summaryrefslogtreecommitdiffstats
path: root/src/EigenUnsupported
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/EigenUnsupported/AdolcForward159
-rw-r--r--src/EigenUnsupported/AlignedVector3234
-rw-r--r--src/EigenUnsupported/ArpackSupport30
-rw-r--r--src/EigenUnsupported/AutoDiff46
-rw-r--r--src/EigenUnsupported/BVH95
-rw-r--r--src/EigenUnsupported/CMakeLists.txt32
-rw-r--r--src/EigenUnsupported/CXX11/CMakeLists.txt8
-rw-r--r--src/EigenUnsupported/CXX11/Tensor137
-rw-r--r--src/EigenUnsupported/CXX11/TensorSymmetry42
-rw-r--r--src/EigenUnsupported/CXX11/ThreadPool74
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/README.md1815
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/Tensor.h554
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorArgMax.h329
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorAssign.h247
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorBase.h1176
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorBlock.h1559
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorBroadcasting.h1093
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorChipping.h518
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorConcatenation.h377
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorContraction.h1023
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorContractionBlocking.h73
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorContractionCuda.h6
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorContractionGpu.h1413
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorContractionMapper.h575
-rwxr-xr-xsrc/EigenUnsupported/CXX11/src/Tensor/TensorContractionSycl.h1650
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorContractionThreadPool.h1679
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorConversion.h456
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorConvolution.h1132
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorConvolutionSycl.h544
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorCostModel.h214
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorCustomOp.h347
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorDevice.h137
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceCuda.h6
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceDefault.h104
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceGpu.h389
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceSycl.h1048
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceThreadPool.h409
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorDimensionList.h236
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorDimensions.h490
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorEvalTo.h236
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorEvaluator.h983
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorExecutor.h703
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorExpr.h388
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorFFT.h669
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorFixedSize.h379
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorForcedEval.h237
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorForwardDeclarations.h191
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorFunctors.h488
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorGenerator.h302
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorGlobalFunctions.h33
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorGpuHipCudaDefines.h99
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h44
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorIO.h79
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorImagePatch.h603
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorIndexList.h738
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorInflation.h247
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorInitializer.h82
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorIntDiv.h263
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorLayoutSwap.h216
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorMacros.h98
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorMap.h327
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorMeta.h311
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorMorphing.h1102
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorPadding.h708
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorPatch.h291
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorRandom.h322
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorReduction.h998
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorReductionCuda.h6
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorReductionGpu.h966
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorReductionSycl.h582
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorRef.h454
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorReverse.h465
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorScan.h528
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorScanSycl.h513
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorShuffling.h471
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorStorage.h161
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorStriding.h346
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorTrace.h303
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorTraits.h264
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorUInt128.h249
-rw-r--r--src/EigenUnsupported/CXX11/src/Tensor/TensorVolumePatch.h629
-rw-r--r--src/EigenUnsupported/CXX11/src/TensorSymmetry/DynamicSymmetry.h293
-rw-r--r--src/EigenUnsupported/CXX11/src/TensorSymmetry/StaticSymmetry.h236
-rw-r--r--src/EigenUnsupported/CXX11/src/TensorSymmetry/Symmetry.h338
-rw-r--r--src/EigenUnsupported/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h669
-rw-r--r--src/EigenUnsupported/CXX11/src/ThreadPool/Barrier.h67
-rw-r--r--src/EigenUnsupported/CXX11/src/ThreadPool/EventCount.h249
-rw-r--r--src/EigenUnsupported/CXX11/src/ThreadPool/NonBlockingThreadPool.h486
-rw-r--r--src/EigenUnsupported/CXX11/src/ThreadPool/RunQueue.h236
-rw-r--r--src/EigenUnsupported/CXX11/src/ThreadPool/ThreadCancel.h23
-rw-r--r--src/EigenUnsupported/CXX11/src/ThreadPool/ThreadEnvironment.h40
-rw-r--r--src/EigenUnsupported/CXX11/src/ThreadPool/ThreadLocal.h301
-rw-r--r--src/EigenUnsupported/CXX11/src/ThreadPool/ThreadPoolInterface.h48
-rw-r--r--src/EigenUnsupported/CXX11/src/ThreadPool/ThreadYield.h20
-rw-r--r--src/EigenUnsupported/CXX11/src/util/CXX11Meta.h537
-rw-r--r--src/EigenUnsupported/CXX11/src/util/CXX11Workarounds.h88
-rw-r--r--src/EigenUnsupported/CXX11/src/util/EmulateArray.h261
-rw-r--r--src/EigenUnsupported/CXX11/src/util/MaxSizeVector.h158
-rw-r--r--src/EigenUnsupported/EulerAngles43
-rw-r--r--src/EigenUnsupported/FFT419
-rw-r--r--src/EigenUnsupported/IterativeSolvers51
-rw-r--r--src/EigenUnsupported/KroneckerProduct36
-rw-r--r--src/EigenUnsupported/LevenbergMarquardt49
-rw-r--r--src/EigenUnsupported/MPRealSupport213
-rw-r--r--src/EigenUnsupported/MatrixFunctions504
-rw-r--r--src/EigenUnsupported/MoreVectorization24
-rw-r--r--src/EigenUnsupported/NonLinearOptimization140
-rw-r--r--src/EigenUnsupported/NumericalDiff56
-rw-r--r--src/EigenUnsupported/OpenGLSupport322
-rw-r--r--src/EigenUnsupported/Polynomials137
-rw-r--r--src/EigenUnsupported/Skyline39
-rw-r--r--src/EigenUnsupported/SparseExtra54
-rw-r--r--src/EigenUnsupported/SpecialFunctions103
-rw-r--r--src/EigenUnsupported/Splines35
-rw-r--r--src/EigenUnsupported/src/AutoDiff/AutoDiffJacobian.h108
-rwxr-xr-xsrc/EigenUnsupported/src/AutoDiff/AutoDiffScalar.h730
-rw-r--r--src/EigenUnsupported/src/AutoDiff/AutoDiffVector.h220
-rw-r--r--src/EigenUnsupported/src/BVH/BVAlgorithms.h293
-rw-r--r--src/EigenUnsupported/src/BVH/KdBVH.h223
-rw-r--r--src/EigenUnsupported/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h790
-rw-r--r--src/EigenUnsupported/src/EulerAngles/CMakeLists.txt6
-rw-r--r--src/EigenUnsupported/src/EulerAngles/EulerAngles.h355
-rw-r--r--src/EigenUnsupported/src/EulerAngles/EulerSystem.h305
-rw-r--r--src/EigenUnsupported/src/FFT/ei_fftw_impl.h261
-rw-r--r--src/EigenUnsupported/src/FFT/ei_kissfft_impl.h449
-rw-r--r--src/EigenUnsupported/src/IterativeSolvers/ConstrainedConjGrad.h187
-rw-r--r--src/EigenUnsupported/src/IterativeSolvers/DGMRES.h511
-rw-r--r--src/EigenUnsupported/src/IterativeSolvers/GMRES.h335
-rwxr-xr-xsrc/EigenUnsupported/src/IterativeSolvers/IDRS.h436
-rw-r--r--src/EigenUnsupported/src/IterativeSolvers/IncompleteLU.h90
-rw-r--r--src/EigenUnsupported/src/IterativeSolvers/IterationController.h154
-rw-r--r--src/EigenUnsupported/src/IterativeSolvers/MINRES.h267
-rw-r--r--src/EigenUnsupported/src/IterativeSolvers/Scaling.h193
-rw-r--r--src/EigenUnsupported/src/KroneckerProduct/KroneckerTensorProduct.h305
-rw-r--r--src/EigenUnsupported/src/LevenbergMarquardt/CopyrightMINPACK.txt52
-rw-r--r--src/EigenUnsupported/src/LevenbergMarquardt/LMcovar.h84
-rw-r--r--src/EigenUnsupported/src/LevenbergMarquardt/LMonestep.h202
-rw-r--r--src/EigenUnsupported/src/LevenbergMarquardt/LMpar.h160
-rw-r--r--src/EigenUnsupported/src/LevenbergMarquardt/LMqrsolv.h188
-rw-r--r--src/EigenUnsupported/src/LevenbergMarquardt/LevenbergMarquardt.h396
-rw-r--r--src/EigenUnsupported/src/MatrixFunctions/MatrixExponential.h441
-rw-r--r--src/EigenUnsupported/src/MatrixFunctions/MatrixFunction.h569
-rw-r--r--src/EigenUnsupported/src/MatrixFunctions/MatrixLogarithm.h373
-rw-r--r--src/EigenUnsupported/src/MatrixFunctions/MatrixPower.h705
-rw-r--r--src/EigenUnsupported/src/MatrixFunctions/MatrixSquareRoot.h368
-rw-r--r--src/EigenUnsupported/src/MatrixFunctions/StemFunction.h117
-rw-r--r--src/EigenUnsupported/src/MoreVectorization/MathFunctions.h95
-rw-r--r--src/EigenUnsupported/src/NonLinearOptimization/HybridNonLinearSolver.h601
-rw-r--r--src/EigenUnsupported/src/NonLinearOptimization/LevenbergMarquardt.h657
-rw-r--r--src/EigenUnsupported/src/NonLinearOptimization/chkder.h66
-rw-r--r--src/EigenUnsupported/src/NonLinearOptimization/covar.h70
-rw-r--r--src/EigenUnsupported/src/NonLinearOptimization/dogleg.h107
-rw-r--r--src/EigenUnsupported/src/NonLinearOptimization/fdjac1.h79
-rw-r--r--src/EigenUnsupported/src/NonLinearOptimization/lmpar.h298
-rw-r--r--src/EigenUnsupported/src/NonLinearOptimization/qrsolv.h91
-rw-r--r--src/EigenUnsupported/src/NonLinearOptimization/r1mpyq.h30
-rw-r--r--src/EigenUnsupported/src/NonLinearOptimization/r1updt.h99
-rw-r--r--src/EigenUnsupported/src/NonLinearOptimization/rwupdt.h49
-rw-r--r--src/EigenUnsupported/src/NumericalDiff/NumericalDiff.h130
-rw-r--r--src/EigenUnsupported/src/Polynomials/Companion.h280
-rw-r--r--src/EigenUnsupported/src/Polynomials/PolynomialSolver.h428
-rw-r--r--src/EigenUnsupported/src/Polynomials/PolynomialUtils.h143
-rw-r--r--src/EigenUnsupported/src/Skyline/SkylineInplaceLU.h352
-rw-r--r--src/EigenUnsupported/src/Skyline/SkylineMatrix.h862
-rw-r--r--src/EigenUnsupported/src/Skyline/SkylineMatrixBase.h212
-rw-r--r--src/EigenUnsupported/src/Skyline/SkylineProduct.h295
-rw-r--r--src/EigenUnsupported/src/Skyline/SkylineStorage.h259
-rw-r--r--src/EigenUnsupported/src/Skyline/SkylineUtil.h89
-rw-r--r--src/EigenUnsupported/src/SparseExtra/BlockOfDynamicSparseMatrix.h122
-rw-r--r--src/EigenUnsupported/src/SparseExtra/BlockSparseMatrix.h1079
-rw-r--r--src/EigenUnsupported/src/SparseExtra/DynamicSparseMatrix.h404
-rw-r--r--src/EigenUnsupported/src/SparseExtra/MarketIO.h282
-rw-r--r--src/EigenUnsupported/src/SparseExtra/MatrixMarketIterator.h247
-rw-r--r--src/EigenUnsupported/src/SparseExtra/RandomSetter.h349
-rw-r--r--src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsArrayAPI.h286
-rw-r--r--src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsBFloat16.h68
-rw-r--r--src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsFunctors.h357
-rw-r--r--src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsHalf.h66
-rw-r--r--src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsImpl.h1959
-rw-r--r--src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsPacketMath.h118
-rw-r--r--src/EigenUnsupported/src/SpecialFunctions/HipVectorCompatibility.h67
-rw-r--r--src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsArrayAPI.h167
-rw-r--r--src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsBFloat16.h58
-rw-r--r--src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsFunctors.h330
-rw-r--r--src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsHalf.h58
-rw-r--r--src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsImpl.h2045
-rw-r--r--src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsPacketMath.h79
-rw-r--r--src/EigenUnsupported/src/SpecialFunctions/arch/AVX/BesselFunctions.h46
-rw-r--r--src/EigenUnsupported/src/SpecialFunctions/arch/AVX/SpecialFunctions.h16
-rw-r--r--src/EigenUnsupported/src/SpecialFunctions/arch/AVX512/BesselFunctions.h46
-rw-r--r--src/EigenUnsupported/src/SpecialFunctions/arch/AVX512/SpecialFunctions.h16
-rw-r--r--src/EigenUnsupported/src/SpecialFunctions/arch/GPU/SpecialFunctions.h369
-rw-r--r--src/EigenUnsupported/src/SpecialFunctions/arch/NEON/BesselFunctions.h54
-rw-r--r--src/EigenUnsupported/src/SpecialFunctions/arch/NEON/SpecialFunctions.h34
-rw-r--r--src/EigenUnsupported/src/Splines/Spline.h507
-rw-r--r--src/EigenUnsupported/src/Splines/SplineFitting.h431
-rw-r--r--src/EigenUnsupported/src/Splines/SplineFwd.h93
197 files changed, 68723 insertions, 0 deletions
diff --git a/src/EigenUnsupported/AdolcForward b/src/EigenUnsupported/AdolcForward
new file mode 100644
index 0000000..56caeae
--- /dev/null
+++ b/src/EigenUnsupported/AdolcForward
@@ -0,0 +1,159 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <g.gael@free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ADLOC_FORWARD
+#define EIGEN_ADLOC_FORWARD
+
+//--------------------------------------------------------------------------------
+//
+// This file provides support for adolc's adouble type in forward mode.
+// ADOL-C is a C++ automatic differentiation library,
+// see https://projects.coin-or.org/ADOL-C for more information.
+//
+// Note that the maximal number of directions is controlled by
+// the preprocessor token NUMBER_DIRECTIONS. The default is 2.
+//
+//--------------------------------------------------------------------------------
+
+#define ADOLC_TAPELESS
+#ifndef NUMBER_DIRECTIONS
+# define NUMBER_DIRECTIONS 2
+#endif
+#include <adolc/adtl.h>
+
+// adolc defines some very stupid macros:
+#if defined(malloc)
+# undef malloc
+#endif
+
+#if defined(calloc)
+# undef calloc
+#endif
+
+#if defined(realloc)
+# undef realloc
+#endif
+
+#include "../../Eigen/Core"
+
+namespace Eigen {
+
+/**
+ * \defgroup AdolcForward_Module Adolc forward module
+ * This module provides support for adolc's adouble type in forward mode.
+ * ADOL-C is a C++ automatic differentiation library,
+ * see https://projects.coin-or.org/ADOL-C for more information.
+ * It mainly consists in:
+ * - a struct Eigen::NumTraits<adtl::adouble> specialization
+ * - overloads of internal::* math function for adtl::adouble type.
+ *
+ * Note that the maximal number of directions is controlled by
+ * the preprocessor token NUMBER_DIRECTIONS. The default is 2.
+ *
+ * \code
+ * #include <unsupported/Eigen/AdolcSupport>
+ * \endcode
+ */
+ //@{
+
+} // namespace Eigen
+
+// Eigen's require a few additional functions which must be defined in the same namespace
+// than the custom scalar type own namespace
+namespace adtl {
+
+inline const adouble& conj(const adouble& x) { return x; }
+inline const adouble& real(const adouble& x) { return x; }
+inline adouble imag(const adouble&) { return 0.; }
+inline adouble abs(const adouble& x) { return fabs(x); }
+inline adouble abs2(const adouble& x) { return x*x; }
+
+inline bool (isinf)(const adouble& x) { return (Eigen::numext::isinf)(x.getValue()); }
+inline bool (isnan)(const adouble& x) { return (Eigen::numext::isnan)(x.getValue()); }
+
+}
+
+namespace Eigen {
+
+template<> struct NumTraits<adtl::adouble>
+ : NumTraits<double>
+{
+ typedef adtl::adouble Real;
+ typedef adtl::adouble NonInteger;
+ typedef adtl::adouble Nested;
+ enum {
+ IsComplex = 0,
+ IsInteger = 0,
+ IsSigned = 1,
+ RequireInitialization = 1,
+ ReadCost = 1,
+ AddCost = 1,
+ MulCost = 1
+ };
+};
+
+template<typename Functor> class AdolcForwardJacobian : public Functor
+{
+ typedef adtl::adouble ActiveScalar;
+public:
+
+ AdolcForwardJacobian() : Functor() {}
+ AdolcForwardJacobian(const Functor& f) : Functor(f) {}
+
+ // forward constructors
+ template<typename T0>
+ AdolcForwardJacobian(const T0& a0) : Functor(a0) {}
+ template<typename T0, typename T1>
+ AdolcForwardJacobian(const T0& a0, const T1& a1) : Functor(a0, a1) {}
+ template<typename T0, typename T1, typename T2>
+ AdolcForwardJacobian(const T0& a0, const T1& a1, const T1& a2) : Functor(a0, a1, a2) {}
+
+ typedef typename Functor::InputType InputType;
+ typedef typename Functor::ValueType ValueType;
+ typedef typename Functor::JacobianType JacobianType;
+
+ typedef Matrix<ActiveScalar, InputType::SizeAtCompileTime, 1> ActiveInput;
+ typedef Matrix<ActiveScalar, ValueType::SizeAtCompileTime, 1> ActiveValue;
+
+ void operator() (const InputType& x, ValueType* v, JacobianType* _jac) const
+ {
+ eigen_assert(v!=0);
+ if (!_jac)
+ {
+ Functor::operator()(x, v);
+ return;
+ }
+
+ JacobianType& jac = *_jac;
+
+ ActiveInput ax = x.template cast<ActiveScalar>();
+ ActiveValue av(jac.rows());
+
+ for (int j=0; j<jac.cols(); j++)
+ for (int i=0; i<jac.cols(); i++)
+ ax[i].setADValue(j, i==j ? 1 : 0);
+
+ Functor::operator()(ax, &av);
+
+ for (int i=0; i<jac.rows(); i++)
+ {
+ (*v)[i] = av[i].getValue();
+ for (int j=0; j<jac.cols(); j++)
+ jac.coeffRef(i,j) = av[i].getADValue(j);
+ }
+ }
+protected:
+
+};
+
+//@}
+
+}
+
+#endif // EIGEN_ADLOC_FORWARD
diff --git a/src/EigenUnsupported/AlignedVector3 b/src/EigenUnsupported/AlignedVector3
new file mode 100644
index 0000000..4fa1842
--- /dev/null
+++ b/src/EigenUnsupported/AlignedVector3
@@ -0,0 +1,234 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <g.gael@free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ALIGNED_VECTOR3
+#define EIGEN_ALIGNED_VECTOR3
+
+#include "../../Eigen/Geometry"
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+namespace Eigen {
+
+/**
+ * \defgroup AlignedVector3_Module Aligned vector3 module
+ *
+ * \code
+ * #include <unsupported/Eigen/AlignedVector3>
+ * \endcode
+ */
+ //@{
+
+
+/** \class AlignedVector3
+ *
+ * \brief A vectorization friendly 3D vector
+ *
+ * This class represents a 3D vector internally using a 4D vector
+ * such that vectorization can be seamlessly enabled. Of course,
+ * the same result can be achieved by directly using a 4D vector.
+ * This class makes this process simpler.
+ *
+ */
+// TODO specialize Cwise
+template<typename _Scalar> class AlignedVector3;
+
+namespace internal {
+template<typename _Scalar> struct traits<AlignedVector3<_Scalar> >
+ : traits<Matrix<_Scalar,3,1,0,4,1> >
+{
+};
+}
+
+template<typename _Scalar> class AlignedVector3
+ : public MatrixBase<AlignedVector3<_Scalar> >
+{
+ typedef Matrix<_Scalar,4,1> CoeffType;
+ CoeffType m_coeffs;
+ public:
+
+ typedef MatrixBase<AlignedVector3<_Scalar> > Base;
+ EIGEN_DENSE_PUBLIC_INTERFACE(AlignedVector3)
+ using Base::operator*;
+
+ inline Index rows() const { return 3; }
+ inline Index cols() const { return 1; }
+
+ Scalar* data() { return m_coeffs.data(); }
+ const Scalar* data() const { return m_coeffs.data(); }
+ Index innerStride() const { return 1; }
+ Index outerStride() const { return 3; }
+
+ inline const Scalar& coeff(Index row, Index col) const
+ { return m_coeffs.coeff(row, col); }
+
+ inline Scalar& coeffRef(Index row, Index col)
+ { return m_coeffs.coeffRef(row, col); }
+
+ inline const Scalar& coeff(Index index) const
+ { return m_coeffs.coeff(index); }
+
+ inline Scalar& coeffRef(Index index)
+ { return m_coeffs.coeffRef(index);}
+
+
+ inline AlignedVector3()
+ {}
+
+ inline AlignedVector3(const Scalar& x, const Scalar& y, const Scalar& z)
+ : m_coeffs(x, y, z, Scalar(0))
+ {}
+
+ inline AlignedVector3(const AlignedVector3& other)
+ : Base(), m_coeffs(other.m_coeffs)
+ {}
+
+ template<typename XprType, int Size=XprType::SizeAtCompileTime>
+ struct generic_assign_selector {};
+
+ template<typename XprType> struct generic_assign_selector<XprType,4>
+ {
+ inline static void run(AlignedVector3& dest, const XprType& src)
+ {
+ dest.m_coeffs = src;
+ }
+ };
+
+ template<typename XprType> struct generic_assign_selector<XprType,3>
+ {
+ inline static void run(AlignedVector3& dest, const XprType& src)
+ {
+ dest.m_coeffs.template head<3>() = src;
+ dest.m_coeffs.w() = Scalar(0);
+ }
+ };
+
+ template<typename Derived>
+ inline AlignedVector3(const MatrixBase<Derived>& other)
+ {
+ generic_assign_selector<Derived>::run(*this,other.derived());
+ }
+
+ inline AlignedVector3& operator=(const AlignedVector3& other)
+ { m_coeffs = other.m_coeffs; return *this; }
+
+ template <typename Derived>
+ inline AlignedVector3& operator=(const MatrixBase<Derived>& other)
+ {
+ generic_assign_selector<Derived>::run(*this,other.derived());
+ return *this;
+ }
+
+ inline AlignedVector3 operator+(const AlignedVector3& other) const
+ { return AlignedVector3(m_coeffs + other.m_coeffs); }
+
+ inline AlignedVector3& operator+=(const AlignedVector3& other)
+ { m_coeffs += other.m_coeffs; return *this; }
+
+ inline AlignedVector3 operator-(const AlignedVector3& other) const
+ { return AlignedVector3(m_coeffs - other.m_coeffs); }
+
+ inline AlignedVector3 operator-() const
+ { return AlignedVector3(-m_coeffs); }
+
+ inline AlignedVector3 operator-=(const AlignedVector3& other)
+ { m_coeffs -= other.m_coeffs; return *this; }
+
+ inline AlignedVector3 operator*(const Scalar& s) const
+ { return AlignedVector3(m_coeffs * s); }
+
+ inline friend AlignedVector3 operator*(const Scalar& s,const AlignedVector3& vec)
+ { return AlignedVector3(s * vec.m_coeffs); }
+
+ inline AlignedVector3& operator*=(const Scalar& s)
+ { m_coeffs *= s; return *this; }
+
+ inline AlignedVector3 operator/(const Scalar& s) const
+ { return AlignedVector3(m_coeffs / s); }
+
+ inline AlignedVector3& operator/=(const Scalar& s)
+ { m_coeffs /= s; return *this; }
+
+ inline Scalar dot(const AlignedVector3& other) const
+ {
+ eigen_assert(m_coeffs.w()==Scalar(0));
+ eigen_assert(other.m_coeffs.w()==Scalar(0));
+ return m_coeffs.dot(other.m_coeffs);
+ }
+
+ inline void normalize()
+ {
+ m_coeffs /= norm();
+ }
+
+ inline AlignedVector3 normalized() const
+ {
+ return AlignedVector3(m_coeffs / norm());
+ }
+
+ inline Scalar sum() const
+ {
+ eigen_assert(m_coeffs.w()==Scalar(0));
+ return m_coeffs.sum();
+ }
+
+ inline Scalar squaredNorm() const
+ {
+ eigen_assert(m_coeffs.w()==Scalar(0));
+ return m_coeffs.squaredNorm();
+ }
+
+ inline Scalar norm() const
+ {
+ using std::sqrt;
+ return sqrt(squaredNorm());
+ }
+
+ inline AlignedVector3 cross(const AlignedVector3& other) const
+ {
+ return AlignedVector3(m_coeffs.cross3(other.m_coeffs));
+ }
+
+ template<typename Derived>
+ inline bool isApprox(const MatrixBase<Derived>& other, const RealScalar& eps=NumTraits<Scalar>::dummy_precision()) const
+ {
+ return m_coeffs.template head<3>().isApprox(other,eps);
+ }
+
+ CoeffType& coeffs() { return m_coeffs; }
+ const CoeffType& coeffs() const { return m_coeffs; }
+};
+
+namespace internal {
+
+template<typename _Scalar>
+struct eval<AlignedVector3<_Scalar>, Dense>
+{
+ typedef const AlignedVector3<_Scalar>& type;
+};
+
+template<typename Scalar>
+struct evaluator<AlignedVector3<Scalar> >
+ : evaluator<Matrix<Scalar,4,1> >
+{
+ typedef AlignedVector3<Scalar> XprType;
+ typedef evaluator<Matrix<Scalar,4,1> > Base;
+
+ evaluator(const XprType &m) : Base(m.coeffs()) {}
+};
+
+}
+
+//@}
+
+}
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_ALIGNED_VECTOR3
diff --git a/src/EigenUnsupported/ArpackSupport b/src/EigenUnsupported/ArpackSupport
new file mode 100644
index 0000000..67c4ac8
--- /dev/null
+++ b/src/EigenUnsupported/ArpackSupport
@@ -0,0 +1,30 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ARPACKSUPPORT_MODULE_H
+#define EIGEN_ARPACKSUPPORT_MODULE_H
+
+#include "../../Eigen/Core"
+
+/** \defgroup ArpackSupport_Module Arpack support module
+ *
+ * This module provides a wrapper to Arpack, a library for sparse eigenvalue decomposition.
+ *
+ * \code
+ * #include <Eigen/ArpackSupport>
+ * \endcode
+ */
+
+#include "../../Eigen/SparseCholesky"
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+#include "src/Eigenvalues/ArpackSelfAdjointEigenSolver.h"
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_ARPACKSUPPORT_MODULE_H
diff --git a/src/EigenUnsupported/AutoDiff b/src/EigenUnsupported/AutoDiff
new file mode 100644
index 0000000..7a4ff46
--- /dev/null
+++ b/src/EigenUnsupported/AutoDiff
@@ -0,0 +1,46 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <g.gael@free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_AUTODIFF_MODULE
+#define EIGEN_AUTODIFF_MODULE
+
+namespace Eigen {
+
+/**
+ * \defgroup AutoDiff_Module Auto Diff module
+ *
+ * This module features forward automatic differentation via a simple
+ * templated scalar type wrapper AutoDiffScalar.
+ *
+ * Warning : this should NOT be confused with numerical differentiation, which
+ * is a different method and has its own module in Eigen : \ref NumericalDiff_Module.
+ *
+ * \code
+ * #include <unsupported/Eigen/AutoDiff>
+ * \endcode
+ */
+//@{
+
+}
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+
+#include "src/AutoDiff/AutoDiffScalar.h"
+// #include "src/AutoDiff/AutoDiffVector.h"
+#include "src/AutoDiff/AutoDiffJacobian.h"
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+
+
+namespace Eigen {
+//@}
+}
+
+#endif // EIGEN_AUTODIFF_MODULE
diff --git a/src/EigenUnsupported/BVH b/src/EigenUnsupported/BVH
new file mode 100644
index 0000000..666c983
--- /dev/null
+++ b/src/EigenUnsupported/BVH
@@ -0,0 +1,95 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Ilya Baran <ibaran@mit.edu>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BVH_MODULE_H
+#define EIGEN_BVH_MODULE_H
+
+#include "../../Eigen/Core"
+#include "../../Eigen/Geometry"
+#include "../../Eigen/StdVector"
+#include <algorithm>
+#include <queue>
+
+namespace Eigen {
+
+/**
+ * \defgroup BVH_Module BVH module
+ * \brief This module provides generic bounding volume hierarchy algorithms
+ * and reference tree implementations.
+ *
+ *
+ * \code
+ * #include <unsupported/Eigen/BVH>
+ * \endcode
+ *
+ * A bounding volume hierarchy (BVH) can accelerate many geometric queries. This module provides a generic implementation
+ * of the two basic algorithms over a BVH: intersection of a query object against all objects in the hierarchy and minimization
+ * of a function over the objects in the hierarchy. It also provides intersection and minimization over a cartesian product of
+ * two BVH's. A BVH accelerates intersection by using the fact that if a query object does not intersect a volume, then it cannot
+ * intersect any object contained in that volume. Similarly, a BVH accelerates minimization because the minimum of a function
+ * over a volume is no greater than the minimum of a function over any object contained in it.
+ *
+ * Some sample queries that can be written in terms of intersection are:
+ * - Determine all points where a ray intersects a triangle mesh
+ * - Given a set of points, determine which are contained in a query sphere
+ * - Given a set of spheres, determine which contain the query point
+ * - Given a set of disks, determine if any is completely contained in a query rectangle (represent each 2D disk as a point \f$(x,y,r)\f$
+ * in 3D and represent the rectangle as a pyramid based on the original rectangle and shrinking in the \f$r\f$ direction)
+ * - Given a set of points, count how many pairs are \f$d\pm\epsilon\f$ apart (done by looking at the cartesian product of the set
+ * of points with itself)
+ *
+ * Some sample queries that can be written in terms of function minimization over a set of objects are:
+ * - Find the intersection between a ray and a triangle mesh closest to the ray origin (function is infinite off the ray)
+ * - Given a polyline and a query point, determine the closest point on the polyline to the query
+ * - Find the diameter of a point cloud (done by looking at the cartesian product and using negative distance as the function)
+ * - Determine how far two meshes are from colliding (this is also a cartesian product query)
+ *
+ * This implementation decouples the basic algorithms both from the type of hierarchy (and the types of the bounding volumes) and
+ * from the particulars of the query. To enable abstraction from the BVH, the BVH is required to implement a generic mechanism
+ * for traversal. To abstract from the query, the query is responsible for keeping track of results.
+ *
+ * To be used in the algorithms, a hierarchy must implement the following traversal mechanism (see KdBVH for a sample implementation): \code
+ typedef Volume //the type of bounding volume
+ typedef Object //the type of object in the hierarchy
+ typedef Index //a reference to a node in the hierarchy--typically an int or a pointer
+ typedef VolumeIterator //an iterator type over node children--returns Index
+ typedef ObjectIterator //an iterator over object (leaf) children--returns const Object &
+ Index getRootIndex() const //returns the index of the hierarchy root
+ const Volume &getVolume(Index index) const //returns the bounding volume of the node at given index
+ void getChildren(Index index, VolumeIterator &outVBegin, VolumeIterator &outVEnd,
+ ObjectIterator &outOBegin, ObjectIterator &outOEnd) const
+ //getChildren takes a node index and makes [outVBegin, outVEnd) range over its node children
+ //and [outOBegin, outOEnd) range over its object children
+ \endcode
+ *
+ * To use the hierarchy, call BVIntersect or BVMinimize, passing it a BVH (or two, for cartesian product) and a minimizer or intersector.
+ * For an intersection query on a single BVH, the intersector encapsulates the query and must provide two functions:
+ * \code
+ bool intersectVolume(const Volume &volume) //returns true if the query intersects the volume
+ bool intersectObject(const Object &object) //returns true if the intersection search should terminate immediately
+ \endcode
+ * The guarantee that BVIntersect provides is that intersectObject will be called on every object whose bounding volume
+ * intersects the query (but possibly on other objects too) unless the search is terminated prematurely. It is the
+ * responsibility of the intersectObject function to keep track of the results in whatever manner is appropriate.
+ * The cartesian product intersection and the BVMinimize queries are similar--see their individual documentation.
+ *
+ * The following is a simple but complete example for how to use the BVH to accelerate the search for a closest red-blue point pair:
+ * \include BVH_Example.cpp
+ * Output: \verbinclude BVH_Example.out
+ */
+}
+
+//@{
+
+#include "src/BVH/BVAlgorithms.h"
+#include "src/BVH/KdBVH.h"
+
+//@}
+
+#endif // EIGEN_BVH_MODULE_H
diff --git a/src/EigenUnsupported/CMakeLists.txt b/src/EigenUnsupported/CMakeLists.txt
new file mode 100644
index 0000000..631a060
--- /dev/null
+++ b/src/EigenUnsupported/CMakeLists.txt
@@ -0,0 +1,32 @@
+set(Eigen_HEADERS
+ AdolcForward
+ AlignedVector3
+ ArpackSupport
+ AutoDiff
+ BVH
+ EulerAngles
+ FFT
+ IterativeSolvers
+ KroneckerProduct
+ LevenbergMarquardt
+ MatrixFunctions
+ MoreVectorization
+ MPRealSupport
+ NonLinearOptimization
+ NumericalDiff
+ OpenGLSupport
+ Polynomials
+ Skyline
+ SparseExtra
+ SpecialFunctions
+ Splines
+ )
+
+install(FILES
+ ${Eigen_HEADERS}
+ DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel
+ )
+
+install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel FILES_MATCHING PATTERN "*.h")
+
+add_subdirectory(CXX11)
diff --git a/src/EigenUnsupported/CXX11/CMakeLists.txt b/src/EigenUnsupported/CXX11/CMakeLists.txt
new file mode 100644
index 0000000..385ed24
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(Eigen_CXX11_HEADERS Tensor TensorSymmetry ThreadPool)
+
+install(FILES
+ ${Eigen_CXX11_HEADERS}
+ DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel
+ )
+
+install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel FILES_MATCHING PATTERN "*.h")
diff --git a/src/EigenUnsupported/CXX11/Tensor b/src/EigenUnsupported/CXX11/Tensor
new file mode 100644
index 0000000..0938bb5
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/Tensor
@@ -0,0 +1,137 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+//#ifndef EIGEN_CXX11_TENSOR_MODULE
+//#define EIGEN_CXX11_TENSOR_MODULE
+
+#include "../../../Eigen/Core"
+
+#if EIGEN_HAS_CXX11
+
+#include "../SpecialFunctions"
+
+#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
+#include "src/util/CXX11Meta.h"
+#include "src/util/MaxSizeVector.h"
+
+/** \defgroup CXX11_Tensor_Module Tensor Module
+ *
+ * This module provides a Tensor class for storing arbitrarily indexed
+ * objects.
+ *
+ * \code
+ * #include <Eigen/CXX11/Tensor>
+ * \endcode
+ *
+ * Much of the documentation can be found \ref eigen_tensors "here".
+ */
+
+#include <atomic>
+#include <chrono>
+#include <cmath>
+#include <cstddef>
+#include <cstring>
+#include <random>
+#include <thread>
+
+#if defined(EIGEN_USE_THREADS) || defined(EIGEN_USE_SYCL)
+#include "ThreadPool"
+#endif
+
+#ifdef EIGEN_USE_GPU
+ #include <iostream>
+ #if defined(EIGEN_USE_HIP)
+ #include <hip/hip_runtime.h>
+ #else
+ #include <cuda_runtime.h>
+ #endif
+#endif
+
+#include "src/Tensor/TensorMacros.h"
+#include "src/Tensor/TensorForwardDeclarations.h"
+#include "src/Tensor/TensorMeta.h"
+#include "src/Tensor/TensorFunctors.h"
+#include "src/Tensor/TensorCostModel.h"
+#include "src/Tensor/TensorDeviceDefault.h"
+#include "src/Tensor/TensorDeviceThreadPool.h"
+#include "src/Tensor/TensorDeviceGpu.h"
+#ifndef gpu_assert
+#define gpu_assert(x)
+#endif
+#include "src/Tensor/TensorDeviceSycl.h"
+#include "src/Tensor/TensorIndexList.h"
+#include "src/Tensor/TensorDimensionList.h"
+#include "src/Tensor/TensorDimensions.h"
+#include "src/Tensor/TensorInitializer.h"
+#include "src/Tensor/TensorTraits.h"
+#include "src/Tensor/TensorRandom.h"
+#include "src/Tensor/TensorUInt128.h"
+#include "src/Tensor/TensorIntDiv.h"
+#include "src/Tensor/TensorGlobalFunctions.h"
+
+#include "src/Tensor/TensorBase.h"
+#include "src/Tensor/TensorBlock.h"
+
+#include "src/Tensor/TensorEvaluator.h"
+#include "src/Tensor/TensorExpr.h"
+#include "src/Tensor/TensorReduction.h"
+#include "src/Tensor/TensorReductionGpu.h"
+#include "src/Tensor/TensorArgMax.h"
+#include "src/Tensor/TensorConcatenation.h"
+#include "src/Tensor/TensorContractionMapper.h"
+#include "src/Tensor/TensorContractionBlocking.h"
+#include "src/Tensor/TensorContraction.h"
+#include "src/Tensor/TensorContractionThreadPool.h"
+#include "src/Tensor/TensorContractionGpu.h"
+#include "src/Tensor/TensorConversion.h"
+#include "src/Tensor/TensorConvolution.h"
+#include "src/Tensor/TensorFFT.h"
+#include "src/Tensor/TensorPatch.h"
+#include "src/Tensor/TensorImagePatch.h"
+#include "src/Tensor/TensorVolumePatch.h"
+#include "src/Tensor/TensorBroadcasting.h"
+#include "src/Tensor/TensorChipping.h"
+#include "src/Tensor/TensorInflation.h"
+#include "src/Tensor/TensorLayoutSwap.h"
+#include "src/Tensor/TensorMorphing.h"
+#include "src/Tensor/TensorPadding.h"
+#include "src/Tensor/TensorReverse.h"
+#include "src/Tensor/TensorShuffling.h"
+#include "src/Tensor/TensorStriding.h"
+#include "src/Tensor/TensorCustomOp.h"
+#include "src/Tensor/TensorEvalTo.h"
+#include "src/Tensor/TensorForcedEval.h"
+#include "src/Tensor/TensorGenerator.h"
+#include "src/Tensor/TensorAssign.h"
+#include "src/Tensor/TensorScan.h"
+#include "src/Tensor/TensorTrace.h"
+
+#ifdef EIGEN_USE_SYCL
+#include "src/Tensor/TensorReductionSycl.h"
+#include "src/Tensor/TensorConvolutionSycl.h"
+#include "src/Tensor/TensorContractionSycl.h"
+#include "src/Tensor/TensorScanSycl.h"
+#endif
+
+#include "src/Tensor/TensorExecutor.h"
+#include "src/Tensor/TensorDevice.h"
+
+#include "src/Tensor/TensorStorage.h"
+#include "src/Tensor/Tensor.h"
+#include "src/Tensor/TensorFixedSize.h"
+#include "src/Tensor/TensorMap.h"
+#include "src/Tensor/TensorRef.h"
+
+#include "src/Tensor/TensorIO.h"
+
+#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_HAS_CXX11
+//#endif // EIGEN_CXX11_TENSOR_MODULE
diff --git a/src/EigenUnsupported/CXX11/TensorSymmetry b/src/EigenUnsupported/CXX11/TensorSymmetry
new file mode 100644
index 0000000..b09c5e4
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/TensorSymmetry
@@ -0,0 +1,42 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSORSYMMETRY_MODULE
+#define EIGEN_CXX11_TENSORSYMMETRY_MODULE
+
+#include "Tensor"
+
+#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+#include "src/util/CXX11Meta.h"
+
+/** \defgroup CXX11_TensorSymmetry_Module Tensor Symmetry Module
+ *
+ * This module provides a classes that allow for the definition of
+ * symmetries w.r.t. tensor indices.
+ *
+ * Including this module will implicitly include the Tensor module.
+ *
+ * \code
+ * #include <Eigen/TensorSymmetry>
+ * \endcode
+ */
+
+#include "src/TensorSymmetry/util/TemplateGroupTheory.h"
+#include "src/TensorSymmetry/Symmetry.h"
+#include "src/TensorSymmetry/StaticSymmetry.h"
+#include "src/TensorSymmetry/DynamicSymmetry.h"
+
+#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_CXX11_TENSORSYMMETRY_MODULE
+
+/*
+ * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
+ */
diff --git a/src/EigenUnsupported/CXX11/ThreadPool b/src/EigenUnsupported/CXX11/ThreadPool
new file mode 100644
index 0000000..c5cafb2
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/ThreadPool
@@ -0,0 +1,74 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_MODULE
+#define EIGEN_CXX11_THREADPOOL_MODULE
+
+#include "../../../Eigen/Core"
+
+#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+/** \defgroup CXX11_ThreadPool_Module C++11 ThreadPool Module
+ *
+ * This module provides 2 threadpool implementations
+ * - a simple reference implementation
+ * - a faster non blocking implementation
+ *
+ * This module requires C++11.
+ *
+ * \code
+ * #include <Eigen/CXX11/ThreadPool>
+ * \endcode
+ */
+
+
+// The code depends on CXX11, so only include the module if the
+// compiler supports it.
+#if (EIGEN_COMP_CXXVER >= 11)
+#include <cstddef>
+#include <cstring>
+#include <time.h>
+
+#include <vector>
+#include <atomic>
+#include <condition_variable>
+#include <deque>
+#include <mutex>
+#include <thread>
+#include <functional>
+#include <memory>
+#include <utility>
+
+// There are non-parenthesized calls to "max" in the <unordered_map> header,
+// which trigger a check in test/main.h causing compilation to fail.
+// We work around the check here by removing the check for max in
+// the case where we have to emulate thread_local.
+#ifdef max
+#undef max
+#endif
+#include <unordered_map>
+
+#include "src/util/CXX11Meta.h"
+#include "src/util/MaxSizeVector.h"
+
+#include "src/ThreadPool/ThreadLocal.h"
+#include "src/ThreadPool/ThreadYield.h"
+#include "src/ThreadPool/ThreadCancel.h"
+#include "src/ThreadPool/EventCount.h"
+#include "src/ThreadPool/RunQueue.h"
+#include "src/ThreadPool/ThreadPoolInterface.h"
+#include "src/ThreadPool/ThreadEnvironment.h"
+#include "src/ThreadPool/Barrier.h"
+#include "src/ThreadPool/NonBlockingThreadPool.h"
+
+#endif
+
+#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_CXX11_THREADPOOL_MODULE
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/README.md b/src/EigenUnsupported/CXX11/src/Tensor/README.md
new file mode 100644
index 0000000..2f65b1b
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/README.md
@@ -0,0 +1,1815 @@
+# Eigen Tensors {#eigen_tensors}
+
+Tensors are multidimensional arrays of elements. Elements are typically scalars,
+but more complex types such as strings are also supported.
+
+## Tensor Classes
+
+You can manipulate a tensor with one of the following classes. They all are in
+the namespace `::Eigen.`
+
+
+### Class Tensor<data_type, rank>
+
+This is the class to use to create a tensor and allocate memory for it. The
+class is templatized with the tensor datatype, such as float or int, and the
+tensor rank. The rank is the number of dimensions, for example rank 2 is a
+matrix.
+
+Tensors of this class are resizable. For example, if you assign a tensor of a
+different size to a Tensor, that tensor is resized to match its new value.
+
+#### Constructor Tensor<data_type, rank>(size0, size1, ...)
+
+Constructor for a Tensor. The constructor must be passed `rank` integers
+indicating the sizes of the instance along each of the the `rank`
+dimensions.
+
+ // Create a tensor of rank 3 of sizes 2, 3, 4. This tensor owns
+ // memory to hold 24 floating point values (24 = 2 x 3 x 4).
+ Tensor<float, 3> t_3d(2, 3, 4);
+
+ // Resize t_3d by assigning a tensor of different sizes, but same rank.
+ t_3d = Tensor<float, 3>(3, 4, 3);
+
+#### Constructor Tensor<data_type, rank>(size_array)
+
+Constructor where the sizes for the constructor are specified as an array of
+values instead of an explicitly list of parameters. The array type to use is
+`Eigen::array<Eigen::Index>`. The array can be constructed automatically
+from an initializer list.
+
+ // Create a tensor of strings of rank 2 with sizes 5, 7.
+ Tensor<string, 2> t_2d({5, 7});
+
+
+### Class TensorFixedSize<data_type, Sizes<size0, size1, ...>>
+
+Class to use for tensors of fixed size, where the size is known at compile
+time. Fixed sized tensors can provide very fast computations because all their
+dimensions are known by the compiler. FixedSize tensors are not resizable.
+
+If the total number of elements in a fixed size tensor is small enough the
+tensor data is held onto the stack and does not cause heap allocation and free.
+
+ // Create a 4 x 3 tensor of floats.
+ TensorFixedSize<float, Sizes<4, 3>> t_4x3;
+
+### Class TensorMap<Tensor<data_type, rank>>
+
+This is the class to use to create a tensor on top of memory allocated and
+owned by another part of your code. It allows to view any piece of allocated
+memory as a Tensor. Instances of this class do not own the memory where the
+data are stored.
+
+A TensorMap is not resizable because it does not own the memory where its data
+are stored.
+
+#### Constructor TensorMap<Tensor<data_type, rank>>(data, size0, size1, ...)
+
+Constructor for a Tensor. The constructor must be passed a pointer to the
+storage for the data, and "rank" size attributes. The storage has to be
+large enough to hold all the data.
+
+ // Map a tensor of ints on top of stack-allocated storage.
+ int storage[128]; // 2 x 4 x 2 x 8 = 128
+ TensorMap<Tensor<int, 4>> t_4d(storage, 2, 4, 2, 8);
+
+ // The same storage can be viewed as a different tensor.
+ // You can also pass the sizes as an array.
+ TensorMap<Tensor<int, 2>> t_2d(storage, 16, 8);
+
+ // You can also map fixed-size tensors. Here we get a 1d view of
+ // the 2d fixed-size tensor.
+ TensorFixedSize<float, Sizes<4, 3>> t_4x3;
+ TensorMap<Tensor<float, 1>> t_12(t_4x3.data(), 12);
+
+
+#### Class TensorRef
+
+See Assigning to a TensorRef below.
+
+## Accessing Tensor Elements
+
+#### <data_type> tensor(index0, index1...)
+
+Return the element at position `(index0, index1...)` in tensor
+`tensor`. You must pass as many parameters as the rank of `tensor`.
+The expression can be used as an l-value to set the value of the element at the
+specified position. The value returned is of the datatype of the tensor.
+
+ // Set the value of the element at position (0, 1, 0);
+ Tensor<float, 3> t_3d(2, 3, 4);
+ t_3d(0, 1, 0) = 12.0f;
+
+ // Initialize all elements to random values.
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < 3; ++j) {
+ for (int k = 0; k < 4; ++k) {
+ t_3d(i, j, k) = ...some random value...;
+ }
+ }
+ }
+
+ // Print elements of a tensor.
+ for (int i = 0; i < 2; ++i) {
+ LOG(INFO) << t_3d(i, 0, 0);
+ }
+
+
+## TensorLayout
+
+The tensor library supports 2 layouts: `ColMajor` (the default) and
+`RowMajor`. Only the default column major layout is currently fully
+supported, and it is therefore not recommended to attempt to use the row major
+layout at the moment.
+
+The layout of a tensor is optionally specified as part of its type. If not
+specified explicitly column major is assumed.
+
+ Tensor<float, 3, ColMajor> col_major; // equivalent to Tensor<float, 3>
+ TensorMap<Tensor<float, 3, RowMajor> > row_major(data, ...);
+
+All the arguments to an expression must use the same layout. Attempting to mix
+different layouts will result in a compilation error.
+
+It is possible to change the layout of a tensor or an expression using the
+`swap_layout()` method. Note that this will also reverse the order of the
+dimensions.
+
+ Tensor<float, 2, ColMajor> col_major(2, 4);
+ Tensor<float, 2, RowMajor> row_major(2, 4);
+
+ Tensor<float, 2> col_major_result = col_major; // ok, layouts match
+ Tensor<float, 2> col_major_result = row_major; // will not compile
+
+ // Simple layout swap
+ col_major_result = row_major.swap_layout();
+ eigen_assert(col_major_result.dimension(0) == 4);
+ eigen_assert(col_major_result.dimension(1) == 2);
+
+ // Swap the layout and preserve the order of the dimensions
+ array<int, 2> shuffle(1, 0);
+ col_major_result = row_major.swap_layout().shuffle(shuffle);
+ eigen_assert(col_major_result.dimension(0) == 2);
+ eigen_assert(col_major_result.dimension(1) == 4);
+
+
+## Tensor Operations
+
+The Eigen Tensor library provides a vast library of operations on Tensors:
+numerical operations such as addition and multiplication, geometry operations
+such as slicing and shuffling, etc. These operations are available as methods
+of the Tensor classes, and in some cases as operator overloads. For example
+the following code computes the elementwise addition of two tensors:
+
+ Tensor<float, 3> t1(2, 3, 4);
+ ...set some values in t1...
+ Tensor<float, 3> t2(2, 3, 4);
+ ...set some values in t2...
+ // Set t3 to the element wise sum of t1 and t2
+ Tensor<float, 3> t3 = t1 + t2;
+
+While the code above looks easy enough, it is important to understand that the
+expression `t1 + t2` is not actually adding the values of the tensors. The
+expression instead constructs a "tensor operator" object of the class
+TensorCwiseBinaryOp<scalar_sum>, which has references to the tensors
+`t1` and `t2`. This is a small C++ object that knows how to add
+`t1` and `t2`. It is only when the value of the expression is assigned
+to the tensor `t3` that the addition is actually performed. Technically,
+this happens through the overloading of `operator=()` in the Tensor class.
+
+This mechanism for computing tensor expressions allows for lazy evaluation and
+optimizations which are what make the tensor library very fast.
+
+Of course, the tensor operators do nest, and the expression `t1 + t2 * 0.3f`
+is actually represented with the (approximate) tree of operators:
+
+ TensorCwiseBinaryOp<scalar_sum>(t1, TensorCwiseUnaryOp<scalar_mul>(t2, 0.3f))
+
+
+### Tensor Operations and C++ "auto"
+
+Because Tensor operations create tensor operators, the C++ `auto` keyword
+does not have its intuitive meaning. Consider these 2 lines of code:
+
+ Tensor<float, 3> t3 = t1 + t2;
+ auto t4 = t1 + t2;
+
+In the first line we allocate the tensor `t3` and it will contain the
+result of the addition of `t1` and `t2`. In the second line, `t4`
+is actually the tree of tensor operators that will compute the addition of
+`t1` and `t2`. In fact, `t4` is *not* a tensor and you cannot get
+the values of its elements:
+
+ Tensor<float, 3> t3 = t1 + t2;
+ cout << t3(0, 0, 0); // OK prints the value of t1(0, 0, 0) + t2(0, 0, 0)
+
+ auto t4 = t1 + t2;
+ cout << t4(0, 0, 0); // Compilation error!
+
+When you use `auto` you do not get a Tensor as a result but instead a
+non-evaluated expression. So only use `auto` to delay evaluation.
+
+Unfortunately, there is no single underlying concrete type for holding
+non-evaluated expressions, hence you have to use auto in the case when you do
+want to hold non-evaluated expressions.
+
+When you need the results of set of tensor computations you have to assign the
+result to a Tensor that will be capable of holding onto them. This can be
+either a normal Tensor, a fixed size Tensor, or a TensorMap on an existing
+piece of memory. All the following will work:
+
+ auto t4 = t1 + t2;
+
+ Tensor<float, 3> result = t4; // Could also be: result(t4);
+ cout << result(0, 0, 0);
+
+ TensorMap<float, 4> result(<a float* with enough space>, <size0>, ...) = t4;
+ cout << result(0, 0, 0);
+
+ TensorFixedSize<float, Sizes<size0, ...>> result = t4;
+ cout << result(0, 0, 0);
+
+Until you need the results, you can keep the operation around, and even reuse
+it for additional operations. As long as you keep the expression as an
+operation, no computation is performed.
+
+ // One way to compute exp((t1 + t2) * 0.2f);
+ auto t3 = t1 + t2;
+ auto t4 = t3 * 0.2f;
+ auto t5 = t4.exp();
+ Tensor<float, 3> result = t5;
+
+ // Another way, exactly as efficient as the previous one:
+ Tensor<float, 3> result = ((t1 + t2) * 0.2f).exp();
+
+### Controlling When Expression are Evaluated
+
+There are several ways to control when expressions are evaluated:
+
+* Assignment to a Tensor, TensorFixedSize, or TensorMap.
+* Use of the eval() method.
+* Assignment to a TensorRef.
+
+#### Assigning to a Tensor, TensorFixedSize, or TensorMap.
+
+The most common way to evaluate an expression is to assign it to a Tensor. In
+the example below, the `auto` declarations make the intermediate values
+"Operations", not Tensors, and do not cause the expressions to be evaluated.
+The assignment to the Tensor `result` causes the evaluation of all the
+operations.
+
+ auto t3 = t1 + t2; // t3 is an Operation.
+ auto t4 = t3 * 0.2f; // t4 is an Operation.
+ auto t5 = t4.exp(); // t5 is an Operation.
+ Tensor<float, 3> result = t5; // The operations are evaluated.
+
+If you know the ranks and sizes of the Operation value you can assign the
+Operation to a TensorFixedSize instead of a Tensor, which is a bit more
+efficient.
+
+ // We know that the result is a 4x4x2 tensor!
+ TensorFixedSize<float, Sizes<4, 4, 2>> result = t5;
+
+Simiarly, assigning an expression to a TensorMap causes its evaluation. Like
+tensors of type TensorFixedSize, TensorMaps cannot be resized so they have to
+have the rank and sizes of the expression that are assigned to them.
+
+#### Calling eval().
+
+When you compute large composite expressions, you sometimes want to tell Eigen
+that an intermediate value in the expression tree is worth evaluating ahead of
+time. This is done by inserting a call to the `eval()` method of the
+expression Operation.
+
+ // The previous example could have been written:
+ Tensor<float, 3> result = ((t1 + t2) * 0.2f).exp();
+
+ // If you want to compute (t1 + t2) once ahead of time you can write:
+ Tensor<float, 3> result = ((t1 + t2).eval() * 0.2f).exp();
+
+Semantically, calling `eval()` is equivalent to materializing the value of
+the expression in a temporary Tensor of the right size. The code above in
+effect does:
+
+ // .eval() knows the size!
+ TensorFixedSize<float, Sizes<4, 4, 2>> tmp = t1 + t2;
+ Tensor<float, 3> result = (tmp * 0.2f).exp();
+
+Note that the return value of `eval()` is itself an Operation, so the
+following code does not do what you may think:
+
+ // Here t3 is an evaluation Operation. t3 has not been evaluated yet.
+ auto t3 = (t1 + t2).eval();
+
+ // You can use t3 in another expression. Still no evaluation.
+ auto t4 = (t3 * 0.2f).exp();
+
+ // The value is evaluated when you assign the Operation to a Tensor, using
+ // an intermediate tensor to represent t3.x
+ Tensor<float, 3> result = t4;
+
+While in the examples above calling `eval()` does not make a difference in
+performance, in other cases it can make a huge difference. In the expression
+below the `broadcast()` expression causes the `X.maximum()` expression
+to be evaluated many times:
+
+ Tensor<...> X ...;
+ Tensor<...> Y = ((X - X.maximum(depth_dim).reshape(dims2d).broadcast(bcast))
+ * beta).exp();
+
+Inserting a call to `eval()` between the `maximum()` and
+`reshape()` calls guarantees that maximum() is only computed once and
+greatly speeds-up execution:
+
+ Tensor<...> Y =
+ ((X - X.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast))
+ * beta).exp();
+
+In the other example below, the tensor `Y` is both used in the expression
+and its assignment. This is an aliasing problem and if the evaluation is not
+done in the right order Y will be updated incrementally during the evaluation
+resulting in bogus results:
+
+ Tensor<...> Y ...;
+ Y = Y / (Y.sum(depth_dim).reshape(dims2d).broadcast(bcast));
+
+Inserting a call to `eval()` between the `sum()` and `reshape()`
+expressions ensures that the sum is computed before any updates to `Y` are
+done.
+
+ Y = Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast));
+
+Note that an eval around the full right hand side expression is not needed
+because the generated has to compute the i-th value of the right hand side
+before assigning it to the left hand side.
+
+However, if you were assigning the expression value to a shuffle of `Y`
+then you would need to force an eval for correctness by adding an `eval()`
+call for the right hand side:
+
+ Y.shuffle(...) =
+ (Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast))).eval();
+
+
+#### Assigning to a TensorRef.
+
+If you need to access only a few elements from the value of an expression you
+can avoid materializing the value in a full tensor by using a TensorRef.
+
+A TensorRef is a small wrapper class for any Eigen Operation. It provides
+overloads for the `()` operator that let you access individual values in
+the expression. TensorRef is convenient, because the Operation themselves do
+not provide a way to access individual elements.
+
+ // Create a TensorRef for the expression. The expression is not
+ // evaluated yet.
+ TensorRef<Tensor<float, 3> > ref = ((t1 + t2) * 0.2f).exp();
+
+ // Use "ref" to access individual elements. The expression is evaluated
+ // on the fly.
+ float at_0 = ref(0, 0, 0);
+ cout << ref(0, 1, 0);
+
+Only use TensorRef when you need a subset of the values of the expression.
+TensorRef only computes the values you access. However note that if you are
+going to access all the values it will be much faster to materialize the
+results in a Tensor first.
+
+In some cases, if the full Tensor result would be very large, you may save
+memory by accessing it as a TensorRef. But not always. So don't count on it.
+
+
+### Controlling How Expressions Are Evaluated
+
+The tensor library provides several implementations of the various operations
+such as contractions and convolutions. The implementations are optimized for
+different environments: single threaded on CPU, multi threaded on CPU, or on a
+GPU using cuda. Additional implementations may be added later.
+
+You can choose which implementation to use with the `device()` call. If
+you do not choose an implementation explicitly the default implementation that
+uses a single thread on the CPU is used.
+
+The default implementation has been optimized for recent Intel CPUs, taking
+advantage of SSE, AVX, and FMA instructions. Work is ongoing to tune the
+library on ARM CPUs. Note that you need to pass compiler-dependent flags
+to enable the use of SSE, AVX, and other instructions.
+
+For example, the following code adds two tensors using the default
+single-threaded CPU implementation:
+
+ Tensor<float, 2> a(30, 40);
+ Tensor<float, 2> b(30, 40);
+ Tensor<float, 2> c = a + b;
+
+To choose a different implementation you have to insert a `device()` call
+before the assignment of the result. For technical C++ reasons this requires
+that the Tensor for the result be declared on its own. This means that you
+have to know the size of the result.
+
+ Eigen::Tensor<float, 2> c(30, 40);
+ c.device(...) = a + b;
+
+The call to `device()` must be the last call on the left of the operator=.
+
+You must pass to the `device()` call an Eigen device object. There are
+presently three devices you can use: DefaultDevice, ThreadPoolDevice and
+GpuDevice.
+
+
+#### Evaluating With the DefaultDevice
+
+This is exactly the same as not inserting a `device()` call.
+
+ DefaultDevice my_device;
+ c.device(my_device) = a + b;
+
+#### Evaluating with a Thread Pool
+
+ // Create the Eigen ThreadPool
+ Eigen::ThreadPool pool(8 /* number of threads in pool */)
+
+ // Create the Eigen ThreadPoolDevice.
+ Eigen::ThreadPoolDevice my_device(&pool, 4 /* number of threads to use */);
+
+ // Now just use the device when evaluating expressions.
+ Eigen::Tensor<float, 2> c(30, 50);
+ c.device(my_device) = a.contract(b, dot_product_dims);
+
+
+#### Evaluating On GPU
+
+This is presently a bit more complicated than just using a thread pool device.
+You need to create a GPU device but you also need to explicitly allocate the
+memory for tensors with cuda.
+
+
+## API Reference
+
+### Datatypes
+
+In the documentation of the tensor methods and Operation we mention datatypes
+that are tensor-type specific:
+
+#### <Tensor-Type>::Dimensions
+
+Acts like an array of ints. Has an `int size` attribute, and can be
+indexed like an array to access individual values. Used to represent the
+dimensions of a tensor. See `dimensions()`.
+
+#### <Tensor-Type>::Index
+
+Acts like an `int`. Used for indexing tensors along their dimensions. See
+`operator()`, `dimension()`, and `size()`.
+
+#### <Tensor-Type>::Scalar
+
+Represents the datatype of individual tensor elements. For example, for a
+`Tensor<float>`, `Scalar` is the type `float`. See
+`setConstant()`.
+
+#### <Operation>
+
+We use this pseudo type to indicate that a tensor Operation is returned by a
+method. We indicate in the text the type and dimensions of the tensor that the
+Operation returns after evaluation.
+
+The Operation will have to be evaluated, for example by assigning it to a
+tensor, before you can access the values of the resulting tensor. You can also
+access the values through a TensorRef.
+
+
+## Built-in Tensor Methods
+
+These are usual C++ methods that act on tensors immediately. They are not
+Operations which provide delayed evaluation of their results. Unless specified
+otherwise, all the methods listed below are available on all tensor classes:
+Tensor, TensorFixedSize, and TensorMap.
+
+## Metadata
+
+### int NumDimensions
+
+Constant value indicating the number of dimensions of a Tensor. This is also
+known as the tensor "rank".
+
+ Eigen::Tensor<float, 2> a(3, 4);
+ cout << "Dims " << a.NumDimensions;
+ => Dims 2
+
+### Dimensions dimensions()
+
+Returns an array-like object representing the dimensions of the tensor.
+The actual type of the `dimensions()` result is `<Tensor-Type>::``Dimensions`.
+
+ Eigen::Tensor<float, 2> a(3, 4);
+ const Eigen::Tensor<float, 2>::Dimensions& d = a.dimensions();
+ cout << "Dim size: " << d.size << ", dim 0: " << d[0]
+ << ", dim 1: " << d[1];
+ => Dim size: 2, dim 0: 3, dim 1: 4
+
+If you use a C++11 compiler, you can use `auto` to simplify the code:
+
+ const auto& d = a.dimensions();
+ cout << "Dim size: " << d.size << ", dim 0: " << d[0]
+ << ", dim 1: " << d[1];
+ => Dim size: 2, dim 0: 3, dim 1: 4
+
+### Index dimension(Index n)
+
+Returns the n-th dimension of the tensor. The actual type of the
+`dimension()` result is `<Tensor-Type>::``Index`, but you can
+always use it like an int.
+
+ Eigen::Tensor<float, 2> a(3, 4);
+ int dim1 = a.dimension(1);
+ cout << "Dim 1: " << dim1;
+ => Dim 1: 4
+
+### Index size()
+
+Returns the total number of elements in the tensor. This is the product of all
+the tensor dimensions. The actual type of the `size()` result is
+`<Tensor-Type>::``Index`, but you can always use it like an int.
+
+ Eigen::Tensor<float, 2> a(3, 4);
+ cout << "Size: " << a.size();
+ => Size: 12
+
+
+### Getting Dimensions From An Operation
+
+A few operations provide `dimensions()` directly,
+e.g. `TensorReslicingOp`. Most operations defer calculating dimensions
+until the operation is being evaluated. If you need access to the dimensions
+of a deferred operation, you can wrap it in a TensorRef (see Assigning to a
+TensorRef above), which provides `dimensions()` and `dimension()` as
+above.
+
+TensorRef can also wrap the plain Tensor types, so this is a useful idiom in
+templated contexts where the underlying object could be either a raw Tensor
+or some deferred operation (e.g. a slice of a Tensor). In this case, the
+template code can wrap the object in a TensorRef and reason about its
+dimensionality while remaining agnostic to the underlying type.
+
+
+## Constructors
+
+### Tensor
+
+Creates a tensor of the specified size. The number of arguments must be equal
+to the rank of the tensor. The content of the tensor is not initialized.
+
+ Eigen::Tensor<float, 2> a(3, 4);
+ cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl;
+ => NumRows: 3 NumCols: 4
+
+### TensorFixedSize
+
+Creates a tensor of the specified size. The number of arguments in the Sizes<>
+template parameter determines the rank of the tensor. The content of the tensor
+is not initialized.
+
+ Eigen::TensorFixedSize<float, Sizes<3, 4>> a;
+ cout << "Rank: " << a.rank() << endl;
+ => Rank: 2
+ cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl;
+ => NumRows: 3 NumCols: 4
+
+### TensorMap
+
+Creates a tensor mapping an existing array of data. The data must not be freed
+until the TensorMap is discarded, and the size of the data must be large enough
+to accommodate the coefficients of the tensor.
+
+ float data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+ Eigen::TensorMap<Tensor<float, 2>> a(data, 3, 4);
+ cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl;
+ => NumRows: 3 NumCols: 4
+ cout << "a(1, 2): " << a(1, 2) << endl;
+ => a(1, 2): 7
+
+
+## Contents Initialization
+
+When a new Tensor or a new TensorFixedSize are created, memory is allocated to
+hold all the tensor elements, but the memory is not initialized. Similarly,
+when a new TensorMap is created on top of non-initialized memory the memory its
+contents are not initialized.
+
+You can use one of the methods below to initialize the tensor memory. These
+have an immediate effect on the tensor and return the tensor itself as a
+result. These are not tensor Operations which delay evaluation.
+
+### <Tensor-Type> setConstant(const Scalar& val)
+
+Sets all elements of the tensor to the constant value `val`. `Scalar`
+is the type of data stored in the tensor. You can pass any value that is
+convertible to that type.
+
+Returns the tensor itself in case you want to chain another call.
+
+ a.setConstant(12.3f);
+ cout << "Constant: " << endl << a << endl << endl;
+ =>
+ Constant:
+ 12.3 12.3 12.3 12.3
+ 12.3 12.3 12.3 12.3
+ 12.3 12.3 12.3 12.3
+
+Note that `setConstant()` can be used on any tensor where the element type
+has a copy constructor and an `operator=()`:
+
+ Eigen::Tensor<string, 2> a(2, 3);
+ a.setConstant("yolo");
+ cout << "String tensor: " << endl << a << endl << endl;
+ =>
+ String tensor:
+ yolo yolo yolo
+ yolo yolo yolo
+
+
+### <Tensor-Type> setZero()
+
+Fills the tensor with zeros. Equivalent to `setConstant(Scalar(0))`.
+Returns the tensor itself in case you want to chain another call.
+
+ a.setZero();
+ cout << "Zeros: " << endl << a << endl << endl;
+ =>
+ Zeros:
+ 0 0 0 0
+ 0 0 0 0
+ 0 0 0 0
+
+
+### <Tensor-Type> setValues({..initializer_list})
+
+Fills the tensor with explicit values specified in a std::initializer_list.
+The type of the initializer list depends on the type and rank of the tensor.
+
+If the tensor has rank N, the initializer list must be nested N times. The
+most deeply nested lists must contains P scalars of the Tensor type where P is
+the size of the last dimension of the Tensor.
+
+For example, for a `TensorFixedSize<float, 2, 3>` the initializer list must
+contains 2 lists of 3 floats each.
+
+`setValues()` returns the tensor itself in case you want to chain another
+call.
+
+ Eigen::Tensor<float, 2> a(2, 3);
+ a.setValues({{0.0f, 1.0f, 2.0f}, {3.0f, 4.0f, 5.0f}});
+ cout << "a" << endl << a << endl << endl;
+ =>
+ a
+ 0 1 2
+ 3 4 5
+
+If a list is too short, the corresponding elements of the tensor will not be
+changed. This is valid at each level of nesting. For example the following
+code only sets the values of the first row of the tensor.
+
+ Eigen::Tensor<int, 2> a(2, 3);
+ a.setConstant(1000);
+ a.setValues({{10, 20, 30}});
+ cout << "a" << endl << a << endl << endl;
+ =>
+ a
+ 10 20 30
+ 1000 1000 1000
+
+### <Tensor-Type> setRandom()
+
+Fills the tensor with random values. Returns the tensor itself in case you
+want to chain another call.
+
+ a.setRandom();
+ cout << "Random: " << endl << a << endl << endl;
+ =>
+ Random:
+ 0.680375 0.59688 -0.329554 0.10794
+ -0.211234 0.823295 0.536459 -0.0452059
+ 0.566198 -0.604897 -0.444451 0.257742
+
+You can customize `setRandom()` by providing your own random number
+generator as a template argument:
+
+ a.setRandom<MyRandomGenerator>();
+
+Here, `MyRandomGenerator` must be a struct with the following member
+functions, where Scalar and Index are the same as `<Tensor-Type>::``Scalar`
+and `<Tensor-Type>::``Index`.
+
+See `struct UniformRandomGenerator` in TensorFunctors.h for an example.
+
+ // Custom number generator for use with setRandom().
+ struct MyRandomGenerator {
+ // Default and copy constructors. Both are needed
+ MyRandomGenerator() { }
+ MyRandomGenerator(const MyRandomGenerator& ) { }
+
+ // Return a random value to be used. "element_location" is the
+ // location of the entry to set in the tensor, it can typically
+ // be ignored.
+ Scalar operator()(Eigen::DenseIndex element_location,
+ Eigen::DenseIndex /*unused*/ = 0) const {
+ return <randomly generated value of type T>;
+ }
+
+ // Same as above but generates several numbers at a time.
+ typename internal::packet_traits<Scalar>::type packetOp(
+ Eigen::DenseIndex packet_location, Eigen::DenseIndex /*unused*/ = 0) const {
+ return <a packet of randomly generated values>;
+ }
+ };
+
+You can also use one of the 2 random number generators that are part of the
+tensor library:
+* UniformRandomGenerator
+* NormalRandomGenerator
+
+
+## Data Access
+
+The Tensor, TensorFixedSize, and TensorRef classes provide the following
+accessors to access the tensor coefficients:
+
+ const Scalar& operator()(const array<Index, NumIndices>& indices)
+ const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
+ Scalar& operator()(const array<Index, NumIndices>& indices)
+ Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
+
+The number of indices must be equal to the rank of the tensor. Moreover, these
+accessors are not available on tensor expressions. In order to access the
+values of a tensor expression, the expression must either be evaluated or
+wrapped in a TensorRef.
+
+
+### Scalar* data() and const Scalar* data() const
+
+Returns a pointer to the storage for the tensor. The pointer is const if the
+tensor was const. This allows direct access to the data. The layout of the
+data depends on the tensor layout: RowMajor or ColMajor.
+
+This access is usually only needed for special cases, for example when mixing
+Eigen Tensor code with other libraries.
+
+Scalar is the type of data stored in the tensor.
+
+ Eigen::Tensor<float, 2> a(3, 4);
+ float* a_data = a.data();
+ a_data[0] = 123.45f;
+ cout << "a(0, 0): " << a(0, 0);
+ => a(0, 0): 123.45
+
+
+## Tensor Operations
+
+All the methods documented below return non evaluated tensor `Operations`.
+These can be chained: you can apply another Tensor Operation to the value
+returned by the method.
+
+The chain of Operation is evaluated lazily, typically when it is assigned to a
+tensor. See "Controlling when Expression are Evaluated" for more details about
+their evaluation.
+
+### <Operation> constant(const Scalar& val)
+
+Returns a tensor of the same type and dimensions as the original tensor but
+where all elements have the value `val`.
+
+This is useful, for example, when you want to add or subtract a constant from a
+tensor, or multiply every element of a tensor by a scalar.
+
+ Eigen::Tensor<float, 2> a(2, 3);
+ a.setConstant(1.0f);
+ Eigen::Tensor<float, 2> b = a + a.constant(2.0f);
+ Eigen::Tensor<float, 2> c = b * b.constant(0.2f);
+ cout << "a" << endl << a << endl << endl;
+ cout << "b" << endl << b << endl << endl;
+ cout << "c" << endl << c << endl << endl;
+ =>
+ a
+ 1 1 1
+ 1 1 1
+
+ b
+ 3 3 3
+ 3 3 3
+
+ c
+ 0.6 0.6 0.6
+ 0.6 0.6 0.6
+
+### <Operation> random()
+
+Returns a tensor of the same type and dimensions as the current tensor
+but where all elements have random values.
+
+This is for example useful to add random values to an existing tensor.
+The generation of random values can be customized in the same manner
+as for `setRandom()`.
+
+ Eigen::Tensor<float, 2> a(2, 3);
+ a.setConstant(1.0f);
+ Eigen::Tensor<float, 2> b = a + a.random();
+ cout << "a" << endl << a << endl << endl;
+ cout << "b" << endl << b << endl << endl;
+ =>
+ a
+ 1 1 1
+ 1 1 1
+
+ b
+ 1.68038 1.5662 1.82329
+ 0.788766 1.59688 0.395103
+
+
+## Unary Element Wise Operations
+
+All these operations take a single input tensor as argument and return a tensor
+of the same type and dimensions as the tensor to which they are applied. The
+requested operations are applied to each element independently.
+
+### <Operation> operator-()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the opposite values of the original tensor.
+
+ Eigen::Tensor<float, 2> a(2, 3);
+ a.setConstant(1.0f);
+ Eigen::Tensor<float, 2> b = -a;
+ cout << "a" << endl << a << endl << endl;
+ cout << "b" << endl << b << endl << endl;
+ =>
+ a
+ 1 1 1
+ 1 1 1
+
+ b
+ -1 -1 -1
+ -1 -1 -1
+
+### <Operation> sqrt()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the square roots of the original tensor.
+
+### <Operation> rsqrt()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the inverse square roots of the original tensor.
+
+### <Operation> square()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the squares of the original tensor values.
+
+### <Operation> inverse()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the inverse of the original tensor values.
+
+### <Operation> exp()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the exponential of the original tensor.
+
+### <Operation> log()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the natural logarithms of the original tensor.
+
+### <Operation> abs()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the absolute values of the original tensor.
+
+### <Operation> pow(Scalar exponent)
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the coefficients of the original tensor to the power of the
+exponent.
+
+The type of the exponent, Scalar, is always the same as the type of the
+tensor coefficients. For example, only integer exponents can be used in
+conjuntion with tensors of integer values.
+
+You can use cast() to lift this restriction. For example this computes
+cubic roots of an int Tensor:
+
+ Eigen::Tensor<int, 2> a(2, 3);
+ a.setValues({{0, 1, 8}, {27, 64, 125}});
+ Eigen::Tensor<double, 2> b = a.cast<double>().pow(1.0 / 3.0);
+ cout << "a" << endl << a << endl << endl;
+ cout << "b" << endl << b << endl << endl;
+ =>
+ a
+ 0 1 8
+ 27 64 125
+
+ b
+ 0 1 2
+ 3 4 5
+
+### <Operation> operator * (Scalar scale)
+
+Multiplies all the coefficients of the input tensor by the provided scale.
+
+### <Operation> cwiseMax(Scalar threshold)
+TODO
+
+### <Operation> cwiseMin(Scalar threshold)
+TODO
+
+### <Operation> unaryExpr(const CustomUnaryOp& func)
+TODO
+
+
+## Binary Element Wise Operations
+
+These operations take two input tensors as arguments. The 2 input tensors should
+be of the same type and dimensions. The result is a tensor of the same
+dimensions as the tensors to which they are applied, and unless otherwise
+specified it is also of the same type. The requested operations are applied to
+each pair of elements independently.
+
+### <Operation> operator+(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise sums of the inputs.
+
+### <Operation> operator-(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise differences of the inputs.
+
+### <Operation> operator*(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise products of the inputs.
+
+### <Operation> operator/(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise quotients of the inputs.
+
+This operator is not supported for integer types.
+
+### <Operation> cwiseMax(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise maximums of the inputs.
+
+### <Operation> cwiseMin(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise mimimums of the inputs.
+
+### <Operation> Logical operators
+
+The following logical operators are supported as well:
+
+* operator&&(const OtherDerived& other)
+* operator||(const OtherDerived& other)
+* operator<(const OtherDerived& other)
+* operator<=(const OtherDerived& other)
+* operator>(const OtherDerived& other)
+* operator>=(const OtherDerived& other)
+* operator==(const OtherDerived& other)
+* operator!=(const OtherDerived& other)
+
+They all return a tensor of boolean values.
+
+
+## Selection (select(const ThenDerived& thenTensor, const ElseDerived& elseTensor)
+
+Selection is a coefficient-wise ternary operator that is the tensor equivalent
+to the if-then-else operation.
+
+ Tensor<bool, 3> if = ...;
+ Tensor<float, 3> then = ...;
+ Tensor<float, 3> else = ...;
+ Tensor<float, 3> result = if.select(then, else);
+
+The 3 arguments must be of the same dimensions, which will also be the dimension
+of the result. The 'if' tensor must be of type boolean, the 'then' and the
+'else' tensor must be of the same type, which will also be the type of the
+result.
+
+Each coefficient in the result is equal to the corresponding coefficient in the
+'then' tensor if the corresponding value in the 'if' tensor is true. If not, the
+resulting coefficient will come from the 'else' tensor.
+
+
+## Contraction
+
+Tensor *contractions* are a generalization of the matrix product to the
+multidimensional case.
+
+ // Create 2 matrices using tensors of rank 2
+ Eigen::Tensor<int, 2> a(2, 3);
+ a.setValues({{1, 2, 3}, {6, 5, 4}});
+ Eigen::Tensor<int, 2> b(3, 2);
+ b.setValues({{1, 2}, {4, 5}, {5, 6}});
+
+ // Compute the traditional matrix product
+ Eigen::array<Eigen::IndexPair<int>, 1> product_dims = { Eigen::IndexPair<int>(1, 0) };
+ Eigen::Tensor<int, 2> AB = a.contract(b, product_dims);
+
+ // Compute the product of the transpose of the matrices
+ Eigen::array<Eigen::IndexPair<int>, 1> transposed_product_dims = { Eigen::IndexPair<int>(0, 1) };
+ Eigen::Tensor<int, 2> AtBt = a.contract(b, transposed_product_dims);
+
+ // Contraction to scalar value using a double contraction.
+ // First coordinate of both tensors are contracted as well as both second coordinates, i.e., this computes the sum of the squares of the elements.
+ Eigen::array<Eigen::IndexPair<int>, 2> double_contraction_product_dims = { Eigen::IndexPair<int>(0, 0), Eigen::IndexPair<int>(1, 1) };
+ Eigen::Tensor<int, 0> AdoubleContractedA = a.contract(a, double_contraction_product_dims);
+
+ // Extracting the scalar value of the tensor contraction for further usage
+ int value = AdoubleContractedA(0);
+
+## Reduction Operations
+
+A *Reduction* operation returns a tensor with fewer dimensions than the
+original tensor. The values in the returned tensor are computed by applying a
+*reduction operator* to slices of values from the original tensor. You specify
+the dimensions along which the slices are made.
+
+The Eigen Tensor library provides a set of predefined reduction operators such
+as `maximum()` and `sum()` and lets you define additional operators by
+implementing a few methods from a reductor template.
+
+### Reduction Dimensions
+
+All reduction operations take a single parameter of type
+`<TensorType>::``Dimensions` which can always be specified as an array of
+ints. These are called the "reduction dimensions." The values are the indices
+of the dimensions of the input tensor over which the reduction is done. The
+parameter can have at most as many element as the rank of the input tensor;
+each element must be less than the tensor rank, as it indicates one of the
+dimensions to reduce.
+
+Each dimension of the input tensor should occur at most once in the reduction
+dimensions as the implementation does not remove duplicates.
+
+The order of the values in the reduction dimensions does not affect the
+results, but the code may execute faster if you list the dimensions in
+increasing order.
+
+Example: Reduction along one dimension.
+
+ // Create a tensor of 2 dimensions
+ Eigen::Tensor<int, 2> a(2, 3);
+ a.setValues({{1, 2, 3}, {6, 5, 4}});
+ // Reduce it along the second dimension (1)...
+ Eigen::array<int, 1> dims({1 /* dimension to reduce */});
+ // ...using the "maximum" operator.
+ // The result is a tensor with one dimension. The size of
+ // that dimension is the same as the first (non-reduced) dimension of a.
+ Eigen::Tensor<int, 1> b = a.maximum(dims);
+ cout << "a" << endl << a << endl << endl;
+ cout << "b" << endl << b << endl << endl;
+ =>
+ a
+ 1 2 3
+ 6 5 4
+
+ b
+ 3
+ 6
+
+Example: Reduction along two dimensions.
+
+ Eigen::Tensor<float, 3, Eigen::ColMajor> a(2, 3, 4);
+ a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f},
+ {7.0f, 6.0f, 5.0f, 4.0f},
+ {8.0f, 9.0f, 10.0f, 11.0f}},
+ {{12.0f, 13.0f, 14.0f, 15.0f},
+ {19.0f, 18.0f, 17.0f, 16.0f},
+ {20.0f, 21.0f, 22.0f, 23.0f}}});
+ // The tensor a has 3 dimensions. We reduce along the
+ // first 2, resulting in a tensor with a single dimension
+ // of size 4 (the last dimension of a.)
+ // Note that we pass the array of reduction dimensions
+ // directly to the maximum() call.
+ Eigen::Tensor<float, 1, Eigen::ColMajor> b =
+ a.maximum(Eigen::array<int, 2>({0, 1}));
+ cout << "b" << endl << b << endl << endl;
+ =>
+ b
+ 20
+ 21
+ 22
+ 23
+
+#### Reduction along all dimensions
+
+As a special case, if you pass no parameter to a reduction operation the
+original tensor is reduced along *all* its dimensions. The result is a
+scalar, represented as a zero-dimension tensor.
+
+ Eigen::Tensor<float, 3> a(2, 3, 4);
+ a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f},
+ {7.0f, 6.0f, 5.0f, 4.0f},
+ {8.0f, 9.0f, 10.0f, 11.0f}},
+ {{12.0f, 13.0f, 14.0f, 15.0f},
+ {19.0f, 18.0f, 17.0f, 16.0f},
+ {20.0f, 21.0f, 22.0f, 23.0f}}});
+ // Reduce along all dimensions using the sum() operator.
+ Eigen::Tensor<float, 0> b = a.sum();
+ cout << "b" << endl << b << endl << endl;
+ =>
+ b
+ 276
+
+
+### <Operation> sum(const Dimensions& new_dims)
+### <Operation> sum()
+
+Reduce a tensor using the sum() operator. The resulting values
+are the sum of the reduced values.
+
+### <Operation> mean(const Dimensions& new_dims)
+### <Operation> mean()
+
+Reduce a tensor using the mean() operator. The resulting values
+are the mean of the reduced values.
+
+### <Operation> maximum(const Dimensions& new_dims)
+### <Operation> maximum()
+
+Reduce a tensor using the maximum() operator. The resulting values are the
+largest of the reduced values.
+
+### <Operation> minimum(const Dimensions& new_dims)
+### <Operation> minimum()
+
+Reduce a tensor using the minimum() operator. The resulting values
+are the smallest of the reduced values.
+
+### <Operation> prod(const Dimensions& new_dims)
+### <Operation> prod()
+
+Reduce a tensor using the prod() operator. The resulting values
+are the product of the reduced values.
+
+### <Operation> all(const Dimensions& new_dims)
+### <Operation> all()
+Reduce a tensor using the all() operator. Casts tensor to bool and then checks
+whether all elements are true. Runs through all elements rather than
+short-circuiting, so may be significantly inefficient.
+
+### <Operation> any(const Dimensions& new_dims)
+### <Operation> any()
+Reduce a tensor using the any() operator. Casts tensor to bool and then checks
+whether any element is true. Runs through all elements rather than
+short-circuiting, so may be significantly inefficient.
+
+
+### <Operation> reduce(const Dimensions& new_dims, const Reducer& reducer)
+
+Reduce a tensor using a user-defined reduction operator. See `SumReducer`
+in TensorFunctors.h for information on how to implement a reduction operator.
+
+
+## Trace
+
+A *Trace* operation returns a tensor with fewer dimensions than the original
+tensor. It returns a tensor whose elements are the sum of the elements of the
+original tensor along the main diagonal for a list of specified dimensions, the
+"trace dimensions". Similar to the `Reduction Dimensions`, the trace dimensions
+are passed as an input parameter to the operation, are of type `<TensorType>::``Dimensions`
+, and have the same requirements when passed as an input parameter. In addition,
+the trace dimensions must have the same size.
+
+Example: Trace along 2 dimensions.
+
+ // Create a tensor of 3 dimensions
+ Eigen::Tensor<int, 3> a(2, 2, 3);
+ a.setValues({{{1, 2, 3}, {4, 5, 6}}, {{7, 8, 9}, {10, 11, 12}}});
+ // Specify the dimensions along which the trace will be computed.
+ // In this example, the trace can only be computed along the dimensions
+ // with indices 0 and 1
+ Eigen::array<int, 2> dims({0, 1});
+ // The output tensor contains all but the trace dimensions.
+ Tensor<int, 1> a_trace = a.trace(dims);
+ cout << "a_trace:" << endl;
+ cout << a_trace << endl;
+ =>
+ a_trace:
+ 11
+ 13
+ 15
+
+
+### <Operation> trace(const Dimensions& new_dims)
+### <Operation> trace()
+
+As a special case, if no parameter is passed to the operation, trace is computed
+along *all* dimensions of the input tensor.
+
+Example: Trace along all dimensions.
+
+ // Create a tensor of 3 dimensions, with all dimensions having the same size.
+ Eigen::Tensor<int, 3> a(3, 3, 3);
+ a.setValues({{{1, 2, 3}, {4, 5, 6}, {7, 8, 9}},
+ {{10, 11, 12}, {13, 14, 15}, {16, 17, 18}},
+ {{19, 20, 21}, {22, 23, 24}, {25, 26, 27}}});
+ // Result is a zero dimension tensor
+ Tensor<int, 0> a_trace = a.trace();
+ cout<<"a_trace:"<<endl;
+ cout<<a_trace<<endl;
+ =>
+ a_trace:
+ 42
+
+
+## Scan Operations
+
+A *Scan* operation returns a tensor with the same dimensions as the original
+tensor. The operation performs an inclusive scan along the specified
+axis, which means it computes a running total along the axis for a given
+reduction operation.
+If the reduction operation corresponds to summation, then this computes the
+prefix sum of the tensor along the given axis.
+
+Example:
+dd a comment to this line
+
+ // Create a tensor of 2 dimensions
+ Eigen::Tensor<int, 2> a(2, 3);
+ a.setValues({{1, 2, 3}, {4, 5, 6}});
+ // Scan it along the second dimension (1) using summation
+ Eigen::Tensor<int, 2> b = a.cumsum(1);
+ // The result is a tensor with the same size as the input
+ cout << "a" << endl << a << endl << endl;
+ cout << "b" << endl << b << endl << endl;
+ =>
+ a
+ 1 2 3
+ 4 5 6
+
+ b
+ 1 3 6
+ 4 9 15
+
+### <Operation> cumsum(const Index& axis)
+
+Perform a scan by summing consecutive entries.
+
+### <Operation> cumprod(const Index& axis)
+
+Perform a scan by multiplying consecutive entries.
+
+
+## Convolutions
+
+### <Operation> convolve(const Kernel& kernel, const Dimensions& dims)
+
+Returns a tensor that is the output of the convolution of the input tensor with the kernel,
+along the specified dimensions of the input tensor. The dimension size for dimensions of the output tensor
+which were part of the convolution will be reduced by the formula:
+output_dim_size = input_dim_size - kernel_dim_size + 1 (requires: input_dim_size >= kernel_dim_size).
+The dimension sizes for dimensions that were not part of the convolution will remain the same.
+Performance of the convolution can depend on the length of the stride(s) of the input tensor dimension(s) along which the
+convolution is computed (the first dimension has the shortest stride for ColMajor, whereas RowMajor's shortest stride is
+for the last dimension).
+
+ // Compute convolution along the second and third dimension.
+ Tensor<float, 4, DataLayout> input(3, 3, 7, 11);
+ Tensor<float, 2, DataLayout> kernel(2, 2);
+ Tensor<float, 4, DataLayout> output(3, 2, 6, 11);
+ input.setRandom();
+ kernel.setRandom();
+
+ Eigen::array<ptrdiff_t, 2> dims({1, 2}); // Specify second and third dimension for convolution.
+ output = input.convolve(kernel, dims);
+
+ for (int i = 0; i < 3; ++i) {
+ for (int j = 0; j < 2; ++j) {
+ for (int k = 0; k < 6; ++k) {
+ for (int l = 0; l < 11; ++l) {
+ const float result = output(i,j,k,l);
+ const float expected = input(i,j+0,k+0,l) * kernel(0,0) +
+ input(i,j+1,k+0,l) * kernel(1,0) +
+ input(i,j+0,k+1,l) * kernel(0,1) +
+ input(i,j+1,k+1,l) * kernel(1,1);
+ VERIFY_IS_APPROX(result, expected);
+ }
+ }
+ }
+ }
+
+
+## Geometrical Operations
+
+These operations return a Tensor with different dimensions than the original
+Tensor. They can be used to access slices of tensors, see them with different
+dimensions, or pad tensors with additional data.
+
+### <Operation> reshape(const Dimensions& new_dims)
+
+Returns a view of the input tensor that has been reshaped to the specified
+new dimensions. The argument new_dims is an array of Index values. The
+rank of the resulting tensor is equal to the number of elements in new_dims.
+
+The product of all the sizes in the new dimension array must be equal to
+the number of elements in the input tensor.
+
+ // Increase the rank of the input tensor by introducing a new dimension
+ // of size 1.
+ Tensor<float, 2> input(7, 11);
+ array<int, 3> three_dims{{7, 11, 1}};
+ Tensor<float, 3> result = input.reshape(three_dims);
+
+ // Decrease the rank of the input tensor by merging 2 dimensions;
+ array<int, 1> one_dim{{7 * 11}};
+ Tensor<float, 1> result = input.reshape(one_dim);
+
+This operation does not move any data in the input tensor, so the resulting
+contents of a reshaped Tensor depend on the data layout of the original Tensor.
+
+For example this is what happens when you `reshape()` a 2D ColMajor tensor
+to one dimension:
+
+ Eigen::Tensor<float, 2, Eigen::ColMajor> a(2, 3);
+ a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}});
+ Eigen::array<Eigen::DenseIndex, 1> one_dim({3 * 2});
+ Eigen::Tensor<float, 1, Eigen::ColMajor> b = a.reshape(one_dim);
+ cout << "b" << endl << b << endl;
+ =>
+ b
+ 0
+ 300
+ 100
+ 400
+ 200
+ 500
+
+This is what happens when the 2D Tensor is RowMajor:
+
+ Eigen::Tensor<float, 2, Eigen::RowMajor> a(2, 3);
+ a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}});
+ Eigen::array<Eigen::DenseIndex, 1> one_dim({3 * 2});
+ Eigen::Tensor<float, 1, Eigen::RowMajor> b = a.reshape(one_dim);
+ cout << "b" << endl << b << endl;
+ =>
+ b
+ 0
+ 100
+ 200
+ 300
+ 400
+ 500
+
+The reshape operation is a lvalue. In other words, it can be used on the left
+side of the assignment operator.
+
+The previous example can be rewritten as follow:
+
+ Eigen::Tensor<float, 2, Eigen::ColMajor> a(2, 3);
+ a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}});
+ Eigen::array<Eigen::DenseIndex, 2> two_dim({2, 3});
+ Eigen::Tensor<float, 1, Eigen::ColMajor> b(6);
+ b.reshape(two_dim) = a;
+ cout << "b" << endl << b << endl;
+ =>
+ b
+ 0
+ 300
+ 100
+ 400
+ 200
+ 500
+
+Note that "b" itself was not reshaped but that instead the assignment is done to
+the reshape view of b.
+
+
+### <Operation> shuffle(const Shuffle& shuffle)
+
+Returns a copy of the input tensor whose dimensions have been
+reordered according to the specified permutation. The argument shuffle
+is an array of Index values. Its size is the rank of the input
+tensor. It must contain a permutation of 0, 1, ..., rank - 1. The i-th
+dimension of the output tensor equals to the size of the shuffle[i]-th
+dimension of the input tensor. For example:
+
+ // Shuffle all dimensions to the left by 1.
+ Tensor<float, 3> input(20, 30, 50);
+ // ... set some values in input.
+ Tensor<float, 3> output = input.shuffle({1, 2, 0})
+
+ eigen_assert(output.dimension(0) == 30);
+ eigen_assert(output.dimension(1) == 50);
+ eigen_assert(output.dimension(2) == 20);
+
+Indices into the output tensor are shuffled accordingly to formulate
+indices into the input tensor. For example, one can assert in the above
+code snippet that:
+
+ eigen_assert(output(3, 7, 11) == input(11, 3, 7));
+
+In general, one can assert that
+
+ eigen_assert(output(..., indices[shuffle[i]], ...) ==
+ input(..., indices[i], ...))
+
+The shuffle operation results in a lvalue, which means that it can be assigned
+to. In other words, it can be used on the left side of the assignment operator.
+
+Let's rewrite the previous example to take advantage of this feature:
+
+ // Shuffle all dimensions to the left by 1.
+ Tensor<float, 3> input(20, 30, 50);
+ // ... set some values in input.
+ Tensor<float, 3> output(30, 50, 20);
+ output.shuffle({2, 0, 1}) = input;
+
+
+### <Operation> stride(const Strides& strides)
+
+Returns a view of the input tensor that strides (skips stride-1
+elements) along each of the dimensions. The argument strides is an
+array of Index values. The dimensions of the resulting tensor are
+ceil(input_dimensions[i] / strides[i]).
+
+For example this is what happens when you `stride()` a 2D tensor:
+
+ Eigen::Tensor<int, 2> a(4, 3);
+ a.setValues({{0, 100, 200}, {300, 400, 500}, {600, 700, 800}, {900, 1000, 1100}});
+ Eigen::array<Eigen::DenseIndex, 2> strides({3, 2});
+ Eigen::Tensor<int, 2> b = a.stride(strides);
+ cout << "b" << endl << b << endl;
+ =>
+ b
+ 0 200
+ 900 1100
+
+It is possible to assign a tensor to a stride:
+ Tensor<float, 3> input(20, 30, 50);
+ // ... set some values in input.
+ Tensor<float, 3> output(40, 90, 200);
+ output.stride({2, 3, 4}) = input;
+
+
+### <Operation> slice(const StartIndices& offsets, const Sizes& extents)
+
+Returns a sub-tensor of the given tensor. For each dimension i, the slice is
+made of the coefficients stored between offset[i] and offset[i] + extents[i] in
+the input tensor.
+
+ Eigen::Tensor<int, 2> a(4, 3);
+ a.setValues({{0, 100, 200}, {300, 400, 500},
+ {600, 700, 800}, {900, 1000, 1100}});
+ Eigen::array<int, 2> offsets = {1, 0};
+ Eigen::array<int, 2> extents = {2, 2};
+ Eigen::Tensor<int, 1> slice = a.slice(offsets, extents);
+ cout << "a" << endl << a << endl;
+ =>
+ a
+ 0 100 200
+ 300 400 500
+ 600 700 800
+ 900 1000 1100
+ cout << "slice" << endl << slice << endl;
+ =>
+ slice
+ 300 400
+ 600 700
+
+
+### <Operation> chip(const Index offset, const Index dim)
+
+A chip is a special kind of slice. It is the subtensor at the given offset in
+the dimension dim. The returned tensor has one fewer dimension than the input
+tensor: the dimension dim is removed.
+
+For example, a matrix chip would be either a row or a column of the input
+matrix.
+
+ Eigen::Tensor<int, 2> a(4, 3);
+ a.setValues({{0, 100, 200}, {300, 400, 500},
+ {600, 700, 800}, {900, 1000, 1100}});
+ Eigen::Tensor<int, 1> row_3 = a.chip(2, 0);
+ Eigen::Tensor<int, 1> col_2 = a.chip(1, 1);
+ cout << "a" << endl << a << endl;
+ =>
+ a
+ 0 100 200
+ 300 400 500
+ 600 700 800
+ 900 1000 1100
+ cout << "row_3" << endl << row_3 << endl;
+ =>
+ row_3
+ 600 700 800
+ cout << "col_2" << endl << col_2 << endl;
+ =>
+ col_2
+ 100 400 700 1000
+
+It is possible to assign values to a tensor chip since the chip operation is a
+lvalue. For example:
+
+ Eigen::Tensor<int, 1> a(3);
+ a.setValues({{100, 200, 300}});
+ Eigen::Tensor<int, 2> b(2, 3);
+ b.setZero();
+ b.chip(0, 0) = a;
+ cout << "a" << endl << a << endl;
+ =>
+ a
+ 100
+ 200
+ 300
+ cout << "b" << endl << b << endl;
+ =>
+ b
+ 100 200 300
+ 0 0 0
+
+
+### <Operation> reverse(const ReverseDimensions& reverse)
+
+Returns a view of the input tensor that reverses the order of the coefficients
+along a subset of the dimensions. The argument reverse is an array of boolean
+values that indicates whether or not the order of the coefficients should be
+reversed along each of the dimensions. This operation preserves the dimensions
+of the input tensor.
+
+For example this is what happens when you `reverse()` the first dimension
+of a 2D tensor:
+
+ Eigen::Tensor<int, 2> a(4, 3);
+ a.setValues({{0, 100, 200}, {300, 400, 500},
+ {600, 700, 800}, {900, 1000, 1100}});
+ Eigen::array<bool, 2> reverse({true, false});
+ Eigen::Tensor<int, 2> b = a.reverse(reverse);
+ cout << "a" << endl << a << endl << "b" << endl << b << endl;
+ =>
+ a
+ 0 100 200
+ 300 400 500
+ 600 700 800
+ 900 1000 1100
+ b
+ 900 1000 1100
+ 600 700 800
+ 300 400 500
+ 0 100 200
+
+
+### <Operation> broadcast(const Broadcast& broadcast)
+
+Returns a view of the input tensor in which the input is replicated one to many
+times.
+The broadcast argument specifies how many copies of the input tensor need to be
+made in each of the dimensions.
+
+ Eigen::Tensor<int, 2> a(2, 3);
+ a.setValues({{0, 100, 200}, {300, 400, 500}});
+ Eigen::array<int, 2> bcast({3, 2});
+ Eigen::Tensor<int, 2> b = a.broadcast(bcast);
+ cout << "a" << endl << a << endl << "b" << endl << b << endl;
+ =>
+ a
+ 0 100 200
+ 300 400 500
+ b
+ 0 100 200 0 100 200
+ 300 400 500 300 400 500
+ 0 100 200 0 100 200
+ 300 400 500 300 400 500
+ 0 100 200 0 100 200
+ 300 400 500 300 400 500
+
+### <Operation> concatenate(const OtherDerived& other, Axis axis)
+
+TODO
+
+### <Operation> pad(const PaddingDimensions& padding)
+
+Returns a view of the input tensor in which the input is padded with zeros.
+
+ Eigen::Tensor<int, 2> a(2, 3);
+ a.setValues({{0, 100, 200}, {300, 400, 500}});
+ Eigen::array<pair<int, int>, 2> paddings;
+ paddings[0] = make_pair(0, 1);
+ paddings[1] = make_pair(2, 3);
+ Eigen::Tensor<int, 2> b = a.pad(paddings);
+ cout << "a" << endl << a << endl << "b" << endl << b << endl;
+ =>
+ a
+ 0 100 200
+ 300 400 500
+ b
+ 0 0 0 0
+ 0 0 0 0
+ 0 100 200 0
+ 300 400 500 0
+ 0 0 0 0
+ 0 0 0 0
+ 0 0 0 0
+
+
+### <Operation> extract_patches(const PatchDims& patch_dims)
+
+Returns a tensor of coefficient patches extracted from the input tensor, where
+each patch is of dimension specified by 'patch_dims'. The returned tensor has
+one greater dimension than the input tensor, which is used to index each patch.
+The patch index in the output tensor depends on the data layout of the input
+tensor: the patch index is the last dimension ColMajor layout, and the first
+dimension in RowMajor layout.
+
+For example, given the following input tensor:
+
+ Eigen::Tensor<float, 2, DataLayout> tensor(3,4);
+ tensor.setValues({{0.0f, 1.0f, 2.0f, 3.0f},
+ {4.0f, 5.0f, 6.0f, 7.0f},
+ {8.0f, 9.0f, 10.0f, 11.0f}});
+
+ cout << "tensor: " << endl << tensor << endl;
+ =>
+ tensor:
+ 0 1 2 3
+ 4 5 6 7
+ 8 9 10 11
+
+Six 2x2 patches can be extracted and indexed using the following code:
+
+ Eigen::Tensor<float, 3, DataLayout> patch;
+ Eigen::array<ptrdiff_t, 2> patch_dims;
+ patch_dims[0] = 2;
+ patch_dims[1] = 2;
+ patch = tensor.extract_patches(patch_dims);
+ for (int k = 0; k < 6; ++k) {
+ cout << "patch index: " << k << endl;
+ for (int i = 0; i < 2; ++i) {
+ for (int j = 0; j < 2; ++j) {
+ if (DataLayout == ColMajor) {
+ cout << patch(i, j, k) << " ";
+ } else {
+ cout << patch(k, i, j) << " ";
+ }
+ }
+ cout << endl;
+ }
+ }
+
+This code results in the following output when the data layout is ColMajor:
+
+ patch index: 0
+ 0 1
+ 4 5
+ patch index: 1
+ 4 5
+ 8 9
+ patch index: 2
+ 1 2
+ 5 6
+ patch index: 3
+ 5 6
+ 9 10
+ patch index: 4
+ 2 3
+ 6 7
+ patch index: 5
+ 6 7
+ 10 11
+
+This code results in the following output when the data layout is RowMajor:
+(NOTE: the set of patches is the same as in ColMajor, but are indexed differently).
+
+ patch index: 0
+ 0 1
+ 4 5
+ patch index: 1
+ 1 2
+ 5 6
+ patch index: 2
+ 2 3
+ 6 7
+ patch index: 3
+ 4 5
+ 8 9
+ patch index: 4
+ 5 6
+ 9 10
+ patch index: 5
+ 6 7
+ 10 11
+
+### <Operation> extract_image_patches(const Index patch_rows, const Index patch_cols, const Index row_stride, const Index col_stride, const PaddingType padding_type)
+
+Returns a tensor of coefficient image patches extracted from the input tensor,
+which is expected to have dimensions ordered as follows (depending on the data
+layout of the input tensor, and the number of additional dimensions 'N'):
+
+*) ColMajor
+1st dimension: channels (of size d)
+2nd dimension: rows (of size r)
+3rd dimension: columns (of size c)
+4th-Nth dimension: time (for video) or batch (for bulk processing).
+
+*) RowMajor (reverse order of ColMajor)
+1st-Nth dimension: time (for video) or batch (for bulk processing).
+N+1'th dimension: columns (of size c)
+N+2'th dimension: rows (of size r)
+N+3'th dimension: channels (of size d)
+
+The returned tensor has one greater dimension than the input tensor, which is
+used to index each patch. The patch index in the output tensor depends on the
+data layout of the input tensor: the patch index is the 4'th dimension in
+ColMajor layout, and the 4'th from the last dimension in RowMajor layout.
+
+For example, given the following input tensor with the following dimension
+sizes:
+ *) depth: 2
+ *) rows: 3
+ *) columns: 5
+ *) batch: 7
+
+ Tensor<float, 4> tensor(2,3,5,7);
+ Tensor<float, 4, RowMajor> tensor_row_major = tensor.swap_layout();
+
+2x2 image patches can be extracted and indexed using the following code:
+
+*) 2D patch: ColMajor (patch indexed by second-to-last dimension)
+
+ Tensor<float, 5> twod_patch;
+ twod_patch = tensor.extract_image_patches<2, 2>();
+ // twod_patch.dimension(0) == 2
+ // twod_patch.dimension(1) == 2
+ // twod_patch.dimension(2) == 2
+ // twod_patch.dimension(3) == 3*5
+ // twod_patch.dimension(4) == 7
+
+*) 2D patch: RowMajor (patch indexed by the second dimension)
+
+ Tensor<float, 5, RowMajor> twod_patch_row_major;
+ twod_patch_row_major = tensor_row_major.extract_image_patches<2, 2>();
+ // twod_patch_row_major.dimension(0) == 7
+ // twod_patch_row_major.dimension(1) == 3*5
+ // twod_patch_row_major.dimension(2) == 2
+ // twod_patch_row_major.dimension(3) == 2
+ // twod_patch_row_major.dimension(4) == 2
+
+## Special Operations
+
+### <Operation> cast<T>()
+
+Returns a tensor of type T with the same dimensions as the original tensor.
+The returned tensor contains the values of the original tensor converted to
+type T.
+
+ Eigen::Tensor<float, 2> a(2, 3);
+ Eigen::Tensor<int, 2> b = a.cast<int>();
+
+This can be useful for example if you need to do element-wise division of
+Tensors of integers. This is not currently supported by the Tensor library
+but you can easily cast the tensors to floats to do the division:
+
+ Eigen::Tensor<int, 2> a(2, 3);
+ a.setValues({{0, 1, 2}, {3, 4, 5}});
+ Eigen::Tensor<int, 2> b =
+ (a.cast<float>() / a.constant(2).cast<float>()).cast<int>();
+ cout << "a" << endl << a << endl << endl;
+ cout << "b" << endl << b << endl << endl;
+ =>
+ a
+ 0 1 2
+ 3 4 5
+
+ b
+ 0 0 1
+ 1 2 2
+
+
+### <Operation> eval()
+
+TODO
+
+
+## Representation of scalar values
+
+Scalar values are often represented by tensors of size 1 and rank 0.For example
+Tensor<T, N>::maximum() currently returns a Tensor<T, 0>. Similarly, the inner
+product of 2 1d tensors (through contractions) returns a 0d tensor.
+
+## Limitations
+
+* The number of tensor dimensions is currently limited to 250 when using a
+ compiler that supports cxx11. It is limited to only 5 for older compilers.
+* The IndexList class requires a cxx11 compliant compiler. You can use an
+ array of indices instead if you don't have access to a modern compiler.
+* On GPUs only floating point values are properly tested and optimized for.
+* Complex and integer values are known to be broken on GPUs. If you try to use
+ them you'll most likely end up triggering a static assertion failure such as
+ EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+
+
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/Tensor.h b/src/EigenUnsupported/CXX11/src/Tensor/Tensor.h
new file mode 100644
index 0000000..8cac2bb
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/Tensor.h
@@ -0,0 +1,554 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_H
+#define EIGEN_CXX11_TENSOR_TENSOR_H
+
+namespace Eigen {
+
+/** \class Tensor
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief The tensor class.
+ *
+ * The %Tensor class is the work-horse for all \em dense tensors within Eigen.
+ *
+ * The %Tensor class encompasses only dynamic-size objects so far.
+ *
+ * The first two template parameters are required:
+ * \tparam Scalar_ Numeric type, e.g. float, double, int or `std::complex<float>`.
+ * User defined scalar types are supported as well (see \ref user_defined_scalars "here").
+ * \tparam NumIndices_ Number of indices (i.e. rank of the tensor)
+ *
+ * The remaining template parameters are optional -- in most cases you don't have to worry about them.
+ * \tparam Options_ A combination of either \b #RowMajor or \b #ColMajor, and of either
+ * \b #AutoAlign or \b #DontAlign.
+ * The former controls \ref TopicStorageOrders "storage order", and defaults to column-major. The latter controls alignment, which is required
+ * for vectorization. It defaults to aligning tensors. Note that tensors currently do not support any operations that profit from vectorization.
+ * Support for such operations (i.e. adding two tensors etc.) is planned.
+ *
+ * You can access elements of tensors using normal subscripting:
+ *
+ * \code
+ * Eigen::Tensor<double, 4> t(10, 10, 10, 10);
+ * t(0, 1, 2, 3) = 42.0;
+ * \endcode
+ *
+ * This class can be extended with the help of the plugin mechanism described on the page
+ * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_TENSOR_PLUGIN.
+ *
+ * <i><b>Some notes:</b></i>
+ *
+ * <dl>
+ * <dt><b>Relation to other parts of Eigen:</b></dt>
+ * <dd>The midterm development goal for this class is to have a similar hierarchy as Eigen uses for matrices, so that
+ * taking blocks or using tensors in expressions is easily possible, including an interface with the vector/matrix code
+ * by providing .asMatrix() and .asVector() (or similar) methods for rank 2 and 1 tensors. However, currently, the %Tensor
+ * class does not provide any of these features and is only available as a stand-alone class that just allows for
+ * coefficient access. Also, when fixed-size tensors are implemented, the number of template arguments is likely to
+ * change dramatically.</dd>
+ * </dl>
+ *
+ * \ref TopicStorageOrders
+ */
+
+template<typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
+class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
+{
+ public:
+ typedef Tensor<Scalar_, NumIndices_, Options_, IndexType_> Self;
+ typedef TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexType_> > Base;
+ typedef typename Eigen::internal::nested<Self>::type Nested;
+ typedef typename internal::traits<Self>::StorageKind StorageKind;
+ typedef typename internal::traits<Self>::Index Index;
+ typedef Scalar_ Scalar;
+ typedef typename NumTraits<Scalar>::Real RealScalar;
+ typedef typename Base::CoeffReturnType CoeffReturnType;
+
+ enum {
+ IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0) & !(Options_&DontAlign),
+ Layout = Options_ & RowMajor ? RowMajor : ColMajor,
+ CoordAccess = true,
+ RawAccess = true
+ };
+
+ static const int Options = Options_;
+ static const int NumIndices = NumIndices_;
+ typedef DSizes<Index, NumIndices_> Dimensions;
+
+ protected:
+ TensorStorage<Scalar, Dimensions, Options> m_storage;
+
+#ifdef EIGEN_HAS_SFINAE
+ template<typename CustomIndices>
+ struct isOfNormalIndex{
+ static const bool is_array = internal::is_base_of<array<Index, NumIndices>, CustomIndices>::value;
+ static const bool is_int = NumTraits<CustomIndices>::IsInteger;
+ static const bool value = is_array | is_int;
+ };
+#endif
+
+ public:
+ // Metadata
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_storage.dimensions(); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); }
+
+ // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+ // work, because that uses base().coeffRef() - and we don't yet
+ // implement a similar class hierarchy
+ inline Self& base() { return *this; }
+ inline const Self& base() const { return *this; }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+ template<typename... IndexTypes>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
+ {
+ // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+ EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ return coeff(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+ }
+#endif
+
+ // normal indices
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(const array<Index, NumIndices>& indices) const
+ {
+ eigen_internal_assert(checkIndexRange(indices));
+ return m_storage.data()[linearizedIndex(indices)];
+ }
+
+ // custom indices
+#ifdef EIGEN_HAS_SFINAE
+ template<typename CustomIndices,
+ EIGEN_SFINAE_ENABLE_IF( !(isOfNormalIndex<CustomIndices>::value) )
+ >
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(CustomIndices& indices) const
+ {
+ return coeff(internal::customIndices2Array<Index,NumIndices>(indices));
+ }
+#endif
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff() const
+ {
+ EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ return m_storage.data()[0];
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const
+ {
+ eigen_internal_assert(index >= 0 && index < size());
+ return m_storage.data()[index];
+ }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+ template<typename... IndexTypes>
+ inline Scalar& coeffRef(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
+ {
+ // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+ EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ return coeffRef(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+ }
+#endif
+
+ // normal indices
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices)
+ {
+ eigen_internal_assert(checkIndexRange(indices));
+ return m_storage.data()[linearizedIndex(indices)];
+ }
+
+ // custom indices
+#ifdef EIGEN_HAS_SFINAE
+ template<typename CustomIndices,
+ EIGEN_SFINAE_ENABLE_IF( !(isOfNormalIndex<CustomIndices>::value) )
+ >
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(CustomIndices& indices)
+ {
+ return coeffRef(internal::customIndices2Array<Index,NumIndices>(indices));
+ }
+#endif
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef()
+ {
+ EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ return m_storage.data()[0];
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
+ {
+ eigen_internal_assert(index >= 0 && index < size());
+ return m_storage.data()[index];
+ }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+ template<typename... IndexTypes>
+ inline const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
+ {
+ // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+ EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ return this->operator()(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+ }
+#else
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const
+ {
+ return coeff(array<Index, 2>(i0, i1));
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const
+ {
+ return coeff(array<Index, 3>(i0, i1, i2));
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const
+ {
+ return coeff(array<Index, 4>(i0, i1, i2, i3));
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
+ {
+ return coeff(array<Index, 5>(i0, i1, i2, i3, i4));
+ }
+#endif
+
+ // custom indices
+#ifdef EIGEN_HAS_SFINAE
+ template<typename CustomIndices,
+ EIGEN_SFINAE_ENABLE_IF( !(isOfNormalIndex<CustomIndices>::value) )
+ >
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(CustomIndices& indices) const
+ {
+ return coeff(internal::customIndices2Array<Index,NumIndices>(indices));
+ }
+#endif
+
+ // normal indices
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const
+ {
+ return coeff(indices);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const
+ {
+ eigen_internal_assert(index >= 0 && index < size());
+ return coeff(index);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()() const
+ {
+ EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ return coeff();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const
+ {
+ // The bracket operator is only for vectors, use the parenthesis operator instead.
+ EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ return coeff(index);
+ }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+ template<typename... IndexTypes>
+ inline Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
+ {
+ // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+ EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ return operator()(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+ }
+#else
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1)
+ {
+ return coeffRef(array<Index, 2>(i0, i1));
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2)
+ {
+ return coeffRef(array<Index, 3>(i0, i1, i2));
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3)
+ {
+ return coeffRef(array<Index, 4>(i0, i1, i2, i3));
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4)
+ {
+ return coeffRef(array<Index, 5>(i0, i1, i2, i3, i4));
+ }
+#endif
+
+ // normal indices
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices)
+ {
+ return coeffRef(indices);
+ }
+
+ // custom indices
+#ifdef EIGEN_HAS_SFINAE
+ template<typename CustomIndices,
+ EIGEN_SFINAE_ENABLE_IF( !(isOfNormalIndex<CustomIndices>::value) )
+ >
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(CustomIndices& indices)
+ {
+ return coeffRef(internal::customIndices2Array<Index,NumIndices>(indices));
+ }
+#endif
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index index)
+ {
+ eigen_assert(index >= 0 && index < size());
+ return coeffRef(index);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()()
+ {
+ EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ return coeffRef();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator[](Index index)
+ {
+ // The bracket operator is only for vectors, use the parenthesis operator instead
+ EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ return coeffRef(index);
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Tensor()
+ : m_storage()
+ {
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Tensor(const Self& other)
+ : m_storage(other.m_storage)
+ {
+ }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+ template<typename... IndexTypes>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index firstDimension, IndexTypes... otherDimensions)
+ : m_storage(firstDimension, otherDimensions...)
+ {
+ // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
+ EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ }
+#else
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1)
+ : m_storage(dim1, array<Index, 1>(dim1))
+ {
+ EIGEN_STATIC_ASSERT(1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2)
+ : m_storage(dim1*dim2, array<Index, 2>(dim1, dim2))
+ {
+ EIGEN_STATIC_ASSERT(2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3)
+ : m_storage(dim1*dim2*dim3, array<Index, 3>(dim1, dim2, dim3))
+ {
+ EIGEN_STATIC_ASSERT(3 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3, Index dim4)
+ : m_storage(dim1*dim2*dim3*dim4, array<Index, 4>(dim1, dim2, dim3, dim4))
+ {
+ EIGEN_STATIC_ASSERT(4 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3, Index dim4, Index dim5)
+ : m_storage(dim1*dim2*dim3*dim4*dim5, array<Index, 5>(dim1, dim2, dim3, dim4, dim5))
+ {
+ EIGEN_STATIC_ASSERT(5 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ }
+#endif
+
+ /** Normal Dimension */
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(const array<Index, NumIndices>& dimensions)
+ : m_storage(internal::array_prod(dimensions), dimensions)
+ {
+ EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+ }
+
+ template<typename OtherDerived>
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Tensor(const TensorBase<OtherDerived, ReadOnlyAccessors>& other)
+ {
+ typedef TensorAssignOp<Tensor, const OtherDerived> Assign;
+ Assign assign(*this, other.derived());
+ resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
+ internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+ }
+
+ template<typename OtherDerived>
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Tensor(const TensorBase<OtherDerived, WriteAccessors>& other)
+ {
+ typedef TensorAssignOp<Tensor, const OtherDerived> Assign;
+ Assign assign(*this, other.derived());
+ resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
+ internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+ }
+
+ #if EIGEN_HAS_RVALUE_REFERENCES
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Tensor(Self&& other)
+ : m_storage(std::move(other.m_storage))
+ {
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Tensor& operator=(Self&& other)
+ {
+ m_storage = std::move(other.m_storage);
+ return *this;
+ }
+ #endif
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Tensor& operator=(const Tensor& other)
+ {
+ typedef TensorAssignOp<Tensor, const Tensor> Assign;
+ Assign assign(*this, other);
+ resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
+ internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+ return *this;
+ }
+ template<typename OtherDerived>
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Tensor& operator=(const OtherDerived& other)
+ {
+ typedef TensorAssignOp<Tensor, const OtherDerived> Assign;
+ Assign assign(*this, other);
+ resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
+ internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+ return *this;
+ }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+ template<typename... IndexTypes> EIGEN_DEVICE_FUNC
+ void resize(Index firstDimension, IndexTypes... otherDimensions)
+ {
+ // The number of dimensions used to resize a tensor must be equal to the rank of the tensor.
+ EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ resize(array<Index, NumIndices>{{firstDimension, otherDimensions...}});
+ }
+#endif
+
+ /** Normal Dimension */
+ EIGEN_DEVICE_FUNC void resize(const array<Index, NumIndices>& dimensions)
+ {
+ int i;
+ Index size = Index(1);
+ for (i = 0; i < NumIndices; i++) {
+ internal::check_rows_cols_for_overflow<Dynamic>::run(size, dimensions[i]);
+ size *= dimensions[i];
+ }
+ #ifdef EIGEN_INITIALIZE_COEFFS
+ bool size_changed = size != this->size();
+ m_storage.resize(size, dimensions);
+ if(size_changed) EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+ #else
+ m_storage.resize(size, dimensions);
+ #endif
+ }
+
+ // Why this overload, DSizes is derived from array ??? //
+ EIGEN_DEVICE_FUNC void resize(const DSizes<Index, NumIndices>& dimensions) {
+ array<Index, NumIndices> dims;
+ for (int i = 0; i < NumIndices; ++i) {
+ dims[i] = dimensions[i];
+ }
+ resize(dims);
+ }
+
+ EIGEN_DEVICE_FUNC
+ void resize()
+ {
+ EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ // Nothing to do: rank 0 tensors have fixed size
+ }
+
+#ifdef EIGEN_HAS_INDEX_LIST
+ template <typename FirstType, typename... OtherTypes>
+ EIGEN_DEVICE_FUNC
+ void resize(const Eigen::IndexList<FirstType, OtherTypes...>& dimensions) {
+ array<Index, NumIndices> dims;
+ for (int i = 0; i < NumIndices; ++i) {
+ dims[i] = static_cast<Index>(dimensions[i]);
+ }
+ resize(dims);
+ }
+#endif
+
+ /** Custom Dimension */
+#ifdef EIGEN_HAS_SFINAE
+ template<typename CustomDimension,
+ EIGEN_SFINAE_ENABLE_IF( !(isOfNormalIndex<CustomDimension>::value) )
+ >
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize(CustomDimension& dimensions)
+ {
+ resize(internal::customIndices2Array<Index,NumIndices>(dimensions));
+ }
+#endif
+
+#ifndef EIGEN_EMULATE_CXX11_META_H
+ template <typename std::ptrdiff_t... Indices>
+ EIGEN_DEVICE_FUNC
+ void resize(const Sizes<Indices...>& dimensions) {
+ array<Index, NumIndices> dims;
+ for (int i = 0; i < NumIndices; ++i) {
+ dims[i] = static_cast<Index>(dimensions[i]);
+ }
+ resize(dims);
+ }
+#else
+ template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5>
+ EIGEN_DEVICE_FUNC
+ void resize(const Sizes<V1, V2, V3, V4, V5>& dimensions) {
+ array<Index, NumIndices> dims;
+ for (int i = 0; i < NumIndices; ++i) {
+ dims[i] = static_cast<Index>(dimensions[i]);
+ }
+ resize(dims);
+ }
+#endif
+
+ protected:
+
+ bool checkIndexRange(const array<Index, NumIndices>& indices) const
+ {
+ using internal::array_apply_and_reduce;
+ using internal::array_zip_and_reduce;
+ using internal::greater_equal_zero_op;
+ using internal::logical_and_op;
+ using internal::lesser_op;
+
+ return
+ // check whether the indices are all >= 0
+ array_apply_and_reduce<logical_and_op, greater_equal_zero_op>(indices) &&
+ // check whether the indices fit in the dimensions
+ array_zip_and_reduce<logical_and_op, lesser_op>(indices, m_storage.dimensions());
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index linearizedIndex(const array<Index, NumIndices>& indices) const
+ {
+ if (Options&RowMajor) {
+ return m_storage.dimensions().IndexOfRowMajor(indices);
+ } else {
+ return m_storage.dimensions().IndexOfColMajor(indices);
+ }
+ }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorArgMax.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorArgMax.h
new file mode 100644
index 0000000..8b8fb92
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorArgMax.h
@@ -0,0 +1,329 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Eugene Brevdo <ebrevdo@gmail.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H
+#define EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H
+
+namespace Eigen {
+namespace internal {
+
+/** \class TensorIndexTuple
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor + Index Tuple class.
+ *
+ *
+ */
+template<typename XprType>
+struct traits<TensorIndexTupleOp<XprType> > : public traits<XprType>
+{
+ typedef traits<XprType> XprTraits;
+ typedef typename XprTraits::StorageKind StorageKind;
+ typedef typename XprTraits::Index Index;
+ typedef Tuple<Index, typename XprTraits::Scalar> Scalar;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = XprTraits::NumDimensions;
+ static const int Layout = XprTraits::Layout;
+};
+
+template<typename XprType>
+struct eval<TensorIndexTupleOp<XprType>, Eigen::Dense>
+{
+ typedef const TensorIndexTupleOp<XprType>EIGEN_DEVICE_REF type;
+};
+
+template<typename XprType>
+struct nested<TensorIndexTupleOp<XprType>, 1,
+ typename eval<TensorIndexTupleOp<XprType> >::type>
+{
+ typedef TensorIndexTupleOp<XprType> type;
+};
+
+} // end namespace internal
+
+template<typename XprType>
+class TensorIndexTupleOp : public TensorBase<TensorIndexTupleOp<XprType>, ReadOnlyAccessors>
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorIndexTupleOp>::Scalar Scalar;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename Eigen::internal::nested<TensorIndexTupleOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorIndexTupleOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorIndexTupleOp>::Index Index;
+ typedef Tuple<Index, typename XprType::CoeffReturnType> CoeffReturnType;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIndexTupleOp(const XprType& expr)
+ : m_xpr(expr) {}
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type&
+ expression() const { return m_xpr; }
+
+ protected:
+ typename XprType::Nested m_xpr;
+};
+
+// Eval as rvalue
+template<typename ArgType, typename Device>
+struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device>
+{
+ typedef TensorIndexTupleOp<ArgType> XprType;
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+ typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+ static const int NumDims = internal::array_size<Dimensions>::value;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+
+ enum {
+ IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
+ PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
+ BlockAccess = false,
+ PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = false
+ };
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : m_impl(op.expression(), device) { }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
+ return m_impl.dimensions();
+ }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+ m_impl.evalSubExprsIfNeeded(NULL);
+ return true;
+ }
+ EIGEN_STRONG_INLINE void cleanup() {
+ m_impl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ return CoeffReturnType(index, m_impl.coeff(index));
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+ costPerCoeff(bool vectorized) const {
+ return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, 1);
+ }
+
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_impl.bind(cgh);
+ }
+#endif
+
+ protected:
+ TensorEvaluator<ArgType, Device> m_impl;
+};
+
+namespace internal {
+
+/** \class TensorTupleIndex
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Converts to Tensor<Tuple<Index, Scalar> > and reduces to Tensor<Index>.
+ *
+ */
+template<typename ReduceOp, typename Dims, typename XprType>
+struct traits<TensorTupleReducerOp<ReduceOp, Dims, XprType> > : public traits<XprType>
+{
+ typedef traits<XprType> XprTraits;
+ typedef typename XprTraits::StorageKind StorageKind;
+ typedef typename XprTraits::Index Index;
+ typedef Index Scalar;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
+ static const int Layout = XprTraits::Layout;
+};
+
+template<typename ReduceOp, typename Dims, typename XprType>
+struct eval<TensorTupleReducerOp<ReduceOp, Dims, XprType>, Eigen::Dense>
+{
+ typedef const TensorTupleReducerOp<ReduceOp, Dims, XprType>EIGEN_DEVICE_REF type;
+};
+
+template<typename ReduceOp, typename Dims, typename XprType>
+struct nested<TensorTupleReducerOp<ReduceOp, Dims, XprType>, 1,
+ typename eval<TensorTupleReducerOp<ReduceOp, Dims, XprType> >::type>
+{
+ typedef TensorTupleReducerOp<ReduceOp, Dims, XprType> type;
+};
+
+} // end namespace internal
+
+template<typename ReduceOp, typename Dims, typename XprType>
+class TensorTupleReducerOp : public TensorBase<TensorTupleReducerOp<ReduceOp, Dims, XprType>, ReadOnlyAccessors>
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorTupleReducerOp>::Scalar Scalar;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename Eigen::internal::nested<TensorTupleReducerOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorTupleReducerOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorTupleReducerOp>::Index Index;
+ typedef Index CoeffReturnType;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTupleReducerOp(const XprType& expr,
+ const ReduceOp& reduce_op,
+ const Index return_dim,
+ const Dims& reduce_dims)
+ : m_xpr(expr), m_reduce_op(reduce_op), m_return_dim(return_dim), m_reduce_dims(reduce_dims) {}
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type&
+ expression() const { return m_xpr; }
+
+ EIGEN_DEVICE_FUNC
+ const ReduceOp& reduce_op() const { return m_reduce_op; }
+
+ EIGEN_DEVICE_FUNC
+ const Dims& reduce_dims() const { return m_reduce_dims; }
+
+ EIGEN_DEVICE_FUNC
+ Index return_dim() const { return m_return_dim; }
+
+ protected:
+ typename XprType::Nested m_xpr;
+ const ReduceOp m_reduce_op;
+ const Index m_return_dim;
+ const Dims m_reduce_dims;
+};
+
+// Eval as rvalue
+template<typename ReduceOp, typename Dims, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Device>
+{
+ typedef TensorTupleReducerOp<ReduceOp, Dims, ArgType> XprType;
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename TensorIndexTupleOp<ArgType>::CoeffReturnType TupleType;
+ typedef typename TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device>::Dimensions Dimensions;
+ typedef typename TensorEvaluator<const TensorIndexTupleOp<ArgType> , Device>::Dimensions InputDimensions;
+ static const int NumDims = internal::array_size<InputDimensions>::value;
+ typedef array<Index, NumDims> StrideDims;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+ typedef StorageMemory<TupleType, Device> TupleStorageMem;
+
+ enum {
+ IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
+ PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
+ BlockAccess = false,
+ PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+ Layout = TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = false
+ };
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : m_orig_impl(op.expression(), device),
+ m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device),
+ m_return_dim(op.return_dim())
+ {
+ gen_strides(m_orig_impl.dimensions(), m_strides);
+ if (Layout == static_cast<int>(ColMajor)) {
+ const Index total_size = internal::array_prod(m_orig_impl.dimensions());
+ m_stride_mod = (m_return_dim < NumDims - 1) ? m_strides[m_return_dim + 1] : total_size;
+ } else {
+ const Index total_size = internal::array_prod(m_orig_impl.dimensions());
+ m_stride_mod = (m_return_dim > 0) ? m_strides[m_return_dim - 1] : total_size;
+ }
+ // If m_return_dim is not a valid index, returns 1 or this can crash on Windows.
+ m_stride_div = ((m_return_dim >= 0) &&
+ (m_return_dim < static_cast<Index>(m_strides.size())))
+ ? m_strides[m_return_dim] : 1;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
+ return m_impl.dimensions();
+ }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+ m_impl.evalSubExprsIfNeeded(NULL);
+ return true;
+ }
+ EIGEN_STRONG_INLINE void cleanup() {
+ m_impl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+ const TupleType v = m_impl.coeff(index);
+ return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div;
+ }
+
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+#ifdef EIGEN_USE_SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_impl.bind(cgh);
+ m_orig_impl.bind(cgh);
+ }
+#endif
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+ costPerCoeff(bool vectorized) const {
+ const double compute_cost = 1.0 +
+ (m_return_dim < 0 ? 0.0 : (TensorOpCost::ModCost<Index>() + TensorOpCost::DivCost<Index>()));
+ return m_orig_impl.costPerCoeff(vectorized) +
+ m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost);
+ }
+
+ private:
+ EIGEN_DEVICE_FUNC void gen_strides(const InputDimensions& dims, StrideDims& strides) {
+ if (m_return_dim < 0) {
+ return; // Won't be using the strides.
+ }
+ eigen_assert(m_return_dim < NumDims &&
+ "Asking to convert index to a dimension outside of the rank");
+
+ // Calculate m_stride_div and m_stride_mod, which are used to
+ // calculate the value of an index w.r.t. the m_return_dim.
+ if (Layout == static_cast<int>(ColMajor)) {
+ strides[0] = 1;
+ for (int i = 1; i < NumDims; ++i) {
+ strides[i] = strides[i-1] * dims[i-1];
+ }
+ } else {
+ strides[NumDims-1] = 1;
+ for (int i = NumDims - 2; i >= 0; --i) {
+ strides[i] = strides[i+1] * dims[i+1];
+ }
+ }
+ }
+
+ protected:
+ TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device> m_orig_impl;
+ TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device> m_impl;
+ const Index m_return_dim;
+ StrideDims m_strides;
+ Index m_stride_mod;
+ Index m_stride_div;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorAssign.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorAssign.h
new file mode 100644
index 0000000..e5811d6
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorAssign.h
@@ -0,0 +1,247 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H
+#define EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H
+
+namespace Eigen {
+
+/** \class TensorAssign
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief The tensor assignment class.
+ *
+ * This class is represents the assignment of the values resulting from the evaluation of
+ * the rhs expression to the memory locations denoted by the lhs expression.
+ */
+namespace internal {
+template<typename LhsXprType, typename RhsXprType>
+struct traits<TensorAssignOp<LhsXprType, RhsXprType> >
+{
+ typedef typename LhsXprType::Scalar Scalar;
+ typedef typename traits<LhsXprType>::StorageKind StorageKind;
+ typedef typename promote_index_type<typename traits<LhsXprType>::Index,
+ typename traits<RhsXprType>::Index>::type Index;
+ typedef typename LhsXprType::Nested LhsNested;
+ typedef typename RhsXprType::Nested RhsNested;
+ typedef typename remove_reference<LhsNested>::type _LhsNested;
+ typedef typename remove_reference<RhsNested>::type _RhsNested;
+ static const std::size_t NumDimensions = internal::traits<LhsXprType>::NumDimensions;
+ static const int Layout = internal::traits<LhsXprType>::Layout;
+ typedef typename traits<LhsXprType>::PointerType PointerType;
+
+ enum {
+ Flags = 0
+ };
+};
+
+template<typename LhsXprType, typename RhsXprType>
+struct eval<TensorAssignOp<LhsXprType, RhsXprType>, Eigen::Dense>
+{
+ typedef const TensorAssignOp<LhsXprType, RhsXprType>& type;
+};
+
+template<typename LhsXprType, typename RhsXprType>
+struct nested<TensorAssignOp<LhsXprType, RhsXprType>, 1, typename eval<TensorAssignOp<LhsXprType, RhsXprType> >::type>
+{
+ typedef TensorAssignOp<LhsXprType, RhsXprType> type;
+};
+
+} // end namespace internal
+
+
+
+template<typename LhsXprType, typename RhsXprType>
+class TensorAssignOp : public TensorBase<TensorAssignOp<LhsXprType, RhsXprType> >
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorAssignOp>::Scalar Scalar;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename LhsXprType::CoeffReturnType CoeffReturnType;
+ typedef typename Eigen::internal::nested<TensorAssignOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorAssignOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorAssignOp>::Index Index;
+
+ static const int NumDims = Eigen::internal::traits<TensorAssignOp>::NumDimensions;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs)
+ : m_lhs_xpr(lhs), m_rhs_xpr(rhs) {}
+
+ /** \returns the nested expressions */
+ EIGEN_DEVICE_FUNC
+ typename internal::remove_all<typename LhsXprType::Nested>::type&
+ lhsExpression() const { return *((typename internal::remove_all<typename LhsXprType::Nested>::type*)&m_lhs_xpr); }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename RhsXprType::Nested>::type&
+ rhsExpression() const { return m_rhs_xpr; }
+
+ protected:
+ typename internal::remove_all<typename LhsXprType::Nested>::type& m_lhs_xpr;
+ const typename internal::remove_all<typename RhsXprType::Nested>::type& m_rhs_xpr;
+};
+
+
+template<typename LeftArgType, typename RightArgType, typename Device>
+struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
+{
+ typedef TensorAssignOp<LeftArgType, RightArgType> XprType;
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+
+ static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+ static const int NumDims = XprType::NumDims;
+
+ enum {
+ IsAligned = int(TensorEvaluator<LeftArgType, Device>::IsAligned) &
+ int(TensorEvaluator<RightArgType, Device>::IsAligned),
+ PacketAccess = int(TensorEvaluator<LeftArgType, Device>::PacketAccess) &
+ int(TensorEvaluator<RightArgType, Device>::PacketAccess),
+ BlockAccess = int(TensorEvaluator<LeftArgType, Device>::BlockAccess) &
+ int(TensorEvaluator<RightArgType, Device>::BlockAccess),
+ PreferBlockAccess = int(TensorEvaluator<LeftArgType, Device>::PreferBlockAccess) |
+ int(TensorEvaluator<RightArgType, Device>::PreferBlockAccess),
+ Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+ RawAccess = TensorEvaluator<LeftArgType, Device>::RawAccess
+ };
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+ typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+ typedef typename TensorEvaluator<const RightArgType, Device>::TensorBlock
+ RightTensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ TensorEvaluator(const XprType& op, const Device& device) :
+ m_leftImpl(op.lhsExpression(), device),
+ m_rightImpl(op.rhsExpression(), device)
+ {
+ EIGEN_STATIC_ASSERT(
+ (static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
+ static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
+ YOU_MADE_A_PROGRAMMING_MISTAKE);
+ }
+
+ EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
+ {
+ // The dimensions of the lhs and the rhs tensors should be equal to prevent
+ // overflows and ensure the result is fully initialized.
+ // TODO: use left impl instead if right impl dimensions are known at compile time.
+ return m_rightImpl.dimensions();
+ }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+ eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions()));
+ m_leftImpl.evalSubExprsIfNeeded(NULL);
+ // If the lhs provides raw access to its storage area (i.e. if m_leftImpl.data() returns a non
+ // null value), attempt to evaluate the rhs expression in place. Returns true iff in place
+ // evaluation isn't supported and the caller still needs to manually assign the values generated
+ // by the rhs to the lhs.
+ return m_rightImpl.evalSubExprsIfNeeded(m_leftImpl.data());
+ }
+
+#ifdef EIGEN_USE_THREADS
+ template <typename EvalSubExprsCallback>
+ EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+ EvaluatorPointerType, EvalSubExprsCallback done) {
+ m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done](bool) {
+ m_rightImpl.evalSubExprsIfNeededAsync(
+ m_leftImpl.data(), [done](bool need_assign) { done(need_assign); });
+ });
+ }
+#endif // EIGEN_USE_THREADS
+
+ EIGEN_STRONG_INLINE void cleanup() {
+ m_leftImpl.cleanup();
+ m_rightImpl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
+ m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
+
+ const int LhsStoreMode = TensorEvaluator<LeftArgType, Device>::IsAligned ? Aligned : Unaligned;
+ const int RhsLoadMode = TensorEvaluator<RightArgType, Device>::IsAligned ? Aligned : Unaligned;
+ m_leftImpl.template writePacket<LhsStoreMode>(i, m_rightImpl.template packet<RhsLoadMode>(i));
+ }
+ EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+ {
+ return m_leftImpl.coeff(index);
+ }
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
+ {
+ return m_leftImpl.template packet<LoadMode>(index);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+ costPerCoeff(bool vectorized) const {
+ // We assume that evalPacket or evalScalar is called to perform the
+ // assignment and account for the cost of the write here, but reduce left
+ // cost by one load because we are using m_leftImpl.coeffRef.
+ TensorOpCost left = m_leftImpl.costPerCoeff(vectorized);
+ return m_rightImpl.costPerCoeff(vectorized) +
+ TensorOpCost(
+ numext::maxi(0.0, left.bytes_loaded() - sizeof(CoeffReturnType)),
+ left.bytes_stored(), left.compute_cycles()) +
+ TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ internal::TensorBlockResourceRequirements getResourceRequirements() const {
+ return internal::TensorBlockResourceRequirements::merge(
+ m_leftImpl.getResourceRequirements(),
+ m_rightImpl.getResourceRequirements());
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(
+ TensorBlockDesc& desc, TensorBlockScratch& scratch) {
+ if (TensorEvaluator<LeftArgType, Device>::RawAccess &&
+ m_leftImpl.data() != NULL) {
+ // If destination has raw data access, we pass it as a potential
+ // destination for a block descriptor evaluation.
+ desc.template AddDestinationBuffer<Layout>(
+ /*dst_base=*/m_leftImpl.data() + desc.offset(),
+ /*dst_strides=*/internal::strides<Layout>(m_leftImpl.dimensions()));
+ }
+
+ RightTensorBlock block = m_rightImpl.block(desc, scratch, /*root_of_expr_ast=*/true);
+ // If block was evaluated into a destination, there is no need to do assignment.
+ if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) {
+ m_leftImpl.writeBlock(desc, block);
+ }
+ block.cleanup();
+ }
+
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_leftImpl.bind(cgh);
+ m_rightImpl.bind(cgh);
+ }
+#endif
+
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_leftImpl.data(); }
+
+ private:
+ TensorEvaluator<LeftArgType, Device> m_leftImpl;
+ TensorEvaluator<RightArgType, Device> m_rightImpl;
+};
+
+}
+
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorBase.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorBase.h
new file mode 100644
index 0000000..35b6458
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorBase.h
@@ -0,0 +1,1176 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_BASE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_BASE_H
+
+// clang-format off
+
+namespace Eigen {
+
+/** \class TensorBase
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief The tensor base class.
+ *
+ * This class is the common parent of the Tensor and TensorMap class, thus
+ * making it possible to use either class interchangeably in expressions.
+ */
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+// FIXME Doxygen does not like the inheritance with different template parameters
+// Since there is no doxygen documentation inside, we disable it for now
+template<typename Derived>
+class TensorBase<Derived, ReadOnlyAccessors>
+{
+ public:
+ typedef internal::traits<Derived> DerivedTraits;
+ typedef typename DerivedTraits::Scalar Scalar;
+ typedef typename DerivedTraits::Index Index;
+ typedef typename internal::remove_const<Scalar>::type CoeffReturnType;
+ static const int NumDimensions = DerivedTraits::NumDimensions;
+
+ // Generic nullary operation support.
+ template <typename CustomNullaryOp> EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<CustomNullaryOp, const Derived>
+ nullaryExpr(const CustomNullaryOp& func) const {
+ return TensorCwiseNullaryOp<CustomNullaryOp, const Derived>(derived(), func);
+ }
+
+ // Coefficient-wise nullary operators
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived>
+ constant(const Scalar& value) const {
+ return nullaryExpr(internal::scalar_constant_op<Scalar>(value));
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<internal::UniformRandomGenerator<Scalar>, const Derived>
+ random() const {
+ return nullaryExpr(internal::UniformRandomGenerator<Scalar>());
+ }
+ template <typename RandomGenerator> EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<RandomGenerator, const Derived>
+ random(const RandomGenerator& gen = RandomGenerator()) const {
+ return nullaryExpr(gen);
+ }
+
+ // Tensor generation
+ template <typename Generator> EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorGeneratorOp<Generator, const Derived>
+ generate(const Generator& generator) const {
+ return TensorGeneratorOp<Generator, const Derived>(derived(), generator);
+ }
+
+ // Generic unary operation support.
+ template <typename CustomUnaryOp> EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<CustomUnaryOp, const Derived>
+ unaryExpr(const CustomUnaryOp& func) const {
+ return TensorCwiseUnaryOp<CustomUnaryOp, const Derived>(derived(), func);
+ }
+
+ // Coefficient-wise unary operators
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const Derived>
+ operator-() const {
+ return unaryExpr(internal::scalar_opposite_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived>
+ sqrt() const {
+ return unaryExpr(internal::scalar_sqrt_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sign_op<Scalar>, const Derived>
+ sign() const {
+ return unaryExpr(internal::scalar_sign_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_rsqrt_op<Scalar>, const Derived>
+ rsqrt() const {
+ return unaryExpr(internal::scalar_rsqrt_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_square_op<Scalar>, const Derived>
+ square() const {
+ return unaryExpr(internal::scalar_square_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_cube_op<Scalar>, const Derived>
+ cube() const {
+ return unaryExpr(internal::scalar_cube_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived>
+ inverse() const {
+ return unaryExpr(internal::scalar_inverse_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_tanh_op<Scalar>, const Derived>
+ tanh() const {
+ return unaryExpr(internal::scalar_tanh_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_lgamma_op<Scalar>, const Derived>
+ lgamma() const {
+ return unaryExpr(internal::scalar_lgamma_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_digamma_op<Scalar>, const Derived>
+ digamma() const {
+ return unaryExpr(internal::scalar_digamma_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_i0_op<Scalar>, const Derived>
+ bessel_i0() const {
+ return unaryExpr(internal::scalar_bessel_i0_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_i0e_op<Scalar>, const Derived>
+ bessel_i0e() const {
+ return unaryExpr(internal::scalar_bessel_i0e_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_i1_op<Scalar>, const Derived>
+ bessel_i1() const {
+ return unaryExpr(internal::scalar_bessel_i1_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_i1e_op<Scalar>, const Derived>
+ bessel_i1e() const {
+ return unaryExpr(internal::scalar_bessel_i1e_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_j0_op<Scalar>, const Derived>
+ bessel_j0() const {
+ return unaryExpr(internal::scalar_bessel_j0_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_y0_op<Scalar>, const Derived>
+ bessel_y0() const {
+ return unaryExpr(internal::scalar_bessel_y0_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_j1_op<Scalar>, const Derived>
+ bessel_j1() const {
+ return unaryExpr(internal::scalar_bessel_j1_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_y1_op<Scalar>, const Derived>
+ bessel_y1() const {
+ return unaryExpr(internal::scalar_bessel_y1_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_k0_op<Scalar>, const Derived>
+ bessel_k0() const {
+ return unaryExpr(internal::scalar_bessel_k0_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_k0e_op<Scalar>, const Derived>
+ bessel_k0e() const {
+ return unaryExpr(internal::scalar_bessel_k0e_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_k1_op<Scalar>, const Derived>
+ bessel_k1() const {
+ return unaryExpr(internal::scalar_bessel_k1_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_k1e_op<Scalar>, const Derived>
+ bessel_k1e() const {
+ return unaryExpr(internal::scalar_bessel_k1e_op<Scalar>());
+ }
+
+ // igamma(a = this, x = other)
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorCwiseBinaryOp<internal::scalar_igamma_op<Scalar>, const Derived, const OtherDerived>
+ igamma(const OtherDerived& other) const {
+ return binaryExpr(other.derived(), internal::scalar_igamma_op<Scalar>());
+ }
+
+ // igamma_der_a(a = this, x = other)
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorCwiseBinaryOp<internal::scalar_igamma_der_a_op<Scalar>, const Derived, const OtherDerived>
+ igamma_der_a(const OtherDerived& other) const {
+ return binaryExpr(other.derived(), internal::scalar_igamma_der_a_op<Scalar>());
+ }
+
+ // gamma_sample_der_alpha(alpha = this, sample = other)
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorCwiseBinaryOp<internal::scalar_gamma_sample_der_alpha_op<Scalar>, const Derived, const OtherDerived>
+ gamma_sample_der_alpha(const OtherDerived& other) const {
+ return binaryExpr(other.derived(), internal::scalar_gamma_sample_der_alpha_op<Scalar>());
+ }
+
+ // igammac(a = this, x = other)
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorCwiseBinaryOp<internal::scalar_igammac_op<Scalar>, const Derived, const OtherDerived>
+ igammac(const OtherDerived& other) const {
+ return binaryExpr(other.derived(), internal::scalar_igammac_op<Scalar>());
+ }
+
+ // zeta(x = this, q = other)
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorCwiseBinaryOp<internal::scalar_zeta_op<Scalar>, const Derived, const OtherDerived>
+ zeta(const OtherDerived& other) const {
+ return binaryExpr(other.derived(), internal::scalar_zeta_op<Scalar>());
+ }
+
+ // polygamma(n = this, x = other)
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorCwiseBinaryOp<internal::scalar_polygamma_op<Scalar>, const Derived, const OtherDerived>
+ polygamma(const OtherDerived& other) const {
+ return binaryExpr(other.derived(), internal::scalar_polygamma_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_erf_op<Scalar>, const Derived>
+ erf() const {
+ return unaryExpr(internal::scalar_erf_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_erfc_op<Scalar>, const Derived>
+ erfc() const {
+ return unaryExpr(internal::scalar_erfc_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_ndtri_op<Scalar>, const Derived>
+ ndtri() const {
+ return unaryExpr(internal::scalar_ndtri_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_logistic_op<Scalar>, const Derived>
+ sigmoid() const {
+ return unaryExpr(internal::scalar_logistic_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_exp_op<Scalar>, const Derived>
+ exp() const {
+ return unaryExpr(internal::scalar_exp_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_expm1_op<Scalar>, const Derived>
+ expm1() const {
+ return unaryExpr(internal::scalar_expm1_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log_op<Scalar>, const Derived>
+ log() const {
+ return unaryExpr(internal::scalar_log_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log1p_op<Scalar>, const Derived>
+ log1p() const {
+ return unaryExpr(internal::scalar_log1p_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log2_op<Scalar>, const Derived>
+ log2() const {
+ return unaryExpr(internal::scalar_log2_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived>
+ abs() const {
+ return unaryExpr(internal::scalar_abs_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_clamp_op<Scalar>, const Derived>
+ clip(Scalar min, Scalar max) const {
+ return unaryExpr(internal::scalar_clamp_op<Scalar>(min, max));
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const typename internal::conditional<NumTraits<CoeffReturnType>::IsComplex,
+ TensorCwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, const Derived>,
+ Derived>::type
+ conjugate() const {
+ return choose(Cond<NumTraits<CoeffReturnType>::IsComplex>(), unaryExpr(internal::scalar_conjugate_op<Scalar>()), derived());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_pow_op<Scalar,Scalar> >, const Derived>
+ pow(Scalar exponent) const {
+ return unaryExpr(internal::bind2nd_op<internal::scalar_pow_op<Scalar,Scalar> >(exponent));
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_real_op<Scalar>, const Derived>
+ real() const {
+ return unaryExpr(internal::scalar_real_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_imag_op<Scalar>, const Derived>
+ imag() const {
+ return unaryExpr(internal::scalar_imag_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_sum_op<Scalar,Scalar> >, const Derived>
+ operator+ (Scalar rhs) const {
+ return unaryExpr(internal::bind2nd_op<internal::scalar_sum_op<Scalar,Scalar> >(rhs));
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE friend
+ const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_sum_op<Scalar> >, const Derived>
+ operator+ (Scalar lhs, const Derived& rhs) {
+ return rhs.unaryExpr(internal::bind1st_op<internal::scalar_sum_op<Scalar> >(lhs));
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_difference_op<Scalar,Scalar> >, const Derived>
+ operator- (Scalar rhs) const {
+ EIGEN_STATIC_ASSERT((NumTraits<Scalar>::IsSigned || internal::is_same<Scalar, const std::complex<float> >::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
+ return unaryExpr(internal::bind2nd_op<internal::scalar_difference_op<Scalar,Scalar> >(rhs));
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE friend
+ const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_difference_op<Scalar> >, const Derived>
+ operator- (Scalar lhs, const Derived& rhs) {
+ return rhs.unaryExpr(internal::bind1st_op<internal::scalar_difference_op<Scalar> >(lhs));
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_product_op<Scalar,Scalar> >, const Derived>
+ operator* (Scalar rhs) const {
+ return unaryExpr(internal::bind2nd_op<internal::scalar_product_op<Scalar,Scalar> >(rhs));
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE friend
+ const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_product_op<Scalar> >, const Derived>
+ operator* (Scalar lhs, const Derived& rhs) {
+ return rhs.unaryExpr(internal::bind1st_op<internal::scalar_product_op<Scalar> >(lhs));
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_quotient_op<Scalar,Scalar> >, const Derived>
+ operator/ (Scalar rhs) const {
+ return unaryExpr(internal::bind2nd_op<internal::scalar_quotient_op<Scalar,Scalar> >(rhs));
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE friend
+ const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_quotient_op<Scalar> >, const Derived>
+ operator/ (Scalar lhs, const Derived& rhs) {
+ return rhs.unaryExpr(internal::bind1st_op<internal::scalar_quotient_op<Scalar> >(lhs));
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_mod_op<Scalar>, const Derived>
+ operator% (Scalar rhs) const {
+ EIGEN_STATIC_ASSERT(NumTraits<Scalar>::IsInteger, YOU_MADE_A_PROGRAMMING_MISTAKE_TRY_MOD);
+ return unaryExpr(internal::scalar_mod_op<Scalar>(rhs));
+ }
+
+ template <int NanPropagation=PropagateFast>
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar,NanPropagation>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+ cwiseMax(Scalar threshold) const {
+ return cwiseMax<NanPropagation>(constant(threshold));
+ }
+
+ template <int NanPropagation=PropagateFast>
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar,NanPropagation>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+ cwiseMin(Scalar threshold) const {
+ return cwiseMin<NanPropagation>(constant(threshold));
+ }
+
+ template<typename NewType>
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const typename internal::conditional<internal::is_same<NewType, CoeffReturnType>::value,
+ Derived,
+ TensorConversionOp<NewType, const Derived> >::type
+ cast() const {
+ return choose(Cond<internal::is_same<NewType, CoeffReturnType>::value>(), derived(), TensorConversionOp<NewType, const Derived>(derived()));
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_round_op<Scalar>, const Derived>
+ round() const {
+ return unaryExpr(internal::scalar_round_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_rint_op<Scalar>, const Derived>
+ rint() const {
+ return unaryExpr(internal::scalar_rint_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_ceil_op<Scalar>, const Derived>
+ ceil() const {
+ return unaryExpr(internal::scalar_ceil_op<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_floor_op<Scalar>, const Derived>
+ floor() const {
+ return unaryExpr(internal::scalar_floor_op<Scalar>());
+ }
+
+ // Generic binary operation support.
+ template <typename CustomBinaryOp, typename OtherDerived> EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>
+ binaryExpr(const OtherDerived& other, const CustomBinaryOp& func) const {
+ return TensorCwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>(derived(), other, func);
+ }
+
+ // Coefficient-wise binary operators.
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const Derived, const OtherDerived>
+ operator+(const OtherDerived& other) const {
+ return binaryExpr(other.derived(), internal::scalar_sum_op<Scalar>());
+ }
+
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const Derived, const OtherDerived>
+ operator-(const OtherDerived& other) const {
+ return binaryExpr(other.derived(), internal::scalar_difference_op<Scalar>());
+ }
+
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorCwiseBinaryOp<internal::scalar_product_op<Scalar>, const Derived, const OtherDerived>
+ operator*(const OtherDerived& other) const {
+ return binaryExpr(other.derived(), internal::scalar_product_op<Scalar>());
+ }
+
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorCwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const Derived, const OtherDerived>
+ operator/(const OtherDerived& other) const {
+ return binaryExpr(other.derived(), internal::scalar_quotient_op<Scalar>());
+ }
+
+ template<int NaNPropagation=PropagateFast, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorCwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar, NaNPropagation>, const Derived, const OtherDerived>
+ cwiseMax(const OtherDerived& other) const {
+ return binaryExpr(other.derived(), internal::scalar_max_op<Scalar,Scalar, NaNPropagation>());
+ }
+
+ template<int NaNPropagation=PropagateFast, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorCwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar, NaNPropagation>, const Derived, const OtherDerived>
+ cwiseMin(const OtherDerived& other) const {
+ return binaryExpr(other.derived(), internal::scalar_min_op<Scalar,Scalar, NaNPropagation>());
+ }
+
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorCwiseBinaryOp<internal::scalar_boolean_and_op, const Derived, const OtherDerived>
+ operator&&(const OtherDerived& other) const {
+ return binaryExpr(other.derived(), internal::scalar_boolean_and_op());
+ }
+
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorCwiseBinaryOp<internal::scalar_boolean_or_op, const Derived, const OtherDerived>
+ operator||(const OtherDerived& other) const {
+ return binaryExpr(other.derived(), internal::scalar_boolean_or_op());
+ }
+
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorCwiseBinaryOp<internal::scalar_boolean_xor_op, const Derived, const OtherDerived>
+ operator^(const OtherDerived& other) const {
+ return binaryExpr(other.derived(), internal::scalar_boolean_xor_op());
+ }
+
+ // Comparisons and tests.
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>, const Derived, const OtherDerived>
+ operator<(const OtherDerived& other) const {
+ return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>());
+ }
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>, const Derived, const OtherDerived>
+ operator<=(const OtherDerived& other) const {
+ return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>());
+ }
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>, const Derived, const OtherDerived>
+ operator>(const OtherDerived& other) const {
+ return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>());
+ }
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>, const Derived, const OtherDerived>
+ operator>=(const OtherDerived& other) const {
+ return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>());
+ }
+
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>, const Derived, const OtherDerived>
+ operator==(const OtherDerived& other) const {
+ return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>());
+ }
+
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>, const Derived, const OtherDerived>
+ operator!=(const OtherDerived& other) const {
+ return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>());
+ }
+
+ // comparisons and tests for Scalars
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+ operator<(Scalar threshold) const {
+ return operator<(constant(threshold));
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+ operator<=(Scalar threshold) const {
+ return operator<=(constant(threshold));
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+ operator>(Scalar threshold) const {
+ return operator>(constant(threshold));
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+ operator>=(Scalar threshold) const {
+ return operator>=(constant(threshold));
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+ operator==(Scalar threshold) const {
+ return operator==(constant(threshold));
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+ operator!=(Scalar threshold) const {
+ return operator!=(constant(threshold));
+ }
+
+ // Checks
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_isnan_op<Scalar>, const Derived>
+ (isnan)() const {
+ return unaryExpr(internal::scalar_isnan_op<Scalar>());
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_isinf_op<Scalar>, const Derived>
+ (isinf)() const {
+ return unaryExpr(internal::scalar_isinf_op<Scalar>());
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_isfinite_op<Scalar>, const Derived>
+ (isfinite)() const {
+ return unaryExpr(internal::scalar_isfinite_op<Scalar>());
+ }
+
+ // Coefficient-wise ternary operators.
+ template<typename ThenDerived, typename ElseDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorSelectOp<const Derived, const ThenDerived, const ElseDerived>
+ select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) const {
+ return TensorSelectOp<const Derived, const ThenDerived, const ElseDerived>(derived(), thenTensor.derived(), elseTensor.derived());
+ }
+
+ // Contractions.
+ typedef Eigen::IndexPair<Index> DimensionPair;
+
+ template<typename OtherDerived, typename Dimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorContractionOp<const Dimensions, const Derived, const OtherDerived, const NoOpOutputKernel>
+ contract(const OtherDerived& other, const Dimensions& dims) const {
+ return TensorContractionOp<const Dimensions, const Derived, const OtherDerived, const NoOpOutputKernel>(derived(), other.derived(), dims);
+ }
+
+ template<typename OtherDerived, typename Dimensions, typename OutputKernel> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorContractionOp<const Dimensions, const Derived, const OtherDerived, const OutputKernel>
+ contract(const OtherDerived& other, const Dimensions& dims, const OutputKernel& output_kernel) const {
+ return TensorContractionOp<const Dimensions, const Derived, const OtherDerived, const OutputKernel>(derived(), other.derived(), dims, output_kernel);
+ }
+
+ // Convolutions.
+ template<typename KernelDerived, typename Dimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorConvolutionOp<const Dimensions, const Derived, const KernelDerived>
+ convolve(const KernelDerived& kernel, const Dimensions& dims) const {
+ return TensorConvolutionOp<const Dimensions, const Derived, const KernelDerived>(derived(), kernel.derived(), dims);
+ }
+
+ // Fourier transforms
+ template <int FFTDataType, int FFTDirection, typename FFT> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorFFTOp<const FFT, const Derived, FFTDataType, FFTDirection>
+ fft(const FFT& dims) const {
+ return TensorFFTOp<const FFT, const Derived, FFTDataType, FFTDirection>(derived(), dims);
+ }
+
+ // Scan.
+ typedef TensorScanOp<internal::SumReducer<CoeffReturnType>, const Derived> TensorScanSumOp;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorScanSumOp
+ cumsum(const Index& axis, bool exclusive = false) const {
+ return TensorScanSumOp(derived(), axis, exclusive);
+ }
+
+ typedef TensorScanOp<internal::ProdReducer<CoeffReturnType>, const Derived> TensorScanProdOp;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorScanProdOp
+ cumprod(const Index& axis, bool exclusive = false) const {
+ return TensorScanProdOp(derived(), axis, exclusive);
+ }
+
+ template <typename Reducer>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorScanOp<Reducer, const Derived>
+ scan(const Index& axis, const Reducer& reducer, bool exclusive = false) const {
+ return TensorScanOp<Reducer, const Derived>(derived(), axis, exclusive, reducer);
+ }
+
+ // Reductions.
+ template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorReductionOp<internal::SumReducer<CoeffReturnType>, const Dims, const Derived>
+ sum(const Dims& dims) const {
+ return TensorReductionOp<internal::SumReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::SumReducer<CoeffReturnType>());
+ }
+
+ const TensorReductionOp<internal::SumReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>
+ sum() const {
+ DimensionList<Index, NumDimensions> in_dims;
+ return TensorReductionOp<internal::SumReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::SumReducer<CoeffReturnType>());
+ }
+
+ template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const Dims, const Derived>
+ mean(const Dims& dims) const {
+ return TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::MeanReducer<CoeffReturnType>());
+ }
+
+ const TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>
+ mean() const {
+ DimensionList<Index, NumDimensions> in_dims;
+ return TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MeanReducer<CoeffReturnType>());
+ }
+
+ template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const Dims, const Derived>
+ prod(const Dims& dims) const {
+ return TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::ProdReducer<CoeffReturnType>());
+ }
+
+ const TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>
+ prod() const {
+ DimensionList<Index, NumDimensions> in_dims;
+ return TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::ProdReducer<CoeffReturnType>());
+ }
+
+ template <typename Dims,int NanPropagation=PropagateFast> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorReductionOp<internal::MaxReducer<CoeffReturnType,NanPropagation>, const Dims, const Derived>
+ maximum(const Dims& dims) const {
+ return TensorReductionOp<internal::MaxReducer<CoeffReturnType,NanPropagation>, const Dims, const Derived>(derived(), dims, internal::MaxReducer<CoeffReturnType,NanPropagation>());
+ }
+
+ template <int NanPropagation=PropagateFast>
+ const TensorReductionOp<internal::MaxReducer<CoeffReturnType,NanPropagation>, const DimensionList<Index, NumDimensions>, const Derived>
+ maximum() const {
+ DimensionList<Index, NumDimensions> in_dims;
+ return TensorReductionOp<internal::MaxReducer<CoeffReturnType,NanPropagation>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MaxReducer<CoeffReturnType,NanPropagation>());
+ }
+
+ template <typename Dims,int NanPropagation=PropagateFast> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorReductionOp<internal::MinReducer<CoeffReturnType,NanPropagation>, const Dims, const Derived>
+ minimum(const Dims& dims) const {
+ return TensorReductionOp<internal::MinReducer<CoeffReturnType,NanPropagation>, const Dims, const Derived>(derived(), dims, internal::MinReducer<CoeffReturnType,NanPropagation>());
+ }
+
+ template <int NanPropagation=PropagateFast>
+ const TensorReductionOp<internal::MinReducer<CoeffReturnType,NanPropagation>, const DimensionList<Index, NumDimensions>, const Derived>
+ minimum() const {
+ DimensionList<Index, NumDimensions> in_dims;
+ return TensorReductionOp<internal::MinReducer<CoeffReturnType,NanPropagation>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MinReducer<CoeffReturnType,NanPropagation>());
+ }
+
+ template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorReductionOp<internal::AndReducer, const Dims, const typename internal::conditional<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> >::type >
+ all(const Dims& dims) const {
+ return cast<bool>().reduce(dims, internal::AndReducer());
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorReductionOp<internal::AndReducer, const DimensionList<Index, NumDimensions>, const typename internal::conditional<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> >::type >
+ all() const {
+ DimensionList<Index, NumDimensions> in_dims;
+ return cast<bool>().reduce(in_dims, internal::AndReducer());
+ }
+
+ template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorReductionOp<internal::OrReducer, const Dims, const typename internal::conditional<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> >::type >
+ any(const Dims& dims) const {
+ return cast<bool>().reduce(dims, internal::OrReducer());
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorReductionOp<internal::OrReducer, const DimensionList<Index, NumDimensions>, const typename internal::conditional<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> >::type >
+ any() const {
+ DimensionList<Index, NumDimensions> in_dims;
+ return cast<bool>().reduce(in_dims, internal::OrReducer());
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorTupleReducerOp<
+ internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >,
+ const array<Index, NumDimensions>, const Derived>
+ argmax() const {
+ array<Index, NumDimensions> in_dims;
+ for (Index d = 0; d < NumDimensions; ++d) in_dims[d] = d;
+ return TensorTupleReducerOp<
+ internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >,
+ const array<Index, NumDimensions>,
+ const Derived>(derived(), internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >(), -1, in_dims);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorTupleReducerOp<
+ internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >,
+ const array<Index, NumDimensions>, const Derived>
+ argmin() const {
+ array<Index, NumDimensions> in_dims;
+ for (Index d = 0; d < NumDimensions; ++d) in_dims[d] = d;
+ return TensorTupleReducerOp<
+ internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >,
+ const array<Index, NumDimensions>,
+ const Derived>(derived(), internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >(), -1, in_dims);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorTupleReducerOp<
+ internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >,
+ const array<Index, 1>, const Derived>
+ argmax(const Index return_dim) const {
+ array<Index, 1> in_dims;
+ in_dims[0] = return_dim;
+ return TensorTupleReducerOp<
+ internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >,
+ const array<Index, 1>,
+ const Derived>(derived(), internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >(), return_dim, in_dims);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorTupleReducerOp<
+ internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >,
+ const array<Index, 1>, const Derived>
+ argmin(const Index return_dim) const {
+ array<Index, 1> in_dims;
+ in_dims[0] = return_dim;
+ return TensorTupleReducerOp<
+ internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >,
+ const array<Index, 1>,
+ const Derived>(derived(), internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >(), return_dim, in_dims);
+ }
+
+ template <typename Reducer, typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorReductionOp<Reducer, const Dims, const Derived>
+ reduce(const Dims& dims, const Reducer& reducer) const {
+ return TensorReductionOp<Reducer, const Dims, const Derived>(derived(), dims, reducer);
+ }
+
+ template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorTraceOp<const Dims, const Derived>
+ trace(const Dims& dims) const {
+ return TensorTraceOp<const Dims, const Derived>(derived(), dims);
+ }
+
+ const TensorTraceOp<const DimensionList<Index, NumDimensions>, const Derived>
+ trace() const {
+ DimensionList<Index, NumDimensions> in_dims;
+ return TensorTraceOp<const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims);
+ }
+
+ template <typename Broadcast> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorBroadcastingOp<const Broadcast, const Derived>
+ broadcast(const Broadcast& bcast) const {
+ return TensorBroadcastingOp<const Broadcast, const Derived>(derived(), bcast);
+ }
+
+ template <typename Axis, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorConcatenationOp<Axis, const Derived, const OtherDerived>
+ concatenate(const OtherDerived& other, Axis axis) const {
+ return TensorConcatenationOp<Axis, const Derived, const OtherDerived>(derived(), other.derived(), axis);
+ }
+
+ template <typename PatchDims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorPatchOp<const PatchDims, const Derived>
+ extract_patches(const PatchDims& patch_dims) const {
+ return TensorPatchOp<const PatchDims, const Derived>(derived(), patch_dims);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorImagePatchOp<Dynamic, Dynamic, const Derived>
+ extract_image_patches(const Index patch_rows = 1, const Index patch_cols = 1,
+ const Index row_stride = 1, const Index col_stride = 1,
+ const Index in_row_stride = 1, const Index in_col_stride = 1,
+ const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = Scalar(0)) const {
+ return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride,
+ in_row_stride, in_col_stride, 1, 1, padding_type, padding_value);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorImagePatchOp<Dynamic, Dynamic, const Derived>
+ extract_image_patches(const Index patch_rows, const Index patch_cols,
+ const Index row_stride, const Index col_stride,
+ const Index in_row_stride, const Index in_col_stride,
+ const Index row_inflate_stride, const Index col_inflate_stride,
+ const Index padding_top, const Index padding_bottom,
+ const Index padding_left,const Index padding_right,
+ const Scalar padding_value) const {
+ return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride,
+ in_row_stride, in_col_stride, row_inflate_stride, col_inflate_stride,
+ padding_top, padding_bottom, padding_left, padding_right, padding_value);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>
+ extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols,
+ const Index plane_stride = 1, const Index row_stride = 1, const Index col_stride = 1,
+ const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = Scalar(0)) const {
+ return TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, 1, 1, 1, padding_type, padding_value);
+ }
+
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>
+ extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols,
+ const Index plane_stride, const Index row_stride, const Index col_stride,
+ const Index plane_inflate_stride, const Index row_inflate_stride, const Index col_inflate_stride,
+ const Index padding_top_z, const Index padding_bottom_z,
+ const Index padding_top, const Index padding_bottom,
+ const Index padding_left, const Index padding_right, const Scalar padding_value = Scalar(0)) const {
+ return TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, plane_inflate_stride, row_inflate_stride, col_inflate_stride, padding_top_z, padding_bottom_z, padding_top, padding_bottom, padding_left, padding_right, padding_value);
+ }
+
+ // Morphing operators.
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorLayoutSwapOp<const Derived>
+ swap_layout() const {
+ return TensorLayoutSwapOp<const Derived>(derived());
+ }
+ template <typename NewDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorReshapingOp<const NewDimensions, const Derived>
+ reshape(const NewDimensions& newDimensions) const {
+ return TensorReshapingOp<const NewDimensions, const Derived>(derived(), newDimensions);
+ }
+ template <typename StartIndices, typename Sizes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorSlicingOp<const StartIndices, const Sizes, const Derived>
+ slice(const StartIndices& startIndices, const Sizes& sizes) const {
+ return TensorSlicingOp<const StartIndices, const Sizes, const Derived>(derived(), startIndices, sizes);
+ }
+ template <typename StartIndices, typename StopIndices, typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides, const Derived>
+ stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) const {
+ return TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides,
+ const Derived>(derived(), startIndices, stopIndices, strides);
+ }
+ template <Index DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorChippingOp<DimId, const Derived>
+ chip(const Index offset) const {
+ return TensorChippingOp<DimId, const Derived>(derived(), offset, DimId);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorChippingOp<Dynamic, const Derived>
+ chip(const Index offset, const Index dim) const {
+ return TensorChippingOp<Dynamic, const Derived>(derived(), offset, dim);
+ }
+ template <typename ReverseDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorReverseOp<const ReverseDimensions, const Derived>
+ reverse(const ReverseDimensions& rev) const {
+ return TensorReverseOp<const ReverseDimensions, const Derived>(derived(), rev);
+ }
+ template <typename PaddingDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorPaddingOp<const PaddingDimensions, const Derived>
+ pad(const PaddingDimensions& padding) const {
+ return TensorPaddingOp<const PaddingDimensions, const Derived>(derived(), padding, internal::scalar_cast_op<int, Scalar>()(0));
+ }
+ template <typename PaddingDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorPaddingOp<const PaddingDimensions, const Derived>
+ pad(const PaddingDimensions& padding, const Scalar padding_value) const {
+ return TensorPaddingOp<const PaddingDimensions, const Derived>(derived(), padding, padding_value);
+ }
+ template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorShufflingOp<const Shuffle, const Derived>
+ shuffle(const Shuffle& shfl) const {
+ return TensorShufflingOp<const Shuffle, const Derived>(derived(), shfl);
+ }
+ template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorStridingOp<const Strides, const Derived>
+ stride(const Strides& strides) const {
+ return TensorStridingOp<const Strides, const Derived>(derived(), strides);
+ }
+ template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorInflationOp<const Strides, const Derived>
+ inflate(const Strides& strides) const {
+ return TensorInflationOp<const Strides, const Derived>(derived(), strides);
+ }
+
+ // Returns a tensor containing index/value tuples
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorIndexTupleOp<const Derived>
+ index_tuples() const {
+ return TensorIndexTupleOp<const Derived>(derived());
+ }
+
+ // Support for custom unary and binary operations
+ template <typename CustomUnaryFunc>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorCustomUnaryOp<const CustomUnaryFunc, const Derived> customOp(const CustomUnaryFunc& op) const {
+ return TensorCustomUnaryOp<const CustomUnaryFunc, const Derived>(derived(), op);
+ }
+ template <typename OtherDerived, typename CustomBinaryFunc>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorCustomBinaryOp<const CustomBinaryFunc, const Derived, const OtherDerived> customOp(const OtherDerived& other, const CustomBinaryFunc& op) const {
+ return TensorCustomBinaryOp<const CustomBinaryFunc, const Derived, const OtherDerived>(derived(), other, op);
+ }
+
+ // Force the evaluation of the expression.
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorForcedEvalOp<const Derived> eval() const {
+ return TensorForcedEvalOp<const Derived>(derived());
+ }
+
+ protected:
+ template <typename Scalar, int NumIndices, int Options, typename IndexType> friend class Tensor;
+ template <typename Scalar, typename Dimensions, int Option, typename IndexTypes> friend class TensorFixedSize;
+ // the Eigen:: prefix is required to workaround a compilation issue with nvcc 9.0
+ template <typename OtherDerived, int AccessLevel> friend class Eigen::TensorBase;
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); }
+};
+
+template<typename Derived, int AccessLevel = internal::accessors_level<Derived>::value>
+class TensorBase : public TensorBase<Derived, ReadOnlyAccessors> {
+ public:
+ typedef TensorBase<Derived, ReadOnlyAccessors> Base;
+ typedef internal::traits<Derived> DerivedTraits;
+ typedef typename DerivedTraits::Scalar Scalar;
+ typedef typename DerivedTraits::Index Index;
+ typedef Scalar CoeffReturnType;
+ static const int NumDimensions = DerivedTraits::NumDimensions;
+
+ template <typename Scalar, int NumIndices, int Options, typename IndexType> friend class Tensor;
+ template <typename Scalar, typename Dimensions, int Option, typename IndexTypes> friend class TensorFixedSize;
+ // the Eigen:: prefix is required to workaround a compilation issue with nvcc 9.0
+ template <typename OtherDerived, int OtherAccessLevel> friend class Eigen::TensorBase;
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Derived& setZero() {
+ return setConstant(Scalar(0));
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Derived& setConstant(const Scalar& val) {
+ return derived() = this->constant(val);
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Derived& setRandom() {
+ return derived() = this->random();
+ }
+ template <typename RandomGenerator> EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Derived& setRandom() {
+ return derived() = this->template random<RandomGenerator>();
+ }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Derived& setValues(
+ const typename internal::Initializer<Derived, NumDimensions>::InitList& vals) {
+ TensorEvaluator<Derived, DefaultDevice> eval(derived(), DefaultDevice());
+ internal::initialize_tensor<Derived, NumDimensions>(eval, vals);
+ return derived();
+ }
+#endif // EIGEN_HAS_VARIADIC_TEMPLATES
+
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Derived& operator+=(const OtherDerived& other) {
+ return derived() = derived() + other.derived();
+ }
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Derived& operator-=(const OtherDerived& other) {
+ return derived() = derived() - other.derived();
+ }
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Derived& operator*=(const OtherDerived& other) {
+ return derived() = derived() * other.derived();
+ }
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Derived& operator/=(const OtherDerived& other) {
+ return derived() = derived() / other.derived();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorLayoutSwapOp<const Derived>
+ swap_layout() const {
+ return TensorLayoutSwapOp<const Derived>(derived());
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ TensorLayoutSwapOp<Derived>
+ swap_layout() {
+ return TensorLayoutSwapOp<Derived>(derived());
+ }
+
+ template <typename Axis, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorConcatenationOp<const Axis, const Derived, const OtherDerived>
+ concatenate(const OtherDerived& other, const Axis& axis) const {
+ return TensorConcatenationOp<const Axis, const Derived, const OtherDerived>(derived(), other, axis);
+ }
+ template <typename Axis, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ TensorConcatenationOp<const Axis, Derived, OtherDerived>
+ concatenate(const OtherDerived& other, const Axis& axis) {
+ return TensorConcatenationOp<const Axis, Derived, OtherDerived>(derived(), other, axis);
+ }
+
+ template <typename NewDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorReshapingOp<const NewDimensions, const Derived>
+ reshape(const NewDimensions& newDimensions) const {
+ return TensorReshapingOp<const NewDimensions, const Derived>(derived(), newDimensions);
+ }
+ template <typename NewDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ TensorReshapingOp<const NewDimensions, Derived>
+ reshape(const NewDimensions& newDimensions) {
+ return TensorReshapingOp<const NewDimensions, Derived>(derived(), newDimensions);
+ }
+
+ template <typename StartIndices, typename Sizes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorSlicingOp<const StartIndices, const Sizes, const Derived>
+ slice(const StartIndices& startIndices, const Sizes& sizes) const {
+ return TensorSlicingOp<const StartIndices, const Sizes, const Derived>(derived(), startIndices, sizes);
+ }
+ template <typename StartIndices, typename Sizes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ TensorSlicingOp<const StartIndices, const Sizes, Derived>
+ slice(const StartIndices& startIndices, const Sizes& sizes) {
+ return TensorSlicingOp<const StartIndices, const Sizes, Derived>(derived(), startIndices, sizes);
+ }
+
+ template <typename StartIndices, typename StopIndices, typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides, const Derived>
+ stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) const {
+ return TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides,
+ const Derived>(derived(), startIndices, stopIndices, strides);
+ }
+ template <typename StartIndices, typename StopIndices, typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides, Derived>
+ stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) {
+ return TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides,
+ Derived>(derived(), startIndices, stopIndices, strides);
+ }
+
+ template <DenseIndex DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorChippingOp<DimId, const Derived>
+ chip(const Index offset) const {
+ return TensorChippingOp<DimId, const Derived>(derived(), offset, DimId);
+ }
+ template <Index DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ TensorChippingOp<DimId, Derived>
+ chip(const Index offset) {
+ return TensorChippingOp<DimId, Derived>(derived(), offset, DimId);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorChippingOp<Dynamic, const Derived>
+ chip(const Index offset, const Index dim) const {
+ return TensorChippingOp<Dynamic, const Derived>(derived(), offset, dim);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ TensorChippingOp<Dynamic, Derived>
+ chip(const Index offset, const Index dim) {
+ return TensorChippingOp<Dynamic, Derived>(derived(), offset, dim);
+ }
+
+ template <typename ReverseDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorReverseOp<const ReverseDimensions, const Derived>
+ reverse(const ReverseDimensions& rev) const {
+ return TensorReverseOp<const ReverseDimensions, const Derived>(derived(), rev);
+ }
+ template <typename ReverseDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ TensorReverseOp<const ReverseDimensions, Derived>
+ reverse(const ReverseDimensions& rev) {
+ return TensorReverseOp<const ReverseDimensions, Derived>(derived(), rev);
+ }
+
+ template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorShufflingOp<const Shuffle, const Derived>
+ shuffle(const Shuffle& shfl) const {
+ return TensorShufflingOp<const Shuffle, const Derived>(derived(), shfl);
+ }
+ template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ TensorShufflingOp<const Shuffle, Derived>
+ shuffle(const Shuffle& shfl) {
+ return TensorShufflingOp<const Shuffle, Derived>(derived(), shfl);
+ }
+
+ template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const TensorStridingOp<const Strides, const Derived>
+ stride(const Strides& strides) const {
+ return TensorStridingOp<const Strides, const Derived>(derived(), strides);
+ }
+ template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ TensorStridingOp<const Strides, Derived>
+ stride(const Strides& strides) {
+ return TensorStridingOp<const Strides, Derived>(derived(), strides);
+ }
+
+ // Select the device on which to evaluate the expression.
+ template <typename DeviceType>
+ TensorDevice<Derived, DeviceType> device(const DeviceType& dev) {
+ return TensorDevice<Derived, DeviceType>(dev, derived());
+ }
+
+ // Select the async device on which to evaluate the expression.
+ template <typename DeviceType, typename DoneCallback>
+ TensorAsyncDevice<Derived, DeviceType, DoneCallback> device(const DeviceType& dev, DoneCallback done) {
+ return TensorAsyncDevice<Derived, DeviceType, DoneCallback>(dev, derived(), std::move(done));
+ }
+
+ protected:
+ EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TensorBase)
+ EIGEN_DEFAULT_COPY_CONSTRUCTOR(TensorBase)
+
+ template<typename OtherDerived> EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Derived& operator=(const OtherDerived& other)
+ {
+ typedef TensorAssignOp<Derived, const OtherDerived> Assign;
+ Assign assign(derived(), other.derived());
+ internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+ return derived();
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Derived& derived() { return *static_cast<Derived*>(this); }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); }
+};
+#endif // EIGEN_PARSED_BY_DOXYGEN
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_BASE_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorBlock.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorBlock.h
new file mode 100644
index 0000000..1e55d12
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorBlock.h
@@ -0,0 +1,1559 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
+#define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
+
+namespace Eigen {
+namespace internal {
+
+// -------------------------------------------------------------------------- //
+// Forward declarations for templates defined below.
+template <typename Scalar, typename IndexType, int NumDims, int Layout>
+class TensorBlockIO;
+
+// -------------------------------------------------------------------------- //
+// Helper function to compute strides for densely stored buffer of given
+// dimensions.
+
+// TODO(ezhulenev): We compute strides 1000 times in different evaluators, use
+// this function instead everywhere.
+template <int Layout, typename IndexType, int NumDims>
+EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
+ const DSizes<IndexType, NumDims>& dimensions) {
+ DSizes<IndexType, NumDims> strides;
+ if (NumDims == 0) return strides;
+
+ // TODO(ezhulenev): Use templates to unroll this loop (similar to
+ // h_array_reduce in CXX11meta.h)? Benchmark it.
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ strides[0] = 1;
+ for (int i = 1; i < NumDims; ++i) {
+ strides[i] = strides[i - 1] * dimensions[i - 1];
+ }
+ } else {
+ strides[NumDims - 1] = 1;
+ for (int i = NumDims - 2; i >= 0; --i) {
+ strides[i] = strides[i + 1] * dimensions[i + 1];
+ }
+ }
+
+ return strides;
+}
+
+template <int Layout, typename IndexType, size_t NumDims>
+EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
+ const Eigen::array<IndexType, NumDims>& dimensions) {
+ return strides<Layout>(DSizes<IndexType, NumDims>(dimensions));
+}
+
+template <int Layout, std::ptrdiff_t... Indices>
+EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides(
+ const Sizes<Indices...>& sizes) {
+ return strides<Layout>(DSizes<std::ptrdiff_t, sizeof...(Indices)>(sizes));
+}
+
+// -------------------------------------------------------------------------- //
+
+// Tensor block shape type defines what are the shape preference for the blocks
+// extracted from the larger tensor.
+//
+// Example: blocks of 100 elements from the large 100x100 tensor:
+// - tensor: 100x100
+// - target_block_size: 100
+//
+// TensorBlockShapeType:
+// - kUniformAllDims: 100 blocks of size 10x10
+// - kSkewedInnerDims: 100 blocks of size 100x1 (or 1x100 depending on a column
+// or row major layout)
+enum class TensorBlockShapeType { kUniformAllDims, kSkewedInnerDims };
+
+struct TensorBlockResourceRequirements {
+ TensorBlockShapeType shape_type; // target block shape
+ size_t size; // target block size
+ TensorOpCost cost_per_coeff; // cost of computing a single block element
+
+#ifdef EIGEN_HIPCC
+ // For HIPCC, we need to explicitly declare as a "device fun", the constructor
+ // which is implicitly invoked in the "merge" / "any" routines. else HIPCC
+ // errors out complaining about the lack of a matching constructor
+ EIGEN_DEVICE_FUNC
+ TensorBlockResourceRequirements(TensorBlockShapeType shape_type_, size_t size_,
+ TensorOpCost cost_)
+ : shape_type(shape_type_), size(size_), cost_per_coeff(cost_)
+ {}
+#endif
+
+ template <typename Scalar>
+ EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(
+ TensorBlockShapeType shape_type, size_t size_in_bytes,
+ TensorOpCost cost) {
+ const size_t size = numext::maxi(size_t(1), size_in_bytes / sizeof(Scalar));
+ return {shape_type, size, cost};
+ }
+
+ template <typename Scalar>
+ EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(
+ TensorBlockShapeType shape_type, size_t size_in_bytes) {
+ // This default cost per coefficient is valid for most materialized tensor
+ // block evaluation implementations, because they typically just read
+ // coefficients from the underlying tensor storage, and write to the tensor
+ // block buffer (scratch or destination memory, reads and writes have linear
+ // access pattern). We ignore the fixed cost of block evaluation, because in
+ // practice it should negligible.
+ //
+ // Lazy block evaluation adds the cost of calling a functor for each
+ // coefficient.
+ //
+ // All non-trivial block evaluation implementations must provide their own
+ // cost approximation (e.g. shuffling inner dimension has a much higher cost
+ // because it reads memory randomly, although the total number of moved
+ // bytes is the same).
+ return withShapeAndSize<Scalar>(shape_type, size_in_bytes,
+ {/*bytes_loaded=*/sizeof(Scalar),
+ /*bytes_stored=*/sizeof(Scalar),
+ /*compute_cycles=*/0});
+ }
+
+ template <typename Scalar>
+ EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements skewed(
+ size_t size_in_bytes) {
+ return withShapeAndSize<Scalar>(TensorBlockShapeType::kSkewedInnerDims,
+ size_in_bytes);
+ }
+
+ template <typename Scalar>
+ EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements uniform(
+ size_t size_in_bytes) {
+ return withShapeAndSize<Scalar>(TensorBlockShapeType::kUniformAllDims,
+ size_in_bytes);
+ }
+
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE TensorBlockResourceRequirements
+ merge(const TensorBlockResourceRequirements& lhs,
+ const TensorBlockResourceRequirements& rhs) {
+ return {merge(lhs.shape_type, rhs.shape_type), // shape_type
+ merge(lhs.size, rhs.size), // size
+ merge(lhs.cost_per_coeff, rhs.cost_per_coeff)}; // cost_per_coeff
+ }
+
+ EIGEN_DEVICE_FUNC TensorBlockResourceRequirements& addCostPerCoeff(
+ TensorOpCost cost) {
+ cost_per_coeff += cost;
+ return *this;
+ }
+
+ // This is a resource requirement that should be returned from expressions
+ // that do not have any block evaluation preference (e.g. default tensor
+ // expression with raw buffer access).
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() {
+ return {TensorBlockShapeType::kUniformAllDims, 1, {0, 0, 0}};
+ }
+
+ private:
+ using Requirements = TensorBlockResourceRequirements;
+
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size) {
+ return numext::maxi(lhs_size, rhs_size);
+ }
+
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE TensorBlockShapeType
+ merge(TensorBlockShapeType lhs, TensorBlockShapeType rhs) {
+ return (lhs == TensorBlockShapeType::kSkewedInnerDims ||
+ rhs == TensorBlockShapeType::kSkewedInnerDims)
+ ? TensorBlockShapeType::kSkewedInnerDims
+ : TensorBlockShapeType::kUniformAllDims;
+ }
+
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost,
+ TensorOpCost rhs_cost) {
+ return lhs_cost + rhs_cost;
+ }
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockDescriptor specifies a block offset within a tensor and the block
+// sizes along each of the tensor dimensions.
+
+template <int NumDims, typename IndexType = Eigen::Index>
+class TensorBlockDescriptor {
+ public:
+ typedef DSizes<IndexType, NumDims> Dimensions;
+
+ // If we evaluate a Tensor assignment, and expression on the left, already has
+ // a memory buffer, then we might do performance optimization, and evaluate
+ // the root expression directly into the final output memory. Some time it's
+ // possible to reuse it for materializing subexpressions inside an expression
+ // tree, to to avoid dynamic memory allocation.
+ //
+ // The pointer type of the underlying storage is erased, because passing
+ // Scalar type through all the expression evaluation layers is way too many
+ // templates. In practice destination buffer type should always match the
+ // evaluated expression scalar type.
+ class DestinationBuffer {
+ public:
+ enum DestinationBufferKind : int {
+ // The above explicit specification of "int" as the enum basetype is
+ // needed to get around a HIPCC link error ("the field type is not
+ // amp-compatible")
+ // which is issued for class members with the enum type.
+ // TODO(rocm):
+ // remove the "int" basetype once HIPCC has been fixed to not error out
+ // in the above scenario.
+
+ // Destination buffer is not defined (`m_data` == nullptr).
+ kEmpty,
+
+ // Tensor block defined by an owning tensor block descriptor can fit
+ // contiguously into the destination buffer. In this case it's safe to
+ // materialize tensor block in the destination buffer, wrap it in a
+ // TensorMap, and use to build Eigen expression on top of it.
+ kContiguous,
+
+ // Destination buffer strides do not match strides of the contiguously
+ // stored block, and it's impossible to define a TensorMap over this
+ // buffer. However if we are evaluating a root of an expression tree, we
+ // still can materialize an output into this destination, because we can
+ // guarantee that no one will ever access it through block API.
+ //
+ // In theory it is possible to build valid TensorStriding<TensorMap>
+ // expression on top of this destination buffer, however it has
+ // inefficient coeff/packet access, and defeats the purpose of fast block
+ // evaluation API.
+ kStrided
+ };
+
+ template <typename Scalar>
+ Scalar* data() const {
+ eigen_assert(m_data_type_size == sizeof(Scalar));
+ return static_cast<Scalar*>(m_data);
+ }
+
+ const Dimensions& strides() const { return m_strides; }
+ const DestinationBufferKind& kind() const { return m_kind; }
+
+ private:
+ friend class TensorBlockDescriptor;
+
+ DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {}
+
+ template <typename Scalar>
+ DestinationBuffer(Scalar* data, const Dimensions& strides,
+ DestinationBufferKind kind)
+ : m_data(static_cast<void*>(data)),
+ m_data_type_size(sizeof(Scalar)),
+ m_strides(strides),
+ m_kind(kind) {}
+
+ template <int Layout, typename Scalar>
+ static DestinationBuffer make(const TensorBlockDescriptor& desc,
+ Scalar* data, const Dimensions& strides) {
+ return DestinationBuffer(data, strides, kind<Layout>(desc, strides));
+ }
+
+ template <int Layout>
+ static DestinationBufferKind kind(const TensorBlockDescriptor& desc,
+ const Dimensions& strides) {
+ const Dimensions& desc_dims = desc.dimensions();
+ const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
+ for (int i = 0; i < NumDims; ++i) {
+ if (desc_dims[i] == 1) continue;
+ if (desc_strides[i] != strides[i]) return kStrided;
+ }
+ return kContiguous;
+ }
+
+ // Storage pointer is type erased, to reduce template bloat, but we still
+ // keep the size of the underlying element type for error checking.
+ void* m_data;
+ size_t m_data_type_size;
+
+ // Destination buffer dimensions always match the dimensions of a tensor
+ // block descriptor it belongs to, however strides might be different.
+ Dimensions m_strides;
+
+ DestinationBufferKind m_kind;
+ };
+
+ TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions,
+ const DestinationBuffer& destination)
+ : m_offset(offset),
+ m_dimensions(dimensions),
+ m_destination(destination) {}
+
+ TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions)
+ : m_offset(offset),
+ m_dimensions(dimensions),
+ m_destination(DestinationBuffer()) {}
+
+ IndexType offset() const { return m_offset; }
+ const Dimensions& dimensions() const { return m_dimensions; }
+ IndexType dimension(int index) const { return m_dimensions[index]; }
+ IndexType size() const { return array_prod<IndexType>(m_dimensions); }
+
+ const DestinationBuffer& destination() const { return m_destination; }
+
+ template <int Layout, typename Scalar>
+ void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) {
+ eigen_assert(dst_base != NULL);
+ m_destination =
+ DestinationBuffer::template make<Layout>(*this, dst_base, dst_strides);
+ }
+
+ template <int Layout, typename Scalar, typename DstStridesIndexType>
+ void AddDestinationBuffer(
+ Scalar* dst_base,
+ const DSizes<DstStridesIndexType, NumDims>& dst_strides) {
+ // DSizes constructor will do index type promotion if it's safe.
+ AddDestinationBuffer<Layout>(dst_base, Dimensions(dst_strides));
+ }
+
+ TensorBlockDescriptor& DropDestinationBuffer() {
+ m_destination.m_data = NULL;
+ m_destination.m_kind = DestinationBuffer::kEmpty;
+ return *this;
+ }
+
+ bool HasDestinationBuffer() const {
+ return m_destination.kind() != DestinationBuffer::kEmpty;
+ }
+
+ // Returns a copy of `*this` with updated offset.
+ TensorBlockDescriptor WithOffset(IndexType offset) const {
+ return TensorBlockDescriptor(offset, m_dimensions, m_destination);
+ }
+
+ private:
+ // Offset and dimensions are immutable after construction. Block descriptor
+ // can only be mutated by adding or dropping destination.
+ const IndexType m_offset;
+ const Dimensions m_dimensions;
+ DestinationBuffer m_destination;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockMapper is responsible for iterating over the blocks of a tensor.
+
+template <int NumDims, int Layout, typename IndexType = Eigen::Index>
+class TensorBlockMapper {
+ typedef TensorBlockDescriptor<NumDims, IndexType> BlockDescriptor;
+
+ public:
+ typedef DSizes<IndexType, NumDims> Dimensions;
+
+ TensorBlockMapper() = default;
+ TensorBlockMapper(const DSizes<IndexType, NumDims>& dimensions,
+ const TensorBlockResourceRequirements& requirements)
+ : m_tensor_dimensions(dimensions), m_requirements(requirements) {
+ // Compute block dimensions and the total number of blocks.
+ InitializeBlockDimensions();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const {
+ return m_total_block_count;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const {
+ return m_block_dimensions.TotalSize();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes<IndexType, NumDims>&
+ blockDimensions() const {
+ return m_block_dimensions;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor
+ blockDescriptor(IndexType block_index) const {
+ static const bool isColMajor = Layout == static_cast<int>(ColMajor);
+
+ IndexType offset = 0;
+ DSizes<IndexType, NumDims> dimensions;
+
+ if (NumDims == 0) return BlockDescriptor(offset, dimensions);
+
+ // Iterate outer -> inner dimensions.
+ for (int i = NumDims - 1; i >= 0; --i) {
+ const int dim = isColMajor ? i : NumDims - i - 1;
+
+ const IndexType idx = block_index / m_block_strides[dim];
+ block_index -= idx * m_block_strides[dim];
+
+ const IndexType coord = idx * m_block_dimensions[dim];
+ dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord,
+ m_block_dimensions[dim]);
+ offset += coord * m_tensor_strides[dim];
+ }
+
+ return {offset, dimensions};
+ }
+
+ private:
+ void InitializeBlockDimensions() {
+ // Requested block shape and size.
+ const TensorBlockShapeType shape_type = m_requirements.shape_type;
+ IndexType target_block_size =
+ numext::maxi<IndexType>(1, static_cast<IndexType>(m_requirements.size));
+
+ IndexType tensor_size = m_tensor_dimensions.TotalSize();
+
+ // Corner case: one of the dimensions is zero. Logic below is too complex
+ // to handle this case on a general basis, just use unit block size.
+ // Note: we must not yield blocks with zero dimensions (recipe for
+ // overflows/underflows, divisions by zero and NaNs later).
+ if (tensor_size == 0) {
+ for (int i = 0; i < NumDims; ++i) {
+ m_block_dimensions[i] = 1;
+ }
+ m_total_block_count = 0;
+ return;
+ }
+
+ // If tensor fits into a target block size, evaluate it as a single block.
+ if (tensor_size <= target_block_size) {
+ m_block_dimensions = m_tensor_dimensions;
+ m_total_block_count = 1;
+ // The only valid block index is `0`, and in this case we do not need
+ // to compute real strides for tensor or blocks (see blockDescriptor).
+ for (int i = 0; i < NumDims; ++i) {
+ m_tensor_strides[i] = 0;
+ m_block_strides[i] = 1;
+ }
+ return;
+ }
+
+ static const bool isColMajor = Layout == static_cast<int>(ColMajor);
+
+ // Block shape skewed towards inner dimension.
+ if (shape_type == TensorBlockShapeType::kSkewedInnerDims) {
+ IndexType coeff_to_allocate = target_block_size;
+
+ for (int i = 0; i < NumDims; ++i) {
+ const int dim = isColMajor ? i : NumDims - i - 1;
+ m_block_dimensions[dim] =
+ numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]);
+ coeff_to_allocate = divup(
+ coeff_to_allocate,
+ numext::maxi(static_cast<IndexType>(1), m_block_dimensions[dim]));
+ }
+ eigen_assert(coeff_to_allocate == 1);
+
+ } else if (shape_type == TensorBlockShapeType::kUniformAllDims) {
+ // Tensor will not fit within 'target_block_size' budget: calculate tensor
+ // block dimension sizes based on "square" dimension size target.
+ const IndexType dim_size_target = convert_index<IndexType>(
+ std::pow(static_cast<float>(target_block_size),
+ 1.0f / static_cast<float>(m_block_dimensions.rank())));
+
+ for (int i = 0; i < NumDims; ++i) {
+ // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it
+ // a multiple of the packet size. Note that reducing
+ // 'block_dim_size' in this manner can increase the number of
+ // blocks, and so will amplify any per-block overhead.
+ m_block_dimensions[i] =
+ numext::mini(dim_size_target, m_tensor_dimensions[i]);
+ }
+
+ // Add any un-allocated coefficients to inner dimension(s).
+ IndexType total_size = m_block_dimensions.TotalSize();
+ for (int i = 0; i < NumDims; ++i) {
+ const int dim = isColMajor ? i : NumDims - i - 1;
+
+ if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) {
+ const IndexType total_size_other_dims =
+ total_size / m_block_dimensions[dim];
+ const IndexType alloc_avail =
+ divup<IndexType>(target_block_size, total_size_other_dims);
+ if (alloc_avail == m_block_dimensions[dim]) {
+ // Insufficient excess coefficients to allocate.
+ break;
+ }
+ m_block_dimensions[dim] =
+ numext::mini(m_tensor_dimensions[dim], alloc_avail);
+ total_size = total_size_other_dims * m_block_dimensions[dim];
+ }
+ }
+
+ } else {
+ eigen_assert(false); // unknown block shape
+ }
+
+ eigen_assert(m_block_dimensions.TotalSize() >=
+ numext::mini<IndexType>(target_block_size,
+ m_tensor_dimensions.TotalSize()));
+
+ // Calculate block counts by dimension and total block count.
+ DSizes<IndexType, NumDims> block_count;
+ for (int i = 0; i < NumDims; ++i) {
+ block_count[i] = divup(m_tensor_dimensions[i], m_block_dimensions[i]);
+ }
+ m_total_block_count = array_prod(block_count);
+
+ // Calculate block strides (used for enumerating blocks).
+ m_tensor_strides = strides<Layout>(m_tensor_dimensions);
+ m_block_strides = strides<Layout>(block_count);
+ }
+
+ DSizes<IndexType, NumDims> m_tensor_dimensions;
+ TensorBlockResourceRequirements m_requirements;
+
+ DSizes<IndexType, NumDims> m_block_dimensions;
+ IndexType m_total_block_count;
+
+ DSizes<IndexType, NumDims> m_tensor_strides;
+ DSizes<IndexType, NumDims> m_block_strides;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockScratchAllocator is responsible for allocating temporary buffers
+// for block evaluation (output or input block materialization). Given that
+// Eigen expression traversal order is deterministic, all temporary allocations
+// are happening in the same order, and usually have exactly the same size.
+// Scratch allocator keeps a trace of all dynamic allocations, and after the
+// first block evaluation is completed, we should be able to reuse all the
+// temporary buffers for the next block evaluation.
+
+template <typename Device>
+class TensorBlockScratchAllocator {
+ public:
+ explicit TensorBlockScratchAllocator(const Device& device)
+ : m_device(device), m_allocation_index(0) {}
+
+ ~TensorBlockScratchAllocator() {
+ for (size_t i = 0; i < m_allocations.size(); ++i) {
+ m_device.deallocate(m_allocations[i].ptr);
+ }
+ }
+
+ void* allocate(size_t size) {
+ // TODO(ezhulenev): Remove when replaced with inlined vector.
+ if (m_allocations.capacity() == 0) m_allocations.reserve(8);
+
+ // Check if we already have an existing allocation att current index.
+ const int num_allocations = static_cast<int>(m_allocations.size());
+ const bool has_allocation = m_allocation_index < num_allocations;
+
+ // Allocation index can't be larger than the number of allocations.
+ eigen_assert(m_allocation_index <= num_allocations);
+
+ // If we have existing allocation, and its size is larger or equal to
+ // requested size, we do nothing.
+
+ // If current allocation can't fit requested size, we deallocate it, and
+ // replace with a larger allocation.
+ if (has_allocation && m_allocations[m_allocation_index].size < size) {
+ m_device.deallocate(m_allocations[m_allocation_index].ptr);
+ m_allocations[m_allocation_index].ptr = m_device.allocate(size);
+ m_allocations[m_allocation_index].size = size;
+ }
+
+ // Make a new allocation if we don't have and existing one.
+ if (!has_allocation) {
+ Allocation allocation;
+ allocation.ptr = m_device.allocate(size);
+ allocation.size = size;
+ m_allocations.push_back(allocation);
+ }
+
+ eigen_assert(m_allocations[m_allocation_index].ptr != NULL);
+ eigen_assert(m_allocations[m_allocation_index].size >= size);
+
+ return m_allocations[m_allocation_index++].ptr;
+ }
+
+ void reset() { m_allocation_index = 0; }
+
+ private:
+ struct Allocation {
+ void* ptr;
+ size_t size;
+ };
+
+ const Device& m_device;
+ int m_allocation_index;
+ // TODO(ezhulenev): This should be an inlined vector.
+ std::vector<Allocation> m_allocations;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockKind represents all possible block kinds, that can be produced by
+// TensorEvaluator::evalBlock function.
+enum TensorBlockKind {
+ // Tensor block that is a lazy expression that must be assigned to a
+ // destination using TensorBlockAssign.
+ kExpr,
+
+ // Tensor block that is a view into a memory buffer owned by an underlying
+ // Tensor expression (e.g. it can be a view into a Tensor buffer).
+ kView,
+
+ // Tensor block that was materialized in a scratch memory buffer, allocated
+ // with TensorBlockScratchAllocator. This block must be copied to a
+ // destination, similar to a block of `kExpr` type.
+ kMaterializedInScratch,
+
+ // Tensor block that was materialized directly into the final output memory
+ // buffer. For example if the left side of an assignment is a Tensor, we can
+ // directly materialize the block in the destination memory.
+ //
+ // If strides in the output buffer do not match tensor block strides, the
+ // Tensor expression will be invalid, and should not be used by
+ // TensorBlockAssign or for constructing another block expression.
+ kMaterializedInOutput
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockNotImplemented should be used to defined TensorBlock typedef in
+// TensorEvaluators that do not support block evaluation.
+
+class TensorBlockNotImplemented {
+ public:
+ typedef void XprType;
+};
+
+// -------------------------------------------------------------------------- //
+// XprScalar extracts Scalar type from the Eigen expressions (if expression type
+// is not void). It's required to be able to define lazy block expression for
+// argument types, that do not support block evaluation.
+
+template <typename XprType>
+struct XprScalar {
+ typedef typename XprType::Scalar type;
+};
+template <>
+struct XprScalar<void> {
+ typedef void type;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorMaterializedBlock is a fully evaluated block of the original tensor,
+// and XprType is just a TensorMap over the data. This block type is typically
+// used to materialize blocks of tensor expressions, that can't be efficiently
+// represented as lazy Tensor expressions with fast coeff/packet operations,
+// e.g. we materialize all broadcasts into evaluated blocks.
+//
+// TensorMaterializedBlock does not own its memory buffer, it's either a memory
+// buffer that backs the original expression (e.g. block is just a view into a
+// Tensor), or a memory buffer allocated with scratch allocator, and in this
+// case the scratch allocator will deallocate it at the end of block based
+// expression execution.
+//
+// If the block was evaluated directly into the output buffer, and strides in
+// the output buffer do not match block strides, the TensorMap expression will
+// be invalid, and should never be used in block assignment or any other tensor
+// expression.
+
+template <typename Scalar, int NumDims, int Layout,
+ typename IndexType = Eigen::Index>
+class TensorMaterializedBlock {
+ public:
+ typedef DSizes<IndexType, NumDims> Dimensions;
+ typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType;
+
+ TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data,
+ const Dimensions& dimensions, bool valid_expr = true)
+ : m_kind(kind),
+ m_data(data),
+ m_dimensions(dimensions),
+ m_expr(m_data, m_dimensions),
+ m_valid_expr(valid_expr) {
+ eigen_assert(m_kind == internal::TensorBlockKind::kView ||
+ m_kind == internal::TensorBlockKind::kMaterializedInScratch ||
+ m_kind == internal::TensorBlockKind::kMaterializedInOutput);
+ }
+
+ TensorBlockKind kind() const { return m_kind; }
+ // NOTE(ezhulenev): Returning XprType by value like in other block types
+ // causes asan failures. The theory is that XprType::Nested doesn't work
+ // properly for TensorMap.
+ const XprType& expr() const {
+ eigen_assert(m_valid_expr);
+ return m_expr;
+ }
+ const Scalar* data() const { return m_data; }
+ void cleanup() {}
+
+ typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
+
+ // TensorMaterializedBlock can be backed by different types of storage:
+ //
+ // (1) Contiguous block of memory allocated with scratch allocator.
+ // (2) Contiguous block of memory reused from tensor block descriptor
+ // destination buffer.
+ // (3) Strided block of memory reused from tensor block descriptor
+ // destination buffer.
+ //
+ class Storage {
+ public:
+ Scalar* data() const { return m_data; }
+ const Dimensions& dimensions() const { return m_dimensions; }
+ const Dimensions& strides() const { return m_strides; }
+
+ TensorMaterializedBlock AsTensorMaterializedBlock() const {
+ return TensorMaterializedBlock(
+ m_materialized_in_output
+ ? internal::TensorBlockKind::kMaterializedInOutput
+ : internal::TensorBlockKind::kMaterializedInScratch,
+ m_data, m_dimensions, !m_strided_storage);
+ }
+
+ private:
+ friend class TensorMaterializedBlock;
+
+ Storage(Scalar* data, const Dimensions& dimensions,
+ const Dimensions& strides, bool materialized_in_output,
+ bool strided_storage)
+ : m_data(data),
+ m_dimensions(dimensions),
+ m_strides(strides),
+ m_materialized_in_output(materialized_in_output),
+ m_strided_storage(strided_storage) {}
+
+ Scalar* m_data;
+ Dimensions m_dimensions;
+ Dimensions m_strides;
+ bool m_materialized_in_output;
+ bool m_strided_storage;
+ };
+
+ // Creates a storage for materialized block either from the block descriptor
+ // destination buffer, or allocates a new buffer with scratch allocator.
+ template <typename TensorBlockScratch>
+ EIGEN_STRONG_INLINE static Storage prepareStorage(
+ TensorBlockDesc& desc, TensorBlockScratch& scratch,
+ bool allow_strided_storage = false) {
+ // Try to reuse destination as an output block buffer.
+ typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer;
+
+ if (desc.destination().kind() == DestinationBuffer::kContiguous) {
+ Scalar* buffer = desc.destination().template data<Scalar>();
+ desc.DropDestinationBuffer();
+ return Storage(buffer, desc.dimensions(),
+ internal::strides<Layout>(desc.dimensions()),
+ /*materialized_in_output=*/true,
+ /*strided_storage=*/false);
+
+ } else if (desc.destination().kind() == DestinationBuffer::kStrided &&
+ allow_strided_storage) {
+ Scalar* buffer = desc.destination().template data<Scalar>();
+ desc.DropDestinationBuffer();
+ return Storage(buffer, desc.dimensions(), desc.destination().strides(),
+ /*materialized_in_output=*/true, /*strided_storage=*/true);
+
+ } else {
+ void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
+ return Storage(static_cast<Scalar*>(mem), desc.dimensions(),
+ internal::strides<Layout>(desc.dimensions()),
+ /*materialized_in_output=*/false,
+ /*strided_storage=*/false);
+ }
+ }
+
+ // Creates a materialized block for the given descriptor from a memory buffer.
+ template <typename DataDimensions, typename TensorBlockScratch>
+ EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize(
+ const Scalar* data, const DataDimensions& data_dims,
+ TensorBlockDesc& desc, TensorBlockScratch& scratch) {
+ eigen_assert(array_size<DataDimensions>::value == desc.dimensions().size());
+
+ // If a tensor block dimensions covers a contiguous block of the underlying
+ // memory, we can skip block buffer memory allocation, and construct a block
+ // from existing `data` memory buffer.
+ //
+ // Example: (RowMajor layout)
+ // data_dims: [11, 12, 13, 14]
+ // desc.dimensions(): [1, 1, 3, 14]
+ //
+ // In this case we can construct a TensorBlock starting at
+ // `data + desc.offset()`, with a `desc.dimensions()` block sizes.
+ static const bool is_col_major = Layout == ColMajor;
+
+ // Find out how many inner dimensions have a matching size.
+ int num_matching_inner_dims = 0;
+ for (int i = 0; i < NumDims; ++i) {
+ int dim = is_col_major ? i : NumDims - i - 1;
+ if (data_dims[dim] != desc.dimensions()[dim]) break;
+ ++num_matching_inner_dims;
+ }
+
+ // All the outer dimensions must be of size `1`, except a single dimension
+ // before the matching inner dimension (`3` in the example above).
+ bool can_use_direct_access = true;
+ for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) {
+ int dim = is_col_major ? i : NumDims - i - 1;
+ if (desc.dimension(dim) != 1) {
+ can_use_direct_access = false;
+ break;
+ }
+ }
+
+ if (can_use_direct_access) {
+ const Scalar* block_start = data + desc.offset();
+ return TensorMaterializedBlock(internal::TensorBlockKind::kView,
+ block_start, desc.dimensions());
+
+ } else {
+ // Reuse destination buffer or allocate new buffer with scratch allocator.
+ const Storage storage = prepareStorage(desc, scratch);
+
+ typedef internal::TensorBlockIO<Scalar, IndexType, NumDims, Layout>
+ TensorBlockIO;
+ typedef typename TensorBlockIO::Dst TensorBlockIODst;
+ typedef typename TensorBlockIO::Src TensorBlockIOSrc;
+
+ TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)),
+ data, desc.offset());
+ TensorBlockIODst dst(storage.dimensions(), storage.strides(),
+ storage.data());
+
+ TensorBlockIO::Copy(dst, src);
+ return storage.AsTensorMaterializedBlock();
+ }
+ }
+
+ private:
+ TensorBlockKind m_kind;
+ const Scalar* m_data;
+ Dimensions m_dimensions;
+ XprType m_expr;
+ bool m_valid_expr;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp
+// functor to the blocks produced by the underlying Tensor expression.
+
+template <typename UnaryOp, typename ArgTensorBlock>
+class TensorCwiseUnaryBlock {
+ static const bool NoArgBlockAccess =
+ internal::is_void<typename ArgTensorBlock::XprType>::value;
+
+ public:
+ typedef typename conditional<
+ NoArgBlockAccess, void,
+ TensorCwiseUnaryOp<UnaryOp, const typename ArgTensorBlock::XprType> >::
+ type XprType;
+
+ typedef typename XprScalar<XprType>::type Scalar;
+
+ TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor)
+ : m_arg_block(arg_block), m_functor(functor) {}
+
+ TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
+
+ XprType expr() const { return XprType(m_arg_block.expr(), m_functor); }
+ const Scalar* data() const { return NULL; }
+ void cleanup() { m_arg_block.cleanup(); }
+
+ private:
+ ArgTensorBlock m_arg_block;
+ UnaryOp m_functor;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp
+// functor to the blocks produced by the underlying Tensor expression.
+
+template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock>
+class TensorCwiseBinaryBlock {
+ static const bool NoArgBlockAccess =
+ internal::is_void<typename LhsTensorBlock::XprType>::value ||
+ internal::is_void<typename RhsTensorBlock::XprType>::value;
+
+ public:
+ typedef typename conditional<
+ NoArgBlockAccess, void,
+ TensorCwiseBinaryOp<BinaryOp, const typename LhsTensorBlock::XprType,
+ const typename RhsTensorBlock::XprType> >::type
+ XprType;
+
+ typedef typename XprScalar<XprType>::type Scalar;
+
+ TensorCwiseBinaryBlock(const LhsTensorBlock& left_block,
+ const RhsTensorBlock& right_block,
+ const BinaryOp& functor)
+ : m_left_block(left_block),
+ m_right_block(right_block),
+ m_functor(functor) {}
+
+ TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
+
+ XprType expr() const {
+ return XprType(m_left_block.expr(), m_right_block.expr(), m_functor);
+ }
+
+ const Scalar* data() const { return NULL; }
+
+ void cleanup() {
+ m_left_block.cleanup();
+ m_right_block.cleanup();
+ }
+
+ private:
+ LhsTensorBlock m_left_block;
+ RhsTensorBlock m_right_block;
+ BinaryOp m_functor;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorUnaryExprBlock is a lazy tensor expression block that can construct
+// an arbitrary tensor expression from a block of the underlying type (this is a
+// generalization of the TensorCwiseUnaryBlock for arbitrary expressions).
+
+template <typename BlockFactory, typename ArgTensorBlock>
+class TensorUnaryExprBlock {
+ typedef typename ArgTensorBlock::XprType ArgXprType;
+ static const bool NoArgBlockAccess = internal::is_void<ArgXprType>::value;
+
+ public:
+ typedef typename conditional<
+ NoArgBlockAccess, void,
+ typename BlockFactory::template XprType<ArgXprType>::type>::type XprType;
+
+ typedef typename XprScalar<XprType>::type Scalar;
+
+ TensorUnaryExprBlock(const ArgTensorBlock& arg_block,
+ const BlockFactory& factory)
+ : m_arg_block(arg_block), m_factory(factory) {}
+
+ TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
+ XprType expr() const { return m_factory.expr(m_arg_block.expr()); }
+ const Scalar* data() const { return NULL; }
+ void cleanup() { m_arg_block.cleanup(); }
+
+ private:
+ ArgTensorBlock m_arg_block;
+ BlockFactory m_factory;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorTernaryExprBlock is a lazy tensor expression block that can construct
+// an arbitrary tensor expression from three blocks of the underlying type.
+
+template <typename BlockFactory, typename Arg1TensorBlock,
+ typename Arg2TensorBlock, typename Arg3TensorBlock>
+class TensorTernaryExprBlock {
+ typedef typename Arg1TensorBlock::XprType Arg1XprType;
+ typedef typename Arg2TensorBlock::XprType Arg2XprType;
+ typedef typename Arg3TensorBlock::XprType Arg3XprType;
+
+ static const bool NoArgBlockAccess = internal::is_void<Arg1XprType>::value ||
+ internal::is_void<Arg2XprType>::value ||
+ internal::is_void<Arg3XprType>::value;
+
+ public:
+ typedef typename conditional<
+ NoArgBlockAccess, void,
+ typename BlockFactory::template XprType<Arg1XprType, Arg2XprType,
+ Arg3XprType>::type>::type XprType;
+
+ typedef typename XprScalar<XprType>::type Scalar;
+
+ TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block,
+ const Arg2TensorBlock& arg2_block,
+ const Arg3TensorBlock& arg3_block,
+ const BlockFactory& factory)
+ : m_arg1_block(arg1_block),
+ m_arg2_block(arg2_block),
+ m_arg3_block(arg3_block),
+ m_factory(factory) {}
+
+ TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
+ XprType expr() const {
+ return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(),
+ m_arg3_block.expr());
+ }
+ const Scalar* data() const { return NULL; }
+ void cleanup() {
+ m_arg1_block.cleanup();
+ m_arg2_block.cleanup();
+ m_arg3_block.cleanup();
+ }
+
+ private:
+ Arg1TensorBlock m_arg1_block;
+ Arg2TensorBlock m_arg2_block;
+ Arg3TensorBlock m_arg3_block;
+ BlockFactory m_factory;
+};
+
+// -------------------------------------------------------------------------- //
+// StridedLinearBufferCopy provides a method to copy data between two linear
+// buffers with different strides, with optimized paths for scatter/gather.
+
+template <typename Scalar, typename IndexType>
+class StridedLinearBufferCopy {
+ typedef typename packet_traits<Scalar>::type Packet;
+ enum {
+ Vectorizable = packet_traits<Scalar>::Vectorizable,
+ PacketSize = packet_traits<Scalar>::size
+ };
+
+ public:
+ // Specifying linear copy kind statically gives ~30% speedup for small sizes.
+ enum class Kind {
+ Linear = 0, // src_stride == 1 && dst_stride == 1
+ Scatter = 1, // src_stride == 1 && dst_stride != 1
+ FillLinear = 2, // src_stride == 0 && dst_stride == 1
+ FillScatter = 3, // src_stride == 0 && dst_stride != 1
+ Gather = 4, // dst_stride == 1
+ Random = 5 // everything else
+ };
+
+ struct Dst {
+ Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {}
+
+ IndexType offset;
+ IndexType stride;
+ Scalar* data;
+ };
+
+ struct Src {
+ Src(IndexType o, IndexType s, const Scalar* d)
+ : offset(o), stride(s), data(d) {}
+
+ IndexType offset;
+ IndexType stride;
+ const Scalar* data;
+ };
+
+ template <typename StridedLinearBufferCopy::Kind kind>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst,
+ const Src& src,
+ const size_t count) {
+ Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride,
+ src.data);
+ }
+
+ private:
+ template <typename StridedLinearBufferCopy::Kind kind>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+ const IndexType count, const IndexType dst_offset,
+ const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,
+ const IndexType src_offset, const IndexType src_stride,
+ const Scalar* EIGEN_RESTRICT src_data) {
+ const Scalar* src = &src_data[src_offset];
+ Scalar* dst = &dst_data[dst_offset];
+
+ if (!Vectorizable) {
+ for (Index i = 0; i < count; ++i) {
+ dst[i * dst_stride] = src[i * src_stride];
+ }
+ return;
+ }
+
+ const IndexType vectorized_size = count - PacketSize;
+ IndexType i = 0;
+
+ if (kind == StridedLinearBufferCopy::Kind::Linear) {
+ // ******************************************************************** //
+ // Linear copy from `src` to `dst`.
+ const IndexType unrolled_size = count - 4 * PacketSize;
+ eigen_assert(src_stride == 1 && dst_stride == 1);
+ for (; i <= unrolled_size; i += 4 * PacketSize) {
+ for (int j = 0; j < 4; ++j) {
+ Packet p = ploadu<Packet>(src + i + j * PacketSize);
+ pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
+ }
+ }
+ for (; i <= vectorized_size; i += PacketSize) {
+ Packet p = ploadu<Packet>(src + i);
+ pstoreu<Scalar, Packet>(dst + i, p);
+ }
+ for (; i < count; ++i) {
+ dst[i] = src[i];
+ }
+ // ******************************************************************** //
+ } else if (kind == StridedLinearBufferCopy::Kind::Scatter) {
+ // Scatter from `src` to `dst`.
+ eigen_assert(src_stride == 1 && dst_stride != 1);
+ for (; i <= vectorized_size; i += PacketSize) {
+ Packet p = ploadu<Packet>(src + i);
+ pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
+ }
+ for (; i < count; ++i) {
+ dst[i * dst_stride] = src[i];
+ }
+ // ******************************************************************** //
+ } else if (kind == StridedLinearBufferCopy::Kind::FillLinear) {
+ // Fill `dst` with value at `*src`.
+ eigen_assert(src_stride == 0 && dst_stride == 1);
+ const IndexType unrolled_size = count - 4 * PacketSize;
+ Packet p = pload1<Packet>(src);
+ for (; i <= unrolled_size; i += 4 * PacketSize) {
+ for (int j = 0; j < 4; ++j) {
+ pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
+ }
+ }
+ for (; i <= vectorized_size; i += PacketSize) {
+ pstoreu<Scalar, Packet>(dst + i, p);
+ }
+ for (; i < count; ++i) {
+ dst[i] = *src;
+ }
+ // ******************************************************************** //
+ } else if (kind == StridedLinearBufferCopy::Kind::FillScatter) {
+ // Scatter `*src` into `dst`.
+ eigen_assert(src_stride == 0 && dst_stride != 1);
+ Packet p = pload1<Packet>(src);
+ for (; i <= vectorized_size; i += PacketSize) {
+ pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
+ }
+ for (; i < count; ++i) {
+ dst[i * dst_stride] = *src;
+ }
+ // ******************************************************************** //
+ } else if (kind == StridedLinearBufferCopy::Kind::Gather) {
+ // Gather from `src` into `dst`.
+ eigen_assert(dst_stride == 1);
+ for (; i <= vectorized_size; i += PacketSize) {
+ Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
+ pstoreu<Scalar, Packet>(dst + i, p);
+ }
+ for (; i < count; ++i) {
+ dst[i] = src[i * src_stride];
+ }
+ // ******************************************************************** //
+ } else if (kind == StridedLinearBufferCopy::Kind::Random) {
+ // Random.
+ for (; i < count; ++i) {
+ dst[i * dst_stride] = src[i * src_stride];
+ }
+ } else {
+ eigen_assert(false);
+ }
+ }
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block.
+// It's possible to specify src->dst dimension mapping for the copy operation.
+// Dimensions of `dst` specify how many elements have to be copied, for the
+// `src` we need to know only stride to navigate through source memory buffer.
+
+template <typename Scalar, typename IndexType, int NumDims, int Layout>
+class TensorBlockIO {
+ static const bool IsColMajor = (Layout == ColMajor);
+
+ typedef StridedLinearBufferCopy<Scalar, IndexType> LinCopy;
+
+ public:
+ typedef DSizes<IndexType, NumDims> Dimensions;
+ typedef DSizes<int, NumDims> DimensionsMap;
+
+ struct Dst {
+ Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst,
+ IndexType dst_offset = 0)
+ : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {}
+
+ Dimensions dims;
+ Dimensions strides;
+ Scalar* data;
+ IndexType offset;
+ };
+
+ struct Src {
+ Src(const Dimensions& src_strides, const Scalar* src,
+ IndexType src_offset = 0)
+ : strides(src_strides), data(src), offset(src_offset) {}
+
+ Dimensions strides;
+ const Scalar* data;
+ IndexType offset;
+ };
+
+ // Copies data to `dst` from `src`, using provided dimensions mapping:
+ //
+ // src_dimension_index = dst_to_src_dim_map[dst_dimension_index]
+ //
+ // Returns the number of copied elements.
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy(
+ const Dst& dst, const Src& src, const DimensionsMap& dst_to_src_dim_map) {
+ // Copy single scalar value from `src` to `dst`.
+ if (NumDims == 0) {
+ *(dst.data + dst.offset) = *(src.data + src.offset);
+ return 1;
+ }
+
+ // Both `dst` and `src` must have contiguous innermost dimension. We also
+ // accept the special case with stride '0', because it's used as a trick to
+ // implement broadcasting.
+ {
+ int inner_dim = IsColMajor ? 0 : NumDims - 1;
+ EIGEN_UNUSED_VARIABLE(inner_dim);
+ eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0);
+ eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0);
+ }
+
+ // Give a shorter name to `dst_to_src_dim_map`.
+ const DimensionsMap& dim_map = dst_to_src_dim_map;
+
+ // Do not squeeze reordered inner dimensions.
+ int num_squeezable_dims = NumSqueezableInnerDims(dim_map);
+
+ // NOTE: We find the innermost dimension (contiguous in memory) in the dst
+ // block, and we write data linearly into that dimension, reading it from
+ // the src. If dimensions are reordered, we might end up reading data from
+ // the src with `stride != 1`.
+ //
+ // NOTE: Random-Read/Linear-Write can be up to ~2X faster than
+ // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680
+
+ // Find the innermost dimension in the dst whose size is not 1. This is the
+ // effective inner dim.
+ int num_size_one_inner_dims = 0;
+ for (int i = 0; i < num_squeezable_dims; ++i) {
+ const int dst_dim = IsColMajor ? i : NumDims - i - 1;
+ if (dst.dims[dst_dim] != 1) break;
+ num_size_one_inner_dims++;
+ }
+
+ // If all dimensions are of size 1, just copy a scalar from `src` to `dst`.
+ if (num_size_one_inner_dims == NumDims) {
+ *(dst.data + dst.offset) = *(src.data + src.offset);
+ return 1;
+ }
+
+ // Outermost dimension in the dst with `stride == 1` (contiguous in memory).
+ const int dst_stride1_dim = IsColMajor
+ ? num_size_one_inner_dims
+ : NumDims - num_size_one_inner_dims - 1;
+
+ // Dimension in the src that corresponds to the dst innermost dimension.
+ const int src_dim_for_dst_stride1_dim =
+ NumDims == 0 ? 1 : dim_map[dst_stride1_dim];
+
+ // Size of the innermost dimension (length of contiguous blocks of memory).
+ IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim];
+
+ // Squeeze multiple inner dims into one if they are contiguous in `dst` and
+ // `src` memory, so we can do less linear copy calls.
+ for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) {
+ const int dst_dim = IsColMajor ? i : NumDims - i - 1;
+ const IndexType dst_stride = dst.strides[dst_dim];
+ const IndexType src_stride = src.strides[dim_map[dst_dim]];
+ if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) {
+ dst_inner_dim_size *= dst.dims[dst_dim];
+ ++num_size_one_inner_dims;
+ } else {
+ break;
+ }
+ }
+
+ // Setup strides to read data from `src` and write to `dst`.
+ IndexType input_offset = src.offset;
+ IndexType output_offset = dst.offset;
+ IndexType input_stride =
+ NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim];
+ IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim];
+
+ const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
+ array<BlockIteratorState, at_least_1_dim> it;
+
+ // Initialize block iterator state. Squeeze away any dimension of size 1.
+ int idx = 0; // currently initialized iterator state index
+ for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
+ const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2;
+ if (dst.dims[dst_dim] == 1) continue;
+
+ it[idx].size = dst.dims[dst_dim];
+ it[idx].input_stride = src.strides[dim_map[dst_dim]];
+ it[idx].output_stride = dst.strides[dst_dim];
+
+ it[idx].input_span = it[idx].input_stride * (it[idx].size - 1);
+ it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
+
+ idx++;
+ }
+
+ // Iterate copying data from src to dst.
+ const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize();
+
+#define COPY_INNER_DIM(KIND) \
+ IndexType num_copied = 0; \
+ for (num_copied = 0; num_copied < block_total_size; \
+ num_copied += dst_inner_dim_size) { \
+ LinCopy::template Run<KIND>( \
+ typename LinCopy::Dst(output_offset, output_stride, dst.data), \
+ typename LinCopy::Src(input_offset, input_stride, src.data), \
+ dst_inner_dim_size); \
+ \
+ for (int j = 0; j < idx; ++j) { \
+ if (++it[j].count < it[j].size) { \
+ input_offset += it[j].input_stride; \
+ output_offset += it[j].output_stride; \
+ break; \
+ } \
+ it[j].count = 0; \
+ input_offset -= it[j].input_span; \
+ output_offset -= it[j].output_span; \
+ } \
+ } \
+ return num_copied;
+
+ if (input_stride == 1 && output_stride == 1) {
+ COPY_INNER_DIM(LinCopy::Kind::Linear);
+ } else if (input_stride == 1 && output_stride != 1) {
+ COPY_INNER_DIM(LinCopy::Kind::Scatter);
+ } else if (input_stride == 0 && output_stride == 1) {
+ COPY_INNER_DIM(LinCopy::Kind::FillLinear);
+ } else if (input_stride == 0 && output_stride != 1) {
+ COPY_INNER_DIM(LinCopy::Kind::FillScatter);
+ } else if (output_stride == 1) {
+ COPY_INNER_DIM(LinCopy::Kind::Gather);
+ } else {
+ COPY_INNER_DIM(LinCopy::Kind::Random);
+ }
+
+#undef COPY_INNER_DIM
+ }
+
+ // Copy from `src` to `dst` with an identity src->dst dimension map. Returns
+ // the number of copied elements.
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst& dst,
+ const Src& src) {
+ DimensionsMap dst_to_src_map;
+ for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i;
+ return Copy(dst, src, dst_to_src_map);
+ }
+
+ private:
+ struct BlockIteratorState {
+ BlockIteratorState()
+ : size(0),
+ count(0),
+ input_stride(0),
+ output_stride(0),
+ input_span(0),
+ output_span(0) {}
+
+ IndexType size;
+ IndexType count;
+ IndexType input_stride;
+ IndexType output_stride;
+ IndexType input_span;
+ IndexType output_span;
+ };
+
+ // Compute how many inner dimensions it's allowed to squeeze when doing IO
+ // between two tensor blocks. It's safe to squeeze inner dimensions, only
+ // if they are not reordered.
+ static int NumSqueezableInnerDims(const DimensionsMap& dim_map) {
+ int num_squeezable_dims = 0;
+ for (int i = 0; i < NumDims; ++i) {
+ const int dim = IsColMajor ? i : NumDims - i - 1;
+ if (dim_map[dim] != dim) break;
+ num_squeezable_dims++;
+ }
+ return num_squeezable_dims;
+ }
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to
+// a Tensor block defined by `desc`, backed by a memory buffer at `target`.
+//
+// Currently there is no way to write from a Tensor expression to a block of
+// memory, if dimensions are reordered. If you need to do that, you should
+// materialize a Tensor block expression into a memory buffer, and then use
+// TensorBlockIO to copy data between two memory buffers with a custom
+// `target->src` dimension map (see definition above).
+//
+// Also currently the innermost dimension of `target` must have a stride '1'
+// (contiguous in memory). This restriction could be lifted with a `pscatter`,
+// but in practice it's never needed, and there is a similar TensorBlockIO
+// workaround for that.
+//
+// TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO
+// where `src` is a tensor expression. Explore if it is possible to rewrite IO
+// to use expressions instead of pointers, and after that TensorBlockAssignment
+// will become an alias to IO.
+template <typename Scalar, int NumDims, typename TensorBlockExpr,
+ typename IndexType = Eigen::Index>
+class TensorBlockAssignment {
+ // We will use coeff/packet path to evaluate block expressions.
+ typedef TensorEvaluator<const TensorBlockExpr, DefaultDevice>
+ TensorBlockEvaluator;
+
+ typedef DSizes<IndexType, NumDims> Dimensions;
+
+ enum {
+ Vectorizable = packet_traits<Scalar>::Vectorizable,
+ PacketSize = packet_traits<Scalar>::size
+ };
+
+ template <bool Vectorizable, typename Evaluator>
+ struct InnerDimAssign {
+ EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
+ const Evaluator& eval,
+ IndexType eval_offset) {
+ for (IndexType i = 0; i < count; ++i) {
+ target[i] = eval.coeff(eval_offset + i);
+ }
+ }
+ };
+
+ template <typename Evaluator>
+ struct InnerDimAssign<true, Evaluator> {
+ EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
+ const Evaluator& eval,
+ IndexType eval_offset) {
+ typedef typename packet_traits<Scalar>::type Packet;
+
+ const IndexType unrolled_size = count - 4 * PacketSize;
+ const IndexType vectorized_size = count - PacketSize;
+ IndexType i = 0;
+
+ for (; i <= unrolled_size; i += 4 * PacketSize) {
+ for (int j = 0; j < 4; ++j) {
+ const IndexType idx = eval_offset + i + j * PacketSize;
+ Packet p = eval.template packet<Unaligned>(idx);
+ pstoreu<Scalar>(target + i + j * PacketSize, p);
+ }
+ }
+
+ for (; i <= vectorized_size; i += PacketSize) {
+ Packet p = eval.template packet<Unaligned>(eval_offset + i);
+ pstoreu<Scalar>(target + i, p);
+ }
+
+ for (; i < count; ++i) {
+ target[i] = eval.coeff(eval_offset + i);
+ }
+ }
+ };
+
+ public:
+ struct Target {
+ Target(const Dimensions& target_dims, const Dimensions& target_strides,
+ Scalar* target_data, IndexType target_offset = 0)
+ : dims(target_dims),
+ strides(target_strides),
+ data(target_data),
+ offset(target_offset) {}
+
+ Dimensions dims;
+ Dimensions strides;
+ Scalar* data;
+ IndexType offset;
+ };
+
+ static Target target(const Dimensions& target_dims,
+ const Dimensions& target_strides, Scalar* target_data,
+ IndexType target_offset = 0) {
+ return Target(target_dims, target_strides, target_data, target_offset);
+ }
+
+ template <typename TargetDimsIndexType, typename TargetStridesIndexType>
+ static Target target(
+ const DSizes<TargetDimsIndexType, NumDims>& target_dims,
+ const DSizes<TargetStridesIndexType, NumDims>& target_strides,
+ Scalar* target_data, IndexType target_offset = 0) {
+ // DSizes constructor will do index type promotion if it's safe.
+ return Target(Dimensions(target_dims), Dimensions(target_strides),
+ target_data, target_offset);
+ }
+
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+ const Target& target, const TensorBlockExpr& expr) {
+ // Prepare evaluator for block expression.
+ DefaultDevice default_device;
+ TensorBlockEvaluator eval(expr, default_device);
+
+ // Tensor block expression dimension should match destination dimensions.
+ eigen_assert(dimensions_match(target.dims, eval.dimensions()));
+
+ static const int Layout = TensorBlockEvaluator::Layout;
+ static const bool is_col_major = Layout == ColMajor;
+
+ // Initialize output inner dimension size based on a layout.
+ const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize();
+ const int inner_dim_idx = is_col_major ? 0 : NumDims - 1;
+ IndexType output_inner_dim_size = target.dims[inner_dim_idx];
+
+ // Target inner dimension stride must be '1'.
+ eigen_assert(target.strides[inner_dim_idx] == 1);
+
+ // Squeeze multiple inner dims into one if they are contiguous in `target`.
+ IndexType num_squeezed_dims = 0;
+ for (Index i = 1; i < NumDims; ++i) {
+ const Index dim = is_col_major ? i : NumDims - i - 1;
+ const IndexType target_stride = target.strides[dim];
+
+ if (output_inner_dim_size == target_stride) {
+ output_inner_dim_size *= target.dims[dim];
+ num_squeezed_dims++;
+ } else {
+ break;
+ }
+ }
+
+ // Initialize output block iterator state. Dimension in this array are
+ // always in inner_most -> outer_most order (col major layout).
+ array<BlockIteratorState, NumDims> it;
+
+ int idx = 0; // currently initialized iterator state index
+ for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) {
+ const Index dim = is_col_major ? i + 1 : NumDims - i - 2;
+
+ it[idx].count = 0;
+ it[idx].size = target.dims[dim];
+ it[idx].output_stride = target.strides[dim];
+ it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
+ idx++;
+ }
+
+ // We read block expression from the beginning, and start writing data to
+ // `target` at given offset.
+ IndexType input_offset = 0;
+ IndexType output_offset = target.offset;
+
+ // Iterate copying data from `eval` to `target`.
+ for (IndexType i = 0; i < output_size; i += output_inner_dim_size) {
+ // Assign to `target` at current offset.
+ InnerDimAssign<Vectorizable && TensorBlockEvaluator::PacketAccess,
+ TensorBlockEvaluator>::Run(target.data + output_offset,
+ output_inner_dim_size, eval,
+ input_offset);
+
+ // Move input offset forward by the number of assigned coefficients.
+ input_offset += output_inner_dim_size;
+
+ // Update index.
+ for (int j = 0; j < idx; ++j) {
+ if (++it[j].count < it[j].size) {
+ output_offset += it[j].output_stride;
+ break;
+ }
+ it[j].count = 0;
+ output_offset -= it[j].output_span;
+ }
+ }
+ }
+
+ private:
+ struct BlockIteratorState {
+ BlockIteratorState()
+ : count(0), size(0), output_stride(0), output_span(0) {}
+
+ IndexType count;
+ IndexType size;
+ IndexType output_stride;
+ IndexType output_span;
+ };
+};
+
+// -------------------------------------------------------------------------- //
+
+} // namespace internal
+} // namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorBroadcasting.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorBroadcasting.h
new file mode 100644
index 0000000..a354132
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorBroadcasting.h
@@ -0,0 +1,1093 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
+
+namespace Eigen {
+
+/** \class TensorBroadcasting
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor broadcasting class.
+ *
+ *
+ */
+namespace internal {
+template<typename Broadcast, typename XprType>
+struct traits<TensorBroadcastingOp<Broadcast, XprType> > : public traits<XprType>
+{
+ typedef typename XprType::Scalar Scalar;
+ typedef traits<XprType> XprTraits;
+ typedef typename XprTraits::StorageKind StorageKind;
+ typedef typename XprTraits::Index Index;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = XprTraits::NumDimensions;
+ static const int Layout = XprTraits::Layout;
+ typedef typename XprTraits::PointerType PointerType;
+};
+
+template<typename Broadcast, typename XprType>
+struct eval<TensorBroadcastingOp<Broadcast, XprType>, Eigen::Dense>
+{
+ typedef const TensorBroadcastingOp<Broadcast, XprType> EIGEN_DEVICE_REF type;
+};
+
+template<typename Broadcast, typename XprType>
+struct nested<TensorBroadcastingOp<Broadcast, XprType>, 1, typename eval<TensorBroadcastingOp<Broadcast, XprType> >::type>
+{
+ typedef TensorBroadcastingOp<Broadcast, XprType> type;
+};
+
+template <typename Dims>
+struct is_input_scalar {
+ static const bool value = false;
+};
+template <>
+struct is_input_scalar<Sizes<> > {
+ static const bool value = true;
+};
+#ifndef EIGEN_EMULATE_CXX11_META_H
+template <typename std::ptrdiff_t... Indices>
+struct is_input_scalar<Sizes<Indices...> > {
+ static const bool value = (Sizes<Indices...>::total_size == 1);
+};
+#endif
+
+} // end namespace internal
+
+
+
+template<typename Broadcast, typename XprType>
+class TensorBroadcastingOp : public TensorBase<TensorBroadcastingOp<Broadcast, XprType>, ReadOnlyAccessors>
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Scalar Scalar;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename Eigen::internal::nested<TensorBroadcastingOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorBroadcastingOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBroadcastingOp(const XprType& expr, const Broadcast& broadcast)
+ : m_xpr(expr), m_broadcast(broadcast) {}
+
+ EIGEN_DEVICE_FUNC
+ const Broadcast& broadcast() const { return m_broadcast; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type&
+ expression() const { return m_xpr; }
+
+ protected:
+ typename XprType::Nested m_xpr;
+ const Broadcast m_broadcast;
+};
+
+
+// Eval as rvalue
+template<typename Broadcast, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
+{
+ typedef TensorBroadcastingOp<Broadcast, ArgType> XprType;
+ typedef typename XprType::Index Index;
+ static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+ typedef DSizes<Index, NumDims> Dimensions;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+ protected: // all the non-static fields must have the same access control, otherwise the TensorEvaluator wont be standard layout;
+ bool isCopy, nByOne, oneByN;
+ public:
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+
+ enum {
+ IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
+ PreferBlockAccess = true,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ RawAccess = false
+ };
+
+ typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+ // We do block based broadcasting using a trick with 2x tensor rank and 0
+ // strides. See block method implementation for details.
+ typedef DSizes<Index, 2 * NumDims> BroadcastDimensions;
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+ typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+ typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
+ ArgTensorBlock;
+
+ typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
+ Layout, Index>
+ TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : isCopy(false), nByOne(false), oneByN(false),
+ m_device(device), m_broadcast(op.broadcast()), m_impl(op.expression(), device)
+ {
+
+ // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar
+ // and store the result in a scalar. Instead one should reshape the scalar into a a N-D
+ // tensor with N >= 1 of 1 element first and then broadcast.
+ EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+ const InputDimensions& input_dims = m_impl.dimensions();
+ isCopy = true;
+ for (int i = 0; i < NumDims; ++i) {
+ eigen_assert(input_dims[i] > 0);
+ m_dimensions[i] = input_dims[i] * m_broadcast[i];
+ if (m_broadcast[i] != 1) {
+ isCopy = false;
+ }
+ }
+
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ m_inputStrides[0] = 1;
+ m_outputStrides[0] = 1;
+ for (int i = 1; i < NumDims; ++i) {
+ m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
+ m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
+ }
+ } else {
+ m_inputStrides[NumDims-1] = 1;
+ m_outputStrides[NumDims-1] = 1;
+ for (int i = NumDims-2; i >= 0; --i) {
+ m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
+ m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
+ }
+ }
+
+ if (input_dims[0] == 1) {
+ oneByN = true;
+ for (int i = 1; i < NumDims; ++i) {
+ if (m_broadcast[i] != 1) {
+ oneByN = false;
+ break;
+ }
+ }
+ } else if (input_dims[NumDims-1] == 1) {
+ nByOne = true;
+ for (int i = 0; i < NumDims-1; ++i) {
+ if (m_broadcast[i] != 1) {
+ nByOne = false;
+ break;
+ }
+ }
+ }
+
+ // Handle special format like NCHW, its input shape is '[1, N..., 1]' and
+ // broadcast shape is '[N, 1..., N]'
+ if (!oneByN && !nByOne) {
+ if (input_dims[0] == 1 && input_dims[NumDims-1] == 1 && NumDims > 2) {
+ nByOne = true;
+ oneByN = true;
+ for (int i = 1; i < NumDims-1; ++i) {
+ if (m_broadcast[i] != 1) {
+ nByOne = false;
+ oneByN = false;
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+ m_impl.evalSubExprsIfNeeded(NULL);
+ return true;
+ }
+
+#ifdef EIGEN_USE_THREADS
+ template <typename EvalSubExprsCallback>
+ EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+ EvaluatorPointerType, EvalSubExprsCallback done) {
+ m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+ }
+#endif // EIGEN_USE_THREADS
+
+ EIGEN_STRONG_INLINE void cleanup() {
+ m_impl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const
+ {
+ if (internal::is_input_scalar<typename internal::remove_all<InputDimensions>::type>::value) {
+ return m_impl.coeff(0);
+ }
+
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ if (isCopy) {
+ return m_impl.coeff(index);
+ } else {
+ return coeffColMajor(index);
+ }
+ } else {
+ if (isCopy) {
+ return m_impl.coeff(index);
+ } else {
+ return coeffRowMajor(index);
+ }
+ }
+ }
+
+ // TODO: attempt to speed this up. The integer divisions and modulo are slow
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexColMajor(Index index) const {
+ Index inputIndex = 0;
+ EIGEN_UNROLL_LOOP
+ for (int i = NumDims - 1; i > 0; --i) {
+ const Index idx = index / m_outputStrides[i];
+ if (internal::index_statically_eq<Broadcast>(i, 1)) {
+ eigen_assert(idx < m_impl.dimensions()[i]);
+ inputIndex += idx * m_inputStrides[i];
+ } else {
+ if (internal::index_statically_eq<InputDimensions>(i, 1)) {
+ eigen_assert(idx % m_impl.dimensions()[i] == 0);
+ } else {
+ inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
+ }
+ }
+ index -= idx * m_outputStrides[i];
+ }
+ if (internal::index_statically_eq<Broadcast>(0, 1)) {
+ eigen_assert(index < m_impl.dimensions()[0]);
+ inputIndex += index;
+ } else {
+ if (internal::index_statically_eq<InputDimensions>(0, 1)) {
+ eigen_assert(index % m_impl.dimensions()[0] == 0);
+ } else {
+ inputIndex += (index % m_impl.dimensions()[0]);
+ }
+ }
+ return inputIndex;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const
+ {
+ return m_impl.coeff(indexColMajor(index));
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexRowMajor(Index index) const {
+ Index inputIndex = 0;
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < NumDims - 1; ++i) {
+ const Index idx = index / m_outputStrides[i];
+ if (internal::index_statically_eq<Broadcast>(i, 1)) {
+ eigen_assert(idx < m_impl.dimensions()[i]);
+ inputIndex += idx * m_inputStrides[i];
+ } else {
+ if (internal::index_statically_eq<InputDimensions>(i, 1)) {
+ eigen_assert(idx % m_impl.dimensions()[i] == 0);
+ } else {
+ inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
+ }
+ }
+ index -= idx * m_outputStrides[i];
+ }
+ if (internal::index_statically_eq<Broadcast>(NumDims - 1, 1)) {
+ eigen_assert(index < m_impl.dimensions()[NumDims - 1]);
+ inputIndex += index;
+ } else {
+ if (internal::index_statically_eq<InputDimensions>(NumDims - 1, 1)) {
+ eigen_assert(index % m_impl.dimensions()[NumDims - 1] == 0);
+ } else {
+ inputIndex += (index % m_impl.dimensions()[NumDims - 1]);
+ }
+ }
+ return inputIndex;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const
+ {
+ return m_impl.coeff(indexRowMajor(index));
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const
+ {
+ if (internal::is_input_scalar<typename internal::remove_all<InputDimensions>::type>::value) {
+ return internal::pset1<PacketReturnType>(m_impl.coeff(0));
+ }
+
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ if (isCopy) {
+ #ifdef EIGEN_GPU_COMPILE_PHASE
+ // See PR 437: on NVIDIA P100 and K20m we observed a x3-4 speed up by enforcing
+ // unaligned loads here. The reason is unclear though.
+ return m_impl.template packet<Unaligned>(index);
+ #else
+ return m_impl.template packet<LoadMode>(index);
+ #endif
+ } else if (oneByN && !nByOne) {
+ return packetNByOne<LoadMode>(index);
+ } else if (!oneByN && nByOne) {
+ return packetOneByN<LoadMode>(index);
+ } else if (oneByN && nByOne) {
+ return packetOneByNByOne<LoadMode>(index);
+ } else {
+ return packetColMajor<LoadMode>(index);
+ }
+ } else {
+ if (isCopy) {
+ #ifdef EIGEN_GPU_COMPILE_PHASE
+ // See above.
+ return m_impl.template packet<Unaligned>(index);
+ #else
+ return m_impl.template packet<LoadMode>(index);
+ #endif
+ } else if (oneByN && !nByOne) {
+ return packetOneByN<LoadMode>(index);
+ } else if (!oneByN && nByOne) {
+ return packetNByOne<LoadMode>(index);
+ } else if (oneByN && nByOne) {
+ return packetOneByNByOne<LoadMode>(index);
+ } else {
+ return packetRowMajor<LoadMode>(index);
+ }
+ }
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByNByOne
+ (Index index) const
+ {
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ Index startDim, endDim;
+ Index inputIndex, outputOffset, batchedIndex;
+
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ startDim = NumDims - 1;
+ endDim = 1;
+ } else {
+ startDim = 0;
+ endDim = NumDims - 2;
+ }
+
+ batchedIndex = index % m_outputStrides[startDim];
+ inputIndex = batchedIndex / m_outputStrides[endDim];
+ outputOffset = batchedIndex % m_outputStrides[endDim];
+
+ if (outputOffset + PacketSize <= m_outputStrides[endDim]) {
+ values[0] = m_impl.coeff(inputIndex);
+ return internal::pload1<PacketReturnType>(values);
+ } else {
+ EIGEN_UNROLL_LOOP
+ for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) {
+ if (outputOffset + cur < m_outputStrides[endDim]) {
+ values[i] = m_impl.coeff(inputIndex);
+ } else {
+ ++inputIndex;
+ inputIndex = (inputIndex == m_inputStrides[startDim] ? 0 : inputIndex);
+ values[i] = m_impl.coeff(inputIndex);
+ outputOffset = 0;
+ cur = 0;
+ }
+ }
+ return internal::pload<PacketReturnType>(values);
+ }
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByN(Index index) const
+ {
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+ Index dim, inputIndex;
+
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ dim = NumDims - 1;
+ } else {
+ dim = 0;
+ }
+
+ inputIndex = index % m_inputStrides[dim];
+ if (inputIndex + PacketSize <= m_inputStrides[dim]) {
+ return m_impl.template packet<Unaligned>(inputIndex);
+ } else {
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < PacketSize; ++i) {
+ if (inputIndex > m_inputStrides[dim]-1) {
+ inputIndex = 0;
+ }
+ values[i] = m_impl.coeff(inputIndex++);
+ }
+ return internal::pload<PacketReturnType>(values);
+ }
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetNByOne(Index index) const
+ {
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ Index dim, inputIndex, outputOffset;
+
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ dim = 1;
+ } else {
+ dim = NumDims - 2;
+ }
+
+ inputIndex = index / m_outputStrides[dim];
+ outputOffset = index % m_outputStrides[dim];
+ if (outputOffset + PacketSize <= m_outputStrides[dim]) {
+ values[0] = m_impl.coeff(inputIndex);
+ return internal::pload1<PacketReturnType>(values);
+ } else {
+ EIGEN_UNROLL_LOOP
+ for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) {
+ if (outputOffset + cur < m_outputStrides[dim]) {
+ values[i] = m_impl.coeff(inputIndex);
+ } else {
+ values[i] = m_impl.coeff(++inputIndex);
+ outputOffset = 0;
+ cur = 0;
+ }
+ }
+ return internal::pload<PacketReturnType>(values);
+ }
+ }
+
+ // Ignore the LoadMode and always use unaligned loads since we can't guarantee
+ // the alignment at compile time.
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
+ {
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+ const Index originalIndex = index;
+
+ Index inputIndex = 0;
+ EIGEN_UNROLL_LOOP
+ for (int i = NumDims - 1; i > 0; --i) {
+ const Index idx = index / m_outputStrides[i];
+ if (internal::index_statically_eq<Broadcast>(i, 1)) {
+ eigen_assert(idx < m_impl.dimensions()[i]);
+ inputIndex += idx * m_inputStrides[i];
+ } else {
+ if (internal::index_statically_eq<InputDimensions>(i, 1)) {
+ eigen_assert(idx % m_impl.dimensions()[i] == 0);
+ } else {
+ inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
+ }
+ }
+ index -= idx * m_outputStrides[i];
+ }
+ Index innermostLoc;
+ if (internal::index_statically_eq<Broadcast>(0, 1)) {
+ eigen_assert(index < m_impl.dimensions()[0]);
+ innermostLoc = index;
+ } else {
+ if (internal::index_statically_eq<InputDimensions>(0, 1)) {
+ eigen_assert(index % m_impl.dimensions()[0] == 0);
+ innermostLoc = 0;
+ } else {
+ innermostLoc = index % m_impl.dimensions()[0];
+ }
+ }
+ inputIndex += innermostLoc;
+
+ // Todo: this could be extended to the second dimension if we're not
+ // broadcasting alongside the first dimension, and so on.
+ if (innermostLoc + PacketSize <= m_impl.dimensions()[0]) {
+ return m_impl.template packet<Unaligned>(inputIndex);
+ } else {
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ values[0] = m_impl.coeff(inputIndex);
+ EIGEN_UNROLL_LOOP
+ for (int i = 1; i < PacketSize; ++i) {
+ if (innermostLoc + i < m_impl.dimensions()[0]) {
+ values[i] = m_impl.coeff(inputIndex+i);
+ } else {
+ values[i] = coeffColMajor(originalIndex+i);
+ }
+ }
+ PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+ return rslt;
+ }
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
+ {
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+ const Index originalIndex = index;
+
+ Index inputIndex = 0;
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < NumDims - 1; ++i) {
+ const Index idx = index / m_outputStrides[i];
+ if (internal::index_statically_eq<Broadcast>(i, 1)) {
+ eigen_assert(idx < m_impl.dimensions()[i]);
+ inputIndex += idx * m_inputStrides[i];
+ } else {
+ if (internal::index_statically_eq<InputDimensions>(i, 1)) {
+ eigen_assert(idx % m_impl.dimensions()[i] == 0);
+ } else {
+ inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
+ }
+ }
+ index -= idx * m_outputStrides[i];
+ }
+ Index innermostLoc;
+ if (internal::index_statically_eq<Broadcast>(NumDims-1, 1)) {
+ eigen_assert(index < m_impl.dimensions()[NumDims-1]);
+ innermostLoc = index;
+ } else {
+ if (internal::index_statically_eq<InputDimensions>(NumDims-1, 1)) {
+ eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0);
+ innermostLoc = 0;
+ } else {
+ innermostLoc = index % m_impl.dimensions()[NumDims-1];
+ }
+ }
+ inputIndex += innermostLoc;
+
+ // Todo: this could be extended to the second dimension if we're not
+ // broadcasting alongside the first dimension, and so on.
+ if (innermostLoc + PacketSize <= m_impl.dimensions()[NumDims-1]) {
+ return m_impl.template packet<Unaligned>(inputIndex);
+ } else {
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ values[0] = m_impl.coeff(inputIndex);
+ EIGEN_UNROLL_LOOP
+ for (int i = 1; i < PacketSize; ++i) {
+ if (innermostLoc + i < m_impl.dimensions()[NumDims-1]) {
+ values[i] = m_impl.coeff(inputIndex+i);
+ } else {
+ values[i] = coeffRowMajor(originalIndex+i);
+ }
+ }
+ PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+ return rslt;
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+ costPerCoeff(bool vectorized) const {
+ double compute_cost = TensorOpCost::AddCost<Index>();
+ if (!isCopy && NumDims > 0) {
+ EIGEN_UNROLL_LOOP
+ for (int i = NumDims - 1; i > 0; --i) {
+ compute_cost += TensorOpCost::DivCost<Index>();
+ if (internal::index_statically_eq<Broadcast>(i, 1)) {
+ compute_cost +=
+ TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
+ } else {
+ if (!internal::index_statically_eq<InputDimensions>(i, 1)) {
+ compute_cost += TensorOpCost::MulCost<Index>() +
+ TensorOpCost::ModCost<Index>() +
+ TensorOpCost::AddCost<Index>();
+ }
+ }
+ compute_cost +=
+ TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
+ }
+ }
+ return m_impl.costPerCoeff(vectorized) +
+ TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ internal::TensorBlockResourceRequirements getResourceRequirements() const {
+ // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large
+ // tensors. But this might need further tuning.
+ const size_t target_size = m_device.firstLevelCacheSize();
+ return internal::TensorBlockResourceRequirements::merge(
+ m_impl.getResourceRequirements(),
+ internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size));
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+ block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+ bool /*root_of_expr_ast*/ = false) const {
+ BlockBroadcastingParams params = blockBroadcastingParams(desc);
+
+ if (params.inner_dim_size == 0 || params.bcast_dim_size == 0) {
+ return emptyBlock();
+ }
+
+ // Prepare storage for the materialized broadcasting result.
+ const typename TensorBlock::Storage block_storage =
+ TensorBlock::prepareStorage(desc, scratch);
+ ScalarNoConst* materialized_output = block_storage.data();
+
+ // We potentially will need to materialize input blocks.
+ size_t materialized_input_size = 0;
+ ScalarNoConst* materialized_input = NULL;
+
+ // Initialize block broadcating iterator state for outer dimensions (outer
+ // with regard to bcast dimension). Dimension in this array are always in
+ // inner_most -> outer_most order (col major layout).
+ array<BlockBroadcastingIteratorState, NumDims> it;
+ int idx = 0;
+
+ for (int i = params.inner_dim_count + 1; i < NumDims; ++i) {
+ const Index dim = IsColMajor ? i : NumDims - 1 - i;
+ it[idx].size = params.output_dims[dim];
+ it[idx].count = 0;
+ it[idx].output_stride = m_outputStrides[dim];
+ it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
+ idx++;
+ }
+
+ // Write output into the beginning of `materialized_output`.
+ Index output_offset = 0;
+
+ // We will fill output block by broadcasting along the bcast dim, and
+ // iterating over outer dimension.
+ const Index output_size = NumDims == 0 ? 1 : params.output_dims.TotalSize();
+
+ for (Index num_output_coeffs = 0; num_output_coeffs < output_size;) {
+ ScalarNoConst* bcast_output = materialized_output + num_output_coeffs;
+ Index bcast_offset = desc.offset() + output_offset;
+
+ // Broadcast along the bcast dimension.
+ num_output_coeffs += BroadcastBlockAlongBcastDim(
+ params, bcast_offset, scratch, bcast_output, &materialized_input,
+ &materialized_input_size);
+
+ // Switch to the next outer dimension.
+ for (int j = 0; j < idx; ++j) {
+ if (++it[j].count < it[j].size) {
+ output_offset += it[j].output_stride;
+ break;
+ }
+ it[j].count = 0;
+ output_offset -= it[j].output_span;
+ }
+ }
+
+ return block_storage.AsTensorMaterializedBlock();
+ }
+
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+ const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+
+ Broadcast functor() const { return m_broadcast; }
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(
+ cl::sycl::handler& cgh) const {
+ m_impl.bind(cgh);
+ }
+#endif
+ private:
+ static const bool IsColMajor =
+ static_cast<int>(Layout) == static_cast<int>(ColMajor);
+
+ // We will build a general case block broadcasting on top of broadcasting
+ // primitive that will do broadcasting only for the inner dimension(s) along
+ // the first dimension smaller than the input size (it's called `bcast_dim`).
+ //
+ // Example:
+ // dim: 0 1 2 (ColMajor)
+ // input size: [9, 3, 6]
+ // block size: [9, 2, 6]
+ //
+ // We will compute broadcasted block by iterating over the outer dimensions
+ // before `bcast_dim` (only dimension `2` in this example) and computing
+ // broadcasts along the `bcast_dim` (dimension `1` in this example).
+
+ // BlockBroadcastingParams holds precomputed parameters for broadcasting a
+ // single block along the broadcasting dimension. Sizes and strides along the
+ // `bcast_dim` might be invalid, they will be adjusted later in
+ // `BroadcastBlockAlongBcastDim`.
+ struct BlockBroadcastingParams {
+ Dimensions input_dims; // input expression dimensions
+ Dimensions output_dims; // output block sizes
+ Dimensions output_strides; // output block strides
+
+ int inner_dim_count; // count inner dimensions matching in size
+ int bcast_dim; // broadcasting dimension index
+ Index bcast_dim_size; // broadcasting dimension size
+ Index inner_dim_size; // inner dimensions size
+
+ // Block sizes and strides for the input block where all dimensions before
+ // `bcast_dim` are equal to `1`.
+ Dimensions input_block_sizes;
+ Dimensions input_block_strides;
+
+ // Block sizes and strides for blocks with extra dimensions and strides `0`.
+ BroadcastDimensions bcast_block_sizes;
+ BroadcastDimensions bcast_block_strides;
+ BroadcastDimensions bcast_input_strides;
+ };
+
+ struct BlockBroadcastingIteratorState {
+ Index size;
+ Index count;
+ Index output_stride;
+ Index output_span;
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlockBroadcastingParams
+ blockBroadcastingParams(TensorBlockDesc& desc) const {
+ BlockBroadcastingParams params;
+
+ params.input_dims = Dimensions(m_impl.dimensions());
+
+ // Output block sizes and strides.
+ params.output_dims = desc.dimensions();
+ params.output_strides = internal::strides<Layout>(params.output_dims);
+
+ // Find the broadcasting dimension (first dimension with output size smaller
+ // that the input size).
+ params.bcast_dim = 0;
+ params.bcast_dim_size = 1;
+ params.inner_dim_size = 1;
+
+ // Count the number of inner dimensions that have the same size in the block
+ // and in the broadcast expression.
+ params.inner_dim_count = 0;
+
+ for (int i = 0; i < NumDims; ++i) {
+ const int dim = IsColMajor ? i : NumDims - i - 1;
+
+ if (params.output_dims[dim] == m_dimensions[dim]) {
+ params.inner_dim_size *= params.output_dims[dim];
+ ++params.inner_dim_count;
+ continue;
+ }
+
+ // First non-matching dimension is the broadcasting dimension.
+ eigen_assert(params.output_dims[dim] < m_dimensions[dim]);
+ params.bcast_dim = dim;
+ params.bcast_dim_size = params.output_dims[dim];
+ break;
+ }
+
+ // Calculate the input block size for looking into the input.
+ for (int i = 0; i < params.inner_dim_count; ++i) {
+ const int dim = IsColMajor ? i : NumDims - i - 1;
+ params.input_block_sizes[dim] = params.input_dims[dim];
+ }
+ for (int i = params.inner_dim_count; i < NumDims; ++i) {
+ const int dim = IsColMajor ? i : NumDims - i - 1;
+ params.input_block_sizes[dim] = 1;
+ }
+ params.input_block_strides =
+ internal::strides<Layout>(params.input_block_sizes);
+
+ // Broadcast with the 0-stride trick: Create 1 extra dim for each
+ // broadcast, set the input stride to 0.
+ //
+ // When ColMajor:
+ //
+ // - bcast_block_sizes:
+ // [d_0, b_0, d_1, b_1, ...]
+ //
+ // - bcast_block_strides:
+ // [output_block_strides[0], output_block_strides[0] * d_0,
+ // output_block_strides[1], output_block_strides[1] * d_1,
+ // ...]
+ //
+ // - bcast_input_strides:
+ // [input_block_strides[0], 0,
+ // input_block_strides[1], 0,
+ // ...].
+ //
+ for (int i = 0; i < params.inner_dim_count; ++i) {
+ const int dim = IsColMajor ? i : NumDims - i - 1;
+
+ const int copy_dim = IsColMajor ? 2 * i : 2 * NumDims - 2 * i - 1;
+ const int broadcast_dim = IsColMajor ? copy_dim + 1 : copy_dim - 1;
+
+ params.bcast_block_sizes[copy_dim] = params.input_dims[dim];
+ params.bcast_block_sizes[broadcast_dim] = m_broadcast[dim];
+ params.bcast_block_strides[copy_dim] = params.output_strides[dim];
+ params.bcast_block_strides[broadcast_dim] =
+ params.output_strides[dim] * params.input_dims[dim];
+ params.bcast_input_strides[copy_dim] = params.input_block_strides[dim];
+ params.bcast_input_strides[broadcast_dim] = 0;
+ }
+
+ for (int i = 2 * params.inner_dim_count; i < 2 * NumDims; ++i) {
+ const int dim = IsColMajor ? i : 2 * NumDims - i - 1;
+ params.bcast_block_sizes[dim] = 1;
+ params.bcast_block_strides[dim] = 0;
+ params.bcast_input_strides[dim] = 0;
+ }
+
+ return params;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock emptyBlock() const {
+ DSizes<Index, NumDims> dimensions;
+ for (int i = 0; i < NumDims; ++i) dimensions[i] = 0;
+ return TensorBlock(internal::TensorBlockKind::kView, NULL, dimensions);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlockAlongBcastDim(
+ BlockBroadcastingParams params, Index bcast_offset,
+ TensorBlockScratch& scratch, ScalarNoConst* materialized_output,
+ ScalarNoConst** materialized_input,
+ size_t* materialized_input_size) const {
+ if (params.bcast_dim_size == 1) {
+ // We just need one block read using the ready-set values above.
+ return BroadcastBlock(
+ params.input_block_sizes, params.input_block_strides,
+ params.bcast_block_sizes, params.bcast_block_strides,
+ params.bcast_input_strides, bcast_offset, 0, scratch,
+ materialized_output, materialized_input, materialized_input_size);
+
+ } else if (params.input_dims[params.bcast_dim] == 1) {
+ // Broadcast bcast dimension (< NumDims) by bcast_dim_size.
+ const int broadcast_bcast_dim =
+ IsColMajor ? 2 * params.inner_dim_count + 1
+ : 2 * NumDims - 2 * params.inner_dim_count - 2;
+
+ params.bcast_block_sizes[broadcast_bcast_dim] = params.bcast_dim_size;
+ params.bcast_input_strides[broadcast_bcast_dim] = 0;
+ params.bcast_block_strides[broadcast_bcast_dim] =
+ params.output_strides[params.bcast_dim];
+
+ return BroadcastBlock(
+ params.input_block_sizes, params.input_block_strides,
+ params.bcast_block_sizes, params.bcast_block_strides,
+ params.bcast_input_strides, bcast_offset, 0, scratch,
+ materialized_output, materialized_input, materialized_input_size);
+
+ } else {
+ // Keep track of the total number of the coefficients written to the
+ // output block.
+ Index num_output_coeffs = 0;
+
+ // The general case. Let's denote the output block as
+ //
+ // x[..., a:a+bcast_dim_size, :, ..., :]
+ //
+ // where a:a+bcast_dim_size is a slice on the bcast_dim dimension
+ // (< NumDims). We need to split the a:a+bcast_dim_size into possibly 3
+ // sub-blocks:
+ //
+ // (1) a:b, where b is the smallest multiple of
+ // input_dims[bcast_dim_start] in [a, a+bcast_dim_size].
+ //
+ // (2) b:c, where c is the largest multiple of input_dims[bcast_dim_start]
+ // in [a, a+bcast_dim_size].
+ //
+ // (3) c:a+bcast_dim_size .
+ //
+ // Or, when b and c do not exist, we just need to process the whole block
+ // together.
+
+ // Find a.
+ const Index bcast_dim_left_index =
+ bcast_offset / m_outputStrides[params.bcast_dim];
+
+ // Find b and c.
+ const Index input_bcast_dim_size = params.input_dims[params.bcast_dim];
+
+ // First multiple after a. This is b when <= bcast_dim_left_index +
+ // bcast_dim_size.
+ const Index first_multiple =
+ divup<Index>(bcast_dim_left_index, input_bcast_dim_size) *
+ input_bcast_dim_size;
+
+ if (first_multiple <= bcast_dim_left_index + params.bcast_dim_size) {
+ // b exists, so does c. Find it.
+ const Index last_multiple =
+ (bcast_dim_left_index + params.bcast_dim_size) /
+ input_bcast_dim_size * input_bcast_dim_size;
+ const int copy_bcast_dim =
+ IsColMajor ? 2 * params.inner_dim_count
+ : 2 * NumDims - 2 * params.inner_dim_count - 1;
+ const int broadcast_bcast_dim =
+ IsColMajor ? 2 * params.inner_dim_count + 1
+ : 2 * NumDims - 2 * params.inner_dim_count - 2;
+
+ if (first_multiple > bcast_dim_left_index) {
+ const Index head_size = first_multiple - bcast_dim_left_index;
+ params.input_block_sizes[params.bcast_dim] = head_size;
+ params.bcast_block_sizes[copy_bcast_dim] = head_size;
+ params.bcast_input_strides[copy_bcast_dim] =
+ params.input_block_strides[params.bcast_dim];
+ params.bcast_block_strides[copy_bcast_dim] =
+ params.output_strides[params.bcast_dim];
+ params.bcast_block_sizes[broadcast_bcast_dim] = 1;
+ params.bcast_input_strides[broadcast_bcast_dim] = 0;
+ params.bcast_block_strides[broadcast_bcast_dim] =
+ params.output_strides[params.bcast_dim] *
+ params.input_dims[params.bcast_dim];
+
+ num_output_coeffs += BroadcastBlock(
+ params.input_block_sizes, params.input_block_strides,
+ params.bcast_block_sizes, params.bcast_block_strides,
+ params.bcast_input_strides, bcast_offset, 0, scratch,
+ materialized_output, materialized_input, materialized_input_size);
+ }
+ if (first_multiple < last_multiple) {
+ params.input_block_sizes[params.bcast_dim] = input_bcast_dim_size;
+ params.bcast_block_sizes[copy_bcast_dim] = input_bcast_dim_size;
+ params.bcast_input_strides[copy_bcast_dim] =
+ params.input_block_strides[params.bcast_dim];
+ params.bcast_block_strides[copy_bcast_dim] =
+ params.output_strides[params.bcast_dim];
+ params.bcast_block_sizes[broadcast_bcast_dim] =
+ (last_multiple - first_multiple) / input_bcast_dim_size;
+ params.bcast_input_strides[broadcast_bcast_dim] = 0;
+ params.bcast_block_strides[broadcast_bcast_dim] =
+ params.output_strides[params.bcast_dim] *
+ params.input_dims[params.bcast_dim];
+ const Index offset = (first_multiple - bcast_dim_left_index) *
+ m_outputStrides[params.bcast_dim];
+
+ num_output_coeffs += BroadcastBlock(
+ params.input_block_sizes, params.input_block_strides,
+ params.bcast_block_sizes, params.bcast_block_strides,
+ params.bcast_input_strides, bcast_offset, offset, scratch,
+ materialized_output, materialized_input, materialized_input_size);
+ }
+ if (last_multiple < bcast_dim_left_index + params.bcast_dim_size) {
+ const Index tail_size =
+ bcast_dim_left_index + params.bcast_dim_size - last_multiple;
+ params.input_block_sizes[params.bcast_dim] = tail_size;
+ params.bcast_block_sizes[copy_bcast_dim] = tail_size;
+ params.bcast_input_strides[copy_bcast_dim] =
+ params.input_block_strides[params.bcast_dim];
+ params.bcast_block_strides[copy_bcast_dim] =
+ params.output_strides[params.bcast_dim];
+ params.bcast_block_sizes[broadcast_bcast_dim] = 1;
+ params.bcast_input_strides[broadcast_bcast_dim] = 0;
+ params.bcast_block_strides[broadcast_bcast_dim] =
+ params.output_strides[params.bcast_dim] *
+ params.input_dims[params.bcast_dim];
+ const Index offset = (last_multiple - bcast_dim_left_index) *
+ m_outputStrides[params.bcast_dim];
+
+ num_output_coeffs += BroadcastBlock(
+ params.input_block_sizes, params.input_block_strides,
+ params.bcast_block_sizes, params.bcast_block_strides,
+ params.bcast_input_strides, bcast_offset, offset, scratch,
+ materialized_output, materialized_input, materialized_input_size);
+ }
+ } else {
+ // b and c do not exist.
+ const int copy_bcast_dim =
+ IsColMajor ? 2 * params.inner_dim_count
+ : 2 * NumDims - 2 * params.inner_dim_count - 1;
+ params.input_block_sizes[params.bcast_dim] = params.bcast_dim_size;
+ params.bcast_block_sizes[copy_bcast_dim] = params.bcast_dim_size;
+ params.bcast_input_strides[copy_bcast_dim] =
+ params.input_block_strides[params.bcast_dim];
+ params.bcast_block_strides[copy_bcast_dim] =
+ params.output_strides[params.bcast_dim];
+
+ num_output_coeffs += BroadcastBlock(
+ params.input_block_sizes, params.input_block_strides,
+ params.bcast_block_sizes, params.bcast_block_strides,
+ params.bcast_input_strides, bcast_offset, 0, scratch,
+ materialized_output, materialized_input, materialized_input_size);
+ }
+
+ return num_output_coeffs;
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlock(
+ const Dimensions& input_block_sizes,
+ const Dimensions& input_block_strides,
+ const BroadcastDimensions& bcast_block_sizes,
+ const BroadcastDimensions& bcast_block_strides,
+ const BroadcastDimensions& bcast_input_strides, Index bcast_offset,
+ Index offset, TensorBlockScratch& scratch,
+ ScalarNoConst* materialized_output, ScalarNoConst** materialized_input,
+ size_t* materialized_input_size) const {
+ // ---------------------------------------------------------------------- //
+ // Tensor block descriptor for reading block from the input.
+ const Index input_offset = bcast_offset + offset;
+ TensorBlockDesc input_desc(
+ IsColMajor ? indexColMajor(input_offset) : indexRowMajor(input_offset),
+ input_block_sizes);
+
+ ArgTensorBlock input_block = m_impl.block(input_desc, scratch);
+
+ // ---------------------------------------------------------------------- //
+ // Materialize input block into a temporary memory buffer only if it's not
+ // already available in the arg block.
+ const ScalarNoConst* input_buffer = NULL;
+
+ if (input_block.data() != NULL) {
+ // Input block already has raw data, there is no need to materialize it.
+ input_buffer = input_block.data();
+
+ } else {
+ // Otherwise we have to do block assignment into a temporary buffer.
+
+ // Maybe reuse previously allocated buffer, or allocate a new one with a
+ // scratch allocator.
+ const size_t input_total_size = input_block_sizes.TotalSize();
+ if (*materialized_input == NULL ||
+ *materialized_input_size < input_total_size) {
+ *materialized_input_size = input_total_size;
+ void* mem = scratch.allocate(*materialized_input_size * sizeof(Scalar));
+ *materialized_input = static_cast<ScalarNoConst*>(mem);
+ }
+
+ typedef internal::TensorBlockAssignment<
+ ScalarNoConst, NumDims, typename ArgTensorBlock::XprType, Index>
+ TensorBlockAssignment;
+
+ TensorBlockAssignment::Run(
+ TensorBlockAssignment::target(input_block_sizes, input_block_strides,
+ *materialized_input),
+ input_block.expr());
+
+ input_buffer = *materialized_input;
+ }
+
+ // ---------------------------------------------------------------------- //
+ // Copy data from materialized input block to the materialized output, using
+ // given broadcast strides (strides with zeroes).
+ typedef internal::TensorBlockIO<ScalarNoConst, Index, 2 * NumDims, Layout>
+ TensorBlockIO;
+
+ typename TensorBlockIO::Src src(bcast_input_strides, input_buffer);
+ typename TensorBlockIO::Dst dst(bcast_block_sizes, bcast_block_strides,
+ materialized_output + offset);
+
+ return TensorBlockIO::Copy(dst, src);
+ }
+
+protected:
+ const Device EIGEN_DEVICE_REF m_device;
+ const typename internal::remove_reference<Broadcast>::type m_broadcast;
+ Dimensions m_dimensions;
+ array<Index, NumDims> m_outputStrides;
+ array<Index, NumDims> m_inputStrides;
+ TensorEvaluator<ArgType, Device> m_impl;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorChipping.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorChipping.h
new file mode 100644
index 0000000..3764573
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorChipping.h
@@ -0,0 +1,518 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H
+
+namespace Eigen {
+
+/** \class TensorKChippingReshaping
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief A chip is a thin slice, corresponding to a column or a row in a 2-d tensor.
+ *
+ *
+ */
+
+namespace internal {
+template<DenseIndex DimId, typename XprType>
+struct traits<TensorChippingOp<DimId, XprType> > : public traits<XprType>
+{
+ typedef typename XprType::Scalar Scalar;
+ typedef traits<XprType> XprTraits;
+ typedef typename XprTraits::StorageKind StorageKind;
+ typedef typename XprTraits::Index Index;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = XprTraits::NumDimensions - 1;
+ static const int Layout = XprTraits::Layout;
+ typedef typename XprTraits::PointerType PointerType;
+};
+
+template<DenseIndex DimId, typename XprType>
+struct eval<TensorChippingOp<DimId, XprType>, Eigen::Dense>
+{
+ typedef const TensorChippingOp<DimId, XprType> EIGEN_DEVICE_REF type;
+};
+
+template<DenseIndex DimId, typename XprType>
+struct nested<TensorChippingOp<DimId, XprType>, 1, typename eval<TensorChippingOp<DimId, XprType> >::type>
+{
+ typedef TensorChippingOp<DimId, XprType> type;
+};
+
+template <DenseIndex DimId>
+struct DimensionId
+{
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) {
+ EIGEN_UNUSED_VARIABLE(dim);
+ eigen_assert(dim == DimId);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const {
+ return DimId;
+ }
+};
+template <>
+struct DimensionId<Dynamic>
+{
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) : actual_dim(dim) {
+ eigen_assert(dim >= 0);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const {
+ return actual_dim;
+ }
+ private:
+ const DenseIndex actual_dim;
+};
+
+
+} // end namespace internal
+
+
+
+template<DenseIndex DimId, typename XprType>
+class TensorChippingOp : public TensorBase<TensorChippingOp<DimId, XprType> >
+{
+ public:
+ typedef TensorBase<TensorChippingOp<DimId, XprType> > Base;
+ typedef typename Eigen::internal::traits<TensorChippingOp>::Scalar Scalar;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename Eigen::internal::nested<TensorChippingOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorChippingOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorChippingOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset, const Index dim)
+ : m_xpr(expr), m_offset(offset), m_dim(dim) {
+ }
+
+ EIGEN_DEVICE_FUNC
+ const Index offset() const { return m_offset; }
+ EIGEN_DEVICE_FUNC
+ const Index dim() const { return m_dim.actualDim(); }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type&
+ expression() const { return m_xpr; }
+
+ EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorChippingOp)
+
+ protected:
+ typename XprType::Nested m_xpr;
+ const Index m_offset;
+ const internal::DimensionId<DimId> m_dim;
+};
+
+
+// Eval as rvalue
+template<DenseIndex DimId, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
+{
+ typedef TensorChippingOp<DimId, ArgType> XprType;
+ static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+ static const int NumDims = NumInputDims-1;
+ typedef typename XprType::Index Index;
+ typedef DSizes<Index, NumDims> Dimensions;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+
+ enum {
+ // Alignment can't be guaranteed at compile time since it depends on the
+ // slice offsets.
+ IsAligned = false,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
+ // Chipping of outer-most dimension is a trivial operation, because we can
+ // read and write directly from the underlying tensor using single offset.
+ IsOuterChipping = (static_cast<int>(Layout) == ColMajor && DimId == NumInputDims - 1) ||
+ (static_cast<int>(Layout) == RowMajor && DimId == 0),
+ // Chipping inner-most dimension.
+ IsInnerChipping = (static_cast<int>(Layout) == ColMajor && DimId == 0) ||
+ (static_cast<int>(Layout) == RowMajor && DimId == NumInputDims - 1),
+ // Prefer block access if the underlying expression prefers it, otherwise
+ // only if chipping is not trivial.
+ PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess ||
+ !IsOuterChipping,
+ CoordAccess = false, // to be implemented
+ RawAccess = false
+ };
+
+ typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+ typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+ typedef internal::TensorBlockDescriptor<NumInputDims, Index>
+ ArgTensorBlockDesc;
+ typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
+ ArgTensorBlock;
+
+ typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
+ Layout, Index>
+ TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device)
+ {
+ EIGEN_STATIC_ASSERT((NumInputDims >= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
+ eigen_assert(NumInputDims > m_dim.actualDim());
+
+ const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+ eigen_assert(op.offset() < input_dims[m_dim.actualDim()]);
+
+ int j = 0;
+ for (int i = 0; i < NumInputDims; ++i) {
+ if (i != m_dim.actualDim()) {
+ m_dimensions[j] = input_dims[i];
+ ++j;
+ }
+ }
+
+ m_stride = 1;
+ m_inputStride = 1;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = 0; i < m_dim.actualDim(); ++i) {
+ m_stride *= input_dims[i];
+ m_inputStride *= input_dims[i];
+ }
+ } else {
+ for (int i = NumInputDims-1; i > m_dim.actualDim(); --i) {
+ m_stride *= input_dims[i];
+ m_inputStride *= input_dims[i];
+ }
+ }
+ m_inputStride *= input_dims[m_dim.actualDim()];
+ m_inputOffset = m_stride * op.offset();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+ m_impl.evalSubExprsIfNeeded(NULL);
+ return true;
+ }
+
+ EIGEN_STRONG_INLINE void cleanup() {
+ m_impl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ return m_impl.coeff(srcCoeff(index));
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+ {
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+ if (isInnerChipping()) {
+ // m_stride is equal to 1, so let's avoid the integer division.
+ eigen_assert(m_stride == 1);
+ Index inputIndex = index * m_inputStride + m_inputOffset;
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < PacketSize; ++i) {
+ values[i] = m_impl.coeff(inputIndex);
+ inputIndex += m_inputStride;
+ }
+ PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+ return rslt;
+ } else if (isOuterChipping()) {
+ // m_stride is always greater than index, so let's avoid the integer division.
+ eigen_assert(m_stride > index);
+ return m_impl.template packet<LoadMode>(index + m_inputOffset);
+ } else {
+ const Index idx = index / m_stride;
+ const Index rem = index - idx * m_stride;
+ if (rem + PacketSize <= m_stride) {
+ Index inputIndex = idx * m_inputStride + m_inputOffset + rem;
+ return m_impl.template packet<LoadMode>(inputIndex);
+ } else {
+ // Cross the stride boundary. Fallback to slow path.
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < PacketSize; ++i) {
+ values[i] = coeff(index);
+ ++index;
+ }
+ PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+ return rslt;
+ }
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+ costPerCoeff(bool vectorized) const {
+ double cost = 0;
+ if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) &&
+ m_dim.actualDim() == 0) ||
+ (static_cast<int>(Layout) == static_cast<int>(RowMajor) &&
+ m_dim.actualDim() == NumInputDims - 1)) {
+ cost += TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
+ } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) &&
+ m_dim.actualDim() == NumInputDims - 1) ||
+ (static_cast<int>(Layout) == static_cast<int>(RowMajor) &&
+ m_dim.actualDim() == 0)) {
+ cost += TensorOpCost::AddCost<Index>();
+ } else {
+ cost += 3 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>() +
+ 3 * TensorOpCost::AddCost<Index>();
+ }
+
+ return m_impl.costPerCoeff(vectorized) +
+ TensorOpCost(0, 0, cost, vectorized, PacketSize);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ internal::TensorBlockResourceRequirements getResourceRequirements() const {
+ const size_t target_size = m_device.lastLevelCacheSize();
+ return internal::TensorBlockResourceRequirements::merge(
+ internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size),
+ m_impl.getResourceRequirements());
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+ block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+ bool root_of_expr_ast = false) const {
+ const Index chip_dim = m_dim.actualDim();
+
+ DSizes<Index, NumInputDims> input_block_dims;
+ for (int i = 0; i < NumInputDims; ++i) {
+ input_block_dims[i]
+ = i < chip_dim ? desc.dimension(i)
+ : i > chip_dim ? desc.dimension(i - 1)
+ : 1;
+ }
+
+ ArgTensorBlockDesc arg_desc(srcCoeff(desc.offset()), input_block_dims);
+
+ // Try to reuse destination buffer for materializing argument block.
+ if (desc.HasDestinationBuffer()) {
+ DSizes<Index, NumInputDims> arg_destination_strides;
+ for (int i = 0; i < NumInputDims; ++i) {
+ arg_destination_strides[i]
+ = i < chip_dim ? desc.destination().strides()[i]
+ : i > chip_dim ? desc.destination().strides()[i - 1]
+ : 0; // for dimensions of size `1` stride should never be used.
+ }
+
+ arg_desc.template AddDestinationBuffer<Layout>(
+ desc.destination().template data<ScalarNoConst>(),
+ arg_destination_strides);
+ }
+
+ ArgTensorBlock arg_block = m_impl.block(arg_desc, scratch, root_of_expr_ast);
+ if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer();
+
+ if (arg_block.data() != NULL) {
+ // Forward argument block buffer if possible.
+ return TensorBlock(arg_block.kind(), arg_block.data(),
+ desc.dimensions());
+
+ } else {
+ // Assign argument block expression to a buffer.
+
+ // Prepare storage for the materialized chipping result.
+ const typename TensorBlock::Storage block_storage =
+ TensorBlock::prepareStorage(desc, scratch);
+
+ typedef internal::TensorBlockAssignment<
+ ScalarNoConst, NumInputDims, typename ArgTensorBlock::XprType, Index>
+ TensorBlockAssignment;
+
+ TensorBlockAssignment::Run(
+ TensorBlockAssignment::target(
+ arg_desc.dimensions(),
+ internal::strides<Layout>(arg_desc.dimensions()),
+ block_storage.data()),
+ arg_block.expr());
+
+ return block_storage.AsTensorMaterializedBlock();
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const {
+ typename Storage::Type result = constCast(m_impl.data());
+ if (isOuterChipping() && result) {
+ return result + m_inputOffset;
+ } else {
+ return NULL;
+ }
+ }
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_impl.bind(cgh);
+ }
+#endif
+
+ protected:
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
+ {
+ Index inputIndex;
+ if (isInnerChipping()) {
+ // m_stride is equal to 1, so let's avoid the integer division.
+ eigen_assert(m_stride == 1);
+ inputIndex = index * m_inputStride + m_inputOffset;
+ } else if (isOuterChipping()) {
+ // m_stride is always greater than index, so let's avoid the integer
+ // division.
+ eigen_assert(m_stride > index);
+ inputIndex = index + m_inputOffset;
+ } else {
+ const Index idx = index / m_stride;
+ inputIndex = idx * m_inputStride + m_inputOffset;
+ index -= idx * m_stride;
+ inputIndex += index;
+ }
+ return inputIndex;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isInnerChipping() const {
+ return IsInnerChipping ||
+ (static_cast<int>(Layout) == ColMajor && m_dim.actualDim() == 0) ||
+ (static_cast<int>(Layout) == RowMajor && m_dim.actualDim() == NumInputDims - 1);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isOuterChipping() const {
+ return IsOuterChipping ||
+ (static_cast<int>(Layout) == ColMajor && m_dim.actualDim() == NumInputDims-1) ||
+ (static_cast<int>(Layout) == RowMajor && m_dim.actualDim() == 0);
+ }
+
+ Dimensions m_dimensions;
+ Index m_stride;
+ Index m_inputOffset;
+ Index m_inputStride;
+ TensorEvaluator<ArgType, Device> m_impl;
+ const internal::DimensionId<DimId> m_dim;
+ const Device EIGEN_DEVICE_REF m_device;
+};
+
+
+// Eval as lvalue
+template<DenseIndex DimId, typename ArgType, typename Device>
+struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
+ : public TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
+{
+ typedef TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> Base;
+ typedef TensorChippingOp<DimId, ArgType> XprType;
+ static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+ static const int NumDims = NumInputDims-1;
+ typedef typename XprType::Index Index;
+ typedef DSizes<Index, NumDims> Dimensions;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+
+ enum {
+ IsAligned = false,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = TensorEvaluator<ArgType, Device>::RawAccess,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ RawAccess = false
+ };
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : Base(op, device)
+ { }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+ {
+ return this->m_impl.coeffRef(this->srcCoeff(index));
+ }
+
+ template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ void writePacket(Index index, const PacketReturnType& x)
+ {
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+
+ if (this->isInnerChipping()) {
+ // m_stride is equal to 1, so let's avoid the integer division.
+ eigen_assert(this->m_stride == 1);
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+ Index inputIndex = index * this->m_inputStride + this->m_inputOffset;
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < PacketSize; ++i) {
+ this->m_impl.coeffRef(inputIndex) = values[i];
+ inputIndex += this->m_inputStride;
+ }
+ } else if (this->isOuterChipping()) {
+ // m_stride is always greater than index, so let's avoid the integer division.
+ eigen_assert(this->m_stride > index);
+ this->m_impl.template writePacket<StoreMode>(index + this->m_inputOffset, x);
+ } else {
+ const Index idx = index / this->m_stride;
+ const Index rem = index - idx * this->m_stride;
+ if (rem + PacketSize <= this->m_stride) {
+ const Index inputIndex = idx * this->m_inputStride + this->m_inputOffset + rem;
+ this->m_impl.template writePacket<StoreMode>(inputIndex, x);
+ } else {
+ // Cross stride boundary. Fallback to slow path.
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < PacketSize; ++i) {
+ this->coeffRef(index) = values[i];
+ ++index;
+ }
+ }
+ }
+ }
+
+ template <typename TensorBlock>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
+ const TensorBlockDesc& desc, const TensorBlock& block) {
+ assert(this->m_impl.data() != NULL);
+
+ const Index chip_dim = this->m_dim.actualDim();
+
+ DSizes<Index, NumInputDims> input_block_dims;
+ for (int i = 0; i < NumInputDims; ++i) {
+ input_block_dims[i] = i < chip_dim ? desc.dimension(i)
+ : i > chip_dim ? desc.dimension(i - 1)
+ : 1;
+ }
+
+ typedef TensorReshapingOp<const DSizes<Index, NumInputDims>,
+ const typename TensorBlock::XprType>
+ TensorBlockExpr;
+
+ typedef internal::TensorBlockAssignment<Scalar, NumInputDims,
+ TensorBlockExpr, Index>
+ TensorBlockAssign;
+
+ TensorBlockAssign::Run(
+ TensorBlockAssign::target(
+ input_block_dims,
+ internal::strides<Layout>(this->m_impl.dimensions()),
+ this->m_impl.data(), this->srcCoeff(desc.offset())),
+ block.expr().reshape(input_block_dims));
+ }
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorConcatenation.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorConcatenation.h
new file mode 100644
index 0000000..5235a8e
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorConcatenation.h
@@ -0,0 +1,377 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H
+
+namespace Eigen {
+
+/** \class TensorConcatenationOp
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor concatenation class.
+ *
+ *
+ */
+namespace internal {
+template<typename Axis, typename LhsXprType, typename RhsXprType>
+struct traits<TensorConcatenationOp<Axis, LhsXprType, RhsXprType> >
+{
+ // Type promotion to handle the case where the types of the lhs and the rhs are different.
+ typedef typename promote_storage_type<typename LhsXprType::Scalar,
+ typename RhsXprType::Scalar>::ret Scalar;
+ typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
+ typename traits<RhsXprType>::StorageKind>::ret StorageKind;
+ typedef typename promote_index_type<typename traits<LhsXprType>::Index,
+ typename traits<RhsXprType>::Index>::type Index;
+ typedef typename LhsXprType::Nested LhsNested;
+ typedef typename RhsXprType::Nested RhsNested;
+ typedef typename remove_reference<LhsNested>::type _LhsNested;
+ typedef typename remove_reference<RhsNested>::type _RhsNested;
+ static const int NumDimensions = traits<LhsXprType>::NumDimensions;
+ static const int Layout = traits<LhsXprType>::Layout;
+ enum { Flags = 0 };
+ typedef typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
+ typename traits<LhsXprType>::PointerType, typename traits<RhsXprType>::PointerType>::type PointerType;
+};
+
+template<typename Axis, typename LhsXprType, typename RhsXprType>
+struct eval<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, Eigen::Dense>
+{
+ typedef const TensorConcatenationOp<Axis, LhsXprType, RhsXprType>& type;
+};
+
+template<typename Axis, typename LhsXprType, typename RhsXprType>
+struct nested<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, 1, typename eval<TensorConcatenationOp<Axis, LhsXprType, RhsXprType> >::type>
+{
+ typedef TensorConcatenationOp<Axis, LhsXprType, RhsXprType> type;
+};
+
+} // end namespace internal
+
+
+template<typename Axis, typename LhsXprType, typename RhsXprType>
+class TensorConcatenationOp : public TensorBase<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, WriteAccessors>
+{
+ public:
+ typedef TensorBase<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, WriteAccessors> Base;
+ typedef typename internal::traits<TensorConcatenationOp>::Scalar Scalar;
+ typedef typename internal::traits<TensorConcatenationOp>::StorageKind StorageKind;
+ typedef typename internal::traits<TensorConcatenationOp>::Index Index;
+ typedef typename internal::nested<TensorConcatenationOp>::type Nested;
+ typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType,
+ typename RhsXprType::CoeffReturnType>::ret CoeffReturnType;
+ typedef typename NumTraits<Scalar>::Real RealScalar;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConcatenationOp(const LhsXprType& lhs, const RhsXprType& rhs, Axis axis)
+ : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_axis(axis) {}
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename LhsXprType::Nested>::type&
+ lhsExpression() const { return m_lhs_xpr; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename RhsXprType::Nested>::type&
+ rhsExpression() const { return m_rhs_xpr; }
+
+ EIGEN_DEVICE_FUNC const Axis& axis() const { return m_axis; }
+
+ EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorConcatenationOp)
+ protected:
+ typename LhsXprType::Nested m_lhs_xpr;
+ typename RhsXprType::Nested m_rhs_xpr;
+ const Axis m_axis;
+};
+
+
+// Eval as rvalue
+template<typename Axis, typename LeftArgType, typename RightArgType, typename Device>
+struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device>
+{
+ typedef TensorConcatenationOp<Axis, LeftArgType, RightArgType> XprType;
+ typedef typename XprType::Index Index;
+ static const int NumDims = internal::array_size<typename TensorEvaluator<LeftArgType, Device>::Dimensions>::value;
+ static const int RightNumDims = internal::array_size<typename TensorEvaluator<RightArgType, Device>::Dimensions>::value;
+ typedef DSizes<Index, NumDims> Dimensions;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+ enum {
+ IsAligned = false,
+ PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &&
+ TensorEvaluator<RightArgType, Device>::PacketAccess,
+ BlockAccess = false,
+ PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess ||
+ TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
+ Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+ RawAccess = false
+ };
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis())
+ {
+ EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) || NumDims == 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
+ EIGEN_STATIC_ASSERT((NumDims == RightNumDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
+ EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+ eigen_assert(0 <= m_axis && m_axis < NumDims);
+ const Dimensions& lhs_dims = m_leftImpl.dimensions();
+ const Dimensions& rhs_dims = m_rightImpl.dimensions();
+ {
+ int i = 0;
+ for (; i < m_axis; ++i) {
+ eigen_assert(lhs_dims[i] > 0);
+ eigen_assert(lhs_dims[i] == rhs_dims[i]);
+ m_dimensions[i] = lhs_dims[i];
+ }
+ eigen_assert(lhs_dims[i] > 0); // Now i == m_axis.
+ eigen_assert(rhs_dims[i] > 0);
+ m_dimensions[i] = lhs_dims[i] + rhs_dims[i];
+ for (++i; i < NumDims; ++i) {
+ eigen_assert(lhs_dims[i] > 0);
+ eigen_assert(lhs_dims[i] == rhs_dims[i]);
+ m_dimensions[i] = lhs_dims[i];
+ }
+ }
+
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ m_leftStrides[0] = 1;
+ m_rightStrides[0] = 1;
+ m_outputStrides[0] = 1;
+
+ for (int j = 1; j < NumDims; ++j) {
+ m_leftStrides[j] = m_leftStrides[j-1] * lhs_dims[j-1];
+ m_rightStrides[j] = m_rightStrides[j-1] * rhs_dims[j-1];
+ m_outputStrides[j] = m_outputStrides[j-1] * m_dimensions[j-1];
+ }
+ } else {
+ m_leftStrides[NumDims - 1] = 1;
+ m_rightStrides[NumDims - 1] = 1;
+ m_outputStrides[NumDims - 1] = 1;
+
+ for (int j = NumDims - 2; j >= 0; --j) {
+ m_leftStrides[j] = m_leftStrides[j+1] * lhs_dims[j+1];
+ m_rightStrides[j] = m_rightStrides[j+1] * rhs_dims[j+1];
+ m_outputStrides[j] = m_outputStrides[j+1] * m_dimensions[j+1];
+ }
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+ // TODO(phli): Add short-circuit memcpy evaluation if underlying data are linear?
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType)
+ {
+ m_leftImpl.evalSubExprsIfNeeded(NULL);
+ m_rightImpl.evalSubExprsIfNeeded(NULL);
+ return true;
+ }
+
+ EIGEN_STRONG_INLINE void cleanup()
+ {
+ m_leftImpl.cleanup();
+ m_rightImpl.cleanup();
+ }
+
+ // TODO(phli): attempt to speed this up. The integer divisions and modulo are slow.
+ // See CL/76180724 comments for more ideas.
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ // Collect dimension-wise indices (subs).
+ array<Index, NumDims> subs;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = NumDims - 1; i > 0; --i) {
+ subs[i] = index / m_outputStrides[i];
+ index -= subs[i] * m_outputStrides[i];
+ }
+ subs[0] = index;
+ } else {
+ for (int i = 0; i < NumDims - 1; ++i) {
+ subs[i] = index / m_outputStrides[i];
+ index -= subs[i] * m_outputStrides[i];
+ }
+ subs[NumDims - 1] = index;
+ }
+
+ const Dimensions& left_dims = m_leftImpl.dimensions();
+ if (subs[m_axis] < left_dims[m_axis]) {
+ Index left_index;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ left_index = subs[0];
+ EIGEN_UNROLL_LOOP
+ for (int i = 1; i < NumDims; ++i) {
+ left_index += (subs[i] % left_dims[i]) * m_leftStrides[i];
+ }
+ } else {
+ left_index = subs[NumDims - 1];
+ EIGEN_UNROLL_LOOP
+ for (int i = NumDims - 2; i >= 0; --i) {
+ left_index += (subs[i] % left_dims[i]) * m_leftStrides[i];
+ }
+ }
+ return m_leftImpl.coeff(left_index);
+ } else {
+ subs[m_axis] -= left_dims[m_axis];
+ const Dimensions& right_dims = m_rightImpl.dimensions();
+ Index right_index;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ right_index = subs[0];
+ EIGEN_UNROLL_LOOP
+ for (int i = 1; i < NumDims; ++i) {
+ right_index += (subs[i] % right_dims[i]) * m_rightStrides[i];
+ }
+ } else {
+ right_index = subs[NumDims - 1];
+ EIGEN_UNROLL_LOOP
+ for (int i = NumDims - 2; i >= 0; --i) {
+ right_index += (subs[i] % right_dims[i]) * m_rightStrides[i];
+ }
+ }
+ return m_rightImpl.coeff(right_index);
+ }
+ }
+
+ // TODO(phli): Add a real vectorization.
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+ {
+ const int packetSize = PacketType<CoeffReturnType, Device>::size;
+ EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index + packetSize - 1 < dimensions().TotalSize());
+
+ EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < packetSize; ++i) {
+ values[i] = coeff(index+i);
+ }
+ PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+ return rslt;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+ costPerCoeff(bool vectorized) const {
+ const double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
+ 2 * TensorOpCost::MulCost<Index>() +
+ TensorOpCost::DivCost<Index>() +
+ TensorOpCost::ModCost<Index>());
+ const double lhs_size = m_leftImpl.dimensions().TotalSize();
+ const double rhs_size = m_rightImpl.dimensions().TotalSize();
+ return (lhs_size / (lhs_size + rhs_size)) *
+ m_leftImpl.costPerCoeff(vectorized) +
+ (rhs_size / (lhs_size + rhs_size)) *
+ m_rightImpl.costPerCoeff(vectorized) +
+ TensorOpCost(0, 0, compute_cost);
+ }
+
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+ #ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_leftImpl.bind(cgh);
+ m_rightImpl.bind(cgh);
+ }
+ #endif
+
+ protected:
+ Dimensions m_dimensions;
+ array<Index, NumDims> m_outputStrides;
+ array<Index, NumDims> m_leftStrides;
+ array<Index, NumDims> m_rightStrides;
+ TensorEvaluator<LeftArgType, Device> m_leftImpl;
+ TensorEvaluator<RightArgType, Device> m_rightImpl;
+ const Axis m_axis;
+};
+
+// Eval as lvalue
+template<typename Axis, typename LeftArgType, typename RightArgType, typename Device>
+ struct TensorEvaluator<TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device>
+ : public TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device>
+{
+ typedef TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device> Base;
+ typedef TensorConcatenationOp<Axis, LeftArgType, RightArgType> XprType;
+ typedef typename Base::Dimensions Dimensions;
+ enum {
+ IsAligned = false,
+ PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &&
+ TensorEvaluator<RightArgType, Device>::PacketAccess,
+ BlockAccess = false,
+ PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess ||
+ TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
+ Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+ RawAccess = false
+ };
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device)
+ : Base(op, device)
+ {
+ EIGEN_STATIC_ASSERT((static_cast<int>(Layout) == static_cast<int>(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+ }
+
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+ {
+ // Collect dimension-wise indices (subs).
+ array<Index, Base::NumDims> subs;
+ for (int i = Base::NumDims - 1; i > 0; --i) {
+ subs[i] = index / this->m_outputStrides[i];
+ index -= subs[i] * this->m_outputStrides[i];
+ }
+ subs[0] = index;
+
+ const Dimensions& left_dims = this->m_leftImpl.dimensions();
+ if (subs[this->m_axis] < left_dims[this->m_axis]) {
+ Index left_index = subs[0];
+ for (int i = 1; i < Base::NumDims; ++i) {
+ left_index += (subs[i] % left_dims[i]) * this->m_leftStrides[i];
+ }
+ return this->m_leftImpl.coeffRef(left_index);
+ } else {
+ subs[this->m_axis] -= left_dims[this->m_axis];
+ const Dimensions& right_dims = this->m_rightImpl.dimensions();
+ Index right_index = subs[0];
+ for (int i = 1; i < Base::NumDims; ++i) {
+ right_index += (subs[i] % right_dims[i]) * this->m_rightStrides[i];
+ }
+ return this->m_rightImpl.coeffRef(right_index);
+ }
+ }
+
+ template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ void writePacket(Index index, const PacketReturnType& x)
+ {
+ const int packetSize = PacketType<CoeffReturnType, Device>::size;
+ EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index + packetSize - 1 < this->dimensions().TotalSize());
+
+ EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
+ internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+ for (int i = 0; i < packetSize; ++i) {
+ coeffRef(index+i) = values[i];
+ }
+ }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorContraction.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorContraction.h
new file mode 100644
index 0000000..8b35f79
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorContraction.h
@@ -0,0 +1,1023 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H
+
+namespace Eigen {
+
+/** \class TensorContraction
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor contraction class.
+ *
+ *
+ */
+namespace internal {
+
+template<typename Dimensions, typename LhsXprType, typename RhsXprType, typename OutputKernelType>
+struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType> >
+{
+ // Type promotion to handle the case where the types of the lhs and the rhs are different.
+ typedef typename gebp_traits<typename remove_const<typename LhsXprType::Scalar>::type,
+ typename remove_const<typename RhsXprType::Scalar>::type>::ResScalar Scalar;
+
+ typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
+ typename traits<RhsXprType>::StorageKind>::ret StorageKind;
+ typedef typename promote_index_type<typename traits<LhsXprType>::Index,
+ typename traits<RhsXprType>::Index>::type Index;
+ typedef typename LhsXprType::Nested LhsNested;
+ typedef typename RhsXprType::Nested RhsNested;
+ typedef typename remove_reference<LhsNested>::type _LhsNested;
+ typedef typename remove_reference<RhsNested>::type _RhsNested;
+
+ // From NumDims below.
+ static const int NumDimensions = traits<LhsXprType>::NumDimensions + traits<RhsXprType>::NumDimensions - 2 * array_size<Dimensions>::value;
+ static const int Layout = traits<LhsXprType>::Layout;
+ typedef typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
+ typename traits<LhsXprType>::PointerType,
+ typename traits<RhsXprType>::PointerType>::type
+ PointerType;
+
+ enum {
+ Flags = 0
+ };
+};
+
+template<typename Dimensions, typename LhsXprType, typename RhsXprType, typename OutputKernelType>
+struct eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType>, Eigen::Dense>
+{
+ typedef const TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType>& type;
+};
+
+template<typename Dimensions, typename LhsXprType, typename RhsXprType, typename OutputKernelType>
+struct nested<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType>, 1, typename eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType> >::type>
+{
+ typedef TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType> type;
+};
+
+template<typename Indices_, typename LeftArgType_, typename RightArgType_, typename OutputKernelType_, typename Device_>
+struct traits<TensorEvaluator<const TensorContractionOp<Indices_, LeftArgType_, RightArgType_, OutputKernelType_>, Device_> > {
+ typedef Indices_ Indices;
+ typedef LeftArgType_ LeftArgType;
+ typedef RightArgType_ RightArgType;
+ typedef OutputKernelType_ OutputKernelType;
+ typedef Device_ Device;
+
+ // From NumDims below.
+ static const int NumDimensions = traits<LeftArgType_>::NumDimensions + traits<RightArgType_>::NumDimensions - 2 * array_size<Indices_>::value;
+};
+
+// Helper class to allocate and deallocate temporary memory for packed buffers.
+template <typename LhsScalar, typename RhsScalar>
+struct TensorContractionBlockMemAllocator {
+ typedef void* BlockMemHandle;
+
+ template <typename Device>
+ EIGEN_DEVICE_FUNC static BlockMemHandle allocate(Device& d, const Index bm,
+ const Index bk,
+ const Index bn,
+ LhsScalar** lhs_block,
+ RhsScalar** rhs_block) {
+ eigen_assert(lhs_block);
+ eigen_assert(rhs_block);
+ BlockSizes sz = ComputeLhsRhsBlockSizes(bm, bk, bn);
+ char* block_mem = static_cast<char*>(d.allocate(sz.lhs_size + sz.rhs_size));
+ eigen_assert(block_mem);
+ *lhs_block = reinterpret_cast<LhsScalar*>(block_mem);
+ *rhs_block = reinterpret_cast<RhsScalar*>(block_mem + sz.lhs_size);
+ return block_mem;
+ }
+
+ template <typename Device>
+ EIGEN_DEVICE_FUNC static BlockMemHandle allocateSlices(
+ Device& d, const Index bm, const Index bk, const Index bn,
+ const Index num_lhs, const Index num_rhs, const Index num_slices,
+ std::vector<LhsScalar*>* lhs_blocks,
+ std::vector<RhsScalar*>* rhs_blocks) {
+ eigen_assert(num_slices > 0);
+ eigen_assert(num_lhs >= 0 && num_rhs >= 0);
+ eigen_assert(num_lhs == 0 || lhs_blocks);
+ eigen_assert(num_rhs == 0 || rhs_blocks);
+ BlockSizes sz = ComputeLhsRhsBlockSizes(bm, bk, bn);
+ void* block_mem = d.allocate(
+ (num_lhs * sz.lhs_size + num_rhs * sz.rhs_size) * num_slices);
+ eigen_assert(block_mem);
+ char* mem = static_cast<char*>(block_mem);
+
+ for (Index x = 0; x < num_slices; x++) {
+ if (num_lhs > 0) lhs_blocks[x].resize(num_lhs);
+ for (Index m = 0; m < num_lhs; m++) {
+ lhs_blocks[x][m] = reinterpret_cast<LhsScalar*>(mem);
+ mem += sz.lhs_size;
+ }
+ if (num_rhs > 0) rhs_blocks[x].resize(num_rhs);
+ for (Index n = 0; n < num_rhs; n++) {
+ rhs_blocks[x][n] = reinterpret_cast<RhsScalar*>(mem);
+ mem += sz.rhs_size;
+ }
+ }
+
+ return block_mem;
+ }
+
+ template <typename Device>
+ EIGEN_DEVICE_FUNC static void deallocate(Device& d, BlockMemHandle handle) {
+ d.deallocate(handle);
+ }
+
+ private:
+ struct BlockSizes {
+ Index lhs_size;
+ Index rhs_size;
+ };
+ EIGEN_DEVICE_FUNC static BlockSizes ComputeLhsRhsBlockSizes(const Index bm,
+ const Index bk,
+ const Index bn) {
+ Index align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
+ BlockSizes sz;
+ sz.lhs_size = divup<Index>(bm * bk * sizeof(LhsScalar), align) * align;
+ sz.rhs_size = divup<Index>(bn * bk * sizeof(RhsScalar), align) * align;
+ return sz;
+ }
+};
+
+// WARNING: In this code we assume that Lhs and Rhs tensor expressions are in
+// ColMajor storage order. This property is guaranteed by the
+// TensorContractionOp evaluator. TensorContractionKernel specifies how we pack
+// blocks of Lhs and Rhs tensor expressions, and how we invoke matrix
+// multiplication for these blocks. Default tensor contraction uses
+// gemm_pack_rhs, gemm_pack_lhs and gebp_kernel from Eigen Core (see
+// GeneralBlocPanelKernel.h for details).
+//
+// By specializing contraction kernels we can use other low level libraries to
+// perform matrix multiplication, and still rely on Eigen contraction evaluator.
+// This also includes full support in TensorContractionThreadPool, assuming that
+// underlying gemm do not use it's own threading.
+//
+// - ResScalar/LhsScalar/RhsScalar - scalar type for the result of
+// multiplication, lhs tensor and rhs tensor respectively.
+//
+// - StorageIndex - index type for the tensor expressions. In practice almost
+// always is Eigen::Index.
+//
+// - OutputMapper provides access to the memory of the output matrix. In
+// practice it's always column major blas_data_mapper (it must be of ResScalar
+// type).
+//
+// - LhsMapper/RhsMapper similarly to blas_data_mapper provide a two dimensional
+// view into the Lhs/Rhs tensor expressions. In practice it's
+// TensorContractionInputMapper, or some specialization of it based on the
+// type of tensor expression (e.g. TensorImagePatchOp has optimized input
+// mapper).
+template <typename ResScalar, typename LhsScalar, typename RhsScalar,
+ typename StorageIndex, typename OutputMapper, typename LhsMapper,
+ typename RhsMapper>
+struct TensorContractionKernel {
+ // True if `invoke()` supports `beta` in `C <- alpha * A * B + beta * C`
+ // (otherwise beta should be always equal to 1).
+ enum { HasBeta = false };
+
+ EIGEN_DEVICE_FUNC
+ TensorContractionKernel(StorageIndex m_, StorageIndex k_, StorageIndex n_,
+ StorageIndex bm_, StorageIndex bk_, StorageIndex bn_)
+ : m(m_), k(k_), n(n_), bm(bm_), bk(bk_), bn(bn_) {}
+
+ // Pack blocks of Lhs and Rhs into contiguous blocks in memory.
+ typedef LhsScalar* LhsBlock;
+ typedef RhsScalar* RhsBlock;
+
+ // Packed Lhs/Rhs block memory allocator.
+ typedef TensorContractionBlockMemAllocator<LhsScalar, RhsScalar>
+ BlockMemAllocator;
+ typedef typename BlockMemAllocator::BlockMemHandle BlockMemHandle;
+
+ typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
+
+ typedef internal::gemm_pack_lhs<
+ LhsScalar, StorageIndex, typename LhsMapper::SubMapper, Traits::mr,
+ Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor>
+ LhsPacker;
+
+ typedef internal::gemm_pack_rhs<RhsScalar, StorageIndex,
+ typename RhsMapper::SubMapper, Traits::nr,
+ ColMajor>
+ RhsPacker;
+
+ typedef internal::gebp_kernel<LhsScalar, RhsScalar, StorageIndex,
+ OutputMapper, Traits::mr, Traits::nr,
+ /*ConjugateLhs*/ false, /*ConjugateRhs*/ false>
+ GebpKernel;
+
+ template <typename Device>
+ EIGEN_DEVICE_FUNC BlockMemHandle allocate(Device& d, LhsBlock* lhs_block,
+ RhsBlock* rhs_block) {
+ return BlockMemAllocator::allocate(d, bm, bk, bn, lhs_block, rhs_block);
+ }
+
+ template <typename Device>
+ EIGEN_DEVICE_FUNC BlockMemHandle allocateSlices(
+ Device& d, const StorageIndex num_lhs, const StorageIndex num_rhs,
+ const StorageIndex num_slices, std::vector<LhsBlock>* lhs_blocks,
+ std::vector<RhsBlock>* rhs_blocks) {
+ return BlockMemAllocator::allocateSlices(
+ d, bm, bk, bn, num_lhs, num_rhs, num_slices, lhs_blocks, rhs_blocks);
+ }
+
+ template <typename Device>
+ EIGEN_DEVICE_FUNC static void deallocate(Device& d, BlockMemHandle handle) {
+ BlockMemAllocator::deallocate(d, handle);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void packLhs(
+ LhsBlock* lhsBlock, const typename LhsMapper::SubMapper& data_mapper,
+ const StorageIndex depth, const StorageIndex rows) {
+ LhsPacker()(*lhsBlock, data_mapper, depth, rows, /*stride*/ 0,
+ /*offset*/ 0);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void packRhs(
+ RhsBlock* rhsBlock, const typename RhsMapper::SubMapper& data_mapper,
+ const StorageIndex depth, const StorageIndex cols) {
+ RhsPacker()(*rhsBlock, data_mapper, depth, cols);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void invoke(
+ const OutputMapper& output_mapper, const LhsBlock& lhsBlock,
+ const RhsBlock& rhsBlock, const StorageIndex rows,
+ const StorageIndex depth, const StorageIndex cols,
+ const ResScalar alpha, const ResScalar beta) {
+ // Default GEBP kernel does not support beta.
+ eigen_assert(beta == ResScalar(1));
+ static const int kComputeStrideFromBlockDimensions = -1;
+ GebpKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha,
+ /*strideA*/ kComputeStrideFromBlockDimensions,
+ /*strideB*/ kComputeStrideFromBlockDimensions,
+ /*offsetA*/ 0, /*offsetB*/ 0);
+ }
+
+ private:
+ // These are dimensions of the original Tensors, and selected block sizes. The
+ // actual block sizes passed to all function above might be smaller because of
+ // the partial blocks at the end.
+ const StorageIndex m;
+ const StorageIndex k;
+ const StorageIndex n;
+ const StorageIndex bm;
+ const StorageIndex bk;
+ const StorageIndex bn;
+};
+
+} // end namespace internal
+
+// Tensor contraction params that should enable to get from output matrix
+// 2-dimensional coordinates to the output tensor dimensions.
+struct TensorContractionParams {
+ // TensorContraction evaluator assumes that both tensors are in ColMajor
+ // layout, if tensors are in RowMajor evaluator swap lhs with rhs.
+ bool swapped_arguments;
+};
+
+// Output kernel allows to fuse operations into the tensor contraction.
+//
+// Examples:
+// 1. Elementwise Relu transformation following Conv2D.
+// 2. AddBias to the Conv2D output channels dimension.
+//
+// The NoOpOutputKernel implements an output kernel that does absolutely nothing.
+struct NoOpOutputKernel {
+ /**
+ * Tensor contraction evaluator calls this kernel after finishing each block
+ * of output matrix. Output blocks belong to the 2-dimensional output tensor.
+ *
+ * TensorContractionParams contains contraction dimensions information
+ * required to map output 2-d space into the expected output tensor space
+ * (potentially higher dimensional).
+ *
+ * \param[in] output_mapper Access to output tensor memory
+ * \param[in] params Tensor contraction parameters
+ * \param[in] i Index of a first row available through output_mapper
+ * \param[in] j Index of a first column available through output_mapper
+ * \param[in] num_rows Number of available rows
+ * \param[in] num_cols Number of available columns
+ */
+ template <typename Index, typename Scalar>
+ EIGEN_ALWAYS_INLINE void operator()(
+ const internal::blas_data_mapper<Scalar, Index, ColMajor>& output_mapper,
+ const TensorContractionParams& params, Index i,
+ Index j, Index num_rows, Index num_cols) const {
+ EIGEN_UNUSED_VARIABLE(output_mapper);
+ EIGEN_UNUSED_VARIABLE(params);
+ EIGEN_UNUSED_VARIABLE(i);
+ EIGEN_UNUSED_VARIABLE(j);
+ EIGEN_UNUSED_VARIABLE(num_rows);
+ EIGEN_UNUSED_VARIABLE(num_cols);
+ }
+};
+
+template<typename Indices, typename LhsXprType, typename RhsXprType, typename OutputKernelType = const NoOpOutputKernel>
+class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXprType, RhsXprType, OutputKernelType>, ReadOnlyAccessors>
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorContractionOp>::Scalar Scalar;
+ typedef typename internal::gebp_traits<typename LhsXprType::CoeffReturnType,
+ typename RhsXprType::CoeffReturnType>::ResScalar CoeffReturnType;
+ typedef typename Eigen::internal::nested<TensorContractionOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorContractionOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorContractionOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp(
+ const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims,
+ const OutputKernelType& output_kernel = OutputKernelType())
+ : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims),
+ m_output_kernel(output_kernel) {}
+
+ EIGEN_DEVICE_FUNC
+ const Indices& indices() const { return m_indices; }
+
+ /** \returns the nested expressions */
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename LhsXprType::Nested>::type&
+ lhsExpression() const { return m_lhs_xpr; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename RhsXprType::Nested>::type&
+ rhsExpression() const { return m_rhs_xpr; }
+
+ EIGEN_DEVICE_FUNC
+ const OutputKernelType& outputKernel() const { return m_output_kernel; }
+
+ protected:
+ typename LhsXprType::Nested m_lhs_xpr;
+ typename RhsXprType::Nested m_rhs_xpr;
+ const Indices m_indices;
+ const OutputKernelType m_output_kernel;
+};
+
+
+template<typename Derived>
+struct TensorContractionEvaluatorBase : internal::no_assignment_operator
+{
+ typedef typename internal::traits<Derived>::Indices Indices;
+ typedef typename internal::traits<Derived>::LeftArgType LeftArgType;
+ typedef typename internal::traits<Derived>::RightArgType RightArgType;
+ typedef typename internal::traits<Derived>::OutputKernelType OutputKernelType;
+ typedef typename internal::traits<Derived>::Device Device;
+
+ typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
+ typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+ typedef typename XprType::Index Index;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ typedef StorageMemory<Scalar, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+
+ enum {
+ IsAligned = true,
+ PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+ BlockAccess = false,
+ PreferBlockAccess = false,
+ Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = true
+ };
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ // Most of the code is assuming that both input tensors are ColMajor. If the
+ // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
+ // If we want to compute A * B = C, where A is LHS and B is RHS, the code
+ // will pretend B is LHS and A is RHS.
+ typedef typename internal::conditional<
+ static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
+ typedef typename internal::conditional<
+ static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
+
+ typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluatorType;
+ typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluatorType;
+
+ static const int LDims =
+ internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
+ static const int RDims =
+ internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
+ static const int ContractDims = internal::array_size<Indices>::value;
+ static const int NumDims = LDims + RDims - 2 * ContractDims;
+
+ typedef array<Index, ContractDims> contract_t;
+ typedef array<Index, LDims - ContractDims> left_nocontract_t;
+ typedef array<Index, RDims - ContractDims> right_nocontract_t;
+
+ typedef DSizes<Index, NumDims> Dimensions;
+
+ EIGEN_STRONG_INLINE
+ TensorContractionEvaluatorBase(const XprType& op, const Device& device)
+ : m_leftImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
+ op.lhsExpression(), op.rhsExpression()), device),
+ m_rightImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
+ op.rhsExpression(), op.lhsExpression()), device),
+ m_device(device),
+ m_output_kernel(op.outputKernel()),
+ m_result(NULL) {
+ EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
+ static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
+ YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+
+ DSizes<Index, LDims> eval_left_dims;
+ DSizes<Index, RDims> eval_right_dims;
+ array<IndexPair<Index>, ContractDims> eval_op_indices;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ // For ColMajor, we keep using the existing dimensions
+ for (int i = 0; i < LDims; i++) {
+ eval_left_dims[i] = m_leftImpl.dimensions()[i];
+ }
+ for (int i = 0; i < RDims; i++) {
+ eval_right_dims[i] = m_rightImpl.dimensions()[i];
+ }
+ // We keep the pairs of contracting indices.
+ for (int i = 0; i < ContractDims; i++) {
+ eval_op_indices[i].first = op.indices()[i].first;
+ eval_op_indices[i].second = op.indices()[i].second;
+ }
+ } else {
+ // For RowMajor, we need to reverse the existing dimensions
+ for (int i = 0; i < LDims; i++) {
+ eval_left_dims[i] = m_leftImpl.dimensions()[LDims - i - 1];
+ }
+ for (int i = 0; i < RDims; i++) {
+ eval_right_dims[i] = m_rightImpl.dimensions()[RDims - i - 1];
+ }
+ // We need to flip all the pairs of contracting indices as well as
+ // reversing the dimensions.
+ for (int i = 0; i < ContractDims; i++) {
+ eval_op_indices[i].first = LDims - 1 - op.indices()[ContractDims - 1 - i].second;
+ eval_op_indices[i].second = RDims - 1 - op.indices()[ContractDims - 1 - i].first;
+ }
+ }
+
+ // Check for duplicate axes and make sure the first index in eval_op_indices
+ // is increasing. Using O(n^2) sorting is OK since ContractDims is small
+ for (int i = 0; i < ContractDims; i++) {
+ for (int j = i + 1; j < ContractDims; j++) {
+ eigen_assert(eval_op_indices[j].first != eval_op_indices[i].first &&
+ eval_op_indices[j].second != eval_op_indices[i].second &&
+ "contraction axes should be unique");
+ if (eval_op_indices[j].first < eval_op_indices[i].first) {
+ numext::swap(eval_op_indices[j], eval_op_indices[i]);
+ }
+ }
+ }
+
+ array<Index, LDims> lhs_strides;
+ lhs_strides[0] = 1;
+ for (int i = 0; i < LDims-1; ++i) {
+ lhs_strides[i+1] = lhs_strides[i] * eval_left_dims[i];
+ }
+
+ array<Index, RDims> rhs_strides;
+ rhs_strides[0] = 1;
+ for (int i = 0; i < RDims-1; ++i) {
+ rhs_strides[i+1] = rhs_strides[i] * eval_right_dims[i];
+ }
+
+ if (m_i_strides.size() > 0) m_i_strides[0] = 1;
+ if (m_j_strides.size() > 0) m_j_strides[0] = 1;
+ if (m_k_strides.size() > 0) m_k_strides[0] = 1;
+
+ m_i_size = 1;
+ m_j_size = 1;
+ m_k_size = 1;
+
+ // To compute the dimension, we simply concatenate the non-contracting
+ // dimensions of the left and then the right tensor. Additionally, we also
+ // compute the strides corresponding to the left non-contracting
+ // dimensions and right non-contracting dimensions.
+ m_lhs_inner_dim_contiguous = true;
+ int dim_idx = 0;
+ Index nocontract_idx = 0;
+
+ for (int i = 0; i < LDims; i++) {
+ // find if we are contracting on index i of left tensor
+ bool contracting = false;
+ for (int j = 0; j < ContractDims; j++) {
+ if (eval_op_indices[j].first == i) {
+ contracting = true;
+ break;
+ }
+ }
+ if (!contracting) {
+ // add dimension size to output dimensions
+ m_dimensions[dim_idx] = eval_left_dims[i];
+ m_left_nocontract_strides[nocontract_idx] = lhs_strides[i];
+ if (dim_idx != i) {
+ m_lhs_inner_dim_contiguous = false;
+ }
+ if (nocontract_idx+1 < internal::array_size<left_nocontract_t>::value) {
+ m_i_strides[nocontract_idx+1] =
+ m_i_strides[nocontract_idx] * eval_left_dims[i];
+ } else {
+ m_i_size = m_i_strides[nocontract_idx] * eval_left_dims[i];
+ }
+ dim_idx++;
+ nocontract_idx++;
+ }
+ }
+
+ nocontract_idx = 0;
+ for (int i = 0; i < RDims; i++) {
+ bool contracting = false;
+ // find if we are contracting on index i of right tensor
+ for (int j = 0; j < ContractDims; j++) {
+ if (eval_op_indices[j].second == i) {
+ contracting = true;
+ break;
+ }
+ }
+ if (!contracting) {
+ m_dimensions[dim_idx] = eval_right_dims[i];
+ if (nocontract_idx+1 < internal::array_size<right_nocontract_t>::value) {
+ m_j_strides[nocontract_idx+1] =
+ m_j_strides[nocontract_idx] * eval_right_dims[i];
+ } else {
+ m_j_size = m_j_strides[nocontract_idx] * eval_right_dims[i];
+ }
+ m_right_nocontract_strides[nocontract_idx] = rhs_strides[i];
+ dim_idx++;
+ nocontract_idx++;
+ }
+ }
+
+ // Now compute the strides corresponding to the contracting dimensions. We
+ // assumed above that non-contracting axes are represented in the same order
+ // in the matrix as they are in the tensor. This is not the case for
+ // contracting axes. As the contracting axes must be of the same size in
+ // each tensor, we'll only look at the first tensor here.
+ m_rhs_inner_dim_contiguous = true;
+ m_rhs_inner_dim_reordered = false;
+ for (int i = 0; i < ContractDims; i++) {
+ Index left = eval_op_indices[i].first;
+ Index right = eval_op_indices[i].second;
+
+ Index size = eval_left_dims[left];
+ eigen_assert(size == eval_right_dims[right] &&
+ "Contraction axes must be same size");
+
+ if (i+1 < static_cast<int>(internal::array_size<contract_t>::value)) {
+ m_k_strides[i+1] = m_k_strides[i] * size;
+ } else {
+ m_k_size = m_k_strides[i] * size;
+ }
+ m_left_contracting_strides[i] = lhs_strides[left];
+ m_right_contracting_strides[i] = rhs_strides[right];
+
+ if (i > 0 && right < eval_op_indices[i-1].second) {
+ m_rhs_inner_dim_reordered = true;
+ }
+ if (right != i) {
+ m_rhs_inner_dim_contiguous = false;
+ }
+ }
+
+ // If the layout is RowMajor, we need to reverse the m_dimensions
+ if (static_cast<int>(Layout) == static_cast<int>(RowMajor)) {
+ for (int i = 0, j = NumDims - 1; i < j; i++, j--) {
+ numext::swap(m_dimensions[i], m_dimensions[j]);
+ }
+ }
+
+ // A set of parameters that will allow output kernel to get from output
+ // tensor dimensions (i, j) into the original tensor dimensions.
+ // TODO(ezhulenev): Add parameters required to infer output tensor index for
+ // more complex contractions than 2x2 on internal dimension.
+ m_tensor_contraction_params.swapped_arguments = static_cast<int>(Layout) == RowMajor;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+ m_leftImpl.evalSubExprsIfNeeded(NULL);
+ m_rightImpl.evalSubExprsIfNeeded(NULL);
+ if (data) {
+ evalTo(data);
+ return false;
+ } else {
+ m_result = static_cast<EvaluatorPointerType>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
+ evalTo(m_result);
+ return true;
+ }
+ }
+
+#ifdef EIGEN_USE_THREADS
+ template <typename EvalSubExprsCallback>
+ EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+ EvaluatorPointerType dest, EvalSubExprsCallback done) {
+ m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done, dest](bool) {
+ m_rightImpl.evalSubExprsIfNeededAsync(nullptr, [this, done, dest](bool) {
+ if (dest) {
+ evalToAsync(dest, [done]() { done(false); });
+ } else {
+ m_result = static_cast<EvaluatorPointerType>(
+ m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
+ evalToAsync(m_result, [done]() { done(true); });
+ }
+ });
+ });
+ }
+#endif // EIGEN_USE_THREADS
+
+#ifndef TENSOR_CONTRACTION_DISPATCH
+#define TENSOR_CONTRACTION_DISPATCH(METHOD, ALIGNMENT, ARGS) \
+ if (this->m_lhs_inner_dim_contiguous) { \
+ if (this->m_rhs_inner_dim_contiguous) { \
+ if (this->m_rhs_inner_dim_reordered) { \
+ METHOD<true, true, true, ALIGNMENT> ARGS; \
+ } else { \
+ METHOD<true, true, false, ALIGNMENT> ARGS; \
+ } \
+ } else { \
+ if (this->m_rhs_inner_dim_reordered) { \
+ METHOD<true, false, true, ALIGNMENT> ARGS; \
+ } else { \
+ METHOD<true, false, false, ALIGNMENT> ARGS; \
+ } \
+ } \
+ } else { \
+ if (this->m_rhs_inner_dim_contiguous) { \
+ if (this->m_rhs_inner_dim_reordered) { \
+ METHOD<false, true, true, ALIGNMENT> ARGS; \
+ } else { \
+ METHOD<false, true, false, ALIGNMENT> ARGS; \
+ } \
+ } else { \
+ if (this->m_rhs_inner_dim_reordered) { \
+ METHOD<false, false, true, ALIGNMENT> ARGS; \
+ } else { \
+ METHOD<false, false, false, ALIGNMENT> ARGS; \
+ } \
+ } \
+ }
+#endif
+
+#ifndef TENSOR_CONTRACTION_ASYNC_DISPATCH
+#define TENSOR_CONTRACTION_ASYNC_DISPATCH(METHOD, DONE, ALIGNMENT, ARGS, FN) \
+ if (this->m_lhs_inner_dim_contiguous) { \
+ if (this->m_rhs_inner_dim_contiguous) { \
+ if (this->m_rhs_inner_dim_reordered) { \
+ (new METHOD<DONE, true, true, true, ALIGNMENT> ARGS)->FN; \
+ } else { \
+ (new METHOD<DONE, true, true, false, ALIGNMENT> ARGS)->FN; \
+ } \
+ } else { \
+ if (this->m_rhs_inner_dim_reordered) { \
+ (new METHOD<DONE, true, false, true, ALIGNMENT> ARGS)->FN; \
+ } else { \
+ (new METHOD<DONE, true, false, false, ALIGNMENT> ARGS)->FN; \
+ } \
+ } \
+ } else { \
+ if (this->m_rhs_inner_dim_contiguous) { \
+ if (this->m_rhs_inner_dim_reordered) { \
+ (new METHOD<DONE, false, true, true, ALIGNMENT> ARGS)->FN; \
+ } else { \
+ (new METHOD<DONE, false, true, false, ALIGNMENT> ARGS)->FN; \
+ } \
+ } else { \
+ if (this->m_rhs_inner_dim_reordered) { \
+ (new METHOD<DONE, false, false, true, ALIGNMENT> ARGS)->FN; \
+ } else { \
+ (new METHOD<DONE, false, false, false, ALIGNMENT> ARGS)->FN; \
+ } \
+ } \
+ }
+#endif
+
+ EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const {
+ static_cast<const Derived*>(this)->template evalProduct<Unaligned>(buffer);
+ }
+
+#ifdef EIGEN_USE_THREADS
+ template <typename EvalToCallback>
+ void evalToAsync(Scalar* buffer, EvalToCallback done) const {
+ static_cast<const Derived*>(this)
+ ->template evalProductAsync<EvalToCallback, Unaligned>(buffer,
+ std::move(done));
+ }
+#endif // EIGEN_USE_THREADS
+
+ template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
+ bool rhs_inner_dim_reordered, int Alignment>
+ void evalProductSequential(Scalar* buffer) const {
+ if (this->m_j_size == 1) {
+ this->template evalGemv<lhs_inner_dim_contiguous,
+ rhs_inner_dim_contiguous, rhs_inner_dim_reordered,
+ Alignment>(buffer);
+ } else {
+ this->template evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous,
+ rhs_inner_dim_reordered, Alignment>(buffer);
+ }
+ }
+
+ template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+ #if !defined(EIGEN_HIPCC)
+ EIGEN_DEVICE_FUNC
+ #endif
+ void evalGemv(Scalar* buffer) const {
+ const Index rows = m_i_size;
+ const Index cols = m_k_size;
+
+ typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
+ typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
+ typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+ typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+ const Index lhs_packet_size = internal::unpacket_traits<typename LeftEvaluator::PacketReturnType>::size;
+ const Index rhs_packet_size = internal::unpacket_traits<typename RightEvaluator::PacketReturnType>::size;
+ const int lhs_alignment = LeftEvaluator::IsAligned ? Aligned : Unaligned;
+ const int rhs_alignment = RightEvaluator::IsAligned ? Aligned : Unaligned;
+ typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
+ LeftEvaluator, left_nocontract_t,
+ contract_t, lhs_packet_size,
+ lhs_inner_dim_contiguous,
+ false, lhs_alignment> LhsMapper;
+
+ typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
+ RightEvaluator, right_nocontract_t,
+ contract_t, rhs_packet_size,
+ rhs_inner_dim_contiguous,
+ rhs_inner_dim_reordered, rhs_alignment> RhsMapper;
+
+ LhsMapper lhs(m_leftImpl, m_left_nocontract_strides, m_i_strides,
+ m_left_contracting_strides, m_k_strides);
+ RhsMapper rhs(m_rightImpl, m_right_nocontract_strides, m_j_strides,
+ m_right_contracting_strides, m_k_strides);
+
+ const Scalar alpha(1);
+ const Index resIncr(1);
+
+ // zero out the result buffer (which must be of size at least rows * sizeof(Scalar)
+ m_device.memset(buffer, 0, rows * sizeof(Scalar));
+
+ internal::general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,false,RhsScalar,RhsMapper,false>::run(
+ rows, cols, lhs, rhs,
+ buffer, resIncr, alpha);
+
+ typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+ m_output_kernel(OutputMapper(buffer, rows), m_tensor_contraction_params,
+ static_cast<Index>(0), static_cast<Index>(0), rows,
+ static_cast<Index>(1));
+ }
+
+ template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+ #if !defined(EIGEN_HIPCC)
+ EIGEN_DEVICE_FUNC
+ #endif
+ void evalGemm(Scalar* buffer) const {
+ // columns in left side, rows in right side
+ const Index k = this->m_k_size;
+ this->template evalGemmPartial<lhs_inner_dim_contiguous,
+ rhs_inner_dim_contiguous,
+ rhs_inner_dim_reordered,
+ Alignment, true>(buffer, 0, k, 1);
+ }
+
+ template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
+ bool rhs_inner_dim_reordered, int Alignment>
+ EIGEN_DEVICE_FUNC void evalGemmPartialWithoutOutputKernel(
+ Scalar* buffer, Index k_start, Index k_end, int num_threads) const {
+ evalGemmPartial<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous,
+ rhs_inner_dim_reordered, Alignment,
+ /*use_output_kernel*/ false>(buffer, k_start, k_end,
+ num_threads);
+ }
+
+ template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment, bool use_output_kernel>
+ EIGEN_DEVICE_FUNC void evalGemmPartial(Scalar* buffer, Index k_start, Index k_end, int num_threads) const {
+ eigen_assert(k_end >= k_start && k_start >= 0 && k_end <= this->m_k_size);
+ // columns in slice on left side, rows on right side
+ const Index k_slice = k_end - k_start;
+
+ // rows in left side
+ const Index m = this->m_i_size;
+
+ // columns in right side
+ const Index n = this->m_j_size;
+
+ // define data mappers for Lhs and Rhs
+ typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
+ typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
+
+ typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+ typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+
+ const Index lhs_packet_size = internal::unpacket_traits<typename LeftEvaluator::PacketReturnType>::size;
+ const Index rhs_packet_size = internal::unpacket_traits<typename RightEvaluator::PacketReturnType>::size;
+
+ typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
+ LeftEvaluator, left_nocontract_t,
+ contract_t, lhs_packet_size,
+ lhs_inner_dim_contiguous,
+ false, Unaligned> LhsMapper;
+
+ typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
+ RightEvaluator, right_nocontract_t,
+ contract_t, rhs_packet_size,
+ rhs_inner_dim_contiguous,
+ rhs_inner_dim_reordered, Unaligned> RhsMapper;
+
+ typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+
+ typedef internal::TensorContractionKernel<
+ Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper>
+ TensorContractionKernel;
+
+ // initialize data mappers
+ LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
+ this->m_left_contracting_strides, this->m_k_strides);
+
+ RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
+ this->m_right_contracting_strides, this->m_k_strides);
+
+ OutputMapper output(buffer, m);
+
+ // Sizes of the blocks to load in cache. See the Goto paper for details.
+ internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar,
+ Index, internal::ShardByCol>
+ blocking(k_slice, m, n, num_threads);
+ const Index kc = blocking.kc();
+ const Index mc = numext::mini(m, blocking.mc());
+ const Index nc = numext::mini(n, blocking.nc());
+
+ typedef typename TensorContractionKernel::LhsBlock LhsBlock;
+ typedef typename TensorContractionKernel::RhsBlock RhsBlock;
+
+ LhsBlock blockA;
+ RhsBlock blockB;
+
+ TensorContractionKernel kernel(m, k_slice, n, mc, kc, nc);
+
+ typedef typename TensorContractionKernel::BlockMemHandle BlockMemHandle;
+ const BlockMemHandle packed_mem =
+ kernel.allocate(this->m_device, &blockA, &blockB);
+
+ // If a contraction kernel does not support beta, explicitly initialize
+ // output buffer with zeroes.
+ if (!TensorContractionKernel::HasBeta) {
+ this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
+ }
+
+ for(Index i2=0; i2<m; i2+=mc)
+ {
+ const Index actual_mc = numext::mini(i2+mc,m)-i2;
+ for (Index k2 = k_start; k2 < k_end; k2 += kc) {
+ // make sure we don't overshoot right edge of left matrix, then pack vertical panel
+ const Index actual_kc = numext::mini(k2 + kc, k_end) - k2;
+ kernel.packLhs(&blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
+
+ // If kernel supports beta, there is no need to initialize output
+ // buffer with zeroes.
+ const Scalar alpha = Scalar(1);
+ const Scalar beta = (TensorContractionKernel::HasBeta && k2 == k_start)
+ ? Scalar(0)
+ : Scalar(1);
+
+ // series of horizontal blocks
+ for (Index j2 = 0; j2 < n; j2 += nc) {
+ // make sure we don't overshoot right edge of right matrix, then pack block
+ const Index actual_nc = numext::mini(j2 + nc, n) - j2;
+ kernel.packRhs(&blockB, rhs.getSubMapper(k2, j2), actual_kc,
+ actual_nc);
+
+ // call gebp (matrix kernel)
+ // The parameters here are copied from Eigen's GEMM implementation
+ const OutputMapper output_mapper = output.getSubMapper(i2, j2);
+ kernel.invoke(output_mapper, blockA, blockB, actual_mc, actual_kc,
+ actual_nc, alpha, beta);
+
+ // We are done with this [i2, j2] output block.
+ if (use_output_kernel && k2 + kc >= k_end) {
+ m_output_kernel(output_mapper, m_tensor_contraction_params, i2, j2,
+ actual_mc, actual_nc);
+ }
+ }
+ }
+ }
+
+ kernel.deallocate(this->m_device, packed_mem);
+ }
+
+ EIGEN_STRONG_INLINE void cleanup() {
+ m_leftImpl.cleanup();
+ m_rightImpl.cleanup();
+
+ if (m_result != NULL) {
+ m_device.deallocate(m_result);
+ m_result = NULL;
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+ return m_result[index];
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
+ return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+ return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_result; }
+
+protected:
+ Dimensions m_dimensions;
+
+ contract_t m_k_strides;
+ contract_t m_left_contracting_strides;
+ contract_t m_right_contracting_strides;
+
+ bool m_lhs_inner_dim_contiguous;
+ bool m_rhs_inner_dim_contiguous;
+ bool m_rhs_inner_dim_reordered;
+
+ left_nocontract_t m_i_strides;
+ right_nocontract_t m_j_strides;
+ left_nocontract_t m_left_nocontract_strides;
+ right_nocontract_t m_right_nocontract_strides;
+
+ Index m_i_size;
+ Index m_j_size;
+ Index m_k_size;
+
+ TensorContractionParams m_tensor_contraction_params;
+
+ TensorEvaluator<EvalLeftArgType, Device> m_leftImpl;
+ TensorEvaluator<EvalRightArgType, Device> m_rightImpl;
+ const Device EIGEN_DEVICE_REF m_device;
+ OutputKernelType m_output_kernel;
+ EvaluatorPointerType m_result;
+};
+
+
+// evaluator for default device
+template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType, typename Device>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> :
+ public TensorContractionEvaluatorBase<
+ TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> > {
+ typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
+ typedef TensorContractionEvaluatorBase<Self> Base;
+
+ typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
+ typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+ typedef typename XprType::Index Index;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+
+ enum {
+ Layout = TensorEvaluator<LeftArgType, Device>::Layout
+ };
+
+ // Most of the code is assuming that both input tensors are ColMajor. If the
+ // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
+ // If we want to compute A * B = C, where A is LHS and B is RHS, the code
+ // will pretend B is LHS and A is RHS.
+ typedef typename internal::conditional<
+ static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
+ typedef typename internal::conditional<
+ static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
+
+ static const int LDims =
+ internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
+ static const int RDims =
+ internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
+ static const int ContractDims = internal::array_size<Indices>::value;
+
+ typedef array<Index, ContractDims> contract_t;
+ typedef array<Index, LDims - ContractDims> left_nocontract_t;
+ typedef array<Index, RDims - ContractDims> right_nocontract_t;
+
+ static const int NumDims = LDims + RDims - 2 * ContractDims;
+
+ // Could we use NumDimensions here?
+ typedef DSizes<Index, NumDims> Dimensions;
+
+ TensorEvaluator(const XprType& op, const Device& device) :
+ Base(op, device) { }
+
+ template <int Alignment>
+ void evalProduct(Scalar* buffer) const {
+ TENSOR_CONTRACTION_DISPATCH(this->template evalProductSequential, Alignment, (buffer));
+ }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionBlocking.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionBlocking.h
new file mode 100644
index 0000000..974feb0
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionBlocking.h
@@ -0,0 +1,73 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H
+
+
+namespace Eigen {
+namespace internal {
+
+enum {
+ ShardByRow = 0,
+ ShardByCol = 1
+};
+
+
+// Default Blocking Strategy
+template<typename ResScalar, typename LhsScalar, typename RhsScalar, typename StorageIndex, int ShardingType = ShardByCol>
+class TensorContractionBlocking {
+ public:
+
+ /*
+ adding EIGEN_DEVICE_FUNC unconditionally to 'TensorContractionBlocking' constructor in `TensorContractionBlocking.h`
+ requires adding EIGEN_DEVICE_FUNC to `computeProductBlockingSizes` in `GeneralBlockPanelKernel.h`
+ which in turn, requires adding EIGEN_DEVICE_FUNC to `evaluateProductBlockingSizesHeuristic` in `GeneralBlockPanelKernel.h`
+ which in turn, requires adding EIGEN_DEVICE_FUNC to `manage_caching_sizes` in `GeneralBlockPanelKernel.h`
+ (else HIPCC will error out)
+
+ However adding EIGEN_DEVICE_FUNC to `manage_caching_sizes` in `GeneralBlockPanelKernel.h`
+ results in NVCC erroring out with the following error
+
+ ../Eigen/src/Core/products/GeneralBlockPanelKernel.h(57): error #2901:
+ dynamic initialization is not supported for function-scope static variables within a __device__/__global__ function
+ */
+
+ #if !defined(EIGEN_HIPCC)
+ EIGEN_DEVICE_FUNC
+ #endif
+ TensorContractionBlocking(StorageIndex k, StorageIndex m, StorageIndex n, StorageIndex num_threads = 1) :
+ kc_(k), mc_(m), nc_(n)
+ {
+ if (ShardingType == ShardByCol) {
+ computeProductBlockingSizes<LhsScalar, RhsScalar, 1>(kc_, mc_, nc_, num_threads);
+ }
+ else {
+ computeProductBlockingSizes<LhsScalar, RhsScalar, 1>(kc_, nc_, mc_, num_threads);
+ }
+
+ const int rhs_packet_size = internal::packet_traits<RhsScalar>::size;
+ kc_ = (rhs_packet_size <= 8 || kc_ <= rhs_packet_size) ?
+ kc_ : (kc_ / rhs_packet_size) * rhs_packet_size;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; }
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex mc() const { return mc_; }
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex nc() const { return nc_; }
+
+ private:
+ StorageIndex kc_;
+ StorageIndex mc_;
+ StorageIndex nc_;
+};
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionCuda.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionCuda.h
new file mode 100644
index 0000000..3f315fe
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionCuda.h
@@ -0,0 +1,6 @@
+
+#if defined(__clang__) || defined(__GNUC__)
+#warning "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorContractionGpu.h file"
+#endif
+
+#include "TensorContractionGpu.h"
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionGpu.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionGpu.h
new file mode 100644
index 0000000..c818038
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionGpu.h
@@ -0,0 +1,1413 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2015 Navdeep Jaitly <ndjaitly@google.com>
+// Copyright (C) 2014 Eric Martin <eric@ericmart.in>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H
+
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
+
+namespace Eigen {
+
+template<typename Scalar, typename Index, typename LhsMapper,
+ typename RhsMapper, typename OutputMapper, bool needs_edge_check>
+__device__ EIGEN_STRONG_INLINE void
+EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
+ const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem,
+ const Index m_size, const Index n_size, const Index k_size) {
+
+ const Index m_block_idx = blockIdx.x;
+ const Index n_block_idx = blockIdx.y;
+
+ const Index base_m = 64 * m_block_idx;
+ const Index base_n = 64 * n_block_idx;
+
+ // declare and initialize 64 registers for output 8x8 block
+
+ // prefetch registers
+ Scalar lhs_pf0;
+ Scalar lhs_pf1;
+ Scalar lhs_pf2;
+ Scalar lhs_pf3;
+ Scalar lhs_pf4;
+ Scalar lhs_pf5;
+ Scalar lhs_pf6;
+ Scalar lhs_pf7;
+
+ Scalar rhs_pf0;
+ Scalar rhs_pf1;
+ Scalar rhs_pf2;
+ Scalar rhs_pf3;
+ Scalar rhs_pf4;
+ Scalar rhs_pf5;
+ Scalar rhs_pf6;
+ Scalar rhs_pf7;
+
+ // shared memory is formatted
+ // (contract idx in block, nocontract idx in block, block idx)
+ // where block idx is column major. This transposition limits the number of
+ // bank conflicts when reading the LHS. The core idea is that since the contracting
+ // index is shared by both sides, then the contracting index should be in threadIdx.x.
+
+ // On the LHS, we pad each row inside of each block with an extra element. This makes
+ // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts
+ // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks.
+
+ // On the RHS we just add 8 padding elements to the end of each block. This gives no bank
+ // conflicts on writes and also none on reads.
+
+ // storage indices
+ const Index lhs_store_idx_base = threadIdx.y * 72 + threadIdx.x * 9 + threadIdx.z;
+ const Index rhs_store_idx_base = threadIdx.y * 72 + threadIdx.z * 8 + threadIdx.x;
+
+ const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0;
+ const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1;
+ const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2;
+ const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3;
+ const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4;
+ const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5;
+ const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6;
+ const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7;
+
+ const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0;
+ const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1;
+ const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2;
+ const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3;
+ const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4;
+ const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5;
+ const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6;
+ const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7;
+
+ // in the loading code, the following variables are important:
+ // threadIdx.x: the vertical position in an 8x8 block
+ // threadIdx.y: the vertical index of the 8x8 block in the grid
+ // threadIdx.z: the horizontal position in an 8x8 block
+ // k: the horizontal index of the 8x8 block in the grid
+ //
+ // The k parameter is implicit (it was the loop counter for a loop that went
+ // from 0 to <8, but now that loop is unrolled in the below code.
+
+ const Index load_idx_vert = threadIdx.x + 8 * threadIdx.y;
+ const Index lhs_vert = base_m + load_idx_vert;
+
+#define prefetchIntoRegisters(base_k) \
+ { \
+ lhs_pf0 = conv(0); \
+ lhs_pf1 = conv(0); \
+ lhs_pf2 = conv(0); \
+ lhs_pf3 = conv(0); \
+ lhs_pf4 = conv(0); \
+ lhs_pf5 = conv(0); \
+ lhs_pf6 = conv(0); \
+ lhs_pf7 = conv(0); \
+ \
+ rhs_pf0 = conv(0); \
+ rhs_pf1 = conv(0); \
+ rhs_pf2 = conv(0); \
+ rhs_pf3 = conv(0); \
+ rhs_pf4 = conv(0); \
+ rhs_pf5 = conv(0); \
+ rhs_pf6 = conv(0); \
+ rhs_pf7 = conv(0); \
+ \
+ if (!needs_edge_check || lhs_vert < m_size) { \
+ const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8; \
+ const Index lhs_horiz_1 = base_k + threadIdx.z + 1 * 8; \
+ const Index lhs_horiz_2 = base_k + threadIdx.z + 2 * 8; \
+ const Index lhs_horiz_3 = base_k + threadIdx.z + 3 * 8; \
+ const Index lhs_horiz_4 = base_k + threadIdx.z + 4 * 8; \
+ const Index lhs_horiz_5 = base_k + threadIdx.z + 5 * 8; \
+ const Index lhs_horiz_6 = base_k + threadIdx.z + 6 * 8; \
+ const Index lhs_horiz_7 = base_k + threadIdx.z + 7 * 8; \
+ \
+ if (!needs_edge_check || lhs_horiz_7 < k_size) { \
+ lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
+ lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
+ lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
+ lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \
+ lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \
+ lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \
+ lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \
+ lhs_pf7 = lhs(lhs_vert, lhs_horiz_7); \
+ } else if (lhs_horiz_6 < k_size) { \
+ lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
+ lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
+ lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
+ lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \
+ lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \
+ lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \
+ lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \
+ } else if (lhs_horiz_5 < k_size) { \
+ lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
+ lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
+ lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
+ lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \
+ lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \
+ lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \
+ } else if (lhs_horiz_4 < k_size) { \
+ lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
+ lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
+ lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
+ lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \
+ lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \
+ } else if (lhs_horiz_3 < k_size) { \
+ lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
+ lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
+ lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
+ lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \
+ } else if (lhs_horiz_2 < k_size) { \
+ lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
+ lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
+ lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \
+ } else if (lhs_horiz_1 < k_size) { \
+ lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
+ lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \
+ } else if (lhs_horiz_0 < k_size) { \
+ lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \
+ } \
+ } \
+ \
+ const Index rhs_vert = base_k + load_idx_vert; \
+ if (!needs_edge_check || rhs_vert < k_size) { \
+ const Index rhs_horiz_0 = base_n + threadIdx.z + 0 * 8; \
+ const Index rhs_horiz_1 = base_n + threadIdx.z + 1 * 8; \
+ const Index rhs_horiz_2 = base_n + threadIdx.z + 2 * 8; \
+ const Index rhs_horiz_3 = base_n + threadIdx.z + 3 * 8; \
+ const Index rhs_horiz_4 = base_n + threadIdx.z + 4 * 8; \
+ const Index rhs_horiz_5 = base_n + threadIdx.z + 5 * 8; \
+ const Index rhs_horiz_6 = base_n + threadIdx.z + 6 * 8; \
+ const Index rhs_horiz_7 = base_n + threadIdx.z + 7 * 8; \
+ \
+ if (rhs_horiz_7 < n_size) { \
+ rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
+ rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
+ rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
+ rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \
+ rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \
+ rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \
+ rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \
+ rhs_pf7 = rhs(rhs_vert, rhs_horiz_7); \
+ } else if (rhs_horiz_6 < n_size) { \
+ rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
+ rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
+ rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
+ rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \
+ rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \
+ rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \
+ rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \
+ } else if (rhs_horiz_5 < n_size) { \
+ rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
+ rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
+ rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
+ rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \
+ rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \
+ rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \
+ } else if (rhs_horiz_4 < n_size) { \
+ rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
+ rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
+ rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
+ rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \
+ rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \
+ } else if (rhs_horiz_3 < n_size) { \
+ rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
+ rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
+ rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
+ rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \
+ } else if (rhs_horiz_2 < n_size) { \
+ rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
+ rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
+ rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \
+ } else if (rhs_horiz_1 < n_size) { \
+ rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
+ rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \
+ } else if (rhs_horiz_0 < n_size) { \
+ rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \
+ } \
+ } \
+ } \
+
+#define writeRegToShmem(_) \
+ lhs_shmem[lhs_store_idx_0] = lhs_pf0; \
+ rhs_shmem[rhs_store_idx_0] = rhs_pf0; \
+ \
+ lhs_shmem[lhs_store_idx_1] = lhs_pf1; \
+ rhs_shmem[rhs_store_idx_1] = rhs_pf1; \
+ \
+ lhs_shmem[lhs_store_idx_2] = lhs_pf2; \
+ rhs_shmem[rhs_store_idx_2] = rhs_pf2; \
+ \
+ lhs_shmem[lhs_store_idx_3] = lhs_pf3; \
+ rhs_shmem[rhs_store_idx_3] = rhs_pf3; \
+ \
+ lhs_shmem[lhs_store_idx_4] = lhs_pf4; \
+ rhs_shmem[rhs_store_idx_4] = rhs_pf4; \
+ \
+ lhs_shmem[lhs_store_idx_5] = lhs_pf5; \
+ rhs_shmem[rhs_store_idx_5] = rhs_pf5; \
+ \
+ lhs_shmem[lhs_store_idx_6] = lhs_pf6; \
+ rhs_shmem[rhs_store_idx_6] = rhs_pf6; \
+ \
+ lhs_shmem[lhs_store_idx_7] = lhs_pf7; \
+ rhs_shmem[rhs_store_idx_7] = rhs_pf7; \
+
+ // declare and initialize result array
+#define res(i, j) _res_##i##j
+#define initResultRow(i) \
+ Scalar res(i, 0) = conv(0); \
+ Scalar res(i, 1) = conv(0); \
+ Scalar res(i, 2) = conv(0); \
+ Scalar res(i, 3) = conv(0); \
+ Scalar res(i, 4) = conv(0); \
+ Scalar res(i, 5) = conv(0); \
+ Scalar res(i, 6) = conv(0); \
+ Scalar res(i, 7) = conv(0); \
+
+ internal::scalar_cast_op<int, Scalar> conv;
+ initResultRow(0);
+ initResultRow(1);
+ initResultRow(2);
+ initResultRow(3);
+ initResultRow(4);
+ initResultRow(5);
+ initResultRow(6);
+ initResultRow(7);
+#undef initResultRow
+
+ for (Index base_k = 0; base_k < k_size; base_k += 64) {
+ // wait for previous iteration to finish with shmem. Despite common sense,
+ // the code is a bit faster with this here then at bottom of loop
+ __syncthreads();
+
+ prefetchIntoRegisters(base_k);
+ writeRegToShmem();
+
+ #undef prefetchIntoRegisters
+ #undef writeRegToShmem
+
+ // wait for shared mem packing to be done before starting computation
+ __syncthreads();
+
+ // compute 8x8 matrix product by outer product. This involves packing one column
+ // of LHS and one row of RHS into registers (takes 16 registers).
+
+#define lcol(i) _lcol##i
+ Scalar lcol(0);
+ Scalar lcol(1);
+ Scalar lcol(2);
+ Scalar lcol(3);
+ Scalar lcol(4);
+ Scalar lcol(5);
+ Scalar lcol(6);
+ Scalar lcol(7);
+
+#define rrow(j) _rrow##j
+ Scalar rrow(0);
+ Scalar rrow(1);
+ Scalar rrow(2);
+ Scalar rrow(3);
+ Scalar rrow(4);
+ Scalar rrow(5);
+ Scalar rrow(6);
+ Scalar rrow(7);
+
+ // Now x corresponds to k, y to m, and z to n
+ const Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y];
+ const Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z];
+
+#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))]
+#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))]
+
+#define loadData(i, j) \
+ lcol(0) = lhs_element(0, j); \
+ rrow(0) = rhs_element(i, 0); \
+ lcol(1) = lhs_element(1, j); \
+ rrow(1) = rhs_element(i, 1); \
+ lcol(2) = lhs_element(2, j); \
+ rrow(2) = rhs_element(i, 2); \
+ lcol(3) = lhs_element(3, j); \
+ rrow(3) = rhs_element(i, 3); \
+ lcol(4) = lhs_element(4, j); \
+ rrow(4) = rhs_element(i, 4); \
+ lcol(5) = lhs_element(5, j); \
+ rrow(5) = rhs_element(i, 5); \
+ lcol(6) = lhs_element(6, j); \
+ rrow(6) = rhs_element(i, 6); \
+ lcol(7) = lhs_element(7, j); \
+ rrow(7) = rhs_element(i, 7); \
+
+#define computeCol(j) \
+ res(0, j) += lcol(0) * rrow(j); \
+ res(1, j) += lcol(1) * rrow(j); \
+ res(2, j) += lcol(2) * rrow(j); \
+ res(3, j) += lcol(3) * rrow(j); \
+ res(4, j) += lcol(4) * rrow(j); \
+ res(5, j) += lcol(5) * rrow(j); \
+ res(6, j) += lcol(6) * rrow(j); \
+ res(7, j) += lcol(7) * rrow(j); \
+
+#define computePass(i) \
+ loadData(i, i); \
+ \
+ computeCol(0); \
+ computeCol(1); \
+ computeCol(2); \
+ computeCol(3); \
+ computeCol(4); \
+ computeCol(5); \
+ computeCol(6); \
+ computeCol(7); \
+
+ computePass(0);
+ computePass(1);
+ computePass(2);
+ computePass(3);
+ computePass(4);
+ computePass(5);
+ computePass(6);
+ computePass(7);
+
+#undef lcol
+#undef rrow
+#undef lhs_element
+#undef rhs_element
+#undef loadData
+#undef computeCol
+#undef computePass
+ } // end loop over k
+
+ // we've now iterated over all of the large (ie width 64) k blocks and
+ // accumulated results in registers. At this point thread (x, y, z) contains
+ // the sum across all big k blocks of the product of little k block of index (x, y)
+ // with block of index (y, z). To compute the final output, we need to reduce
+ // the 8 threads over y by summation.
+#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
+#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
+#else
+#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask)
+#endif
+
+#define reduceRow(i, mask) \
+ shuffleInc(i, 0, mask); \
+ shuffleInc(i, 1, mask); \
+ shuffleInc(i, 2, mask); \
+ shuffleInc(i, 3, mask); \
+ shuffleInc(i, 4, mask); \
+ shuffleInc(i, 5, mask); \
+ shuffleInc(i, 6, mask); \
+ shuffleInc(i, 7, mask); \
+
+#define reduceMatrix(mask) \
+ reduceRow(0, mask); \
+ reduceRow(1, mask); \
+ reduceRow(2, mask); \
+ reduceRow(3, mask); \
+ reduceRow(4, mask); \
+ reduceRow(5, mask); \
+ reduceRow(6, mask); \
+ reduceRow(7, mask); \
+
+ // actually perform the reduction, now each thread of index (_, y, z)
+ // contains the correct values in its registers that belong in the output
+ // block
+ reduceMatrix(1);
+ reduceMatrix(2);
+ reduceMatrix(4);
+
+#undef shuffleInc
+#undef reduceRow
+#undef reduceMatrix
+
+ // now we need to copy the 64 values into main memory. We can't split work
+ // among threads because all variables are in registers. There's 2 ways
+ // to do this:
+ // (1) have 1 thread do 64 writes from registers into global memory
+ // (2) have 1 thread do 64 writes into shared memory, and then 8 threads
+ // each do 8 writes into global memory. We can just overwrite the shared
+ // memory from the problem we just solved.
+ // (2) is slightly faster than (1) due to less branching and more ILP
+
+ // TODO: won't yield much gain, but could just use currently unused shared mem
+ // and then we won't have to sync
+ // wait for shared mem to be out of use
+ __syncthreads();
+
+#define writeResultShmem(i, j) \
+ lhs_shmem[i + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j] = res(i, j); \
+
+#define writeRow(i) \
+ writeResultShmem(i, 0); \
+ writeResultShmem(i, 1); \
+ writeResultShmem(i, 2); \
+ writeResultShmem(i, 3); \
+ writeResultShmem(i, 4); \
+ writeResultShmem(i, 5); \
+ writeResultShmem(i, 6); \
+ writeResultShmem(i, 7); \
+
+ if (threadIdx.x == 0) {
+ writeRow(0);
+ writeRow(1);
+ writeRow(2);
+ writeRow(3);
+ writeRow(4);
+ writeRow(5);
+ writeRow(6);
+ writeRow(7);
+ }
+#undef writeResultShmem
+#undef writeRow
+
+ const int max_i_write = numext::mini((int)((m_size - base_m - threadIdx.y + 7) / 8), 8);
+ const int max_j_write = numext::mini((int)((n_size - base_n - threadIdx.z + 7) / 8), 8);
+
+ if (threadIdx.x < max_i_write) {
+ if (max_j_write == 8) {
+ // TODO: can i trade bank conflicts for coalesced writes?
+ Scalar val0 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 0];
+ Scalar val1 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 1];
+ Scalar val2 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 2];
+ Scalar val3 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 3];
+ Scalar val4 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 4];
+ Scalar val5 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 5];
+ Scalar val6 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 6];
+ Scalar val7 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 7];
+
+ output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 0) = val0;
+ output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 1) = val1;
+ output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 2) = val2;
+ output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 3) = val3;
+ output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 4) = val4;
+ output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 5) = val5;
+ output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 6) = val6;
+ output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 7) = val7;
+ } else {
+#pragma unroll 7
+ for (int j = 0; j < max_j_write; j++) {
+ Scalar val = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j];
+ output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * j) = val;
+ }
+ }
+ }
+#undef res
+}
+
+
+template<typename Scalar, typename Index, typename LhsMapper,
+ typename RhsMapper, typename OutputMapper>
+__global__ void
+#if defined(EIGEN_HIPCC)
+__launch_bounds__(512, 1)
+#else
+__launch_bounds__(512)
+#endif
+EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
+ const OutputMapper output,
+ const Index m_size, const Index n_size, const Index k_size) {
+ __shared__ Scalar lhs_shmem[72 * 64];
+ __shared__ Scalar rhs_shmem[72 * 64];
+
+ const Index m_block_idx = blockIdx.x;
+ const Index n_block_idx = blockIdx.y;
+
+ const Index base_m = 64 * m_block_idx;
+ const Index base_n = 64 * n_block_idx;
+
+ if (base_m + 63 < m_size && base_n + 63 < n_size) {
+ EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
+ } else {
+ EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
+ }
+}
+
+
+template<typename Index, typename LhsMapper,
+ typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
+ bool CHECK_RHS_BOUNDARY>
+__device__ __forceinline__ void
+EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs,
+ const OutputMapper output, float2 lhs_shmem2[][16],
+ float2 rhs_shmem2[][8], const Index m_size,
+ const Index n_size, const Index k_size,
+ const Index base_m, const Index base_n) {
+
+ // prefetch registers
+ float4 lhs_pf0, rhs_pf0;
+
+ float4 results[4];
+ for (int i=0; i < 4; i++) {
+ results[i].x = results[i].y = results[i].z = results[i].w = 0;
+ }
+
+#define prefetch_lhs(reg, row, col) \
+ if (!CHECK_LHS_BOUNDARY) { \
+ if (col < k_size) { \
+ reg =lhs.template loadPacket<float4,Unaligned>(row, col); \
+ } \
+ } else { \
+ if (col < k_size) { \
+ if (row + 3 < m_size) { \
+ reg =lhs.template loadPacket<float4,Unaligned>(row, col); \
+ } else if (row + 2 < m_size) { \
+ reg.x =lhs(row + 0, col); \
+ reg.y =lhs(row + 1, col); \
+ reg.z =lhs(row + 2, col); \
+ } else if (row + 1 < m_size) { \
+ reg.x =lhs(row + 0, col); \
+ reg.y =lhs(row + 1, col); \
+ } else if (row < m_size) { \
+ reg.x =lhs(row + 0, col); \
+ } \
+ } \
+ } \
+
+ Index lhs_vert = base_m+threadIdx.x*4;
+
+ for (Index k = 0; k < k_size; k += 16) {
+
+ lhs_pf0 = internal::pset1<float4>(0);
+ rhs_pf0 = internal::pset1<float4>(0);
+
+ Index lhs_horiz = threadIdx.y+k;
+ prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz)
+
+ Index rhs_vert = k+(threadIdx.x%4)*4;
+ Index rhs_horiz0 = (threadIdx.x>>2)+threadIdx.y*4+base_n;
+
+ if (!CHECK_RHS_BOUNDARY) {
+ if ((rhs_vert + 3) < k_size) {
+ // just CHECK_RHS_BOUNDARY
+ rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
+ } else if (rhs_vert + 2 < k_size) {
+ // just CHECK_RHS_BOUNDARY
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+ rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+ } else if (rhs_vert + 1 < k_size) {
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+ } else if (rhs_vert < k_size) {
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ }
+ } else {
+ if (rhs_horiz0 < n_size) {
+ if ((rhs_vert + 3) < k_size) {
+ rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
+ } else if ((rhs_vert + 2) < k_size) {
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+ rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+ } else if ((rhs_vert + 1) < k_size) {
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+ } else if (rhs_vert < k_size) {
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ }
+ }
+ }
+ float x1, x2 ;
+ // the following can be a bitwise operation..... some day.
+ if((threadIdx.x%8) < 4) {
+ x1 = rhs_pf0.y;
+ x2 = rhs_pf0.w;
+ } else {
+ x1 = rhs_pf0.x;
+ x2 = rhs_pf0.z;
+ }
+ #if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
+ x1 = __shfl_xor(x1, 4);
+ x2 = __shfl_xor(x2, 4);
+ #else
+ x1 = __shfl_xor_sync(0xFFFFFFFF, x1, 4);
+ x2 = __shfl_xor_sync(0xFFFFFFFF, x2, 4);
+ #endif
+ if((threadIdx.x%8) < 4) {
+ rhs_pf0.y = x1;
+ rhs_pf0.w = x2;
+ } else {
+ rhs_pf0.x = x1;
+ rhs_pf0.z = x2;
+ }
+
+ // We have 64 features.
+ // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1.
+ // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3.
+ // ...
+ // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63
+ // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1
+ // ...
+ rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2][threadIdx.x%8] = make_float2(rhs_pf0.x, rhs_pf0.y);
+ rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2+32][threadIdx.x%8] = make_float2(rhs_pf0.z, rhs_pf0.w);
+
+ // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61)
+ // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61)
+ // ...
+ // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61)
+ // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63)
+ // ...
+
+ lhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(lhs_pf0.x, lhs_pf0.y);
+ lhs_shmem2[threadIdx.y+16][threadIdx.x] = make_float2(lhs_pf0.z, lhs_pf0.w);
+
+
+#define add_vals(fl1, fl2, fr1, fr2)\
+ results[0].x += fl1.x * fr1.x;\
+ results[0].y += fl1.y * fr1.x;\
+ results[0].z += fl2.x * fr1.x;\
+ results[0].w += fl2.y * fr1.x;\
+\
+ results[1].x += fl1.x * fr1.y;\
+ results[1].y += fl1.y * fr1.y;\
+ results[1].z += fl2.x * fr1.y;\
+ results[1].w += fl2.y * fr1.y;\
+\
+ results[2].x += fl1.x * fr2.x;\
+ results[2].y += fl1.y * fr2.x;\
+ results[2].z += fl2.x * fr2.x;\
+ results[2].w += fl2.y * fr2.x;\
+\
+ results[3].x += fl1.x * fr2.y;\
+ results[3].y += fl1.y * fr2.y;\
+ results[3].z += fl2.x * fr2.y;\
+ results[3].w += fl2.y * fr2.y;\
+
+ __syncthreads();
+
+ // Do the multiplies.
+ #pragma unroll
+ for (int koff = 0; koff < 16; koff ++) {
+ // 32 x threads.
+ float2 fl1 = lhs_shmem2[koff][threadIdx.x];
+ float2 fl2 = lhs_shmem2[koff + 16][threadIdx.x];
+
+ int start_feature = threadIdx.y * 4;
+ float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
+ float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
+
+ add_vals(fl1, fl2, fr1, fr2)
+ }
+ __syncthreads();
+ }
+
+#undef prefetch_lhs
+#undef add_vals
+
+ Index horiz_base = threadIdx.y*4+base_n;
+ if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
+ for (int i = 0; i < 4; i++) {
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ output(lhs_vert + 1, horiz_base + i) = results[i].y;
+ output(lhs_vert + 2, horiz_base + i) = results[i].z;
+ output(lhs_vert + 3, horiz_base + i) = results[i].w;
+ }
+ } else if (!CHECK_RHS_BOUNDARY) {
+ // CHECK LHS
+ if (lhs_vert + 3 < m_size) {
+ for (int i = 0; i < 4; i++) {
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ output(lhs_vert + 1, horiz_base + i) = results[i].y;
+ output(lhs_vert + 2, horiz_base + i) = results[i].z;
+ output(lhs_vert + 3, horiz_base + i) = results[i].w;
+ }
+ } else if (lhs_vert + 2 < m_size) {
+ for (int i = 0; i < 4; i++) {
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ output(lhs_vert + 1, horiz_base + i) = results[i].y;
+ output(lhs_vert + 2, horiz_base + i) = results[i].z;
+ }
+ } else if (lhs_vert + 1 < m_size) {
+ for (int i = 0; i < 4; i++) {
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ output(lhs_vert + 1, horiz_base + i) = results[i].y;
+ }
+ } else if (lhs_vert < m_size) {
+ for (int i = 0; i < 4; i++) {
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ }
+ }
+ } else if (!CHECK_LHS_BOUNDARY) {
+ // CHECK RHS
+ /*
+ int ncols_rem = fminf(n_size- horiz_base, 4);
+ for (int i = 0; i < ncols_rem; i++) {
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ output(lhs_vert + 1, horiz_base + i) = results[i].y;
+ output(lhs_vert + 2, horiz_base + i) = results[i].z;
+ output(lhs_vert + 3, horiz_base + i) = results[i].w;
+ }*/
+ for (int i = 0; i < 4; i++) {
+ if (horiz_base+i < n_size) {
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ output(lhs_vert + 1, horiz_base + i) = results[i].y;
+ output(lhs_vert + 2, horiz_base + i) = results[i].z;
+ output(lhs_vert + 3, horiz_base + i) = results[i].w;
+ }
+ }
+ } else {
+ // CHECK both boundaries.
+ for (int i = 0; i < 4; i++) {
+ if (horiz_base+i < n_size) {
+ if (lhs_vert < m_size)
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ if (lhs_vert + 1 < m_size)
+ output(lhs_vert + 1, horiz_base + i) = results[i].y;
+ if (lhs_vert + 2 < m_size)
+ output(lhs_vert + 2, horiz_base + i) = results[i].z;
+ if (lhs_vert + 3 < m_size)
+ output(lhs_vert + 3, horiz_base + i) = results[i].w;
+ }
+ }
+ }
+}
+
+
+template<typename Index, typename LhsMapper,
+ typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
+ bool CHECK_RHS_BOUNDARY>
+__device__ __forceinline__ void
+EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
+ const OutputMapper output, float2 lhs_shmem2[][32],
+ float2 rhs_shmem2[][8], const Index m_size,
+ const Index n_size, const Index k_size,
+ const Index base_m, const Index base_n) {
+
+ // prefetch registers
+ float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3;
+ float4 rhs_pf0, rhs_pf1;
+
+ float4 results[8];
+ for (int i=0; i < 8; i++) {
+ results[i].x = results[i].y = results[i].z = results[i].w = 0;
+ }
+
+ Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32;
+ for (Index k = 0; k < k_size; k += 32) {
+ lhs_pf0 = internal::pset1<float4>(0);
+ lhs_pf1 = internal::pset1<float4>(0);
+ lhs_pf2 = internal::pset1<float4>(0);
+ lhs_pf3 = internal::pset1<float4>(0);
+
+ rhs_pf0 = internal::pset1<float4>(0);
+ rhs_pf1 = internal::pset1<float4>(0);
+
+ if (!CHECK_LHS_BOUNDARY) {
+ if ((threadIdx.y/4+k+24) < k_size) {
+ lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+ lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+ lhs_pf3 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
+ } else if ((threadIdx.y/4+k+16) < k_size) {
+ lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+ lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+ } else if ((threadIdx.y/4+k+8) < k_size) {
+ lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+ } else if ((threadIdx.y/4+k) < k_size) {
+ lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
+ }
+ } else {
+ // just CHECK_LHS_BOUNDARY
+ if (lhs_vert + 3 < m_size) {
+ if ((threadIdx.y/4+k+24) < k_size) {
+ lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+ lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+ lhs_pf3 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
+ } else if ((threadIdx.y/4+k+16) < k_size) {
+ lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+ lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+ } else if ((threadIdx.y/4+k+8) < k_size) {
+ lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
+ lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+ } else if ((threadIdx.y/4+k) < k_size) {
+ lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
+ }
+ } else if (lhs_vert + 2 < m_size) {
+ if ((threadIdx.y/4+k+24) < k_size) {
+ lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+ lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+ lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
+ lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+ lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+ lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
+ lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+ lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
+ lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16));
+ lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
+ lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24));
+ lhs_pf3.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+24));
+ } else if ((threadIdx.y/4+k+16) < k_size) {
+ lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+ lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+ lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
+ lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+ lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+ lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
+ lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+ lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
+ lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16));
+ } else if ((threadIdx.y/4+k+8) < k_size) {
+ lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+ lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+ lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
+ lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+ lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+ lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
+ } else if ((threadIdx.y/4+k) < k_size) {
+ lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+ lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+ lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
+ }
+ } else if (lhs_vert + 1 < m_size) {
+ if ((threadIdx.y/4+k+24) < k_size) {
+ lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+ lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+ lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+ lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+ lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+ lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
+ lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
+ lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24));
+ } else if ((threadIdx.y/4+k+16) < k_size) {
+ lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+ lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+ lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+ lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+ lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+ lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
+ } else if ((threadIdx.y/4+k+8) < k_size) {
+ lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+ lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+ lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+ lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+ } else if ((threadIdx.y/4+k) < k_size) {
+ lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+ lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+ }
+ } else if (lhs_vert < m_size) {
+ if ((threadIdx.y/4+k+24) < k_size) {
+ lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+ lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+ lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+ lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
+ } else if ((threadIdx.y/4+k+16) < k_size) {
+ lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+ lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+ lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+ } else if ((threadIdx.y/4+k+8) < k_size) {
+ lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+ lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+ } else if ((threadIdx.y/4+k) < k_size) {
+ lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+ }
+ }
+ }
+ __syncthreads();
+ Index rhs_vert = k+threadIdx.x*4;
+ Index rhs_horiz0 = threadIdx.y*2+base_n;
+ Index rhs_horiz1 = threadIdx.y*2+1+base_n;
+ if (!CHECK_RHS_BOUNDARY) {
+ if ((rhs_vert + 3) < k_size) {
+ // just CHECK_RHS_BOUNDARY
+ rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
+ rhs_pf1 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz1);
+ } else if (rhs_vert + 2 < k_size) {
+ // just CHECK_RHS_BOUNDARY
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+ rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+ rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+ rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+ rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
+ } else if (rhs_vert + 1 < k_size) {
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+ rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+ rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+ } else if (rhs_vert < k_size) {
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+ }
+ } else {
+ if (rhs_horiz1 < n_size) {
+ if ((rhs_vert + 3) < k_size) {
+ // just CHECK_RHS_BOUNDARY
+ rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
+ rhs_pf1 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz1);
+ } else if (rhs_vert + 2 < k_size) {
+ // just CHECK_RHS_BOUNDARY
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+ rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+ rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+ rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+ rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
+ } else if (k+threadIdx.x*4 + 1 < k_size) {
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+ rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+ rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+ } else if (k+threadIdx.x*4 < k_size) {
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+ }
+ } else if (rhs_horiz0 < n_size) {
+ if ((rhs_vert + 3) < k_size) {
+ // just CHECK_RHS_BOUNDARY
+ rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
+ } else if ((rhs_vert + 2) < k_size) {
+ // just CHECK_RHS_BOUNDARY
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+ rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+ } else if ((rhs_vert + 1) < k_size) {
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+ } else if (rhs_vert < k_size) {
+ rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+ }
+ }
+ }
+ __syncthreads();
+ // Loaded. Do computation
+ // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1.
+ // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3.
+ // ..
+ // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63
+ rhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(rhs_pf0.x, rhs_pf1.x);
+ // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1.
+ // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3.
+ // ..
+ rhs_shmem2[threadIdx.y+32][threadIdx.x] = make_float2(rhs_pf0.y, rhs_pf1.y);
+ // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1.
+ // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3.
+ rhs_shmem2[threadIdx.y+64][threadIdx.x] = make_float2(rhs_pf0.z, rhs_pf1.z);
+ // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1.
+ // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3.
+ rhs_shmem2[threadIdx.y+96][threadIdx.x] = make_float2(rhs_pf0.w, rhs_pf1.w);
+
+ // LHS.
+ // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125)
+ // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125)
+ // ...
+ // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127)
+ // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127)
+
+
+#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\
+ results[0].x += a_feat1.x * f1.x;\
+ results[1].x += a_feat1.x * f1.y;\
+ results[2].x += a_feat1.x * f2.x;\
+ results[3].x += a_feat1.x * f2.y;\
+ results[4].x += a_feat1.x * f3.x;\
+ results[5].x += a_feat1.x * f3.y;\
+ results[6].x += a_feat1.x * f4.x;\
+ results[7].x += a_feat1.x * f4.y;\
+\
+ results[0].y += a_feat1.y * f1.x;\
+ results[1].y += a_feat1.y * f1.y;\
+ results[2].y += a_feat1.y * f2.x;\
+ results[3].y += a_feat1.y * f2.y;\
+ results[4].y += a_feat1.y * f3.x;\
+ results[5].y += a_feat1.y * f3.y;\
+ results[6].y += a_feat1.y * f4.x;\
+ results[7].y += a_feat1.y * f4.y;\
+\
+ results[0].z += a_feat2.x * f1.x;\
+ results[1].z += a_feat2.x * f1.y;\
+ results[2].z += a_feat2.x * f2.x;\
+ results[3].z += a_feat2.x * f2.y;\
+ results[4].z += a_feat2.x * f3.x;\
+ results[5].z += a_feat2.x * f3.y;\
+ results[6].z += a_feat2.x * f4.x;\
+ results[7].z += a_feat2.x * f4.y;\
+\
+ results[0].w += a_feat2.y * f1.x;\
+ results[1].w += a_feat2.y * f1.y;\
+ results[2].w += a_feat2.y * f2.x;\
+ results[3].w += a_feat2.y * f2.y;\
+ results[4].w += a_feat2.y * f3.x;\
+ results[5].w += a_feat2.y * f3.y;\
+ results[6].w += a_feat2.y * f4.x;\
+ results[7].w += a_feat2.y * f4.y;\
+
+ lhs_shmem2[threadIdx.y/4][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y);
+ lhs_shmem2[threadIdx.y/4+8][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y);
+ lhs_shmem2[threadIdx.y/4+16][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y);
+ lhs_shmem2[threadIdx.y/4+24][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y);
+
+ lhs_shmem2[threadIdx.y/4 + 32][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w);
+ lhs_shmem2[threadIdx.y/4 + 40][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w);
+ lhs_shmem2[threadIdx.y/4 + 48][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w);
+ lhs_shmem2[threadIdx.y/4 + 56][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w);
+
+ __syncthreads();
+
+ // Do the multiplies.
+ #pragma unroll
+ for (int koff = 0; koff < 32; koff ++) {
+ float2 a3 = lhs_shmem2[koff][threadIdx.x + (threadIdx.y % 4) * 8];
+ float2 a4 = lhs_shmem2[koff + 32][threadIdx.x + (threadIdx.y % 4) * 8];
+
+ // first feature is at (threadIdx.y/4) * 8 last is at start + 8.
+ int start_feature = (threadIdx.y / 4) * 8;
+
+ float2 br1 = rhs_shmem2[start_feature/2 + (koff % 4) * 32][koff/4];
+ float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4];
+ float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4];
+ float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4];
+
+ add_vals(a3, a4, br1, br2, br3, br4)
+ }
+ __syncthreads();
+ } // end loop over k
+
+ __syncthreads();
+ Index horiz_base = (threadIdx.y/4)*8+base_n;
+ if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
+ for (int i = 0; i < 8; i++) {
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ output(lhs_vert + 1, horiz_base + i) = results[i].y;
+ output(lhs_vert + 2, horiz_base + i) = results[i].z;
+ output(lhs_vert + 3, horiz_base + i) = results[i].w;
+ }
+ } else if (!CHECK_RHS_BOUNDARY) {
+ if (lhs_vert + 3 < m_size) {
+ for (int i = 0; i < 8; i++) {
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ output(lhs_vert + 1, horiz_base + i) = results[i].y;
+ output(lhs_vert + 2, horiz_base + i) = results[i].z;
+ output(lhs_vert + 3, horiz_base + i) = results[i].w;
+ }
+ } else if (lhs_vert + 2 < m_size) {
+ for (int i = 0; i < 8; i++) {
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ output(lhs_vert + 1, horiz_base + i) = results[i].y;
+ output(lhs_vert + 2, horiz_base + i) = results[i].z;
+ }
+ } else if (lhs_vert + 1 < m_size) {
+ for (int i = 0; i < 8; i++) {
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ output(lhs_vert + 1, horiz_base + i) = results[i].y;
+ }
+ } else if (lhs_vert < m_size) {
+ for (int i = 0; i < 8; i++) {
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ }
+ }
+ } else if (!CHECK_LHS_BOUNDARY) {
+ // CHECK BOUNDARY_B
+ for (int i = 0; i < 8; i++) {
+ if (horiz_base + i < n_size) {
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ output(lhs_vert + 1, horiz_base + i) = results[i].y;
+ output(lhs_vert + 2, horiz_base + i) = results[i].z;
+ output(lhs_vert + 3, horiz_base + i) = results[i].w;
+ }
+ }
+ } else {
+ // CHECK both boundaries.
+ for (int i = 0; i < 8; i++) {
+ if (horiz_base + i < n_size) {
+ if (lhs_vert < m_size)
+ output(lhs_vert, horiz_base + i) = results[i].x;
+ if (lhs_vert + 1 < m_size)
+ output(lhs_vert + 1, horiz_base + i) = results[i].y;
+ if (lhs_vert + 2 < m_size)
+ output(lhs_vert + 2, horiz_base + i) = results[i].z;
+ if (lhs_vert + 3 < m_size)
+ output(lhs_vert + 3, horiz_base + i) = results[i].w;
+ }
+ }
+ }
+}
+
+
+template<typename Index, typename LhsMapper,
+ typename RhsMapper, typename OutputMapper>
+__global__ void
+#if defined(EIGEN_HIPCC)
+__launch_bounds__(256, 1)
+#else
+__launch_bounds__(256)
+#endif
+EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
+ const OutputMapper output,
+ const Index m_size, const Index n_size, const Index k_size) {
+ __shared__ float2 lhs_shmem[64*32];
+ __shared__ float2 rhs_shmem[128*8];
+
+ typedef float2 LHS_MEM[64][32];
+ typedef float2 RHS_MEM[128][8];
+
+ const Index m_block_idx = blockIdx.x;
+ const Index n_block_idx = blockIdx.y;
+
+ const Index base_m = 128 * m_block_idx;
+ const Index base_n = 64 * n_block_idx;
+
+ bool check_rhs = (base_n + 63) >= n_size;
+ bool check_lhs128 = (base_m + 127) >= m_size;
+
+ if (!check_rhs) {
+ if (!check_lhs128) {
+ // >= 128 rows left
+ EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(
+ lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+ } else {
+ EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(
+ lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+ }
+ } else {
+ if (!check_lhs128) {
+ // >= 128 rows left
+ EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(
+ lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+ } else {
+ EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(
+ lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+ }
+ }
+}
+
+template<typename Index, typename LhsMapper,
+ typename RhsMapper, typename OutputMapper>
+__global__ void
+#if defined(EIGEN_HIPCC)
+__launch_bounds__(256, 1)
+#else
+__launch_bounds__(256)
+#endif
+EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs,
+ const OutputMapper output,
+ const Index m_size, const Index n_size, const Index k_size) {
+ __shared__ float2 lhs_shmem[32][16];
+ __shared__ float2 rhs_shmem[64][8];
+
+ const Index m_block_idx = blockIdx.x;
+ const Index n_block_idx = blockIdx.y;
+
+ const Index base_m = 64 * m_block_idx;
+ const Index base_n = 64 * n_block_idx;
+
+ if (base_m + 63 < m_size) {
+ if (base_n + 63 < n_size) {
+ EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+ } else {
+ EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+ }
+ } else {
+ if (base_n + 63 < n_size) {
+ EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+ } else {
+ EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+ }
+ }
+}
+
+
+template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, GpuDevice> :
+ public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, GpuDevice> > {
+
+ typedef GpuDevice Device;
+
+ typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
+ typedef TensorContractionEvaluatorBase<Self> Base;
+
+ typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
+ typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+ typedef typename XprType::Index Index;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
+
+ enum {
+ Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+ };
+
+ // Most of the code is assuming that both input tensors are ColMajor. If the
+ // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
+ // If we want to compute A * B = C, where A is LHS and B is RHS, the code
+ // will pretend B is LHS and A is RHS.
+ typedef typename internal::conditional<
+ static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
+ typedef typename internal::conditional<
+ static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
+
+ static const int LDims =
+ internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
+ static const int RDims =
+ internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
+ static const int ContractDims = internal::array_size<Indices>::value;
+
+ typedef array<Index, LDims> left_dim_mapper_t;
+ typedef array<Index, RDims> right_dim_mapper_t;
+
+ typedef array<Index, ContractDims> contract_t;
+ typedef array<Index, LDims - ContractDims> left_nocontract_t;
+ typedef array<Index, RDims - ContractDims> right_nocontract_t;
+
+ static const int NumDims = LDims + RDims - 2 * ContractDims;
+
+ typedef DSizes<Index, NumDims> Dimensions;
+
+ // typedefs needed in evalTo
+ typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
+ typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
+
+ typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+ typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+
+ typedef typename LeftEvaluator::Dimensions LeftDimensions;
+ typedef typename RightEvaluator::Dimensions RightDimensions;
+
+ TensorEvaluator(const XprType& op, const Device& device) :
+ Base(op, device)
+ {
+ EIGEN_STATIC_ASSERT( (internal::is_same<OutputKernelType, const NoOpOutputKernel>::value),
+ GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS);
+ }
+
+ // We need to redefine this method to make nvcc happy
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+ this->m_leftImpl.evalSubExprsIfNeeded(NULL);
+ this->m_rightImpl.evalSubExprsIfNeeded(NULL);
+ if (data) {
+ evalTo(data);
+ return false;
+ } else {
+ this->m_result = static_cast<Scalar *>(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar)));
+ evalTo(this->m_result);
+ return true;
+ }
+ }
+
+ void evalTo(Scalar* buffer) const {
+ if (this->m_lhs_inner_dim_contiguous) {
+ if (this->m_rhs_inner_dim_contiguous) {
+ if (this->m_rhs_inner_dim_reordered) {
+ evalTyped<true, true, true, Unaligned>(buffer);
+ }
+ else {
+ evalTyped<true, true, false, Unaligned>(buffer);
+ }
+ }
+ else {
+ if (this->m_rhs_inner_dim_reordered) {
+ evalTyped<true, false, true, Unaligned>(buffer);
+ }
+ else {
+ evalTyped<true, false, false, Unaligned>(buffer);
+ }
+ }
+ }
+ else {
+ if (this->m_rhs_inner_dim_contiguous) {
+ if (this->m_rhs_inner_dim_reordered) {
+ evalTyped<false, true, true, Unaligned>(buffer);
+ }
+ else {
+ evalTyped<false, true, false, Unaligned>(buffer);
+ }
+ }
+ else {
+ if (this->m_rhs_inner_dim_reordered) {
+ evalTyped<false, false, true, Unaligned>(buffer);
+ }
+ else {
+ evalTyped<false, false, false, Unaligned>(buffer);
+ }
+ }
+ }
+ }
+
+ template <typename LhsScalar, typename RhsScalar, typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels {
+ static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
+ const Index m_blocks = (m + 63) / 64;
+ const Index n_blocks = (n + 63) / 64;
+ const dim3 num_blocks(m_blocks, n_blocks, 1);
+ const dim3 block_size(8, 8, 8);
+ LAUNCH_GPU_KERNEL((EigenContractionKernel<Scalar, Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
+ }
+ };
+
+ template <typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels<float, float, Index, LhsMapper, RhsMapper, OutputMapper> {
+ static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
+ if (m < 768 || n < 768) {
+ const Index m_blocks = (m + 63) / 64;
+ const Index n_blocks = (n + 63) / 64;
+ const dim3 num_blocks(m_blocks, n_blocks, 1);
+ const dim3 block_size(16, 16, 1);
+ LAUNCH_GPU_KERNEL((EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
+ } else {
+ const Index m_blocks = (m + 127) / 128;
+ const Index n_blocks = (n + 63) / 64;
+ const dim3 num_blocks(m_blocks, n_blocks, 1);
+ const dim3 block_size(8, 32, 1);
+ LAUNCH_GPU_KERNEL((EigenFloatContractionKernel<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
+ }
+ }
+ };
+
+ template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+ void evalTyped(Scalar* buffer) const {
+ // columns in left side, rows in right side
+ const Index k = this->m_k_size;
+ EIGEN_UNUSED_VARIABLE(k)
+
+ // rows in left side
+ const Index m = this->m_i_size;
+
+ // columns in right side
+ const Index n = this->m_j_size;
+
+ // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
+ this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
+
+ typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
+ LeftEvaluator, left_nocontract_t,
+ contract_t, 4,
+ lhs_inner_dim_contiguous,
+ false, Unaligned> LhsMapper;
+
+ typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
+ RightEvaluator, right_nocontract_t,
+ contract_t, 4,
+ rhs_inner_dim_contiguous,
+ rhs_inner_dim_reordered, Unaligned> RhsMapper;
+
+ typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+
+
+ // initialize data mappers
+ LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
+ this->m_left_contracting_strides, this->m_k_strides);
+
+ RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
+ this->m_right_contracting_strides, this->m_k_strides);
+
+ OutputMapper output(buffer, m);
+
+#if defined(EIGEN_USE_HIP)
+ setGpuSharedMemConfig(hipSharedMemBankSizeEightByte);
+#else
+ setGpuSharedMemConfig(cudaSharedMemBankSizeEightByte);
+#endif
+
+ LaunchKernels<LhsScalar, RhsScalar, Index, LhsMapper, RhsMapper, OutputMapper>::Run(lhs, rhs, output, m, n, k, this->m_device);
+ }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_USE_GPU and EIGEN_GPUCC
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionMapper.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionMapper.h
new file mode 100644
index 0000000..9ab900b
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionMapper.h
@@ -0,0 +1,575 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H
+
+namespace Eigen {
+
+namespace internal {
+
+enum {
+ Rhs = 0,
+ Lhs = 1
+};
+
+/*
+ * Implementation of the Eigen blas_data_mapper class for tensors.
+ */
+/// The make pointer class is used by sycl in order to build the mapper class on the device. For other platform the default make pointer is used which
+/// is scalar * for CoeffLoader.
+template <typename Tensor, bool HasRawAccess, template <class> class MakePointer_ = MakePointer>
+struct CoeffLoader;
+
+template <typename Scalar, typename Index, int side, typename Tensor,
+ typename nocontract_t, typename contract_t, int packet_size,
+ bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
+ template <class> class MakePointer_ = MakePointer>
+class BaseTensorContractionMapper;
+
+template <typename Tensor, bool HasRawAccess, template <class> class MakePointer_>
+struct CoeffLoader {
+ enum {
+ DirectOffsets = false
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_tensor(tensor) { }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index) {
+ eigen_assert(false && "unsupported");
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename MakePointer_<const typename Tensor::Scalar>::Type
+ data() const {
+ eigen_assert(false && "unsupported");
+ return NULL;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return m_tensor.coeff(index); }
+
+ template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ typename Tensor::PacketReturnType packet(typename Tensor::Index index) const
+ {
+ return m_tensor.template packet<LoadMode>(index);
+ }
+
+ #ifdef EIGEN_USE_SYCL
+ // The placeholder accessors require to be bound to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_tensor.bind(cgh);
+ }
+ #endif
+
+ private:
+ const Tensor m_tensor;
+};
+
+template <typename Tensor, template <class> class MakePointer_>
+struct CoeffLoader<Tensor, true, MakePointer_> {
+ enum {
+ DirectOffsets = true
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_data(tensor.data()) {}
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) {
+ m_data += offset;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename MakePointer_<const typename Tensor::Scalar>::Type
+ data() const {
+ return m_data;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return loadConstant(m_data+index); }
+
+ template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ typename Tensor::PacketReturnType packet(typename Tensor::Index index) const
+ {
+ return internal::ploadt_ro<typename Tensor::PacketReturnType, LoadMode>(m_data + index);
+ }
+
+ #ifdef EIGEN_USE_SYCL
+ // The placeholder accessors require to be bound to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_data.bind(cgh);
+ }
+ #endif
+ private:
+ typedef typename Tensor::Scalar Scalar;
+
+ typename MakePointer_<const Scalar>::Type m_data;
+};
+
+template<typename Scalar, typename Index, int side,
+ typename Tensor,
+ typename nocontract_t, typename contract_t,
+ int packet_size, bool inner_dim_contiguous, int Alignment, template <class> class MakePointer_ = MakePointer>
+class SimpleTensorContractionMapper {
+ public:
+ EIGEN_DEVICE_FUNC
+ SimpleTensorContractionMapper(const Tensor& tensor,
+ const nocontract_t& nocontract_strides,
+ const nocontract_t& ij_strides,
+ const contract_t& contract_strides,
+ const contract_t& k_strides) :
+ m_tensor(tensor),
+ m_nocontract_strides(nocontract_strides),
+ m_ij_strides(ij_strides),
+ m_contract_strides(contract_strides),
+ m_k_strides(k_strides) { }
+
+ enum {
+ DirectOffsets = CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>::DirectOffsets
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) {
+ m_tensor.offsetBuffer(offset);
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE void prefetch(Index /*i*/) { }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar operator()(Index row) const {
+ // column major assumption
+ return operator()(row, 0);
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar operator()(Index row, Index col) const {
+ return m_tensor.coeff(computeIndex(row, col));
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const {
+ const bool left = (side == Lhs);
+ EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963
+ Index nocontract_val = left ? row : col;
+ Index linidx = 0;
+ EIGEN_UNROLL_LOOP
+ for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
+ const Index idx = nocontract_val / m_ij_strides[i];
+ linidx += idx * m_nocontract_strides[i];
+ nocontract_val -= idx * m_ij_strides[i];
+ }
+ if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
+ if (side == Lhs && inner_dim_contiguous) {
+ eigen_assert(m_nocontract_strides[0] == 1);
+ linidx += nocontract_val;
+ } else {
+ linidx += nocontract_val * m_nocontract_strides[0];
+ }
+ }
+
+ Index contract_val = left ? col : row;
+ if(array_size<contract_t>::value > 0) {
+ EIGEN_UNROLL_LOOP
+ for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
+ const Index idx = contract_val / m_k_strides[i];
+ linidx += idx * m_contract_strides[i];
+ contract_val -= idx * m_k_strides[i];
+ }
+
+ if (side == Rhs && inner_dim_contiguous) {
+ eigen_assert(m_contract_strides[0] == 1);
+ linidx += contract_val;
+ } else {
+ linidx += contract_val * m_contract_strides[0];
+ }
+ }
+
+ return linidx;
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE IndexPair<Index> computeIndexPair(Index row, Index col, const Index distance) const {
+ const bool left = (side == Lhs);
+ EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963
+ Index nocontract_val[2] = {left ? row : col, left ? row + distance : col};
+ Index linidx[2] = {0, 0};
+ if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
+ EIGEN_UNROLL_LOOP
+ for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
+ const Index idx0 = nocontract_val[0] / m_ij_strides[i];
+ const Index idx1 = nocontract_val[1] / m_ij_strides[i];
+ linidx[0] += idx0 * m_nocontract_strides[i];
+ linidx[1] += idx1 * m_nocontract_strides[i];
+ nocontract_val[0] -= idx0 * m_ij_strides[i];
+ nocontract_val[1] -= idx1 * m_ij_strides[i];
+ }
+ if (side == Lhs && inner_dim_contiguous) {
+ eigen_assert(m_nocontract_strides[0] == 1);
+ linidx[0] += nocontract_val[0];
+ linidx[1] += nocontract_val[1];
+ } else {
+ linidx[0] += nocontract_val[0] * m_nocontract_strides[0];
+ linidx[1] += nocontract_val[1] * m_nocontract_strides[0];
+ }
+ }
+
+ Index contract_val[2] = {left ? col : row, left ? col : row + distance};
+ if (array_size<contract_t>::value> 0) {
+ EIGEN_UNROLL_LOOP
+ for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
+ const Index idx0 = contract_val[0] / m_k_strides[i];
+ const Index idx1 = contract_val[1] / m_k_strides[i];
+ linidx[0] += idx0 * m_contract_strides[i];
+ linidx[1] += idx1 * m_contract_strides[i];
+ contract_val[0] -= idx0 * m_k_strides[i];
+ contract_val[1] -= idx1 * m_k_strides[i];
+ }
+
+ if (side == Rhs && inner_dim_contiguous) {
+ eigen_assert(m_contract_strides[0] == 1);
+ linidx[0] += contract_val[0];
+ linidx[1] += contract_val[1];
+ } else {
+ linidx[0] += contract_val[0] * m_contract_strides[0];
+ linidx[1] += contract_val[1] * m_contract_strides[0];
+ }
+ }
+ return IndexPair<Index>(linidx[0], linidx[1]);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index firstAligned(Index size) const {
+ // Only claim alignment when we can compute the actual stride (ie when we're
+ // dealing with the lhs with inner_dim_contiguous. This is because the
+ // matrix-vector product relies on the stride when dealing with aligned inputs.
+ return (Alignment == Aligned) && (side == Lhs) && inner_dim_contiguous ? 0 : size;
+ }
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index stride() const {
+ return ((side == Lhs) && inner_dim_contiguous && array_size<contract_t>::value > 0) ? m_contract_strides[0] : 1;
+ }
+
+ #ifdef EIGEN_USE_SYCL
+ // The placeholder accessors require to be bound to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_tensor.bind(cgh);
+ }
+ #endif
+
+ const CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>& tensor() const {
+ return m_tensor;
+ }
+
+ const nocontract_t& nocontract_strides() const {
+ return m_nocontract_strides;
+ }
+ const nocontract_t& ij_strides() const { return m_ij_strides; }
+ const contract_t& contract_strides() const { return m_contract_strides; }
+ const contract_t& k_strides() const { return m_k_strides; }
+
+ protected:
+ CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_> m_tensor;
+ const nocontract_t m_nocontract_strides;
+ const nocontract_t m_ij_strides;
+ const contract_t m_contract_strides;
+ const contract_t m_k_strides;
+};
+
+template<typename Scalar, typename Index, int side,
+ typename Tensor,
+ typename nocontract_t, typename contract_t,
+ int packet_size, bool inner_dim_contiguous,
+ bool inner_dim_reordered, int Alignment, template <class> class MakePointer_>
+class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment, MakePointer_>
+{
+ public:
+ typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment, MakePointer_> ParentMapper;
+
+ EIGEN_DEVICE_FUNC
+ BaseTensorContractionMapper(const Tensor& tensor,
+ const nocontract_t& nocontract_strides,
+ const nocontract_t& ij_strides,
+ const contract_t& contract_strides,
+ const contract_t& k_strides) :
+ ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
+
+ template <typename PacketT,int AlignmentType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ typename internal::enable_if<internal::unpacket_traits<PacketT>::size==packet_size,PacketT>::type
+ load(Index i, Index j) const
+ {
+ // whole method makes column major assumption
+
+ // don't need to add offsets for now (because operator handles that)
+ // current code assumes packet size must be a multiple of 2
+ EIGEN_STATIC_ASSERT(packet_size % 2 == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+ if (Tensor::PacketAccess && inner_dim_contiguous && !inner_dim_reordered) {
+ const Index index = this->computeIndex(i, j);
+ eigen_assert(this->computeIndex(i+packet_size-1, j) == index + packet_size-1);
+ return this->m_tensor.template packet<AlignmentType>(index);
+ }
+
+ const IndexPair<Index> indexPair = this->computeIndexPair(i, j, packet_size - 1);
+ const Index first = indexPair.first;
+ const Index lastIdx = indexPair.second;
+
+ // We can always do optimized packet reads from left hand side right now, because
+ // the vertical matrix dimension on the left hand side is never contracting.
+ // On the right hand side we need to check if the contracting dimensions may have
+ // been shuffled first.
+ if (Tensor::PacketAccess &&
+ (side == Lhs || internal::array_size<contract_t>::value <= 1 || !inner_dim_reordered) &&
+ (lastIdx - first) == (packet_size - 1)) {
+
+ return this->m_tensor.template packet<AlignmentType>(first);
+ }
+
+ EIGEN_ALIGN_MAX Scalar data[packet_size];
+
+ data[0] = this->m_tensor.coeff(first);
+ EIGEN_UNROLL_LOOP
+ for (Index k = 1; k < packet_size - 1; k += 2) {
+ const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1);
+ data[k] = this->m_tensor.coeff(internal_pair.first);
+ data[k + 1] = this->m_tensor.coeff(internal_pair.second);
+ }
+ data[packet_size - 1] = this->m_tensor.coeff(lastIdx);
+
+ return pload<PacketT>(data);
+ }
+
+ template <typename PacketT,int AlignmentType>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ typename internal::enable_if<internal::unpacket_traits<PacketT>::size!=packet_size,PacketT>::type
+ load(Index i, Index j) const
+ {
+ const Index requested_packet_size = internal::unpacket_traits<PacketT>::size;
+ EIGEN_ALIGN_MAX Scalar data[requested_packet_size];
+
+ const IndexPair<Index> indexPair = this->computeIndexPair(i, j, requested_packet_size - 1);
+ const Index first = indexPair.first;
+ const Index lastIdx = indexPair.second;
+
+ data[0] = this->m_tensor.coeff(first);
+ for (Index k = 1; k < requested_packet_size - 1; k += 2) {
+ const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1);
+ data[k] = this->m_tensor.coeff(internal_pair.first);
+ data[k + 1] = this->m_tensor.coeff(internal_pair.second);
+ }
+ data[requested_packet_size - 1] = this->m_tensor.coeff(lastIdx);
+
+ return pload<PacketT>(data);
+ }
+
+ template <typename PacketT,int AlignmentType>
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const {
+ return this->load<PacketT,AlignmentType>(i,j);
+ }
+};
+
+
+template<typename Scalar, typename Index, int side,
+ typename Tensor,
+ typename nocontract_t, typename contract_t,
+ bool inner_dim_contiguous,
+ bool inner_dim_reordered, int Alignment, template <class> class MakePointer_>
+class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_>
+ : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment, MakePointer_>
+{
+ public:
+ typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment, MakePointer_> ParentMapper;
+
+ EIGEN_DEVICE_FUNC
+ BaseTensorContractionMapper(const Tensor& tensor,
+ const nocontract_t& nocontract_strides,
+ const nocontract_t& ij_strides,
+ const contract_t& contract_strides,
+ const contract_t& k_strides) :
+ ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
+
+ template <typename PacketT,int> EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const {
+ EIGEN_ALIGN_MAX Scalar data[1];
+ data[0] = this->m_tensor.coeff(this->computeIndex(i, j));
+ return pload<PacketT>(data);
+ }
+ template <typename PacketT,int> EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE PacketT load(Index i, Index j) const {
+ EIGEN_ALIGN_MAX Scalar data[1];
+ data[0] = this->m_tensor.coeff(this->computeIndex(i, j));
+ return pload<PacketT>(data);
+ }
+};
+
+
+template<typename Scalar, typename Index, int side,
+ typename Tensor,
+ typename nocontract_t, typename contract_t,
+ int packet_size,
+ bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, template <class> class MakePointer_=MakePointer>
+class TensorContractionSubMapper {
+ public:
+
+ typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> ParentMapper;
+ typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> Self;
+ typedef Self LinearMapper;
+
+ enum {
+ // We can use direct offsets iff the parent mapper supports then and we can compute the strides.
+ // TODO: we should also enable direct offsets for the Rhs case.
+ UseDirectOffsets = ParentMapper::DirectOffsets && (side == Lhs) && inner_dim_contiguous && (array_size<contract_t>::value > 0)
+ };
+
+ EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset)
+ : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) {
+ // Bake the offsets into the buffer used by the base mapper whenever possible. This avoids the need to recompute
+ // this offset every time we attempt to access a coefficient.
+ if (UseDirectOffsets) {
+ Index stride = m_base_mapper.stride();
+ m_base_mapper.offsetBuffer(vert_offset + horiz_offset * stride);
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
+ if (UseDirectOffsets) {
+ return m_base_mapper(i, 0);
+ }
+ return m_base_mapper(i + m_vert_offset, m_horiz_offset);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const {
+ if (UseDirectOffsets) {
+ return m_base_mapper(i, j);
+ }
+ return m_base_mapper(i + m_vert_offset, j + m_horiz_offset);
+ }
+
+ template <typename PacketT>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i) const {
+ if (UseDirectOffsets) {
+ return m_base_mapper.template loadPacket<PacketT,Alignment>(i, 0);
+ }
+ return m_base_mapper.template loadPacket<PacketT,Alignment>(i + m_vert_offset, m_horiz_offset);
+ }
+
+ template <typename PacketT>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i, Index j) const {
+ if (UseDirectOffsets) {
+ return m_base_mapper.template loadPacket<PacketT,Alignment>(i, j);
+ }
+ return m_base_mapper.template loadPacket<PacketT,Alignment>(i + m_vert_offset, j + m_horiz_offset);
+ }
+
+ template <typename PacketT, int AlignmentType>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i, Index j) const {
+ if (UseDirectOffsets) {
+ return m_base_mapper.template load<PacketT,AlignmentType>(i, j);
+ }
+ return m_base_mapper.template loadPacket<PacketT,AlignmentType>(i + m_vert_offset, j + m_horiz_offset);
+ }
+
+ template <typename PacketT>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketT& p) const {
+ if (UseDirectOffsets) {
+ m_base_mapper.storePacket(i, 0, p);
+ }
+ m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+ if (UseDirectOffsets) {
+ return LinearMapper(m_base_mapper, i, j);
+ }
+ return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset);
+ }
+
+ template <typename PacketT, int AlignmentType>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const {
+ EIGEN_STATIC_ASSERT((internal::is_same<PacketT, PacketT>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
+ const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned;
+ if (UseDirectOffsets) {
+ return m_base_mapper.template loadPacket<PacketT,ActualAlignment>(i, 0);
+ }
+ return m_base_mapper.template loadPacket<PacketT,ActualAlignment>(i + m_vert_offset, m_horiz_offset);
+ }
+
+ template <typename PacketT>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool aligned(Index) const {
+ return false;
+ }
+
+ #ifdef EIGEN_USE_SYCL
+ // The placeholder accessors require to be bound to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_base_mapper.bind(cgh);
+ }
+ #endif
+
+ const ParentMapper& base_mapper() const { return m_base_mapper; }
+ Index vert_offset() const { return m_vert_offset; }
+ Index horiz_offset() const { return m_horiz_offset; }
+
+ private:
+ ParentMapper m_base_mapper;
+ const Index m_vert_offset;
+ const Index m_horiz_offset;
+};
+
+
+template<typename Scalar_, typename Index, int side,
+ typename Tensor,
+ typename nocontract_t, typename contract_t,
+ int packet_size,
+ bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, template <class> class MakePointer_=MakePointer>
+class TensorContractionInputMapper
+ : public BaseTensorContractionMapper<Scalar_, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> {
+
+ public:
+ typedef Scalar_ Scalar;
+ typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> Base;
+ typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> SubMapper;
+ typedef SubMapper VectorMapper;
+
+ EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor,
+ const nocontract_t& nocontract_strides,
+ const nocontract_t& ij_strides,
+ const contract_t& contract_strides,
+ const contract_t& k_strides)
+ : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const {
+ return SubMapper(*this, i, j);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
+ return VectorMapper(*this, i, j);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>& get_tensor() const {
+ return Base::m_tensor;
+ }
+};
+
+
+template <typename T> struct TensorContractionInputMapperTrait;
+
+template<typename Scalar_, typename Index_, int side_,
+ typename Tensor_,
+ typename nocontract_t_, typename contract_t_,
+ int packet_size_,
+ bool inner_dim_contiguous_, bool inner_dim_reordered_, int Alignment_, template <class> class MakePointer_>
+struct TensorContractionInputMapperTrait<TensorContractionInputMapper<Scalar_, Index_, side_, Tensor_,
+ nocontract_t_, contract_t_, packet_size_, inner_dim_contiguous_,
+ inner_dim_reordered_, Alignment_, MakePointer_> > {
+
+ typedef Tensor_ XprType;
+ static const bool inner_dim_contiguous = inner_dim_contiguous_;
+ static const bool inner_dim_reordered = inner_dim_reordered_;
+ };
+
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionSycl.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionSycl.h
new file mode 100755
index 0000000..473c228
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionSycl.h
@@ -0,0 +1,1650 @@
+// This file is part of Eigen, a lightweight C++ template library for linear algebra.
+//
+// Mehdi Goli Codeplay Software Ltd.
+// Ralph Potter Codeplay Software Ltd.
+// Luke Iwanski Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla Public License v. 2.0. If a copy of the MPL was not
+// distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * TensorContractionSycl.h
+ *
+ * \brief:
+ * TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend
+ *
+ *****************************************************************/
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
+
+namespace Eigen {
+
+namespace TensorSycl {
+namespace internal {
+
+#ifndef EIGEN_SYCL_DISABLE_GEMV
+/*!
+ * \brief TVPanelSize, a template class used for setting the panel size required for launching General TensorVector
+ * contraction kernel on various hardware devices.
+ *
+ * \tparam Scalar: determines the element type of the tensor/vector
+ *
+ * \tparam StorageIndex determines the Index type.
+ *
+ * \tparam NCWindow: determines the number of non-contracting element to be process by each work-group
+ *
+ * \tparam CFactor: determines the number of contracting element to be process by each thread
+ *
+ * \tparam NCFactor: determines the number of non-contracting element to be process by each thread
+ */
+template <typename Scalar, typename StorageIndex, StorageIndex NCWindow, StorageIndex CFactor, StorageIndex NCFactor>
+struct TVPanelSize {
+ // LocalThreadSizeC: determines total number of thread per workgroup for the contracting dimension
+ static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeC = EIGEN_SYCL_LOCAL_THREAD_DIM0;
+ // LocalThreadSizeNC: determines total number of thread per workgroup for the non-contracting dimension
+ static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC = EIGEN_SYCL_LOCAL_THREAD_DIM1;
+ // TileSizeDimNC: determines the tile size for the non-contracting dimension
+ static EIGEN_CONSTEXPR StorageIndex TileSizeDimNC = NCWindow / NCFactor;
+ // TileSizeDimC: determines the tile size for the contracting dimension
+ static EIGEN_CONSTEXPR StorageIndex TileSizeDimC = CFactor * LocalThreadSizeNC * LocalThreadSizeC;
+ // WorkLoadPerThreadNC : determines workload per thread for loading the non-contracting dimension
+ static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC = TileSizeDimNC / LocalThreadSizeNC;
+ // WorkLoadPerThreadC: determines workload per thread for loading the non-contracting dimension
+ static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadC = TileSizeDimC / LocalThreadSizeC;
+ // BC : determines if supporting bank conflict is required
+ static EIGEN_CONSTEXPR bool BC = false;
+};
+#endif
+
+/*!
+ * \brief TTPanelSize, a template class used for setting the panel size required for launching General Tensor Tensor
+ contraction kernel on various hardware devices.
+ *
+ * \tparam Scalar: determines the element type of the tensor
+ *
+ * \tparam StorageIndex: determines the Index type.
+ *
+ * \tparam REG_SIZE_M: determines workload per thread for loading the M dimension This can be varied based on the
+ available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro).
+ *
+ * \tparam REG_SIZE_N: determines workload per thread for loading the N dimension This can be varied based on the
+ available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro).
+ *
+ * \tparam TSDK: determines Tile size for dimension K. The packet size is assumed to be considered
+ */
+
+template <typename Scalar, typename StorageIndex, StorageIndex REG_SIZE_M, StorageIndex REG_SIZE_N, StorageIndex TSDK>
+struct TTPanelSize {
+ // TileSizeDimK: determines Tile size for dimension K. The packet size is assumed to be considered
+ static EIGEN_CONSTEXPR StorageIndex TileSizeDimK = TSDK;
+ // WorkLoadPerThreadM : determines workload per thread for loading the M dimension This can be varied based on the
+ // available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro//
+#ifndef EIGEN_SYCL_REG_M
+ static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = REG_SIZE_M;
+#else
+ static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = EIGEN_SYCL_REG_M;
+#endif
+// WorkLoadPerThreadN : determines workload per thread for loading the N dimension This can be varied based on the
+// available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro
+#ifndef EIGEN_SYCL_REG_N
+ static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = REG_SIZE_N;
+#else
+ static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = EIGEN_SYCL_REG_N;
+#endif
+ // LocalThreadSizeM: determines total number of thread per workgroup for the m dimension
+ static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeM = EIGEN_SYCL_LOCAL_THREAD_DIM0;
+ // LocalThreadSizeN: determines total number of thread per workgroup for the n dimension
+ static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeN = EIGEN_SYCL_LOCAL_THREAD_DIM1;
+ // TileSizeDimM: determines the tile size for the m dimension
+ static EIGEN_CONSTEXPR StorageIndex TileSizeDimM = LocalThreadSizeM * WorkLoadPerThreadM;
+ // TileSizeDimN: determines the tile size for the n dimension
+ static EIGEN_CONSTEXPR StorageIndex TileSizeDimN = LocalThreadSizeN * WorkLoadPerThreadN;
+ // LoadPerThreadLhs: determines workload per thread for loading Lhs Tensor. This must be divisable by packetsize
+ static EIGEN_CONSTEXPR StorageIndex LoadPerThreadLhs =
+ ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimN));
+ // LoadPerThreadRhs: determines workload per thread for loading Rhs Tensor. This must be divisable by packetsize
+ static EIGEN_CONSTEXPR StorageIndex LoadPerThreadRhs =
+ ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimM));
+ // BC : determines if supporting bank conflict is required
+ static EIGEN_CONSTEXPR bool BC = true;
+ // DoubleBuffer: determines if double buffering technique should be used (This can be disabled by
+ // EIGEN_SYCL_DISABLE_DOUBLE_BUFFER macro when the device doesnot have sufficient local memory)
+ static EIGEN_CONSTEXPR bool DoubleBuffer =
+#ifdef EIGEN_SYCL_DISABLE_DOUBLE_BUFFER
+ false;
+#else
+ true;
+#endif
+};
+
+/* !
+ * \brief contraction_type: an enum class representing the Tensor Contraction implementation algorithm. This is used to
+ * specialize the contraction algorithm based on device support for dedicated local memory.
+ */
+enum class contraction_type { local, no_local };
+/* !
+ * \brief data_source an enum class determining the location of the data in a memory hierarchy (global, local, private).
+ */
+enum class data_source { global_mem, local_mem, private_mem };
+
+/*!
+ * \brief read, a template function used for loading the data from global
+ memory. This function is used to guarantee coalesced and vectorized load whenever possible
+ *
+ * \tparam PacketLoad: determines if the each element of this tensor block should be loaded in a packet mode
+ *
+ * \param is_coalesced_layout: determines whether or not the Tensor data in a memory can be access coalesced and
+ vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the
+ contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case
+ when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed.
+ *
+ * \tparam PacketType: determines the type of packet
+ *
+ * \tparam TensorMapper: determines the input tensor mapper type
+ *
+ * \tparam StorageIndex: determines the Index type
+
+ * \param tensorMapper: is the input tensor
+ *
+ * \param NCIndex: is the non-contracting dim index
+ *
+ * \param CIndex is the contracting dim index
+ *
+ * \param ld: is the leading dimension of the flattened tensor
+ */
+template <bool PacketLoad, bool is_coalesced_layout, bool, typename PacketType, typename TensorMapper,
+ typename StorageIndex>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<PacketLoad, PacketType>::type read(
+ const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &ld) {
+ const StorageIndex row = (is_coalesced_layout) ? NCIndex : CIndex;
+ const StorageIndex col = (is_coalesced_layout) ? CIndex : NCIndex;
+ return tensorMapper.get_tensor().template packet<Unaligned>(row + (col * ld));
+}
+
+/*!
+ * \brief read, special overload of read function, when the read access is not vectorized
+ *
+ * \tparam PacketLoad: determines if the each element of this tensor block should be loaded in a packet mode
+ *
+ * \param is_coalesced_layout: determines whether or not the Tensor data in a memory can be access coalesced and
+ vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the
+ contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case
+ when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed.
+ *
+ * \tparam PacketType: determines the type of packet
+ *
+ * \tparam TensorMapper: determines the input tensor mapper type
+ *
+ * \tparam StorageIndex: determines the Index type
+
+ * \param tensorMapper: is the input tensor
+ *
+ * \param NCIndex: is the non-contracting dim index
+ *
+ * \param CIndex: is the contracting dim index
+ */
+template <bool PacketLoad, bool, bool IsRhs, typename PacketType, typename TensorMapper, typename StorageIndex>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!PacketLoad, PacketType>::type read(
+ const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &) {
+ const StorageIndex row = (IsRhs) ? CIndex : NCIndex;
+ const StorageIndex col = (IsRhs) ? NCIndex : CIndex;
+ return tensorMapper(row, col);
+}
+
+/*!
+ * \brief write, a template function used for storing the data to local memory. This function is used to guarantee
+ * coalesced and vectorized store whenever possible.
+ *
+ * \tparam StorageIndex: determines the Index type
+ *
+ * \param ld is the leading dimension of the local memory. ld is a compile time value for the local memory
+ *
+ * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy.
+ *
+ * \tparam PacketType: determines the type of packet
+ *
+ * \tparam DataScalar: determines the output data type
+ *
+ * \param packet_data: the data to be written in the local memory
+ *
+ * \param ptr: a pointer to the local memory
+ *
+ * \param CIndex is the contracting dim index
+ */
+
+template <typename StorageIndex, StorageIndex ld, data_source dt, typename PacketType, typename DataScalar>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ typename ::Eigen::internal::enable_if<dt != data_source::global_mem, void>::type
+ write(PacketType &packet_data, DataScalar ptr) {
+ EIGEN_CONSTEXPR int PacketSize = Eigen::internal::unpacket_traits<PacketType>::size;
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < PacketSize; i++) {
+ *ptr = PacketWrapper<PacketType, PacketSize>::scalarize(i, packet_data);
+ ptr += ld;
+ }
+}
+
+/*!
+ * \brief Overloading the write function for storing the data to global memory, when vectorization enabled This function
+ * is used to guarantee coalesced and vectorized store whenever possible.
+ *
+ * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy.
+ *
+ * \tparam PacketType: determines the type of packet
+ *
+ * \tparam DataScalar: determines the output data type
+ *
+ * \param packet_data: the data to be written in the local memory
+ *
+ * \param ptr: a pointer to the local memory
+ */
+
+template <data_source dt, typename PacketType, typename DataScalar>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<
+ Eigen::internal::unpacket_traits<PacketType>::size != 1 && dt == data_source::global_mem, void>::type
+write(PacketType &packet_data, DataScalar *ptr) {
+ ::Eigen::internal::pstoreu<DataScalar, PacketType>(ptr, packet_data);
+}
+
+/*!
+ * \brief Overloading the write function for storing the data to global memory, when vectorization is disabled.
+ *
+ * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy.
+ *
+ * \tparam PacketType: determines the type of packet
+ *
+ * \tparam DataScalar: determines the output data type
+ *
+ * \param packet_data: the data to be written in the local memory
+ *
+ * \param ptr: a pointer to the local memory
+ */
+template <data_source dt, typename PacketType, typename DataScalar>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<
+ Eigen::internal::unpacket_traits<PacketType>::size == 1 && dt == data_source::global_mem, void>::type
+write(PacketType &packet_data, DataScalar *ptr) {
+ *ptr = packet_data;
+}
+
+/*!
+ * \brief check_boundary: is used to check the edge condition for non-internal blocks.
+ *
+ * \tparam is_internal: determines if the block is internal
+ */
+template <bool is_internal>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary(bool) {
+ return true;
+}
+
+/*!
+ * \brief check_boundary: specialization of the check_boundary for non-internal blocks.
+ *
+ * \param cond: true when the data is in range. Otherwise false
+ */
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary<false>(bool cond) {
+ return cond;
+}
+
+/*!
+ * \brief BlockProperties is a template class that provides different characteristic of a block of each Tensor processed
+ * by each workgroup.
+ *
+ * \tparam is_transposed: iff true, determines whether or not the block of the Tensor is transposed
+ *
+ * \tparam packet_load_: determines if the each element of this tensor block should be loaded in a packet mode
+ *
+ * \tparam PacketType: determines the type of packet
+ *
+ * \tparam OutType: determines the type of each element for this block of tensor. If packet load is true, it will be
+ * packetType; Otherwise it will be scalar Type
+ *
+ * \param elements_per_access determines the size of each element based on OutType
+ *
+ * \param is_coalesced_layout determines whether or not the Tensor data in a memory can be access coalesced and
+ * vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the
+ * contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case
+ * when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed.
+ *
+ * \param nc_stride determines the stride of non-contracting dimension to access the next adjustment element within the
+ * Tensor Block for each workgroup
+ *
+ * \param c_stride determines the stride of contracting dimension to access the next adjustment element within the
+ * Tensor Block for each workgroup
+ */
+template <bool is_transposed, bool is_rhs_, bool packet_load_, typename PacketType>
+struct BlockProperties {
+ static EIGEN_CONSTEXPR bool packet_load = packet_load_;
+ typedef typename Eigen::internal::unpacket_traits<PacketType>::type OutScalar;
+ static EIGEN_CONSTEXPR bool is_rhs = is_rhs_;
+ typedef typename Eigen::internal::conditional<packet_load, PacketType, OutScalar>::type OutType;
+ static EIGEN_CONSTEXPR int elements_per_access = Eigen::internal::unpacket_traits<OutType>::size;
+ static EIGEN_CONSTEXPR bool is_coalesced_layout = !(is_transposed ^ is_rhs);
+ static EIGEN_CONSTEXPR int nc_stride = (is_coalesced_layout ? elements_per_access : 1);
+ static EIGEN_CONSTEXPR int c_stride = (is_coalesced_layout ? 1 : elements_per_access);
+};
+
+/*!
+ * \brief ThreadProperties is a template class that provides each thread's properties within a workgroup. Please see
+ * the sycl-1.2.1 specification (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for the workgroup,
+ * work-items
+ *
+ * \tparam StorageIndex: determines the StorageIndex Type
+ *
+ * \param linearLocalThreadId: determines the linearized location of a thread within a work-group
+ *
+ * \param kGroupId: determines the logical group id in a k dimension of the flattened tensor. It will be > 1 when
+ * tall/skinny algorithm is used
+ *
+ * \param mGroupOffset: determines the logical start position of all thread within a workgroup for the m dimension of
+ * the flattened tensor.
+ *
+ * \param kGroupOffset determines the logical start position of all thread within a workgroup for the k dimension of the
+ * flattened tensor. It will be > 1 when tall/skinny algorithm is used.
+ *
+ * \param mLocalOffset: determines the logical start position of each thread within a workgroup for the m dimension of a
+ * flattened tensor. The position determines the distance of each thread within the workgroup from each other
+ * independent from their global position.
+ *
+ * \param nLocalOffset: determines the logical start position of each thread within a workgroup for the n dimension of a
+ * flattened tensor. The position determines the distance of each thread within the workgroup from each other
+ * independent from their global position.
+ *
+ * \param mGlobalOffset: determines the logical start position of each thread a thread for the m dimension on a
+ * flattened tensor
+ *
+ * \param nGlobalOffset: determines the logical start position of each thread a thread for the n dimension on a
+ * flattened tensor
+ *
+ * \param kSize : determine the number of the k elements of the flattened Tensor to be processed by each thread for the
+ * given tensor block. This is !=K dimension of Flattened Tensor when Tall/Skinny matrix is used.
+ *
+ * \param is_internal : this will determined if the thread within the work-group computes an internal block of tensor or
+ * the edge blocks. When it is internal, there is no need to check the boundaries and all the if stantement can be
+ * resolve by compiler.
+ */
+template <typename StorageIndex>
+struct ThreadProperties {
+ const StorageIndex linearLocalThreadId;
+ const StorageIndex kGroupId;
+ const StorageIndex mGroupOffset;
+ const StorageIndex nGroupOffset;
+ const StorageIndex kGroupOffset;
+ const StorageIndex mLocalOffset;
+ const StorageIndex nLocalOffset;
+ const StorageIndex mGlobalOffset;
+ const StorageIndex nGlobalOffset;
+ StorageIndex kSize;
+ const bool is_internal;
+ // this is used to adjust the last block
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ThreadProperties(
+ const StorageIndex linearLocalThreadId_, const StorageIndex kGroupId_, const StorageIndex mGroupOffset_,
+ const StorageIndex nGroupOffset_, const StorageIndex kGroupOffset_, const StorageIndex mLocalOffset_,
+ const StorageIndex nLocalOffset_, const StorageIndex mGlobalOffset_, const StorageIndex nGlobalOffset_,
+ StorageIndex kSize_, const bool is_internal_)
+ : linearLocalThreadId(linearLocalThreadId_),
+ kGroupId(kGroupId_),
+ mGroupOffset(mGroupOffset_),
+ nGroupOffset(nGroupOffset_),
+ kGroupOffset(kGroupOffset_),
+ mLocalOffset(mLocalOffset_),
+ nLocalOffset(nLocalOffset_),
+ mGlobalOffset(mGlobalOffset_),
+ nGlobalOffset(nGlobalOffset_),
+ kSize(kSize_),
+ is_internal(is_internal_) {}
+};
+
+/*!
+ * \brief TensorContractionKernel is a template class that provides Tensor -Tensor contraction operation.
+ *
+ * \tparam OutScalar: determines the output scalar type
+ *
+ * \tparam LhsScalar: determines the left-hand-side scalar type
+ *
+ * \tparam RhsScalar: determines the right-hand-side scalar type
+ *
+ * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification
+ (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition)
+ *
+ * \tparam LhsMapper determines the tensor contraction mapper type for left-hand-side matrix
+ *
+ * \tparam RhsMapper determines the tensor contraction mapper type for right-hand-side matrix
+ *
+ * \tparam StorageIndex: determines the StorageIndex Type
+ *
+ * \tparam Properties: determines the Contraction Panel properties
+ *
+ * \tparam TripleDim: determines the M, K, N dimensions for the flatten tensors in order to treat them as a matrix
+ *
+ * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression.
+ *
+ * \tparam input_mapper_properties : determine if the input tensors are matrix. If they are matrix, special memory
+ access is used to guarantee that always the memory access are coalesced.
+ *
+ * \tptaram IsFinal : determine if this is the final kernel. If so, the result will be written in a final output.
+ Otherwise, the result of contraction will be written iin a temporary buffer. This is the case when Tall/Skinny
+ contraction is used. So in this case, a final reduction step is required to compute final output.
+
+ * \tparam contraction_tp: it is an enum value representing whether the local memroy/no local memory implementation of
+ the algorithm to be used
+ *
+ * \param scratch: local memory containing tiles of LHS and RHS tensors for each work-group
+ *
+ * \param lhs: determines the left-hand-side flattened tensor (tensor mapper)
+ *
+ * \param rhs: determines the right-hand-side flattened tensor (tensor mapper)
+ *
+ * \param out_res: determines the output tensor containing the contraction result
+ *
+ * \param groupSizeM: a logical number determining the number of work-group for m dimension
+ *
+ * \param groupSizeN: a logical number determining the number of work-group for n dimension
+ *
+ * \param numTiles: determines total number of tiles on the k dimension
+ *
+ * \param TripleDim: determines the M, K, N dimensions for the flatten tensors in order to treat them as a matrix
+ */
+template <typename OutScalar, typename LhsScalar, typename RhsScalar, typename OutAccessor, typename LhsMapper,
+ typename RhsMapper, typename StorageIndex, typename Properties, typename TripleDim, bool Vectorizable,
+ typename input_mapper_properties, bool IsFinal, contraction_type contraction_tp>
+class TensorContractionKernel {
+ public:
+ typedef typename Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketReturnType
+ PacketReturnType;
+ static EIGEN_CONSTEXPR int PacketSize =
+ Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketSize;
+ static EIGEN_CONSTEXPR bool is_lhs_transposed =
+ !::Eigen::internal::TensorContractionInputMapperTrait<LhsMapper>::inner_dim_contiguous;
+ static EIGEN_CONSTEXPR bool is_rhs_transposed =
+ !::Eigen::internal::TensorContractionInputMapperTrait<RhsMapper>::inner_dim_contiguous;
+
+ typedef BlockProperties<is_lhs_transposed, false, input_mapper_properties::is_lhs_matrix && Vectorizable,
+ PacketReturnType>
+ LHSBlockProperties;
+
+ typedef BlockProperties<is_rhs_transposed, true, input_mapper_properties::is_rhs_matrix && Vectorizable,
+ PacketReturnType>
+ RHSBlockProperties;
+
+ static EIGEN_CONSTEXPR StorageIndex NStride =
+ contraction_tp == contraction_type::local ? Properties::WorkLoadPerThreadN : RHSBlockProperties::nc_stride;
+
+ typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
+ typedef cl::sycl::multi_ptr<OutScalar, cl::sycl::access::address_space::local_space> local_ptr;
+ typedef OutScalar * /*cl::sycl::multi_ptr<OutScalar, cl::sycl::access::address_space::private_space>*/ private_ptr;
+ typedef
+ typename ::Eigen::internal::conditional<contraction_tp == contraction_type::local, local_ptr, private_ptr>::type
+ tile_ptr;
+ static EIGEN_CONSTEXPR StorageIndex LSDL = contraction_tp == contraction_type::local
+ ? Properties::TileSizeDimM + Properties::BC
+ : Properties::WorkLoadPerThreadM;
+ static EIGEN_CONSTEXPR StorageIndex LSDR = contraction_tp == contraction_type::local
+ ? Properties::TileSizeDimN + Properties::BC
+ : Properties::WorkLoadPerThreadN;
+ static EIGEN_CONSTEXPR StorageIndex LocalOffset = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN;
+
+ /**
+ * \brief MemHolder this is a place holder struct for creating memory hierarchy in SYCL. Inside SYCL kernel it is not
+ * allowed to have dynamic memory allocation. While the local memory is created outside of the kernel and passed to
+ * the kernel as an accessor, the private memory can only allowed to be allocated statically. Since we are abstracting
+ * the TiledMemory for both local and private memory, the MemHolder structs is used as a helper to abstract out
+ * different type of memory needed when local/no_local memory computation is called.
+ *
+ * \tparam contraction_type: it is an enum value representing whether the local memroy/no local memory implementation
+ of the algorithm to be used
+ * \tparam the private memory size
+ * \param ptr the tile memory pointer type
+ */
+ template <contraction_type, StorageIndex>
+ struct MemHolder {
+ tile_ptr ptr;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MemHolder(local_ptr block_start_ptr) : ptr(block_start_ptr) {}
+ };
+ /**
+ * \brief specialization of memHolder class when no local memory kernel is used.
+ */
+ template <StorageIndex MemSize>
+ struct MemHolder<contraction_type::no_local, MemSize> {
+ OutScalar ptr[MemSize] = {OutScalar{0}};
+ };
+ /**
+ * \brief TiledMemory: contains required memory pointer for loading each tile of the TensorContraction panel from
+ * global memory to local/private memory when local/no_local algorithm used.
+ *
+ * \param lhs_scratch_extract : determines the LHS tile memory. It is either private or local memory based on the
+ * selected contraction_type.
+ *
+ * \param rhs_scratch_extract : determines the RHS tile memory. It is either private or local memory based on the
+ * selected contraction_type.
+ *
+ * \param lhs_extract_index: determins the position of each thread on a local memory for lhs input. When private
+ * memory is used this is set to zero as this is not applicable in case of private memory.
+ *
+ * \param rhs_extract_index: determins the position of each thread on a local memory for rhs input. When private
+ * memory is used this is set to zero as this is not applicable in case of private memory.
+ *
+ * \param lhs_scratch_compute : determines the location to load for computation for lhs_local memory. This is the
+ * same as lhs_scratch_extract for private memory.
+ *
+ * \param rhs_scratch_compute : determines the location to load for computation for rhs_local memory. This is the
+ * same as rhs_scratch_extract for private memory.
+ */
+ struct TiledMemory {
+ MemHolder<contraction_tp, Properties::WorkLoadPerThreadM * Properties::TileSizeDimK> lhs_scratch_extract;
+ MemHolder<contraction_tp, Properties::WorkLoadPerThreadN * Properties::TileSizeDimK> rhs_scratch_extract;
+ tile_ptr lhs_scratch_ptr_compute;
+ tile_ptr rhs_scratch_ptr_compute;
+ const std::pair<StorageIndex, StorageIndex> lhs_extract_index;
+ const std::pair<StorageIndex, StorageIndex> rhs_extract_index;
+ template <contraction_type tp = contraction_tp>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ TiledMemory(const ThreadProperties<StorageIndex> &, local_ptr,
+ typename ::Eigen::internal::enable_if<tp == contraction_type::no_local>::type * = 0)
+ : lhs_scratch_extract{},
+ rhs_scratch_extract{},
+ lhs_scratch_ptr_compute(lhs_scratch_extract.ptr),
+ rhs_scratch_ptr_compute(rhs_scratch_extract.ptr),
+ lhs_extract_index(std::pair<StorageIndex, StorageIndex>(StorageIndex{0}, StorageIndex{0})),
+ rhs_extract_index(std::pair<StorageIndex, StorageIndex>(StorageIndex{0}, StorageIndex{0})) {}
+
+ template <contraction_type tp = contraction_tp>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ TiledMemory(const ThreadProperties<StorageIndex> &thread_properties, local_ptr block_start_ptr,
+ typename ::Eigen::internal::enable_if<tp == contraction_type::local>::type * = 0)
+ : lhs_scratch_extract{block_start_ptr},
+ rhs_scratch_extract{lhs_scratch_extract.ptr +
+ ((Properties::DoubleBuffer + 1) * LSDL * Properties::TileSizeDimK)},
+ lhs_scratch_ptr_compute(lhs_scratch_extract.ptr + thread_properties.mLocalOffset),
+ rhs_scratch_ptr_compute(rhs_scratch_extract.ptr + thread_properties.nLocalOffset),
+ lhs_extract_index(
+ local_id_extract<LHSBlockProperties, Properties::TileSizeDimM>(thread_properties.linearLocalThreadId)),
+ rhs_extract_index(
+ local_id_extract<RHSBlockProperties, Properties::TileSizeDimN>(thread_properties.linearLocalThreadId)) {}
+ };
+
+ Scratch scratch;
+ const LhsMapper lhs;
+ const RhsMapper rhs;
+ OutAccessor out_res;
+ const StorageIndex groupSizeM;
+ const StorageIndex groupSizeN;
+ const StorageIndex numTiles;
+ const TripleDim triple_dim;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_,
+ const RhsMapper rhs_, OutAccessor out_res_,
+ const StorageIndex groupSizeM_,
+ const StorageIndex groupSizeN_,
+ const StorageIndex numTiles_,
+ const TripleDim triple_dim_)
+ : scratch(scratch_),
+ lhs(lhs_),
+ rhs(rhs_),
+ out_res(out_res_),
+ groupSizeM(groupSizeM_),
+ groupSizeN(groupSizeN_),
+ numTiles(numTiles_),
+ triple_dim(triple_dim_) {}
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_,
+ const RhsMapper rhs_, OutAccessor out_res_,
+ const StorageIndex groupSizeM_,
+ const StorageIndex numTiles_,
+ const TripleDim triple_dim_)
+ : TensorContractionKernel(scratch_, lhs_, rhs_, out_res_, groupSizeM_, 1, numTiles_, triple_dim_) {}
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+ const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
+ const StorageIndex nLocalThreadId = linearLocalThreadId / Properties::LocalThreadSizeM;
+ const StorageIndex mLocalThreadId = linearLocalThreadId % Properties::LocalThreadSizeM;
+ const StorageIndex mGroupId = itemID.get_group(0) % groupSizeM;
+ const StorageIndex tmp = itemID.get_group(0) / groupSizeM;
+ const StorageIndex nGroupId = IsFinal ? tmp : tmp % groupSizeN;
+ const StorageIndex kGroupId = IsFinal ? 0 : tmp / groupSizeN;
+ const StorageIndex mGroupOffset = mGroupId * Properties::TileSizeDimM;
+ const StorageIndex nGroupOffset = nGroupId * Properties::TileSizeDimN;
+ const StorageIndex mLocalOffset = PacketSize * mLocalThreadId;
+ const StorageIndex nLocalOffset = NStride * nLocalThreadId;
+ const StorageIndex mGlobalOffset = mGroupOffset + mLocalOffset;
+ const StorageIndex nGlobalOffset = nGroupOffset + nLocalOffset;
+
+ const StorageIndex kSizePerWG = IsFinal ? triple_dim.K : numTiles * Properties::TileSizeDimK;
+ StorageIndex kGroupOffset = kGroupId * kSizePerWG;
+ const bool is_internal = triple_dim.M - mGroupOffset >= Properties::TileSizeDimM &&
+ triple_dim.N - nGroupOffset >= Properties::TileSizeDimN &&
+ triple_dim.K - kGroupOffset >= kSizePerWG;
+ // this is used to adjust the last block
+ StorageIndex kSize = IsFinal ? triple_dim.K : std::min(kSizePerWG, triple_dim.K - kGroupOffset);
+ // This is used to find out the lats K offset so that kGroupOffset -kSize can compute the coffset for loading to
+ // tile
+ kGroupOffset += kSize;
+
+ auto thread_properties =
+ ThreadProperties<StorageIndex>(linearLocalThreadId, kGroupId, mGroupOffset, nGroupOffset, kGroupOffset,
+ mLocalOffset, nLocalOffset, mGlobalOffset, nGlobalOffset, kSize, is_internal);
+
+ auto out_ptr = out_res.get_pointer() + (IsFinal ? 0 : thread_properties.kGroupId * triple_dim.M * triple_dim.N);
+
+ (thread_properties.is_internal) ? compute_panel<true>(itemID, thread_properties, out_ptr)
+ : compute_panel<false>(itemID, thread_properties, out_ptr);
+ }
+ // The compute block computes the contraction operation private block for each thread and store the resutl in the
+ // privateRes memory of Each computation the compute block function is independent of local and no local concepts as
+ // it only compute the block on each thread's private memory space
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_block_per_tile(OutScalar *lhs_block_ptr, OutScalar *rhs_block_ptr,
+ PacketReturnType *privateRes) {
+ StorageIndex idx = 0;
+ EIGEN_CONSTEXPR StorageIndex lhs_stride =
+ contraction_tp == contraction_type::local ? (PacketSize * Properties::LocalThreadSizeM) : 1;
+ EIGEN_UNROLL_LOOP
+ for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN; wLPTN++) {
+ auto rhsPacket = PacketReturnType{*(rhs_block_ptr + wLPTN)};
+ StorageIndex lhs_index = 0;
+ EIGEN_UNROLL_LOOP
+ for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) {
+ PacketReturnType lhsPack{};
+ Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, PacketSize>::set_packet(lhsPack,
+ lhs_block_ptr + lhs_index);
+ privateRes[idx] = ::Eigen::internal::pmadd(lhsPack, rhsPacket, privateRes[idx]);
+
+ lhs_index += lhs_stride;
+ idx++;
+ }
+ }
+ }
+ // The store function write the computed contraction operation in the private memory of each thread to the global
+ // memory. The store function is independent of local and no local concepts s that it can be abstract out in the base
+ // class.
+ template <bool is_internal_block, StorageIndex PrivateNStride, typename OutPtr>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void store(OutPtr *out_ptr, PacketReturnType *privateRes,
+ StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) {
+ auto chk_bound = [&](const StorageIndex &mIndex, const StorageIndex &nIndex) EIGEN_DEVICE_FUNC {
+ return (mIndex + PacketSize - 1 < triple_dim.M && nGlobalOffset + nIndex < triple_dim.N);
+ };
+ // when local memory is not used M and N are both accessed in a coalesced way. However, when local memory is
+ // available the k*N is transposed in the local to N*K therefore, each blocks operates on blockId*
+ // WorkLoadPerThreadN slice of N
+ EIGEN_CONSTEXPR StorageIndex GlobalNStride =
+ contraction_tp == contraction_type::local ? 1 : Properties::LocalThreadSizeN;
+ EIGEN_UNROLL_LOOP
+ for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN / PrivateNStride; wLPTN++) {
+ // output leading dimension
+ StorageIndex outputLD = 0;
+ // When local memory is used the PrivateNstride is always 1 because the coalesed access on N is loaded into Local
+ // memory and extracting from local to global is the same as no transposed version. However, when local memory is
+ // not used and RHS is transposed we packetize the load for RHS.
+ EIGEN_UNROLL_LOOP
+ for (StorageIndex nId = 0; nId < PrivateNStride; nId++) {
+ StorageIndex globalRow = mGlobalOffset;
+ EIGEN_UNROLL_LOOP
+ for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) {
+ PacketReturnType privetOut = privateRes[wLPTM];
+ if (check_boundary<is_internal_block>(chk_bound(globalRow, nId))) {
+ // Store the final results in C. The C matrix has always M as a first StorageIndex and N as a second
+ // StorageIndex Therefore it is always coalesced layout
+ write<data_source::global_mem>(privetOut, out_ptr + outputLD + globalRow);
+ } else {
+ EIGEN_UNROLL_LOOP
+ for (StorageIndex mId = 0; mId < PacketSize; mId++) {
+ StorageIndex mOffset = globalRow + mId;
+ if (mOffset < triple_dim.M && (nGlobalOffset + nId < triple_dim.N)) {
+ out_ptr[mOffset + outputLD] =
+ Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, PacketSize>::scalarize(mId, privetOut);
+ }
+ }
+ }
+ globalRow += (PacketSize * Properties::LocalThreadSizeM);
+ }
+ outputLD += triple_dim.M;
+ privateRes += Properties::WorkLoadPerThreadM / PacketSize;
+ }
+ out_ptr += (GlobalNStride * outputLD);
+
+ nGlobalOffset += (PrivateNStride * GlobalNStride);
+ }
+ }
+ // when no local memory is used the following extract_block will be enabled
+ template <typename InputBlockProperties, bool is_internal_block, typename Input, typename PrivateReg,
+ contraction_type contract_tp = contraction_tp>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ typename ::Eigen::internal::enable_if<contract_tp == contraction_type::no_local>::type
+ extract_block(const Input &inpt, PrivateReg private_ptr, const std::pair<StorageIndex, StorageIndex> &,
+ const StorageIndex &ncOffset, const StorageIndex cOffset) {
+ EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC =
+ InputBlockProperties::is_rhs ? Properties::LocalThreadSizeN : Properties::LocalThreadSizeM;
+ EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC =
+ InputBlockProperties::is_rhs ? Properties::WorkLoadPerThreadN : Properties::WorkLoadPerThreadM;
+ const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M;
+
+ auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC {
+ return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) &&
+ (NCIndex + InputBlockProperties::nc_stride - 1 < NC));
+ };
+ const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K;
+ StorageIndex cIndex = cOffset;
+
+ EIGEN_UNROLL_LOOP
+ for (StorageIndex cId = 0; cId < Properties::TileSizeDimK / InputBlockProperties::c_stride; cId++) {
+ StorageIndex ncIndex = ncOffset;
+ EIGEN_UNROLL_LOOP
+ for (StorageIndex ncId = 0; ncId < WorkLoadPerThreadNC / InputBlockProperties::nc_stride; ncId++) {
+ if (check_boundary<is_internal_block>(chk_bound(cIndex, ncIndex))) {
+ auto val =
+ read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
+ InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, ncIndex, cIndex, ld);
+
+ write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : WorkLoadPerThreadNC),
+ data_source::private_mem>(val, private_ptr);
+ } else {
+ EIGEN_UNROLL_LOOP
+ for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {
+ const StorageIndex ncInd = ncIndex + (InputBlockProperties::is_coalesced_layout ? i : 0);
+ const StorageIndex cInd = cIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i);
+ OutScalar val =
+ (ncInd < NC && cInd < triple_dim.K)
+ ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
+ inpt, ncInd, cInd, ld)
+ : OutScalar(0);
+ write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : WorkLoadPerThreadNC),
+ data_source::private_mem>(
+ val, private_ptr + (InputBlockProperties::is_coalesced_layout ? i : 0) +
+ ((InputBlockProperties::is_coalesced_layout ? 0 : i) * WorkLoadPerThreadNC));
+ }
+ }
+
+ // if it is lhs we have to load it packetised when the packet size is > 1, because the output is coalesced. So
+ // even if M is not accessed in a coalesced mode, we have to load packet_size number of m per thread.
+ ncIndex = (!InputBlockProperties::is_rhs && InputBlockProperties::nc_stride == 1 && PacketSize != 1)
+ ? ncOffset + (ncId + 1) % PacketSize + ((ncId + 1) / PacketSize) * LocalThreadSizeNC
+ : (ncIndex + InputBlockProperties::nc_stride * LocalThreadSizeNC);
+ private_ptr += InputBlockProperties::nc_stride;
+ }
+ // the previous for loop ( private_ptr += (ncId * nc_stride)) has already moved ptr with one WorkLoadPerThreadNC
+ private_ptr += (InputBlockProperties::c_stride - 1) * WorkLoadPerThreadNC;
+ cIndex += InputBlockProperties::c_stride;
+ }
+ }
+ template <typename InputBlockProperties, StorageIndex TileSizeDimNC>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::pair<StorageIndex, StorageIndex> local_id_extract(
+ const StorageIndex &linearLocalThreadId) {
+ const StorageIndex localThreadNC =
+ (InputBlockProperties::is_coalesced_layout)
+ ? linearLocalThreadId % (TileSizeDimNC / InputBlockProperties::nc_stride)
+ : linearLocalThreadId / (Properties::TileSizeDimK / InputBlockProperties::c_stride);
+ const StorageIndex localThreadC =
+ (InputBlockProperties::is_coalesced_layout)
+ ? linearLocalThreadId / (TileSizeDimNC / InputBlockProperties::nc_stride)
+ : linearLocalThreadId % (Properties::TileSizeDimK / InputBlockProperties::c_stride);
+ return std::pair<StorageIndex, StorageIndex>(localThreadNC, localThreadC);
+ }
+
+ template <bool db = Properties::DoubleBuffer, contraction_type ctp = contraction_tp>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ typename ::Eigen::internal::enable_if<db && ctp == contraction_type::local>::type
+ sync_mem(const cl::sycl::nd_item<1> &, bool &db_offset) noexcept {
+ db_offset = !db_offset;
+ }
+
+ template <bool db = Properties::DoubleBuffer, contraction_type ctp = contraction_tp>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ typename ::Eigen::internal::enable_if<!db && ctp == contraction_type::local>::type
+ sync_mem(const cl::sycl::nd_item<1> &itemID, bool &) noexcept {
+ itemID.barrier(cl::sycl::access::fence_space::local_space);
+ }
+
+ template <contraction_type ctp = contraction_tp>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ typename ::Eigen::internal::enable_if<ctp == contraction_type::no_local>::type
+ sync_mem(const cl::sycl::nd_item<1> &, bool &) noexcept {
+ return;
+ }
+
+ template <bool need_sync, contraction_type ctp = contraction_tp>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ typename ::Eigen::internal::enable_if<need_sync && ctp == contraction_type::no_local>::type
+ sync_thread(const cl::sycl::nd_item<1> &
+#ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION
+ itemID
+#endif
+ ) noexcept {
+#ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION
+ itemID.barrier(cl::sycl::access::fence_spacce::local_space);
+#else
+ return;
+#endif
+ }
+ template <bool need_sync, contraction_type ctp = contraction_tp>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ typename ::Eigen::internal::enable_if<need_sync && ctp == contraction_type::local>::type
+ sync_thread(const cl::sycl::nd_item<1> &itemID) {
+ itemID.barrier(cl::sycl::access::fence_space::local_space);
+ }
+ template <bool need_sync>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!need_sync>::type sync_thread(
+ const cl::sycl::nd_item<1> &) {
+ return;
+ }
+
+ template <bool is_internal_block>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_tile_per_panel(const cl::sycl::nd_item<1> &itemID,
+ ThreadProperties<StorageIndex> &thread_properties,
+ TiledMemory &tiled_input_block,
+ PacketReturnType *privateRes, bool &db_offset) {
+ // Tiling the Rhs block from global to local memory
+ extract_block<RHSBlockProperties, is_internal_block>(
+ rhs, tiled_input_block.rhs_scratch_extract.ptr + (db_offset * Properties::TileSizeDimK * LSDR),
+ tiled_input_block.rhs_extract_index,
+ contraction_tp == contraction_type::local ? thread_properties.nGroupOffset : thread_properties.nGlobalOffset,
+ thread_properties.kGroupOffset - thread_properties.kSize);
+
+ sync_thread<contraction_tp == contraction_type::no_local>(itemID);
+
+ // Tiling the Lhs block from global to local memory
+ extract_block<LHSBlockProperties, is_internal_block>(
+ lhs, tiled_input_block.lhs_scratch_extract.ptr + (db_offset * LSDL * Properties::TileSizeDimK),
+ tiled_input_block.lhs_extract_index,
+ contraction_tp == contraction_type::local ? thread_properties.mGroupOffset : thread_properties.mGlobalOffset,
+ thread_properties.kGroupOffset - thread_properties.kSize);
+
+ // itemID.barrier(cl::sycl::access::fence_space::local_space);
+ sync_thread<contraction_tp == contraction_type::local>(itemID);
+ // switch to compute mede
+ StorageIndex lhs_offset = (db_offset * LSDL * Properties::TileSizeDimK);
+ StorageIndex rhs_offset = (db_offset * Properties::TileSizeDimK * LSDR);
+ // Loop over the values of a single tile
+ for (StorageIndex k = 0; k < Properties::TileSizeDimK; k++) {
+ compute_block_per_tile(tiled_input_block.lhs_scratch_ptr_compute + lhs_offset,
+ tiled_input_block.rhs_scratch_ptr_compute + rhs_offset, privateRes);
+ lhs_offset += LSDL;
+ rhs_offset += LSDR;
+ }
+ // computing the K index for the next tile
+ thread_properties.kSize -= Properties::TileSizeDimK;
+ sync_mem(itemID, db_offset);
+ }
+
+ // when local memory is available the following compute_panel will be enabled
+ template <bool is_internal_block, typename OutPtr>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(const cl::sycl::nd_item<1> &itemID,
+ ThreadProperties<StorageIndex> &thread_properties,
+ OutPtr out_ptr) {
+ auto tiled_input_block = TiledMemory{thread_properties, scratch.get_pointer()};
+ // Allocate register space
+ PacketReturnType privateRes[Properties::WorkLoadPerThreadM * Properties::WorkLoadPerThreadN / PacketSize] = {
+ PacketReturnType{0}};
+ bool db_offset = 0;
+
+ while (thread_properties.kSize >= Properties::TileSizeDimK) {
+ compute_tile_per_panel<is_internal_block>(itemID, thread_properties, tiled_input_block, privateRes, db_offset);
+ }
+ if (thread_properties.kSize > 0) {
+ compute_tile_per_panel<false>(itemID, thread_properties, tiled_input_block, privateRes, db_offset);
+ }
+
+ // Storing the final results in the output
+ store<is_internal_block,
+ contraction_tp == contraction_type::local ? static_cast<StorageIndex>(1) : RHSBlockProperties::nc_stride>(
+ out_ptr + thread_properties.nGlobalOffset * triple_dim.M, privateRes, thread_properties.mGlobalOffset,
+ thread_properties.nGlobalOffset);
+ }
+ // When local memory is available the following extract_block will be enabled
+ template <typename InputBlockProperties, bool is_internal_block, typename Input, typename Local,
+ contraction_type contract_tp = contraction_tp>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ typename ::Eigen::internal::enable_if<contract_tp == contraction_type::local>::type
+ extract_block(const Input &inpt, Local local_ptr, const std::pair<StorageIndex, StorageIndex>& local_index,
+ const StorageIndex &ncOffset, const StorageIndex cOffset) {
+ EIGEN_CONSTEXPR StorageIndex TileSizeDimNC =
+ InputBlockProperties::is_rhs ? Properties::TileSizeDimN : Properties::TileSizeDimM;
+ EIGEN_CONSTEXPR StorageIndex LoadPerThread =
+ InputBlockProperties::is_rhs ? Properties::LoadPerThreadRhs : Properties::LoadPerThreadLhs;
+ EIGEN_CONSTEXPR StorageIndex LSD = InputBlockProperties::is_rhs ? LSDR : LSDL;
+ static_assert(((LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride) == 0) &&
+ (LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride) == 0)),
+ " LocalOffset must be divisable by stride");
+ const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M;
+ StorageIndex localThreadNC = local_index.first;
+ StorageIndex localThreadC = local_index.second;
+ auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC {
+ return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) &&
+ (NCIndex + InputBlockProperties::nc_stride - 1 < NC));
+ };
+ EIGEN_UNROLL_LOOP
+ for (StorageIndex lPT = 0; lPT < LoadPerThread / InputBlockProperties::elements_per_access; lPT++) {
+ const StorageIndex CIndex = cOffset + (InputBlockProperties::c_stride * localThreadC);
+ const StorageIndex NCIndex = ncOffset + (InputBlockProperties::nc_stride * localThreadNC);
+ const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K;
+ if (check_boundary<is_internal_block>(chk_bound(CIndex, NCIndex))) {
+ auto val =
+ read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
+ InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, NCIndex, CIndex, ld);
+ write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : LSD), data_source::local_mem>(
+ val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) +
+ (InputBlockProperties::c_stride * localThreadC * LSD));
+ } else {
+ EIGEN_UNROLL_LOOP
+ for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {
+ const StorageIndex nCInd = NCIndex + (InputBlockProperties::is_coalesced_layout ? i : 0);
+ const StorageIndex cInd = CIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i);
+ OutScalar val =
+ (nCInd < NC && cInd < triple_dim.K)
+ ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
+ inpt, nCInd, cInd, ld)
+ : OutScalar(0);
+
+ write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : LSD), data_source::local_mem>(
+ val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) +
+ (InputBlockProperties::is_coalesced_layout ? i : 0) +
+ ((InputBlockProperties::c_stride * localThreadC +
+ (InputBlockProperties::is_coalesced_layout ? 0 : i)) *
+ LSD));
+ }
+ }
+ localThreadNC += (InputBlockProperties::is_coalesced_layout)
+ ? LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride)
+ : LocalOffset / (Properties::TileSizeDimK / InputBlockProperties::c_stride);
+ localThreadC += (InputBlockProperties::is_coalesced_layout)
+ ? LocalOffset / (TileSizeDimNC / InputBlockProperties::nc_stride)
+ : LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride);
+ }
+ }
+};
+
+#ifndef EIGEN_SYCL_DISABLE_GEMV
+
+/*!
+ * \brief GeneralVectorTensor is a template class that provides Tensor -vector contraction operation, which is a special
+ * case of Tensor Tensor contraction.
+ *
+ * \tparam OutScalar: determines the output scalar type
+ *
+ * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification
+ * (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition)
+ *
+ * \tparam VectorMapper: determines the tensor contraction mapper for the vector input (can be lhs or rhs)
+ *
+ * \tparam TensorMapper: determines the tensor contraction mapper for the tensor input (can be lhs or rhs)
+ *
+ * \tparam StorageIndex: determines the StorageIndex Type
+ *
+ * \tparam Properties: determines the Contraction Panel properties
+ *
+ * \tparam KFactor: determines the number of elements in K dimension in a Tile
+ *
+ * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression.
+ *
+ * \tparam is_lhs_vec: determines whether lhs is a vector or rhs is a vector
+ *
+ * \tparam IsFinal: determine if this is the final kernel. If so, the result will be written in a final output.
+ * Otherwise, the result of contraction will be written iin a temporary buffer.
+ *
+ * \param scratch: determines the local memory containing the vector block for each work-group
+ *
+ * \param vec: determines the vector input (tensor mapper)
+ *
+ * \param mat: determines the tensor input (tensor mapper)
+ *
+ * \param out_res: determines the output vector containing the contraction result
+ *
+ * \param nonContractGroupSize: a logical number determining the number of work-group for non-contracting dimension
+ *
+ * \param nonContractDim: determines the size of non contracting dimension for the flattened tensor
+ *
+ * \param contractDim: determines the size of non contracting dimension for the flattened tensor
+ *
+ */
+template <typename OutScalar, typename OutAccessor, typename VectorMapper, typename TensorMapper, typename StorageIndex,
+ typename Properties, StorageIndex KFactor, bool Vectorizable, bool is_lhs_vec, bool IsFinal>
+struct GeneralVectorTensor {
+ typedef typename Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketReturnType
+ PacketReturnType;
+ static EIGEN_CONSTEXPR int PacketSize =
+ Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketSize;
+ typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
+
+ static EIGEN_CONSTEXPR StorageIndex OutScratchOffset =
+ KFactor * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC;
+
+ // Since the access layout for a vector can always be coalesced, when LHS is a vector, we pass false and false to make
+ // sure that the !^ is true When RHS is a vector, we pass true and true to make sure that the !^ is true.
+ typedef BlockProperties<is_lhs_vec ? false : true, is_lhs_vec ? false : true, Vectorizable, PacketReturnType>
+ VecBlockProperties;
+
+ Scratch scratch;
+ const VectorMapper vec;
+ const TensorMapper mat;
+ OutAccessor out_res;
+ const StorageIndex nonContractGroupSize;
+ const StorageIndex nonContractDim;
+ const StorageIndex contractDim;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE GeneralVectorTensor(Scratch scratch_, const VectorMapper vec_,
+ const TensorMapper mat_, OutAccessor out_res_,
+ const StorageIndex nonContractGroupSize_,
+ const StorageIndex nonContractDim_,
+ const StorageIndex contractDim_)
+ : scratch(scratch_),
+ vec(vec_),
+ mat(mat_),
+ out_res(out_res_),
+ nonContractGroupSize(nonContractGroupSize_),
+ nonContractDim(nonContractDim_),
+ contractDim(contractDim_) {}
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+ auto scratch_ptr = scratch.get_pointer();
+ const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
+ StorageIndex nonContractId = is_lhs_vec ? linearLocalThreadId / Properties::LocalThreadSizeC
+ : linearLocalThreadId % Properties::LocalThreadSizeNC;
+ StorageIndex contractId = is_lhs_vec ? linearLocalThreadId % Properties::LocalThreadSizeC
+ : linearLocalThreadId / Properties::LocalThreadSizeNC;
+ const StorageIndex cGroupSize = itemID.get_group_range(0) / nonContractGroupSize;
+ const StorageIndex nonContractGroupId =
+ is_lhs_vec ? itemID.get_group(0) / cGroupSize : itemID.get_group(0) % nonContractGroupSize;
+ const StorageIndex contractGroupId =
+ is_lhs_vec ? itemID.get_group(0) % cGroupSize : itemID.get_group(0) / nonContractGroupSize;
+ auto out_ptr = out_res.get_pointer() + (IsFinal ? 0 : contractGroupId * nonContractDim);
+
+ const StorageIndex nonContractGroupOffset = nonContractGroupId * Properties::TileSizeDimNC;
+ const StorageIndex contractGroupOffset = contractGroupId * Properties::TileSizeDimC;
+ auto outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC;
+ const StorageIndex globalNonContractDimOffset = nonContractGroupOffset + nonContractId;
+ const StorageIndex globalContractDimOffset = contractGroupOffset + contractId;
+ auto local_output = scratch_ptr + OutScratchOffset;
+ const bool is_internal = nonContractDim - nonContractGroupOffset >= Properties::TileSizeDimNC &&
+ contractDim - contractGroupOffset >= Properties::TileSizeDimC;
+ is_internal
+ ? compute_panel<true>(itemID, vec, mat, local_output, out_ptr,
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+ scratch_ptr, contractGroupOffset,
+#endif
+ nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId,
+ nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex)
+ : compute_panel<false>(itemID, vec, mat, local_output, out_ptr,
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+ scratch_ptr, contractGroupOffset,
+#endif
+ nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId,
+ nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex);
+ }
+ template <bool is_internal_block, typename OutPtr>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(
+ const cl::sycl::nd_item<1> &itemID, const VectorMapper &vec, const TensorMapper &mat, OutScalar *local_output,
+ OutPtr out_ptr,
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+ OutScalar *scratch_ptr, const StorageIndex contractGroupOffset,
+#endif
+ const StorageIndex nonContractGroupOffset, const StorageIndex linearLocalThreadId, StorageIndex contractDim,
+ StorageIndex nonContractDim, StorageIndex contractId, StorageIndex nonContractId,
+ StorageIndex globalContractDimOffset, StorageIndex globalNonContractDimOffset, StorageIndex outScratchIndex) {
+ OutScalar outScalar[Properties::WorkLoadPerThreadNC] = {OutScalar(0)};
+ // Reading the vector
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+ const StorageIndex vectorOffset = contractGroupOffset + linearLocalThreadId;
+ extract_block<VecBlockProperties, is_internal_block, KFactor,
+ Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC>(vec, scratch_ptr, linearLocalThreadId,
+ vectorOffset, contractDim);
+
+ itemID.barrier(cl::sycl::access::fence_space::local_space);
+ auto in_scratch_ptr = scratch_ptr + contractId;
+#endif
+
+ StorageIndex privateOffsetC = 0;
+ EIGEN_UNROLL_LOOP
+ for (StorageIndex i = 0; i < Properties::WorkLoadPerThreadC; i++) {
+ StorageIndex privateOffsetNC = 0;
+ bool contract_conds = ((globalContractDimOffset + privateOffsetC) < contractDim);
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+ auto vecScalar = *in_scratch_ptr;
+#else
+ auto vecScalar = (check_boundary<is_internal_block>(contract_conds))
+ ? vec(is_lhs_vec ? StorageIndex(0) : globalContractDimOffset + privateOffsetC,
+ is_lhs_vec ? globalContractDimOffset + privateOffsetC : StorageIndex(0))
+ : OutScalar(0);
+#endif
+ EIGEN_UNROLL_LOOP
+ for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
+ auto matScalar = (check_boundary<is_internal_block>(
+ contract_conds && ((globalNonContractDimOffset + privateOffsetNC) < nonContractDim)))
+ ? mat(is_lhs_vec ? globalContractDimOffset + privateOffsetC
+ : globalNonContractDimOffset + privateOffsetNC,
+ is_lhs_vec ? globalNonContractDimOffset + privateOffsetNC
+ : globalContractDimOffset + privateOffsetC)
+ : OutScalar(0);
+
+ outScalar[j] = cl::sycl::mad(matScalar, vecScalar, outScalar[j]);
+ privateOffsetNC += Properties::LocalThreadSizeNC;
+ }
+ privateOffsetC += Properties::LocalThreadSizeC;
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+ in_scratch_ptr += Properties::LocalThreadSizeC;
+#endif
+ }
+
+ auto out_scratch_ptr = local_output + outScratchIndex;
+ // Each block of 16*16 element in shared memory should reduce to 16*1
+ EIGEN_UNROLL_LOOP
+ for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
+ *out_scratch_ptr = outScalar[j];
+
+ out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
+ }
+ if (is_lhs_vec) {
+ nonContractId = linearLocalThreadId % Properties::LocalThreadSizeNC;
+ contractId = linearLocalThreadId / Properties::LocalThreadSizeNC;
+ outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC;
+ }
+
+ out_scratch_ptr = local_output + outScratchIndex;
+ EIGEN_UNROLL_LOOP
+ for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
+ EIGEN_UNROLL_LOOP
+ for (StorageIndex offset = Properties::LocalThreadSizeC >> 1; offset > 0; offset >>= 1) {
+ itemID.barrier(cl::sycl::access::fence_space::local_space);
+ if (contractId < offset) {
+ StorageIndex myNeigbourId = (Properties::LocalThreadSizeNC * offset);
+ *out_scratch_ptr += out_scratch_ptr[myNeigbourId];
+ }
+ }
+ // moving to next 16 by 16 block
+ out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
+ }
+
+ if (contractId == 0) {
+ out_scratch_ptr = local_output + nonContractId;
+ StorageIndex global_final_offset = nonContractGroupOffset + nonContractId;
+ out_ptr += global_final_offset;
+ EIGEN_UNROLL_LOOP
+ for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
+ if (check_boundary<is_internal_block>(global_final_offset < nonContractDim)) {
+ auto res = *out_scratch_ptr;
+
+ *out_ptr = res;
+ out_ptr += Properties::LocalThreadSizeNC;
+ }
+ // moving to next 16 by 16 block to ge the next 16 reduced elements
+ out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
+ if (!(is_internal_block)) global_final_offset += Properties::LocalThreadSizeNC;
+ }
+ }
+ }
+
+ template <typename InputBlockProperties, bool is_internal_block, int CFactor, int GroupSize, typename Input,
+ typename Local>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_block(const Input &inpt, Local *local_ptr,
+ const StorageIndex &linearLocalThreadId,
+ const StorageIndex &cOffset, const StorageIndex &C) {
+ local_ptr += InputBlockProperties::c_stride * linearLocalThreadId;
+ StorageIndex cIndex = cOffset;
+ for (StorageIndex cId = 0; cId < CFactor / InputBlockProperties::c_stride; cId++) {
+ if (check_boundary<is_internal_block>(cIndex + InputBlockProperties::c_stride - 1 < C)) {
+ auto val = read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
+ InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, StorageIndex(0),
+ cIndex, StorageIndex(1));
+ write<StorageIndex, 1, data_source::local_mem>(val, local_ptr);
+ } else {
+ EIGEN_UNROLL_LOOP
+ for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {
+ OutScalar val =
+ (cIndex + i < C)
+ ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
+ inpt, StorageIndex(0), cIndex + i, StorageIndex(1))
+ : OutScalar(0);
+ write<StorageIndex, 1, data_source::local_mem>(val, local_ptr + i);
+ }
+ }
+ local_ptr += InputBlockProperties::c_stride * GroupSize;
+ cIndex += InputBlockProperties::c_stride * GroupSize;
+ }
+ }
+};
+#endif
+
+#ifndef EIGEN_SYCL_DISABLE_SCALAR
+
+/*!
+ * \brief GeneralScalarContraction is a template class that provides the scalar value of Tensor -Tensor contraction
+ * operation, when all the dimensions are contracting dimensions. This Kernel reduces two tensors to an scalar
+ *
+ * \tparam OutScalar: determines the output scalar type
+ *
+ * \tparam LhsScalar: determines the left-hand-side scalar type
+ *
+ * \tparam RhsScalar: determines the right-hand-side scalar type
+ *
+ * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification
+ * (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition)
+ *
+ * \tparam LhsMapper: determines the tensor contraction mapper type for left-hand-side matrix
+ *
+ * \tparam RhsMapper: determines the tensor contraction mapper type for right-hand-side matrix
+ *
+ * \tparam StorageIndex: determines the StorageIndex Type
+ *
+ * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression.
+ *
+ * \param scratch: local memory containing tiles of LHS and RHS tensors for each work-group
+ *
+ * \param lhs: determines the left-hand-side flattened tensor (tensor mapper)
+ *
+ * \param rhs: determines the right-hand-side flattened tensor (tensor mapper)
+ *
+ * \param out_res: determines the output tensor containing the contraction result
+ *
+ * \param rng: determins the total input data size
+ */
+template <typename OutScalar, typename LhsScalar, typename RhsScalar, typename OutAccessor, typename LhsMapper,
+ typename RhsMapper, typename StorageIndex, bool Vectorizable>
+struct GeneralScalarContraction {
+ typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
+ Scratch scratch;
+ const LhsMapper lhs;
+ const RhsMapper rhs;
+ OutAccessor out_res;
+ const StorageIndex rng;
+
+ EIGEN_DEVICE_FUNC
+ GeneralScalarContraction(Scratch scratch_, const LhsMapper lhs_, const RhsMapper rhs_, OutAccessor out_res_,
+ const StorageIndex rng_)
+ : scratch(scratch_), lhs(lhs_), rhs(rhs_), out_res(out_res_), rng(rng_) {}
+
+ EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) {
+ auto out_ptr = out_res.get_pointer();
+ auto scratch_ptr = scratch.get_pointer().get();
+
+ StorageIndex globalid = itemID.get_global_id(0);
+ StorageIndex localid = itemID.get_local_id(0);
+ OutScalar accumulator = OutScalar(0);
+ for (StorageIndex i = globalid; i < rng; i += itemID.get_global_range(0)) {
+ accumulator = cl::sycl::mad(lhs(0, i), rhs(i, 0), accumulator);
+ }
+ auto out_scratch_ptr = scratch_ptr + localid;
+ *out_scratch_ptr = accumulator;
+ for (StorageIndex offset = itemID.get_local_range(0) >> 1; offset > 0; offset >>= 1) {
+ itemID.barrier(cl::sycl::access::fence_space::local_space);
+ if (localid < offset) {
+ *out_scratch_ptr = (accumulator += out_scratch_ptr[offset]);
+ }
+ }
+ if (localid == 0) {
+ out_ptr[itemID.get_group(0)] = accumulator;
+ }
+ }
+};
+#endif
+
+} // namespace internal
+} // namespace TensorSycl
+
+template <typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>,
+ Eigen::SyclDevice>
+ : public TensorContractionEvaluatorBase<TensorEvaluator<
+ const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Eigen::SyclDevice>> {
+ static_assert(std::is_same<OutputKernelType, const NoOpOutputKernel>::value,
+ "SYCL tensor contraction does not support output kernels.");
+
+ typedef Eigen::SyclDevice Device;
+
+ typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
+ typedef TensorContractionEvaluatorBase<Self> Base;
+ typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
+ typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+ typedef typename XprType::Index StorageIndex;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ typedef typename Base::Storage Storage;
+ typedef typename Base::EvaluatorPointerType EvaluatorPointerType;
+ struct TripleDim {
+ const StorageIndex M;
+ const StorageIndex N;
+ const StorageIndex K;
+ TripleDim(const StorageIndex M_, const StorageIndex N_, const StorageIndex K_) : M(M_), N(N_), K(K_) {}
+ };
+ enum {
+ Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+ PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+ BlockAccess = false,
+ };
+
+ static EIGEN_CONSTEXPR int LDims = Base::LDims;
+ static EIGEN_CONSTEXPR int RDims = Base::RDims;
+ static EIGEN_CONSTEXPR int ContractDims = Base::ContractDims;
+
+ typedef array<StorageIndex, LDims> left_dim_mapper_t;
+ typedef array<StorageIndex, RDims> right_dim_mapper_t;
+
+ typedef array<StorageIndex, ContractDims> contract_t;
+ typedef array<StorageIndex, LDims - ContractDims> left_nocontract_t;
+ typedef array<StorageIndex, RDims - ContractDims> right_nocontract_t;
+
+ static const int NumDims = LDims + RDims - 2 * ContractDims;
+
+ typedef DSizes<StorageIndex, NumDims> Dimensions;
+
+ typedef TensorEvaluator<typename Base::EvalLeftArgType, Device> LeftEvaluator;
+ typedef TensorEvaluator<typename Base::EvalRightArgType, Device> RightEvaluator;
+ typedef typename Eigen::internal::remove_const<typename LeftEvaluator::CoeffReturnType>::type LhsScalar;
+ typedef typename Eigen::internal::remove_const<typename RightEvaluator::CoeffReturnType>::type RhsScalar;
+
+ typedef typename LeftEvaluator::Dimensions LeftDimensions;
+ typedef typename RightEvaluator::Dimensions RightDimensions;
+
+ template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered>
+ struct input_mapper_propertis {
+ static EIGEN_CONSTEXPR bool is_lhs_matrix = (LDims == 2 && ContractDims == 1) || lhs_inner_dim_contiguous;
+ static EIGEN_CONSTEXPR bool is_rhs_matrix =
+ (RDims == 2 && ContractDims == 1) || (rhs_inner_dim_contiguous && !rhs_inner_dim_reordered);
+ };
+
+ TensorEvaluator(const XprType &op, const Device &device) : Base(op, device) {}
+
+ // We need to redefine this method to make nvcc happy
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(typename Base::EvaluatorPointerType data) {
+ this->m_leftImpl.evalSubExprsIfNeeded(NULL);
+ this->m_rightImpl.evalSubExprsIfNeeded(NULL);
+ if (!data) {
+ this->m_result = this->m_device.get(
+ static_cast<Scalar *>(this->m_device.allocate_temp(this->dimensions().TotalSize() * sizeof(Scalar))));
+ data = this->m_result;
+ }
+ evalToSycl(data);
+ return (this->m_result != NULL);
+ }
+ const Eigen::SyclDevice &device() const { return this->m_device; }
+ void evalToSycl(typename Base::EvaluatorPointerType buffer) const {
+ if (this->m_lhs_inner_dim_contiguous) {
+ if (this->m_rhs_inner_dim_contiguous) {
+ if (this->m_rhs_inner_dim_reordered) {
+ evalTyped<true, true, true, Unaligned>(buffer);
+ } else {
+ evalTyped<true, true, false, Unaligned>(buffer);
+ }
+ } else {
+ if (this->m_rhs_inner_dim_reordered) {
+ evalTyped<true, false, true, Unaligned>(buffer);
+ } else {
+ evalTyped<true, false, false, Unaligned>(buffer);
+ }
+ }
+ } else {
+ if (this->m_rhs_inner_dim_contiguous) {
+ if (this->m_rhs_inner_dim_reordered) {
+ evalTyped<false, true, true, Unaligned>(buffer);
+ } else {
+ evalTyped<false, true, false, Unaligned>(buffer);
+ }
+ } else {
+ if (this->m_rhs_inner_dim_reordered) {
+ evalTyped<false, false, true, Unaligned>(buffer);
+ } else {
+ evalTyped<false, false, false, Unaligned>(buffer);
+ }
+ }
+ }
+ }
+
+ template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+ void evalTyped(typename Base::EvaluatorPointerType buffer) const {
+ const auto triple_dim = TripleDim{this->m_i_size, this->m_j_size, this->m_k_size};
+ typedef internal::TensorContractionInputMapper<
+ LhsScalar, StorageIndex, internal::Lhs, LeftEvaluator, left_nocontract_t, contract_t,
+ PacketType<CoeffReturnType, Device>::size, lhs_inner_dim_contiguous, false, Unaligned, MakeSYCLPointer>
+ LhsMapper;
+
+ typedef internal::TensorContractionInputMapper<RhsScalar, StorageIndex, internal::Rhs, RightEvaluator,
+ right_nocontract_t, contract_t,
+ PacketType<CoeffReturnType, Device>::size, rhs_inner_dim_contiguous,
+ rhs_inner_dim_reordered, Unaligned, MakeSYCLPointer>
+ RhsMapper;
+
+ // initialize data mappers
+ LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
+ this->m_left_contracting_strides, this->m_k_strides);
+
+ RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
+ this->m_right_contracting_strides, this->m_k_strides);
+
+#ifndef EIGEN_SYCL_DISABLE_SCALAR
+ if (triple_dim.M == 1 && triple_dim.N == 1) {
+ launchSC(buffer, lhs, rhs, triple_dim.K);
+ } else
+#endif
+#ifndef EIGEN_SYCL_DISABLE_GEMV
+ if (triple_dim.M != 1 && triple_dim.N == 1) {
+ LaunchVT<false>(buffer, rhs, lhs, triple_dim.M, triple_dim.K);
+ } else if (triple_dim.M == 1 && triple_dim.N != 1) {
+ LaunchVT<true>(buffer, lhs, rhs, triple_dim.N, triple_dim.K);
+ } else // This is equivalent of if (m!=1 && n!=1)
+#endif
+ {
+ typedef input_mapper_propertis<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>
+ inpt_mapper_properties;
+#ifndef EIGEN_SYCL_DISABLE_SKINNY
+ bool skinny = false;
+ auto platform_name = this->device().getPlatformName();
+ // This is based on empirical calculation for AMD r9-nano and Fiji
+ if (platform_name.find("AMD") == 0) {
+ skinny = (triple_dim.M < triple_dim.K || triple_dim.N < triple_dim.K) &&
+ ((triple_dim.M < 1024 && triple_dim.N < 1024) ||
+ (uint64_t(triple_dim.M * triple_dim.N) < uint64_t(triple_dim.K)));
+ } else {
+ skinny = (((std::max(triple_dim.K, triple_dim.N) / std::min(triple_dim.K, triple_dim.N)) > 100) ||
+ ((std::max(triple_dim.K, triple_dim.M) / std::min(triple_dim.K, triple_dim.M)) > 100) ||
+ ((std::max(triple_dim.N, triple_dim.M) / std::min(triple_dim.N, triple_dim.M)) > 100));
+ }
+ if (skinny)
+ adjustTT<true, inpt_mapper_properties>(buffer, lhs, rhs, triple_dim);
+ else
+#endif // EIGEN_SYCL_DISABLE_SKINNY
+ adjustTT<false, inpt_mapper_properties>(buffer, lhs, rhs, triple_dim);
+ }
+ }
+
+ template <bool skinny, typename input_mapper_properties, typename LhsMapper, typename RhsMapper>
+ void EIGEN_ALWAYS_INLINE adjustTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,
+ const TripleDim &triple_dim) const {
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+ if (device().has_local_memory()) {
+ typedef TensorSycl::internal::TTPanelSize<CoeffReturnType, StorageIndex, 4, 4, 16> PanelParameters;
+ launchTT<TensorSycl::internal::contraction_type::local, skinny, input_mapper_properties, PanelParameters>(
+ buffer, lhs, rhs, triple_dim);
+ }
+#endif
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF
+ if (!(device().has_local_memory())) {
+ typedef TensorSycl::internal::TTPanelSize<CoeffReturnType, StorageIndex, 4, 4, 4> PanelParameters;
+ launchTT<TensorSycl::internal::contraction_type::no_local, skinny, input_mapper_properties, PanelParameters>(
+ buffer, lhs, rhs, triple_dim);
+ }
+#endif
+ }
+
+ template <TensorSycl::internal::contraction_type ct, bool skinny, typename input_mapper_properties,
+ typename Properties, typename LhsMapper, typename RhsMapper>
+ void launchTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,
+ const TripleDim &triple_dim) const {
+ const StorageIndex roundUpM = Eigen::TensorSycl::internal::roundUp(triple_dim.M, Properties::TileSizeDimM);
+ const StorageIndex roundUpN = Eigen::TensorSycl::internal::roundUp(triple_dim.N, Properties::TileSizeDimN);
+ const StorageIndex groupSizeM = roundUpM / Properties::TileSizeDimM;
+ const StorageIndex groupSizeN = roundUpN / Properties::TileSizeDimN;
+
+ const StorageIndex roundUpK = Eigen::TensorSycl::internal::roundUp(triple_dim.K, Properties::TileSizeDimK);
+ StorageIndex totalTilesK = roundUpK / Properties::TileSizeDimK;
+ StorageIndex groupSizeK =
+ skinny
+ ? std::max(std::min(totalTilesK,
+ (StorageIndex)(device().getPowerOfTwo(device().getNumSyclMultiProcessors(), true) * 4) /
+ (groupSizeM * groupSizeN)),
+ StorageIndex(1))
+ : StorageIndex(1);
+
+ const StorageIndex numTilesPerGroup = Eigen::TensorSycl::internal::roundUp(totalTilesK, groupSizeK) / groupSizeK;
+
+ const StorageIndex totalGroupSize = groupSizeM * groupSizeN * groupSizeK;
+
+ const StorageIndex localRange = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN;
+ const StorageIndex globalRange = totalGroupSize * localRange;
+
+ const StorageIndex scratchSize = (ct == TensorSycl::internal::contraction_type::local)
+ ? ((Properties::DoubleBuffer + 1) *
+ (Properties::TileSizeDimM + Properties::BC) * (Properties::TileSizeDimK)) +
+ ((Properties::DoubleBuffer + 1) * (Properties::TileSizeDimK) *
+ (Properties::TileSizeDimN + Properties::BC))
+ : StorageIndex(1);
+
+ auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
+ if (groupSizeK == 1) {
+ typedef TensorSycl::internal::TensorContractionKernel<CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType,
+ LhsMapper, RhsMapper, StorageIndex, Properties, TripleDim,
+ PacketAccess, input_mapper_properties, true, ct>
+ ContractKernelName;
+ device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
+ lhs, rhs, buffer, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, triple_dim);
+ } else {
+ typedef TensorSycl::internal::TensorContractionKernel<CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType,
+ LhsMapper, RhsMapper, StorageIndex, Properties, TripleDim,
+ PacketAccess, input_mapper_properties, false, ct>
+ ContractKernelName;
+ CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>(
+ device().allocate_temp(triple_dim.M * triple_dim.N * groupSizeK * sizeof(CoeffReturnType)));
+ EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
+
+ device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
+ lhs, rhs, tmp_global_accessor, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup,
+ triple_dim);
+
+ typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
+ auto op = Op();
+ typedef TensorSycl::internal::SecondStepPartialReduction<CoeffReturnType, StorageIndex, EvaluatorPointerType,
+ EvaluatorPointerType, Op>
+ ReductionKernel;
+
+ device().template unary_kernel_launcher<CoeffReturnType, ReductionKernel>(
+ tmp_global_accessor, buffer,
+ cl::sycl::nd_range<1>(cl::sycl::range<1>(StorageIndex(
+ Eigen::TensorSycl::internal::roundUp(triple_dim.M * triple_dim.N, localRange))),
+ cl::sycl::range<1>(localRange)),
+ StorageIndex(1), op, StorageIndex(triple_dim.M * triple_dim.N), groupSizeK);
+
+ device().deallocate_temp(temp_pointer);
+ }
+ }
+
+#ifndef EIGEN_SYCL_DISABLE_GEMV
+ template <bool is_lhs_vec, typename VectorMapper, typename TensorMapper, typename StorageIndex>
+ void EIGEN_ALWAYS_INLINE LaunchVT(EvaluatorPointerType buffer, const VectorMapper &vec, const TensorMapper &mat,
+ StorageIndex NC, StorageIndex C) const {
+ const StorageIndex nonContractDim = NC;
+ EIGEN_CONSTEXPR StorageIndex NCFactor = 1;
+ EIGEN_CONSTEXPR StorageIndex CFactor = 1;
+ EIGEN_CONSTEXPR StorageIndex NCWindow = 16;
+ typedef Eigen::TensorSycl::internal::TVPanelSize<CoeffReturnType, StorageIndex, NCWindow, CFactor, NCFactor>
+ Properties;
+ const StorageIndex roundUpC = Eigen::TensorSycl::internal::roundUp(C, Properties::TileSizeDimC);
+ const StorageIndex cNumGroups = roundUpC / (Properties::LocalThreadSizeC * Properties::WorkLoadPerThreadC);
+ const StorageIndex roundUpNC = Eigen::TensorSycl::internal::roundUp(nonContractDim, Properties::TileSizeDimNC);
+ const StorageIndex nCNumGroups = roundUpNC / (Properties::LocalThreadSizeNC * Properties::WorkLoadPerThreadNC);
+ const StorageIndex globalRange =
+ (roundUpNC / (Properties::WorkLoadPerThreadNC)) * (roundUpC / (Properties::WorkLoadPerThreadC));
+ const StorageIndex localRange = Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC;
+ const StorageIndex scratchSize =
+ (Properties::WorkLoadPerThreadNC + CFactor) * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC;
+ auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
+ if (cNumGroups > 1) {
+ typedef Eigen::TensorSycl::internal::GeneralVectorTensor<CoeffReturnType, EvaluatorPointerType, VectorMapper,
+ TensorMapper, StorageIndex, Properties, CFactor, false,
+ is_lhs_vec, false>
+ ContractKernelName;
+ CoeffReturnType *temp_pointer =
+ static_cast<CoeffReturnType *>(device().allocate_temp(nonContractDim * cNumGroups * sizeof(CoeffReturnType)));
+ EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
+
+ device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
+ vec, mat, tmp_global_accessor, thread_range, scratchSize, nCNumGroups, nonContractDim, C);
+
+ typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
+ typedef TensorSycl::internal::SecondStepPartialReduction<CoeffReturnType, StorageIndex, EvaluatorPointerType,
+ EvaluatorPointerType, Op>
+ ReductionKernel;
+
+ device().template unary_kernel_launcher<CoeffReturnType, ReductionKernel>(
+ tmp_global_accessor, buffer,
+ cl::sycl::nd_range<1>(cl::sycl::range<1>(Eigen::TensorSycl::internal::roundUp(nonContractDim, localRange)),
+ cl::sycl::range<1>(localRange)),
+ StorageIndex(1), Op(), nonContractDim, cNumGroups);
+
+ device().deallocate_temp(temp_pointer);
+ } else {
+ typedef Eigen::TensorSycl::internal::GeneralVectorTensor<CoeffReturnType, EvaluatorPointerType, VectorMapper,
+ TensorMapper, StorageIndex, Properties, CFactor, false,
+ is_lhs_vec, true>
+ ContractKernelName;
+ device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
+ vec, mat, buffer, thread_range, scratchSize, nCNumGroups, nonContractDim, C);
+ }
+ }
+#endif
+
+#ifndef EIGEN_SYCL_DISABLE_SCALAR
+ template <typename LhsMapper, typename RhsMapper>
+ EIGEN_ALWAYS_INLINE void launchSC(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,
+ StorageIndex K) const {
+ EIGEN_STATIC_ASSERT(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
+ (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
+ "The Local thread size must be a power of 2 for the reduction "
+ "operation");
+ EIGEN_CONSTEXPR StorageIndex local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
+
+ // Here we force the code not to be more than 2-step reduction: Our empirical research shows that if each thread
+ // reduces at least 512 elementss individually, we get better performance.
+ const StorageIndex num_work_group = ((K + (512 * local_range - 1)) / (512 * local_range) > 1 ? local_range : 1);
+ const StorageIndex global_range = num_work_group * local_range;
+
+ typedef Eigen::TensorSycl::internal::GeneralScalarContraction<
+ CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType, LhsMapper, RhsMapper, StorageIndex, false>
+ ContractKernelName;
+ auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
+ if (num_work_group > 1) {
+ CoeffReturnType *temp_pointer =
+ static_cast<CoeffReturnType *>(device().allocate_temp(num_work_group * sizeof(CoeffReturnType)));
+ EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
+ device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(lhs, rhs, tmp_global_accessor,
+ thread_range, local_range, K);
+ typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
+ typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,
+ EvaluatorPointerType, StorageIndex, local_range>
+ GenericRKernel;
+ device().template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
+ tmp_global_accessor, buffer,
+ cl::sycl::nd_range<1>(cl::sycl::range<1>(local_range), cl::sycl::range<1>(local_range)), local_range, Op());
+
+ device().deallocate_temp(temp_pointer);
+ } else {
+ device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(lhs, rhs, buffer, thread_range,
+ local_range, K);
+ }
+ }
+#endif
+
+ EIGEN_STRONG_INLINE void cleanup() {
+ this->m_leftImpl.cleanup();
+ this->m_rightImpl.cleanup();
+
+ if (this->m_result) {
+ this->m_device.deallocate_temp(this->m_result);
+ this->m_result = NULL;
+ }
+ }
+ // The placeholder accessors must bound to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ this->m_leftImpl.bind(cgh);
+ this->m_rightImpl.bind(cgh);
+ this->m_result.bind(cgh);
+ }
+};
+} // namespace Eigen
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionThreadPool.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionThreadPool.h
new file mode 100644
index 0000000..21be6ea
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -0,0 +1,1679 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H
+
+// evaluator for thread pool device
+#ifdef EIGEN_USE_THREADS
+
+namespace Eigen {
+
+template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice> :
+ public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice> > {
+
+ typedef ThreadPoolDevice Device;
+
+ typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
+ typedef TensorContractionEvaluatorBase<Self> Base;
+
+ typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
+ typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+ typedef typename XprType::Index Index;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+
+ enum {
+ Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+ };
+
+ // Most of the code is assuming that both input tensors are ColMajor. If the
+ // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
+ // If we want to compute A * B = C, where A is LHS and B is RHS, the code
+ // will pretend B is LHS and A is RHS.
+ typedef typename internal::conditional<
+ static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
+ typedef typename internal::conditional<
+ static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
+
+ static const int LDims =
+ internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
+ static const int RDims =
+ internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
+ static const int ContractDims = internal::array_size<Indices>::value;
+
+ typedef array<Index, LDims> left_dim_mapper_t;
+ typedef array<Index, RDims> right_dim_mapper_t;
+
+ typedef array<Index, ContractDims> contract_t;
+ typedef array<Index, LDims - ContractDims> left_nocontract_t;
+ typedef array<Index, RDims - ContractDims> right_nocontract_t;
+
+ static const int NumDims = LDims + RDims - 2 * ContractDims;
+
+ typedef DSizes<Index, NumDims> Dimensions;
+
+ // typedefs needed in evalTo
+ typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
+ typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
+ typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
+
+ typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+ typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+
+ TensorEvaluator(const XprType& op, const Device& device) :
+ Base(op, device) {}
+
+ template <int Alignment>
+ void evalProduct(Scalar* buffer) const {
+ evalProductImpl<NoCallback, Alignment>(buffer, NoCallback());
+ }
+
+ template <typename EvalToCallback, int Alignment>
+ void evalProductAsync(Scalar* buffer, EvalToCallback done) const {
+ evalProductImpl<EvalToCallback, Alignment>(buffer, std::move(done));
+ }
+
+ template <typename DoneCallback, int Alignment>
+ void evalProductImpl(Scalar* buffer, DoneCallback done) const {
+ // This function computes a lot of heuristics in multiple steps, and it
+ // also has multiple exit points. To keep it sane, readable and all in one
+ // place, sync/async execution decision is made at runtime at the very end.
+ //
+ // (1) In sync mode we allocate Context on the stack, submit computations
+ // to the device thread pool, and block on a barrier until it is
+ // completed.
+ //
+ // (2) In async mode we allocate Context on the heap, and after all tasks
+ // are finished, we call provided the done callback, and delete a
+ // context from the heap.
+ //
+ // (*) EvalParallelContext & EvalShardedByInnerDimContext owns all the state
+ // and temporary buffers, requried for executing the tensor contraction.
+ // They are responsible for cleaning it up after contraction is done.
+ static const bool IsEvalInSyncMode =
+ std::is_same<DoneCallback, NoCallback>::value;
+
+ const Index m = this->m_i_size;
+ const Index n = this->m_j_size;
+ const Index k = this->m_k_size;
+ if (m == 0 || n == 0 || k == 0) return;
+
+ // Compute a set of algorithm parameters:
+ // - kernel block sizes (bm, bn, bk)
+ // - task grain sizes (number of kernels executed per task: gm, gn)
+ // - number of threads
+ // - sharding by row/column
+ // - parallel packing or first lhs then rhs
+ // and some derived parameters:
+ // - number of tasks (nm, nn, nk)
+ // - number of kernels (nm0, nn0)
+ // Unfortunately, all these parameters are tightly interdependent.
+ // So in some cases we first compute approximate values, then compute other
+ // values based on these approximations and then refine the approximations.
+
+ // There are lots of heuristics here. There is some reasoning behind them,
+ // but ultimately they are just tuned on contraction benchmarks for
+ // different input configurations, thread counts and instruction sets.
+ // So feel free to question any of them.
+
+ // Compute whether we want to shard by row or by column.
+ // This is a first approximation, it will be refined later. Since we don't
+ // know number of threads yet we use 2, because what's we are most
+ // interested in at this point is whether it makes sense to use
+ // parallelization at all or not.
+ bool shard_by_col = shardByCol(m, n, 2);
+
+ // First approximation of kernel blocking sizes.
+ // Again, we don't know number of threads yet, so we use 2.
+ Index bm, bn, bk;
+ if (shard_by_col) {
+ internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index,
+ internal::ShardByCol>
+ blocking(k, m, n, 2);
+ bm = blocking.mc();
+ bn = blocking.nc();
+ bk = blocking.kc();
+ } else {
+ internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index,
+ internal::ShardByRow>
+ blocking(k, m, n, 2);
+ bm = blocking.mc();
+ bn = blocking.nc();
+ bk = blocking.kc();
+ }
+
+ // Compute optimal number of threads.
+ // Note: we use bk instead of k here because we are interested in amount of
+ // _parallelizable_ computations, and computations are not parallelizable
+ // across k dimension.
+ const TensorOpCost cost =
+ contractionCost(m, n, bm, bn, bk, shard_by_col, false);
+ int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
+ static_cast<double>(n) * m, cost, this->m_device.numThreads());
+ int num_threads_by_k = numThreadsInnerDim(m, n, k);
+ if (shardByInnerDim(m, n, k, num_threads, num_threads_by_k)) {
+ // We are in the scenario where it is more effective to shard by the
+ // inner dimension.
+ if (IsEvalInSyncMode) {
+ EvalShardedByInnerDimContext<DoneCallback> ctx(
+ this, num_threads_by_k, buffer, m, n, k, std::move(done));
+ ctx.template run<Alignment>();
+ } else {
+ auto* ctx = new EvalShardedByInnerDimContext<DoneCallback>(
+ this, num_threads_by_k, buffer, m, n, k, std::move(done));
+ ctx->template runAsync<Alignment>();
+ }
+
+ return;
+ }
+
+ // TODO(dvyukov): this is a stop-gap to prevent regressions while the cost
+ // model is not tuned. Remove this when the cost model is tuned.
+ if (n == 1) num_threads = 1;
+
+ if (num_threads == 1) {
+ TENSOR_CONTRACTION_DISPATCH(this->template evalProductSequential,
+ Unaligned, (buffer));
+ if (!IsEvalInSyncMode) done();
+ return;
+ }
+
+ // Now that we know number of threads, recalculate sharding and blocking.
+ shard_by_col = shardByCol(m, n, num_threads);
+ if (shard_by_col) {
+ internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index,
+ internal::ShardByCol>
+ blocking(k, m, n, num_threads);
+ bm = blocking.mc();
+ bn = blocking.nc();
+ bk = blocking.kc();
+ } else {
+ internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index,
+ internal::ShardByRow>
+ blocking(k, m, n, num_threads);
+ bm = blocking.mc();
+ bn = blocking.nc();
+ bk = blocking.kc();
+ }
+
+ // Number of kernels for each dimension.
+ Index nm0 = divup(m, bm);
+ Index nn0 = divup(n, bn);
+ Index nk = divup(k, bk);
+
+ // Calculate task grain size (number of kernels executed per task).
+ // This task size coarsening serves two purposes:
+ // 1. It reduces per-task overheads including synchronization overheads.
+ // 2. It allows to use caches better (reuse the same packed rhs in several
+ // consecutive kernels).
+ Index gm = 1;
+ Index gn = 1;
+ // If we are sharding by column, then we prefer to reduce rows first.
+ if (shard_by_col) {
+ gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col);
+ gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col);
+ } else {
+ gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col);
+ gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col);
+ }
+ // Number of tasks in each dimension.
+ Index nm = divup(nm0, gm);
+ Index nn = divup(nn0, gn);
+
+ // If there is enough concurrency in the sharding dimension, we choose not
+ // to paralellize by the other dimension, and execute all kernels in sync
+ // mode. This reduces parallelism from the nm x nn down to nn
+ // (shard_by_col==true) or nm (shard_by_col==false).
+ const Index sharding_dim_tasks = shard_by_col ? nn : nm;
+ const int num_worker_threads = this->m_device.numThreadsInPool();
+
+ // With small number of threads we want to make sure that we do not reduce
+ // parallelism too much. With large number of threads we trade maximum
+ // parallelism for better memory locality.
+ const float oversharding_factor =
+ num_worker_threads <= 4 ? 8.0 :
+ num_worker_threads <= 8 ? 4.0 :
+ num_worker_threads <= 16 ? 2.0 :
+ num_worker_threads <= 32 ? 1.0 :
+ num_worker_threads <= 64 ? 0.8 : /* num_worker_threads > 64 */ 0.6;
+
+ const bool parallelize_by_sharding_dim_only =
+ sharding_dim_tasks >= oversharding_factor * num_worker_threads;
+
+ // Last by not least, decide whether we want to issue both lhs and rhs
+ // packing in parallel; or issue lhs packing first, and then issue rhs
+ // packing when lhs packing completes (for !shard_by_col lhs and rhs are
+ // swapped). Parallel packing allows more parallelism (for both packing and
+ // kernels), while sequential packing provides better locality (once
+ // a thread finishes rhs packing it proceed to kernels with that rhs).
+ // First, we are interested in parallel packing if there are few tasks.
+ bool parallel_pack = num_threads >= nm * nn;
+ // Also do parallel packing if all data fits into L2$.
+ if (m * bk * Index(sizeof(LhsScalar)) + n * bk * Index(sizeof(RhsScalar)) <=
+ l2CacheSize() * num_threads)
+ parallel_pack = true;
+ // But don't do it if we will use each rhs only once. Locality seems to be
+ // more important in this case.
+ if ((shard_by_col ? nm : nn) == 1) parallel_pack = false;
+ // Also don't get in the way of parallelize_by_sharding_dim_only
+ // optimization.
+ if (parallelize_by_sharding_dim_only) parallel_pack = false;
+
+ // TODO(ezhulnev): With if contexpr we don't need SyncEvalParallelContext.
+ if (IsEvalInSyncMode) {
+#define CONTEXT_ARGS \
+ (this, num_threads, buffer, m, n, k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, \
+ nn0, shard_by_col, parallel_pack, parallelize_by_sharding_dim_only, \
+ NoCallback()) \
+ .run()
+ TENSOR_CONTRACTION_DISPATCH(SyncEvalParallelContext, Alignment,
+ CONTEXT_ARGS);
+#undef CONTEXT_ARGS
+
+ } else {
+#define CONTEXT_ARGS \
+ (this, num_threads, buffer, m, n, k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, \
+ nn0, shard_by_col, parallel_pack, parallelize_by_sharding_dim_only, \
+ std::move(done))
+ TENSOR_CONTRACTION_ASYNC_DISPATCH(EvalParallelContext, DoneCallback,
+ Alignment, CONTEXT_ARGS, run());
+#undef CONTEXT_ARGS
+ }
+ }
+
+ // ------------------------------------------------------------------------ //
+
+ // Dummy struct to represent an empty DoneCallback.
+
+ struct NoCallback {
+ void operator()() {
+ eigen_assert(false && "NoCallback should never be called");
+ }
+ };
+
+ // ------------------------------------------------------------------------ //
+
+ template <typename DoneCallback, typename Context>
+ class EvalParallelNotification;
+
+ // Synchronous evaluation notification that blocks caller thread in Wait().
+ template <typename Context>
+ class EvalParallelNotification<NoCallback, Context> {
+ public:
+ EvalParallelNotification(Context*, NoCallback) {}
+ void Notify() { done_.Notify(); }
+ void Wait() { done_.Wait(); }
+ private:
+ Eigen::Notification done_;
+ };
+
+ // Asynchronous evaluation notification that does not block in Wait().
+ template <typename DoneCallback, typename Context>
+ class EvalParallelNotification {
+ public:
+ EvalParallelNotification(Context* ctx, DoneCallback done)
+ : ctx_(ctx), done_(std::move(done)) {}
+
+ void Notify() {
+ // Make a copy of done callback, because it will be destructed when we
+ // will delete context in the next line (EvalParallelNotification is a
+ // data member of EvalParallelContext class).
+ DoneCallback done_copy = std::move(done_);
+
+ // Delete parallel evaluation context.
+ delete ctx_;
+
+ // Now safely call the done callback.
+ done_copy();
+ }
+
+ void Wait() {}
+
+ private:
+ Context* ctx_;
+ DoneCallback done_;
+ };
+
+ // Context orchestrates sync/async parallel contraction evaluation. When it is
+ // executed in asynchronous mode, it owns all the shared state that might be
+ // accessible by block packing and kernel tasks.
+
+ template <typename DoneCallback, bool lhs_inner_dim_contiguous,
+ bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered,
+ int Alignment>
+ class EvalParallelContext {
+ public:
+ typedef internal::TensorContractionInputMapper<
+ LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t,
+ contract_t, internal::packet_traits<LhsScalar>::size,
+ lhs_inner_dim_contiguous, false, Unaligned>
+ LhsMapper;
+ typedef internal::TensorContractionInputMapper<
+ RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t,
+ contract_t, internal::packet_traits<RhsScalar>::size,
+ rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned>
+ RhsMapper;
+
+ typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+
+ typedef internal::TensorContractionKernel<
+ Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper>
+ TensorContractionKernel;
+
+ typedef typename TensorContractionKernel::LhsBlock LhsBlock;
+ typedef typename TensorContractionKernel::RhsBlock RhsBlock;
+ typedef typename TensorContractionKernel::BlockMemHandle BlockMemHandle;
+
+ EvalParallelContext(const Self* self, int num_threads, Scalar* buffer,
+ Index tm, Index tn, Index tk, Index bm, Index bn,
+ Index bk, Index nm, Index nn, Index nk, Index gm,
+ Index gn, Index nm0, Index nn0, bool shard_by_col,
+ bool parallel_pack,
+ bool parallelize_by_sharding_dim_only,
+ DoneCallback done)
+ : created_by_thread_id_(std::this_thread::get_id()),
+ done_(this, std::move(done)),
+ device_(self->m_device),
+ lhs_(self->m_leftImpl, self->m_left_nocontract_strides,
+ self->m_i_strides, self->m_left_contracting_strides,
+ self->m_k_strides),
+ rhs_(self->m_rightImpl, self->m_right_nocontract_strides,
+ self->m_j_strides, self->m_right_contracting_strides,
+ self->m_k_strides),
+ buffer_(buffer),
+ output_(buffer, tm),
+ output_kernel_(self->m_output_kernel),
+ tensor_contraction_params_(self->m_tensor_contraction_params),
+ num_threads_(num_threads),
+ shard_by_col_(shard_by_col),
+ parallel_pack_(parallel_pack),
+ parallelize_by_sharding_dim_only_(parallelize_by_sharding_dim_only),
+ m_(tm),
+ n_(tn),
+ k_(tk),
+ bm_(bm),
+ bn_(bn),
+ bk_(bk),
+ nm_(nm),
+ nn_(nn),
+ nk_(nk),
+ gm_(gm),
+ gn_(gn),
+ nm0_(nm0),
+ nn0_(nn0),
+ kernel_(m_, k_, n_, bm_, bk_, bn_),
+ num_thread_local_allocations_(0),
+ // We reserve 2X more capacity for a thread local values, than the
+ // number of threads in the pool to efficiently handle task stealing
+ // by threads that are not managed by the pool.
+ thread_local_capacity(2 * (parallelize_by_sharding_dim_only_
+ ? device_.numThreadsInPool()
+ : 0)),
+ // We will use only one of the Lhs/Rhs thread local storage depending
+ // on the shard_by_col value and we parallelize by sharding dim ONLY.
+ lhs_thread_local_blocks_(shard_by_col_ ? 0 : thread_local_capacity,
+ {*this}, {*this}),
+ rhs_thread_local_blocks_(shard_by_col_ ? thread_local_capacity : 0,
+ {*this}, {*this}) {
+ // These two options are mutually exclusive.
+ eigen_assert(!(parallel_pack && parallelize_by_sharding_dim_only));
+
+ for (Index x = 0; x < P; x++) {
+ // Normal number of notifications for k slice switch is
+ // nm_ + nn_ + nm_ * nn_. However, first P - 1 slices will receive only
+ // nm_ + nn_ notifications, because they will not receive notifications
+ // from preceding kernels.
+ state_switch_[x] =
+ x == 0
+ ? 1
+ : (parallel_pack_ ? nn_ + nm_ : (shard_by_col_ ? nn_ : nm_)) +
+ (x == P - 1 ? nm_ * nn_ : 0);
+ state_packing_ready_[x] =
+ parallel_pack_ ? 0 : (shard_by_col_ ? nm_ : nn_);
+ state_kernel_[x] = new std::atomic<uint8_t>*[nm_];
+ for (Index m = 0; m < nm_; m++) {
+ state_kernel_[x][m] = new std::atomic<uint8_t>[nn_];
+ // Kernels generally receive 3 notifications (previous kernel + 2
+ // packing), but the first slice won't get notifications from previous
+ // kernels.
+ for (Index n = 0; n < nn_; n++)
+ state_kernel_[x][m][n].store(
+ (x == 0 ? 0 : 1) + (parallel_pack_ ? 2 : 1),
+ std::memory_order_relaxed);
+ }
+ }
+
+ // Allocate memory for packed rhs/lhs matrices.
+ packed_mem_ = kernel_.allocateSlices( //
+ device_, //
+ /*num_lhs=*/nm0_, //
+ /*num_rhs=*/nn0_, //
+ /*num_slices=*/std::min<Index>(nk_, P - 1), //
+ packed_lhs_, packed_rhs_);
+
+ if (parallelize_by_sharding_dim_only_) {
+ const int num_worker_threads = device_.numThreadsInPool();
+
+ if (shard_by_col) {
+ can_use_thread_local_packed_ = new std::atomic<bool>[nn_];
+ for (int i = 0; i < nn_; ++i)
+ can_use_thread_local_packed_[i].store(true,
+ std::memory_order_relaxed);
+
+ Index num_blocks = num_worker_threads * gn_;
+ thread_local_pre_alocated_mem_ = kernel_.allocateSlices( //
+ device_, //
+ /*num_lhs=*/0, //
+ /*num_rhs=*/num_blocks, //
+ /*num_slices=*/1, //
+ /*lhs_blocks=*/nullptr, &rhs_thread_local_pre_allocated_);
+
+ } else {
+ can_use_thread_local_packed_ = new std::atomic<bool>[nm_];
+ for (int i = 0; i < nm_; ++i)
+ can_use_thread_local_packed_[i].store(true,
+ std::memory_order_relaxed);
+
+ Index num_blocks = num_worker_threads * gm_;
+ thread_local_pre_alocated_mem_ = kernel_.allocateSlices( //
+ device_, //
+ /*num_lhs=*/num_blocks, //
+ /*num_rhs=*/0, //
+ /*num_slices=*/1, &lhs_thread_local_pre_allocated_, //
+ /*rhs_blocks=*/nullptr);
+ }
+ }
+ }
+
+ ~EvalParallelContext() {
+ for (Index x = 0; x < P; x++) {
+ for (Index m = 0; m < nm_; m++) delete[] state_kernel_[x][m];
+ delete[] state_kernel_[x];
+ }
+ kernel_.deallocate(device_, packed_mem_);
+ if (parallelize_by_sharding_dim_only_) {
+ kernel_.deallocate(device_, thread_local_pre_alocated_mem_);
+ delete[] can_use_thread_local_packed_;
+ }
+ }
+
+ void run() {
+ // Kick off packing of the first slice.
+ signal_switch(0, 1);
+
+ // Wait for overall completion.
+ //
+ // If parallel evaluation is executed in async mode, this is a no-op, and
+ // Wait() will return immediately. In synchronous mode it will block the
+ // caller thread until it will receive notification from last task.
+ //
+ // In async mode, last task when completed will call done callback from
+ // the same thread, and will delete this context.
+ //
+ // TODO(dvyukov): This wait can lead to deadlock if contraction is
+ // evaluated in synchronous mode. If nthreads contractions are
+ // concurrently submitted from worker threads, this wait will block all
+ // worker threads and the system will deadlock.
+ done_.Wait();
+ }
+
+ private:
+ std::thread::id created_by_thread_id_;
+
+ // This notification is specialized on the type of DoneCallback and can be
+ // blocking or non-blocking.
+ EvalParallelNotification<DoneCallback, EvalParallelContext> done_;
+
+ const Device& device_;
+ LhsMapper lhs_;
+ RhsMapper rhs_;
+ Scalar* const buffer_;
+ OutputMapper output_;
+ OutputKernelType output_kernel_;
+ TensorContractionParams tensor_contraction_params_;
+ const int num_threads_;
+ const bool shard_by_col_;
+ const bool parallel_pack_;
+ const bool parallelize_by_sharding_dim_only_;
+ // Matrix sizes.
+ const Index m_;
+ const Index n_;
+ const Index k_;
+ // Block sizes.
+ const Index bm_;
+ const Index bn_;
+ const Index bk_;
+ // Number of tasks.
+ const Index nm_;
+ const Index nn_;
+ const Index nk_;
+ // Task grain sizes (number of kernels executed per task).
+ const Index gm_;
+ const Index gn_;
+ // Number of blocks (this is different from ni_/nn_ because of task size
+ // coarsening).
+ const Index nm0_;
+ const Index nn0_;
+ // Tensor contraction kernel.
+ TensorContractionKernel kernel_;
+
+ // Parallelization strategy.
+ //
+ // Blocks related to the same k block can run in parallel because they write
+ // to different output blocks. So we parallelize within k slices, this
+ // gives us parallelism level of m x n. Before we can start any kernels
+ // related to k-th slice, we need to issue m lhs packing tasks and n rhs
+ // packing tasks.
+ //
+ // However, there is a bottleneck when we are finishing kernels for k-th
+ // slice (at the very end there is only 1 runnable kernel). To mitigate this
+ // bottleneck we allow kernels from k-th and k+1-th slices to run in
+ // parallel. Note that (m, n, k) and (m, n, k+1) kernels write to the same
+ // output block, so they must not run in parallel.
+ //
+ // This gives us the following dependency graph.
+ // On each k slice we have m x n kernel tasks, m lhs paking tasks and n rhs
+ // packing tasks.
+ // Kernel (m, n, k) can start when:
+ // - kernel (m, n, k-1) has finished
+ // - lhs packing (m, k) has finished
+ // - rhs packing (n, k) has finished
+ // Lhs/rhs packing can start when:
+ // - all k-1 packing has finished (artificially imposed to limit amount of
+ // parallel packing)
+ //
+ // On top of that we limit runnable tasks to two consecutive k slices.
+ // This is done to limit amount of memory we need for packed lhs/rhs
+ // (for each k slice we need m*bk + n*bk memory in packed_lhs_/packed_rhs_).
+ //
+ // state_switch_ tracks when we are ready to switch to the next k slice.
+ // state_kernel_[m][n] tracks when we are ready to kick off kernel (m, n).
+ // These variable are rolling over 3 consecutive k slices: first two we are
+ // actively executing + one to track completion of kernels in the second
+ // slice.
+ static const Index P = 3;
+
+ // Handle to the allocated temporary storage for Lhs/Rhs blocks.
+ BlockMemHandle packed_mem_;
+ std::vector<LhsBlock> packed_lhs_[P - 1];
+ std::vector<RhsBlock> packed_rhs_[P - 1];
+
+ // If we choose to parallelize only by the sharding dimension, each thread
+ // will have it's own "thead local" (not a c++ thread local storage) memory
+ // for packed_lhs or packed_rhs (shard_by_col = false of true). This memory
+ // can't be passed to a kernel that might execute on a different thread.
+ //
+ // In practice when we are ready to pack memory for the sharding dimension
+ // (rhs if shard_by_col==true) of the K-th slice, all kernels for K-1 slice
+ // already computed (99% of the time), and we can pack data into the thread
+ // local storage, and guarantee that all the kernels will be executed
+ // immediately in the same thread. This significantly increases L1 cache hit
+ // ratio and reduces pressure on the memory bus.
+ //
+ // It's still possible that kernel for the K-th slice will be ready before
+ // completion of the K-1 kernel, so we have to allocate "global" packed_lhs_
+ // and packed_rhs_ to allow kernels to be executed later on a thread
+ // different from the thread that was used for packing.
+
+ // Handle for pre-allocated thread local memory buffers.
+ BlockMemHandle thread_local_pre_alocated_mem_;
+
+ // Only one of these will be initialized depending on shard_by_col value
+ // (the size will be `num_worker_threads * num_grains_in_the_sharding_dim`).
+ std::vector<LhsBlock> lhs_thread_local_pre_allocated_;
+ std::vector<RhsBlock> rhs_thread_local_pre_allocated_;
+
+ // How many thread local blocks were already allocated.
+ std::atomic<int> num_thread_local_allocations_;
+ const int thread_local_capacity;
+
+ // We will use pre-allocated Lhs/Rhs blocks defined above, if the number of
+ // unique threads in a system is below or equal to the number of threads in
+ // a thread pool. We will fallback on dynamic memory allocation after that.
+
+ // ThreadLocalBlocks is a container for Lhs or Rhs thread local buffers. Its
+ // size is equal to the grain size in Lhs/Rhs sharding dimension.
+ template <typename BlockType>
+ class ThreadLocalBlocks {
+ public:
+ ThreadLocalBlocks() = default;
+
+ ThreadLocalBlocks(BlockType* base, size_t grain_size)
+ : is_pre_allocated_(true),
+ thread_local_pre_allocated_base_(base),
+ grain_size_(grain_size) {}
+
+ ThreadLocalBlocks(BlockMemHandle mem_handle,
+ std::vector<BlockType> blocks)
+ : is_pre_allocated_(false),
+ mem_handle_(std::move(mem_handle)),
+ blocks_(std::move(blocks)) {}
+
+ BlockType& block(int grain_index) {
+ eigen_assert(grain_index >= 0);
+ eigen_assert(static_cast<size_t>(grain_index) < size());
+ return is_pre_allocated_ ? thread_local_pre_allocated_base_[grain_index]
+ : blocks_[grain_index];
+ }
+
+ void Release(EvalParallelContext& ctx) const {
+ if (!is_pre_allocated_) {
+ ctx.kernel_.deallocate(ctx.device_, mem_handle_);
+ }
+ }
+
+ size_t size() const {
+ return is_pre_allocated_ ? grain_size_ : blocks_.size();
+ }
+
+ private:
+ bool is_pre_allocated_;
+
+ // Reuse pre-allocated thread local buffers.
+ BlockType* thread_local_pre_allocated_base_ = nullptr;
+ size_t grain_size_ = 0;
+
+ // These will be initialized only if `is_pre_allocated == false`.
+ BlockMemHandle mem_handle_{};
+ std::vector<BlockType> blocks_;
+ };
+
+ // ThreadLocalBlocksInitialize callable does custom thread local blocks
+ // initialization, and will reuse pre-allocated buffers if possible, or will
+ // dynamically allocate new memory.
+ //
+ // Lhs/Rhs blocks might be of the same type, so we have to pass explicitly
+ // for what side do we plan to do block allocation.
+ template <typename BlockType, bool is_rhs>
+ class ThreadLocalBlocksInitialize {
+ static constexpr bool kIsLhs =
+ !is_rhs && std::is_same<BlockType, LhsBlock>::value;
+ static const bool kIsRhs =
+ is_rhs && std::is_same<BlockType, RhsBlock>::value;
+ static_assert(kIsLhs || kIsRhs, "Unkown block type");
+
+ using Blocks = ThreadLocalBlocks<BlockType>;
+
+ public:
+ ThreadLocalBlocksInitialize(EvalParallelContext& ctx)
+ : ctx_(ctx),
+ num_worker_threads_(ctx_.device_.numThreadsInPool()) {}
+
+ void operator()(Blocks& blocks) {
+ const int n = ctx_.num_thread_local_allocations_.fetch_add(
+ 1, std::memory_order_relaxed);
+
+ if (n >= num_worker_threads_) {
+ ThreadLocalBlocksAllocator<is_rhs>::allocate(ctx_, blocks);
+ } else {
+ ThreadLocalBlocksAllocator<is_rhs>::reuse(ctx_, n, blocks);
+ }
+ }
+
+ private:
+ // NOTE(ezhulenev): Without 'if constexpr' we have to put calls to
+ // TensorContractionKernel::allocateSlices into template specializations.
+ // Also explicit specializations are not allowed at class scope in C++03,
+ // EvalCtx type parameter is just a workaround for that limitation.
+ template <bool pack_rhs, typename EvalCtx = EvalParallelContext>
+ struct ThreadLocalBlocksAllocator;
+
+ template <typename EvalCtx>
+ struct ThreadLocalBlocksAllocator</*pack_rhs=*/true, EvalCtx> {
+ static void allocate(EvalCtx& ctx, Blocks& blocks) {
+ std::vector<RhsBlock> rhs_blocks;
+ BlockMemHandle mem_handle = ctx.kernel_.allocateSlices(
+ ctx.device_,
+ /*num_lhs=*/0,
+ /*num_rhs=*/ctx.gn_,
+ /*num_slices=*/1,
+ /*lhs_blocks=*/nullptr, /*rhs_blocks=*/&rhs_blocks);
+
+ blocks = ThreadLocalBlocks<RhsBlock>(std::move(mem_handle),
+ std::move(rhs_blocks));
+ }
+
+ static void reuse(EvalCtx& ctx, int index, Blocks& blocks) {
+ RhsBlock* ptr = &ctx.rhs_thread_local_pre_allocated_[ctx.gn_ * index];
+ blocks = ThreadLocalBlocks<RhsBlock>(ptr, ctx.gn_);
+ }
+ };
+
+ template <typename EvalCtx>
+ struct ThreadLocalBlocksAllocator</*pack_rhs=*/false, EvalCtx> {
+ static void allocate(EvalCtx& ctx, Blocks& blocks) {
+ std::vector<LhsBlock> lhs_blocks;
+ BlockMemHandle mem_handle = ctx.kernel_.allocateSlices(
+ ctx.device_,
+ /*num_lhs=*/ctx.gm_,
+ /*num_rhs=*/0,
+ /*num_slices=*/1,
+ /*lhs_blocks=*/&lhs_blocks, /*rhs_blocks=*/nullptr);
+
+ blocks = ThreadLocalBlocks<LhsBlock>(std::move(mem_handle),
+ std::move(lhs_blocks));
+ }
+
+ static void reuse(EvalCtx& ctx, int index, Blocks& blocks) {
+ LhsBlock* ptr = &ctx.lhs_thread_local_pre_allocated_[ctx.gm_ * index];
+ blocks = ThreadLocalBlocks<LhsBlock>(ptr, ctx.gm_);
+ }
+ };
+
+ EvalParallelContext& ctx_;
+ const int num_worker_threads_;
+ };
+
+ template <typename BlockType>
+ class ThreadLocalBlocksRelease {
+ public:
+ using Blocks = ThreadLocalBlocks<BlockType>;
+ ThreadLocalBlocksRelease(EvalParallelContext& ctx) : ctx_(ctx) {}
+ void operator()(Blocks& blocks) { blocks.Release(ctx_); }
+
+ private:
+ EvalParallelContext& ctx_;
+ };
+
+ // ThreadLocalBlocks initialization callables.
+ using ThreadLocalLhsInit =
+ ThreadLocalBlocksInitialize<LhsBlock, /*is_rhs=*/false>;
+ using ThreadLocalRhsInit =
+ ThreadLocalBlocksInitialize<RhsBlock, /*is_rhs=*/true>;
+
+ // ThreadLocalBlocks release callables.
+ using ThreadLocalLhsRelease = ThreadLocalBlocksRelease<LhsBlock>;
+ using ThreadLocalRhsRelease = ThreadLocalBlocksRelease<RhsBlock>;
+
+ // Thread local containers for Lhs/Rhs block packs. In practice only one of
+ // them will be used, depending on the shard_by_col value.
+ Eigen::ThreadLocal<ThreadLocalBlocks<LhsBlock>, ThreadLocalLhsInit,
+ ThreadLocalLhsRelease>
+ lhs_thread_local_blocks_;
+ Eigen::ThreadLocal<ThreadLocalBlocks<RhsBlock>, ThreadLocalRhsInit,
+ ThreadLocalRhsRelease>
+ rhs_thread_local_blocks_;
+
+ // After a particular shard for Kth slice missed thread local execution
+ // opportunity (K-1 slice didn't complete kernels execution), we can no
+ // longer schedule K+1 and following slices in thread local mode, because
+ // there is no more guarantee that previous kernels were executed
+ // sequentially in the same thread (size is nn_ or nm_).
+ std::atomic<bool>* can_use_thread_local_packed_;
+
+ std::atomic<uint8_t>** state_kernel_[P];
+ // state_switch_ is frequently modified by worker threads, while other
+ // fields are read-only after constructor. Let's move it to a separate cache
+ // line to reduce cache-coherency traffic.
+ char pad_[128];
+ std::atomic<Index> state_packing_ready_[P];
+ std::atomic<Index> state_switch_[P];
+
+ LhsBlock& packed_lhs(Index m, Index k, Index m1, bool use_thread_local) {
+ if (use_thread_local) {
+ eigen_assert(!shard_by_col_);
+ ThreadLocalBlocks<LhsBlock>& blocks = lhs_thread_local_blocks_.local();
+
+ Index grain_index = m1 - m * gm_;
+ return blocks.block(internal::convert_index<int>(grain_index)); // FIXME better make ThreadLocalBlocks use Eigen::Index?
+ } else {
+ return packed_lhs_[k % (P - 1)][m1];
+ }
+ }
+
+ RhsBlock& packed_rhs(Index n, Index k, Index n1, bool use_thread_local) {
+ if (use_thread_local) {
+ eigen_assert(shard_by_col_);
+ ThreadLocalBlocks<RhsBlock>& blocks = rhs_thread_local_blocks_.local();
+
+ Index grain_index = n1 - n * gn_;
+ return blocks.block(internal::convert_index<int>(grain_index)); // FIXME better make ThreadLocalBlocks use Eigen::Index?
+ } else {
+ return packed_rhs_[k % (P - 1)][n1];
+ }
+ }
+
+ // In following two methods (pack_lhs and pack_rhs), if we know for sure
+ // that we'll be able to immediately call a kernel with packed data, and do
+ // not submit it to the thread pool, we can use thread local memory for
+ // packed data.
+ //
+ // We can only reliably check it if we are running all kernels in sync mode
+ // (parallelize only by sharding dim). If kernel for m==0 (n==0) is ready to
+ // run, it's guaranteed that all kernels with larger values of m (n) are
+ // also ready, because we execute them in the same order for all K slices.
+
+ void pack_lhs(Index m, Index k) {
+ bool use_thread_local = false;
+
+ if (parallelize_by_sharding_dim_only_ && !shard_by_col_ &&
+ can_use_thread_local_packed_[m].load(std::memory_order_relaxed)) {
+ if (state_kernel_[k % P][m][0].load(std::memory_order_relaxed) == 1) {
+ use_thread_local = true;
+ } else {
+ // If we can't guarantee that all kernels in `k` slice will be
+ // executed sequentially in current thread, it's no longer safe to use
+ // thread local memory in following slices along the k dimensions.
+ eigen_assert(k > 0);
+ can_use_thread_local_packed_[m].store(false,
+ std::memory_order_relaxed);
+ }
+ }
+
+ const Index mend = m * gm_ + gm(m);
+ for (Index m1 = m * gm_; m1 < mend; m1++)
+ kernel_.packLhs(&packed_lhs(m, k, m1, use_thread_local),
+ lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1));
+
+ if (!parallel_pack_ && shard_by_col_) {
+ assert(!use_thread_local);
+ signal_packing(k);
+ } else {
+ signal_switch(k + 1);
+ for (Index n = nn_ - 1; n >= 0; n--) {
+ bool sync = parallelize_by_sharding_dim_only_ || n == 0;
+ signal_kernel(m, n, k, sync, use_thread_local);
+ }
+ }
+ }
+
+ void pack_rhs(Index n, Index k) {
+ bool use_thread_local = false;
+
+ if (parallelize_by_sharding_dim_only_ && shard_by_col_ &&
+ can_use_thread_local_packed_[n].load(std::memory_order_relaxed)) {
+ if (state_kernel_[k % P][0][n].load(std::memory_order_relaxed) == 1) {
+ use_thread_local = true;
+ } else {
+ // If we can't guarantee that all kernels in `k` slice will be
+ // executed sequentially in current thread, it's no longer safe to use
+ // thread local memory in followig slices along the k dimensions.
+ eigen_assert(k > 0);
+ can_use_thread_local_packed_[n].store(false,
+ std::memory_order_relaxed);
+ }
+ }
+
+ const Index nend = n * gn_ + gn(n);
+ for (Index n1 = n * gn_; n1 < nend; n1++) {
+ if (!TensorContractionKernel::HasBeta && k == 0) {
+ // Zero the output memory in parallel, only if contraction kernel does
+ // not support `beta`. Otherwise we will pass beta 0.0 to the first
+ // call to the `TensorContractionKernel::invoke()`.
+ //
+ // On 10000x2x10000 mm zeroing can easily take half of time. Zero (bn
+ // x m) row. Safe to do here because all kernels that will write to
+ // this memory depend on completion of this task. Note: don't call
+ // device_.memset() here. device_.memset() blocks on thread pool
+ // worker thread, which can lead to underutilization and deadlocks.
+ memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar));
+ }
+ kernel_.packRhs(&packed_rhs(n, k, n1, use_thread_local),
+ rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1));
+ }
+
+ if (parallel_pack_ || shard_by_col_) {
+ signal_switch(k + 1);
+ for (Index m = nm_ - 1; m >= 0; m--) {
+ bool sync = parallelize_by_sharding_dim_only_ || m == 0;
+ signal_kernel(m, n, k, sync, use_thread_local);
+ }
+ } else {
+ assert(!use_thread_local);
+ signal_packing(k);
+ }
+ }
+
+ void kernel(Index m, Index n, Index k, bool use_thread_local) {
+ // Note: order of iteration matters here. Iteration over m is innermost
+ // because we want to reuse the same packed rhs in consecutive tasks
+ // (rhs fits into L2$ while lhs only into L3$).
+ const Index nend = n * gn_ + gn(n);
+ const Index mend = m * gm_ + gm(m);
+
+ // NOTE: output = alpha * LHS * RHS + beta * output.
+ const Scalar alpha = Scalar(1);
+ const Scalar beta =
+ (TensorContractionKernel::HasBeta && k == 0) ? Scalar(0) : Scalar(1);
+
+ if (shard_by_col_) {
+ for (Index n1 = n * gn_; n1 < nend; n1++) {
+ for (Index m1 = m * gm_; m1 < mend; m1++) {
+ const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_);
+ kernel_.invoke(
+ output_mapper,
+ packed_lhs(m, k, m1, !shard_by_col_ && use_thread_local),
+ packed_rhs(n, k, n1, shard_by_col_ && use_thread_local), bm(m1),
+ bk(k), bn(n1), alpha, beta);
+
+ // We are done with the last task for the [m1, n1] block.
+ if (k + 1 == nk_) {
+ output_kernel_(output_mapper, tensor_contraction_params_,
+ m1 * bm_, n1 * bn_, bm(m1), bn(n1));
+ }
+ }
+ }
+ } else {
+ for (Index m1 = m * gm_; m1 < mend; m1++)
+ for (Index n1 = n * gn_; n1 < nend; n1++) {
+ const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_);
+ kernel_.invoke(
+ output_mapper,
+ packed_lhs(m, k, m1, !shard_by_col_ && use_thread_local),
+ packed_rhs(n, k, n1, shard_by_col_ && use_thread_local), bm(m1),
+ bk(k), bn(n1), alpha, beta);
+
+ // We are done with the last task for the [m1, n1] block.
+ if (k + 1 == nk_) {
+ output_kernel_(output_mapper, tensor_contraction_params_,
+ m1 * bm_, n1 * bn_, bm(m1), bn(n1));
+ }
+ }
+ }
+ signal_kernel(m, n, k + 1, /*sync=*/false, /*use_thread_local=*/false);
+ signal_switch(k + 2);
+ }
+
+ void signal_packing(Index k) {
+ eigen_assert(!parallel_pack_);
+ Index s = state_packing_ready_[k % P].fetch_sub(1);
+ eigen_assert(s > 0);
+ if (s != 1) return;
+ state_packing_ready_[k % P] = shard_by_col_ ? nm_ : nn_;
+ enqueue_packing(k, shard_by_col_);
+ }
+
+ void signal_kernel(Index m, Index n, Index k, bool sync,
+ bool use_thread_local) {
+ std::atomic<uint8_t>* state = &state_kernel_[k % P][m][n];
+ Index s = state->load();
+ eigen_assert(s > 0);
+ if (s != 1 && state->fetch_sub(1) != 1) {
+ eigen_assert(!use_thread_local);
+ return;
+ }
+ state->store(parallel_pack_ ? 3 : 2, std::memory_order_relaxed);
+ if (sync) {
+ kernel(m, n, k, use_thread_local);
+ } else {
+ eigen_assert(!use_thread_local);
+ device_.enqueueNoNotification(
+ [=]() { kernel(m, n, k, use_thread_local); });
+ }
+ }
+
+ void signal_switch(Index k, Index v = 1) {
+ Index s = state_switch_[k % P].fetch_sub(v);
+ eigen_assert(s >= v);
+ if (s != v) return;
+
+ // Ready to switch to the next k slice.
+ // Reset counter for the next iteration.
+ state_switch_[k % P] =
+ (parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_)) +
+ nm_ * nn_;
+ if (k < nk_) {
+ // Issue lhs/rhs packing. Their completion will in turn kick off
+ // kernels.
+ if (parallel_pack_) {
+ enqueue_packing(k, !shard_by_col_);
+ enqueue_packing(k, shard_by_col_);
+ } else if (shard_by_col_) {
+ enqueue_packing(k, false);
+ } else {
+ enqueue_packing(k, true);
+ }
+
+ // Termination handling.
+ // Because kernel completion signals k + 2 switch, we need to finish nk
+ // + 2 slices without issuing any tasks on nk + 1 slice. So here we
+ // pretend that all nk + 1 packing tasks just finish instantly; so that
+ // nk + 2 switch only waits for completion of nk kernels.
+ } else if (k == nk_) {
+ signal_switch(k + 1,
+ parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_));
+ } else {
+ done_.Notify();
+ }
+ }
+
+ // Enqueue all rhs/lhs packing for k-th slice.
+ void enqueue_packing(Index k, bool rhs) {
+ enqueue_packing_helper(0, rhs ? nn_ : nm_, k, rhs);
+ }
+
+ void enqueue_packing_helper(Index start, Index end, Index k, bool rhs) {
+ if (end - start == 1) {
+ if (rhs)
+ pack_rhs(start, k);
+ else
+ pack_lhs(start, k);
+ } else {
+ while (end - start > 1) {
+ Index mid = (start + end) / 2;
+ device_.enqueueNoNotification(
+ [=]() { enqueue_packing_helper(mid, end, k, rhs); });
+ end = mid;
+ }
+
+ // Decide if we want to run first packing task (start == 0) in
+ // async mode if we parallelize only by sharding dim:
+ // (1) pack_lhs and pack_rhs call signal_switch before completing
+ // all calls to signal_kernel, which in sync mode might lead
+ // to the execution of the first kernel of the k+1 slice, before
+ // completing a call to the last kernel of the k slice.
+ // (2) all pack tasks for sharded dim must be executed in a thread
+ // pool to get pre-allocated thead local buffers.
+ bool pack_async =
+ (start == 0) &&
+ (parallelize_by_sharding_dim_only_&& shard_by_col_ == rhs) &&
+ (k > 0 || std::this_thread::get_id() == created_by_thread_id_);
+
+ if (pack_async) {
+ device_.enqueueNoNotification(
+ [=]() { enqueue_packing_helper(start, end, k, rhs); });
+ } else {
+ enqueue_packing_helper(start, end, k, rhs);
+ }
+ }
+ }
+
+ // Block sizes with accounting for potentially incomplete last block.
+ Index bm(Index m) const { return m + 1 < nm0_ ? bm_ : m_ + bm_ - bm_ * nm0_; }
+ Index bn(Index n) const { return n + 1 < nn0_ ? bn_ : n_ + bn_ - bn_ * nn0_; }
+ Index bk(Index k) const { return k + 1 < nk_ ? bk_ : k_ + bk_ - bk_ * nk_; }
+ // Task grain sizes accounting for potentially incomplete last task.
+ Index gm(Index m) const { return m + 1 < nm_ ? gm_ : nm0_ + gm_ - gm_ * nm_; }
+ Index gn(Index n) const { return n + 1 < nn_ ? gn_ : nn0_ + gn_ - gn_ * nn_; }
+
+ EvalParallelContext(const EvalParallelContext&) = delete;
+ void operator=(const EvalParallelContext&) = delete;
+ };
+
+ template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
+ bool rhs_inner_dim_reordered, int Alignment>
+ using SyncEvalParallelContext =
+ EvalParallelContext<NoCallback, lhs_inner_dim_contiguous,
+ rhs_inner_dim_contiguous, rhs_inner_dim_reordered,
+ Alignment>;
+
+ // ------------------------------------------------------------------------ //
+
+ // EvalShardedByInnerDimContext orchestrates sync/async contraction
+ // evaluation, when we shard by inner dimension. When it is executed in
+ // asynchronous mode, it owns all the shared state that might be accessible by
+ // block processing tasks.
+
+ template <typename DoneCallback>
+ struct EvalShardedByInnerDimContext {
+ EvalShardedByInnerDimContext(const Self* self, int num_threads,
+ Scalar* result_buffer,
+ Index m_size, Index n_size, Index k_size,
+ DoneCallback done_callback)
+ : evaluator(self),
+ m_lhs_inner_dim_contiguous(evaluator->m_lhs_inner_dim_contiguous),
+ m_rhs_inner_dim_contiguous(evaluator->m_rhs_inner_dim_contiguous),
+ m_rhs_inner_dim_reordered(evaluator->m_rhs_inner_dim_reordered),
+ result(result_buffer),
+ m(m_size),
+ n(n_size),
+ k(k_size),
+ done(std::move(done_callback)),
+ buffer_size_bytes(m * n * sizeof(Scalar)),
+ block_size(blockSize(k, num_threads)),
+ num_blocks(divup<Index>(k, block_size)),
+ num_pending_blocks(internal::convert_index<int>(num_blocks)),
+ l0_ranges(divup<Index>(num_blocks, l0_size)),
+ l0_state(l0_ranges),
+ block_buffers(num_blocks) {
+ // Keep count of pending gemm tasks for each l0 range.
+ for (int i = 0; i < l0_ranges; ++i) {
+ const Index num_pending_tasks = actualRangeSize(l0_ranges, l0_size, i);
+ l0_state.emplace_back(internal::convert_index<int>(num_pending_tasks));
+ }
+
+ // Allocate temporary buffers for each block.
+ for (Index block_idx = 0; block_idx < num_blocks; ++block_idx) {
+ Scalar* buf = block_idx == 0
+ ? result
+ : static_cast<Scalar*>(evaluator->m_device.allocate(
+ buffer_size_bytes));
+ block_buffers.emplace_back(buf);
+ }
+ }
+
+ ~EvalShardedByInnerDimContext() {
+ for (Index i = 1; i < num_blocks; ++i) {
+ evaluator->m_device.deallocate(block_buffers[i]);
+ }
+ }
+
+ template <int Alignment>
+ void run() {
+ Barrier barrier(internal::convert_index<int>(num_blocks));
+ eval<Alignment>(barrier, 0, num_blocks);
+ barrier.Wait();
+
+ // Aggregate partial sums from l0 ranges.
+ aggregateL0Blocks<Alignment>();
+
+ // Apply output kernel.
+ applyOutputKernel();
+ }
+
+ template <int Alignment>
+ void runAsync() {
+ evalAsync<Alignment>(0, num_blocks);
+ }
+
+ private:
+ // The underlying GEMM kernel assumes that k is a multiple of
+ // the packet size and subtle breakage occurs if this is violated.
+ static const Index packet_size = internal::packet_traits<RhsScalar>::size;
+
+ const Self* evaluator; // TensorContraction evaluator
+
+ // These fields required fromTENSOR_CONTRACTION_DISPATCH macro.
+ bool m_lhs_inner_dim_contiguous;
+ bool m_rhs_inner_dim_contiguous;
+ bool m_rhs_inner_dim_reordered;
+
+ Scalar* result;
+
+ Index m;
+ Index n;
+ Index k;
+
+ DoneCallback done;
+
+ // ----------------------------------------------------------------------//
+ // Algorithm parameters.
+
+ // We will compute partial results into the buffers of this size.
+ Index buffer_size_bytes;
+
+ Index block_size;
+ Index num_blocks;
+
+ // Keep track of pending tasks when evaluate in async mode.
+ std::atomic<int> num_pending_blocks;
+
+ // We compute partial gemm results in parallel, and to get the final result
+ // we need to add them all together. For the large number of threads (>= 48)
+ // this adds a very expensive sequential step at the end.
+ //
+ // We split the [0, num_blocks) into small ranges, and when a task for the
+ // block finishes its partial gemm computation, it checks if it was the last
+ // gemm in the range, and if so, it will add all blocks of the range.
+ //
+ // After all tasks done, we need to add only these pre-aggregated blocks.
+
+ // For now we use just a single level of ranges to compute pre-aggregated
+ // partial sums, but in general we can use more layers to compute tree
+ // aggregation in parallel and reduce the size of the sequential step.
+ //
+ // TODO(ezhulenev): Add multilevel tree aggregation? Probably will make
+ // sense only if number of threads >= ~128?
+ static const Index l0_size = 4;
+ Index l0_ranges;
+
+ // Keep count of pending gemm tasks for each l0 range.
+ MaxSizeVector<std::atomic<int>> l0_state; // [0, l0_ranges)
+
+ // Buffers allocated for each temporary block computation.
+ MaxSizeVector<Scalar*> block_buffers; // [0, num_blocks)
+
+ template <int Alignment>
+ void processBlock(Index block_idx, Index begin, Index end) {
+ Scalar* buf = block_buffers[block_idx];
+
+ TENSOR_CONTRACTION_DISPATCH(
+ evaluator->template evalGemmPartialWithoutOutputKernel, Alignment,
+ (buf, begin, end,
+ /*num_threads=*/internal::convert_index<int>(num_blocks)));
+
+ // Check if it was the last task in l0 range.
+ const Index l0_index = block_idx / l0_size;
+ const int v = l0_state[l0_index].fetch_sub(1);
+ eigen_assert(v >= 1);
+
+ // If we processed the last block of the range, we can aggregate all
+ // partial results into the first block of the range.
+ if (v == 1) {
+ const Index rng_size = actualRangeSize(l0_ranges, l0_size, l0_index);
+ const Index dst_block_idx = l0_index * l0_size;
+
+ if (rng_size == l0_size) {
+ addAllToBuffer<Alignment>(
+ m * n,
+ /*src_buf0=*/block_buffers[dst_block_idx + 1],
+ /*src_buf1=*/block_buffers[dst_block_idx + 2],
+ /*src_buf2=*/block_buffers[dst_block_idx + 3],
+ /*dst_buf= */ block_buffers[dst_block_idx]);
+ } else {
+ // Aggregate blocks of potentially incomplete last range.
+ for (int i = 1; i < rng_size; ++i) {
+ addToBuffer<Alignment>(m * n,
+ /*src_buf=*/block_buffers[dst_block_idx + i],
+ /*dst_buf=*/block_buffers[dst_block_idx]);
+ }
+ }
+ }
+ }
+
+ // Aggregate partial sums from l0 ranges.
+ template <int Alignment>
+ void aggregateL0Blocks() const {
+ Index l0_index = 1;
+
+ for (; l0_index + 2 < l0_ranges; l0_index += 3) {
+ addAllToBuffer<Alignment>(
+ m * n,
+ /*src_buf0=*/block_buffers[(l0_index + 0) * l0_size],
+ /*src_buf1=*/block_buffers[(l0_index + 1) * l0_size],
+ /*src_buf2=*/block_buffers[(l0_index + 2) * l0_size],
+ /*dst_buf= */ block_buffers[0]);
+ }
+
+ for (; l0_index < l0_ranges; ++l0_index) {
+ addToBuffer<Alignment>(m * n, block_buffers[l0_index * l0_size],
+ block_buffers[0]);
+ }
+ }
+
+ void applyOutputKernel() const {
+ typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+ evaluator->m_output_kernel(
+ OutputMapper(result, m), evaluator->m_tensor_contraction_params,
+ static_cast<Eigen::Index>(0), static_cast<Eigen::Index>(0), m, n);
+ }
+
+ // Compute block size with accounting for potentially incomplete last block.
+ Index actualBlockSize(Index block_idx) const {
+ return block_idx + 1 < num_blocks
+ ? block_size
+ : k + block_size - block_size * num_blocks;
+ };
+
+ // Compute range size with accounting for potentially incomplete last range.
+ Index actualRangeSize(Index num_ranges, Index range_size,
+ Index range_idx) const {
+ eigen_assert(range_idx < num_ranges);
+ return range_idx + 1 < num_ranges
+ ? range_size
+ : num_blocks + range_size - range_size * num_ranges;
+ };
+
+ template <int Alignment>
+ EIGEN_STRONG_INLINE static void addToBuffer(size_t n, const Scalar* src_buf,
+ Scalar* tgt_buf) {
+ const int output_packet_size =
+ internal::unpacket_traits<PacketReturnType>::size;
+ size_t i = 0;
+ const size_t num_packets = n / output_packet_size;
+ for (; i < output_packet_size * num_packets; i += output_packet_size) {
+ const PacketReturnType src_val =
+ internal::pload<PacketReturnType>(src_buf + i);
+ const PacketReturnType tgt_val =
+ internal::ploadt<PacketReturnType, Alignment>(tgt_buf + i);
+ const PacketReturnType sum = internal::padd(src_val, tgt_val);
+ internal::pstoret<Scalar, PacketReturnType, Alignment>(tgt_buf + i,
+ sum);
+ }
+ for (; i < n; ++i) {
+ tgt_buf[i] += src_buf[i];
+ }
+ }
+
+ template <int Alignment>
+ EIGEN_STRONG_INLINE static void addAllToBuffer(size_t n,
+ const Scalar* src_buf0,
+ const Scalar* src_buf1,
+ const Scalar* src_buf2,
+ Scalar* dst_buf) {
+ using ::Eigen::internal::padd;
+ using ::Eigen::internal::pload;
+ using ::Eigen::internal::ploadt;
+ using ::Eigen::internal::pstoret;
+
+ const int output_packet_size =
+ internal::unpacket_traits<PacketReturnType>::size;
+
+ size_t i = 0;
+ const size_t num_packets = n / output_packet_size;
+ for (; i < output_packet_size * num_packets; i += output_packet_size) {
+ const auto src_val0 = pload<PacketReturnType>(src_buf0 + i);
+ const auto src_val1 = pload<PacketReturnType>(src_buf1 + i);
+ const auto src_val2 = pload<PacketReturnType>(src_buf2 + i);
+
+ const auto dst_val = ploadt<PacketReturnType, Alignment>(dst_buf + i);
+ const auto sum =
+ padd(padd(dst_val, src_val0), padd(src_val1, src_val2));
+
+ pstoret<Scalar, PacketReturnType, Alignment>(dst_buf + i, sum);
+ }
+ for (; i < n; ++i) {
+ dst_buf[i] += src_buf0[i] + src_buf1[i] + src_buf2[i];
+ }
+ }
+
+ template <int Alignment>
+ void eval(Barrier& barrier, Index start_block_idx, Index end_block_idx) {
+ while (end_block_idx - start_block_idx > 1) {
+ Index mid_block_idx = (start_block_idx + end_block_idx) / 2;
+ evaluator->m_device.enqueueNoNotification(
+ [this, &barrier, mid_block_idx, end_block_idx]() {
+ eval<Alignment>(barrier, mid_block_idx, end_block_idx);
+ });
+ end_block_idx = mid_block_idx;
+ }
+
+ Index block_idx = start_block_idx;
+ Index block_start = block_idx * block_size;
+ Index block_end = block_start + actualBlockSize(block_idx);
+
+ processBlock<Alignment>(block_idx, block_start, block_end);
+ barrier.Notify();
+ }
+
+ template <int Alignment>
+ void evalAsync(Index start_block_idx, Index end_block_idx) {
+ while (end_block_idx - start_block_idx > 1) {
+ Index mid_block_idx = (start_block_idx + end_block_idx) / 2;
+ evaluator->m_device.enqueueNoNotification(
+ [this, mid_block_idx, end_block_idx]() {
+ evalAsync<Alignment>(mid_block_idx, end_block_idx);
+ });
+ end_block_idx = mid_block_idx;
+ }
+
+ Index block_idx = start_block_idx;
+
+ Index block_start = block_idx * block_size;
+ Index block_end = block_start + actualBlockSize(block_idx);
+
+ processBlock<Alignment>(block_idx, block_start, block_end);
+
+ int v = num_pending_blocks.fetch_sub(1);
+ eigen_assert(v >= 1);
+
+ if (v == 1) {
+ // Aggregate partial sums from l0 ranges.
+ aggregateL0Blocks<Alignment>();
+
+ // Apply output kernel.
+ applyOutputKernel();
+
+ // NOTE: If we call `done` callback before deleting this (context),
+ // it might deallocate Self* pointer captured by context, and we'll
+ // fail in destructor trying to deallocate temporary buffers.
+
+ // Move done call back from context before it will be destructed.
+ DoneCallback done_copy = std::move(done);
+
+ // We are confident that we are the last one who touches context.
+ delete this;
+
+ // Now safely call the done callback.
+ done_copy();
+ }
+ }
+
+ // Cost model doesn't capture well the cost associated with constructing
+ // tensor contraction mappers and computing loop bounds in gemm_pack_lhs
+ // and gemm_pack_rhs, so we specify minimum desired block size.
+ static Index blockSize(Index k, int num_threads) {
+ const auto round_up = [=](Index index) -> Index {
+ const Index kmultiple = packet_size <= 8 ? 8 : packet_size;
+ return divup<Index>(index, kmultiple) * kmultiple;
+ };
+
+ const Index target_block_size = round_up(divup<Index>(k, num_threads));
+ const Index desired_min_block_size = 12 * packet_size;
+
+ return numext::mini<Index>(
+ k, numext::maxi<Index>(desired_min_block_size, target_block_size));
+ }
+
+ EvalShardedByInnerDimContext(const EvalShardedByInnerDimContext&) = delete;
+ void operator=(const EvalShardedByInnerDimContext&) = delete;
+ };
+
+ // ------------------------------------------------------------------------ //
+
+ // Below are the function used by evalProductImpl heuristics, trying to select
+ // optimcal parameters for parallelization algorithm.
+
+ // Decide whether we want to shard m x n contraction by columns or by rows.
+ static bool shardByCol(Index m, Index n, Index num_threads) {
+ // Note: we are comparing both n and m against Traits::nr, it is not
+ // a mistake. We are trying to figure out how both n and m will fit into
+ // the main sharding dimension.
+
+ // Sharding by column is the default
+ // ... unless there is enough data for vectorization over rows
+ if (m / num_threads >= Traits::nr &&
+ // and not enough data for vectorization over columns
+ (n / num_threads < Traits::nr ||
+ // ... or barely enough data for vectorization over columns,
+ // but it is not evenly dividable across threads
+ (n / num_threads < 4 * Traits::nr &&
+ (n % (num_threads * Traits::nr)) != 0 &&
+ // ... and it is evenly dividable across threads for rows
+ ((m % (num_threads * Traits::nr)) == 0 ||
+ // .. or it is not evenly dividable for both dimensions but
+ // there is much more data over rows so that corner effects are
+ // mitigated.
+ (m / n >= 6)))))
+ return false;
+ // Wait, or if matrices are just substantially prolonged over the other
+ // dimension.
+ if (n / num_threads < 16 * Traits::nr && m > n * 32) return false;
+ return true;
+ }
+
+ Index coarsenM(Index m, Index n, Index bm, Index bn, Index bk, Index gn,
+ int num_threads, bool shard_by_col) const {
+ Index gm = 1;
+ Index gm1 = 1;
+ Index nm0 = divup(m, bm);
+ Index nm1 = nm0;
+ for (;;) {
+ // Find the next candidate for m grain size. It needs to result in
+ // different number of blocks. E.g. if we have 10 kernels, we want to try
+ // 5 and 10, but not 6, 7, 8 and 9.
+ while (gm1 <= nm0 && nm1 == divup(nm0, gm1)) gm1++;
+ if (gm1 > nm0) break;
+ // Check the candidate.
+ int res = checkGrain(m, n, bm, bn, bk, gm1, gn, gm, gn, num_threads,
+ shard_by_col);
+ if (res < 0) break;
+ nm1 = divup(nm0, gm1);
+ if (res == 0) continue;
+ // Commit new grain size.
+ gm = gm1;
+ }
+ return gm;
+ }
+
+ Index coarsenN(Index m, Index n, Index bm, Index bn, Index bk, Index gm,
+ int num_threads, bool shard_by_col) const {
+ Index gn = 1;
+ Index gn1 = 1;
+ Index nn0 = divup(n, bn);
+ Index nn1 = nn0;
+ for (;;) {
+ while (gn1 <= nn0 && nn1 == divup(nn0, gn1)) gn1++;
+ if (gn1 > nn0) break;
+ int res = checkGrain(m, n, bm, bn, bk, gm, gn1, gm, gn, num_threads,
+ shard_by_col);
+ if (res < 0) break;
+ nn1 = divup(nn0, gn1);
+ if (res == 0) continue;
+ gn = gn1;
+ }
+ return gn;
+ }
+
+ // checkGrain checks whether grain (gm, gn) is suitable and is better than
+ // (oldgm, oldgn).
+ int checkGrain(Index m, Index n, Index bm, Index bn, Index bk, Index gm,
+ Index gn, Index oldgm, Index oldgn, int num_threads,
+ bool shard_by_col) const {
+ const TensorOpCost cost =
+ contractionCost(bm * gm, bn * gn, bm, bn, bk, shard_by_col, true);
+ double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(
+ static_cast<double>(bm) * gm * bn * gn, cost);
+ // If the task is too small, then we agree on it regardless of anything
+ // else. Otherwise synchronization overheads will dominate.
+ if (taskSize < 1) return 1;
+ // If it is too large, then we reject it and all larger tasks.
+ if (taskSize > 2) return -1;
+ // Now we are in presumably good task size range.
+ // The main deciding factor here is parallelism. Consider that we have 12
+ // kernels and 4 threads. Grains of 2, 3 and 4 all yield good task sizes.
+ // But 2/4 yield 6/3 tasks, which gives us parallelism of 0.75 (at most 3/4
+ // of cores will be busy). While grain size 3 gives us 4 tasks, which gives
+ // us parallelism of 1 (we can load all cores).
+ Index nm0 = divup(m, bm);
+ Index nn0 = divup(n, bn);
+ Index new_tasks = divup(nm0, gm) * divup(nn0, gn);
+ double new_parallelism = static_cast<double>(new_tasks) /
+ (divup<int>(new_tasks, num_threads) * num_threads);
+ Index old_tasks = divup(nm0, oldgm) * divup(nn0, oldgn);
+ double old_parallelism = static_cast<double>(old_tasks) /
+ (divup<int>(old_tasks, num_threads) * num_threads);
+ if (new_parallelism > old_parallelism || new_parallelism == 1) return 1;
+ return 0;
+ }
+
+ TensorOpCost contractionCost(Index m, Index n, Index bm, Index bn, Index bk,
+ bool shard_by_col, bool prepacked) const {
+ const int packed_size = std::min<int>(PacketType<LhsScalar, Device>::size,
+ PacketType<RhsScalar, Device>::size);
+ const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size;
+ const double kd = static_cast<double>(bk);
+ double compute_bandwidth = computeBandwidth(false, bm, bn, bk);
+ // Computations.
+ TensorOpCost cost = TensorOpCost(0, 0, kd * compute_bandwidth, true, packed_size);
+ // Output stores.
+ cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size);
+ if (prepacked) {
+ // Packing and kernels are executed in different tasks. When we calculate
+ // task grain size we look only at kernel cost assuming that kernel
+ // is more expensive than packing.
+ return cost;
+ }
+ // Lhs/rhs loads + computations.
+ TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * (kd / n);
+ TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * (kd / m);
+ // Lhs packing memory cost does not contribute considerably to overall
+ // execution time because lhs is prefetched early and accessed sequentially.
+ if (shard_by_col)
+ lhsCost.dropMemoryCost();
+ else
+ rhsCost.dropMemoryCost();
+ return cost + lhsCost + rhsCost;
+ }
+
+ // Decide whether we want to shard m x k x n contraction over the inner
+ // (contraction) dimension (k).
+ static bool shardByInnerDim(Index m, Index n, Index k, int num_threads,
+ int num_threads_by_k) {
+ std::ptrdiff_t bufsize = m * n * sizeof(Scalar);
+ bool shard_by_k = false;
+ if (n == 1 || // If mat*vec or...
+ num_threads_by_k < 2 || // running single threaded or...
+ num_threads_by_k <
+ num_threads || // sharding by k gives less parallelism or...
+ bufsize > l3CacheSize() / num_threads_by_k || // need more buffer space
+ // than L3 cache or...
+ k / num_threads_by_k < 2 * Traits::nr) { // k per thread is tiny.
+ shard_by_k = false;
+ } else if (numext::maxi(m, n) / num_threads <
+ Traits::nr || // both other dimensions are tiny or...
+ // k per thread is not small and...
+ (k / num_threads_by_k > 8 * Traits::nr &&
+ // one of the outer dimensions is tiny or sharding by k offers
+ // more parallelism.
+ (numext::mini(m, n) < 2 * Traits::nr ||
+ num_threads_by_k > num_threads))) {
+ shard_by_k = true;
+ }
+ return shard_by_k;
+ }
+
+ TensorOpCost contractionCostPerInnerDim(Index m, Index n, Index k) const {
+ // Compute cost.
+ const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size;
+ TensorOpCost cost(0, 0, (computeBandwidth(true, m, n, k) * m) * n, true, output_packet_size);
+ // Output stores.
+ cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size);
+ TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * m;
+ TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * n;
+ // Since the inner gemm kernel is always sharded by column, the lhs
+ // load cost is negligible.
+ lhsCost.dropMemoryCost();
+ return cost + lhsCost + rhsCost;
+ }
+
+ int numThreadsInnerDim(Index m, Index n, Index k) const {
+ const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size;
+ TensorOpCost cost = contractionCostPerInnerDim(m, n, k);
+ double total_parallel_cost =
+ TensorCostModel<ThreadPoolDevice>::totalCost(k, cost);
+ // Cost of reduction step accumulating the m*n per-thread buffers into the
+ // result.
+ double reduction_cost = TensorCostModel<ThreadPoolDevice>::totalCost(
+ m * n, TensorOpCost(2, 1, 1, true, output_packet_size));
+ int num_threads = 1;
+ double min_cost = total_parallel_cost;
+ double kPerThreadOverHead = 3000;
+ double kFixedOverHead = 100000;
+ for (int nt = 2; nt <= this->m_device.numThreads(); nt += 2) {
+ double sequential_cost =
+ kFixedOverHead + nt * (reduction_cost + kPerThreadOverHead);
+ double parallel_cost = total_parallel_cost / nt + sequential_cost;
+ if (parallel_cost < min_cost) {
+ num_threads = nt;
+ min_cost = parallel_cost;
+ }
+ }
+ return num_threads;
+ }
+
+ double computeBandwidth(bool shard_by_col, Index bm, Index bn,
+ Index bk) const {
+ // Peak VFMA bandwidth is 0.5. However if we have not enough data for
+ // vectorization bandwidth drops. The 4.0 and 2.0 bandwidth is determined
+ // experimentally.
+ double computeBandwidth =
+ bk == 1 ? 4.0
+ : (shard_by_col ? bn : bm) < Traits::nr ||
+ (shard_by_col ? bm : bn) < Traits::mr
+ ? 2.0
+ : 0.5;
+#ifndef EIGEN_VECTORIZE_FMA
+ // Bandwidth of all of VFMA/MULPS/ADDPS is 0.5 on latest Intel processors.
+ // However for MULPS/ADDPS we have dependent sequence of 2 such
+ // instructions,
+ // so overall bandwidth is 1.0.
+ if (computeBandwidth == 0.5) computeBandwidth = 1.0;
+#endif
+ return computeBandwidth;
+ }
+
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_USE_THREADS
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorConversion.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorConversion.h
new file mode 100644
index 0000000..09d2da9
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorConversion.h
@@ -0,0 +1,456 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H
+
+namespace Eigen {
+
+/** \class TensorConversionOp
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor conversion class. This class makes it possible to vectorize
+ * type casting operations when the number of scalars per packet in the source
+ * and the destination type differ
+ */
+namespace internal {
+template<typename TargetType, typename XprType>
+struct traits<TensorConversionOp<TargetType, XprType> >
+{
+ // Type promotion to handle the case where the types of the lhs and the rhs are different.
+ typedef TargetType Scalar;
+ typedef typename traits<XprType>::StorageKind StorageKind;
+ typedef typename traits<XprType>::Index Index;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = traits<XprType>::NumDimensions;
+ static const int Layout = traits<XprType>::Layout;
+ enum { Flags = 0 };
+ typedef typename TypeConversion<Scalar, typename traits<XprType>::PointerType>::type PointerType;
+};
+
+template<typename TargetType, typename XprType>
+struct eval<TensorConversionOp<TargetType, XprType>, Eigen::Dense>
+{
+ typedef const TensorConversionOp<TargetType, XprType>& type;
+};
+
+template<typename TargetType, typename XprType>
+struct nested<TensorConversionOp<TargetType, XprType>, 1, typename eval<TensorConversionOp<TargetType, XprType> >::type>
+{
+ typedef TensorConversionOp<TargetType, XprType> type;
+};
+
+} // end namespace internal
+
+
+template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket, int SrcCoeffRatio, int TgtCoeffRatio>
+struct PacketConverter;
+
+template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
+struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 1, 1> {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ PacketConverter(const TensorEvaluator& impl)
+ : m_impl(impl) {}
+
+ template<int LoadMode, typename Index>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
+ return internal::pcast<SrcPacket, TgtPacket>(m_impl.template packet<LoadMode>(index));
+ }
+
+ private:
+ const TensorEvaluator& m_impl;
+};
+
+
+template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
+struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 2, 1> {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ PacketConverter(const TensorEvaluator& impl)
+ : m_impl(impl) {}
+
+ template<int LoadMode, typename Index>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
+ const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
+
+ SrcPacket src1 = m_impl.template packet<LoadMode>(index);
+ SrcPacket src2 = m_impl.template packet<LoadMode>(index + SrcPacketSize);
+ TgtPacket result = internal::pcast<SrcPacket, TgtPacket>(src1, src2);
+ return result;
+ }
+
+ private:
+ const TensorEvaluator& m_impl;
+};
+
+template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
+struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 4, 1> {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ PacketConverter(const TensorEvaluator& impl)
+ : m_impl(impl) {}
+
+ template<int LoadMode, typename Index>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
+ const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
+
+ SrcPacket src1 = m_impl.template packet<LoadMode>(index);
+ SrcPacket src2 = m_impl.template packet<LoadMode>(index + SrcPacketSize);
+ SrcPacket src3 = m_impl.template packet<LoadMode>(index + 2 * SrcPacketSize);
+ SrcPacket src4 = m_impl.template packet<LoadMode>(index + 3 * SrcPacketSize);
+ TgtPacket result = internal::pcast<SrcPacket, TgtPacket>(src1, src2, src3, src4);
+ return result;
+ }
+
+ private:
+ const TensorEvaluator& m_impl;
+};
+
+template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
+struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 8, 1> {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ PacketConverter(const TensorEvaluator& impl)
+ : m_impl(impl) {}
+
+ template<int LoadMode, typename Index>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
+ const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
+
+ SrcPacket src1 = m_impl.template packet<LoadMode>(index);
+ SrcPacket src2 = m_impl.template packet<LoadMode>(index + 1 * SrcPacketSize);
+ SrcPacket src3 = m_impl.template packet<LoadMode>(index + 2 * SrcPacketSize);
+ SrcPacket src4 = m_impl.template packet<LoadMode>(index + 3 * SrcPacketSize);
+ SrcPacket src5 = m_impl.template packet<LoadMode>(index + 4 * SrcPacketSize);
+ SrcPacket src6 = m_impl.template packet<LoadMode>(index + 5 * SrcPacketSize);
+ SrcPacket src7 = m_impl.template packet<LoadMode>(index + 6 * SrcPacketSize);
+ SrcPacket src8 = m_impl.template packet<LoadMode>(index + 7 * SrcPacketSize);
+ TgtPacket result = internal::pcast<SrcPacket, TgtPacket>(src1, src2, src3, src4, src5, src6, src7, src8);
+ return result;
+ }
+
+ private:
+ const TensorEvaluator& m_impl;
+};
+
+template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket, int TgtCoeffRatio>
+struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 1, TgtCoeffRatio> {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ PacketConverter(const TensorEvaluator& impl)
+ : m_impl(impl), m_maxIndex(impl.dimensions().TotalSize()) {}
+
+ template<int LoadMode, typename Index>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
+ const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
+ // Only call m_impl.packet() when we have direct access to the underlying data. This
+ // ensures that we don't compute the subexpression twice. We may however load some
+ // coefficients twice, but in practice this doesn't negatively impact performance.
+ if (m_impl.data() && (index + SrcPacketSize < m_maxIndex)) {
+ // Force unaligned memory loads since we can't ensure alignment anymore
+ return internal::pcast<SrcPacket, TgtPacket>(m_impl.template packet<Unaligned>(index));
+ } else {
+ const int TgtPacketSize = internal::unpacket_traits<TgtPacket>::size;
+ typedef typename internal::unpacket_traits<SrcPacket>::type SrcType;
+ typedef typename internal::unpacket_traits<TgtPacket>::type TgtType;
+ internal::scalar_cast_op<SrcType, TgtType> converter;
+ EIGEN_ALIGN_MAX typename internal::unpacket_traits<TgtPacket>::type values[TgtPacketSize];
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < TgtPacketSize; ++i) {
+ values[i] = converter(m_impl.coeff(index+i));
+ }
+ TgtPacket rslt = internal::pload<TgtPacket>(values);
+ return rslt;
+ }
+ }
+
+ private:
+ const TensorEvaluator& m_impl;
+ const typename TensorEvaluator::Index m_maxIndex;
+};
+
+template<typename TargetType, typename XprType>
+class TensorConversionOp : public TensorBase<TensorConversionOp<TargetType, XprType>, ReadOnlyAccessors>
+{
+ public:
+ typedef typename internal::traits<TensorConversionOp>::Scalar Scalar;
+ typedef typename internal::traits<TensorConversionOp>::StorageKind StorageKind;
+ typedef typename internal::traits<TensorConversionOp>::Index Index;
+ typedef typename internal::nested<TensorConversionOp>::type Nested;
+ typedef Scalar CoeffReturnType;
+ typedef typename NumTraits<Scalar>::Real RealScalar;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConversionOp(const XprType& xpr)
+ : m_xpr(xpr) {}
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type&
+ expression() const { return m_xpr; }
+
+ protected:
+ typename XprType::Nested m_xpr;
+};
+
+template <bool SameType, typename Eval, typename EvalPointerType> struct ConversionSubExprEval {
+ static EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType) {
+ impl.evalSubExprsIfNeeded(NULL);
+ return true;
+ }
+};
+
+template <typename Eval, typename EvalPointerType> struct ConversionSubExprEval<true, Eval, EvalPointerType> {
+ static EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType data) {
+ return impl.evalSubExprsIfNeeded(data);
+ }
+};
+
+#ifdef EIGEN_USE_THREADS
+template <bool SameType, typename Eval, typename EvalPointerType,
+ typename EvalSubExprsCallback>
+struct ConversionSubExprEvalAsync {
+ static EIGEN_STRONG_INLINE void run(Eval& impl, EvalPointerType, EvalSubExprsCallback done) {
+ impl.evalSubExprsIfNeededAsync(nullptr, std::move(done));
+ }
+};
+
+template <typename Eval, typename EvalPointerType,
+ typename EvalSubExprsCallback>
+struct ConversionSubExprEvalAsync<true, Eval, EvalPointerType,
+ EvalSubExprsCallback> {
+ static EIGEN_STRONG_INLINE void run(Eval& impl, EvalPointerType data, EvalSubExprsCallback done) {
+ impl.evalSubExprsIfNeededAsync(data, std::move(done));
+ }
+};
+#endif
+
+namespace internal {
+
+template <typename SrcType, typename TargetType, bool IsSameT>
+struct CoeffConv {
+ template <typename ArgType, typename Device>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetType run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+ internal::scalar_cast_op<SrcType, TargetType> converter;
+ return converter(impl.coeff(index));
+ }
+};
+
+template <typename SrcType, typename TargetType>
+struct CoeffConv<SrcType, TargetType, true> {
+ template <typename ArgType, typename Device>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetType run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+ return impl.coeff(index);
+ }
+};
+
+template <typename SrcPacket, typename TargetPacket, int LoadMode, bool ActuallyVectorize, bool IsSameT>
+struct PacketConv {
+ typedef typename internal::unpacket_traits<SrcPacket>::type SrcType;
+ typedef typename internal::unpacket_traits<TargetPacket>::type TargetType;
+
+ static const int PacketSize = internal::unpacket_traits<TargetPacket>::size;
+
+ template <typename ArgType, typename Device>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+ internal::scalar_cast_op<SrcType, TargetType> converter;
+ EIGEN_ALIGN_MAX typename internal::remove_const<TargetType>::type values[PacketSize];
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < PacketSize; ++i) {
+ values[i] = converter(impl.coeff(index+i));
+ }
+ TargetPacket rslt = internal::pload<TargetPacket>(values);
+ return rslt;
+ }
+};
+
+template <typename SrcPacket, typename TargetPacket, int LoadMode, bool IsSameT>
+struct PacketConv<SrcPacket, TargetPacket, LoadMode, true, IsSameT> {
+ typedef typename internal::unpacket_traits<SrcPacket>::type SrcType;
+ typedef typename internal::unpacket_traits<TargetPacket>::type TargetType;
+
+ template <typename ArgType, typename Device>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+ const int SrcCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio;
+ const int TgtCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio;
+ PacketConverter<TensorEvaluator<ArgType, Device>, SrcPacket, TargetPacket,
+ SrcCoeffRatio, TgtCoeffRatio> converter(impl);
+ return converter.template packet<LoadMode>(index);
+ }
+};
+
+template <typename SrcPacket, typename TargetPacket, int LoadMode>
+struct PacketConv<SrcPacket, TargetPacket, LoadMode, /*ActuallyVectorize=*/false, /*IsSameT=*/true> {
+ typedef typename internal::unpacket_traits<TargetPacket>::type TargetType;
+ static const int PacketSize = internal::unpacket_traits<TargetPacket>::size;
+
+ template <typename ArgType, typename Device>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+ EIGEN_ALIGN_MAX typename internal::remove_const<TargetType>::type values[PacketSize];
+ for (int i = 0; i < PacketSize; ++i) values[i] = impl.coeff(index+i);
+ return internal::pload<TargetPacket>(values);
+ }
+};
+
+template <typename SrcPacket, typename TargetPacket, int LoadMode>
+struct PacketConv<SrcPacket, TargetPacket, LoadMode, /*ActuallyVectorize=*/true, /*IsSameT=*/true> {
+ template <typename ArgType, typename Device>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+ return impl.template packet<LoadMode>(index);
+ }
+};
+
+} // namespace internal
+
+// Eval as rvalue
+template<typename TargetType, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
+{
+ typedef TensorConversionOp<TargetType, ArgType> XprType;
+ typedef typename XprType::Index Index;
+ typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+ typedef TargetType Scalar;
+ typedef TargetType CoeffReturnType;
+ typedef typename internal::remove_all<typename internal::traits<ArgType>::Scalar>::type SrcType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ typedef typename PacketType<SrcType, Device>::type PacketSourceType;
+ static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+ static const bool IsSameType = internal::is_same<TargetType, SrcType>::value;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+
+ enum {
+ IsAligned = false,
+ PacketAccess =
+ #ifndef EIGEN_USE_SYCL
+ true,
+ #else
+ TensorEvaluator<ArgType, Device>::PacketAccess &
+ internal::type_casting_traits<SrcType, TargetType>::VectorizedCast,
+ #endif
+ BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
+ PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ RawAccess = false
+ };
+
+ static const int NumDims = internal::array_size<Dimensions>::value;
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+ typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+ typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
+ ArgTensorBlock;
+
+ struct TensorConversionOpBlockFactory {
+ template <typename ArgXprType>
+ struct XprType {
+ typedef TensorConversionOp<TargetType, const ArgXprType> type;
+ };
+
+ template <typename ArgXprType>
+ typename XprType<ArgXprType>::type expr(const ArgXprType& expr) const {
+ return typename XprType<ArgXprType>::type(expr);
+ }
+ };
+
+ typedef internal::TensorUnaryExprBlock<TensorConversionOpBlockFactory,
+ ArgTensorBlock>
+ TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : m_impl(op.expression(), device)
+ {
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data)
+ {
+ return ConversionSubExprEval<IsSameType, TensorEvaluator<ArgType, Device>, EvaluatorPointerType>::run(m_impl, data);
+ }
+
+#ifdef EIGEN_USE_THREADS
+ template <typename EvalSubExprsCallback>
+ EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+ EvaluatorPointerType data, EvalSubExprsCallback done) {
+ ConversionSubExprEvalAsync<IsSameType, TensorEvaluator<ArgType, Device>,
+ EvaluatorPointerType,
+ EvalSubExprsCallback>::run(m_impl, data, std::move(done));
+ }
+#endif
+
+ EIGEN_STRONG_INLINE void cleanup()
+ {
+ m_impl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ return internal::CoeffConv<SrcType, TargetType, IsSameType>::run(m_impl,index);
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType
+ packet(Index index) const {
+ // If we are not going to do the cast, we just need to check that base
+ // TensorEvaluator has packet access. Otherwise we also need to make sure,
+ // that we have an implementation of vectorized cast.
+ const bool Vectorizable =
+ IsSameType
+ ? TensorEvaluator<ArgType, Device>::PacketAccess
+ : int(TensorEvaluator<ArgType, Device>::PacketAccess) &
+ int(internal::type_casting_traits<SrcType, TargetType>::VectorizedCast);
+
+ return internal::PacketConv<PacketSourceType, PacketReturnType, LoadMode,
+ Vectorizable, IsSameType>::run(m_impl, index);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+ costPerCoeff(bool vectorized) const {
+ const double cast_cost = TensorOpCost::CastCost<SrcType, TargetType>();
+ if (vectorized) {
+ const double SrcCoeffRatio =
+ internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio;
+ const double TgtCoeffRatio =
+ internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio;
+ return m_impl.costPerCoeff(vectorized) * (SrcCoeffRatio / PacketSize) +
+ TensorOpCost(0, 0, TgtCoeffRatio * (cast_cost / PacketSize));
+ } else {
+ return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, cast_cost);
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ internal::TensorBlockResourceRequirements getResourceRequirements() const {
+ return m_impl.getResourceRequirements();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+ block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+ bool /*root_of_expr_ast*/ = false) const {
+ return TensorBlock(m_impl.block(desc, scratch),
+ TensorConversionOpBlockFactory());
+ }
+
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+ /// required by sycl in order to extract the sycl accessor
+ const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_impl.bind(cgh);
+ }
+#endif
+
+ protected:
+ TensorEvaluator<ArgType, Device> m_impl;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorConvolution.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorConvolution.h
new file mode 100644
index 0000000..b20f80b
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorConvolution.h
@@ -0,0 +1,1132 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
+
+namespace Eigen {
+
+/** \class TensorConvolution
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor convolution class.
+ *
+ *
+ */
+namespace internal {
+
+template <typename Index, typename InputDims, int NumKernelDims, int Layout>
+class IndexMapper {
+ public:
+ IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims,
+ const array<Index, NumKernelDims>& indices) {
+
+ array<Index, NumDims> dimensions = input_dims;
+ for (int i = 0; i < NumKernelDims; ++i) {
+ const Index index = indices[i];
+ const Index input_dim = input_dims[index];
+ const Index kernel_dim = kernel_dims[i];
+ const Index result_dim = input_dim - kernel_dim + 1;
+ dimensions[index] = result_dim;
+ }
+
+ array<Index, NumDims> inputStrides;
+ array<Index, NumDims> outputStrides;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ inputStrides[0] = 1;
+ outputStrides[0] = 1;
+ for (int i = 1; i < NumDims; ++i) {
+ inputStrides[i] = inputStrides[i-1] * input_dims[i-1];
+ outputStrides[i] = outputStrides[i-1] * dimensions[i-1];
+ }
+ } else {
+ inputStrides[NumDims - 1] = 1;
+ outputStrides[NumDims - 1] = 1;
+ for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) {
+ inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
+ outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1];
+ }
+ }
+
+ array<Index, NumDims> gpuInputDimensions;
+ array<Index, NumDims> gpuOutputDimensions;
+ array<Index, NumDims> tmp = dimensions;
+ array<Index, NumDims> ordering;
+ const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+ ? 0
+ : NumDims - NumKernelDims;
+ for (int i = 0; i < NumKernelDims; ++i) {
+ const Index index = i + offset;
+ ordering[index] = indices[i];
+ tmp[indices[i]] = -1;
+ gpuInputDimensions[index] = input_dims[indices[i]];
+ gpuOutputDimensions[index] = dimensions[indices[i]];
+ }
+
+ int written = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+ ? NumKernelDims
+ : 0;
+ for (int i = 0; i < NumDims; ++i) {
+ if (tmp[i] >= 0) {
+ ordering[written] = i;
+ gpuInputDimensions[written] = input_dims[i];
+ gpuOutputDimensions[written] = dimensions[i];
+ ++written;
+ }
+ }
+
+ for (int i = 0; i < NumDims; ++i) {
+ m_inputStrides[i] = inputStrides[ordering[i]];
+ m_outputStrides[i] = outputStrides[ordering[i]];
+ }
+
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = 0; i < NumDims; ++i) {
+ if (i > NumKernelDims) {
+ m_gpuInputStrides[i] =
+ m_gpuInputStrides[i - 1] * gpuInputDimensions[i - 1];
+ m_gpuOutputStrides[i] =
+ m_gpuOutputStrides[i - 1] * gpuOutputDimensions[i - 1];
+ } else {
+ m_gpuInputStrides[i] = 1;
+ m_gpuOutputStrides[i] = 1;
+ }
+ }
+ } else {
+ for (int i = NumDims - 1; i >= 0; --i) {
+ if (static_cast<size_t>(i + 1) < offset) {
+ m_gpuInputStrides[i] =
+ m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1];
+ m_gpuOutputStrides[i] =
+ m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1];
+ } else {
+ m_gpuInputStrides[i] = 1;
+ m_gpuOutputStrides[i] = 1;
+ }
+ }
+ }
+ }
+
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const {
+ Index inputIndex = 0;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int d = NumDims - 1; d > NumKernelDims; --d) {
+ const Index idx = p / m_gpuInputStrides[d];
+ inputIndex += idx * m_inputStrides[d];
+ p -= idx * m_gpuInputStrides[d];
+ }
+ inputIndex += p * m_inputStrides[NumKernelDims];
+ } else {
+ std::ptrdiff_t limit = 0;
+ if (NumKernelDims < NumDims) {
+ limit = NumDims - NumKernelDims - 1;
+ }
+ for (int d = 0; d < limit; ++d) {
+ const Index idx = p / m_gpuInputStrides[d];
+ inputIndex += idx * m_inputStrides[d];
+ p -= idx * m_gpuInputStrides[d];
+ }
+ inputIndex += p * m_inputStrides[limit];
+ }
+ return inputIndex;
+ }
+
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const {
+ Index outputIndex = 0;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int d = NumDims - 1; d > NumKernelDims; --d) {
+ const Index idx = p / m_gpuOutputStrides[d];
+ outputIndex += idx * m_outputStrides[d];
+ p -= idx * m_gpuOutputStrides[d];
+ }
+ outputIndex += p * m_outputStrides[NumKernelDims];
+ } else {
+ std::ptrdiff_t limit = 0;
+ if (NumKernelDims < NumDims) {
+ limit = NumDims - NumKernelDims - 1;
+ }
+ for (int d = 0; d < limit; ++d) {
+ const Index idx = p / m_gpuOutputStrides[d];
+ outputIndex += idx * m_outputStrides[d];
+ p -= idx * m_gpuOutputStrides[d];
+ }
+ outputIndex += p * m_outputStrides[limit];
+ }
+ return outputIndex;
+ }
+
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const {
+ const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+ ? 0
+ : NumDims - NumKernelDims;
+ return i * m_inputStrides[offset];
+ }
+
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const {
+ const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+ ? 0
+ : NumDims - NumKernelDims;
+ return i * m_outputStrides[offset];
+ }
+
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j) const {
+ const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+ ? 0
+ : NumDims - NumKernelDims;
+ return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
+ }
+
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j) const {
+ const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+ ? 0
+ : NumDims - NumKernelDims;
+ return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
+ }
+
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
+ const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+ ? 0
+ : NumDims - NumKernelDims;
+ return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] +
+ k * m_inputStrides[offset + 2];
+ }
+
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
+ const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
+ ? 0
+ : NumDims - NumKernelDims;
+ return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] +
+ k * m_outputStrides[offset + 2];
+ }
+
+ private:
+ static const int NumDims = internal::array_size<InputDims>::value;
+ array<Index, NumDims> m_inputStrides;
+ array<Index, NumDims> m_outputStrides;
+ array<Index, NumDims> m_gpuInputStrides;
+ array<Index, NumDims> m_gpuOutputStrides;
+};
+
+
+
+template<typename Dimensions, typename InputXprType, typename KernelXprType>
+struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >
+{
+ // Type promotion to handle the case where the types of the lhs and the rhs are different.
+ typedef typename promote_storage_type<typename InputXprType::Scalar,
+ typename KernelXprType::Scalar>::ret Scalar;
+ typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind,
+ typename traits<KernelXprType>::StorageKind>::ret StorageKind;
+ typedef typename promote_index_type<typename traits<InputXprType>::Index,
+ typename traits<KernelXprType>::Index>::type Index;
+ typedef typename InputXprType::Nested LhsNested;
+ typedef typename KernelXprType::Nested RhsNested;
+ typedef typename remove_reference<LhsNested>::type _LhsNested;
+ typedef typename remove_reference<RhsNested>::type _RhsNested;
+ static const int NumDimensions = traits<InputXprType>::NumDimensions;
+ static const int Layout = traits<InputXprType>::Layout;
+ typedef typename conditional<Pointer_type_promotion<typename InputXprType::Scalar, Scalar>::val,
+ typename traits<InputXprType>::PointerType, typename traits<KernelXprType>::PointerType>::type PointerType;
+
+ enum {
+ Flags = 0
+ };
+};
+
+template<typename Dimensions, typename InputXprType, typename KernelXprType>
+struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense>
+{
+ typedef const TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>& type;
+};
+
+template<typename Dimensions, typename InputXprType, typename KernelXprType>
+struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type>
+{
+ typedef TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> type;
+};
+
+} // end namespace internal
+
+
+
+template<typename Indices, typename InputXprType, typename KernelXprType>
+class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors>
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType,
+ typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;
+ typedef typename Eigen::internal::nested<TensorConvolutionOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorConvolutionOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorConvolutionOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims)
+ : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const Indices& indices() const { return m_indices; }
+
+ /** \returns the nested expressions */
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const typename internal::remove_all<typename InputXprType::Nested>::type&
+ inputExpression() const { return m_input_xpr; }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const typename internal::remove_all<typename KernelXprType::Nested>::type&
+ kernelExpression() const { return m_kernel_xpr; }
+
+ protected:
+ typename InputXprType::Nested m_input_xpr;
+ typename KernelXprType::Nested m_kernel_xpr;
+ const Indices m_indices;
+};
+
+
+template<typename Indices, typename InputArgType, typename KernelArgType, typename Device>
+struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device>
+{
+ typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
+
+ static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value;
+ static const int NumKernelDims = internal::array_size<Indices>::value;
+ typedef typename XprType::Index Index;
+ typedef DSizes<Index, NumDims> Dimensions;
+
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+ typedef StorageMemory<Scalar, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+
+ enum {
+ IsAligned = int(TensorEvaluator<InputArgType, Device>::IsAligned) & int(TensorEvaluator<KernelArgType, Device>::IsAligned),
+ PacketAccess = int(TensorEvaluator<InputArgType, Device>::PacketAccess) & int(TensorEvaluator<KernelArgType, Device>::PacketAccess),
+ BlockAccess = false,
+ PreferBlockAccess = false,
+ Layout = TensorEvaluator<InputArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = false
+ };
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
+ {
+ EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+ const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
+ const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
+
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ m_inputStride[0] = 1;
+ for (int i = 1; i < NumDims; ++i) {
+ m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1];
+ }
+ } else {
+ m_inputStride[NumDims - 1] = 1;
+ for (int i = NumDims - 2; i >= 0; --i) {
+ m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1];
+ }
+ }
+
+ m_dimensions = m_inputImpl.dimensions();
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = 0; i < NumKernelDims; ++i) {
+ const Index index = op.indices()[i];
+ const Index input_dim = input_dims[index];
+ const Index kernel_dim = kernel_dims[i];
+ const Index result_dim = input_dim - kernel_dim + 1;
+ m_dimensions[index] = result_dim;
+ if (i > 0) {
+ m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1];
+ } else {
+ m_kernelStride[0] = 1;
+ }
+ m_indexStride[i] = m_inputStride[index];
+ }
+
+ m_outputStride[0] = 1;
+ for (int i = 1; i < NumDims; ++i) {
+ m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1];
+ }
+ } else {
+ for (int i = NumKernelDims - 1; i >= 0; --i) {
+ const Index index = op.indices()[i];
+ const Index input_dim = input_dims[index];
+ const Index kernel_dim = kernel_dims[i];
+ const Index result_dim = input_dim - kernel_dim + 1;
+ m_dimensions[index] = result_dim;
+ if (i < NumKernelDims - 1) {
+ m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1];
+ } else {
+ m_kernelStride[NumKernelDims - 1] = 1;
+ }
+ m_indexStride[i] = m_inputStride[index];
+ }
+
+ m_outputStride[NumDims - 1] = 1;
+ for (int i = NumDims - 2; i >= 0; --i) {
+ m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1];
+ }
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+ m_inputImpl.evalSubExprsIfNeeded(NULL);
+ preloadKernel();
+ return true;
+ }
+ EIGEN_STRONG_INLINE void cleanup() {
+ m_inputImpl.cleanup();
+ if (m_local_kernel) {
+ m_device.deallocate((void*)m_kernel);
+ m_local_kernel = false;
+ }
+ m_kernel = NULL;
+ }
+
+ void evalTo(typename XprType::Scalar* buffer) {
+ evalSubExprsIfNeeded(NULL);
+ for (int i = 0; i < dimensions().TotalSize(); ++i) {
+ buffer[i] += coeff(i);
+ }
+ cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ CoeffReturnType result = CoeffReturnType(0);
+ convolve(firstInput(index), 0, NumKernelDims-1, result);
+ return result;
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const
+ {
+ Index indices[2] = {index, index+PacketSize-1};
+ Index startInputs[2] = {0, 0};
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = NumDims - 1; i > 0; --i) {
+ const Index idx0 = indices[0] / m_outputStride[i];
+ const Index idx1 = indices[1] / m_outputStride[i];
+ startInputs[0] += idx0 * m_inputStride[i];
+ startInputs[1] += idx1 * m_inputStride[i];
+ indices[0] -= idx0 * m_outputStride[i];
+ indices[1] -= idx1 * m_outputStride[i];
+ }
+ } else {
+ for (int i = 0; i < NumDims - 1; ++i) {
+ const Index idx0 = indices[0] / m_outputStride[i];
+ const Index idx1 = indices[1] / m_outputStride[i];
+ startInputs[0] += idx0 * m_inputStride[i];
+ startInputs[1] += idx1 * m_inputStride[i];
+ indices[0] -= idx0 * m_outputStride[i];
+ indices[1] -= idx1 * m_outputStride[i];
+ }
+ }
+ startInputs[0] += indices[0];
+ startInputs[1] += indices[1];
+
+ if (startInputs[1]-startInputs[0] == PacketSize-1) {
+ PacketReturnType result = internal::pset1<PacketReturnType>(0);
+ convolvePacket(startInputs[0], 0, NumKernelDims-1, result);
+ return result;
+ } else {
+ EIGEN_ALIGN_MAX Scalar data[PacketSize];
+ data[0] = Scalar(0);
+ convolve(startInputs[0], 0, NumKernelDims-1, data[0]);
+ for (int i = 1; i < PacketSize-1; ++i) {
+ data[i] = Scalar(0);
+ convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]);
+ }
+ data[PacketSize-1] = Scalar(0);
+ convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]);
+ return internal::pload<PacketReturnType>(data);
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+ costPerCoeff(bool vectorized) const {
+ const double kernel_size = m_kernelImpl.dimensions().TotalSize();
+ // We ignore the use of fused multiply-add.
+ const double convolve_compute_cost =
+ TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
+ const double firstIndex_compute_cost =
+ NumDims *
+ (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
+ TensorOpCost::DivCost<Index>());
+ return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
+ kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
+ m_kernelImpl.costPerCoeff(vectorized) +
+ TensorOpCost(0, 0, convolve_compute_cost, vectorized,
+ PacketSize));
+ }
+
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+ private:
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
+ Index startInput = 0;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = NumDims - 1; i > 0; --i) {
+ const Index idx = index / m_outputStride[i];
+ startInput += idx * m_inputStride[i];
+ index -= idx * m_outputStride[i];
+ }
+ } else {
+ for (int i = 0; i < NumDims - 1; ++i) {
+ const Index idx = index / m_outputStride[i];
+ startInput += idx * m_inputStride[i];
+ index -= idx * m_outputStride[i];
+ }
+ }
+ startInput += index;
+ return startInput;
+ }
+
+ EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const {
+ for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
+ const Index input = firstIndex + j * m_indexStride[DimIndex];
+ const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
+ if (DimIndex > 0) {
+ convolve(input, kernel, DimIndex-1, accum);
+ } else {
+ accum += m_inputImpl.coeff(input) * m_kernel[kernel];
+ }
+ }
+ }
+
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const {
+ for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
+ const Index input = firstIndex + j * m_indexStride[DimIndex];
+ const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
+ if (DimIndex > 0) {
+ convolvePacket(input, kernel, DimIndex-1, accum);
+ } else {
+ accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum);
+ }
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
+ // Don't make a local copy of the kernel unless we have to (i.e. it's an
+ // expression that needs to be evaluated)
+ const Scalar* in_place = m_kernelImpl.data();
+ if (in_place) {
+ m_kernel = in_place;
+ m_local_kernel = false;
+ } else {
+ size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
+ Scalar* local = (Scalar*)m_device.allocate_temp(kernel_sz);
+ typedef TensorEvalToOp<const KernelArgType> EvalTo;
+ EvalTo evalToTmp(local, m_kernelArg);
+ const bool Vectorize = internal::IsVectorizable<Device, KernelArgType>::value;
+ internal::TensorExecutor<const EvalTo, Device, Vectorize>::run(evalToTmp, m_device);
+
+ m_kernel = local;
+ m_local_kernel = true;
+ }
+ }
+
+ array<Index, NumDims> m_inputStride;
+ array<Index, NumDims> m_outputStride;
+
+ array<Index, NumKernelDims> m_indexStride;
+ array<Index, NumKernelDims> m_kernelStride;
+ TensorEvaluator<InputArgType, Device> m_inputImpl;
+ TensorEvaluator<KernelArgType, Device> m_kernelImpl;
+ Dimensions m_dimensions;
+
+ KernelArgType m_kernelArg;
+ const Scalar* m_kernel;
+ bool m_local_kernel;
+ const Device EIGEN_DEVICE_REF m_device;
+};
+
+
+
+
+// Use an optimized implementation of the evaluation code for GPUs whenever possible.
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
+
+template <int StaticKernelSize>
+struct GetKernelSize {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const {
+ return StaticKernelSize;
+ }
+};
+template <>
+struct GetKernelSize<Dynamic> {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const {
+ return kernelSize;
+ }
+};
+
+template <typename InputEvaluator, typename Index, typename InputDims,
+ int StaticKernelSize>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel1D(
+ InputEvaluator eval,
+ const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout>
+ indexMapper,
+ const float* __restrict kernel, const int numPlanes, const int numX,
+ const int maxX, const int kernelSize, float* buffer) {
+#if defined(EIGEN_HIPCC)
+ HIP_DYNAMIC_SHARED(float, s)
+#else
+ extern __shared__ float s[];
+#endif
+
+ const int first_x = blockIdx.x * maxX;
+ const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
+ const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
+ const int num_x_output = last_x - first_x + 1;
+
+ const int first_plane = blockIdx.y * blockDim.y;
+ const int plane_stride = blockDim.y * gridDim.y;
+
+ for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
+ // Load inputs to shared memory
+ const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
+ const int plane_kernel_offset = threadIdx.y * num_x_input;
+ #pragma unroll
+ for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
+ const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x);
+ s[i + plane_kernel_offset] = eval.coeff(tensor_index);
+ }
+
+ __syncthreads();
+
+ // Compute the convolution
+ const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
+
+ #pragma unroll
+ for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
+ const int kernel_offset = plane_kernel_offset + i;
+ float result = 0.0f;
+ #pragma unroll
+ for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
+ result += s[k + kernel_offset] * kernel[k];
+ }
+ const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x);
+ buffer[tensor_index] = result;
+ }
+ __syncthreads();
+ }
+};
+
+template <typename InputEvaluator, typename Index, typename InputDims,
+ int StaticKernelSizeX, int StaticKernelSizeY>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel2D(
+ InputEvaluator eval,
+ const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout>
+ indexMapper,
+ const float* __restrict kernel, const int numPlanes, const int numX,
+ const int maxX, const int numY, const int maxY, const int kernelSizeX,
+ const int kernelSizeY, float* buffer) {
+#if defined(EIGEN_HIPCC)
+ HIP_DYNAMIC_SHARED(float, s)
+#else
+ extern __shared__ float s[];
+#endif
+
+ const int first_x = blockIdx.x * maxX;
+ const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
+ const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
+ const int num_x_output = last_x - first_x + 1;
+
+ const int first_y = blockIdx.y * maxY;
+ const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
+ const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
+ const int num_y_output = last_y - first_y + 1;
+
+ const int first_plane = blockIdx.z * blockDim.z;
+ const int plane_stride = blockDim.z * gridDim.z;
+
+ for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
+
+ const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
+ const int plane_kernel_offset = threadIdx.z * num_y_input;
+
+ // Load inputs to shared memory
+ #pragma unroll
+ for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
+ const int input_offset = num_x_input * (j + plane_kernel_offset);
+ #pragma unroll
+ for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
+ const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y);
+ s[i + input_offset] = eval.coeff(tensor_index);
+ }
+ }
+
+ __syncthreads();
+
+ // Convolution
+ const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
+
+ #pragma unroll
+ for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
+ #pragma unroll
+ for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
+ float result = 0.0f;
+ #pragma unroll
+ for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
+ const int kernel_offset = kernelSizeX * l;
+ const int input_offset = i + num_x_input * (j + l + plane_kernel_offset);
+ #pragma unroll
+ for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
+ result += s[k + input_offset] * kernel[k + kernel_offset];
+ }
+ }
+ const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
+ buffer[tensor_index] = result;
+ }
+ }
+
+ __syncthreads();
+ }
+};
+
+template <typename InputEvaluator, typename Index, typename InputDims>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D(
+ InputEvaluator eval,
+ const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout>
+ indexMapper,
+ const float* __restrict kernel, const size_t numPlanes, const size_t numX,
+ const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ,
+ const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
+ const size_t kernelSizeZ, float* buffer) {
+#if defined(EIGEN_HIPCC)
+ HIP_DYNAMIC_SHARED(float, s)
+#else
+ extern __shared__ float s[];
+#endif
+
+ // Load inputs to shared memory
+ const int first_x = blockIdx.x * maxX;
+ const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
+ const int num_x_input = last_x - first_x + kernelSizeX;
+
+ const int first_y = blockIdx.y * maxY;
+ const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
+ const int num_y_input = last_y - first_y + kernelSizeY;
+
+ const int first_z = blockIdx.z * maxZ;
+ const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
+ const int num_z_input = last_z - first_z + kernelSizeZ;
+
+ for (int p = 0; p < numPlanes; ++p) {
+
+ const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
+ const int plane_kernel_offset = 0;
+
+ for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
+ for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
+ for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
+ const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
+ s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
+ }
+ }
+ }
+
+ __syncthreads();
+
+ // Convolution
+ const int num_z_output = last_z - first_z + 1;
+ const int num_y_output = last_y - first_y + 1;
+ const int num_x_output = last_x - first_x + 1;
+ const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
+
+ for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
+ for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
+ for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
+ float result = 0.0f;
+ for (int n = 0; n < kernelSizeZ; ++n) {
+ for (int m = 0; m < kernelSizeY; ++m) {
+ for (int l = 0; l < kernelSizeX; ++l) {
+ result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
+ }
+ }
+ }
+ const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
+ buffer[tensor_index] = result;
+ }
+ }
+ }
+ __syncthreads();
+ }
+};
+
+
+
+template<typename Indices, typename InputArgType, typename KernelArgType>
+struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice>
+{
+ typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
+
+ static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value;
+ static const int NumKernelDims = internal::array_size<Indices>::value;
+ typedef typename XprType::Index Index;
+ typedef DSizes<Index, NumDims> Dimensions;
+ typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions;
+
+ enum {
+ IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
+ PacketAccess = false,
+ BlockAccess = false,
+ PreferBlockAccess = false,
+ Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = false
+ };
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ TensorEvaluator(const XprType& op, const GpuDevice& device)
+ : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
+ {
+ EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+ const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
+ const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
+
+ m_dimensions = m_inputImpl.dimensions();
+ for (int i = 0; i < NumKernelDims; ++i) {
+ const Index index = op.indices()[i];
+ const Index input_dim = input_dims[index];
+ const Index kernel_dim = kernel_dims[i];
+ const Index result_dim = input_dim - kernel_dim + 1;
+ m_dimensions[index] = result_dim;
+ }
+ }
+
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
+ typedef typename InputArgType::Scalar Scalar;
+ static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
+ EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+ preloadKernel();
+ m_inputImpl.evalSubExprsIfNeeded(NULL);
+ if (data) {
+ executeEval(data);
+ return false;
+ } else {
+ m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
+ executeEval(m_buf);
+ return true;
+ }
+ }
+
+ EIGEN_STRONG_INLINE void cleanup() {
+ m_inputImpl.cleanup();
+ if (m_buf) {
+ m_device.deallocate(m_buf);
+ m_buf = NULL;
+ }
+ if (m_local_kernel) {
+ m_device.deallocate((void*)m_kernel);
+ m_local_kernel = false;
+ }
+ m_kernel = NULL;
+ }
+
+ EIGEN_STRONG_INLINE void preloadKernel() {
+ // Don't make a local copy of the kernel unless we have to (i.e. it's an
+ // expression that needs to be evaluated)
+ const Scalar* in_place = m_kernelImpl.data();
+ if (in_place) {
+ m_kernel = in_place;
+ m_local_kernel = false;
+ } else {
+ size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
+ Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
+ typedef TensorEvalToOp<const KernelArgType> EvalTo;
+ EvalTo evalToTmp(local, m_kernelArg);
+ const bool PacketAccess = internal::IsVectorizable<GpuDevice, KernelArgType>::value;
+ internal::TensorExecutor<const EvalTo, GpuDevice, PacketAccess>::run(evalToTmp, m_device);
+
+ m_kernel = local;
+ m_local_kernel = true;
+ }
+ }
+
+ static unsigned int ceil(unsigned int num, unsigned int denom) {
+ const unsigned int rounded_toward_zero = num / denom;
+ if (num > rounded_toward_zero * denom) {
+ return rounded_toward_zero + 1;
+ }
+ return rounded_toward_zero;
+ }
+
+ void executeEval(Scalar* data) const {
+ typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
+
+ const int maxSharedMem = m_device.sharedMemPerBlock();
+ const int maxThreadsPerBlock = m_device.maxGpuThreadsPerBlock();
+ const int maxBlocksPerProcessor = m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock;
+ const int numMultiProcessors = m_device.getNumGpuMultiProcessors();
+ const int warpSize = 32;
+
+ switch (NumKernelDims) {
+ case 1: {
+ const int kernel_size = m_kernelImpl.dimensions().TotalSize();
+
+ const int numX = dimensions()[m_indices[0]];
+ const int numP = dimensions().TotalSize() / numX;
+ int maxX;
+ dim3 block_size;
+
+ const int single_stride_dim =
+ static_cast<int>(Layout) == static_cast<int>(ColMajor)
+ ? 0
+ : m_inputImpl.dimensions().rank() - 1;
+ if (m_indices[0] == single_stride_dim) {
+ // Maximum the reuse
+ const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
+ maxX = numext::mini<int>(inner_dim, numX);
+ const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);
+ block_size.x = numext::mini(maxThreadsPerBlock, maxX);
+ block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
+ }
+ else {
+ // Read as much as possible alongside the inner most dimension, that is the plane
+ const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar));
+ const int maxP = numext::mini<int>(inner_dim, numP);
+ maxX = numext::mini<int>(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);
+
+ block_size.x = numext::mini(warpSize, maxX);
+ block_size.y = numext::mini<int>(maxThreadsPerBlock/block_size.x, maxP);
+ }
+
+ const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
+ gpu_assert(shared_mem <= maxSharedMem);
+
+ const int num_x_blocks = ceil(numX, maxX);
+ const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
+ const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
+
+ dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y)));
+
+
+ //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
+
+ const array<Index, 1> indices(m_indices[0]);
+ const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]);
+ internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(
+ m_inputImpl.dimensions(), kernel_dims, indices);
+ switch(kernel_size) {
+ case 4: {
+ LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
+ break;
+ }
+ case 7: {
+ LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
+ break;
+ }
+ default: {
+ LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
+ }
+ }
+ break;
+ }
+
+ case 2: {
+ const int idxX =
+ static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
+ const int idxY =
+ static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
+ const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
+ const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
+
+ const int numX = dimensions()[m_indices[idxX]];
+ const int numY = dimensions()[m_indices[idxY]];
+ const int numP = dimensions().TotalSize() / (numX*numY);
+
+ const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x));
+
+ // Snap maxX to warp size
+ int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
+ const int maxX = numext::mini<int>(inner_dim, numX);
+ const int maxY = numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
+ const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);
+
+ dim3 block_size;
+ block_size.x = numext::mini(1024, maxX);
+ block_size.y = numext::mini<int>(1024/block_size.x, maxY);
+ block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
+
+ const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
+ gpu_assert(shared_mem <= maxSharedMem);
+
+ const int num_x_blocks = ceil(numX, maxX);
+ const int num_y_blocks = ceil(numY, maxY);
+ const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
+ const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
+
+ dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z)));
+
+
+ //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
+
+ const array<Index, 2> indices(m_indices[idxX], m_indices[idxY]);
+ const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[idxX],
+ m_kernelImpl.dimensions()[idxY]);
+ internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(
+ m_inputImpl.dimensions(), kernel_dims, indices);
+ switch (kernel_size_x) {
+ case 4: {
+ switch (kernel_size_y) {
+ case 7: {
+ LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
+ break;
+ }
+ default: {
+ LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
+ break;
+ }
+ }
+ break;
+ }
+ case 7: {
+ switch (kernel_size_y) {
+ case 4: {
+ LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
+ break;
+ }
+ default: {
+ LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
+ break;
+ }
+ }
+ break;
+ }
+ default: {
+ LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
+ break;
+ }
+ }
+ break;
+ }
+
+ case 3: {
+ const int idxX =
+ static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
+ const int idxY =
+ static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
+ const int idxZ =
+ static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
+
+ const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
+ const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
+ const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
+
+ const int numX = dimensions()[m_indices[idxX]];
+ const int numY = dimensions()[m_indices[idxY]];
+ const int numZ = dimensions()[m_indices[idxZ]];
+ const int numP = dimensions().TotalSize() / (numX*numY*numZ);
+
+ const int maxX = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
+ const int maxY = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
+ const int maxZ = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
+
+ dim3 block_size;
+ block_size.x = numext::mini(32, maxX);
+ block_size.y = numext::mini(32, maxY);
+ block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxZ);
+ dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
+
+ const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
+ gpu_assert(shared_mem <= maxSharedMem);
+
+ //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
+ const array<Index, 3> indices(m_indices[idxX], m_indices[idxY],
+ m_indices[idxZ]);
+ const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[idxX],
+ m_kernelImpl.dimensions()[idxY],
+ m_kernelImpl.dimensions()[idxZ]);
+ internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(
+ m_inputImpl.dimensions(), kernel_dims, indices);
+
+ LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
+ break;
+ }
+
+ default: {
+ EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
+ }
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ eigen_assert(m_buf);
+ eigen_assert(index < m_dimensions.TotalSize());
+ return m_buf[index];
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const
+ {
+ eigen_assert(m_buf);
+ eigen_assert(index < m_dimensions.TotalSize());
+ return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+ costPerCoeff(bool vectorized) const {
+ // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
+ // model.
+ const double kernel_size = m_kernelImpl.dimensions().TotalSize();
+ // We ignore the use of fused multiply-add.
+ const double convolve_compute_cost =
+ TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
+ const double firstIndex_compute_cost =
+ NumDims *
+ (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
+ TensorOpCost::DivCost<Index>());
+ return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
+ kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
+ m_kernelImpl.costPerCoeff(vectorized) +
+ TensorOpCost(0, 0, convolve_compute_cost, vectorized,
+ PacketSize));
+ }
+
+ private:
+ // No assignment (copies are needed by the kernels)
+ TensorEvaluator& operator = (const TensorEvaluator&);
+
+ TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
+ TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
+ KernelArgType m_kernelArg;
+ Indices m_indices;
+ Dimensions m_dimensions;
+ Scalar* m_buf;
+ const Scalar* m_kernel;
+ bool m_local_kernel;
+
+ const GpuDevice& m_device;
+};
+#endif
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorConvolutionSycl.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorConvolutionSycl.h
new file mode 100644
index 0000000..033318f
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorConvolutionSycl.h
@@ -0,0 +1,544 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli Codeplay Software Ltd.
+// Ralph Potter Codeplay Software Ltd.
+// Luke Iwanski Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
+
+namespace Eigen {
+
+/** \class TensorConvolution
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor convolution class.
+ *
+ *
+ */
+
+enum class convolution_type { CONV1D, CONV2D, CONV3D };
+template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
+ typename Kernel_accessor, typename Buffer_accessor, convolution_type Conv_Dim>
+struct EigenConvolutionKernel;
+template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
+ typename Kernel_accessor, typename Buffer_accessor>
+struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
+ Buffer_accessor, convolution_type::CONV1D> {
+ typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+ Local_accessor;
+ Local_accessor local_acc;
+ Evaluator device_evaluator;
+ Kernel_accessor kernel_filter;
+ Buffer_accessor buffer_acc;
+ internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper;
+ const size_t kernelSize;
+ const cl::sycl::range<2> input_range;
+ EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
+ Buffer_accessor buffer_acc_,
+ internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper_,
+ const size_t kernelSize_, const cl::sycl::range<2> input_range_)
+ : local_acc(local_acc_),
+ device_evaluator(device_evaluator_),
+ kernel_filter(kernel_filter_),
+ buffer_acc(buffer_acc_),
+ indexMapper(indexMapper_),
+ kernelSize(kernelSize_),
+ input_range(input_range_) {}
+
+ template <typename BooleanDim2>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) {
+ return (boolean_check[0] && boolean_check[1]);
+ }
+ void operator()(cl::sycl::nd_item<2> itemID) {
+ auto buffer_ptr = buffer_acc.get_pointer();
+ auto kernel_ptr = kernel_filter.get_pointer();
+ // the required row to be calculated for the for each plane in shered memory
+ const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1);
+ const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input;
+ const size_t input_offset = itemID.get_group(0) * itemID.get_local_range()[0];
+ const size_t plane_tensor_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(1));
+ /// fill the shared memory
+ for (size_t i = itemID.get_local_id(0); i < num_input; i += itemID.get_local_range()[0]) {
+ const size_t local_index = i + plane_kernel_offset;
+ const size_t tensor_index =
+ plane_tensor_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + input_offset);
+
+ local_acc[local_index] =
+ (((i + input_offset) < (input_range[0] + kernelSize - 1)) && itemID.get_global_id(1) < input_range[1])
+ ? device_evaluator.coeff(tensor_index)
+ : CoeffReturnType(0);
+ }
+
+ itemID.barrier(cl::sycl::access::fence_space::local_space);
+
+ // calculate the convolution // output start x
+ const size_t first_output_start = itemID.get_group(0) * (itemID.get_local_range()[0]);
+ if (boundary_check(itemID.get_global_id() < input_range)) {
+ CoeffReturnType result = static_cast<CoeffReturnType>(0);
+ const size_t index = plane_kernel_offset + itemID.get_local_id(0);
+ for (size_t k = 0; k < kernelSize; ++k) {
+ result += (local_acc[k + index] * kernel_ptr[k]);
+ }
+ const size_t tensor_index =
+ indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(1)) +
+ indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + first_output_start);
+ buffer_ptr[tensor_index] = result;
+ }
+ }
+};
+
+template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
+ typename Kernel_accessor, typename Buffer_accessor>
+struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
+ Buffer_accessor, convolution_type::CONV2D> {
+ typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+ Local_accessor;
+ Local_accessor local_acc;
+ Evaluator device_evaluator;
+ Kernel_accessor kernel_filter;
+ Buffer_accessor buffer_acc;
+ internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper;
+ const cl::sycl::range<2> kernel_size;
+ const cl::sycl::range<3> input_range;
+ EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
+ Buffer_accessor buffer_acc_,
+ internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper_,
+ const cl::sycl::range<2> kernel_size_, const cl::sycl::range<3> input_range_)
+ : local_acc(local_acc_),
+ device_evaluator(device_evaluator_),
+ kernel_filter(kernel_filter_),
+ buffer_acc(buffer_acc_),
+ indexMapper(indexMapper_),
+ kernel_size(kernel_size_),
+ input_range(input_range_) {}
+ template <typename BooleanDim3>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
+ return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
+ }
+
+ void operator()(cl::sycl::nd_item<3> itemID) {
+ auto buffer_ptr = buffer_acc.get_pointer();
+ auto kernel_ptr = kernel_filter.get_pointer();
+ // the required row to be calculated for the for each plane in shered memory
+ const auto num_input = cl::sycl::range<2>{
+ (cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)};
+
+ const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(2));
+ const size_t plane_kernel_offset = itemID.get_local_id(2) * num_input[1];
+
+ const auto input_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
+ itemID.get_group(1) * itemID.get_local_range()[1]};
+
+ // fill the local memory
+ bool in_range_dim2 = itemID.get_global_id(2) < input_range[2];
+ for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
+ const size_t local_input_offset = num_input[0] * (j + plane_kernel_offset);
+ bool in_range_dim1 = ((j + input_offset[1]) < (input_range[1] + kernel_size[1] - 1));
+ for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
+ const size_t local_index = i + local_input_offset;
+ const size_t tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
+ i + input_offset[0], j + input_offset[1]);
+ local_acc[local_index] = (((i + input_offset[0]) < (input_range[0] + kernel_size[0] - 1)) &&
+ in_range_dim1 && in_range_dim2)
+ ? device_evaluator.coeff(tensor_index)
+ : CoeffReturnType(0);
+ }
+ }
+
+ itemID.barrier(cl::sycl::access::fence_space::local_space);
+
+ // output offset start for each thread
+ const auto output_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
+ itemID.get_group(1) * itemID.get_local_range()[1]};
+
+ if (boundary_check(itemID.get_global_id() < input_range)) {
+ CoeffReturnType result = static_cast<CoeffReturnType>(0);
+
+ for (size_t j = 0; j < kernel_size[1]; j++) {
+ size_t kernel_offset = kernel_size[0] * j;
+ const size_t index =
+ (num_input[0] * (plane_kernel_offset + j + itemID.get_local_id(1))) + itemID.get_local_id(0);
+ for (size_t i = 0; i < kernel_size[0]; i++) {
+ result += (local_acc[i + index] * kernel_ptr[i + kernel_offset]);
+ }
+ }
+ const size_t tensor_index =
+ indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(2)) +
+ indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + output_offset[0],
+ itemID.get_local_id(1) + output_offset[1]);
+
+ buffer_ptr[tensor_index] = result;
+ }
+ }
+};
+
+template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
+ typename Kernel_accessor, typename Buffer_accessor>
+struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
+ Buffer_accessor, convolution_type::CONV3D> {
+ typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+ Local_accessor;
+ Local_accessor local_acc;
+ Evaluator device_evaluator;
+ Kernel_accessor kernel_filter;
+ Buffer_accessor buffer_acc;
+ internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper;
+ const cl::sycl::range<3> kernel_size;
+ const cl::sycl::range<3> input_range;
+ const size_t numP;
+
+ EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
+ Buffer_accessor buffer_acc_,
+ internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper_,
+ const cl::sycl::range<3> kernel_size_, const cl::sycl::range<3> input_range_,
+ const size_t numP_)
+ : local_acc(local_acc_),
+ device_evaluator(device_evaluator_),
+ kernel_filter(kernel_filter_),
+ buffer_acc(buffer_acc_),
+ indexMapper(indexMapper_),
+ kernel_size(kernel_size_),
+ input_range(input_range_),
+ numP(numP_) {}
+ template <typename BooleanDim3>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
+ return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
+ }
+ void operator()(cl::sycl::nd_item<3> itemID) {
+ auto buffer_ptr = buffer_acc.get_pointer();
+ auto kernel_ptr = kernel_filter.get_pointer();
+ const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1};
+
+ const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()};
+
+ const auto output_offset =
+ cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range() + itemID.get_local_id()};
+
+ for (size_t p = 0; p < numP; p++) {
+ /// fill the shared memory
+ const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
+ for (size_t k = itemID.get_local_id(2); k < num_input[2]; k += itemID.get_local_range()[2]) {
+ size_t local_index_dim2 = num_input[0] * num_input[1] * k;
+ bool cond_k_dim = (k + input_offset[2] < (input_range[2] + kernel_size[2] - 1));
+ for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
+ bool cond_j_dim = cond_k_dim && (j + input_offset[1] < (input_range[1] + kernel_size[1] - 1));
+ size_t local_index_dim1 = (num_input[0] * j) + local_index_dim2;
+ for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
+ bool conds = cond_j_dim && (i + input_offset[0] < (input_range[0] + kernel_size[0] - 1));
+ const size_t local_index = local_index_dim1 + i;
+ const size_t tensor_index =
+ plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
+ i + input_offset[0], j + input_offset[1], k + input_offset[2]);
+ local_acc[local_index] = conds ? device_evaluator.coeff(tensor_index) : CoeffReturnType(0);
+ }
+ }
+ }
+ itemID.barrier(cl::sycl::access::fence_space::local_space);
+
+ // calculate the convolution
+
+ if (boundary_check(itemID.get_global_id() < input_range)) {
+ CoeffReturnType result = static_cast<CoeffReturnType>(0);
+ for (size_t k = 0; k < kernel_size[2]; k++) {
+ for (size_t j = 0; j < kernel_size[1]; j++) {
+ for (size_t i = 0; i < kernel_size[0]; i++) {
+ const size_t kernel_index = i + kernel_size[0] * (j + kernel_size[1] * k);
+ const size_t local_index =
+ ((i + itemID.get_local_id(0)) +
+ num_input[0] * ((j + itemID.get_local_id(1)) + num_input[1] * (k + itemID.get_local_id(2))));
+
+ result += (local_acc[local_index] * kernel_ptr[kernel_index]);
+ }
+ }
+ }
+ const size_t tensor_index =
+ indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p) +
+ indexMapper.mapGpuOutputKernelToTensorOutputOffset(output_offset[0], output_offset[1], output_offset[2]);
+ buffer_ptr[tensor_index] = result;
+ }
+
+ itemID.barrier(cl::sycl::access::fence_space::local_space);
+ }
+ }
+};
+
+template <typename Indices, typename InputArgType, typename KernelArgType>
+struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Eigen::SyclDevice> {
+ typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
+
+ static const int NumDims =
+ internal::array_size<typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions>::value;
+ static const int NumKernelDims = internal::array_size<Indices>::value;
+ typedef typename XprType::Index Index;
+ typedef DSizes<Index, NumDims> Dimensions;
+ typedef typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions KernelDimensions;
+ typedef const Eigen::SyclDevice Device;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Eigen::SyclDevice>::type PacketReturnType;
+ typedef typename InputArgType::Scalar Scalar;
+ static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+ typedef StorageMemory<CoeffReturnType, Eigen::SyclDevice> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+ typedef StorageMemory<const CoeffReturnType, Eigen::SyclDevice> KernelStorage;
+
+ enum {
+ IsAligned = TensorEvaluator<InputArgType, Eigen::SyclDevice>::IsAligned &
+ TensorEvaluator<KernelArgType, Eigen::SyclDevice>::IsAligned,
+ PacketAccess = false,
+ BlockAccess = false,
+ PreferBlockAccess = false,
+ Layout = TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = false
+ };
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ TensorEvaluator(const XprType &op, const Eigen::SyclDevice &device)
+ : m_inputImpl(op.inputExpression(), device),
+ m_kernelArg(op.kernelExpression()),
+ m_kernelImpl(op.kernelExpression(), device),
+ m_indices(op.indices()),
+ m_buf(NULL),
+ m_kernel(NULL),
+ m_local_kernel(false),
+ m_device(device) {
+ EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout) ==
+ static_cast<int>(TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Layout)),
+ YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+ const typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions &input_dims = m_inputImpl.dimensions();
+ const typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions &kernel_dims =
+ m_kernelImpl.dimensions();
+
+ m_dimensions = m_inputImpl.dimensions();
+ for (int i = 0; i < NumKernelDims; ++i) {
+ const Index index = op.indices()[i];
+ const Index input_dim = input_dims[index];
+ const Index kernel_dim = kernel_dims[i];
+ const Index result_dim = input_dim - kernel_dim + 1;
+ m_dimensions[index] = result_dim;
+ }
+ }
+
+ EIGEN_DEVICE_FUNC const Dimensions &dimensions() const { return m_dimensions; }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+ preloadKernel();
+ m_inputImpl.evalSubExprsIfNeeded(NULL);
+ if (data) {
+ executeEval(data);
+ return false;
+ } else {
+ m_buf = (EvaluatorPointerType)m_device.get(
+ (Scalar *)m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar)));
+ executeEval(m_buf);
+ return true;
+ }
+ }
+
+ EIGEN_STRONG_INLINE void cleanup() {
+ m_inputImpl.cleanup();
+ if (m_buf) {
+ m_device.deallocate_temp(m_buf);
+ m_buf = NULL;
+ }
+ if (m_local_kernel) {
+ m_device.deallocate_temp(m_kernel);
+ m_local_kernel = false;
+ }
+ m_kernel = NULL;
+ }
+ /// used by sycl in order to build the sycl buffer
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device &device() const { return m_device; }
+ /// used by sycl in order to build the sycl buffer
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_buf; }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
+ // Don't make a local copy of the kernel unless we have to (i.e. it's an
+ // expression that needs to be evaluated)
+ typename KernelStorage::Type in_place = m_kernelImpl.data();
+ if (in_place) {
+ m_kernel = in_place;
+ m_local_kernel = false;
+ } else {
+ ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
+ EvaluatorPointerType local = (EvaluatorPointerType)m_device.get((Scalar *)m_device.allocate_temp(kernel_sz));
+ typedef TensorEvalToOp<const KernelArgType> EvalTo;
+ EvalTo evalToTmp(m_device.get(local), m_kernelArg);
+ const bool PacketAccess = internal::IsVectorizable<Eigen::SyclDevice, KernelArgType>::value;
+ internal::TensorExecutor<const EvalTo, Eigen::SyclDevice, PacketAccess>::run(evalToTmp, m_device);
+ m_kernel = local;
+ m_local_kernel = true;
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(EvaluatorPointerType data) const {
+ typedef TensorEvaluator<InputArgType, Eigen::SyclDevice> InputEvaluator;
+ typedef typename InputEvaluator::Dimensions InputDims;
+ switch (NumKernelDims) {
+ case 1: {
+ const size_t numX = dimensions()[m_indices[0]];
+ const size_t numP = dimensions().TotalSize() / numX;
+ const auto input_dim = std::array<size_t, 2>{numX, numP};
+ auto global_range = cl::sycl::range<2>{};
+ auto local_range = cl::sycl::range<2>{};
+ const size_t kernel_size = m_kernelImpl.dimensions().TotalSize();
+
+ m_device.parallel_for_setup(input_dim, global_range, local_range);
+ const size_t local_memory_size = (local_range[0] + kernel_size - 1) * (local_range[1]);
+ gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
+ const array<Index, 1> indices{{m_indices[0]}};
+ const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}};
+ internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+
+ typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
+ typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV1D>
+ ConvKernel;
+
+ m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
+ m_inputImpl, m_kernel, data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size,
+ indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1]));
+ break;
+ }
+
+ case 2: {
+ auto kernel_index = std::array<size_t, 2>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1,
+ static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0};
+ auto kernel_size = cl::sycl::range<2>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
+ (size_t)m_kernelImpl.dimensions()[kernel_index[1]]};
+ const size_t numX = dimensions()[m_indices[kernel_index[0]]];
+ const size_t numY = dimensions()[m_indices[kernel_index[1]]];
+ const size_t numP = dimensions().TotalSize() / (numX * numY);
+ auto input_dim = std::array<size_t, 3>{numX, numY, numP};
+
+ auto global_range = cl::sycl::range<3>{};
+ auto local_range = cl::sycl::range<3>{};
+
+ m_device.parallel_for_setup(input_dim, global_range, local_range);
+
+ const size_t local_memory_size =
+ (local_range[0] + kernel_size[0] - 1) * (local_range[1] + kernel_size[1] - 1) * local_range[2];
+ gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
+ const array<Index, 2> indices{{m_indices[kernel_index[0]], m_indices[kernel_index[1]]}};
+ const array<Index, 2> kernel_dims{
+ {m_kernelImpl.dimensions()[kernel_index[0]], m_kernelImpl.dimensions()[kernel_index[1]]}};
+ internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+ typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
+ typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV2D>
+ ConvKernel;
+ m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
+ m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
+ indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]});
+ break;
+ }
+
+ case 3: {
+ auto kernel_index = std::array<size_t, 3>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2,
+ static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1,
+ static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0};
+
+ auto kernel_size = cl::sycl::range<3>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
+ (size_t)m_kernelImpl.dimensions()[kernel_index[1]],
+ (size_t)m_kernelImpl.dimensions()[kernel_index[2]]};
+
+ const size_t numX = dimensions()[m_indices[kernel_index[0]]];
+ const size_t numY = dimensions()[m_indices[kernel_index[1]]];
+ const size_t numZ = dimensions()[m_indices[kernel_index[2]]];
+ auto input_dim = std::array<size_t, 3>{numX, numY, numZ};
+ const size_t numP = dimensions().TotalSize() / (numX * numY * numZ);
+
+ const array<Index, 3> indices{
+ {m_indices[kernel_index[0]], m_indices[kernel_index[1]], m_indices[kernel_index[2]]}};
+ const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[kernel_index[0]],
+ m_kernelImpl.dimensions()[kernel_index[1]],
+ m_kernelImpl.dimensions()[kernel_index[2]]}};
+
+ internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+
+ auto global_range = cl::sycl::range<3>{};
+ auto local_range = cl::sycl::range<3>{};
+
+ m_device.parallel_for_setup(input_dim, global_range, local_range);
+ auto local_memory_range = (local_range + kernel_size - 1);
+ const size_t local_memory_size = local_memory_range[0] * local_memory_range[1] * local_memory_range[2];
+
+ gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
+ typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
+ typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV3D>
+ ConvKernel;
+ m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
+ m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
+ indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP);
+ break;
+ }
+
+ default: {
+ EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3),
+ THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
+ }
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+ eigen_assert(m_buf != NULL);
+ eigen_assert(index < m_dimensions.TotalSize());
+ return m_buf[index];
+ }
+
+ template <int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const {
+ eigen_assert(m_buf != NULL);
+ eigen_assert(index < m_dimensions.TotalSize());
+ return internal::ploadt<PacketReturnType, LoadMode>(m_buf + index);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+ // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
+ // model.
+ const double kernel_size = m_kernelImpl.dimensions().TotalSize();
+ // We ignore the use of fused multiply-add.
+ const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
+ const double firstIndex_compute_cost =
+ NumDims *
+ (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
+ return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
+ kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
+ TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
+ }
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_kernelImpl.bind(cgh);
+ m_inputImpl.bind(cgh);
+ m_buf.bind(cgh);
+ m_kernel.bind(cgh);
+ }
+
+ private:
+ // No assignment (copies are needed by the kernels)
+ TensorEvaluator &operator=(const TensorEvaluator &);
+ TensorEvaluator<InputArgType, Eigen::SyclDevice> m_inputImpl;
+ KernelArgType m_kernelArg;
+ TensorEvaluator<KernelArgType, Eigen::SyclDevice> m_kernelImpl;
+ Indices m_indices;
+ Dimensions m_dimensions;
+ EvaluatorPointerType m_buf;
+ typename KernelStorage::Type m_kernel;
+ bool m_local_kernel;
+ const Eigen::SyclDevice EIGEN_DEVICE_REF m_device;
+}; // namespace Eigen
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorCostModel.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorCostModel.h
new file mode 100644
index 0000000..195267c
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorCostModel.h
@@ -0,0 +1,214 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Rasmus Munk Larsen <rmlarsen@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
+#define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
+
+namespace Eigen {
+
+/** \class TensorEvaluator
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief A cost model used to limit the number of threads used for evaluating
+ * tensor expression.
+ *
+ */
+
+// Class storing the cost of evaluating a tensor expression in terms of the
+// estimated number of operand bytes loads, bytes stored, and compute cycles.
+class TensorOpCost {
+ public:
+ // TODO(rmlarsen): Fix the scalar op costs in Eigen proper. Even a simple
+ // model based on minimal reciprocal throughput numbers from Intel or
+ // Agner Fog's tables would be better than what is there now.
+ template <typename ArgType>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int MulCost() {
+ return internal::functor_traits<
+ internal::scalar_product_op<ArgType, ArgType> >::Cost;
+ }
+ template <typename ArgType>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int AddCost() {
+ return internal::functor_traits<internal::scalar_sum_op<ArgType> >::Cost;
+ }
+ template <typename ArgType>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int DivCost() {
+ return internal::functor_traits<
+ internal::scalar_quotient_op<ArgType, ArgType> >::Cost;
+ }
+ template <typename ArgType>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int ModCost() {
+ return internal::functor_traits<internal::scalar_mod_op<ArgType> >::Cost;
+ }
+ template <typename SrcType, typename TargetType>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int CastCost() {
+ return internal::functor_traits<
+ internal::scalar_cast_op<SrcType, TargetType> >::Cost;
+ }
+
+ EIGEN_DEVICE_FUNC
+ TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {}
+ EIGEN_DEVICE_FUNC
+ TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles)
+ : bytes_loaded_(bytes_loaded),
+ bytes_stored_(bytes_stored),
+ compute_cycles_(compute_cycles) {}
+
+ EIGEN_DEVICE_FUNC
+ TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles,
+ bool vectorized, double packet_size)
+ : bytes_loaded_(bytes_loaded),
+ bytes_stored_(bytes_stored),
+ compute_cycles_(vectorized ? compute_cycles / packet_size
+ : compute_cycles) {
+ eigen_assert(bytes_loaded >= 0 && (numext::isfinite)(bytes_loaded));
+ eigen_assert(bytes_stored >= 0 && (numext::isfinite)(bytes_stored));
+ eigen_assert(compute_cycles >= 0 && (numext::isfinite)(compute_cycles));
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const {
+ return bytes_loaded_;
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const {
+ return bytes_stored_;
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const {
+ return compute_cycles_;
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost(
+ double load_cost, double store_cost, double compute_cost) const {
+ return load_cost * bytes_loaded_ + store_cost * bytes_stored_ +
+ compute_cost * compute_cycles_;
+ }
+
+ // Drop memory access component. Intended for cases when memory accesses are
+ // sequential or are completely masked by computations.
+ EIGEN_DEVICE_FUNC void dropMemoryCost() {
+ bytes_loaded_ = 0;
+ bytes_stored_ = 0;
+ }
+
+ // TODO(rmlarsen): Define min in terms of total cost, not elementwise.
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin(
+ const TensorOpCost& rhs) const {
+ double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded());
+ double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored());
+ double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles());
+ return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
+ }
+
+ // TODO(rmlarsen): Define max in terms of total cost, not elementwise.
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax(
+ const TensorOpCost& rhs) const {
+ double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
+ double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored());
+ double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles());
+ return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=(
+ const TensorOpCost& rhs) {
+ bytes_loaded_ += rhs.bytes_loaded();
+ bytes_stored_ += rhs.bytes_stored();
+ compute_cycles_ += rhs.compute_cycles();
+ return *this;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(double rhs) {
+ bytes_loaded_ *= rhs;
+ bytes_stored_ *= rhs;
+ compute_cycles_ *= rhs;
+ return *this;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+(
+ TensorOpCost lhs, const TensorOpCost& rhs) {
+ lhs += rhs;
+ return lhs;
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(
+ TensorOpCost lhs, double rhs) {
+ lhs *= rhs;
+ return lhs;
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(
+ double lhs, TensorOpCost rhs) {
+ rhs *= lhs;
+ return rhs;
+ }
+
+ friend std::ostream& operator<<(std::ostream& os, const TensorOpCost& tc) {
+ return os << "[bytes_loaded = " << tc.bytes_loaded()
+ << ", bytes_stored = " << tc.bytes_stored()
+ << ", compute_cycles = " << tc.compute_cycles() << "]";
+ }
+
+ private:
+ double bytes_loaded_;
+ double bytes_stored_;
+ double compute_cycles_;
+};
+
+// TODO(rmlarsen): Implement a policy that chooses an "optimal" number of theads
+// in [1:max_threads] instead of just switching multi-threading off for small
+// work units.
+template <typename Device>
+class TensorCostModel {
+ public:
+ // Scaling from Eigen compute cost to device cycles.
+ static const int kDeviceCyclesPerComputeCycle = 1;
+
+ // Costs in device cycles.
+ static const int kStartupCycles = 100000;
+ static const int kPerThreadCycles = 100000;
+ static const int kTaskSize = 40000;
+
+ // Returns the number of threads in [1:max_threads] to use for
+ // evaluating an expression with the given output size and cost per
+ // coefficient.
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(
+ double output_size, const TensorOpCost& cost_per_coeff, int max_threads) {
+ double cost = totalCost(output_size, cost_per_coeff);
+ double threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
+ // Make sure we don't invoke undefined behavior when we convert to an int.
+ threads = numext::mini<double>(threads, GenericNumTraits<int>::highest());
+ return numext::mini(max_threads,
+ numext::maxi<int>(1, static_cast<int>(threads)));
+ }
+
+ // taskSize assesses parallel task size.
+ // Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task
+ // granularity needs to be increased to mitigate parallelization overheads.
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize(
+ double output_size, const TensorOpCost& cost_per_coeff) {
+ return totalCost(output_size, cost_per_coeff) / kTaskSize;
+ }
+
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost(
+ double output_size, const TensorOpCost& cost_per_coeff) {
+ // Cost of memory fetches from L2 cache. 64 is typical cache line size.
+ // 11 is L2 cache latency on Haswell.
+ // We don't know whether data is in L1, L2 or L3. But we are most interested
+ // in single-threaded computational time around 100us-10ms (smaller time
+ // is too small for parallelization, larger time is not interesting
+ // either because we are probably using all available threads already).
+ // And for the target time range, L2 seems to be what matters. Data set
+ // fitting into L1 is too small to take noticeable time. Data set fitting
+ // only into L3 presumably will take more than 10ms to load and process.
+ const double kLoadCycles = 1.0 / 64 * 11;
+ const double kStoreCycles = 1.0 / 64 * 11;
+ // Scaling from Eigen compute cost to device cycles.
+ return output_size *
+ cost_per_coeff.total_cost(kLoadCycles, kStoreCycles,
+ kDeviceCyclesPerComputeCycle);
+ }
+};
+
+} // namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorCustomOp.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorCustomOp.h
new file mode 100644
index 0000000..95a8a84
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorCustomOp.h
@@ -0,0 +1,347 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H
+
+namespace Eigen {
+
+/** \class TensorCustomUnaryOp
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor custom class.
+ *
+ *
+ */
+namespace internal {
+template<typename CustomUnaryFunc, typename XprType>
+struct traits<TensorCustomUnaryOp<CustomUnaryFunc, XprType> >
+{
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::StorageKind StorageKind;
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = traits<XprType>::NumDimensions;
+ static const int Layout = traits<XprType>::Layout;
+ typedef typename traits<XprType>::PointerType PointerType;
+};
+
+template<typename CustomUnaryFunc, typename XprType>
+struct eval<TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Eigen::Dense>
+{
+ typedef const TensorCustomUnaryOp<CustomUnaryFunc, XprType>EIGEN_DEVICE_REF type;
+};
+
+template<typename CustomUnaryFunc, typename XprType>
+struct nested<TensorCustomUnaryOp<CustomUnaryFunc, XprType> >
+{
+ typedef TensorCustomUnaryOp<CustomUnaryFunc, XprType> type;
+};
+
+} // end namespace internal
+
+
+
+template<typename CustomUnaryFunc, typename XprType>
+class TensorCustomUnaryOp : public TensorBase<TensorCustomUnaryOp<CustomUnaryFunc, XprType>, ReadOnlyAccessors>
+{
+ public:
+ typedef typename internal::traits<TensorCustomUnaryOp>::Scalar Scalar;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename internal::nested<TensorCustomUnaryOp>::type Nested;
+ typedef typename internal::traits<TensorCustomUnaryOp>::StorageKind StorageKind;
+ typedef typename internal::traits<TensorCustomUnaryOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCustomUnaryOp(const XprType& expr, const CustomUnaryFunc& func)
+ : m_expr(expr), m_func(func) {}
+
+ EIGEN_DEVICE_FUNC
+ const CustomUnaryFunc& func() const { return m_func; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type&
+ expression() const { return m_expr; }
+
+ protected:
+ typename XprType::Nested m_expr;
+ const CustomUnaryFunc m_func;
+};
+
+
+// Eval as rvalue
+template<typename CustomUnaryFunc, typename XprType, typename Device>
+struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Device>
+{
+ typedef TensorCustomUnaryOp<CustomUnaryFunc, XprType> ArgType;
+ typedef typename internal::traits<ArgType>::Index Index;
+ static const int NumDims = internal::traits<ArgType>::NumDimensions;
+ typedef DSizes<Index, NumDims> Dimensions;
+ typedef typename internal::remove_const<typename ArgType::Scalar>::type Scalar;
+ typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+ typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+
+ enum {
+ IsAligned = false,
+ PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+ BlockAccess = false,
+ PreferBlockAccess = false,
+ Layout = TensorEvaluator<XprType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = false
+ };
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device)
+ : m_op(op), m_device(device), m_result(NULL)
+ {
+ m_dimensions = op.func().dimensions(op.expression());
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+ if (data) {
+ evalTo(data);
+ return false;
+ } else {
+ m_result = static_cast<EvaluatorPointerType>(m_device.get( (CoeffReturnType*)
+ m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar))));
+ evalTo(m_result);
+ return true;
+ }
+ }
+
+ EIGEN_STRONG_INLINE void cleanup() {
+ if (m_result) {
+ m_device.deallocate_temp(m_result);
+ m_result = NULL;
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+ return m_result[index];
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
+ return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+ // TODO(rmlarsen): Extend CustomOp API to return its cost estimate.
+ return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
+ }
+
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
+
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_result.bind(cgh);
+ }
+#endif
+
+ protected:
+ void evalTo(EvaluatorPointerType data) {
+ TensorMap<Tensor<CoeffReturnType, NumDims, Layout, Index> > result(m_device.get(data), m_dimensions);
+ m_op.func().eval(m_op.expression(), result, m_device);
+ }
+
+ Dimensions m_dimensions;
+ const ArgType m_op;
+ const Device EIGEN_DEVICE_REF m_device;
+ EvaluatorPointerType m_result;
+};
+
+
+
+/** \class TensorCustomBinaryOp
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor custom class.
+ *
+ *
+ */
+namespace internal {
+template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
+struct traits<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> >
+{
+ typedef typename internal::promote_storage_type<typename LhsXprType::Scalar,
+ typename RhsXprType::Scalar>::ret Scalar;
+ typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType,
+ typename RhsXprType::CoeffReturnType>::ret CoeffReturnType;
+ typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
+ typename traits<RhsXprType>::StorageKind>::ret StorageKind;
+ typedef typename promote_index_type<typename traits<LhsXprType>::Index,
+ typename traits<RhsXprType>::Index>::type Index;
+ typedef typename LhsXprType::Nested LhsNested;
+ typedef typename RhsXprType::Nested RhsNested;
+ typedef typename remove_reference<LhsNested>::type _LhsNested;
+ typedef typename remove_reference<RhsNested>::type _RhsNested;
+ static const int NumDimensions = traits<LhsXprType>::NumDimensions;
+ static const int Layout = traits<LhsXprType>::Layout;
+ typedef typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
+ typename traits<LhsXprType>::PointerType, typename traits<RhsXprType>::PointerType>::type PointerType;
+};
+
+template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
+struct eval<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Eigen::Dense>
+{
+ typedef const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>& type;
+};
+
+template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
+struct nested<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> >
+{
+ typedef TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> type;
+};
+
+} // end namespace internal
+
+
+
+template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
+class TensorCustomBinaryOp : public TensorBase<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, ReadOnlyAccessors>
+{
+ public:
+ typedef typename internal::traits<TensorCustomBinaryOp>::Scalar Scalar;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename internal::traits<TensorCustomBinaryOp>::CoeffReturnType CoeffReturnType;
+ typedef typename internal::nested<TensorCustomBinaryOp>::type Nested;
+ typedef typename internal::traits<TensorCustomBinaryOp>::StorageKind StorageKind;
+ typedef typename internal::traits<TensorCustomBinaryOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCustomBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const CustomBinaryFunc& func)
+
+ : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_func(func) {}
+
+ EIGEN_DEVICE_FUNC
+ const CustomBinaryFunc& func() const { return m_func; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename LhsXprType::Nested>::type&
+ lhsExpression() const { return m_lhs_xpr; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename RhsXprType::Nested>::type&
+ rhsExpression() const { return m_rhs_xpr; }
+
+ protected:
+ typename LhsXprType::Nested m_lhs_xpr;
+ typename RhsXprType::Nested m_rhs_xpr;
+ const CustomBinaryFunc m_func;
+};
+
+
+// Eval as rvalue
+template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType, typename Device>
+struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Device>
+{
+ typedef TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> XprType;
+ typedef typename internal::traits<XprType>::Index Index;
+ static const int NumDims = internal::traits<XprType>::NumDimensions;
+ typedef DSizes<Index, NumDims> Dimensions;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+
+ typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+
+ enum {
+ IsAligned = false,
+ PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+ BlockAccess = false,
+ PreferBlockAccess = false,
+ Layout = TensorEvaluator<LhsXprType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = false
+ };
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : m_op(op), m_device(device), m_result(NULL)
+ {
+ m_dimensions = op.func().dimensions(op.lhsExpression(), op.rhsExpression());
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+ if (data) {
+ evalTo(data);
+ return false;
+ } else {
+ m_result = static_cast<EvaluatorPointerType>(m_device.get( (CoeffReturnType*)
+ m_device.allocate_temp(dimensions().TotalSize() * sizeof(CoeffReturnType))));
+ evalTo(m_result);
+ return true;
+ }
+ }
+
+ EIGEN_STRONG_INLINE void cleanup() {
+ if (m_result != NULL) {
+ m_device.deallocate_temp(m_result);
+ m_result = NULL;
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+ return m_result[index];
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
+ return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+ // TODO(rmlarsen): Extend CustomOp API to return its cost estimate.
+ return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
+ }
+
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
+
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_result.bind(cgh);
+ }
+#endif
+
+ protected:
+ void evalTo(EvaluatorPointerType data) {
+ TensorMap<Tensor<CoeffReturnType, NumDims, Layout> > result(m_device.get(data), m_dimensions);
+ m_op.func().eval(m_op.lhsExpression(), m_op.rhsExpression(), result, m_device);
+ }
+
+ Dimensions m_dimensions;
+ const XprType m_op;
+ const Device EIGEN_DEVICE_REF m_device;
+ EvaluatorPointerType m_result;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorDevice.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorDevice.h
new file mode 100644
index 0000000..96fa46c
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorDevice.h
@@ -0,0 +1,137 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H
+
+namespace Eigen {
+
+/** \class TensorDevice
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Pseudo expression providing an operator = that will evaluate its argument
+ * on the specified computing 'device' (GPU, thread pool, ...)
+ *
+ * Example:
+ * C.device(EIGEN_GPU) = A + B;
+ *
+ * Todo: operator *= and /=.
+ */
+
+template <typename ExpressionType, typename DeviceType> class TensorDevice {
+ public:
+ TensorDevice(const DeviceType& device, ExpressionType& expression) : m_device(device), m_expression(expression) {}
+
+ EIGEN_DEFAULT_COPY_CONSTRUCTOR(TensorDevice)
+
+ template<typename OtherDerived>
+ EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
+ typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
+ Assign assign(m_expression, other);
+ internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device);
+ return *this;
+ }
+
+ template<typename OtherDerived>
+ EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) {
+ typedef typename OtherDerived::Scalar Scalar;
+ typedef TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ExpressionType, const OtherDerived> Sum;
+ Sum sum(m_expression, other);
+ typedef TensorAssignOp<ExpressionType, const Sum> Assign;
+ Assign assign(m_expression, sum);
+ internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device);
+ return *this;
+ }
+
+ template<typename OtherDerived>
+ EIGEN_STRONG_INLINE TensorDevice& operator-=(const OtherDerived& other) {
+ typedef typename OtherDerived::Scalar Scalar;
+ typedef TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const ExpressionType, const OtherDerived> Difference;
+ Difference difference(m_expression, other);
+ typedef TensorAssignOp<ExpressionType, const Difference> Assign;
+ Assign assign(m_expression, difference);
+ internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device);
+ return *this;
+ }
+
+ protected:
+ const DeviceType& m_device;
+ ExpressionType& m_expression;
+};
+
+/** \class TensorAsyncDevice
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Pseudo expression providing an operator = that will evaluate its
+ * argument asynchronously on the specified device. Currently only
+ * ThreadPoolDevice implements proper asynchronous execution, while the default
+ * and GPU devices just run the expression synchronously and call m_done() on
+ * completion..
+ *
+ * Example:
+ * auto done = []() { ... expression evaluation done ... };
+ * C.device(thread_pool_device, std::move(done)) = A + B;
+ */
+
+template <typename ExpressionType, typename DeviceType, typename DoneCallback>
+class TensorAsyncDevice {
+ public:
+ TensorAsyncDevice(const DeviceType& device, ExpressionType& expression,
+ DoneCallback done)
+ : m_device(device), m_expression(expression), m_done(std::move(done)) {}
+
+ template <typename OtherDerived>
+ EIGEN_STRONG_INLINE TensorAsyncDevice& operator=(const OtherDerived& other) {
+ typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
+ typedef internal::TensorExecutor<const Assign, DeviceType> Executor;
+
+ Assign assign(m_expression, other);
+ Executor::run(assign, m_device);
+ m_done();
+
+ return *this;
+ }
+
+ protected:
+ const DeviceType& m_device;
+ ExpressionType& m_expression;
+ DoneCallback m_done;
+};
+
+
+#ifdef EIGEN_USE_THREADS
+template <typename ExpressionType, typename DoneCallback>
+class TensorAsyncDevice<ExpressionType, ThreadPoolDevice, DoneCallback> {
+ public:
+ TensorAsyncDevice(const ThreadPoolDevice& device, ExpressionType& expression,
+ DoneCallback done)
+ : m_device(device), m_expression(expression), m_done(std::move(done)) {}
+
+ template <typename OtherDerived>
+ EIGEN_STRONG_INLINE TensorAsyncDevice& operator=(const OtherDerived& other) {
+ typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
+ typedef internal::TensorAsyncExecutor<const Assign, ThreadPoolDevice, DoneCallback> Executor;
+
+ // WARNING: After assignment 'm_done' callback will be in undefined state.
+ Assign assign(m_expression, other);
+ Executor::runAsync(assign, m_device, std::move(m_done));
+
+ return *this;
+ }
+
+ protected:
+ const ThreadPoolDevice& m_device;
+ ExpressionType& m_expression;
+ DoneCallback m_done;
+};
+#endif
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceCuda.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceCuda.h
new file mode 100644
index 0000000..f779239
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -0,0 +1,6 @@
+
+#if defined(__clang__) || defined(__GNUC__)
+#warning "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorDeviceGpu.h file"
+#endif
+
+#include "TensorDeviceGpu.h"
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceDefault.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceDefault.h
new file mode 100644
index 0000000..46b9d3a
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceDefault.h
@@ -0,0 +1,104 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H
+
+
+namespace Eigen {
+
+// Default device for the machine (typically a single cpu core)
+struct DefaultDevice {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+ return internal::aligned_malloc(num_bytes);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+ internal::aligned_free(buffer);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const {
+ return allocate(num_bytes);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const {
+ deallocate(buffer);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+ ::memcpy(dst, src, n);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
+ memcpy(dst, src, n);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
+ memcpy(dst, src, n);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+ ::memset(buffer, c, n);
+ }
+ template<typename Type>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const {
+ return data;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
+#if !defined(EIGEN_GPU_COMPILE_PHASE)
+ // Running on the host CPU
+ return 1;
+#elif defined(EIGEN_HIP_DEVICE_COMPILE)
+ // Running on a HIP device
+ return 64;
+#else
+ // Running on a CUDA device
+ return 32;
+#endif
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
+#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY)
+ // Running on the host CPU
+ return l1CacheSize();
+#elif defined(EIGEN_HIP_DEVICE_COMPILE)
+ // Running on a HIP device
+ return 48*1024; // FIXME : update this number for HIP
+#else
+ // Running on a CUDA device, return the amount of shared memory available.
+ return 48*1024;
+#endif
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY)
+ // Running single threaded on the host CPU
+ return l3CacheSize();
+#elif defined(EIGEN_HIP_DEVICE_COMPILE)
+ // Running on a HIP device
+ return firstLevelCacheSize(); // FIXME : update this number for HIP
+#else
+ // Running on a CUDA device
+ return firstLevelCacheSize();
+#endif
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+#if !defined(EIGEN_GPU_COMPILE_PHASE)
+ // Running single threaded on the host CPU
+ // Should return an enum that encodes the ISA supported by the CPU
+ return 1;
+#elif defined(EIGEN_HIP_DEVICE_COMPILE)
+ // Running on a HIP device
+ // return 1 as major for HIP
+ return 1;
+#else
+ // Running on a CUDA device
+ return EIGEN_CUDA_ARCH / 100;
+#endif
+ }
+};
+
+} // namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceGpu.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceGpu.h
new file mode 100644
index 0000000..ec2e3cb
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceGpu.h
@@ -0,0 +1,389 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H)
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
+
+// This header file container defines fo gpu* macros which will resolve to
+// their equivalent hip* or cuda* versions depending on the compiler in use
+// A separate header (included at the end of this file) will undefine all
+#include "TensorGpuHipCudaDefines.h"
+
+namespace Eigen {
+
+static const int kGpuScratchSize = 1024;
+
+// This defines an interface that GPUDevice can take to use
+// HIP / CUDA streams underneath.
+class StreamInterface {
+ public:
+ virtual ~StreamInterface() {}
+
+ virtual const gpuStream_t& stream() const = 0;
+ virtual const gpuDeviceProp_t& deviceProperties() const = 0;
+
+ // Allocate memory on the actual device where the computation will run
+ virtual void* allocate(size_t num_bytes) const = 0;
+ virtual void deallocate(void* buffer) const = 0;
+
+ // Return a scratchpad buffer of size 1k
+ virtual void* scratchpad() const = 0;
+
+ // Return a semaphore. The semaphore is initially initialized to 0, and
+ // each kernel using it is responsible for resetting to 0 upon completion
+ // to maintain the invariant that the semaphore is always equal to 0 upon
+ // each kernel start.
+ virtual unsigned int* semaphore() const = 0;
+};
+
+class GpuDeviceProperties {
+ public:
+ GpuDeviceProperties() :
+ initialized_(false), first_(true), device_properties_(nullptr) {}
+
+ ~GpuDeviceProperties() {
+ if (device_properties_) {
+ delete[] device_properties_;
+ }
+ }
+
+ EIGEN_STRONG_INLINE const gpuDeviceProp_t& get(int device) const {
+ return device_properties_[device];
+ }
+
+ EIGEN_STRONG_INLINE bool isInitialized() const {
+ return initialized_;
+ }
+
+ void initialize() {
+ if (!initialized_) {
+ // Attempts to ensure proper behavior in the case of multiple threads
+ // calling this function simultaneously. This would be trivial to
+ // implement if we could use std::mutex, but unfortunately mutex don't
+ // compile with nvcc, so we resort to atomics and thread fences instead.
+ // Note that if the caller uses a compiler that doesn't support c++11 we
+ // can't ensure that the initialization is thread safe.
+ if (first_.exchange(false)) {
+ // We're the first thread to reach this point.
+ int num_devices;
+ gpuError_t status = gpuGetDeviceCount(&num_devices);
+ if (status != gpuSuccess) {
+ std::cerr << "Failed to get the number of GPU devices: "
+ << gpuGetErrorString(status)
+ << std::endl;
+ gpu_assert(status == gpuSuccess);
+ }
+ device_properties_ = new gpuDeviceProp_t[num_devices];
+ for (int i = 0; i < num_devices; ++i) {
+ status = gpuGetDeviceProperties(&device_properties_[i], i);
+ if (status != gpuSuccess) {
+ std::cerr << "Failed to initialize GPU device #"
+ << i
+ << ": "
+ << gpuGetErrorString(status)
+ << std::endl;
+ gpu_assert(status == gpuSuccess);
+ }
+ }
+
+ std::atomic_thread_fence(std::memory_order_release);
+ initialized_ = true;
+ } else {
+ // Wait for the other thread to inititialize the properties.
+ while (!initialized_) {
+ std::atomic_thread_fence(std::memory_order_acquire);
+ std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+ }
+ }
+ }
+ }
+
+ private:
+ volatile bool initialized_;
+ std::atomic<bool> first_;
+ gpuDeviceProp_t* device_properties_;
+};
+
+EIGEN_ALWAYS_INLINE const GpuDeviceProperties& GetGpuDeviceProperties() {
+ static GpuDeviceProperties* deviceProperties = new GpuDeviceProperties();
+ if (!deviceProperties->isInitialized()) {
+ deviceProperties->initialize();
+ }
+ return *deviceProperties;
+}
+
+EIGEN_ALWAYS_INLINE const gpuDeviceProp_t& GetGpuDeviceProperties(int device) {
+ return GetGpuDeviceProperties().get(device);
+}
+
+static const gpuStream_t default_stream = gpuStreamDefault;
+
+class GpuStreamDevice : public StreamInterface {
+ public:
+ // Use the default stream on the current device
+ GpuStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
+ gpuGetDevice(&device_);
+ }
+ // Use the default stream on the specified device
+ GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {}
+ // Use the specified stream. Note that it's the
+ // caller responsibility to ensure that the stream can run on
+ // the specified device. If no device is specified the code
+ // assumes that the stream is associated to the current gpu device.
+ GpuStreamDevice(const gpuStream_t* stream, int device = -1)
+ : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
+ if (device < 0) {
+ gpuGetDevice(&device_);
+ } else {
+ int num_devices;
+ gpuError_t err = gpuGetDeviceCount(&num_devices);
+ EIGEN_UNUSED_VARIABLE(err)
+ gpu_assert(err == gpuSuccess);
+ gpu_assert(device < num_devices);
+ device_ = device;
+ }
+ }
+
+ virtual ~GpuStreamDevice() {
+ if (scratch_) {
+ deallocate(scratch_);
+ }
+ }
+
+ const gpuStream_t& stream() const { return *stream_; }
+ const gpuDeviceProp_t& deviceProperties() const {
+ return GetGpuDeviceProperties(device_);
+ }
+ virtual void* allocate(size_t num_bytes) const {
+ gpuError_t err = gpuSetDevice(device_);
+ EIGEN_UNUSED_VARIABLE(err)
+ gpu_assert(err == gpuSuccess);
+ void* result;
+ err = gpuMalloc(&result, num_bytes);
+ gpu_assert(err == gpuSuccess);
+ gpu_assert(result != NULL);
+ return result;
+ }
+ virtual void deallocate(void* buffer) const {
+ gpuError_t err = gpuSetDevice(device_);
+ EIGEN_UNUSED_VARIABLE(err)
+ gpu_assert(err == gpuSuccess);
+ gpu_assert(buffer != NULL);
+ err = gpuFree(buffer);
+ gpu_assert(err == gpuSuccess);
+ }
+
+ virtual void* scratchpad() const {
+ if (scratch_ == NULL) {
+ scratch_ = allocate(kGpuScratchSize + sizeof(unsigned int));
+ }
+ return scratch_;
+ }
+
+ virtual unsigned int* semaphore() const {
+ if (semaphore_ == NULL) {
+ char* scratch = static_cast<char*>(scratchpad()) + kGpuScratchSize;
+ semaphore_ = reinterpret_cast<unsigned int*>(scratch);
+ gpuError_t err = gpuMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
+ EIGEN_UNUSED_VARIABLE(err)
+ gpu_assert(err == gpuSuccess);
+ }
+ return semaphore_;
+ }
+
+ private:
+ const gpuStream_t* stream_;
+ int device_;
+ mutable void* scratch_;
+ mutable unsigned int* semaphore_;
+};
+
+struct GpuDevice {
+ // The StreamInterface is not owned: the caller is
+ // responsible for its initialization and eventual destruction.
+ explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
+ eigen_assert(stream);
+ }
+ explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
+ eigen_assert(stream);
+ }
+ // TODO(bsteiner): This is an internal API, we should not expose it.
+ EIGEN_STRONG_INLINE const gpuStream_t& stream() const {
+ return stream_->stream();
+ }
+
+ EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+ return stream_->allocate(num_bytes);
+ }
+
+ EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+ stream_->deallocate(buffer);
+ }
+
+ EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const {
+ return stream_->allocate(num_bytes);
+ }
+
+ EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const {
+ stream_->deallocate(buffer);
+ }
+
+ template<typename Type>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const {
+ return data;
+ }
+
+ EIGEN_STRONG_INLINE void* scratchpad() const {
+ return stream_->scratchpad();
+ }
+
+ EIGEN_STRONG_INLINE unsigned int* semaphore() const {
+ return stream_->semaphore();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+#ifndef EIGEN_GPU_COMPILE_PHASE
+ gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToDevice,
+ stream_->stream());
+ EIGEN_UNUSED_VARIABLE(err)
+ gpu_assert(err == gpuSuccess);
+#else
+ EIGEN_UNUSED_VARIABLE(dst);
+ EIGEN_UNUSED_VARIABLE(src);
+ EIGEN_UNUSED_VARIABLE(n);
+ eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+ }
+
+ EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
+ gpuError_t err =
+ gpuMemcpyAsync(dst, src, n, gpuMemcpyHostToDevice, stream_->stream());
+ EIGEN_UNUSED_VARIABLE(err)
+ gpu_assert(err == gpuSuccess);
+ }
+
+ EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
+ gpuError_t err =
+ gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToHost, stream_->stream());
+ EIGEN_UNUSED_VARIABLE(err)
+ gpu_assert(err == gpuSuccess);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+#ifndef EIGEN_GPU_COMPILE_PHASE
+ gpuError_t err = gpuMemsetAsync(buffer, c, n, stream_->stream());
+ EIGEN_UNUSED_VARIABLE(err)
+ gpu_assert(err == gpuSuccess);
+#else
+ eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+ }
+
+ EIGEN_STRONG_INLINE size_t numThreads() const {
+ // FIXME
+ return 32;
+ }
+
+ EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
+ // FIXME
+ return 48*1024;
+ }
+
+ EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+ // We won't try to take advantage of the l2 cache for the time being, and
+ // there is no l3 cache on hip/cuda devices.
+ return firstLevelCacheSize();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
+#ifndef EIGEN_GPU_COMPILE_PHASE
+ gpuError_t err = gpuStreamSynchronize(stream_->stream());
+ if (err != gpuSuccess) {
+ std::cerr << "Error detected in GPU stream: "
+ << gpuGetErrorString(err)
+ << std::endl;
+ gpu_assert(err == gpuSuccess);
+ }
+#else
+ gpu_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+ }
+
+ EIGEN_STRONG_INLINE int getNumGpuMultiProcessors() const {
+ return stream_->deviceProperties().multiProcessorCount;
+ }
+ EIGEN_STRONG_INLINE int maxGpuThreadsPerBlock() const {
+ return stream_->deviceProperties().maxThreadsPerBlock;
+ }
+ EIGEN_STRONG_INLINE int maxGpuThreadsPerMultiProcessor() const {
+ return stream_->deviceProperties().maxThreadsPerMultiProcessor;
+ }
+ EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
+ return stream_->deviceProperties().sharedMemPerBlock;
+ }
+ EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+ return stream_->deviceProperties().major;
+ }
+ EIGEN_STRONG_INLINE int minorDeviceVersion() const {
+ return stream_->deviceProperties().minor;
+ }
+
+ EIGEN_STRONG_INLINE int maxBlocks() const {
+ return max_blocks_;
+ }
+
+ // This function checks if the GPU runtime recorded an error for the
+ // underlying stream device.
+ inline bool ok() const {
+#ifdef EIGEN_GPUCC
+ gpuError_t error = gpuStreamQuery(stream_->stream());
+ return (error == gpuSuccess) || (error == gpuErrorNotReady);
+#else
+ return false;
+#endif
+ }
+
+ private:
+ const StreamInterface* stream_;
+ int max_blocks_;
+};
+
+#if defined(EIGEN_HIPCC)
+
+#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
+ hipLaunchKernelGGL(kernel, dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), __VA_ARGS__); \
+ gpu_assert(hipGetLastError() == hipSuccess);
+
+#else
+
+#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
+ (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \
+ gpu_assert(cudaGetLastError() == cudaSuccess);
+
+#endif
+
+// FIXME: Should be device and kernel specific.
+#ifdef EIGEN_GPUCC
+static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig config) {
+#ifndef EIGEN_GPU_COMPILE_PHASE
+ gpuError_t status = gpuDeviceSetSharedMemConfig(config);
+ EIGEN_UNUSED_VARIABLE(status)
+ gpu_assert(status == gpuSuccess);
+#else
+ EIGEN_UNUSED_VARIABLE(config)
+#endif
+}
+#endif
+
+} // end namespace Eigen
+
+// undefine all the gpu* macros we defined at the beginning of the file
+#include "TensorGpuHipCudaUndefines.h"
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceSycl.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceSycl.h
new file mode 100644
index 0000000..df591c2
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceSycl.h
@@ -0,0 +1,1048 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli Codeplay Software Ltd.
+// Ralph Potter Codeplay Software Ltd.
+// Luke Iwanski Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H)
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
+#include <unordered_set>
+
+namespace Eigen {
+
+namespace TensorSycl {
+namespace internal {
+
+/// Cache all the device information needed
+struct SyclDeviceInfo {
+ SyclDeviceInfo(cl::sycl::queue queue)
+ : local_mem_type(
+ queue.get_device()
+ .template get_info<cl::sycl::info::device::local_mem_type>()),
+ max_work_item_sizes(
+ queue.get_device()
+ .template get_info<
+ cl::sycl::info::device::max_work_item_sizes>()),
+ max_mem_alloc_size(
+ queue.get_device()
+ .template get_info<
+ cl::sycl::info::device::max_mem_alloc_size>()),
+ max_compute_units(queue.get_device()
+ .template get_info<
+ cl::sycl::info::device::max_compute_units>()),
+ max_work_group_size(
+ queue.get_device()
+ .template get_info<
+ cl::sycl::info::device::max_work_group_size>()),
+ local_mem_size(
+ queue.get_device()
+ .template get_info<cl::sycl::info::device::local_mem_size>()),
+ platform_name(queue.get_device()
+ .get_platform()
+ .template get_info<cl::sycl::info::platform::name>()),
+ device_name(queue.get_device()
+ .template get_info<cl::sycl::info::device::name>()),
+ device_vendor(
+ queue.get_device()
+ .template get_info<cl::sycl::info::device::vendor>()) {}
+
+ cl::sycl::info::local_mem_type local_mem_type;
+ cl::sycl::id<3> max_work_item_sizes;
+ unsigned long max_mem_alloc_size;
+ unsigned long max_compute_units;
+ unsigned long max_work_group_size;
+ size_t local_mem_size;
+ std::string platform_name;
+ std::string device_name;
+ std::string device_vendor;
+};
+
+} // end namespace internal
+} // end namespace TensorSycl
+
+typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t;
+// All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and
+// can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently
+// TensorFlow via the Eigen SYCL Backend.
+EIGEN_STRONG_INLINE auto get_sycl_supported_devices()
+ -> decltype(cl::sycl::device::get_devices()) {
+#ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR
+ return {cl::sycl::device(cl::sycl::default_selector())};
+#else
+ std::vector<cl::sycl::device> supported_devices;
+ auto platform_list = cl::sycl::platform::get_platforms();
+ for (const auto &platform : platform_list) {
+ auto device_list = platform.get_devices();
+ auto platform_name =
+ platform.template get_info<cl::sycl::info::platform::name>();
+ std::transform(platform_name.begin(), platform_name.end(),
+ platform_name.begin(), ::tolower);
+ for (const auto &device : device_list) {
+ auto vendor = device.template get_info<cl::sycl::info::device::vendor>();
+ std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower);
+ bool unsupported_condition =
+ (device.is_cpu() && platform_name.find("amd") != std::string::npos &&
+ vendor.find("apu") == std::string::npos) ||
+ (platform_name.find("experimental") != std::string::npos) ||
+ device.is_host();
+ if (!unsupported_condition) {
+ supported_devices.push_back(device);
+ }
+ }
+ }
+ return supported_devices;
+#endif
+}
+
+class QueueInterface {
+ public:
+ /// Creating device by using cl::sycl::selector or cl::sycl::device.
+ template <typename DeviceOrSelector>
+ explicit QueueInterface(
+ const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler,
+ unsigned num_threads = std::thread::hardware_concurrency())
+ : m_queue(dev_or_sel, handler),
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+ m_prog(m_queue.get_context(), get_sycl_supported_devices()),
+#endif
+ m_thread_pool(num_threads),
+ m_device_info(m_queue) {
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+ m_prog.build_with_kernel_type<DeviceOrSelector>();
+ auto f = [&](cl::sycl::handler &cgh) {
+ cgh.single_task<DeviceOrSelector>(m_prog.get_kernel<DeviceOrSelector>(),
+ [=]() {})
+ };
+ EIGEN_SYCL_TRY_CATCH(m_queue.submit(f));
+#endif
+ }
+
+ template <typename DeviceOrSelector>
+ explicit QueueInterface(
+ const DeviceOrSelector &dev_or_sel,
+ unsigned num_threads = std::thread::hardware_concurrency())
+ : QueueInterface(dev_or_sel,
+ [this](cl::sycl::exception_list l) {
+ this->exception_caught_ = this->sycl_async_handler(l);
+ },
+ num_threads) {}
+
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+ EIGEN_STRONG_INLINE cl::sycl::program &program() const { return m_prog; }
+#endif
+
+ /// Attach an existing buffer to the pointer map, Eigen will not reuse it
+ EIGEN_STRONG_INLINE void *attach_buffer(
+ cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
+ std::lock_guard<std::mutex> lock(pmapper_mutex_);
+ return static_cast<void *>(pMapper.add_pointer(buf));
+ }
+
+ /// Detach previously attached buffer
+ EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
+ std::lock_guard<std::mutex> lock(pmapper_mutex_);
+ TensorSycl::internal::SYCLfree<false>(p, pMapper);
+ }
+
+ /// Allocating device pointer. This pointer is actually an 8 bytes host
+ /// pointer used as key to access the sycl device buffer. The reason is that
+ /// we cannot use device buffer as a pointer as a m_data in Eigen leafNode
+ /// expressions. So we create a key pointer to be used in Eigen expression
+ /// construction. When we convert the Eigen construction into the sycl
+ /// construction we use this pointer as a key in our buffer_map and we make
+ /// sure that we dedicate only one buffer only for this pointer. The device
+ /// pointer would be deleted by calling deallocate function.
+ EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
+#if EIGEN_MAX_ALIGN_BYTES > 0
+ size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
+ if (align > 0) {
+ num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
+ }
+#endif
+ std::lock_guard<std::mutex> lock(pmapper_mutex_);
+ return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
+ }
+
+ EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
+#if EIGEN_MAX_ALIGN_BYTES > 0
+ size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
+ if (align > 0) {
+ num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
+ }
+#endif
+ std::lock_guard<std::mutex> lock(pmapper_mutex_);
+#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
+ if (scratch_buffers.empty()) {
+ return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
+ ;
+ } else {
+ for (auto it = scratch_buffers.begin(); it != scratch_buffers.end();) {
+ auto buff = pMapper.get_buffer(*it);
+ if (buff.get_size() >= num_bytes) {
+ auto ptr = *it;
+ scratch_buffers.erase(it);
+ return ptr;
+ } else {
+ ++it;
+ }
+ }
+ return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
+ }
+#else
+ return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
+#endif
+ }
+ template <typename data_t>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
+ cl::sycl::access::mode::read_write, data_t>
+ get(data_t *data) const {
+ return get_range_accessor<cl::sycl::access::mode::read_write, data_t>(data);
+ }
+ template <typename data_t>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(
+ TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
+ data_t>
+ data) const {
+ return static_cast<data_t *>(data.get_virtual_pointer());
+ }
+
+ EIGEN_STRONG_INLINE void deallocate_temp(void *p) const {
+ std::lock_guard<std::mutex> lock(pmapper_mutex_);
+#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
+ scratch_buffers.insert(p);
+#else
+ TensorSycl::internal::SYCLfree(p, pMapper);
+#endif
+ }
+ template <cl::sycl::access::mode AcMd, typename T>
+ EIGEN_STRONG_INLINE void deallocate_temp(
+ const TensorSycl::internal::RangeAccess<AcMd, T> &p) const {
+ deallocate_temp(p.get_virtual_pointer());
+ }
+
+ /// This is used to deallocate the device pointer. p is used as a key inside
+ /// the map to find the device buffer and delete it.
+ EIGEN_STRONG_INLINE void deallocate(void *p) const {
+ std::lock_guard<std::mutex> lock(pmapper_mutex_);
+ TensorSycl::internal::SYCLfree(p, pMapper);
+ }
+
+ EIGEN_STRONG_INLINE void deallocate_all() const {
+ std::lock_guard<std::mutex> lock(pmapper_mutex_);
+ TensorSycl::internal::SYCLfreeAll(pMapper);
+#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
+ scratch_buffers.clear();
+#endif
+ }
+
+ /// The memcpyHostToDevice is used to copy the data from host to device
+ /// The destination pointer could be deleted before the copy happend which is
+ /// why a callback function is needed. By default if none is provided, the
+ /// function is blocking.
+ EIGEN_STRONG_INLINE void memcpyHostToDevice(
+ void *dst, const void *src, size_t n,
+ std::function<void()> callback) const {
+ static const auto write_mode = cl::sycl::access::mode::discard_write;
+ static const auto global_access = cl::sycl::access::target::global_buffer;
+ typedef cl::sycl::accessor<buffer_scalar_t, 1, write_mode, global_access>
+ write_accessor;
+ if (n == 0) {
+ if (callback) callback();
+ return;
+ }
+ n /= sizeof(buffer_scalar_t);
+ auto f = [&](cl::sycl::handler &cgh) {
+ write_accessor dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
+ buffer_scalar_t const *ptr = static_cast<buffer_scalar_t const *>(src);
+ auto non_deleter = [](buffer_scalar_t const *) {};
+ std::shared_ptr<const buffer_scalar_t> s_ptr(ptr, non_deleter);
+ cgh.copy(s_ptr, dst_acc);
+ };
+ cl::sycl::event e;
+ EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
+ synchronize_and_callback(e, callback);
+ }
+
+ /// The memcpyDeviceToHost is used to copy the data from device to host.
+ /// The source pointer could be deleted before the copy happend which is
+ /// why a callback function is needed. By default if none is provided, the
+ /// function is blocking.
+ EIGEN_STRONG_INLINE void memcpyDeviceToHost(
+ void *dst, const void *src, size_t n,
+ std::function<void()> callback) const {
+ static const auto read_mode = cl::sycl::access::mode::read;
+ static const auto global_access = cl::sycl::access::target::global_buffer;
+ typedef cl::sycl::accessor<buffer_scalar_t, 1, read_mode, global_access>
+ read_accessor;
+ if (n == 0) {
+ if (callback) callback();
+ return;
+ }
+ n /= sizeof(buffer_scalar_t);
+ auto f = [&](cl::sycl::handler &cgh) {
+ read_accessor src_acc = get_range_accessor<read_mode>(cgh, src, n);
+ buffer_scalar_t *ptr = static_cast<buffer_scalar_t *>(dst);
+ auto non_deleter = [](buffer_scalar_t *) {};
+ std::shared_ptr<buffer_scalar_t> s_ptr(ptr, non_deleter);
+ cgh.copy(src_acc, s_ptr);
+ };
+ cl::sycl::event e;
+ EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
+ synchronize_and_callback(e, callback);
+ }
+
+ /// The memcpy function.
+ /// No callback is required here as both arguments are on the device
+ /// and SYCL can handle the dependency.
+ EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const {
+ static const auto read_mode = cl::sycl::access::mode::read;
+ static const auto write_mode = cl::sycl::access::mode::discard_write;
+ if (n == 0) {
+ return;
+ }
+ n /= sizeof(buffer_scalar_t);
+ auto f = [&](cl::sycl::handler &cgh) {
+ auto src_acc = get_range_accessor<read_mode>(cgh, src, n);
+ auto dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
+ cgh.copy(src_acc, dst_acc);
+ };
+ cl::sycl::event e;
+ EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
+ async_synchronize(e);
+ }
+
+ /// the memset function.
+ /// No callback is required here as both arguments are on the device
+ /// and SYCL can handle the dependency.
+ EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
+ static const auto write_mode = cl::sycl::access::mode::discard_write;
+ if (n == 0) {
+ return;
+ }
+ n /= sizeof(buffer_scalar_t);
+ auto f = [&](cl::sycl::handler &cgh) {
+ auto dst_acc = get_range_accessor<write_mode>(cgh, data, n);
+ // The cast to uint8_t is here to match the behaviour of the standard
+ // memset. The cast to buffer_scalar_t is needed to match the type of the
+ // accessor (in case buffer_scalar_t is not uint8_t)
+ cgh.fill(dst_acc, static_cast<buffer_scalar_t>(static_cast<uint8_t>(c)));
+ };
+ cl::sycl::event e;
+ EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
+ async_synchronize(e);
+ }
+
+ /// Get a range accessor to the virtual pointer's device memory. This range
+ /// accessor will allow access to the memory from the pointer to the end of
+ /// the buffer.
+ ///
+ /// NOTE: Inside a kernel the range accessor will always be indexed from the
+ /// start of the buffer, so the offset in the accessor is only used by
+ /// methods like handler::copy and will not be available inside a kernel.
+ template <cl::sycl::access::mode AcMd, typename T>
+ EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
+ get_range_accessor(const void *ptr) const {
+ static const auto global_access = cl::sycl::access::target::global_buffer;
+ static const auto is_place_holder = cl::sycl::access::placeholder::true_t;
+ typedef TensorSycl::internal::RangeAccess<AcMd, T> ret_type;
+ typedef const TensorSycl::internal::buffer_data_type_t *internal_ptr_t;
+
+ std::lock_guard<std::mutex> lock(pmapper_mutex_);
+
+ auto original_buffer = pMapper.get_buffer(ptr);
+ const ptrdiff_t offset = pMapper.get_offset(ptr);
+ const ptrdiff_t typed_offset = offset / sizeof(T);
+ eigen_assert(typed_offset >= 0);
+ const auto typed_size = original_buffer.get_size() / sizeof(T);
+ auto buffer = original_buffer.template reinterpret<
+ typename Eigen::internal::remove_const<T>::type>(
+ cl::sycl::range<1>(typed_size));
+ const ptrdiff_t size = buffer.get_count() - typed_offset;
+ eigen_assert(size >= 0);
+ typedef cl::sycl::accessor<typename Eigen::internal::remove_const<T>::type,
+ 1, AcMd, global_access, is_place_holder>
+ placeholder_accessor_t;
+ const auto start_ptr = static_cast<internal_ptr_t>(ptr) - offset;
+ return ret_type(placeholder_accessor_t(buffer, cl::sycl::range<1>(size),
+ cl::sycl::id<1>(typed_offset)),
+ static_cast<size_t>(typed_offset),
+ reinterpret_cast<std::intptr_t>(start_ptr));
+ }
+
+ /// Get a range accessor to the virtual pointer's device memory with a
+ /// specified size.
+ template <cl::sycl::access::mode AcMd, typename Index>
+ EIGEN_STRONG_INLINE cl::sycl::accessor<
+ buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
+ get_range_accessor(cl::sycl::handler &cgh, const void *ptr,
+ const Index n_bytes) const {
+ static const auto global_access = cl::sycl::access::target::global_buffer;
+ eigen_assert(n_bytes >= 0);
+ std::lock_guard<std::mutex> lock(pmapper_mutex_);
+ auto buffer = pMapper.get_buffer(ptr);
+ const ptrdiff_t offset = pMapper.get_offset(ptr);
+ eigen_assert(offset >= 0);
+ eigen_assert(offset + n_bytes <= buffer.get_size());
+ return buffer.template get_access<AcMd, global_access>(
+ cgh, cl::sycl::range<1>(n_bytes), cl::sycl::id<1>(offset));
+ }
+
+ /// Creation of sycl accessor for a buffer. This function first tries to find
+ /// the buffer in the buffer_map. If found it gets the accessor from it, if
+ /// not, the function then adds an entry by creating a sycl buffer for that
+ /// particular pointer.
+ template <cl::sycl::access::mode AcMd>
+ EIGEN_STRONG_INLINE cl::sycl::accessor<
+ buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
+ get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const {
+ std::lock_guard<std::mutex> lock(pmapper_mutex_);
+ return pMapper.get_buffer(ptr)
+ .template get_access<AcMd, cl::sycl::access::target::global_buffer>(
+ cgh);
+ }
+
+ EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
+ const void *ptr) const {
+ std::lock_guard<std::mutex> lock(pmapper_mutex_);
+ return pMapper.get_buffer(ptr);
+ }
+
+ EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
+ std::lock_guard<std::mutex> lock(pmapper_mutex_);
+ return pMapper.get_offset(ptr);
+ }
+
+ template <typename OutScalar, typename sycl_kernel, typename Lhs,
+ typename Rhs, typename OutPtr, typename Range, typename Index,
+ typename... T>
+ EIGEN_ALWAYS_INLINE void binary_kernel_launcher(const Lhs &lhs,
+ const Rhs &rhs, OutPtr outptr,
+ Range thread_range,
+ Index scratchSize,
+ T... var) const {
+ auto kernel_functor = [=](cl::sycl::handler &cgh) {
+ // binding the placeholder accessors to a commandgroup handler
+ lhs.bind(cgh);
+ rhs.bind(cgh);
+ outptr.bind(cgh);
+ typedef cl::sycl::accessor<OutScalar, 1,
+ cl::sycl::access::mode::read_write,
+ cl::sycl::access::target::local>
+ LocalAccessor;
+
+ LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
+ cgh.parallel_for(
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+ program().template get_kernel<sycl_kernel>(),
+#endif
+ thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...));
+ };
+ cl::sycl::event e;
+ EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
+ async_synchronize(e);
+ }
+
+ template <typename OutScalar, typename sycl_kernel, typename InPtr,
+ typename OutPtr, typename Range, typename Index, typename... T>
+ EIGEN_ALWAYS_INLINE void unary_kernel_launcher(const InPtr &inptr,
+ OutPtr &outptr,
+ Range thread_range,
+ Index scratchSize,
+ T... var) const {
+ auto kernel_functor = [=](cl::sycl::handler &cgh) {
+ // binding the placeholder accessors to a commandgroup handler
+ inptr.bind(cgh);
+ outptr.bind(cgh);
+ typedef cl::sycl::accessor<OutScalar, 1,
+ cl::sycl::access::mode::read_write,
+ cl::sycl::access::target::local>
+ LocalAccessor;
+
+ LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
+ cgh.parallel_for(
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+ program().template get_kernel<sycl_kernel>(),
+#endif
+ thread_range, sycl_kernel(scratch, inptr, outptr, var...));
+ };
+ cl::sycl::event e;
+ EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
+ async_synchronize(e);
+ }
+
+ template <typename OutScalar, typename sycl_kernel, typename InPtr,
+ typename Range, typename Index, typename... T>
+ EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(const InPtr &inptr,
+ Range thread_range,
+ Index scratchSize,
+ T... var) const {
+ auto kernel_functor = [=](cl::sycl::handler &cgh) {
+ // binding the placeholder accessors to a commandgroup handler
+ inptr.bind(cgh);
+ typedef cl::sycl::accessor<OutScalar, 1,
+ cl::sycl::access::mode::read_write,
+ cl::sycl::access::target::local>
+ LocalAccessor;
+
+ LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
+ cgh.parallel_for(
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+ program().template get_kernel<sycl_kernel>(),
+#endif
+ thread_range, sycl_kernel(scratch, inptr, var...));
+ };
+ cl::sycl::event e;
+ EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
+ async_synchronize(e);
+ }
+
+
+ EIGEN_STRONG_INLINE void synchronize() const {
+#ifdef EIGEN_EXCEPTIONS
+ m_queue.wait_and_throw();
+#else
+ m_queue.wait();
+#endif
+ }
+
+
+ EIGEN_STRONG_INLINE void async_synchronize(cl::sycl::event e) const {
+ set_latest_event(e);
+#ifndef EIGEN_SYCL_ASYNC_EXECUTION
+ synchronize();
+#endif
+ }
+
+ template <typename Index>
+ EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize,
+ Index &rng, Index &GRange) const {
+ tileSize = static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
+ tileSize = std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
+ EIGEN_SYCL_LOCAL_THREAD_DIM1),
+ static_cast<Index>(tileSize));
+ rng = n;
+ if (rng == 0) rng = static_cast<Index>(1);
+ GRange = rng;
+ if (tileSize > GRange)
+ tileSize = GRange;
+ else if (GRange > tileSize) {
+ Index xMode = static_cast<Index>(GRange % tileSize);
+ if (xMode != 0) GRange += static_cast<Index>(tileSize - xMode);
+ }
+ }
+
+ /// This is used to prepare the number of threads and also the number of
+ /// threads per block for sycl kernels
+ template <typename Index>
+ EIGEN_STRONG_INLINE void parallel_for_setup(
+ const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
+ cl::sycl::range<2> &local_range) const {
+ std::array<Index, 2> input_range = input_dim;
+ Index max_workgroup_Size =
+ static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
+ max_workgroup_Size =
+ std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
+ EIGEN_SYCL_LOCAL_THREAD_DIM1),
+ static_cast<Index>(max_workgroup_Size));
+ Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
+ local_range[1] =
+ static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
+ input_range[1] = input_dim[1];
+ if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
+ global_range[1] = input_range[1];
+ if (local_range[1] > global_range[1])
+ local_range[1] = global_range[1];
+ else if (global_range[1] > local_range[1]) {
+ Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
+ if (xMode != 0)
+ global_range[1] += static_cast<Index>(local_range[1] - xMode);
+ }
+ local_range[0] = static_cast<Index>(max_workgroup_Size / local_range[1]);
+ input_range[0] = input_dim[0];
+ if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
+ global_range[0] = input_range[0];
+ if (local_range[0] > global_range[0])
+ local_range[0] = global_range[0];
+ else if (global_range[0] > local_range[0]) {
+ Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
+ if (xMode != 0)
+ global_range[0] += static_cast<Index>(local_range[0] - xMode);
+ }
+ }
+
+ /// This is used to prepare the number of threads and also the number of
+ /// threads per block for sycl kernels
+ template <typename Index>
+ EIGEN_STRONG_INLINE void parallel_for_setup(
+ const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
+ cl::sycl::range<3> &local_range) const {
+ std::array<Index, 3> input_range = input_dim;
+ Index max_workgroup_Size =
+ static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
+ max_workgroup_Size =
+ std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
+ EIGEN_SYCL_LOCAL_THREAD_DIM1),
+ static_cast<Index>(max_workgroup_Size));
+ Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
+ local_range[2] =
+ static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 3)));
+ input_range[2] = input_dim[2];
+ if (input_range[2] == 0) input_range[1] = static_cast<Index>(1);
+ global_range[2] = input_range[2];
+ if (local_range[2] > global_range[2])
+ local_range[2] = global_range[2];
+ else if (global_range[2] > local_range[2]) {
+ Index xMode = static_cast<Index>(global_range[2] % local_range[2]);
+ if (xMode != 0)
+ global_range[2] += static_cast<Index>(local_range[2] - xMode);
+ }
+ pow_of_2 = static_cast<Index>(
+ std::log2(static_cast<Index>(max_workgroup_Size / local_range[2])));
+ local_range[1] =
+ static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
+ input_range[1] = input_dim[1];
+ if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
+ global_range[1] = input_range[1];
+ if (local_range[1] > global_range[1])
+ local_range[1] = global_range[1];
+ else if (global_range[1] > local_range[1]) {
+ Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
+ if (xMode != 0)
+ global_range[1] += static_cast<Index>(local_range[1] - xMode);
+ }
+ local_range[0] = static_cast<Index>(max_workgroup_Size /
+ (local_range[1] * local_range[2]));
+ input_range[0] = input_dim[0];
+ if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
+ global_range[0] = input_range[0];
+ if (local_range[0] > global_range[0])
+ local_range[0] = global_range[0];
+ else if (global_range[0] > local_range[0]) {
+ Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
+ if (xMode != 0)
+ global_range[0] += static_cast<Index>(local_range[0] - xMode);
+ }
+ }
+
+ EIGEN_STRONG_INLINE bool has_local_memory() const {
+#if !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
+ return false;
+#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
+ return true;
+#else
+ return m_device_info.local_mem_type ==
+ cl::sycl::info::local_mem_type::local;
+#endif
+ }
+
+ EIGEN_STRONG_INLINE unsigned long max_buffer_size() const {
+ return m_device_info.max_mem_alloc_size;
+ }
+
+ EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
+ return m_device_info.max_compute_units;
+ }
+
+ EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
+ return m_device_info.max_work_group_size;
+ }
+
+ EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const {
+ return m_device_info.max_work_item_sizes;
+ }
+
+ /// No need for sycl it should act the same as CPU version
+ EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; }
+
+ EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
+ // OpenCL doesnot have such concept
+ return 2;
+ }
+
+ EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
+ return m_device_info.local_mem_size;
+ }
+
+ // This function returns the nearest power of 2 Work-group size which is <=
+ // maximum device workgroup size.
+ EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
+ return getPowerOfTwo(m_device_info.max_work_group_size, false);
+ }
+
+ EIGEN_STRONG_INLINE std::string getPlatformName() const {
+ return m_device_info.platform_name;
+ }
+
+ EIGEN_STRONG_INLINE std::string getDeviceName() const {
+ return m_device_info.device_name;
+ }
+
+ EIGEN_STRONG_INLINE std::string getDeviceVendor() const {
+ return m_device_info.device_vendor;
+ }
+
+ // This function returns the nearest power of 2
+ // if roundup is true returns result>=wgsize
+ // else it return result <= wgsize
+ EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t wGSize, bool roundUp) const {
+ if (roundUp) --wGSize;
+ wGSize |= (wGSize >> 1);
+ wGSize |= (wGSize >> 2);
+ wGSize |= (wGSize >> 4);
+ wGSize |= (wGSize >> 8);
+ wGSize |= (wGSize >> 16);
+#if EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64 || EIGEN_OS_WIN64
+ wGSize |= (wGSize >> 32);
+#endif
+ return ((!roundUp) ? (wGSize - (wGSize >> 1)) : ++wGSize);
+ }
+
+ EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { return m_queue; }
+
+ // This function checks if the runtime recorded an error for the
+ // underlying stream device.
+ EIGEN_STRONG_INLINE bool ok() const {
+ if (!exception_caught_) {
+ synchronize();
+ }
+ return !exception_caught_;
+ }
+
+ EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const {
+#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
+ std::lock_guard<std::mutex> lock(event_mutex_);
+ return latest_events_[std::this_thread::get_id()];
+#else
+ eigen_assert(false);
+ return cl::sycl::event();
+#endif
+ }
+
+ // destructor
+ ~QueueInterface() {
+ pMapper.clear();
+#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
+ scratch_buffers.clear();
+#endif
+ }
+
+ protected:
+ EIGEN_STRONG_INLINE void set_latest_event(cl::sycl::event e) const {
+#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
+ std::lock_guard<std::mutex> lock(event_mutex_);
+ latest_events_[std::this_thread::get_id()] = e;
+#else
+ EIGEN_UNUSED_VARIABLE(e);
+#endif
+ }
+
+ void synchronize_and_callback(cl::sycl::event e,
+ const std::function<void()> &callback) const {
+ set_latest_event(e);
+ if (callback) {
+ auto callback_ = [=]() {
+#ifdef EIGEN_EXCEPTIONS
+ cl::sycl::event(e).wait_and_throw();
+#else
+ cl::sycl::event(e).wait();
+#endif
+ callback();
+ };
+ m_thread_pool.Schedule(std::move(callback_));
+ } else {
+#ifdef EIGEN_EXCEPTIONS
+ m_queue.wait_and_throw();
+#else
+ m_queue.wait();
+#endif
+ }
+ }
+
+ bool sycl_async_handler(cl::sycl::exception_list exceptions) const {
+ bool exception_caught = false;
+ for (const auto &e : exceptions) {
+ if (e) {
+ exception_caught = true;
+ EIGEN_THROW_X(e);
+ }
+ }
+ return exception_caught;
+ }
+
+ /// class members:
+ bool exception_caught_ = false;
+
+ mutable std::mutex pmapper_mutex_;
+
+#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
+ mutable std::mutex event_mutex_;
+ mutable std::unordered_map<std::thread::id, cl::sycl::event> latest_events_;
+#endif
+
+ /// std::map is the container used to make sure that we create only one buffer
+ /// per pointer. The lifespan of the buffer now depends on the lifespan of
+ /// SyclDevice. If a non-read-only pointer is needed to be accessed on the
+ /// host we should manually deallocate it.
+ mutable TensorSycl::internal::PointerMapper pMapper;
+#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
+ mutable std::unordered_set<void *> scratch_buffers;
+#endif
+ /// sycl queue
+ mutable cl::sycl::queue m_queue;
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+ mutable cl::sycl::program m_prog;
+#endif
+
+ /// The thread pool is used to wait on events and call callbacks
+ /// asynchronously
+ mutable Eigen::ThreadPool m_thread_pool;
+
+ const TensorSycl::internal::SyclDeviceInfo m_device_info;
+};
+
+struct SyclDeviceBase {
+ /// QueueInterface is not owned. it is the caller's responsibility to destroy
+ /// it
+ const QueueInterface *m_queue_stream;
+ explicit SyclDeviceBase(const QueueInterface *queue_stream)
+ : m_queue_stream(queue_stream) {}
+ EIGEN_STRONG_INLINE const QueueInterface *queue_stream() const {
+ return m_queue_stream;
+ }
+};
+
+// Here is a sycl device struct which accept the sycl queue interface
+// as an input
+struct SyclDevice : public SyclDeviceBase {
+ explicit SyclDevice(const QueueInterface *queue_stream)
+ : SyclDeviceBase(queue_stream) {}
+
+ // this is the accessor used to construct the evaluator
+ template <cl::sycl::access::mode AcMd, typename T>
+ EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
+ get_range_accessor(const void *ptr) const {
+ return queue_stream()->template get_range_accessor<AcMd, T>(ptr);
+ }
+
+ // get sycl accessor
+ template <cl::sycl::access::mode AcMd>
+ EIGEN_STRONG_INLINE cl::sycl::accessor<
+ buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
+ get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const {
+ return queue_stream()->template get_sycl_accessor<AcMd>(cgh, ptr);
+ }
+
+ /// Accessing the created sycl device buffer for the device pointer
+ EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
+ const void *ptr) const {
+ return queue_stream()->get_sycl_buffer(ptr);
+ }
+
+ /// This is used to prepare the number of threads and also the number of
+ /// threads per block for sycl kernels
+ template <typename Index>
+ EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize,
+ Index &rng, Index &GRange) const {
+ queue_stream()->parallel_for_setup(n, tileSize, rng, GRange);
+ }
+
+ /// This is used to prepare the number of threads and also the number of
+ /// threads per block for sycl kernels
+ template <typename Index>
+ EIGEN_STRONG_INLINE void parallel_for_setup(
+ const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
+ cl::sycl::range<2> &local_range) const {
+ queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
+ }
+
+ /// This is used to prepare the number of threads and also the number of
+ /// threads per block for sycl kernels
+ template <typename Index>
+ EIGEN_STRONG_INLINE void parallel_for_setup(
+ const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
+ cl::sycl::range<3> &local_range) const {
+ queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
+ }
+
+ /// allocate device memory
+ EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
+ return queue_stream()->allocate(num_bytes);
+ }
+
+ EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
+ return queue_stream()->allocate_temp(num_bytes);
+ }
+
+ /// deallocate device memory
+ EIGEN_STRONG_INLINE void deallocate(void *p) const {
+ queue_stream()->deallocate(p);
+ }
+
+ EIGEN_STRONG_INLINE void deallocate_temp(void *buffer) const {
+ queue_stream()->deallocate_temp(buffer);
+ }
+ template <cl::sycl::access::mode AcMd, typename T>
+ EIGEN_STRONG_INLINE void deallocate_temp(
+ const TensorSycl::internal::RangeAccess<AcMd, T> &buffer) const {
+ queue_stream()->deallocate_temp(buffer);
+ }
+ EIGEN_STRONG_INLINE void deallocate_all() const {
+ queue_stream()->deallocate_all();
+ }
+
+ template <typename data_t>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
+ cl::sycl::access::mode::read_write, data_t>
+ get(data_t *data) const {
+ return queue_stream()->get(data);
+ }
+ template <typename data_t>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(
+ TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
+ data_t>
+ data) const {
+ return queue_stream()->get(data);
+ }
+
+ /// attach existing buffer
+ EIGEN_STRONG_INLINE void *attach_buffer(
+ cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
+ return queue_stream()->attach_buffer(buf);
+ }
+ /// detach buffer
+ EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
+ queue_stream()->detach_buffer(p);
+ }
+ EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
+ return queue_stream()->get_offset(ptr);
+ }
+
+ // some runtime conditions that can be applied here
+ EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; }
+
+ /// memcpyHostToDevice
+ template <typename Index>
+ EIGEN_STRONG_INLINE void memcpyHostToDevice(
+ Index *dst, const Index *src, size_t n,
+ std::function<void()> callback = {}) const {
+ queue_stream()->memcpyHostToDevice(dst, src, n, callback);
+ }
+ /// memcpyDeviceToHost
+ template <typename Index>
+ EIGEN_STRONG_INLINE void memcpyDeviceToHost(
+ void *dst, const Index *src, size_t n,
+ std::function<void()> callback = {}) const {
+ queue_stream()->memcpyDeviceToHost(dst, src, n, callback);
+ }
+ /// the memcpy function
+ template <typename Index>
+ EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const {
+ queue_stream()->memcpy(dst, src, n);
+ }
+ /// the memset function
+ EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
+ queue_stream()->memset(data, c, n);
+ }
+ /// returning the sycl queue
+ EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const {
+ return queue_stream()->sycl_queue();
+ }
+#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
+ EIGEN_STRONG_INLINE cl::sycl::program &program() const {
+ return queue_stream()->program();
+ }
+#endif
+
+ EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return 48 * 1024; }
+
+ EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+ // We won't try to take advantage of the l2 cache for the time being, and
+ // there is no l3 cache on sycl devices.
+ return firstLevelCacheSize();
+ }
+ EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
+ return queue_stream()->getNumSyclMultiProcessors();
+ }
+ EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
+ return queue_stream()->maxSyclThreadsPerBlock();
+ }
+ EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const {
+ return queue_stream()->maxWorkItemSizes();
+ }
+ EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
+ // OpenCL doesnot have such concept
+ return queue_stream()->maxSyclThreadsPerMultiProcessor();
+ }
+ EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
+ return queue_stream()->sharedMemPerBlock();
+ }
+ EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
+ return queue_stream()->getNearestPowerOfTwoWorkGroupSize();
+ }
+
+ EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t val, bool roundUp) const {
+ return queue_stream()->getPowerOfTwo(val, roundUp);
+ }
+ /// No need for sycl it should act the same as CPU version
+ EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+ return queue_stream()->majorDeviceVersion();
+ }
+
+ EIGEN_STRONG_INLINE void synchronize() const {
+ queue_stream()->synchronize();
+ }
+ EIGEN_STRONG_INLINE void async_synchronize(
+ cl::sycl::event e = cl::sycl::event()) const {
+ queue_stream()->async_synchronize(e);
+ }
+ EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const {
+ return queue_stream()->get_latest_event();
+ }
+
+ // This function checks if the runtime recorded an error for the
+ // underlying stream device.
+ EIGEN_STRONG_INLINE bool ok() const { return queue_stream()->ok(); }
+
+ EIGEN_STRONG_INLINE bool has_local_memory() const {
+ return queue_stream()->has_local_memory();
+ }
+ EIGEN_STRONG_INLINE long max_buffer_size() const {
+ return queue_stream()->max_buffer_size();
+ }
+ EIGEN_STRONG_INLINE std::string getPlatformName() const {
+ return queue_stream()->getPlatformName();
+ }
+ EIGEN_STRONG_INLINE std::string getDeviceName() const {
+ return queue_stream()->getDeviceName();
+ }
+ EIGEN_STRONG_INLINE std::string getDeviceVendor() const {
+ return queue_stream()->getDeviceVendor();
+ }
+ template <typename OutScalar, typename KernelType, typename... T>
+ EIGEN_ALWAYS_INLINE void binary_kernel_launcher(T... var) const {
+ queue_stream()->template binary_kernel_launcher<OutScalar, KernelType>(
+ var...);
+ }
+ template <typename OutScalar, typename KernelType, typename... T>
+ EIGEN_ALWAYS_INLINE void unary_kernel_launcher(T... var) const {
+ queue_stream()->template unary_kernel_launcher<OutScalar, KernelType>(
+ var...);
+ }
+
+ template <typename OutScalar, typename KernelType, typename... T>
+ EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(T... var) const {
+ queue_stream()->template nullary_kernel_launcher<OutScalar, KernelType>(
+ var...);
+ }
+};
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceThreadPool.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceThreadPool.h
new file mode 100644
index 0000000..e524b53
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -0,0 +1,409 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if defined(EIGEN_USE_THREADS) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H)
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H
+
+namespace Eigen {
+
+// Runs an arbitrary function and then calls Notify() on the passed in
+// Notification.
+template <typename Function, typename... Args> struct FunctionWrapperWithNotification
+{
+ static void run(Notification* n, Function f, Args... args) {
+ f(args...);
+ if (n) {
+ n->Notify();
+ }
+ }
+};
+
+template <typename Function, typename... Args> struct FunctionWrapperWithBarrier
+{
+ static void run(Barrier* b, Function f, Args... args) {
+ f(args...);
+ if (b) {
+ b->Notify();
+ }
+ }
+};
+
+template <typename SyncType>
+static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) {
+ if (n) {
+ n->Wait();
+ }
+}
+
+// An abstract interface to a device specific memory allocator.
+class Allocator {
+ public:
+ virtual ~Allocator() {}
+ virtual void* allocate(size_t num_bytes) const = 0;
+ virtual void deallocate(void* buffer) const = 0;
+};
+
+// Build a thread pool device on top the an existing pool of threads.
+struct ThreadPoolDevice {
+ // The ownership of the thread pool remains with the caller.
+ ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores, Allocator* allocator = nullptr)
+ : pool_(pool), num_threads_(num_cores), allocator_(allocator) { }
+
+ EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+ return allocator_ ? allocator_->allocate(num_bytes)
+ : internal::aligned_malloc(num_bytes);
+ }
+
+ EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+ if (allocator_) {
+ allocator_->deallocate(buffer);
+ } else {
+ internal::aligned_free(buffer);
+ }
+ }
+
+ EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const {
+ return allocate(num_bytes);
+ }
+
+ EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const {
+ deallocate(buffer);
+ }
+
+ template<typename Type>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const {
+ return data;
+ }
+
+ EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+#ifdef __ANDROID__
+ ::memcpy(dst, src, n);
+#else
+ // TODO(rmlarsen): Align blocks on cache lines.
+ // We have observed that going beyond 4 threads usually just wastes
+ // CPU cycles due to the threads competing for memory bandwidth, so we
+ // statically schedule at most 4 block copies here.
+ const size_t kMinBlockSize = 32768;
+ const size_t num_threads = CostModel::numThreads(n, TensorOpCost(1.0, 1.0, 0), 4);
+ if (n <= kMinBlockSize || num_threads < 2) {
+ ::memcpy(dst, src, n);
+ } else {
+ const char* src_ptr = static_cast<const char*>(src);
+ char* dst_ptr = static_cast<char*>(dst);
+ const size_t blocksize = (n + (num_threads - 1)) / num_threads;
+ Barrier barrier(static_cast<int>(num_threads - 1));
+ // Launch the last 3 blocks on worker threads.
+ for (size_t i = 1; i < num_threads; ++i) {
+ enqueue_with_barrier(&barrier, [n, i, src_ptr, dst_ptr, blocksize] {
+ ::memcpy(dst_ptr + i * blocksize, src_ptr + i * blocksize,
+ numext::mini(blocksize, n - (i * blocksize)));
+ });
+ }
+ // Launch the first block on the main thread.
+ ::memcpy(dst_ptr, src_ptr, blocksize);
+ barrier.Wait();
+ }
+#endif
+ }
+ EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
+ memcpy(dst, src, n);
+ }
+ EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
+ memcpy(dst, src, n);
+ }
+
+ EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+ ::memset(buffer, c, n);
+ }
+
+ EIGEN_STRONG_INLINE int numThreads() const {
+ return num_threads_;
+ }
+
+ // Number of theads available in the underlying thread pool. This number can
+ // be different from the value returned by numThreads().
+ EIGEN_STRONG_INLINE int numThreadsInPool() const {
+ return pool_->NumThreads();
+ }
+
+ EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
+ return l1CacheSize();
+ }
+
+ EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+ // The l3 cache size is shared between all the cores.
+ return l3CacheSize() / num_threads_;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+ // Should return an enum that encodes the ISA supported by the CPU
+ return 1;
+ }
+
+ template <class Function, class... Args>
+ EIGEN_STRONG_INLINE Notification* enqueue(Function&& f,
+ Args&&... args) const {
+ Notification* n = new Notification();
+ pool_->Schedule(
+ std::bind(&FunctionWrapperWithNotification<Function, Args...>::run, n,
+ std::move(f), args...));
+ return n;
+ }
+
+ template <class Function, class... Args>
+ EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b, Function&& f,
+ Args&&... args) const {
+ pool_->Schedule(
+ std::bind(&FunctionWrapperWithBarrier<Function, Args...>::run, b,
+ std::move(f), args...));
+ }
+
+ template <class Function, class... Args>
+ EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f,
+ Args&&... args) const {
+ if (sizeof...(args) > 0) {
+ pool_->Schedule(std::bind(std::move(f), args...));
+ } else {
+ pool_->Schedule(std::move(f));
+ }
+ }
+
+ // Returns a logical thread index between 0 and pool_->NumThreads() - 1 if
+ // called from one of the threads in pool_. Returns -1 otherwise.
+ EIGEN_STRONG_INLINE int currentThreadId() const {
+ return pool_->CurrentThreadId();
+ }
+
+ // WARNING: This function is synchronous and will block the calling thread.
+ //
+ // Synchronous parallelFor executes f with [0, n) arguments in parallel and
+ // waits for completion. F accepts a half-open interval [first, last). Block
+ // size is chosen based on the iteration cost and resulting parallel
+ // efficiency. If block_align is not nullptr, it is called to round up the
+ // block size.
+ void parallelFor(Index n, const TensorOpCost& cost,
+ std::function<Index(Index)> block_align,
+ std::function<void(Index, Index)> f) const {
+ if (EIGEN_PREDICT_FALSE(n <= 0)){
+ return;
+ // Compute small problems directly in the caller thread.
+ } else if (n == 1 || numThreads() == 1 ||
+ CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) {
+ f(0, n);
+ return;
+ }
+
+ // Compute block size and total count of blocks.
+ ParallelForBlock block = CalculateParallelForBlock(n, cost, block_align);
+
+ // Recursively divide size into halves until we reach block_size.
+ // Division code rounds mid to block_size, so we are guaranteed to get
+ // block_count leaves that do actual computations.
+ Barrier barrier(static_cast<unsigned int>(block.count));
+ std::function<void(Index, Index)> handleRange;
+ handleRange = [=, &handleRange, &barrier, &f](Index firstIdx,
+ Index lastIdx) {
+ while (lastIdx - firstIdx > block.size) {
+ // Split into halves and schedule the second half on a different thread.
+ const Index midIdx = firstIdx + divup((lastIdx - firstIdx) / 2, block.size) * block.size;
+ pool_->Schedule([=, &handleRange]() { handleRange(midIdx, lastIdx); });
+ lastIdx = midIdx;
+ }
+ // Single block or less, execute directly.
+ f(firstIdx, lastIdx);
+ barrier.Notify();
+ };
+
+ if (block.count <= numThreads()) {
+ // Avoid a thread hop by running the root of the tree and one block on the
+ // main thread.
+ handleRange(0, n);
+ } else {
+ // Execute the root in the thread pool to avoid running work on more than
+ // numThreads() threads.
+ pool_->Schedule([=, &handleRange]() { handleRange(0, n); });
+ }
+
+ barrier.Wait();
+ }
+
+ // Convenience wrapper for parallelFor that does not align blocks.
+ void parallelFor(Index n, const TensorOpCost& cost,
+ std::function<void(Index, Index)> f) const {
+ parallelFor(n, cost, nullptr, std::move(f));
+ }
+
+ // WARNING: This function is asynchronous and will not block the calling thread.
+ //
+ // Asynchronous parallelFor executes f with [0, n) arguments in parallel
+ // without waiting for completion. When the last block finished, it will call
+ // 'done' callback. F accepts a half-open interval [first, last). Block size
+ // is chosen based on the iteration cost and resulting parallel efficiency. If
+ // block_align is not nullptr, it is called to round up the block size.
+ void parallelForAsync(Index n, const TensorOpCost& cost,
+ std::function<Index(Index)> block_align,
+ std::function<void(Index, Index)> f,
+ std::function<void()> done) const {
+ // Compute small problems directly in the caller thread.
+ if (n <= 1 || numThreads() == 1 ||
+ CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) {
+ f(0, n);
+ done();
+ return;
+ }
+
+ // Compute block size and total count of blocks.
+ ParallelForBlock block = CalculateParallelForBlock(n, cost, block_align);
+
+ ParallelForAsyncContext* const ctx =
+ new ParallelForAsyncContext(block.count, std::move(f), std::move(done));
+
+ // Recursively divide size into halves until we reach block_size.
+ // Division code rounds mid to block_size, so we are guaranteed to get
+ // block_count leaves that do actual computations.
+ ctx->handle_range = [this, ctx, block](Index firstIdx, Index lastIdx) {
+ while (lastIdx - firstIdx > block.size) {
+ // Split into halves and schedule the second half on a different thread.
+ const Index midIdx = firstIdx + divup((lastIdx - firstIdx) / 2, block.size) * block.size;
+ pool_->Schedule(
+ [ctx, midIdx, lastIdx]() { ctx->handle_range(midIdx, lastIdx); });
+ lastIdx = midIdx;
+ }
+
+ // Single block or less, execute directly.
+ ctx->f(firstIdx, lastIdx);
+
+ // Delete async context if it was the last block.
+ if (ctx->count.fetch_sub(1) == 1) delete ctx;
+ };
+
+ if (block.count <= numThreads()) {
+ // Avoid a thread hop by running the root of the tree and one block on the
+ // main thread.
+ ctx->handle_range(0, n);
+ } else {
+ // Execute the root in the thread pool to avoid running work on more than
+ // numThreads() threads.
+ pool_->Schedule([ctx, n]() { ctx->handle_range(0, n); });
+ }
+ }
+
+ // Convenience wrapper for parallelForAsync that does not align blocks.
+ void parallelForAsync(Index n, const TensorOpCost& cost,
+ std::function<void(Index, Index)> f,
+ std::function<void()> done) const {
+ parallelForAsync(n, cost, nullptr, std::move(f), std::move(done));
+ }
+
+ // Thread pool accessor.
+ ThreadPoolInterface* getPool() const { return pool_; }
+
+ // Allocator accessor.
+ Allocator* allocator() const { return allocator_; }
+
+ private:
+ typedef TensorCostModel<ThreadPoolDevice> CostModel;
+
+ // For parallelForAsync we must keep passed in closures on the heap, and
+ // delete them only after `done` callback finished.
+ struct ParallelForAsyncContext {
+ ParallelForAsyncContext(Index block_count,
+ std::function<void(Index, Index)> block_f,
+ std::function<void()> done_callback)
+ : count(block_count),
+ f(std::move(block_f)),
+ done(std::move(done_callback)) {}
+ ~ParallelForAsyncContext() { done(); }
+
+ std::atomic<Index> count;
+ std::function<void(Index, Index)> f;
+ std::function<void()> done;
+
+ std::function<void(Index, Index)> handle_range;
+ };
+
+ struct ParallelForBlock {
+ Index size; // block size
+ Index count; // number of blocks
+ };
+
+ // Calculates block size based on (1) the iteration cost and (2) parallel
+ // efficiency. We want blocks to be not too small to mitigate parallelization
+ // overheads; not too large to mitigate tail effect and potential load
+ // imbalance and we also want number of blocks to be evenly dividable across
+ // threads.
+ ParallelForBlock CalculateParallelForBlock(
+ const Index n, const TensorOpCost& cost,
+ std::function<Index(Index)> block_align) const {
+ const double block_size_f = 1.0 / CostModel::taskSize(1, cost);
+ const Index max_oversharding_factor = 4;
+ Index block_size = numext::mini(
+ n, numext::maxi<Index>(
+ divup<Index>(n, max_oversharding_factor * numThreads()),
+ block_size_f));
+ const Index max_block_size = numext::mini(n, 2 * block_size);
+
+ if (block_align) {
+ Index new_block_size = block_align(block_size);
+ eigen_assert(new_block_size >= block_size);
+ block_size = numext::mini(n, new_block_size);
+ }
+
+ Index block_count = divup(n, block_size);
+
+ // Calculate parallel efficiency as fraction of total CPU time used for
+ // computations:
+ double max_efficiency =
+ static_cast<double>(block_count) /
+ (divup<int>(block_count, numThreads()) * numThreads());
+
+ // Now try to increase block size up to max_block_size as long as it
+ // doesn't decrease parallel efficiency.
+ for (Index prev_block_count = block_count;
+ max_efficiency < 1.0 && prev_block_count > 1;) {
+ // This is the next block size that divides size into a smaller number
+ // of blocks than the current block_size.
+ Index coarser_block_size = divup(n, prev_block_count - 1);
+ if (block_align) {
+ Index new_block_size = block_align(coarser_block_size);
+ eigen_assert(new_block_size >= coarser_block_size);
+ coarser_block_size = numext::mini(n, new_block_size);
+ }
+ if (coarser_block_size > max_block_size) {
+ break; // Reached max block size. Stop.
+ }
+ // Recalculate parallel efficiency.
+ const Index coarser_block_count = divup(n, coarser_block_size);
+ eigen_assert(coarser_block_count < prev_block_count);
+ prev_block_count = coarser_block_count;
+ const double coarser_efficiency =
+ static_cast<double>(coarser_block_count) /
+ (divup<int>(coarser_block_count, numThreads()) * numThreads());
+ if (coarser_efficiency + 0.01 >= max_efficiency) {
+ // Taking it.
+ block_size = coarser_block_size;
+ block_count = coarser_block_count;
+ if (max_efficiency < coarser_efficiency) {
+ max_efficiency = coarser_efficiency;
+ }
+ }
+ }
+
+ return {block_size, block_count};
+ }
+
+ ThreadPoolInterface* pool_;
+ int num_threads_;
+ Allocator* allocator_;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorDimensionList.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorDimensionList.h
new file mode 100644
index 0000000..1a30e45
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorDimensionList.h
@@ -0,0 +1,236 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H
+#define EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H
+
+namespace Eigen {
+
+/** \internal
+ *
+ * \class TensorDimensionList
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Special case of tensor index list used to list all the dimensions of a tensor of rank n.
+ *
+ * \sa Tensor
+ */
+
+template <typename Index, std::size_t Rank> struct DimensionList {
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+ const Index operator[] (const Index i) const { return i; }
+};
+
+namespace internal {
+
+template<typename Index, std::size_t Rank> struct array_size<DimensionList<Index, Rank> > {
+ static const size_t value = Rank;
+};
+template<typename Index, std::size_t Rank> struct array_size<const DimensionList<Index, Rank> > {
+ static const size_t value = Rank;
+};
+
+template<DenseIndex n, typename Index, std::size_t Rank> const Index array_get(DimensionList<Index, Rank>&) {
+ return n;
+}
+template<DenseIndex n, typename Index, std::size_t Rank> const Index array_get(const DimensionList<Index, Rank>&) {
+ return n;
+}
+
+
+#if EIGEN_HAS_CONSTEXPR
+template <typename Index, std::size_t Rank>
+struct index_known_statically_impl<DimensionList<Index, Rank> > {
+ EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) {
+ return true;
+ }
+};
+template <typename Index, std::size_t Rank>
+struct index_known_statically_impl<const DimensionList<Index, Rank> > {
+ EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) {
+ return true;
+ }
+};
+
+template <typename Index, std::size_t Rank>
+struct all_indices_known_statically_impl<DimensionList<Index, Rank> > {
+ EIGEN_DEVICE_FUNC static constexpr bool run() {
+ return true;
+ }
+};
+template <typename Index, std::size_t Rank>
+struct all_indices_known_statically_impl<const DimensionList<Index, Rank> > {
+ EIGEN_DEVICE_FUNC static constexpr bool run() {
+ return true;
+ }
+};
+
+template <typename Index, std::size_t Rank>
+struct indices_statically_known_to_increase_impl<DimensionList<Index, Rank> > {
+ EIGEN_DEVICE_FUNC static constexpr bool run() {
+ return true;
+ }
+};
+template <typename Index, std::size_t Rank>
+struct indices_statically_known_to_increase_impl<const DimensionList<Index, Rank> > {
+ EIGEN_DEVICE_FUNC static constexpr bool run() {
+ return true;
+ }
+};
+
+template <typename Index, std::size_t Rank>
+struct index_statically_eq_impl<DimensionList<Index, Rank> > {
+ static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+ return i == value;
+ }
+};
+template <typename Index, std::size_t Rank>
+struct index_statically_eq_impl<const DimensionList<Index, Rank> > {
+ EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+ return i == value;
+ }
+};
+
+template <typename Index, std::size_t Rank>
+struct index_statically_ne_impl<DimensionList<Index, Rank> > {
+ EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+ return i != value;
+ }
+};
+template <typename Index, std::size_t Rank>
+struct index_statically_ne_impl<const DimensionList<Index, Rank> > {
+ static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+ return i != value;
+ }
+};
+
+template <typename Index, std::size_t Rank>
+struct index_statically_gt_impl<DimensionList<Index, Rank> > {
+ EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+ return i > value;
+ }
+};
+template <typename Index, std::size_t Rank>
+struct index_statically_gt_impl<const DimensionList<Index, Rank> > {
+ EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+ return i > value;
+ }
+};
+
+template <typename Index, std::size_t Rank>
+struct index_statically_lt_impl<DimensionList<Index, Rank> > {
+ EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+ return i < value;
+ }
+};
+template <typename Index, std::size_t Rank>
+struct index_statically_lt_impl<const DimensionList<Index, Rank> > {
+ EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+ return i < value;
+ }
+};
+
+#else
+template <typename Index, std::size_t Rank>
+struct index_known_statically_impl<DimensionList<Index, Rank> > {
+ EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run(const DenseIndex) {
+ return true;
+ }
+};
+template <typename Index, std::size_t Rank>
+struct index_known_statically_impl<const DimensionList<Index, Rank> > {
+ EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run(const DenseIndex) {
+ return true;
+ }
+};
+
+template <typename Index, std::size_t Rank>
+struct all_indices_known_statically_impl<DimensionList<Index, Rank> > {
+ EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run() {
+ return true;
+ }
+};
+template <typename Index, std::size_t Rank>
+struct all_indices_known_statically_impl<const DimensionList<Index, Rank> > {
+ EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run() {
+ return true;
+ }
+};
+
+template <typename Index, std::size_t Rank>
+struct indices_statically_known_to_increase_impl<DimensionList<Index, Rank> > {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() {
+ return true;
+ }
+};
+template <typename Index, std::size_t Rank>
+struct indices_statically_known_to_increase_impl<const DimensionList<Index, Rank> > {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() {
+ return true;
+ }
+};
+
+template <typename Index, std::size_t Rank>
+struct index_statically_eq_impl<DimensionList<Index, Rank> > {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
+ return false;
+ }
+};
+template <typename Index, std::size_t Rank>
+struct index_statically_eq_impl<const DimensionList<Index, Rank> > {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
+ return false;
+ }
+};
+
+template <typename Index, std::size_t Rank>
+struct index_statically_ne_impl<DimensionList<Index, Rank> > {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex){
+ return false;
+ }
+};
+template <typename Index, std::size_t Rank>
+struct index_statically_ne_impl<const DimensionList<Index, Rank> > {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
+ return false;
+ }
+};
+
+template <typename Index, std::size_t Rank>
+struct index_statically_gt_impl<DimensionList<Index, Rank> > {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
+ return false;
+ }
+};
+template <typename Index, std::size_t Rank>
+struct index_statically_gt_impl<const DimensionList<Index, Rank> > {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
+ return false;
+ }
+};
+
+template <typename Index, std::size_t Rank>
+struct index_statically_lt_impl<DimensionList<Index, Rank> > {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
+ return false;
+ }
+};
+template <typename Index, std::size_t Rank>
+struct index_statically_lt_impl<const DimensionList<Index, Rank> > {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
+ return false;
+ }
+};
+#endif
+
+} // end namespace internal
+} // end namespace Eigen
+
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorDimensions.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorDimensions.h
new file mode 100644
index 0000000..f0f1e83
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorDimensions.h
@@ -0,0 +1,490 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H
+
+
+namespace Eigen {
+
+/** \internal
+ *
+ * \class TensorDimensions
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Set of classes used to encode and store the dimensions of a Tensor.
+ *
+ * The Sizes class encodes as part of the type the number of dimensions and the
+ * sizes corresponding to each dimension. It uses no storage space since it is
+ * entirely known at compile time.
+ * The DSizes class is its dynamic sibling: the number of dimensions is known
+ * at compile time but the sizes are set during execution.
+ *
+ * \sa Tensor
+ */
+
+// Boilerplate code
+namespace internal {
+
+template<std::ptrdiff_t n, typename Dimension> struct dget {
+ static const std::ptrdiff_t value = get<n, Dimension>::value;
+};
+
+
+template<typename Index, std::ptrdiff_t NumIndices, std::ptrdiff_t n, bool RowMajor>
+struct fixed_size_tensor_index_linearization_helper
+{
+ template <typename Dimensions> EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Index run(array<Index, NumIndices> const& indices,
+ const Dimensions& dimensions)
+ {
+ return array_get<RowMajor ? n - 1 : (NumIndices - n)>(indices) +
+ dget<RowMajor ? n - 1 : (NumIndices - n), Dimensions>::value *
+ fixed_size_tensor_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(indices, dimensions);
+ }
+};
+
+template<typename Index, std::ptrdiff_t NumIndices, bool RowMajor>
+struct fixed_size_tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor>
+{
+ template <typename Dimensions> EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Index run(array<Index, NumIndices> const&, const Dimensions&)
+ {
+ return 0;
+ }
+};
+
+template<typename Index, std::ptrdiff_t n>
+struct fixed_size_tensor_index_extraction_helper
+{
+ template <typename Dimensions> EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Index run(const Index index,
+ const Dimensions& dimensions)
+ {
+ const Index mult = (index == n-1) ? 1 : 0;
+ return array_get<n-1>(dimensions) * mult +
+ fixed_size_tensor_index_extraction_helper<Index, n - 1>::run(index, dimensions);
+ }
+};
+
+template<typename Index>
+struct fixed_size_tensor_index_extraction_helper<Index, 0>
+{
+ template <typename Dimensions> EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Index run(const Index,
+ const Dimensions&)
+ {
+ return 0;
+ }
+ };
+
+} // end namespace internal
+
+
+// Fixed size
+#ifndef EIGEN_EMULATE_CXX11_META_H
+template <typename std::ptrdiff_t... Indices>
+struct Sizes {
+ typedef internal::numeric_list<std::ptrdiff_t, Indices...> Base;
+ const Base t = Base();
+ static const std::ptrdiff_t total_size = internal::arg_prod(Indices...);
+ static const ptrdiff_t count = Base::count;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t rank() const {
+ return Base::count;
+ }
+
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t TotalSize() {
+ return internal::arg_prod(Indices...);
+ }
+
+ EIGEN_DEVICE_FUNC Sizes() { }
+ template <typename DenseIndex>
+ explicit EIGEN_DEVICE_FUNC Sizes(const array<DenseIndex, Base::count>& /*indices*/) {
+ // todo: add assertion
+ }
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+ template <typename... DenseIndex> EIGEN_DEVICE_FUNC Sizes(DenseIndex...) { }
+ explicit EIGEN_DEVICE_FUNC Sizes(std::initializer_list<std::ptrdiff_t> /*l*/) {
+ // todo: add assertion
+ }
+#endif
+
+ template <typename T> Sizes& operator = (const T& /*other*/) {
+ // add assertion failure if the size of other is different
+ return *this;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[] (const std::ptrdiff_t index) const {
+ return internal::fixed_size_tensor_index_extraction_helper<std::ptrdiff_t, Base::count>::run(index, t);
+ }
+
+ template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ ptrdiff_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
+ return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(indices, t);
+ }
+ template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ ptrdiff_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
+ return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(indices, t);
+ }
+};
+
+namespace internal {
+template <typename std::ptrdiff_t... Indices>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes<Indices...>&) {
+ return Sizes<Indices...>::total_size;
+}
+}
+
+#else
+
+template <std::ptrdiff_t n>
+struct non_zero_size {
+ typedef internal::type2val<std::ptrdiff_t, n> type;
+};
+template <>
+struct non_zero_size<0> {
+ typedef internal::null_type type;
+};
+
+template <std::ptrdiff_t V1=0, std::ptrdiff_t V2=0, std::ptrdiff_t V3=0, std::ptrdiff_t V4=0, std::ptrdiff_t V5=0> struct Sizes {
+ typedef typename internal::make_type_list<typename non_zero_size<V1>::type, typename non_zero_size<V2>::type, typename non_zero_size<V3>::type, typename non_zero_size<V4>::type, typename non_zero_size<V5>::type >::type Base;
+ static const std::ptrdiff_t count = Base::count;
+ static const std::ptrdiff_t total_size = internal::arg_prod<Base>::value;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptrdiff_t rank() const {
+ return count;
+ }
+
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptrdiff_t TotalSize() {
+ return internal::arg_prod<Base>::value;
+ }
+
+ Sizes() { }
+ template <typename DenseIndex>
+ explicit Sizes(const array<DenseIndex, Base::count>& /*indices*/) {
+ // todo: add assertion
+ }
+ template <typename T> Sizes& operator = (const T& /*other*/) {
+ // add assertion failure if the size of other is different
+ return *this;
+ }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+ template <typename... DenseIndex> Sizes(DenseIndex... /*indices*/) { }
+ explicit Sizes(std::initializer_list<std::ptrdiff_t>) {
+ // todo: add assertion
+ }
+#else
+ EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex) {
+ }
+ EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex) {
+ }
+ EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex) {
+ }
+ EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) {
+ }
+ EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) {
+ }
+#endif
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index operator[] (const Index index) const {
+ switch (index) {
+ case 0:
+ return internal::get<0, Base>::value;
+ case 1:
+ return internal::get<1, Base>::value;
+ case 2:
+ return internal::get<2, Base>::value;
+ case 3:
+ return internal::get<3, Base>::value;
+ case 4:
+ return internal::get<4, Base>::value;
+ default:
+ eigen_assert(false && "index overflow");
+ return static_cast<Index>(-1);
+ }
+ }
+
+ template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ ptrdiff_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
+ return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(indices, *reinterpret_cast<const Base*>(this));
+ }
+ template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ ptrdiff_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
+ return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(indices, *reinterpret_cast<const Base*>(this));
+ }
+};
+
+namespace internal {
+template <std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes<V1, V2, V3, V4, V5>&) {
+ return Sizes<V1, V2, V3, V4, V5>::total_size;
+}
+}
+
+#endif
+
+// Boilerplate
+namespace internal {
+template<typename Index, std::ptrdiff_t NumIndices, std::ptrdiff_t n, bool RowMajor>
+struct tensor_index_linearization_helper
+{
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Index run(array<Index, NumIndices> const& indices, array<Index, NumIndices> const& dimensions)
+ {
+ return array_get<RowMajor ? n : (NumIndices - n - 1)>(indices) +
+ array_get<RowMajor ? n : (NumIndices - n - 1)>(dimensions) *
+ tensor_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(indices, dimensions);
+ }
+};
+
+template<typename Index, std::ptrdiff_t NumIndices, bool RowMajor>
+struct tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor>
+{
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Index run(array<Index, NumIndices> const& indices, array<Index, NumIndices> const&)
+ {
+ return array_get<RowMajor ? 0 : NumIndices - 1>(indices);
+ }
+};
+} // end namespace internal
+
+
+
+// Dynamic size
+template <typename DenseIndex, int NumDims>
+struct DSizes : array<DenseIndex, NumDims> {
+ typedef array<DenseIndex, NumDims> Base;
+ static const int count = NumDims;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const {
+ return NumDims;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex TotalSize() const {
+ return (NumDims == 0) ? 1 : internal::array_prod(*static_cast<const Base*>(this));
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DSizes() {
+ for (int i = 0 ; i < NumDims; ++i) {
+ (*this)[i] = 0;
+ }
+ }
+ EIGEN_DEVICE_FUNC explicit DSizes(const array<DenseIndex, NumDims>& a) : Base(a) { }
+
+ EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0) {
+ eigen_assert(NumDims == 1);
+ (*this)[0] = i0;
+ }
+
+ EIGEN_DEVICE_FUNC DSizes(const DimensionList<DenseIndex, NumDims>& a) {
+ for (int i = 0 ; i < NumDims; ++i) {
+ (*this)[i] = a[i];
+ }
+ }
+
+ // Enable DSizes index type promotion only if we are promoting to the
+ // larger type, e.g. allow to promote dimensions of type int to long.
+ template<typename OtherIndex>
+ EIGEN_DEVICE_FUNC
+ explicit DSizes(const array<OtherIndex, NumDims>& other,
+ // Default template parameters require c++11.
+ typename internal::enable_if<
+ internal::is_same<
+ DenseIndex,
+ typename internal::promote_index_type<
+ DenseIndex,
+ OtherIndex
+ >::type
+ >::value, void*>::type = 0) {
+ for (int i = 0; i < NumDims; ++i) {
+ (*this)[i] = static_cast<DenseIndex>(other[i]);
+ }
+ }
+
+#ifdef EIGEN_HAS_INDEX_LIST
+ template <typename FirstType, typename... OtherTypes>
+ EIGEN_DEVICE_FUNC
+ explicit DSizes(const Eigen::IndexList<FirstType, OtherTypes...>& dimensions) {
+ for (int i = 0; i < dimensions.count; ++i) {
+ (*this)[i] = dimensions[i];
+ }
+ }
+#endif
+
+#ifndef EIGEN_EMULATE_CXX11_META_H
+ template <typename std::ptrdiff_t... Indices>
+ EIGEN_DEVICE_FUNC DSizes(const Sizes<Indices...>& a) {
+ for (int i = 0 ; i < NumDims; ++i) {
+ (*this)[i] = a[i];
+ }
+ }
+#else
+ template <std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5>
+ EIGEN_DEVICE_FUNC DSizes(const Sizes<V1, V2, V3, V4, V5>& a) {
+ for (int i = 0 ; i < NumDims; ++i) {
+ (*this)[i] = a[i];
+ }
+ }
+#endif
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+ template<typename... IndexTypes> EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, DenseIndex secondDimension, IndexTypes... otherDimensions) : Base({{firstDimension, secondDimension, otherDimensions...}}) {
+ EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 2 == NumDims, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ }
+#else
+ EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1) {
+ eigen_assert(NumDims == 2);
+ (*this)[0] = i0;
+ (*this)[1] = i1;
+ }
+ EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) {
+ eigen_assert(NumDims == 3);
+ (*this)[0] = i0;
+ (*this)[1] = i1;
+ (*this)[2] = i2;
+ }
+ EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) {
+ eigen_assert(NumDims == 4);
+ (*this)[0] = i0;
+ (*this)[1] = i1;
+ (*this)[2] = i2;
+ (*this)[3] = i3;
+ }
+ EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) {
+ eigen_assert(NumDims == 5);
+ (*this)[0] = i0;
+ (*this)[1] = i1;
+ (*this)[2] = i2;
+ (*this)[3] = i3;
+ (*this)[4] = i4;
+ }
+#endif
+
+ EIGEN_DEVICE_FUNC DSizes& operator = (const array<DenseIndex, NumDims>& other) {
+ *static_cast<Base*>(this) = other;
+ return *this;
+ }
+
+ // A constexpr would be so much better here
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex IndexOfColMajor(const array<DenseIndex, NumDims>& indices) const {
+ return internal::tensor_index_linearization_helper<DenseIndex, NumDims, NumDims - 1, false>::run(indices, *static_cast<const Base*>(this));
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex IndexOfRowMajor(const array<DenseIndex, NumDims>& indices) const {
+ return internal::tensor_index_linearization_helper<DenseIndex, NumDims, NumDims - 1, true>::run(indices, *static_cast<const Base*>(this));
+ }
+};
+
+template <typename IndexType, int NumDims>
+std::ostream& operator<<(std::ostream& os,
+ const DSizes<IndexType, NumDims>& dims) {
+ os << "[";
+ for (int i = 0; i < NumDims; ++i) {
+ if (i > 0) os << ", ";
+ os << dims[i];
+ }
+ os << "]";
+ return os;
+}
+
+// Boilerplate
+namespace internal {
+template<typename Index, std::ptrdiff_t NumIndices, std::ptrdiff_t n, bool RowMajor>
+struct tensor_vsize_index_linearization_helper
+{
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Index run(array<Index, NumIndices> const& indices, std::vector<DenseIndex> const& dimensions)
+ {
+ return array_get<RowMajor ? n : (NumIndices - n - 1)>(indices) +
+ array_get<RowMajor ? n : (NumIndices - n - 1)>(dimensions) *
+ tensor_vsize_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(indices, dimensions);
+ }
+};
+
+template<typename Index, std::ptrdiff_t NumIndices, bool RowMajor>
+struct tensor_vsize_index_linearization_helper<Index, NumIndices, 0, RowMajor>
+{
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Index run(array<Index, NumIndices> const& indices, std::vector<DenseIndex> const&)
+ {
+ return array_get<RowMajor ? 0 : NumIndices - 1>(indices);
+ }
+};
+} // end namespace internal
+
+
+namespace internal {
+
+template <typename DenseIndex, int NumDims> struct array_size<const DSizes<DenseIndex, NumDims> > {
+ static const ptrdiff_t value = NumDims;
+};
+template <typename DenseIndex, int NumDims> struct array_size<DSizes<DenseIndex, NumDims> > {
+ static const ptrdiff_t value = NumDims;
+};
+#ifndef EIGEN_EMULATE_CXX11_META_H
+template <typename std::ptrdiff_t... Indices> struct array_size<const Sizes<Indices...> > {
+static const std::ptrdiff_t value = Sizes<Indices...>::count;
+};
+template <typename std::ptrdiff_t... Indices> struct array_size<Sizes<Indices...> > {
+static const std::ptrdiff_t value = Sizes<Indices...>::count;
+};
+template <std::ptrdiff_t n, typename std::ptrdiff_t... Indices> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<Indices...>&) {
+ return get<n, internal::numeric_list<std::ptrdiff_t, Indices...> >::value;
+}
+template <std::ptrdiff_t n> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<>&) {
+ eigen_assert(false && "should never be called");
+ return -1;
+}
+#else
+template <std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5> struct array_size<const Sizes<V1,V2,V3,V4,V5> > {
+ static const ptrdiff_t value = Sizes<V1,V2,V3,V4,V5>::count;
+};
+template <std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5> struct array_size<Sizes<V1,V2,V3,V4,V5> > {
+ static const ptrdiff_t value = Sizes<V1,V2,V3,V4,V5>::count;
+};
+template <std::ptrdiff_t n, std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<V1,V2,V3,V4,V5>&) {
+ return get<n, typename Sizes<V1,V2,V3,V4,V5>::Base>::value;
+}
+
+#endif
+
+
+template <typename Dims1, typename Dims2, ptrdiff_t n, ptrdiff_t m>
+struct sizes_match_below_dim {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Dims1&, Dims2&) {
+ return false;
+ }
+};
+template <typename Dims1, typename Dims2, ptrdiff_t n>
+struct sizes_match_below_dim<Dims1, Dims2, n, n> {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Dims1& dims1, Dims2& dims2) {
+ return (array_get<n-1>(dims1) == array_get<n-1>(dims2)) &&
+ sizes_match_below_dim<Dims1, Dims2, n-1, n-1>::run(dims1, dims2);
+ }
+};
+template <typename Dims1, typename Dims2>
+struct sizes_match_below_dim<Dims1, Dims2, 0, 0> {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Dims1&, Dims2&) {
+ return true;
+ }
+};
+
+} // end namespace internal
+
+
+template <typename Dims1, typename Dims2>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool dimensions_match(Dims1 dims1, Dims2 dims2) {
+ return internal::sizes_match_below_dim<Dims1, Dims2, internal::array_size<Dims1>::value, internal::array_size<Dims2>::value>::run(dims1, dims2);
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorEvalTo.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorEvalTo.h
new file mode 100644
index 0000000..a48d035
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorEvalTo.h
@@ -0,0 +1,236 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H
+#define EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H
+
+namespace Eigen {
+
+/** \class TensorForcedEval
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor reshaping class.
+ *
+ *
+ */
+namespace internal {
+template<typename XprType, template <class> class MakePointer_>
+struct traits<TensorEvalToOp<XprType, MakePointer_> >
+{
+ // Type promotion to handle the case where the types of the lhs and the rhs are different.
+ typedef typename XprType::Scalar Scalar;
+ typedef traits<XprType> XprTraits;
+ typedef typename XprTraits::StorageKind StorageKind;
+ typedef typename XprTraits::Index Index;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = XprTraits::NumDimensions;
+ static const int Layout = XprTraits::Layout;
+ typedef typename MakePointer_<Scalar>::Type PointerType;
+
+ enum {
+ Flags = 0
+ };
+ template <class T>
+ struct MakePointer {
+ // Intermediate typedef to workaround MSVC issue.
+ typedef MakePointer_<T> MakePointerT;
+ typedef typename MakePointerT::Type Type;
+
+
+ };
+};
+
+template<typename XprType, template <class> class MakePointer_>
+struct eval<TensorEvalToOp<XprType, MakePointer_>, Eigen::Dense>
+{
+ typedef const TensorEvalToOp<XprType, MakePointer_>& type;
+};
+
+template<typename XprType, template <class> class MakePointer_>
+struct nested<TensorEvalToOp<XprType, MakePointer_>, 1, typename eval<TensorEvalToOp<XprType, MakePointer_> >::type>
+{
+ typedef TensorEvalToOp<XprType, MakePointer_> type;
+};
+
+} // end namespace internal
+
+
+
+
+template<typename XprType, template <class> class MakePointer_>
+class TensorEvalToOp : public TensorBase<TensorEvalToOp<XprType, MakePointer_>, ReadOnlyAccessors>
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorEvalToOp>::Scalar Scalar;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+ typedef typename MakePointer_<CoeffReturnType>::Type PointerType;
+ typedef typename Eigen::internal::nested<TensorEvalToOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorEvalToOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorEvalToOp>::Index Index;
+
+ static const int NumDims = Eigen::internal::traits<TensorEvalToOp>::NumDimensions;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(PointerType buffer, const XprType& expr)
+ : m_xpr(expr), m_buffer(buffer) {}
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type&
+ expression() const { return m_xpr; }
+
+ EIGEN_DEVICE_FUNC PointerType buffer() const { return m_buffer; }
+
+ protected:
+ typename XprType::Nested m_xpr;
+ PointerType m_buffer;
+};
+
+
+
+template<typename ArgType, typename Device, template <class> class MakePointer_>
+struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
+{
+ typedef TensorEvalToOp<ArgType, MakePointer_> XprType;
+ typedef typename ArgType::Scalar Scalar;
+ typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+ typedef typename XprType::Index Index;
+ typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+ typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+ enum {
+ IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = true,
+ PreferBlockAccess = false,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = true
+ };
+
+ static const int NumDims = internal::traits<ArgType>::NumDimensions;
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+ typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+ typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
+ ArgTensorBlock;
+
+ typedef internal::TensorBlockAssignment<
+ CoeffReturnType, NumDims, typename ArgTensorBlock::XprType, Index>
+ TensorBlockAssignment;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : m_impl(op.expression(), device), m_buffer(device.get(op.buffer())), m_expression(op.expression()){}
+
+
+ EIGEN_STRONG_INLINE ~TensorEvaluator() {
+ }
+
+
+ EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType scalar) {
+ EIGEN_UNUSED_VARIABLE(scalar);
+ eigen_assert(scalar == NULL);
+ return m_impl.evalSubExprsIfNeeded(m_buffer);
+ }
+
+#ifdef EIGEN_USE_THREADS
+ template <typename EvalSubExprsCallback>
+ EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+ EvaluatorPointerType scalar, EvalSubExprsCallback done) {
+ EIGEN_UNUSED_VARIABLE(scalar);
+ eigen_assert(scalar == NULL);
+ m_impl.evalSubExprsIfNeededAsync(m_buffer, std::move(done));
+ }
+#endif
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
+ m_buffer[i] = m_impl.coeff(i);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
+ internal::pstoret<CoeffReturnType, PacketReturnType, Aligned>(m_buffer + i, m_impl.template packet<TensorEvaluator<ArgType, Device>::IsAligned ? Aligned : Unaligned>(i));
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ internal::TensorBlockResourceRequirements getResourceRequirements() const {
+ return m_impl.getResourceRequirements();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(
+ TensorBlockDesc& desc, TensorBlockScratch& scratch) {
+ // Add `m_buffer` as destination buffer to the block descriptor.
+ desc.template AddDestinationBuffer<Layout>(
+ /*dst_base=*/m_buffer + desc.offset(),
+ /*dst_strides=*/internal::strides<Layout>(m_impl.dimensions()));
+
+ ArgTensorBlock block =
+ m_impl.block(desc, scratch, /*root_of_expr_ast=*/true);
+
+ // If block was evaluated into a destination buffer, there is no need to do
+ // an assignment.
+ if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) {
+ TensorBlockAssignment::Run(
+ TensorBlockAssignment::target(
+ desc.dimensions(), internal::strides<Layout>(m_impl.dimensions()),
+ m_buffer, desc.offset()),
+ block.expr());
+ }
+ block.cleanup();
+ }
+
+ EIGEN_STRONG_INLINE void cleanup() {
+ m_impl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ return m_buffer[index];
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+ {
+ return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+ // We assume that evalPacket or evalScalar is called to perform the
+ // assignment and account for the cost of the write here.
+ return m_impl.costPerCoeff(vectorized) +
+ TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize);
+ }
+
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_buffer; }
+ ArgType expression() const { return m_expression; }
+ #ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_impl.bind(cgh);
+ m_buffer.bind(cgh);
+ }
+ #endif
+
+
+ private:
+ TensorEvaluator<ArgType, Device> m_impl;
+ EvaluatorPointerType m_buffer;
+ const ArgType m_expression;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorEvaluator.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorEvaluator.h
new file mode 100644
index 0000000..3aff7fa
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorEvaluator.h
@@ -0,0 +1,983 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H
+#define EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H
+
+namespace Eigen {
+
+/** \class TensorEvaluator
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief The tensor evaluator classes.
+ *
+ * These classes are responsible for the evaluation of the tensor expression.
+ *
+ * TODO: add support for more types of expressions, in particular expressions
+ * leading to lvalues (slicing, reshaping, etc...)
+ */
+
+// Generic evaluator
+template<typename Derived, typename Device>
+struct TensorEvaluator
+{
+ typedef typename Derived::Index Index;
+ typedef typename Derived::Scalar Scalar;
+ typedef typename Derived::Scalar CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ typedef typename Derived::Dimensions Dimensions;
+ typedef Derived XprType;
+ static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+ typedef typename internal::traits<Derived>::template MakePointer<Scalar>::Type TensorPointerType;
+ typedef StorageMemory<Scalar, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+
+ // NumDimensions is -1 for variable dim tensors
+ static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
+ internal::traits<Derived>::NumDimensions : 0;
+
+ enum {
+ IsAligned = Derived::IsAligned,
+ PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+ BlockAccess = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
+ PreferBlockAccess = false,
+ Layout = Derived::Layout,
+ CoordAccess = NumCoords > 0,
+ RawAccess = true
+ };
+
+ typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc;
+ typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+ typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumCoords,
+ Layout, Index>
+ TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
+ : m_data(device.get((const_cast<TensorPointerType>(m.data())))),
+ m_dims(m.dimensions()),
+ m_device(device)
+ { }
+
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType dest) {
+ if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization && dest) {
+ m_device.memcpy((void*)(m_device.get(dest)), m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar));
+ return false;
+ }
+ return true;
+ }
+
+#ifdef EIGEN_USE_THREADS
+ template <typename EvalSubExprsCallback>
+ EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+ EvaluatorPointerType dest, EvalSubExprsCallback done) {
+ // TODO(ezhulenev): ThreadPoolDevice memcpy is blockign operation.
+ done(evalSubExprsIfNeeded(dest));
+ }
+#endif // EIGEN_USE_THREADS
+
+ EIGEN_STRONG_INLINE void cleanup() {}
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+ eigen_assert(m_data != NULL);
+ return m_data[index];
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) {
+ eigen_assert(m_data != NULL);
+ return m_data[index];
+ }
+
+ template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ PacketReturnType packet(Index index) const
+ {
+ return internal::ploadt<PacketReturnType, LoadMode>(m_data + index);
+ }
+
+ // Return a packet starting at `index` where `umask` specifies which elements
+ // have to be loaded. Type/size of mask depends on PacketReturnType, e.g. for
+ // Packet16f, `umask` is of type uint16_t and if a bit is 1, corresponding
+ // float element will be loaded, otherwise 0 will be loaded.
+ // Function has been templatized to enable Sfinae.
+ template <typename PacketReturnTypeT> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ typename internal::enable_if<internal::unpacket_traits<PacketReturnTypeT>::masked_load_available, PacketReturnTypeT>::type
+ partialPacket(Index index, typename internal::unpacket_traits<PacketReturnTypeT>::mask_t umask) const
+ {
+ return internal::ploadu<PacketReturnTypeT>(m_data + index, umask);
+ }
+
+ template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ void writePacket(Index index, const PacketReturnType& x)
+ {
+ return internal::pstoret<Scalar, PacketReturnType, StoreMode>(m_data + index, x);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<DenseIndex, NumCoords>& coords) const {
+ eigen_assert(m_data != NULL);
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ return m_data[m_dims.IndexOfColMajor(coords)];
+ } else {
+ return m_data[m_dims.IndexOfRowMajor(coords)];
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType&
+ coeffRef(const array<DenseIndex, NumCoords>& coords) {
+ eigen_assert(m_data != NULL);
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ return m_data[m_dims.IndexOfColMajor(coords)];
+ } else {
+ return m_data[m_dims.IndexOfRowMajor(coords)];
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+ return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
+ PacketType<CoeffReturnType, Device>::size);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ internal::TensorBlockResourceRequirements getResourceRequirements() const {
+ return internal::TensorBlockResourceRequirements::any();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+ block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+ bool /*root_of_expr_ast*/ = false) const {
+ assert(m_data != NULL);
+ return TensorBlock::materialize(m_data, m_dims, desc, scratch);
+ }
+
+ template<typename TensorBlock>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
+ const TensorBlockDesc& desc, const TensorBlock& block) {
+ assert(m_data != NULL);
+
+ typedef typename TensorBlock::XprType TensorBlockExpr;
+ typedef internal::TensorBlockAssignment<Scalar, NumCoords, TensorBlockExpr,
+ Index>
+ TensorBlockAssign;
+
+ TensorBlockAssign::Run(
+ TensorBlockAssign::target(desc.dimensions(),
+ internal::strides<Layout>(m_dims), m_data,
+ desc.offset()),
+ block.expr());
+ }
+
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
+
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_data.bind(cgh);
+ }
+#endif
+ protected:
+ EvaluatorPointerType m_data;
+ Dimensions m_dims;
+ const Device EIGEN_DEVICE_REF m_device;
+};
+
+namespace {
+template <typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T loadConstant(const T* address) {
+ return *address;
+}
+// Use the texture cache on CUDA devices whenever possible
+#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
+template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+float loadConstant(const float* address) {
+ return __ldg(address);
+}
+template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+double loadConstant(const double* address) {
+ return __ldg(address);
+}
+template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+Eigen::half loadConstant(const Eigen::half* address) {
+ return Eigen::half(half_impl::raw_uint16_to_half(__ldg(&address->x)));
+}
+#endif
+#ifdef EIGEN_USE_SYCL
+// overload of load constant should be implemented here based on range access
+template <cl::sycl::access::mode AcMd, typename T>
+T &loadConstant(const Eigen::TensorSycl::internal::RangeAccess<AcMd, T> &address) {
+ return *address;
+}
+#endif
+}
+
+
+// Default evaluator for rvalues
+template<typename Derived, typename Device>
+struct TensorEvaluator<const Derived, Device>
+{
+ typedef typename Derived::Index Index;
+ typedef typename Derived::Scalar Scalar;
+ typedef typename Derived::Scalar CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ typedef typename Derived::Dimensions Dimensions;
+ typedef const Derived XprType;
+ typedef typename internal::traits<Derived>::template MakePointer<const Scalar>::Type TensorPointerType;
+ typedef StorageMemory<const Scalar, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+
+ typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+ // NumDimensions is -1 for variable dim tensors
+ static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
+ internal::traits<Derived>::NumDimensions : 0;
+ static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+
+ enum {
+ IsAligned = Derived::IsAligned,
+ PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+ BlockAccess = internal::is_arithmetic<ScalarNoConst>::value,
+ PreferBlockAccess = false,
+ Layout = Derived::Layout,
+ CoordAccess = NumCoords > 0,
+ RawAccess = true
+ };
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc;
+ typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+ typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumCoords,
+ Layout, Index>
+ TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
+ : m_data(device.get(m.data())), m_dims(m.dimensions()), m_device(device)
+ { }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+ if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization && data) {
+ m_device.memcpy((void*)(m_device.get(data)),m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar));
+ return false;
+ }
+ return true;
+ }
+
+#ifdef EIGEN_USE_THREADS
+ template <typename EvalSubExprsCallback>
+ EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+ EvaluatorPointerType dest, EvalSubExprsCallback done) {
+ // TODO(ezhulenev): ThreadPoolDevice memcpy is a blockign operation.
+ done(evalSubExprsIfNeeded(dest));
+ }
+#endif // EIGEN_USE_THREADS
+
+ EIGEN_STRONG_INLINE void cleanup() { }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+ eigen_assert(m_data != NULL);
+ return loadConstant(m_data+index);
+ }
+
+ template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ PacketReturnType packet(Index index) const
+ {
+ return internal::ploadt_ro<PacketReturnType, LoadMode>(m_data + index);
+ }
+
+ // Return a packet starting at `index` where `umask` specifies which elements
+ // have to be loaded. Type/size of mask depends on PacketReturnType, e.g. for
+ // Packet16f, `umask` is of type uint16_t and if a bit is 1, corresponding
+ // float element will be loaded, otherwise 0 will be loaded.
+ // Function has been templatized to enable Sfinae.
+ template <typename PacketReturnTypeT> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ typename internal::enable_if<internal::unpacket_traits<PacketReturnTypeT>::masked_load_available, PacketReturnTypeT>::type
+ partialPacket(Index index, typename internal::unpacket_traits<PacketReturnTypeT>::mask_t umask) const
+ {
+ return internal::ploadu<PacketReturnTypeT>(m_data + index, umask);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<DenseIndex, NumCoords>& coords) const {
+ eigen_assert(m_data != NULL);
+ const Index index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_dims.IndexOfColMajor(coords)
+ : m_dims.IndexOfRowMajor(coords);
+ return loadConstant(m_data+index);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+ return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
+ PacketType<CoeffReturnType, Device>::size);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ internal::TensorBlockResourceRequirements getResourceRequirements() const {
+ return internal::TensorBlockResourceRequirements::any();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+ block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+ bool /*root_of_expr_ast*/ = false) const {
+ assert(m_data != NULL);
+ return TensorBlock::materialize(m_data, m_dims, desc, scratch);
+ }
+
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_data.bind(cgh);
+ }
+#endif
+ protected:
+ EvaluatorPointerType m_data;
+ Dimensions m_dims;
+ const Device EIGEN_DEVICE_REF m_device;
+};
+
+
+
+
+// -------------------- CwiseNullaryOp --------------------
+
+template<typename NullaryOp, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
+{
+ typedef TensorCwiseNullaryOp<NullaryOp, ArgType> XprType;
+
+ TensorEvaluator(const XprType& op, const Device& device)
+ : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper()
+ { }
+
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+ typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+
+ enum {
+ IsAligned = true,
+ PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess
+ #ifdef EIGEN_USE_SYCL
+ && (PacketType<CoeffReturnType, Device>::size >1)
+ #endif
+ ,
+ BlockAccess = false,
+ PreferBlockAccess = false,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = false
+ };
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; }
+
+#ifdef EIGEN_USE_THREADS
+ template <typename EvalSubExprsCallback>
+ EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+ EvaluatorPointerType, EvalSubExprsCallback done) {
+ done(true);
+ }
+#endif // EIGEN_USE_THREADS
+
+ EIGEN_STRONG_INLINE void cleanup() { }
+
+ EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+ {
+ return m_wrapper(m_functor, index);
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+ {
+ return m_wrapper.template packetOp<PacketReturnType, Index>(m_functor, index);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+ costPerCoeff(bool vectorized) const {
+ return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
+ PacketType<CoeffReturnType, Device>::size);
+ }
+
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_argImpl.bind(cgh);
+ }
+#endif
+
+ private:
+ const NullaryOp m_functor;
+ TensorEvaluator<ArgType, Device> m_argImpl;
+ const internal::nullary_wrapper<CoeffReturnType,NullaryOp> m_wrapper;
+};
+
+
+
+// -------------------- CwiseUnaryOp --------------------
+
+template<typename UnaryOp, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
+{
+ typedef TensorCwiseUnaryOp<UnaryOp, ArgType> XprType;
+
+ enum {
+ IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+ PacketAccess = int(TensorEvaluator<ArgType, Device>::PacketAccess) &
+ int(internal::functor_traits<UnaryOp>::PacketAccess),
+ BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
+ PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = false
+ };
+
+ TensorEvaluator(const XprType& op, const Device& device)
+ : m_device(device),
+ m_functor(op.functor()),
+ m_argImpl(op.nestedExpression(), device)
+ { }
+
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+ typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+ typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+ static const int NumDims = internal::array_size<Dimensions>::value;
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+ typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+ typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
+ ArgTensorBlock;
+
+ typedef internal::TensorCwiseUnaryBlock<UnaryOp, ArgTensorBlock>
+ TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+ m_argImpl.evalSubExprsIfNeeded(NULL);
+ return true;
+ }
+
+#ifdef EIGEN_USE_THREADS
+ template <typename EvalSubExprsCallback>
+ EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+ EvaluatorPointerType, EvalSubExprsCallback done) {
+ m_argImpl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+ }
+#endif // EIGEN_USE_THREADS
+
+ EIGEN_STRONG_INLINE void cleanup() {
+ m_argImpl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+ {
+ return m_functor(m_argImpl.coeff(index));
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+ {
+ return m_functor.packetOp(m_argImpl.template packet<LoadMode>(index));
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+ const double functor_cost = internal::functor_traits<UnaryOp>::Cost;
+ return m_argImpl.costPerCoeff(vectorized) +
+ TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ internal::TensorBlockResourceRequirements getResourceRequirements() const {
+ static const double functor_cost = internal::functor_traits<UnaryOp>::Cost;
+ return m_argImpl.getResourceRequirements().addCostPerCoeff(
+ {0, 0, functor_cost / PacketSize});
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+ block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+ bool /*root_of_expr_ast*/ = false) const {
+ return TensorBlock(m_argImpl.block(desc, scratch), m_functor);
+ }
+
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const{
+ m_argImpl.bind(cgh);
+ }
+#endif
+
+
+ private:
+ const Device EIGEN_DEVICE_REF m_device;
+ const UnaryOp m_functor;
+ TensorEvaluator<ArgType, Device> m_argImpl;
+};
+
+
+// -------------------- CwiseBinaryOp --------------------
+
+template<typename BinaryOp, typename LeftArgType, typename RightArgType, typename Device>
+struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType>, Device>
+{
+ typedef TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType> XprType;
+
+ enum {
+ IsAligned = int(TensorEvaluator<LeftArgType, Device>::IsAligned) &
+ int(TensorEvaluator<RightArgType, Device>::IsAligned),
+ PacketAccess = int(TensorEvaluator<LeftArgType, Device>::PacketAccess) &
+ int(TensorEvaluator<RightArgType, Device>::PacketAccess) &
+ int(internal::functor_traits<BinaryOp>::PacketAccess),
+ BlockAccess = int(TensorEvaluator<LeftArgType, Device>::BlockAccess) &
+ int(TensorEvaluator<RightArgType, Device>::BlockAccess),
+ PreferBlockAccess = int(TensorEvaluator<LeftArgType, Device>::PreferBlockAccess) |
+ int(TensorEvaluator<RightArgType, Device>::PreferBlockAccess),
+ Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = false
+ };
+
+ TensorEvaluator(const XprType& op, const Device& device)
+ : m_device(device),
+ m_functor(op.functor()),
+ m_leftImpl(op.lhsExpression(), device),
+ m_rightImpl(op.rhsExpression(), device)
+ {
+ EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) || internal::traits<XprType>::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
+ eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions()));
+ }
+
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+ typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+
+ static const int NumDims = internal::array_size<
+ typename TensorEvaluator<LeftArgType, Device>::Dimensions>::value;
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+ typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+ typedef typename TensorEvaluator<const LeftArgType, Device>::TensorBlock
+ LeftTensorBlock;
+ typedef typename TensorEvaluator<const RightArgType, Device>::TensorBlock
+ RightTensorBlock;
+
+ typedef internal::TensorCwiseBinaryBlock<BinaryOp, LeftTensorBlock,
+ RightTensorBlock>
+ TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
+ {
+ // TODO: use right impl instead if right impl dimensions are known at compile time.
+ return m_leftImpl.dimensions();
+ }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+ m_leftImpl.evalSubExprsIfNeeded(NULL);
+ m_rightImpl.evalSubExprsIfNeeded(NULL);
+ return true;
+ }
+
+#ifdef EIGEN_USE_THREADS
+ template <typename EvalSubExprsCallback>
+ EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+ EvaluatorPointerType, EvalSubExprsCallback done) {
+ // TODO(ezhulenev): Evaluate two expression in parallel?
+ m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done](bool) {
+ m_rightImpl.evalSubExprsIfNeededAsync(nullptr,
+ [done](bool) { done(true); });
+ });
+ }
+#endif // EIGEN_USE_THREADS
+
+ EIGEN_STRONG_INLINE void cleanup() {
+ m_leftImpl.cleanup();
+ m_rightImpl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+ {
+ return m_functor(m_leftImpl.coeff(index), m_rightImpl.coeff(index));
+ }
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+ {
+ return m_functor.packetOp(m_leftImpl.template packet<LoadMode>(index), m_rightImpl.template packet<LoadMode>(index));
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+ costPerCoeff(bool vectorized) const {
+ const double functor_cost = internal::functor_traits<BinaryOp>::Cost;
+ return m_leftImpl.costPerCoeff(vectorized) +
+ m_rightImpl.costPerCoeff(vectorized) +
+ TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ internal::TensorBlockResourceRequirements getResourceRequirements() const {
+ static const double functor_cost = internal::functor_traits<BinaryOp>::Cost;
+ return internal::TensorBlockResourceRequirements::merge(
+ m_leftImpl.getResourceRequirements(),
+ m_rightImpl.getResourceRequirements())
+ .addCostPerCoeff({0, 0, functor_cost / PacketSize});
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+ block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+ bool /*root_of_expr_ast*/ = false) const {
+ desc.DropDestinationBuffer();
+ return TensorBlock(m_leftImpl.block(desc, scratch),
+ m_rightImpl.block(desc, scratch), m_functor);
+ }
+
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+ #ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_leftImpl.bind(cgh);
+ m_rightImpl.bind(cgh);
+ }
+ #endif
+ private:
+ const Device EIGEN_DEVICE_REF m_device;
+ const BinaryOp m_functor;
+ TensorEvaluator<LeftArgType, Device> m_leftImpl;
+ TensorEvaluator<RightArgType, Device> m_rightImpl;
+};
+
+// -------------------- CwiseTernaryOp --------------------
+
+template<typename TernaryOp, typename Arg1Type, typename Arg2Type, typename Arg3Type, typename Device>
+struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, Arg3Type>, Device>
+{
+ typedef TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, Arg3Type> XprType;
+
+ enum {
+ IsAligned = TensorEvaluator<Arg1Type, Device>::IsAligned & TensorEvaluator<Arg2Type, Device>::IsAligned & TensorEvaluator<Arg3Type, Device>::IsAligned,
+ PacketAccess = TensorEvaluator<Arg1Type, Device>::PacketAccess &&
+ TensorEvaluator<Arg2Type, Device>::PacketAccess &&
+ TensorEvaluator<Arg3Type, Device>::PacketAccess &&
+ internal::functor_traits<TernaryOp>::PacketAccess,
+ BlockAccess = false,
+ PreferBlockAccess = TensorEvaluator<Arg1Type, Device>::PreferBlockAccess ||
+ TensorEvaluator<Arg2Type, Device>::PreferBlockAccess ||
+ TensorEvaluator<Arg3Type, Device>::PreferBlockAccess,
+ Layout = TensorEvaluator<Arg1Type, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = false
+ };
+
+ TensorEvaluator(const XprType& op, const Device& device)
+ : m_functor(op.functor()),
+ m_arg1Impl(op.arg1Expression(), device),
+ m_arg2Impl(op.arg2Expression(), device),
+ m_arg3Impl(op.arg3Expression(), device)
+ {
+ EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<Arg1Type, Device>::Layout) == static_cast<int>(TensorEvaluator<Arg3Type, Device>::Layout) || internal::traits<XprType>::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+ EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind,
+ typename internal::traits<Arg2Type>::StorageKind>::value),
+ STORAGE_KIND_MUST_MATCH)
+ EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind,
+ typename internal::traits<Arg3Type>::StorageKind>::value),
+ STORAGE_KIND_MUST_MATCH)
+ EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::Index,
+ typename internal::traits<Arg2Type>::Index>::value),
+ STORAGE_INDEX_MUST_MATCH)
+ EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::Index,
+ typename internal::traits<Arg3Type>::Index>::value),
+ STORAGE_INDEX_MUST_MATCH)
+
+ eigen_assert(dimensions_match(m_arg1Impl.dimensions(), m_arg2Impl.dimensions()) && dimensions_match(m_arg1Impl.dimensions(), m_arg3Impl.dimensions()));
+ }
+
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+ typedef typename TensorEvaluator<Arg1Type, Device>::Dimensions Dimensions;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
+ {
+ // TODO: use arg2 or arg3 dimensions if they are known at compile time.
+ return m_arg1Impl.dimensions();
+ }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+ m_arg1Impl.evalSubExprsIfNeeded(NULL);
+ m_arg2Impl.evalSubExprsIfNeeded(NULL);
+ m_arg3Impl.evalSubExprsIfNeeded(NULL);
+ return true;
+ }
+ EIGEN_STRONG_INLINE void cleanup() {
+ m_arg1Impl.cleanup();
+ m_arg2Impl.cleanup();
+ m_arg3Impl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+ {
+ return m_functor(m_arg1Impl.coeff(index), m_arg2Impl.coeff(index), m_arg3Impl.coeff(index));
+ }
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+ {
+ return m_functor.packetOp(m_arg1Impl.template packet<LoadMode>(index),
+ m_arg2Impl.template packet<LoadMode>(index),
+ m_arg3Impl.template packet<LoadMode>(index));
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+ costPerCoeff(bool vectorized) const {
+ const double functor_cost = internal::functor_traits<TernaryOp>::Cost;
+ return m_arg1Impl.costPerCoeff(vectorized) +
+ m_arg2Impl.costPerCoeff(vectorized) +
+ m_arg3Impl.costPerCoeff(vectorized) +
+ TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
+ }
+
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_arg1Impl.bind(cgh);
+ m_arg2Impl.bind(cgh);
+ m_arg3Impl.bind(cgh);
+ }
+#endif
+
+ private:
+ const TernaryOp m_functor;
+ TensorEvaluator<Arg1Type, Device> m_arg1Impl;
+ TensorEvaluator<Arg2Type, Device> m_arg2Impl;
+ TensorEvaluator<Arg3Type, Device> m_arg3Impl;
+};
+
+
+// -------------------- SelectOp --------------------
+
+template<typename IfArgType, typename ThenArgType, typename ElseArgType, typename Device>
+struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>, Device>
+{
+ typedef TensorSelectOp<IfArgType, ThenArgType, ElseArgType> XprType;
+ typedef typename XprType::Scalar Scalar;
+
+ enum {
+ IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned &
+ TensorEvaluator<ElseArgType, Device>::IsAligned,
+ PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess &
+ TensorEvaluator<ElseArgType, Device>::PacketAccess &
+ PacketType<Scalar, Device>::HasBlend,
+ BlockAccess = TensorEvaluator<IfArgType, Device>::BlockAccess &&
+ TensorEvaluator<ThenArgType, Device>::BlockAccess &&
+ TensorEvaluator<ElseArgType, Device>::BlockAccess,
+ PreferBlockAccess = TensorEvaluator<IfArgType, Device>::PreferBlockAccess ||
+ TensorEvaluator<ThenArgType, Device>::PreferBlockAccess ||
+ TensorEvaluator<ElseArgType, Device>::PreferBlockAccess,
+ Layout = TensorEvaluator<IfArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = false
+ };
+
+ TensorEvaluator(const XprType& op, const Device& device)
+ : m_condImpl(op.ifExpression(), device),
+ m_thenImpl(op.thenExpression(), device),
+ m_elseImpl(op.elseExpression(), device)
+ {
+ EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<IfArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<ThenArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+ EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<IfArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<ElseArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+ eigen_assert(dimensions_match(m_condImpl.dimensions(), m_thenImpl.dimensions()));
+ eigen_assert(dimensions_match(m_thenImpl.dimensions(), m_elseImpl.dimensions()));
+ }
+
+ typedef typename XprType::Index Index;
+ typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+ typedef typename TensorEvaluator<IfArgType, Device>::Dimensions Dimensions;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+
+ static const int NumDims = internal::array_size<Dimensions>::value;
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+ typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+ typedef typename TensorEvaluator<const IfArgType, Device>::TensorBlock
+ IfArgTensorBlock;
+ typedef typename TensorEvaluator<const ThenArgType, Device>::TensorBlock
+ ThenArgTensorBlock;
+ typedef typename TensorEvaluator<const ElseArgType, Device>::TensorBlock
+ ElseArgTensorBlock;
+
+ struct TensorSelectOpBlockFactory {
+ template <typename IfArgXprType, typename ThenArgXprType, typename ElseArgXprType>
+ struct XprType {
+ typedef TensorSelectOp<const IfArgXprType, const ThenArgXprType, const ElseArgXprType> type;
+ };
+
+ template <typename IfArgXprType, typename ThenArgXprType, typename ElseArgXprType>
+ typename XprType<IfArgXprType, ThenArgXprType, ElseArgXprType>::type expr(
+ const IfArgXprType& if_expr, const ThenArgXprType& then_expr, const ElseArgXprType& else_expr) const {
+ return typename XprType<IfArgXprType, ThenArgXprType, ElseArgXprType>::type(if_expr, then_expr, else_expr);
+ }
+ };
+
+ typedef internal::TensorTernaryExprBlock<TensorSelectOpBlockFactory,
+ IfArgTensorBlock, ThenArgTensorBlock,
+ ElseArgTensorBlock>
+ TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
+ {
+ // TODO: use then or else impl instead if they happen to be known at compile time.
+ return m_condImpl.dimensions();
+ }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+ m_condImpl.evalSubExprsIfNeeded(NULL);
+ m_thenImpl.evalSubExprsIfNeeded(NULL);
+ m_elseImpl.evalSubExprsIfNeeded(NULL);
+ return true;
+ }
+
+#ifdef EIGEN_USE_THREADS
+ template <typename EvalSubExprsCallback>
+ EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+ EvaluatorPointerType, EvalSubExprsCallback done) {
+ m_condImpl.evalSubExprsIfNeeded(nullptr, [this, done](bool) {
+ m_thenImpl.evalSubExprsIfNeeded(nullptr, [this, done](bool) {
+ m_elseImpl.evalSubExprsIfNeeded(nullptr, [done](bool) { done(true); });
+ });
+ });
+ }
+#endif // EIGEN_USE_THREADS
+
+ EIGEN_STRONG_INLINE void cleanup() {
+ m_condImpl.cleanup();
+ m_thenImpl.cleanup();
+ m_elseImpl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+ {
+ return m_condImpl.coeff(index) ? m_thenImpl.coeff(index) : m_elseImpl.coeff(index);
+ }
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
+ {
+ internal::Selector<PacketSize> select;
+ EIGEN_UNROLL_LOOP
+ for (Index i = 0; i < PacketSize; ++i) {
+ select.select[i] = m_condImpl.coeff(index+i);
+ }
+ return internal::pblend(select,
+ m_thenImpl.template packet<LoadMode>(index),
+ m_elseImpl.template packet<LoadMode>(index));
+
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+ costPerCoeff(bool vectorized) const {
+ return m_condImpl.costPerCoeff(vectorized) +
+ m_thenImpl.costPerCoeff(vectorized)
+ .cwiseMax(m_elseImpl.costPerCoeff(vectorized));
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ internal::TensorBlockResourceRequirements getResourceRequirements() const {
+ auto then_req = m_thenImpl.getResourceRequirements();
+ auto else_req = m_elseImpl.getResourceRequirements();
+
+ auto merged_req =
+ internal::TensorBlockResourceRequirements::merge(then_req, else_req);
+ merged_req.cost_per_coeff =
+ then_req.cost_per_coeff.cwiseMax(else_req.cost_per_coeff);
+
+ return internal::TensorBlockResourceRequirements::merge(
+ m_condImpl.getResourceRequirements(), merged_req);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+ block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+ bool /*root_of_expr_ast*/ = false) const {
+ // It's unsafe to pass destination buffer to underlying expressions, because
+ // output might be aliased with one of the inputs.
+ desc.DropDestinationBuffer();
+
+ return TensorBlock(
+ m_condImpl.block(desc, scratch), m_thenImpl.block(desc, scratch),
+ m_elseImpl.block(desc, scratch), TensorSelectOpBlockFactory());
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_condImpl.bind(cgh);
+ m_thenImpl.bind(cgh);
+ m_elseImpl.bind(cgh);
+ }
+#endif
+ private:
+ TensorEvaluator<IfArgType, Device> m_condImpl;
+ TensorEvaluator<ThenArgType, Device> m_thenImpl;
+ TensorEvaluator<ElseArgType, Device> m_elseImpl;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorExecutor.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorExecutor.h
new file mode 100644
index 0000000..c52fb77
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorExecutor.h
@@ -0,0 +1,703 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
+#define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
+
+namespace Eigen {
+
+/**
+ * \class TensorExecutor
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief The tensor executor class.
+ *
+ * This class is responsible for launch the evaluation of the expression on
+ * the specified computing device.
+ *
+ * @tparam Vectorizable can use packet math (SSE/AVX/etc... registers and
+ * instructions)
+ * @tparam Tiling can use block based tensor evaluation
+ * (see TensorBlock.h)
+ */
+namespace internal {
+
+/**
+ * Evaluating TensorBroadcastingOp via coefficient of packet path is extremely
+ * expensive. If expression has at least one broadcast op in it, and it supports
+ * block based evaluation, we always prefer it, even for the small tensors. For
+ * all other tileable ops, block evaluation overhead for small tensors (fits
+ * into L1) is too large, and we fallback on vectorized evaluation.
+ */
+
+// TODO(ezhulenev): Add specializations for all other types of Tensor ops.
+
+template<typename Expression>
+struct ExpressionHasTensorBroadcastingOp {
+ enum { value = false };
+};
+
+template<typename LhsXprType, typename RhsXprType>
+struct ExpressionHasTensorBroadcastingOp<
+ const TensorAssignOp<LhsXprType, RhsXprType> > {
+ enum { value = ExpressionHasTensorBroadcastingOp<RhsXprType>::value };
+};
+
+template<typename UnaryOp, typename XprType>
+struct ExpressionHasTensorBroadcastingOp<
+ const TensorCwiseUnaryOp<UnaryOp, XprType> > {
+ enum { value = ExpressionHasTensorBroadcastingOp<XprType>::value };
+};
+
+template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
+struct ExpressionHasTensorBroadcastingOp<
+ const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> > {
+ enum {
+ value = ExpressionHasTensorBroadcastingOp<LhsXprType>::value ||
+ ExpressionHasTensorBroadcastingOp<RhsXprType>::value
+ };
+};
+
+template<typename Broadcast, typename XprType>
+struct ExpressionHasTensorBroadcastingOp<
+ const TensorBroadcastingOp<Broadcast, XprType> > {
+ enum { value = true };
+};
+
+// -------------------------------------------------------------------------- //
+
+/**
+ * Default strategy: the expression is evaluated sequentially with a single cpu
+ * thread, without vectorization and block evaluation.
+ */
+template <typename Expression, typename Device, bool Vectorizable,
+ TiledEvaluation Tiling>
+class TensorExecutor {
+ public:
+ typedef typename Expression::Index StorageIndex;
+
+ // Including `unsupported/Eigen/CXX11/Tensor` in different translation units
+ // with/without `EIGEN_USE_THREADS` or `EIGEN_USE_GPU` is a potential ODR
+ // violation. If this template is instantiated with a non-default device, it
+ // means that this header file was included without defining
+ // `EIGEN_USE_THREADS`, `EIGEN_USE_GPU` or `EIGEN_USE_SYCL`.
+ static_assert(std::is_same<Device, DefaultDevice>::value,
+ "Default executor instantiated with non-default device. "
+ "You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or "
+ "EIGEN_USE_SYCL before including Eigen headers.");
+
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE void run(const Expression& expr,
+ const Device& device = Device()) {
+ TensorEvaluator<Expression, Device> evaluator(expr, device);
+ const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+ if (needs_assign) {
+ const StorageIndex size = array_prod(evaluator.dimensions());
+ for (StorageIndex i = 0; i < size; ++i) {
+ evaluator.evalScalar(i);
+ }
+ }
+ evaluator.cleanup();
+ }
+};
+
+/**
+ * Default async execution strategy is not implemented. Currently it's only
+ * available for ThreadPoolDevice (see definition below).
+ */
+template <typename Expression, typename Device, typename DoneCallback,
+ bool Vectorizable, TiledEvaluation Tiling>
+class TensorAsyncExecutor {};
+
+/**
+ * Process all the data with a single cpu thread, using vectorized instructions.
+ */
+template <typename Expression>
+class TensorExecutor<Expression, DefaultDevice, /*Vectorizable=*/true,
+ /*Tiling=*/TiledEvaluation::Off> {
+ public:
+ typedef typename Expression::Index StorageIndex;
+
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE void run(
+ const Expression& expr, const DefaultDevice& device = DefaultDevice()) {
+ TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
+ const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+ if (needs_assign) {
+ const StorageIndex size = array_prod(evaluator.dimensions());
+ const int PacketSize = unpacket_traits<typename TensorEvaluator<
+ Expression, DefaultDevice>::PacketReturnType>::size;
+
+ // Give compiler a strong possibility to unroll the loop. But don't insist
+ // on unrolling, because if the function is expensive compiler should not
+ // unroll the loop at the expense of inlining.
+ const StorageIndex UnrolledSize =
+ (size / (4 * PacketSize)) * 4 * PacketSize;
+ for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) {
+ for (StorageIndex j = 0; j < 4; j++) {
+ evaluator.evalPacket(i + j * PacketSize);
+ }
+ }
+ const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize;
+ for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
+ evaluator.evalPacket(i);
+ }
+ for (StorageIndex i = VectorizedSize; i < size; ++i) {
+ evaluator.evalScalar(i);
+ }
+ }
+ evaluator.cleanup();
+ }
+};
+
+/**
+ * Process all the data with a single cpu thread, using blocks of data. By
+ * sizing a block to fit L1 cache we get better cache performance.
+ */
+template <typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, DefaultDevice, Vectorizable,
+ /*Tiling=*/TiledEvaluation::On> {
+ public:
+ typedef typename traits<Expression>::Scalar Scalar;
+ typedef typename remove_const<Scalar>::type ScalarNoConst;
+
+ typedef TensorEvaluator<Expression, DefaultDevice> Evaluator;
+ typedef typename traits<Expression>::Index StorageIndex;
+
+ static const int NumDims = traits<Expression>::NumDimensions;
+
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE void run(const Expression& expr,
+ const DefaultDevice& device = DefaultDevice()) {
+ typedef TensorBlockMapper<NumDims, Evaluator::Layout, StorageIndex>
+ TensorBlockMapper;
+
+ typedef internal::TensorBlockDescriptor<NumDims, StorageIndex>
+ TensorBlockDesc;
+ typedef internal::TensorBlockScratchAllocator<DefaultDevice>
+ TensorBlockScratch;
+
+ Evaluator evaluator(expr, device);
+
+ // TODO(ezhulenev): Do not use tiling for small tensors?
+ const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+
+ if (needs_assign) {
+ // Query expression tree for desired block size/shape.
+ const TensorBlockResourceRequirements requirements =
+ evaluator.getResourceRequirements();
+
+ const TensorBlockMapper block_mapper(
+ typename TensorBlockDesc::Dimensions(evaluator.dimensions()),
+ requirements);
+
+ // Share scratch memory allocator between all blocks.
+ TensorBlockScratch scratch(device);
+
+ const StorageIndex total_block_count = block_mapper.blockCount();
+ for (StorageIndex i = 0; i < total_block_count; ++i) {
+ TensorBlockDesc desc = block_mapper.blockDescriptor(i);
+ evaluator.evalBlock(desc, scratch);
+ scratch.reset();
+ }
+ }
+ evaluator.cleanup();
+ }
+};
+
+/**
+ * Multicore strategy: the index space is partitioned and each partition is
+ * executed on a single core.
+ *
+ * (1) TensorExecutor will submit work to the ThreadPoolDevice managed thread
+ * pool, and will block the caller thread until all tasks are finished.
+ *
+ * (2) TensorAsyncExecutor is a non-blocking version, that will submit work to
+ * the ThreadPoolDevice managed thread pool, and will return immediately.
+ * It will call 'done' callback after all tasks are finished.
+ */
+#ifdef EIGEN_USE_THREADS
+
+template <typename TensorBlockMapper>
+struct TensorExecutorTilingContext {
+ TensorExecutorTilingContext() = default;
+ TensorExecutorTilingContext(const TensorBlockMapper& b_mapper,
+ const TensorOpCost& b_cost, size_t b_aligned_size)
+ : block_mapper(b_mapper),
+ cost(b_cost),
+ aligned_blocksize(b_aligned_size) {}
+
+ TensorBlockMapper block_mapper; // navigate through blocks
+ TensorOpCost cost; // cost of computing a single block
+ size_t aligned_blocksize; // block size after memory alignment
+};
+
+// Computes a block evaluation parameters, and allocates temporary memory buffer
+// for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below.
+template <typename Evaluator, typename TensorBlockMapper, bool Vectorizable>
+TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(
+ const Evaluator& evaluator) {
+ // Query expression tree for desired block size/shape.
+ TensorBlockResourceRequirements requirements =
+ evaluator.getResourceRequirements();
+
+ // Update target block size based on cost model.
+ double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(
+ 1, requirements.cost_per_coeff);
+ requirements.size = static_cast<size_t>(1.0 / taskSize);
+
+ TensorBlockMapper block_mapper(
+ typename TensorBlockMapper::Dimensions(evaluator.dimensions()),
+ requirements);
+
+ size_t block_size = block_mapper.blockTotalSize();
+ const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
+ const size_t aligned_blocksize =
+ align *
+ divup<size_t>(block_size * sizeof(typename Evaluator::Scalar), align);
+
+ return {block_mapper, requirements.cost_per_coeff * block_size,
+ aligned_blocksize};
+}
+
+template <typename Evaluator, typename StorageIndex, bool Vectorizable>
+struct EvalRange {
+ static void run(Evaluator* evaluator_in, const StorageIndex firstIdx,
+ const StorageIndex lastIdx) {
+ Evaluator evaluator = *evaluator_in;
+ eigen_assert(lastIdx >= firstIdx);
+ for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
+ evaluator.evalScalar(i);
+ }
+ }
+
+ static StorageIndex alignBlockSize(StorageIndex size) { return size; }
+};
+
+template <typename Evaluator, typename StorageIndex>
+struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> {
+ static const int PacketSize =
+ unpacket_traits<typename Evaluator::PacketReturnType>::size;
+
+ static void run(Evaluator* evaluator_in, const StorageIndex firstIdx,
+ const StorageIndex lastIdx) {
+ Evaluator evaluator = *evaluator_in;
+ eigen_assert(lastIdx >= firstIdx);
+ StorageIndex i = firstIdx;
+ if (lastIdx - firstIdx >= PacketSize) {
+ eigen_assert(firstIdx % PacketSize == 0);
+ StorageIndex last_chunk_offset = lastIdx - 4 * PacketSize;
+ // Give compiler a strong possibility to unroll the loop. But don't insist
+ // on unrolling, because if the function is expensive compiler should not
+ // unroll the loop at the expense of inlining.
+ for (; i <= last_chunk_offset; i += 4 * PacketSize) {
+ for (StorageIndex j = 0; j < 4; j++) {
+ evaluator.evalPacket(i + j * PacketSize);
+ }
+ }
+ last_chunk_offset = lastIdx - PacketSize;
+ for (; i <= last_chunk_offset; i += PacketSize) {
+ evaluator.evalPacket(i);
+ }
+ }
+ for (; i < lastIdx; ++i) {
+ evaluator.evalScalar(i);
+ }
+ }
+
+ static StorageIndex alignBlockSize(StorageIndex size) {
+ // Align block size to packet size and account for unrolling in run above.
+ if (size >= 16 * PacketSize) {
+ return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
+ }
+ // Aligning to 4 * PacketSize would increase block size by more than 25%.
+ return (size + PacketSize - 1) & ~(PacketSize - 1);
+ }
+};
+
+template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tiling> {
+ public:
+ typedef typename Expression::Index StorageIndex;
+
+ static EIGEN_STRONG_INLINE void run(const Expression& expr,
+ const ThreadPoolDevice& device) {
+ typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+ typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
+
+ Evaluator evaluator(expr, device);
+ const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
+ if (needs_assign) {
+ const StorageIndex size = array_prod(evaluator.dimensions());
+ device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
+ EvalRange::alignBlockSize,
+ [&evaluator](StorageIndex firstIdx, StorageIndex lastIdx) {
+ EvalRange::run(&evaluator, firstIdx, lastIdx);
+ });
+ }
+ evaluator.cleanup();
+ }
+};
+
+template <typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
+ /*Tiling=*/TiledEvaluation::On> {
+ public:
+ typedef typename traits<Expression>::Index IndexType;
+ typedef typename traits<Expression>::Scalar Scalar;
+ typedef typename remove_const<Scalar>::type ScalarNoConst;
+
+ static const int NumDims = traits<Expression>::NumDimensions;
+
+ typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+ typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
+ typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
+
+ typedef internal::TensorBlockDescriptor<NumDims, IndexType>
+ TensorBlockDesc;
+ typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
+ TensorBlockScratch;
+
+ static EIGEN_STRONG_INLINE void run(const Expression& expr,
+ const ThreadPoolDevice& device) {
+ Evaluator evaluator(expr, device);
+
+ const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
+ if (needs_assign) {
+ const TilingContext tiling =
+ internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper,
+ Vectorizable>(evaluator);
+
+ auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx,
+ IndexType lastBlockIdx) {
+ TensorBlockScratch scratch(device);
+
+ for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
+ ++block_idx) {
+ TensorBlockDesc desc = tiling.block_mapper.blockDescriptor(block_idx);
+ evaluator.evalBlock(desc, scratch);
+ scratch.reset();
+ }
+ };
+
+ // Evaluate small expressions directly as a single block.
+ if (tiling.block_mapper.blockCount() == 1) {
+ TensorBlockScratch scratch(device);
+ TensorBlockDesc desc(0, tiling.block_mapper.blockDimensions());
+ evaluator.evalBlock(desc, scratch);
+ } else {
+ device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost,
+ eval_block);
+ }
+ }
+ evaluator.cleanup();
+ }
+};
+
+template <typename Expression, typename DoneCallback, bool Vectorizable,
+ TiledEvaluation Tiling>
+class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
+ Vectorizable, Tiling> {
+ public:
+ typedef typename Expression::Index StorageIndex;
+ typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+
+ static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
+ const ThreadPoolDevice& device,
+ DoneCallback done) {
+ TensorAsyncExecutorContext* const ctx =
+ new TensorAsyncExecutorContext(expr, device, std::move(done));
+
+ const auto on_eval_subexprs = [ctx, &device](bool need_assign) -> void {
+ if (!need_assign) {
+ delete ctx;
+ return;
+ }
+
+ typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
+ const StorageIndex size = array_prod(ctx->evaluator.dimensions());
+ device.parallelForAsync(
+ size, ctx->evaluator.costPerCoeff(Vectorizable),
+ EvalRange::alignBlockSize,
+ [ctx](StorageIndex firstIdx, StorageIndex lastIdx) {
+ EvalRange::run(&ctx->evaluator, firstIdx, lastIdx);
+ },
+ [ctx]() { delete ctx; });
+ };
+
+ ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
+ }
+
+ private:
+ struct TensorAsyncExecutorContext {
+ TensorAsyncExecutorContext(const Expression& expr,
+ const ThreadPoolDevice& thread_pool,
+ DoneCallback done)
+ : evaluator(expr, thread_pool), on_done(std::move(done)) {}
+
+ ~TensorAsyncExecutorContext() {
+ evaluator.cleanup();
+ on_done();
+ }
+
+ Evaluator evaluator;
+
+ private:
+ DoneCallback on_done;
+ };
+};
+
+template <typename Expression, typename DoneCallback, bool Vectorizable>
+class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
+ Vectorizable, /*Tileable*/ TiledEvaluation::On> {
+ public:
+ typedef typename traits<Expression>::Index IndexType;
+ typedef typename traits<Expression>::Scalar Scalar;
+ typedef typename remove_const<Scalar>::type ScalarNoConst;
+
+ static const int NumDims = traits<Expression>::NumDimensions;
+
+ typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+ typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
+ typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
+
+ typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
+ typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
+ TensorBlockScratch;
+
+ static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
+ const ThreadPoolDevice& device,
+ DoneCallback done) {
+
+ TensorAsyncExecutorContext* const ctx =
+ new TensorAsyncExecutorContext(expr, device, std::move(done));
+
+ const auto on_eval_subexprs = [ctx](bool need_assign) -> void {
+ if (!need_assign) {
+ delete ctx;
+ return;
+ }
+
+ ctx->tiling = internal::GetTensorExecutorTilingContext<
+ Evaluator, BlockMapper, Vectorizable>(ctx->evaluator);
+
+ auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) {
+ TensorBlockScratch scratch(ctx->device);
+
+ for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
+ ++block_idx) {
+ TensorBlockDesc desc =
+ ctx->tiling.block_mapper.blockDescriptor(block_idx);
+ ctx->evaluator.evalBlock(desc, scratch);
+ scratch.reset();
+ }
+ };
+
+ // Evaluate small expressions directly as a single block.
+ if (ctx->tiling.block_mapper.blockCount() == 1) {
+ TensorBlockScratch scratch(ctx->device);
+ TensorBlockDesc desc(0, ctx->tiling.block_mapper.blockDimensions());
+ ctx->evaluator.evalBlock(desc, scratch);
+ delete ctx;
+ } else {
+ ctx->device.parallelForAsync(ctx->tiling.block_mapper.blockCount(),
+ ctx->tiling.cost, eval_block,
+ [ctx]() { delete ctx; });
+ }
+ };
+
+ ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
+ }
+
+ private:
+ struct TensorAsyncExecutorContext {
+ TensorAsyncExecutorContext(const Expression& expr,
+ const ThreadPoolDevice& thread_pool,
+ DoneCallback done)
+ : device(thread_pool),
+ evaluator(expr, thread_pool),
+ on_done(std::move(done)) {}
+
+ ~TensorAsyncExecutorContext() {
+ evaluator.cleanup();
+ on_done();
+ }
+
+ const ThreadPoolDevice& device;
+ Evaluator evaluator;
+ TilingContext tiling;
+
+ private:
+ DoneCallback on_done;
+ };
+};
+
+#endif // EIGEN_USE_THREADS
+
+// GPU: the evaluation of the expression is offloaded to a GPU.
+#if defined(EIGEN_USE_GPU)
+
+template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
+class TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling> {
+ public:
+ typedef typename Expression::Index StorageIndex;
+ static void run(const Expression& expr, const GpuDevice& device);
+};
+
+#if defined(EIGEN_GPUCC)
+template <typename Evaluator, typename StorageIndex, bool Vectorizable>
+struct EigenMetaKernelEval {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+ void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
+ for (StorageIndex i = firstIdx; i < lastIdx; i += step_size) {
+ eval.evalScalar(i);
+ }
+ }
+};
+
+template <typename Evaluator, typename StorageIndex>
+struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+ void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
+ const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+ const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize;
+ const StorageIndex vectorized_step_size = step_size * PacketSize;
+
+ // Use the vector path
+ for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size;
+ i += vectorized_step_size) {
+ eval.evalPacket(i);
+ }
+ for (StorageIndex i = vectorized_size + firstIdx; i < lastIdx; i += step_size) {
+ eval.evalScalar(i);
+ }
+ }
+};
+
+template <typename Evaluator, typename StorageIndex>
+__global__ void
+__launch_bounds__(1024)
+EigenMetaKernel(Evaluator eval, StorageIndex size) {
+
+ const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x;
+ const StorageIndex step_size = blockDim.x * gridDim.x;
+
+ const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
+ EigenMetaKernelEval<Evaluator, StorageIndex, vectorizable>::run(eval, first_index, size, step_size);
+}
+
+/*static*/
+template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
+EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling>::run(
+ const Expression& expr, const GpuDevice& device) {
+ TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
+ const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
+ if (needs_assign) {
+
+ const int block_size = device.maxGpuThreadsPerBlock();
+ const int max_blocks = device.getNumGpuMultiProcessors() *
+ device.maxGpuThreadsPerMultiProcessor() / block_size;
+ const StorageIndex size = array_prod(evaluator.dimensions());
+ // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
+ const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
+
+ LAUNCH_GPU_KERNEL(
+ (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>),
+ num_blocks, block_size, 0, device, evaluator, size);
+ }
+ evaluator.cleanup();
+}
+
+#endif // EIGEN_GPUCC
+#endif // EIGEN_USE_GPU
+
+// SYCL Executor policy
+#ifdef EIGEN_USE_SYCL
+
+template <typename Evaluator>
+struct ExecExprFunctorKernel {
+ typedef typename Evaluator::Index Index;
+ Evaluator evaluator;
+ const Index range;
+ template <typename Scratch>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel(
+ const Scratch, Evaluator evaluator_, const Index range_)
+ : evaluator(evaluator_), range(range_) {}
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()(
+ cl::sycl::nd_item<1> itemID) {
+ compute(itemID);
+ }
+ template <bool is_vec = Evaluator::PacketAccess>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<!is_vec>::type
+ compute(const cl::sycl::nd_item<1>& itemID) {
+ Index gId = static_cast<Index>(itemID.get_global_linear_id());
+ Index total_threads = itemID.get_global_range(0);
+
+ for (Index i = gId; i < range; i += total_threads) {
+ evaluator.evalScalar(i);
+ }
+ }
+ template <bool is_vec = Evaluator::PacketAccess>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<is_vec>::type
+ compute(const cl::sycl::nd_item<1>& itemID) {
+ const Index vectorizedRange =
+ (range / Evaluator::PacketSize) * Evaluator::PacketSize;
+ Index gId = static_cast<Index>(itemID.get_global_linear_id());
+ const Index step = Evaluator::PacketSize * itemID.get_global_range(0);
+ const Index start = Evaluator::PacketSize * gId;
+ for (Index i = start; i < vectorizedRange; i += step) {
+ evaluator.evalPacket(i);
+ }
+ gId += vectorizedRange;
+ for (Index i = gId; i < range; i += itemID.get_global_range(0)) {
+ evaluator.evalScalar(i);
+ }
+ }
+};
+
+template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
+class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tiling> {
+ public:
+ typedef typename Expression::Index Index;
+ static EIGEN_STRONG_INLINE void run(const Expression& expr,
+ const Eigen::SyclDevice& dev) {
+ typedef Eigen::TensorEvaluator<Expression, Eigen::SyclDevice> Evaluator;
+ Evaluator evaluator(expr, dev);
+ const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+ if (needs_assign) {
+ Index range, GRange, tileSize;
+ Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions());
+ total_size = (total_size == 0) ? 1 : total_size;
+ const int PacketSize =
+ Eigen::PacketType<typename Evaluator::CoeffReturnType,
+ Eigen::SyclDevice>::size;
+ Index vectorizable_threads = static_cast<Index>(total_size / PacketSize);
+ dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange);
+ range = total_size;
+
+ dev.template nullary_kernel_launcher<
+ typename Evaluator::CoeffReturnType,
+ ExecExprFunctorKernel<Evaluator> >(
+ evaluator,
+ cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
+ cl::sycl::range<1>(tileSize)),
+ Index(1), range);
+ }
+ evaluator.cleanup();
+ }
+};
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorExpr.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorExpr.h
new file mode 100644
index 0000000..c9bccfc
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorExpr.h
@@ -0,0 +1,388 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXPR_H
+#define EIGEN_CXX11_TENSOR_TENSOR_EXPR_H
+
+namespace Eigen {
+
+/** \class TensorExpr
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor expression classes.
+ *
+ * The TensorCwiseNullaryOp class applies a nullary operators to an expression.
+ * This is typically used to generate constants.
+ *
+ * The TensorCwiseUnaryOp class represents an expression where a unary operator
+ * (e.g. cwiseSqrt) is applied to an expression.
+ *
+ * The TensorCwiseBinaryOp class represents an expression where a binary
+ * operator (e.g. addition) is applied to a lhs and a rhs expression.
+ *
+ */
+namespace internal {
+template<typename NullaryOp, typename XprType>
+struct traits<TensorCwiseNullaryOp<NullaryOp, XprType> >
+ : traits<XprType>
+{
+ typedef traits<XprType> XprTraits;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::Nested XprTypeNested;
+ typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
+ static const int NumDimensions = XprTraits::NumDimensions;
+ static const int Layout = XprTraits::Layout;
+ typedef typename XprTraits::PointerType PointerType;
+ enum {
+ Flags = 0
+ };
+};
+
+} // end namespace internal
+
+
+
+template<typename NullaryOp, typename XprType>
+class TensorCwiseNullaryOp : public TensorBase<TensorCwiseNullaryOp<NullaryOp, XprType>, ReadOnlyAccessors>
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::Scalar Scalar;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef TensorCwiseNullaryOp<NullaryOp, XprType> Nested;
+ typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseNullaryOp(const XprType& xpr, const NullaryOp& func = NullaryOp())
+ : m_xpr(xpr), m_functor(func) {}
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type&
+ nestedExpression() const { return m_xpr; }
+
+ EIGEN_DEVICE_FUNC
+ const NullaryOp& functor() const { return m_functor; }
+
+ protected:
+ typename XprType::Nested m_xpr;
+ const NullaryOp m_functor;
+};
+
+
+
+namespace internal {
+template<typename UnaryOp, typename XprType>
+struct traits<TensorCwiseUnaryOp<UnaryOp, XprType> >
+ : traits<XprType>
+{
+ // TODO(phli): Add InputScalar, InputPacket. Check references to
+ // current Scalar/Packet to see if the intent is Input or Output.
+ typedef typename result_of<UnaryOp(typename XprType::Scalar)>::type Scalar;
+ typedef traits<XprType> XprTraits;
+ typedef typename XprType::Nested XprTypeNested;
+ typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
+ static const int NumDimensions = XprTraits::NumDimensions;
+ static const int Layout = XprTraits::Layout;
+ typedef typename TypeConversion<Scalar,
+ typename XprTraits::PointerType
+ >::type
+ PointerType;
+};
+
+template<typename UnaryOp, typename XprType>
+struct eval<TensorCwiseUnaryOp<UnaryOp, XprType>, Eigen::Dense>
+{
+ typedef const TensorCwiseUnaryOp<UnaryOp, XprType>& type;
+};
+
+template<typename UnaryOp, typename XprType>
+struct nested<TensorCwiseUnaryOp<UnaryOp, XprType>, 1, typename eval<TensorCwiseUnaryOp<UnaryOp, XprType> >::type>
+{
+ typedef TensorCwiseUnaryOp<UnaryOp, XprType> type;
+};
+
+} // end namespace internal
+
+
+
+template<typename UnaryOp, typename XprType>
+class TensorCwiseUnaryOp : public TensorBase<TensorCwiseUnaryOp<UnaryOp, XprType>, ReadOnlyAccessors>
+{
+ public:
+ // TODO(phli): Add InputScalar, InputPacket. Check references to
+ // current Scalar/Packet to see if the intent is Input or Output.
+ typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::Scalar Scalar;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef Scalar CoeffReturnType;
+ typedef typename Eigen::internal::nested<TensorCwiseUnaryOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
+ : m_xpr(xpr), m_functor(func) {}
+
+ EIGEN_DEVICE_FUNC
+ const UnaryOp& functor() const { return m_functor; }
+
+ /** \returns the nested expression */
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type&
+ nestedExpression() const { return m_xpr; }
+
+ protected:
+ typename XprType::Nested m_xpr;
+ const UnaryOp m_functor;
+};
+
+
+namespace internal {
+template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
+struct traits<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> >
+{
+ // Type promotion to handle the case where the types of the lhs and the rhs
+ // are different.
+ // TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket. Check references to
+ // current Scalar/Packet to see if the intent is Inputs or Output.
+ typedef typename result_of<
+ BinaryOp(typename LhsXprType::Scalar,
+ typename RhsXprType::Scalar)>::type Scalar;
+ typedef traits<LhsXprType> XprTraits;
+ typedef typename promote_storage_type<
+ typename traits<LhsXprType>::StorageKind,
+ typename traits<RhsXprType>::StorageKind>::ret StorageKind;
+ typedef typename promote_index_type<
+ typename traits<LhsXprType>::Index,
+ typename traits<RhsXprType>::Index>::type Index;
+ typedef typename LhsXprType::Nested LhsNested;
+ typedef typename RhsXprType::Nested RhsNested;
+ typedef typename remove_reference<LhsNested>::type _LhsNested;
+ typedef typename remove_reference<RhsNested>::type _RhsNested;
+ static const int NumDimensions = XprTraits::NumDimensions;
+ static const int Layout = XprTraits::Layout;
+ typedef typename TypeConversion<Scalar,
+ typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
+ typename traits<LhsXprType>::PointerType,
+ typename traits<RhsXprType>::PointerType>::type
+ >::type
+ PointerType;
+ enum {
+ Flags = 0
+ };
+};
+
+template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
+struct eval<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, Eigen::Dense>
+{
+ typedef const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>& type;
+};
+
+template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
+struct nested<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, 1, typename eval<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> >::type>
+{
+ typedef TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> type;
+};
+
+} // end namespace internal
+
+
+
+template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
+class TensorCwiseBinaryOp : public TensorBase<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, ReadOnlyAccessors>
+{
+ public:
+ // TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket. Check references to
+ // current Scalar/Packet to see if the intent is Inputs or Output.
+ typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::Scalar Scalar;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef Scalar CoeffReturnType;
+ typedef typename Eigen::internal::nested<TensorCwiseBinaryOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const BinaryOp& func = BinaryOp())
+ : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_functor(func) {}
+
+ EIGEN_DEVICE_FUNC
+ const BinaryOp& functor() const { return m_functor; }
+
+ /** \returns the nested expressions */
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename LhsXprType::Nested>::type&
+ lhsExpression() const { return m_lhs_xpr; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename RhsXprType::Nested>::type&
+ rhsExpression() const { return m_rhs_xpr; }
+
+ protected:
+ typename LhsXprType::Nested m_lhs_xpr;
+ typename RhsXprType::Nested m_rhs_xpr;
+ const BinaryOp m_functor;
+};
+
+
+namespace internal {
+template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+struct traits<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> >
+{
+ // Type promotion to handle the case where the types of the args are different.
+ typedef typename result_of<
+ TernaryOp(typename Arg1XprType::Scalar,
+ typename Arg2XprType::Scalar,
+ typename Arg3XprType::Scalar)>::type Scalar;
+ typedef traits<Arg1XprType> XprTraits;
+ typedef typename traits<Arg1XprType>::StorageKind StorageKind;
+ typedef typename traits<Arg1XprType>::Index Index;
+ typedef typename Arg1XprType::Nested Arg1Nested;
+ typedef typename Arg2XprType::Nested Arg2Nested;
+ typedef typename Arg3XprType::Nested Arg3Nested;
+ typedef typename remove_reference<Arg1Nested>::type _Arg1Nested;
+ typedef typename remove_reference<Arg2Nested>::type _Arg2Nested;
+ typedef typename remove_reference<Arg3Nested>::type _Arg3Nested;
+ static const int NumDimensions = XprTraits::NumDimensions;
+ static const int Layout = XprTraits::Layout;
+ typedef typename TypeConversion<Scalar,
+ typename conditional<Pointer_type_promotion<typename Arg2XprType::Scalar, Scalar>::val,
+ typename traits<Arg2XprType>::PointerType,
+ typename traits<Arg3XprType>::PointerType>::type
+ >::type
+ PointerType;
+ enum {
+ Flags = 0
+ };
+};
+
+template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+struct eval<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, Eigen::Dense>
+{
+ typedef const TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>& type;
+};
+
+template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+struct nested<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, 1, typename eval<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> >::type>
+{
+ typedef TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> type;
+};
+
+} // end namespace internal
+
+
+
+template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+class TensorCwiseTernaryOp : public TensorBase<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, ReadOnlyAccessors>
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::Scalar Scalar;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef Scalar CoeffReturnType;
+ typedef typename Eigen::internal::nested<TensorCwiseTernaryOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseTernaryOp(const Arg1XprType& arg1, const Arg2XprType& arg2, const Arg3XprType& arg3, const TernaryOp& func = TernaryOp())
+ : m_arg1_xpr(arg1), m_arg2_xpr(arg2), m_arg3_xpr(arg3), m_functor(func) {}
+
+ EIGEN_DEVICE_FUNC
+ const TernaryOp& functor() const { return m_functor; }
+
+ /** \returns the nested expressions */
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename Arg1XprType::Nested>::type&
+ arg1Expression() const { return m_arg1_xpr; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename Arg2XprType::Nested>::type&
+ arg2Expression() const { return m_arg2_xpr; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename Arg3XprType::Nested>::type&
+ arg3Expression() const { return m_arg3_xpr; }
+
+ protected:
+ typename Arg1XprType::Nested m_arg1_xpr;
+ typename Arg2XprType::Nested m_arg2_xpr;
+ typename Arg3XprType::Nested m_arg3_xpr;
+ const TernaryOp m_functor;
+};
+
+
+namespace internal {
+template<typename IfXprType, typename ThenXprType, typename ElseXprType>
+struct traits<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >
+ : traits<ThenXprType>
+{
+ typedef typename traits<ThenXprType>::Scalar Scalar;
+ typedef traits<ThenXprType> XprTraits;
+ typedef typename promote_storage_type<typename traits<ThenXprType>::StorageKind,
+ typename traits<ElseXprType>::StorageKind>::ret StorageKind;
+ typedef typename promote_index_type<typename traits<ElseXprType>::Index,
+ typename traits<ThenXprType>::Index>::type Index;
+ typedef typename IfXprType::Nested IfNested;
+ typedef typename ThenXprType::Nested ThenNested;
+ typedef typename ElseXprType::Nested ElseNested;
+ static const int NumDimensions = XprTraits::NumDimensions;
+ static const int Layout = XprTraits::Layout;
+ typedef typename conditional<Pointer_type_promotion<typename ThenXprType::Scalar, Scalar>::val,
+ typename traits<ThenXprType>::PointerType,
+ typename traits<ElseXprType>::PointerType>::type PointerType;
+};
+
+template<typename IfXprType, typename ThenXprType, typename ElseXprType>
+struct eval<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, Eigen::Dense>
+{
+ typedef const TensorSelectOp<IfXprType, ThenXprType, ElseXprType>& type;
+};
+
+template<typename IfXprType, typename ThenXprType, typename ElseXprType>
+struct nested<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, 1, typename eval<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >::type>
+{
+ typedef TensorSelectOp<IfXprType, ThenXprType, ElseXprType> type;
+};
+
+} // end namespace internal
+
+
+template<typename IfXprType, typename ThenXprType, typename ElseXprType>
+class TensorSelectOp : public TensorBase<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, ReadOnlyAccessors>
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorSelectOp>::Scalar Scalar;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename internal::promote_storage_type<typename ThenXprType::CoeffReturnType,
+ typename ElseXprType::CoeffReturnType>::ret CoeffReturnType;
+ typedef typename Eigen::internal::nested<TensorSelectOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorSelectOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorSelectOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC
+ TensorSelectOp(const IfXprType& a_condition,
+ const ThenXprType& a_then,
+ const ElseXprType& a_else)
+ : m_condition(a_condition), m_then(a_then), m_else(a_else)
+ { }
+
+ EIGEN_DEVICE_FUNC
+ const IfXprType& ifExpression() const { return m_condition; }
+
+ EIGEN_DEVICE_FUNC
+ const ThenXprType& thenExpression() const { return m_then; }
+
+ EIGEN_DEVICE_FUNC
+ const ElseXprType& elseExpression() const { return m_else; }
+
+ protected:
+ typename IfXprType::Nested m_condition;
+ typename ThenXprType::Nested m_then;
+ typename ElseXprType::Nested m_else;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_EXPR_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorFFT.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorFFT.h
new file mode 100644
index 0000000..4a1a068
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorFFT.h
@@ -0,0 +1,669 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Jianwei Cui <thucjw@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_FFT_H
+#define EIGEN_CXX11_TENSOR_TENSOR_FFT_H
+
+namespace Eigen {
+
+/** \class TensorFFT
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor FFT class.
+ *
+ * TODO:
+ * Vectorize the Cooley Tukey and the Bluestein algorithm
+ * Add support for multithreaded evaluation
+ * Improve the performance on GPU
+ */
+
+template <bool NeedUprade> struct MakeComplex {
+ template <typename T>
+ EIGEN_DEVICE_FUNC
+ T operator() (const T& val) const { return val; }
+};
+
+template <> struct MakeComplex<true> {
+ template <typename T>
+ EIGEN_DEVICE_FUNC
+ std::complex<T> operator() (const T& val) const { return std::complex<T>(val, 0); }
+};
+
+template <> struct MakeComplex<false> {
+ template <typename T>
+ EIGEN_DEVICE_FUNC
+ std::complex<T> operator() (const std::complex<T>& val) const { return val; }
+};
+
+template <int ResultType> struct PartOf {
+ template <typename T> T operator() (const T& val) const { return val; }
+};
+
+template <> struct PartOf<RealPart> {
+ template <typename T> T operator() (const std::complex<T>& val) const { return val.real(); }
+};
+
+template <> struct PartOf<ImagPart> {
+ template <typename T> T operator() (const std::complex<T>& val) const { return val.imag(); }
+};
+
+namespace internal {
+template <typename FFT, typename XprType, int FFTResultType, int FFTDir>
+struct traits<TensorFFTOp<FFT, XprType, FFTResultType, FFTDir> > : public traits<XprType> {
+ typedef traits<XprType> XprTraits;
+ typedef typename NumTraits<typename XprTraits::Scalar>::Real RealScalar;
+ typedef typename std::complex<RealScalar> ComplexScalar;
+ typedef typename XprTraits::Scalar InputScalar;
+ typedef typename conditional<FFTResultType == RealPart || FFTResultType == ImagPart, RealScalar, ComplexScalar>::type OutputScalar;
+ typedef typename XprTraits::StorageKind StorageKind;
+ typedef typename XprTraits::Index Index;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = XprTraits::NumDimensions;
+ static const int Layout = XprTraits::Layout;
+ typedef typename traits<XprType>::PointerType PointerType;
+};
+
+template <typename FFT, typename XprType, int FFTResultType, int FFTDirection>
+struct eval<TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection>, Eigen::Dense> {
+ typedef const TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection>& type;
+};
+
+template <typename FFT, typename XprType, int FFTResultType, int FFTDirection>
+struct nested<TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection>, 1, typename eval<TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection> >::type> {
+ typedef TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection> type;
+};
+
+} // end namespace internal
+
+template <typename FFT, typename XprType, int FFTResultType, int FFTDir>
+class TensorFFTOp : public TensorBase<TensorFFTOp<FFT, XprType, FFTResultType, FFTDir>, ReadOnlyAccessors> {
+ public:
+ typedef typename Eigen::internal::traits<TensorFFTOp>::Scalar Scalar;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename std::complex<RealScalar> ComplexScalar;
+ typedef typename internal::conditional<FFTResultType == RealPart || FFTResultType == ImagPart, RealScalar, ComplexScalar>::type OutputScalar;
+ typedef OutputScalar CoeffReturnType;
+ typedef typename Eigen::internal::nested<TensorFFTOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorFFTOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorFFTOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFFTOp(const XprType& expr, const FFT& fft)
+ : m_xpr(expr), m_fft(fft) {}
+
+ EIGEN_DEVICE_FUNC
+ const FFT& fft() const { return m_fft; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type& expression() const {
+ return m_xpr;
+ }
+
+ protected:
+ typename XprType::Nested m_xpr;
+ const FFT m_fft;
+};
+
+// Eval as rvalue
+template <typename FFT, typename ArgType, typename Device, int FFTResultType, int FFTDir>
+struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, Device> {
+ typedef TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir> XprType;
+ typedef typename XprType::Index Index;
+ static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+ typedef DSizes<Index, NumDims> Dimensions;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename std::complex<RealScalar> ComplexScalar;
+ typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
+ typedef internal::traits<XprType> XprTraits;
+ typedef typename XprTraits::Scalar InputScalar;
+ typedef typename internal::conditional<FFTResultType == RealPart || FFTResultType == ImagPart, RealScalar, ComplexScalar>::type OutputScalar;
+ typedef OutputScalar CoeffReturnType;
+ typedef typename PacketType<OutputScalar, Device>::type PacketReturnType;
+ static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+
+ enum {
+ IsAligned = false,
+ PacketAccess = true,
+ BlockAccess = false,
+ PreferBlockAccess = false,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false,
+ RawAccess = false
+ };
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) {
+ const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+ for (int i = 0; i < NumDims; ++i) {
+ eigen_assert(input_dims[i] > 0);
+ m_dimensions[i] = input_dims[i];
+ }
+
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ m_strides[0] = 1;
+ for (int i = 1; i < NumDims; ++i) {
+ m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1];
+ }
+ } else {
+ m_strides[NumDims - 1] = 1;
+ for (int i = NumDims - 2; i >= 0; --i) {
+ m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1];
+ }
+ }
+ m_size = m_dimensions.TotalSize();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
+ return m_dimensions;
+ }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+ m_impl.evalSubExprsIfNeeded(NULL);
+ if (data) {
+ evalToBuf(data);
+ return false;
+ } else {
+ m_data = (EvaluatorPointerType)m_device.get((CoeffReturnType*)(m_device.allocate_temp(sizeof(CoeffReturnType) * m_size)));
+ evalToBuf(m_data);
+ return true;
+ }
+ }
+
+ EIGEN_STRONG_INLINE void cleanup() {
+ if (m_data) {
+ m_device.deallocate(m_data);
+ m_data = NULL;
+ }
+ m_impl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const {
+ return m_data[index];
+ }
+
+ template <int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType
+ packet(Index index) const {
+ return internal::ploadt<PacketReturnType, LoadMode>(m_data + index);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+ costPerCoeff(bool vectorized) const {
+ return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
+ }
+
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_data.bind(cgh);
+ }
+#endif
+
+ private:
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(EvaluatorPointerType data) {
+ const bool write_to_out = internal::is_same<OutputScalar, ComplexScalar>::value;
+ ComplexScalar* buf = write_to_out ? (ComplexScalar*)data : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * m_size);
+
+ for (Index i = 0; i < m_size; ++i) {
+ buf[i] = MakeComplex<internal::is_same<InputScalar, RealScalar>::value>()(m_impl.coeff(i));
+ }
+
+ for (size_t i = 0; i < m_fft.size(); ++i) {
+ Index dim = m_fft[i];
+ eigen_assert(dim >= 0 && dim < NumDims);
+ Index line_len = m_dimensions[dim];
+ eigen_assert(line_len >= 1);
+ ComplexScalar* line_buf = (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * line_len);
+ const bool is_power_of_two = isPowerOfTwo(line_len);
+ const Index good_composite = is_power_of_two ? 0 : findGoodComposite(line_len);
+ const Index log_len = is_power_of_two ? getLog2(line_len) : getLog2(good_composite);
+
+ ComplexScalar* a = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * good_composite);
+ ComplexScalar* b = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * good_composite);
+ ComplexScalar* pos_j_base_powered = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * (line_len + 1));
+ if (!is_power_of_two) {
+ // Compute twiddle factors
+ // t_n = exp(sqrt(-1) * pi * n^2 / line_len)
+ // for n = 0, 1,..., line_len-1.
+ // For n > 2 we use the recurrence t_n = t_{n-1}^2 / t_{n-2} * t_1^2
+
+ // The recurrence is correct in exact arithmetic, but causes
+ // numerical issues for large transforms, especially in
+ // single-precision floating point.
+ //
+ // pos_j_base_powered[0] = ComplexScalar(1, 0);
+ // if (line_len > 1) {
+ // const ComplexScalar pos_j_base = ComplexScalar(
+ // numext::cos(M_PI / line_len), numext::sin(M_PI / line_len));
+ // pos_j_base_powered[1] = pos_j_base;
+ // if (line_len > 2) {
+ // const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base;
+ // for (int i = 2; i < line_len + 1; ++i) {
+ // pos_j_base_powered[i] = pos_j_base_powered[i - 1] *
+ // pos_j_base_powered[i - 1] /
+ // pos_j_base_powered[i - 2] *
+ // pos_j_base_sq;
+ // }
+ // }
+ // }
+ // TODO(rmlarsen): Find a way to use Eigen's vectorized sin
+ // and cosine functions here.
+ for (int j = 0; j < line_len + 1; ++j) {
+ double arg = ((EIGEN_PI * j) * j) / line_len;
+ std::complex<double> tmp(numext::cos(arg), numext::sin(arg));
+ pos_j_base_powered[j] = static_cast<ComplexScalar>(tmp);
+ }
+ }
+
+ for (Index partial_index = 0; partial_index < m_size / line_len; ++partial_index) {
+ const Index base_offset = getBaseOffsetFromIndex(partial_index, dim);
+
+ // get data into line_buf
+ const Index stride = m_strides[dim];
+ if (stride == 1) {
+ m_device.memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar));
+ } else {
+ Index offset = base_offset;
+ for (int j = 0; j < line_len; ++j, offset += stride) {
+ line_buf[j] = buf[offset];
+ }
+ }
+
+ // process the line
+ if (is_power_of_two) {
+ processDataLineCooleyTukey(line_buf, line_len, log_len);
+ }
+ else {
+ processDataLineBluestein(line_buf, line_len, good_composite, log_len, a, b, pos_j_base_powered);
+ }
+
+ // write back
+ if (FFTDir == FFT_FORWARD && stride == 1) {
+ m_device.memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar));
+ } else {
+ Index offset = base_offset;
+ const ComplexScalar div_factor = ComplexScalar(1.0 / line_len, 0);
+ for (int j = 0; j < line_len; ++j, offset += stride) {
+ buf[offset] = (FFTDir == FFT_FORWARD) ? line_buf[j] : line_buf[j] * div_factor;
+ }
+ }
+ }
+ m_device.deallocate(line_buf);
+ if (!is_power_of_two) {
+ m_device.deallocate(a);
+ m_device.deallocate(b);
+ m_device.deallocate(pos_j_base_powered);
+ }
+ }
+
+ if(!write_to_out) {
+ for (Index i = 0; i < m_size; ++i) {
+ data[i] = PartOf<FFTResultType>()(buf[i]);
+ }
+ m_device.deallocate(buf);
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static bool isPowerOfTwo(Index x) {
+ eigen_assert(x > 0);
+ return !(x & (x - 1));
+ }
+
+ // The composite number for padding, used in Bluestein's FFT algorithm
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Index findGoodComposite(Index n) {
+ Index i = 2;
+ while (i < 2 * n - 1) i *= 2;
+ return i;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Index getLog2(Index m) {
+ Index log2m = 0;
+ while (m >>= 1) log2m++;
+ return log2m;
+ }
+
+ // Call Cooley Tukey algorithm directly, data length must be power of 2
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void processDataLineCooleyTukey(ComplexScalar* line_buf, Index line_len, Index log_len) {
+ eigen_assert(isPowerOfTwo(line_len));
+ scramble_FFT(line_buf, line_len);
+ compute_1D_Butterfly<FFTDir>(line_buf, line_len, log_len);
+ }
+
+ // Call Bluestein's FFT algorithm, m is a good composite number greater than (2 * n - 1), used as the padding length
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void processDataLineBluestein(ComplexScalar* line_buf, Index line_len, Index good_composite, Index log_len, ComplexScalar* a, ComplexScalar* b, const ComplexScalar* pos_j_base_powered) {
+ Index n = line_len;
+ Index m = good_composite;
+ ComplexScalar* data = line_buf;
+
+ for (Index i = 0; i < n; ++i) {
+ if(FFTDir == FFT_FORWARD) {
+ a[i] = data[i] * numext::conj(pos_j_base_powered[i]);
+ }
+ else {
+ a[i] = data[i] * pos_j_base_powered[i];
+ }
+ }
+ for (Index i = n; i < m; ++i) {
+ a[i] = ComplexScalar(0, 0);
+ }
+
+ for (Index i = 0; i < n; ++i) {
+ if(FFTDir == FFT_FORWARD) {
+ b[i] = pos_j_base_powered[i];
+ }
+ else {
+ b[i] = numext::conj(pos_j_base_powered[i]);
+ }
+ }
+ for (Index i = n; i < m - n; ++i) {
+ b[i] = ComplexScalar(0, 0);
+ }
+ for (Index i = m - n; i < m; ++i) {
+ if(FFTDir == FFT_FORWARD) {
+ b[i] = pos_j_base_powered[m-i];
+ }
+ else {
+ b[i] = numext::conj(pos_j_base_powered[m-i]);
+ }
+ }
+
+ scramble_FFT(a, m);
+ compute_1D_Butterfly<FFT_FORWARD>(a, m, log_len);
+
+ scramble_FFT(b, m);
+ compute_1D_Butterfly<FFT_FORWARD>(b, m, log_len);
+
+ for (Index i = 0; i < m; ++i) {
+ a[i] *= b[i];
+ }
+
+ scramble_FFT(a, m);
+ compute_1D_Butterfly<FFT_REVERSE>(a, m, log_len);
+
+ //Do the scaling after ifft
+ for (Index i = 0; i < m; ++i) {
+ a[i] /= m;
+ }
+
+ for (Index i = 0; i < n; ++i) {
+ if(FFTDir == FFT_FORWARD) {
+ data[i] = a[i] * numext::conj(pos_j_base_powered[i]);
+ }
+ else {
+ data[i] = a[i] * pos_j_base_powered[i];
+ }
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void scramble_FFT(ComplexScalar* data, Index n) {
+ eigen_assert(isPowerOfTwo(n));
+ Index j = 1;
+ for (Index i = 1; i < n; ++i){
+ if (j > i) {
+ std::swap(data[j-1], data[i-1]);
+ }
+ Index m = n >> 1;
+ while (m >= 2 && j > m) {
+ j -= m;
+ m >>= 1;
+ }
+ j += m;
+ }
+ }
+
+ template <int Dir>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_2(ComplexScalar* data) {
+ ComplexScalar tmp = data[1];
+ data[1] = data[0] - data[1];
+ data[0] += tmp;
+ }
+
+ template <int Dir>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_4(ComplexScalar* data) {
+ ComplexScalar tmp[4];
+ tmp[0] = data[0] + data[1];
+ tmp[1] = data[0] - data[1];
+ tmp[2] = data[2] + data[3];
+ if (Dir == FFT_FORWARD) {
+ tmp[3] = ComplexScalar(0.0, -1.0) * (data[2] - data[3]);
+ } else {
+ tmp[3] = ComplexScalar(0.0, 1.0) * (data[2] - data[3]);
+ }
+ data[0] = tmp[0] + tmp[2];
+ data[1] = tmp[1] + tmp[3];
+ data[2] = tmp[0] - tmp[2];
+ data[3] = tmp[1] - tmp[3];
+ }
+
+ template <int Dir>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_8(ComplexScalar* data) {
+ ComplexScalar tmp_1[8];
+ ComplexScalar tmp_2[8];
+
+ tmp_1[0] = data[0] + data[1];
+ tmp_1[1] = data[0] - data[1];
+ tmp_1[2] = data[2] + data[3];
+ if (Dir == FFT_FORWARD) {
+ tmp_1[3] = (data[2] - data[3]) * ComplexScalar(0, -1);
+ } else {
+ tmp_1[3] = (data[2] - data[3]) * ComplexScalar(0, 1);
+ }
+ tmp_1[4] = data[4] + data[5];
+ tmp_1[5] = data[4] - data[5];
+ tmp_1[6] = data[6] + data[7];
+ if (Dir == FFT_FORWARD) {
+ tmp_1[7] = (data[6] - data[7]) * ComplexScalar(0, -1);
+ } else {
+ tmp_1[7] = (data[6] - data[7]) * ComplexScalar(0, 1);
+ }
+ tmp_2[0] = tmp_1[0] + tmp_1[2];
+ tmp_2[1] = tmp_1[1] + tmp_1[3];
+ tmp_2[2] = tmp_1[0] - tmp_1[2];
+ tmp_2[3] = tmp_1[1] - tmp_1[3];
+ tmp_2[4] = tmp_1[4] + tmp_1[6];
+// SQRT2DIV2 = sqrt(2)/2
+#define SQRT2DIV2 0.7071067811865476
+ if (Dir == FFT_FORWARD) {
+ tmp_2[5] = (tmp_1[5] + tmp_1[7]) * ComplexScalar(SQRT2DIV2, -SQRT2DIV2);
+ tmp_2[6] = (tmp_1[4] - tmp_1[6]) * ComplexScalar(0, -1);
+ tmp_2[7] = (tmp_1[5] - tmp_1[7]) * ComplexScalar(-SQRT2DIV2, -SQRT2DIV2);
+ } else {
+ tmp_2[5] = (tmp_1[5] + tmp_1[7]) * ComplexScalar(SQRT2DIV2, SQRT2DIV2);
+ tmp_2[6] = (tmp_1[4] - tmp_1[6]) * ComplexScalar(0, 1);
+ tmp_2[7] = (tmp_1[5] - tmp_1[7]) * ComplexScalar(-SQRT2DIV2, SQRT2DIV2);
+ }
+ data[0] = tmp_2[0] + tmp_2[4];
+ data[1] = tmp_2[1] + tmp_2[5];
+ data[2] = tmp_2[2] + tmp_2[6];
+ data[3] = tmp_2[3] + tmp_2[7];
+ data[4] = tmp_2[0] - tmp_2[4];
+ data[5] = tmp_2[1] - tmp_2[5];
+ data[6] = tmp_2[2] - tmp_2[6];
+ data[7] = tmp_2[3] - tmp_2[7];
+ }
+
+ template <int Dir>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_1D_merge(
+ ComplexScalar* data, Index n, Index n_power_of_2) {
+ // Original code:
+ // RealScalar wtemp = std::sin(M_PI/n);
+ // RealScalar wpi = -std::sin(2 * M_PI/n);
+ const RealScalar wtemp = m_sin_PI_div_n_LUT[n_power_of_2];
+ const RealScalar wpi = (Dir == FFT_FORWARD)
+ ? m_minus_sin_2_PI_div_n_LUT[n_power_of_2]
+ : -m_minus_sin_2_PI_div_n_LUT[n_power_of_2];
+
+ const ComplexScalar wp(wtemp, wpi);
+ const ComplexScalar wp_one = wp + ComplexScalar(1, 0);
+ const ComplexScalar wp_one_2 = wp_one * wp_one;
+ const ComplexScalar wp_one_3 = wp_one_2 * wp_one;
+ const ComplexScalar wp_one_4 = wp_one_3 * wp_one;
+ const Index n2 = n / 2;
+ ComplexScalar w(1.0, 0.0);
+ for (Index i = 0; i < n2; i += 4) {
+ ComplexScalar temp0(data[i + n2] * w);
+ ComplexScalar temp1(data[i + 1 + n2] * w * wp_one);
+ ComplexScalar temp2(data[i + 2 + n2] * w * wp_one_2);
+ ComplexScalar temp3(data[i + 3 + n2] * w * wp_one_3);
+ w = w * wp_one_4;
+
+ data[i + n2] = data[i] - temp0;
+ data[i] += temp0;
+
+ data[i + 1 + n2] = data[i + 1] - temp1;
+ data[i + 1] += temp1;
+
+ data[i + 2 + n2] = data[i + 2] - temp2;
+ data[i + 2] += temp2;
+
+ data[i + 3 + n2] = data[i + 3] - temp3;
+ data[i + 3] += temp3;
+ }
+ }
+
+ template <int Dir>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_1D_Butterfly(
+ ComplexScalar* data, Index n, Index n_power_of_2) {
+ eigen_assert(isPowerOfTwo(n));
+ if (n > 8) {
+ compute_1D_Butterfly<Dir>(data, n / 2, n_power_of_2 - 1);
+ compute_1D_Butterfly<Dir>(data + n / 2, n / 2, n_power_of_2 - 1);
+ butterfly_1D_merge<Dir>(data, n, n_power_of_2);
+ } else if (n == 8) {
+ butterfly_8<Dir>(data);
+ } else if (n == 4) {
+ butterfly_4<Dir>(data);
+ } else if (n == 2) {
+ butterfly_2<Dir>(data);
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getBaseOffsetFromIndex(Index index, Index omitted_dim) const {
+ Index result = 0;
+
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = NumDims - 1; i > omitted_dim; --i) {
+ const Index partial_m_stride = m_strides[i] / m_dimensions[omitted_dim];
+ const Index idx = index / partial_m_stride;
+ index -= idx * partial_m_stride;
+ result += idx * m_strides[i];
+ }
+ result += index;
+ }
+ else {
+ for (Index i = 0; i < omitted_dim; ++i) {
+ const Index partial_m_stride = m_strides[i] / m_dimensions[omitted_dim];
+ const Index idx = index / partial_m_stride;
+ index -= idx * partial_m_stride;
+ result += idx * m_strides[i];
+ }
+ result += index;
+ }
+ // Value of index_coords[omitted_dim] is not determined to this step
+ return result;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getIndexFromOffset(Index base, Index omitted_dim, Index offset) const {
+ Index result = base + offset * m_strides[omitted_dim] ;
+ return result;
+ }
+
+ protected:
+ Index m_size;
+ const FFT EIGEN_DEVICE_REF m_fft;
+ Dimensions m_dimensions;
+ array<Index, NumDims> m_strides;
+ TensorEvaluator<ArgType, Device> m_impl;
+ EvaluatorPointerType m_data;
+ const Device EIGEN_DEVICE_REF m_device;
+
+ // This will support a maximum FFT size of 2^32 for each dimension
+ // m_sin_PI_div_n_LUT[i] = (-2) * std::sin(M_PI / std::pow(2,i)) ^ 2;
+ const RealScalar m_sin_PI_div_n_LUT[32] = {
+ RealScalar(0.0),
+ RealScalar(-2),
+ RealScalar(-0.999999999999999),
+ RealScalar(-0.292893218813453),
+ RealScalar(-0.0761204674887130),
+ RealScalar(-0.0192147195967696),
+ RealScalar(-0.00481527332780311),
+ RealScalar(-0.00120454379482761),
+ RealScalar(-3.01181303795779e-04),
+ RealScalar(-7.52981608554592e-05),
+ RealScalar(-1.88247173988574e-05),
+ RealScalar(-4.70619042382852e-06),
+ RealScalar(-1.17654829809007e-06),
+ RealScalar(-2.94137117780840e-07),
+ RealScalar(-7.35342821488550e-08),
+ RealScalar(-1.83835707061916e-08),
+ RealScalar(-4.59589268710903e-09),
+ RealScalar(-1.14897317243732e-09),
+ RealScalar(-2.87243293150586e-10),
+ RealScalar( -7.18108232902250e-11),
+ RealScalar(-1.79527058227174e-11),
+ RealScalar(-4.48817645568941e-12),
+ RealScalar(-1.12204411392298e-12),
+ RealScalar(-2.80511028480785e-13),
+ RealScalar(-7.01277571201985e-14),
+ RealScalar(-1.75319392800498e-14),
+ RealScalar(-4.38298482001247e-15),
+ RealScalar(-1.09574620500312e-15),
+ RealScalar(-2.73936551250781e-16),
+ RealScalar(-6.84841378126949e-17),
+ RealScalar(-1.71210344531737e-17),
+ RealScalar(-4.28025861329343e-18)
+ };
+
+ // m_minus_sin_2_PI_div_n_LUT[i] = -std::sin(2 * M_PI / std::pow(2,i));
+ const RealScalar m_minus_sin_2_PI_div_n_LUT[32] = {
+ RealScalar(0.0),
+ RealScalar(0.0),
+ RealScalar(-1.00000000000000e+00),
+ RealScalar(-7.07106781186547e-01),
+ RealScalar(-3.82683432365090e-01),
+ RealScalar(-1.95090322016128e-01),
+ RealScalar(-9.80171403295606e-02),
+ RealScalar(-4.90676743274180e-02),
+ RealScalar(-2.45412285229123e-02),
+ RealScalar(-1.22715382857199e-02),
+ RealScalar(-6.13588464915448e-03),
+ RealScalar(-3.06795676296598e-03),
+ RealScalar(-1.53398018628477e-03),
+ RealScalar(-7.66990318742704e-04),
+ RealScalar(-3.83495187571396e-04),
+ RealScalar(-1.91747597310703e-04),
+ RealScalar(-9.58737990959773e-05),
+ RealScalar(-4.79368996030669e-05),
+ RealScalar(-2.39684498084182e-05),
+ RealScalar(-1.19842249050697e-05),
+ RealScalar(-5.99211245264243e-06),
+ RealScalar(-2.99605622633466e-06),
+ RealScalar(-1.49802811316901e-06),
+ RealScalar(-7.49014056584716e-07),
+ RealScalar(-3.74507028292384e-07),
+ RealScalar(-1.87253514146195e-07),
+ RealScalar(-9.36267570730981e-08),
+ RealScalar(-4.68133785365491e-08),
+ RealScalar(-2.34066892682746e-08),
+ RealScalar(-1.17033446341373e-08),
+ RealScalar(-5.85167231706864e-09),
+ RealScalar(-2.92583615853432e-09)
+ };
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_FFT_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorFixedSize.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorFixedSize.h
new file mode 100644
index 0000000..ca39bb8
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorFixedSize.h
@@ -0,0 +1,379 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H
+
+namespace Eigen {
+
+/** \class TensorFixedSize
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief The fixed sized version of the tensor class.
+ *
+ * The fixed sized equivalent of
+ * Eigen::Tensor<float, 3> t(3, 5, 7);
+ * is
+ * Eigen::TensorFixedSize<float, Sizes<3,5,7>> t;
+ */
+
+template<typename Scalar_, typename Dimensions_, int Options_, typename IndexType>
+class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_, Options_, IndexType> >
+{
+ public:
+ typedef TensorFixedSize<Scalar_, Dimensions_, Options_, IndexType> Self;
+ typedef TensorBase<TensorFixedSize<Scalar_, Dimensions_, Options_, IndexType> > Base;
+ typedef typename Eigen::internal::nested<Self>::type Nested;
+ typedef typename internal::traits<Self>::StorageKind StorageKind;
+ typedef typename internal::traits<Self>::Index Index;
+ typedef Scalar_ Scalar;
+ typedef typename NumTraits<Scalar>::Real RealScalar;
+ typedef typename Base::CoeffReturnType CoeffReturnType;
+
+ static const int Options = Options_;
+
+ enum {
+ IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0),
+ PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+ BlockAccess = false,
+ PreferBlockAccess = false,
+ Layout = Options_ & RowMajor ? RowMajor : ColMajor,
+ CoordAccess = true,
+ RawAccess = true
+ };
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ typedef Dimensions_ Dimensions;
+ static const std::size_t NumIndices = Dimensions::count;
+
+ protected:
+ TensorStorage<Scalar, Dimensions, Options> m_storage;
+
+ public:
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_storage.dimensions(); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); }
+
+ // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+ // work, because that uses base().coeffRef() - and we don't yet
+ // implement a similar class hierarchy
+ inline Self& base() { return *this; }
+ inline const Self& base() const { return *this; }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+ template<typename... IndexTypes>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index firstIndex, IndexTypes... otherIndices) const
+ {
+ // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+ EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ return coeff(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+ }
+#endif
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar& coeff(const array<Index, NumIndices>& indices) const
+ {
+ eigen_internal_assert(checkIndexRange(indices));
+ return m_storage.data()[linearizedIndex(indices)];
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const
+ {
+ eigen_internal_assert(index >= 0 && index < size());
+ return m_storage.data()[index];
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar& coeff() const
+ {
+ EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ return m_storage.data()[0];
+ }
+
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+ template<typename... IndexTypes>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices)
+ {
+ // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+ EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ return coeffRef(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+ }
+#endif
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices)
+ {
+ eigen_internal_assert(checkIndexRange(indices));
+ return m_storage.data()[linearizedIndex(indices)];
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
+ {
+ eigen_internal_assert(index >= 0 && index < size());
+ return m_storage.data()[index];
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& coeffRef()
+ {
+ EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ return m_storage.data()[0];
+ }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+ template<typename... IndexTypes>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const
+ {
+ // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+ EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ return this->operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+ }
+#else
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const
+ {
+ if (Options&RowMajor) {
+ const Index index = i1 + i0 * m_storage.dimensions()[1];
+ return m_storage.data()[index];
+ } else {
+ const Index index = i0 + i1 * m_storage.dimensions()[0];
+ return m_storage.data()[index];
+ }
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const
+ {
+ if (Options&RowMajor) {
+ const Index index = i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0);
+ return m_storage.data()[index];
+ } else {
+ const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * i2);
+ return m_storage.data()[index];
+ }
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const
+ {
+ if (Options&RowMajor) {
+ const Index index = i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0));
+ return m_storage.data()[index];
+ } else {
+ const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * i3));
+ return m_storage.data()[index];
+ }
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
+ {
+ if (Options&RowMajor) {
+ const Index index = i4 + m_storage.dimensions()[4] * (i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0)));
+ return m_storage.data()[index];
+ } else {
+ const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * (i3 + m_storage.dimensions()[3] * i4)));
+ return m_storage.data()[index];
+ }
+ }
+#endif
+
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const
+ {
+ eigen_assert(checkIndexRange(indices));
+ return coeff(indices);
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const
+ {
+ eigen_internal_assert(index >= 0 && index < size());
+ return coeff(index);
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar& operator()() const
+ {
+ EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ return coeff();
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const
+ {
+ // The bracket operator is only for vectors, use the parenthesis operator instead.
+ EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ return coeff(index);
+ }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+ template<typename... IndexTypes>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
+ {
+ // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+ EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ return operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+ }
+#else
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1)
+ {
+ if (Options&RowMajor) {
+ const Index index = i1 + i0 * m_storage.dimensions()[1];
+ return m_storage.data()[index];
+ } else {
+ const Index index = i0 + i1 * m_storage.dimensions()[0];
+ return m_storage.data()[index];
+ }
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2)
+ {
+ if (Options&RowMajor) {
+ const Index index = i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0);
+ return m_storage.data()[index];
+ } else {
+ const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * i2);
+ return m_storage.data()[index];
+ }
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3)
+ {
+ if (Options&RowMajor) {
+ const Index index = i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0));
+ return m_storage.data()[index];
+ } else {
+ const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * i3));
+ return m_storage.data()[index];
+ }
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4)
+ {
+ if (Options&RowMajor) {
+ const Index index = i4 + m_storage.dimensions()[4] * (i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0)));
+ return m_storage.data()[index];
+ } else {
+ const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * (i3 + m_storage.dimensions()[3] * i4)));
+ return m_storage.data()[index];
+ }
+ }
+#endif
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices)
+ {
+ eigen_assert(checkIndexRange(indices));
+ return coeffRef(indices);
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& operator()(Index index)
+ {
+ eigen_assert(index >= 0 && index < size());
+ return coeffRef(index);
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& operator()()
+ {
+ EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ return coeffRef();
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& operator[](Index index)
+ {
+ // The bracket operator is only for vectors, use the parenthesis operator instead
+ EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ return coeffRef(index);
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE TensorFixedSize()
+ : m_storage()
+ {
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE TensorFixedSize(const Self& other)
+ : m_storage(other.m_storage)
+ {
+ }
+
+#if EIGEN_HAS_RVALUE_REFERENCES
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize(Self&& other)
+ : m_storage(other.m_storage)
+ {
+ }
+#endif
+
+ template<typename OtherDerived>
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE TensorFixedSize(const TensorBase<OtherDerived, ReadOnlyAccessors>& other)
+ {
+ typedef TensorAssignOp<TensorFixedSize, const OtherDerived> Assign;
+ Assign assign(*this, other.derived());
+ internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+ }
+ template<typename OtherDerived>
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE TensorFixedSize(const TensorBase<OtherDerived, WriteAccessors>& other)
+ {
+ typedef TensorAssignOp<TensorFixedSize, const OtherDerived> Assign;
+ Assign assign(*this, other.derived());
+ internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+ }
+
+ // FIXME: check that the dimensions of other match the dimensions of *this.
+ // Unfortunately this isn't possible yet when the rhs is an expression.
+ EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(TensorFixedSize)
+
+
+ protected:
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE bool checkIndexRange(const array<Index, NumIndices>& /*indices*/) const
+ {
+ using internal::array_apply_and_reduce;
+ using internal::array_zip_and_reduce;
+ using internal::greater_equal_zero_op;
+ using internal::logical_and_op;
+ using internal::lesser_op;
+
+ return true;
+ // check whether the indices are all >= 0
+ /* array_apply_and_reduce<logical_and_op, greater_equal_zero_op>(indices) &&
+ // check whether the indices fit in the dimensions
+ array_zip_and_reduce<logical_and_op, lesser_op>(indices, m_storage.dimensions());*/
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Index linearizedIndex(const array<Index, NumIndices>& indices) const
+ {
+ if (Options&RowMajor) {
+ return m_storage.dimensions().IndexOfRowMajor(indices);
+ } else {
+ return m_storage.dimensions().IndexOfColMajor(indices);
+ }
+ }
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorForcedEval.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorForcedEval.h
new file mode 100644
index 0000000..e800ded
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorForcedEval.h
@@ -0,0 +1,237 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H
+#define EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H
+
+namespace Eigen {
+
+/** \class TensorForcedEval
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor reshaping class.
+ *
+ *
+ */
+namespace internal {
+template<typename XprType>
+struct traits<TensorForcedEvalOp<XprType> >
+{
+ // Type promotion to handle the case where the types of the lhs and the rhs are different.
+ typedef typename XprType::Scalar Scalar;
+ typedef traits<XprType> XprTraits;
+ typedef typename traits<XprType>::StorageKind StorageKind;
+ typedef typename traits<XprType>::Index Index;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = XprTraits::NumDimensions;
+ static const int Layout = XprTraits::Layout;
+ typedef typename XprTraits::PointerType PointerType;
+
+ enum {
+ Flags = 0
+ };
+};
+
+template<typename XprType>
+struct eval<TensorForcedEvalOp<XprType>, Eigen::Dense>
+{
+ typedef const TensorForcedEvalOp<XprType>& type;
+};
+
+template<typename XprType>
+struct nested<TensorForcedEvalOp<XprType>, 1, typename eval<TensorForcedEvalOp<XprType> >::type>
+{
+ typedef TensorForcedEvalOp<XprType> type;
+};
+
+} // end namespace internal
+
+
+
+template<typename XprType>
+class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType>, ReadOnlyAccessors>
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Scalar Scalar;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+ typedef typename Eigen::internal::nested<TensorForcedEvalOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorForcedEvalOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorForcedEvalOp(const XprType& expr)
+ : m_xpr(expr) {}
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type&
+ expression() const { return m_xpr; }
+
+ protected:
+ typename XprType::Nested m_xpr;
+};
+
+namespace internal {
+template <typename Device, typename CoeffReturnType>
+struct non_integral_type_placement_new{
+ template <typename StorageType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index numValues, StorageType m_buffer) {
+ // Initialize non-trivially constructible types.
+ if (!internal::is_arithmetic<CoeffReturnType>::value) {
+ for (Index i = 0; i < numValues; ++i) new (m_buffer + i) CoeffReturnType();
+ }
+}
+};
+
+// SYCL does not support non-integral types
+// having new (m_buffer + i) CoeffReturnType() causes the following compiler error for SYCL Devices
+// no matching function for call to 'operator new'
+template <typename CoeffReturnType>
+struct non_integral_type_placement_new<Eigen::SyclDevice, CoeffReturnType> {
+ template <typename StorageType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index, StorageType) {
+}
+};
+} // end namespace internal
+
+template<typename ArgType_, typename Device>
+struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
+{
+ typedef const typename internal::remove_all<ArgType_>::type ArgType;
+ typedef TensorForcedEvalOp<ArgType> XprType;
+ typedef typename ArgType::Scalar Scalar;
+ typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+ typedef typename XprType::Index Index;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+ typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+
+ enum {
+ IsAligned = true,
+ PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+ BlockAccess = internal::is_arithmetic<CoeffReturnType>::value,
+ PreferBlockAccess = false,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ RawAccess = true
+ };
+
+ static const int NumDims = internal::traits<ArgType>::NumDimensions;
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+ typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+ typedef typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims,
+ Layout, Index>
+ TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ TensorEvaluator(const XprType& op, const Device& device)
+ : m_impl(op.expression(), device), m_op(op.expression()),
+ m_device(device), m_buffer(NULL)
+ { }
+
+ EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+ const Index numValues = internal::array_prod(m_impl.dimensions());
+ m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp(numValues * sizeof(CoeffReturnType)));
+
+ internal::non_integral_type_placement_new<Device, CoeffReturnType>()(numValues, m_buffer);
+
+ typedef TensorEvalToOp< const typename internal::remove_const<ArgType>::type > EvalTo;
+ EvalTo evalToTmp(m_device.get(m_buffer), m_op);
+
+ internal::TensorExecutor<
+ const EvalTo, typename internal::remove_const<Device>::type,
+ /*Vectorizable=*/internal::IsVectorizable<Device, const ArgType>::value,
+ /*Tiling=*/internal::IsTileable<Device, const ArgType>::value>::
+ run(evalToTmp, m_device);
+
+ return true;
+ }
+
+#ifdef EIGEN_USE_THREADS
+ template <typename EvalSubExprsCallback>
+ EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+ EvaluatorPointerType, EvalSubExprsCallback done) {
+ const Index numValues = internal::array_prod(m_impl.dimensions());
+ m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp(
+ numValues * sizeof(CoeffReturnType)));
+ typedef TensorEvalToOp<const typename internal::remove_const<ArgType>::type>
+ EvalTo;
+ EvalTo evalToTmp(m_device.get(m_buffer), m_op);
+
+ auto on_done = std::bind([](EvalSubExprsCallback done_) { done_(true); },
+ std::move(done));
+ internal::TensorAsyncExecutor<
+ const EvalTo, typename internal::remove_const<Device>::type,
+ decltype(on_done),
+ /*Vectorizable=*/internal::IsVectorizable<Device, const ArgType>::value,
+ /*Tiling=*/internal::IsTileable<Device, const ArgType>::value>::
+ runAsync(evalToTmp, m_device, std::move(on_done));
+ }
+#endif
+
+ EIGEN_STRONG_INLINE void cleanup() {
+ m_device.deallocate_temp(m_buffer);
+ m_buffer = NULL;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ return m_buffer[index];
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+ {
+ return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ internal::TensorBlockResourceRequirements getResourceRequirements() const {
+ return internal::TensorBlockResourceRequirements::any();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+ block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+ bool /*root_of_expr_ast*/ = false) const {
+ assert(m_buffer != NULL);
+ return TensorBlock::materialize(m_buffer, m_impl.dimensions(), desc, scratch);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+ return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ EvaluatorPointerType data() const { return m_buffer; }
+
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_buffer.bind(cgh);
+ m_impl.bind(cgh);
+ }
+#endif
+ private:
+ TensorEvaluator<ArgType, Device> m_impl;
+ const ArgType m_op;
+ const Device EIGEN_DEVICE_REF m_device;
+ EvaluatorPointerType m_buffer;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorForwardDeclarations.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorForwardDeclarations.h
new file mode 100644
index 0000000..246ebe4
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -0,0 +1,191 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H
+
+namespace Eigen {
+
+// MakePointer class is used as a container of the address space of the pointer
+// on the host and on the device. From the host side it generates the T* pointer
+// and when EIGEN_USE_SYCL is used it construct a buffer with a map_allocator to
+// T* m_data on the host. It is always called on the device.
+// Specialisation of MakePointer class for creating the sycl buffer with
+// map_allocator.
+template<typename T> struct MakePointer {
+ typedef T* Type;
+ typedef const T* ConstType;
+};
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T* constCast(const T* data) {
+ return const_cast<T*>(data);
+}
+
+// The StorageMemory class is a container of the device specific pointer
+// used for refering to a Pointer on TensorEvaluator class. While the TensorExpression
+// is a device-agnostic type and need MakePointer class for type conversion,
+// the TensorEvaluator class can be specialized for a device, hence it is possible
+// to construct different types of temproray storage memory in TensorEvaluator
+// for different devices by specializing the following StorageMemory class.
+template<typename T, typename device> struct StorageMemory: MakePointer <T> {};
+
+namespace internal{
+template<typename A, typename B> struct Pointer_type_promotion {
+ static const bool val=false;
+};
+template<typename A> struct Pointer_type_promotion<A, A> {
+ static const bool val = true;
+};
+template<typename A, typename B> struct TypeConversion {
+ typedef A* type;
+};
+}
+
+
+template<typename PlainObjectType, int Options_ = Unaligned, template <class> class MakePointer_ = MakePointer> class TensorMap;
+template<typename Scalar_, int NumIndices_, int Options_ = 0, typename IndexType = DenseIndex> class Tensor;
+template<typename Scalar_, typename Dimensions, int Options_ = 0, typename IndexType = DenseIndex> class TensorFixedSize;
+template<typename PlainObjectType> class TensorRef;
+template<typename Derived, int AccessLevel> class TensorBase;
+
+template<typename NullaryOp, typename PlainObjectType> class TensorCwiseNullaryOp;
+template<typename UnaryOp, typename XprType> class TensorCwiseUnaryOp;
+template<typename BinaryOp, typename LeftXprType, typename RightXprType> class TensorCwiseBinaryOp;
+template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType> class TensorCwiseTernaryOp;
+template<typename IfXprType, typename ThenXprType, typename ElseXprType> class TensorSelectOp;
+template<typename Op, typename Dims, typename XprType, template <class> class MakePointer_ = MakePointer > class TensorReductionOp;
+template<typename XprType> class TensorIndexTupleOp;
+template<typename ReduceOp, typename Dims, typename XprType> class TensorTupleReducerOp;
+template<typename Axis, typename LeftXprType, typename RightXprType> class TensorConcatenationOp;
+template<typename Dimensions, typename LeftXprType, typename RightXprType, typename OutputKernelType> class TensorContractionOp;
+template<typename TargetType, typename XprType> class TensorConversionOp;
+template<typename Dimensions, typename InputXprType, typename KernelXprType> class TensorConvolutionOp;
+template<typename FFT, typename XprType, int FFTDataType, int FFTDirection> class TensorFFTOp;
+template<typename PatchDim, typename XprType> class TensorPatchOp;
+template<DenseIndex Rows, DenseIndex Cols, typename XprType> class TensorImagePatchOp;
+template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType> class TensorVolumePatchOp;
+template<typename Broadcast, typename XprType> class TensorBroadcastingOp;
+template<DenseIndex DimId, typename XprType> class TensorChippingOp;
+template<typename NewDimensions, typename XprType> class TensorReshapingOp;
+template<typename XprType> class TensorLayoutSwapOp;
+template<typename StartIndices, typename Sizes, typename XprType> class TensorSlicingOp;
+template<typename ReverseDimensions, typename XprType> class TensorReverseOp;
+template<typename PaddingDimensions, typename XprType> class TensorPaddingOp;
+template<typename Shuffle, typename XprType> class TensorShufflingOp;
+template<typename Strides, typename XprType> class TensorStridingOp;
+template<typename StartIndices, typename StopIndices, typename Strides, typename XprType> class TensorStridingSlicingOp;
+template<typename Strides, typename XprType> class TensorInflationOp;
+template<typename Generator, typename XprType> class TensorGeneratorOp;
+template<typename LeftXprType, typename RightXprType> class TensorAssignOp;
+template<typename Op, typename XprType> class TensorScanOp;
+template<typename Dims, typename XprType> class TensorTraceOp;
+
+template<typename CustomUnaryFunc, typename XprType> class TensorCustomUnaryOp;
+template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType> class TensorCustomBinaryOp;
+
+template<typename XprType, template <class> class MakePointer_ = MakePointer> class TensorEvalToOp;
+template<typename XprType> class TensorForcedEvalOp;
+
+template<typename ExpressionType, typename DeviceType> class TensorDevice;
+template<typename ExpressionType, typename DeviceType, typename DoneCallback> class TensorAsyncDevice;
+template<typename Derived, typename Device> struct TensorEvaluator;
+
+struct NoOpOutputKernel;
+
+struct DefaultDevice;
+struct ThreadPoolDevice;
+struct GpuDevice;
+struct SyclDevice;
+
+#ifdef EIGEN_USE_SYCL
+
+template <typename T> struct MakeSYCLPointer {
+ typedef Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T> Type;
+};
+
+template <typename T>
+EIGEN_STRONG_INLINE const Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T>&
+constCast(const Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T>& data) {
+ return data;
+}
+
+template <typename T>
+struct StorageMemory<T, SyclDevice> : MakeSYCLPointer<T> {};
+template <typename T>
+struct StorageMemory<T, const SyclDevice> : StorageMemory<T, SyclDevice> {};
+
+namespace TensorSycl {
+namespace internal{
+template <typename Evaluator, typename Op> class GenericNondeterministicReducer;
+}
+}
+#endif
+
+
+enum FFTResultType {
+ RealPart = 0,
+ ImagPart = 1,
+ BothParts = 2
+};
+
+enum FFTDirection {
+ FFT_FORWARD = 0,
+ FFT_REVERSE = 1
+};
+
+
+namespace internal {
+
+template <typename Device, typename Expression>
+struct IsVectorizable {
+ static const bool value = TensorEvaluator<Expression, Device>::PacketAccess;
+};
+
+template <typename Expression>
+struct IsVectorizable<GpuDevice, Expression> {
+ static const bool value = TensorEvaluator<Expression, GpuDevice>::PacketAccess &&
+ TensorEvaluator<Expression, GpuDevice>::IsAligned;
+};
+
+// Tiled evaluation strategy.
+enum TiledEvaluation {
+ Off = 0, // tiled evaluation is not supported
+ On = 1, // still work in progress (see TensorBlock.h)
+};
+
+template <typename Device, typename Expression>
+struct IsTileable {
+ // Check that block evaluation is supported and it's a preferred option (at
+ // least one sub-expression has much faster block evaluation, e.g.
+ // broadcasting).
+ static const bool BlockAccess =
+ TensorEvaluator<Expression, Device>::BlockAccess &&
+ TensorEvaluator<Expression, Device>::PreferBlockAccess;
+
+ static const TiledEvaluation value =
+ BlockAccess ? TiledEvaluation::On : TiledEvaluation::Off;
+};
+
+template <typename Expression, typename Device,
+ bool Vectorizable = IsVectorizable<Device, Expression>::value,
+ TiledEvaluation Tiling = IsTileable<Device, Expression>::value>
+class TensorExecutor;
+
+template <typename Expression, typename Device, typename DoneCallback,
+ bool Vectorizable = IsVectorizable<Device, Expression>::value,
+ TiledEvaluation Tiling = IsTileable<Device, Expression>::value>
+class TensorAsyncExecutor;
+
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorFunctors.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorFunctors.h
new file mode 100644
index 0000000..d963032
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorFunctors.h
@@ -0,0 +1,488 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H
+
+namespace Eigen {
+namespace internal {
+
+
+/** \internal
+ * \brief Template functor to compute the modulo between an array and a scalar.
+ */
+template <typename Scalar>
+struct scalar_mod_op {
+ EIGEN_DEVICE_FUNC scalar_mod_op(const Scalar& divisor) : m_divisor(divisor) {}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a % m_divisor; }
+ const Scalar m_divisor;
+};
+template <typename Scalar>
+struct functor_traits<scalar_mod_op<Scalar> >
+{ enum { Cost = scalar_div_cost<Scalar,false>::value, PacketAccess = false }; };
+
+
+/** \internal
+ * \brief Template functor to compute the modulo between 2 arrays.
+ */
+template <typename Scalar>
+struct scalar_mod2_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; }
+};
+template <typename Scalar>
+struct functor_traits<scalar_mod2_op<Scalar> >
+{ enum { Cost = scalar_div_cost<Scalar,false>::value, PacketAccess = false }; };
+
+template <typename Scalar>
+struct scalar_fmod_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
+ operator()(const Scalar& a, const Scalar& b) const {
+ return numext::fmod(a, b);
+ }
+};
+template <typename Scalar>
+struct functor_traits<scalar_fmod_op<Scalar> > {
+ enum { Cost = 13, // Reciprocal throughput of FPREM on Haswell.
+ PacketAccess = false };
+};
+
+template<typename Reducer, typename Device>
+struct reducer_traits {
+ enum {
+ Cost = 1,
+ PacketAccess = false,
+ IsStateful = false,
+ IsExactlyAssociative = true
+ };
+};
+
+// Standard reduction functors
+template <typename T> struct SumReducer
+{
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
+ internal::scalar_sum_op<T> sum_op;
+ *accum = sum_op(*accum, t);
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
+ (*accum) = padd<Packet>(*accum, p);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+ internal::scalar_cast_op<int, T> conv;
+ return conv(0);
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
+ return pset1<Packet>(initialize());
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
+ return accum;
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
+ return vaccum;
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
+ internal::scalar_sum_op<T> sum_op;
+ return sum_op(saccum, predux(vaccum));
+ }
+};
+
+template <typename T, typename Device>
+struct reducer_traits<SumReducer<T>, Device> {
+ enum {
+ Cost = NumTraits<T>::AddCost,
+ PacketAccess = PacketType<T, Device>::HasAdd,
+ IsStateful = false,
+ IsExactlyAssociative = NumTraits<T>::IsInteger
+ };
+};
+
+template <typename T> struct MeanReducer
+{
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ MeanReducer() : scalarCount_(0), packetCount_(0) { }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) {
+ internal::scalar_sum_op<T> sum_op;
+ *accum = sum_op(*accum, t);
+ scalarCount_++;
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) {
+ (*accum) = padd<Packet>(*accum, p);
+ packetCount_++;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+ internal::scalar_cast_op<int, T> conv;
+ return conv(0);
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
+ return pset1<Packet>(initialize());
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
+ internal::scalar_quotient_op<T> quotient_op;
+ return quotient_op(accum, T(scalarCount_));
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
+ return pdiv(vaccum, pset1<Packet>(T(packetCount_)));
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
+ internal::scalar_sum_op<T> sum_op;
+ internal::scalar_quotient_op<T> quotient_op;
+ return quotient_op(
+ sum_op(saccum, predux(vaccum)),
+ T(scalarCount_ + packetCount_ * unpacket_traits<Packet>::size));
+ }
+
+ protected:
+ DenseIndex scalarCount_;
+ DenseIndex packetCount_;
+};
+
+template <typename T, typename Device>
+struct reducer_traits<MeanReducer<T>, Device> {
+ enum {
+ Cost = NumTraits<T>::AddCost,
+ PacketAccess = PacketType<T, Device>::HasAdd &&
+ PacketType<T, Device>::HasDiv && !NumTraits<T>::IsInteger,
+ IsStateful = true,
+ IsExactlyAssociative = NumTraits<T>::IsInteger
+ };
+};
+
+
+template <typename T, bool IsMax = true, bool IsInteger = true>
+struct MinMaxBottomValue {
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
+ return Eigen::NumTraits<T>::lowest();
+ }
+};
+template <typename T>
+struct MinMaxBottomValue<T, true, false> {
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
+ return -Eigen::NumTraits<T>::infinity();
+ }
+};
+template <typename T>
+struct MinMaxBottomValue<T, false, true> {
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
+ return Eigen::NumTraits<T>::highest();
+ }
+};
+template <typename T>
+struct MinMaxBottomValue<T, false, false> {
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
+ return Eigen::NumTraits<T>::infinity();
+ }
+};
+
+
+template <typename T, int NaNPropagation=PropagateFast> struct MaxReducer
+{
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
+ scalar_max_op<T, T, NaNPropagation> op;
+ *accum = op(t, *accum);
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
+ scalar_max_op<T, T, NaNPropagation> op;
+ (*accum) = op.packetOp(*accum, p);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+ return MinMaxBottomValue<T, /*IsMax=*/true, Eigen::NumTraits<T>::IsInteger>::bottom_value();
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
+ return pset1<Packet>(initialize());
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
+ return accum;
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
+ return vaccum;
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
+ scalar_max_op<T, T, NaNPropagation> op;
+ return op(saccum, op.predux(vaccum));
+ }
+};
+
+template <typename T, typename Device, int NaNPropagation>
+ struct reducer_traits<MaxReducer<T, NaNPropagation>, Device> {
+ enum {
+ Cost = NumTraits<T>::AddCost,
+ PacketAccess = PacketType<T, Device>::HasMax,
+ IsStateful = false,
+ IsExactlyAssociative = (NaNPropagation!=PropagateFast)
+ };
+};
+
+template <typename T, int NaNPropagation=PropagateFast> struct MinReducer
+{
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
+ scalar_min_op<T, T, NaNPropagation> op;
+ *accum = op(t, *accum);
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
+ scalar_min_op<T, T, NaNPropagation> op;
+ (*accum) = op.packetOp(*accum, p);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+ return MinMaxBottomValue<T, /*IsMax=*/false, Eigen::NumTraits<T>::IsInteger>::bottom_value();
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
+ return pset1<Packet>(initialize());
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
+ return accum;
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
+ return vaccum;
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
+ scalar_min_op<T, T, NaNPropagation> op;
+ return op(saccum, op.predux(vaccum));
+ }
+};
+
+template <typename T, typename Device, int NaNPropagation>
+ struct reducer_traits<MinReducer<T, NaNPropagation>, Device> {
+ enum {
+ Cost = NumTraits<T>::AddCost,
+ PacketAccess = PacketType<T, Device>::HasMin,
+ IsStateful = false,
+ IsExactlyAssociative = (NaNPropagation!=PropagateFast)
+ };
+};
+
+template <typename T> struct ProdReducer
+{
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
+ internal::scalar_product_op<T> prod_op;
+ (*accum) = prod_op(*accum, t);
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
+ (*accum) = pmul<Packet>(*accum, p);
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+ internal::scalar_cast_op<int, T> conv;
+ return conv(1);
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
+ return pset1<Packet>(initialize());
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
+ return accum;
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
+ return vaccum;
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
+ internal::scalar_product_op<T> prod_op;
+ return prod_op(saccum, predux_mul(vaccum));
+ }
+};
+
+template <typename T, typename Device>
+struct reducer_traits<ProdReducer<T>, Device> {
+ enum {
+ Cost = NumTraits<T>::MulCost,
+ PacketAccess = PacketType<T, Device>::HasMul,
+ IsStateful = false,
+ IsExactlyAssociative = true
+ };
+};
+
+
+struct AndReducer
+{
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const {
+ *accum = *accum && t;
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool initialize() const {
+ return true;
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool finalize(bool accum) const {
+ return accum;
+ }
+};
+
+template <typename Device>
+struct reducer_traits<AndReducer, Device> {
+ enum {
+ Cost = 1,
+ PacketAccess = false,
+ IsStateful = false,
+ IsExactlyAssociative = true
+ };
+};
+
+
+struct OrReducer {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const {
+ *accum = *accum || t;
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool initialize() const {
+ return false;
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool finalize(bool accum) const {
+ return accum;
+ }
+};
+
+template <typename Device>
+struct reducer_traits<OrReducer, Device> {
+ enum {
+ Cost = 1,
+ PacketAccess = false,
+ IsStateful = false,
+ IsExactlyAssociative = true
+ };
+};
+
+// Argmin/Argmax reducers. Returns the first occurrence if multiple locations
+// contain the same min/max value.
+template <typename T> struct ArgMaxTupleReducer
+{
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
+ if (t.second < accum->second) {
+ return;
+ } else if (t.second > accum->second || accum->first > t.first ) {
+ *accum = t;
+ }
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+ return T(0, NumTraits<typename T::second_type>::lowest());
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T& accum) const {
+ return accum;
+ }
+};
+
+template <typename T, typename Device>
+struct reducer_traits<ArgMaxTupleReducer<T>, Device> {
+ enum {
+ Cost = NumTraits<T>::AddCost,
+ PacketAccess = false,
+ IsStateful = false,
+ IsExactlyAssociative = true
+ };
+};
+
+
+template <typename T> struct ArgMinTupleReducer
+{
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T& t, T* accum) const {
+ if (t.second > accum->second) {
+ return;
+ } else if (t.second < accum->second || accum->first > t.first) {
+ *accum = t;
+ }
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+ return T(0, NumTraits<typename T::second_type>::highest());
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T& accum) const {
+ return accum;
+ }
+};
+
+template <typename T, typename Device>
+struct reducer_traits<ArgMinTupleReducer<T>, Device> {
+ enum {
+ Cost = NumTraits<T>::AddCost,
+ PacketAccess = false,
+ IsStateful = false,
+ IsExactlyAssociative = true
+ };
+};
+
+
+template <typename T, typename Index, size_t NumDims>
+class GaussianGenerator {
+ public:
+ static const bool PacketAccess = false;
+
+ EIGEN_DEVICE_FUNC GaussianGenerator(const array<T, NumDims>& means,
+ const array<T, NumDims>& std_devs)
+ : m_means(means)
+ {
+ EIGEN_UNROLL_LOOP
+ for (size_t i = 0; i < NumDims; ++i) {
+ m_two_sigmas[i] = std_devs[i] * std_devs[i] * 2;
+ }
+ }
+
+ EIGEN_DEVICE_FUNC T operator()(const array<Index, NumDims>& coordinates) const {
+ T tmp = T(0);
+ EIGEN_UNROLL_LOOP
+ for (size_t i = 0; i < NumDims; ++i) {
+ T offset = coordinates[i] - m_means[i];
+ tmp += offset * offset / m_two_sigmas[i];
+ }
+ return numext::exp(-tmp);
+ }
+
+ private:
+ array<T, NumDims> m_means;
+ array<T, NumDims> m_two_sigmas;
+};
+
+template <typename T, typename Index, size_t NumDims>
+struct functor_traits<GaussianGenerator<T, Index, NumDims> > {
+ enum {
+ Cost = NumDims * (2 * NumTraits<T>::AddCost + NumTraits<T>::MulCost +
+ functor_traits<scalar_quotient_op<T, T> >::Cost) +
+ functor_traits<scalar_exp_op<T> >::Cost,
+ PacketAccess = GaussianGenerator<T, Index, NumDims>::PacketAccess
+ };
+};
+
+template <typename Scalar>
+struct scalar_clamp_op {
+ EIGEN_DEVICE_FUNC inline scalar_clamp_op(const Scalar& _min, const Scalar& _max) : m_min(_min), m_max(_max) {}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar
+ operator()(const Scalar& x) const {
+ return numext::mini(numext::maxi(x, m_min), m_max);
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
+ packetOp(const Packet& x) const {
+ return internal::pmin(internal::pmax(x, pset1<Packet>(m_min)), pset1<Packet>(m_max));
+ }
+ const Scalar m_min;
+ const Scalar m_max;
+};
+template<typename Scalar>
+struct functor_traits<scalar_clamp_op<Scalar> >
+{ enum { Cost = 2 * NumTraits<Scalar>::AddCost, PacketAccess = (packet_traits<Scalar>::HasMin && packet_traits<Scalar>::HasMax)}; };
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorGenerator.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorGenerator.h
new file mode 100644
index 0000000..174bf06
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorGenerator.h
@@ -0,0 +1,302 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H
+#define EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H
+
+namespace Eigen {
+
+/** \class TensorGeneratorOp
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor generator class.
+ *
+ *
+ */
+namespace internal {
+template<typename Generator, typename XprType>
+struct traits<TensorGeneratorOp<Generator, XprType> > : public traits<XprType>
+{
+ typedef typename XprType::Scalar Scalar;
+ typedef traits<XprType> XprTraits;
+ typedef typename XprTraits::StorageKind StorageKind;
+ typedef typename XprTraits::Index Index;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = XprTraits::NumDimensions;
+ static const int Layout = XprTraits::Layout;
+ typedef typename XprTraits::PointerType PointerType;
+};
+
+template<typename Generator, typename XprType>
+struct eval<TensorGeneratorOp<Generator, XprType>, Eigen::Dense>
+{
+ typedef const TensorGeneratorOp<Generator, XprType>& type;
+};
+
+template<typename Generator, typename XprType>
+struct nested<TensorGeneratorOp<Generator, XprType>, 1, typename eval<TensorGeneratorOp<Generator, XprType> >::type>
+{
+ typedef TensorGeneratorOp<Generator, XprType> type;
+};
+
+} // end namespace internal
+
+
+
+template<typename Generator, typename XprType>
+class TensorGeneratorOp : public TensorBase<TensorGeneratorOp<Generator, XprType>, ReadOnlyAccessors>
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorGeneratorOp>::Scalar Scalar;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename Eigen::internal::nested<TensorGeneratorOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorGeneratorOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorGeneratorOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorGeneratorOp(const XprType& expr, const Generator& generator)
+ : m_xpr(expr), m_generator(generator) {}
+
+ EIGEN_DEVICE_FUNC
+ const Generator& generator() const { return m_generator; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type&
+ expression() const { return m_xpr; }
+
+ protected:
+ typename XprType::Nested m_xpr;
+ const Generator m_generator;
+};
+
+
+// Eval as rvalue
+template<typename Generator, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
+{
+ typedef TensorGeneratorOp<Generator, ArgType> XprType;
+ typedef typename XprType::Index Index;
+ typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+ static const int NumDims = internal::array_size<Dimensions>::value;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+ enum {
+ IsAligned = false,
+ PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+ BlockAccess = true,
+ PreferBlockAccess = true,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = false
+ };
+
+ typedef internal::TensorIntDivisor<Index> IndexDivisor;
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+ typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+ typedef typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims,
+ Layout, Index>
+ TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : m_device(device), m_generator(op.generator())
+ {
+ TensorEvaluator<ArgType, Device> argImpl(op.expression(), device);
+ m_dimensions = argImpl.dimensions();
+
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ m_strides[0] = 1;
+ EIGEN_UNROLL_LOOP
+ for (int i = 1; i < NumDims; ++i) {
+ m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1];
+ if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]);
+ }
+ } else {
+ m_strides[NumDims - 1] = 1;
+ EIGEN_UNROLL_LOOP
+ for (int i = NumDims - 2; i >= 0; --i) {
+ m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1];
+ if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]);
+ }
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+ return true;
+ }
+ EIGEN_STRONG_INLINE void cleanup() {
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ array<Index, NumDims> coords;
+ extract_coordinates(index, coords);
+ return m_generator(coords);
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+ {
+ const int packetSize = PacketType<CoeffReturnType, Device>::size;
+ EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+packetSize-1 < dimensions().TotalSize());
+
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+ for (int i = 0; i < packetSize; ++i) {
+ values[i] = coeff(index+i);
+ }
+ PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+ return rslt;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ internal::TensorBlockResourceRequirements getResourceRequirements() const {
+ const size_t target_size = m_device.firstLevelCacheSize();
+ // TODO(ezhulenev): Generator should have a cost.
+ return internal::TensorBlockResourceRequirements::skewed<Scalar>(
+ target_size);
+ }
+
+ struct BlockIteratorState {
+ Index stride;
+ Index span;
+ Index size;
+ Index count;
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+ block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+ bool /*root_of_expr_ast*/ = false) const {
+ static const bool is_col_major =
+ static_cast<int>(Layout) == static_cast<int>(ColMajor);
+
+ // Compute spatial coordinates for the first block element.
+ array<Index, NumDims> coords;
+ extract_coordinates(desc.offset(), coords);
+ array<Index, NumDims> initial_coords = coords;
+
+ // Offset in the output block buffer.
+ Index offset = 0;
+
+ // Initialize output block iterator state. Dimension in this array are
+ // always in inner_most -> outer_most order (col major layout).
+ array<BlockIteratorState, NumDims> it;
+ for (int i = 0; i < NumDims; ++i) {
+ const int dim = is_col_major ? i : NumDims - 1 - i;
+ it[i].size = desc.dimension(dim);
+ it[i].stride = i == 0 ? 1 : (it[i - 1].size * it[i - 1].stride);
+ it[i].span = it[i].stride * (it[i].size - 1);
+ it[i].count = 0;
+ }
+ eigen_assert(it[0].stride == 1);
+
+ // Prepare storage for the materialized generator result.
+ const typename TensorBlock::Storage block_storage =
+ TensorBlock::prepareStorage(desc, scratch);
+
+ CoeffReturnType* block_buffer = block_storage.data();
+
+ static const int packet_size = PacketType<CoeffReturnType, Device>::size;
+
+ static const int inner_dim = is_col_major ? 0 : NumDims - 1;
+ const Index inner_dim_size = it[0].size;
+ const Index inner_dim_vectorized = inner_dim_size - packet_size;
+
+ while (it[NumDims - 1].count < it[NumDims - 1].size) {
+ Index i = 0;
+ // Generate data for the vectorized part of the inner-most dimension.
+ for (; i <= inner_dim_vectorized; i += packet_size) {
+ for (Index j = 0; j < packet_size; ++j) {
+ array<Index, NumDims> j_coords = coords; // Break loop dependence.
+ j_coords[inner_dim] += j;
+ *(block_buffer + offset + i + j) = m_generator(j_coords);
+ }
+ coords[inner_dim] += packet_size;
+ }
+ // Finalize non-vectorized part of the inner-most dimension.
+ for (; i < inner_dim_size; ++i) {
+ *(block_buffer + offset + i) = m_generator(coords);
+ coords[inner_dim]++;
+ }
+ coords[inner_dim] = initial_coords[inner_dim];
+
+ // For the 1d tensor we need to generate only one inner-most dimension.
+ if (NumDims == 1) break;
+
+ // Update offset.
+ for (i = 1; i < NumDims; ++i) {
+ if (++it[i].count < it[i].size) {
+ offset += it[i].stride;
+ coords[is_col_major ? i : NumDims - 1 - i]++;
+ break;
+ }
+ if (i != NumDims - 1) it[i].count = 0;
+ coords[is_col_major ? i : NumDims - 1 - i] =
+ initial_coords[is_col_major ? i : NumDims - 1 - i];
+ offset -= it[i].span;
+ }
+ }
+
+ return block_storage.AsTensorMaterializedBlock();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+ costPerCoeff(bool) const {
+ // TODO(rmlarsen): This is just a placeholder. Define interface to make
+ // generators return their cost.
+ return TensorOpCost(0, 0, TensorOpCost::AddCost<Scalar>() +
+ TensorOpCost::MulCost<Scalar>());
+ }
+
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler&) const {}
+#endif
+
+ protected:
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ void extract_coordinates(Index index, array<Index, NumDims>& coords) const {
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = NumDims - 1; i > 0; --i) {
+ const Index idx = index / m_fast_strides[i];
+ index -= idx * m_strides[i];
+ coords[i] = idx;
+ }
+ coords[0] = index;
+ } else {
+ for (int i = 0; i < NumDims - 1; ++i) {
+ const Index idx = index / m_fast_strides[i];
+ index -= idx * m_strides[i];
+ coords[i] = idx;
+ }
+ coords[NumDims-1] = index;
+ }
+ }
+
+ const Device EIGEN_DEVICE_REF m_device;
+ Dimensions m_dimensions;
+ array<Index, NumDims> m_strides;
+ array<IndexDivisor, NumDims> m_fast_strides;
+ Generator m_generator;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorGlobalFunctions.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorGlobalFunctions.h
new file mode 100644
index 0000000..665b861
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorGlobalFunctions.h
@@ -0,0 +1,33 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H
+
+namespace Eigen {
+
+/** \cpp11 \returns an expression of the coefficient-wise betainc(\a x, \a a, \a b) to the given tensors.
+ *
+ * This function computes the regularized incomplete beta function (integral).
+ *
+ */
+template <typename ADerived, typename BDerived, typename XDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
+ TensorCwiseTernaryOp<internal::scalar_betainc_op<typename XDerived::Scalar>,
+ const ADerived, const BDerived, const XDerived>
+ betainc(const ADerived& a, const BDerived& b, const XDerived& x) {
+ return TensorCwiseTernaryOp<
+ internal::scalar_betainc_op<typename XDerived::Scalar>, const ADerived,
+ const BDerived, const XDerived>(
+ a, b, x, internal::scalar_betainc_op<typename XDerived::Scalar>());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorGpuHipCudaDefines.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
new file mode 100644
index 0000000..cb53ce2
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
@@ -0,0 +1,99 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2018 Deven Desai <deven.desai.amd@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H)
+#define EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
+
+// Note that we are using EIGEN_USE_HIP here instead of EIGEN_HIPCC...this is by design
+// There is code in the Tensorflow codebase that will define EIGEN_USE_GPU, but
+// for some reason gets sent to the gcc/host compiler instead of the gpu/nvcc/hipcc compiler
+// When compiling such files, gcc will end up trying to pick up the CUDA headers by
+// default (see the code within "unsupported/Eigen/CXX11/Tensor" that is guarded by EIGEN_USE_GPU)
+// This will obviously not work when trying to compile tensorflow on a system with no CUDA
+// To work around this issue for HIP systems (and leave the default behaviour intact), the
+// HIP tensorflow build defines EIGEN_USE_HIP when compiling all source files, and
+// "unsupported/Eigen/CXX11/Tensor" has been updated to use HIP header when EIGEN_USE_HIP is
+// defined. In continuation of that requirement, the guard here needs to be EIGEN_USE_HIP as well
+
+#if defined(EIGEN_USE_HIP)
+
+#define gpuStream_t hipStream_t
+#define gpuDeviceProp_t hipDeviceProp_t
+#define gpuError_t hipError_t
+#define gpuSuccess hipSuccess
+#define gpuErrorNotReady hipErrorNotReady
+#define gpuGetDeviceCount hipGetDeviceCount
+#define gpuGetLastError hipGetLastError
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorName hipGetErrorName
+#define gpuGetErrorString hipGetErrorString
+#define gpuGetDeviceProperties hipGetDeviceProperties
+#define gpuStreamDefault hipStreamDefault
+#define gpuGetDevice hipGetDevice
+#define gpuSetDevice hipSetDevice
+#define gpuMalloc hipMalloc
+#define gpuFree hipFree
+#define gpuMemsetAsync hipMemsetAsync
+#define gpuMemcpyAsync hipMemcpyAsync
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuStreamQuery hipStreamQuery
+#define gpuSharedMemConfig hipSharedMemConfig
+#define gpuDeviceSetSharedMemConfig hipDeviceSetSharedMemConfig
+#define gpuStreamSynchronize hipStreamSynchronize
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuMemcpy hipMemcpy
+
+#else
+
+#define gpuStream_t cudaStream_t
+#define gpuDeviceProp_t cudaDeviceProp
+#define gpuError_t cudaError_t
+#define gpuSuccess cudaSuccess
+#define gpuErrorNotReady cudaErrorNotReady
+#define gpuGetDeviceCount cudaGetDeviceCount
+#define gpuGetLastError cudaGetLastError
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorName cudaGetErrorName
+#define gpuGetErrorString cudaGetErrorString
+#define gpuGetDeviceProperties cudaGetDeviceProperties
+#define gpuStreamDefault cudaStreamDefault
+#define gpuGetDevice cudaGetDevice
+#define gpuSetDevice cudaSetDevice
+#define gpuMalloc cudaMalloc
+#define gpuFree cudaFree
+#define gpuMemsetAsync cudaMemsetAsync
+#define gpuMemcpyAsync cudaMemcpyAsync
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuStreamQuery cudaStreamQuery
+#define gpuSharedMemConfig cudaSharedMemConfig
+#define gpuDeviceSetSharedMemConfig cudaDeviceSetSharedMemConfig
+#define gpuStreamSynchronize cudaStreamSynchronize
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuMemcpy cudaMemcpy
+
+#endif
+
+// gpu_assert can be overridden
+#ifndef gpu_assert
+
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+// HIPCC do not support the use of assert on the GPU side.
+#define gpu_assert(COND)
+#else
+#define gpu_assert(COND) assert(COND)
+#endif
+
+#endif // gpu_assert
+
+#endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h
new file mode 100644
index 0000000..1d142f2
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h
@@ -0,0 +1,44 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2018 Deven Desai <deven.desai.amd@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H)
+
+#ifndef EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES
+
+#undef gpuStream_t
+#undef gpuDeviceProp_t
+#undef gpuError_t
+#undef gpuSuccess
+#undef gpuErrorNotReady
+#undef gpuGetDeviceCount
+#undef gpuGetErrorString
+#undef gpuGetDeviceProperties
+#undef gpuStreamDefault
+#undef gpuGetDevice
+#undef gpuSetDevice
+#undef gpuMalloc
+#undef gpuFree
+#undef gpuMemsetAsync
+#undef gpuMemcpyAsync
+#undef gpuMemcpyDeviceToDevice
+#undef gpuMemcpyDeviceToHost
+#undef gpuMemcpyHostToDevice
+#undef gpuStreamQuery
+#undef gpuSharedMemConfig
+#undef gpuDeviceSetSharedMemConfig
+#undef gpuStreamSynchronize
+#undef gpuDeviceSynchronize
+#undef gpuMemcpy
+
+#endif // EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES
+
+#undef EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
+
+#endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorIO.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorIO.h
new file mode 100644
index 0000000..a901c5d
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorIO.h
@@ -0,0 +1,79 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_IO_H
+#define EIGEN_CXX11_TENSOR_TENSOR_IO_H
+
+namespace Eigen {
+
+namespace internal {
+
+// Print the tensor as a 2d matrix
+template <typename Tensor, int Rank>
+struct TensorPrinter {
+ static void run (std::ostream& os, const Tensor& tensor) {
+ typedef typename internal::remove_const<typename Tensor::Scalar>::type Scalar;
+ typedef typename Tensor::Index Index;
+ const Index total_size = internal::array_prod(tensor.dimensions());
+ if (total_size > 0) {
+ const Index first_dim = Eigen::internal::array_get<0>(tensor.dimensions());
+ static const int layout = Tensor::Layout;
+ Map<const Array<Scalar, Dynamic, Dynamic, layout> > matrix(const_cast<Scalar*>(tensor.data()), first_dim, total_size/first_dim);
+ os << matrix;
+ }
+ }
+};
+
+
+// Print the tensor as a vector
+template <typename Tensor>
+struct TensorPrinter<Tensor, 1> {
+ static void run (std::ostream& os, const Tensor& tensor) {
+ typedef typename internal::remove_const<typename Tensor::Scalar>::type Scalar;
+ typedef typename Tensor::Index Index;
+ const Index total_size = internal::array_prod(tensor.dimensions());
+ if (total_size > 0) {
+ Map<const Array<Scalar, Dynamic, 1> > array(const_cast<Scalar*>(tensor.data()), total_size);
+ os << array;
+ }
+ }
+};
+
+
+// Print the tensor as a scalar
+template <typename Tensor>
+struct TensorPrinter<Tensor, 0> {
+ static void run (std::ostream& os, const Tensor& tensor) {
+ os << tensor.coeff(0);
+ }
+};
+}
+
+template <typename T>
+std::ostream& operator << (std::ostream& os, const TensorBase<T, ReadOnlyAccessors>& expr) {
+ typedef TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice> Evaluator;
+ typedef typename Evaluator::Dimensions Dimensions;
+
+ // Evaluate the expression if needed
+ TensorForcedEvalOp<const T> eval = expr.eval();
+ Evaluator tensor(eval, DefaultDevice());
+ tensor.evalSubExprsIfNeeded(NULL);
+
+ // Print the result
+ static const int rank = internal::array_size<Dimensions>::value;
+ internal::TensorPrinter<Evaluator, rank>::run(os, tensor);
+
+ // Cleanup.
+ tensor.cleanup();
+ return os;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_IO_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorImagePatch.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorImagePatch.h
new file mode 100644
index 0000000..dd51850
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorImagePatch.h
@@ -0,0 +1,603 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H
+#define EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H
+
+namespace Eigen {
+
+/** \class TensorImagePatch
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Patch extraction specialized for image processing.
+ * This assumes that the input has a least 3 dimensions ordered as follow:
+ * 1st dimension: channels (of size d)
+ * 2nd dimension: rows (of size r)
+ * 3rd dimension: columns (of size c)
+ * There can be additional dimensions such as time (for video) or batch (for
+ * bulk processing after the first 3.
+ * Calling the image patch code with patch_rows and patch_cols is equivalent
+ * to calling the regular patch extraction code with parameters d, patch_rows,
+ * patch_cols, and 1 for all the additional dimensions.
+ */
+namespace internal {
+
+template<DenseIndex Rows, DenseIndex Cols, typename XprType>
+struct traits<TensorImagePatchOp<Rows, Cols, XprType> > : public traits<XprType>
+{
+ typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+ typedef traits<XprType> XprTraits;
+ typedef typename XprTraits::StorageKind StorageKind;
+ typedef typename XprTraits::Index Index;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = XprTraits::NumDimensions + 1;
+ static const int Layout = XprTraits::Layout;
+ typedef typename XprTraits::PointerType PointerType;
+};
+
+template<DenseIndex Rows, DenseIndex Cols, typename XprType>
+struct eval<TensorImagePatchOp<Rows, Cols, XprType>, Eigen::Dense>
+{
+ typedef const TensorImagePatchOp<Rows, Cols, XprType>& type;
+};
+
+template<DenseIndex Rows, DenseIndex Cols, typename XprType>
+struct nested<TensorImagePatchOp<Rows, Cols, XprType>, 1, typename eval<TensorImagePatchOp<Rows, Cols, XprType> >::type>
+{
+ typedef TensorImagePatchOp<Rows, Cols, XprType> type;
+};
+
+template <typename Self, bool Vectorizable>
+struct ImagePatchCopyOp {
+ typedef typename Self::Index Index;
+ typedef typename Self::Scalar Scalar;
+ typedef typename Self::Impl Impl;
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+ const Self& self, const Index num_coeff_to_copy, const Index dst_index,
+ Scalar* dst_data, const Index src_index) {
+ const Impl& impl = self.impl();
+ for (Index i = 0; i < num_coeff_to_copy; ++i) {
+ dst_data[dst_index + i] = impl.coeff(src_index + i);
+ }
+ }
+};
+
+template <typename Self>
+struct ImagePatchCopyOp<Self, true> {
+ typedef typename Self::Index Index;
+ typedef typename Self::Scalar Scalar;
+ typedef typename Self::Impl Impl;
+ typedef typename packet_traits<Scalar>::type Packet;
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+ const Self& self, const Index num_coeff_to_copy, const Index dst_index,
+ Scalar* dst_data, const Index src_index) {
+ const Impl& impl = self.impl();
+ const Index packet_size = internal::unpacket_traits<Packet>::size;
+ const Index vectorized_size =
+ (num_coeff_to_copy / packet_size) * packet_size;
+ for (Index i = 0; i < vectorized_size; i += packet_size) {
+ Packet p = impl.template packet<Unaligned>(src_index + i);
+ internal::pstoret<Scalar, Packet, Unaligned>(dst_data + dst_index + i, p);
+ }
+ for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) {
+ dst_data[dst_index + i] = impl.coeff(src_index + i);
+ }
+ }
+};
+
+template <typename Self>
+struct ImagePatchPaddingOp {
+ typedef typename Self::Index Index;
+ typedef typename Self::Scalar Scalar;
+ typedef typename packet_traits<Scalar>::type Packet;
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
+ const Index num_coeff_to_pad, const Scalar padding_value,
+ const Index dst_index, Scalar* dst_data) {
+ const Index packet_size = internal::unpacket_traits<Packet>::size;
+ const Packet padded_packet = internal::pset1<Packet>(padding_value);
+ const Index vectorized_size =
+ (num_coeff_to_pad / packet_size) * packet_size;
+ for (Index i = 0; i < vectorized_size; i += packet_size) {
+ internal::pstoret<Scalar, Packet, Unaligned>(dst_data + dst_index + i,
+ padded_packet);
+ }
+ for (Index i = vectorized_size; i < num_coeff_to_pad; ++i) {
+ dst_data[dst_index + i] = padding_value;
+ }
+ }
+};
+
+} // end namespace internal
+
+template<DenseIndex Rows, DenseIndex Cols, typename XprType>
+class TensorImagePatchOp : public TensorBase<TensorImagePatchOp<Rows, Cols, XprType>, ReadOnlyAccessors>
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorImagePatchOp>::Scalar Scalar;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename Eigen::internal::nested<TensorImagePatchOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorImagePatchOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorImagePatchOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols,
+ DenseIndex row_strides, DenseIndex col_strides,
+ DenseIndex in_row_strides, DenseIndex in_col_strides,
+ DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
+ PaddingType padding_type, Scalar padding_value)
+ : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
+ m_row_strides(row_strides), m_col_strides(col_strides),
+ m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
+ m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
+ m_padding_explicit(false), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0),
+ m_padding_type(padding_type), m_padding_value(padding_value) {}
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols,
+ DenseIndex row_strides, DenseIndex col_strides,
+ DenseIndex in_row_strides, DenseIndex in_col_strides,
+ DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
+ DenseIndex padding_top, DenseIndex padding_bottom,
+ DenseIndex padding_left, DenseIndex padding_right,
+ Scalar padding_value)
+ : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
+ m_row_strides(row_strides), m_col_strides(col_strides),
+ m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
+ m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
+ m_padding_explicit(true), m_padding_top(padding_top), m_padding_bottom(padding_bottom),
+ m_padding_left(padding_left), m_padding_right(padding_right),
+ m_padding_type(PADDING_VALID), m_padding_value(padding_value) {}
+
+
+ EIGEN_DEVICE_FUNC
+ DenseIndex patch_rows() const { return m_patch_rows; }
+ EIGEN_DEVICE_FUNC
+ DenseIndex patch_cols() const { return m_patch_cols; }
+ EIGEN_DEVICE_FUNC
+ DenseIndex row_strides() const { return m_row_strides; }
+ EIGEN_DEVICE_FUNC
+ DenseIndex col_strides() const { return m_col_strides; }
+ EIGEN_DEVICE_FUNC
+ DenseIndex in_row_strides() const { return m_in_row_strides; }
+ EIGEN_DEVICE_FUNC
+ DenseIndex in_col_strides() const { return m_in_col_strides; }
+ EIGEN_DEVICE_FUNC
+ DenseIndex row_inflate_strides() const { return m_row_inflate_strides; }
+ EIGEN_DEVICE_FUNC
+ DenseIndex col_inflate_strides() const { return m_col_inflate_strides; }
+ EIGEN_DEVICE_FUNC
+ bool padding_explicit() const { return m_padding_explicit; }
+ EIGEN_DEVICE_FUNC
+ DenseIndex padding_top() const { return m_padding_top; }
+ EIGEN_DEVICE_FUNC
+ DenseIndex padding_bottom() const { return m_padding_bottom; }
+ EIGEN_DEVICE_FUNC
+ DenseIndex padding_left() const { return m_padding_left; }
+ EIGEN_DEVICE_FUNC
+ DenseIndex padding_right() const { return m_padding_right; }
+ EIGEN_DEVICE_FUNC
+ PaddingType padding_type() const { return m_padding_type; }
+ EIGEN_DEVICE_FUNC
+ Scalar padding_value() const { return m_padding_value; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type&
+ expression() const { return m_xpr; }
+
+ protected:
+ typename XprType::Nested m_xpr;
+ const DenseIndex m_patch_rows;
+ const DenseIndex m_patch_cols;
+ const DenseIndex m_row_strides;
+ const DenseIndex m_col_strides;
+ const DenseIndex m_in_row_strides;
+ const DenseIndex m_in_col_strides;
+ const DenseIndex m_row_inflate_strides;
+ const DenseIndex m_col_inflate_strides;
+ const bool m_padding_explicit;
+ const DenseIndex m_padding_top;
+ const DenseIndex m_padding_bottom;
+ const DenseIndex m_padding_left;
+ const DenseIndex m_padding_right;
+ const PaddingType m_padding_type;
+ const Scalar m_padding_value;
+};
+
+// Eval as rvalue
+template<DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
+{
+ typedef TensorImagePatchOp<Rows, Cols, ArgType> XprType;
+ typedef typename XprType::Index Index;
+ static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+ static const int NumDims = NumInputDims + 1;
+ typedef DSizes<Index, NumDims> Dimensions;
+ typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+ typedef TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>,
+ Device> Self;
+ typedef TensorEvaluator<ArgType, Device> Impl;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+
+ enum {
+ IsAligned = false,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = false,
+ PreferBlockAccess = true,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false,
+ RawAccess = false
+ };
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device)
+ : m_device(device), m_impl(op.expression(), device)
+ {
+ EIGEN_STATIC_ASSERT((NumDims >= 4), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+ m_paddingValue = op.padding_value();
+
+ const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+
+ // Caches a few variables.
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ m_inputDepth = input_dims[0];
+ m_inputRows = input_dims[1];
+ m_inputCols = input_dims[2];
+ } else {
+ m_inputDepth = input_dims[NumInputDims-1];
+ m_inputRows = input_dims[NumInputDims-2];
+ m_inputCols = input_dims[NumInputDims-3];
+ }
+
+ m_row_strides = op.row_strides();
+ m_col_strides = op.col_strides();
+
+ // Input strides and effective input/patch size
+ m_in_row_strides = op.in_row_strides();
+ m_in_col_strides = op.in_col_strides();
+ m_row_inflate_strides = op.row_inflate_strides();
+ m_col_inflate_strides = op.col_inflate_strides();
+ // The "effective" input rows and input cols are the input rows and cols
+ // after inflating them with zeros.
+ // For examples, a 2x3 matrix with row_inflate_strides and
+ // col_inflate_strides of 2 comes from:
+ // A B C
+ // D E F
+ //
+ // to a matrix is 3 x 5:
+ //
+ // A . B . C
+ // . . . . .
+ // D . E . F
+
+ m_input_rows_eff = (m_inputRows - 1) * m_row_inflate_strides + 1;
+ m_input_cols_eff = (m_inputCols - 1) * m_col_inflate_strides + 1;
+ m_patch_rows_eff = op.patch_rows() + (op.patch_rows() - 1) * (m_in_row_strides - 1);
+ m_patch_cols_eff = op.patch_cols() + (op.patch_cols() - 1) * (m_in_col_strides - 1);
+
+ if (op.padding_explicit()) {
+ m_outputRows = numext::ceil((m_input_rows_eff + op.padding_top() + op.padding_bottom() - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides));
+ m_outputCols = numext::ceil((m_input_cols_eff + op.padding_left() + op.padding_right() - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides));
+ m_rowPaddingTop = op.padding_top();
+ m_colPaddingLeft = op.padding_left();
+ } else {
+ // Computing padding from the type
+ switch (op.padding_type()) {
+ case PADDING_VALID:
+ m_outputRows = numext::ceil((m_input_rows_eff - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides));
+ m_outputCols = numext::ceil((m_input_cols_eff - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides));
+ // Calculate the padding
+ m_rowPaddingTop = numext::maxi<Index>(0, ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2);
+ m_colPaddingLeft = numext::maxi<Index>(0, ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2);
+ break;
+ case PADDING_SAME:
+ m_outputRows = numext::ceil(m_input_rows_eff / static_cast<float>(m_row_strides));
+ m_outputCols = numext::ceil(m_input_cols_eff / static_cast<float>(m_col_strides));
+ // Calculate the padding
+ m_rowPaddingTop = ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2;
+ m_colPaddingLeft = ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2;
+ // The padding size calculation for PADDING_SAME has been updated to
+ // be consistent with how TensorFlow extracts its paddings.
+ m_rowPaddingTop = numext::maxi<Index>(0, m_rowPaddingTop);
+ m_colPaddingLeft = numext::maxi<Index>(0, m_colPaddingLeft);
+ break;
+ default:
+ eigen_assert(false && "unexpected padding");
+ m_outputCols=0; // silence the uninitialised warning;
+ m_outputRows=0; //// silence the uninitialised warning;
+ }
+ }
+ eigen_assert(m_outputRows > 0);
+ eigen_assert(m_outputCols > 0);
+
+ // Dimensions for result of extraction.
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ // ColMajor
+ // 0: depth
+ // 1: patch_rows
+ // 2: patch_cols
+ // 3: number of patches
+ // 4 and beyond: anything else (such as batch).
+ m_dimensions[0] = input_dims[0];
+ m_dimensions[1] = op.patch_rows();
+ m_dimensions[2] = op.patch_cols();
+ m_dimensions[3] = m_outputRows * m_outputCols;
+ for (int i = 4; i < NumDims; ++i) {
+ m_dimensions[i] = input_dims[i-1];
+ }
+ } else {
+ // RowMajor
+ // NumDims-1: depth
+ // NumDims-2: patch_rows
+ // NumDims-3: patch_cols
+ // NumDims-4: number of patches
+ // NumDims-5 and beyond: anything else (such as batch).
+ m_dimensions[NumDims-1] = input_dims[NumInputDims-1];
+ m_dimensions[NumDims-2] = op.patch_rows();
+ m_dimensions[NumDims-3] = op.patch_cols();
+ m_dimensions[NumDims-4] = m_outputRows * m_outputCols;
+ for (int i = NumDims-5; i >= 0; --i) {
+ m_dimensions[i] = input_dims[i];
+ }
+ }
+
+ // Strides for moving the patch in various dimensions.
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ m_colStride = m_dimensions[1];
+ m_patchStride = m_colStride * m_dimensions[2] * m_dimensions[0];
+ m_otherStride = m_patchStride * m_dimensions[3];
+ } else {
+ m_colStride = m_dimensions[NumDims-2];
+ m_patchStride = m_colStride * m_dimensions[NumDims-3] * m_dimensions[NumDims-1];
+ m_otherStride = m_patchStride * m_dimensions[NumDims-4];
+ }
+
+ // Strides for navigating through the input tensor.
+ m_rowInputStride = m_inputDepth;
+ m_colInputStride = m_inputDepth * m_inputRows;
+ m_patchInputStride = m_inputDepth * m_inputRows * m_inputCols;
+
+ // Fast representations of different variables.
+ m_fastOtherStride = internal::TensorIntDivisor<Index>(m_otherStride);
+ m_fastPatchStride = internal::TensorIntDivisor<Index>(m_patchStride);
+ m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride);
+ m_fastInflateRowStride = internal::TensorIntDivisor<Index>(m_row_inflate_strides);
+ m_fastInflateColStride = internal::TensorIntDivisor<Index>(m_col_inflate_strides);
+ m_fastInputColsEff = internal::TensorIntDivisor<Index>(m_input_cols_eff);
+
+ // Number of patches in the width dimension.
+ m_fastOutputRows = internal::TensorIntDivisor<Index>(m_outputRows);
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[0]);
+ } else {
+ m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[NumDims-1]);
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+ m_impl.evalSubExprsIfNeeded(NULL);
+ return true;
+ }
+
+#ifdef EIGEN_USE_THREADS
+ template <typename EvalSubExprsCallback>
+ EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+ EvaluatorPointerType, EvalSubExprsCallback done) {
+ m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+ }
+#endif // EIGEN_USE_THREADS
+
+ EIGEN_STRONG_INLINE void cleanup() {
+ m_impl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ // Patch index corresponding to the passed in index.
+ const Index patchIndex = index / m_fastPatchStride;
+ // Find the offset of the element wrt the location of the first element.
+ const Index patchOffset = (index - patchIndex * m_patchStride) / m_fastOutputDepth;
+
+ // Other ways to index this element.
+ const Index otherIndex = (NumDims == 4) ? 0 : index / m_fastOtherStride;
+ const Index patch2DIndex = (NumDims == 4) ? patchIndex : (index - otherIndex * m_otherStride) / m_fastPatchStride;
+
+ // Calculate col index in the input original tensor.
+ const Index colIndex = patch2DIndex / m_fastOutputRows;
+ const Index colOffset = patchOffset / m_fastColStride;
+ const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft;
+ const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInflateColStride) : 0);
+ if (inputCol < 0 || inputCol >= m_input_cols_eff ||
+ ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) {
+ return Scalar(m_paddingValue);
+ }
+
+ // Calculate row index in the original input tensor.
+ const Index rowIndex = patch2DIndex - colIndex * m_outputRows;
+ const Index rowOffset = patchOffset - colOffset * m_colStride;
+ const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop;
+ const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInflateRowStride) : 0);
+ if (inputRow < 0 || inputRow >= m_input_rows_eff ||
+ ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) {
+ return Scalar(m_paddingValue);
+ }
+
+ const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
+ const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index];
+
+ const Index inputIndex = depth + origInputRow * m_rowInputStride + origInputCol * m_colInputStride + otherIndex * m_patchInputStride;
+ return m_impl.coeff(inputIndex);
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+ {
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+ if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1) {
+ return packetWithPossibleZero(index);
+ }
+
+ const Index indices[2] = {index, index + PacketSize - 1};
+ const Index patchIndex = indices[0] / m_fastPatchStride;
+ if (patchIndex != indices[1] / m_fastPatchStride) {
+ return packetWithPossibleZero(index);
+ }
+ const Index otherIndex = (NumDims == 4) ? 0 : indices[0] / m_fastOtherStride;
+ eigen_assert(otherIndex == indices[1] / m_fastOtherStride);
+
+ // Find the offset of the element wrt the location of the first element.
+ const Index patchOffsets[2] = {(indices[0] - patchIndex * m_patchStride) / m_fastOutputDepth,
+ (indices[1] - patchIndex * m_patchStride) / m_fastOutputDepth};
+
+ const Index patch2DIndex = (NumDims == 4) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride;
+ eigen_assert(patch2DIndex == (indices[1] - otherIndex * m_otherStride) / m_fastPatchStride);
+
+ const Index colIndex = patch2DIndex / m_fastOutputRows;
+ const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride, patchOffsets[1] / m_fastColStride};
+
+ // Calculate col indices in the original input tensor.
+ const Index inputCols[2] = {colIndex * m_col_strides + colOffsets[0] -
+ m_colPaddingLeft, colIndex * m_col_strides + colOffsets[1] - m_colPaddingLeft};
+ if (inputCols[1] < 0 || inputCols[0] >= m_inputCols) {
+ return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
+ }
+
+ if (inputCols[0] == inputCols[1]) {
+ const Index rowIndex = patch2DIndex - colIndex * m_outputRows;
+ const Index rowOffsets[2] = {patchOffsets[0] - colOffsets[0]*m_colStride, patchOffsets[1] - colOffsets[1]*m_colStride};
+ eigen_assert(rowOffsets[0] <= rowOffsets[1]);
+ // Calculate col indices in the original input tensor.
+ const Index inputRows[2] = {rowIndex * m_row_strides + rowOffsets[0] -
+ m_rowPaddingTop, rowIndex * m_row_strides + rowOffsets[1] - m_rowPaddingTop};
+
+ if (inputRows[1] < 0 || inputRows[0] >= m_inputRows) {
+ return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
+ }
+
+ if (inputRows[0] >= 0 && inputRows[1] < m_inputRows) {
+ // no padding
+ const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
+ const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index];
+ const Index inputIndex = depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex * m_patchInputStride;
+ return m_impl.template packet<Unaligned>(inputIndex);
+ }
+ }
+
+ return packetWithPossibleZero(index);
+ }
+
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_impl.bind(cgh);
+ }
+#endif
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowPaddingTop() const { return m_rowPaddingTop; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colPaddingLeft() const { return m_colPaddingLeft; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputRows() const { return m_outputRows; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputCols() const { return m_outputCols; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userRowStride() const { return m_row_strides; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userColStride() const { return m_col_strides; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInRowStride() const { return m_in_row_strides; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInColStride() const { return m_in_col_strides; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowInflateStride() const { return m_row_inflate_strides; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colInflateStride() const { return m_col_inflate_strides; }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+ costPerCoeff(bool vectorized) const {
+ // We conservatively estimate the cost for the code path where the computed
+ // index is inside the original image and
+ // TensorEvaluator<ArgType, Device>::CoordAccess is false.
+ const double compute_cost = 3 * TensorOpCost::DivCost<Index>() +
+ 6 * TensorOpCost::MulCost<Index>() +
+ 8 * TensorOpCost::MulCost<Index>();
+ return m_impl.costPerCoeff(vectorized) +
+ TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
+ }
+
+ protected:
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
+ {
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < PacketSize; ++i) {
+ values[i] = coeff(index+i);
+ }
+ PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+ return rslt;
+ }
+
+ Dimensions m_dimensions;
+
+ Index m_otherStride;
+ Index m_patchStride;
+ Index m_colStride;
+ Index m_row_strides;
+ Index m_col_strides;
+
+ Index m_in_row_strides;
+ Index m_in_col_strides;
+ Index m_row_inflate_strides;
+ Index m_col_inflate_strides;
+
+ Index m_input_rows_eff;
+ Index m_input_cols_eff;
+ Index m_patch_rows_eff;
+ Index m_patch_cols_eff;
+
+ internal::TensorIntDivisor<Index> m_fastOtherStride;
+ internal::TensorIntDivisor<Index> m_fastPatchStride;
+ internal::TensorIntDivisor<Index> m_fastColStride;
+ internal::TensorIntDivisor<Index> m_fastInflateRowStride;
+ internal::TensorIntDivisor<Index> m_fastInflateColStride;
+ internal::TensorIntDivisor<Index> m_fastInputColsEff;
+
+ Index m_rowInputStride;
+ Index m_colInputStride;
+ Index m_patchInputStride;
+
+ Index m_inputDepth;
+ Index m_inputRows;
+ Index m_inputCols;
+
+ Index m_outputRows;
+ Index m_outputCols;
+
+ Index m_rowPaddingTop;
+ Index m_colPaddingLeft;
+
+ internal::TensorIntDivisor<Index> m_fastOutputRows;
+ internal::TensorIntDivisor<Index> m_fastOutputDepth;
+
+ Scalar m_paddingValue;
+
+ const Device EIGEN_DEVICE_REF m_device;
+ TensorEvaluator<ArgType, Device> m_impl;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorIndexList.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorIndexList.h
new file mode 100644
index 0000000..2d8c7b9
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorIndexList.h
@@ -0,0 +1,738 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
+#define EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
+
+
+#if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES
+
+#define EIGEN_HAS_INDEX_LIST
+
+namespace Eigen {
+
+/** \internal
+ *
+ * \class TensorIndexList
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Set of classes used to encode a set of Tensor dimensions/indices.
+ *
+ * The indices in the list can be known at compile time or at runtime. A mix
+ * of static and dynamic indices can also be provided if needed. The tensor
+ * code will attempt to take advantage of the indices that are known at
+ * compile time to optimize the code it generates.
+ *
+ * This functionality requires a c++11 compliant compiler. If your compiler
+ * is older you need to use arrays of indices instead.
+ *
+ * Several examples are provided in the cxx11_tensor_index_list.cpp file.
+ *
+ * \sa Tensor
+ */
+
+template <Index n>
+struct type2index {
+ static const Index value = n;
+ EIGEN_DEVICE_FUNC constexpr operator Index() const { return n; }
+ EIGEN_DEVICE_FUNC void set(Index val) {
+ eigen_assert(val == n);
+ }
+};
+
+// This can be used with IndexPairList to get compile-time constant pairs,
+// such as IndexPairList<type2indexpair<1,2>, type2indexpair<3,4>>().
+template <Index f, Index s>
+struct type2indexpair {
+ static const Index first = f;
+ static const Index second = s;
+
+ constexpr EIGEN_DEVICE_FUNC operator IndexPair<Index>() const {
+ return IndexPair<Index>(f, s);
+ }
+
+ EIGEN_DEVICE_FUNC void set(const IndexPair<Index>& val) {
+ eigen_assert(val.first == f);
+ eigen_assert(val.second == s);
+ }
+};
+
+
+template<Index n> struct NumTraits<type2index<n> >
+{
+ typedef Index Real;
+ enum {
+ IsComplex = 0,
+ RequireInitialization = false,
+ ReadCost = 1,
+ AddCost = 1,
+ MulCost = 1
+ };
+
+ EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real epsilon() { return 0; }
+ EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real dummy_precision() { return 0; }
+ EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real highest() { return n; }
+ EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real lowest() { return n; }
+};
+
+namespace internal {
+template <typename T>
+EIGEN_DEVICE_FUNC void update_value(T& val, Index new_val) {
+ val = internal::convert_index<T>(new_val);
+}
+template <Index n>
+EIGEN_DEVICE_FUNC void update_value(type2index<n>& val, Index new_val) {
+ val.set(new_val);
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC void update_value(T& val, IndexPair<Index> new_val) {
+ val = new_val;
+}
+template <Index f, Index s>
+EIGEN_DEVICE_FUNC void update_value(type2indexpair<f, s>& val, IndexPair<Index> new_val) {
+ val.set(new_val);
+}
+
+
+template <typename T>
+struct is_compile_time_constant {
+ static constexpr bool value = false;
+};
+
+template <Index idx>
+struct is_compile_time_constant<type2index<idx> > {
+ static constexpr bool value = true;
+};
+template <Index idx>
+struct is_compile_time_constant<const type2index<idx> > {
+ static constexpr bool value = true;
+};
+template <Index idx>
+struct is_compile_time_constant<type2index<idx>& > {
+ static constexpr bool value = true;
+};
+template <Index idx>
+struct is_compile_time_constant<const type2index<idx>& > {
+ static constexpr bool value = true;
+};
+
+template <Index f, Index s>
+struct is_compile_time_constant<type2indexpair<f, s> > {
+ static constexpr bool value = true;
+};
+template <Index f, Index s>
+struct is_compile_time_constant<const type2indexpair<f, s> > {
+ static constexpr bool value = true;
+};
+template <Index f, Index s>
+struct is_compile_time_constant<type2indexpair<f, s>& > {
+ static constexpr bool value = true;
+};
+template <Index f, Index s>
+struct is_compile_time_constant<const type2indexpair<f, s>& > {
+ static constexpr bool value = true;
+};
+
+
+template<typename... T>
+struct IndexTuple;
+
+template<typename T, typename... O>
+struct IndexTuple<T, O...> {
+ EIGEN_DEVICE_FUNC constexpr IndexTuple() : head(), others() { }
+ EIGEN_DEVICE_FUNC constexpr IndexTuple(const T& v, const O... o) : head(v), others(o...) { }
+
+ constexpr static int count = 1 + sizeof...(O);
+ T head;
+ IndexTuple<O...> others;
+ typedef T Head;
+ typedef IndexTuple<O...> Other;
+};
+
+template<typename T>
+ struct IndexTuple<T> {
+ EIGEN_DEVICE_FUNC constexpr IndexTuple() : head() { }
+ EIGEN_DEVICE_FUNC constexpr IndexTuple(const T& v) : head(v) { }
+
+ constexpr static int count = 1;
+ T head;
+ typedef T Head;
+};
+
+
+template<int N, typename... T>
+struct IndexTupleExtractor;
+
+template<int N, typename T, typename... O>
+struct IndexTupleExtractor<N, T, O...> {
+
+ typedef typename IndexTupleExtractor<N-1, O...>::ValType ValType;
+
+ EIGEN_DEVICE_FUNC static constexpr ValType& get_val(IndexTuple<T, O...>& val) {
+ return IndexTupleExtractor<N-1, O...>::get_val(val.others);
+ }
+
+ EIGEN_DEVICE_FUNC static constexpr const ValType& get_val(const IndexTuple<T, O...>& val) {
+ return IndexTupleExtractor<N-1, O...>::get_val(val.others);
+ }
+ template <typename V>
+ EIGEN_DEVICE_FUNC static void set_val(IndexTuple<T, O...>& val, V& new_val) {
+ IndexTupleExtractor<N-1, O...>::set_val(val.others, new_val);
+ }
+
+};
+
+template<typename T, typename... O>
+ struct IndexTupleExtractor<0, T, O...> {
+
+ typedef T ValType;
+
+ EIGEN_DEVICE_FUNC static constexpr ValType& get_val(IndexTuple<T, O...>& val) {
+ return val.head;
+ }
+ EIGEN_DEVICE_FUNC static constexpr const ValType& get_val(const IndexTuple<T, O...>& val) {
+ return val.head;
+ }
+ template <typename V>
+ EIGEN_DEVICE_FUNC static void set_val(IndexTuple<T, O...>& val, V& new_val) {
+ val.head = new_val;
+ }
+};
+
+
+
+template <int N, typename T, typename... O>
+EIGEN_DEVICE_FUNC constexpr typename IndexTupleExtractor<N, T, O...>::ValType& array_get(IndexTuple<T, O...>& tuple) {
+ return IndexTupleExtractor<N, T, O...>::get_val(tuple);
+}
+template <int N, typename T, typename... O>
+EIGEN_DEVICE_FUNC constexpr const typename IndexTupleExtractor<N, T, O...>::ValType& array_get(const IndexTuple<T, O...>& tuple) {
+ return IndexTupleExtractor<N, T, O...>::get_val(tuple);
+}
+template <typename T, typename... O>
+ struct array_size<IndexTuple<T, O...> > {
+ static const size_t value = IndexTuple<T, O...>::count;
+};
+template <typename T, typename... O>
+ struct array_size<const IndexTuple<T, O...> > {
+ static const size_t value = IndexTuple<T, O...>::count;
+};
+
+
+
+
+template <Index Idx, typename ValueT>
+struct tuple_coeff {
+ template <typename... T>
+ EIGEN_DEVICE_FUNC static constexpr ValueT get(const Index i, const IndexTuple<T...>& t) {
+ // return array_get<Idx>(t) * (i == Idx) + tuple_coeff<Idx-1>::get(i, t) * (i != Idx);
+ return (i == Idx ? array_get<Idx>(t) : tuple_coeff<Idx-1, ValueT>::get(i, t));
+ }
+ template <typename... T>
+ EIGEN_DEVICE_FUNC static void set(const Index i, IndexTuple<T...>& t, const ValueT& value) {
+ if (i == Idx) {
+ update_value(array_get<Idx>(t), value);
+ } else {
+ tuple_coeff<Idx-1, ValueT>::set(i, t, value);
+ }
+ }
+
+ template <typename... T>
+ EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const Index i, const IndexTuple<T...>& t) {
+ return ((i == Idx) & is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value) ||
+ tuple_coeff<Idx-1, ValueT>::value_known_statically(i, t);
+ }
+
+ template <typename... T>
+ EIGEN_DEVICE_FUNC static constexpr bool values_up_to_known_statically(const IndexTuple<T...>& t) {
+ return is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value &&
+ tuple_coeff<Idx-1, ValueT>::values_up_to_known_statically(t);
+ }
+
+ template <typename... T>
+ EIGEN_DEVICE_FUNC static constexpr bool values_up_to_statically_known_to_increase(const IndexTuple<T...>& t) {
+ return is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value &&
+ is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value &&
+ array_get<Idx>(t) > array_get<Idx-1>(t) &&
+ tuple_coeff<Idx-1, ValueT>::values_up_to_statically_known_to_increase(t);
+ }
+};
+
+template <typename ValueT>
+struct tuple_coeff<0, ValueT> {
+ template <typename... T>
+ EIGEN_DEVICE_FUNC static constexpr ValueT get(const Index /*i*/, const IndexTuple<T...>& t) {
+ // eigen_assert (i == 0); // gcc fails to compile assertions in constexpr
+ return array_get<0>(t)/* * (i == 0)*/;
+ }
+ template <typename... T>
+ EIGEN_DEVICE_FUNC static void set(const Index i, IndexTuple<T...>& t, const ValueT value) {
+ eigen_assert (i == 0);
+ update_value(array_get<0>(t), value);
+ }
+ template <typename... T>
+ EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const Index i, const IndexTuple<T...>&) {
+ return is_compile_time_constant<typename IndexTupleExtractor<0, T...>::ValType>::value && (i == 0);
+ }
+
+ template <typename... T>
+ EIGEN_DEVICE_FUNC static constexpr bool values_up_to_known_statically(const IndexTuple<T...>&) {
+ return is_compile_time_constant<typename IndexTupleExtractor<0, T...>::ValType>::value;
+ }
+
+ template <typename... T>
+ EIGEN_DEVICE_FUNC static constexpr bool values_up_to_statically_known_to_increase(const IndexTuple<T...>&) {
+ return true;
+ }
+};
+} // namespace internal
+
+
+
+template<typename FirstType, typename... OtherTypes>
+struct IndexList : internal::IndexTuple<FirstType, OtherTypes...> {
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Index operator[] (const Index i) const {
+ return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::get(i, *this);
+ }
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Index get(const Index i) const {
+ return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::get(i, *this);
+ }
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const Index i, const Index value) {
+ return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::set(i, *this, value);
+ }
+
+ EIGEN_DEVICE_FUNC constexpr IndexList(const internal::IndexTuple<FirstType, OtherTypes...>& other) : internal::IndexTuple<FirstType, OtherTypes...>(other) { }
+ EIGEN_DEVICE_FUNC constexpr IndexList(FirstType& first, OtherTypes... other) : internal::IndexTuple<FirstType, OtherTypes...>(first, other...) { }
+ EIGEN_DEVICE_FUNC constexpr IndexList() : internal::IndexTuple<FirstType, OtherTypes...>() { }
+
+ EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const Index i) const {
+ return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::value_known_statically(i, *this);
+ }
+ EIGEN_DEVICE_FUNC constexpr bool all_values_known_statically() const {
+ return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::values_up_to_known_statically(*this);
+ }
+
+ EIGEN_DEVICE_FUNC constexpr bool values_statically_known_to_increase() const {
+ return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::values_up_to_statically_known_to_increase(*this);
+ }
+};
+
+template <typename FirstType, typename... OtherTypes>
+std::ostream& operator<<(std::ostream& os,
+ const IndexList<FirstType, OtherTypes...>& dims) {
+ os << "[";
+ for (size_t i = 0; i < 1 + sizeof...(OtherTypes); ++i) {
+ if (i > 0) os << ", ";
+ os << dims[i];
+ }
+ os << "]";
+ return os;
+}
+
+template<typename FirstType, typename... OtherTypes>
+constexpr IndexList<FirstType, OtherTypes...> make_index_list(FirstType val1, OtherTypes... other_vals) {
+ return IndexList<FirstType, OtherTypes...>(val1, other_vals...);
+}
+
+
+template<typename FirstType, typename... OtherTypes>
+struct IndexPairList : internal::IndexTuple<FirstType, OtherTypes...> {
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr IndexPair<Index> operator[] (const Index i) const {
+ return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, IndexPair<Index>>::get(i, *this);
+ }
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const Index i, const IndexPair<Index> value) {
+ return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...>>::value-1, IndexPair<Index> >::set(i, *this, value);
+ }
+
+ EIGEN_DEVICE_FUNC constexpr IndexPairList(const internal::IndexTuple<FirstType, OtherTypes...>& other) : internal::IndexTuple<FirstType, OtherTypes...>(other) { }
+ EIGEN_DEVICE_FUNC constexpr IndexPairList() : internal::IndexTuple<FirstType, OtherTypes...>() { }
+
+ EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const Index i) const {
+ return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::value_known_statically(i, *this);
+ }
+};
+
+namespace internal {
+
+template<typename FirstType, typename... OtherTypes>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index array_prod(const IndexList<FirstType, OtherTypes...>& sizes) {
+ Index result = 1;
+ EIGEN_UNROLL_LOOP
+ for (size_t i = 0; i < array_size<IndexList<FirstType, OtherTypes...> >::value; ++i) {
+ result *= sizes[i];
+ }
+ return result;
+}
+
+template<typename FirstType, typename... OtherTypes> struct array_size<IndexList<FirstType, OtherTypes...> > {
+ static const size_t value = array_size<IndexTuple<FirstType, OtherTypes...> >::value;
+};
+template<typename FirstType, typename... OtherTypes> struct array_size<const IndexList<FirstType, OtherTypes...> > {
+ static const size_t value = array_size<IndexTuple<FirstType, OtherTypes...> >::value;
+};
+
+template<typename FirstType, typename... OtherTypes> struct array_size<IndexPairList<FirstType, OtherTypes...> > {
+ static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value;
+};
+template<typename FirstType, typename... OtherTypes> struct array_size<const IndexPairList<FirstType, OtherTypes...> > {
+ static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value;
+};
+
+template<Index N, typename FirstType, typename... OtherTypes> EIGEN_DEVICE_FUNC constexpr Index array_get(IndexList<FirstType, OtherTypes...>& a) {
+ return IndexTupleExtractor<N, FirstType, OtherTypes...>::get_val(a);
+}
+template<Index N, typename FirstType, typename... OtherTypes> EIGEN_DEVICE_FUNC constexpr Index array_get(const IndexList<FirstType, OtherTypes...>& a) {
+ return IndexTupleExtractor<N, FirstType, OtherTypes...>::get_val(a);
+}
+
+template <typename T>
+struct index_known_statically_impl {
+ EIGEN_DEVICE_FUNC static constexpr bool run(const Index) {
+ return false;
+ }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_known_statically_impl<IndexList<FirstType, OtherTypes...> > {
+ EIGEN_DEVICE_FUNC static constexpr bool run(const Index i) {
+ return IndexList<FirstType, OtherTypes...>().value_known_statically(i);
+ }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_known_statically_impl<const IndexList<FirstType, OtherTypes...> > {
+ EIGEN_DEVICE_FUNC static constexpr bool run(const Index i) {
+ return IndexList<FirstType, OtherTypes...>().value_known_statically(i);
+ }
+};
+
+
+template <typename T>
+struct all_indices_known_statically_impl {
+ static constexpr bool run() {
+ return false;
+ }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct all_indices_known_statically_impl<IndexList<FirstType, OtherTypes...> > {
+ EIGEN_DEVICE_FUNC static constexpr bool run() {
+ return IndexList<FirstType, OtherTypes...>().all_values_known_statically();
+ }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct all_indices_known_statically_impl<const IndexList<FirstType, OtherTypes...> > {
+ EIGEN_DEVICE_FUNC static constexpr bool run() {
+ return IndexList<FirstType, OtherTypes...>().all_values_known_statically();
+ }
+};
+
+
+template <typename T>
+struct indices_statically_known_to_increase_impl {
+ EIGEN_DEVICE_FUNC static constexpr bool run() {
+ return false;
+ }
+};
+
+template <typename FirstType, typename... OtherTypes>
+ struct indices_statically_known_to_increase_impl<IndexList<FirstType, OtherTypes...> > {
+ EIGEN_DEVICE_FUNC static constexpr bool run() {
+ return Eigen::IndexList<FirstType, OtherTypes...>().values_statically_known_to_increase();
+ }
+};
+
+template <typename FirstType, typename... OtherTypes>
+ struct indices_statically_known_to_increase_impl<const IndexList<FirstType, OtherTypes...> > {
+ EIGEN_DEVICE_FUNC static constexpr bool run() {
+ return Eigen::IndexList<FirstType, OtherTypes...>().values_statically_known_to_increase();
+ }
+};
+
+
+template <typename Tx>
+struct index_statically_eq_impl {
+ EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
+ return false;
+ }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_eq_impl<IndexList<FirstType, OtherTypes...> > {
+ EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+ return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+ (IndexList<FirstType, OtherTypes...>().get(i) == value);
+ }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_eq_impl<const IndexList<FirstType, OtherTypes...> > {
+ EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+ return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+ (IndexList<FirstType, OtherTypes...>().get(i) == value);
+ }
+};
+
+
+template <typename T>
+struct index_statically_ne_impl {
+ EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
+ return false;
+ }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_ne_impl<IndexList<FirstType, OtherTypes...> > {
+ EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+ return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+ (IndexList<FirstType, OtherTypes...>().get(i) != value);
+ }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_ne_impl<const IndexList<FirstType, OtherTypes...> > {
+ EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+ return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+ (IndexList<FirstType, OtherTypes...>().get(i) != value);
+ }
+};
+
+
+template <typename T>
+struct index_statically_gt_impl {
+ EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
+ return false;
+ }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_gt_impl<IndexList<FirstType, OtherTypes...> > {
+ EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+ return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+ (IndexList<FirstType, OtherTypes...>().get(i) > value);
+ }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_gt_impl<const IndexList<FirstType, OtherTypes...> > {
+ EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+ return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+ (IndexList<FirstType, OtherTypes...>().get(i) > value);
+ }
+};
+
+
+
+template <typename T>
+struct index_statically_lt_impl {
+ EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
+ return false;
+ }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_lt_impl<IndexList<FirstType, OtherTypes...> > {
+ EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+ return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+ (IndexList<FirstType, OtherTypes...>().get(i) < value);
+ }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_lt_impl<const IndexList<FirstType, OtherTypes...> > {
+ EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+ return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+ (IndexList<FirstType, OtherTypes...>().get(i) < value);
+ }
+};
+
+
+
+template <typename Tx>
+struct index_pair_first_statically_eq_impl {
+ EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
+ return false;
+ }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_pair_first_statically_eq_impl<IndexPairList<FirstType, OtherTypes...> > {
+ EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+ return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
+ (IndexPairList<FirstType, OtherTypes...>().operator[](i).first == value);
+ }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_pair_first_statically_eq_impl<const IndexPairList<FirstType, OtherTypes...> > {
+ EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+ return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
+ (IndexPairList<FirstType, OtherTypes...>().operator[](i).first == value);
+ }
+};
+
+
+
+template <typename Tx>
+struct index_pair_second_statically_eq_impl {
+ EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
+ return false;
+ }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_pair_second_statically_eq_impl<IndexPairList<FirstType, OtherTypes...> > {
+ EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+ return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
+ (IndexPairList<FirstType, OtherTypes...>().operator[](i).second == value);
+ }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_pair_second_statically_eq_impl<const IndexPairList<FirstType, OtherTypes...> > {
+ EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+ return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
+ (IndexPairList<FirstType, OtherTypes...>().operator[](i).second == value);
+ }
+};
+
+
+} // end namespace internal
+} // end namespace Eigen
+
+#else
+
+namespace Eigen {
+namespace internal {
+
+template <typename T>
+struct index_known_statically_impl {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const Index) {
+ return false;
+ }
+};
+
+template <typename T>
+struct all_indices_known_statically_impl {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() {
+ return false;
+ }
+};
+
+template <typename T>
+struct indices_statically_known_to_increase_impl {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() {
+ return false;
+ }
+};
+
+template <typename T>
+struct index_statically_eq_impl {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
+ return false;
+ }
+};
+
+template <typename T>
+struct index_statically_ne_impl {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
+ return false;
+ }
+};
+
+template <typename T>
+struct index_statically_gt_impl {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
+ return false;
+ }
+};
+
+template <typename T>
+struct index_statically_lt_impl {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
+ return false;
+ }
+};
+
+template <typename Tx>
+struct index_pair_first_statically_eq_impl {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
+ return false;
+ }
+};
+
+template <typename Tx>
+struct index_pair_second_statically_eq_impl {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
+ return false;
+ }
+};
+
+
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif
+
+
+namespace Eigen {
+namespace internal {
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_known_statically(Index i) {
+ return index_known_statically_impl<T>::run(i);
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool all_indices_known_statically() {
+ return all_indices_known_statically_impl<T>::run();
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool indices_statically_known_to_increase() {
+ return indices_statically_known_to_increase_impl<T>::run();
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_eq(Index i, Index value) {
+ return index_statically_eq_impl<T>::run(i, value);
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_ne(Index i, Index value) {
+ return index_statically_ne_impl<T>::run(i, value);
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_gt(Index i, Index value) {
+ return index_statically_gt_impl<T>::run(i, value);
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_lt(Index i, Index value) {
+ return index_statically_lt_impl<T>::run(i, value);
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_first_statically_eq(Index i, Index value) {
+ return index_pair_first_statically_eq_impl<T>::run(i, value);
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_second_statically_eq(Index i, Index value) {
+ return index_pair_second_statically_eq_impl<T>::run(i, value);
+}
+
+} // end namespace internal
+} // end namespace Eigen
+
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorInflation.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorInflation.h
new file mode 100644
index 0000000..c5cb61a
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorInflation.h
@@ -0,0 +1,247 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Ke Yang <yangke@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H
+
+namespace Eigen {
+
+/** \class TensorInflation
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor inflation class.
+ *
+ *
+ */
+namespace internal {
+template<typename Strides, typename XprType>
+struct traits<TensorInflationOp<Strides, XprType> > : public traits<XprType>
+{
+ typedef typename XprType::Scalar Scalar;
+ typedef traits<XprType> XprTraits;
+ typedef typename XprTraits::StorageKind StorageKind;
+ typedef typename XprTraits::Index Index;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = XprTraits::NumDimensions;
+ static const int Layout = XprTraits::Layout;
+ typedef typename XprTraits::PointerType PointerType;
+};
+
+template<typename Strides, typename XprType>
+struct eval<TensorInflationOp<Strides, XprType>, Eigen::Dense>
+{
+ typedef const TensorInflationOp<Strides, XprType>& type;
+};
+
+template<typename Strides, typename XprType>
+struct nested<TensorInflationOp<Strides, XprType>, 1, typename eval<TensorInflationOp<Strides, XprType> >::type>
+{
+ typedef TensorInflationOp<Strides, XprType> type;
+};
+
+} // end namespace internal
+
+template<typename Strides, typename XprType>
+class TensorInflationOp : public TensorBase<TensorInflationOp<Strides, XprType>, ReadOnlyAccessors>
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorInflationOp>::Scalar Scalar;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename Eigen::internal::nested<TensorInflationOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorInflationOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorInflationOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorInflationOp(const XprType& expr, const Strides& strides)
+ : m_xpr(expr), m_strides(strides) {}
+
+ EIGEN_DEVICE_FUNC
+ const Strides& strides() const { return m_strides; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type&
+ expression() const { return m_xpr; }
+
+ protected:
+ typename XprType::Nested m_xpr;
+ const Strides m_strides;
+};
+
+// Eval as rvalue
+template<typename Strides, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
+{
+ typedef TensorInflationOp<Strides, ArgType> XprType;
+ typedef typename XprType::Index Index;
+ static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+ typedef DSizes<Index, NumDims> Dimensions;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+
+ enum {
+ IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = false,
+ PreferBlockAccess = false,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = false
+ };
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : m_impl(op.expression(), device), m_strides(op.strides())
+ {
+ m_dimensions = m_impl.dimensions();
+ // Expand each dimension to the inflated dimension.
+ for (int i = 0; i < NumDims; ++i) {
+ m_dimensions[i] = (m_dimensions[i] - 1) * op.strides()[i] + 1;
+ }
+
+ // Remember the strides for fast division.
+ for (int i = 0; i < NumDims; ++i) {
+ m_fastStrides[i] = internal::TensorIntDivisor<Index>(m_strides[i]);
+ }
+
+ const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ m_outputStrides[0] = 1;
+ m_inputStrides[0] = 1;
+ for (int i = 1; i < NumDims; ++i) {
+ m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
+ m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
+ }
+ } else { // RowMajor
+ m_outputStrides[NumDims-1] = 1;
+ m_inputStrides[NumDims-1] = 1;
+ for (int i = NumDims - 2; i >= 0; --i) {
+ m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
+ m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
+ }
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+ m_impl.evalSubExprsIfNeeded(NULL);
+ return true;
+ }
+ EIGEN_STRONG_INLINE void cleanup() {
+ m_impl.cleanup();
+ }
+
+ // Computes the input index given the output index. Returns true if the output
+ // index doesn't fall into a hole.
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool getInputIndex(Index index, Index* inputIndex) const
+ {
+ eigen_assert(index < dimensions().TotalSize());
+ *inputIndex = 0;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ EIGEN_UNROLL_LOOP
+ for (int i = NumDims - 1; i > 0; --i) {
+ const Index idx = index / m_outputStrides[i];
+ if (idx != idx / m_fastStrides[i] * m_strides[i]) {
+ return false;
+ }
+ *inputIndex += idx / m_strides[i] * m_inputStrides[i];
+ index -= idx * m_outputStrides[i];
+ }
+ if (index != index / m_fastStrides[0] * m_strides[0]) {
+ return false;
+ }
+ *inputIndex += index / m_strides[0];
+ return true;
+ } else {
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < NumDims - 1; ++i) {
+ const Index idx = index / m_outputStrides[i];
+ if (idx != idx / m_fastStrides[i] * m_strides[i]) {
+ return false;
+ }
+ *inputIndex += idx / m_strides[i] * m_inputStrides[i];
+ index -= idx * m_outputStrides[i];
+ }
+ if (index != index / m_fastStrides[NumDims-1] * m_strides[NumDims-1]) {
+ return false;
+ }
+ *inputIndex += index / m_strides[NumDims - 1];
+ }
+ return true;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ Index inputIndex = 0;
+ if (getInputIndex(index, &inputIndex)) {
+ return m_impl.coeff(inputIndex);
+ } else {
+ return Scalar(0);
+ }
+ }
+
+ // TODO(yangke): optimize this function so that we can detect and produce
+ // all-zero packets
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+ {
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < PacketSize; ++i) {
+ values[i] = coeff(index+i);
+ }
+ PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+ return rslt;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+ const double compute_cost = NumDims * (3 * TensorOpCost::DivCost<Index>() +
+ 3 * TensorOpCost::MulCost<Index>() +
+ 2 * TensorOpCost::AddCost<Index>());
+ const double input_size = m_impl.dimensions().TotalSize();
+ const double output_size = m_dimensions.TotalSize();
+ if (output_size == 0)
+ return TensorOpCost();
+ return m_impl.costPerCoeff(vectorized) +
+ TensorOpCost(sizeof(CoeffReturnType) * input_size / output_size, 0,
+ compute_cost, vectorized, PacketSize);
+ }
+
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_impl.bind(cgh);
+ }
+#endif
+
+ protected:
+ Dimensions m_dimensions;
+ array<Index, NumDims> m_outputStrides;
+ array<Index, NumDims> m_inputStrides;
+ TensorEvaluator<ArgType, Device> m_impl;
+ const Strides m_strides;
+ array<internal::TensorIntDivisor<Index>, NumDims> m_fastStrides;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorInitializer.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorInitializer.h
new file mode 100644
index 0000000..26a3818
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorInitializer.h
@@ -0,0 +1,82 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H
+#define EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+
+#include <initializer_list>
+
+namespace Eigen {
+
+/** \class TensorInitializer
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Helper template to initialize Tensors from std::initializer_lists.
+ */
+namespace internal {
+
+template <typename Derived, int N>
+struct Initializer {
+ typedef std::initializer_list<
+ typename Initializer<Derived, N - 1>::InitList> InitList;
+
+ static void run(TensorEvaluator<Derived, DefaultDevice>& tensor,
+ Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>* indices,
+ const InitList& vals) {
+ int i = 0;
+ for (const auto& v : vals) {
+ (*indices)[traits<Derived>::NumDimensions - N] = i++;
+ Initializer<Derived, N - 1>::run(tensor, indices, v);
+ }
+ }
+};
+
+template <typename Derived>
+struct Initializer<Derived, 1> {
+ typedef std::initializer_list<typename traits<Derived>::Scalar> InitList;
+
+ static void run(TensorEvaluator<Derived, DefaultDevice>& tensor,
+ Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>* indices,
+ const InitList& vals) {
+ int i = 0;
+ // There is likely a faster way to do that than iterating.
+ for (const auto& v : vals) {
+ (*indices)[traits<Derived>::NumDimensions - 1] = i++;
+ tensor.coeffRef(*indices) = v;
+ }
+ }
+};
+
+template <typename Derived>
+struct Initializer<Derived, 0> {
+ typedef typename traits<Derived>::Scalar InitList;
+
+ static void run(TensorEvaluator<Derived, DefaultDevice>& tensor,
+ Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>*,
+ const InitList& v) {
+ tensor.coeffRef(0) = v;
+ }
+};
+
+
+template <typename Derived, int N>
+void initialize_tensor(TensorEvaluator<Derived, DefaultDevice>& tensor,
+ const typename Initializer<Derived, traits<Derived>::NumDimensions>::InitList& vals) {
+ Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions> indices;
+ Initializer<Derived, traits<Derived>::NumDimensions>::run(tensor, &indices, vals);
+}
+
+} // namespace internal
+} // namespace Eigen
+
+#endif // EIGEN_HAS_VARIADIC_TEMPLATES
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorIntDiv.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorIntDiv.h
new file mode 100644
index 0000000..6d5cce4
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorIntDiv.h
@@ -0,0 +1,263 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H
+#define EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H
+
+
+namespace Eigen {
+
+/** \internal
+ *
+ * \class TensorIntDiv
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Fast integer division by a constant.
+ *
+ * See the paper from Granlund and Montgomery for explanation.
+ * (at https://doi.org/10.1145/773473.178249)
+ *
+ * \sa Tensor
+ */
+
+namespace internal {
+
+namespace {
+
+ // Note: result is undefined if val == 0
+ template <typename T>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+ typename internal::enable_if<sizeof(T)==4,int>::type count_leading_zeros(const T val)
+ {
+#ifdef EIGEN_GPU_COMPILE_PHASE
+ return __clz(val);
+#elif defined(SYCL_DEVICE_ONLY)
+ return cl::sycl::clz(val);
+#elif EIGEN_COMP_MSVC
+ unsigned long index;
+ _BitScanReverse(&index, val);
+ return 31 - index;
+#else
+ EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ return __builtin_clz(static_cast<uint32_t>(val));
+#endif
+ }
+
+ template <typename T>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+ typename internal::enable_if<sizeof(T)==8,int>::type count_leading_zeros(const T val)
+ {
+#ifdef EIGEN_GPU_COMPILE_PHASE
+ return __clzll(val);
+#elif defined(SYCL_DEVICE_ONLY)
+ return static_cast<int>(cl::sycl::clz(val));
+#elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64
+ unsigned long index;
+ _BitScanReverse64(&index, val);
+ return 63 - index;
+#elif EIGEN_COMP_MSVC
+ // MSVC's _BitScanReverse64 is not available for 32bits builds.
+ unsigned int lo = (unsigned int)(val&0xffffffff);
+ unsigned int hi = (unsigned int)((val>>32)&0xffffffff);
+ int n;
+ if(hi==0)
+ n = 32 + count_leading_zeros<unsigned int>(lo);
+ else
+ n = count_leading_zeros<unsigned int>(hi);
+ return n;
+#else
+ EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ return __builtin_clzll(static_cast<uint64_t>(val));
+#endif
+ }
+
+ template <typename T>
+ struct UnsignedTraits {
+ typedef typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type type;
+ };
+
+ template <typename T>
+ struct DividerTraits {
+ typedef typename UnsignedTraits<T>::type type;
+ static const int N = sizeof(T) * 8;
+ };
+
+ template <typename T>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) {
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+ return __umulhi(a, b);
+#elif defined(SYCL_DEVICE_ONLY)
+ return cl::sycl::mul_hi(a, static_cast<uint32_t>(b));
+#else
+ return (static_cast<uint64_t>(a) * b) >> 32;
+#endif
+ }
+
+ template <typename T>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+ return __umul64hi(a, b);
+#elif defined(SYCL_DEVICE_ONLY)
+ return cl::sycl::mul_hi(a, static_cast<uint64_t>(b));
+#elif EIGEN_HAS_BUILTIN_INT128
+ __uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b);
+ return static_cast<uint64_t>(v >> 64);
+#else
+ return (TensorUInt128<static_val<0>, uint64_t>(a) * TensorUInt128<static_val<0>, uint64_t>(b)).upper();
+#endif
+ }
+
+ template <int N, typename T>
+ struct DividerHelper {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t computeMultiplier(const int log_div, const T divider) {
+ EIGEN_STATIC_ASSERT(N == 32, YOU_MADE_A_PROGRAMMING_MISTAKE);
+ return static_cast<uint32_t>((static_cast<uint64_t>(1) << (N+log_div)) / divider - (static_cast<uint64_t>(1) << N) + 1);
+ }
+ };
+
+ template <typename T>
+ struct DividerHelper<64, T> {
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) {
+#if EIGEN_HAS_BUILTIN_INT128 && !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY)
+ return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1);
+#else
+ const uint64_t shift = 1ULL << log_div;
+ TensorUInt128<uint64_t, uint64_t> result = TensorUInt128<uint64_t, static_val<0> >(shift, 0) / TensorUInt128<static_val<0>, uint64_t>(divider)
+ - TensorUInt128<static_val<1>, static_val<0> >(1, 0)
+ + TensorUInt128<static_val<0>, static_val<1> >(1);
+ return static_cast<uint64_t>(result);
+#endif
+ }
+ };
+}
+
+
+template <typename T, bool div_gt_one = false>
+struct TensorIntDivisor {
+ public:
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() {
+ multiplier = 0;
+ shift1 = 0;
+ shift2 = 0;
+ }
+
+ // Must have 0 < divider < 2^31. This is relaxed to
+ // 0 < divider < 2^63 when using 64-bit indices on platforms that support
+ // the __uint128_t type.
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor(const T divider) {
+ const int N = DividerTraits<T>::N;
+ eigen_assert(static_cast<typename UnsignedTraits<T>::type>(divider) < NumTraits<UnsignedType>::highest()/2);
+ eigen_assert(divider > 0);
+
+ // fast ln2
+ const int leading_zeros = count_leading_zeros(static_cast<UnsignedType>(divider));
+ int log_div = N - leading_zeros;
+ // if divider is a power of two then log_div is 1 more than it should be.
+ if ((static_cast<typename UnsignedTraits<T>::type>(1) << (log_div-1)) == static_cast<typename UnsignedTraits<T>::type>(divider))
+ log_div--;
+
+ multiplier = DividerHelper<N, T>::computeMultiplier(log_div, divider);
+ shift1 = log_div > 1 ? 1 : log_div;
+ shift2 = log_div > 1 ? log_div-1 : 0;
+ }
+
+ // Must have 0 <= numerator. On platforms that don't support the __uint128_t
+ // type numerator should also be less than 2^32-1.
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T divide(const T numerator) const {
+ eigen_assert(static_cast<typename UnsignedTraits<T>::type>(numerator) < NumTraits<UnsignedType>::highest()/2);
+ //eigen_assert(numerator >= 0); // this is implicitly asserted by the line above
+
+ UnsignedType t1 = muluh(multiplier, numerator);
+ UnsignedType t = (static_cast<UnsignedType>(numerator) - t1) >> shift1;
+ return (t1 + t) >> shift2;
+ }
+
+ private:
+ typedef typename DividerTraits<T>::type UnsignedType;
+ UnsignedType multiplier;
+ int32_t shift1;
+ int32_t shift2;
+};
+
+
+// Optimized version for signed 32 bit integers.
+// Derived from Hacker's Delight.
+// Only works for divisors strictly greater than one
+template <>
+class TensorIntDivisor<int32_t, true> {
+ public:
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() {
+ magic = 0;
+ shift = 0;
+ }
+ // Must have 2 <= divider
+ EIGEN_DEVICE_FUNC TensorIntDivisor(int32_t divider) {
+ eigen_assert(divider >= 2);
+ calcMagic(divider);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const {
+#ifdef EIGEN_GPU_COMPILE_PHASE
+ return (__umulhi(magic, n) >> shift);
+#elif defined(SYCL_DEVICE_ONLY)
+ return (cl::sycl::mul_hi(magic, static_cast<uint32_t>(n)) >> shift);
+#else
+ uint64_t v = static_cast<uint64_t>(magic) * static_cast<uint64_t>(n);
+ return (static_cast<uint32_t>(v >> 32) >> shift);
+#endif
+ }
+
+private:
+ // Compute the magic numbers. See Hacker's Delight section 10 for an in
+ // depth explanation.
+ EIGEN_DEVICE_FUNC void calcMagic(int32_t d) {
+ const unsigned two31 = 0x80000000; // 2**31.
+ unsigned ad = d;
+ unsigned t = two31 + (ad >> 31);
+ unsigned anc = t - 1 - t%ad; // Absolute value of nc.
+ int p = 31; // Init. p.
+ unsigned q1 = two31/anc; // Init. q1 = 2**p/|nc|.
+ unsigned r1 = two31 - q1*anc; // Init. r1 = rem(2**p, |nc|).
+ unsigned q2 = two31/ad; // Init. q2 = 2**p/|d|.
+ unsigned r2 = two31 - q2*ad; // Init. r2 = rem(2**p, |d|).
+ unsigned delta = 0;
+ do {
+ p = p + 1;
+ q1 = 2*q1; // Update q1 = 2**p/|nc|.
+ r1 = 2*r1; // Update r1 = rem(2**p, |nc|).
+ if (r1 >= anc) { // (Must be an unsigned
+ q1 = q1 + 1; // comparison here).
+ r1 = r1 - anc;}
+ q2 = 2*q2; // Update q2 = 2**p/|d|.
+ r2 = 2*r2; // Update r2 = rem(2**p, |d|).
+ if (r2 >= ad) { // (Must be an unsigned
+ q2 = q2 + 1; // comparison here).
+ r2 = r2 - ad;}
+ delta = ad - r2;
+ } while (q1 < delta || (q1 == delta && r1 == 0));
+
+ magic = (unsigned)(q2 + 1);
+ shift = p - 32;
+ }
+
+ uint32_t magic;
+ int32_t shift;
+};
+
+
+template <typename T, bool div_gt_one>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator / (const T& numerator, const TensorIntDivisor<T, div_gt_one>& divisor) {
+ return divisor.divide(numerator);
+}
+
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorLayoutSwap.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorLayoutSwap.h
new file mode 100644
index 0000000..80106c1
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorLayoutSwap.h
@@ -0,0 +1,216 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H
+#define EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H
+
+namespace Eigen {
+
+/** \class TensorLayoutSwap
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Swap the layout from col-major to row-major, or row-major
+ * to col-major, and invert the order of the dimensions.
+ *
+ * Beware: the dimensions are reversed by this operation. If you want to
+ * preserve the ordering of the dimensions, you need to combine this
+ * operation with a shuffle.
+ *
+ * \example:
+ * Tensor<float, 2, ColMajor> input(2, 4);
+ * Tensor<float, 2, RowMajor> output = input.swap_layout();
+ * eigen_assert(output.dimension(0) == 4);
+ * eigen_assert(output.dimension(1) == 2);
+ *
+ * array<int, 2> shuffle(1, 0);
+ * output = input.swap_layout().shuffle(shuffle);
+ * eigen_assert(output.dimension(0) == 2);
+ * eigen_assert(output.dimension(1) == 4);
+ *
+ */
+namespace internal {
+template<typename XprType>
+struct traits<TensorLayoutSwapOp<XprType> > : public traits<XprType>
+{
+ typedef typename XprType::Scalar Scalar;
+ typedef traits<XprType> XprTraits;
+ typedef typename XprTraits::StorageKind StorageKind;
+ typedef typename XprTraits::Index Index;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = traits<XprType>::NumDimensions;
+ static const int Layout = (traits<XprType>::Layout == ColMajor) ? RowMajor : ColMajor;
+ typedef typename XprTraits::PointerType PointerType;
+};
+
+template<typename XprType>
+struct eval<TensorLayoutSwapOp<XprType>, Eigen::Dense>
+{
+ typedef const TensorLayoutSwapOp<XprType>& type;
+};
+
+template<typename XprType>
+struct nested<TensorLayoutSwapOp<XprType>, 1, typename eval<TensorLayoutSwapOp<XprType> >::type>
+{
+ typedef TensorLayoutSwapOp<XprType> type;
+};
+
+} // end namespace internal
+
+
+
+template<typename XprType>
+class TensorLayoutSwapOp : public TensorBase<TensorLayoutSwapOp<XprType>, WriteAccessors>
+{
+ public:
+ typedef TensorBase<TensorLayoutSwapOp<XprType>, WriteAccessors> Base;
+ typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Scalar Scalar;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+ typedef typename Eigen::internal::nested<TensorLayoutSwapOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorLayoutSwapOp(const XprType& expr)
+ : m_xpr(expr) {}
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type&
+ expression() const { return m_xpr; }
+
+ EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorLayoutSwapOp)
+ protected:
+ typename XprType::Nested m_xpr;
+};
+
+
+// Eval as rvalue
+template<typename ArgType, typename Device>
+struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
+{
+ typedef TensorLayoutSwapOp<ArgType> XprType;
+ typedef typename XprType::Index Index;
+ static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+ typedef DSizes<Index, NumDims> Dimensions;
+
+ enum {
+ IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = false,
+ PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+ Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
+ CoordAccess = false, // to be implemented
+ RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
+ };
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : m_impl(op.expression(), device)
+ {
+ for(int i = 0; i < NumDims; ++i) {
+ m_dimensions[i] = m_impl.dimensions()[NumDims-1-i];
+ }
+ }
+
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_impl.bind(cgh);
+ }
+#endif
+
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+ return m_impl.evalSubExprsIfNeeded(data);
+ }
+ EIGEN_STRONG_INLINE void cleanup() {
+ m_impl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ return m_impl.coeff(index);
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+ {
+ return m_impl.template packet<LoadMode>(index);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+ return m_impl.costPerCoeff(vectorized);
+ }
+
+ EIGEN_DEVICE_FUNC typename Storage::Type data() const {
+ return constCast(m_impl.data());
+ }
+
+ const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+
+ protected:
+ TensorEvaluator<ArgType, Device> m_impl;
+ Dimensions m_dimensions;
+};
+
+
+// Eval as lvalue
+template<typename ArgType, typename Device>
+ struct TensorEvaluator<TensorLayoutSwapOp<ArgType>, Device>
+ : public TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
+{
+ typedef TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device> Base;
+ typedef TensorLayoutSwapOp<ArgType> XprType;
+
+ enum {
+ IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = false,
+ PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+ Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
+ CoordAccess = false // to be implemented
+ };
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : Base(op, device)
+ { }
+
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+ {
+ return this->m_impl.coeffRef(index);
+ }
+ template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ void writePacket(Index index, const PacketReturnType& x)
+ {
+ this->m_impl.template writePacket<StoreMode>(index, x);
+ }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorMacros.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorMacros.h
new file mode 100644
index 0000000..73ff3d2
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorMacros.h
@@ -0,0 +1,98 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_META_MACROS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_META_MACROS_H
+
+
+/** use this macro in sfinae selection in templated functions
+ *
+ * template<typename T,
+ * typename std::enable_if< isBanana<T>::value , int >::type = 0
+ * >
+ * void foo(){}
+ *
+ * becomes =>
+ *
+ * template<typename TopoType,
+ * SFINAE_ENABLE_IF( isBanana<T>::value )
+ * >
+ * void foo(){}
+ */
+
+// SFINAE requires variadic templates
+#if !defined(EIGEN_GPUCC)
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+ // SFINAE doesn't work for gcc <= 4.7
+ #ifdef EIGEN_COMP_GNUC
+ #if EIGEN_GNUC_AT_LEAST(4,8)
+ #define EIGEN_HAS_SFINAE
+ #endif
+ #else
+ #define EIGEN_HAS_SFINAE
+ #endif
+#endif
+#endif
+
+#define EIGEN_SFINAE_ENABLE_IF( __condition__ ) \
+ typename internal::enable_if< ( __condition__ ) , int >::type = 0
+
+// Define a macro to use a reference on the host but a value on the device
+#if defined(SYCL_DEVICE_ONLY)
+ #define EIGEN_DEVICE_REF
+#else
+ #define EIGEN_DEVICE_REF &
+#endif
+
+// Define a macro for catching SYCL exceptions if exceptions are enabled
+#define EIGEN_SYCL_TRY_CATCH(X) \
+ do { \
+ EIGEN_TRY {X;} \
+ EIGEN_CATCH(const cl::sycl::exception& e) { \
+ EIGEN_THROW_X(std::runtime_error("SYCL exception at " + \
+ std::string(__FILE__) + ":" + \
+ std::to_string(__LINE__) + "\n" + \
+ e.what())); \
+ } \
+ } while (false)
+
+// Define a macro if local memory flags are unset or one of them is set
+// Setting both flags is the same as unsetting them
+#if (!defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)) || \
+ (defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM))
+ #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON 1
+ #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF 1
+#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
+ #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON 1
+#elif !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
+ #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF 1
+#endif
+
+#if EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
+ #define EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
+ using Base::operator =; \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) { Base::operator=(other); return *this; } \
+ template <typename OtherDerived> \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const OtherDerived& other) { Base::operator=(other); return *this; }
+#else
+ #define EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
+ EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived)
+#endif
+
+/** \internal
+ * \brief Macro to manually inherit assignment operators.
+ * This is necessary, because the implicitly defined assignment operator gets deleted when a custom operator= is defined.
+ * This also inherits template<OtherDerived> operator=(const OtherDerived&) assignments.
+ * With C++11 or later this also default-implements the copy-constructor
+ */
+#define EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(Derived) \
+ EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
+ EIGEN_DEFAULT_COPY_CONSTRUCTOR(Derived)
+
+#endif
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorMap.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorMap.h
new file mode 100644
index 0000000..6834c97
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorMap.h
@@ -0,0 +1,327 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_MAP_H
+#define EIGEN_CXX11_TENSOR_TENSOR_MAP_H
+
+namespace Eigen {
+
+// FIXME use proper doxygen documentation (e.g. \tparam MakePointer_)
+
+/** \class TensorMap
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief A tensor expression mapping an existing array of data.
+ *
+ */
+/// `template <class> class MakePointer_` is added to convert the host pointer to the device pointer.
+/// It is added due to the fact that for our device compiler `T*` is not allowed.
+/// If we wanted to use the same Evaluator functions we have to convert that type to our pointer `T`.
+/// This is done through our `MakePointer_` class. By default the Type in the `MakePointer_<T>` is `T*` .
+/// Therefore, by adding the default value, we managed to convert the type and it does not break any
+/// existing code as its default value is `T*`.
+template<typename PlainObjectType, int Options_, template <class> class MakePointer_> class TensorMap : public TensorBase<TensorMap<PlainObjectType, Options_, MakePointer_> >
+{
+ public:
+ typedef TensorMap<PlainObjectType, Options_, MakePointer_> Self;
+ typedef TensorBase<TensorMap<PlainObjectType, Options_, MakePointer_> > Base;
+ #ifdef EIGEN_USE_SYCL
+ typedef typename Eigen::internal::remove_reference<typename Eigen::internal::nested<Self>::type>::type Nested;
+ #else
+ typedef typename Eigen::internal::nested<Self>::type Nested;
+ #endif
+ typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind;
+ typedef typename internal::traits<PlainObjectType>::Index Index;
+ typedef typename internal::traits<PlainObjectType>::Scalar Scalar;
+ typedef typename NumTraits<Scalar>::Real RealScalar;
+ typedef typename PlainObjectType::Base::CoeffReturnType CoeffReturnType;
+
+ typedef typename MakePointer_<Scalar>::Type PointerType;
+ typedef typename MakePointer_<Scalar>::ConstType PointerConstType;
+
+ // WARN: PointerType still can be a pointer to const (const Scalar*), for
+ // example in TensorMap<Tensor<const Scalar, ...>> expression. This type of
+ // expression should be illegal, but adding this restriction is not possible
+ // in practice (see https://bitbucket.org/eigen/eigen/pull-requests/488).
+ typedef typename internal::conditional<
+ bool(internal::is_lvalue<PlainObjectType>::value),
+ PointerType, // use simple pointer in lvalue expressions
+ PointerConstType // use const pointer in rvalue expressions
+ >::type StoragePointerType;
+
+ // If TensorMap was constructed over rvalue expression (e.g. const Tensor),
+ // we should return a reference to const from operator() (and others), even
+ // if TensorMap itself is not const.
+ typedef typename internal::conditional<
+ bool(internal::is_lvalue<PlainObjectType>::value),
+ Scalar&,
+ const Scalar&
+ >::type StorageRefType;
+
+ static const int Options = Options_;
+
+ static const Index NumIndices = PlainObjectType::NumIndices;
+ typedef typename PlainObjectType::Dimensions Dimensions;
+
+ enum {
+ IsAligned = ((int(Options_)&Aligned)==Aligned),
+ Layout = PlainObjectType::Layout,
+ CoordAccess = true,
+ RawAccess = true
+ };
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr) : m_data(dataPtr), m_dimensions() {
+ // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
+ EIGEN_STATIC_ASSERT((0 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
+ }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+ template<typename... IndexTypes> EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) {
+ // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
+ EIGEN_STATIC_ASSERT((sizeof...(otherDimensions) + 1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
+ }
+#else
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) {
+ // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
+ EIGEN_STATIC_ASSERT((1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2) : m_data(dataPtr), m_dimensions(dim1, dim2) {
+ EIGEN_STATIC_ASSERT(2 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3) {
+ EIGEN_STATIC_ASSERT(3 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4) {
+ EIGEN_STATIC_ASSERT(4 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4, dim5) {
+ EIGEN_STATIC_ASSERT(5 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ }
+#endif
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, const array<Index, NumIndices>& dimensions)
+ : m_data(dataPtr), m_dimensions(dimensions)
+ { }
+
+ template <typename Dimensions>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, const Dimensions& dimensions)
+ : m_data(dataPtr), m_dimensions(dimensions)
+ { }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PlainObjectType& tensor)
+ : m_data(tensor.data()), m_dimensions(tensor.dimensions())
+ { }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Index rank() const { return m_dimensions.rank(); }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_dimensions[n]; }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE StoragePointerType data() { return m_data; }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE StoragePointerType data() const { return m_data; }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE StorageRefType operator()(const array<Index, NumIndices>& indices) const
+ {
+ // eigen_assert(checkIndexRange(indices));
+ if (PlainObjectType::Options&RowMajor) {
+ const Index index = m_dimensions.IndexOfRowMajor(indices);
+ return m_data[index];
+ } else {
+ const Index index = m_dimensions.IndexOfColMajor(indices);
+ return m_data[index];
+ }
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE StorageRefType operator()() const
+ {
+ EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ return m_data[0];
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE StorageRefType operator()(Index index) const
+ {
+ eigen_internal_assert(index >= 0 && index < size());
+ return m_data[index];
+ }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+ template<typename... IndexTypes> EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE StorageRefType operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
+ {
+ EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(internal::all((Eigen::NumTraits<Index>::highest() >= otherIndices)...));
+ if (PlainObjectType::Options&RowMajor) {
+ const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+ return m_data[index];
+ } else {
+ const Index index = m_dimensions.IndexOfColMajor(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+ return m_data[index];
+ }
+ }
+#else
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1) const
+ {
+ if (PlainObjectType::Options&RowMajor) {
+ const Index index = i1 + i0 * m_dimensions[1];
+ return m_data[index];
+ } else {
+ const Index index = i0 + i1 * m_dimensions[0];
+ return m_data[index];
+ }
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2) const
+ {
+ if (PlainObjectType::Options&RowMajor) {
+ const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0);
+ return m_data[index];
+ } else {
+ const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2);
+ return m_data[index];
+ }
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3) const
+ {
+ if (PlainObjectType::Options&RowMajor) {
+ const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0));
+ return m_data[index];
+ } else {
+ const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * i3));
+ return m_data[index];
+ }
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
+ {
+ if (PlainObjectType::Options&RowMajor) {
+ const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)));
+ return m_data[index];
+ } else {
+ const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * (i3 + m_dimensions[3] * i4)));
+ return m_data[index];
+ }
+ }
+#endif
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE StorageRefType operator()(const array<Index, NumIndices>& indices)
+ {
+ // eigen_assert(checkIndexRange(indices));
+ if (PlainObjectType::Options&RowMajor) {
+ const Index index = m_dimensions.IndexOfRowMajor(indices);
+ return m_data[index];
+ } else {
+ const Index index = m_dimensions.IndexOfColMajor(indices);
+ return m_data[index];
+ }
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE StorageRefType operator()()
+ {
+ EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ return m_data[0];
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE StorageRefType operator()(Index index)
+ {
+ eigen_internal_assert(index >= 0 && index < size());
+ return m_data[index];
+ }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+ template<typename... IndexTypes> EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE StorageRefType operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
+ {
+ static_assert(sizeof...(otherIndices) + 2 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
+ eigen_assert(internal::all((Eigen::NumTraits<Index>::highest() >= otherIndices)...));
+ const std::size_t NumDims = sizeof...(otherIndices) + 2;
+ if (PlainObjectType::Options&RowMajor) {
+ const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumDims>{{firstIndex, secondIndex, otherIndices...}});
+ return m_data[index];
+ } else {
+ const Index index = m_dimensions.IndexOfColMajor(array<Index, NumDims>{{firstIndex, secondIndex, otherIndices...}});
+ return m_data[index];
+ }
+ }
+#else
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1)
+ {
+ if (PlainObjectType::Options&RowMajor) {
+ const Index index = i1 + i0 * m_dimensions[1];
+ return m_data[index];
+ } else {
+ const Index index = i0 + i1 * m_dimensions[0];
+ return m_data[index];
+ }
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2)
+ {
+ if (PlainObjectType::Options&RowMajor) {
+ const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0);
+ return m_data[index];
+ } else {
+ const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2);
+ return m_data[index];
+ }
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3)
+ {
+ if (PlainObjectType::Options&RowMajor) {
+ const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0));
+ return m_data[index];
+ } else {
+ const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * i3));
+ return m_data[index];
+ }
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3, Index i4)
+ {
+ if (PlainObjectType::Options&RowMajor) {
+ const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)));
+ return m_data[index];
+ } else {
+ const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * (i3 + m_dimensions[3] * i4)));
+ return m_data[index];
+ }
+ }
+#endif
+
+ EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorMap)
+
+ private:
+ StoragePointerType m_data;
+ Dimensions m_dimensions;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_MAP_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorMeta.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorMeta.h
new file mode 100644
index 0000000..a6181d3
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorMeta.h
@@ -0,0 +1,311 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_META_H
+#define EIGEN_CXX11_TENSOR_TENSOR_META_H
+
+namespace Eigen {
+
+template<bool cond> struct Cond {};
+
+template<typename T1, typename T2> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+const T1& choose(Cond<true>, const T1& first, const T2&) {
+ return first;
+}
+
+template<typename T1, typename T2> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+const T2& choose(Cond<false>, const T1&, const T2& second) {
+ return second;
+}
+
+
+template <typename T, typename X, typename Y>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T divup(const X x, const Y y) {
+ return static_cast<T>((x + y - 1) / y);
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T divup(const T x, const T y) {
+ return static_cast<T>((x + y - 1) / y);
+}
+
+template <size_t n> struct max_n_1 {
+ static const size_t size = n;
+};
+template <> struct max_n_1<0> {
+ static const size_t size = 1;
+};
+
+
+// Default packet types
+template <typename Scalar, typename Device>
+struct PacketType : internal::packet_traits<Scalar> {
+ typedef typename internal::packet_traits<Scalar>::type type;
+};
+
+// For CUDA packet types when using a GpuDevice
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_HAS_GPU_FP16)
+
+typedef ulonglong2 Packet4h2;
+template<>
+struct PacketType<half, GpuDevice> {
+ typedef Packet4h2 type;
+ static const int size = 8;
+ enum {
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
+ HasNegate = 1,
+ HasAbs = 1,
+ HasArg = 0,
+ HasAbs2 = 0,
+ HasMin = 1,
+ HasMax = 1,
+ HasConj = 0,
+ HasSetLinear = 0,
+ HasBlend = 0,
+
+ HasDiv = 1,
+ HasSqrt = 1,
+ HasRsqrt = 1,
+ HasExp = 1,
+ HasExpm1 = 0,
+ HasLog = 1,
+ HasLog1p = 0,
+ HasLog10 = 0,
+ HasPow = 1,
+ };
+};
+#endif
+
+#if defined(EIGEN_USE_SYCL)
+
+namespace TensorSycl {
+namespace internal {
+
+template <typename Index, Index A, Index B> struct PlusOp {
+ static constexpr Index Value = A + B;
+};
+
+template <typename Index, Index A, Index B> struct DivOp {
+ static constexpr Index Value = A / B;
+};
+
+template <typename Index, Index start, Index end, Index step,
+ template <class Indx, Indx...> class StepOp>
+struct static_for {
+ template <typename UnaryOperator>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void loop(UnaryOperator op) {
+ op(start);
+ static_for<Index, StepOp<Index, start, step>::Value, end, step,
+ StepOp>::loop(op);
+ }
+};
+template <typename Index, Index end, Index step,
+ template <class Indx, Indx...> class StepOp>
+struct static_for<Index, end, end, step, StepOp> {
+ template <typename UnaryOperator>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void loop(UnaryOperator) {}
+};
+
+template <typename OutScalar, typename Device, bool Vectorizable>
+struct Vectorise {
+ static const int PacketSize = 1;
+ typedef OutScalar PacketReturnType;
+};
+
+template <typename OutScalar, typename Device>
+struct Vectorise<OutScalar, Device, true> {
+ static const int PacketSize = Eigen::PacketType<OutScalar, Device>::size;
+ typedef typename Eigen::PacketType<OutScalar, Device>::type PacketReturnType;
+};
+
+static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index roundUp(Index x, Index y) {
+ return ((((x) + (y)-1) / (y)) * (y));
+}
+
+} // namespace internal
+} // namespace TensorSycl
+
+template <>
+ struct PacketType<half, SyclDevice> {
+ typedef half type;
+ static const int size = 1;
+ enum {
+ HasAdd = 0,
+ HasSub = 0,
+ HasMul = 0,
+ HasNegate = 0,
+ HasAbs = 0,
+ HasArg = 0,
+ HasAbs2 = 0,
+ HasMin = 0,
+ HasMax = 0,
+ HasConj = 0,
+ HasSetLinear = 0,
+ HasBlend = 0
+ };
+};
+template <typename Scalar>
+struct PacketType<Scalar, SyclDevice> : internal::default_packet_traits {
+ typedef Scalar type;
+ typedef Scalar half;
+ enum {
+ Vectorizable = 0,
+ size = 1,
+ AlignedOnScalar = 0,
+ HasHalfPacket = 0
+ };
+ enum {
+ HasAdd = 0,
+ HasSub = 0,
+ HasMul = 0,
+ HasNegate = 0,
+ HasAbs = 0,
+ HasAbs2 = 0,
+ HasMin = 0,
+ HasMax = 0,
+ HasConj = 0,
+ HasSetLinear = 0
+ };
+
+};
+
+template <typename Scalar>
+struct PacketType<Scalar, const SyclDevice> : PacketType<Scalar, SyclDevice>{};
+
+#ifndef EIGEN_DONT_VECTORIZE_SYCL
+#define PACKET_TYPE(CVQual, Type, val, lengths, DEV)\
+template<> struct PacketType<CVQual Type, DEV> : internal::sycl_packet_traits<val, lengths> \
+{\
+ typedef typename internal::packet_traits<Type>::type type;\
+ typedef typename internal::packet_traits<Type>::half half;\
+};
+
+
+PACKET_TYPE(const, float, 1, 4, SyclDevice)
+PACKET_TYPE(, float, 1, 4, SyclDevice)
+PACKET_TYPE(const, float, 1, 4, const SyclDevice)
+PACKET_TYPE(, float, 1, 4, const SyclDevice)
+
+PACKET_TYPE(const, double, 0, 2, SyclDevice)
+PACKET_TYPE(, double, 0, 2, SyclDevice)
+PACKET_TYPE(const, double, 0, 2, const SyclDevice)
+PACKET_TYPE(, double, 0, 2, const SyclDevice)
+#undef PACKET_TYPE
+
+template<> struct PacketType<half, const SyclDevice>: PacketType<half, SyclDevice>{};
+template<> struct PacketType<const half, const SyclDevice>: PacketType<half, SyclDevice>{};
+#endif
+#endif
+
+// Tuple mimics std::pair but works on e.g. nvcc.
+template <typename U, typename V> struct Tuple {
+ public:
+ U first;
+ V second;
+
+ typedef U first_type;
+ typedef V second_type;
+
+ EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Tuple() : first(), second() {}
+
+ EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Tuple(const U& f, const V& s) : first(f), second(s) {}
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ void swap(Tuple& rhs) {
+ using numext::swap;
+ swap(first, rhs.first);
+ swap(second, rhs.second);
+ }
+};
+
+template <typename U, typename V>
+EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+bool operator==(const Tuple<U, V>& x, const Tuple<U, V>& y) {
+ return (x.first == y.first && x.second == y.second);
+}
+
+template <typename U, typename V>
+EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+bool operator!=(const Tuple<U, V>& x, const Tuple<U, V>& y) {
+ return !(x == y);
+}
+
+
+// Can't use std::pairs on cuda devices
+template <typename Idx> struct IndexPair {
+ EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) {}
+ EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Idx f, Idx s) : first(f), second(s) {}
+
+ EIGEN_DEVICE_FUNC void set(IndexPair<Idx> val) {
+ first = val.first;
+ second = val.second;
+ }
+
+ Idx first;
+ Idx second;
+};
+
+
+#ifdef EIGEN_HAS_SFINAE
+namespace internal {
+
+ template<typename IndexType, typename Index, Index... Is>
+ EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ array<Index, sizeof...(Is)> customIndices2Array(IndexType& idx, numeric_list<Index, Is...>) {
+ return { idx[Is]... };
+ }
+ template<typename IndexType, typename Index>
+ EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ array<Index, 0> customIndices2Array(IndexType&, numeric_list<Index>) {
+ return array<Index, 0>();
+ }
+
+ /** Make an array (for index/dimensions) out of a custom index */
+ template<typename Index, std::size_t NumIndices, typename IndexType>
+ EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ array<Index, NumIndices> customIndices2Array(IndexType& idx) {
+ return customIndices2Array(idx, typename gen_numeric_list<Index, NumIndices>::type{});
+ }
+
+
+ template <typename B, typename D>
+ struct is_base_of
+ {
+
+ typedef char (&yes)[1];
+ typedef char (&no)[2];
+
+ template <typename BB, typename DD>
+ struct Host
+ {
+ operator BB*() const;
+ operator DD*();
+ };
+
+ template<typename T>
+ static yes check(D*, T);
+ static no check(B*, int);
+
+ static const bool value = sizeof(check(Host<B,D>(), int())) == sizeof(yes);
+ };
+
+}
+#endif
+
+
+
+} // namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_META_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorMorphing.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorMorphing.h
new file mode 100644
index 0000000..b3f00f7
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorMorphing.h
@@ -0,0 +1,1102 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H
+
+namespace Eigen {
+
+/** \class TensorReshaping
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor reshaping class.
+ *
+ *
+ */
+namespace internal {
+template<typename NewDimensions, typename XprType>
+struct traits<TensorReshapingOp<NewDimensions, XprType> > : public traits<XprType>
+{
+ typedef typename XprType::Scalar Scalar;
+ typedef traits<XprType> XprTraits;
+ typedef typename XprTraits::StorageKind StorageKind;
+ typedef typename XprTraits::Index Index;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = array_size<NewDimensions>::value;
+ static const int Layout = XprTraits::Layout;
+ typedef typename XprTraits::PointerType PointerType;
+};
+
+template<typename NewDimensions, typename XprType>
+struct eval<TensorReshapingOp<NewDimensions, XprType>, Eigen::Dense>
+{
+ typedef const TensorReshapingOp<NewDimensions, XprType>EIGEN_DEVICE_REF type;
+};
+
+template<typename NewDimensions, typename XprType>
+struct nested<TensorReshapingOp<NewDimensions, XprType>, 1, typename eval<TensorReshapingOp<NewDimensions, XprType> >::type>
+{
+ typedef TensorReshapingOp<NewDimensions, XprType> type;
+};
+
+} // end namespace internal
+
+
+
+template<typename NewDimensions, typename XprType>
+class TensorReshapingOp : public TensorBase<TensorReshapingOp<NewDimensions, XprType>, WriteAccessors>
+{
+ public:
+ typedef TensorBase<TensorReshapingOp<NewDimensions, XprType>, WriteAccessors> Base;
+ typedef typename Eigen::internal::traits<TensorReshapingOp>::Scalar Scalar;
+ typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+ typedef typename Eigen::internal::nested<TensorReshapingOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorReshapingOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorReshapingOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReshapingOp(const XprType& expr, const NewDimensions& dims)
+ : m_xpr(expr), m_dims(dims) {}
+
+ EIGEN_DEVICE_FUNC
+ const NewDimensions& dimensions() const { return m_dims; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type&
+ expression() const { return m_xpr; }
+
+ EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorReshapingOp)
+
+ protected:
+ typename XprType::Nested m_xpr;
+ const NewDimensions m_dims;
+};
+
+
+// Eval as rvalue
+template<typename NewDimensions, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
+{
+ typedef TensorReshapingOp<NewDimensions, ArgType> XprType;
+ typedef NewDimensions Dimensions;
+
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+ typedef StorageMemory<typename internal::remove_const<CoeffReturnType>::type, Device> ConstCastStorage;
+
+ static const int NumOutputDims = internal::array_size<Dimensions>::value;
+ static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+
+ enum ReshapingKind {
+ // We do not use layout information to determine reshaping kind.
+ // Depending on the layout `N` can be inner or outer dimension.
+ OneByN = 0, // expr.reshape(1, N)
+ NByOne = 1, // expr.reshape(N, 1)
+ Runtime = 2 // Reshape dimensions are dynamic (specified at runtime).
+ };
+
+ // clang-format off
+ static const ReshapingKind kind =
+#if defined(EIGEN_HAS_INDEX_LIST)
+ (NumOutputDims == 2 && internal::index_statically_eq<NewDimensions>(/*index=*/0, /*value=*/1)) ? OneByN
+ : (NumOutputDims == 2 && internal::index_statically_eq<NewDimensions>(/*index=*/1, /*value=*/1)) ? NByOne
+ : Runtime;
+#else
+ Runtime;
+#endif
+ // clang-format on
+
+ enum {
+ IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ // For trivial reshapes with raw access to underlying data we will provide
+ // zero overhead block access.
+ // TODO(ezhulenev): Consider adding block access without raw access?
+ BlockAccess = TensorEvaluator<ArgType, Device>::RawAccess &&
+ NumInputDims > 0 && NumOutputDims > 0,
+ PreferBlockAccess = false,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
+ };
+
+ typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockDescriptor<NumOutputDims, Index> TensorBlockDesc;
+ typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+ typedef
+ typename internal::TensorMaterializedBlock<ScalarNoConst, NumOutputDims,
+ Layout, Index>
+ TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : m_impl(op.expression(), device), m_dimensions(op.dimensions())
+ {
+ // The total size of the reshaped tensor must be equal to the total size
+ // of the input tensor.
+ eigen_assert(internal::array_prod(m_impl.dimensions()) == internal::array_prod(op.dimensions()));
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+#ifdef EIGEN_USE_THREADS
+ template <typename EvalSubExprsCallback>
+ EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+ EvaluatorPointerType data, EvalSubExprsCallback done) {
+ m_impl.evalSubExprsIfNeededAsync(data, std::move(done));
+ }
+#endif
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+ return m_impl.evalSubExprsIfNeeded(data);
+ }
+ EIGEN_STRONG_INLINE void cleanup() {
+ m_impl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ return m_impl.coeff(index);
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+ {
+ return m_impl.template packet<LoadMode>(index);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+ return m_impl.costPerCoeff(vectorized);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ internal::TensorBlockResourceRequirements getResourceRequirements() const {
+ return internal::TensorBlockResourceRequirements::any();
+ }
+
+ // required in block(OutputTensorBlock* output_block) const
+ // For C++03 compatibility this must be defined outside the method
+ struct BlockIteratorState {
+ Index stride;
+ Index span;
+ Index size;
+ Index count;
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+ block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+ bool /*root_of_expr_ast*/ = false) const {
+ eigen_assert(m_impl.data() != NULL);
+ eigen_assert((kind == Runtime) ||
+ (kind == OneByN && desc.dimensions()[0] == 1) ||
+ (kind == NByOne && desc.dimensions()[1] == 1));
+
+ if (kind == OneByN || kind == NByOne) {
+ // We can guarantee at compile time that block is just a contiguous slice
+ // of the underlying expression memory buffer.
+ return TensorBlock(internal::TensorBlockKind::kView,
+ m_impl.data() + desc.offset(), desc.dimensions());
+ } else {
+ // This will do additional runtime checks, and in the end it might be also
+ // a view, or it might be a block materialized in the temporary buffer.
+ return TensorBlock::materialize(m_impl.data(), m_dimensions, desc,
+ scratch);
+ }
+ }
+
+ EIGEN_DEVICE_FUNC typename Storage::Type data() const {
+ return constCast(m_impl.data());
+ }
+
+ EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+
+ #ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_impl.bind(cgh);
+ }
+ #endif
+ protected:
+ TensorEvaluator<ArgType, Device> m_impl;
+ NewDimensions m_dimensions;
+};
+
+
+// Eval as lvalue
+template<typename NewDimensions, typename ArgType, typename Device>
+ struct TensorEvaluator<TensorReshapingOp<NewDimensions, ArgType>, Device>
+ : public TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
+
+{
+ typedef TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device> Base;
+ typedef TensorReshapingOp<NewDimensions, ArgType> XprType;
+ typedef NewDimensions Dimensions;
+
+ enum {
+ IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = TensorEvaluator<ArgType, Device>::RawAccess,
+ PreferBlockAccess = false,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
+ };
+
+ EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : Base(op, device)
+ { }
+
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockDescriptor<TensorEvaluator::NumOutputDims, Index>
+ TensorBlockDesc;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+ {
+ return this->m_impl.coeffRef(index);
+ }
+
+ template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ void writePacket(Index index, const PacketReturnType& x)
+ {
+ this->m_impl.template writePacket<StoreMode>(index, x);
+ }
+
+ template <typename TensorBlock>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
+ const TensorBlockDesc& desc, const TensorBlock& block) {
+ assert(this->m_impl.data() != NULL);
+
+ typedef typename TensorBlock::XprType TensorBlockExpr;
+ typedef internal::TensorBlockAssignment<
+ Scalar, TensorEvaluator::NumOutputDims, TensorBlockExpr, Index>
+ TensorBlockAssign;
+
+ TensorBlockAssign::Run(
+ TensorBlockAssign::target(desc.dimensions(),
+ internal::strides<Layout>(this->dimensions()),
+ this->m_impl.data(), desc.offset()),
+ block.expr());
+ }
+};
+
+
+/** \class TensorSlicing
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor slicing class.
+ *
+ *
+ */
+namespace internal {
+template<typename StartIndices, typename Sizes, typename XprType>
+struct traits<TensorSlicingOp<StartIndices, Sizes, XprType> > : public traits<XprType>
+{
+ typedef typename XprType::Scalar Scalar;
+ typedef traits<XprType> XprTraits;
+ typedef typename XprTraits::StorageKind StorageKind;
+ typedef typename XprTraits::Index Index;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = array_size<StartIndices>::value;
+ static const int Layout = XprTraits::Layout;
+ typedef typename XprTraits::PointerType PointerType;
+};
+
+template<typename StartIndices, typename Sizes, typename XprType>
+struct eval<TensorSlicingOp<StartIndices, Sizes, XprType>, Eigen::Dense>
+{
+ typedef const TensorSlicingOp<StartIndices, Sizes, XprType>EIGEN_DEVICE_REF type;
+};
+
+template<typename StartIndices, typename Sizes, typename XprType>
+struct nested<TensorSlicingOp<StartIndices, Sizes, XprType>, 1, typename eval<TensorSlicingOp<StartIndices, Sizes, XprType> >::type>
+{
+ typedef TensorSlicingOp<StartIndices, Sizes, XprType> type;
+};
+
+} // end namespace internal
+
+
+
+template<typename StartIndices, typename Sizes, typename XprType>
+class TensorSlicingOp : public TensorBase<TensorSlicingOp<StartIndices, Sizes, XprType> >
+{
+ public:
+ typedef TensorBase<TensorSlicingOp<StartIndices, Sizes, XprType> > Base;
+ typedef typename Eigen::internal::traits<TensorSlicingOp>::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename Eigen::internal::nested<TensorSlicingOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorSlicingOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorSlicingOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSlicingOp(const XprType& expr, const StartIndices& indices, const Sizes& sizes)
+ : m_xpr(expr), m_indices(indices), m_sizes(sizes) {}
+
+ EIGEN_DEVICE_FUNC
+ const StartIndices& startIndices() const { return m_indices; }
+ EIGEN_DEVICE_FUNC
+ const Sizes& sizes() const { return m_sizes; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type&
+ expression() const { return m_xpr; }
+
+ EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorSlicingOp)
+
+ protected:
+ typename XprType::Nested m_xpr;
+ const StartIndices m_indices;
+ const Sizes m_sizes;
+};
+
+
+// Fixme: figure out the exact threshold
+namespace {
+template <typename Index, typename Device, bool BlockAccess> struct MemcpyTriggerForSlicing {
+ EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const Device& device) : threshold_(2 * device.numThreads()) { }
+ EIGEN_DEVICE_FUNC bool operator ()(Index total, Index contiguous) const {
+ const bool prefer_block_evaluation = BlockAccess && total > 32*1024;
+ return !prefer_block_evaluation && contiguous > threshold_;
+ }
+
+ private:
+ Index threshold_;
+};
+
+// It is very expensive to start the memcpy kernel on GPU: we therefore only
+// use it for large copies.
+#ifdef EIGEN_USE_GPU
+template <typename Index, bool BlockAccess> struct MemcpyTriggerForSlicing<Index, GpuDevice, BlockAccess> {
+ EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const GpuDevice&) { }
+ EIGEN_DEVICE_FUNC bool operator ()(Index, Index contiguous) const { return contiguous > 4*1024*1024; }
+};
+#endif
+
+// It is very expensive to start the memcpy kernel on GPU: we therefore only
+// use it for large copies.
+#ifdef EIGEN_USE_SYCL
+template <typename Index, bool BlockAccess> struct MemcpyTriggerForSlicing<Index, Eigen::SyclDevice, BlockAccess> {
+ EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const SyclDevice&) { }
+ EIGEN_DEVICE_FUNC bool operator ()(Index, Index contiguous) const { return contiguous > 4*1024*1024; }
+};
+#endif
+
+}
+
+// Eval as rvalue
+template<typename StartIndices, typename Sizes, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
+{
+ typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType;
+ static const int NumDims = internal::array_size<Sizes>::value;
+
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ typedef Sizes Dimensions;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef StorageMemory<typename internal::remove_const<CoeffReturnType>::type, Device> ConstCastStorage;
+ typedef typename Storage::Type EvaluatorPointerType;
+
+ enum {
+ // Alignment can't be guaranteed at compile time since it depends on the
+ // slice offsets and sizes.
+ IsAligned = false,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess &&
+ // FIXME: Temporary workaround for bug in slicing of bool tensors.
+ !internal::is_same<typename internal::remove_const<Scalar>::type, bool>::value,
+ PreferBlockAccess = true,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false,
+ RawAccess = false
+ };
+
+ typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+ typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+ // Tensor slicing does not change the block type.
+ typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
+ TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices())
+ {
+ m_is_identity = true;
+ for (int i = 0; i < internal::array_size<Dimensions>::value; ++i) {
+ eigen_assert(m_impl.dimensions()[i] >=
+ op.sizes()[i] + op.startIndices()[i]);
+ if (m_impl.dimensions()[i] != op.sizes()[i] ||
+ op.startIndices()[i] != 0) {
+ m_is_identity = false;
+ }
+ }
+
+ // No strides for scalars.
+ if (NumDims == 0) return;
+
+ const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+ const Sizes& output_dims = op.sizes();
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ m_inputStrides[0] = 1;
+ for (int i = 1; i < NumDims; ++i) {
+ m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
+ }
+
+ // Don't initialize m_fastOutputStrides[0] since it won't ever be accessed.
+ m_outputStrides[0] = 1;
+ for (int i = 1; i < NumDims; ++i) {
+ m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1];
+ m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1);
+ }
+ } else {
+ m_inputStrides[NumDims-1] = 1;
+ for (int i = NumDims - 2; i >= 0; --i) {
+ m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
+ }
+
+ // Don't initialize m_fastOutputStrides[NumDims-1] since it won't ever be accessed.
+ m_outputStrides[NumDims-1] = 1;
+ for (int i = NumDims - 2; i >= 0; --i) {
+ m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1];
+ m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1);
+ }
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+ m_impl.evalSubExprsIfNeeded(NULL);
+ if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization
+ && data && m_impl.data()) {
+ Index contiguous_values = 1;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = 0; i < NumDims; ++i) {
+ contiguous_values *= dimensions()[i];
+ if (dimensions()[i] != m_impl.dimensions()[i]) {
+ break;
+ }
+ }
+ } else {
+ for (int i = NumDims-1; i >= 0; --i) {
+ contiguous_values *= dimensions()[i];
+ if (dimensions()[i] != m_impl.dimensions()[i]) {
+ break;
+ }
+ }
+ }
+ // Use memcpy if it's going to be faster than using the regular evaluation.
+ const MemcpyTriggerForSlicing<Index, Device, BlockAccess> trigger(m_device);
+ if (trigger(internal::array_prod(dimensions()), contiguous_values)) {
+ EvaluatorPointerType src = (EvaluatorPointerType)m_impl.data();
+ for (Index i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) {
+ Index offset = srcCoeff(i);
+ m_device.memcpy((void*)(m_device.get(data + i)), m_device.get(src+offset), contiguous_values * sizeof(Scalar));
+ }
+ return false;
+ }
+ }
+ return true;
+ }
+
+#ifdef EIGEN_USE_THREADS
+ template <typename EvalSubExprsCallback>
+ EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+ EvaluatorPointerType /*data*/, EvalSubExprsCallback done) {
+ m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+ }
+#endif // EIGEN_USE_THREADS
+
+ EIGEN_STRONG_INLINE void cleanup() {
+ m_impl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ if (m_is_identity) {
+ return m_impl.coeff(index);
+ } else {
+ return m_impl.coeff(srcCoeff(index));
+ }
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+ {
+ const int packetSize = PacketType<CoeffReturnType, Device>::size;
+ EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+packetSize-1 < internal::array_prod(dimensions()));
+
+ if (m_is_identity) {
+ return m_impl.template packet<LoadMode>(index);
+ }
+
+ Index inputIndices[] = {0, 0};
+ Index indices[] = {index, index + packetSize - 1};
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ EIGEN_UNROLL_LOOP
+ for (int i = NumDims - 1; i > 0; --i) {
+ const Index idx0 = indices[0] / m_fastOutputStrides[i];
+ const Index idx1 = indices[1] / m_fastOutputStrides[i];
+ inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i];
+ inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i];
+ indices[0] -= idx0 * m_outputStrides[i];
+ indices[1] -= idx1 * m_outputStrides[i];
+ }
+ inputIndices[0] += (indices[0] + m_offsets[0]);
+ inputIndices[1] += (indices[1] + m_offsets[0]);
+ } else {
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < NumDims - 1; ++i) {
+ const Index idx0 = indices[0] / m_fastOutputStrides[i];
+ const Index idx1 = indices[1] / m_fastOutputStrides[i];
+ inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i];
+ inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i];
+ indices[0] -= idx0 * m_outputStrides[i];
+ indices[1] -= idx1 * m_outputStrides[i];
+ }
+ inputIndices[0] += (indices[0] + m_offsets[NumDims-1]);
+ inputIndices[1] += (indices[1] + m_offsets[NumDims-1]);
+ }
+ if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
+ PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
+ return rslt;
+ }
+ else {
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
+ values[0] = m_impl.coeff(inputIndices[0]);
+ values[packetSize-1] = m_impl.coeff(inputIndices[1]);
+ EIGEN_UNROLL_LOOP
+ for (int i = 1; i < packetSize-1; ++i) {
+ values[i] = coeff(index+i);
+ }
+ PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+ return rslt;
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+ return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ internal::TensorBlockResourceRequirements getResourceRequirements() const {
+ const size_t target_size = m_device.lastLevelCacheSize();
+ return internal::TensorBlockResourceRequirements::merge(
+ internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size),
+ m_impl.getResourceRequirements());
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+ block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+ bool /*root_of_expr_ast*/ = false) const {
+ TensorBlockDesc arg_desc = desc.WithOffset(srcCoeff(desc.offset()));
+ TensorBlock block = m_impl.block(arg_desc, scratch);
+ if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer();
+ return block;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const {
+ typename Storage::Type result = constCast(m_impl.data());
+ if (result) {
+ Index offset = 0;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = 0; i < NumDims; ++i) {
+ if (m_dimensions[i] != m_impl.dimensions()[i]) {
+ offset += m_offsets[i] * m_inputStrides[i];
+ for (int j = i+1; j < NumDims; ++j) {
+ if (m_dimensions[j] > 1) {
+ return NULL;
+ }
+ offset += m_offsets[j] * m_inputStrides[j];
+ }
+ break;
+ }
+ }
+ } else {
+ for (int i = NumDims - 1; i >= 0; --i) {
+ if (m_dimensions[i] != m_impl.dimensions()[i]) {
+ offset += m_offsets[i] * m_inputStrides[i];
+ for (int j = i-1; j >= 0; --j) {
+ if (m_dimensions[j] > 1) {
+ return NULL;
+ }
+ offset += m_offsets[j] * m_inputStrides[j];
+ }
+ break;
+ }
+ }
+ }
+ return result + offset;
+ }
+ return NULL;
+ }
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_impl.bind(cgh);
+ }
+#endif
+
+ protected:
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
+ {
+ Index inputIndex = 0;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ EIGEN_UNROLL_LOOP
+ for (int i = NumDims - 1; i > 0; --i) {
+ const Index idx = index / m_fastOutputStrides[i];
+ inputIndex += (idx + m_offsets[i]) * m_inputStrides[i];
+ index -= idx * m_outputStrides[i];
+ }
+ inputIndex += (index + m_offsets[0]);
+ } else {
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < NumDims - 1; ++i) {
+ const Index idx = index / m_fastOutputStrides[i];
+ inputIndex += (idx + m_offsets[i]) * m_inputStrides[i];
+ index -= idx * m_outputStrides[i];
+ }
+ inputIndex += (index + m_offsets[NumDims-1]);
+ }
+ return inputIndex;
+ }
+
+ array<Index, NumDims> m_outputStrides;
+ array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
+ array<Index, NumDims> m_inputStrides;
+ TensorEvaluator<ArgType, Device> m_impl;
+ const Device EIGEN_DEVICE_REF m_device;
+ Dimensions m_dimensions;
+ bool m_is_identity;
+ const StartIndices m_offsets;
+};
+
+
+// Eval as lvalue
+template<typename StartIndices, typename Sizes, typename ArgType, typename Device>
+struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
+ : public TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
+{
+ typedef TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device> Base;
+ typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType;
+ static const int NumDims = internal::array_size<Sizes>::value;
+
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ typedef Sizes Dimensions;
+
+ enum {
+ IsAligned = false,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
+ PreferBlockAccess = true,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false,
+ RawAccess = (NumDims == 1) & TensorEvaluator<ArgType, Device>::RawAccess
+ };
+
+ typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+ typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : Base(op, device)
+ { }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+ {
+ if (this->m_is_identity) {
+ return this->m_impl.coeffRef(index);
+ } else {
+ return this->m_impl.coeffRef(this->srcCoeff(index));
+ }
+ }
+
+ template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ void writePacket(Index index, const PacketReturnType& x)
+ {
+ if (this->m_is_identity) {
+ this->m_impl.template writePacket<StoreMode>(index, x);
+ return;
+ }
+
+ const int packetSize = PacketType<CoeffReturnType, Device>::size;
+ Index inputIndices[] = {0, 0};
+ Index indices[] = {index, index + packetSize - 1};
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ EIGEN_UNROLL_LOOP
+ for (int i = NumDims - 1; i > 0; --i) {
+ const Index idx0 = indices[0] / this->m_fastOutputStrides[i];
+ const Index idx1 = indices[1] / this->m_fastOutputStrides[i];
+ inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i];
+ inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i];
+ indices[0] -= idx0 * this->m_outputStrides[i];
+ indices[1] -= idx1 * this->m_outputStrides[i];
+ }
+ inputIndices[0] += (indices[0] + this->m_offsets[0]);
+ inputIndices[1] += (indices[1] + this->m_offsets[0]);
+ } else {
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < NumDims - 1; ++i) {
+ const Index idx0 = indices[0] / this->m_fastOutputStrides[i];
+ const Index idx1 = indices[1] / this->m_fastOutputStrides[i];
+ inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i];
+ inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i];
+ indices[0] -= idx0 * this->m_outputStrides[i];
+ indices[1] -= idx1 * this->m_outputStrides[i];
+ }
+ inputIndices[0] += (indices[0] + this->m_offsets[NumDims-1]);
+ inputIndices[1] += (indices[1] + this->m_offsets[NumDims-1]);
+ }
+ if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
+ this->m_impl.template writePacket<StoreMode>(inputIndices[0], x);
+ }
+ else {
+ EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
+ internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+ this->m_impl.coeffRef(inputIndices[0]) = values[0];
+ this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1];
+ EIGEN_UNROLL_LOOP
+ for (int i = 1; i < packetSize-1; ++i) {
+ this->coeffRef(index+i) = values[i];
+ }
+ }
+ }
+
+ template<typename TensorBlock>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
+ const TensorBlockDesc& desc, const TensorBlock& block) {
+ TensorBlockDesc arg_desc = desc.WithOffset(this->srcCoeff(desc.offset()));
+ this->m_impl.writeBlock(arg_desc, block);
+ }
+};
+
+namespace internal {
+template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
+struct traits<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> > : public traits<XprType>
+{
+ typedef typename XprType::Scalar Scalar;
+ typedef traits<XprType> XprTraits;
+ typedef typename XprTraits::StorageKind StorageKind;
+ typedef typename XprTraits::Index Index;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = array_size<StartIndices>::value;
+ static const int Layout = XprTraits::Layout;
+ typedef typename XprTraits::PointerType PointerType;
+};
+
+template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
+struct eval<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Eigen::Dense>
+{
+ typedef const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>EIGEN_DEVICE_REF type;
+};
+
+template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
+struct nested<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, 1, typename eval<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >::type>
+{
+ typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> type;
+};
+
+} // end namespace internal
+
+
+template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
+class TensorStridingSlicingOp : public TensorBase<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >
+{
+ public:
+ typedef TensorBase<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> > Base;
+ typedef typename internal::traits<TensorStridingSlicingOp>::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename internal::nested<TensorStridingSlicingOp>::type Nested;
+ typedef typename internal::traits<TensorStridingSlicingOp>::StorageKind StorageKind;
+ typedef typename internal::traits<TensorStridingSlicingOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingSlicingOp(
+ const XprType& expr, const StartIndices& startIndices,
+ const StopIndices& stopIndices, const Strides& strides)
+ : m_xpr(expr), m_startIndices(startIndices), m_stopIndices(stopIndices),
+ m_strides(strides) {}
+
+ EIGEN_DEVICE_FUNC
+ const StartIndices& startIndices() const { return m_startIndices; }
+ EIGEN_DEVICE_FUNC
+ const StartIndices& stopIndices() const { return m_stopIndices; }
+ EIGEN_DEVICE_FUNC
+ const StartIndices& strides() const { return m_strides; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type&
+ expression() const { return m_xpr; }
+
+ EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorStridingSlicingOp)
+
+ protected:
+ typename XprType::Nested m_xpr;
+ const StartIndices m_startIndices;
+ const StopIndices m_stopIndices;
+ const Strides m_strides;
+};
+
+// Eval as rvalue
+template<typename StartIndices, typename StopIndices, typename Strides, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device>
+{
+ typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType> XprType;
+ static const int NumDims = internal::array_size<Strides>::value;
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+ typedef Strides Dimensions;
+
+ enum {
+ // Alignment can't be guaranteed at compile time since it depends on the
+ // slice offsets and sizes.
+ IsAligned = false,
+ PacketAccess = false,
+ BlockAccess = false,
+ PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ RawAccess = false
+ };
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : m_impl(op.expression(), device),
+ m_device(device),
+ m_strides(op.strides())
+ {
+ // Handle degenerate intervals by gracefully clamping and allowing m_dimensions to be zero
+ DSizes<Index, NumDims> startIndicesClamped, stopIndicesClamped;
+ for (ptrdiff_t i = 0; i < internal::array_size<Dimensions>::value; ++i) {
+ eigen_assert(m_strides[i] != 0 && "0 stride is invalid");
+ if (m_strides[i] > 0) {
+ startIndicesClamped[i] =
+ clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]);
+ stopIndicesClamped[i] =
+ clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]);
+ } else {
+ /* implies m_strides[i] < 0 by assert */
+ startIndicesClamped[i] =
+ clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1);
+ stopIndicesClamped[i] =
+ clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1);
+ }
+ m_startIndices[i] = startIndicesClamped[i];
+ }
+
+ typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
+ const InputDimensions& input_dims = m_impl.dimensions();
+
+ // compute output tensor shape
+ m_is_identity = true;
+ for (int i = 0; i < NumDims; i++) {
+ Index interval = stopIndicesClamped[i] - startIndicesClamped[i];
+ if (interval == 0 || ((interval < 0) != (m_strides[i] < 0))) {
+ m_dimensions[i] = 0;
+ } else {
+ m_dimensions[i] =
+ (interval / m_strides[i]) + (interval % m_strides[i] != 0 ? 1 : 0);
+ eigen_assert(m_dimensions[i] >= 0);
+ }
+ if (m_strides[i] != 1 || interval != m_impl.dimensions()[i]) {
+ m_is_identity = false;
+ }
+ }
+
+ Strides output_dims = m_dimensions;
+
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ m_inputStrides[0] = m_strides[0];
+ m_offsets[0] = startIndicesClamped[0];
+ Index previousDimProduct = 1;
+ for (int i = 1; i < NumDims; ++i) {
+ previousDimProduct *= input_dims[i-1];
+ m_inputStrides[i] = previousDimProduct * m_strides[i];
+ m_offsets[i] = startIndicesClamped[i] * previousDimProduct;
+ }
+
+ // Don't initialize m_fastOutputStrides[0] since it won't ever be accessed.
+ m_outputStrides[0] = 1;
+ for (int i = 1; i < NumDims; ++i) {
+ m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1];
+ m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1);
+ }
+ } else {
+ m_inputStrides[NumDims-1] = m_strides[NumDims-1];
+ m_offsets[NumDims-1] = startIndicesClamped[NumDims-1];
+ Index previousDimProduct = 1;
+ for (int i = NumDims - 2; i >= 0; --i) {
+ previousDimProduct *= input_dims[i+1];
+ m_inputStrides[i] = previousDimProduct * m_strides[i];
+ m_offsets[i] = startIndicesClamped[i] * previousDimProduct;
+ }
+
+ m_outputStrides[NumDims-1] = 1;
+ for (int i = NumDims - 2; i >= 0; --i) {
+ m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1];
+ m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1);
+ }
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+ m_impl.evalSubExprsIfNeeded(NULL);
+ return true;
+ }
+
+ EIGEN_STRONG_INLINE void cleanup() {
+ m_impl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ if (m_is_identity) {
+ return m_impl.coeff(index);
+ } else {
+ return m_impl.coeff(srcCoeff(index));
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+ return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const {
+ return NULL;
+ }
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_impl.bind(cgh);
+ }
+#endif
+ protected:
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
+ {
+ Index inputIndex = 0;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ EIGEN_UNROLL_LOOP
+ for (int i = NumDims - 1; i >= 0; --i) {
+ const Index idx = index / m_fastOutputStrides[i];
+ inputIndex += idx * m_inputStrides[i] + m_offsets[i];
+ index -= idx * m_outputStrides[i];
+ }
+ } else {
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < NumDims; ++i) {
+ const Index idx = index / m_fastOutputStrides[i];
+ inputIndex += idx * m_inputStrides[i] + m_offsets[i];
+ index -= idx * m_outputStrides[i];
+ }
+ }
+ return inputIndex;
+ }
+
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) {
+#ifndef SYCL_DEVICE_ONLY
+ return numext::maxi(min, numext::mini(max,value));
+#else
+ return cl::sycl::clamp(value, min, max);
+#endif
+ }
+
+ array<Index, NumDims> m_outputStrides;
+ array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
+ array<Index, NumDims> m_inputStrides;
+ bool m_is_identity;
+ TensorEvaluator<ArgType, Device> m_impl;
+ const Device EIGEN_DEVICE_REF m_device;
+ DSizes<Index, NumDims> m_startIndices; // clamped startIndices
+ DSizes<Index, NumDims> m_dimensions;
+ DSizes<Index, NumDims> m_offsets; // offset in a flattened shape
+ const Strides m_strides;
+};
+
+// Eval as lvalue
+template<typename StartIndices, typename StopIndices, typename Strides, typename ArgType, typename Device>
+struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device>
+ : public TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device>
+{
+ typedef TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device> Base;
+ typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType> XprType;
+ static const int NumDims = internal::array_size<Strides>::value;
+
+ enum {
+ IsAligned = false,
+ PacketAccess = false,
+ BlockAccess = false,
+ PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess,
+ RawAccess = false
+ };
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : Base(op, device)
+ { }
+
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ typedef Strides Dimensions;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+ {
+ if (this->m_is_identity) {
+ return this->m_impl.coeffRef(index);
+ } else {
+ return this->m_impl.coeffRef(this->srcCoeff(index));
+ }
+ }
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorPadding.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorPadding.h
new file mode 100644
index 0000000..ee44382
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorPadding.h
@@ -0,0 +1,708 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_PADDING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_PADDING_H
+
+namespace Eigen {
+
+/** \class TensorPadding
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor padding class.
+ * At the moment only padding with a constant value is supported.
+ *
+ */
+namespace internal {
+template<typename PaddingDimensions, typename XprType>
+struct traits<TensorPaddingOp<PaddingDimensions, XprType> > : public traits<XprType>
+{
+ typedef typename XprType::Scalar Scalar;
+ typedef traits<XprType> XprTraits;
+ typedef typename XprTraits::StorageKind StorageKind;
+ typedef typename XprTraits::Index Index;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = XprTraits::NumDimensions;
+ static const int Layout = XprTraits::Layout;
+ typedef typename XprTraits::PointerType PointerType;
+};
+
+template<typename PaddingDimensions, typename XprType>
+struct eval<TensorPaddingOp<PaddingDimensions, XprType>, Eigen::Dense>
+{
+ typedef const TensorPaddingOp<PaddingDimensions, XprType>& type;
+};
+
+template<typename PaddingDimensions, typename XprType>
+struct nested<TensorPaddingOp<PaddingDimensions, XprType>, 1, typename eval<TensorPaddingOp<PaddingDimensions, XprType> >::type>
+{
+ typedef TensorPaddingOp<PaddingDimensions, XprType> type;
+};
+
+} // end namespace internal
+
+
+
+template<typename PaddingDimensions, typename XprType>
+class TensorPaddingOp : public TensorBase<TensorPaddingOp<PaddingDimensions, XprType>, ReadOnlyAccessors>
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorPaddingOp>::Scalar Scalar;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename Eigen::internal::nested<TensorPaddingOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorPaddingOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorPaddingOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPaddingOp(const XprType& expr, const PaddingDimensions& padding_dims, const Scalar padding_value)
+ : m_xpr(expr), m_padding_dims(padding_dims), m_padding_value(padding_value) {}
+
+ EIGEN_DEVICE_FUNC
+ const PaddingDimensions& padding() const { return m_padding_dims; }
+ EIGEN_DEVICE_FUNC
+ Scalar padding_value() const { return m_padding_value; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type&
+ expression() const { return m_xpr; }
+
+ protected:
+ typename XprType::Nested m_xpr;
+ const PaddingDimensions m_padding_dims;
+ const Scalar m_padding_value;
+};
+
+
+// Eval as rvalue
+template<typename PaddingDimensions, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device>
+{
+ typedef TensorPaddingOp<PaddingDimensions, ArgType> XprType;
+ typedef typename XprType::Index Index;
+ static const int NumDims = internal::array_size<PaddingDimensions>::value;
+ typedef DSizes<Index, NumDims> Dimensions;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+
+ enum {
+ IsAligned = true,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = TensorEvaluator<ArgType, Device>::RawAccess,
+ PreferBlockAccess = true,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = true,
+ RawAccess = false
+ };
+
+ typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+ typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+ typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
+ Layout, Index>
+ TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()), m_device(device)
+ {
+ // The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead
+ // to a vector, which doesn't make sense. Instead one should reshape the scalar into a vector
+ // of 1 element first and then pad.
+ EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+ // Compute dimensions
+ m_dimensions = m_impl.dimensions();
+ for (int i = 0; i < NumDims; ++i) {
+ m_dimensions[i] += m_padding[i].first + m_padding[i].second;
+ }
+ const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ m_inputStrides[0] = 1;
+ m_outputStrides[0] = 1;
+ for (int i = 1; i < NumDims; ++i) {
+ m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
+ m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
+ }
+ m_outputStrides[NumDims] = m_outputStrides[NumDims-1] * m_dimensions[NumDims-1];
+ } else {
+ m_inputStrides[NumDims - 1] = 1;
+ m_outputStrides[NumDims] = 1;
+ for (int i = NumDims - 2; i >= 0; --i) {
+ m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
+ m_outputStrides[i+1] = m_outputStrides[i+2] * m_dimensions[i+1];
+ }
+ m_outputStrides[0] = m_outputStrides[1] * m_dimensions[0];
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+ m_impl.evalSubExprsIfNeeded(NULL);
+ return true;
+ }
+
+#ifdef EIGEN_USE_THREADS
+ template <typename EvalSubExprsCallback>
+ EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+ EvaluatorPointerType, EvalSubExprsCallback done) {
+ m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+ }
+#endif // EIGEN_USE_THREADS
+
+ EIGEN_STRONG_INLINE void cleanup() {
+ m_impl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ eigen_assert(index < dimensions().TotalSize());
+ Index inputIndex = 0;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ EIGEN_UNROLL_LOOP
+ for (int i = NumDims - 1; i > 0; --i) {
+ const Index idx = index / m_outputStrides[i];
+ if (isPaddingAtIndexForDim(idx, i)) {
+ return m_paddingValue;
+ }
+ inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
+ index -= idx * m_outputStrides[i];
+ }
+ if (isPaddingAtIndexForDim(index, 0)) {
+ return m_paddingValue;
+ }
+ inputIndex += (index - m_padding[0].first);
+ } else {
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < NumDims - 1; ++i) {
+ const Index idx = index / m_outputStrides[i+1];
+ if (isPaddingAtIndexForDim(idx, i)) {
+ return m_paddingValue;
+ }
+ inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
+ index -= idx * m_outputStrides[i+1];
+ }
+ if (isPaddingAtIndexForDim(index, NumDims-1)) {
+ return m_paddingValue;
+ }
+ inputIndex += (index - m_padding[NumDims-1].first);
+ }
+ return m_impl.coeff(inputIndex);
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+ {
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ return packetColMajor(index);
+ }
+ return packetRowMajor(index);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+ TensorOpCost cost = m_impl.costPerCoeff(vectorized);
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < NumDims; ++i)
+ updateCostPerDimension(cost, i, i == 0);
+ } else {
+ EIGEN_UNROLL_LOOP
+ for (int i = NumDims - 1; i >= 0; --i)
+ updateCostPerDimension(cost, i, i == NumDims - 1);
+ }
+ return cost;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ internal::TensorBlockResourceRequirements getResourceRequirements() const {
+ const size_t target_size = m_device.lastLevelCacheSize();
+ return internal::TensorBlockResourceRequirements::merge(
+ internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size),
+ m_impl.getResourceRequirements());
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+ block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+ bool /*root_of_expr_ast*/ = false) const {
+ // If one of the dimensions is zero, return empty block view.
+ if (desc.size() == 0) {
+ return TensorBlock(internal::TensorBlockKind::kView, NULL,
+ desc.dimensions());
+ }
+
+ static const bool IsColMajor = Layout == static_cast<int>(ColMajor);
+ const int inner_dim_idx = IsColMajor ? 0 : NumDims - 1;
+
+ Index offset = desc.offset();
+
+ // Compute offsets in the output tensor corresponding to the desc.offset().
+ DSizes<Index, NumDims> output_offsets;
+ for (int i = NumDims - 1; i > 0; --i) {
+ const int dim = IsColMajor ? i : NumDims - i - 1;
+ const int stride_dim = IsColMajor ? dim : dim + 1;
+ output_offsets[dim] = offset / m_outputStrides[stride_dim];
+ offset -= output_offsets[dim] * m_outputStrides[stride_dim];
+ }
+ output_offsets[inner_dim_idx] = offset;
+
+ // Offsets in the input corresponding to output offsets.
+ DSizes<Index, NumDims> input_offsets = output_offsets;
+ for (int i = 0; i < NumDims; ++i) {
+ const int dim = IsColMajor ? i : NumDims - i - 1;
+ input_offsets[dim] = input_offsets[dim] - m_padding[dim].first;
+ }
+
+ // Compute offset in the input buffer (at this point it might be illegal and
+ // point outside of the input buffer, because we don't check for negative
+ // offsets, it will be autocorrected in the block iteration loop below).
+ Index input_offset = 0;
+ for (int i = 0; i < NumDims; ++i) {
+ const int dim = IsColMajor ? i : NumDims - i - 1;
+ input_offset += input_offsets[dim] * m_inputStrides[dim];
+ }
+
+ // Destination buffer and scratch buffer both indexed from 0 and have the
+ // same dimensions as the requested block (for destination buffer this
+ // property is guaranteed by `desc.destination()`).
+ Index output_offset = 0;
+ const DSizes<Index, NumDims> output_strides =
+ internal::strides<Layout>(desc.dimensions());
+
+ // NOTE(ezhulenev): We initialize bock iteration state for `NumDims - 1`
+ // dimensions, skipping innermost dimension. In theory it should be possible
+ // to squeeze matching innermost dimensions, however in practice that did
+ // not show any improvements in benchmarks. Also in practice first outer
+ // dimension usually has padding, and will prevent squeezing.
+
+ // Initialize output block iterator state. Dimension in this array are
+ // always in inner_most -> outer_most order (col major layout).
+ array<BlockIteratorState, NumDims - 1> it;
+ for (int i = 0; i < NumDims - 1; ++i) {
+ const int dim = IsColMajor ? i + 1 : NumDims - i - 2;
+ it[i].count = 0;
+ it[i].size = desc.dimension(dim);
+
+ it[i].input_stride = m_inputStrides[dim];
+ it[i].input_span = it[i].input_stride * (it[i].size - 1);
+
+ it[i].output_stride = output_strides[dim];
+ it[i].output_span = it[i].output_stride * (it[i].size - 1);
+ }
+
+ const Index input_inner_dim_size =
+ static_cast<Index>(m_impl.dimensions()[inner_dim_idx]);
+
+ // Total output size.
+ const Index output_size = desc.size();
+
+ // We will fill inner dimension of this size in the output. It might be
+ // larger than the inner dimension in the input, so we might have to pad
+ // before/after we copy values from the input inner dimension.
+ const Index output_inner_dim_size = desc.dimension(inner_dim_idx);
+
+ // How many values to fill with padding BEFORE reading from the input inner
+ // dimension.
+ const Index output_inner_pad_before_size =
+ input_offsets[inner_dim_idx] < 0
+ ? numext::mini(numext::abs(input_offsets[inner_dim_idx]),
+ output_inner_dim_size)
+ : 0;
+
+ // How many values we can actually copy from the input inner dimension.
+ const Index output_inner_copy_size = numext::mini(
+ // Want to copy from input.
+ (output_inner_dim_size - output_inner_pad_before_size),
+ // Can copy from input.
+ numext::maxi(input_inner_dim_size - (input_offsets[inner_dim_idx] +
+ output_inner_pad_before_size),
+ Index(0)));
+
+ eigen_assert(output_inner_copy_size >= 0);
+
+ // How many values to fill with padding AFTER reading from the input inner
+ // dimension.
+ const Index output_inner_pad_after_size =
+ (output_inner_dim_size - output_inner_copy_size -
+ output_inner_pad_before_size);
+
+ // Sanity check, sum of all sizes must be equal to the output size.
+ eigen_assert(output_inner_dim_size ==
+ (output_inner_pad_before_size + output_inner_copy_size +
+ output_inner_pad_after_size));
+
+ // Keep track of current coordinates and padding in the output.
+ DSizes<Index, NumDims> output_coord = output_offsets;
+ DSizes<Index, NumDims> output_padded;
+ for (int i = 0; i < NumDims; ++i) {
+ const int dim = IsColMajor ? i : NumDims - i - 1;
+ output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
+ }
+
+ typedef internal::StridedLinearBufferCopy<ScalarNoConst, Index> LinCopy;
+
+ // Prepare storage for the materialized padding result.
+ const typename TensorBlock::Storage block_storage =
+ TensorBlock::prepareStorage(desc, scratch);
+
+ // TODO(ezhulenev): Squeeze multiple non-padded inner dimensions into a
+ // single logical inner dimension.
+
+ // When possible we squeeze writes for the innermost (only if non-padded)
+ // dimension with the first padded dimension. This allows to reduce the
+ // number of calls to LinCopy and better utilize vector instructions.
+ const bool squeeze_writes =
+ NumDims > 1 &&
+ // inner dimension is not padded
+ (input_inner_dim_size == m_dimensions[inner_dim_idx]) &&
+ // and equal to the block inner dimension
+ (input_inner_dim_size == output_inner_dim_size);
+
+ const int squeeze_dim = IsColMajor ? inner_dim_idx + 1 : inner_dim_idx - 1;
+
+ // Maximum coordinate on a squeeze dimension that we can write to.
+ const Index squeeze_max_coord =
+ squeeze_writes ? numext::mini(
+ // max non-padded element in the input
+ static_cast<Index>(m_dimensions[squeeze_dim] -
+ m_padding[squeeze_dim].second),
+ // max element in the output buffer
+ static_cast<Index>(output_offsets[squeeze_dim] +
+ desc.dimension(squeeze_dim)))
+ : static_cast<Index>(0);
+
+ // Iterate copying data from `m_impl.data()` to the output buffer.
+ for (Index size = 0; size < output_size;) {
+ // Detect if we are in the padded region (exclude innermost dimension).
+ bool is_padded = false;
+ for (int j = 1; j < NumDims; ++j) {
+ const int dim = IsColMajor ? j : NumDims - j - 1;
+ is_padded = output_padded[dim];
+ if (is_padded) break;
+ }
+
+ if (is_padded) {
+ // Fill single innermost dimension with padding value.
+ size += output_inner_dim_size;
+
+ LinCopy::template Run<LinCopy::Kind::FillLinear>(
+ typename LinCopy::Dst(output_offset, 1, block_storage.data()),
+ typename LinCopy::Src(0, 0, &m_paddingValue),
+ output_inner_dim_size);
+
+
+ } else if (squeeze_writes) {
+ // Squeeze multiple reads from innermost dimensions.
+ const Index squeeze_num = squeeze_max_coord - output_coord[squeeze_dim];
+ size += output_inner_dim_size * squeeze_num;
+
+ // Copy `squeeze_num` inner dimensions from input to output.
+ LinCopy::template Run<LinCopy::Kind::Linear>(
+ typename LinCopy::Dst(output_offset, 1, block_storage.data()),
+ typename LinCopy::Src(input_offset, 1, m_impl.data()),
+ output_inner_dim_size * squeeze_num);
+
+ // Update iteration state for only `squeeze_num - 1` processed inner
+ // dimensions, because we have another iteration state update at the end
+ // of the loop that will update iteration state for the last inner
+ // processed dimension.
+ it[0].count += (squeeze_num - 1);
+ input_offset += it[0].input_stride * (squeeze_num - 1);
+ output_offset += it[0].output_stride * (squeeze_num - 1);
+ output_coord[squeeze_dim] += (squeeze_num - 1);
+
+ } else {
+ // Single read from innermost dimension.
+ size += output_inner_dim_size;
+
+ { // Fill with padding before copying from input inner dimension.
+ const Index out = output_offset;
+
+ LinCopy::template Run<LinCopy::Kind::FillLinear>(
+ typename LinCopy::Dst(out, 1, block_storage.data()),
+ typename LinCopy::Src(0, 0, &m_paddingValue),
+ output_inner_pad_before_size);
+ }
+
+ { // Copy data from input inner dimension.
+ const Index out = output_offset + output_inner_pad_before_size;
+ const Index in = input_offset + output_inner_pad_before_size;
+
+ eigen_assert(output_inner_copy_size == 0 || m_impl.data() != NULL);
+
+ LinCopy::template Run<LinCopy::Kind::Linear>(
+ typename LinCopy::Dst(out, 1, block_storage.data()),
+ typename LinCopy::Src(in, 1, m_impl.data()),
+ output_inner_copy_size);
+ }
+
+ { // Fill with padding after copying from input inner dimension.
+ const Index out = output_offset + output_inner_pad_before_size +
+ output_inner_copy_size;
+
+ LinCopy::template Run<LinCopy::Kind::FillLinear>(
+ typename LinCopy::Dst(out, 1, block_storage.data()),
+ typename LinCopy::Src(0, 0, &m_paddingValue),
+ output_inner_pad_after_size);
+ }
+ }
+
+ for (int j = 0; j < NumDims - 1; ++j) {
+ const int dim = IsColMajor ? j + 1 : NumDims - j - 2;
+
+ if (++it[j].count < it[j].size) {
+ input_offset += it[j].input_stride;
+ output_offset += it[j].output_stride;
+ output_coord[dim] += 1;
+ output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
+ break;
+ }
+ it[j].count = 0;
+ input_offset -= it[j].input_span;
+ output_offset -= it[j].output_span;
+ output_coord[dim] -= it[j].size - 1;
+ output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
+ }
+ }
+
+ return block_storage.AsTensorMaterializedBlock();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_impl.bind(cgh);
+ }
+#endif
+
+ private:
+ struct BlockIteratorState {
+ BlockIteratorState()
+ : count(0),
+ size(0),
+ input_stride(0),
+ input_span(0),
+ output_stride(0),
+ output_span(0) {}
+
+ Index count;
+ Index size;
+ Index input_stride;
+ Index input_span;
+ Index output_stride;
+ Index output_span;
+ };
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim(
+ Index index, int dim_index) const {
+#if defined(EIGEN_HAS_INDEX_LIST)
+ return (!internal::index_pair_first_statically_eq<PaddingDimensions>(dim_index, 0) &&
+ index < m_padding[dim_index].first) ||
+ (!internal::index_pair_second_statically_eq<PaddingDimensions>(dim_index, 0) &&
+ index >= m_dimensions[dim_index] - m_padding[dim_index].second);
+#else
+ return (index < m_padding[dim_index].first) ||
+ (index >= m_dimensions[dim_index] - m_padding[dim_index].second);
+#endif
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isLeftPaddingCompileTimeZero(
+ int dim_index) const {
+#if defined(EIGEN_HAS_INDEX_LIST)
+ return internal::index_pair_first_statically_eq<PaddingDimensions>(dim_index, 0);
+#else
+ EIGEN_UNUSED_VARIABLE(dim_index);
+ return false;
+#endif
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isRightPaddingCompileTimeZero(
+ int dim_index) const {
+#if defined(EIGEN_HAS_INDEX_LIST)
+ return internal::index_pair_second_statically_eq<PaddingDimensions>(dim_index, 0);
+#else
+ EIGEN_UNUSED_VARIABLE(dim_index);
+ return false;
+#endif
+ }
+
+
+ void updateCostPerDimension(TensorOpCost& cost, int i, bool first) const {
+ const double in = static_cast<double>(m_impl.dimensions()[i]);
+ const double out = in + m_padding[i].first + m_padding[i].second;
+ if (out == 0)
+ return;
+ const double reduction = in / out;
+ cost *= reduction;
+ if (first) {
+ cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost<Index>() +
+ reduction * (1 * TensorOpCost::AddCost<Index>()));
+ } else {
+ cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost<Index>() +
+ 2 * TensorOpCost::MulCost<Index>() +
+ reduction * (2 * TensorOpCost::MulCost<Index>() +
+ 1 * TensorOpCost::DivCost<Index>()));
+ }
+ }
+
+ protected:
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
+ {
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+ const Index initialIndex = index;
+ Index inputIndex = 0;
+ EIGEN_UNROLL_LOOP
+ for (int i = NumDims - 1; i > 0; --i) {
+ const Index firstIdx = index;
+ const Index lastIdx = index + PacketSize - 1;
+ const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i];
+ const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i];
+ const Index lastPaddedRight = m_outputStrides[i+1];
+
+ if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) {
+ // all the coefficient are in the padding zone.
+ return internal::pset1<PacketReturnType>(m_paddingValue);
+ }
+ else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
+ // all the coefficient are in the padding zone.
+ return internal::pset1<PacketReturnType>(m_paddingValue);
+ }
+ else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
+ // all the coefficient are between the 2 padding zones.
+ const Index idx = index / m_outputStrides[i];
+ inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
+ index -= idx * m_outputStrides[i];
+ }
+ else {
+ // Every other case
+ return packetWithPossibleZero(initialIndex);
+ }
+ }
+
+ const Index lastIdx = index + PacketSize - 1;
+ const Index firstIdx = index;
+ const Index lastPaddedLeft = m_padding[0].first;
+ const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second);
+ const Index lastPaddedRight = m_outputStrides[1];
+
+ if (!isLeftPaddingCompileTimeZero(0) && lastIdx < lastPaddedLeft) {
+ // all the coefficient are in the padding zone.
+ return internal::pset1<PacketReturnType>(m_paddingValue);
+ }
+ else if (!isRightPaddingCompileTimeZero(0) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
+ // all the coefficient are in the padding zone.
+ return internal::pset1<PacketReturnType>(m_paddingValue);
+ }
+ else if ((isLeftPaddingCompileTimeZero(0) && isRightPaddingCompileTimeZero(0)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
+ // all the coefficient are between the 2 padding zones.
+ inputIndex += (index - m_padding[0].first);
+ return m_impl.template packet<Unaligned>(inputIndex);
+ }
+ // Every other case
+ return packetWithPossibleZero(initialIndex);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
+ {
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+ const Index initialIndex = index;
+ Index inputIndex = 0;
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < NumDims - 1; ++i) {
+ const Index firstIdx = index;
+ const Index lastIdx = index + PacketSize - 1;
+ const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i+1];
+ const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1];
+ const Index lastPaddedRight = m_outputStrides[i];
+
+ if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) {
+ // all the coefficient are in the padding zone.
+ return internal::pset1<PacketReturnType>(m_paddingValue);
+ }
+ else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
+ // all the coefficient are in the padding zone.
+ return internal::pset1<PacketReturnType>(m_paddingValue);
+ }
+ else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
+ // all the coefficient are between the 2 padding zones.
+ const Index idx = index / m_outputStrides[i+1];
+ inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
+ index -= idx * m_outputStrides[i+1];
+ }
+ else {
+ // Every other case
+ return packetWithPossibleZero(initialIndex);
+ }
+ }
+
+ const Index lastIdx = index + PacketSize - 1;
+ const Index firstIdx = index;
+ const Index lastPaddedLeft = m_padding[NumDims-1].first;
+ const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second);
+ const Index lastPaddedRight = m_outputStrides[NumDims-1];
+
+ if (!isLeftPaddingCompileTimeZero(NumDims-1) && lastIdx < lastPaddedLeft) {
+ // all the coefficient are in the padding zone.
+ return internal::pset1<PacketReturnType>(m_paddingValue);
+ }
+ else if (!isRightPaddingCompileTimeZero(NumDims-1) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
+ // all the coefficient are in the padding zone.
+ return internal::pset1<PacketReturnType>(m_paddingValue);
+ }
+ else if ((isLeftPaddingCompileTimeZero(NumDims-1) && isRightPaddingCompileTimeZero(NumDims-1)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
+ // all the coefficient are between the 2 padding zones.
+ inputIndex += (index - m_padding[NumDims-1].first);
+ return m_impl.template packet<Unaligned>(inputIndex);
+ }
+ // Every other case
+ return packetWithPossibleZero(initialIndex);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
+ {
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < PacketSize; ++i) {
+ values[i] = coeff(index+i);
+ }
+ PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+ return rslt;
+ }
+
+ Dimensions m_dimensions;
+ array<Index, NumDims+1> m_outputStrides;
+ array<Index, NumDims> m_inputStrides;
+ TensorEvaluator<ArgType, Device> m_impl;
+ PaddingDimensions m_padding;
+
+ Scalar m_paddingValue;
+
+ const Device EIGEN_DEVICE_REF m_device;
+};
+
+
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_PADDING_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorPatch.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorPatch.h
new file mode 100644
index 0000000..413d25d
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorPatch.h
@@ -0,0 +1,291 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_PATCH_H
+#define EIGEN_CXX11_TENSOR_TENSOR_PATCH_H
+
+namespace Eigen {
+
+/** \class TensorPatch
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor patch class.
+ *
+ *
+ */
+namespace internal {
+template<typename PatchDim, typename XprType>
+struct traits<TensorPatchOp<PatchDim, XprType> > : public traits<XprType>
+{
+ typedef typename XprType::Scalar Scalar;
+ typedef traits<XprType> XprTraits;
+ typedef typename XprTraits::StorageKind StorageKind;
+ typedef typename XprTraits::Index Index;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = XprTraits::NumDimensions + 1;
+ static const int Layout = XprTraits::Layout;
+ typedef typename XprTraits::PointerType PointerType;
+};
+
+template<typename PatchDim, typename XprType>
+struct eval<TensorPatchOp<PatchDim, XprType>, Eigen::Dense>
+{
+ typedef const TensorPatchOp<PatchDim, XprType>& type;
+};
+
+template<typename PatchDim, typename XprType>
+struct nested<TensorPatchOp<PatchDim, XprType>, 1, typename eval<TensorPatchOp<PatchDim, XprType> >::type>
+{
+ typedef TensorPatchOp<PatchDim, XprType> type;
+};
+
+} // end namespace internal
+
+
+
+template<typename PatchDim, typename XprType>
+class TensorPatchOp : public TensorBase<TensorPatchOp<PatchDim, XprType>, ReadOnlyAccessors>
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorPatchOp>::Scalar Scalar;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename Eigen::internal::nested<TensorPatchOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorPatchOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorPatchOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPatchOp(const XprType& expr, const PatchDim& patch_dims)
+ : m_xpr(expr), m_patch_dims(patch_dims) {}
+
+ EIGEN_DEVICE_FUNC
+ const PatchDim& patch_dims() const { return m_patch_dims; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type&
+ expression() const { return m_xpr; }
+
+ protected:
+ typename XprType::Nested m_xpr;
+ const PatchDim m_patch_dims;
+};
+
+
+// Eval as rvalue
+template<typename PatchDim, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
+{
+ typedef TensorPatchOp<PatchDim, ArgType> XprType;
+ typedef typename XprType::Index Index;
+ static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value + 1;
+ typedef DSizes<Index, NumDims> Dimensions;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+
+
+ enum {
+ IsAligned = false,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = false,
+ PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false,
+ RawAccess = false
+ };
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : m_impl(op.expression(), device)
+ {
+ Index num_patches = 1;
+ const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+ const PatchDim& patch_dims = op.patch_dims();
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = 0; i < NumDims-1; ++i) {
+ m_dimensions[i] = patch_dims[i];
+ num_patches *= (input_dims[i] - patch_dims[i] + 1);
+ }
+ m_dimensions[NumDims-1] = num_patches;
+
+ m_inputStrides[0] = 1;
+ m_patchStrides[0] = 1;
+ for (int i = 1; i < NumDims-1; ++i) {
+ m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
+ m_patchStrides[i] = m_patchStrides[i-1] * (input_dims[i-1] - patch_dims[i-1] + 1);
+ }
+ m_outputStrides[0] = 1;
+ for (int i = 1; i < NumDims; ++i) {
+ m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
+ }
+ } else {
+ for (int i = 0; i < NumDims-1; ++i) {
+ m_dimensions[i+1] = patch_dims[i];
+ num_patches *= (input_dims[i] - patch_dims[i] + 1);
+ }
+ m_dimensions[0] = num_patches;
+
+ m_inputStrides[NumDims-2] = 1;
+ m_patchStrides[NumDims-2] = 1;
+ for (int i = NumDims-3; i >= 0; --i) {
+ m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
+ m_patchStrides[i] = m_patchStrides[i+1] * (input_dims[i+1] - patch_dims[i+1] + 1);
+ }
+ m_outputStrides[NumDims-1] = 1;
+ for (int i = NumDims-2; i >= 0; --i) {
+ m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
+ }
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+ m_impl.evalSubExprsIfNeeded(NULL);
+ return true;
+ }
+
+ EIGEN_STRONG_INLINE void cleanup() {
+ m_impl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ Index output_stride_index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? NumDims - 1 : 0;
+ // Find the location of the first element of the patch.
+ Index patchIndex = index / m_outputStrides[output_stride_index];
+ // Find the offset of the element wrt the location of the first element.
+ Index patchOffset = index - patchIndex * m_outputStrides[output_stride_index];
+ Index inputIndex = 0;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ EIGEN_UNROLL_LOOP
+ for (int i = NumDims - 2; i > 0; --i) {
+ const Index patchIdx = patchIndex / m_patchStrides[i];
+ patchIndex -= patchIdx * m_patchStrides[i];
+ const Index offsetIdx = patchOffset / m_outputStrides[i];
+ patchOffset -= offsetIdx * m_outputStrides[i];
+ inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i];
+ }
+ } else {
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < NumDims - 2; ++i) {
+ const Index patchIdx = patchIndex / m_patchStrides[i];
+ patchIndex -= patchIdx * m_patchStrides[i];
+ const Index offsetIdx = patchOffset / m_outputStrides[i+1];
+ patchOffset -= offsetIdx * m_outputStrides[i+1];
+ inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i];
+ }
+ }
+ inputIndex += (patchIndex + patchOffset);
+ return m_impl.coeff(inputIndex);
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+ {
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+ Index output_stride_index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? NumDims - 1 : 0;
+ Index indices[2] = {index, index + PacketSize - 1};
+ Index patchIndices[2] = {indices[0] / m_outputStrides[output_stride_index],
+ indices[1] / m_outputStrides[output_stride_index]};
+ Index patchOffsets[2] = {indices[0] - patchIndices[0] * m_outputStrides[output_stride_index],
+ indices[1] - patchIndices[1] * m_outputStrides[output_stride_index]};
+
+ Index inputIndices[2] = {0, 0};
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ EIGEN_UNROLL_LOOP
+ for (int i = NumDims - 2; i > 0; --i) {
+ const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i],
+ patchIndices[1] / m_patchStrides[i]};
+ patchIndices[0] -= patchIdx[0] * m_patchStrides[i];
+ patchIndices[1] -= patchIdx[1] * m_patchStrides[i];
+
+ const Index offsetIdx[2] = {patchOffsets[0] / m_outputStrides[i],
+ patchOffsets[1] / m_outputStrides[i]};
+ patchOffsets[0] -= offsetIdx[0] * m_outputStrides[i];
+ patchOffsets[1] -= offsetIdx[1] * m_outputStrides[i];
+
+ inputIndices[0] += (patchIdx[0] + offsetIdx[0]) * m_inputStrides[i];
+ inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i];
+ }
+ } else {
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < NumDims - 2; ++i) {
+ const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i],
+ patchIndices[1] / m_patchStrides[i]};
+ patchIndices[0] -= patchIdx[0] * m_patchStrides[i];
+ patchIndices[1] -= patchIdx[1] * m_patchStrides[i];
+
+ const Index offsetIdx[2] = {patchOffsets[0] / m_outputStrides[i+1],
+ patchOffsets[1] / m_outputStrides[i+1]};
+ patchOffsets[0] -= offsetIdx[0] * m_outputStrides[i+1];
+ patchOffsets[1] -= offsetIdx[1] * m_outputStrides[i+1];
+
+ inputIndices[0] += (patchIdx[0] + offsetIdx[0]) * m_inputStrides[i];
+ inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i];
+ }
+ }
+ inputIndices[0] += (patchIndices[0] + patchOffsets[0]);
+ inputIndices[1] += (patchIndices[1] + patchOffsets[1]);
+
+ if (inputIndices[1] - inputIndices[0] == PacketSize - 1) {
+ PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
+ return rslt;
+ }
+ else {
+ EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize];
+ values[0] = m_impl.coeff(inputIndices[0]);
+ values[PacketSize-1] = m_impl.coeff(inputIndices[1]);
+ EIGEN_UNROLL_LOOP
+ for (int i = 1; i < PacketSize-1; ++i) {
+ values[i] = coeff(index+i);
+ }
+ PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+ return rslt;
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+ const double compute_cost = NumDims * (TensorOpCost::DivCost<Index>() +
+ TensorOpCost::MulCost<Index>() +
+ 2 * TensorOpCost::AddCost<Index>());
+ return m_impl.costPerCoeff(vectorized) +
+ TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
+ }
+
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_impl.bind(cgh);
+ }
+#endif
+
+ protected:
+ Dimensions m_dimensions;
+ array<Index, NumDims> m_outputStrides;
+ array<Index, NumDims-1> m_inputStrides;
+ array<Index, NumDims-1> m_patchStrides;
+
+ TensorEvaluator<ArgType, Device> m_impl;
+
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_PATCH_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorRandom.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorRandom.h
new file mode 100644
index 0000000..37c1d1c
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorRandom.h
@@ -0,0 +1,322 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2018 Mehdi Goli <eigen@codeplay.com> Codeplay Software Ltd.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
+#define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
+
+namespace Eigen {
+namespace internal {
+
+namespace {
+
+EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+ // We don't support 3d kernels since we currently only use 1 and
+ // 2d kernels.
+ gpu_assert(threadIdx.z == 0);
+ return blockIdx.x * blockDim.x + threadIdx.x
+ + gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
+#else
+ // Rely on Eigen's random implementation.
+ return random<uint64_t>();
+#endif
+}
+
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state, uint64_t stream) {
+ // TODO: Unify with the implementation in the non blocking thread pool.
+ uint64_t current = *state;
+ // Update the internal state
+ *state = current * 6364136223846793005ULL + (stream << 1 | 1);
+ // Generate the random output (using the PCG-XSH-RS scheme)
+ return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
+}
+
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) {
+ seed = seed ? seed : get_random_seed();
+ return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
+}
+
+} // namespace
+
+
+template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+T RandomToTypeUniform(uint64_t* state, uint64_t stream) {
+ unsigned rnd = PCG_XSH_RS_generator(state, stream);
+ return static_cast<T>(rnd);
+}
+
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state, uint64_t stream) {
+ // Generate 10 random bits for the mantissa, merge with exponent.
+ unsigned rnd = PCG_XSH_RS_generator(state, stream);
+ const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x3ffu) | (static_cast<uint16_t>(15) << 10);
+ Eigen::half result = Eigen::numext::bit_cast<Eigen::half>(half_bits);
+ // Return the final result
+ return result - Eigen::half(1.0f);
+}
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+Eigen::bfloat16 RandomToTypeUniform<Eigen::bfloat16>(uint64_t* state, uint64_t stream) {
+
+ // Generate 7 random bits for the mantissa, merge with exponent.
+ unsigned rnd = PCG_XSH_RS_generator(state, stream);
+ const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x7fu) | (static_cast<uint16_t>(127) << 7);
+ Eigen::bfloat16 result = Eigen::numext::bit_cast<Eigen::bfloat16>(half_bits);
+ // Return the final result
+ return result - Eigen::bfloat16(1.0f);
+}
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float RandomToTypeUniform<float>(uint64_t* state, uint64_t stream) {
+ typedef union {
+ uint32_t raw;
+ float fp;
+ } internal;
+ internal result;
+ // Generate 23 random bits for the mantissa mantissa
+ const unsigned rnd = PCG_XSH_RS_generator(state, stream);
+ result.raw = rnd & 0x7fffffu;
+ // Set the exponent
+ result.raw |= (static_cast<uint32_t>(127) << 23);
+ // Return the final result
+ return result.fp - 1.0f;
+}
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double RandomToTypeUniform<double>(uint64_t* state, uint64_t stream) {
+ typedef union {
+ uint64_t raw;
+ double dp;
+ } internal;
+ internal result;
+ result.raw = 0;
+ // Generate 52 random bits for the mantissa
+ // First generate the upper 20 bits
+ unsigned rnd1 = PCG_XSH_RS_generator(state, stream) & 0xfffffu;
+ // The generate the lower 32 bits
+ unsigned rnd2 = PCG_XSH_RS_generator(state, stream);
+ result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2;
+ // Set the exponent
+ result.raw |= (static_cast<uint64_t>(1023) << 52);
+ // Return the final result
+ return result.dp - 1.0;
+}
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state, uint64_t stream) {
+ return std::complex<float>(RandomToTypeUniform<float>(state, stream),
+ RandomToTypeUniform<float>(state, stream));
+}
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state, uint64_t stream) {
+ return std::complex<double>(RandomToTypeUniform<double>(state, stream),
+ RandomToTypeUniform<double>(state, stream));
+}
+
+template <typename T> class UniformRandomGenerator {
+ public:
+ static const bool PacketAccess = true;
+
+ // Uses the given "seed" if non-zero, otherwise uses a random seed.
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
+ uint64_t seed = 0) {
+ m_state = PCG_XSH_RS_state(seed);
+ #ifdef EIGEN_USE_SYCL
+ // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
+ // Therefor, we need two step to initializate the m_state.
+ // IN SYCL, the constructor of the functor is s called on the CPU
+ // and we get the clock seed here from the CPU. However, This seed is
+ //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
+ // and only available on the Operator() function (which is called on the GPU).
+ // Thus for CUDA (((CLOCK + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread
+ // but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each thread adds
+ // the (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the construction
+ // similar to CUDA Therefore, the thread Id injection is not available at this stage.
+ //However when the operator() is called the thread ID will be avilable. So inside the opeator,
+ // we add the thrreadID, BlockId,... (which is equivalent of i)
+ //to the seed and construct the unique m_state per thead similar to cuda.
+ m_exec_once =false;
+ #endif
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
+ const UniformRandomGenerator& other) {
+ m_state = other.m_state;
+ #ifdef EIGEN_USE_SYCL
+ m_exec_once =other.m_exec_once;
+ #endif
+ }
+
+ template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ T operator()(Index i) const {
+ #ifdef EIGEN_USE_SYCL
+ if(!m_exec_once) {
+ // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
+ // The (i * 6364136223846793005ULL) is the remaining part of the PCG_XSH_RS_state on the GPU side
+ m_state += (i * 6364136223846793005ULL);
+ m_exec_once =true;
+ }
+ #endif
+ T result = RandomToTypeUniform<T>(&m_state, i);
+ return result;
+ }
+
+ template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Packet packetOp(Index i) const {
+ const int packetSize = internal::unpacket_traits<Packet>::size;
+ EIGEN_ALIGN_MAX T values[packetSize];
+ #ifdef EIGEN_USE_SYCL
+ if(!m_exec_once) {
+ // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
+ m_state += (i * 6364136223846793005ULL);
+ m_exec_once =true;
+ }
+ #endif
+ EIGEN_UNROLL_LOOP
+ for (int j = 0; j < packetSize; ++j) {
+ values[j] = RandomToTypeUniform<T>(&m_state, i);
+ }
+ return internal::pload<Packet>(values);
+ }
+
+ private:
+ mutable uint64_t m_state;
+ #ifdef EIGEN_USE_SYCL
+ mutable bool m_exec_once;
+ #endif
+};
+
+template <typename Scalar>
+struct functor_traits<UniformRandomGenerator<Scalar> > {
+ enum {
+ // Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)).
+ Cost = 12 * NumTraits<Scalar>::AddCost *
+ ((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)),
+ PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess
+ };
+};
+
+
+
+template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+T RandomToTypeNormal(uint64_t* state, uint64_t stream) {
+ // Use the ratio of uniform method to generate numbers following a normal
+ // distribution. See for example Numerical Recipes chapter 7.3.9 for the
+ // details.
+ T u, v, q;
+ do {
+ u = RandomToTypeUniform<T>(state, stream);
+ v = T(1.7156) * (RandomToTypeUniform<T>(state, stream) - T(0.5));
+ const T x = u - T(0.449871);
+ const T y = numext::abs(v) + T(0.386595);
+ q = x*x + y * (T(0.196)*y - T(0.25472)*x);
+ } while (q > T(0.27597) &&
+ (q > T(0.27846) || v*v > T(-4) * numext::log(u) * u*u));
+
+ return v/u;
+}
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state, uint64_t stream) {
+ return std::complex<float>(RandomToTypeNormal<float>(state, stream),
+ RandomToTypeNormal<float>(state, stream));
+}
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state, uint64_t stream) {
+ return std::complex<double>(RandomToTypeNormal<double>(state, stream),
+ RandomToTypeNormal<double>(state, stream));
+}
+
+
+template <typename T> class NormalRandomGenerator {
+ public:
+ static const bool PacketAccess = true;
+
+ // Uses the given "seed" if non-zero, otherwise uses a random seed.
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) {
+ m_state = PCG_XSH_RS_state(seed);
+ #ifdef EIGEN_USE_SYCL
+ // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
+ // Therefor, we need two steps to initializate the m_state.
+ // IN SYCL, the constructor of the functor is s called on the CPU
+ // and we get the clock seed here from the CPU. However, This seed is
+ //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
+ // and only available on the Operator() function (which is called on the GPU).
+ // Therefore, the thread Id injection is not available at this stage. However when the operator()
+ //is called the thread ID will be avilable. So inside the opeator,
+ // we add the thrreadID, BlockId,... (which is equivalent of i)
+ //to the seed and construct the unique m_state per thead similar to cuda.
+ m_exec_once =false;
+ #endif
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(
+ const NormalRandomGenerator& other) {
+ m_state = other.m_state;
+#ifdef EIGEN_USE_SYCL
+ m_exec_once=other.m_exec_once;
+#endif
+ }
+
+ template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ T operator()(Index i) const {
+ #ifdef EIGEN_USE_SYCL
+ if(!m_exec_once) {
+ // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
+ m_state += (i * 6364136223846793005ULL);
+ m_exec_once =true;
+ }
+ #endif
+ T result = RandomToTypeNormal<T>(&m_state, i);
+ return result;
+ }
+
+ template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ Packet packetOp(Index i) const {
+ const int packetSize = internal::unpacket_traits<Packet>::size;
+ EIGEN_ALIGN_MAX T values[packetSize];
+ #ifdef EIGEN_USE_SYCL
+ if(!m_exec_once) {
+ // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
+ m_state += (i * 6364136223846793005ULL);
+ m_exec_once =true;
+ }
+ #endif
+ EIGEN_UNROLL_LOOP
+ for (int j = 0; j < packetSize; ++j) {
+ values[j] = RandomToTypeNormal<T>(&m_state, i);
+ }
+ return internal::pload<Packet>(values);
+ }
+
+ private:
+ mutable uint64_t m_state;
+ #ifdef EIGEN_USE_SYCL
+ mutable bool m_exec_once;
+ #endif
+};
+
+
+template <typename Scalar>
+struct functor_traits<NormalRandomGenerator<Scalar> > {
+ enum {
+ // On average, we need to generate about 3 random numbers
+ // 15 mul, 8 add, 1.5 logs
+ Cost = 3 * functor_traits<UniformRandomGenerator<Scalar> >::Cost +
+ 15 * NumTraits<Scalar>::AddCost + 8 * NumTraits<Scalar>::AddCost +
+ 3 * functor_traits<scalar_log_op<Scalar> >::Cost / 2,
+ PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess
+ };
+};
+
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorReduction.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorReduction.h
new file mode 100644
index 0000000..583f462
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorReduction.h
@@ -0,0 +1,998 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2016 Mehdi Goli, Codeplay Software Ltd <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
+
+// clang is incompatible with the CUDA syntax wrt making a kernel a class friend,
+// so we'll use a macro to make clang happy.
+#ifndef KERNEL_FRIEND
+#if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__))
+#define KERNEL_FRIEND friend __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
+#else
+#define KERNEL_FRIEND friend
+#endif
+#endif
+
+
+namespace Eigen {
+
+
+/** \class TensorReduction
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor reduction class.
+ *
+ */
+
+namespace internal {
+ template<typename Op, typename Dims, typename XprType,template <class> class MakePointer_ >
+ struct traits<TensorReductionOp<Op, Dims, XprType, MakePointer_> >
+ : traits<XprType>
+{
+ typedef traits<XprType> XprTraits;
+ typedef typename XprTraits::Scalar Scalar;
+ typedef typename XprTraits::StorageKind StorageKind;
+ typedef typename XprTraits::Index Index;
+ typedef typename XprType::Nested Nested;
+ static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
+ static const int Layout = XprTraits::Layout;
+ typedef typename XprTraits::PointerType PointerType;
+
+ template <class T> struct MakePointer {
+ // Intermediate typedef to workaround MSVC issue.
+ typedef MakePointer_<T> MakePointerT;
+ typedef typename MakePointerT::Type Type;
+ };
+};
+
+template<typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
+struct eval<TensorReductionOp<Op, Dims, XprType, MakePointer_>, Eigen::Dense>
+{
+ typedef const TensorReductionOp<Op, Dims, XprType, MakePointer_>& type;
+};
+
+template<typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
+struct nested<TensorReductionOp<Op, Dims, XprType, MakePointer_>, 1, typename eval<TensorReductionOp<Op, Dims, XprType, MakePointer_> >::type>
+{
+ typedef TensorReductionOp<Op, Dims, XprType, MakePointer_> type;
+};
+
+
+template <typename OutputDims> struct DimInitializer {
+ template <typename InputDims, typename ReducedDims> EIGEN_DEVICE_FUNC
+ static void run(const InputDims& input_dims,
+ const array<bool, internal::array_size<InputDims>::value>& reduced,
+ OutputDims* output_dims, ReducedDims* reduced_dims) {
+ const int NumInputDims = internal::array_size<InputDims>::value;
+ int outputIndex = 0;
+ int reduceIndex = 0;
+ for (int i = 0; i < NumInputDims; ++i) {
+ if (reduced[i]) {
+ (*reduced_dims)[reduceIndex] = input_dims[i];
+ ++reduceIndex;
+ } else {
+ (*output_dims)[outputIndex] = input_dims[i];
+ ++outputIndex;
+ }
+ }
+ }
+};
+
+template <> struct DimInitializer<Sizes<> > {
+ template <typename InputDims, typename Index, size_t Rank> EIGEN_DEVICE_FUNC
+ static void run(const InputDims& input_dims, const array<bool, Rank>&,
+ Sizes<>*, array<Index, Rank>* reduced_dims) {
+ const int NumInputDims = internal::array_size<InputDims>::value;
+ for (int i = 0; i < NumInputDims; ++i) {
+ (*reduced_dims)[i] = input_dims[i];
+ }
+ }
+};
+
+
+template <typename ReducedDims, int NumTensorDims, int Layout>
+struct are_inner_most_dims {
+ static const bool value = false;
+};
+template <typename ReducedDims, int NumTensorDims, int Layout>
+struct preserve_inner_most_dims {
+ static const bool value = false;
+};
+
+#if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES
+template <typename ReducedDims, int NumTensorDims>
+struct are_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
+ static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
+ static const bool tmp2 = index_statically_eq<ReducedDims>(0, 0);
+ static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value-1, array_size<ReducedDims>::value-1);
+ static const bool value = tmp1 & tmp2 & tmp3;
+};
+template <typename ReducedDims, int NumTensorDims>
+struct are_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
+ static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
+ static const bool tmp2 = index_statically_eq<ReducedDims>(0, NumTensorDims - array_size<ReducedDims>::value);
+ static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
+ static const bool value = tmp1 & tmp2 & tmp3;
+
+};
+template <typename ReducedDims, int NumTensorDims>
+struct preserve_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
+ static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
+ static const bool tmp2 = index_statically_gt<ReducedDims>(0, 0);
+ static const bool value = tmp1 & tmp2;
+
+};
+template <typename ReducedDims, int NumTensorDims>
+struct preserve_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
+ static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
+ static const bool tmp2 = index_statically_lt<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
+ static const bool value = tmp1 & tmp2;
+};
+#endif
+
+
+template <int DimIndex, typename Self, typename Op>
+struct GenericDimReducer {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) {
+ EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+ for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
+ const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
+ GenericDimReducer<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
+ }
+ }
+};
+template <typename Self, typename Op>
+struct GenericDimReducer<0, Self, Op> {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) {
+ for (int j = 0; j < self.m_reducedDims[0]; ++j) {
+ const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
+ reducer.reduce(self.m_impl.coeff(input), accum);
+ }
+ }
+};
+template <typename Self, typename Op>
+struct GenericDimReducer<-1, Self, Op> {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index index, Op& reducer, typename Self::CoeffReturnType* accum) {
+ reducer.reduce(self.m_impl.coeff(index), accum);
+ }
+};
+
+template <typename Self, typename Op, bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess),
+ bool UseTreeReduction = (!Self::ReducerTraits::IsStateful &&
+ !Self::ReducerTraits::IsExactlyAssociative)>
+struct InnerMostDimReducer {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
+ typename Self::CoeffReturnType accum = reducer.initialize();
+ for (typename Self::Index j = 0; j < numValuesToReduce; ++j) {
+ reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
+ }
+ return reducer.finalize(accum);
+ }
+};
+
+template <typename Self, typename Op>
+struct InnerMostDimReducer<Self, Op, true, false> {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
+ const typename Self::Index packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
+ const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
+ typename Self::PacketReturnType paccum = reducer.template initializePacket<typename Self::PacketReturnType>();
+ for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) {
+ reducer.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum);
+ }
+ typename Self::CoeffReturnType accum = reducer.initialize();
+ for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) {
+ reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
+ }
+ return reducer.finalizeBoth(accum, paccum);
+ }
+};
+
+#if !defined(EIGEN_HIPCC)
+static const int kLeafSize = 1024;
+
+template <typename Self, typename Op>
+struct InnerMostDimReducer<Self, Op, false, true> {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType
+ reduce(const Self& self, typename Self::Index firstIndex,
+ typename Self::Index numValuesToReduce, Op& reducer) {
+ typename Self::CoeffReturnType accum = reducer.initialize();
+ if (numValuesToReduce > kLeafSize) {
+ const typename Self::Index half = numValuesToReduce / 2;
+ reducer.reduce(reduce(self, firstIndex, half, reducer), &accum);
+ reducer.reduce(
+ reduce(self, firstIndex + half, numValuesToReduce - half, reducer),
+ &accum);
+ } else {
+ for (typename Self::Index j = 0; j < numValuesToReduce; ++j) {
+ reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
+ }
+ }
+ return reducer.finalize(accum);
+ }
+};
+
+template <typename Self, typename Op>
+struct InnerMostDimReducer<Self, Op, true, true> {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType
+ reduce(const Self& self, typename Self::Index firstIndex,
+ typename Self::Index numValuesToReduce, Op& reducer) {
+ const typename Self::Index packetSize =
+ internal::unpacket_traits<typename Self::PacketReturnType>::size;
+ typename Self::CoeffReturnType accum = reducer.initialize();
+ if (numValuesToReduce > packetSize * kLeafSize) {
+ // Make sure the split point is aligned on a packet boundary.
+ const typename Self::Index split =
+ packetSize *
+ divup(firstIndex + divup(numValuesToReduce, typename Self::Index(2)),
+ packetSize);
+ const typename Self::Index num_left =
+ numext::mini(split - firstIndex, numValuesToReduce);
+ reducer.reduce(reduce(self, firstIndex, num_left, reducer), &accum);
+ if (num_left < numValuesToReduce) {
+ reducer.reduce(
+ reduce(self, split, numValuesToReduce - num_left, reducer), &accum);
+ }
+ return reducer.finalize(accum);
+ } else {
+ const typename Self::Index UnrollSize =
+ (numValuesToReduce / (2*packetSize)) * 2*packetSize;
+ const typename Self::Index VectorizedSize =
+ (numValuesToReduce / packetSize) * packetSize;
+ typename Self::PacketReturnType paccum =
+ reducer.template initializePacket<typename Self::PacketReturnType>();
+ typename Self::PacketReturnType paccum2 =
+ reducer.template initializePacket<typename Self::PacketReturnType>();
+ for (typename Self::Index j = 0; j < UnrollSize; j += packetSize * 2) {
+ reducer.reducePacket(
+ self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum);
+ reducer.reducePacket(
+ self.m_impl.template packet<Unaligned>(firstIndex + j + packetSize),
+ &paccum2);
+ }
+ for (typename Self::Index j = UnrollSize; j < VectorizedSize; j+= packetSize) {
+ reducer.reducePacket(self.m_impl.template packet<Unaligned>(
+ firstIndex + j), &paccum);
+ }
+ reducer.reducePacket(paccum2, &paccum);
+ for (typename Self::Index j = VectorizedSize; j < numValuesToReduce;
+ ++j) {
+ reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
+ }
+ return reducer.finalizeBoth(accum, paccum);
+ }
+ }
+};
+#endif
+
+template <int DimIndex, typename Self, typename Op, bool vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
+struct InnerMostDimPreserver {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) {
+ eigen_assert(false && "should never be called");
+ }
+};
+
+template <int DimIndex, typename Self, typename Op>
+struct InnerMostDimPreserver<DimIndex, Self, Op, true> {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
+ EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+ for (typename Self::Index j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
+ const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
+ InnerMostDimPreserver<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
+ }
+ }
+};
+
+template <typename Self, typename Op>
+struct InnerMostDimPreserver<0, Self, Op, true> {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
+ for (typename Self::Index j = 0; j < self.m_reducedDims[0]; ++j) {
+ const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
+ reducer.reducePacket(self.m_impl.template packet<Unaligned>(input), accum);
+ }
+ }
+};
+template <typename Self, typename Op>
+struct InnerMostDimPreserver<-1, Self, Op, true> {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) {
+ eigen_assert(false && "should never be called");
+ }
+};
+
+// Default full reducer
+template <typename Self, typename Op, typename Device, bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
+struct FullReducer {
+ static const bool HasOptimizedImplementation = false;
+
+ static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::EvaluatorPointerType output) {
+ const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions());
+ *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
+ }
+};
+
+
+#ifdef EIGEN_USE_THREADS
+// Multithreaded full reducers
+template <typename Self, typename Op,
+ bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
+struct FullReducerShard {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex,
+ typename Self::Index numValuesToReduce, Op& reducer,
+ typename Self::CoeffReturnType* output) {
+ *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(
+ self, firstIndex, numValuesToReduce, reducer);
+ }
+};
+
+// Multithreaded full reducer
+template <typename Self, typename Op, bool Vectorizable>
+struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
+ static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful;
+ static const Index PacketSize =
+ unpacket_traits<typename Self::PacketReturnType>::size;
+
+ // launch one reducer per thread and accumulate the result.
+ static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device,
+ typename Self::CoeffReturnType* output) {
+ typedef typename Self::Index Index;
+ const Index num_coeffs = array_prod(self.m_impl.dimensions());
+ if (num_coeffs == 0) {
+ *output = reducer.finalize(reducer.initialize());
+ return;
+ }
+ const TensorOpCost cost =
+ self.m_impl.costPerCoeff(Vectorizable) +
+ TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable,
+ PacketSize);
+ const int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
+ num_coeffs, cost, device.numThreads());
+ if (num_threads == 1) {
+ *output =
+ InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
+ return;
+ }
+ const Index blocksize =
+ std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
+ const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
+ eigen_assert(num_coeffs >= numblocks * blocksize);
+
+ Barrier barrier(internal::convert_index<unsigned int>(numblocks));
+ MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
+ for (Index i = 0; i < numblocks; ++i) {
+ device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, Vectorizable>::run,
+ self, i * blocksize, blocksize, reducer,
+ &shards[i]);
+ }
+ typename Self::CoeffReturnType finalShard;
+ if (numblocks * blocksize < num_coeffs) {
+ finalShard = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(
+ self, numblocks * blocksize, num_coeffs - numblocks * blocksize,
+ reducer);
+ } else {
+ finalShard = reducer.initialize();
+ }
+ barrier.Wait();
+
+ for (Index i = 0; i < numblocks; ++i) {
+ reducer.reduce(shards[i], &finalShard);
+ }
+ *output = reducer.finalize(finalShard);
+ }
+};
+
+#endif
+
+
+// Default inner reducer
+template <typename Self, typename Op, typename Device>
+struct InnerReducer {
+ static const bool HasOptimizedImplementation = false;
+
+ EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
+ eigen_assert(false && "Not implemented");
+ return true;
+ }
+};
+
+// Default outer reducer
+template <typename Self, typename Op, typename Device>
+struct OuterReducer {
+ static const bool HasOptimizedImplementation = false;
+
+ EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
+ eigen_assert(false && "Not implemented");
+ return true;
+ }
+};
+
+#ifdef EIGEN_USE_SYCL
+// Default Generic reducer
+template <typename Self, typename Op, typename Device>
+struct GenericReducer {
+ static const bool HasOptimizedImplementation = false;
+
+ EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
+ eigen_assert(false && "Not implemented");
+ return true;
+ }
+};
+#endif
+
+#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
+template <int B, int N, typename S, typename R, typename I_>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
+
+
+#if defined(EIGEN_HAS_GPU_FP16)
+template <typename S, typename R, typename I_>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits<half>::type*);
+template <int B, int N, typename S, typename R, typename I_>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits<half>::type*);
+template <int NPT, typename S, typename R, typename I_>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
+
+#endif
+
+template <int NPT, typename S, typename R, typename I_>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
+
+template <int NPT, typename S, typename R, typename I_>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
+#endif
+
+/**
+ * For SYCL, the return type of the reduction is deduced from the initialize method of the given Op.
+ * This allows the reduction to have a different type for the accumulator than the input data type.
+ * If this is the case, the functor needs to have two reduce method: one for reducing an element of the input
+ * with the accumulator and the other for reducing two accumulators.
+ * Such a reducer can be useful for instance when the accumulator is a boolean or a bitset that checks for
+ * some properties of the input.
+ */
+template <typename Op, typename CoeffReturnType>
+struct ReductionReturnType {
+#if defined(EIGEN_USE_SYCL)
+ typedef typename remove_const<decltype(std::declval<Op>().initialize())>::type type;
+#else
+ typedef typename remove_const<CoeffReturnType>::type type;
+#endif
+};
+
+} // end namespace internal
+
+
+template <typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
+class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType, MakePointer_>, ReadOnlyAccessors> {
+ public:
+ typedef typename Eigen::internal::traits<TensorReductionOp>::Scalar Scalar;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
+ typedef typename Eigen::internal::nested<TensorReductionOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorReductionOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorReductionOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ TensorReductionOp(const XprType& expr, const Dims& dims) : m_expr(expr), m_dims(dims)
+ { }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ TensorReductionOp(const XprType& expr, const Dims& dims, const Op& reducer) : m_expr(expr), m_dims(dims), m_reducer(reducer)
+ { }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const XprType& expression() const { return m_expr; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const Dims& dims() const { return m_dims; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const Op& reducer() const { return m_reducer; }
+
+ protected:
+ typename XprType::Nested m_expr;
+ const Dims m_dims;
+ const Op m_reducer;
+};
+
+template<typename ArgType, typename Device>
+struct TensorReductionEvaluatorBase;
+
+// Eval as rvalue
+template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
+struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
+{
+ typedef internal::reducer_traits<Op, Device> ReducerTraits;
+ typedef Dims ReducedDims;
+ typedef TensorReductionOp<Op, Dims, ArgType, MakePointer_> XprType;
+ typedef typename XprType::Index Index;
+ typedef ArgType ChildType;
+ typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
+ static const int NumInputDims = internal::array_size<InputDimensions>::value;
+ static const int NumReducedDims = internal::array_size<Dims>::value;
+ static const int NumOutputDims = NumInputDims - NumReducedDims;
+ typedef typename internal::conditional<NumOutputDims==0, Sizes<>, DSizes<Index, NumOutputDims> >::type Dimensions;
+ typedef typename XprType::Scalar Scalar;
+ typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Self;
+ static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess;
+ typedef typename internal::ReductionReturnType<Op, typename XprType::CoeffReturnType>::type CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const Index PacketSize = PacketType<CoeffReturnType, Device>::size;
+
+ typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+
+ // Subset of strides of the input tensor for the non-reduced dimensions.
+ // Indexed by output dimensions.
+ static const int NumPreservedStrides = max_n_1<NumOutputDims>::size;
+
+ enum {
+ IsAligned = false,
+ PacketAccess = Self::InputPacketAccess && ReducerTraits::PacketAccess,
+ BlockAccess = false,
+ PreferBlockAccess = true,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = false
+ };
+
+ typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
+ static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
+ static const bool RunningFullReduction = (NumOutputDims==0);
+
+ EIGEN_STRONG_INLINE TensorReductionEvaluatorBase(const XprType& op, const Device& device)
+ : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device)
+ {
+ EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
+ EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
+ YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+ // Build the bitmap indicating if an input dimension is reduced or not.
+ for (int i = 0; i < NumInputDims; ++i) {
+ m_reduced[i] = false;
+ }
+ for (int i = 0; i < NumReducedDims; ++i) {
+ eigen_assert(op.dims()[i] >= 0);
+ eigen_assert(op.dims()[i] < NumInputDims);
+ m_reduced[op.dims()[i]] = true;
+ }
+
+ const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+ internal::DimInitializer<Dimensions>::run(input_dims, m_reduced, &m_dimensions, &m_reducedDims);
+
+ // Precompute output strides.
+ if (NumOutputDims > 0) {
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ m_outputStrides[0] = 1;
+ for (int i = 1; i < NumOutputDims; ++i) {
+ m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+ m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
+ }
+ } else {
+ m_outputStrides[NumOutputDims - 1] = 1;
+ for (int i = NumOutputDims - 2; i >= 0; --i) {
+ m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+ m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
+ }
+ }
+ }
+
+ // Precompute input strides.
+ if (NumInputDims > 0) {
+ array<Index, NumInputDims> input_strides;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ input_strides[0] = 1;
+ for (int i = 1; i < NumInputDims; ++i) {
+ input_strides[i] = input_strides[i-1] * input_dims[i-1];
+ }
+ } else {
+ input_strides.back() = 1;
+ for (int i = NumInputDims - 2; i >= 0; --i) {
+ input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
+ }
+ }
+
+ int outputIndex = 0;
+ int reduceIndex = 0;
+ for (int i = 0; i < NumInputDims; ++i) {
+ if (m_reduced[i]) {
+ m_reducedStrides[reduceIndex] = input_strides[i];
+ ++reduceIndex;
+ } else {
+ m_preservedStrides[outputIndex] = input_strides[i];
+ m_output_to_input_dim_map[outputIndex] = i;
+ ++outputIndex;
+ }
+ }
+ }
+
+ // Special case for full reductions
+ if (NumOutputDims == 0) {
+ m_preservedStrides[0] = internal::array_prod(input_dims);
+ }
+
+ m_numValuesToReduce =
+ NumOutputDims == 0
+ ? internal::array_prod(input_dims)
+ : (static_cast<int>(Layout) == static_cast<int>(ColMajor))
+ ? m_preservedStrides[0]
+ : m_preservedStrides[NumOutputDims - 1];
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+ EIGEN_STRONG_INLINE
+ bool evalSubExprsIfNeededCommon(EvaluatorPointerType data) {
+ // Use the FullReducer if possible.
+ if ((RunningFullReduction && RunningOnSycl) ||(RunningFullReduction &&
+ internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
+ ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) ||
+ !RunningOnGPU))) {
+ bool need_assign = false;
+ if (!data) {
+ m_result = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType))));
+ data = m_result;
+ need_assign = true;
+ }
+ Op reducer(m_reducer);
+ internal::FullReducer<Self, Op, Device>::run(*this, reducer, m_device, data);
+ return need_assign;
+ }
+
+ // Attempt to use an optimized reduction.
+ else if ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || (RunningOnSycl)) {
+ bool reducing_inner_dims = true;
+ for (int i = 0; i < NumReducedDims; ++i) {
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ reducing_inner_dims &= m_reduced[i];
+ } else {
+ reducing_inner_dims &= m_reduced[NumInputDims - 1 - i];
+ }
+ }
+ if (internal::InnerReducer<Self, Op, Device>::HasOptimizedImplementation &&
+ (reducing_inner_dims || ReducingInnerMostDims)) {
+ const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
+ const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
+ if (!data) {
+ if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) || (RunningOnSycl)) {
+ data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
+ m_result = data;
+ }
+ else {
+ return true;
+ }
+ }
+ Op reducer(m_reducer);
+ // For SYCL this if always return false
+ if (internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
+ if (m_result) {
+ m_device.deallocate_temp(m_result);
+ m_result = NULL;
+ }
+ return true;
+ } else {
+ return (m_result != NULL);
+ }
+ }
+
+ bool preserving_inner_dims = true;
+ for (int i = 0; i < NumReducedDims; ++i) {
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ preserving_inner_dims &= m_reduced[NumInputDims - 1 - i];
+ } else {
+ preserving_inner_dims &= m_reduced[i];
+ }
+ }
+ if (internal::OuterReducer<Self, Op, Device>::HasOptimizedImplementation &&
+ preserving_inner_dims) {
+ const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
+ const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
+ if (!data) {
+ if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) || (RunningOnSycl)) {
+ data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
+ m_result = data;
+ }
+ else {
+ return true;
+ }
+ }
+ Op reducer(m_reducer);
+ // For SYCL this if always return false
+ if (internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
+ if (m_result) {
+ m_device.deallocate_temp(m_result);
+ m_result = NULL;
+ }
+ return true;
+ } else {
+ return (m_result != NULL);
+ }
+ }
+ #if defined(EIGEN_USE_SYCL)
+ // If there is no Optimised version for SYCL, the reduction expression
+ // must break into two subexpression and use the SYCL generic Reducer on the device.
+ if(RunningOnSycl) {
+ const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
+ const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
+ if (!data) {
+ data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
+ m_result = data;
+ }
+ Op reducer(m_reducer);
+ internal::GenericReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
+ return (m_result != NULL);
+ }
+ #endif
+ }
+ return true;
+ }
+
+#ifdef EIGEN_USE_THREADS
+ template <typename EvalSubExprsCallback>
+ EIGEN_STRONG_INLINE
+ void
+ evalSubExprsIfNeededAsync(EvaluatorPointerType data,
+ EvalSubExprsCallback done) {
+ m_impl.evalSubExprsIfNeededAsync(NULL, [this, data, done](bool) {
+ done(evalSubExprsIfNeededCommon(data));
+ });
+ }
+#endif
+
+ EIGEN_STRONG_INLINE
+ bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+ m_impl.evalSubExprsIfNeeded(NULL);
+ return evalSubExprsIfNeededCommon(data);
+ }
+
+ EIGEN_STRONG_INLINE void cleanup() {
+ m_impl.cleanup();
+ if (m_result) {
+ m_device.deallocate_temp(m_result);
+ m_result = NULL;
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ if (( RunningFullReduction || RunningOnGPU) && m_result ) {
+ return *(m_result + index);
+ }
+ Op reducer(m_reducer);
+ if (ReducingInnerMostDims || RunningFullReduction) {
+ const Index num_values_to_reduce =
+ (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1];
+ return internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstInput(index),
+ num_values_to_reduce, reducer);
+ } else {
+ typename Self::CoeffReturnType accum = reducer.initialize();
+ internal::GenericDimReducer<NumReducedDims-1, Self, Op>::reduce(*this, firstInput(index), reducer, &accum);
+ return reducer.finalize(accum);
+ }
+ }
+
+ // TODO(bsteiner): provide a more efficient implementation.
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+ {
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index + PacketSize - 1 < Index(internal::array_prod(dimensions())));
+
+ if (RunningOnGPU && m_result) {
+ return internal::pload<PacketReturnType>(m_result + index);
+ }
+
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ if (ReducingInnerMostDims) {
+ const Index num_values_to_reduce =
+ (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1];
+ const Index firstIndex = firstInput(index);
+ for (Index i = 0; i < PacketSize; ++i) {
+ Op reducer(m_reducer);
+ values[i] = internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstIndex + i * num_values_to_reduce,
+ num_values_to_reduce, reducer);
+ }
+ } else if (PreservingInnerMostDims) {
+ const Index firstIndex = firstInput(index);
+ const int innermost_dim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumOutputDims - 1;
+ // TBD: extend this the the n innermost dimensions that we preserve.
+ if (((firstIndex % m_dimensions[innermost_dim]) + PacketSize - 1) < m_dimensions[innermost_dim]) {
+ Op reducer(m_reducer);
+ typename Self::PacketReturnType accum = reducer.template initializePacket<typename Self::PacketReturnType>();
+ internal::InnerMostDimPreserver<NumReducedDims-1, Self, Op>::reduce(*this, firstIndex, reducer, &accum);
+ return reducer.finalizePacket(accum);
+ } else {
+ for (int i = 0; i < PacketSize; ++i) {
+ values[i] = coeff(index + i);
+ }
+ }
+ } else {
+ for (int i = 0; i < PacketSize; ++i) {
+ values[i] = coeff(index + i);
+ }
+ }
+ PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+ return rslt;
+ }
+
+ // Must be called after evalSubExprsIfNeeded().
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+ if (RunningFullReduction && m_result) {
+ return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
+ } else {
+ const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
+ const double compute_cost = num_values_to_reduce * internal::functor_traits<Op>::Cost;
+ return m_impl.costPerCoeff(vectorized) * num_values_to_reduce +
+ TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
+ EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+ EIGEN_DEVICE_FUNC const Device& device() const { return m_device; }
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_impl.bind(cgh);
+ m_result.bind(cgh);
+ }
+#endif
+
+ private:
+ template <int, typename, typename> friend struct internal::GenericDimReducer;
+ template <typename, typename, bool, bool> friend struct internal::InnerMostDimReducer;
+ template <int, typename, typename, bool> friend struct internal::InnerMostDimPreserver;
+ template <typename S, typename O, typename D, bool V> friend struct internal::FullReducer;
+#ifdef EIGEN_USE_THREADS
+ template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
+#endif
+#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
+ template <int B, int N, typename S, typename R, typename I_> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
+#if defined(EIGEN_HAS_GPU_FP16)
+ template <typename S, typename R, typename I_> KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits<Eigen::half>::type*);
+ template <int B, int N, typename S, typename R, typename I_> KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits<Eigen::half>::type*);
+ template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
+#endif
+ template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
+
+ template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
+#endif
+
+#if defined(EIGEN_USE_SYCL)
+ template < typename Evaluator_, typename Op__> friend class TensorSycl::internal::GenericNondeterministicReducer;
+ // SYCL need the Generic reducer for the case the recution algorithm is neither inner, outer, and full reducer
+ template <typename, typename, typename> friend struct internal::GenericReducer;
+#endif
+
+
+ template <typename S, typename O, typename D> friend struct internal::InnerReducer;
+
+ struct BlockIteratorState {
+ Index input_dim;
+ Index output_size;
+ Index output_count;
+ };
+
+ // Returns the Index in the input tensor of the first value that needs to be
+ // used to compute the reduction at output index "index".
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
+ if (ReducingInnerMostDims) {
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ return index * m_preservedStrides[0];
+ } else {
+ return index * m_preservedStrides[NumPreservedStrides - 1];
+ }
+ }
+ // TBD: optimize the case where we preserve the innermost dimensions.
+ Index startInput = 0;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = NumOutputDims - 1; i > 0; --i) {
+ // This is index_i in the output tensor.
+ const Index idx = index / m_outputStrides[i];
+ startInput += idx * m_preservedStrides[i];
+ index -= idx * m_outputStrides[i];
+ }
+ if (PreservingInnerMostDims) {
+ eigen_assert(m_preservedStrides[0] == 1);
+ startInput += index;
+ } else {
+ startInput += index * m_preservedStrides[0];
+ }
+ } else {
+ for (int i = 0; i < NumOutputDims - 1; ++i) {
+ // This is index_i in the output tensor.
+ const Index idx = index / m_outputStrides[i];
+ startInput += idx * m_preservedStrides[i];
+ index -= idx * m_outputStrides[i];
+ }
+ if (PreservingInnerMostDims) {
+ eigen_assert(m_preservedStrides[NumPreservedStrides - 1] == 1);
+ startInput += index;
+ } else {
+ startInput += index * m_preservedStrides[NumPreservedStrides - 1];
+ }
+ }
+ return startInput;
+ }
+
+ // Bitmap indicating if an input dimension is reduced or not.
+ array<bool, NumInputDims> m_reduced;
+ // Dimensions of the output of the operation.
+ Dimensions m_dimensions;
+ // Precomputed strides for the output tensor.
+ array<Index, NumOutputDims> m_outputStrides;
+ array<internal::TensorIntDivisor<Index>, NumOutputDims> m_fastOutputStrides;
+ array<Index, NumPreservedStrides> m_preservedStrides;
+ // Map from output to input dimension index.
+ array<Index, NumOutputDims> m_output_to_input_dim_map;
+ // How many values go into each reduction
+ Index m_numValuesToReduce;
+
+ // Subset of strides of the input tensor for the reduced dimensions.
+ // Indexed by reduced dimensions.
+ array<Index, NumReducedDims> m_reducedStrides;
+ // Size of the input dimensions that are reduced.
+ // Indexed by reduced dimensions.
+ array<Index, NumReducedDims> m_reducedDims;
+
+ // Evaluator for the input expression.
+ TensorEvaluator<ArgType, Device> m_impl;
+
+ // Operation to apply for computing the reduction.
+ Op m_reducer;
+
+ // For full reductions
+#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
+ static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
+ static const bool RunningOnSycl = false;
+#elif defined(EIGEN_USE_SYCL)
+static const bool RunningOnSycl = internal::is_same<typename internal::remove_all<Device>::type, Eigen::SyclDevice>::value;
+static const bool RunningOnGPU = false;
+#else
+ static const bool RunningOnGPU = false;
+ static const bool RunningOnSycl = false;
+#endif
+ EvaluatorPointerType m_result;
+
+ const Device EIGEN_DEVICE_REF m_device;
+};
+
+template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
+struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
+: public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> {
+ typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Base;
+ EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Device& device) : Base(op, device){}
+};
+
+
+template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_>
+struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice>
+: public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> {
+
+ typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> Base;
+ EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Eigen::SyclDevice& device) : Base(op, device){}
+ // The coeff function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel
+ //Therefore the coeff function should be overridden by for SYCL kernel
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::CoeffReturnType coeff(typename Base::Index index) const {
+ return *(this->data() + index);
+ }
+ // The packet function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel
+ //Therefore the packet function should be overridden by for SYCL kernel
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::PacketReturnType packet(typename Base::Index index) const {
+ return internal::pload<typename Base::PacketReturnType>(this->data() + index);
+ }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorReductionCuda.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorReductionCuda.h
new file mode 100644
index 0000000..68780cd
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorReductionCuda.h
@@ -0,0 +1,6 @@
+
+#if defined(__clang__) || defined(__GNUC__)
+#warning "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorReductionGpu.h file"
+#endif
+
+#include "TensorReductionGpu.h"
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorReductionGpu.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorReductionGpu.h
new file mode 100644
index 0000000..db4e8d8
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorReductionGpu.h
@@ -0,0 +1,966 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
+#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
+
+namespace Eigen {
+namespace internal {
+
+
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
+// Full reducers for GPU, don't vectorize for now
+
+// Reducer function that enables multiple gpu thread to safely accumulate at the same
+// output address. It basically reads the current value of the output variable, and
+// attempts to update it with the new value. If in the meantime another gpu thread
+// updated the content of the output address it will try again.
+template <typename T, typename R>
+__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
+#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
+ if (sizeof(T) == 4)
+ {
+ unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
+ unsigned int newval = oldval;
+ reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+ if (newval == oldval) {
+ return;
+ }
+ unsigned int readback;
+ while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
+ oldval = readback;
+ newval = oldval;
+ reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+ if (newval == oldval) {
+ return;
+ }
+ }
+ }
+ else if (sizeof(T) == 8) {
+ unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
+ unsigned long long newval = oldval;
+ reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+ if (newval == oldval) {
+ return;
+ }
+ unsigned long long readback;
+ while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
+ oldval = readback;
+ newval = oldval;
+ reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+ if (newval == oldval) {
+ return;
+ }
+ }
+ }
+ else {
+ gpu_assert(0 && "Wordsize not supported");
+ }
+#else // EIGEN_CUDA_ARCH >= 300
+ gpu_assert(0 && "Shouldn't be called on unsupported device");
+#endif // EIGEN_CUDA_ARCH >= 300
+}
+
+// We extend atomicExch to support extra data types
+template <typename Type>
+__device__ inline Type atomicExchCustom(Type* address, Type val) {
+ return atomicExch(address, val);
+}
+
+template <>
+__device__ inline double atomicExchCustom(double* address, double val) {
+ unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
+ return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
+}
+
+#ifdef EIGEN_HAS_GPU_FP16
+template <typename R>
+__device__ inline void atomicReduce(half2* output, half2 accum, R& reducer) {
+ unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
+ unsigned int newval = oldval;
+ reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
+ if (newval == oldval) {
+ return;
+ }
+ unsigned int readback;
+ while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
+ oldval = readback;
+ newval = oldval;
+ reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
+ if (newval == oldval) {
+ return;
+ }
+ }
+}
+// reduction should be associative since reduction is not atomic in wide vector but atomic in half2 operations
+template <typename R>
+__device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reducer) {
+ half2* houtput=reinterpret_cast<half2*>(output);
+ half2* haccum=reinterpret_cast<half2*>(&accum);
+ for(int i=0;i<4;++i){
+ atomicReduce(houtput+i,*(haccum+i),reducer);
+ }
+}
+#endif // EIGEN_HAS_GPU_FP16
+
+template <>
+__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
+#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
+ atomicAdd(output, accum);
+#else // EIGEN_CUDA_ARCH >= 300
+ gpu_assert(0 && "Shouldn't be called on unsupported device");
+#endif // EIGEN_CUDA_ARCH >= 300
+}
+
+
+template <typename CoeffType, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
+ const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+ const Index num_threads = blockDim.x * gridDim.x;
+ for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+ output[i] = val;
+ }
+}
+
+
+template <int BlockSize, int NumPerThread, typename Self,
+ typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
+ typename Self::CoeffReturnType* output, unsigned int* semaphore) {
+#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
+ // Initialize the output value
+ const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
+ if (gridDim.x == 1) {
+ if (first_index == 0) {
+ *output = reducer.initialize();
+ }
+ }
+ else {
+ if (threadIdx.x == 0) {
+ unsigned int block = atomicCAS(semaphore, 0u, 1u);
+ if (block == 0) {
+ // We're the first block to run, initialize the output value
+ atomicExchCustom(output, reducer.initialize());
+ __threadfence();
+ atomicExch(semaphore, 2u);
+ }
+ else {
+ // Wait for the first block to initialize the output value.
+ // Use atomicCAS here to ensure that the reads aren't cached
+ unsigned int val;
+ do {
+ val = atomicCAS(semaphore, 2u, 2u);
+ }
+ while (val < 2u);
+ }
+ }
+ }
+
+ __syncthreads();
+
+ eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
+
+ typename Self::CoeffReturnType accum = reducer.initialize();
+ Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
+ for (Index i = 0; i < max_iter; i+=BlockSize) {
+ const Index index = first_index + i;
+ eigen_assert(index < num_coeffs);
+ typename Self::CoeffReturnType val = input.m_impl.coeff(index);
+ reducer.reduce(val, &accum);
+ }
+
+#pragma unroll
+ for (int offset = warpSize/2; offset > 0; offset /= 2) {
+ #if defined(EIGEN_HIPCC)
+ // use std::is_floating_point to determine the type of reduced_val
+ // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error
+ // and list the float and int versions of __shfl_down as the candidate functions.
+ if (std::is_floating_point<typename Self::CoeffReturnType>::value) {
+ reducer.reduce(__shfl_down(static_cast<float>(accum), offset, warpSize), &accum);
+ } else {
+ reducer.reduce(__shfl_down(static_cast<int>(accum), offset, warpSize), &accum);
+ }
+ #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+ reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
+ #else
+ reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
+ #endif
+ }
+
+ if ((threadIdx.x & (warpSize - 1)) == 0) {
+ atomicReduce(output, accum, reducer);
+ }
+
+ if (gridDim.x > 1 && threadIdx.x == 0) {
+ // Let the last block reset the semaphore
+ atomicInc(semaphore, gridDim.x + 1);
+#if defined(EIGEN_HIPCC)
+ __threadfence_system();
+#endif
+ }
+#else // EIGEN_CUDA_ARCH >= 300
+ gpu_assert(0 && "Shouldn't be called on unsupported device");
+#endif // EIGEN_CUDA_ARCH >= 300
+}
+
+
+#ifdef EIGEN_HAS_GPU_FP16
+template <typename Self,
+ typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
+ packet_traits<Eigen::half>::type* scratch) {
+ eigen_assert(blockDim.x == 1);
+ eigen_assert(gridDim.x == 1);
+ typedef packet_traits<Eigen::half>::type packet_type;
+ Index packet_remainder =
+ num_coeffs % Index(unpacket_traits<packet_type>::size);
+ if (packet_remainder != 0) {
+ half2* h2scratch = reinterpret_cast<half2*>(scratch);
+ for (Index i = num_coeffs - packet_remainder; i + 2 <= num_coeffs; i += 2) {
+ *h2scratch =
+ __halves2half2(input.m_impl.coeff(i), input.m_impl.coeff(i + 1));
+ h2scratch++;
+ }
+ if ((num_coeffs & 1) != 0) {
+ half lastCoeff = input.m_impl.coeff(num_coeffs - 1);
+ *h2scratch = __halves2half2(lastCoeff, reducer.initialize());
+ }
+ } else {
+ *scratch = reducer.template initializePacket<packet_type>();
+ }
+}
+
+template <typename Self,
+ typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
+ const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+ const Index num_threads = blockDim.x * gridDim.x;
+ typedef typename packet_traits<Eigen::half>::type PacketType;
+
+ const Index num_packets =
+ num_coeffs / Index(unpacket_traits<PacketType>::size);
+ PacketType* p_output = reinterpret_cast<PacketType*>(output);
+ for (Index i = thread_id; i < num_packets; i += num_threads) {
+ p_output[i] = reducer.template initializePacket<PacketType>();
+ }
+ Index packet_remainder =
+ num_coeffs % Index(unpacket_traits<PacketType>::size);
+ if (thread_id < packet_remainder) {
+ output[num_coeffs - packet_remainder + thread_id] = reducer.initialize();
+ }
+}
+
+template <int BlockSize, int NumPerThread, typename Self,
+ typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
+ half* output, packet_traits<Eigen::half>::type* scratch) {
+ typedef typename packet_traits<Eigen::half>::type PacketType;
+ const int packet_width = unpacket_traits<PacketType>::size;
+ eigen_assert(NumPerThread % packet_width == 0);
+ const Index first_index =
+ blockIdx.x * BlockSize * NumPerThread + packet_width * threadIdx.x;
+
+ // Initialize the output value if it wasn't initialized by the ReductionInitKernel
+
+ if (gridDim.x == 1) {
+ if (first_index == 0) {
+ int rem = num_coeffs % packet_width;
+ if (rem != 0) {
+ half2* p_scratch = reinterpret_cast<half2*>(scratch);
+ *scratch = reducer.template initializePacket<PacketType>();
+ for (int i = 0; i < rem / 2; i++) {
+ *p_scratch = __halves2half2(
+ input.m_impl.coeff(num_coeffs - packet_width + 2 * i),
+ input.m_impl.coeff(num_coeffs - packet_width + 2 * i + 1));
+ p_scratch++;
+ }
+ if ((num_coeffs & 1) != 0) {
+ half last = input.m_impl.coeff(num_coeffs - 1);
+ *p_scratch = __halves2half2(last, reducer.initialize());
+ }
+ } else {
+ *scratch = reducer.template initializePacket<PacketType>();
+ }
+ }
+ __syncthreads();
+ }
+
+ PacketType accum = reducer.template initializePacket<PacketType>();
+ const Index max_iter =
+ numext::mini<Index>((num_coeffs - first_index) / packet_width,
+ NumPerThread * BlockSize / packet_width);
+ for (Index i = 0; i < max_iter; i += BlockSize) {
+ const Index index = first_index + packet_width * i;
+ eigen_assert(index + packet_width < num_coeffs);
+ PacketType val = input.m_impl.template packet<Unaligned>(index);
+ reducer.reducePacket(val, &accum);
+ }
+
+#pragma unroll
+ for (int offset = warpSize/2; offset > 0; offset /= 2) {
+ #if defined(EIGEN_HIPCC)
+ PacketType r1;
+ half2* hr = reinterpret_cast<half2*>(&r1);
+ half2* hacc = reinterpret_cast<half2*>(&accum);
+ for (int i = 0; i < packet_width / 2; i++) {
+ // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
+ union { int i; half2 h; } wka_in, wka_out;
+ wka_in.h = hacc[i];
+ wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
+ hr[i] = wka_out.h;
+ }
+ reducer.reducePacket(r1, &accum);
+ #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+ PacketType r1;
+ half2* hr = reinterpret_cast<half2*>(&r1);
+ half2* hacc = reinterpret_cast<half2*>(&accum);
+ for (int i = 0; i < packet_width / 2; i++) {
+ hr[i] = __shfl_down(hacc[i], offset, warpSize);
+ }
+ reducer.reducePacket(r1, &accum);
+ #else
+ PacketType r1;
+ half2* hr = reinterpret_cast<half2*>(&r1);
+ half2* hacc = reinterpret_cast<half2*>(&accum);
+ for (int i = 0; i < packet_width / 2; i++) {
+ hr[i] = __shfl_down_sync(0xFFFFFFFF, hacc[i], (unsigned)offset, warpSize);
+ }
+ reducer.reducePacket(r1, &accum);
+
+ #endif
+ }
+
+ if ((threadIdx.x & (warpSize - 1)) == 0) {
+ atomicReduce(scratch, accum, reducer);
+ }
+
+ __syncthreads();
+ half2* rv1 = reinterpret_cast<half2*>(scratch);
+ if (packet_width > 2) {
+ reducer.reducePacket(rv1[2], rv1);
+ reducer.reducePacket(rv1[3], rv1 + 1);
+ reducer.reducePacket(rv1[1], rv1);
+ }
+ if (gridDim.x == 1) {
+ if (first_index == 0) {
+ half tmp = __low2half(*rv1);
+ reducer.reduce(__high2half(*rv1), &tmp);
+ *output = tmp;
+ }
+ }
+}
+
+template <typename Op>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op reducer, half* output, packet_traits<Eigen::half>::type* scratch) {
+ eigen_assert(threadIdx.x == 1);
+ half2* pscratch = reinterpret_cast<half2*>(scratch);
+ half tmp = __float2half(0.f);
+ typedef packet_traits<Eigen::half>::type packet_type;
+ for (int i = 0; i < unpacket_traits<packet_type>::size; i += 2) {
+ reducer.reduce(__low2half(*pscratch), &tmp);
+ reducer.reduce(__high2half(*pscratch), &tmp);
+ pscratch++;
+ }
+ *output = tmp;
+}
+
+#endif // EIGEN_HAS_GPU_FP16
+
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
+struct FullReductionLauncher {
+ static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
+ gpu_assert(false && "Should only be called on doubles, floats and half floats");
+ }
+};
+
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct FullReductionLauncher<
+ Self, Op, OutputType, PacketAccess,
+ typename internal::enable_if<
+ internal::is_same<float, OutputType>::value ||
+ internal::is_same<double, OutputType>::value,
+ void>::type> {
+ static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
+
+ typedef typename Self::Index Index;
+ const int block_size = 256;
+ const int num_per_thread = 128;
+ const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+
+ unsigned int* semaphore = NULL;
+ if (num_blocks > 1) {
+ semaphore = device.semaphore();
+ }
+
+ LAUNCH_GPU_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
+ num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
+ }
+};
+
+#ifdef EIGEN_HAS_GPU_FP16
+template <typename Self, typename Op>
+struct FullReductionLauncher<Self, Op, Eigen::half, false> {
+ static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
+ gpu_assert(false && "Should not be called since there is no packet accessor");
+ }
+};
+
+template <typename Self, typename Op>
+struct FullReductionLauncher<Self, Op, Eigen::half, true> {
+ static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
+ typedef typename Self::Index Index;
+ typedef typename packet_traits<Eigen::half>::type PacketType;
+
+ const int block_size = 256;
+ const int num_per_thread = 128;
+ const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+ PacketType* scratch = static_cast<PacketType*>(device.scratchpad());
+ // half2* scratch = static_cast<half2*>(device.scratchpad());
+
+ if (num_blocks > 1) {
+ // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
+ // won't be a race conditions between multiple thread blocks.
+ LAUNCH_GPU_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
+ 1, 1, 0, device, reducer, self, num_coeffs, scratch);
+ }
+
+ LAUNCH_GPU_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
+ num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
+
+ if (num_blocks > 1) {
+ LAUNCH_GPU_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
+ 1, 1, 0, device, reducer, output, scratch);
+ }
+ }
+};
+#endif // EIGEN_HAS_GPU_FP16
+
+
+template <typename Self, typename Op, bool Vectorizable>
+struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
+ // Unfortunately nvidia doesn't support well exotic types such as complex,
+ // so reduce the scope of the optimized version of the code to the simple cases
+ // of doubles, floats and half floats
+#ifdef EIGEN_HAS_GPU_FP16
+ static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
+ (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+ internal::is_same<typename Self::CoeffReturnType, double>::value ||
+ (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
+#else // EIGEN_HAS_GPU_FP16
+ static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
+ (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+ internal::is_same<typename Self::CoeffReturnType, double>::value);
+#endif // EIGEN_HAS_GPU_FP16
+
+ template <typename OutputType>
+ static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
+ gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
+ const Index num_coeffs = array_prod(self.m_impl.dimensions());
+ // Don't crash when we're called with an input tensor of size 0.
+ if (num_coeffs == 0) {
+ return;
+ }
+
+ FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
+ }
+};
+
+
+template <int NumPerThread, typename Self,
+ typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+ typename Self::CoeffReturnType* output) {
+#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
+ typedef typename Self::CoeffReturnType Type;
+ eigen_assert(blockDim.y == 1);
+ eigen_assert(blockDim.z == 1);
+ eigen_assert(gridDim.y == 1);
+ eigen_assert(gridDim.z == 1);
+
+ const int unroll_times = 16;
+ eigen_assert(NumPerThread % unroll_times == 0);
+
+ const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
+ const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
+
+ const Index num_threads = blockDim.x * gridDim.x;
+ const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+ // Initialize the output values if they weren't initialized by the ReductionInitKernel
+ if (gridDim.x == 1) {
+ for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+ output[i] = reducer.initialize();
+ }
+ __syncthreads();
+ }
+
+ for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
+ const Index row = i / input_col_blocks;
+
+ if (row < num_preserved_coeffs) {
+ const Index col_block = i % input_col_blocks;
+ const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
+
+ Type reduced_val = reducer.initialize();
+
+ for (Index j = 0; j < NumPerThread; j += unroll_times) {
+ const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
+ if (last_col >= num_coeffs_to_reduce) {
+ for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
+ const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+ reducer.reduce(val, &reduced_val);
+ }
+ break;
+ } else {
+ // Faster version of the loop with no branches after unrolling.
+#pragma unroll
+ for (int k = 0; k < unroll_times; ++k) {
+ const Index col = col_begin + blockDim.x * (j + k);
+ reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
+ }
+ }
+ }
+
+#pragma unroll
+ for (int offset = warpSize/2; offset > 0; offset /= 2) {
+ #if defined(EIGEN_HIPCC)
+ // use std::is_floating_point to determine the type of reduced_val
+ // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error
+ // and list the float and int versions of __shfl_down as the candidate functions.
+ if (std::is_floating_point<Type>::value) {
+ reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val);
+ } else {
+ reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val);
+ }
+ #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+ reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
+ #else
+ reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
+ #endif
+ }
+
+ if ((threadIdx.x & (warpSize - 1)) == 0) {
+ atomicReduce(&(output[row]), reduced_val, reducer);
+ }
+ }
+ }
+#else // EIGEN_CUDA_ARCH >= 300
+ gpu_assert(0 && "Shouldn't be called on unsupported device");
+#endif // EIGEN_CUDA_ARCH >= 300
+}
+
+#ifdef EIGEN_HAS_GPU_FP16
+
+template <int NumPerThread, typename Self,
+ typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+ half* output) {
+ eigen_assert(blockDim.y == 1);
+ eigen_assert(blockDim.z == 1);
+ eigen_assert(gridDim.y == 1);
+ eigen_assert(gridDim.z == 1);
+
+ typedef typename packet_traits<Eigen::half>::type PacketType;
+ const int packet_width = unpacket_traits<PacketType>::size;
+ const int unroll_times = 16 / packet_width;
+ eigen_assert(NumPerThread % unroll_times == 0);
+ eigen_assert(unroll_times % 2 == 0);
+
+ const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
+ const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
+
+ const Index num_threads = blockDim.x * gridDim.x;
+ const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+ // Initialize the output values if they weren't initialized by the ReductionInitKernel
+ if (gridDim.x == 1) {
+ Index i = packet_width * thread_id;
+ for (; i + packet_width <= num_preserved_coeffs;
+ i += packet_width * num_threads) {
+ PacketType* poutput = reinterpret_cast<PacketType*>(output + i);
+ *poutput = reducer.template initializePacket<PacketType>();
+ }
+ if (i < num_preserved_coeffs) {
+ output[i] = reducer.initialize();
+ }
+ __syncthreads();
+ }
+
+ for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
+ const Index row = 2 * (i / input_col_blocks); // everybody takes 2 rows
+
+ if (row + 1 < num_preserved_coeffs) {
+ const Index col_block = i % input_col_blocks;
+ const Index col_begin =
+ packet_width * (col_block * blockDim.x * NumPerThread + threadIdx.x);
+
+ PacketType reduced_val1 = reducer.template initializePacket<PacketType>();
+ PacketType reduced_val2 = reducer.template initializePacket<PacketType>();
+
+ for (Index j = 0; j < NumPerThread; j += unroll_times) {
+ const Index last_col =
+ col_begin + blockDim.x * (j + unroll_times - 1) * packet_width;
+ if (last_col >= num_coeffs_to_reduce) {
+ Index col = col_begin + blockDim.x * j;
+ for (; col + packet_width <= num_coeffs_to_reduce;
+ col += blockDim.x) {
+ const PacketType val1 = input.m_impl.template packet<Unaligned>(
+ row * num_coeffs_to_reduce + col);
+ reducer.reducePacket(val1, &reduced_val1);
+ const PacketType val2 = input.m_impl.template packet<Unaligned>(
+ (row + 1) * num_coeffs_to_reduce + col);
+ reducer.reducePacket(val2, &reduced_val2);
+ }
+ if (col < num_coeffs_to_reduce) {
+ PacketType r1 = reducer.template initializePacket<PacketType>();
+ PacketType r2 = reducer.template initializePacket<PacketType>();
+ half2* hr1 = reinterpret_cast<half2*>(&r1);
+ half2* hr2 = reinterpret_cast<half2*>(&r2);
+ while (col + 1 < num_coeffs_to_reduce) {
+ *hr1 = __halves2half2(
+ input.m_impl.coeff(row * num_coeffs_to_reduce + col),
+ input.m_impl.coeff(row * num_coeffs_to_reduce + col + 1));
+ *hr2 = __halves2half2(
+ input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col),
+ input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col +
+ 1));
+ hr1++;
+ hr2++;
+ col += 2;
+ }
+ if (col < num_coeffs_to_reduce) {
+ // Peel;
+ const half last1 =
+ input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+ *hr1 = __halves2half2(last1, reducer.initialize());
+ const half last2 =
+ input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col);
+ *hr2 = __halves2half2(last2, reducer.initialize());
+ }
+ reducer.reducePacket(r1, &reduced_val1);
+ reducer.reducePacket(r2, &reduced_val2);
+ }
+ break;
+ } else {
+ // Faster version of the loop with no branches after unrolling.
+#pragma unroll
+ for (int k = 0; k < unroll_times; ++k) {
+ const Index col = col_begin + blockDim.x * (j + k) * packet_width;
+ reducer.reducePacket(input.m_impl.template packet<Unaligned>(
+ row * num_coeffs_to_reduce + col),
+ &reduced_val1);
+ reducer.reducePacket(input.m_impl.template packet<Unaligned>(
+ (row + 1) * num_coeffs_to_reduce + col),
+ &reduced_val2);
+ }
+ }
+ }
+
+#pragma unroll
+ for (int offset = warpSize/2; offset > 0; offset /= 2) {
+ #if defined(EIGEN_HIPCC)
+ PacketType r1;
+ PacketType r2;
+ half2* hr1 = reinterpret_cast<half2*>(&r1);
+ half2* hr2 = reinterpret_cast<half2*>(&r2);
+ half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
+ half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
+ for (int i = 0; i < packet_width / 2; i++) {
+ // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
+ union { int i; half2 h; } wka_in1, wka_out1;
+ wka_in1.h = rv1[i];
+ wka_out1.i = __shfl_down(wka_in1.i, offset, warpSize);
+ hr1[i] = wka_out1.h;
+
+ union { int i; half2 h; } wka_in2, wka_out2;
+ wka_in2.h = rv2[i];
+ wka_out2.i = __shfl_down(wka_in2.i, offset, warpSize);
+ hr2[i] = wka_out2.h;
+ }
+ reducer.reducePacket(r1, &reduced_val1);
+ reducer.reducePacket(r2, &reduced_val2);
+ #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+ PacketType r1;
+ PacketType r2;
+ half2* hr1 = reinterpret_cast<half2*>(&r1);
+ half2* hr2 = reinterpret_cast<half2*>(&r2);
+ half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
+ half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
+ for (int i = 0; i < packet_width / 2; i++) {
+ hr1[i] = __shfl_down(rv1[i], offset, warpSize);
+ hr2[i] = __shfl_down(rv2[i], offset, warpSize);
+ }
+ reducer.reducePacket(r1, &reduced_val1);
+ reducer.reducePacket(r2, &reduced_val2);
+ #else
+ PacketType r1;
+ PacketType r2;
+ half2* hr1 = reinterpret_cast<half2*>(&r1);
+ half2* hr2 = reinterpret_cast<half2*>(&r2);
+ half2* rr1 = reinterpret_cast<half2*>(&reduced_val1);
+ half2* rr2 = reinterpret_cast<half2*>(&reduced_val2);
+ for (int i = 0; i < packet_width / 2; i++) {
+ hr1[i] =
+ __shfl_down_sync(0xFFFFFFFF, rr1[i], (unsigned)offset, warpSize);
+ hr2[i] =
+ __shfl_down_sync(0xFFFFFFFF, rr2[i], (unsigned)offset, warpSize);
+ }
+ reducer.reducePacket(r1, &reduced_val1);
+ reducer.reducePacket(r2, &reduced_val2);
+
+ #endif
+ }
+ half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
+ half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
+ half2 val;
+ if (packet_width > 2) {
+ reducer.reducePacket(rv1[2], rv1);
+ reducer.reducePacket(rv1[3], rv1 + 1);
+ reducer.reducePacket(rv1[1], rv1);
+ reducer.reducePacket(rv2[2], rv2);
+ reducer.reducePacket(rv2[3], rv2 + 1);
+ reducer.reducePacket(rv2[1], rv2);
+ }
+ half val1 = __low2half(*rv1);
+ reducer.reduce(__high2half(*rv1), &val1);
+ half val2 = __low2half(*rv2);
+ reducer.reduce(__high2half(*rv2), &val2);
+ val = __halves2half2(val1, val2);
+ if ((threadIdx.x & (warpSize - 1)) == 0) {
+ half* loc = output + row;
+ atomicReduce((half2*)loc, val, reducer);
+ }
+ }
+ }
+}
+
+#endif // EIGEN_HAS_GPU_FP16
+
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
+struct InnerReductionLauncher {
+ static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
+ gpu_assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
+ return true;
+ }
+};
+
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct InnerReductionLauncher<
+ Self, Op, OutputType, PacketAccess,
+ typename internal::enable_if<
+ internal::is_same<float, OutputType>::value ||
+ internal::is_same<double, OutputType>::value,
+ void>::type> {
+ static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+ typedef typename Self::Index Index;
+
+ const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+ const int block_size = 256;
+ const int num_per_thread = 128;
+ const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+ const int max_blocks = device.getNumGpuMultiProcessors() *
+ device.maxGpuThreadsPerMultiProcessor() / block_size;
+ const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+ if (num_blocks > 1) {
+ // We initialize the outputs outside the reduction kernel when we can't be sure that there
+ // won't be a race conditions between multiple thread blocks.
+ const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
+ const int max_blocks = device.getNumGpuMultiProcessors() *
+ device.maxGpuThreadsPerMultiProcessor() / 1024;
+ const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+ LAUNCH_GPU_KERNEL((ReductionInitKernel<OutputType, Index>),
+ num_blocks, 1024, 0, device, reducer.initialize(),
+ num_preserved_vals, output);
+ }
+
+ LAUNCH_GPU_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
+ num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+ return false;
+ }
+};
+
+#ifdef EIGEN_HAS_GPU_FP16
+template <typename Self, typename Op>
+struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
+ static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
+ gpu_assert(false && "Should not be called since there is no packet accessor");
+ return true;
+ }
+};
+
+template <typename Self, typename Op>
+struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
+ static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+ typedef typename Self::Index Index;
+
+ if (num_preserved_vals % 2 != 0) {
+ // Not supported yet, revert to the slower code path
+ return true;
+ }
+
+ const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+ const int block_size = /*256*/128;
+ const int num_per_thread = /*128*/64;
+ const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+ const int max_blocks = device.getNumGpuMultiProcessors() *
+ device.maxGpuThreadsPerMultiProcessor() / block_size;
+ const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+ if (num_blocks > 1) {
+ // We initialize the outputs outside the reduction kernel when we can't be sure that there
+ // won't be a race conditions between multiple thread blocks.
+ LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
+ 1, 1, 0, device, reducer, self, num_preserved_vals, output);
+ }
+
+ LAUNCH_GPU_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
+ num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+ return false;
+ }
+};
+#endif // EIGEN_HAS_GPU_FP16
+
+
+template <typename Self, typename Op>
+struct InnerReducer<Self, Op, GpuDevice> {
+ // Unfortunately nvidia doesn't support well exotic types such as complex,
+ // so reduce the scope of the optimized version of the code to the simple case
+ // of floats and half floats.
+#ifdef EIGEN_HAS_GPU_FP16
+ static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
+ (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+ internal::is_same<typename Self::CoeffReturnType, double>::value ||
+ (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
+#else // EIGEN_HAS_GPU_FP16
+ static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
+ (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+ internal::is_same<typename Self::CoeffReturnType, double>::value);
+#endif // EIGEN_HAS_GPU_FP16
+
+ template <typename OutputType>
+ static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+ gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
+ const Index num_coeffs = array_prod(self.m_impl.dimensions());
+ // Don't crash when we're called with an input tensor of size 0.
+ if (num_coeffs == 0) {
+ return true;
+ }
+ // It's faster to use the usual code.
+ if (num_coeffs_to_reduce <= 128) {
+ return true;
+ }
+
+ return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
+ }
+};
+
+template <int NumPerThread, typename Self,
+ typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+ typename Self::CoeffReturnType* output) {
+ const Index num_threads = blockDim.x * gridDim.x;
+ const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+ // Initialize the output values if they weren't initialized by the ReductionInitKernel
+ if (gridDim.x == 1) {
+ for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+ output[i] = reducer.initialize();
+ }
+ __syncthreads();
+ }
+
+ // Do the reduction.
+ const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
+ for (Index i = thread_id; i < max_iter; i += num_threads) {
+ const Index input_col = i % num_preserved_coeffs;
+ const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
+ typename Self::CoeffReturnType reduced_val = reducer.initialize();
+ const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
+ for (Index j = input_row; j < max_row; j++) {
+ typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
+ reducer.reduce(val, &reduced_val);
+ }
+ atomicReduce(&(output[input_col]), reduced_val, reducer);
+ }
+}
+
+
+template <typename Self, typename Op>
+struct OuterReducer<Self, Op, GpuDevice> {
+ // Unfortunately nvidia doesn't support well exotic types such as complex,
+ // so reduce the scope of the optimized version of the code to the simple case
+ // of floats.
+ static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
+ (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+ internal::is_same<typename Self::CoeffReturnType, double>::value);
+ template <typename Device, typename OutputType>
+ static
+ #if !defined(EIGEN_HIPCC)
+ // FIXME : leaving this EIGEN_DEVICE_FUNC in, results in the following runtime error
+ // (in the cxx11_tensor_reduction_gpu test)
+ //
+ // terminate called after throwing an instance of 'std::runtime_error'
+ // what(): No device code available for function: _ZN5Eigen8internal20OuterReductionKernelIL...
+ //
+ // don't know why this happens (and why is it a runtime error instead of a compile time error)
+ //
+ // this will be fixed by HIP PR#457
+ EIGEN_DEVICE_FUNC
+ #endif
+ bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
+ gpu_assert(false && "Should only be called to reduce doubles or floats on a gpu device");
+ return true;
+ }
+
+ static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+ typedef typename Self::Index Index;
+
+ // It's faster to use the usual code.
+ if (num_coeffs_to_reduce <= 32) {
+ return true;
+ }
+
+ const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+ const int block_size = 256;
+ const int num_per_thread = 16;
+ const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+ const int max_blocks = device.getNumGpuMultiProcessors() *
+ device.maxGpuThreadsPerMultiProcessor() / block_size;
+ const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+ if (num_blocks > 1) {
+ // We initialize the outputs in the reduction kernel itself when we don't have to worry
+ // about race conditions between multiple thread blocks.
+ const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
+ const int max_blocks = device.getNumGpuMultiProcessors() *
+ device.maxGpuThreadsPerMultiProcessor() / 1024;
+ const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+ LAUNCH_GPU_KERNEL((ReductionInitKernel<float, Index>),
+ num_blocks, 1024, 0, device, reducer.initialize(),
+ num_preserved_vals, output);
+ }
+
+ LAUNCH_GPU_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
+ num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+ return false;
+ }
+};
+
+#endif // defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
+
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorReductionSycl.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorReductionSycl.h
new file mode 100644
index 0000000..474eba0
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorReductionSycl.h
@@ -0,0 +1,582 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli Codeplay Software Ltd.
+// Ralph Potter Codeplay Software Ltd.
+// Luke Iwanski Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * TensorReductionSycl.h
+ *
+ * \brief:
+ * This is the specialization of the reduction operation. Two phase reduction approach
+ * is used since the GPU does not have Global Synchronization for global memory among
+ * different work-group/thread block. To solve the problem, we need to create two kernels
+ * to reduce the data, where the first kernel reduce the data locally and each local
+ * workgroup/thread-block save the input data into global memory. In the second phase (global reduction)
+ * one work-group uses one work-group/thread-block to reduces the intermediate data into one single element.
+ * Here is an NVIDIA presentation explaining the optimized two phase reduction algorithm on GPU:
+ * https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
+ *
+ *****************************************************************/
+
+#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
+#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
+namespace Eigen {
+namespace TensorSycl {
+namespace internal {
+
+template <typename Op, typename CoeffReturnType, typename Index, bool Vectorizable>
+struct OpDefiner {
+ typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, Vectorizable>::PacketReturnType PacketReturnType;
+ typedef Op type;
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Op &op) { return op; }
+
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
+ const Index &) {
+ return accumulator;
+ }
+};
+
+template <typename CoeffReturnType, typename Index>
+struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, false> {
+ typedef Eigen::internal::SumReducer<CoeffReturnType> type;
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
+ return type();
+ }
+
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType finalise_op(const CoeffReturnType &accumulator,
+ const Index &scale) {
+ ::Eigen::internal::scalar_quotient_op<CoeffReturnType> quotient_op;
+ return quotient_op(accumulator, CoeffReturnType(scale));
+ }
+};
+
+template <typename CoeffReturnType, typename Index>
+struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, true> {
+ typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, true>::PacketReturnType PacketReturnType;
+ typedef Eigen::internal::SumReducer<CoeffReturnType> type;
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
+ return type();
+ }
+
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
+ const Index &scale) {
+ return ::Eigen::internal::pdiv(accumulator, ::Eigen::internal::pset1<PacketReturnType>(CoeffReturnType(scale)));
+ }
+};
+
+template <typename CoeffReturnType, typename OpType, typename InputAccessor, typename OutputAccessor, typename Index,
+ Index local_range>
+struct SecondStepFullReducer {
+ typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+ LocalAccessor;
+ typedef OpDefiner<OpType, CoeffReturnType, Index, true> OpDef;
+ typedef typename OpDef::type Op;
+ LocalAccessor scratch;
+ InputAccessor aI;
+ OutputAccessor outAcc;
+ Op op;
+ SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)
+ : scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {}
+
+ void operator()(cl::sycl::nd_item<1> itemID) {
+ // Our empirical research shows that the best performance will be achieved
+ // when there is only one element per thread to reduce in the second step.
+ // in this step the second step reduction time is almost negligible.
+ // Hence, in the second step of reduction the input size is fixed to the
+ // local size, thus, there is only one element read per thread. The
+ // algorithm must be changed if the number of reduce per thread in the
+ // second step is greater than 1. Otherwise, the result will be wrong.
+ const Index localid = itemID.get_local_id(0);
+ auto aInPtr = aI.get_pointer() + localid;
+ auto aOutPtr = outAcc.get_pointer();
+ CoeffReturnType *scratchptr = scratch.get_pointer();
+ CoeffReturnType accumulator = *aInPtr;
+
+ scratchptr[localid] = op.finalize(accumulator);
+ for (Index offset = itemID.get_local_range(0) / 2; offset > 0; offset /= 2) {
+ itemID.barrier(cl::sycl::access::fence_space::local_space);
+ if (localid < offset) {
+ op.reduce(scratchptr[localid + offset], &accumulator);
+ scratchptr[localid] = op.finalize(accumulator);
+ }
+ }
+ if (localid == 0) *aOutPtr = op.finalize(accumulator);
+ }
+};
+
+// Full reduction first phase. In this version the vectorization is true and the reduction accept
+// any generic reducerOp e.g( max, min, sum, mean, iamax, iamin, etc ).
+template <typename Evaluator, typename OpType, typename Evaluator::Index local_range>
+class FullReductionKernelFunctor {
+ public:
+ typedef typename Evaluator::CoeffReturnType CoeffReturnType;
+ typedef typename Evaluator::Index Index;
+ typedef OpDefiner<OpType, typename Evaluator::CoeffReturnType, Index,
+ (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
+ OpDef;
+
+ typedef typename OpDef::type Op;
+ typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
+ typedef typename Evaluator::PacketReturnType PacketReturnType;
+ typedef
+ typename ::Eigen::internal::conditional<(Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess),
+ PacketReturnType, CoeffReturnType>::type OutType;
+ typedef cl::sycl::accessor<OutType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+ LocalAccessor;
+ LocalAccessor scratch;
+ Evaluator evaluator;
+ EvaluatorPointerType final_output;
+ Index rng;
+ Op op;
+
+ FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_,
+ Index rng_, OpType op_)
+ : scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {}
+
+ void operator()(cl::sycl::nd_item<1> itemID) { compute_reduction(itemID); }
+
+ template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<Vect>::type compute_reduction(
+ const cl::sycl::nd_item<1> &itemID) {
+ auto output_ptr = final_output.get_pointer();
+ Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize;
+ Index globalid = itemID.get_global_id(0);
+ Index localid = itemID.get_local_id(0);
+ Index step = Evaluator::PacketSize * itemID.get_global_range(0);
+ Index start = Evaluator::PacketSize * globalid;
+ // vectorizable parts
+ PacketReturnType packetAccumulator = op.template initializePacket<PacketReturnType>();
+ for (Index i = start; i < VectorizedRange; i += step) {
+ op.template reducePacket<PacketReturnType>(evaluator.impl().template packet<Unaligned>(i), &packetAccumulator);
+ }
+ globalid += VectorizedRange;
+ // non vectorizable parts
+ for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
+ op.template reducePacket<PacketReturnType>(
+ ::Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, Evaluator::PacketSize>::convert_to_packet_type(
+ evaluator.impl().coeff(i), op.initialize()),
+ &packetAccumulator);
+ }
+ scratch[localid] = packetAccumulator =
+ OpDef::finalise_op(op.template finalizePacket<PacketReturnType>(packetAccumulator), rng);
+ // reduction parts // Local size is always power of 2
+ EIGEN_UNROLL_LOOP
+ for (Index offset = local_range / 2; offset > 0; offset /= 2) {
+ itemID.barrier(cl::sycl::access::fence_space::local_space);
+ if (localid < offset) {
+ op.template reducePacket<PacketReturnType>(scratch[localid + offset], &packetAccumulator);
+ scratch[localid] = op.template finalizePacket<PacketReturnType>(packetAccumulator);
+ }
+ }
+ if (localid == 0) {
+ output_ptr[itemID.get_group(0)] =
+ op.finalizeBoth(op.initialize(), op.template finalizePacket<PacketReturnType>(packetAccumulator));
+ }
+ }
+
+ template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!Vect>::type compute_reduction(
+ const cl::sycl::nd_item<1> &itemID) {
+ auto output_ptr = final_output.get_pointer();
+ Index globalid = itemID.get_global_id(0);
+ Index localid = itemID.get_local_id(0);
+ // vectorizable parts
+ CoeffReturnType accumulator = op.initialize();
+ // non vectorizable parts
+ for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
+ op.reduce(evaluator.impl().coeff(i), &accumulator);
+ }
+ scratch[localid] = accumulator = OpDef::finalise_op(op.finalize(accumulator), rng);
+
+ // reduction parts. the local size is always power of 2
+ EIGEN_UNROLL_LOOP
+ for (Index offset = local_range / 2; offset > 0; offset /= 2) {
+ itemID.barrier(cl::sycl::access::fence_space::local_space);
+ if (localid < offset) {
+ op.reduce(scratch[localid + offset], &accumulator);
+ scratch[localid] = op.finalize(accumulator);
+ }
+ }
+ if (localid == 0) {
+ output_ptr[itemID.get_group(0)] = op.finalize(accumulator);
+ }
+ }
+};
+
+template <typename Evaluator, typename OpType>
+class GenericNondeterministicReducer {
+ public:
+ typedef typename Evaluator::CoeffReturnType CoeffReturnType;
+ typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
+ typedef typename Evaluator::Index Index;
+ typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
+ typedef typename OpDef::type Op;
+ template <typename Scratch>
+ GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_,
+ Index range_, Index num_values_to_reduce_)
+ : evaluator(evaluator_),
+ output_accessor(output_accessor_),
+ functor(OpDef::get_op(functor_)),
+ range(range_),
+ num_values_to_reduce(num_values_to_reduce_) {}
+
+ void operator()(cl::sycl::nd_item<1> itemID) {
+ auto output_accessor_ptr = output_accessor.get_pointer();
+ /// const cast added as a naive solution to solve the qualifier drop error
+ Index globalid = static_cast<Index>(itemID.get_global_linear_id());
+ if (globalid < range) {
+ CoeffReturnType accum = functor.initialize();
+ Eigen::internal::GenericDimReducer<Evaluator::NumReducedDims - 1, Evaluator, Op>::reduce(
+ evaluator, evaluator.firstInput(globalid), functor, &accum);
+ output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce);
+ }
+ }
+
+ private:
+ Evaluator evaluator;
+ EvaluatorPointerType output_accessor;
+ Op functor;
+ Index range;
+ Index num_values_to_reduce;
+};
+
+enum class reduction_dim { inner_most, outer_most };
+// default is preserver
+template <typename Evaluator, typename OpType, typename PannelParameters, reduction_dim rt>
+struct PartialReductionKernel {
+ typedef typename Evaluator::CoeffReturnType CoeffReturnType;
+ typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
+ typedef typename Evaluator::Index Index;
+ typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
+ typedef typename OpDef::type Op;
+ typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+ ScratchAcc;
+ ScratchAcc scratch;
+ Evaluator evaluator;
+ EvaluatorPointerType output_accessor;
+ Op op;
+ const Index preserve_elements_num_groups;
+ const Index reduce_elements_num_groups;
+ const Index num_coeffs_to_preserve;
+ const Index num_coeffs_to_reduce;
+
+ PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_,
+ const Index preserve_elements_num_groups_, const Index reduce_elements_num_groups_,
+ const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_)
+ : scratch(scratch_),
+ evaluator(evaluator_),
+ output_accessor(output_accessor_),
+ op(OpDef::get_op(op_)),
+ preserve_elements_num_groups(preserve_elements_num_groups_),
+ reduce_elements_num_groups(reduce_elements_num_groups_),
+ num_coeffs_to_preserve(num_coeffs_to_preserve_),
+ num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId,
+ CoeffReturnType &accumulator) {
+ if (globalPId >= num_coeffs_to_preserve) {
+ return;
+ }
+ Index global_offset = rt == reduction_dim::outer_most ? globalPId + (globalRId * num_coeffs_to_preserve)
+ : globalRId + (globalPId * num_coeffs_to_reduce);
+ Index localOffset = globalRId;
+
+ const Index per_thread_local_stride = PannelParameters::LocalThreadSizeR * reduce_elements_num_groups;
+ const Index per_thread_global_stride =
+ rt == reduction_dim::outer_most ? num_coeffs_to_preserve * per_thread_local_stride : per_thread_local_stride;
+ for (Index i = globalRId; i < num_coeffs_to_reduce; i += per_thread_local_stride) {
+ op.reduce(evaluator.impl().coeff(global_offset), &accumulator);
+ localOffset += per_thread_local_stride;
+ global_offset += per_thread_global_stride;
+ }
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+ const Index linearLocalThreadId = itemID.get_local_id(0);
+ Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP
+ : linearLocalThreadId / PannelParameters::LocalThreadSizeR;
+ Index rLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId / PannelParameters::LocalThreadSizeP
+ : linearLocalThreadId % PannelParameters::LocalThreadSizeR;
+ const Index pGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) % preserve_elements_num_groups
+ : itemID.get_group(0) / reduce_elements_num_groups;
+ const Index rGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) / preserve_elements_num_groups
+ : itemID.get_group(0) % reduce_elements_num_groups;
+
+ Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
+ const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId;
+ auto scratchPtr = scratch.get_pointer().get();
+ auto outPtr =
+ output_accessor.get_pointer() + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0);
+ CoeffReturnType accumulator = op.initialize();
+
+ element_wise_reduce(globalRId, globalPId, accumulator);
+
+ accumulator = OpDef::finalise_op(op.finalize(accumulator), num_coeffs_to_reduce);
+ scratchPtr[pLocalThreadId + rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)] =
+ accumulator;
+ if (rt == reduction_dim::inner_most) {
+ pLocalThreadId = linearLocalThreadId % PannelParameters::LocalThreadSizeP;
+ rLocalThreadId = linearLocalThreadId / PannelParameters::LocalThreadSizeP;
+ globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
+ }
+
+ /* Apply the reduction operation between the current local
+ * id and the one on the other half of the vector. */
+ auto out_scratch_ptr =
+ scratchPtr + (pLocalThreadId + (rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)));
+ itemID.barrier(cl::sycl::access::fence_space::local_space);
+ if (rt == reduction_dim::inner_most) {
+ accumulator = *out_scratch_ptr;
+ }
+ // The Local LocalThreadSizeR is always power of 2
+ EIGEN_UNROLL_LOOP
+ for (Index offset = PannelParameters::LocalThreadSizeR >> 1; offset > 0; offset >>= 1) {
+ if (rLocalThreadId < offset) {
+ op.reduce(out_scratch_ptr[(PannelParameters::LocalThreadSizeP + PannelParameters::BC) * offset], &accumulator);
+ // The result has already been divided for mean reducer in the
+ // previous reduction so no need to divide furthermore
+ *out_scratch_ptr = op.finalize(accumulator);
+ }
+ /* All threads collectively read from global memory into local.
+ * The barrier ensures all threads' IO is resolved before
+ * execution continues (strictly speaking, all threads within
+ * a single work-group - there is no co-ordination between
+ * work-groups, only work-items). */
+ itemID.barrier(cl::sycl::access::fence_space::local_space);
+ }
+
+ if (rLocalThreadId == 0 && (globalPId < num_coeffs_to_preserve)) {
+ outPtr[globalPId] = op.finalize(accumulator);
+ }
+ }
+};
+
+template <typename OutScalar, typename Index, typename InputAccessor, typename OutputAccessor, typename OpType>
+struct SecondStepPartialReduction {
+ typedef OpDefiner<OpType, OutScalar, Index, false> OpDef;
+ typedef typename OpDef::type Op;
+ typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+ ScratchAccessor;
+ InputAccessor input_accessor;
+ OutputAccessor output_accessor;
+ Op op;
+ const Index num_coeffs_to_preserve;
+ const Index num_coeffs_to_reduce;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SecondStepPartialReduction(ScratchAccessor, InputAccessor input_accessor_,
+ OutputAccessor output_accessor_, OpType op_,
+ const Index num_coeffs_to_preserve_,
+ const Index num_coeffs_to_reduce_)
+ : input_accessor(input_accessor_),
+ output_accessor(output_accessor_),
+ op(OpDef::get_op(op_)),
+ num_coeffs_to_preserve(num_coeffs_to_preserve_),
+ num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+ const Index globalId = itemID.get_global_id(0);
+
+ if (globalId >= num_coeffs_to_preserve) return;
+
+ auto in_ptr = input_accessor.get_pointer() + globalId;
+
+ OutScalar accumulator = op.initialize();
+// num_coeffs_to_reduce is not bigger that 256
+ for (Index i = 0; i < num_coeffs_to_reduce; i++) {
+ op.reduce(*in_ptr, &accumulator);
+ in_ptr += num_coeffs_to_preserve;
+ }
+ output_accessor.get_pointer()[globalId] = op.finalize(accumulator);
+ }
+}; // namespace internal
+
+template <typename Index, Index LTP, Index LTR, bool BC_>
+struct ReductionPannel {
+ static EIGEN_CONSTEXPR Index LocalThreadSizeP = LTP;
+ static EIGEN_CONSTEXPR Index LocalThreadSizeR = LTR;
+ static EIGEN_CONSTEXPR bool BC = BC_;
+};
+
+template <typename Self, typename Op, TensorSycl::internal::reduction_dim rt>
+struct PartialReducerLauncher {
+ typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
+ typedef typename Self::CoeffReturnType CoeffReturnType;
+ typedef typename Self::Storage Storage;
+ typedef typename Self::Index Index;
+ typedef ReductionPannel<typename Self::Index, EIGEN_SYCL_LOCAL_THREAD_DIM0, EIGEN_SYCL_LOCAL_THREAD_DIM1, true>
+ PannelParameters;
+
+ typedef PartialReductionKernel<Self, Op, PannelParameters, rt> SyclReducerKerneType;
+
+ static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType output,
+ Index num_coeffs_to_reduce, Index num_coeffs_to_preserve) {
+ Index roundUpP = roundUp(num_coeffs_to_preserve, PannelParameters::LocalThreadSizeP);
+
+ // getPowerOfTwo makes sure local range is power of 2 and <=
+ // maxSyclThreadPerBlock this will help us to avoid extra check on the
+ // kernel
+ static_assert(!((PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR) &
+ (PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR - 1)),
+ "The Local thread size must be a power of 2 for the reduction "
+ "operation");
+
+ EIGEN_CONSTEXPR Index localRange = PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR;
+ // In this step, we force the code not to be more than 2-step reduction:
+ // Our empirical research shows that if each thread reduces at least 64
+ // elemnts individually, we get better performance. However, this can change
+ // on different platforms. In this step we force the code not to be
+ // morthan step reduction: Our empirical research shows that for inner_most
+ // dim reducer, it is better to have 8 group in a reduce dimension for sizes
+ // > 1024 to achieve the best performance.
+ const Index reductionPerThread = 64;
+ Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(), true);
+ const Index pNumGroups = roundUpP / PannelParameters::LocalThreadSizeP;
+ Index rGroups = (cu + pNumGroups - 1) / pNumGroups;
+ const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ? std::min(rGroups, localRange) : 1;
+ const Index globalRange = pNumGroups * rNumGroups * localRange;
+
+ EIGEN_CONSTEXPR Index scratchSize =
+ PannelParameters::LocalThreadSizeR * (PannelParameters::LocalThreadSizeP + PannelParameters::BC);
+ auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
+ if (rNumGroups > 1) {
+ CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>(
+ dev.allocate_temp(num_coeffs_to_preserve * rNumGroups * sizeof(CoeffReturnType)));
+ EvaluatorPointerType temp_accessor = dev.get(temp_pointer);
+ dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
+ self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
+ num_coeffs_to_reduce);
+
+ typedef SecondStepPartialReduction<CoeffReturnType, Index, EvaluatorPointerType, EvaluatorPointerType, Op>
+ SecondStepPartialReductionKernel;
+
+ dev.template unary_kernel_launcher<CoeffReturnType, SecondStepPartialReductionKernel>(
+ temp_accessor, output,
+ cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)), Index(1),
+ reducer, num_coeffs_to_preserve, rNumGroups);
+
+ self.device().deallocate_temp(temp_pointer);
+ } else {
+ dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
+ self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
+ num_coeffs_to_reduce);
+ }
+ return false;
+ }
+};
+} // namespace internal
+} // namespace TensorSycl
+
+namespace internal {
+
+template <typename Self, typename Op, bool Vectorizable>
+struct FullReducer<Self, Op, Eigen::SyclDevice, Vectorizable> {
+ typedef typename Self::CoeffReturnType CoeffReturnType;
+ typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
+ static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
+ static EIGEN_CONSTEXPR int PacketSize = Self::PacketAccess ? Self::PacketSize : 1;
+ static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data) {
+ typedef typename conditional<Self::PacketAccess, typename Self::PacketReturnType, CoeffReturnType>::type OutType;
+ static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
+ (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
+ "The Local thread size must be a power of 2 for the reduction "
+ "operation");
+ EIGEN_CONSTEXPR Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
+
+ typename Self::Index inputSize = self.impl().dimensions().TotalSize();
+ // In this step we force the code not to be more than 2-step reduction:
+ // Our empirical research shows that if each thread reduces at least 512
+ // elemnts individually, we get better performance.
+ const Index reductionPerThread = 2048;
+ // const Index num_work_group =
+ Index reductionGroup = dev.getPowerOfTwo(
+ (inputSize + (reductionPerThread * local_range - 1)) / (reductionPerThread * local_range), true);
+ const Index num_work_group = std::min(reductionGroup, local_range);
+ // 1
+ // ? local_range
+ // : 1);
+ const Index global_range = num_work_group * local_range;
+
+ auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
+ typedef TensorSycl::internal::FullReductionKernelFunctor<Self, Op, local_range> reduction_kernel_t;
+ if (num_work_group > 1) {
+ CoeffReturnType *temp_pointer =
+ static_cast<CoeffReturnType *>(dev.allocate_temp(num_work_group * sizeof(CoeffReturnType)));
+ typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
+ dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, tmp_global_accessor, thread_range,
+ local_range, inputSize, reducer);
+
+ typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,
+ EvaluatorPointerType, Index, local_range>
+ GenericRKernel;
+ dev.template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
+ tmp_global_accessor, data,
+ cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)), num_work_group,
+ reducer);
+
+ dev.deallocate_temp(temp_pointer);
+ } else {
+ dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, data, thread_range, local_range, inputSize,
+ reducer);
+ }
+ }
+};
+// vectorizable inner_most most dim preserver
+// col reduction
+template <typename Self, typename Op>
+struct OuterReducer<Self, Op, Eigen::SyclDevice> {
+ static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
+
+ static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
+ typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
+ typename Self::Index num_coeffs_to_preserve) {
+ return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
+ Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::outer_most>::run(self, reducer, dev, output,
+ num_coeffs_to_reduce,
+ num_coeffs_to_preserve);
+ }
+};
+// row reduction
+template <typename Self, typename Op>
+struct InnerReducer<Self, Op, Eigen::SyclDevice> {
+ static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
+
+ static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
+ typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
+ typename Self::Index num_coeffs_to_preserve) {
+ return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
+ Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::inner_most>::run(self, reducer, dev, output,
+ num_coeffs_to_reduce,
+ num_coeffs_to_preserve);
+ }
+};
+
+// ArmgMax uses this kernel for partial reduction//
+// TODO(@mehdi.goli) come up with a better kernel
+// generic partial reduction
+template <typename Self, typename Op>
+struct GenericReducer<Self, Op, Eigen::SyclDevice> {
+ static EIGEN_CONSTEXPR bool HasOptimizedImplementation = false;
+ static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
+ typename Self::EvaluatorPointerType output, typename Self::Index num_values_to_reduce,
+ typename Self::Index num_coeffs_to_preserve) {
+ typename Self::Index range, GRange, tileSize;
+ dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
+
+ dev.template unary_kernel_launcher<typename Self::CoeffReturnType,
+ TensorSycl::internal::GenericNondeterministicReducer<Self, Op>>(
+ self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1),
+ reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast<Index>(1));
+ return false;
+ }
+};
+
+} // namespace internal
+} // namespace Eigen
+
+#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorRef.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorRef.h
new file mode 100644
index 0000000..a27d364
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorRef.h
@@ -0,0 +1,454 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_REF_H
+#define EIGEN_CXX11_TENSOR_TENSOR_REF_H
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Dimensions, typename Scalar>
+class TensorLazyBaseEvaluator {
+ public:
+ TensorLazyBaseEvaluator() : m_refcount(0) { }
+ virtual ~TensorLazyBaseEvaluator() { }
+
+ EIGEN_DEVICE_FUNC virtual const Dimensions& dimensions() const = 0;
+ EIGEN_DEVICE_FUNC virtual const Scalar* data() const = 0;
+
+ EIGEN_DEVICE_FUNC virtual const Scalar coeff(DenseIndex index) const = 0;
+ EIGEN_DEVICE_FUNC virtual Scalar& coeffRef(DenseIndex index) = 0;
+
+ void incrRefCount() { ++m_refcount; }
+ void decrRefCount() { --m_refcount; }
+ int refCount() const { return m_refcount; }
+
+ private:
+ // No copy, no assignment;
+ TensorLazyBaseEvaluator(const TensorLazyBaseEvaluator& other);
+ TensorLazyBaseEvaluator& operator = (const TensorLazyBaseEvaluator& other);
+
+ int m_refcount;
+};
+
+
+template <typename Dimensions, typename Expr, typename Device>
+class TensorLazyEvaluatorReadOnly : public TensorLazyBaseEvaluator<Dimensions, typename TensorEvaluator<Expr, Device>::Scalar> {
+ public:
+ // typedef typename TensorEvaluator<Expr, Device>::Dimensions Dimensions;
+ typedef typename TensorEvaluator<Expr, Device>::Scalar Scalar;
+ typedef StorageMemory<Scalar, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+ typedef TensorEvaluator<Expr, Device> EvalType;
+
+ TensorLazyEvaluatorReadOnly(const Expr& expr, const Device& device) : m_impl(expr, device), m_dummy(Scalar(0)) {
+ m_dims = m_impl.dimensions();
+ m_impl.evalSubExprsIfNeeded(NULL);
+ }
+ virtual ~TensorLazyEvaluatorReadOnly() {
+ m_impl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC virtual const Dimensions& dimensions() const {
+ return m_dims;
+ }
+ EIGEN_DEVICE_FUNC virtual const Scalar* data() const {
+ return m_impl.data();
+ }
+
+ EIGEN_DEVICE_FUNC virtual const Scalar coeff(DenseIndex index) const {
+ return m_impl.coeff(index);
+ }
+ EIGEN_DEVICE_FUNC virtual Scalar& coeffRef(DenseIndex /*index*/) {
+ eigen_assert(false && "can't reference the coefficient of a rvalue");
+ return m_dummy;
+ };
+
+ protected:
+ TensorEvaluator<Expr, Device> m_impl;
+ Dimensions m_dims;
+ Scalar m_dummy;
+};
+
+template <typename Dimensions, typename Expr, typename Device>
+class TensorLazyEvaluatorWritable : public TensorLazyEvaluatorReadOnly<Dimensions, Expr, Device> {
+ public:
+ typedef TensorLazyEvaluatorReadOnly<Dimensions, Expr, Device> Base;
+ typedef typename Base::Scalar Scalar;
+ typedef StorageMemory<Scalar, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+
+ TensorLazyEvaluatorWritable(const Expr& expr, const Device& device) : Base(expr, device) {
+ }
+ virtual ~TensorLazyEvaluatorWritable() {
+ }
+
+ EIGEN_DEVICE_FUNC virtual Scalar& coeffRef(DenseIndex index) {
+ return this->m_impl.coeffRef(index);
+ }
+};
+
+template <typename Dimensions, typename Expr, typename Device>
+class TensorLazyEvaluator : public internal::conditional<bool(internal::is_lvalue<Expr>::value),
+ TensorLazyEvaluatorWritable<Dimensions, Expr, Device>,
+ TensorLazyEvaluatorReadOnly<Dimensions, const Expr, Device> >::type {
+ public:
+ typedef typename internal::conditional<bool(internal::is_lvalue<Expr>::value),
+ TensorLazyEvaluatorWritable<Dimensions, Expr, Device>,
+ TensorLazyEvaluatorReadOnly<Dimensions, const Expr, Device> >::type Base;
+ typedef typename Base::Scalar Scalar;
+
+ TensorLazyEvaluator(const Expr& expr, const Device& device) : Base(expr, device) {
+ }
+ virtual ~TensorLazyEvaluator() {
+ }
+};
+
+} // namespace internal
+
+
+/** \class TensorRef
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief A reference to a tensor expression
+ * The expression will be evaluated lazily (as much as possible).
+ *
+ */
+template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef<PlainObjectType> >
+{
+ public:
+ typedef TensorRef<PlainObjectType> Self;
+ typedef typename PlainObjectType::Base Base;
+ typedef typename Eigen::internal::nested<Self>::type Nested;
+ typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind;
+ typedef typename internal::traits<PlainObjectType>::Index Index;
+ typedef typename internal::traits<PlainObjectType>::Scalar Scalar;
+ typedef typename NumTraits<Scalar>::Real RealScalar;
+ typedef typename Base::CoeffReturnType CoeffReturnType;
+ typedef Scalar* PointerType;
+ typedef PointerType PointerArgType;
+
+ static const Index NumIndices = PlainObjectType::NumIndices;
+ typedef typename PlainObjectType::Dimensions Dimensions;
+
+ enum {
+ IsAligned = false,
+ PacketAccess = false,
+ BlockAccess = false,
+ PreferBlockAccess = false,
+ Layout = PlainObjectType::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = false
+ };
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -----------===//
+ typedef internal::TensorBlockNotImplemented TensorBlock;
+ //===------------------------------------------------------------------===//
+
+ EIGEN_STRONG_INLINE TensorRef() : m_evaluator(NULL) {
+ }
+
+ template <typename Expression>
+ EIGEN_STRONG_INLINE TensorRef(const Expression& expr) : m_evaluator(new internal::TensorLazyEvaluator<Dimensions, Expression, DefaultDevice>(expr, DefaultDevice())) {
+ m_evaluator->incrRefCount();
+ }
+
+ template <typename Expression>
+ EIGEN_STRONG_INLINE TensorRef& operator = (const Expression& expr) {
+ unrefEvaluator();
+ m_evaluator = new internal::TensorLazyEvaluator<Dimensions, Expression, DefaultDevice>(expr, DefaultDevice());
+ m_evaluator->incrRefCount();
+ return *this;
+ }
+
+ ~TensorRef() {
+ unrefEvaluator();
+ }
+
+ TensorRef(const TensorRef& other) : m_evaluator(other.m_evaluator) {
+ eigen_assert(m_evaluator->refCount() > 0);
+ m_evaluator->incrRefCount();
+ }
+
+ TensorRef& operator = (const TensorRef& other) {
+ if (this != &other) {
+ unrefEvaluator();
+ m_evaluator = other.m_evaluator;
+ eigen_assert(m_evaluator->refCount() > 0);
+ m_evaluator->incrRefCount();
+ }
+ return *this;
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Index rank() const { return m_evaluator->dimensions().size(); }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_evaluator->dimensions()[n]; }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_evaluator->dimensions(); }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Index size() const { return m_evaluator->dimensions().TotalSize(); }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar* data() const { return m_evaluator->data(); }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar operator()(Index index) const
+ {
+ return m_evaluator->coeff(index);
+ }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+ template<typename... IndexTypes> EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar operator()(Index firstIndex, IndexTypes... otherIndices) const
+ {
+ const std::size_t num_indices = (sizeof...(otherIndices) + 1);
+ const array<Index, num_indices> indices{{firstIndex, otherIndices...}};
+ return coeff(indices);
+ }
+ template<typename... IndexTypes> EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices)
+ {
+ const std::size_t num_indices = (sizeof...(otherIndices) + 1);
+ const array<Index, num_indices> indices{{firstIndex, otherIndices...}};
+ return coeffRef(indices);
+ }
+#else
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1) const
+ {
+ array<Index, 2> indices;
+ indices[0] = i0;
+ indices[1] = i1;
+ return coeff(indices);
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2) const
+ {
+ array<Index, 3> indices;
+ indices[0] = i0;
+ indices[1] = i1;
+ indices[2] = i2;
+ return coeff(indices);
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2, Index i3) const
+ {
+ array<Index, 4> indices;
+ indices[0] = i0;
+ indices[1] = i1;
+ indices[2] = i2;
+ indices[3] = i3;
+ return coeff(indices);
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
+ {
+ array<Index, 5> indices;
+ indices[0] = i0;
+ indices[1] = i1;
+ indices[2] = i2;
+ indices[3] = i3;
+ indices[4] = i4;
+ return coeff(indices);
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1)
+ {
+ array<Index, 2> indices;
+ indices[0] = i0;
+ indices[1] = i1;
+ return coeffRef(indices);
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1, Index i2)
+ {
+ array<Index, 3> indices;
+ indices[0] = i0;
+ indices[1] = i1;
+ indices[2] = i2;
+ return coeffRef(indices);
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3)
+ {
+ array<Index, 4> indices;
+ indices[0] = i0;
+ indices[1] = i1;
+ indices[2] = i2;
+ indices[3] = i3;
+ return coeffRef(indices);
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1, Index i2, Index i3, Index i4)
+ {
+ array<Index, 5> indices;
+ indices[0] = i0;
+ indices[1] = i1;
+ indices[2] = i2;
+ indices[3] = i3;
+ indices[4] = i4;
+ return coeffRef(indices);
+ }
+#endif
+
+ template <std::size_t NumIndices> EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar coeff(const array<Index, NumIndices>& indices) const
+ {
+ const Dimensions& dims = this->dimensions();
+ Index index = 0;
+ if (PlainObjectType::Options & RowMajor) {
+ index += indices[0];
+ for (size_t i = 1; i < NumIndices; ++i) {
+ index = index * dims[i] + indices[i];
+ }
+ } else {
+ index += indices[NumIndices-1];
+ for (int i = NumIndices-2; i >= 0; --i) {
+ index = index * dims[i] + indices[i];
+ }
+ }
+ return m_evaluator->coeff(index);
+ }
+ template <std::size_t NumIndices> EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices)
+ {
+ const Dimensions& dims = this->dimensions();
+ Index index = 0;
+ if (PlainObjectType::Options & RowMajor) {
+ index += indices[0];
+ for (size_t i = 1; i < NumIndices; ++i) {
+ index = index * dims[i] + indices[i];
+ }
+ } else {
+ index += indices[NumIndices-1];
+ for (int i = NumIndices-2; i >= 0; --i) {
+ index = index * dims[i] + indices[i];
+ }
+ }
+ return m_evaluator->coeffRef(index);
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
+ {
+ return m_evaluator->coeff(index);
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
+ {
+ return m_evaluator->coeffRef(index);
+ }
+
+ private:
+ EIGEN_STRONG_INLINE void unrefEvaluator() {
+ if (m_evaluator) {
+ m_evaluator->decrRefCount();
+ if (m_evaluator->refCount() == 0) {
+ delete m_evaluator;
+ }
+ }
+ }
+
+ internal::TensorLazyBaseEvaluator<Dimensions, Scalar>* m_evaluator;
+};
+
+
+// evaluator for rvalues
+template<typename Derived, typename Device>
+struct TensorEvaluator<const TensorRef<Derived>, Device>
+{
+ typedef typename Derived::Index Index;
+ typedef typename Derived::Scalar Scalar;
+ typedef typename Derived::Scalar CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ typedef typename Derived::Dimensions Dimensions;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+
+ enum {
+ IsAligned = false,
+ PacketAccess = false,
+ BlockAccess = false,
+ PreferBlockAccess = false,
+ Layout = TensorRef<Derived>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = false
+ };
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef<Derived>& m, const Device&)
+ : m_ref(m)
+ { }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_ref.dimensions(); }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+ return true;
+ }
+
+ EIGEN_STRONG_INLINE void cleanup() { }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+ return m_ref.coeff(index);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+ return m_ref.coeffRef(index);
+ }
+
+ EIGEN_DEVICE_FUNC const Scalar* data() const { return m_ref.data(); }
+
+ protected:
+ TensorRef<Derived> m_ref;
+};
+
+
+// evaluator for lvalues
+template<typename Derived, typename Device>
+struct TensorEvaluator<TensorRef<Derived>, Device> : public TensorEvaluator<const TensorRef<Derived>, Device>
+{
+ typedef typename Derived::Index Index;
+ typedef typename Derived::Scalar Scalar;
+ typedef typename Derived::Scalar CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ typedef typename Derived::Dimensions Dimensions;
+
+ typedef TensorEvaluator<const TensorRef<Derived>, Device> Base;
+
+ enum {
+ IsAligned = false,
+ PacketAccess = false,
+ BlockAccess = false,
+ PreferBlockAccess = false,
+ RawAccess = false
+ };
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_STRONG_INLINE TensorEvaluator(TensorRef<Derived>& m, const Device& d) : Base(m, d)
+ { }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+ return this->m_ref.coeffRef(index);
+ }
+};
+
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_REF_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorReverse.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorReverse.h
new file mode 100644
index 0000000..586ce68
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorReverse.h
@@ -0,0 +1,465 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Navdeep Jaitly <ndjaitly@google.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H
+namespace Eigen {
+
+/** \class TensorReverse
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor reverse elements class.
+ *
+ */
+namespace internal {
+template<typename ReverseDimensions, typename XprType>
+struct traits<TensorReverseOp<ReverseDimensions,
+ XprType> > : public traits<XprType>
+{
+ typedef typename XprType::Scalar Scalar;
+ typedef traits<XprType> XprTraits;
+ typedef typename XprTraits::StorageKind StorageKind;
+ typedef typename XprTraits::Index Index;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = XprTraits::NumDimensions;
+ static const int Layout = XprTraits::Layout;
+ typedef typename XprTraits::PointerType PointerType;
+};
+
+template<typename ReverseDimensions, typename XprType>
+struct eval<TensorReverseOp<ReverseDimensions, XprType>, Eigen::Dense>
+{
+ typedef const TensorReverseOp<ReverseDimensions, XprType>& type;
+};
+
+template<typename ReverseDimensions, typename XprType>
+struct nested<TensorReverseOp<ReverseDimensions, XprType>, 1,
+ typename eval<TensorReverseOp<ReverseDimensions, XprType> >::type>
+{
+ typedef TensorReverseOp<ReverseDimensions, XprType> type;
+};
+
+} // end namespace internal
+
+template<typename ReverseDimensions, typename XprType>
+class TensorReverseOp : public TensorBase<TensorReverseOp<ReverseDimensions,
+ XprType>, WriteAccessors>
+{
+ public:
+ typedef TensorBase<TensorReverseOp<ReverseDimensions, XprType>, WriteAccessors>Base;
+ typedef typename Eigen::internal::traits<TensorReverseOp>::Scalar Scalar;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename Eigen::internal::nested<TensorReverseOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorReverseOp>::StorageKind
+ StorageKind;
+ typedef typename Eigen::internal::traits<TensorReverseOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReverseOp(
+ const XprType& expr, const ReverseDimensions& reverse_dims)
+ : m_xpr(expr), m_reverse_dims(reverse_dims) { }
+
+ EIGEN_DEVICE_FUNC
+ const ReverseDimensions& reverse() const { return m_reverse_dims; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type&
+ expression() const { return m_xpr; }
+
+ EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorReverseOp)
+
+
+ protected:
+ typename XprType::Nested m_xpr;
+ const ReverseDimensions m_reverse_dims;
+};
+
+// Eval as rvalue
+template<typename ReverseDimensions, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device>
+{
+ typedef TensorReverseOp<ReverseDimensions, ArgType> XprType;
+ typedef typename XprType::Index Index;
+ static const int NumDims = internal::array_size<ReverseDimensions>::value;
+ typedef DSizes<Index, NumDims> Dimensions;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+
+ enum {
+ IsAligned = false,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = NumDims > 0,
+ PreferBlockAccess = true,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = false
+ };
+
+ typedef internal::TensorIntDivisor<Index> IndexDivisor;
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+ typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+ typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
+ ArgTensorBlock;
+
+ typedef typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims,
+ Layout, Index>
+ TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : m_impl(op.expression(), device),
+ m_reverse(op.reverse()),
+ m_device(device)
+ {
+ // Reversing a scalar isn't supported yet. It would be a no-op anyway.
+ EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+ // Compute strides
+ m_dimensions = m_impl.dimensions();
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ m_strides[0] = 1;
+ for (int i = 1; i < NumDims; ++i) {
+ m_strides[i] = m_strides[i-1] * m_dimensions[i-1];
+ if (m_strides[i] > 0) m_fastStrides[i] = IndexDivisor(m_strides[i]);
+ }
+ } else {
+ m_strides[NumDims-1] = 1;
+ for (int i = NumDims - 2; i >= 0; --i) {
+ m_strides[i] = m_strides[i+1] * m_dimensions[i+1];
+ if (m_strides[i] > 0) m_fastStrides[i] = IndexDivisor(m_strides[i]);
+ }
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const Dimensions& dimensions() const { return m_dimensions; }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+ m_impl.evalSubExprsIfNeeded(NULL);
+ return true;
+ }
+
+#ifdef EIGEN_USE_THREADS
+ template <typename EvalSubExprsCallback>
+ EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+ EvaluatorPointerType, EvalSubExprsCallback done) {
+ m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+ }
+#endif // EIGEN_USE_THREADS
+
+ EIGEN_STRONG_INLINE void cleanup() {
+ m_impl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index reverseIndex(
+ Index index) const {
+ eigen_assert(index < dimensions().TotalSize());
+ Index inputIndex = 0;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ EIGEN_UNROLL_LOOP
+ for (int i = NumDims - 1; i > 0; --i) {
+ Index idx = index / m_fastStrides[i];
+ index -= idx * m_strides[i];
+ if (m_reverse[i]) {
+ idx = m_dimensions[i] - idx - 1;
+ }
+ inputIndex += idx * m_strides[i] ;
+ }
+ if (m_reverse[0]) {
+ inputIndex += (m_dimensions[0] - index - 1);
+ } else {
+ inputIndex += index;
+ }
+ } else {
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < NumDims - 1; ++i) {
+ Index idx = index / m_fastStrides[i];
+ index -= idx * m_strides[i];
+ if (m_reverse[i]) {
+ idx = m_dimensions[i] - idx - 1;
+ }
+ inputIndex += idx * m_strides[i] ;
+ }
+ if (m_reverse[NumDims-1]) {
+ inputIndex += (m_dimensions[NumDims-1] - index - 1);
+ } else {
+ inputIndex += index;
+ }
+ }
+ return inputIndex;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(
+ Index index) const {
+ return m_impl.coeff(reverseIndex(index));
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ PacketReturnType packet(Index index) const
+ {
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+ // TODO(ndjaitly): write a better packing routine that uses
+ // local structure.
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type
+ values[PacketSize];
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < PacketSize; ++i) {
+ values[i] = coeff(index+i);
+ }
+ PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+ return rslt;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ internal::TensorBlockResourceRequirements getResourceRequirements() const {
+ const size_t target_size = m_device.lastLevelCacheSize();
+ // Block evaluation reads underlying memory in reverse order, and default
+ // cost model does not properly catch this in bytes stored/loaded.
+ return internal::TensorBlockResourceRequirements::skewed<Scalar>(
+ target_size)
+ .addCostPerCoeff({0, 0, 24});
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+ block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+ bool /*root_of_expr_ast*/ = false) const {
+ // TODO(ezhulenev): If underlying tensor expression supports and prefers
+ // block evaluation we must use it. Currently we use coeff and packet
+ // access into the underlying tensor expression.
+ // static const bool useBlockAccessForArgType =
+ // TensorEvaluator<ArgType, Device>::BlockAccess &&
+ // TensorEvaluator<ArgType, Device>::PreferBlockAccess;
+
+ static const bool isColMajor =
+ static_cast<int>(Layout) == static_cast<int>(ColMajor);
+
+ static const Index inner_dim_idx = isColMajor ? 0 : NumDims - 1;
+ const bool inner_dim_reversed = m_reverse[inner_dim_idx];
+
+ // Offset in the output block.
+ Index block_offset = 0;
+
+ // Offset in the input Tensor.
+ Index input_offset = reverseIndex(desc.offset());
+
+ // Initialize output block iterator state. Dimension in this array are
+ // always in inner_most -> outer_most order (col major layout).
+ array<BlockIteratorState, NumDims> it;
+ for (int i = 0; i < NumDims; ++i) {
+ const int dim = isColMajor ? i : NumDims - 1 - i;
+ it[i].size = desc.dimension(dim);
+ it[i].count = 0;
+ it[i].reverse = m_reverse[dim];
+
+ it[i].block_stride =
+ i == 0 ? 1 : (it[i - 1].size * it[i - 1].block_stride);
+ it[i].block_span = it[i].block_stride * (it[i].size - 1);
+
+ it[i].input_stride = m_strides[dim];
+ it[i].input_span = it[i].input_stride * (it[i].size - 1);
+
+ if (it[i].reverse) {
+ it[i].input_stride = -1 * it[i].input_stride;
+ it[i].input_span = -1 * it[i].input_span;
+ }
+ }
+
+ // If multiple inner dimensions have the same reverse flag, check if we can
+ // merge them into a single virtual inner dimension.
+ int effective_inner_dim = 0;
+ for (int i = 1; i < NumDims; ++i) {
+ if (it[i].reverse != it[effective_inner_dim].reverse) break;
+ if (it[i].block_stride != it[effective_inner_dim].size) break;
+ if (it[i].block_stride != numext::abs(it[i].input_stride)) break;
+
+ it[i].size = it[effective_inner_dim].size * it[i].size;
+
+ it[i].block_stride = 1;
+ it[i].input_stride = (inner_dim_reversed ? -1 : 1);
+
+ it[i].block_span = it[i].block_stride * (it[i].size - 1);
+ it[i].input_span = it[i].input_stride * (it[i].size - 1);
+
+ effective_inner_dim = i;
+ }
+
+ eigen_assert(it[effective_inner_dim].block_stride == 1);
+ eigen_assert(it[effective_inner_dim].input_stride ==
+ (inner_dim_reversed ? -1 : 1));
+
+ const Index inner_dim_size = it[effective_inner_dim].size;
+
+ // Prepare storage for the materialized reverse result.
+ const typename TensorBlock::Storage block_storage =
+ TensorBlock::prepareStorage(desc, scratch);
+ CoeffReturnType* block_buffer = block_storage.data();
+
+ while (it[NumDims - 1].count < it[NumDims - 1].size) {
+ // Copy inner-most dimension data from reversed location in input.
+ Index dst = block_offset;
+ Index src = input_offset;
+
+ // NOTE(ezhulenev): Adding vectorized path with internal::preverse showed
+ // worse results in benchmarks than a simple coefficient loop.
+ if (inner_dim_reversed) {
+ for (Index i = 0; i < inner_dim_size; ++i) {
+ block_buffer[dst] = m_impl.coeff(src);
+ ++dst;
+ --src;
+ }
+ } else {
+ for (Index i = 0; i < inner_dim_size; ++i) {
+ block_buffer[dst] = m_impl.coeff(src);
+ ++dst;
+ ++src;
+ }
+ }
+
+ // For the 1d tensor we need to generate only one inner-most dimension.
+ if ((NumDims - effective_inner_dim) == 1) break;
+
+ // Update offset.
+ for (Index i = effective_inner_dim + 1; i < NumDims; ++i) {
+ if (++it[i].count < it[i].size) {
+ block_offset += it[i].block_stride;
+ input_offset += it[i].input_stride;
+ break;
+ }
+ if (i != NumDims - 1) it[i].count = 0;
+ block_offset -= it[i].block_span;
+ input_offset -= it[i].input_span;
+ }
+ }
+
+ return block_storage.AsTensorMaterializedBlock();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+ double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
+ 2 * TensorOpCost::MulCost<Index>() +
+ TensorOpCost::DivCost<Index>());
+ for (int i = 0; i < NumDims; ++i) {
+ if (m_reverse[i]) {
+ compute_cost += 2 * TensorOpCost::AddCost<Index>();
+ }
+ }
+ return m_impl.costPerCoeff(vectorized) +
+ TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize);
+ }
+
+ EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_impl.bind(cgh);
+ }
+#endif
+
+ protected:
+ Dimensions m_dimensions;
+ array<Index, NumDims> m_strides;
+ array<IndexDivisor, NumDims> m_fastStrides;
+ TensorEvaluator<ArgType, Device> m_impl;
+ ReverseDimensions m_reverse;
+ const Device EIGEN_DEVICE_REF m_device;
+
+ private:
+ struct BlockIteratorState {
+ BlockIteratorState()
+ : size(0),
+ count(0),
+ reverse(false),
+ block_stride(0),
+ block_span(0),
+ input_stride(0),
+ input_span(0) {}
+
+ Index size;
+ Index count;
+ bool reverse;
+ Index block_stride;
+ Index block_span;
+ Index input_stride;
+ Index input_span;
+ };
+};
+
+// Eval as lvalue
+
+template <typename ReverseDimensions, typename ArgType, typename Device>
+struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
+ : public TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>,
+ Device> {
+ typedef TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>,
+ Device> Base;
+ typedef TensorReverseOp<ReverseDimensions, ArgType> XprType;
+ typedef typename XprType::Index Index;
+ static const int NumDims = internal::array_size<ReverseDimensions>::value;
+ typedef DSizes<Index, NumDims> Dimensions;
+
+ enum {
+ IsAligned = false,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = false,
+ PreferBlockAccess = false,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = false
+ };
+ EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : Base(op, device) {}
+
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const Dimensions& dimensions() const { return this->m_dimensions; }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+ return this->m_impl.coeffRef(this->reverseIndex(index));
+ }
+
+ template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ void writePacket(Index index, const PacketReturnType& x) {
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+ // This code is pilfered from TensorMorphing.h
+ EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize];
+ internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < PacketSize; ++i) {
+ this->coeffRef(index+i) = values[i];
+ }
+ }
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorScan.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorScan.h
new file mode 100644
index 0000000..beae854
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorScan.h
@@ -0,0 +1,528 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Igor Babuschkin <igor@babuschk.in>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
+#define EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Op, typename XprType>
+struct traits<TensorScanOp<Op, XprType> >
+ : public traits<XprType> {
+ typedef typename XprType::Scalar Scalar;
+ typedef traits<XprType> XprTraits;
+ typedef typename XprTraits::StorageKind StorageKind;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = XprTraits::NumDimensions;
+ static const int Layout = XprTraits::Layout;
+ typedef typename XprTraits::PointerType PointerType;
+};
+
+template<typename Op, typename XprType>
+struct eval<TensorScanOp<Op, XprType>, Eigen::Dense>
+{
+ typedef const TensorScanOp<Op, XprType>& type;
+};
+
+template<typename Op, typename XprType>
+struct nested<TensorScanOp<Op, XprType>, 1,
+ typename eval<TensorScanOp<Op, XprType> >::type>
+{
+ typedef TensorScanOp<Op, XprType> type;
+};
+} // end namespace internal
+
+/** \class TensorScan
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor scan class.
+ */
+template <typename Op, typename XprType>
+class TensorScanOp
+ : public TensorBase<TensorScanOp<Op, XprType>, ReadOnlyAccessors> {
+public:
+ typedef typename Eigen::internal::traits<TensorScanOp>::Scalar Scalar;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename Eigen::internal::nested<TensorScanOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorScanOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorScanOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorScanOp(
+ const XprType& expr, const Index& axis, bool exclusive = false, const Op& op = Op())
+ : m_expr(expr), m_axis(axis), m_accumulator(op), m_exclusive(exclusive) {}
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const Index axis() const { return m_axis; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const XprType& expression() const { return m_expr; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const Op accumulator() const { return m_accumulator; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ bool exclusive() const { return m_exclusive; }
+
+protected:
+ typename XprType::Nested m_expr;
+ const Index m_axis;
+ const Op m_accumulator;
+ const bool m_exclusive;
+};
+
+
+namespace internal {
+
+template <typename Self>
+EIGEN_STRONG_INLINE void ReduceScalar(Self& self, Index offset,
+ typename Self::CoeffReturnType* data) {
+ // Compute the scan along the axis, starting at the given offset
+ typename Self::CoeffReturnType accum = self.accumulator().initialize();
+ if (self.stride() == 1) {
+ if (self.exclusive()) {
+ for (Index curr = offset; curr < offset + self.size(); ++curr) {
+ data[curr] = self.accumulator().finalize(accum);
+ self.accumulator().reduce(self.inner().coeff(curr), &accum);
+ }
+ } else {
+ for (Index curr = offset; curr < offset + self.size(); ++curr) {
+ self.accumulator().reduce(self.inner().coeff(curr), &accum);
+ data[curr] = self.accumulator().finalize(accum);
+ }
+ }
+ } else {
+ if (self.exclusive()) {
+ for (Index idx3 = 0; idx3 < self.size(); idx3++) {
+ Index curr = offset + idx3 * self.stride();
+ data[curr] = self.accumulator().finalize(accum);
+ self.accumulator().reduce(self.inner().coeff(curr), &accum);
+ }
+ } else {
+ for (Index idx3 = 0; idx3 < self.size(); idx3++) {
+ Index curr = offset + idx3 * self.stride();
+ self.accumulator().reduce(self.inner().coeff(curr), &accum);
+ data[curr] = self.accumulator().finalize(accum);
+ }
+ }
+ }
+}
+
+template <typename Self>
+EIGEN_STRONG_INLINE void ReducePacket(Self& self, Index offset,
+ typename Self::CoeffReturnType* data) {
+ using Scalar = typename Self::CoeffReturnType;
+ using Packet = typename Self::PacketReturnType;
+ // Compute the scan along the axis, starting at the calculated offset
+ Packet accum = self.accumulator().template initializePacket<Packet>();
+ if (self.stride() == 1) {
+ if (self.exclusive()) {
+ for (Index curr = offset; curr < offset + self.size(); ++curr) {
+ internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
+ self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
+ }
+ } else {
+ for (Index curr = offset; curr < offset + self.size(); ++curr) {
+ self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
+ internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
+ }
+ }
+ } else {
+ if (self.exclusive()) {
+ for (Index idx3 = 0; idx3 < self.size(); idx3++) {
+ const Index curr = offset + idx3 * self.stride();
+ internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
+ self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
+ }
+ } else {
+ for (Index idx3 = 0; idx3 < self.size(); idx3++) {
+ const Index curr = offset + idx3 * self.stride();
+ self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
+ internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
+ }
+ }
+ }
+}
+
+template <typename Self, bool Vectorize, bool Parallel>
+struct ReduceBlock {
+ EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
+ typename Self::CoeffReturnType* data) {
+ for (Index idx2 = 0; idx2 < self.stride(); idx2++) {
+ // Calculate the starting offset for the scan
+ Index offset = idx1 + idx2;
+ ReduceScalar(self, offset, data);
+ }
+ }
+};
+
+// Specialization for vectorized reduction.
+template <typename Self>
+struct ReduceBlock<Self, /*Vectorize=*/true, /*Parallel=*/false> {
+ EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
+ typename Self::CoeffReturnType* data) {
+ using Packet = typename Self::PacketReturnType;
+ const int PacketSize = internal::unpacket_traits<Packet>::size;
+ Index idx2 = 0;
+ for (; idx2 + PacketSize <= self.stride(); idx2 += PacketSize) {
+ // Calculate the starting offset for the packet scan
+ Index offset = idx1 + idx2;
+ ReducePacket(self, offset, data);
+ }
+ for (; idx2 < self.stride(); idx2++) {
+ // Calculate the starting offset for the scan
+ Index offset = idx1 + idx2;
+ ReduceScalar(self, offset, data);
+ }
+ }
+};
+
+// Single-threaded CPU implementation of scan
+template <typename Self, typename Reducer, typename Device,
+ bool Vectorize =
+ (TensorEvaluator<typename Self::ChildTypeNoConst, Device>::PacketAccess &&
+ internal::reducer_traits<Reducer, Device>::PacketAccess)>
+struct ScanLauncher {
+ void operator()(Self& self, typename Self::CoeffReturnType* data) {
+ Index total_size = internal::array_prod(self.dimensions());
+
+ // We fix the index along the scan axis to 0 and perform a
+ // scan per remaining entry. The iteration is split into two nested
+ // loops to avoid an integer division by keeping track of each idx1 and
+ // idx2.
+ for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) {
+ ReduceBlock<Self, Vectorize, /*Parallel=*/false> block_reducer;
+ block_reducer(self, idx1, data);
+ }
+ }
+};
+
+#ifdef EIGEN_USE_THREADS
+
+// Adjust block_size to avoid false sharing of cachelines among
+// threads. Currently set to twice the cache line size on Intel and ARM
+// processors.
+EIGEN_STRONG_INLINE Index AdjustBlockSize(Index item_size, Index block_size) {
+ EIGEN_CONSTEXPR Index kBlockAlignment = 128;
+ const Index items_per_cacheline =
+ numext::maxi<Index>(1, kBlockAlignment / item_size);
+ return items_per_cacheline * divup(block_size, items_per_cacheline);
+}
+
+template <typename Self>
+struct ReduceBlock<Self, /*Vectorize=*/true, /*Parallel=*/true> {
+ EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
+ typename Self::CoeffReturnType* data) {
+ using Scalar = typename Self::CoeffReturnType;
+ using Packet = typename Self::PacketReturnType;
+ const int PacketSize = internal::unpacket_traits<Packet>::size;
+ Index num_scalars = self.stride();
+ Index num_packets = 0;
+ if (self.stride() >= PacketSize) {
+ num_packets = self.stride() / PacketSize;
+ self.device().parallelFor(
+ num_packets,
+ TensorOpCost(PacketSize * self.size(), PacketSize * self.size(),
+ 16 * PacketSize * self.size(), true, PacketSize),
+ // Make the shard size large enough that two neighboring threads
+ // won't write to the same cacheline of `data`.
+ [=](Index blk_size) {
+ return AdjustBlockSize(PacketSize * sizeof(Scalar), blk_size);
+ },
+ [&](Index first, Index last) {
+ for (Index packet = first; packet < last; ++packet) {
+ const Index idx2 = packet * PacketSize;
+ ReducePacket(self, idx1 + idx2, data);
+ }
+ });
+ num_scalars -= num_packets * PacketSize;
+ }
+ self.device().parallelFor(
+ num_scalars, TensorOpCost(self.size(), self.size(), 16 * self.size()),
+ // Make the shard size large enough that two neighboring threads
+ // won't write to the same cacheline of `data`.
+ [=](Index blk_size) {
+ return AdjustBlockSize(sizeof(Scalar), blk_size);
+ },
+ [&](Index first, Index last) {
+ for (Index scalar = first; scalar < last; ++scalar) {
+ const Index idx2 = num_packets * PacketSize + scalar;
+ ReduceScalar(self, idx1 + idx2, data);
+ }
+ });
+ }
+};
+
+template <typename Self>
+struct ReduceBlock<Self, /*Vectorize=*/false, /*Parallel=*/true> {
+ EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
+ typename Self::CoeffReturnType* data) {
+ using Scalar = typename Self::CoeffReturnType;
+ self.device().parallelFor(
+ self.stride(), TensorOpCost(self.size(), self.size(), 16 * self.size()),
+ // Make the shard size large enough that two neighboring threads
+ // won't write to the same cacheline of `data`.
+ [=](Index blk_size) {
+ return AdjustBlockSize(sizeof(Scalar), blk_size);
+ },
+ [&](Index first, Index last) {
+ for (Index idx2 = first; idx2 < last; ++idx2) {
+ ReduceScalar(self, idx1 + idx2, data);
+ }
+ });
+ }
+};
+
+// Specialization for multi-threaded execution.
+template <typename Self, typename Reducer, bool Vectorize>
+struct ScanLauncher<Self, Reducer, ThreadPoolDevice, Vectorize> {
+ void operator()(Self& self, typename Self::CoeffReturnType* data) {
+ using Scalar = typename Self::CoeffReturnType;
+ using Packet = typename Self::PacketReturnType;
+ const int PacketSize = internal::unpacket_traits<Packet>::size;
+ const Index total_size = internal::array_prod(self.dimensions());
+ const Index inner_block_size = self.stride() * self.size();
+ bool parallelize_by_outer_blocks = (total_size >= (self.stride() * inner_block_size));
+
+ if ((parallelize_by_outer_blocks && total_size <= 4096) ||
+ (!parallelize_by_outer_blocks && self.stride() < PacketSize)) {
+ ScanLauncher<Self, Reducer, DefaultDevice, Vectorize> launcher;
+ launcher(self, data);
+ return;
+ }
+
+ if (parallelize_by_outer_blocks) {
+ // Parallelize over outer blocks.
+ const Index num_outer_blocks = total_size / inner_block_size;
+ self.device().parallelFor(
+ num_outer_blocks,
+ TensorOpCost(inner_block_size, inner_block_size,
+ 16 * PacketSize * inner_block_size, Vectorize,
+ PacketSize),
+ [=](Index blk_size) {
+ return AdjustBlockSize(inner_block_size * sizeof(Scalar), blk_size);
+ },
+ [&](Index first, Index last) {
+ for (Index idx1 = first; idx1 < last; ++idx1) {
+ ReduceBlock<Self, Vectorize, /*Parallelize=*/false> block_reducer;
+ block_reducer(self, idx1 * inner_block_size, data);
+ }
+ });
+ } else {
+ // Parallelize over inner packets/scalars dimensions when the reduction
+ // axis is not an inner dimension.
+ ReduceBlock<Self, Vectorize, /*Parallelize=*/true> block_reducer;
+ for (Index idx1 = 0; idx1 < total_size;
+ idx1 += self.stride() * self.size()) {
+ block_reducer(self, idx1, data);
+ }
+ }
+ }
+};
+#endif // EIGEN_USE_THREADS
+
+#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
+
+// GPU implementation of scan
+// TODO(ibab) This placeholder implementation performs multiple scans in
+// parallel, but it would be better to use a parallel scan algorithm and
+// optimize memory access.
+template <typename Self, typename Reducer>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) {
+ // Compute offset as in the CPU version
+ Index val = threadIdx.x + blockIdx.x * blockDim.x;
+ Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride();
+
+ if (offset + (self.size() - 1) * self.stride() < total_size) {
+ // Compute the scan along the axis, starting at the calculated offset
+ typename Self::CoeffReturnType accum = self.accumulator().initialize();
+ for (Index idx = 0; idx < self.size(); idx++) {
+ Index curr = offset + idx * self.stride();
+ if (self.exclusive()) {
+ data[curr] = self.accumulator().finalize(accum);
+ self.accumulator().reduce(self.inner().coeff(curr), &accum);
+ } else {
+ self.accumulator().reduce(self.inner().coeff(curr), &accum);
+ data[curr] = self.accumulator().finalize(accum);
+ }
+ }
+ }
+ __syncthreads();
+
+}
+
+template <typename Self, typename Reducer, bool Vectorize>
+struct ScanLauncher<Self, Reducer, GpuDevice, Vectorize> {
+ void operator()(const Self& self, typename Self::CoeffReturnType* data) {
+ Index total_size = internal::array_prod(self.dimensions());
+ Index num_blocks = (total_size / self.size() + 63) / 64;
+ Index block_size = 64;
+
+ LAUNCH_GPU_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
+ }
+};
+#endif // EIGEN_USE_GPU && (EIGEN_GPUCC)
+
+} // namespace internal
+
+// Eval as rvalue
+template <typename Op, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
+
+ typedef TensorScanOp<Op, ArgType> XprType;
+ typedef typename XprType::Index Index;
+ typedef const ArgType ChildTypeNoConst;
+ typedef const ArgType ChildType;
+ static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+ typedef DSizes<Index, NumDims> Dimensions;
+ typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ typedef TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> Self;
+ typedef StorageMemory<Scalar, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+
+ enum {
+ IsAligned = false,
+ PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+ BlockAccess = false,
+ PreferBlockAccess = false,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false,
+ RawAccess = true
+ };
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : m_impl(op.expression(), device),
+ m_device(device),
+ m_exclusive(op.exclusive()),
+ m_accumulator(op.accumulator()),
+ m_size(m_impl.dimensions()[op.axis()]),
+ m_stride(1), m_consume_dim(op.axis()),
+ m_output(NULL) {
+
+ // Accumulating a scalar isn't supported.
+ EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+ eigen_assert(op.axis() >= 0 && op.axis() < NumDims);
+
+ // Compute stride of scan axis
+ const Dimensions& dims = m_impl.dimensions();
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = 0; i < op.axis(); ++i) {
+ m_stride = m_stride * dims[i];
+ }
+ } else {
+ // dims can only be indexed through unsigned integers,
+ // so let's use an unsigned type to let the compiler knows.
+ // This prevents stupid warnings: ""'*((void*)(& evaluator)+64)[18446744073709551615]' may be used uninitialized in this function"
+ unsigned int axis = internal::convert_index<unsigned int>(op.axis());
+ for (unsigned int i = NumDims - 1; i > axis; --i) {
+ m_stride = m_stride * dims[i];
+ }
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
+ return m_impl.dimensions();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride() const {
+ return m_stride;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& consume_dim() const {
+ return m_consume_dim;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const {
+ return m_size;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& accumulator() const {
+ return m_accumulator;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const {
+ return m_exclusive;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& inner() const {
+ return m_impl;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const {
+ return m_device;
+ }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+ m_impl.evalSubExprsIfNeeded(NULL);
+ internal::ScanLauncher<Self, Op, Device> launcher;
+ if (data) {
+ launcher(*this, data);
+ return false;
+ }
+
+ const Index total_size = internal::array_prod(dimensions());
+ m_output = static_cast<EvaluatorPointerType>(m_device.get((Scalar*) m_device.allocate_temp(total_size * sizeof(Scalar))));
+ launcher(*this, m_output);
+ return true;
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
+ return internal::ploadt<PacketReturnType, LoadMode>(m_output + index);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const
+ {
+ return m_output;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ return m_output[index];
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
+ return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
+ }
+
+ EIGEN_STRONG_INLINE void cleanup() {
+ if (m_output) {
+ m_device.deallocate_temp(m_output);
+ m_output = NULL;
+ }
+ m_impl.cleanup();
+ }
+
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_impl.bind(cgh);
+ m_output.bind(cgh);
+ }
+#endif
+protected:
+ TensorEvaluator<ArgType, Device> m_impl;
+ const Device EIGEN_DEVICE_REF m_device;
+ const bool m_exclusive;
+ Op m_accumulator;
+ const Index m_size;
+ Index m_stride;
+ Index m_consume_dim;
+ EvaluatorPointerType m_output;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorScanSycl.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorScanSycl.h
new file mode 100644
index 0000000..7f68ecb
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorScanSycl.h
@@ -0,0 +1,513 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli Codeplay Software Ltd.
+// Ralph Potter Codeplay Software Ltd.
+// Luke Iwanski Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * TensorScanSycl.h
+ *
+ * \brief:
+ * Tensor Scan Sycl implement the extend version of
+ * "Efficient parallel scan algorithms for GPUs." .for Tensor operations.
+ * The algorithm requires up to 3 stage (consequently 3 kernels) depending on
+ * the size of the tensor. In the first kernel (ScanKernelFunctor), each
+ * threads within the work-group individually reduces the allocated elements per
+ * thread in order to reduces the total number of blocks. In the next step all
+ * thread within the work-group will reduce the associated blocks into the
+ * temporary buffers. In the next kernel(ScanBlockKernelFunctor), the temporary
+ * buffer is given as an input and all the threads within a work-group scan and
+ * reduces the boundaries between the blocks (generated from the previous
+ * kernel). and write the data on the temporary buffer. If the second kernel is
+ * required, the third and final kerenl (ScanAdjustmentKernelFunctor) will
+ * adjust the final result into the output buffer.
+ * The original algorithm for the parallel prefix sum can be found here:
+ *
+ * Sengupta, Shubhabrata, Mark Harris, and Michael Garland. "Efficient parallel
+ * scan algorithms for GPUs." NVIDIA, Santa Clara, CA, Tech. Rep. NVR-2008-003
+ *1, no. 1 (2008): 1-17.
+ *****************************************************************/
+
+#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
+#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
+
+namespace Eigen {
+namespace TensorSycl {
+namespace internal {
+
+#ifndef EIGEN_SYCL_MAX_GLOBAL_RANGE
+#define EIGEN_SYCL_MAX_GLOBAL_RANGE (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 * 4)
+#endif
+
+template <typename index_t>
+struct ScanParameters {
+ // must be power of 2
+ static EIGEN_CONSTEXPR index_t ScanPerThread = 8;
+ const index_t total_size;
+ const index_t non_scan_size;
+ const index_t scan_size;
+ const index_t non_scan_stride;
+ const index_t scan_stride;
+ const index_t panel_threads;
+ const index_t group_threads;
+ const index_t block_threads;
+ const index_t elements_per_group;
+ const index_t elements_per_block;
+ const index_t loop_range;
+
+ ScanParameters(index_t total_size_, index_t non_scan_size_, index_t scan_size_, index_t non_scan_stride_,
+ index_t scan_stride_, index_t panel_threads_, index_t group_threads_, index_t block_threads_,
+ index_t elements_per_group_, index_t elements_per_block_, index_t loop_range_)
+ : total_size(total_size_),
+ non_scan_size(non_scan_size_),
+ scan_size(scan_size_),
+ non_scan_stride(non_scan_stride_),
+ scan_stride(scan_stride_),
+ panel_threads(panel_threads_),
+ group_threads(group_threads_),
+ block_threads(block_threads_),
+ elements_per_group(elements_per_group_),
+ elements_per_block(elements_per_block_),
+ loop_range(loop_range_) {}
+};
+
+enum class scan_step { first, second };
+template <typename Evaluator, typename CoeffReturnType, typename OutAccessor, typename Op, typename Index,
+ scan_step stp>
+struct ScanKernelFunctor {
+ typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+ LocalAccessor;
+ static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2;
+
+ LocalAccessor scratch;
+ Evaluator dev_eval;
+ OutAccessor out_accessor;
+ OutAccessor temp_accessor;
+ const ScanParameters<Index> scanParameters;
+ Op accumulator;
+ const bool inclusive;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanKernelFunctor(LocalAccessor scratch_, const Evaluator dev_eval_,
+ OutAccessor out_accessor_, OutAccessor temp_accessor_,
+ const ScanParameters<Index> scanParameters_, Op accumulator_,
+ const bool inclusive_)
+ : scratch(scratch_),
+ dev_eval(dev_eval_),
+ out_accessor(out_accessor_),
+ temp_accessor(temp_accessor_),
+ scanParameters(scanParameters_),
+ accumulator(accumulator_),
+ inclusive(inclusive_) {}
+
+ template <scan_step sst = stp, typename Input>
+ typename ::Eigen::internal::enable_if<sst == scan_step::first, CoeffReturnType>::type EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE
+ read(const Input &inpt, Index global_id) {
+ return inpt.coeff(global_id);
+ }
+
+ template <scan_step sst = stp, typename Input>
+ typename ::Eigen::internal::enable_if<sst != scan_step::first, CoeffReturnType>::type EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE
+ read(const Input &inpt, Index global_id) {
+ return inpt[global_id];
+ }
+
+ template <scan_step sst = stp, typename InclusiveOp>
+ typename ::Eigen::internal::enable_if<sst == scan_step::first>::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ first_step_inclusive_Operation(InclusiveOp inclusive_op) {
+ inclusive_op();
+ }
+
+ template <scan_step sst = stp, typename InclusiveOp>
+ typename ::Eigen::internal::enable_if<sst != scan_step::first>::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ first_step_inclusive_Operation(InclusiveOp) {}
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+ auto out_ptr = out_accessor.get_pointer();
+ auto tmp_ptr = temp_accessor.get_pointer();
+ auto scratch_ptr = scratch.get_pointer().get();
+
+ for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {
+ Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
+ Index tmp = data_offset % scanParameters.panel_threads;
+ const Index panel_id = data_offset / scanParameters.panel_threads;
+ const Index group_id = tmp / scanParameters.group_threads;
+ tmp = tmp % scanParameters.group_threads;
+ const Index block_id = tmp / scanParameters.block_threads;
+ const Index local_id = tmp % scanParameters.block_threads;
+ // we put one element per packet in scratch_mem
+ const Index scratch_stride = scanParameters.elements_per_block / PacketSize;
+ const Index scratch_offset = (itemID.get_local_id(0) / scanParameters.block_threads) * scratch_stride;
+ CoeffReturnType private_scan[ScanParameters<Index>::ScanPerThread];
+ CoeffReturnType inclusive_scan;
+ // the actual panel size is scan_size * non_scan_size.
+ // elements_per_panel is roundup to power of 2 for binary tree
+ const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size;
+ const Index group_offset = group_id * scanParameters.non_scan_stride;
+ // This will be effective when the size is bigger than elements_per_block
+ const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride;
+ const Index thread_offset = (ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride);
+ const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
+ Index next_elements = 0;
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
+ Index global_id = global_offset + next_elements;
+ private_scan[i] = ((((block_id * scanParameters.elements_per_block) +
+ (ScanParameters<Index>::ScanPerThread * local_id) + i) < scanParameters.scan_size) &&
+ (global_id < scanParameters.total_size))
+ ? read(dev_eval, global_id)
+ : accumulator.initialize();
+ next_elements += scanParameters.scan_stride;
+ }
+ first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC {
+ if (inclusive) {
+ inclusive_scan = private_scan[ScanParameters<Index>::ScanPerThread - 1];
+ }
+ });
+ // This for loop must be 2
+ EIGEN_UNROLL_LOOP
+ for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {
+ Index private_offset = 1;
+ // build sum in place up the tree
+ EIGEN_UNROLL_LOOP
+ for (Index d = PacketSize >> 1; d > 0; d >>= 1) {
+ EIGEN_UNROLL_LOOP
+ for (Index l = 0; l < d; l++) {
+ Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
+ Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
+ CoeffReturnType accum = accumulator.initialize();
+ accumulator.reduce(private_scan[ai], &accum);
+ accumulator.reduce(private_scan[bi], &accum);
+ private_scan[bi] = accumulator.finalize(accum);
+ }
+ private_offset *= 2;
+ }
+ scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset] =
+ private_scan[PacketSize - 1 + packetIndex];
+ private_scan[PacketSize - 1 + packetIndex] = accumulator.initialize();
+ // traverse down tree & build scan
+ EIGEN_UNROLL_LOOP
+ for (Index d = 1; d < PacketSize; d *= 2) {
+ private_offset >>= 1;
+ EIGEN_UNROLL_LOOP
+ for (Index l = 0; l < d; l++) {
+ Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
+ Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
+ CoeffReturnType accum = accumulator.initialize();
+ accumulator.reduce(private_scan[ai], &accum);
+ accumulator.reduce(private_scan[bi], &accum);
+ private_scan[ai] = private_scan[bi];
+ private_scan[bi] = accumulator.finalize(accum);
+ }
+ }
+ }
+
+ Index offset = 1;
+ // build sum in place up the tree
+ for (Index d = scratch_stride >> 1; d > 0; d >>= 1) {
+ // Synchronise
+ itemID.barrier(cl::sycl::access::fence_space::local_space);
+ if (local_id < d) {
+ Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
+ Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
+ CoeffReturnType accum = accumulator.initialize();
+ accumulator.reduce(scratch_ptr[ai], &accum);
+ accumulator.reduce(scratch_ptr[bi], &accum);
+ scratch_ptr[bi] = accumulator.finalize(accum);
+ }
+ offset *= 2;
+ }
+ // Synchronise
+ itemID.barrier(cl::sycl::access::fence_space::local_space);
+ // next step optimisation
+ if (local_id == 0) {
+ if (((scanParameters.elements_per_group / scanParameters.elements_per_block) > 1)) {
+ const Index temp_id = panel_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) *
+ scanParameters.non_scan_size +
+ group_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) +
+ block_id;
+ tmp_ptr[temp_id] = scratch_ptr[scratch_stride - 1 + scratch_offset];
+ }
+ // clear the last element
+ scratch_ptr[scratch_stride - 1 + scratch_offset] = accumulator.initialize();
+ }
+ // traverse down tree & build scan
+ for (Index d = 1; d < scratch_stride; d *= 2) {
+ offset >>= 1;
+ // Synchronise
+ itemID.barrier(cl::sycl::access::fence_space::local_space);
+ if (local_id < d) {
+ Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
+ Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
+ CoeffReturnType accum = accumulator.initialize();
+ accumulator.reduce(scratch_ptr[ai], &accum);
+ accumulator.reduce(scratch_ptr[bi], &accum);
+ scratch_ptr[ai] = scratch_ptr[bi];
+ scratch_ptr[bi] = accumulator.finalize(accum);
+ }
+ }
+ // Synchronise
+ itemID.barrier(cl::sycl::access::fence_space::local_space);
+ // This for loop must be 2
+ EIGEN_UNROLL_LOOP
+ for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {
+ EIGEN_UNROLL_LOOP
+ for (Index i = 0; i < PacketSize; i++) {
+ CoeffReturnType accum = private_scan[packetIndex + i];
+ accumulator.reduce(scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset], &accum);
+ private_scan[packetIndex + i] = accumulator.finalize(accum);
+ }
+ }
+ first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC {
+ if (inclusive) {
+ accumulator.reduce(private_scan[ScanParameters<Index>::ScanPerThread - 1], &inclusive_scan);
+ private_scan[0] = accumulator.finalize(inclusive_scan);
+ }
+ });
+ next_elements = 0;
+ // right the first set of private param
+ EIGEN_UNROLL_LOOP
+ for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
+ Index global_id = global_offset + next_elements;
+ if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) <
+ scanParameters.scan_size) &&
+ (global_id < scanParameters.total_size)) {
+ Index private_id = (i * !inclusive) + (((i + 1) % ScanParameters<Index>::ScanPerThread) * (inclusive));
+ out_ptr[global_id] = private_scan[private_id];
+ }
+ next_elements += scanParameters.scan_stride;
+ }
+ } // end for loop
+ }
+};
+
+template <typename CoeffReturnType, typename InAccessor, typename OutAccessor, typename Op, typename Index>
+struct ScanAdjustmentKernelFunctor {
+ typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+ LocalAccessor;
+ static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2;
+ InAccessor in_accessor;
+ OutAccessor out_accessor;
+ const ScanParameters<Index> scanParameters;
+ Op accumulator;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanAdjustmentKernelFunctor(LocalAccessor, InAccessor in_accessor_,
+ OutAccessor out_accessor_,
+ const ScanParameters<Index> scanParameters_,
+ Op accumulator_)
+ : in_accessor(in_accessor_),
+ out_accessor(out_accessor_),
+ scanParameters(scanParameters_),
+ accumulator(accumulator_) {}
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+ auto in_ptr = in_accessor.get_pointer();
+ auto out_ptr = out_accessor.get_pointer();
+
+ for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {
+ Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
+ Index tmp = data_offset % scanParameters.panel_threads;
+ const Index panel_id = data_offset / scanParameters.panel_threads;
+ const Index group_id = tmp / scanParameters.group_threads;
+ tmp = tmp % scanParameters.group_threads;
+ const Index block_id = tmp / scanParameters.block_threads;
+ const Index local_id = tmp % scanParameters.block_threads;
+
+ // the actual panel size is scan_size * non_scan_size.
+ // elements_per_panel is roundup to power of 2 for binary tree
+ const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size;
+ const Index group_offset = group_id * scanParameters.non_scan_stride;
+ // This will be effective when the size is bigger than elements_per_block
+ const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride;
+ const Index thread_offset = ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride;
+
+ const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
+ const Index block_size = scanParameters.elements_per_group / scanParameters.elements_per_block;
+ const Index in_id = (panel_id * block_size * scanParameters.non_scan_size) + (group_id * block_size) + block_id;
+ CoeffReturnType adjust_val = in_ptr[in_id];
+
+ Index next_elements = 0;
+ EIGEN_UNROLL_LOOP
+ for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
+ Index global_id = global_offset + next_elements;
+ if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) <
+ scanParameters.scan_size) &&
+ (global_id < scanParameters.total_size)) {
+ CoeffReturnType accum = adjust_val;
+ accumulator.reduce(out_ptr[global_id], &accum);
+ out_ptr[global_id] = accumulator.finalize(accum);
+ }
+ next_elements += scanParameters.scan_stride;
+ }
+ }
+ }
+};
+
+template <typename Index>
+struct ScanInfo {
+ const Index &total_size;
+ const Index &scan_size;
+ const Index &panel_size;
+ const Index &non_scan_size;
+ const Index &scan_stride;
+ const Index &non_scan_stride;
+
+ Index max_elements_per_block;
+ Index block_size;
+ Index panel_threads;
+ Index group_threads;
+ Index block_threads;
+ Index elements_per_group;
+ Index elements_per_block;
+ Index loop_range;
+ Index global_range;
+ Index local_range;
+ const Eigen::SyclDevice &dev;
+ EIGEN_STRONG_INLINE ScanInfo(const Index &total_size_, const Index &scan_size_, const Index &panel_size_,
+ const Index &non_scan_size_, const Index &scan_stride_, const Index &non_scan_stride_,
+ const Eigen::SyclDevice &dev_)
+ : total_size(total_size_),
+ scan_size(scan_size_),
+ panel_size(panel_size_),
+ non_scan_size(non_scan_size_),
+ scan_stride(scan_stride_),
+ non_scan_stride(non_scan_stride_),
+ dev(dev_) {
+ // must be power of 2
+ local_range = std::min(Index(dev.getNearestPowerOfTwoWorkGroupSize()),
+ Index(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1));
+
+ max_elements_per_block = local_range * ScanParameters<Index>::ScanPerThread;
+
+ elements_per_group =
+ dev.getPowerOfTwo(Index(roundUp(Index(scan_size), ScanParameters<Index>::ScanPerThread)), true);
+ const Index elements_per_panel = elements_per_group * non_scan_size;
+ elements_per_block = std::min(Index(elements_per_group), Index(max_elements_per_block));
+ panel_threads = elements_per_panel / ScanParameters<Index>::ScanPerThread;
+ group_threads = elements_per_group / ScanParameters<Index>::ScanPerThread;
+ block_threads = elements_per_block / ScanParameters<Index>::ScanPerThread;
+ block_size = elements_per_group / elements_per_block;
+#ifdef EIGEN_SYCL_MAX_GLOBAL_RANGE
+ const Index max_threads = std::min(Index(panel_threads * panel_size), Index(EIGEN_SYCL_MAX_GLOBAL_RANGE));
+#else
+ const Index max_threads = panel_threads * panel_size;
+#endif
+ global_range = roundUp(max_threads, local_range);
+ loop_range = Index(
+ std::ceil(double(elements_per_panel * panel_size) / (global_range * ScanParameters<Index>::ScanPerThread)));
+ }
+ inline ScanParameters<Index> get_scan_parameter() {
+ return ScanParameters<Index>(total_size, non_scan_size, scan_size, non_scan_stride, scan_stride, panel_threads,
+ group_threads, block_threads, elements_per_group, elements_per_block, loop_range);
+ }
+ inline cl::sycl::nd_range<1> get_thread_range() {
+ return cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
+ }
+};
+
+template <typename EvaluatorPointerType, typename CoeffReturnType, typename Reducer, typename Index>
+struct SYCLAdjustBlockOffset {
+ EIGEN_STRONG_INLINE static void adjust_scan_block_offset(EvaluatorPointerType in_ptr, EvaluatorPointerType out_ptr,
+ Reducer &accumulator, const Index total_size,
+ const Index scan_size, const Index panel_size,
+ const Index non_scan_size, const Index scan_stride,
+ const Index non_scan_stride, const Eigen::SyclDevice &dev) {
+ auto scan_info =
+ ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);
+
+ typedef ScanAdjustmentKernelFunctor<CoeffReturnType, EvaluatorPointerType, EvaluatorPointerType, Reducer, Index>
+ AdjustFuctor;
+ dev.template unary_kernel_launcher<CoeffReturnType, AdjustFuctor>(in_ptr, out_ptr, scan_info.get_thread_range(),
+ scan_info.max_elements_per_block,
+ scan_info.get_scan_parameter(), accumulator);
+ }
+};
+
+template <typename CoeffReturnType, scan_step stp>
+struct ScanLauncher_impl {
+ template <typename Input, typename EvaluatorPointerType, typename Reducer, typename Index>
+ EIGEN_STRONG_INLINE static void scan_block(Input in_ptr, EvaluatorPointerType out_ptr, Reducer &accumulator,
+ const Index total_size, const Index scan_size, const Index panel_size,
+ const Index non_scan_size, const Index scan_stride,
+ const Index non_scan_stride, const bool inclusive,
+ const Eigen::SyclDevice &dev) {
+ auto scan_info =
+ ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);
+ const Index temp_pointer_size = scan_info.block_size * non_scan_size * panel_size;
+ const Index scratch_size = scan_info.max_elements_per_block / (ScanParameters<Index>::ScanPerThread / 2);
+ CoeffReturnType *temp_pointer =
+ static_cast<CoeffReturnType *>(dev.allocate_temp(temp_pointer_size * sizeof(CoeffReturnType)));
+ EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
+
+ typedef ScanKernelFunctor<Input, CoeffReturnType, EvaluatorPointerType, Reducer, Index, stp> ScanFunctor;
+ dev.template binary_kernel_launcher<CoeffReturnType, ScanFunctor>(
+ in_ptr, out_ptr, tmp_global_accessor, scan_info.get_thread_range(), scratch_size,
+ scan_info.get_scan_parameter(), accumulator, inclusive);
+
+ if (scan_info.block_size > 1) {
+ ScanLauncher_impl<CoeffReturnType, scan_step::second>::scan_block(
+ tmp_global_accessor, tmp_global_accessor, accumulator, temp_pointer_size, scan_info.block_size, panel_size,
+ non_scan_size, Index(1), scan_info.block_size, false, dev);
+
+ SYCLAdjustBlockOffset<EvaluatorPointerType, CoeffReturnType, Reducer, Index>::adjust_scan_block_offset(
+ tmp_global_accessor, out_ptr, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride,
+ non_scan_stride, dev);
+ }
+ dev.deallocate_temp(temp_pointer);
+ }
+};
+
+} // namespace internal
+} // namespace TensorSycl
+namespace internal {
+template <typename Self, typename Reducer, bool vectorize>
+struct ScanLauncher<Self, Reducer, Eigen::SyclDevice, vectorize> {
+ typedef typename Self::Index Index;
+ typedef typename Self::CoeffReturnType CoeffReturnType;
+ typedef typename Self::Storage Storage;
+ typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
+ void operator()(Self &self, EvaluatorPointerType data) {
+ const Index total_size = internal::array_prod(self.dimensions());
+ const Index scan_size = self.size();
+ const Index scan_stride = self.stride();
+ // this is the scan op (can be sum or ...)
+ auto accumulator = self.accumulator();
+ auto inclusive = !self.exclusive();
+ auto consume_dim = self.consume_dim();
+ auto dev = self.device();
+
+ auto dims = self.inner().dimensions();
+
+ Index non_scan_size = 1;
+ Index panel_size = 1;
+ if (static_cast<int>(Self::Layout) == static_cast<int>(ColMajor)) {
+ for (int i = 0; i < consume_dim; i++) {
+ non_scan_size *= dims[i];
+ }
+ for (int i = consume_dim + 1; i < Self::NumDims; i++) {
+ panel_size *= dims[i];
+ }
+ } else {
+ for (int i = Self::NumDims - 1; i > consume_dim; i--) {
+ non_scan_size *= dims[i];
+ }
+ for (int i = consume_dim - 1; i >= 0; i--) {
+ panel_size *= dims[i];
+ }
+ }
+ const Index non_scan_stride = (scan_stride > 1) ? 1 : scan_size;
+ auto eval_impl = self.inner();
+ TensorSycl::internal::ScanLauncher_impl<CoeffReturnType, TensorSycl::internal::scan_step::first>::scan_block(
+ eval_impl, data, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride,
+ inclusive, dev);
+ }
+};
+} // namespace internal
+} // namespace Eigen
+
+#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorShuffling.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorShuffling.h
new file mode 100644
index 0000000..e5e5efd
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorShuffling.h
@@ -0,0 +1,471 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
+
+namespace Eigen {
+
+/** \class TensorShuffling
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor shuffling class.
+ *
+ *
+ */
+namespace internal {
+template<typename Shuffle, typename XprType>
+struct traits<TensorShufflingOp<Shuffle, XprType> > : public traits<XprType>
+{
+ typedef typename XprType::Scalar Scalar;
+ typedef traits<XprType> XprTraits;
+ typedef typename XprTraits::StorageKind StorageKind;
+ typedef typename XprTraits::Index Index;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = XprTraits::NumDimensions;
+ static const int Layout = XprTraits::Layout;
+ typedef typename XprTraits::PointerType PointerType;
+};
+
+template<typename Shuffle, typename XprType>
+struct eval<TensorShufflingOp<Shuffle, XprType>, Eigen::Dense>
+{
+ typedef const TensorShufflingOp<Shuffle, XprType>& type;
+};
+
+template<typename Shuffle, typename XprType>
+struct nested<TensorShufflingOp<Shuffle, XprType>, 1, typename eval<TensorShufflingOp<Shuffle, XprType> >::type>
+{
+ typedef TensorShufflingOp<Shuffle, XprType> type;
+};
+
+} // end namespace internal
+
+
+
+template<typename Shuffle, typename XprType>
+class TensorShufflingOp : public TensorBase<TensorShufflingOp<Shuffle, XprType> >
+{
+ public:
+ typedef TensorBase<TensorShufflingOp<Shuffle, XprType> > Base;
+ typedef typename Eigen::internal::traits<TensorShufflingOp>::Scalar Scalar;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename Eigen::internal::nested<TensorShufflingOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorShufflingOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorShufflingOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp(const XprType& expr, const Shuffle& shfl)
+ : m_xpr(expr), m_shuffle(shfl) {}
+
+ EIGEN_DEVICE_FUNC
+ const Shuffle& shufflePermutation() const { return m_shuffle; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type&
+ expression() const { return m_xpr; }
+
+ EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorShufflingOp)
+
+
+ protected:
+ typename XprType::Nested m_xpr;
+ const Shuffle m_shuffle;
+};
+
+
+// Eval as rvalue
+template<typename Shuffle, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
+{
+ typedef TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> Self;
+ typedef TensorShufflingOp<Shuffle, ArgType> XprType;
+ typedef typename XprType::Index Index;
+ static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+ typedef DSizes<Index, NumDims> Dimensions;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+
+ enum {
+ IsAligned = false,
+ PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+ BlockAccess = TensorEvaluator<ArgType, Device>::RawAccess,
+ PreferBlockAccess = true,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = false
+ };
+
+ typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+ typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+ typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
+ Layout, Index>
+ TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : m_device(device),
+ m_impl(op.expression(), device)
+ {
+ const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+ const Shuffle& shuffle = op.shufflePermutation();
+ m_is_identity = true;
+ for (int i = 0; i < NumDims; ++i) {
+ m_shuffle[i] = static_cast<int>(shuffle[i]);
+ m_dimensions[i] = input_dims[shuffle[i]];
+ m_inverseShuffle[shuffle[i]] = i;
+ if (m_is_identity && shuffle[i] != i) {
+ m_is_identity = false;
+ }
+ }
+
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ m_unshuffledInputStrides[0] = 1;
+ m_outputStrides[0] = 1;
+
+ for (int i = 1; i < NumDims; ++i) {
+ m_unshuffledInputStrides[i] =
+ m_unshuffledInputStrides[i - 1] * input_dims[i - 1];
+ m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+ m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(
+ m_outputStrides[i] > 0 ? m_outputStrides[i] : Index(1));
+ }
+ } else {
+ m_unshuffledInputStrides[NumDims - 1] = 1;
+ m_outputStrides[NumDims - 1] = 1;
+ for (int i = NumDims - 2; i >= 0; --i) {
+ m_unshuffledInputStrides[i] =
+ m_unshuffledInputStrides[i + 1] * input_dims[i + 1];
+ m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+ m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(
+ m_outputStrides[i] > 0 ? m_outputStrides[i] : Index(1));
+ }
+ }
+
+ for (int i = 0; i < NumDims; ++i) {
+ m_inputStrides[i] = m_unshuffledInputStrides[shuffle[i]];
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+ m_impl.evalSubExprsIfNeeded(NULL);
+ return true;
+ }
+
+#ifdef EIGEN_USE_THREADS
+ template <typename EvalSubExprsCallback>
+ EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+ EvaluatorPointerType, EvalSubExprsCallback done) {
+ m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+ }
+#endif // EIGEN_USE_THREADS
+
+ EIGEN_STRONG_INLINE void cleanup() {
+ m_impl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ if (m_is_identity) {
+ return m_impl.coeff(index);
+ } else {
+ return m_impl.coeff(srcCoeff(index));
+ }
+ }
+
+ template <int LoadMode, typename Self, bool ImplPacketAccess>
+ struct PacketLoader {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ static PacketReturnType Run(const Self& self, Index index) {
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < PacketSize; ++i) {
+ values[i] = self.coeff(index + i);
+ }
+ PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+ return rslt;
+ }
+ };
+
+ template<int LoadMode, typename Self>
+ struct PacketLoader<LoadMode, Self, true> {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ static PacketReturnType Run(const Self& self, Index index) {
+ if (self.m_is_identity) {
+ return self.m_impl.template packet<LoadMode>(index);
+ } else {
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < PacketSize; ++i) {
+ values[i] = self.coeff(index + i);
+ }
+ PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+ return rslt;
+ }
+ }
+ };
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+ {
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+ return PacketLoader<LoadMode, Self, TensorEvaluator<ArgType, Device>::PacketAccess>::Run(*this, index);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ internal::TensorBlockResourceRequirements getResourceRequirements() const {
+ static const int inner_dim =
+ Layout == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
+
+ const size_t target_size = m_device.firstLevelCacheSize();
+ const bool inner_dim_shuffled = m_shuffle[inner_dim] != inner_dim;
+
+ // Shuffled inner dimensions leads to a random memory access, which is not
+ // captured by default cost model bytes loaded/stored. We add this cost
+ // explicitly. The number of cycles picked based on the benchmarks.
+ // TODO(ezhulenev): This number was picked based on a very questionable
+ // benchmarks, add benchmarks that are representative of real workloads.
+ using BlockRequirements = internal::TensorBlockResourceRequirements;
+ if (inner_dim_shuffled) {
+ return BlockRequirements::uniform<Scalar>(target_size)
+ .addCostPerCoeff({0, 0, NumDims * 28});
+ } else {
+ return BlockRequirements::skewed<Scalar>(target_size);
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
+ block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+ bool root_of_expr_ast = false) const {
+ assert(m_impl.data() != NULL);
+
+ typedef internal::TensorBlockIO<ScalarNoConst, Index, NumDims, Layout>
+ TensorBlockIO;
+ typedef typename TensorBlockIO::Dst TensorBlockIODst;
+ typedef typename TensorBlockIO::Src TensorBlockIOSrc;
+
+ const typename TensorBlock::Storage block_storage =
+ TensorBlock::prepareStorage(
+ desc, scratch, /*allow_strided_storage=*/root_of_expr_ast);
+
+ typename TensorBlockIO::Dimensions input_strides(m_unshuffledInputStrides);
+ TensorBlockIOSrc src(input_strides, m_impl.data(), srcCoeff(desc.offset()));
+
+ TensorBlockIODst dst(block_storage.dimensions(), block_storage.strides(),
+ block_storage.data());
+
+ typename TensorBlockIO::DimensionsMap dst_to_src_dim_map(m_shuffle);
+ TensorBlockIO::Copy(dst, src, dst_to_src_dim_map);
+
+ return block_storage.AsTensorMaterializedBlock();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+ const double compute_cost = m_is_identity ? TensorOpCost::AddCost<Index>() :
+ NumDims * (2 * TensorOpCost::AddCost<Index>() +
+ 2 * TensorOpCost::MulCost<Index>() +
+ TensorOpCost::DivCost<Index>());
+ return m_impl.costPerCoeff(vectorized) +
+ TensorOpCost(0, 0, compute_cost, m_is_identity /* vectorized */, PacketSize);
+ }
+
+ EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_impl.bind(cgh);
+ }
+#endif
+ protected:
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index GetBlockOutputIndex(
+ Index input_index,
+ const DSizes<Index, NumDims>& input_block_strides,
+ const DSizes<Index, NumDims>& output_block_strides,
+ const DSizes<internal::TensorIntDivisor<Index>, NumDims>& fast_input_block_strides) const {
+ Index output_index = 0;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = NumDims - 1; i > 0; --i) {
+ const Index idx = input_index / fast_input_block_strides[i];
+ output_index += idx * output_block_strides[m_inverseShuffle[i]];
+ input_index -= idx * input_block_strides[i];
+ }
+ return output_index + input_index *
+ output_block_strides[m_inverseShuffle[0]];
+ } else {
+ for (int i = 0; i < NumDims - 1; ++i) {
+ const Index idx = input_index / fast_input_block_strides[i];
+ output_index += idx * output_block_strides[m_inverseShuffle[i]];
+ input_index -= idx * input_block_strides[i];
+ }
+ return output_index + input_index *
+ output_block_strides[m_inverseShuffle[NumDims - 1]];
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const {
+ Index inputIndex = 0;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = NumDims - 1; i > 0; --i) {
+ const Index idx = index / m_fastOutputStrides[i];
+ inputIndex += idx * m_inputStrides[i];
+ index -= idx * m_outputStrides[i];
+ }
+ return inputIndex + index * m_inputStrides[0];
+ } else {
+ for (int i = 0; i < NumDims - 1; ++i) {
+ const Index idx = index / m_fastOutputStrides[i];
+ inputIndex += idx * m_inputStrides[i];
+ index -= idx * m_outputStrides[i];
+ }
+ return inputIndex + index * m_inputStrides[NumDims - 1];
+ }
+ }
+
+ Dimensions m_dimensions;
+ bool m_is_identity;
+ array<int, NumDims> m_shuffle;
+ array<Index, NumDims> m_inverseShuffle; // TODO(ezhulenev): Make it int type.
+ array<Index, NumDims> m_outputStrides;
+ array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
+ array<Index, NumDims> m_inputStrides;
+ array<Index, NumDims> m_unshuffledInputStrides;
+
+ const Device EIGEN_DEVICE_REF m_device;
+ TensorEvaluator<ArgType, Device> m_impl;
+};
+
+
+// Eval as lvalue
+template<typename Shuffle, typename ArgType, typename Device>
+struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
+ : public TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
+{
+ typedef TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> Base;
+
+ typedef TensorShufflingOp<Shuffle, ArgType> XprType;
+ typedef typename XprType::Index Index;
+ static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+ typedef DSizes<Index, NumDims> Dimensions;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+
+ enum {
+ IsAligned = false,
+ PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+ BlockAccess = TensorEvaluator<ArgType, Device>::RawAccess,
+ PreferBlockAccess = true,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ RawAccess = false
+ };
+
+ typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : Base(op, device)
+ { }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+ {
+ return this->m_impl.coeffRef(this->srcCoeff(index));
+ }
+
+ template <int StoreMode> EIGEN_STRONG_INLINE
+ void writePacket(Index index, const PacketReturnType& x)
+ {
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < PacketSize; ++i) {
+ this->coeffRef(index+i) = values[i];
+ }
+ }
+
+ template <typename TensorBlock>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
+ const TensorBlockDesc& desc, const TensorBlock& block) {
+ eigen_assert(this->m_impl.data() != NULL);
+
+ typedef internal::TensorBlockIO<ScalarNoConst, Index, NumDims, Layout>
+ TensorBlockIO;
+ typedef typename TensorBlockIO::Dst TensorBlockIODst;
+ typedef typename TensorBlockIO::Src TensorBlockIOSrc;
+
+ const Scalar* block_buffer = block.data();
+
+ // TODO(ezhulenev): TensorBlockIO should be able to read from any Eigen
+ // expression with coefficient and packet access as `src`.
+ void* mem = NULL;
+ if (block_buffer == NULL) {
+ mem = this->m_device.allocate(desc.size() * sizeof(Scalar));
+ ScalarNoConst* buf = static_cast<ScalarNoConst*>(mem);
+
+ typedef internal::TensorBlockAssignment<
+ ScalarNoConst, NumDims, typename TensorBlock::XprType, Index>
+ TensorBlockAssignment;
+
+ TensorBlockAssignment::Run(
+ TensorBlockAssignment::target(
+ desc.dimensions(), internal::strides<Layout>(desc.dimensions()),
+ buf),
+ block.expr());
+
+ block_buffer = buf;
+ }
+
+ // Read from block.
+ TensorBlockIOSrc src(internal::strides<Layout>(desc.dimensions()),
+ block_buffer);
+
+ // Write to the output buffer.
+ typename TensorBlockIO::Dimensions output_strides(
+ this->m_unshuffledInputStrides);
+ typename TensorBlockIO::Dimensions output_dimensions;
+ for (int i = 0; i < NumDims; ++i) {
+ output_dimensions[this->m_shuffle[i]] = desc.dimension(i);
+ }
+ TensorBlockIODst dst(output_dimensions, output_strides, this->m_impl.data(),
+ this->srcCoeff(desc.offset()));
+
+ // Reorder dimensions according to the shuffle.
+ typename TensorBlockIO::DimensionsMap dst_to_src_dim_map;
+ for (int i = 0; i < NumDims; ++i) {
+ dst_to_src_dim_map[i] = static_cast<int>(this->m_inverseShuffle[i]);
+ }
+ TensorBlockIO::Copy(dst, src, dst_to_src_dim_map);
+
+ // Deallocate temporary buffer used for the block materialization.
+ if (mem != NULL) this->m_device.deallocate(mem);
+ }
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorStorage.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorStorage.h
new file mode 100644
index 0000000..5ff0880
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorStorage.h
@@ -0,0 +1,161 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+// Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSORSTORAGE_H
+#define EIGEN_CXX11_TENSOR_TENSORSTORAGE_H
+
+#ifdef EIGEN_TENSOR_STORAGE_CTOR_PLUGIN
+ #define EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN EIGEN_TENSOR_STORAGE_CTOR_PLUGIN;
+#else
+ #define EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN
+#endif
+
+namespace Eigen {
+
+/** \internal
+ *
+ * \class TensorStorage
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Stores the data of a tensor
+ *
+ * This class stores the data of fixed-size, dynamic-size or mixed tensors
+ * in a way as compact as possible.
+ *
+ * \sa Tensor
+ */
+template<typename T, typename Dimensions, int Options> class TensorStorage;
+
+
+// Pure fixed-size storage
+template<typename T, typename FixedDimensions, int Options_>
+class TensorStorage
+{
+ private:
+ static const std::size_t Size = FixedDimensions::total_size;
+
+ // Allocate an array of size at least one to prevent compiler warnings.
+ static const std::size_t MinSize = max_n_1<Size>::size;
+ EIGEN_ALIGN_MAX T m_data[MinSize];
+
+ public:
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE TensorStorage() {
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE T *data() { return m_data; }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const T *data() const { return m_data; }
+
+ static EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const FixedDimensions& dimensions()
+ {
+ static const FixedDimensions* singleton_dimensions = new FixedDimensions();
+ return *singleton_dimensions;
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE DenseIndex size() const { return Size; }
+};
+
+// pure dynamic
+template<typename T, typename IndexType, int NumIndices_, int Options_>
+class TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_>
+{
+ public:
+ typedef IndexType Index;
+ typedef DSizes<IndexType, NumIndices_> Dimensions;
+ typedef TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_> Self;
+
+ EIGEN_DEVICE_FUNC TensorStorage() : m_data(0), m_dimensions() {
+ if (NumIndices_ == 0) {
+ m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(1);
+ }
+ }
+ EIGEN_DEVICE_FUNC TensorStorage(internal::constructor_without_unaligned_array_assert)
+ : m_data(0), m_dimensions(internal::template repeat<NumIndices_, Index>(0)) {}
+ EIGEN_DEVICE_FUNC TensorStorage(Index size, const array<Index, NumIndices_>& dimensions)
+ : m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(size)), m_dimensions(dimensions)
+ { EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+ template <typename... DenseIndex>
+ EIGEN_DEVICE_FUNC TensorStorage(DenseIndex... indices) : m_dimensions(indices...) {
+ m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(internal::array_prod(m_dimensions));
+ }
+#endif
+
+ EIGEN_DEVICE_FUNC TensorStorage(const Self& other)
+ : m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(internal::array_prod(other.m_dimensions)))
+ , m_dimensions(other.m_dimensions)
+ {
+ internal::smart_copy(other.m_data, other.m_data+internal::array_prod(other.m_dimensions), m_data);
+ }
+ EIGEN_DEVICE_FUNC Self& operator=(const Self& other)
+ {
+ if (this != &other) {
+ Self tmp(other);
+ this->swap(tmp);
+ }
+ return *this;
+ }
+
+#if EIGEN_HAS_RVALUE_REFERENCES
+ EIGEN_DEVICE_FUNC TensorStorage(Self&& other) : TensorStorage()
+ {
+ *this = std::move(other);
+ }
+
+ EIGEN_DEVICE_FUNC Self& operator=(Self&& other)
+ {
+ numext::swap(m_data, other.m_data);
+ numext::swap(m_dimensions, other.m_dimensions);
+ return *this;
+ }
+#endif
+
+ EIGEN_DEVICE_FUNC ~TensorStorage() { internal::conditional_aligned_delete_auto<T,(Options_&DontAlign)==0>(m_data, internal::array_prod(m_dimensions)); }
+ EIGEN_DEVICE_FUNC void swap(Self& other)
+ { numext::swap(m_data,other.m_data); numext::swap(m_dimensions,other.m_dimensions); }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {return m_dimensions;}
+
+ EIGEN_DEVICE_FUNC void resize(Index size, const array<Index, NumIndices_>& nbDimensions)
+ {
+ const Index currentSz = internal::array_prod(m_dimensions);
+ if(size != currentSz)
+ {
+ internal::conditional_aligned_delete_auto<T,(Options_&DontAlign)==0>(m_data, currentSz);
+ if (size)
+ m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(size);
+ else if (NumIndices_ == 0) {
+ m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(1);
+ }
+ else
+ m_data = 0;
+ EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
+ }
+ m_dimensions = nbDimensions;
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T *data() { return m_data; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T *data() const { return m_data; }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); }
+
+ private:
+ T *m_data;
+ Dimensions m_dimensions;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSORSTORAGE_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorStriding.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorStriding.h
new file mode 100644
index 0000000..2f62a66
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorStriding.h
@@ -0,0 +1,346 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H
+
+namespace Eigen {
+
+/** \class TensorStriding
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor striding class.
+ *
+ *
+ */
+namespace internal {
+template<typename Strides, typename XprType>
+struct traits<TensorStridingOp<Strides, XprType> > : public traits<XprType>
+{
+ typedef typename XprType::Scalar Scalar;
+ typedef traits<XprType> XprTraits;
+ typedef typename XprTraits::StorageKind StorageKind;
+ typedef typename XprTraits::Index Index;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = XprTraits::NumDimensions;
+ static const int Layout = XprTraits::Layout;
+ typedef typename XprTraits::PointerType PointerType;
+};
+
+template<typename Strides, typename XprType>
+struct eval<TensorStridingOp<Strides, XprType>, Eigen::Dense>
+{
+ typedef const TensorStridingOp<Strides, XprType>EIGEN_DEVICE_REF type;
+};
+
+template<typename Strides, typename XprType>
+struct nested<TensorStridingOp<Strides, XprType>, 1, typename eval<TensorStridingOp<Strides, XprType> >::type>
+{
+ typedef TensorStridingOp<Strides, XprType> type;
+};
+
+} // end namespace internal
+
+
+
+template<typename Strides, typename XprType>
+class TensorStridingOp : public TensorBase<TensorStridingOp<Strides, XprType> >
+{
+ public:
+ typedef TensorBase<TensorStridingOp<Strides, XprType> > Base;
+ typedef typename Eigen::internal::traits<TensorStridingOp>::Scalar Scalar;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename Eigen::internal::nested<TensorStridingOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorStridingOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorStridingOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingOp(const XprType& expr, const Strides& dims)
+ : m_xpr(expr), m_dims(dims) {}
+
+ EIGEN_DEVICE_FUNC
+ const Strides& strides() const { return m_dims; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type&
+ expression() const { return m_xpr; }
+
+ EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorStridingOp)
+
+ protected:
+ typename XprType::Nested m_xpr;
+ const Strides m_dims;
+};
+
+
+// Eval as rvalue
+template<typename Strides, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
+{
+ typedef TensorStridingOp<Strides, ArgType> XprType;
+ typedef typename XprType::Index Index;
+ static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+ typedef DSizes<Index, NumDims> Dimensions;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+
+ enum {
+ IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = false,
+ PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = false
+ };
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : m_impl(op.expression(), device)
+ {
+ m_dimensions = m_impl.dimensions();
+ for (int i = 0; i < NumDims; ++i) {
+ m_dimensions[i] =Eigen::numext::ceil(static_cast<float>(m_dimensions[i]) / op.strides()[i]);
+ }
+
+ const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ m_outputStrides[0] = 1;
+ m_inputStrides[0] = 1;
+ for (int i = 1; i < NumDims; ++i) {
+ m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
+ m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
+ m_inputStrides[i-1] *= op.strides()[i-1];
+ }
+ m_inputStrides[NumDims-1] *= op.strides()[NumDims-1];
+ } else { // RowMajor
+ m_outputStrides[NumDims-1] = 1;
+ m_inputStrides[NumDims-1] = 1;
+ for (int i = NumDims - 2; i >= 0; --i) {
+ m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
+ m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
+ m_inputStrides[i+1] *= op.strides()[i+1];
+ }
+ m_inputStrides[0] *= op.strides()[0];
+ }
+ }
+
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType/*data*/) {
+ m_impl.evalSubExprsIfNeeded(NULL);
+ return true;
+ }
+ EIGEN_STRONG_INLINE void cleanup() {
+ m_impl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ return m_impl.coeff(srcCoeff(index));
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+ {
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+ Index inputIndices[] = {0, 0};
+ Index indices[] = {index, index + PacketSize - 1};
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ EIGEN_UNROLL_LOOP
+ for (int i = NumDims - 1; i > 0; --i) {
+ const Index idx0 = indices[0] / m_outputStrides[i];
+ const Index idx1 = indices[1] / m_outputStrides[i];
+ inputIndices[0] += idx0 * m_inputStrides[i];
+ inputIndices[1] += idx1 * m_inputStrides[i];
+ indices[0] -= idx0 * m_outputStrides[i];
+ indices[1] -= idx1 * m_outputStrides[i];
+ }
+ inputIndices[0] += indices[0] * m_inputStrides[0];
+ inputIndices[1] += indices[1] * m_inputStrides[0];
+ } else { // RowMajor
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < NumDims - 1; ++i) {
+ const Index idx0 = indices[0] / m_outputStrides[i];
+ const Index idx1 = indices[1] / m_outputStrides[i];
+ inputIndices[0] += idx0 * m_inputStrides[i];
+ inputIndices[1] += idx1 * m_inputStrides[i];
+ indices[0] -= idx0 * m_outputStrides[i];
+ indices[1] -= idx1 * m_outputStrides[i];
+ }
+ inputIndices[0] += indices[0] * m_inputStrides[NumDims-1];
+ inputIndices[1] += indices[1] * m_inputStrides[NumDims-1];
+ }
+ if (inputIndices[1] - inputIndices[0] == PacketSize - 1) {
+ PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
+ return rslt;
+ }
+ else {
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ values[0] = m_impl.coeff(inputIndices[0]);
+ values[PacketSize-1] = m_impl.coeff(inputIndices[1]);
+ EIGEN_UNROLL_LOOP
+ for (int i = 1; i < PacketSize-1; ++i) {
+ values[i] = coeff(index+i);
+ }
+ PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+ return rslt;
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+ double compute_cost = (NumDims - 1) * (TensorOpCost::AddCost<Index>() +
+ TensorOpCost::MulCost<Index>() +
+ TensorOpCost::DivCost<Index>()) +
+ TensorOpCost::MulCost<Index>();
+ if (vectorized) {
+ compute_cost *= 2; // packet() computes two indices
+ }
+ const int innerDim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : (NumDims - 1);
+ return m_impl.costPerCoeff(vectorized && m_inputStrides[innerDim] == 1) +
+ // Computation is not vectorized per se, but it is done once per packet.
+ TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
+ }
+
+ EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_impl.bind(cgh);
+ }
+#endif
+ protected:
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
+ {
+ Index inputIndex = 0;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ EIGEN_UNROLL_LOOP
+ for (int i = NumDims - 1; i > 0; --i) {
+ const Index idx = index / m_outputStrides[i];
+ inputIndex += idx * m_inputStrides[i];
+ index -= idx * m_outputStrides[i];
+ }
+ inputIndex += index * m_inputStrides[0];
+ } else { // RowMajor
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < NumDims - 1; ++i) {
+ const Index idx = index / m_outputStrides[i];
+ inputIndex += idx * m_inputStrides[i];
+ index -= idx * m_outputStrides[i];
+ }
+ inputIndex += index * m_inputStrides[NumDims-1];
+ }
+ return inputIndex;
+ }
+
+ Dimensions m_dimensions;
+ array<Index, NumDims> m_outputStrides;
+ array<Index, NumDims> m_inputStrides;
+ TensorEvaluator<ArgType, Device> m_impl;
+};
+
+// Eval as lvalue
+template<typename Strides, typename ArgType, typename Device>
+struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
+ : public TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
+{
+ typedef TensorStridingOp<Strides, ArgType> XprType;
+ typedef TensorEvaluator<const XprType, Device> Base;
+ // typedef typename XprType::Index Index;
+ static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+ // typedef DSizes<Index, NumDims> Dimensions;
+
+ enum {
+ IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ PreferBlockAccess = false,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false, // to be implemented
+ RawAccess = false
+ };
+
+ EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : Base(op, device) { }
+
+ typedef typename XprType::Index Index;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
+ {
+ return this->m_impl.coeffRef(this->srcCoeff(index));
+ }
+
+ template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ void writePacket(Index index, const PacketReturnType& x)
+ {
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+PacketSize-1 < this->dimensions().TotalSize());
+
+ Index inputIndices[] = {0, 0};
+ Index indices[] = {index, index + PacketSize - 1};
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ EIGEN_UNROLL_LOOP
+ for (int i = NumDims - 1; i > 0; --i) {
+ const Index idx0 = indices[0] / this->m_outputStrides[i];
+ const Index idx1 = indices[1] / this->m_outputStrides[i];
+ inputIndices[0] += idx0 * this->m_inputStrides[i];
+ inputIndices[1] += idx1 * this->m_inputStrides[i];
+ indices[0] -= idx0 * this->m_outputStrides[i];
+ indices[1] -= idx1 * this->m_outputStrides[i];
+ }
+ inputIndices[0] += indices[0] * this->m_inputStrides[0];
+ inputIndices[1] += indices[1] * this->m_inputStrides[0];
+ } else { // RowMajor
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < NumDims - 1; ++i) {
+ const Index idx0 = indices[0] / this->m_outputStrides[i];
+ const Index idx1 = indices[1] / this->m_outputStrides[i];
+ inputIndices[0] += idx0 * this->m_inputStrides[i];
+ inputIndices[1] += idx1 * this->m_inputStrides[i];
+ indices[0] -= idx0 * this->m_outputStrides[i];
+ indices[1] -= idx1 * this->m_outputStrides[i];
+ }
+ inputIndices[0] += indices[0] * this->m_inputStrides[NumDims-1];
+ inputIndices[1] += indices[1] * this->m_inputStrides[NumDims-1];
+ }
+ if (inputIndices[1] - inputIndices[0] == PacketSize - 1) {
+ this->m_impl.template writePacket<Unaligned>(inputIndices[0], x);
+ }
+ else {
+ EIGEN_ALIGN_MAX Scalar values[PacketSize];
+ internal::pstore<Scalar, PacketReturnType>(values, x);
+ this->m_impl.coeffRef(inputIndices[0]) = values[0];
+ this->m_impl.coeffRef(inputIndices[1]) = values[PacketSize-1];
+ EIGEN_UNROLL_LOOP
+ for (int i = 1; i < PacketSize-1; ++i) {
+ this->coeffRef(index+i) = values[i];
+ }
+ }
+ }
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorTrace.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorTrace.h
new file mode 100644
index 0000000..926ecdd
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorTrace.h
@@ -0,0 +1,303 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gagan Goel <gagan.nith@gmail.com>
+// Copyright (C) 2017 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_TRACE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_TRACE_H
+
+namespace Eigen {
+
+/** \class TensorTrace
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor Trace class.
+ *
+ *
+ */
+
+namespace internal {
+template<typename Dims, typename XprType>
+struct traits<TensorTraceOp<Dims, XprType> > : public traits<XprType>
+{
+ typedef typename XprType::Scalar Scalar;
+ typedef traits<XprType> XprTraits;
+ typedef typename XprTraits::StorageKind StorageKind;
+ typedef typename XprTraits::Index Index;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
+ static const int Layout = XprTraits::Layout;
+};
+
+template<typename Dims, typename XprType>
+struct eval<TensorTraceOp<Dims, XprType>, Eigen::Dense>
+{
+ typedef const TensorTraceOp<Dims, XprType>& type;
+};
+
+template<typename Dims, typename XprType>
+struct nested<TensorTraceOp<Dims, XprType>, 1, typename eval<TensorTraceOp<Dims, XprType> >::type>
+{
+ typedef TensorTraceOp<Dims, XprType> type;
+};
+
+} // end namespace internal
+
+
+template<typename Dims, typename XprType>
+class TensorTraceOp : public TensorBase<TensorTraceOp<Dims, XprType> >
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorTraceOp>::Scalar Scalar;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename Eigen::internal::nested<TensorTraceOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorTraceOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorTraceOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTraceOp(const XprType& expr, const Dims& dims)
+ : m_xpr(expr), m_dims(dims) {
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const Dims& dims() const { return m_dims; }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const typename internal::remove_all<typename XprType::Nested>::type& expression() const { return m_xpr; }
+
+ protected:
+ typename XprType::Nested m_xpr;
+ const Dims m_dims;
+};
+
+
+// Eval as rvalue
+template<typename Dims, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
+{
+ typedef TensorTraceOp<Dims, ArgType> XprType;
+ static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+ static const int NumReducedDims = internal::array_size<Dims>::value;
+ static const int NumOutputDims = NumInputDims - NumReducedDims;
+ typedef typename XprType::Index Index;
+ typedef DSizes<Index, NumOutputDims> Dimensions;
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+
+ enum {
+ IsAligned = false,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = false,
+ PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false,
+ RawAccess = false
+ };
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+ : m_impl(op.expression(), device), m_traceDim(1), m_device(device)
+ {
+
+ EIGEN_STATIC_ASSERT((NumOutputDims >= 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+ EIGEN_STATIC_ASSERT((NumReducedDims >= 2) || ((NumReducedDims == 0) && (NumInputDims == 0)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+ for (int i = 0; i < NumInputDims; ++i) {
+ m_reduced[i] = false;
+ }
+
+ const Dims& op_dims = op.dims();
+ for (int i = 0; i < NumReducedDims; ++i) {
+ eigen_assert(op_dims[i] >= 0);
+ eigen_assert(op_dims[i] < NumInputDims);
+ m_reduced[op_dims[i]] = true;
+ }
+
+ // All the dimensions should be distinct to compute the trace
+ int num_distinct_reduce_dims = 0;
+ for (int i = 0; i < NumInputDims; ++i) {
+ if (m_reduced[i]) {
+ ++num_distinct_reduce_dims;
+ }
+ }
+
+ eigen_assert(num_distinct_reduce_dims == NumReducedDims);
+
+ // Compute the dimensions of the result.
+ const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+
+ int output_index = 0;
+ int reduced_index = 0;
+ for (int i = 0; i < NumInputDims; ++i) {
+ if (m_reduced[i]) {
+ m_reducedDims[reduced_index] = input_dims[i];
+ if (reduced_index > 0) {
+ // All the trace dimensions must have the same size
+ eigen_assert(m_reducedDims[0] == m_reducedDims[reduced_index]);
+ }
+ ++reduced_index;
+ }
+ else {
+ m_dimensions[output_index] = input_dims[i];
+ ++output_index;
+ }
+ }
+
+ if (NumReducedDims != 0) {
+ m_traceDim = m_reducedDims[0];
+ }
+
+ // Compute the output strides
+ if (NumOutputDims > 0) {
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ m_outputStrides[0] = 1;
+ for (int i = 1; i < NumOutputDims; ++i) {
+ m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+ }
+ }
+ else {
+ m_outputStrides.back() = 1;
+ for (int i = NumOutputDims - 2; i >= 0; --i) {
+ m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+ }
+ }
+ }
+
+ // Compute the input strides
+ if (NumInputDims > 0) {
+ array<Index, NumInputDims> input_strides;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ input_strides[0] = 1;
+ for (int i = 1; i < NumInputDims; ++i) {
+ input_strides[i] = input_strides[i - 1] * input_dims[i - 1];
+ }
+ }
+ else {
+ input_strides.back() = 1;
+ for (int i = NumInputDims - 2; i >= 0; --i) {
+ input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
+ }
+ }
+
+ output_index = 0;
+ reduced_index = 0;
+ for (int i = 0; i < NumInputDims; ++i) {
+ if(m_reduced[i]) {
+ m_reducedStrides[reduced_index] = input_strides[i];
+ ++reduced_index;
+ }
+ else {
+ m_preservedStrides[output_index] = input_strides[i];
+ ++output_index;
+ }
+ }
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
+ return m_dimensions;
+ }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+ m_impl.evalSubExprsIfNeeded(NULL);
+ return true;
+ }
+
+ EIGEN_STRONG_INLINE void cleanup() {
+ m_impl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ // Initialize the result
+ CoeffReturnType result = internal::cast<int, CoeffReturnType>(0);
+ Index index_stride = 0;
+ for (int i = 0; i < NumReducedDims; ++i) {
+ index_stride += m_reducedStrides[i];
+ }
+
+ // If trace is requested along all dimensions, starting index would be 0
+ Index cur_index = 0;
+ if (NumOutputDims != 0)
+ cur_index = firstInput(index);
+ for (Index i = 0; i < m_traceDim; ++i) {
+ result += m_impl.coeff(cur_index);
+ cur_index += index_stride;
+ }
+
+ return result;
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
+ eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ for (int i = 0; i < PacketSize; ++i) {
+ values[i] = coeff(index + i);
+ }
+ PacketReturnType result = internal::ploadt<PacketReturnType, LoadMode>(values);
+ return result;
+ }
+
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_impl.bind(cgh);
+ }
+#endif
+
+ protected:
+ // Given the output index, finds the first index in the input tensor used to compute the trace
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
+ Index startInput = 0;
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ for (int i = NumOutputDims - 1; i > 0; --i) {
+ const Index idx = index / m_outputStrides[i];
+ startInput += idx * m_preservedStrides[i];
+ index -= idx * m_outputStrides[i];
+ }
+ startInput += index * m_preservedStrides[0];
+ }
+ else {
+ for (int i = 0; i < NumOutputDims - 1; ++i) {
+ const Index idx = index / m_outputStrides[i];
+ startInput += idx * m_preservedStrides[i];
+ index -= idx * m_outputStrides[i];
+ }
+ startInput += index * m_preservedStrides[NumOutputDims - 1];
+ }
+ return startInput;
+ }
+
+ Dimensions m_dimensions;
+ TensorEvaluator<ArgType, Device> m_impl;
+ // Initialize the size of the trace dimension
+ Index m_traceDim;
+ const Device EIGEN_DEVICE_REF m_device;
+ array<bool, NumInputDims> m_reduced;
+ array<Index, NumReducedDims> m_reducedDims;
+ array<Index, NumOutputDims> m_outputStrides;
+ array<Index, NumReducedDims> m_reducedStrides;
+ array<Index, NumOutputDims> m_preservedStrides;
+};
+
+
+} // End namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_TRACE_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorTraits.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorTraits.h
new file mode 100644
index 0000000..4f7fd34
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorTraits.h
@@ -0,0 +1,264 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H
+
+namespace Eigen {
+namespace internal {
+
+
+template<typename Scalar, int Options>
+class compute_tensor_flags
+{
+ enum {
+ is_dynamic_size_storage = 1,
+
+ is_aligned =
+ (
+ ((Options&DontAlign)==0) && (
+#if EIGEN_MAX_STATIC_ALIGN_BYTES>0
+ (!is_dynamic_size_storage)
+#else
+ 0
+#endif
+ |
+#if EIGEN_MAX_ALIGN_BYTES>0
+ is_dynamic_size_storage
+#else
+ 0
+#endif
+ )
+ ),
+ packet_access_bit = packet_traits<Scalar>::Vectorizable && is_aligned ? PacketAccessBit : 0
+ };
+
+ public:
+ enum { ret = packet_access_bit };
+};
+
+
+template<typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
+struct traits<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
+{
+ typedef Scalar_ Scalar;
+ typedef Dense StorageKind;
+ typedef IndexType_ Index;
+ static const int NumDimensions = NumIndices_;
+ static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor;
+ enum {
+ Options = Options_,
+ Flags = compute_tensor_flags<Scalar_, Options_>::ret | (is_const<Scalar_>::value ? 0 : LvalueBit)
+ };
+ template <typename T> struct MakePointer {
+ typedef T* Type;
+ };
+ typedef typename MakePointer<Scalar>::Type PointerType;
+};
+
+
+template<typename Scalar_, typename Dimensions, int Options_, typename IndexType_>
+struct traits<TensorFixedSize<Scalar_, Dimensions, Options_, IndexType_> >
+{
+ typedef Scalar_ Scalar;
+ typedef Dense StorageKind;
+ typedef IndexType_ Index;
+ static const int NumDimensions = array_size<Dimensions>::value;
+ static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor;
+ enum {
+ Options = Options_,
+ Flags = compute_tensor_flags<Scalar_, Options_>::ret | (is_const<Scalar_>::value ? 0: LvalueBit)
+ };
+ template <typename T> struct MakePointer {
+ typedef T* Type;
+ };
+ typedef typename MakePointer<Scalar>::Type PointerType;
+};
+
+
+template<typename PlainObjectType, int Options_, template <class> class MakePointer_>
+struct traits<TensorMap<PlainObjectType, Options_, MakePointer_> >
+ : public traits<PlainObjectType>
+{
+ typedef traits<PlainObjectType> BaseTraits;
+ typedef typename BaseTraits::Scalar Scalar;
+ typedef typename BaseTraits::StorageKind StorageKind;
+ typedef typename BaseTraits::Index Index;
+ static const int NumDimensions = BaseTraits::NumDimensions;
+ static const int Layout = BaseTraits::Layout;
+ enum {
+ Options = Options_,
+ Flags = BaseTraits::Flags
+ };
+ template <class T> struct MakePointer {
+ // Intermediate typedef to workaround MSVC issue.
+ typedef MakePointer_<T> MakePointerT;
+ typedef typename MakePointerT::Type Type;
+ };
+ typedef typename MakePointer<Scalar>::Type PointerType;
+};
+
+template<typename PlainObjectType>
+struct traits<TensorRef<PlainObjectType> >
+ : public traits<PlainObjectType>
+{
+ typedef traits<PlainObjectType> BaseTraits;
+ typedef typename BaseTraits::Scalar Scalar;
+ typedef typename BaseTraits::StorageKind StorageKind;
+ typedef typename BaseTraits::Index Index;
+ static const int NumDimensions = BaseTraits::NumDimensions;
+ static const int Layout = BaseTraits::Layout;
+ enum {
+ Options = BaseTraits::Options,
+ Flags = BaseTraits::Flags
+ };
+ typedef typename BaseTraits::PointerType PointerType;
+};
+
+
+template<typename _Scalar, int NumIndices_, int Options, typename IndexType_>
+struct eval<Tensor<_Scalar, NumIndices_, Options, IndexType_>, Eigen::Dense>
+{
+ typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>EIGEN_DEVICE_REF type;
+};
+
+template<typename _Scalar, int NumIndices_, int Options, typename IndexType_>
+struct eval<const Tensor<_Scalar, NumIndices_, Options, IndexType_>, Eigen::Dense>
+{
+ typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>EIGEN_DEVICE_REF type;
+};
+
+template<typename Scalar_, typename Dimensions, int Options, typename IndexType_>
+struct eval<TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, Eigen::Dense>
+{
+ typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type;
+};
+
+template<typename Scalar_, typename Dimensions, int Options, typename IndexType_>
+struct eval<const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, Eigen::Dense>
+{
+ typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type;
+};
+
+template<typename PlainObjectType, int Options, template <class> class MakePointer>
+struct eval<TensorMap<PlainObjectType, Options, MakePointer>, Eigen::Dense>
+{
+ typedef const TensorMap<PlainObjectType, Options, MakePointer>EIGEN_DEVICE_REF type;
+};
+
+template<typename PlainObjectType, int Options, template <class> class MakePointer>
+struct eval<const TensorMap<PlainObjectType, Options, MakePointer>, Eigen::Dense>
+{
+ typedef const TensorMap<PlainObjectType, Options, MakePointer>EIGEN_DEVICE_REF type;
+};
+
+template<typename PlainObjectType>
+struct eval<TensorRef<PlainObjectType>, Eigen::Dense>
+{
+ typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type;
+};
+
+template<typename PlainObjectType>
+struct eval<const TensorRef<PlainObjectType>, Eigen::Dense>
+{
+ typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type;
+};
+
+// TODO nested<> does not exist anymore in Eigen/Core, and it thus has to be removed in favor of ref_selector.
+template<typename T, int n=1, typename PlainObject = void> struct nested
+{
+ typedef typename ref_selector<T>::type type;
+};
+
+template <typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
+struct nested<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
+{
+ typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>EIGEN_DEVICE_REF type;
+};
+
+template <typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
+struct nested<const Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
+{
+ typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>EIGEN_DEVICE_REF type;
+};
+
+template <typename Scalar_, typename Dimensions, int Options, typename IndexType_>
+struct nested<TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> >
+{
+ typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type;
+};
+
+template <typename Scalar_, typename Dimensions, int Options, typename IndexType_>
+struct nested<const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> >
+{
+ typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type;
+};
+
+
+template <typename PlainObjectType>
+struct nested<TensorRef<PlainObjectType> >
+{
+ typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type;
+};
+
+template <typename PlainObjectType>
+struct nested<const TensorRef<PlainObjectType> >
+{
+ typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type;
+};
+
+} // end namespace internal
+
+// Convolutional layers take in an input tensor of shape (D, R, C, B), or (D, C,
+// R, B), and convolve it with a set of filters, which can also be presented as
+// a tensor (D, K, K, M), where M is the number of filters, K is the filter
+// size, and each 3-dimensional tensor of size (D, K, K) is a filter. For
+// simplicity we assume that we always use square filters (which is usually the
+// case in images), hence the two Ks in the tensor dimension. It also takes in
+// a few additional parameters:
+// Stride (S): The convolution stride is the offset between locations where we
+// apply the filters. A larger stride means that the output will be
+// spatially smaller.
+// Padding (P): The padding we apply to the input tensor along the R and C
+// dimensions. This is usually used to make sure that the spatial
+// dimensions of the output matches our intention.
+//
+// Two types of padding are often used:
+// SAME: The pad value is computed so that the output will have size
+// R/S and C/S.
+// VALID: no padding is carried out.
+// When we do padding, the padded values at the padded locations are usually
+// zero.
+//
+// The output dimensions for convolution, when given all the parameters above,
+// are as follows:
+// When Padding = SAME: the output size is (B, R', C', M), where
+// R' = ceil(float(R) / float(S))
+// C' = ceil(float(C) / float(S))
+// where ceil is the ceiling function. The input tensor is padded with 0 as
+// needed. The number of padded rows and columns are computed as:
+// Pr = ((R' - 1) * S + K - R) / 2
+// Pc = ((C' - 1) * S + K - C) / 2
+// when the stride is 1, we have the simplified case R'=R, C'=C, Pr=Pc=(K-1)/2.
+// This is where SAME comes from - the output has the same size as the input has.
+// When Padding = VALID: the output size is computed as
+// R' = ceil(float(R - K + 1) / float(S))
+// C' = ceil(float(C - K + 1) / float(S))
+// and the number of padded rows and columns are computed in the same way as in
+// the SAME case.
+// When the stride is 1, we have the simplified case R'=R-K+1, C'=C-K+1, Pr=0,
+// Pc=0.
+typedef enum {
+ PADDING_VALID = 1,
+ PADDING_SAME = 2
+} PaddingType;
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorUInt128.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorUInt128.h
new file mode 100644
index 0000000..d23f2e4
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorUInt128.h
@@ -0,0 +1,249 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_UINT128_H
+#define EIGEN_CXX11_TENSOR_TENSOR_UINT128_H
+
+namespace Eigen {
+namespace internal {
+
+
+template <uint64_t n>
+struct static_val {
+ static const uint64_t value = n;
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE operator uint64_t() const { return n; }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val() { }
+
+ template <typename T>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val(const T& v) {
+ EIGEN_UNUSED_VARIABLE(v);
+ eigen_assert(v == n);
+ }
+};
+
+
+template <typename HIGH = uint64_t, typename LOW = uint64_t>
+struct TensorUInt128
+{
+ HIGH high;
+ LOW low;
+
+ template<typename OTHER_HIGH, typename OTHER_LOW>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+ TensorUInt128(const TensorUInt128<OTHER_HIGH, OTHER_LOW>& other) : high(other.high), low(other.low) {
+ EIGEN_STATIC_ASSERT(sizeof(OTHER_HIGH) <= sizeof(HIGH), YOU_MADE_A_PROGRAMMING_MISTAKE);
+ EIGEN_STATIC_ASSERT(sizeof(OTHER_LOW) <= sizeof(LOW), YOU_MADE_A_PROGRAMMING_MISTAKE);
+ }
+
+ template<typename OTHER_HIGH, typename OTHER_LOW>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+ TensorUInt128& operator = (const TensorUInt128<OTHER_HIGH, OTHER_LOW>& other) {
+ EIGEN_STATIC_ASSERT(sizeof(OTHER_HIGH) <= sizeof(HIGH), YOU_MADE_A_PROGRAMMING_MISTAKE);
+ EIGEN_STATIC_ASSERT(sizeof(OTHER_LOW) <= sizeof(LOW), YOU_MADE_A_PROGRAMMING_MISTAKE);
+ high = other.high;
+ low = other.low;
+ return *this;
+ }
+
+ template<typename T>
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+ explicit TensorUInt128(const T& x) : high(0), low(x) {
+ eigen_assert((static_cast<typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type>(x) <= NumTraits<uint64_t>::highest()));
+ eigen_assert(x >= 0);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+ TensorUInt128(HIGH y, LOW x) : high(y), low(x) { }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE operator LOW() const {
+ return low;
+ }
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LOW lower() const {
+ return low;
+ }
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HIGH upper() const {
+ return high;
+ }
+};
+
+
+template <typename HL, typename LL, typename HR, typename LR>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+bool operator == (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+{
+ return (lhs.high == rhs.high) & (lhs.low == rhs.low);
+}
+
+template <typename HL, typename LL, typename HR, typename LR>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+bool operator != (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+{
+ return (lhs.high != rhs.high) | (lhs.low != rhs.low);
+}
+
+template <typename HL, typename LL, typename HR, typename LR>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+bool operator >= (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+{
+ if (lhs.high != rhs.high) {
+ return lhs.high > rhs.high;
+ }
+ return lhs.low >= rhs.low;
+}
+
+template <typename HL, typename LL, typename HR, typename LR>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+bool operator < (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+{
+ if (lhs.high != rhs.high) {
+ return lhs.high < rhs.high;
+ }
+ return lhs.low < rhs.low;
+}
+
+template <typename HL, typename LL, typename HR, typename LR>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+TensorUInt128<uint64_t, uint64_t> operator + (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+{
+ TensorUInt128<uint64_t, uint64_t> result(lhs.high + rhs.high, lhs.low + rhs.low);
+ if (result.low < rhs.low) {
+ result.high += 1;
+ }
+ return result;
+}
+
+template <typename HL, typename LL, typename HR, typename LR>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+TensorUInt128<uint64_t, uint64_t> operator - (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+{
+ TensorUInt128<uint64_t, uint64_t> result(lhs.high - rhs.high, lhs.low - rhs.low);
+ if (result.low > lhs.low) {
+ result.high -= 1;
+ }
+ return result;
+}
+
+
+template <typename HL, typename LL, typename HR, typename LR>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+TensorUInt128<uint64_t, uint64_t> operator * (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+{
+ // Split each 128-bit integer into 4 32-bit integers, and then do the
+ // multiplications by hand as follow:
+ // lhs a b c d
+ // rhs e f g h
+ // -----------
+ // ah bh ch dh
+ // bg cg dg
+ // cf df
+ // de
+ // The result is stored in 2 64bit integers, high and low.
+
+ const uint64_t LOW = 0x00000000FFFFFFFFLL;
+ const uint64_t HIGH = 0xFFFFFFFF00000000LL;
+
+ uint64_t d = lhs.low & LOW;
+ uint64_t c = (lhs.low & HIGH) >> 32LL;
+ uint64_t b = lhs.high & LOW;
+ uint64_t a = (lhs.high & HIGH) >> 32LL;
+
+ uint64_t h = rhs.low & LOW;
+ uint64_t g = (rhs.low & HIGH) >> 32LL;
+ uint64_t f = rhs.high & LOW;
+ uint64_t e = (rhs.high & HIGH) >> 32LL;
+
+ // Compute the low 32 bits of low
+ uint64_t acc = d * h;
+ uint64_t low = acc & LOW;
+ // Compute the high 32 bits of low. Add a carry every time we wrap around
+ acc >>= 32LL;
+ uint64_t carry = 0;
+ uint64_t acc2 = acc + c * h;
+ if (acc2 < acc) {
+ carry++;
+ }
+ acc = acc2 + d * g;
+ if (acc < acc2) {
+ carry++;
+ }
+ low |= (acc << 32LL);
+
+ // Carry forward the high bits of acc to initiate the computation of the
+ // low 32 bits of high
+ acc2 = (acc >> 32LL) | (carry << 32LL);
+ carry = 0;
+
+ acc = acc2 + b * h;
+ if (acc < acc2) {
+ carry++;
+ }
+ acc2 = acc + c * g;
+ if (acc2 < acc) {
+ carry++;
+ }
+ acc = acc2 + d * f;
+ if (acc < acc2) {
+ carry++;
+ }
+ uint64_t high = acc & LOW;
+
+ // Start to compute the high 32 bits of high.
+ acc2 = (acc >> 32LL) | (carry << 32LL);
+
+ acc = acc2 + a * h;
+ acc2 = acc + b * g;
+ acc = acc2 + c * f;
+ acc2 = acc + d * e;
+ high |= (acc2 << 32LL);
+
+ return TensorUInt128<uint64_t, uint64_t>(high, low);
+}
+
+template <typename HL, typename LL, typename HR, typename LR>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+TensorUInt128<uint64_t, uint64_t> operator / (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+{
+ if (rhs == TensorUInt128<static_val<0>, static_val<1> >(1)) {
+ return TensorUInt128<uint64_t, uint64_t>(lhs.high, lhs.low);
+ } else if (lhs < rhs) {
+ return TensorUInt128<uint64_t, uint64_t>(0);
+ } else {
+ // calculate the biggest power of 2 times rhs that's less than or equal to lhs
+ TensorUInt128<uint64_t, uint64_t> power2(1);
+ TensorUInt128<uint64_t, uint64_t> d(rhs);
+ TensorUInt128<uint64_t, uint64_t> tmp(lhs - d);
+ while (lhs >= d) {
+ tmp = tmp - d;
+ d = d + d;
+ power2 = power2 + power2;
+ }
+
+ tmp = TensorUInt128<uint64_t, uint64_t>(lhs.high, lhs.low);
+ TensorUInt128<uint64_t, uint64_t> result(0);
+ while (power2 != TensorUInt128<static_val<0>, static_val<0> >(0)) {
+ if (tmp >= d) {
+ tmp = tmp - d;
+ result = result + power2;
+ }
+ // Shift right
+ power2 = TensorUInt128<uint64_t, uint64_t>(power2.high >> 1, (power2.low >> 1) | (power2.high << 63));
+ d = TensorUInt128<uint64_t, uint64_t>(d.high >> 1, (d.low >> 1) | (d.high << 63));
+ }
+
+ return result;
+ }
+}
+
+
+} // namespace internal
+} // namespace Eigen
+
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_UINT128_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorVolumePatch.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorVolumePatch.h
new file mode 100644
index 0000000..0beb9ff
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorVolumePatch.h
@@ -0,0 +1,629 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H
+#define EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H
+
+namespace Eigen {
+
+/** \class TensorVolumePatch
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Patch extraction specialized for processing of volumetric data.
+ * This assumes that the input has a least 4 dimensions ordered as follows:
+ * - channels
+ * - planes
+ * - rows
+ * - columns
+ * - (optional) additional dimensions such as time or batch size.
+ * Calling the volume patch code with patch_planes, patch_rows, and patch_cols
+ * is equivalent to calling the regular patch extraction code with parameters
+ * d, patch_planes, patch_rows, patch_cols, and 1 for all the additional
+ * dimensions.
+ */
+namespace internal {
+
+template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
+struct traits<TensorVolumePatchOp<Planes, Rows, Cols, XprType> > : public traits<XprType>
+{
+ typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+ typedef traits<XprType> XprTraits;
+ typedef typename XprTraits::StorageKind StorageKind;
+ typedef typename XprTraits::Index Index;
+ typedef typename XprType::Nested Nested;
+ typedef typename remove_reference<Nested>::type _Nested;
+ static const int NumDimensions = XprTraits::NumDimensions + 1;
+ static const int Layout = XprTraits::Layout;
+ typedef typename XprTraits::PointerType PointerType;
+
+};
+
+template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
+struct eval<TensorVolumePatchOp<Planes, Rows, Cols, XprType>, Eigen::Dense>
+{
+ typedef const TensorVolumePatchOp<Planes, Rows, Cols, XprType>& type;
+};
+
+template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
+struct nested<TensorVolumePatchOp<Planes, Rows, Cols, XprType>, 1, typename eval<TensorVolumePatchOp<Planes, Rows, Cols, XprType> >::type>
+{
+ typedef TensorVolumePatchOp<Planes, Rows, Cols, XprType> type;
+};
+
+} // end namespace internal
+
+template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
+class TensorVolumePatchOp : public TensorBase<TensorVolumePatchOp<Planes, Rows, Cols, XprType>, ReadOnlyAccessors>
+{
+ public:
+ typedef typename Eigen::internal::traits<TensorVolumePatchOp>::Scalar Scalar;
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename Eigen::internal::nested<TensorVolumePatchOp>::type Nested;
+ typedef typename Eigen::internal::traits<TensorVolumePatchOp>::StorageKind StorageKind;
+ typedef typename Eigen::internal::traits<TensorVolumePatchOp>::Index Index;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorVolumePatchOp(const XprType& expr, DenseIndex patch_planes, DenseIndex patch_rows, DenseIndex patch_cols,
+ DenseIndex plane_strides, DenseIndex row_strides, DenseIndex col_strides,
+ DenseIndex in_plane_strides, DenseIndex in_row_strides, DenseIndex in_col_strides,
+ DenseIndex plane_inflate_strides, DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
+ PaddingType padding_type, Scalar padding_value)
+ : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
+ m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides),
+ m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
+ m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
+ m_padding_explicit(false), m_padding_top_z(0), m_padding_bottom_z(0), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0),
+ m_padding_type(padding_type), m_padding_value(padding_value) {}
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorVolumePatchOp(const XprType& expr, DenseIndex patch_planes, DenseIndex patch_rows, DenseIndex patch_cols,
+ DenseIndex plane_strides, DenseIndex row_strides, DenseIndex col_strides,
+ DenseIndex in_plane_strides, DenseIndex in_row_strides, DenseIndex in_col_strides,
+ DenseIndex plane_inflate_strides, DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
+ DenseIndex padding_top_z, DenseIndex padding_bottom_z,
+ DenseIndex padding_top, DenseIndex padding_bottom,
+ DenseIndex padding_left, DenseIndex padding_right,
+ Scalar padding_value)
+ : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
+ m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides),
+ m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
+ m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
+ m_padding_explicit(true), m_padding_top_z(padding_top_z), m_padding_bottom_z(padding_bottom_z), m_padding_top(padding_top), m_padding_bottom(padding_bottom),
+ m_padding_left(padding_left), m_padding_right(padding_right),
+ m_padding_type(PADDING_VALID), m_padding_value(padding_value) {}
+
+ EIGEN_DEVICE_FUNC
+ DenseIndex patch_planes() const { return m_patch_planes; }
+ EIGEN_DEVICE_FUNC
+ DenseIndex patch_rows() const { return m_patch_rows; }
+ EIGEN_DEVICE_FUNC
+ DenseIndex patch_cols() const { return m_patch_cols; }
+ EIGEN_DEVICE_FUNC
+ DenseIndex plane_strides() const { return m_plane_strides; }
+ EIGEN_DEVICE_FUNC
+ DenseIndex row_strides() const { return m_row_strides; }
+ EIGEN_DEVICE_FUNC
+ DenseIndex col_strides() const { return m_col_strides; }
+ EIGEN_DEVICE_FUNC
+ DenseIndex in_plane_strides() const { return m_in_plane_strides; }
+ EIGEN_DEVICE_FUNC
+ DenseIndex in_row_strides() const { return m_in_row_strides; }
+ EIGEN_DEVICE_FUNC
+ DenseIndex in_col_strides() const { return m_in_col_strides; }
+ EIGEN_DEVICE_FUNC
+ DenseIndex plane_inflate_strides() const { return m_plane_inflate_strides; }
+ EIGEN_DEVICE_FUNC
+ DenseIndex row_inflate_strides() const { return m_row_inflate_strides; }
+ EIGEN_DEVICE_FUNC
+ DenseIndex col_inflate_strides() const { return m_col_inflate_strides; }
+ EIGEN_DEVICE_FUNC
+ bool padding_explicit() const { return m_padding_explicit; }
+ EIGEN_DEVICE_FUNC
+ DenseIndex padding_top_z() const { return m_padding_top_z; }
+ EIGEN_DEVICE_FUNC
+ DenseIndex padding_bottom_z() const { return m_padding_bottom_z; }
+ EIGEN_DEVICE_FUNC
+ DenseIndex padding_top() const { return m_padding_top; }
+ EIGEN_DEVICE_FUNC
+ DenseIndex padding_bottom() const { return m_padding_bottom; }
+ EIGEN_DEVICE_FUNC
+ DenseIndex padding_left() const { return m_padding_left; }
+ EIGEN_DEVICE_FUNC
+ DenseIndex padding_right() const { return m_padding_right; }
+ EIGEN_DEVICE_FUNC
+ PaddingType padding_type() const { return m_padding_type; }
+ EIGEN_DEVICE_FUNC
+ Scalar padding_value() const { return m_padding_value; }
+
+ EIGEN_DEVICE_FUNC
+ const typename internal::remove_all<typename XprType::Nested>::type&
+ expression() const { return m_xpr; }
+
+ protected:
+ typename XprType::Nested m_xpr;
+ const DenseIndex m_patch_planes;
+ const DenseIndex m_patch_rows;
+ const DenseIndex m_patch_cols;
+ const DenseIndex m_plane_strides;
+ const DenseIndex m_row_strides;
+ const DenseIndex m_col_strides;
+ const DenseIndex m_in_plane_strides;
+ const DenseIndex m_in_row_strides;
+ const DenseIndex m_in_col_strides;
+ const DenseIndex m_plane_inflate_strides;
+ const DenseIndex m_row_inflate_strides;
+ const DenseIndex m_col_inflate_strides;
+ const bool m_padding_explicit;
+ const DenseIndex m_padding_top_z;
+ const DenseIndex m_padding_bottom_z;
+ const DenseIndex m_padding_top;
+ const DenseIndex m_padding_bottom;
+ const DenseIndex m_padding_left;
+ const DenseIndex m_padding_right;
+ const PaddingType m_padding_type;
+ const Scalar m_padding_value;
+};
+
+
+// Eval as rvalue
+template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, Device>
+{
+ typedef TensorVolumePatchOp<Planes, Rows, Cols, ArgType> XprType;
+ typedef typename XprType::Index Index;
+ static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+ static const int NumDims = NumInputDims + 1;
+ typedef DSizes<Index, NumDims> Dimensions;
+ typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+
+ enum {
+ IsAligned = false,
+ PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+ BlockAccess = false,
+ PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+ Layout = TensorEvaluator<ArgType, Device>::Layout,
+ CoordAccess = false,
+ RawAccess = false
+ };
+
+ //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+ typedef internal::TensorBlockNotImplemented TensorBlock;
+ //===--------------------------------------------------------------------===//
+
+ EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) :
+ m_impl(op.expression(), device)
+ {
+ EIGEN_STATIC_ASSERT((NumDims >= 5), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+ m_paddingValue = op.padding_value();
+
+ const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+
+ // Cache a few variables.
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ m_inputDepth = input_dims[0];
+ m_inputPlanes = input_dims[1];
+ m_inputRows = input_dims[2];
+ m_inputCols = input_dims[3];
+ } else {
+ m_inputDepth = input_dims[NumInputDims-1];
+ m_inputPlanes = input_dims[NumInputDims-2];
+ m_inputRows = input_dims[NumInputDims-3];
+ m_inputCols = input_dims[NumInputDims-4];
+ }
+
+ m_plane_strides = op.plane_strides();
+ m_row_strides = op.row_strides();
+ m_col_strides = op.col_strides();
+
+ // Input strides and effective input/patch size
+ m_in_plane_strides = op.in_plane_strides();
+ m_in_row_strides = op.in_row_strides();
+ m_in_col_strides = op.in_col_strides();
+ m_plane_inflate_strides = op.plane_inflate_strides();
+ m_row_inflate_strides = op.row_inflate_strides();
+ m_col_inflate_strides = op.col_inflate_strides();
+
+ // The "effective" spatial size after inflating data with zeros.
+ m_input_planes_eff = (m_inputPlanes - 1) * m_plane_inflate_strides + 1;
+ m_input_rows_eff = (m_inputRows - 1) * m_row_inflate_strides + 1;
+ m_input_cols_eff = (m_inputCols - 1) * m_col_inflate_strides + 1;
+ m_patch_planes_eff = op.patch_planes() + (op.patch_planes() - 1) * (m_in_plane_strides - 1);
+ m_patch_rows_eff = op.patch_rows() + (op.patch_rows() - 1) * (m_in_row_strides - 1);
+ m_patch_cols_eff = op.patch_cols() + (op.patch_cols() - 1) * (m_in_col_strides - 1);
+
+ if (op.padding_explicit()) {
+ m_outputPlanes = numext::ceil((m_input_planes_eff + op.padding_top_z() + op.padding_bottom_z() - m_patch_planes_eff + 1.f) / static_cast<float>(m_plane_strides));
+ m_outputRows = numext::ceil((m_input_rows_eff + op.padding_top() + op.padding_bottom() - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides));
+ m_outputCols = numext::ceil((m_input_cols_eff + op.padding_left() + op.padding_right() - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides));
+ m_planePaddingTop = op.padding_top_z();
+ m_rowPaddingTop = op.padding_top();
+ m_colPaddingLeft = op.padding_left();
+ } else {
+ // Computing padding from the type
+ switch (op.padding_type()) {
+ case PADDING_VALID:
+ m_outputPlanes = numext::ceil((m_input_planes_eff - m_patch_planes_eff + 1.f) / static_cast<float>(m_plane_strides));
+ m_outputRows = numext::ceil((m_input_rows_eff - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides));
+ m_outputCols = numext::ceil((m_input_cols_eff - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides));
+ m_planePaddingTop = 0;
+ m_rowPaddingTop = 0;
+ m_colPaddingLeft = 0;
+ break;
+ case PADDING_SAME: {
+ m_outputPlanes = numext::ceil(m_input_planes_eff / static_cast<float>(m_plane_strides));
+ m_outputRows = numext::ceil(m_input_rows_eff / static_cast<float>(m_row_strides));
+ m_outputCols = numext::ceil(m_input_cols_eff / static_cast<float>(m_col_strides));
+ const Index dz = (m_outputPlanes - 1) * m_plane_strides + m_patch_planes_eff - m_input_planes_eff;
+ const Index dy = (m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff;
+ const Index dx = (m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff;
+ m_planePaddingTop = dz / 2;
+ m_rowPaddingTop = dy / 2;
+ m_colPaddingLeft = dx / 2;
+ break;
+ }
+ default:
+ eigen_assert(false && "unexpected padding");
+ }
+ }
+ eigen_assert(m_outputRows > 0);
+ eigen_assert(m_outputCols > 0);
+ eigen_assert(m_outputPlanes > 0);
+
+ // Dimensions for result of extraction.
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ // ColMajor
+ // 0: depth
+ // 1: patch_planes
+ // 2: patch_rows
+ // 3: patch_cols
+ // 4: number of patches
+ // 5 and beyond: anything else (such as batch).
+ m_dimensions[0] = input_dims[0];
+ m_dimensions[1] = op.patch_planes();
+ m_dimensions[2] = op.patch_rows();
+ m_dimensions[3] = op.patch_cols();
+ m_dimensions[4] = m_outputPlanes * m_outputRows * m_outputCols;
+ for (int i = 5; i < NumDims; ++i) {
+ m_dimensions[i] = input_dims[i-1];
+ }
+ } else {
+ // RowMajor
+ // NumDims-1: depth
+ // NumDims-2: patch_planes
+ // NumDims-3: patch_rows
+ // NumDims-4: patch_cols
+ // NumDims-5: number of patches
+ // NumDims-6 and beyond: anything else (such as batch).
+ m_dimensions[NumDims-1] = input_dims[NumInputDims-1];
+ m_dimensions[NumDims-2] = op.patch_planes();
+ m_dimensions[NumDims-3] = op.patch_rows();
+ m_dimensions[NumDims-4] = op.patch_cols();
+ m_dimensions[NumDims-5] = m_outputPlanes * m_outputRows * m_outputCols;
+ for (int i = NumDims-6; i >= 0; --i) {
+ m_dimensions[i] = input_dims[i];
+ }
+ }
+
+ // Strides for the output tensor.
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ m_rowStride = m_dimensions[1];
+ m_colStride = m_dimensions[2] * m_rowStride;
+ m_patchStride = m_colStride * m_dimensions[3] * m_dimensions[0];
+ m_otherStride = m_patchStride * m_dimensions[4];
+ } else {
+ m_rowStride = m_dimensions[NumDims-2];
+ m_colStride = m_dimensions[NumDims-3] * m_rowStride;
+ m_patchStride = m_colStride * m_dimensions[NumDims-4] * m_dimensions[NumDims-1];
+ m_otherStride = m_patchStride * m_dimensions[NumDims-5];
+ }
+
+ // Strides for navigating through the input tensor.
+ m_planeInputStride = m_inputDepth;
+ m_rowInputStride = m_inputDepth * m_inputPlanes;
+ m_colInputStride = m_inputDepth * m_inputRows * m_inputPlanes;
+ m_otherInputStride = m_inputDepth * m_inputRows * m_inputCols * m_inputPlanes;
+
+ m_outputPlanesRows = m_outputPlanes * m_outputRows;
+
+ // Fast representations of different variables.
+ m_fastOtherStride = internal::TensorIntDivisor<Index>(m_otherStride);
+
+ m_fastPatchStride = internal::TensorIntDivisor<Index>(m_patchStride);
+ m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride);
+ m_fastRowStride = internal::TensorIntDivisor<Index>(m_rowStride);
+ m_fastInputRowStride = internal::TensorIntDivisor<Index>(m_row_inflate_strides);
+ m_fastInputColStride = internal::TensorIntDivisor<Index>(m_col_inflate_strides);
+ m_fastInputPlaneStride = internal::TensorIntDivisor<Index>(m_plane_inflate_strides);
+ m_fastInputColsEff = internal::TensorIntDivisor<Index>(m_input_cols_eff);
+ m_fastOutputPlanes = internal::TensorIntDivisor<Index>(m_outputPlanes);
+ m_fastOutputPlanesRows = internal::TensorIntDivisor<Index>(m_outputPlanesRows);
+
+ if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+ m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[0]);
+ } else {
+ m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[NumDims-1]);
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+ EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+ m_impl.evalSubExprsIfNeeded(NULL);
+ return true;
+ }
+
+ EIGEN_STRONG_INLINE void cleanup() {
+ m_impl.cleanup();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+ {
+ // Patch index corresponding to the passed in index.
+ const Index patchIndex = index / m_fastPatchStride;
+
+ // Spatial offset within the patch. This has to be translated into 3D
+ // coordinates within the patch.
+ const Index patchOffset = (index - patchIndex * m_patchStride) / m_fastOutputDepth;
+
+ // Batch, etc.
+ const Index otherIndex = (NumDims == 5) ? 0 : index / m_fastOtherStride;
+ const Index patch3DIndex = (NumDims == 5) ? patchIndex : (index - otherIndex * m_otherStride) / m_fastPatchStride;
+
+ // Calculate column index in the input original tensor.
+ const Index colIndex = patch3DIndex / m_fastOutputPlanesRows;
+ const Index colOffset = patchOffset / m_fastColStride;
+ const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft;
+ const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
+ if (inputCol < 0 || inputCol >= m_input_cols_eff ||
+ ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) {
+ return Scalar(m_paddingValue);
+ }
+
+ // Calculate row index in the original input tensor.
+ const Index rowIndex = (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes;
+ const Index rowOffset = (patchOffset - colOffset * m_colStride) / m_fastRowStride;
+ const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop;
+ const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
+ if (inputRow < 0 || inputRow >= m_input_rows_eff ||
+ ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) {
+ return Scalar(m_paddingValue);
+ }
+
+ // Calculate plane index in the original input tensor.
+ const Index planeIndex = (patch3DIndex - m_outputPlanes * (colIndex * m_outputRows + rowIndex));
+ const Index planeOffset = patchOffset - colOffset * m_colStride - rowOffset * m_rowStride;
+ const Index inputPlane = planeIndex * m_plane_strides + planeOffset * m_in_plane_strides - m_planePaddingTop;
+ const Index origInputPlane = (m_plane_inflate_strides == 1) ? inputPlane : ((inputPlane >= 0) ? (inputPlane / m_fastInputPlaneStride) : 0);
+ if (inputPlane < 0 || inputPlane >= m_input_planes_eff ||
+ ((m_plane_inflate_strides != 1) && (inputPlane != origInputPlane * m_plane_inflate_strides))) {
+ return Scalar(m_paddingValue);
+ }
+
+ const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
+ const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index];
+
+ const Index inputIndex = depth +
+ origInputRow * m_rowInputStride +
+ origInputCol * m_colInputStride +
+ origInputPlane * m_planeInputStride +
+ otherIndex * m_otherInputStride;
+
+ return m_impl.coeff(inputIndex);
+ }
+
+ template<int LoadMode>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+ {
+ EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+ eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+ if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1 ||
+ m_in_plane_strides != 1 || m_plane_inflate_strides != 1) {
+ return packetWithPossibleZero(index);
+ }
+
+ const Index indices[2] = {index, index + PacketSize - 1};
+ const Index patchIndex = indices[0] / m_fastPatchStride;
+ if (patchIndex != indices[1] / m_fastPatchStride) {
+ return packetWithPossibleZero(index);
+ }
+ const Index otherIndex = (NumDims == 5) ? 0 : indices[0] / m_fastOtherStride;
+ eigen_assert(otherIndex == indices[1] / m_fastOtherStride);
+
+ // Find the offset of the element wrt the location of the first element.
+ const Index patchOffsets[2] = {(indices[0] - patchIndex * m_patchStride) / m_fastOutputDepth,
+ (indices[1] - patchIndex * m_patchStride) / m_fastOutputDepth};
+
+ const Index patch3DIndex = (NumDims == 5) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride;
+ eigen_assert(patch3DIndex == (indices[1] - otherIndex * m_otherStride) / m_fastPatchStride);
+
+ const Index colIndex = patch3DIndex / m_fastOutputPlanesRows;
+ const Index colOffsets[2] = {
+ patchOffsets[0] / m_fastColStride,
+ patchOffsets[1] / m_fastColStride};
+
+ // Calculate col indices in the original input tensor.
+ const Index inputCols[2] = {
+ colIndex * m_col_strides + colOffsets[0] - m_colPaddingLeft,
+ colIndex * m_col_strides + colOffsets[1] - m_colPaddingLeft};
+ if (inputCols[1] < 0 || inputCols[0] >= m_inputCols) {
+ return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
+ }
+
+ if (inputCols[0] != inputCols[1]) {
+ return packetWithPossibleZero(index);
+ }
+
+ const Index rowIndex = (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes;
+ const Index rowOffsets[2] = {
+ (patchOffsets[0] - colOffsets[0] * m_colStride) / m_fastRowStride,
+ (patchOffsets[1] - colOffsets[1] * m_colStride) / m_fastRowStride};
+ eigen_assert(rowOffsets[0] <= rowOffsets[1]);
+ // Calculate col indices in the original input tensor.
+ const Index inputRows[2] = {
+ rowIndex * m_row_strides + rowOffsets[0] - m_rowPaddingTop,
+ rowIndex * m_row_strides + rowOffsets[1] - m_rowPaddingTop};
+
+ if (inputRows[1] < 0 || inputRows[0] >= m_inputRows) {
+ return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
+ }
+
+ if (inputRows[0] != inputRows[1]) {
+ return packetWithPossibleZero(index);
+ }
+
+ const Index planeIndex = (patch3DIndex - m_outputPlanes * (colIndex * m_outputRows + rowIndex));
+ const Index planeOffsets[2] = {
+ patchOffsets[0] - colOffsets[0] * m_colStride - rowOffsets[0] * m_rowStride,
+ patchOffsets[1] - colOffsets[1] * m_colStride - rowOffsets[1] * m_rowStride};
+ eigen_assert(planeOffsets[0] <= planeOffsets[1]);
+ const Index inputPlanes[2] = {
+ planeIndex * m_plane_strides + planeOffsets[0] - m_planePaddingTop,
+ planeIndex * m_plane_strides + planeOffsets[1] - m_planePaddingTop};
+
+ if (inputPlanes[1] < 0 || inputPlanes[0] >= m_inputPlanes) {
+ return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
+ }
+
+ if (inputPlanes[0] >= 0 && inputPlanes[1] < m_inputPlanes) {
+ // no padding
+ const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
+ const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index];
+ const Index inputIndex = depth +
+ inputRows[0] * m_rowInputStride +
+ inputCols[0] * m_colInputStride +
+ m_planeInputStride * inputPlanes[0] +
+ otherIndex * m_otherInputStride;
+ return m_impl.template packet<Unaligned>(inputIndex);
+ }
+
+ return packetWithPossibleZero(index);
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+ costPerCoeff(bool vectorized) const {
+ const double compute_cost =
+ 10 * TensorOpCost::DivCost<Index>() + 21 * TensorOpCost::MulCost<Index>() +
+ 8 * TensorOpCost::AddCost<Index>();
+ return TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
+ }
+
+ EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+ const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index planePaddingTop() const { return m_planePaddingTop; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowPaddingTop() const { return m_rowPaddingTop; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colPaddingLeft() const { return m_colPaddingLeft; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputPlanes() const { return m_outputPlanes; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputRows() const { return m_outputRows; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputCols() const { return m_outputCols; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userPlaneStride() const { return m_plane_strides; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userRowStride() const { return m_row_strides; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userColStride() const { return m_col_strides; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInPlaneStride() const { return m_in_plane_strides; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInRowStride() const { return m_in_row_strides; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInColStride() const { return m_in_col_strides; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index planeInflateStride() const { return m_plane_inflate_strides; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowInflateStride() const { return m_row_inflate_strides; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colInflateStride() const { return m_col_inflate_strides; }
+
+#ifdef EIGEN_USE_SYCL
+ // binding placeholder accessors to a command group handler for SYCL
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
+ m_impl.bind(cgh);
+ }
+#endif
+ protected:
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
+ {
+ EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+ EIGEN_UNROLL_LOOP
+ for (int i = 0; i < PacketSize; ++i) {
+ values[i] = coeff(index+i);
+ }
+ PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+ return rslt;
+ }
+
+ Dimensions m_dimensions;
+
+ // Parameters passed to the constructor.
+ Index m_plane_strides;
+ Index m_row_strides;
+ Index m_col_strides;
+
+ Index m_outputPlanes;
+ Index m_outputRows;
+ Index m_outputCols;
+
+ Index m_planePaddingTop;
+ Index m_rowPaddingTop;
+ Index m_colPaddingLeft;
+
+ Index m_in_plane_strides;
+ Index m_in_row_strides;
+ Index m_in_col_strides;
+
+ Index m_plane_inflate_strides;
+ Index m_row_inflate_strides;
+ Index m_col_inflate_strides;
+
+ // Cached input size.
+ Index m_inputDepth;
+ Index m_inputPlanes;
+ Index m_inputRows;
+ Index m_inputCols;
+
+ // Other cached variables.
+ Index m_outputPlanesRows;
+
+ // Effective input/patch post-inflation size.
+ Index m_input_planes_eff;
+ Index m_input_rows_eff;
+ Index m_input_cols_eff;
+ Index m_patch_planes_eff;
+ Index m_patch_rows_eff;
+ Index m_patch_cols_eff;
+
+ // Strides for the output tensor.
+ Index m_otherStride;
+ Index m_patchStride;
+ Index m_rowStride;
+ Index m_colStride;
+
+ // Strides for the input tensor.
+ Index m_planeInputStride;
+ Index m_rowInputStride;
+ Index m_colInputStride;
+ Index m_otherInputStride;
+
+ internal::TensorIntDivisor<Index> m_fastOtherStride;
+ internal::TensorIntDivisor<Index> m_fastPatchStride;
+ internal::TensorIntDivisor<Index> m_fastColStride;
+ internal::TensorIntDivisor<Index> m_fastRowStride;
+ internal::TensorIntDivisor<Index> m_fastInputPlaneStride;
+ internal::TensorIntDivisor<Index> m_fastInputRowStride;
+ internal::TensorIntDivisor<Index> m_fastInputColStride;
+ internal::TensorIntDivisor<Index> m_fastInputColsEff;
+ internal::TensorIntDivisor<Index> m_fastOutputPlanesRows;
+ internal::TensorIntDivisor<Index> m_fastOutputPlanes;
+ internal::TensorIntDivisor<Index> m_fastOutputDepth;
+
+ Scalar m_paddingValue;
+
+ TensorEvaluator<ArgType, Device> m_impl;
+
+
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H
diff --git a/src/EigenUnsupported/CXX11/src/TensorSymmetry/DynamicSymmetry.h b/src/EigenUnsupported/CXX11/src/TensorSymmetry/DynamicSymmetry.h
new file mode 100644
index 0000000..bc4f202
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/TensorSymmetry/DynamicSymmetry.h
@@ -0,0 +1,293 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSORSYMMETRY_DYNAMICSYMMETRY_H
+#define EIGEN_CXX11_TENSORSYMMETRY_DYNAMICSYMMETRY_H
+
+namespace Eigen {
+
+class DynamicSGroup
+{
+ public:
+ inline explicit DynamicSGroup() : m_numIndices(1), m_elements(), m_generators(), m_globalFlags(0) { m_elements.push_back(ge(Generator(0, 0, 0))); }
+ inline DynamicSGroup(const DynamicSGroup& o) : m_numIndices(o.m_numIndices), m_elements(o.m_elements), m_generators(o.m_generators), m_globalFlags(o.m_globalFlags) { }
+ inline DynamicSGroup(DynamicSGroup&& o) : m_numIndices(o.m_numIndices), m_elements(), m_generators(o.m_generators), m_globalFlags(o.m_globalFlags) { std::swap(m_elements, o.m_elements); }
+ inline DynamicSGroup& operator=(const DynamicSGroup& o) { m_numIndices = o.m_numIndices; m_elements = o.m_elements; m_generators = o.m_generators; m_globalFlags = o.m_globalFlags; return *this; }
+ inline DynamicSGroup& operator=(DynamicSGroup&& o) { m_numIndices = o.m_numIndices; std::swap(m_elements, o.m_elements); m_generators = o.m_generators; m_globalFlags = o.m_globalFlags; return *this; }
+
+ void add(int one, int two, int flags = 0);
+
+ template<typename Gen_>
+ inline void add(Gen_) { add(Gen_::One, Gen_::Two, Gen_::Flags); }
+ inline void addSymmetry(int one, int two) { add(one, two, 0); }
+ inline void addAntiSymmetry(int one, int two) { add(one, two, NegationFlag); }
+ inline void addHermiticity(int one, int two) { add(one, two, ConjugationFlag); }
+ inline void addAntiHermiticity(int one, int two) { add(one, two, NegationFlag | ConjugationFlag); }
+
+ template<typename Op, typename RV, typename Index, std::size_t N, typename... Args>
+ inline RV apply(const std::array<Index, N>& idx, RV initial, Args&&... args) const
+ {
+ eigen_assert(N >= m_numIndices && "Can only apply symmetry group to objects that have at least the required amount of indices.");
+ for (std::size_t i = 0; i < size(); i++)
+ initial = Op::run(h_permute(i, idx, typename internal::gen_numeric_list<int, N>::type()), m_elements[i].flags, initial, std::forward<Args>(args)...);
+ return initial;
+ }
+
+ template<typename Op, typename RV, typename Index, typename... Args>
+ inline RV apply(const std::vector<Index>& idx, RV initial, Args&&... args) const
+ {
+ eigen_assert(idx.size() >= m_numIndices && "Can only apply symmetry group to objects that have at least the required amount of indices.");
+ for (std::size_t i = 0; i < size(); i++)
+ initial = Op::run(h_permute(i, idx), m_elements[i].flags, initial, std::forward<Args>(args)...);
+ return initial;
+ }
+
+ inline int globalFlags() const { return m_globalFlags; }
+ inline std::size_t size() const { return m_elements.size(); }
+
+ template<typename Tensor_, typename... IndexTypes>
+ inline internal::tensor_symmetry_value_setter<Tensor_, DynamicSGroup> operator()(Tensor_& tensor, typename Tensor_::Index firstIndex, IndexTypes... otherIndices) const
+ {
+ static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
+ return operator()(tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices>{{firstIndex, otherIndices...}});
+ }
+
+ template<typename Tensor_>
+ inline internal::tensor_symmetry_value_setter<Tensor_, DynamicSGroup> operator()(Tensor_& tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices> const& indices) const
+ {
+ return internal::tensor_symmetry_value_setter<Tensor_, DynamicSGroup>(tensor, *this, indices);
+ }
+ private:
+ struct GroupElement {
+ std::vector<int> representation;
+ int flags;
+ bool isId() const
+ {
+ for (std::size_t i = 0; i < representation.size(); i++)
+ if (i != (size_t)representation[i])
+ return false;
+ return true;
+ }
+ };
+ struct Generator {
+ int one;
+ int two;
+ int flags;
+ constexpr inline Generator(int one_, int two_, int flags_) : one(one_), two(two_), flags(flags_) {}
+ };
+
+ std::size_t m_numIndices;
+ std::vector<GroupElement> m_elements;
+ std::vector<Generator> m_generators;
+ int m_globalFlags;
+
+ template<typename Index, std::size_t N, int... n>
+ inline std::array<Index, N> h_permute(std::size_t which, const std::array<Index, N>& idx, internal::numeric_list<int, n...>) const
+ {
+ return std::array<Index, N>{{ idx[n >= m_numIndices ? n : m_elements[which].representation[n]]... }};
+ }
+
+ template<typename Index>
+ inline std::vector<Index> h_permute(std::size_t which, std::vector<Index> idx) const
+ {
+ std::vector<Index> result;
+ result.reserve(idx.size());
+ for (auto k : m_elements[which].representation)
+ result.push_back(idx[k]);
+ for (std::size_t i = m_numIndices; i < idx.size(); i++)
+ result.push_back(idx[i]);
+ return result;
+ }
+
+ inline GroupElement ge(Generator const& g) const
+ {
+ GroupElement result;
+ result.representation.reserve(m_numIndices);
+ result.flags = g.flags;
+ for (std::size_t k = 0; k < m_numIndices; k++) {
+ if (k == (std::size_t)g.one)
+ result.representation.push_back(g.two);
+ else if (k == (std::size_t)g.two)
+ result.representation.push_back(g.one);
+ else
+ result.representation.push_back(int(k));
+ }
+ return result;
+ }
+
+ GroupElement mul(GroupElement, GroupElement) const;
+ inline GroupElement mul(Generator g1, GroupElement g2) const
+ {
+ return mul(ge(g1), g2);
+ }
+
+ inline GroupElement mul(GroupElement g1, Generator g2) const
+ {
+ return mul(g1, ge(g2));
+ }
+
+ inline GroupElement mul(Generator g1, Generator g2) const
+ {
+ return mul(ge(g1), ge(g2));
+ }
+
+ inline int findElement(GroupElement e) const
+ {
+ for (auto ee : m_elements) {
+ if (ee.representation == e.representation)
+ return ee.flags ^ e.flags;
+ }
+ return -1;
+ }
+
+ void updateGlobalFlags(int flagDiffOfSameGenerator);
+};
+
+// dynamic symmetry group that auto-adds the template parameters in the constructor
+template<typename... Gen>
+class DynamicSGroupFromTemplateArgs : public DynamicSGroup
+{
+ public:
+ inline DynamicSGroupFromTemplateArgs() : DynamicSGroup()
+ {
+ add_all(internal::type_list<Gen...>());
+ }
+ inline DynamicSGroupFromTemplateArgs(DynamicSGroupFromTemplateArgs const& other) : DynamicSGroup(other) { }
+ inline DynamicSGroupFromTemplateArgs(DynamicSGroupFromTemplateArgs&& other) : DynamicSGroup(other) { }
+ inline DynamicSGroupFromTemplateArgs<Gen...>& operator=(const DynamicSGroupFromTemplateArgs<Gen...>& o) { DynamicSGroup::operator=(o); return *this; }
+ inline DynamicSGroupFromTemplateArgs<Gen...>& operator=(DynamicSGroupFromTemplateArgs<Gen...>&& o) { DynamicSGroup::operator=(o); return *this; }
+
+ private:
+ template<typename Gen1, typename... GenNext>
+ inline void add_all(internal::type_list<Gen1, GenNext...>)
+ {
+ add(Gen1());
+ add_all(internal::type_list<GenNext...>());
+ }
+
+ inline void add_all(internal::type_list<>)
+ {
+ }
+};
+
+inline DynamicSGroup::GroupElement DynamicSGroup::mul(GroupElement g1, GroupElement g2) const
+{
+ eigen_internal_assert(g1.representation.size() == m_numIndices);
+ eigen_internal_assert(g2.representation.size() == m_numIndices);
+
+ GroupElement result;
+ result.representation.reserve(m_numIndices);
+ for (std::size_t i = 0; i < m_numIndices; i++) {
+ int v = g2.representation[g1.representation[i]];
+ eigen_assert(v >= 0);
+ result.representation.push_back(v);
+ }
+ result.flags = g1.flags ^ g2.flags;
+ return result;
+}
+
+inline void DynamicSGroup::add(int one, int two, int flags)
+{
+ eigen_assert(one >= 0);
+ eigen_assert(two >= 0);
+ eigen_assert(one != two);
+
+ if ((std::size_t)one >= m_numIndices || (std::size_t)two >= m_numIndices) {
+ std::size_t newNumIndices = (one > two) ? one : two + 1;
+ for (auto& gelem : m_elements) {
+ gelem.representation.reserve(newNumIndices);
+ for (std::size_t i = m_numIndices; i < newNumIndices; i++)
+ gelem.representation.push_back(i);
+ }
+ m_numIndices = newNumIndices;
+ }
+
+ Generator g{one, two, flags};
+ GroupElement e = ge(g);
+
+ /* special case for first generator */
+ if (m_elements.size() == 1) {
+ while (!e.isId()) {
+ m_elements.push_back(e);
+ e = mul(e, g);
+ }
+
+ if (e.flags > 0)
+ updateGlobalFlags(e.flags);
+
+ // only add in case we didn't have identity
+ if (m_elements.size() > 1)
+ m_generators.push_back(g);
+ return;
+ }
+
+ int p = findElement(e);
+ if (p >= 0) {
+ updateGlobalFlags(p);
+ return;
+ }
+
+ std::size_t coset_order = m_elements.size();
+ m_elements.push_back(e);
+ for (std::size_t i = 1; i < coset_order; i++)
+ m_elements.push_back(mul(m_elements[i], e));
+ m_generators.push_back(g);
+
+ std::size_t coset_rep = coset_order;
+ do {
+ for (auto g : m_generators) {
+ e = mul(m_elements[coset_rep], g);
+ p = findElement(e);
+ if (p < 0) {
+ // element not yet in group
+ m_elements.push_back(e);
+ for (std::size_t i = 1; i < coset_order; i++)
+ m_elements.push_back(mul(m_elements[i], e));
+ } else if (p > 0) {
+ updateGlobalFlags(p);
+ }
+ }
+ coset_rep += coset_order;
+ } while (coset_rep < m_elements.size());
+}
+
+inline void DynamicSGroup::updateGlobalFlags(int flagDiffOfSameGenerator)
+{
+ switch (flagDiffOfSameGenerator) {
+ case 0:
+ default:
+ // nothing happened
+ break;
+ case NegationFlag:
+ // every element is it's own negative => whole tensor is zero
+ m_globalFlags |= GlobalZeroFlag;
+ break;
+ case ConjugationFlag:
+ // every element is it's own conjugate => whole tensor is real
+ m_globalFlags |= GlobalRealFlag;
+ break;
+ case (NegationFlag | ConjugationFlag):
+ // every element is it's own negative conjugate => whole tensor is imaginary
+ m_globalFlags |= GlobalImagFlag;
+ break;
+ /* NOTE:
+ * since GlobalZeroFlag == GlobalRealFlag | GlobalImagFlag, if one generator
+ * causes the tensor to be real and the next one to be imaginary, this will
+ * trivially give the correct result
+ */
+ }
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSORSYMMETRY_DYNAMICSYMMETRY_H
+
+/*
+ * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
+ */
diff --git a/src/EigenUnsupported/CXX11/src/TensorSymmetry/StaticSymmetry.h b/src/EigenUnsupported/CXX11/src/TensorSymmetry/StaticSymmetry.h
new file mode 100644
index 0000000..942293b
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/TensorSymmetry/StaticSymmetry.h
@@ -0,0 +1,236 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSORSYMMETRY_STATICSYMMETRY_H
+#define EIGEN_CXX11_TENSORSYMMETRY_STATICSYMMETRY_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename list> struct tensor_static_symgroup_permutate;
+
+template<int... nn>
+struct tensor_static_symgroup_permutate<numeric_list<int, nn...>>
+{
+ constexpr static std::size_t N = sizeof...(nn);
+
+ template<typename T>
+ constexpr static inline std::array<T, N> run(const std::array<T, N>& indices)
+ {
+ return {{indices[nn]...}};
+ }
+};
+
+template<typename indices_, int flags_>
+struct tensor_static_symgroup_element
+{
+ typedef indices_ indices;
+ constexpr static int flags = flags_;
+};
+
+template<typename Gen, int N>
+struct tensor_static_symgroup_element_ctor
+{
+ typedef tensor_static_symgroup_element<
+ typename gen_numeric_list_swapped_pair<int, N, Gen::One, Gen::Two>::type,
+ Gen::Flags
+ > type;
+};
+
+template<int N>
+struct tensor_static_symgroup_identity_ctor
+{
+ typedef tensor_static_symgroup_element<
+ typename gen_numeric_list<int, N>::type,
+ 0
+ > type;
+};
+
+template<typename iib>
+struct tensor_static_symgroup_multiply_helper
+{
+ template<int... iia>
+ constexpr static inline numeric_list<int, get<iia, iib>::value...> helper(numeric_list<int, iia...>) {
+ return numeric_list<int, get<iia, iib>::value...>();
+ }
+};
+
+template<typename A, typename B>
+struct tensor_static_symgroup_multiply
+{
+ private:
+ typedef typename A::indices iia;
+ typedef typename B::indices iib;
+ constexpr static int ffa = A::flags;
+ constexpr static int ffb = B::flags;
+
+ public:
+ static_assert(iia::count == iib::count, "Cannot multiply symmetry elements with different number of indices.");
+
+ typedef tensor_static_symgroup_element<
+ decltype(tensor_static_symgroup_multiply_helper<iib>::helper(iia())),
+ ffa ^ ffb
+ > type;
+};
+
+template<typename A, typename B>
+struct tensor_static_symgroup_equality
+{
+ typedef typename A::indices iia;
+ typedef typename B::indices iib;
+ constexpr static int ffa = A::flags;
+ constexpr static int ffb = B::flags;
+ static_assert(iia::count == iib::count, "Cannot compare symmetry elements with different number of indices.");
+
+ constexpr static bool value = is_same<iia, iib>::value;
+
+ private:
+ /* this should be zero if they are identical, or else the tensor
+ * will be forced to be pure real, pure imaginary or even pure zero
+ */
+ constexpr static int flags_cmp_ = ffa ^ ffb;
+
+ /* either they are not equal, then we don't care whether the flags
+ * match, or they are equal, and then we have to check
+ */
+ constexpr static bool is_zero = value && flags_cmp_ == NegationFlag;
+ constexpr static bool is_real = value && flags_cmp_ == ConjugationFlag;
+ constexpr static bool is_imag = value && flags_cmp_ == (NegationFlag | ConjugationFlag);
+
+ public:
+ constexpr static int global_flags =
+ (is_real ? GlobalRealFlag : 0) |
+ (is_imag ? GlobalImagFlag : 0) |
+ (is_zero ? GlobalZeroFlag : 0);
+};
+
+template<std::size_t NumIndices, typename... Gen>
+struct tensor_static_symgroup
+{
+ typedef StaticSGroup<Gen...> type;
+ constexpr static std::size_t size = type::static_size;
+};
+
+template<typename Index, std::size_t N, int... ii, int... jj>
+constexpr static inline std::array<Index, N> tensor_static_symgroup_index_permute(std::array<Index, N> idx, internal::numeric_list<int, ii...>, internal::numeric_list<int, jj...>)
+{
+ return {{ idx[ii]..., idx[jj]... }};
+}
+
+template<typename Index, int... ii>
+static inline std::vector<Index> tensor_static_symgroup_index_permute(std::vector<Index> idx, internal::numeric_list<int, ii...>)
+{
+ std::vector<Index> result{{ idx[ii]... }};
+ std::size_t target_size = idx.size();
+ for (std::size_t i = result.size(); i < target_size; i++)
+ result.push_back(idx[i]);
+ return result;
+}
+
+template<typename T> struct tensor_static_symgroup_do_apply;
+
+template<typename first, typename... next>
+struct tensor_static_symgroup_do_apply<internal::type_list<first, next...>>
+{
+ template<typename Op, typename RV, std::size_t SGNumIndices, typename Index, std::size_t NumIndices, typename... Args>
+ static inline RV run(const std::array<Index, NumIndices>& idx, RV initial, Args&&... args)
+ {
+ static_assert(NumIndices >= SGNumIndices, "Can only apply symmetry group to objects that have at least the required amount of indices.");
+ typedef typename internal::gen_numeric_list<int, NumIndices - SGNumIndices, SGNumIndices>::type remaining_indices;
+ initial = Op::run(tensor_static_symgroup_index_permute(idx, typename first::indices(), remaining_indices()), first::flags, initial, std::forward<Args>(args)...);
+ return tensor_static_symgroup_do_apply<internal::type_list<next...>>::template run<Op, RV, SGNumIndices>(idx, initial, args...);
+ }
+
+ template<typename Op, typename RV, std::size_t SGNumIndices, typename Index, typename... Args>
+ static inline RV run(const std::vector<Index>& idx, RV initial, Args&&... args)
+ {
+ eigen_assert(idx.size() >= SGNumIndices && "Can only apply symmetry group to objects that have at least the required amount of indices.");
+ initial = Op::run(tensor_static_symgroup_index_permute(idx, typename first::indices()), first::flags, initial, std::forward<Args>(args)...);
+ return tensor_static_symgroup_do_apply<internal::type_list<next...>>::template run<Op, RV, SGNumIndices>(idx, initial, args...);
+ }
+};
+
+template<EIGEN_TPL_PP_SPEC_HACK_DEF(typename, empty)>
+struct tensor_static_symgroup_do_apply<internal::type_list<EIGEN_TPL_PP_SPEC_HACK_USE(empty)>>
+{
+ template<typename Op, typename RV, std::size_t SGNumIndices, typename Index, std::size_t NumIndices, typename... Args>
+ static inline RV run(const std::array<Index, NumIndices>&, RV initial, Args&&...)
+ {
+ // do nothing
+ return initial;
+ }
+
+ template<typename Op, typename RV, std::size_t SGNumIndices, typename Index, typename... Args>
+ static inline RV run(const std::vector<Index>&, RV initial, Args&&...)
+ {
+ // do nothing
+ return initial;
+ }
+};
+
+} // end namespace internal
+
+template<typename... Gen>
+class StaticSGroup
+{
+ constexpr static std::size_t NumIndices = internal::tensor_symmetry_num_indices<Gen...>::value;
+ typedef internal::group_theory::enumerate_group_elements<
+ internal::tensor_static_symgroup_multiply,
+ internal::tensor_static_symgroup_equality,
+ typename internal::tensor_static_symgroup_identity_ctor<NumIndices>::type,
+ internal::type_list<typename internal::tensor_static_symgroup_element_ctor<Gen, NumIndices>::type...>
+ > group_elements;
+ typedef typename group_elements::type ge;
+ public:
+ constexpr inline StaticSGroup() {}
+ constexpr inline StaticSGroup(const StaticSGroup<Gen...>&) {}
+ constexpr inline StaticSGroup(StaticSGroup<Gen...>&&) {}
+
+ template<typename Op, typename RV, typename Index, std::size_t N, typename... Args>
+ static inline RV apply(const std::array<Index, N>& idx, RV initial, Args&&... args)
+ {
+ return internal::tensor_static_symgroup_do_apply<ge>::template run<Op, RV, NumIndices>(idx, initial, args...);
+ }
+
+ template<typename Op, typename RV, typename Index, typename... Args>
+ static inline RV apply(const std::vector<Index>& idx, RV initial, Args&&... args)
+ {
+ eigen_assert(idx.size() == NumIndices);
+ return internal::tensor_static_symgroup_do_apply<ge>::template run<Op, RV, NumIndices>(idx, initial, args...);
+ }
+
+ constexpr static std::size_t static_size = ge::count;
+
+ constexpr static inline std::size_t size() {
+ return ge::count;
+ }
+ constexpr static inline int globalFlags() { return group_elements::global_flags; }
+
+ template<typename Tensor_, typename... IndexTypes>
+ inline internal::tensor_symmetry_value_setter<Tensor_, StaticSGroup<Gen...>> operator()(Tensor_& tensor, typename Tensor_::Index firstIndex, IndexTypes... otherIndices) const
+ {
+ static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
+ return operator()(tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices>{{firstIndex, otherIndices...}});
+ }
+
+ template<typename Tensor_>
+ inline internal::tensor_symmetry_value_setter<Tensor_, StaticSGroup<Gen...>> operator()(Tensor_& tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices> const& indices) const
+ {
+ return internal::tensor_symmetry_value_setter<Tensor_, StaticSGroup<Gen...>>(tensor, *this, indices);
+ }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSORSYMMETRY_STATICSYMMETRY_H
+
+/*
+ * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
+ */
diff --git a/src/EigenUnsupported/CXX11/src/TensorSymmetry/Symmetry.h b/src/EigenUnsupported/CXX11/src/TensorSymmetry/Symmetry.h
new file mode 100644
index 0000000..879d6cd
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/TensorSymmetry/Symmetry.h
@@ -0,0 +1,338 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSORSYMMETRY_SYMMETRY_H
+#define EIGEN_CXX11_TENSORSYMMETRY_SYMMETRY_H
+
+namespace Eigen {
+
+enum {
+ NegationFlag = 0x01,
+ ConjugationFlag = 0x02
+};
+
+enum {
+ GlobalRealFlag = 0x01,
+ GlobalImagFlag = 0x02,
+ GlobalZeroFlag = 0x03
+};
+
+namespace internal {
+
+template<std::size_t NumIndices, typename... Sym> struct tensor_symmetry_pre_analysis;
+template<std::size_t NumIndices, typename... Sym> struct tensor_static_symgroup;
+template<bool instantiate, std::size_t NumIndices, typename... Sym> struct tensor_static_symgroup_if;
+template<typename Tensor_> struct tensor_symmetry_calculate_flags;
+template<typename Tensor_> struct tensor_symmetry_assign_value;
+template<typename... Sym> struct tensor_symmetry_num_indices;
+
+} // end namespace internal
+
+template<int One_, int Two_>
+struct Symmetry
+{
+ static_assert(One_ != Two_, "Symmetries must cover distinct indices.");
+ constexpr static int One = One_;
+ constexpr static int Two = Two_;
+ constexpr static int Flags = 0;
+};
+
+template<int One_, int Two_>
+struct AntiSymmetry
+{
+ static_assert(One_ != Two_, "Symmetries must cover distinct indices.");
+ constexpr static int One = One_;
+ constexpr static int Two = Two_;
+ constexpr static int Flags = NegationFlag;
+};
+
+template<int One_, int Two_>
+struct Hermiticity
+{
+ static_assert(One_ != Two_, "Symmetries must cover distinct indices.");
+ constexpr static int One = One_;
+ constexpr static int Two = Two_;
+ constexpr static int Flags = ConjugationFlag;
+};
+
+template<int One_, int Two_>
+struct AntiHermiticity
+{
+ static_assert(One_ != Two_, "Symmetries must cover distinct indices.");
+ constexpr static int One = One_;
+ constexpr static int Two = Two_;
+ constexpr static int Flags = ConjugationFlag | NegationFlag;
+};
+
+/** \class DynamicSGroup
+ * \ingroup TensorSymmetry_Module
+ *
+ * \brief Dynamic symmetry group
+ *
+ * The %DynamicSGroup class represents a symmetry group that need not be known at
+ * compile time. It is useful if one wants to support arbitrary run-time defineable
+ * symmetries for tensors, but it is also instantiated if a symmetry group is defined
+ * at compile time that would be either too large for the compiler to reasonably
+ * generate (using templates to calculate this at compile time is very inefficient)
+ * or that the compiler could generate the group but that it wouldn't make sense to
+ * unroll the loop for setting coefficients anymore.
+ */
+class DynamicSGroup;
+
+/** \internal
+ *
+ * \class DynamicSGroupFromTemplateArgs
+ * \ingroup TensorSymmetry_Module
+ *
+ * \brief Dynamic symmetry group, initialized from template arguments
+ *
+ * This class is a child class of DynamicSGroup. It uses the template arguments
+ * specified to initialize itself.
+ */
+template<typename... Gen>
+class DynamicSGroupFromTemplateArgs;
+
+/** \class StaticSGroup
+ * \ingroup TensorSymmetry_Module
+ *
+ * \brief Static symmetry group
+ *
+ * This class represents a symmetry group that is known and resolved completely
+ * at compile time. Ideally, no run-time penalty is incurred compared to the
+ * manual unrolling of the symmetry.
+ *
+ * <b><i>CAUTION:</i></b>
+ *
+ * Do not use this class directly for large symmetry groups. The compiler
+ * may run into a limit, or segfault or in the very least will take a very,
+ * very, very long time to compile the code. Use the SGroup class instead
+ * if you want a static group. That class contains logic that will
+ * automatically select the DynamicSGroup class instead if the symmetry
+ * group becomes too large. (In that case, unrolling may not even be
+ * beneficial.)
+ */
+template<typename... Gen>
+class StaticSGroup;
+
+/** \class SGroup
+ * \ingroup TensorSymmetry_Module
+ *
+ * \brief Symmetry group, initialized from template arguments
+ *
+ * This class represents a symmetry group whose generators are already
+ * known at compile time. It may or may not be resolved at compile time,
+ * depending on the estimated size of the group.
+ *
+ * \sa StaticSGroup
+ * \sa DynamicSGroup
+ */
+template<typename... Gen>
+class SGroup : public internal::tensor_symmetry_pre_analysis<internal::tensor_symmetry_num_indices<Gen...>::value, Gen...>::root_type
+{
+ public:
+ constexpr static std::size_t NumIndices = internal::tensor_symmetry_num_indices<Gen...>::value;
+ typedef typename internal::tensor_symmetry_pre_analysis<NumIndices, Gen...>::root_type Base;
+
+ // make standard constructors + assignment operators public
+ inline SGroup() : Base() { }
+ inline SGroup(const SGroup<Gen...>& other) : Base(other) { }
+ inline SGroup(SGroup<Gen...>&& other) : Base(other) { }
+ inline SGroup<Gen...>& operator=(const SGroup<Gen...>& other) { Base::operator=(other); return *this; }
+ inline SGroup<Gen...>& operator=(SGroup<Gen...>&& other) { Base::operator=(other); return *this; }
+
+ // all else is defined in the base class
+};
+
+namespace internal {
+
+template<typename... Sym> struct tensor_symmetry_num_indices
+{
+ constexpr static std::size_t value = 1;
+};
+
+template<int One_, int Two_, typename... Sym> struct tensor_symmetry_num_indices<Symmetry<One_, Two_>, Sym...>
+{
+private:
+ constexpr static std::size_t One = static_cast<std::size_t>(One_);
+ constexpr static std::size_t Two = static_cast<std::size_t>(Two_);
+ constexpr static std::size_t Three = tensor_symmetry_num_indices<Sym...>::value;
+
+ // don't use std::max, since it's not constexpr until C++14...
+ constexpr static std::size_t maxOneTwoPlusOne = ((One > Two) ? One : Two) + 1;
+public:
+ constexpr static std::size_t value = (maxOneTwoPlusOne > Three) ? maxOneTwoPlusOne : Three;
+};
+
+template<int One_, int Two_, typename... Sym> struct tensor_symmetry_num_indices<AntiSymmetry<One_, Two_>, Sym...>
+ : public tensor_symmetry_num_indices<Symmetry<One_, Two_>, Sym...> {};
+template<int One_, int Two_, typename... Sym> struct tensor_symmetry_num_indices<Hermiticity<One_, Two_>, Sym...>
+ : public tensor_symmetry_num_indices<Symmetry<One_, Two_>, Sym...> {};
+template<int One_, int Two_, typename... Sym> struct tensor_symmetry_num_indices<AntiHermiticity<One_, Two_>, Sym...>
+ : public tensor_symmetry_num_indices<Symmetry<One_, Two_>, Sym...> {};
+
+/** \internal
+ *
+ * \class tensor_symmetry_pre_analysis
+ * \ingroup TensorSymmetry_Module
+ *
+ * \brief Pre-select whether to use a static or dynamic symmetry group
+ *
+ * When a symmetry group could in principle be determined at compile time,
+ * this template implements the logic whether to actually do that or whether
+ * to rather defer that to runtime.
+ *
+ * The logic is as follows:
+ * <dl>
+ * <dt><b>No generators (trivial symmetry):</b></dt>
+ * <dd>Use a trivial static group. Ideally, this has no performance impact
+ * compared to not using symmetry at all. In practice, this might not
+ * be the case.</dd>
+ * <dt><b>More than 4 generators:</b></dt>
+ * <dd>Calculate the group at run time, it is likely far too large for the
+ * compiler to be able to properly generate it in a realistic time.</dd>
+ * <dt><b>Up to and including 4 generators:</b></dt>
+ * <dd>Actually enumerate all group elements, but then check how many there
+ * are. If there are more than 16, it is unlikely that unrolling the
+ * loop (as is done in the static compile-time case) is sensible, so
+ * use a dynamic group instead. If there are at most 16 elements, actually
+ * use that static group. Note that the largest group with 4 generators
+ * still compiles with reasonable resources.</dd>
+ * </dl>
+ *
+ * Note: Example compile time performance with g++-4.6 on an Intenl Core i5-3470
+ * with 16 GiB RAM (all generators non-redundant and the subgroups don't
+ * factorize):
+ *
+ * # Generators -O0 -ggdb -O2
+ * -------------------------------------------------------------------
+ * 1 0.5 s / 250 MiB 0.45s / 230 MiB
+ * 2 0.5 s / 260 MiB 0.5 s / 250 MiB
+ * 3 0.65s / 310 MiB 0.62s / 310 MiB
+ * 4 2.2 s / 860 MiB 1.7 s / 770 MiB
+ * 5 130 s / 13000 MiB 120 s / 11000 MiB
+ *
+ * It is clear that everything is still very efficient up to 4 generators, then
+ * the memory and CPU requirements become unreasonable. Thus we only instantiate
+ * the template group theory logic if the number of generators supplied is 4 or
+ * lower, otherwise this will be forced to be done during runtime, where the
+ * algorithm is reasonably fast.
+ */
+template<std::size_t NumIndices>
+struct tensor_symmetry_pre_analysis<NumIndices>
+{
+ typedef StaticSGroup<> root_type;
+};
+
+template<std::size_t NumIndices, typename Gen_, typename... Gens_>
+struct tensor_symmetry_pre_analysis<NumIndices, Gen_, Gens_...>
+{
+ constexpr static std::size_t max_static_generators = 4;
+ constexpr static std::size_t max_static_elements = 16;
+ typedef tensor_static_symgroup_if<(sizeof...(Gens_) + 1 <= max_static_generators), NumIndices, Gen_, Gens_...> helper;
+ constexpr static std::size_t possible_size = helper::size;
+
+ typedef typename conditional<
+ possible_size == 0 || possible_size >= max_static_elements,
+ DynamicSGroupFromTemplateArgs<Gen_, Gens_...>,
+ typename helper::type
+ >::type root_type;
+};
+
+template<bool instantiate, std::size_t NumIndices, typename... Gens>
+struct tensor_static_symgroup_if
+{
+ constexpr static std::size_t size = 0;
+ typedef void type;
+};
+
+template<std::size_t NumIndices, typename... Gens>
+struct tensor_static_symgroup_if<true, NumIndices, Gens...> : tensor_static_symgroup<NumIndices, Gens...> {};
+
+template<typename Tensor_>
+struct tensor_symmetry_assign_value
+{
+ typedef typename Tensor_::Index Index;
+ typedef typename Tensor_::Scalar Scalar;
+ constexpr static std::size_t NumIndices = Tensor_::NumIndices;
+
+ static inline int run(const std::array<Index, NumIndices>& transformed_indices, int transformation_flags, int dummy, Tensor_& tensor, const Scalar& value_)
+ {
+ Scalar value(value_);
+ if (transformation_flags & ConjugationFlag)
+ value = numext::conj(value);
+ if (transformation_flags & NegationFlag)
+ value = -value;
+ tensor.coeffRef(transformed_indices) = value;
+ return dummy;
+ }
+};
+
+template<typename Tensor_>
+struct tensor_symmetry_calculate_flags
+{
+ typedef typename Tensor_::Index Index;
+ constexpr static std::size_t NumIndices = Tensor_::NumIndices;
+
+ static inline int run(const std::array<Index, NumIndices>& transformed_indices, int transform_flags, int current_flags, const std::array<Index, NumIndices>& orig_indices)
+ {
+ if (transformed_indices == orig_indices) {
+ if (transform_flags & (ConjugationFlag | NegationFlag))
+ return current_flags | GlobalImagFlag; // anti-hermitian diagonal
+ else if (transform_flags & ConjugationFlag)
+ return current_flags | GlobalRealFlag; // hermitian diagonal
+ else if (transform_flags & NegationFlag)
+ return current_flags | GlobalZeroFlag; // anti-symmetric diagonal
+ }
+ return current_flags;
+ }
+};
+
+template<typename Tensor_, typename Symmetry_, int Flags = 0>
+class tensor_symmetry_value_setter
+{
+ public:
+ typedef typename Tensor_::Index Index;
+ typedef typename Tensor_::Scalar Scalar;
+ constexpr static std::size_t NumIndices = Tensor_::NumIndices;
+
+ inline tensor_symmetry_value_setter(Tensor_& tensor, Symmetry_ const& symmetry, std::array<Index, NumIndices> const& indices)
+ : m_tensor(tensor), m_symmetry(symmetry), m_indices(indices) { }
+
+ inline tensor_symmetry_value_setter<Tensor_, Symmetry_, Flags>& operator=(Scalar const& value)
+ {
+ doAssign(value);
+ return *this;
+ }
+ private:
+ Tensor_& m_tensor;
+ Symmetry_ m_symmetry;
+ std::array<Index, NumIndices> m_indices;
+
+ inline void doAssign(Scalar const& value)
+ {
+ #ifdef EIGEN_TENSOR_SYMMETRY_CHECK_VALUES
+ int value_flags = m_symmetry.template apply<internal::tensor_symmetry_calculate_flags<Tensor_>, int>(m_indices, m_symmetry.globalFlags(), m_indices);
+ if (value_flags & GlobalRealFlag)
+ eigen_assert(numext::imag(value) == 0);
+ if (value_flags & GlobalImagFlag)
+ eigen_assert(numext::real(value) == 0);
+ #endif
+ m_symmetry.template apply<internal::tensor_symmetry_assign_value<Tensor_>, int>(m_indices, 0, m_tensor, value);
+ }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSORSYMMETRY_SYMMETRY_H
+
+/*
+ * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
+ */
diff --git a/src/EigenUnsupported/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h b/src/EigenUnsupported/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h
new file mode 100644
index 0000000..54bf9db
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h
@@ -0,0 +1,669 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSORSYMMETRY_TEMPLATEGROUPTHEORY_H
+#define EIGEN_CXX11_TENSORSYMMETRY_TEMPLATEGROUPTHEORY_H
+
+namespace Eigen {
+
+namespace internal {
+
+namespace group_theory {
+
+/** \internal
+ * \file CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h
+ * This file contains C++ templates that implement group theory algorithms.
+ *
+ * The algorithms allow for a compile-time analysis of finite groups.
+ *
+ * Currently only Dimino's algorithm is implemented, which returns a list
+ * of all elements in a group given a set of (possibly redundant) generators.
+ * (One could also do that with the so-called orbital algorithm, but that
+ * is much more expensive and usually has no advantages.)
+ */
+
+/**********************************************************************
+ * "Ok kid, here is where it gets complicated."
+ * - Amelia Pond in the "Doctor Who" episode
+ * "The Big Bang"
+ *
+ * Dimino's algorithm
+ * ==================
+ *
+ * The following is Dimino's algorithm in sequential form:
+ *
+ * Input: identity element, list of generators, equality check,
+ * multiplication operation
+ * Output: list of group elements
+ *
+ * 1. add identity element
+ * 2. remove identities from list of generators
+ * 3. add all powers of first generator that aren't the
+ * identity element
+ * 4. go through all remaining generators:
+ * a. if generator is already in the list of elements
+ * -> do nothing
+ * b. otherwise
+ * i. remember current # of elements
+ * (i.e. the size of the current subgroup)
+ * ii. add all current elements (which includes
+ * the identity) each multiplied from right
+ * with the current generator to the group
+ * iii. add all remaining cosets that are generated
+ * by products of the new generator with itself
+ * and all other generators seen so far
+ *
+ * In functional form, this is implemented as a long set of recursive
+ * templates that have a complicated relationship.
+ *
+ * The main interface for Dimino's algorithm is the template
+ * enumerate_group_elements. All lists are implemented as variadic
+ * type_list<typename...> and numeric_list<typename = int, int...>
+ * templates.
+ *
+ * 'Calling' templates is usually done via typedefs.
+ *
+ * This algorithm is an extended version of the basic version. The
+ * extension consists in the fact that each group element has a set
+ * of flags associated with it. Multiplication of two group elements
+ * with each other results in a group element whose flags are the
+ * XOR of the flags of the previous elements. Each time the algorithm
+ * notices that a group element it just calculated is already in the
+ * list of current elements, the flags of both will be compared and
+ * added to the so-called 'global flags' of the group.
+ *
+ * The rationale behind this extension is that this allows not only
+ * for the description of symmetries between tensor indices, but
+ * also allows for the description of hermiticity, antisymmetry and
+ * antihermiticity. Negation and conjugation each are specific bit
+ * in the flags value and if two different ways to reach a group
+ * element lead to two different flags, this poses a constraint on
+ * the allowed values of the resulting tensor. For example, if a
+ * group element is reach both with and without the conjugation
+ * flags, it is clear that the resulting tensor has to be real.
+ *
+ * Note that this flag mechanism is quite generic and may have other
+ * uses beyond tensor properties.
+ *
+ * IMPORTANT:
+ * This algorithm assumes the group to be finite. If you try to
+ * run it with a group that's infinite, the algorithm will only
+ * terminate once you hit a compiler limit (max template depth).
+ * Also note that trying to use this implementation to create a
+ * very large group will probably either make you hit the same
+ * limit, cause the compiler to segfault or at the very least
+ * take a *really* long time (hours, days, weeks - sic!) to
+ * compile. It is not recommended to plug in more than 4
+ * generators, unless they are independent of each other.
+ */
+
+/** \internal
+ *
+ * \class strip_identities
+ * \ingroup CXX11_TensorSymmetry_Module
+ *
+ * \brief Cleanse a list of group elements of the identity element
+ *
+ * This template is used to make a first pass through all initial
+ * generators of Dimino's algorithm and remove the identity
+ * elements.
+ *
+ * \sa enumerate_group_elements
+ */
+template<template<typename, typename> class Equality, typename id, typename L> struct strip_identities;
+
+template<
+ template<typename, typename> class Equality,
+ typename id,
+ typename t,
+ typename... ts
+>
+struct strip_identities<Equality, id, type_list<t, ts...>>
+{
+ typedef typename conditional<
+ Equality<id, t>::value,
+ typename strip_identities<Equality, id, type_list<ts...>>::type,
+ typename concat<type_list<t>, typename strip_identities<Equality, id, type_list<ts...>>::type>::type
+ >::type type;
+ constexpr static int global_flags = Equality<id, t>::global_flags | strip_identities<Equality, id, type_list<ts...>>::global_flags;
+};
+
+template<
+ template<typename, typename> class Equality,
+ typename id
+ EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, ts)
+>
+struct strip_identities<Equality, id, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(ts)>>
+{
+ typedef type_list<> type;
+ constexpr static int global_flags = 0;
+};
+
+/** \internal
+ *
+ * \class dimino_first_step_elements_helper
+ * \ingroup CXX11_TensorSymmetry_Module
+ *
+ * \brief Recursive template that adds powers of the first generator to the list of group elements
+ *
+ * This template calls itself recursively to add powers of the first
+ * generator to the list of group elements. It stops if it reaches
+ * the identity element again.
+ *
+ * \sa enumerate_group_elements, dimino_first_step_elements
+ */
+template<
+ template<typename, typename> class Multiply,
+ template<typename, typename> class Equality,
+ typename id,
+ typename g,
+ typename current_element,
+ typename elements,
+ bool dont_add_current_element // = false
+>
+struct dimino_first_step_elements_helper
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+ : // recursive inheritance is too difficult for Doxygen
+ public dimino_first_step_elements_helper<
+ Multiply,
+ Equality,
+ id,
+ g,
+ typename Multiply<current_element, g>::type,
+ typename concat<elements, type_list<current_element>>::type,
+ Equality<typename Multiply<current_element, g>::type, id>::value
+ > {};
+
+template<
+ template<typename, typename> class Multiply,
+ template<typename, typename> class Equality,
+ typename id,
+ typename g,
+ typename current_element,
+ typename elements
+>
+struct dimino_first_step_elements_helper<Multiply, Equality, id, g, current_element, elements, true>
+#endif // EIGEN_PARSED_BY_DOXYGEN
+{
+ typedef elements type;
+ constexpr static int global_flags = Equality<current_element, id>::global_flags;
+};
+
+/** \internal
+ *
+ * \class dimino_first_step_elements
+ * \ingroup CXX11_TensorSymmetry_Module
+ *
+ * \brief Add all powers of the first generator to the list of group elements
+ *
+ * This template takes the first non-identity generator and generates the initial
+ * list of elements which consists of all powers of that generator. For a group
+ * with just one generated, it would be enumerated after this.
+ *
+ * \sa enumerate_group_elements
+ */
+template<
+ template<typename, typename> class Multiply,
+ template<typename, typename> class Equality,
+ typename id,
+ typename generators
+>
+struct dimino_first_step_elements
+{
+ typedef typename get<0, generators>::type first_generator;
+ typedef typename skip<1, generators>::type next_generators;
+ typedef type_list<first_generator> generators_done;
+
+ typedef dimino_first_step_elements_helper<
+ Multiply,
+ Equality,
+ id,
+ first_generator,
+ first_generator,
+ type_list<id>,
+ false
+ > helper;
+ typedef typename helper::type type;
+ constexpr static int global_flags = helper::global_flags;
+};
+
+/** \internal
+ *
+ * \class dimino_get_coset_elements
+ * \ingroup CXX11_TensorSymmetry_Module
+ *
+ * \brief Generate all elements of a specific coset
+ *
+ * This template generates all the elements of a specific coset by
+ * multiplying all elements in the given subgroup with the new
+ * coset representative. Note that the first element of the
+ * subgroup is always the identity element, so the first element of
+ * the result of this template is going to be the coset
+ * representative itself.
+ *
+ * Note that this template accepts an additional boolean parameter
+ * that specifies whether to actually generate the coset (true) or
+ * just return an empty list (false).
+ *
+ * \sa enumerate_group_elements, dimino_add_cosets_for_rep
+ */
+template<
+ template<typename, typename> class Multiply,
+ typename sub_group_elements,
+ typename new_coset_rep,
+ bool generate_coset // = true
+>
+struct dimino_get_coset_elements
+{
+ typedef typename apply_op_from_right<Multiply, new_coset_rep, sub_group_elements>::type type;
+};
+
+template<
+ template<typename, typename> class Multiply,
+ typename sub_group_elements,
+ typename new_coset_rep
+>
+struct dimino_get_coset_elements<Multiply, sub_group_elements, new_coset_rep, false>
+{
+ typedef type_list<> type;
+};
+
+/** \internal
+ *
+ * \class dimino_add_cosets_for_rep
+ * \ingroup CXX11_TensorSymmetry_Module
+ *
+ * \brief Recursive template for adding coset spaces
+ *
+ * This template multiplies the coset representative with a generator
+ * from the list of previous generators. If the new element is not in
+ * the group already, it adds the corresponding coset. Finally it
+ * proceeds to call itself with the next generator from the list.
+ *
+ * \sa enumerate_group_elements, dimino_add_all_coset_spaces
+ */
+template<
+ template<typename, typename> class Multiply,
+ template<typename, typename> class Equality,
+ typename id,
+ typename sub_group_elements,
+ typename elements,
+ typename generators,
+ typename rep_element,
+ int sub_group_size
+>
+struct dimino_add_cosets_for_rep;
+
+template<
+ template<typename, typename> class Multiply,
+ template<typename, typename> class Equality,
+ typename id,
+ typename sub_group_elements,
+ typename elements,
+ typename g,
+ typename... gs,
+ typename rep_element,
+ int sub_group_size
+>
+struct dimino_add_cosets_for_rep<Multiply, Equality, id, sub_group_elements, elements, type_list<g, gs...>, rep_element, sub_group_size>
+{
+ typedef typename Multiply<rep_element, g>::type new_coset_rep;
+ typedef contained_in_list_gf<Equality, new_coset_rep, elements> _cil;
+ constexpr static bool add_coset = !_cil::value;
+
+ typedef typename dimino_get_coset_elements<
+ Multiply,
+ sub_group_elements,
+ new_coset_rep,
+ add_coset
+ >::type coset_elements;
+
+ typedef dimino_add_cosets_for_rep<
+ Multiply,
+ Equality,
+ id,
+ sub_group_elements,
+ typename concat<elements, coset_elements>::type,
+ type_list<gs...>,
+ rep_element,
+ sub_group_size
+ > _helper;
+
+ typedef typename _helper::type type;
+ constexpr static int global_flags = _cil::global_flags | _helper::global_flags;
+
+ /* Note that we don't have to update global flags here, since
+ * we will only add these elements if they are not part of
+ * the group already. But that only happens if the coset rep
+ * is not already in the group, so the check for the coset rep
+ * will catch this.
+ */
+};
+
+template<
+ template<typename, typename> class Multiply,
+ template<typename, typename> class Equality,
+ typename id,
+ typename sub_group_elements,
+ typename elements
+ EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, empty),
+ typename rep_element,
+ int sub_group_size
+>
+struct dimino_add_cosets_for_rep<Multiply, Equality, id, sub_group_elements, elements, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(empty)>, rep_element, sub_group_size>
+{
+ typedef elements type;
+ constexpr static int global_flags = 0;
+};
+
+/** \internal
+ *
+ * \class dimino_add_all_coset_spaces
+ * \ingroup CXX11_TensorSymmetry_Module
+ *
+ * \brief Recursive template for adding all coset spaces for a new generator
+ *
+ * This template tries to go through the list of generators (with
+ * the help of the dimino_add_cosets_for_rep template) as long as
+ * it still finds elements that are not part of the group and add
+ * the corresponding cosets.
+ *
+ * \sa enumerate_group_elements, dimino_add_cosets_for_rep
+ */
+template<
+ template<typename, typename> class Multiply,
+ template<typename, typename> class Equality,
+ typename id,
+ typename sub_group_elements,
+ typename elements,
+ typename generators,
+ int sub_group_size,
+ int rep_pos,
+ bool stop_condition // = false
+>
+struct dimino_add_all_coset_spaces
+{
+ typedef typename get<rep_pos, elements>::type rep_element;
+ typedef dimino_add_cosets_for_rep<
+ Multiply,
+ Equality,
+ id,
+ sub_group_elements,
+ elements,
+ generators,
+ rep_element,
+ sub_group_elements::count
+ > _ac4r;
+ typedef typename _ac4r::type new_elements;
+
+ constexpr static int new_rep_pos = rep_pos + sub_group_elements::count;
+ constexpr static bool new_stop_condition = new_rep_pos >= new_elements::count;
+
+ typedef dimino_add_all_coset_spaces<
+ Multiply,
+ Equality,
+ id,
+ sub_group_elements,
+ new_elements,
+ generators,
+ sub_group_size,
+ new_rep_pos,
+ new_stop_condition
+ > _helper;
+
+ typedef typename _helper::type type;
+ constexpr static int global_flags = _helper::global_flags | _ac4r::global_flags;
+};
+
+template<
+ template<typename, typename> class Multiply,
+ template<typename, typename> class Equality,
+ typename id,
+ typename sub_group_elements,
+ typename elements,
+ typename generators,
+ int sub_group_size,
+ int rep_pos
+>
+struct dimino_add_all_coset_spaces<Multiply, Equality, id, sub_group_elements, elements, generators, sub_group_size, rep_pos, true>
+{
+ typedef elements type;
+ constexpr static int global_flags = 0;
+};
+
+/** \internal
+ *
+ * \class dimino_add_generator
+ * \ingroup CXX11_TensorSymmetry_Module
+ *
+ * \brief Enlarge the group by adding a new generator.
+ *
+ * It accepts a boolean parameter that determines if the generator is redundant,
+ * i.e. was already seen in the group. In that case, it reduces to a no-op.
+ *
+ * \sa enumerate_group_elements, dimino_add_all_coset_spaces
+ */
+template<
+ template<typename, typename> class Multiply,
+ template<typename, typename> class Equality,
+ typename id,
+ typename elements,
+ typename generators_done,
+ typename current_generator,
+ bool redundant // = false
+>
+struct dimino_add_generator
+{
+ /* this template is only called if the generator is not redundant
+ * => all elements of the group multiplied with the new generator
+ * are going to be new elements of the most trivial coset space
+ */
+ typedef typename apply_op_from_right<Multiply, current_generator, elements>::type multiplied_elements;
+ typedef typename concat<elements, multiplied_elements>::type new_elements;
+
+ constexpr static int rep_pos = elements::count;
+
+ typedef dimino_add_all_coset_spaces<
+ Multiply,
+ Equality,
+ id,
+ elements, // elements of previous subgroup
+ new_elements,
+ typename concat<generators_done, type_list<current_generator>>::type,
+ elements::count, // size of previous subgroup
+ rep_pos,
+ false // don't stop (because rep_pos >= new_elements::count is always false at this point)
+ > _helper;
+ typedef typename _helper::type type;
+ constexpr static int global_flags = _helper::global_flags;
+};
+
+template<
+ template<typename, typename> class Multiply,
+ template<typename, typename> class Equality,
+ typename id,
+ typename elements,
+ typename generators_done,
+ typename current_generator
+>
+struct dimino_add_generator<Multiply, Equality, id, elements, generators_done, current_generator, true>
+{
+ // redundant case
+ typedef elements type;
+ constexpr static int global_flags = 0;
+};
+
+/** \internal
+ *
+ * \class dimino_add_remaining_generators
+ * \ingroup CXX11_TensorSymmetry_Module
+ *
+ * \brief Recursive template that adds all remaining generators to a group
+ *
+ * Loop through the list of generators that remain and successively
+ * add them to the group.
+ *
+ * \sa enumerate_group_elements, dimino_add_generator
+ */
+template<
+ template<typename, typename> class Multiply,
+ template<typename, typename> class Equality,
+ typename id,
+ typename generators_done,
+ typename remaining_generators,
+ typename elements
+>
+struct dimino_add_remaining_generators
+{
+ typedef typename get<0, remaining_generators>::type first_generator;
+ typedef typename skip<1, remaining_generators>::type next_generators;
+
+ typedef contained_in_list_gf<Equality, first_generator, elements> _cil;
+
+ typedef dimino_add_generator<
+ Multiply,
+ Equality,
+ id,
+ elements,
+ generators_done,
+ first_generator,
+ _cil::value
+ > _helper;
+
+ typedef typename _helper::type new_elements;
+
+ typedef dimino_add_remaining_generators<
+ Multiply,
+ Equality,
+ id,
+ typename concat<generators_done, type_list<first_generator>>::type,
+ next_generators,
+ new_elements
+ > _next_iter;
+
+ typedef typename _next_iter::type type;
+ constexpr static int global_flags =
+ _cil::global_flags |
+ _helper::global_flags |
+ _next_iter::global_flags;
+};
+
+template<
+ template<typename, typename> class Multiply,
+ template<typename, typename> class Equality,
+ typename id,
+ typename generators_done,
+ typename elements
+>
+struct dimino_add_remaining_generators<Multiply, Equality, id, generators_done, type_list<>, elements>
+{
+ typedef elements type;
+ constexpr static int global_flags = 0;
+};
+
+/** \internal
+ *
+ * \class enumerate_group_elements_noid
+ * \ingroup CXX11_TensorSymmetry_Module
+ *
+ * \brief Helper template that implements group element enumeration
+ *
+ * This is a helper template that implements the actual enumeration
+ * of group elements. This has been split so that the list of
+ * generators can be cleansed of the identity element before
+ * performing the actual operation.
+ *
+ * \sa enumerate_group_elements
+ */
+template<
+ template<typename, typename> class Multiply,
+ template<typename, typename> class Equality,
+ typename id,
+ typename generators,
+ int initial_global_flags = 0
+>
+struct enumerate_group_elements_noid
+{
+ typedef dimino_first_step_elements<Multiply, Equality, id, generators> first_step;
+ typedef typename first_step::type first_step_elements;
+
+ typedef dimino_add_remaining_generators<
+ Multiply,
+ Equality,
+ id,
+ typename first_step::generators_done,
+ typename first_step::next_generators, // remaining_generators
+ typename first_step::type // first_step elements
+ > _helper;
+
+ typedef typename _helper::type type;
+ constexpr static int global_flags =
+ initial_global_flags |
+ first_step::global_flags |
+ _helper::global_flags;
+};
+
+// in case when no generators are specified
+template<
+ template<typename, typename> class Multiply,
+ template<typename, typename> class Equality,
+ typename id,
+ int initial_global_flags
+>
+struct enumerate_group_elements_noid<Multiply, Equality, id, type_list<>, initial_global_flags>
+{
+ typedef type_list<id> type;
+ constexpr static int global_flags = initial_global_flags;
+};
+
+/** \internal
+ *
+ * \class enumerate_group_elements
+ * \ingroup CXX11_TensorSymmetry_Module
+ *
+ * \brief Enumerate all elements in a finite group
+ *
+ * This template enumerates all elements in a finite group. It accepts
+ * the following template parameters:
+ *
+ * \tparam Multiply The multiplication operation that multiplies two group elements
+ * with each other.
+ * \tparam Equality The equality check operation that checks if two group elements
+ * are equal to another.
+ * \tparam id The identity element
+ * \tparam _generators A list of (possibly redundant) generators of the group
+ */
+template<
+ template<typename, typename> class Multiply,
+ template<typename, typename> class Equality,
+ typename id,
+ typename _generators
+>
+struct enumerate_group_elements
+ : public enumerate_group_elements_noid<
+ Multiply,
+ Equality,
+ id,
+ typename strip_identities<Equality, id, _generators>::type,
+ strip_identities<Equality, id, _generators>::global_flags
+ >
+{
+};
+
+} // end namespace group_theory
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSORSYMMETRY_TEMPLATEGROUPTHEORY_H
+
+/*
+ * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
+ */
diff --git a/src/EigenUnsupported/CXX11/src/ThreadPool/Barrier.h b/src/EigenUnsupported/CXX11/src/ThreadPool/Barrier.h
new file mode 100644
index 0000000..e4c59dc
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/ThreadPool/Barrier.h
@@ -0,0 +1,67 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Rasmus Munk Larsen <rmlarsen@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Barrier is an object that allows one or more threads to wait until
+// Notify has been called a specified number of times.
+
+#ifndef EIGEN_CXX11_THREADPOOL_BARRIER_H
+#define EIGEN_CXX11_THREADPOOL_BARRIER_H
+
+namespace Eigen {
+
+class Barrier {
+ public:
+ Barrier(unsigned int count) : state_(count << 1), notified_(false) {
+ eigen_plain_assert(((count << 1) >> 1) == count);
+ }
+ ~Barrier() { eigen_plain_assert((state_ >> 1) == 0); }
+
+ void Notify() {
+ unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2;
+ if (v != 1) {
+ // Clear the lowest bit (waiter flag) and check that the original state
+ // value was not zero. If it was zero, it means that notify was called
+ // more times than the original count.
+ eigen_plain_assert(((v + 2) & ~1) != 0);
+ return; // either count has not dropped to 0, or waiter is not waiting
+ }
+ std::unique_lock<std::mutex> l(mu_);
+ eigen_plain_assert(!notified_);
+ notified_ = true;
+ cv_.notify_all();
+ }
+
+ void Wait() {
+ unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel);
+ if ((v >> 1) == 0) return;
+ std::unique_lock<std::mutex> l(mu_);
+ while (!notified_) {
+ cv_.wait(l);
+ }
+ }
+
+ private:
+ std::mutex mu_;
+ std::condition_variable cv_;
+ std::atomic<unsigned int> state_; // low bit is waiter flag
+ bool notified_;
+};
+
+// Notification is an object that allows a user to to wait for another
+// thread to signal a notification that an event has occurred.
+//
+// Multiple threads can wait on the same Notification object,
+// but only one caller must call Notify() on the object.
+struct Notification : Barrier {
+ Notification() : Barrier(1){};
+};
+
+} // namespace Eigen
+
+#endif // EIGEN_CXX11_THREADPOOL_BARRIER_H
diff --git a/src/EigenUnsupported/CXX11/src/ThreadPool/EventCount.h b/src/EigenUnsupported/CXX11/src/ThreadPool/EventCount.h
new file mode 100644
index 0000000..4549aa0
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/ThreadPool/EventCount.h
@@ -0,0 +1,249 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H_
+#define EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H_
+
+namespace Eigen {
+
+// EventCount allows to wait for arbitrary predicates in non-blocking
+// algorithms. Think of condition variable, but wait predicate does not need to
+// be protected by a mutex. Usage:
+// Waiting thread does:
+//
+// if (predicate)
+// return act();
+// EventCount::Waiter& w = waiters[my_index];
+// ec.Prewait(&w);
+// if (predicate) {
+// ec.CancelWait(&w);
+// return act();
+// }
+// ec.CommitWait(&w);
+//
+// Notifying thread does:
+//
+// predicate = true;
+// ec.Notify(true);
+//
+// Notify is cheap if there are no waiting threads. Prewait/CommitWait are not
+// cheap, but they are executed only if the preceding predicate check has
+// failed.
+//
+// Algorithm outline:
+// There are two main variables: predicate (managed by user) and state_.
+// Operation closely resembles Dekker mutual algorithm:
+// https://en.wikipedia.org/wiki/Dekker%27s_algorithm
+// Waiting thread sets state_ then checks predicate, Notifying thread sets
+// predicate then checks state_. Due to seq_cst fences in between these
+// operations it is guaranteed than either waiter will see predicate change
+// and won't block, or notifying thread will see state_ change and will unblock
+// the waiter, or both. But it can't happen that both threads don't see each
+// other changes, which would lead to deadlock.
+class EventCount {
+ public:
+ class Waiter;
+
+ EventCount(MaxSizeVector<Waiter>& waiters)
+ : state_(kStackMask), waiters_(waiters) {
+ eigen_plain_assert(waiters.size() < (1 << kWaiterBits) - 1);
+ }
+
+ ~EventCount() {
+ // Ensure there are no waiters.
+ eigen_plain_assert(state_.load() == kStackMask);
+ }
+
+ // Prewait prepares for waiting.
+ // After calling Prewait, the thread must re-check the wait predicate
+ // and then call either CancelWait or CommitWait.
+ void Prewait() {
+ uint64_t state = state_.load(std::memory_order_relaxed);
+ for (;;) {
+ CheckState(state);
+ uint64_t newstate = state + kWaiterInc;
+ CheckState(newstate);
+ if (state_.compare_exchange_weak(state, newstate,
+ std::memory_order_seq_cst))
+ return;
+ }
+ }
+
+ // CommitWait commits waiting after Prewait.
+ void CommitWait(Waiter* w) {
+ eigen_plain_assert((w->epoch & ~kEpochMask) == 0);
+ w->state = Waiter::kNotSignaled;
+ const uint64_t me = (w - &waiters_[0]) | w->epoch;
+ uint64_t state = state_.load(std::memory_order_seq_cst);
+ for (;;) {
+ CheckState(state, true);
+ uint64_t newstate;
+ if ((state & kSignalMask) != 0) {
+ // Consume the signal and return immidiately.
+ newstate = state - kWaiterInc - kSignalInc;
+ } else {
+ // Remove this thread from pre-wait counter and add to the waiter stack.
+ newstate = ((state & kWaiterMask) - kWaiterInc) | me;
+ w->next.store(state & (kStackMask | kEpochMask),
+ std::memory_order_relaxed);
+ }
+ CheckState(newstate);
+ if (state_.compare_exchange_weak(state, newstate,
+ std::memory_order_acq_rel)) {
+ if ((state & kSignalMask) == 0) {
+ w->epoch += kEpochInc;
+ Park(w);
+ }
+ return;
+ }
+ }
+ }
+
+ // CancelWait cancels effects of the previous Prewait call.
+ void CancelWait() {
+ uint64_t state = state_.load(std::memory_order_relaxed);
+ for (;;) {
+ CheckState(state, true);
+ uint64_t newstate = state - kWaiterInc;
+ // We don't know if the thread was also notified or not,
+ // so we should not consume a signal unconditionaly.
+ // Only if number of waiters is equal to number of signals,
+ // we know that the thread was notified and we must take away the signal.
+ if (((state & kWaiterMask) >> kWaiterShift) ==
+ ((state & kSignalMask) >> kSignalShift))
+ newstate -= kSignalInc;
+ CheckState(newstate);
+ if (state_.compare_exchange_weak(state, newstate,
+ std::memory_order_acq_rel))
+ return;
+ }
+ }
+
+ // Notify wakes one or all waiting threads.
+ // Must be called after changing the associated wait predicate.
+ void Notify(bool notifyAll) {
+ std::atomic_thread_fence(std::memory_order_seq_cst);
+ uint64_t state = state_.load(std::memory_order_acquire);
+ for (;;) {
+ CheckState(state);
+ const uint64_t waiters = (state & kWaiterMask) >> kWaiterShift;
+ const uint64_t signals = (state & kSignalMask) >> kSignalShift;
+ // Easy case: no waiters.
+ if ((state & kStackMask) == kStackMask && waiters == signals) return;
+ uint64_t newstate;
+ if (notifyAll) {
+ // Empty wait stack and set signal to number of pre-wait threads.
+ newstate =
+ (state & kWaiterMask) | (waiters << kSignalShift) | kStackMask;
+ } else if (signals < waiters) {
+ // There is a thread in pre-wait state, unblock it.
+ newstate = state + kSignalInc;
+ } else {
+ // Pop a waiter from list and unpark it.
+ Waiter* w = &waiters_[state & kStackMask];
+ uint64_t next = w->next.load(std::memory_order_relaxed);
+ newstate = (state & (kWaiterMask | kSignalMask)) | next;
+ }
+ CheckState(newstate);
+ if (state_.compare_exchange_weak(state, newstate,
+ std::memory_order_acq_rel)) {
+ if (!notifyAll && (signals < waiters))
+ return; // unblocked pre-wait thread
+ if ((state & kStackMask) == kStackMask) return;
+ Waiter* w = &waiters_[state & kStackMask];
+ if (!notifyAll) w->next.store(kStackMask, std::memory_order_relaxed);
+ Unpark(w);
+ return;
+ }
+ }
+ }
+
+ class Waiter {
+ friend class EventCount;
+ // Align to 128 byte boundary to prevent false sharing with other Waiter
+ // objects in the same vector.
+ EIGEN_ALIGN_TO_BOUNDARY(128) std::atomic<uint64_t> next;
+ std::mutex mu;
+ std::condition_variable cv;
+ uint64_t epoch = 0;
+ unsigned state = kNotSignaled;
+ enum {
+ kNotSignaled,
+ kWaiting,
+ kSignaled,
+ };
+ };
+
+ private:
+ // State_ layout:
+ // - low kWaiterBits is a stack of waiters committed wait
+ // (indexes in waiters_ array are used as stack elements,
+ // kStackMask means empty stack).
+ // - next kWaiterBits is count of waiters in prewait state.
+ // - next kWaiterBits is count of pending signals.
+ // - remaining bits are ABA counter for the stack.
+ // (stored in Waiter node and incremented on push).
+ static const uint64_t kWaiterBits = 14;
+ static const uint64_t kStackMask = (1ull << kWaiterBits) - 1;
+ static const uint64_t kWaiterShift = kWaiterBits;
+ static const uint64_t kWaiterMask = ((1ull << kWaiterBits) - 1)
+ << kWaiterShift;
+ static const uint64_t kWaiterInc = 1ull << kWaiterShift;
+ static const uint64_t kSignalShift = 2 * kWaiterBits;
+ static const uint64_t kSignalMask = ((1ull << kWaiterBits) - 1)
+ << kSignalShift;
+ static const uint64_t kSignalInc = 1ull << kSignalShift;
+ static const uint64_t kEpochShift = 3 * kWaiterBits;
+ static const uint64_t kEpochBits = 64 - kEpochShift;
+ static const uint64_t kEpochMask = ((1ull << kEpochBits) - 1) << kEpochShift;
+ static const uint64_t kEpochInc = 1ull << kEpochShift;
+ std::atomic<uint64_t> state_;
+ MaxSizeVector<Waiter>& waiters_;
+
+ static void CheckState(uint64_t state, bool waiter = false) {
+ static_assert(kEpochBits >= 20, "not enough bits to prevent ABA problem");
+ const uint64_t waiters = (state & kWaiterMask) >> kWaiterShift;
+ const uint64_t signals = (state & kSignalMask) >> kSignalShift;
+ eigen_plain_assert(waiters >= signals);
+ eigen_plain_assert(waiters < (1 << kWaiterBits) - 1);
+ eigen_plain_assert(!waiter || waiters > 0);
+ (void)waiters;
+ (void)signals;
+ }
+
+ void Park(Waiter* w) {
+ std::unique_lock<std::mutex> lock(w->mu);
+ while (w->state != Waiter::kSignaled) {
+ w->state = Waiter::kWaiting;
+ w->cv.wait(lock);
+ }
+ }
+
+ void Unpark(Waiter* w) {
+ for (Waiter* next; w; w = next) {
+ uint64_t wnext = w->next.load(std::memory_order_relaxed) & kStackMask;
+ next = wnext == kStackMask ? nullptr : &waiters_[wnext];
+ unsigned state;
+ {
+ std::unique_lock<std::mutex> lock(w->mu);
+ state = w->state;
+ w->state = Waiter::kSignaled;
+ }
+ // Avoid notifying if it wasn't waiting.
+ if (state == Waiter::kWaiting) w->cv.notify_one();
+ }
+ }
+
+ EventCount(const EventCount&) = delete;
+ void operator=(const EventCount&) = delete;
+};
+
+} // namespace Eigen
+
+#endif // EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H_
diff --git a/src/EigenUnsupported/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/src/EigenUnsupported/CXX11/src/ThreadPool/NonBlockingThreadPool.h
new file mode 100644
index 0000000..23a2b54
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/ThreadPool/NonBlockingThreadPool.h
@@ -0,0 +1,486 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H
+#define EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H
+
+namespace Eigen {
+
+template <typename Environment>
+class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
+ public:
+ typedef typename Environment::Task Task;
+ typedef RunQueue<Task, 1024> Queue;
+
+ ThreadPoolTempl(int num_threads, Environment env = Environment())
+ : ThreadPoolTempl(num_threads, true, env) {}
+
+ ThreadPoolTempl(int num_threads, bool allow_spinning,
+ Environment env = Environment())
+ : env_(env),
+ num_threads_(num_threads),
+ allow_spinning_(allow_spinning),
+ thread_data_(num_threads),
+ all_coprimes_(num_threads),
+ waiters_(num_threads),
+ global_steal_partition_(EncodePartition(0, num_threads_)),
+ blocked_(0),
+ spinning_(0),
+ done_(false),
+ cancelled_(false),
+ ec_(waiters_) {
+ waiters_.resize(num_threads_);
+ // Calculate coprimes of all numbers [1, num_threads].
+ // Coprimes are used for random walks over all threads in Steal
+ // and NonEmptyQueueIndex. Iteration is based on the fact that if we take
+ // a random starting thread index t and calculate num_threads - 1 subsequent
+ // indices as (t + coprime) % num_threads, we will cover all threads without
+ // repetitions (effectively getting a presudo-random permutation of thread
+ // indices).
+ eigen_plain_assert(num_threads_ < kMaxThreads);
+ for (int i = 1; i <= num_threads_; ++i) {
+ all_coprimes_.emplace_back(i);
+ ComputeCoprimes(i, &all_coprimes_.back());
+ }
+#ifndef EIGEN_THREAD_LOCAL
+ init_barrier_.reset(new Barrier(num_threads_));
+#endif
+ thread_data_.resize(num_threads_);
+ for (int i = 0; i < num_threads_; i++) {
+ SetStealPartition(i, EncodePartition(0, num_threads_));
+ thread_data_[i].thread.reset(
+ env_.CreateThread([this, i]() { WorkerLoop(i); }));
+ }
+#ifndef EIGEN_THREAD_LOCAL
+ // Wait for workers to initialize per_thread_map_. Otherwise we might race
+ // with them in Schedule or CurrentThreadId.
+ init_barrier_->Wait();
+#endif
+ }
+
+ ~ThreadPoolTempl() {
+ done_ = true;
+
+ // Now if all threads block without work, they will start exiting.
+ // But note that threads can continue to work arbitrary long,
+ // block, submit new work, unblock and otherwise live full life.
+ if (!cancelled_) {
+ ec_.Notify(true);
+ } else {
+ // Since we were cancelled, there might be entries in the queues.
+ // Empty them to prevent their destructor from asserting.
+ for (size_t i = 0; i < thread_data_.size(); i++) {
+ thread_data_[i].queue.Flush();
+ }
+ }
+ // Join threads explicitly (by destroying) to avoid destruction order within
+ // this class.
+ for (size_t i = 0; i < thread_data_.size(); ++i)
+ thread_data_[i].thread.reset();
+ }
+
+ void SetStealPartitions(const std::vector<std::pair<unsigned, unsigned>>& partitions) {
+ eigen_plain_assert(partitions.size() == static_cast<std::size_t>(num_threads_));
+
+ // Pass this information to each thread queue.
+ for (int i = 0; i < num_threads_; i++) {
+ const auto& pair = partitions[i];
+ unsigned start = pair.first, end = pair.second;
+ AssertBounds(start, end);
+ unsigned val = EncodePartition(start, end);
+ SetStealPartition(i, val);
+ }
+ }
+
+ void Schedule(std::function<void()> fn) EIGEN_OVERRIDE {
+ ScheduleWithHint(std::move(fn), 0, num_threads_);
+ }
+
+ void ScheduleWithHint(std::function<void()> fn, int start,
+ int limit) override {
+ Task t = env_.CreateTask(std::move(fn));
+ PerThread* pt = GetPerThread();
+ if (pt->pool == this) {
+ // Worker thread of this pool, push onto the thread's queue.
+ Queue& q = thread_data_[pt->thread_id].queue;
+ t = q.PushFront(std::move(t));
+ } else {
+ // A free-standing thread (or worker of another pool), push onto a random
+ // queue.
+ eigen_plain_assert(start < limit);
+ eigen_plain_assert(limit <= num_threads_);
+ int num_queues = limit - start;
+ int rnd = Rand(&pt->rand) % num_queues;
+ eigen_plain_assert(start + rnd < limit);
+ Queue& q = thread_data_[start + rnd].queue;
+ t = q.PushBack(std::move(t));
+ }
+ // Note: below we touch this after making w available to worker threads.
+ // Strictly speaking, this can lead to a racy-use-after-free. Consider that
+ // Schedule is called from a thread that is neither main thread nor a worker
+ // thread of this pool. Then, execution of w directly or indirectly
+ // completes overall computations, which in turn leads to destruction of
+ // this. We expect that such scenario is prevented by program, that is,
+ // this is kept alive while any threads can potentially be in Schedule.
+ if (!t.f) {
+ ec_.Notify(false);
+ } else {
+ env_.ExecuteTask(t); // Push failed, execute directly.
+ }
+ }
+
+ void Cancel() EIGEN_OVERRIDE {
+ cancelled_ = true;
+ done_ = true;
+
+ // Let each thread know it's been cancelled.
+#ifdef EIGEN_THREAD_ENV_SUPPORTS_CANCELLATION
+ for (size_t i = 0; i < thread_data_.size(); i++) {
+ thread_data_[i].thread->OnCancel();
+ }
+#endif
+
+ // Wake up the threads without work to let them exit on their own.
+ ec_.Notify(true);
+ }
+
+ int NumThreads() const EIGEN_FINAL { return num_threads_; }
+
+ int CurrentThreadId() const EIGEN_FINAL {
+ const PerThread* pt = const_cast<ThreadPoolTempl*>(this)->GetPerThread();
+ if (pt->pool == this) {
+ return pt->thread_id;
+ } else {
+ return -1;
+ }
+ }
+
+ private:
+ // Create a single atomic<int> that encodes start and limit information for
+ // each thread.
+ // We expect num_threads_ < 65536, so we can store them in a single
+ // std::atomic<unsigned>.
+ // Exposed publicly as static functions so that external callers can reuse
+ // this encode/decode logic for maintaining their own thread-safe copies of
+ // scheduling and steal domain(s).
+ static const int kMaxPartitionBits = 16;
+ static const int kMaxThreads = 1 << kMaxPartitionBits;
+
+ inline unsigned EncodePartition(unsigned start, unsigned limit) {
+ return (start << kMaxPartitionBits) | limit;
+ }
+
+ inline void DecodePartition(unsigned val, unsigned* start, unsigned* limit) {
+ *limit = val & (kMaxThreads - 1);
+ val >>= kMaxPartitionBits;
+ *start = val;
+ }
+
+ void AssertBounds(int start, int end) {
+ eigen_plain_assert(start >= 0);
+ eigen_plain_assert(start < end); // non-zero sized partition
+ eigen_plain_assert(end <= num_threads_);
+ }
+
+ inline void SetStealPartition(size_t i, unsigned val) {
+ thread_data_[i].steal_partition.store(val, std::memory_order_relaxed);
+ }
+
+ inline unsigned GetStealPartition(int i) {
+ return thread_data_[i].steal_partition.load(std::memory_order_relaxed);
+ }
+
+ void ComputeCoprimes(int N, MaxSizeVector<unsigned>* coprimes) {
+ for (int i = 1; i <= N; i++) {
+ unsigned a = i;
+ unsigned b = N;
+ // If GCD(a, b) == 1, then a and b are coprimes.
+ while (b != 0) {
+ unsigned tmp = a;
+ a = b;
+ b = tmp % b;
+ }
+ if (a == 1) {
+ coprimes->push_back(i);
+ }
+ }
+ }
+
+ typedef typename Environment::EnvThread Thread;
+
+ struct PerThread {
+ constexpr PerThread() : pool(NULL), rand(0), thread_id(-1) {}
+ ThreadPoolTempl* pool; // Parent pool, or null for normal threads.
+ uint64_t rand; // Random generator state.
+ int thread_id; // Worker thread index in pool.
+#ifndef EIGEN_THREAD_LOCAL
+ // Prevent false sharing.
+ char pad_[128];
+#endif
+ };
+
+ struct ThreadData {
+ constexpr ThreadData() : thread(), steal_partition(0), queue() {}
+ std::unique_ptr<Thread> thread;
+ std::atomic<unsigned> steal_partition;
+ Queue queue;
+ };
+
+ Environment env_;
+ const int num_threads_;
+ const bool allow_spinning_;
+ MaxSizeVector<ThreadData> thread_data_;
+ MaxSizeVector<MaxSizeVector<unsigned>> all_coprimes_;
+ MaxSizeVector<EventCount::Waiter> waiters_;
+ unsigned global_steal_partition_;
+ std::atomic<unsigned> blocked_;
+ std::atomic<bool> spinning_;
+ std::atomic<bool> done_;
+ std::atomic<bool> cancelled_;
+ EventCount ec_;
+#ifndef EIGEN_THREAD_LOCAL
+ std::unique_ptr<Barrier> init_barrier_;
+ std::mutex per_thread_map_mutex_; // Protects per_thread_map_.
+ std::unordered_map<uint64_t, std::unique_ptr<PerThread>> per_thread_map_;
+#endif
+
+ // Main worker thread loop.
+ void WorkerLoop(int thread_id) {
+#ifndef EIGEN_THREAD_LOCAL
+ std::unique_ptr<PerThread> new_pt(new PerThread());
+ per_thread_map_mutex_.lock();
+ bool insertOK = per_thread_map_.emplace(GlobalThreadIdHash(), std::move(new_pt)).second;
+ eigen_plain_assert(insertOK);
+ EIGEN_UNUSED_VARIABLE(insertOK);
+ per_thread_map_mutex_.unlock();
+ init_barrier_->Notify();
+ init_barrier_->Wait();
+#endif
+ PerThread* pt = GetPerThread();
+ pt->pool = this;
+ pt->rand = GlobalThreadIdHash();
+ pt->thread_id = thread_id;
+ Queue& q = thread_data_[thread_id].queue;
+ EventCount::Waiter* waiter = &waiters_[thread_id];
+ // TODO(dvyukov,rmlarsen): The time spent in NonEmptyQueueIndex() is
+ // proportional to num_threads_ and we assume that new work is scheduled at
+ // a constant rate, so we set spin_count to 5000 / num_threads_. The
+ // constant was picked based on a fair dice roll, tune it.
+ const int spin_count =
+ allow_spinning_ && num_threads_ > 0 ? 5000 / num_threads_ : 0;
+ if (num_threads_ == 1) {
+ // For num_threads_ == 1 there is no point in going through the expensive
+ // steal loop. Moreover, since NonEmptyQueueIndex() calls PopBack() on the
+ // victim queues it might reverse the order in which ops are executed
+ // compared to the order in which they are scheduled, which tends to be
+ // counter-productive for the types of I/O workloads the single thread
+ // pools tend to be used for.
+ while (!cancelled_) {
+ Task t = q.PopFront();
+ for (int i = 0; i < spin_count && !t.f; i++) {
+ if (!cancelled_.load(std::memory_order_relaxed)) {
+ t = q.PopFront();
+ }
+ }
+ if (!t.f) {
+ if (!WaitForWork(waiter, &t)) {
+ return;
+ }
+ }
+ if (t.f) {
+ env_.ExecuteTask(t);
+ }
+ }
+ } else {
+ while (!cancelled_) {
+ Task t = q.PopFront();
+ if (!t.f) {
+ t = LocalSteal();
+ if (!t.f) {
+ t = GlobalSteal();
+ if (!t.f) {
+ // Leave one thread spinning. This reduces latency.
+ if (allow_spinning_ && !spinning_ && !spinning_.exchange(true)) {
+ for (int i = 0; i < spin_count && !t.f; i++) {
+ if (!cancelled_.load(std::memory_order_relaxed)) {
+ t = GlobalSteal();
+ } else {
+ return;
+ }
+ }
+ spinning_ = false;
+ }
+ if (!t.f) {
+ if (!WaitForWork(waiter, &t)) {
+ return;
+ }
+ }
+ }
+ }
+ }
+ if (t.f) {
+ env_.ExecuteTask(t);
+ }
+ }
+ }
+ }
+
+ // Steal tries to steal work from other worker threads in the range [start,
+ // limit) in best-effort manner.
+ Task Steal(unsigned start, unsigned limit) {
+ PerThread* pt = GetPerThread();
+ const size_t size = limit - start;
+ unsigned r = Rand(&pt->rand);
+ // Reduce r into [0, size) range, this utilizes trick from
+ // https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
+ eigen_plain_assert(all_coprimes_[size - 1].size() < (1<<30));
+ unsigned victim = ((uint64_t)r * (uint64_t)size) >> 32;
+ unsigned index = ((uint64_t) all_coprimes_[size - 1].size() * (uint64_t)r) >> 32;
+ unsigned inc = all_coprimes_[size - 1][index];
+
+ for (unsigned i = 0; i < size; i++) {
+ eigen_plain_assert(start + victim < limit);
+ Task t = thread_data_[start + victim].queue.PopBack();
+ if (t.f) {
+ return t;
+ }
+ victim += inc;
+ if (victim >= size) {
+ victim -= size;
+ }
+ }
+ return Task();
+ }
+
+ // Steals work within threads belonging to the partition.
+ Task LocalSteal() {
+ PerThread* pt = GetPerThread();
+ unsigned partition = GetStealPartition(pt->thread_id);
+ // If thread steal partition is the same as global partition, there is no
+ // need to go through the steal loop twice.
+ if (global_steal_partition_ == partition) return Task();
+ unsigned start, limit;
+ DecodePartition(partition, &start, &limit);
+ AssertBounds(start, limit);
+
+ return Steal(start, limit);
+ }
+
+ // Steals work from any other thread in the pool.
+ Task GlobalSteal() {
+ return Steal(0, num_threads_);
+ }
+
+
+ // WaitForWork blocks until new work is available (returns true), or if it is
+ // time to exit (returns false). Can optionally return a task to execute in t
+ // (in such case t.f != nullptr on return).
+ bool WaitForWork(EventCount::Waiter* waiter, Task* t) {
+ eigen_plain_assert(!t->f);
+ // We already did best-effort emptiness check in Steal, so prepare for
+ // blocking.
+ ec_.Prewait();
+ // Now do a reliable emptiness check.
+ int victim = NonEmptyQueueIndex();
+ if (victim != -1) {
+ ec_.CancelWait();
+ if (cancelled_) {
+ return false;
+ } else {
+ *t = thread_data_[victim].queue.PopBack();
+ return true;
+ }
+ }
+ // Number of blocked threads is used as termination condition.
+ // If we are shutting down and all worker threads blocked without work,
+ // that's we are done.
+ blocked_++;
+ // TODO is blocked_ required to be unsigned?
+ if (done_ && blocked_ == static_cast<unsigned>(num_threads_)) {
+ ec_.CancelWait();
+ // Almost done, but need to re-check queues.
+ // Consider that all queues are empty and all worker threads are preempted
+ // right after incrementing blocked_ above. Now a free-standing thread
+ // submits work and calls destructor (which sets done_). If we don't
+ // re-check queues, we will exit leaving the work unexecuted.
+ if (NonEmptyQueueIndex() != -1) {
+ // Note: we must not pop from queues before we decrement blocked_,
+ // otherwise the following scenario is possible. Consider that instead
+ // of checking for emptiness we popped the only element from queues.
+ // Now other worker threads can start exiting, which is bad if the
+ // work item submits other work. So we just check emptiness here,
+ // which ensures that all worker threads exit at the same time.
+ blocked_--;
+ return true;
+ }
+ // Reached stable termination state.
+ ec_.Notify(true);
+ return false;
+ }
+ ec_.CommitWait(waiter);
+ blocked_--;
+ return true;
+ }
+
+ int NonEmptyQueueIndex() {
+ PerThread* pt = GetPerThread();
+ // We intentionally design NonEmptyQueueIndex to steal work from
+ // anywhere in the queue so threads don't block in WaitForWork() forever
+ // when all threads in their partition go to sleep. Steal is still local.
+ const size_t size = thread_data_.size();
+ unsigned r = Rand(&pt->rand);
+ unsigned inc = all_coprimes_[size - 1][r % all_coprimes_[size - 1].size()];
+ unsigned victim = r % size;
+ for (unsigned i = 0; i < size; i++) {
+ if (!thread_data_[victim].queue.Empty()) {
+ return victim;
+ }
+ victim += inc;
+ if (victim >= size) {
+ victim -= size;
+ }
+ }
+ return -1;
+ }
+
+ static EIGEN_STRONG_INLINE uint64_t GlobalThreadIdHash() {
+ return std::hash<std::thread::id>()(std::this_thread::get_id());
+ }
+
+ EIGEN_STRONG_INLINE PerThread* GetPerThread() {
+#ifndef EIGEN_THREAD_LOCAL
+ static PerThread dummy;
+ auto it = per_thread_map_.find(GlobalThreadIdHash());
+ if (it == per_thread_map_.end()) {
+ return &dummy;
+ } else {
+ return it->second.get();
+ }
+#else
+ EIGEN_THREAD_LOCAL PerThread per_thread_;
+ PerThread* pt = &per_thread_;
+ return pt;
+#endif
+ }
+
+ static EIGEN_STRONG_INLINE unsigned Rand(uint64_t* state) {
+ uint64_t current = *state;
+ // Update the internal state
+ *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
+ // Generate the random output (using the PCG-XSH-RS scheme)
+ return static_cast<unsigned>((current ^ (current >> 22)) >>
+ (22 + (current >> 61)));
+ }
+};
+
+typedef ThreadPoolTempl<StlThreadEnvironment> ThreadPool;
+
+} // namespace Eigen
+
+#endif // EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H
diff --git a/src/EigenUnsupported/CXX11/src/ThreadPool/RunQueue.h b/src/EigenUnsupported/CXX11/src/ThreadPool/RunQueue.h
new file mode 100644
index 0000000..b572ebc
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/ThreadPool/RunQueue.h
@@ -0,0 +1,236 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_
+#define EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_
+
+namespace Eigen {
+
+// RunQueue is a fixed-size, partially non-blocking deque or Work items.
+// Operations on front of the queue must be done by a single thread (owner),
+// operations on back of the queue can be done by multiple threads concurrently.
+//
+// Algorithm outline:
+// All remote threads operating on the queue back are serialized by a mutex.
+// This ensures that at most two threads access state: owner and one remote
+// thread (Size aside). The algorithm ensures that the occupied region of the
+// underlying array is logically continuous (can wraparound, but no stray
+// occupied elements). Owner operates on one end of this region, remote thread
+// operates on the other end. Synchronization between these threads
+// (potential consumption of the last element and take up of the last empty
+// element) happens by means of state variable in each element. States are:
+// empty, busy (in process of insertion of removal) and ready. Threads claim
+// elements (empty->busy and ready->busy transitions) by means of a CAS
+// operation. The finishing transition (busy->empty and busy->ready) are done
+// with plain store as the element is exclusively owned by the current thread.
+//
+// Note: we could permit only pointers as elements, then we would not need
+// separate state variable as null/non-null pointer value would serve as state,
+// but that would require malloc/free per operation for large, complex values
+// (and this is designed to store std::function<()>).
+template <typename Work, unsigned kSize>
+class RunQueue {
+ public:
+ RunQueue() : front_(0), back_(0) {
+ // require power-of-two for fast masking
+ eigen_plain_assert((kSize & (kSize - 1)) == 0);
+ eigen_plain_assert(kSize > 2); // why would you do this?
+ eigen_plain_assert(kSize <= (64 << 10)); // leave enough space for counter
+ for (unsigned i = 0; i < kSize; i++)
+ array_[i].state.store(kEmpty, std::memory_order_relaxed);
+ }
+
+ ~RunQueue() { eigen_plain_assert(Size() == 0); }
+
+ // PushFront inserts w at the beginning of the queue.
+ // If queue is full returns w, otherwise returns default-constructed Work.
+ Work PushFront(Work w) {
+ unsigned front = front_.load(std::memory_order_relaxed);
+ Elem* e = &array_[front & kMask];
+ uint8_t s = e->state.load(std::memory_order_relaxed);
+ if (s != kEmpty ||
+ !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
+ return w;
+ front_.store(front + 1 + (kSize << 1), std::memory_order_relaxed);
+ e->w = std::move(w);
+ e->state.store(kReady, std::memory_order_release);
+ return Work();
+ }
+
+ // PopFront removes and returns the first element in the queue.
+ // If the queue was empty returns default-constructed Work.
+ Work PopFront() {
+ unsigned front = front_.load(std::memory_order_relaxed);
+ Elem* e = &array_[(front - 1) & kMask];
+ uint8_t s = e->state.load(std::memory_order_relaxed);
+ if (s != kReady ||
+ !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
+ return Work();
+ Work w = std::move(e->w);
+ e->state.store(kEmpty, std::memory_order_release);
+ front = ((front - 1) & kMask2) | (front & ~kMask2);
+ front_.store(front, std::memory_order_relaxed);
+ return w;
+ }
+
+ // PushBack adds w at the end of the queue.
+ // If queue is full returns w, otherwise returns default-constructed Work.
+ Work PushBack(Work w) {
+ std::unique_lock<std::mutex> lock(mutex_);
+ unsigned back = back_.load(std::memory_order_relaxed);
+ Elem* e = &array_[(back - 1) & kMask];
+ uint8_t s = e->state.load(std::memory_order_relaxed);
+ if (s != kEmpty ||
+ !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
+ return w;
+ back = ((back - 1) & kMask2) | (back & ~kMask2);
+ back_.store(back, std::memory_order_relaxed);
+ e->w = std::move(w);
+ e->state.store(kReady, std::memory_order_release);
+ return Work();
+ }
+
+ // PopBack removes and returns the last elements in the queue.
+ Work PopBack() {
+ if (Empty()) return Work();
+ std::unique_lock<std::mutex> lock(mutex_);
+ unsigned back = back_.load(std::memory_order_relaxed);
+ Elem* e = &array_[back & kMask];
+ uint8_t s = e->state.load(std::memory_order_relaxed);
+ if (s != kReady ||
+ !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
+ return Work();
+ Work w = std::move(e->w);
+ e->state.store(kEmpty, std::memory_order_release);
+ back_.store(back + 1 + (kSize << 1), std::memory_order_relaxed);
+ return w;
+ }
+
+ // PopBackHalf removes and returns half last elements in the queue.
+ // Returns number of elements removed.
+ unsigned PopBackHalf(std::vector<Work>* result) {
+ if (Empty()) return 0;
+ std::unique_lock<std::mutex> lock(mutex_);
+ unsigned back = back_.load(std::memory_order_relaxed);
+ unsigned size = Size();
+ unsigned mid = back;
+ if (size > 1) mid = back + (size - 1) / 2;
+ unsigned n = 0;
+ unsigned start = 0;
+ for (; static_cast<int>(mid - back) >= 0; mid--) {
+ Elem* e = &array_[mid & kMask];
+ uint8_t s = e->state.load(std::memory_order_relaxed);
+ if (n == 0) {
+ if (s != kReady || !e->state.compare_exchange_strong(
+ s, kBusy, std::memory_order_acquire))
+ continue;
+ start = mid;
+ } else {
+ // Note: no need to store temporal kBusy, we exclusively own these
+ // elements.
+ eigen_plain_assert(s == kReady);
+ }
+ result->push_back(std::move(e->w));
+ e->state.store(kEmpty, std::memory_order_release);
+ n++;
+ }
+ if (n != 0)
+ back_.store(start + 1 + (kSize << 1), std::memory_order_relaxed);
+ return n;
+ }
+
+ // Size returns current queue size.
+ // Can be called by any thread at any time.
+ unsigned Size() const { return SizeOrNotEmpty<true>(); }
+
+ // Empty tests whether container is empty.
+ // Can be called by any thread at any time.
+ bool Empty() const { return SizeOrNotEmpty<false>() == 0; }
+
+ // Delete all the elements from the queue.
+ void Flush() {
+ while (!Empty()) {
+ PopFront();
+ }
+ }
+
+ private:
+ static const unsigned kMask = kSize - 1;
+ static const unsigned kMask2 = (kSize << 1) - 1;
+ struct Elem {
+ std::atomic<uint8_t> state;
+ Work w;
+ };
+ enum {
+ kEmpty,
+ kBusy,
+ kReady,
+ };
+ std::mutex mutex_;
+ // Low log(kSize) + 1 bits in front_ and back_ contain rolling index of
+ // front/back, respectively. The remaining bits contain modification counters
+ // that are incremented on Push operations. This allows us to (1) distinguish
+ // between empty and full conditions (if we would use log(kSize) bits for
+ // position, these conditions would be indistinguishable); (2) obtain
+ // consistent snapshot of front_/back_ for Size operation using the
+ // modification counters.
+ std::atomic<unsigned> front_;
+ std::atomic<unsigned> back_;
+ Elem array_[kSize];
+
+ // SizeOrNotEmpty returns current queue size; if NeedSizeEstimate is false,
+ // only whether the size is 0 is guaranteed to be correct.
+ // Can be called by any thread at any time.
+ template<bool NeedSizeEstimate>
+ unsigned SizeOrNotEmpty() const {
+ // Emptiness plays critical role in thread pool blocking. So we go to great
+ // effort to not produce false positives (claim non-empty queue as empty).
+ unsigned front = front_.load(std::memory_order_acquire);
+ for (;;) {
+ // Capture a consistent snapshot of front/tail.
+ unsigned back = back_.load(std::memory_order_acquire);
+ unsigned front1 = front_.load(std::memory_order_relaxed);
+ if (front != front1) {
+ front = front1;
+ std::atomic_thread_fence(std::memory_order_acquire);
+ continue;
+ }
+ if (NeedSizeEstimate) {
+ return CalculateSize(front, back);
+ } else {
+ // This value will be 0 if the queue is empty, and undefined otherwise.
+ unsigned maybe_zero = ((front ^ back) & kMask2);
+ // Queue size estimate must agree with maybe zero check on the queue
+ // empty/non-empty state.
+ eigen_assert((CalculateSize(front, back) == 0) == (maybe_zero == 0));
+ return maybe_zero;
+ }
+ }
+ }
+
+ EIGEN_ALWAYS_INLINE
+ unsigned CalculateSize(unsigned front, unsigned back) const {
+ int size = (front & kMask2) - (back & kMask2);
+ // Fix overflow.
+ if (size < 0) size += 2 * kSize;
+ // Order of modification in push/pop is crafted to make the queue look
+ // larger than it is during concurrent modifications. E.g. push can
+ // increment size before the corresponding pop has decremented it.
+ // So the computed size can be up to kSize + 1, fix it.
+ if (size > static_cast<int>(kSize)) size = kSize;
+ return static_cast<unsigned>(size);
+ }
+
+ RunQueue(const RunQueue&) = delete;
+ void operator=(const RunQueue&) = delete;
+};
+
+} // namespace Eigen
+
+#endif // EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_
diff --git a/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadCancel.h b/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadCancel.h
new file mode 100644
index 0000000..a05685f
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadCancel.h
@@ -0,0 +1,23 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H
+#define EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H
+
+// Try to come up with a portable way to cancel a thread
+#if EIGEN_OS_GNULINUX
+ #define EIGEN_THREAD_CANCEL(t) \
+ pthread_cancel(t.native_handle());
+ #define EIGEN_SUPPORTS_THREAD_CANCELLATION 1
+#else
+#define EIGEN_THREAD_CANCEL(t)
+#endif
+
+
+#endif // EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H
diff --git a/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadEnvironment.h b/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadEnvironment.h
new file mode 100644
index 0000000..d94a064
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadEnvironment.h
@@ -0,0 +1,40 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H
+#define EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H
+
+namespace Eigen {
+
+struct StlThreadEnvironment {
+ struct Task {
+ std::function<void()> f;
+ };
+
+ // EnvThread constructor must start the thread,
+ // destructor must join the thread.
+ class EnvThread {
+ public:
+ EnvThread(std::function<void()> f) : thr_(std::move(f)) {}
+ ~EnvThread() { thr_.join(); }
+ // This function is called when the threadpool is cancelled.
+ void OnCancel() { }
+
+ private:
+ std::thread thr_;
+ };
+
+ EnvThread* CreateThread(std::function<void()> f) { return new EnvThread(std::move(f)); }
+ Task CreateTask(std::function<void()> f) { return Task{std::move(f)}; }
+ void ExecuteTask(const Task& t) { t.f(); }
+};
+
+} // namespace Eigen
+
+#endif // EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H
diff --git a/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadLocal.h b/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadLocal.h
new file mode 100644
index 0000000..4e68474
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadLocal.h
@@ -0,0 +1,301 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
+#define EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
+
+#ifdef EIGEN_AVOID_THREAD_LOCAL
+
+#ifdef EIGEN_THREAD_LOCAL
+#undef EIGEN_THREAD_LOCAL
+#endif
+
+#else
+
+#if EIGEN_MAX_CPP_VER >= 11 && \
+ ((EIGEN_COMP_GNUC && EIGEN_GNUC_AT_LEAST(4, 8)) || \
+ __has_feature(cxx_thread_local) || \
+ (EIGEN_COMP_MSVC >= 1900) )
+#define EIGEN_THREAD_LOCAL static thread_local
+#endif
+
+// Disable TLS for Apple and Android builds with older toolchains.
+#if defined(__APPLE__)
+// Included for TARGET_OS_IPHONE, __IPHONE_OS_VERSION_MIN_REQUIRED,
+// __IPHONE_8_0.
+#include <Availability.h>
+#include <TargetConditionals.h>
+#endif
+// Checks whether C++11's `thread_local` storage duration specifier is
+// supported.
+#if defined(__apple_build_version__) && \
+ ((__apple_build_version__ < 8000042) || \
+ (TARGET_OS_IPHONE && __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_9_0))
+// Notes: Xcode's clang did not support `thread_local` until version
+// 8, and even then not for all iOS < 9.0.
+#undef EIGEN_THREAD_LOCAL
+
+#elif defined(__ANDROID__) && EIGEN_COMP_CLANG
+// There are platforms for which TLS should not be used even though the compiler
+// makes it seem like it's supported (Android NDK < r12b for example).
+// This is primarily because of linker problems and toolchain misconfiguration:
+// TLS isn't supported until NDK r12b per
+// https://developer.android.com/ndk/downloads/revision_history.html
+// Since NDK r16, `__NDK_MAJOR__` and `__NDK_MINOR__` are defined in
+// <android/ndk-version.h>. For NDK < r16, users should define these macros,
+// e.g. `-D__NDK_MAJOR__=11 -D__NKD_MINOR__=0` for NDK r11.
+#if __has_include(<android/ndk-version.h>)
+#include <android/ndk-version.h>
+#endif // __has_include(<android/ndk-version.h>)
+#if defined(__ANDROID__) && defined(__clang__) && defined(__NDK_MAJOR__) && \
+ defined(__NDK_MINOR__) && \
+ ((__NDK_MAJOR__ < 12) || ((__NDK_MAJOR__ == 12) && (__NDK_MINOR__ < 1)))
+#undef EIGEN_THREAD_LOCAL
+#endif
+#endif // defined(__ANDROID__) && defined(__clang__)
+
+#endif // EIGEN_AVOID_THREAD_LOCAL
+
+namespace Eigen {
+
+namespace internal {
+template <typename T>
+struct ThreadLocalNoOpInitialize {
+ void operator()(T&) const {}
+};
+
+template <typename T>
+struct ThreadLocalNoOpRelease {
+ void operator()(T&) const {}
+};
+
+} // namespace internal
+
+// Thread local container for elements of type T, that does not use thread local
+// storage. As long as the number of unique threads accessing this storage
+// is smaller than `capacity_`, it is lock-free and wait-free. Otherwise it will
+// use a mutex for synchronization.
+//
+// Type `T` has to be default constructible, and by default each thread will get
+// a default constructed value. It is possible to specify custom `initialize`
+// callable, that will be called lazily from each thread accessing this object,
+// and will be passed a default initialized object of type `T`. Also it's
+// possible to pass a custom `release` callable, that will be invoked before
+// calling ~T().
+//
+// Example:
+//
+// struct Counter {
+// int value = 0;
+// }
+//
+// Eigen::ThreadLocal<Counter> counter(10);
+//
+// // Each thread will have access to it's own counter object.
+// Counter& cnt = counter.local();
+// cnt++;
+//
+// WARNING: Eigen::ThreadLocal uses the OS-specific value returned by
+// std::this_thread::get_id() to identify threads. This value is not guaranteed
+// to be unique except for the life of the thread. A newly created thread may
+// get an OS-specific ID equal to that of an already destroyed thread.
+//
+// Somewhat similar to TBB thread local storage, with similar restrictions:
+// https://www.threadingbuildingblocks.org/docs/help/reference/thread_local_storage/enumerable_thread_specific_cls.html
+//
+template <typename T,
+ typename Initialize = internal::ThreadLocalNoOpInitialize<T>,
+ typename Release = internal::ThreadLocalNoOpRelease<T>>
+class ThreadLocal {
+ // We preallocate default constructed elements in MaxSizedVector.
+ static_assert(std::is_default_constructible<T>::value,
+ "ThreadLocal data type must be default constructible");
+
+ public:
+ explicit ThreadLocal(int capacity)
+ : ThreadLocal(capacity, internal::ThreadLocalNoOpInitialize<T>(),
+ internal::ThreadLocalNoOpRelease<T>()) {}
+
+ ThreadLocal(int capacity, Initialize initialize)
+ : ThreadLocal(capacity, std::move(initialize),
+ internal::ThreadLocalNoOpRelease<T>()) {}
+
+ ThreadLocal(int capacity, Initialize initialize, Release release)
+ : initialize_(std::move(initialize)),
+ release_(std::move(release)),
+ capacity_(capacity),
+ data_(capacity_),
+ ptr_(capacity_),
+ filled_records_(0) {
+ eigen_assert(capacity_ >= 0);
+ data_.resize(capacity_);
+ for (int i = 0; i < capacity_; ++i) {
+ ptr_.emplace_back(nullptr);
+ }
+ }
+
+ T& local() {
+ std::thread::id this_thread = std::this_thread::get_id();
+ if (capacity_ == 0) return SpilledLocal(this_thread);
+
+ std::size_t h = std::hash<std::thread::id>()(this_thread);
+ const int start_idx = h % capacity_;
+
+ // NOTE: From the definition of `std::this_thread::get_id()` it is
+ // guaranteed that we never can have concurrent insertions with the same key
+ // to our hash-map like data structure. If we didn't find an element during
+ // the initial traversal, it's guaranteed that no one else could have
+ // inserted it while we are in this function. This allows to massively
+ // simplify out lock-free insert-only hash map.
+
+ // Check if we already have an element for `this_thread`.
+ int idx = start_idx;
+ while (ptr_[idx].load() != nullptr) {
+ ThreadIdAndValue& record = *(ptr_[idx].load());
+ if (record.thread_id == this_thread) return record.value;
+
+ idx += 1;
+ if (idx >= capacity_) idx -= capacity_;
+ if (idx == start_idx) break;
+ }
+
+ // If we are here, it means that we found an insertion point in lookup
+ // table at `idx`, or we did a full traversal and table is full.
+
+ // If lock-free storage is full, fallback on mutex.
+ if (filled_records_.load() >= capacity_) return SpilledLocal(this_thread);
+
+ // We double check that we still have space to insert an element into a lock
+ // free storage. If old value in `filled_records_` is larger than the
+ // records capacity, it means that some other thread added an element while
+ // we were traversing lookup table.
+ int insertion_index =
+ filled_records_.fetch_add(1, std::memory_order_relaxed);
+ if (insertion_index >= capacity_) return SpilledLocal(this_thread);
+
+ // At this point it's guaranteed that we can access to
+ // data_[insertion_index_] without a data race.
+ data_[insertion_index].thread_id = this_thread;
+ initialize_(data_[insertion_index].value);
+
+ // That's the pointer we'll put into the lookup table.
+ ThreadIdAndValue* inserted = &data_[insertion_index];
+
+ // We'll use nullptr pointer to ThreadIdAndValue in a compare-and-swap loop.
+ ThreadIdAndValue* empty = nullptr;
+
+ // Now we have to find an insertion point into the lookup table. We start
+ // from the `idx` that was identified as an insertion point above, it's
+ // guaranteed that we will have an empty record somewhere in a lookup table
+ // (because we created a record in the `data_`).
+ const int insertion_idx = idx;
+
+ do {
+ // Always start search from the original insertion candidate.
+ idx = insertion_idx;
+ while (ptr_[idx].load() != nullptr) {
+ idx += 1;
+ if (idx >= capacity_) idx -= capacity_;
+ // If we did a full loop, it means that we don't have any free entries
+ // in the lookup table, and this means that something is terribly wrong.
+ eigen_assert(idx != insertion_idx);
+ }
+ // Atomic CAS of the pointer guarantees that any other thread, that will
+ // follow this pointer will see all the mutations in the `data_`.
+ } while (!ptr_[idx].compare_exchange_weak(empty, inserted));
+
+ return inserted->value;
+ }
+
+ // WARN: It's not thread safe to call it concurrently with `local()`.
+ void ForEach(std::function<void(std::thread::id, T&)> f) {
+ // Reading directly from `data_` is unsafe, because only CAS to the
+ // record in `ptr_` makes all changes visible to other threads.
+ for (auto& ptr : ptr_) {
+ ThreadIdAndValue* record = ptr.load();
+ if (record == nullptr) continue;
+ f(record->thread_id, record->value);
+ }
+
+ // We did not spill into the map based storage.
+ if (filled_records_.load(std::memory_order_relaxed) < capacity_) return;
+
+ // Adds a happens before edge from the last call to SpilledLocal().
+ std::unique_lock<std::mutex> lock(mu_);
+ for (auto& kv : per_thread_map_) {
+ f(kv.first, kv.second);
+ }
+ }
+
+ // WARN: It's not thread safe to call it concurrently with `local()`.
+ ~ThreadLocal() {
+ // Reading directly from `data_` is unsafe, because only CAS to the record
+ // in `ptr_` makes all changes visible to other threads.
+ for (auto& ptr : ptr_) {
+ ThreadIdAndValue* record = ptr.load();
+ if (record == nullptr) continue;
+ release_(record->value);
+ }
+
+ // We did not spill into the map based storage.
+ if (filled_records_.load(std::memory_order_relaxed) < capacity_) return;
+
+ // Adds a happens before edge from the last call to SpilledLocal().
+ std::unique_lock<std::mutex> lock(mu_);
+ for (auto& kv : per_thread_map_) {
+ release_(kv.second);
+ }
+ }
+
+ private:
+ struct ThreadIdAndValue {
+ std::thread::id thread_id;
+ T value;
+ };
+
+ // Use unordered map guarded by a mutex when lock free storage is full.
+ T& SpilledLocal(std::thread::id this_thread) {
+ std::unique_lock<std::mutex> lock(mu_);
+
+ auto it = per_thread_map_.find(this_thread);
+ if (it == per_thread_map_.end()) {
+ auto result = per_thread_map_.emplace(this_thread, T());
+ eigen_assert(result.second);
+ initialize_((*result.first).second);
+ return (*result.first).second;
+ } else {
+ return it->second;
+ }
+ }
+
+ Initialize initialize_;
+ Release release_;
+ const int capacity_;
+
+ // Storage that backs lock-free lookup table `ptr_`. Records stored in this
+ // storage contiguously starting from index 0.
+ MaxSizeVector<ThreadIdAndValue> data_;
+
+ // Atomic pointers to the data stored in `data_`. Used as a lookup table for
+ // linear probing hash map (https://en.wikipedia.org/wiki/Linear_probing).
+ MaxSizeVector<std::atomic<ThreadIdAndValue*>> ptr_;
+
+ // Number of records stored in the `data_`.
+ std::atomic<int> filled_records_;
+
+ // We fallback on per thread map if lock-free storage is full. In practice
+ // this should never happen, if `capacity_` is a reasonable estimate of the
+ // number of threads running in a system.
+ std::mutex mu_; // Protects per_thread_map_.
+ std::unordered_map<std::thread::id, T> per_thread_map_;
+};
+
+} // namespace Eigen
+
+#endif // EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
diff --git a/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadPoolInterface.h b/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadPoolInterface.h
new file mode 100644
index 0000000..25030dc
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadPoolInterface.h
@@ -0,0 +1,48 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H
+#define EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H
+
+namespace Eigen {
+
+// This defines an interface that ThreadPoolDevice can take to use
+// custom thread pools underneath.
+class ThreadPoolInterface {
+ public:
+ // Submits a closure to be run by a thread in the pool.
+ virtual void Schedule(std::function<void()> fn) = 0;
+
+ // Submits a closure to be run by threads in the range [start, end) in the
+ // pool.
+ virtual void ScheduleWithHint(std::function<void()> fn, int /*start*/,
+ int /*end*/) {
+ // Just defer to Schedule in case sub-classes aren't interested in
+ // overriding this functionality.
+ Schedule(fn);
+ }
+
+ // If implemented, stop processing the closures that have been enqueued.
+ // Currently running closures may still be processed.
+ // If not implemented, does nothing.
+ virtual void Cancel() {}
+
+ // Returns the number of threads in the pool.
+ virtual int NumThreads() const = 0;
+
+ // Returns a logical thread index between 0 and NumThreads() - 1 if called
+ // from one of the threads in the pool. Returns -1 otherwise.
+ virtual int CurrentThreadId() const = 0;
+
+ virtual ~ThreadPoolInterface() {}
+};
+
+} // namespace Eigen
+
+#endif // EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H
diff --git a/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadYield.h b/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadYield.h
new file mode 100644
index 0000000..a859c7b
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadYield.h
@@ -0,0 +1,20 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H
+#define EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H
+
+// Try to come up with a portable way to yield
+#if EIGEN_COMP_GNUC && EIGEN_GNUC_AT_MOST(4, 7)
+#define EIGEN_THREAD_YIELD() sched_yield()
+#else
+#define EIGEN_THREAD_YIELD() std::this_thread::yield()
+#endif
+
+#endif // EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H
diff --git a/src/EigenUnsupported/CXX11/src/util/CXX11Meta.h b/src/EigenUnsupported/CXX11/src/util/CXX11Meta.h
new file mode 100644
index 0000000..149ceaf
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/util/CXX11Meta.h
@@ -0,0 +1,537 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11META_H
+#define EIGEN_CXX11META_H
+
+#include <vector>
+#include "EmulateArray.h"
+
+#include "CXX11Workarounds.h"
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal
+ * \file CXX11/util/CXX11Meta.h
+ * This file contains generic metaprogramming classes which are not specifically related to Eigen.
+ * This file expands upon Core/util/Meta.h and adds support for C++11 specific features.
+ */
+
+template<typename... tt>
+struct type_list { constexpr static int count = sizeof...(tt); };
+
+template<typename t, typename... tt>
+struct type_list<t, tt...> { constexpr static int count = sizeof...(tt) + 1; typedef t first_type; };
+
+template<typename T, T... nn>
+struct numeric_list { constexpr static std::size_t count = sizeof...(nn); };
+
+template<typename T, T n, T... nn>
+struct numeric_list<T, n, nn...> { static const std::size_t count = sizeof...(nn) + 1; const static T first_value = n; };
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+/* numeric list constructors
+ *
+ * equivalencies:
+ * constructor result
+ * typename gen_numeric_list<int, 5>::type numeric_list<int, 0,1,2,3,4>
+ * typename gen_numeric_list_reversed<int, 5>::type numeric_list<int, 4,3,2,1,0>
+ * typename gen_numeric_list_swapped_pair<int, 5,1,2>::type numeric_list<int, 0,2,1,3,4>
+ * typename gen_numeric_list_repeated<int, 0, 5>::type numeric_list<int, 0,0,0,0,0>
+ */
+
+template<typename T, std::size_t n, T start = 0, T... ii> struct gen_numeric_list : gen_numeric_list<T, n-1, start, start + n-1, ii...> {};
+template<typename T, T start, T... ii> struct gen_numeric_list<T, 0, start, ii...> { typedef numeric_list<T, ii...> type; };
+
+template<typename T, std::size_t n, T start = 0, T... ii> struct gen_numeric_list_reversed : gen_numeric_list_reversed<T, n-1, start, ii..., start + n-1> {};
+template<typename T, T start, T... ii> struct gen_numeric_list_reversed<T, 0, start, ii...> { typedef numeric_list<T, ii...> type; };
+
+template<typename T, std::size_t n, T a, T b, T start = 0, T... ii> struct gen_numeric_list_swapped_pair : gen_numeric_list_swapped_pair<T, n-1, a, b, start, (start + n-1) == a ? b : ((start + n-1) == b ? a : (start + n-1)), ii...> {};
+template<typename T, T a, T b, T start, T... ii> struct gen_numeric_list_swapped_pair<T, 0, a, b, start, ii...> { typedef numeric_list<T, ii...> type; };
+
+template<typename T, std::size_t n, T V, T... nn> struct gen_numeric_list_repeated : gen_numeric_list_repeated<T, n-1, V, V, nn...> {};
+template<typename T, T V, T... nn> struct gen_numeric_list_repeated<T, 0, V, nn...> { typedef numeric_list<T, nn...> type; };
+
+/* list manipulation: concatenate */
+
+template<class a, class b> struct concat;
+
+template<typename... as, typename... bs> struct concat<type_list<as...>, type_list<bs...>> { typedef type_list<as..., bs...> type; };
+template<typename T, T... as, T... bs> struct concat<numeric_list<T, as...>, numeric_list<T, bs...> > { typedef numeric_list<T, as..., bs...> type; };
+
+template<typename... p> struct mconcat;
+template<typename a> struct mconcat<a> { typedef a type; };
+template<typename a, typename b> struct mconcat<a, b> : concat<a, b> {};
+template<typename a, typename b, typename... cs> struct mconcat<a, b, cs...> : concat<a, typename mconcat<b, cs...>::type> {};
+
+/* list manipulation: extract slices */
+
+template<int n, typename x> struct take;
+template<int n, typename a, typename... as> struct take<n, type_list<a, as...>> : concat<type_list<a>, typename take<n-1, type_list<as...>>::type> {};
+template<int n> struct take<n, type_list<>> { typedef type_list<> type; };
+template<typename a, typename... as> struct take<0, type_list<a, as...>> { typedef type_list<> type; };
+template<> struct take<0, type_list<>> { typedef type_list<> type; };
+
+template<typename T, int n, T a, T... as> struct take<n, numeric_list<T, a, as...>> : concat<numeric_list<T, a>, typename take<n-1, numeric_list<T, as...>>::type> {};
+template<typename T, int n> struct take<n, numeric_list<T>> { typedef numeric_list<T> type; };
+template<typename T, T a, T... as> struct take<0, numeric_list<T, a, as...>> { typedef numeric_list<T> type; };
+template<typename T> struct take<0, numeric_list<T>> { typedef numeric_list<T> type; };
+
+template<typename T, int n, T... ii> struct h_skip_helper_numeric;
+template<typename T, int n, T i, T... ii> struct h_skip_helper_numeric<T, n, i, ii...> : h_skip_helper_numeric<T, n-1, ii...> {};
+template<typename T, T i, T... ii> struct h_skip_helper_numeric<T, 0, i, ii...> { typedef numeric_list<T, i, ii...> type; };
+template<typename T, int n> struct h_skip_helper_numeric<T, n> { typedef numeric_list<T> type; };
+template<typename T> struct h_skip_helper_numeric<T, 0> { typedef numeric_list<T> type; };
+
+template<int n, typename... tt> struct h_skip_helper_type;
+template<int n, typename t, typename... tt> struct h_skip_helper_type<n, t, tt...> : h_skip_helper_type<n-1, tt...> {};
+template<typename t, typename... tt> struct h_skip_helper_type<0, t, tt...> { typedef type_list<t, tt...> type; };
+template<int n> struct h_skip_helper_type<n> { typedef type_list<> type; };
+template<> struct h_skip_helper_type<0> { typedef type_list<> type; };
+#endif //not EIGEN_PARSED_BY_DOXYGEN
+
+template<int n>
+struct h_skip {
+ template<typename T, T... ii>
+ constexpr static EIGEN_STRONG_INLINE typename h_skip_helper_numeric<T, n, ii...>::type helper(numeric_list<T, ii...>) { return typename h_skip_helper_numeric<T, n, ii...>::type(); }
+ template<typename... tt>
+ constexpr static EIGEN_STRONG_INLINE typename h_skip_helper_type<n, tt...>::type helper(type_list<tt...>) { return typename h_skip_helper_type<n, tt...>::type(); }
+};
+
+template<int n, typename a> struct skip { typedef decltype(h_skip<n>::helper(a())) type; };
+
+template<int start, int count, typename a> struct slice : take<count, typename skip<start, a>::type> {};
+
+/* list manipulation: retrieve single element from list */
+
+template<int n, typename x> struct get;
+
+template<int n, typename a, typename... as> struct get<n, type_list<a, as...>> : get<n-1, type_list<as...>> {};
+template<typename a, typename... as> struct get<0, type_list<a, as...>> { typedef a type; };
+
+template<typename T, int n, T a, T... as> struct get<n, numeric_list<T, a, as...>> : get<n-1, numeric_list<T, as...>> {};
+template<typename T, T a, T... as> struct get<0, numeric_list<T, a, as...>> { constexpr static T value = a; };
+
+template<std::size_t n, typename T, T a, T... as> constexpr T array_get(const numeric_list<T, a, as...>&) {
+ return get<(int)n, numeric_list<T, a, as...>>::value;
+}
+
+/* always get type, regardless of dummy; good for parameter pack expansion */
+
+template<typename T, T dummy, typename t> struct id_numeric { typedef t type; };
+template<typename dummy, typename t> struct id_type { typedef t type; };
+
+/* equality checking, flagged version */
+
+template<typename a, typename b> struct is_same_gf : is_same<a, b> { constexpr static int global_flags = 0; };
+
+/* apply_op to list */
+
+template<
+ bool from_left, // false
+ template<typename, typename> class op,
+ typename additional_param,
+ typename... values
+>
+struct h_apply_op_helper { typedef type_list<typename op<values, additional_param>::type...> type; };
+template<
+ template<typename, typename> class op,
+ typename additional_param,
+ typename... values
+>
+struct h_apply_op_helper<true, op, additional_param, values...> { typedef type_list<typename op<additional_param, values>::type...> type; };
+
+template<
+ bool from_left,
+ template<typename, typename> class op,
+ typename additional_param
+>
+struct h_apply_op
+{
+ template<typename... values>
+ constexpr static typename h_apply_op_helper<from_left, op, additional_param, values...>::type helper(type_list<values...>)
+ { return typename h_apply_op_helper<from_left, op, additional_param, values...>::type(); }
+};
+
+template<
+ template<typename, typename> class op,
+ typename additional_param,
+ typename a
+>
+struct apply_op_from_left { typedef decltype(h_apply_op<true, op, additional_param>::helper(a())) type; };
+
+template<
+ template<typename, typename> class op,
+ typename additional_param,
+ typename a
+>
+struct apply_op_from_right { typedef decltype(h_apply_op<false, op, additional_param>::helper(a())) type; };
+
+/* see if an element is in a list */
+
+template<
+ template<typename, typename> class test,
+ typename check_against,
+ typename h_list,
+ bool last_check_positive = false
+>
+struct contained_in_list;
+
+template<
+ template<typename, typename> class test,
+ typename check_against,
+ typename h_list
+>
+struct contained_in_list<test, check_against, h_list, true>
+{
+ constexpr static bool value = true;
+};
+
+template<
+ template<typename, typename> class test,
+ typename check_against,
+ typename a,
+ typename... as
+>
+struct contained_in_list<test, check_against, type_list<a, as...>, false> : contained_in_list<test, check_against, type_list<as...>, test<check_against, a>::value> {};
+
+template<
+ template<typename, typename> class test,
+ typename check_against
+ EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, empty)
+>
+struct contained_in_list<test, check_against, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(empty)>, false> { constexpr static bool value = false; };
+
+/* see if an element is in a list and check for global flags */
+
+template<
+ template<typename, typename> class test,
+ typename check_against,
+ typename h_list,
+ int default_flags = 0,
+ bool last_check_positive = false,
+ int last_check_flags = default_flags
+>
+struct contained_in_list_gf;
+
+template<
+ template<typename, typename> class test,
+ typename check_against,
+ typename h_list,
+ int default_flags,
+ int last_check_flags
+>
+struct contained_in_list_gf<test, check_against, h_list, default_flags, true, last_check_flags>
+{
+ constexpr static bool value = true;
+ constexpr static int global_flags = last_check_flags;
+};
+
+template<
+ template<typename, typename> class test,
+ typename check_against,
+ typename a,
+ typename... as,
+ int default_flags,
+ int last_check_flags
+>
+struct contained_in_list_gf<test, check_against, type_list<a, as...>, default_flags, false, last_check_flags> : contained_in_list_gf<test, check_against, type_list<as...>, default_flags, test<check_against, a>::value, test<check_against, a>::global_flags> {};
+
+template<
+ template<typename, typename> class test,
+ typename check_against
+ EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, empty),
+ int default_flags,
+ int last_check_flags
+>
+struct contained_in_list_gf<test, check_against, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(empty)>, default_flags, false, last_check_flags> { constexpr static bool value = false; constexpr static int global_flags = default_flags; };
+
+/* generic reductions */
+
+template<
+ typename Reducer,
+ typename... Ts
+> struct reduce;
+
+template<
+ typename Reducer
+> struct reduce<Reducer>
+{
+ EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE int run() { return Reducer::Identity; }
+};
+
+template<
+ typename Reducer,
+ typename A
+> struct reduce<Reducer, A>
+{
+ EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE A run(A a) { return a; }
+};
+
+template<
+ typename Reducer,
+ typename A,
+ typename... Ts
+> struct reduce<Reducer, A, Ts...>
+{
+ EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, Ts... ts) -> decltype(Reducer::run(a, reduce<Reducer, Ts...>::run(ts...))) {
+ return Reducer::run(a, reduce<Reducer, Ts...>::run(ts...));
+ }
+};
+
+/* generic binary operations */
+
+struct sum_op {
+ template<typename A, typename B> EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a + b) { return a + b; }
+ static constexpr int Identity = 0;
+};
+struct product_op {
+ template<typename A, typename B> EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a * b) { return a * b; }
+ static constexpr int Identity = 1;
+};
+
+struct logical_and_op { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a && b) { return a && b; } };
+struct logical_or_op { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a || b) { return a || b; } };
+
+struct equal_op { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a == b) { return a == b; } };
+struct not_equal_op { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a != b) { return a != b; } };
+struct lesser_op { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a < b) { return a < b; } };
+struct lesser_equal_op { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a <= b) { return a <= b; } };
+struct greater_op { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a > b) { return a > b; } };
+struct greater_equal_op { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a >= b) { return a >= b; } };
+
+/* generic unary operations */
+
+struct not_op { template<typename A> constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(!a) { return !a; } };
+struct negation_op { template<typename A> constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(-a) { return -a; } };
+struct greater_equal_zero_op { template<typename A> constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(a >= 0) { return a >= 0; } };
+
+
+/* reductions for lists */
+
+// using auto -> return value spec makes ICC 13.0 and 13.1 crash here, so we have to hack it
+// together in front... (13.0 doesn't work with array_prod/array_reduce/... anyway, but 13.1
+// does...
+template<typename... Ts>
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE decltype(reduce<product_op, Ts...>::run((*((Ts*)0))...)) arg_prod(Ts... ts)
+{
+ return reduce<product_op, Ts...>::run(ts...);
+}
+
+template<typename... Ts>
+constexpr EIGEN_STRONG_INLINE decltype(reduce<sum_op, Ts...>::run((*((Ts*)0))...)) arg_sum(Ts... ts)
+{
+ return reduce<sum_op, Ts...>::run(ts...);
+}
+
+/* reverse arrays */
+
+template<typename Array, int... n>
+constexpr EIGEN_STRONG_INLINE Array h_array_reverse(Array arr, numeric_list<int, n...>)
+{
+ return {{array_get<sizeof...(n) - n - 1>(arr)...}};
+}
+
+template<typename T, std::size_t N>
+constexpr EIGEN_STRONG_INLINE array<T, N> array_reverse(array<T, N> arr)
+{
+ return h_array_reverse(arr, typename gen_numeric_list<int, N>::type());
+}
+
+
+/* generic array reductions */
+
+// can't reuse standard reduce() interface above because Intel's Compiler
+// *really* doesn't like it, so we just reimplement the stuff
+// (start from N - 1 and work down to 0 because specialization for
+// n == N - 1 also doesn't work in Intel's compiler, so it goes into
+// an infinite loop)
+template<typename Reducer, typename T, std::size_t N, std::size_t n = N - 1>
+struct h_array_reduce {
+ EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(array<T, N> arr, T identity) -> decltype(Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr, identity), array_get<n>(arr)))
+ {
+ return Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr, identity), array_get<n>(arr));
+ }
+};
+
+template<typename Reducer, typename T, std::size_t N>
+struct h_array_reduce<Reducer, T, N, 0>
+{
+ EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE T run(const array<T, N>& arr, T)
+ {
+ return array_get<0>(arr);
+ }
+};
+
+template<typename Reducer, typename T>
+struct h_array_reduce<Reducer, T, 0>
+{
+ EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE T run(const array<T, 0>&, T identity)
+ {
+ return identity;
+ }
+};
+
+template<typename Reducer, typename T, std::size_t N>
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_reduce(const array<T, N>& arr, T identity) -> decltype(h_array_reduce<Reducer, T, N>::run(arr, identity))
+{
+ return h_array_reduce<Reducer, T, N>::run(arr, identity);
+}
+
+/* standard array reductions */
+
+template<typename T, std::size_t N>
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_sum(const array<T, N>& arr) -> decltype(array_reduce<sum_op, T, N>(arr, static_cast<T>(0)))
+{
+ return array_reduce<sum_op, T, N>(arr, static_cast<T>(0));
+}
+
+template<typename T, std::size_t N>
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_prod(const array<T, N>& arr) -> decltype(array_reduce<product_op, T, N>(arr, static_cast<T>(1)))
+{
+ return array_reduce<product_op, T, N>(arr, static_cast<T>(1));
+}
+
+template<typename t>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector<t>& a) {
+ eigen_assert(a.size() > 0);
+ t prod = 1;
+ for (size_t i = 0; i < a.size(); ++i) { prod *= a[i]; }
+ return prod;
+}
+
+/* zip an array */
+
+template<typename Op, typename A, typename B, std::size_t N, int... n>
+constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A(), B())),N> h_array_zip(array<A, N> a, array<B, N> b, numeric_list<int, n...>)
+{
+ return array<decltype(Op::run(A(), B())),N>{{ Op::run(array_get<n>(a), array_get<n>(b))... }};
+}
+
+template<typename Op, typename A, typename B, std::size_t N>
+constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A(), B())),N> array_zip(array<A, N> a, array<B, N> b)
+{
+ return h_array_zip<Op>(a, b, typename gen_numeric_list<int, N>::type());
+}
+
+/* zip an array and reduce the result */
+
+template<typename Reducer, typename Op, typename A, typename B, std::size_t N, int... n>
+constexpr EIGEN_STRONG_INLINE auto h_array_zip_and_reduce(array<A, N> a, array<B, N> b, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...))
+{
+ return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...);
+}
+
+template<typename Reducer, typename Op, typename A, typename B, std::size_t N>
+constexpr EIGEN_STRONG_INLINE auto array_zip_and_reduce(array<A, N> a, array<B, N> b) -> decltype(h_array_zip_and_reduce<Reducer, Op, A, B, N>(a, b, typename gen_numeric_list<int, N>::type()))
+{
+ return h_array_zip_and_reduce<Reducer, Op, A, B, N>(a, b, typename gen_numeric_list<int, N>::type());
+}
+
+/* apply stuff to an array */
+
+template<typename Op, typename A, std::size_t N, int... n>
+constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A())),N> h_array_apply(array<A, N> a, numeric_list<int, n...>)
+{
+ return array<decltype(Op::run(A())),N>{{ Op::run(array_get<n>(a))... }};
+}
+
+template<typename Op, typename A, std::size_t N>
+constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A())),N> array_apply(array<A, N> a)
+{
+ return h_array_apply<Op>(a, typename gen_numeric_list<int, N>::type());
+}
+
+/* apply stuff to an array and reduce */
+
+template<typename Reducer, typename Op, typename A, std::size_t N, int... n>
+constexpr EIGEN_STRONG_INLINE auto h_array_apply_and_reduce(array<A, N> arr, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...))
+{
+ return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...);
+}
+
+template<typename Reducer, typename Op, typename A, std::size_t N>
+constexpr EIGEN_STRONG_INLINE auto array_apply_and_reduce(array<A, N> a) -> decltype(h_array_apply_and_reduce<Reducer, Op, A, N>(a, typename gen_numeric_list<int, N>::type()))
+{
+ return h_array_apply_and_reduce<Reducer, Op, A, N>(a, typename gen_numeric_list<int, N>::type());
+}
+
+/* repeat a value n times (and make an array out of it
+ * usage:
+ * array<int, 16> = repeat<16>(42);
+ */
+
+template<int n>
+struct h_repeat
+{
+ template<typename t, int... ii>
+ constexpr static EIGEN_STRONG_INLINE array<t, n> run(t v, numeric_list<int, ii...>)
+ {
+ return {{ typename id_numeric<int, ii, t>::type(v)... }};
+ }
+};
+
+template<int n, typename t>
+constexpr array<t, n> repeat(t v) { return h_repeat<n>::run(v, typename gen_numeric_list<int, n>::type()); }
+
+/* instantiate a class by a C-style array */
+template<class InstType, typename ArrType, std::size_t N, bool Reverse, typename... Ps>
+struct h_instantiate_by_c_array;
+
+template<class InstType, typename ArrType, std::size_t N, typename... Ps>
+struct h_instantiate_by_c_array<InstType, ArrType, N, false, Ps...>
+{
+ static InstType run(ArrType* arr, Ps... args)
+ {
+ return h_instantiate_by_c_array<InstType, ArrType, N - 1, false, Ps..., ArrType>::run(arr + 1, args..., arr[0]);
+ }
+};
+
+template<class InstType, typename ArrType, std::size_t N, typename... Ps>
+struct h_instantiate_by_c_array<InstType, ArrType, N, true, Ps...>
+{
+ static InstType run(ArrType* arr, Ps... args)
+ {
+ return h_instantiate_by_c_array<InstType, ArrType, N - 1, false, ArrType, Ps...>::run(arr + 1, arr[0], args...);
+ }
+};
+
+template<class InstType, typename ArrType, typename... Ps>
+struct h_instantiate_by_c_array<InstType, ArrType, 0, false, Ps...>
+{
+ static InstType run(ArrType* arr, Ps... args)
+ {
+ (void)arr;
+ return InstType(args...);
+ }
+};
+
+template<class InstType, typename ArrType, typename... Ps>
+struct h_instantiate_by_c_array<InstType, ArrType, 0, true, Ps...>
+{
+ static InstType run(ArrType* arr, Ps... args)
+ {
+ (void)arr;
+ return InstType(args...);
+ }
+};
+
+template<class InstType, typename ArrType, std::size_t N, bool Reverse = false>
+InstType instantiate_by_c_array(ArrType* arr)
+{
+ return h_instantiate_by_c_array<InstType, ArrType, N, Reverse>::run(arr);
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11META_H
diff --git a/src/EigenUnsupported/CXX11/src/util/CXX11Workarounds.h b/src/EigenUnsupported/CXX11/src/util/CXX11Workarounds.h
new file mode 100644
index 0000000..056736c
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/util/CXX11Workarounds.h
@@ -0,0 +1,88 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11WORKAROUNDS_H
+#define EIGEN_CXX11WORKAROUNDS_H
+
+/* COMPATIBILITY CHECKS
+ * (so users of compilers that are too old get some realistic error messages)
+ */
+#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1310)
+#error Intel Compiler only supports required C++ features since version 13.1.
+// note that most stuff in principle works with 13.0 but when combining
+// some features, at some point 13.0 will just fail with an internal assertion
+#elif defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 6))
+// G++ < 4.6 by default will continue processing the source files - even if we use #error to make
+// it error out. For this reason, we use the pragma to make sure G++ aborts at the first error
+// it sees. Unfortunately, that is still not our #error directive, but at least the output is
+// short enough the user has a chance to see that the compiler version is not sufficient for
+// the funky template mojo we use.
+#pragma GCC diagnostic error "-Wfatal-errors"
+#error GNU C++ Compiler (g++) only supports required C++ features since version 4.6.
+#endif
+
+/* Check that the compiler at least claims to support C++11. It might not be sufficient
+ * because the compiler may not implement it correctly, but at least we'll know.
+ * On the other hand, visual studio still doesn't claim to support C++11 although it's
+ * compliant enugh for our purpose.
+ */
+#if (EIGEN_COMP_CXXVER < 11)
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+#pragma GCC diagnostic error "-Wfatal-errors"
+#endif
+#error This library needs at least a C++11 compliant compiler. If you use g++/clang, please enable the -std=c++11 compiler flag. (-std=c++0x on older versions.)
+#endif
+
+namespace Eigen {
+
+namespace internal {
+
+/* std::get is only constexpr in C++14, not yet in C++11
+ */
+
+
+template<std::size_t I_, class T> constexpr inline T& array_get(std::vector<T>& a) { return a[I_]; }
+template<std::size_t I_, class T> constexpr inline T&& array_get(std::vector<T>&& a) { return a[I_]; }
+template<std::size_t I_, class T> constexpr inline T const& array_get(std::vector<T> const& a) { return a[I_]; }
+
+/* Suppose you have a template of the form
+ * template<typename T> struct X;
+ * And you want to specialize it in such a way:
+ * template<typename S1, typename... SN> struct X<Foo<S1, SN...>> { ::: };
+ * template<> struct X<Foo<>> { ::: };
+ * This will work in Intel's compiler 13.0, but only to some extent in g++ 4.6, since
+ * g++ can only match templates called with parameter packs if the number of template
+ * arguments is not a fixed size (so inside the first specialization, referencing
+ * X<Foo<Sn...>> will fail in g++). On the other hand, g++ will accept the following:
+ * template<typename S...> struct X<Foo<S...>> { ::: }:
+ * as an additional (!) specialization, which will then only match the empty case.
+ * But Intel's compiler 13.0 won't accept that, it will only accept the empty syntax,
+ * so we have to create a workaround for this.
+ */
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
+#define EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n) mt... n
+#define EIGEN_TPL_PP_SPEC_HACK_DEFC(mt, n) , EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n)
+#define EIGEN_TPL_PP_SPEC_HACK_USE(n) n...
+#define EIGEN_TPL_PP_SPEC_HACK_USEC(n) , n...
+#else
+#define EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n)
+#define EIGEN_TPL_PP_SPEC_HACK_DEFC(mt, n)
+#define EIGEN_TPL_PP_SPEC_HACK_USE(n)
+#define EIGEN_TPL_PP_SPEC_HACK_USEC(n)
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11WORKAROUNDS_H
+
+/*
+ * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
+ */
diff --git a/src/EigenUnsupported/CXX11/src/util/EmulateArray.h b/src/EigenUnsupported/CXX11/src/util/EmulateArray.h
new file mode 100644
index 0000000..834b20b
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/util/EmulateArray.h
@@ -0,0 +1,261 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_EMULATE_ARRAY_H
+#define EIGEN_EMULATE_ARRAY_H
+
+
+
+// The array class is only available starting with cxx11. Emulate our own here
+// if needed. Beware, msvc still doesn't advertise itself as a c++11 compiler!
+// Moreover, CUDA doesn't support the STL containers, so we use our own instead.
+#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(EIGEN_GPUCC) || defined(EIGEN_AVOID_STL_ARRAY)
+
+namespace Eigen {
+template <typename T, size_t n> class array {
+ public:
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE T& operator[] (size_t index) { eigen_internal_assert(index < size()); return values[index]; }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { eigen_internal_assert(index < size()); return values[index]; }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE T& at(size_t index) { eigen_assert(index < size()); return values[index]; }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const T& at(size_t index) const { eigen_assert(index < size()); return values[index]; }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE T& front() { return values[0]; }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const T& front() const { return values[0]; }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE T& back() { return values[n-1]; }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const T& back() const { return values[n-1]; }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+ static std::size_t size() { return n; }
+
+ T values[n];
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE array() { }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE array(const T& v) {
+ EIGEN_STATIC_ASSERT(n==1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ values[0] = v;
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE array(const T& v1, const T& v2) {
+ EIGEN_STATIC_ASSERT(n==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ values[0] = v1;
+ values[1] = v2;
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3) {
+ EIGEN_STATIC_ASSERT(n==3, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ values[0] = v1;
+ values[1] = v2;
+ values[2] = v3;
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3,
+ const T& v4) {
+ EIGEN_STATIC_ASSERT(n==4, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ values[0] = v1;
+ values[1] = v2;
+ values[2] = v3;
+ values[3] = v4;
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
+ const T& v5) {
+ EIGEN_STATIC_ASSERT(n==5, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ values[0] = v1;
+ values[1] = v2;
+ values[2] = v3;
+ values[3] = v4;
+ values[4] = v5;
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
+ const T& v5, const T& v6) {
+ EIGEN_STATIC_ASSERT(n==6, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ values[0] = v1;
+ values[1] = v2;
+ values[2] = v3;
+ values[3] = v4;
+ values[4] = v5;
+ values[5] = v6;
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
+ const T& v5, const T& v6, const T& v7) {
+ EIGEN_STATIC_ASSERT(n==7, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ values[0] = v1;
+ values[1] = v2;
+ values[2] = v3;
+ values[3] = v4;
+ values[4] = v5;
+ values[5] = v6;
+ values[6] = v7;
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE array(
+ const T& v1, const T& v2, const T& v3, const T& v4,
+ const T& v5, const T& v6, const T& v7, const T& v8) {
+ EIGEN_STATIC_ASSERT(n==8, YOU_MADE_A_PROGRAMMING_MISTAKE)
+ values[0] = v1;
+ values[1] = v2;
+ values[2] = v3;
+ values[3] = v4;
+ values[4] = v5;
+ values[5] = v6;
+ values[6] = v7;
+ values[7] = v8;
+ }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE array(std::initializer_list<T> l) {
+ eigen_assert(l.size() == n);
+ internal::smart_copy(l.begin(), l.end(), values);
+ }
+#endif
+};
+
+
+// Specialize array for zero size
+template <typename T> class array<T, 0> {
+ public:
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE T& operator[] (size_t) {
+ eigen_assert(false && "Can't index a zero size array");
+ return dummy;
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const T& operator[] (size_t) const {
+ eigen_assert(false && "Can't index a zero size array");
+ return dummy;
+ }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE T& front() {
+ eigen_assert(false && "Can't index a zero size array");
+ return dummy;
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const T& front() const {
+ eigen_assert(false && "Can't index a zero size array");
+ return dummy;
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE T& back() {
+ eigen_assert(false && "Can't index a zero size array");
+ return dummy;
+ }
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE const T& back() const {
+ eigen_assert(false && "Can't index a zero size array");
+ return dummy;
+ }
+
+ static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::size_t size() { return 0; }
+
+ EIGEN_DEVICE_FUNC
+ EIGEN_STRONG_INLINE array() : dummy() { }
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+ EIGEN_DEVICE_FUNC array(std::initializer_list<T> l) : dummy() {
+ EIGEN_UNUSED_VARIABLE(l);
+ eigen_assert(l.size() == 0);
+ }
+#endif
+
+ private:
+ T dummy;
+};
+
+// Comparison operator
+// Todo: implement !=, <, <=, >, and >=
+template<class T, std::size_t N>
+EIGEN_DEVICE_FUNC bool operator==(const array<T,N>& lhs, const array<T,N>& rhs) {
+ for (std::size_t i = 0; i < N; ++i) {
+ if (lhs[i] != rhs[i]) {
+ return false;
+ }
+ }
+ return true;
+}
+
+
+namespace internal {
+template<std::size_t I_, class T, std::size_t N>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(array<T,N>& a) {
+ return a[I_];
+}
+template<std::size_t I_, class T, std::size_t N>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const array<T,N>& a) {
+ return a[I_];
+}
+
+template<class T, std::size_t N> struct array_size<array<T,N> > {
+ enum { value = N };
+};
+template<class T, std::size_t N> struct array_size<array<T,N>& > {
+ enum { value = N };
+};
+template<class T, std::size_t N> struct array_size<const array<T,N> > {
+ enum { value = N };
+};
+template<class T, std::size_t N> struct array_size<const array<T,N>& > {
+ enum { value = N };
+};
+
+} // end namespace internal
+} // end namespace Eigen
+
+#else
+
+// The compiler supports c++11, and we're not targeting cuda: use std::array as Eigen::array
+#include <array>
+namespace Eigen {
+
+template <typename T, std::size_t N> using array = std::array<T, N>;
+
+namespace internal {
+/* std::get is only constexpr in C++14, not yet in C++11
+ * - libstdc++ from version 4.7 onwards has it nevertheless,
+ * so use that
+ * - libstdc++ older versions: use _M_instance directly
+ * - libc++ all versions so far: use __elems_ directly
+ * - all other libs: use std::get to be portable, but
+ * this may not be constexpr
+ */
+#if defined(__GLIBCXX__) && __GLIBCXX__ < 20120322
+#define STD_GET_ARR_HACK a._M_instance[I_]
+#elif defined(_LIBCPP_VERSION)
+#define STD_GET_ARR_HACK a.__elems_[I_]
+#else
+#define STD_GET_ARR_HACK std::template get<I_, T, N>(a)
+#endif
+
+template<std::size_t I_, class T, std::size_t N> constexpr inline T& array_get(std::array<T,N>& a) { return (T&) STD_GET_ARR_HACK; }
+template<std::size_t I_, class T, std::size_t N> constexpr inline T&& array_get(std::array<T,N>&& a) { return (T&&) STD_GET_ARR_HACK; }
+template<std::size_t I_, class T, std::size_t N> constexpr inline T const& array_get(std::array<T,N> const& a) { return (T const&) STD_GET_ARR_HACK; }
+
+#undef STD_GET_ARR_HACK
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif
+
+#endif // EIGEN_EMULATE_ARRAY_H
diff --git a/src/EigenUnsupported/CXX11/src/util/MaxSizeVector.h b/src/EigenUnsupported/CXX11/src/util/MaxSizeVector.h
new file mode 100644
index 0000000..277ab14
--- /dev/null
+++ b/src/EigenUnsupported/CXX11/src/util/MaxSizeVector.h
@@ -0,0 +1,158 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_FIXEDSIZEVECTOR_H
+#define EIGEN_FIXEDSIZEVECTOR_H
+
+namespace Eigen {
+
+/** \class MaxSizeVector
+ * \ingroup Core
+ *
+ * \brief The MaxSizeVector class.
+ *
+ * The %MaxSizeVector provides a subset of std::vector functionality.
+ *
+ * The goal is to provide basic std::vector operations when using
+ * std::vector is not an option (e.g. on GPU or when compiling using
+ * FMA/AVX, as this can cause either compilation failures or illegal
+ * instruction failures).
+ *
+ * Beware: The constructors are not API compatible with these of
+ * std::vector.
+ */
+template <typename T>
+class MaxSizeVector {
+ static const size_t alignment = EIGEN_PLAIN_ENUM_MAX(EIGEN_ALIGNOF(T), sizeof(void*));
+ public:
+ // Construct a new MaxSizeVector, reserve n elements.
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ explicit MaxSizeVector(size_t n)
+ : reserve_(n), size_(0),
+ data_(static_cast<T*>(internal::handmade_aligned_malloc(n * sizeof(T), alignment))) {
+ }
+
+ // Construct a new MaxSizeVector, reserve and resize to n.
+ // Copy the init value to all elements.
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ MaxSizeVector(size_t n, const T& init)
+ : reserve_(n), size_(n),
+ data_(static_cast<T*>(internal::handmade_aligned_malloc(n * sizeof(T), alignment))) {
+ size_t i = 0;
+ EIGEN_TRY
+ {
+ for(; i < size_; ++i) { new (&data_[i]) T(init); }
+ }
+ EIGEN_CATCH(...)
+ {
+ // Construction failed, destruct in reverse order:
+ for(; (i+1) > 0; --i) { data_[i-1].~T(); }
+ internal::handmade_aligned_free(data_);
+ EIGEN_THROW;
+ }
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ ~MaxSizeVector() {
+ for (size_t i = size_; i > 0; --i) {
+ data_[i-1].~T();
+ }
+ internal::handmade_aligned_free(data_);
+ }
+
+ void resize(size_t n) {
+ eigen_assert(n <= reserve_);
+ for (; size_ < n; ++size_) {
+ new (&data_[size_]) T;
+ }
+ for (; size_ > n; --size_) {
+ data_[size_-1].~T();
+ }
+ eigen_assert(size_ == n);
+ }
+
+ // Append new elements (up to reserved size).
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ void push_back(const T& t) {
+ eigen_assert(size_ < reserve_);
+ new (&data_[size_++]) T(t);
+ }
+
+ // For C++03 compatibility this only takes one argument
+ template<class X>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ void emplace_back(const X& x) {
+ eigen_assert(size_ < reserve_);
+ new (&data_[size_++]) T(x);
+ }
+
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const T& operator[] (size_t i) const {
+ eigen_assert(i < size_);
+ return data_[i];
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ T& operator[] (size_t i) {
+ eigen_assert(i < size_);
+ return data_[i];
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ T& back() {
+ eigen_assert(size_ > 0);
+ return data_[size_ - 1];
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const T& back() const {
+ eigen_assert(size_ > 0);
+ return data_[size_ - 1];
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ void pop_back() {
+ eigen_assert(size_ > 0);
+ data_[--size_].~T();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ size_t size() const { return size_; }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ bool empty() const { return size_ == 0; }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ T* data() { return data_; }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const T* data() const { return data_; }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ T* begin() { return data_; }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ T* end() { return data_ + size_; }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const T* begin() const { return data_; }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+ const T* end() const { return data_ + size_; }
+
+ private:
+ size_t reserve_;
+ size_t size_;
+ T* data_;
+};
+
+} // namespace Eigen
+
+#endif // EIGEN_FIXEDSIZEVECTOR_H
diff --git a/src/EigenUnsupported/EulerAngles b/src/EigenUnsupported/EulerAngles
new file mode 100644
index 0000000..f8f1c5d
--- /dev/null
+++ b/src/EigenUnsupported/EulerAngles
@@ -0,0 +1,43 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Tal Hadad <tal_hd@hotmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_EULERANGLES_MODULE_H
+#define EIGEN_EULERANGLES_MODULE_H
+
+
+#include "../../Eigen/Core"
+#include "../../Eigen/Geometry"
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+namespace Eigen {
+
+/**
+ * \defgroup EulerAngles_Module EulerAngles module
+ * \brief This module provides generic euler angles rotation.
+ *
+ * Euler angles are a way to represent 3D rotation.
+ *
+ * In order to use this module in your code, include this header:
+ * \code
+ * #include <unsupported/Eigen/EulerAngles>
+ * \endcode
+ *
+ * See \ref EulerAngles for more information.
+ *
+ */
+
+}
+
+#include "src/EulerAngles/EulerSystem.h"
+#include "src/EulerAngles/EulerAngles.h"
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_EULERANGLES_MODULE_H
diff --git a/src/EigenUnsupported/FFT b/src/EigenUnsupported/FFT
new file mode 100644
index 0000000..c8c311a
--- /dev/null
+++ b/src/EigenUnsupported/FFT
@@ -0,0 +1,419 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Mark Borgerding mark a borgerding net
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_FFT_H
+#define EIGEN_FFT_H
+
+#include <complex>
+#include <vector>
+#include <map>
+#include "../../Eigen/Core"
+
+
+/**
+ * \defgroup FFT_Module Fast Fourier Transform module
+ *
+ * \code
+ * #include <unsupported/Eigen/FFT>
+ * \endcode
+ *
+ * This module provides Fast Fourier transformation, with a configurable backend
+ * implementation.
+ *
+ * The default implementation is based on kissfft. It is a small, free, and
+ * reasonably efficient default.
+ *
+ * There are currently two implementation backend:
+ *
+ * - fftw (http://www.fftw.org) : faster, GPL -- incompatible with Eigen in LGPL form, bigger code size.
+ * - MKL (http://en.wikipedia.org/wiki/Math_Kernel_Library) : fastest, commercial -- may be incompatible with Eigen in GPL form.
+ *
+ * \section FFTDesign Design
+ *
+ * The following design decisions were made concerning scaling and
+ * half-spectrum for real FFT.
+ *
+ * The intent is to facilitate generic programming and ease migrating code
+ * from Matlab/octave.
+ * We think the default behavior of Eigen/FFT should favor correctness and
+ * generality over speed. Of course, the caller should be able to "opt-out" from this
+ * behavior and get the speed increase if they want it.
+ *
+ * 1) %Scaling:
+ * Other libraries (FFTW,IMKL,KISSFFT) do not perform scaling, so there
+ * is a constant gain incurred after the forward&inverse transforms , so
+ * IFFT(FFT(x)) = Kx; this is done to avoid a vector-by-value multiply.
+ * The downside is that algorithms that worked correctly in Matlab/octave
+ * don't behave the same way once implemented in C++.
+ *
+ * How Eigen/FFT differs: invertible scaling is performed so IFFT( FFT(x) ) = x.
+ *
+ * 2) Real FFT half-spectrum
+ * Other libraries use only half the frequency spectrum (plus one extra
+ * sample for the Nyquist bin) for a real FFT, the other half is the
+ * conjugate-symmetric of the first half. This saves them a copy and some
+ * memory. The downside is the caller needs to have special logic for the
+ * number of bins in complex vs real.
+ *
+ * How Eigen/FFT differs: The full spectrum is returned from the forward
+ * transform. This facilitates generic template programming by obviating
+ * separate specializations for real vs complex. On the inverse
+ * transform, only half the spectrum is actually used if the output type is real.
+ */
+
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+#ifdef EIGEN_FFTW_DEFAULT
+// FFTW: faster, GPL -- incompatible with Eigen in LGPL form, bigger code size
+# include <fftw3.h>
+# include "src/FFT/ei_fftw_impl.h"
+ namespace Eigen {
+ //template <typename T> typedef struct internal::fftw_impl default_fft_impl; this does not work
+ template <typename T> struct default_fft_impl : public internal::fftw_impl<T> {};
+ }
+#elif defined EIGEN_MKL_DEFAULT
+// TODO
+// intel Math Kernel Library: fastest, commercial -- may be incompatible with Eigen in GPL form
+# include "src/FFT/ei_imklfft_impl.h"
+ namespace Eigen {
+ template <typename T> struct default_fft_impl : public internal::imklfft_impl {};
+ }
+#else
+// internal::kissfft_impl: small, free, reasonably efficient default, derived from kissfft
+//
+# include "src/FFT/ei_kissfft_impl.h"
+ namespace Eigen {
+ template <typename T>
+ struct default_fft_impl : public internal::kissfft_impl<T> {};
+ }
+#endif
+
+namespace Eigen {
+
+
+//
+template<typename T_SrcMat,typename T_FftIfc> struct fft_fwd_proxy;
+template<typename T_SrcMat,typename T_FftIfc> struct fft_inv_proxy;
+
+namespace internal {
+template<typename T_SrcMat,typename T_FftIfc>
+struct traits< fft_fwd_proxy<T_SrcMat,T_FftIfc> >
+{
+ typedef typename T_SrcMat::PlainObject ReturnType;
+};
+template<typename T_SrcMat,typename T_FftIfc>
+struct traits< fft_inv_proxy<T_SrcMat,T_FftIfc> >
+{
+ typedef typename T_SrcMat::PlainObject ReturnType;
+};
+}
+
+template<typename T_SrcMat,typename T_FftIfc>
+struct fft_fwd_proxy
+ : public ReturnByValue<fft_fwd_proxy<T_SrcMat,T_FftIfc> >
+{
+ typedef DenseIndex Index;
+
+ fft_fwd_proxy(const T_SrcMat& src,T_FftIfc & fft, Index nfft) : m_src(src),m_ifc(fft), m_nfft(nfft) {}
+
+ template<typename T_DestMat> void evalTo(T_DestMat& dst) const;
+
+ Index rows() const { return m_src.rows(); }
+ Index cols() const { return m_src.cols(); }
+protected:
+ const T_SrcMat & m_src;
+ T_FftIfc & m_ifc;
+ Index m_nfft;
+};
+
+template<typename T_SrcMat,typename T_FftIfc>
+struct fft_inv_proxy
+ : public ReturnByValue<fft_inv_proxy<T_SrcMat,T_FftIfc> >
+{
+ typedef DenseIndex Index;
+
+ fft_inv_proxy(const T_SrcMat& src,T_FftIfc & fft, Index nfft) : m_src(src),m_ifc(fft), m_nfft(nfft) {}
+
+ template<typename T_DestMat> void evalTo(T_DestMat& dst) const;
+
+ Index rows() const { return m_src.rows(); }
+ Index cols() const { return m_src.cols(); }
+protected:
+ const T_SrcMat & m_src;
+ T_FftIfc & m_ifc;
+ Index m_nfft;
+};
+
+
+template <typename T_Scalar,
+ typename T_Impl=default_fft_impl<T_Scalar> >
+class FFT
+{
+ public:
+ typedef T_Impl impl_type;
+ typedef DenseIndex Index;
+ typedef typename impl_type::Scalar Scalar;
+ typedef typename impl_type::Complex Complex;
+
+ enum Flag {
+ Default=0, // goof proof
+ Unscaled=1,
+ HalfSpectrum=2,
+ // SomeOtherSpeedOptimization=4
+ Speedy=32767
+ };
+
+ FFT( const impl_type & impl=impl_type() , Flag flags=Default ) :m_impl(impl),m_flag(flags) { }
+
+ inline
+ bool HasFlag(Flag f) const { return (m_flag & (int)f) == f;}
+
+ inline
+ void SetFlag(Flag f) { m_flag |= (int)f;}
+
+ inline
+ void ClearFlag(Flag f) { m_flag &= (~(int)f);}
+
+ inline
+ void fwd( Complex * dst, const Scalar * src, Index nfft)
+ {
+ m_impl.fwd(dst,src,static_cast<int>(nfft));
+ if ( HasFlag(HalfSpectrum) == false)
+ ReflectSpectrum(dst,nfft);
+ }
+
+ inline
+ void fwd( Complex * dst, const Complex * src, Index nfft)
+ {
+ m_impl.fwd(dst,src,static_cast<int>(nfft));
+ }
+
+ /*
+ inline
+ void fwd2(Complex * dst, const Complex * src, int n0,int n1)
+ {
+ m_impl.fwd2(dst,src,n0,n1);
+ }
+ */
+
+ template <typename _Input>
+ inline
+ void fwd( std::vector<Complex> & dst, const std::vector<_Input> & src)
+ {
+ if ( NumTraits<_Input>::IsComplex == 0 && HasFlag(HalfSpectrum) )
+ dst.resize( (src.size()>>1)+1); // half the bins + Nyquist bin
+ else
+ dst.resize(src.size());
+ fwd(&dst[0],&src[0],src.size());
+ }
+
+ template<typename InputDerived, typename ComplexDerived>
+ inline
+ void fwd( MatrixBase<ComplexDerived> & dst, const MatrixBase<InputDerived> & src, Index nfft=-1)
+ {
+ typedef typename ComplexDerived::Scalar dst_type;
+ typedef typename InputDerived::Scalar src_type;
+ EIGEN_STATIC_ASSERT_VECTOR_ONLY(InputDerived)
+ EIGEN_STATIC_ASSERT_VECTOR_ONLY(ComplexDerived)
+ EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(ComplexDerived,InputDerived) // size at compile-time
+ EIGEN_STATIC_ASSERT((internal::is_same<dst_type, Complex>::value),
+ YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+ EIGEN_STATIC_ASSERT(int(InputDerived::Flags)&int(ComplexDerived::Flags)&DirectAccessBit,
+ THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES)
+
+ if (nfft<1)
+ nfft = src.size();
+
+ if ( NumTraits< src_type >::IsComplex == 0 && HasFlag(HalfSpectrum) )
+ dst.derived().resize( (nfft>>1)+1);
+ else
+ dst.derived().resize(nfft);
+
+ if ( src.innerStride() != 1 || src.size() < nfft ) {
+ Matrix<src_type,1,Dynamic> tmp;
+ if (src.size()<nfft) {
+ tmp.setZero(nfft);
+ tmp.block(0,0,src.size(),1 ) = src;
+ }else{
+ tmp = src;
+ }
+ fwd( &dst[0],&tmp[0],nfft );
+ }else{
+ fwd( &dst[0],&src[0],nfft );
+ }
+ }
+
+ template<typename InputDerived>
+ inline
+ fft_fwd_proxy< MatrixBase<InputDerived>, FFT<T_Scalar,T_Impl> >
+ fwd( const MatrixBase<InputDerived> & src, Index nfft=-1)
+ {
+ return fft_fwd_proxy< MatrixBase<InputDerived> ,FFT<T_Scalar,T_Impl> >( src, *this,nfft );
+ }
+
+ template<typename InputDerived>
+ inline
+ fft_inv_proxy< MatrixBase<InputDerived>, FFT<T_Scalar,T_Impl> >
+ inv( const MatrixBase<InputDerived> & src, Index nfft=-1)
+ {
+ return fft_inv_proxy< MatrixBase<InputDerived> ,FFT<T_Scalar,T_Impl> >( src, *this,nfft );
+ }
+
+ inline
+ void inv( Complex * dst, const Complex * src, Index nfft)
+ {
+ m_impl.inv( dst,src,static_cast<int>(nfft) );
+ if ( HasFlag( Unscaled ) == false)
+ scale(dst,Scalar(1./nfft),nfft); // scale the time series
+ }
+
+ inline
+ void inv( Scalar * dst, const Complex * src, Index nfft)
+ {
+ m_impl.inv( dst,src,static_cast<int>(nfft) );
+ if ( HasFlag( Unscaled ) == false)
+ scale(dst,Scalar(1./nfft),nfft); // scale the time series
+ }
+
+ template<typename OutputDerived, typename ComplexDerived>
+ inline
+ void inv( MatrixBase<OutputDerived> & dst, const MatrixBase<ComplexDerived> & src, Index nfft=-1)
+ {
+ typedef typename ComplexDerived::Scalar src_type;
+ typedef typename ComplexDerived::RealScalar real_type;
+ typedef typename OutputDerived::Scalar dst_type;
+ const bool realfft= (NumTraits<dst_type>::IsComplex == 0);
+ EIGEN_STATIC_ASSERT_VECTOR_ONLY(OutputDerived)
+ EIGEN_STATIC_ASSERT_VECTOR_ONLY(ComplexDerived)
+ EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(ComplexDerived,OutputDerived) // size at compile-time
+ EIGEN_STATIC_ASSERT((internal::is_same<src_type, Complex>::value),
+ YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+ EIGEN_STATIC_ASSERT(int(OutputDerived::Flags)&int(ComplexDerived::Flags)&DirectAccessBit,
+ THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES)
+
+ if (nfft<1) { //automatic FFT size determination
+ if ( realfft && HasFlag(HalfSpectrum) )
+ nfft = 2*(src.size()-1); //assume even fft size
+ else
+ nfft = src.size();
+ }
+ dst.derived().resize( nfft );
+
+ // check for nfft that does not fit the input data size
+ Index resize_input= ( realfft && HasFlag(HalfSpectrum) )
+ ? ( (nfft/2+1) - src.size() )
+ : ( nfft - src.size() );
+
+ if ( src.innerStride() != 1 || resize_input ) {
+ // if the vector is strided, then we need to copy it to a packed temporary
+ Matrix<src_type,1,Dynamic> tmp;
+ if ( resize_input ) {
+ size_t ncopy = (std::min)(src.size(),src.size() + resize_input);
+ tmp.setZero(src.size() + resize_input);
+ if ( realfft && HasFlag(HalfSpectrum) ) {
+ // pad at the Nyquist bin
+ tmp.head(ncopy) = src.head(ncopy);
+ tmp(ncopy-1) = real(tmp(ncopy-1)); // enforce real-only Nyquist bin
+ }else{
+ size_t nhead,ntail;
+ nhead = 1+ncopy/2-1; // range [0:pi)
+ ntail = ncopy/2-1; // range (-pi:0)
+ tmp.head(nhead) = src.head(nhead);
+ tmp.tail(ntail) = src.tail(ntail);
+ if (resize_input<0) { //shrinking -- create the Nyquist bin as the average of the two bins that fold into it
+ tmp(nhead) = ( src(nfft/2) + src( src.size() - nfft/2 ) )*real_type(.5);
+ }else{ // expanding -- split the old Nyquist bin into two halves
+ tmp(nhead) = src(nhead) * real_type(.5);
+ tmp(tmp.size()-nhead) = tmp(nhead);
+ }
+ }
+ }else{
+ tmp = src;
+ }
+ inv( &dst[0],&tmp[0], nfft);
+ }else{
+ inv( &dst[0],&src[0], nfft);
+ }
+ }
+
+ template <typename _Output>
+ inline
+ void inv( std::vector<_Output> & dst, const std::vector<Complex> & src,Index nfft=-1)
+ {
+ if (nfft<1)
+ nfft = ( NumTraits<_Output>::IsComplex == 0 && HasFlag(HalfSpectrum) ) ? 2*(src.size()-1) : src.size();
+ dst.resize( nfft );
+ inv( &dst[0],&src[0],nfft);
+ }
+
+
+ /*
+ // TODO: multi-dimensional FFTs
+ inline
+ void inv2(Complex * dst, const Complex * src, int n0,int n1)
+ {
+ m_impl.inv2(dst,src,n0,n1);
+ if ( HasFlag( Unscaled ) == false)
+ scale(dst,1./(n0*n1),n0*n1);
+ }
+ */
+
+ inline
+ impl_type & impl() {return m_impl;}
+ private:
+
+ template <typename T_Data>
+ inline
+ void scale(T_Data * x,Scalar s,Index nx)
+ {
+#if 1
+ for (int k=0;k<nx;++k)
+ *x++ *= s;
+#else
+ if ( ((ptrdiff_t)x) & 15 )
+ Matrix<T_Data, Dynamic, 1>::Map(x,nx) *= s;
+ else
+ Matrix<T_Data, Dynamic, 1>::MapAligned(x,nx) *= s;
+ //Matrix<T_Data, Dynamic, Dynamic>::Map(x,nx) * s;
+#endif
+ }
+
+ inline
+ void ReflectSpectrum(Complex * freq, Index nfft)
+ {
+ // create the implicit right-half spectrum (conjugate-mirror of the left-half)
+ Index nhbins=(nfft>>1)+1;
+ for (Index k=nhbins;k < nfft; ++k )
+ freq[k] = conj(freq[nfft-k]);
+ }
+
+ impl_type m_impl;
+ int m_flag;
+};
+
+template<typename T_SrcMat,typename T_FftIfc>
+template<typename T_DestMat> inline
+void fft_fwd_proxy<T_SrcMat,T_FftIfc>::evalTo(T_DestMat& dst) const
+{
+ m_ifc.fwd( dst, m_src, m_nfft);
+}
+
+template<typename T_SrcMat,typename T_FftIfc>
+template<typename T_DestMat> inline
+void fft_inv_proxy<T_SrcMat,T_FftIfc>::evalTo(T_DestMat& dst) const
+{
+ m_ifc.inv( dst, m_src, m_nfft);
+}
+
+}
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif
diff --git a/src/EigenUnsupported/IterativeSolvers b/src/EigenUnsupported/IterativeSolvers
new file mode 100644
index 0000000..a3f58d6
--- /dev/null
+++ b/src/EigenUnsupported/IterativeSolvers
@@ -0,0 +1,51 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <g.gael@free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ITERATIVE_SOLVERS_MODULE_H
+#define EIGEN_ITERATIVE_SOLVERS_MODULE_H
+
+#include "../../Eigen/Sparse"
+#include "../../Eigen/Jacobi"
+#include "../../Eigen/Householder"
+
+
+/**
+ * \defgroup IterativeLinearSolvers_Module Iterative solvers module
+ * This module aims to provide various iterative linear and non linear solver algorithms.
+ * It currently provides:
+ * - a constrained conjugate gradient
+ * - a Householder GMRES implementation
+ * - an IDR(s) implementation
+ * - a DGMRES implementation
+ * - a MINRES implementation
+ *
+ * \code
+ * #include <unsupported/Eigen/IterativeSolvers>
+ * \endcode
+ */
+
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+#ifndef EIGEN_MPL2_ONLY
+#include "src/IterativeSolvers/IterationController.h"
+#include "src/IterativeSolvers/ConstrainedConjGrad.h"
+#endif
+
+#include "src/IterativeSolvers/IncompleteLU.h"
+#include "src/IterativeSolvers/GMRES.h"
+#include "src/IterativeSolvers/DGMRES.h"
+//#include "src/IterativeSolvers/SSORPreconditioner.h"
+#include "src/IterativeSolvers/MINRES.h"
+#include "src/IterativeSolvers/IDRS.h"
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+
+#endif // EIGEN_ITERATIVE_SOLVERS_MODULE_H
diff --git a/src/EigenUnsupported/KroneckerProduct b/src/EigenUnsupported/KroneckerProduct
new file mode 100644
index 0000000..5f5afb8
--- /dev/null
+++ b/src/EigenUnsupported/KroneckerProduct
@@ -0,0 +1,36 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_KRONECKER_PRODUCT_MODULE_H
+#define EIGEN_KRONECKER_PRODUCT_MODULE_H
+
+#include "../../Eigen/Core"
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+#include "../../Eigen/src/SparseCore/SparseUtil.h"
+
+namespace Eigen {
+
+/**
+ * \defgroup KroneckerProduct_Module KroneckerProduct module
+ *
+ * This module contains an experimental Kronecker product implementation.
+ *
+ * \code
+ * #include <Eigen/KroneckerProduct>
+ * \endcode
+ */
+
+} // namespace Eigen
+
+#include "src/KroneckerProduct/KroneckerTensorProduct.h"
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_KRONECKER_PRODUCT_MODULE_H
diff --git a/src/EigenUnsupported/LevenbergMarquardt b/src/EigenUnsupported/LevenbergMarquardt
new file mode 100644
index 0000000..1090505
--- /dev/null
+++ b/src/EigenUnsupported/LevenbergMarquardt
@@ -0,0 +1,49 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Thomas Capricelli <orzel@freehackers.org>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_LEVENBERGMARQUARDT_MODULE
+#define EIGEN_LEVENBERGMARQUARDT_MODULE
+
+// #include <vector>
+
+#include "../../Eigen/Core"
+#include "../../Eigen/Jacobi"
+#include "../../Eigen/QR"
+#include "NumericalDiff"
+
+#include "../../Eigen/SparseQR"
+
+/**
+ * \defgroup LevenbergMarquardt_Module Levenberg-Marquardt module
+ *
+ * \code
+ * #include </Eigen/LevenbergMarquardt>
+ * \endcode
+ *
+ *
+ */
+
+#include "../../Eigen/SparseCore"
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+
+#include "src/LevenbergMarquardt/LMqrsolv.h"
+#include "src/LevenbergMarquardt/LMcovar.h"
+#include "src/LevenbergMarquardt/LMpar.h"
+
+#endif
+
+#include "src/LevenbergMarquardt/LevenbergMarquardt.h"
+#include "src/LevenbergMarquardt/LMonestep.h"
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_LEVENBERGMARQUARDT_MODULE
diff --git a/src/EigenUnsupported/MPRealSupport b/src/EigenUnsupported/MPRealSupport
new file mode 100644
index 0000000..c4ea4ec
--- /dev/null
+++ b/src/EigenUnsupported/MPRealSupport
@@ -0,0 +1,213 @@
+// This file is part of a joint effort between Eigen, a lightweight C++ template library
+// for linear algebra, and MPFR C++, a C++ interface to MPFR library (http://www.holoborodko.com/pavel/)
+//
+// Copyright (C) 2010-2012 Pavel Holoborodko <pavel@holoborodko.com>
+// Copyright (C) 2010 Konstantin Holoborodko <konstantin@holoborodko.com>
+// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MPREALSUPPORT_MODULE_H
+#define EIGEN_MPREALSUPPORT_MODULE_H
+
+#include "../../Eigen/Core"
+#include <mpreal.h>
+
+namespace Eigen {
+
+/**
+ * \defgroup MPRealSupport_Module MPFRC++ Support module
+ * \code
+ * #include <Eigen/MPRealSupport>
+ * \endcode
+ *
+ * This module provides support for multi precision floating point numbers
+ * via the <a href="http://www.holoborodko.com/pavel/mpfr">MPFR C++</a>
+ * library which itself is built upon <a href="http://www.mpfr.org/">MPFR</a>/<a href="http://gmplib.org/">GMP</a>.
+ *
+ * \warning MPFR C++ is licensed under the GPL.
+ *
+ * You can find a copy of MPFR C++ that is known to be compatible in the unsupported/test/mpreal folder.
+ *
+ * Here is an example:
+ *
+\code
+#include <iostream>
+#include <Eigen/MPRealSupport>
+#include <Eigen/LU>
+using namespace mpfr;
+using namespace Eigen;
+int main()
+{
+ // set precision to 256 bits (double has only 53 bits)
+ mpreal::set_default_prec(256);
+ // Declare matrix and vector types with multi-precision scalar type
+ typedef Matrix<mpreal,Dynamic,Dynamic> MatrixXmp;
+ typedef Matrix<mpreal,Dynamic,1> VectorXmp;
+
+ MatrixXmp A = MatrixXmp::Random(100,100);
+ VectorXmp b = VectorXmp::Random(100);
+
+ // Solve Ax=b using LU
+ VectorXmp x = A.lu().solve(b);
+ std::cout << "relative error: " << (A*x - b).norm() / b.norm() << std::endl;
+ return 0;
+}
+\endcode
+ *
+ */
+
+ template<> struct NumTraits<mpfr::mpreal>
+ : GenericNumTraits<mpfr::mpreal>
+ {
+ enum {
+ IsInteger = 0,
+ IsSigned = 1,
+ IsComplex = 0,
+ RequireInitialization = 1,
+ ReadCost = HugeCost,
+ AddCost = HugeCost,
+ MulCost = HugeCost
+ };
+
+ typedef mpfr::mpreal Real;
+ typedef mpfr::mpreal NonInteger;
+
+ static inline Real highest (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::maxval(Precision); }
+ static inline Real lowest (long Precision = mpfr::mpreal::get_default_prec()) { return -mpfr::maxval(Precision); }
+
+ // Constants
+ static inline Real Pi (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_pi(Precision); }
+ static inline Real Euler (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_euler(Precision); }
+ static inline Real Log2 (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_log2(Precision); }
+ static inline Real Catalan (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_catalan(Precision); }
+
+ static inline Real epsilon (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::machine_epsilon(Precision); }
+ static inline Real epsilon (const Real& x) { return mpfr::machine_epsilon(x); }
+
+#ifdef MPREAL_HAVE_DYNAMIC_STD_NUMERIC_LIMITS
+ static inline int digits10 (long Precision = mpfr::mpreal::get_default_prec()) { return std::numeric_limits<Real>::digits10(Precision); }
+ static inline int digits10 (const Real& x) { return std::numeric_limits<Real>::digits10(x); }
+
+ static inline int digits () { return std::numeric_limits<Real>::digits(); }
+ static inline int digits (const Real& x) { return std::numeric_limits<Real>::digits(x); }
+#endif
+
+ static inline Real dummy_precision()
+ {
+ mpfr_prec_t weak_prec = ((mpfr::mpreal::get_default_prec()-1) * 90) / 100;
+ return mpfr::machine_epsilon(weak_prec);
+ }
+ };
+
+ namespace internal {
+
+ template<> inline mpfr::mpreal random<mpfr::mpreal>()
+ {
+ return mpfr::random();
+ }
+
+ template<> inline mpfr::mpreal random<mpfr::mpreal>(const mpfr::mpreal& a, const mpfr::mpreal& b)
+ {
+ return a + (b-a) * random<mpfr::mpreal>();
+ }
+
+ inline bool isMuchSmallerThan(const mpfr::mpreal& a, const mpfr::mpreal& b, const mpfr::mpreal& eps)
+ {
+ return mpfr::abs(a) <= mpfr::abs(b) * eps;
+ }
+
+ inline bool isApprox(const mpfr::mpreal& a, const mpfr::mpreal& b, const mpfr::mpreal& eps)
+ {
+ return mpfr::isEqualFuzzy(a,b,eps);
+ }
+
+ inline bool isApproxOrLessThan(const mpfr::mpreal& a, const mpfr::mpreal& b, const mpfr::mpreal& eps)
+ {
+ return a <= b || mpfr::isEqualFuzzy(a,b,eps);
+ }
+
+ template<> inline long double cast<mpfr::mpreal,long double>(const mpfr::mpreal& x)
+ { return x.toLDouble(); }
+
+ template<> inline double cast<mpfr::mpreal,double>(const mpfr::mpreal& x)
+ { return x.toDouble(); }
+
+ template<> inline long cast<mpfr::mpreal,long>(const mpfr::mpreal& x)
+ { return x.toLong(); }
+
+ template<> inline int cast<mpfr::mpreal,int>(const mpfr::mpreal& x)
+ { return int(x.toLong()); }
+
+ // Specialize GEBP kernel and traits for mpreal (no need for peeling, nor complicated stuff)
+ // This also permits to directly call mpfr's routines and avoid many temporaries produced by mpreal
+ template<>
+ class gebp_traits<mpfr::mpreal, mpfr::mpreal, false, false>
+ {
+ public:
+ typedef mpfr::mpreal ResScalar;
+ enum {
+ Vectorizable = false,
+ LhsPacketSize = 1,
+ RhsPacketSize = 1,
+ ResPacketSize = 1,
+ NumberOfRegisters = 1,
+ nr = 1,
+ mr = 1,
+ LhsProgress = 1,
+ RhsProgress = 1
+ };
+ typedef ResScalar LhsPacket;
+ typedef ResScalar RhsPacket;
+ typedef ResScalar ResPacket;
+ typedef LhsPacket LhsPacket4Packing;
+
+ };
+
+
+
+ template<typename Index, typename DataMapper, bool ConjugateLhs, bool ConjugateRhs>
+ struct gebp_kernel<mpfr::mpreal,mpfr::mpreal,Index,DataMapper,1,1,ConjugateLhs,ConjugateRhs>
+ {
+ typedef mpfr::mpreal mpreal;
+
+ EIGEN_DONT_INLINE
+ void operator()(const DataMapper& res, const mpreal* blockA, const mpreal* blockB,
+ Index rows, Index depth, Index cols, const mpreal& alpha,
+ Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0)
+ {
+ if(rows==0 || cols==0 || depth==0)
+ return;
+
+ mpreal acc1(0,mpfr_get_prec(blockA[0].mpfr_srcptr())),
+ tmp (0,mpfr_get_prec(blockA[0].mpfr_srcptr()));
+
+ if(strideA==-1) strideA = depth;
+ if(strideB==-1) strideB = depth;
+
+ for(Index i=0; i<rows; ++i)
+ {
+ for(Index j=0; j<cols; ++j)
+ {
+ const mpreal *A = blockA + i*strideA + offsetA;
+ const mpreal *B = blockB + j*strideB + offsetB;
+
+ acc1 = 0;
+ for(Index k=0; k<depth; k++)
+ {
+ mpfr_mul(tmp.mpfr_ptr(), A[k].mpfr_srcptr(), B[k].mpfr_srcptr(), mpreal::get_default_rnd());
+ mpfr_add(acc1.mpfr_ptr(), acc1.mpfr_ptr(), tmp.mpfr_ptr(), mpreal::get_default_rnd());
+ }
+
+ mpfr_mul(acc1.mpfr_ptr(), acc1.mpfr_srcptr(), alpha.mpfr_srcptr(), mpreal::get_default_rnd());
+ mpfr_add(res(i,j).mpfr_ptr(), res(i,j).mpfr_srcptr(), acc1.mpfr_srcptr(), mpreal::get_default_rnd());
+ }
+ }
+ }
+ };
+ } // end namespace internal
+}
+
+#endif // EIGEN_MPREALSUPPORT_MODULE_H
diff --git a/src/EigenUnsupported/MatrixFunctions b/src/EigenUnsupported/MatrixFunctions
new file mode 100644
index 0000000..20c23d1
--- /dev/null
+++ b/src/EigenUnsupported/MatrixFunctions
@@ -0,0 +1,504 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Jitse Niesen <jitse@maths.leeds.ac.uk>
+// Copyright (C) 2012 Chen-Pang He <jdh8@ms63.hinet.net>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATRIX_FUNCTIONS
+#define EIGEN_MATRIX_FUNCTIONS
+
+#include <cfloat>
+#include <list>
+
+#include "../../Eigen/Core"
+#include "../../Eigen/LU"
+#include "../../Eigen/Eigenvalues"
+
+/**
+ * \defgroup MatrixFunctions_Module Matrix functions module
+ * \brief This module aims to provide various methods for the computation of
+ * matrix functions.
+ *
+ * To use this module, add
+ * \code
+ * #include <unsupported/Eigen/MatrixFunctions>
+ * \endcode
+ * at the start of your source file.
+ *
+ * This module defines the following MatrixBase methods.
+ * - \ref matrixbase_cos "MatrixBase::cos()", for computing the matrix cosine
+ * - \ref matrixbase_cosh "MatrixBase::cosh()", for computing the matrix hyperbolic cosine
+ * - \ref matrixbase_exp "MatrixBase::exp()", for computing the matrix exponential
+ * - \ref matrixbase_log "MatrixBase::log()", for computing the matrix logarithm
+ * - \ref matrixbase_pow "MatrixBase::pow()", for computing the matrix power
+ * - \ref matrixbase_matrixfunction "MatrixBase::matrixFunction()", for computing general matrix functions
+ * - \ref matrixbase_sin "MatrixBase::sin()", for computing the matrix sine
+ * - \ref matrixbase_sinh "MatrixBase::sinh()", for computing the matrix hyperbolic sine
+ * - \ref matrixbase_sqrt "MatrixBase::sqrt()", for computing the matrix square root
+ *
+ * These methods are the main entry points to this module.
+ *
+ * %Matrix functions are defined as follows. Suppose that \f$ f \f$
+ * is an entire function (that is, a function on the complex plane
+ * that is everywhere complex differentiable). Then its Taylor
+ * series
+ * \f[ f(0) + f'(0) x + \frac{f''(0)}{2} x^2 + \frac{f'''(0)}{3!} x^3 + \cdots \f]
+ * converges to \f$ f(x) \f$. In this case, we can define the matrix
+ * function by the same series:
+ * \f[ f(M) = f(0) + f'(0) M + \frac{f''(0)}{2} M^2 + \frac{f'''(0)}{3!} M^3 + \cdots \f]
+ *
+ */
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+#include "src/MatrixFunctions/MatrixExponential.h"
+#include "src/MatrixFunctions/MatrixFunction.h"
+#include "src/MatrixFunctions/MatrixSquareRoot.h"
+#include "src/MatrixFunctions/MatrixLogarithm.h"
+#include "src/MatrixFunctions/MatrixPower.h"
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+
+/**
+\page matrixbaseextra_page
+\ingroup MatrixFunctions_Module
+
+\section matrixbaseextra MatrixBase methods defined in the MatrixFunctions module
+
+The remainder of the page documents the following MatrixBase methods
+which are defined in the MatrixFunctions module.
+
+
+
+\subsection matrixbase_cos MatrixBase::cos()
+
+Compute the matrix cosine.
+
+\code
+const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::cos() const
+\endcode
+
+\param[in] M a square matrix.
+\returns expression representing \f$ \cos(M) \f$.
+
+This function computes the matrix cosine. Use ArrayBase::cos() for computing the entry-wise cosine.
+
+The implementation calls \ref matrixbase_matrixfunction "matrixFunction()" with StdStemFunctions::cos().
+
+\sa \ref matrixbase_sin "sin()" for an example.
+
+
+
+\subsection matrixbase_cosh MatrixBase::cosh()
+
+Compute the matrix hyberbolic cosine.
+
+\code
+const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::cosh() const
+\endcode
+
+\param[in] M a square matrix.
+\returns expression representing \f$ \cosh(M) \f$
+
+This function calls \ref matrixbase_matrixfunction "matrixFunction()" with StdStemFunctions::cosh().
+
+\sa \ref matrixbase_sinh "sinh()" for an example.
+
+
+
+\subsection matrixbase_exp MatrixBase::exp()
+
+Compute the matrix exponential.
+
+\code
+const MatrixExponentialReturnValue<Derived> MatrixBase<Derived>::exp() const
+\endcode
+
+\param[in] M matrix whose exponential is to be computed.
+\returns expression representing the matrix exponential of \p M.
+
+The matrix exponential of \f$ M \f$ is defined by
+\f[ \exp(M) = \sum_{k=0}^\infty \frac{M^k}{k!}. \f]
+The matrix exponential can be used to solve linear ordinary
+differential equations: the solution of \f$ y' = My \f$ with the
+initial condition \f$ y(0) = y_0 \f$ is given by
+\f$ y(t) = \exp(M) y_0 \f$.
+
+The matrix exponential is different from applying the exp function to all the entries in the matrix.
+Use ArrayBase::exp() if you want to do the latter.
+
+The cost of the computation is approximately \f$ 20 n^3 \f$ for
+matrices of size \f$ n \f$. The number 20 depends weakly on the
+norm of the matrix.
+
+The matrix exponential is computed using the scaling-and-squaring
+method combined with Pad&eacute; approximation. The matrix is first
+rescaled, then the exponential of the reduced matrix is computed
+approximant, and then the rescaling is undone by repeated
+squaring. The degree of the Pad&eacute; approximant is chosen such
+that the approximation error is less than the round-off
+error. However, errors may accumulate during the squaring phase.
+
+Details of the algorithm can be found in: Nicholas J. Higham, "The
+scaling and squaring method for the matrix exponential revisited,"
+<em>SIAM J. %Matrix Anal. Applic.</em>, <b>26</b>:1179&ndash;1193,
+2005.
+
+Example: The following program checks that
+\f[ \exp \left[ \begin{array}{ccc}
+ 0 & \frac14\pi & 0 \\
+ -\frac14\pi & 0 & 0 \\
+ 0 & 0 & 0
+ \end{array} \right] = \left[ \begin{array}{ccc}
+ \frac12\sqrt2 & -\frac12\sqrt2 & 0 \\
+ \frac12\sqrt2 & \frac12\sqrt2 & 0 \\
+ 0 & 0 & 1
+ \end{array} \right]. \f]
+This corresponds to a rotation of \f$ \frac14\pi \f$ radians around
+the z-axis.
+
+\include MatrixExponential.cpp
+Output: \verbinclude MatrixExponential.out
+
+\note \p M has to be a matrix of \c float, \c double, `long double`
+\c complex<float>, \c complex<double>, or `complex<long double>` .
+
+
+\subsection matrixbase_log MatrixBase::log()
+
+Compute the matrix logarithm.
+
+\code
+const MatrixLogarithmReturnValue<Derived> MatrixBase<Derived>::log() const
+\endcode
+
+\param[in] M invertible matrix whose logarithm is to be computed.
+\returns expression representing the matrix logarithm root of \p M.
+
+The matrix logarithm of \f$ M \f$ is a matrix \f$ X \f$ such that
+\f$ \exp(X) = M \f$ where exp denotes the matrix exponential. As for
+the scalar logarithm, the equation \f$ \exp(X) = M \f$ may have
+multiple solutions; this function returns a matrix whose eigenvalues
+have imaginary part in the interval \f$ (-\pi,\pi] \f$.
+
+The matrix logarithm is different from applying the log function to all the entries in the matrix.
+Use ArrayBase::log() if you want to do the latter.
+
+In the real case, the matrix \f$ M \f$ should be invertible and
+it should have no eigenvalues which are real and negative (pairs of
+complex conjugate eigenvalues are allowed). In the complex case, it
+only needs to be invertible.
+
+This function computes the matrix logarithm using the Schur-Parlett
+algorithm as implemented by MatrixBase::matrixFunction(). The
+logarithm of an atomic block is computed by MatrixLogarithmAtomic,
+which uses direct computation for 1-by-1 and 2-by-2 blocks and an
+inverse scaling-and-squaring algorithm for bigger blocks, with the
+square roots computed by MatrixBase::sqrt().
+
+Details of the algorithm can be found in Section 11.6.2 of:
+Nicholas J. Higham,
+<em>Functions of Matrices: Theory and Computation</em>,
+SIAM 2008. ISBN 978-0-898716-46-7.
+
+Example: The following program checks that
+\f[ \log \left[ \begin{array}{ccc}
+ \frac12\sqrt2 & -\frac12\sqrt2 & 0 \\
+ \frac12\sqrt2 & \frac12\sqrt2 & 0 \\
+ 0 & 0 & 1
+ \end{array} \right] = \left[ \begin{array}{ccc}
+ 0 & \frac14\pi & 0 \\
+ -\frac14\pi & 0 & 0 \\
+ 0 & 0 & 0
+ \end{array} \right]. \f]
+This corresponds to a rotation of \f$ \frac14\pi \f$ radians around
+the z-axis. This is the inverse of the example used in the
+documentation of \ref matrixbase_exp "exp()".
+
+\include MatrixLogarithm.cpp
+Output: \verbinclude MatrixLogarithm.out
+
+\note \p M has to be a matrix of \c float, \c double, `long
+double`, \c complex<float>, \c complex<double>, or `complex<long double>`.
+
+\sa MatrixBase::exp(), MatrixBase::matrixFunction(),
+ class MatrixLogarithmAtomic, MatrixBase::sqrt().
+
+
+\subsection matrixbase_pow MatrixBase::pow()
+
+Compute the matrix raised to arbitrary real power.
+
+\code
+const MatrixPowerReturnValue<Derived> MatrixBase<Derived>::pow(RealScalar p) const
+\endcode
+
+\param[in] M base of the matrix power, should be a square matrix.
+\param[in] p exponent of the matrix power.
+
+The matrix power \f$ M^p \f$ is defined as \f$ \exp(p \log(M)) \f$,
+where exp denotes the matrix exponential, and log denotes the matrix
+logarithm. This is different from raising all the entries in the matrix
+to the p-th power. Use ArrayBase::pow() if you want to do the latter.
+
+If \p p is complex, the scalar type of \p M should be the type of \p
+p . \f$ M^p \f$ simply evaluates into \f$ \exp(p \log(M)) \f$.
+Therefore, the matrix \f$ M \f$ should meet the conditions to be an
+argument of matrix logarithm.
+
+If \p p is real, it is casted into the real scalar type of \p M. Then
+this function computes the matrix power using the Schur-Pad&eacute;
+algorithm as implemented by class MatrixPower. The exponent is split
+into integral part and fractional part, where the fractional part is
+in the interval \f$ (-1, 1) \f$. The main diagonal and the first
+super-diagonal is directly computed.
+
+If \p M is singular with a semisimple zero eigenvalue and \p p is
+positive, the Schur factor \f$ T \f$ is reordered with Givens
+rotations, i.e.
+
+\f[ T = \left[ \begin{array}{cc}
+ T_1 & T_2 \\
+ 0 & 0
+ \end{array} \right] \f]
+
+where \f$ T_1 \f$ is invertible. Then \f$ T^p \f$ is given by
+
+\f[ T^p = \left[ \begin{array}{cc}
+ T_1^p & T_1^{-1} T_1^p T_2 \\
+ 0 & 0
+ \end{array}. \right] \f]
+
+\warning Fractional power of a matrix with a non-semisimple zero
+eigenvalue is not well-defined. We introduce an assertion failure
+against inaccurate result, e.g. \code
+#include <unsupported/Eigen/MatrixFunctions>
+#include <iostream>
+
+int main()
+{
+ Eigen::Matrix4d A;
+ A << 0, 0, 2, 3,
+ 0, 0, 4, 5,
+ 0, 0, 6, 7,
+ 0, 0, 8, 9;
+ std::cout << A.pow(0.37) << std::endl;
+
+ // The 1 makes eigenvalue 0 non-semisimple.
+ A.coeffRef(0, 1) = 1;
+
+ // This fails if EIGEN_NO_DEBUG is undefined.
+ std::cout << A.pow(0.37) << std::endl;
+
+ return 0;
+}
+\endcode
+
+Details of the algorithm can be found in: Nicholas J. Higham and
+Lijing Lin, "A Schur-Pad&eacute; algorithm for fractional powers of a
+matrix," <em>SIAM J. %Matrix Anal. Applic.</em>,
+<b>32(3)</b>:1056&ndash;1078, 2011.
+
+Example: The following program checks that
+\f[ \left[ \begin{array}{ccc}
+ \cos1 & -\sin1 & 0 \\
+ \sin1 & \cos1 & 0 \\
+ 0 & 0 & 1
+ \end{array} \right]^{\frac14\pi} = \left[ \begin{array}{ccc}
+ \frac12\sqrt2 & -\frac12\sqrt2 & 0 \\
+ \frac12\sqrt2 & \frac12\sqrt2 & 0 \\
+ 0 & 0 & 1
+ \end{array} \right]. \f]
+This corresponds to \f$ \frac14\pi \f$ rotations of 1 radian around
+the z-axis.
+
+\include MatrixPower.cpp
+Output: \verbinclude MatrixPower.out
+
+MatrixBase::pow() is user-friendly. However, there are some
+circumstances under which you should use class MatrixPower directly.
+MatrixPower can save the result of Schur decomposition, so it's
+better for computing various powers for the same matrix.
+
+Example:
+\include MatrixPower_optimal.cpp
+Output: \verbinclude MatrixPower_optimal.out
+
+\note \p M has to be a matrix of \c float, \c double, `long
+double`, \c complex<float>, \c complex<double>, or
+\c complex<long double> .
+
+\sa MatrixBase::exp(), MatrixBase::log(), class MatrixPower.
+
+
+\subsection matrixbase_matrixfunction MatrixBase::matrixFunction()
+
+Compute a matrix function.
+
+\code
+const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::matrixFunction(typename internal::stem_function<typename internal::traits<Derived>::Scalar>::type f) const
+\endcode
+
+\param[in] M argument of matrix function, should be a square matrix.
+\param[in] f an entire function; \c f(x,n) should compute the n-th
+derivative of f at x.
+\returns expression representing \p f applied to \p M.
+
+Suppose that \p M is a matrix whose entries have type \c Scalar.
+Then, the second argument, \p f, should be a function with prototype
+\code
+ComplexScalar f(ComplexScalar, int)
+\endcode
+where \c ComplexScalar = \c std::complex<Scalar> if \c Scalar is
+real (e.g., \c float or \c double) and \c ComplexScalar =
+\c Scalar if \c Scalar is complex. The return value of \c f(x,n)
+should be \f$ f^{(n)}(x) \f$, the n-th derivative of f at x.
+
+This routine uses the algorithm described in:
+Philip Davies and Nicholas J. Higham,
+"A Schur-Parlett algorithm for computing matrix functions",
+<em>SIAM J. %Matrix Anal. Applic.</em>, <b>25</b>:464&ndash;485, 2003.
+
+The actual work is done by the MatrixFunction class.
+
+Example: The following program checks that
+\f[ \exp \left[ \begin{array}{ccc}
+ 0 & \frac14\pi & 0 \\
+ -\frac14\pi & 0 & 0 \\
+ 0 & 0 & 0
+ \end{array} \right] = \left[ \begin{array}{ccc}
+ \frac12\sqrt2 & -\frac12\sqrt2 & 0 \\
+ \frac12\sqrt2 & \frac12\sqrt2 & 0 \\
+ 0 & 0 & 1
+ \end{array} \right]. \f]
+This corresponds to a rotation of \f$ \frac14\pi \f$ radians around
+the z-axis. This is the same example as used in the documentation
+of \ref matrixbase_exp "exp()".
+
+\include MatrixFunction.cpp
+Output: \verbinclude MatrixFunction.out
+
+Note that the function \c expfn is defined for complex numbers
+\c x, even though the matrix \c A is over the reals. Instead of
+\c expfn, we could also have used StdStemFunctions::exp:
+\code
+A.matrixFunction(StdStemFunctions<std::complex<double> >::exp, &B);
+\endcode
+
+
+
+\subsection matrixbase_sin MatrixBase::sin()
+
+Compute the matrix sine.
+
+\code
+const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::sin() const
+\endcode
+
+\param[in] M a square matrix.
+\returns expression representing \f$ \sin(M) \f$.
+
+This function computes the matrix sine. Use ArrayBase::sin() for computing the entry-wise sine.
+
+The implementation calls \ref matrixbase_matrixfunction "matrixFunction()" with StdStemFunctions::sin().
+
+Example: \include MatrixSine.cpp
+Output: \verbinclude MatrixSine.out
+
+
+
+\subsection matrixbase_sinh MatrixBase::sinh()
+
+Compute the matrix hyperbolic sine.
+
+\code
+MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::sinh() const
+\endcode
+
+\param[in] M a square matrix.
+\returns expression representing \f$ \sinh(M) \f$
+
+This function calls \ref matrixbase_matrixfunction "matrixFunction()" with StdStemFunctions::sinh().
+
+Example: \include MatrixSinh.cpp
+Output: \verbinclude MatrixSinh.out
+
+
+\subsection matrixbase_sqrt MatrixBase::sqrt()
+
+Compute the matrix square root.
+
+\code
+const MatrixSquareRootReturnValue<Derived> MatrixBase<Derived>::sqrt() const
+\endcode
+
+\param[in] M invertible matrix whose square root is to be computed.
+\returns expression representing the matrix square root of \p M.
+
+The matrix square root of \f$ M \f$ is the matrix \f$ M^{1/2} \f$
+whose square is the original matrix; so if \f$ S = M^{1/2} \f$ then
+\f$ S^2 = M \f$. This is different from taking the square root of all
+the entries in the matrix; use ArrayBase::sqrt() if you want to do the
+latter.
+
+In the <b>real case</b>, the matrix \f$ M \f$ should be invertible and
+it should have no eigenvalues which are real and negative (pairs of
+complex conjugate eigenvalues are allowed). In that case, the matrix
+has a square root which is also real, and this is the square root
+computed by this function.
+
+The matrix square root is computed by first reducing the matrix to
+quasi-triangular form with the real Schur decomposition. The square
+root of the quasi-triangular matrix can then be computed directly. The
+cost is approximately \f$ 25 n^3 \f$ real flops for the real Schur
+decomposition and \f$ 3\frac13 n^3 \f$ real flops for the remainder
+(though the computation time in practice is likely more than this
+indicates).
+
+Details of the algorithm can be found in: Nicholas J. Highan,
+"Computing real square roots of a real matrix", <em>Linear Algebra
+Appl.</em>, 88/89:405&ndash;430, 1987.
+
+If the matrix is <b>positive-definite symmetric</b>, then the square
+root is also positive-definite symmetric. In this case, it is best to
+use SelfAdjointEigenSolver::operatorSqrt() to compute it.
+
+In the <b>complex case</b>, the matrix \f$ M \f$ should be invertible;
+this is a restriction of the algorithm. The square root computed by
+this algorithm is the one whose eigenvalues have an argument in the
+interval \f$ (-\frac12\pi, \frac12\pi] \f$. This is the usual branch
+cut.
+
+The computation is the same as in the real case, except that the
+complex Schur decomposition is used to reduce the matrix to a
+triangular matrix. The theoretical cost is the same. Details are in:
+&Aring;ke Bj&ouml;rck and Sven Hammarling, "A Schur method for the
+square root of a matrix", <em>Linear Algebra Appl.</em>,
+52/53:127&ndash;140, 1983.
+
+Example: The following program checks that the square root of
+\f[ \left[ \begin{array}{cc}
+ \cos(\frac13\pi) & -\sin(\frac13\pi) \\
+ \sin(\frac13\pi) & \cos(\frac13\pi)
+ \end{array} \right], \f]
+corresponding to a rotation over 60 degrees, is a rotation over 30 degrees:
+\f[ \left[ \begin{array}{cc}
+ \cos(\frac16\pi) & -\sin(\frac16\pi) \\
+ \sin(\frac16\pi) & \cos(\frac16\pi)
+ \end{array} \right]. \f]
+
+\include MatrixSquareRoot.cpp
+Output: \verbinclude MatrixSquareRoot.out
+
+\sa class RealSchur, class ComplexSchur, class MatrixSquareRoot,
+ SelfAdjointEigenSolver::operatorSqrt().
+
+*/
+
+#endif // EIGEN_MATRIX_FUNCTIONS
+
diff --git a/src/EigenUnsupported/MoreVectorization b/src/EigenUnsupported/MoreVectorization
new file mode 100644
index 0000000..7662b47
--- /dev/null
+++ b/src/EigenUnsupported/MoreVectorization
@@ -0,0 +1,24 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MOREVECTORIZATION_MODULE_H
+#define EIGEN_MOREVECTORIZATION_MODULE_H
+
+#include "../../Eigen/Core"
+
+namespace Eigen {
+
+/**
+ * \defgroup MoreVectorization More vectorization module
+ */
+
+}
+
+#include "src/MoreVectorization/MathFunctions.h"
+
+#endif // EIGEN_MOREVECTORIZATION_MODULE_H
diff --git a/src/EigenUnsupported/NonLinearOptimization b/src/EigenUnsupported/NonLinearOptimization
new file mode 100644
index 0000000..961f192
--- /dev/null
+++ b/src/EigenUnsupported/NonLinearOptimization
@@ -0,0 +1,140 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Thomas Capricelli <orzel@freehackers.org>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_NONLINEAROPTIMIZATION_MODULE
+#define EIGEN_NONLINEAROPTIMIZATION_MODULE
+
+#include <vector>
+
+#include "../../Eigen/Core"
+#include "../../Eigen/Jacobi"
+#include "../../Eigen/QR"
+#include "NumericalDiff"
+
+/**
+ * \defgroup NonLinearOptimization_Module Non linear optimization module
+ *
+ * \code
+ * #include <unsupported/Eigen/NonLinearOptimization>
+ * \endcode
+ *
+ * This module provides implementation of two important algorithms in non linear
+ * optimization. In both cases, we consider a system of non linear functions. Of
+ * course, this should work, and even work very well if those functions are
+ * actually linear. But if this is so, you should probably better use other
+ * methods more fitted to this special case.
+ *
+ * One algorithm allows to find a least-squares solution of such a system
+ * (Levenberg-Marquardt algorithm) and the second one is used to find
+ * a zero for the system (Powell hybrid "dogleg" method).
+ *
+ * This code is a port of minpack (http://en.wikipedia.org/wiki/MINPACK).
+ * Minpack is a very famous, old, robust and well renowned package, written in
+ * fortran. Those implementations have been carefully tuned, tested, and used
+ * for several decades.
+ *
+ * The original fortran code was automatically translated using f2c (http://en.wikipedia.org/wiki/F2c) in C,
+ * then c++, and then cleaned by several different authors.
+ * The last one of those cleanings being our starting point :
+ * http://devernay.free.fr/hacks/cminpack.html
+ *
+ * Finally, we ported this code to Eigen, creating classes and API
+ * coherent with Eigen. When possible, we switched to Eigen
+ * implementation, such as most linear algebra (vectors, matrices, stable norms).
+ *
+ * Doing so, we were very careful to check the tests we setup at the very
+ * beginning, which ensure that the same results are found.
+ *
+ * \section Tests Tests
+ *
+ * The tests are placed in the file unsupported/test/NonLinear.cpp.
+ *
+ * There are two kinds of tests : those that come from examples bundled with cminpack.
+ * They guaranty we get the same results as the original algorithms (value for 'x',
+ * for the number of evaluations of the function, and for the number of evaluations
+ * of the Jacobian if ever).
+ *
+ * Other tests were added by myself at the very beginning of the
+ * process and check the results for Levenberg-Marquardt using the reference data
+ * on http://www.itl.nist.gov/div898/strd/nls/nls_main.shtml. Since then i've
+ * carefully checked that the same results were obtained when modifying the
+ * code. Please note that we do not always get the exact same decimals as they do,
+ * but this is ok : they use 128bits float, and we do the tests using the C type 'double',
+ * which is 64 bits on most platforms (x86 and amd64, at least).
+ * I've performed those tests on several other implementations of Levenberg-Marquardt, and
+ * (c)minpack performs VERY well compared to those, both in accuracy and speed.
+ *
+ * The documentation for running the tests is on the wiki
+ * http://eigen.tuxfamily.org/index.php?title=Tests
+ *
+ * \section API API: overview of methods
+ *
+ * Both algorithms needs a functor computing the Jacobian. It can be computed by
+ * hand, using auto-differentiation (see \ref AutoDiff_Module), or using numerical
+ * differences (see \ref NumericalDiff_Module). For instance:
+ *\code
+ * MyFunc func;
+ * NumericalDiff<MyFunc> func_with_num_diff(func);
+ * LevenbergMarquardt<NumericalDiff<MyFunc> > lm(func_with_num_diff);
+ * \endcode
+ * For HybridNonLinearSolver, the method solveNumericalDiff() does the above wrapping for
+ * you.
+ *
+ * The methods LevenbergMarquardt.lmder1()/lmdif1()/lmstr1() and
+ * HybridNonLinearSolver.hybrj1()/hybrd1() are specific methods from the original
+ * minpack package that you probably should NOT use until you are porting a code that
+ * was previously using minpack. They just define a 'simple' API with default values
+ * for some parameters.
+ *
+ * All algorithms are provided using two APIs :
+ * - one where the user inits the algorithm, and uses '*OneStep()' as much as he wants :
+ * this way the caller have control over the steps
+ * - one where the user just calls a method (optimize() or solve()) which will
+ * handle the loop: init + loop until a stop condition is met. Those are provided for
+ * convenience.
+ *
+ * As an example, the method LevenbergMarquardt::minimize() is
+ * implemented as follow:
+ * \code
+ * Status LevenbergMarquardt<FunctorType,Scalar>::minimize(FVectorType &x, const int mode)
+ * {
+ * Status status = minimizeInit(x, mode);
+ * do {
+ * status = minimizeOneStep(x, mode);
+ * } while (status==Running);
+ * return status;
+ * }
+ * \endcode
+ *
+ * \section examples Examples
+ *
+ * The easiest way to understand how to use this module is by looking at the many examples in the file
+ * unsupported/test/NonLinearOptimization.cpp.
+ */
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+
+#include "src/NonLinearOptimization/qrsolv.h"
+#include "src/NonLinearOptimization/r1updt.h"
+#include "src/NonLinearOptimization/r1mpyq.h"
+#include "src/NonLinearOptimization/rwupdt.h"
+#include "src/NonLinearOptimization/fdjac1.h"
+#include "src/NonLinearOptimization/lmpar.h"
+#include "src/NonLinearOptimization/dogleg.h"
+#include "src/NonLinearOptimization/covar.h"
+
+#include "src/NonLinearOptimization/chkder.h"
+
+#endif
+
+#include "src/NonLinearOptimization/HybridNonLinearSolver.h"
+#include "src/NonLinearOptimization/LevenbergMarquardt.h"
+
+
+#endif // EIGEN_NONLINEAROPTIMIZATION_MODULE
diff --git a/src/EigenUnsupported/NumericalDiff b/src/EigenUnsupported/NumericalDiff
new file mode 100644
index 0000000..0668f96
--- /dev/null
+++ b/src/EigenUnsupported/NumericalDiff
@@ -0,0 +1,56 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Thomas Capricelli <orzel@freehackers.org>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_NUMERICALDIFF_MODULE
+#define EIGEN_NUMERICALDIFF_MODULE
+
+#include "../../Eigen/Core"
+
+namespace Eigen {
+
+/**
+ * \defgroup NumericalDiff_Module Numerical differentiation module
+ *
+ * \code
+ * #include <unsupported/Eigen/NumericalDiff>
+ * \endcode
+ *
+ * See http://en.wikipedia.org/wiki/Numerical_differentiation
+ *
+ * Warning : this should NOT be confused with automatic differentiation, which
+ * is a different method and has its own module in Eigen : \ref
+ * AutoDiff_Module.
+ *
+ * Currently only "Forward" and "Central" schemes are implemented. Those
+ * are basic methods, and there exist some more elaborated way of
+ * computing such approximates. They are implemented using both
+ * proprietary and free software, and usually requires linking to an
+ * external library. It is very easy for you to write a functor
+ * using such software, and the purpose is quite orthogonal to what we
+ * want to achieve with Eigen.
+ *
+ * This is why we will not provide wrappers for every great numerical
+ * differentiation software that exist, but should rather stick with those
+ * basic ones, that still are useful for testing.
+ *
+ * Also, the \ref NonLinearOptimization_Module needs this in order to
+ * provide full features compatibility with the original (c)minpack
+ * package.
+ *
+ */
+}
+
+//@{
+
+#include "src/NumericalDiff/NumericalDiff.h"
+
+//@}
+
+
+#endif // EIGEN_NUMERICALDIFF_MODULE
diff --git a/src/EigenUnsupported/OpenGLSupport b/src/EigenUnsupported/OpenGLSupport
new file mode 100644
index 0000000..f8c2130
--- /dev/null
+++ b/src/EigenUnsupported/OpenGLSupport
@@ -0,0 +1,322 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_OPENGL_MODULE
+#define EIGEN_OPENGL_MODULE
+
+#include "../../Eigen/Geometry"
+
+#if defined(__APPLE_CC__)
+ #include <OpenGL/gl.h>
+#else
+ #include <GL/gl.h>
+#endif
+
+namespace Eigen {
+
+/**
+ * \defgroup OpenGLSUpport_Module OpenGL Support module
+ *
+ * This module provides wrapper functions for a couple of OpenGL functions
+ * which simplify the way to pass Eigen's object to openGL.
+ * Here is an example:
+ *
+ * \code
+ * // You need to add path_to_eigen/unsupported to your include path.
+ * #include <Eigen/OpenGLSupport>
+ * // ...
+ * Vector3f x, y;
+ * Matrix3f rot;
+ *
+ * glVertex(y + x * rot);
+ *
+ * Quaternion q;
+ * glRotate(q);
+ *
+ * // ...
+ * \endcode
+ *
+ */
+//@{
+
+#define EIGEN_GL_FUNC_DECLARATION(FUNC) \
+namespace internal { \
+ template< typename XprType, \
+ typename Scalar = typename XprType::Scalar, \
+ int Rows = XprType::RowsAtCompileTime, \
+ int Cols = XprType::ColsAtCompileTime, \
+ bool IsGLCompatible = bool(internal::evaluator<XprType>::Flags&LinearAccessBit) \
+ && bool(XprType::Flags&DirectAccessBit) \
+ && (XprType::IsVectorAtCompileTime || (XprType::Flags&RowMajorBit)==0)> \
+ struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl); \
+ \
+ template<typename XprType, typename Scalar, int Rows, int Cols> \
+ struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)<XprType,Scalar,Rows,Cols,false> { \
+ inline static void run(const XprType& p) { \
+ EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)<typename plain_matrix_type_column_major<XprType>::type>::run(p); } \
+ }; \
+} \
+ \
+template<typename Derived> inline void FUNC(const Eigen::DenseBase<Derived>& p) { \
+ EIGEN_CAT(EIGEN_CAT(internal::gl_,FUNC),_impl)<Derived>::run(p.derived()); \
+}
+
+
+#define EIGEN_GL_FUNC_SPECIALIZATION_MAT(FUNC,SCALAR,ROWS,COLS,SUFFIX) \
+namespace internal { \
+ template< typename XprType> struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)<XprType, SCALAR, ROWS, COLS, true> { \
+ inline static void run(const XprType& p) { FUNC##SUFFIX(p.data()); } \
+ }; \
+}
+
+
+#define EIGEN_GL_FUNC_SPECIALIZATION_VEC(FUNC,SCALAR,SIZE,SUFFIX) \
+namespace internal { \
+ template< typename XprType> struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)<XprType, SCALAR, SIZE, 1, true> { \
+ inline static void run(const XprType& p) { FUNC##SUFFIX(p.data()); } \
+ }; \
+ template< typename XprType> struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)<XprType, SCALAR, 1, SIZE, true> { \
+ inline static void run(const XprType& p) { FUNC##SUFFIX(p.data()); } \
+ }; \
+}
+
+
+EIGEN_GL_FUNC_DECLARATION (glVertex)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,int, 2,2iv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,short, 2,2sv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,float, 2,2fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,double, 2,2dv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,int, 3,3iv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,short, 3,3sv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,float, 3,3fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,double, 3,3dv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,int, 4,4iv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,short, 4,4sv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,float, 4,4fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,double, 4,4dv)
+
+EIGEN_GL_FUNC_DECLARATION (glTexCoord)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,int, 2,2iv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,short, 2,2sv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,float, 2,2fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,double, 2,2dv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,int, 3,3iv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,short, 3,3sv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,float, 3,3fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,double, 3,3dv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,int, 4,4iv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,short, 4,4sv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,float, 4,4fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,double, 4,4dv)
+
+EIGEN_GL_FUNC_DECLARATION (glColor)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,int, 2,2iv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,short, 2,2sv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,float, 2,2fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,double, 2,2dv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,int, 3,3iv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,short, 3,3sv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,float, 3,3fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,double, 3,3dv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,int, 4,4iv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,short, 4,4sv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,float, 4,4fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,double, 4,4dv)
+
+EIGEN_GL_FUNC_DECLARATION (glNormal)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glNormal,int, 3,3iv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glNormal,short, 3,3sv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glNormal,float, 3,3fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glNormal,double, 3,3dv)
+
+inline void glScale2fv(const float* v) { glScalef(v[0], v[1], 1.f); }
+inline void glScale2dv(const double* v) { glScaled(v[0], v[1], 1.0); }
+inline void glScale3fv(const float* v) { glScalef(v[0], v[1], v[2]); }
+inline void glScale3dv(const double* v) { glScaled(v[0], v[1], v[2]); }
+
+EIGEN_GL_FUNC_DECLARATION (glScale)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glScale,float, 2,2fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glScale,double, 2,2dv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glScale,float, 3,3fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glScale,double, 3,3dv)
+
+template<typename Scalar> void glScale(const UniformScaling<Scalar>& s) { glScale(Matrix<Scalar,3,1>::Constant(s.factor())); }
+
+inline void glTranslate2fv(const float* v) { glTranslatef(v[0], v[1], 0.f); }
+inline void glTranslate2dv(const double* v) { glTranslated(v[0], v[1], 0.0); }
+inline void glTranslate3fv(const float* v) { glTranslatef(v[0], v[1], v[2]); }
+inline void glTranslate3dv(const double* v) { glTranslated(v[0], v[1], v[2]); }
+
+EIGEN_GL_FUNC_DECLARATION (glTranslate)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTranslate,float, 2,2fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTranslate,double, 2,2dv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTranslate,float, 3,3fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTranslate,double, 3,3dv)
+
+template<typename Scalar> void glTranslate(const Translation<Scalar,2>& t) { glTranslate(t.vector()); }
+template<typename Scalar> void glTranslate(const Translation<Scalar,3>& t) { glTranslate(t.vector()); }
+
+EIGEN_GL_FUNC_DECLARATION (glMultMatrix)
+EIGEN_GL_FUNC_SPECIALIZATION_MAT(glMultMatrix,float, 4,4,f)
+EIGEN_GL_FUNC_SPECIALIZATION_MAT(glMultMatrix,double, 4,4,d)
+
+template<typename Scalar> void glMultMatrix(const Transform<Scalar,3,Affine>& t) { glMultMatrix(t.matrix()); }
+template<typename Scalar> void glMultMatrix(const Transform<Scalar,3,Projective>& t) { glMultMatrix(t.matrix()); }
+template<typename Scalar> void glMultMatrix(const Transform<Scalar,3,AffineCompact>& t) { glMultMatrix(Transform<Scalar,3,Affine>(t).matrix()); }
+
+EIGEN_GL_FUNC_DECLARATION (glLoadMatrix)
+EIGEN_GL_FUNC_SPECIALIZATION_MAT(glLoadMatrix,float, 4,4,f)
+EIGEN_GL_FUNC_SPECIALIZATION_MAT(glLoadMatrix,double, 4,4,d)
+
+template<typename Scalar> void glLoadMatrix(const Transform<Scalar,3,Affine>& t) { glLoadMatrix(t.matrix()); }
+template<typename Scalar> void glLoadMatrix(const Transform<Scalar,3,Projective>& t) { glLoadMatrix(t.matrix()); }
+template<typename Scalar> void glLoadMatrix(const Transform<Scalar,3,AffineCompact>& t) { glLoadMatrix(Transform<Scalar,3,Affine>(t).matrix()); }
+
+inline void glRotate(const Rotation2D<float>& rot)
+{
+ glRotatef(rot.angle()*180.f/float(EIGEN_PI), 0.f, 0.f, 1.f);
+}
+inline void glRotate(const Rotation2D<double>& rot)
+{
+ glRotated(rot.angle()*180.0/double(EIGEN_PI), 0.0, 0.0, 1.0);
+}
+
+template<typename Derived> void glRotate(const RotationBase<Derived,3>& rot)
+{
+ Transform<typename Derived::Scalar,3,Projective> tr(rot);
+ glMultMatrix(tr.matrix());
+}
+
+#define EIGEN_GL_MAKE_CONST_const const
+#define EIGEN_GL_MAKE_CONST__
+#define EIGEN_GL_EVAL(X) X
+
+#define EIGEN_GL_FUNC1_DECLARATION(FUNC,ARG1,CONST) \
+namespace internal { \
+ template< typename XprType, \
+ typename Scalar = typename XprType::Scalar, \
+ int Rows = XprType::RowsAtCompileTime, \
+ int Cols = XprType::ColsAtCompileTime, \
+ bool IsGLCompatible = bool(internal::evaluator<XprType>::Flags&LinearAccessBit) \
+ && bool(XprType::Flags&DirectAccessBit) \
+ && (XprType::IsVectorAtCompileTime || (XprType::Flags&RowMajorBit)==0)> \
+ struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl); \
+ \
+ template<typename XprType, typename Scalar, int Rows, int Cols> \
+ struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)<XprType,Scalar,Rows,Cols,false> { \
+ inline static void run(ARG1 a,EIGEN_GL_EVAL(EIGEN_GL_MAKE_CONST_##CONST) XprType& p) { \
+ EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)<typename plain_matrix_type_column_major<XprType>::type>::run(a,p); } \
+ }; \
+} \
+ \
+template<typename Derived> inline void FUNC(ARG1 a,EIGEN_GL_EVAL(EIGEN_GL_MAKE_CONST_##CONST) Eigen::DenseBase<Derived>& p) { \
+ EIGEN_CAT(EIGEN_CAT(internal::gl_,FUNC),_impl)<Derived>::run(a,p.derived()); \
+}
+
+
+#define EIGEN_GL_FUNC1_SPECIALIZATION_MAT(FUNC,ARG1,CONST,SCALAR,ROWS,COLS,SUFFIX) \
+namespace internal { \
+ template< typename XprType> struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)<XprType, SCALAR, ROWS, COLS, true> { \
+ inline static void run(ARG1 a, EIGEN_GL_EVAL(EIGEN_GL_MAKE_CONST_##CONST) XprType& p) { FUNC##SUFFIX(a,p.data()); } \
+ }; \
+}
+
+
+#define EIGEN_GL_FUNC1_SPECIALIZATION_VEC(FUNC,ARG1,CONST,SCALAR,SIZE,SUFFIX) \
+namespace internal { \
+ template< typename XprType> struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)<XprType, SCALAR, SIZE, 1, true> { \
+ inline static void run(ARG1 a, EIGEN_GL_EVAL(EIGEN_GL_MAKE_CONST_##CONST) XprType& p) { FUNC##SUFFIX(a,p.data()); } \
+ }; \
+ template< typename XprType> struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)<XprType, SCALAR, 1, SIZE, true> { \
+ inline static void run(ARG1 a, EIGEN_GL_EVAL(EIGEN_GL_MAKE_CONST_##CONST) XprType& p) { FUNC##SUFFIX(a,p.data()); } \
+ }; \
+}
+
+EIGEN_GL_FUNC1_DECLARATION (glGet,GLenum,_)
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glGet,GLenum,_,float, 4,4,Floatv)
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glGet,GLenum,_,double, 4,4,Doublev)
+
+// glUniform API
+
+#ifdef GL_VERSION_2_0
+
+inline void glUniform2fv_ei (GLint loc, const float* v) { glUniform2fv(loc,1,v); }
+inline void glUniform2iv_ei (GLint loc, const int* v) { glUniform2iv(loc,1,v); }
+
+inline void glUniform3fv_ei (GLint loc, const float* v) { glUniform3fv(loc,1,v); }
+inline void glUniform3iv_ei (GLint loc, const int* v) { glUniform3iv(loc,1,v); }
+
+inline void glUniform4fv_ei (GLint loc, const float* v) { glUniform4fv(loc,1,v); }
+inline void glUniform4iv_ei (GLint loc, const int* v) { glUniform4iv(loc,1,v); }
+
+inline void glUniformMatrix2fv_ei (GLint loc, const float* v) { glUniformMatrix2fv(loc,1,false,v); }
+inline void glUniformMatrix3fv_ei (GLint loc, const float* v) { glUniformMatrix3fv(loc,1,false,v); }
+inline void glUniformMatrix4fv_ei (GLint loc, const float* v) { glUniformMatrix4fv(loc,1,false,v); }
+
+
+EIGEN_GL_FUNC1_DECLARATION (glUniform,GLint,const)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,float, 2,2fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,int, 2,2iv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,float, 3,3fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,int, 3,3iv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,float, 4,4fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,int, 4,4iv_ei)
+
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float, 2,2,Matrix2fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float, 3,3,Matrix3fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float, 4,4,Matrix4fv_ei)
+
+#endif
+
+#ifdef GL_VERSION_2_1
+
+inline void glUniformMatrix2x3fv_ei(GLint loc, const float* v) { glUniformMatrix2x3fv(loc,1,false,v); }
+inline void glUniformMatrix3x2fv_ei(GLint loc, const float* v) { glUniformMatrix3x2fv(loc,1,false,v); }
+inline void glUniformMatrix2x4fv_ei(GLint loc, const float* v) { glUniformMatrix2x4fv(loc,1,false,v); }
+inline void glUniformMatrix4x2fv_ei(GLint loc, const float* v) { glUniformMatrix4x2fv(loc,1,false,v); }
+inline void glUniformMatrix3x4fv_ei(GLint loc, const float* v) { glUniformMatrix3x4fv(loc,1,false,v); }
+inline void glUniformMatrix4x3fv_ei(GLint loc, const float* v) { glUniformMatrix4x3fv(loc,1,false,v); }
+
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float, 2,3,Matrix2x3fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float, 3,2,Matrix3x2fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float, 2,4,Matrix2x4fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float, 4,2,Matrix4x2fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float, 3,4,Matrix3x4fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float, 4,3,Matrix4x3fv_ei)
+
+#endif
+
+#ifdef GL_VERSION_3_0
+
+inline void glUniform2uiv_ei (GLint loc, const unsigned int* v) { glUniform2uiv(loc,1,v); }
+inline void glUniform3uiv_ei (GLint loc, const unsigned int* v) { glUniform3uiv(loc,1,v); }
+inline void glUniform4uiv_ei (GLint loc, const unsigned int* v) { glUniform4uiv(loc,1,v); }
+
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,unsigned int, 2,2uiv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,unsigned int, 3,3uiv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,unsigned int, 4,4uiv_ei)
+
+#endif
+
+#ifdef GL_ARB_gpu_shader_fp64
+inline void glUniform2dv_ei (GLint loc, const double* v) { glUniform2dv(loc,1,v); }
+inline void glUniform3dv_ei (GLint loc, const double* v) { glUniform3dv(loc,1,v); }
+inline void glUniform4dv_ei (GLint loc, const double* v) { glUniform4dv(loc,1,v); }
+
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,double, 2,2dv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,double, 3,3dv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,double, 4,4dv_ei)
+#endif
+
+
+//@}
+
+}
+
+#endif // EIGEN_OPENGL_MODULE
diff --git a/src/EigenUnsupported/Polynomials b/src/EigenUnsupported/Polynomials
new file mode 100644
index 0000000..32ce2a2
--- /dev/null
+++ b/src/EigenUnsupported/Polynomials
@@ -0,0 +1,137 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_POLYNOMIALS_MODULE_H
+#define EIGEN_POLYNOMIALS_MODULE_H
+
+#include "../../Eigen/Core"
+
+#include "../../Eigen/Eigenvalues"
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+// Note that EIGEN_HIDE_HEAVY_CODE has to be defined per module
+#if (defined EIGEN_EXTERN_INSTANTIATIONS) && (EIGEN_EXTERN_INSTANTIATIONS>=2)
+ #ifndef EIGEN_HIDE_HEAVY_CODE
+ #define EIGEN_HIDE_HEAVY_CODE
+ #endif
+#elif defined EIGEN_HIDE_HEAVY_CODE
+ #undef EIGEN_HIDE_HEAVY_CODE
+#endif
+
+/**
+ * \defgroup Polynomials_Module Polynomials module
+ * \brief This module provides a QR based polynomial solver.
+ *
+ * To use this module, add
+ * \code
+ * #include <unsupported/Eigen/Polynomials>
+ * \endcode
+ * at the start of your source file.
+ */
+
+#include "src/Polynomials/PolynomialUtils.h"
+#include "src/Polynomials/Companion.h"
+#include "src/Polynomials/PolynomialSolver.h"
+
+/**
+ \page polynomials Polynomials defines functions for dealing with polynomials
+ and a QR based polynomial solver.
+ \ingroup Polynomials_Module
+
+ The remainder of the page documents first the functions for evaluating, computing
+ polynomials, computing estimates about polynomials and next the QR based polynomial
+ solver.
+
+ \section polynomialUtils convenient functions to deal with polynomials
+ \subsection roots_to_monicPolynomial
+ The function
+ \code
+ void roots_to_monicPolynomial( const RootVector& rv, Polynomial& poly )
+ \endcode
+ computes the coefficients \f$ a_i \f$ of
+
+ \f$ p(x) = a_0 + a_{1}x + ... + a_{n-1}x^{n-1} + x^n \f$
+
+ where \f$ p \f$ is known through its roots i.e. \f$ p(x) = (x-r_1)(x-r_2)...(x-r_n) \f$.
+
+ \subsection poly_eval
+ The function
+ \code
+ T poly_eval( const Polynomials& poly, const T& x )
+ \endcode
+ evaluates a polynomial at a given point using stabilized H&ouml;rner method.
+
+ The following code: first computes the coefficients in the monomial basis of the monic polynomial that has the provided roots;
+ then, it evaluates the computed polynomial, using a stabilized H&ouml;rner method.
+
+ \include PolynomialUtils1.cpp
+ Output: \verbinclude PolynomialUtils1.out
+
+ \subsection Cauchy bounds
+ The function
+ \code
+ Real cauchy_max_bound( const Polynomial& poly )
+ \endcode
+ provides a maximum bound (the Cauchy one: \f$C(p)\f$) for the absolute value of a root of the given polynomial i.e.
+ \f$ \forall r_i \f$ root of \f$ p(x) = \sum_{k=0}^d a_k x^k \f$,
+ \f$ |r_i| \le C(p) = \sum_{k=0}^{d} \left | \frac{a_k}{a_d} \right | \f$
+ The leading coefficient \f$ p \f$: should be non zero \f$a_d \neq 0\f$.
+
+
+ The function
+ \code
+ Real cauchy_min_bound( const Polynomial& poly )
+ \endcode
+ provides a minimum bound (the Cauchy one: \f$c(p)\f$) for the absolute value of a non zero root of the given polynomial i.e.
+ \f$ \forall r_i \neq 0 \f$ root of \f$ p(x) = \sum_{k=0}^d a_k x^k \f$,
+ \f$ |r_i| \ge c(p) = \left( \sum_{k=0}^{d} \left | \frac{a_k}{a_0} \right | \right)^{-1} \f$
+
+
+
+
+ \section QR polynomial solver class
+ Computes the complex roots of a polynomial by computing the eigenvalues of the associated companion matrix with the QR algorithm.
+
+ The roots of \f$ p(x) = a_0 + a_1 x + a_2 x^2 + a_{3} x^3 + x^4 \f$ are the eigenvalues of
+ \f$
+ \left [
+ \begin{array}{cccc}
+ 0 & 0 & 0 & a_0 \\
+ 1 & 0 & 0 & a_1 \\
+ 0 & 1 & 0 & a_2 \\
+ 0 & 0 & 1 & a_3
+ \end{array} \right ]
+ \f$
+
+ However, the QR algorithm is not guaranteed to converge when there are several eigenvalues with same modulus.
+
+ Therefore the current polynomial solver is guaranteed to provide a correct result only when the complex roots \f$r_1,r_2,...,r_d\f$ have distinct moduli i.e.
+
+ \f$ \forall i,j \in [1;d],~ \| r_i \| \neq \| r_j \| \f$.
+
+ With 32bit (float) floating types this problem shows up frequently.
+ However, almost always, correct accuracy is reached even in these cases for 64bit
+ (double) floating types and small polynomial degree (<20).
+
+ \include PolynomialSolver1.cpp
+
+ In the above example:
+
+ -# a simple use of the polynomial solver is shown;
+ -# the accuracy problem with the QR algorithm is presented: a polynomial with almost conjugate roots is provided to the solver.
+ Those roots have almost same module therefore the QR algorithm failed to converge: the accuracy
+ of the last root is bad;
+ -# a simple way to circumvent the problem is shown: use doubles instead of floats.
+
+ Output: \verbinclude PolynomialSolver1.out
+*/
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_POLYNOMIALS_MODULE_H
diff --git a/src/EigenUnsupported/Skyline b/src/EigenUnsupported/Skyline
new file mode 100644
index 0000000..ebdf143
--- /dev/null
+++ b/src/EigenUnsupported/Skyline
@@ -0,0 +1,39 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SKYLINE_MODULE_H
+#define EIGEN_SKYLINE_MODULE_H
+
+
+#include "../../Eigen/Core"
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+#include <map>
+#include <cstdlib>
+#include <cstring>
+#include <algorithm>
+
+/**
+ * \defgroup Skyline_Module Skyline module
+ *
+ *
+ *
+ *
+ */
+
+#include "src/Skyline/SkylineUtil.h"
+#include "src/Skyline/SkylineMatrixBase.h"
+#include "src/Skyline/SkylineStorage.h"
+#include "src/Skyline/SkylineMatrix.h"
+#include "src/Skyline/SkylineInplaceLU.h"
+#include "src/Skyline/SkylineProduct.h"
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_SKYLINE_MODULE_H
diff --git a/src/EigenUnsupported/SparseExtra b/src/EigenUnsupported/SparseExtra
new file mode 100644
index 0000000..ba5cbd6
--- /dev/null
+++ b/src/EigenUnsupported/SparseExtra
@@ -0,0 +1,54 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <g.gael@free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSE_EXTRA_MODULE_H
+#define EIGEN_SPARSE_EXTRA_MODULE_H
+
+#include "../../Eigen/Sparse"
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+#include <vector>
+#include <map>
+#include <cstdlib>
+#include <cstring>
+#include <algorithm>
+#include <fstream>
+#include <sstream>
+
+#ifdef EIGEN_GOOGLEHASH_SUPPORT
+ #include <google/dense_hash_map>
+ #include <google/sparse_hash_map>
+#endif
+
+/**
+ * \defgroup SparseExtra_Module SparseExtra module
+ *
+ * This module contains some experimental features extending the sparse module.
+ *
+ * \code
+ * #include <Eigen/SparseExtra>
+ * \endcode
+ */
+
+
+#include "src/SparseExtra/DynamicSparseMatrix.h"
+#include "src/SparseExtra/BlockOfDynamicSparseMatrix.h"
+#include "src/SparseExtra/RandomSetter.h"
+
+#include "src/SparseExtra/MarketIO.h"
+
+#if !defined(_WIN32)
+#include <dirent.h>
+#include "src/SparseExtra/MatrixMarketIterator.h"
+#endif
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_SPARSE_EXTRA_MODULE_H
diff --git a/src/EigenUnsupported/SpecialFunctions b/src/EigenUnsupported/SpecialFunctions
new file mode 100644
index 0000000..f6a2460
--- /dev/null
+++ b/src/EigenUnsupported/SpecialFunctions
@@ -0,0 +1,103 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <g.gael@free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIALFUNCTIONS_MODULE
+#define EIGEN_SPECIALFUNCTIONS_MODULE
+
+#include <math.h>
+
+#include "../../Eigen/Core"
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+namespace Eigen {
+
+/**
+ * \defgroup SpecialFunctions_Module Special math functions module
+ *
+ * This module features additional coefficient-wise math functions available
+ * within the numext:: namespace for the scalar version, and as method and/or free
+ * functions of Array. Those include:
+ *
+ * - erf
+ * - erfc
+ * - lgamma
+ * - igamma
+ * - igamma_der_a
+ * - gamma_sample_der_alpha
+ * - igammac
+ * - digamma
+ * - ndtri
+ * - polygamma
+ * - zeta
+ * - betainc
+ *
+ * Bessel Functions
+ * - bessel_i0
+ * - bessel_i0e
+ * - bessel_i1
+ * - bessel_i1e
+ * - bessel_j0
+ * - bessel_j1
+ * - bessel_k0
+ * - bessel_k0e
+ * - bessel_k1
+ * - bessel_k1e
+ * - bessel_y0
+ * - bessel_y1
+ *
+ * \code
+ * #include <unsupported/Eigen/SpecialFunctions>
+ * \endcode
+ */
+//@{
+
+}
+
+#include "src/SpecialFunctions/BesselFunctionsImpl.h"
+#include "src/SpecialFunctions/BesselFunctionsBFloat16.h"
+#include "src/SpecialFunctions/BesselFunctionsHalf.h"
+#include "src/SpecialFunctions/BesselFunctionsPacketMath.h"
+#include "src/SpecialFunctions/BesselFunctionsFunctors.h"
+#include "src/SpecialFunctions/BesselFunctionsArrayAPI.h"
+#include "src/SpecialFunctions/SpecialFunctionsImpl.h"
+#if defined(EIGEN_HIPCC)
+#include "src/SpecialFunctions/HipVectorCompatibility.h"
+#endif
+#include "src/SpecialFunctions/SpecialFunctionsBFloat16.h"
+#include "src/SpecialFunctions/SpecialFunctionsHalf.h"
+#include "src/SpecialFunctions/SpecialFunctionsPacketMath.h"
+#include "src/SpecialFunctions/SpecialFunctionsFunctors.h"
+#include "src/SpecialFunctions/SpecialFunctionsArrayAPI.h"
+
+#if defined EIGEN_VECTORIZE_AVX512
+ #include "src/SpecialFunctions/arch/AVX/BesselFunctions.h"
+ #include "src/SpecialFunctions/arch/AVX/SpecialFunctions.h"
+ #include "src/SpecialFunctions/arch/AVX512/BesselFunctions.h"
+ #include "src/SpecialFunctions/arch/AVX512/SpecialFunctions.h"
+#elif defined EIGEN_VECTORIZE_AVX
+ #include "src/SpecialFunctions/arch/AVX/BesselFunctions.h"
+ #include "src/SpecialFunctions/arch/AVX/SpecialFunctions.h"
+#elif defined EIGEN_VECTORIZE_NEON
+ #include "src/SpecialFunctions/arch/NEON/BesselFunctions.h"
+ #include "src/SpecialFunctions/arch/NEON/SpecialFunctions.h"
+#endif
+
+#if defined EIGEN_VECTORIZE_GPU
+ #include "src/SpecialFunctions/arch/GPU/SpecialFunctions.h"
+#endif
+
+namespace Eigen {
+//@}
+}
+
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_SPECIALFUNCTIONS_MODULE
diff --git a/src/EigenUnsupported/Splines b/src/EigenUnsupported/Splines
new file mode 100644
index 0000000..2ca5813
--- /dev/null
+++ b/src/EigenUnsupported/Splines
@@ -0,0 +1,35 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 20010-2011 Hauke Heibel <hauke.heibel@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPLINES_MODULE_H
+#define EIGEN_SPLINES_MODULE_H
+
+namespace Eigen
+{
+/**
+ * \defgroup Splines_Module Spline and spline fitting module
+ *
+ * This module provides a simple multi-dimensional spline class while
+ * offering most basic functionality to fit a spline to point sets.
+ *
+ * \code
+ * #include <unsupported/Eigen/Splines>
+ * \endcode
+ */
+}
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+#include "src/Splines/SplineFwd.h"
+#include "src/Splines/Spline.h"
+#include "src/Splines/SplineFitting.h"
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_SPLINES_MODULE_H
diff --git a/src/EigenUnsupported/src/AutoDiff/AutoDiffJacobian.h b/src/EigenUnsupported/src/AutoDiff/AutoDiffJacobian.h
new file mode 100644
index 0000000..33b6c39
--- /dev/null
+++ b/src/EigenUnsupported/src/AutoDiff/AutoDiffJacobian.h
@@ -0,0 +1,108 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_AUTODIFF_JACOBIAN_H
+#define EIGEN_AUTODIFF_JACOBIAN_H
+
+namespace Eigen
+{
+
+template<typename Functor> class AutoDiffJacobian : public Functor
+{
+public:
+ AutoDiffJacobian() : Functor() {}
+ AutoDiffJacobian(const Functor& f) : Functor(f) {}
+
+ // forward constructors
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+ template<typename... T>
+ AutoDiffJacobian(const T& ...Values) : Functor(Values...) {}
+#else
+ template<typename T0>
+ AutoDiffJacobian(const T0& a0) : Functor(a0) {}
+ template<typename T0, typename T1>
+ AutoDiffJacobian(const T0& a0, const T1& a1) : Functor(a0, a1) {}
+ template<typename T0, typename T1, typename T2>
+ AutoDiffJacobian(const T0& a0, const T1& a1, const T2& a2) : Functor(a0, a1, a2) {}
+#endif
+
+ typedef typename Functor::InputType InputType;
+ typedef typename Functor::ValueType ValueType;
+ typedef typename ValueType::Scalar Scalar;
+
+ enum {
+ InputsAtCompileTime = InputType::RowsAtCompileTime,
+ ValuesAtCompileTime = ValueType::RowsAtCompileTime
+ };
+
+ typedef Matrix<Scalar, ValuesAtCompileTime, InputsAtCompileTime> JacobianType;
+ typedef typename JacobianType::Index Index;
+
+ typedef Matrix<Scalar, InputsAtCompileTime, 1> DerivativeType;
+ typedef AutoDiffScalar<DerivativeType> ActiveScalar;
+
+ typedef Matrix<ActiveScalar, InputsAtCompileTime, 1> ActiveInput;
+ typedef Matrix<ActiveScalar, ValuesAtCompileTime, 1> ActiveValue;
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+ // Some compilers don't accept variadic parameters after a default parameter,
+ // i.e., we can't just write _jac=0 but we need to overload operator():
+ EIGEN_STRONG_INLINE
+ void operator() (const InputType& x, ValueType* v) const
+ {
+ this->operator()(x, v, 0);
+ }
+ template<typename... ParamsType>
+ void operator() (const InputType& x, ValueType* v, JacobianType* _jac,
+ const ParamsType&... Params) const
+#else
+ void operator() (const InputType& x, ValueType* v, JacobianType* _jac=0) const
+#endif
+ {
+ eigen_assert(v!=0);
+
+ if (!_jac)
+ {
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+ Functor::operator()(x, v, Params...);
+#else
+ Functor::operator()(x, v);
+#endif
+ return;
+ }
+
+ JacobianType& jac = *_jac;
+
+ ActiveInput ax = x.template cast<ActiveScalar>();
+ ActiveValue av(jac.rows());
+
+ if(InputsAtCompileTime==Dynamic)
+ for (Index j=0; j<jac.rows(); j++)
+ av[j].derivatives().resize(x.rows());
+
+ for (Index i=0; i<jac.cols(); i++)
+ ax[i].derivatives() = DerivativeType::Unit(x.rows(),i);
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+ Functor::operator()(ax, &av, Params...);
+#else
+ Functor::operator()(ax, &av);
+#endif
+
+ for (Index i=0; i<jac.rows(); i++)
+ {
+ (*v)[i] = av[i].value();
+ jac.row(i) = av[i].derivatives();
+ }
+ }
+};
+
+}
+
+#endif // EIGEN_AUTODIFF_JACOBIAN_H
diff --git a/src/EigenUnsupported/src/AutoDiff/AutoDiffScalar.h b/src/EigenUnsupported/src/AutoDiff/AutoDiffScalar.h
new file mode 100755
index 0000000..0f166e3
--- /dev/null
+++ b/src/EigenUnsupported/src/AutoDiff/AutoDiffScalar.h
@@ -0,0 +1,730 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_AUTODIFF_SCALAR_H
+#define EIGEN_AUTODIFF_SCALAR_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename A, typename B>
+struct make_coherent_impl {
+ static void run(A&, B&) {}
+};
+
+// resize a to match b is a.size()==0, and conversely.
+template<typename A, typename B>
+void make_coherent(const A& a, const B&b)
+{
+ make_coherent_impl<A,B>::run(a.const_cast_derived(), b.const_cast_derived());
+}
+
+template<typename DerivativeType, bool Enable> struct auto_diff_special_op;
+
+} // end namespace internal
+
+template<typename DerivativeType> class AutoDiffScalar;
+
+template<typename NewDerType>
+inline AutoDiffScalar<NewDerType> MakeAutoDiffScalar(const typename NewDerType::Scalar& value, const NewDerType &der) {
+ return AutoDiffScalar<NewDerType>(value,der);
+}
+
+/** \class AutoDiffScalar
+ * \brief A scalar type replacement with automatic differentiation capability
+ *
+ * \param DerivativeType the vector type used to store/represent the derivatives. The base scalar type
+ * as well as the number of derivatives to compute are determined from this type.
+ * Typical choices include, e.g., \c Vector4f for 4 derivatives, or \c VectorXf
+ * if the number of derivatives is not known at compile time, and/or, the number
+ * of derivatives is large.
+ * Note that DerivativeType can also be a reference (e.g., \c VectorXf&) to wrap a
+ * existing vector into an AutoDiffScalar.
+ * Finally, DerivativeType can also be any Eigen compatible expression.
+ *
+ * This class represents a scalar value while tracking its respective derivatives using Eigen's expression
+ * template mechanism.
+ *
+ * It supports the following list of global math function:
+ * - std::abs, std::sqrt, std::pow, std::exp, std::log, std::sin, std::cos,
+ * - internal::abs, internal::sqrt, numext::pow, internal::exp, internal::log, internal::sin, internal::cos,
+ * - internal::conj, internal::real, internal::imag, numext::abs2.
+ *
+ * AutoDiffScalar can be used as the scalar type of an Eigen::Matrix object. However,
+ * in that case, the expression template mechanism only occurs at the top Matrix level,
+ * while derivatives are computed right away.
+ *
+ */
+
+template<typename DerivativeType>
+class AutoDiffScalar
+ : public internal::auto_diff_special_op
+ <DerivativeType, !internal::is_same<typename internal::traits<typename internal::remove_all<DerivativeType>::type>::Scalar,
+ typename NumTraits<typename internal::traits<typename internal::remove_all<DerivativeType>::type>::Scalar>::Real>::value>
+{
+ public:
+ typedef internal::auto_diff_special_op
+ <DerivativeType, !internal::is_same<typename internal::traits<typename internal::remove_all<DerivativeType>::type>::Scalar,
+ typename NumTraits<typename internal::traits<typename internal::remove_all<DerivativeType>::type>::Scalar>::Real>::value> Base;
+ typedef typename internal::remove_all<DerivativeType>::type DerType;
+ typedef typename internal::traits<DerType>::Scalar Scalar;
+ typedef typename NumTraits<Scalar>::Real Real;
+
+ using Base::operator+;
+ using Base::operator*;
+
+ /** Default constructor without any initialization. */
+ AutoDiffScalar() {}
+
+ /** Constructs an active scalar from its \a value,
+ and initializes the \a nbDer derivatives such that it corresponds to the \a derNumber -th variable */
+ AutoDiffScalar(const Scalar& value, int nbDer, int derNumber)
+ : m_value(value), m_derivatives(DerType::Zero(nbDer))
+ {
+ m_derivatives.coeffRef(derNumber) = Scalar(1);
+ }
+
+ /** Conversion from a scalar constant to an active scalar.
+ * The derivatives are set to zero. */
+ /*explicit*/ AutoDiffScalar(const Real& value)
+ : m_value(value)
+ {
+ if(m_derivatives.size()>0)
+ m_derivatives.setZero();
+ }
+
+ /** Constructs an active scalar from its \a value and derivatives \a der */
+ AutoDiffScalar(const Scalar& value, const DerType& der)
+ : m_value(value), m_derivatives(der)
+ {}
+
+ template<typename OtherDerType>
+ AutoDiffScalar(const AutoDiffScalar<OtherDerType>& other
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+ , typename internal::enable_if<
+ internal::is_same<Scalar, typename internal::traits<typename internal::remove_all<OtherDerType>::type>::Scalar>::value
+ && internal::is_convertible<OtherDerType,DerType>::value , void*>::type = 0
+#endif
+ )
+ : m_value(other.value()), m_derivatives(other.derivatives())
+ {}
+
+ friend std::ostream & operator << (std::ostream & s, const AutoDiffScalar& a)
+ {
+ return s << a.value();
+ }
+
+ AutoDiffScalar(const AutoDiffScalar& other)
+ : m_value(other.value()), m_derivatives(other.derivatives())
+ {}
+
+ template<typename OtherDerType>
+ inline AutoDiffScalar& operator=(const AutoDiffScalar<OtherDerType>& other)
+ {
+ m_value = other.value();
+ m_derivatives = other.derivatives();
+ return *this;
+ }
+
+ inline AutoDiffScalar& operator=(const AutoDiffScalar& other)
+ {
+ m_value = other.value();
+ m_derivatives = other.derivatives();
+ return *this;
+ }
+
+ inline AutoDiffScalar& operator=(const Scalar& other)
+ {
+ m_value = other;
+ if(m_derivatives.size()>0)
+ m_derivatives.setZero();
+ return *this;
+ }
+
+// inline operator const Scalar& () const { return m_value; }
+// inline operator Scalar& () { return m_value; }
+
+ inline const Scalar& value() const { return m_value; }
+ inline Scalar& value() { return m_value; }
+
+ inline const DerType& derivatives() const { return m_derivatives; }
+ inline DerType& derivatives() { return m_derivatives; }
+
+ inline bool operator< (const Scalar& other) const { return m_value < other; }
+ inline bool operator<=(const Scalar& other) const { return m_value <= other; }
+ inline bool operator> (const Scalar& other) const { return m_value > other; }
+ inline bool operator>=(const Scalar& other) const { return m_value >= other; }
+ inline bool operator==(const Scalar& other) const { return m_value == other; }
+ inline bool operator!=(const Scalar& other) const { return m_value != other; }
+
+ friend inline bool operator< (const Scalar& a, const AutoDiffScalar& b) { return a < b.value(); }
+ friend inline bool operator<=(const Scalar& a, const AutoDiffScalar& b) { return a <= b.value(); }
+ friend inline bool operator> (const Scalar& a, const AutoDiffScalar& b) { return a > b.value(); }
+ friend inline bool operator>=(const Scalar& a, const AutoDiffScalar& b) { return a >= b.value(); }
+ friend inline bool operator==(const Scalar& a, const AutoDiffScalar& b) { return a == b.value(); }
+ friend inline bool operator!=(const Scalar& a, const AutoDiffScalar& b) { return a != b.value(); }
+
+ template<typename OtherDerType> inline bool operator< (const AutoDiffScalar<OtherDerType>& b) const { return m_value < b.value(); }
+ template<typename OtherDerType> inline bool operator<=(const AutoDiffScalar<OtherDerType>& b) const { return m_value <= b.value(); }
+ template<typename OtherDerType> inline bool operator> (const AutoDiffScalar<OtherDerType>& b) const { return m_value > b.value(); }
+ template<typename OtherDerType> inline bool operator>=(const AutoDiffScalar<OtherDerType>& b) const { return m_value >= b.value(); }
+ template<typename OtherDerType> inline bool operator==(const AutoDiffScalar<OtherDerType>& b) const { return m_value == b.value(); }
+ template<typename OtherDerType> inline bool operator!=(const AutoDiffScalar<OtherDerType>& b) const { return m_value != b.value(); }
+
+ inline const AutoDiffScalar<DerType&> operator+(const Scalar& other) const
+ {
+ return AutoDiffScalar<DerType&>(m_value + other, m_derivatives);
+ }
+
+ friend inline const AutoDiffScalar<DerType&> operator+(const Scalar& a, const AutoDiffScalar& b)
+ {
+ return AutoDiffScalar<DerType&>(a + b.value(), b.derivatives());
+ }
+
+// inline const AutoDiffScalar<DerType&> operator+(const Real& other) const
+// {
+// return AutoDiffScalar<DerType&>(m_value + other, m_derivatives);
+// }
+
+// friend inline const AutoDiffScalar<DerType&> operator+(const Real& a, const AutoDiffScalar& b)
+// {
+// return AutoDiffScalar<DerType&>(a + b.value(), b.derivatives());
+// }
+
+ inline AutoDiffScalar& operator+=(const Scalar& other)
+ {
+ value() += other;
+ return *this;
+ }
+
+ template<typename OtherDerType>
+ inline const AutoDiffScalar<CwiseBinaryOp<internal::scalar_sum_op<Scalar>,const DerType,const typename internal::remove_all<OtherDerType>::type> >
+ operator+(const AutoDiffScalar<OtherDerType>& other) const
+ {
+ internal::make_coherent(m_derivatives, other.derivatives());
+ return AutoDiffScalar<CwiseBinaryOp<internal::scalar_sum_op<Scalar>,const DerType,const typename internal::remove_all<OtherDerType>::type> >(
+ m_value + other.value(),
+ m_derivatives + other.derivatives());
+ }
+
+ template<typename OtherDerType>
+ inline AutoDiffScalar&
+ operator+=(const AutoDiffScalar<OtherDerType>& other)
+ {
+ (*this) = (*this) + other;
+ return *this;
+ }
+
+ inline const AutoDiffScalar<DerType&> operator-(const Scalar& b) const
+ {
+ return AutoDiffScalar<DerType&>(m_value - b, m_derivatives);
+ }
+
+ friend inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const DerType> >
+ operator-(const Scalar& a, const AutoDiffScalar& b)
+ {
+ return AutoDiffScalar<CwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const DerType> >
+ (a - b.value(), -b.derivatives());
+ }
+
+ inline AutoDiffScalar& operator-=(const Scalar& other)
+ {
+ value() -= other;
+ return *this;
+ }
+
+ template<typename OtherDerType>
+ inline const AutoDiffScalar<CwiseBinaryOp<internal::scalar_difference_op<Scalar>, const DerType,const typename internal::remove_all<OtherDerType>::type> >
+ operator-(const AutoDiffScalar<OtherDerType>& other) const
+ {
+ internal::make_coherent(m_derivatives, other.derivatives());
+ return AutoDiffScalar<CwiseBinaryOp<internal::scalar_difference_op<Scalar>, const DerType,const typename internal::remove_all<OtherDerType>::type> >(
+ m_value - other.value(),
+ m_derivatives - other.derivatives());
+ }
+
+ template<typename OtherDerType>
+ inline AutoDiffScalar&
+ operator-=(const AutoDiffScalar<OtherDerType>& other)
+ {
+ *this = *this - other;
+ return *this;
+ }
+
+ inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const DerType> >
+ operator-() const
+ {
+ return AutoDiffScalar<CwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const DerType> >(
+ -m_value,
+ -m_derivatives);
+ }
+
+ inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) >
+ operator*(const Scalar& other) const
+ {
+ return MakeAutoDiffScalar(m_value * other, m_derivatives * other);
+ }
+
+ friend inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) >
+ operator*(const Scalar& other, const AutoDiffScalar& a)
+ {
+ return MakeAutoDiffScalar(a.value() * other, a.derivatives() * other);
+ }
+
+// inline const AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >
+// operator*(const Real& other) const
+// {
+// return AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >(
+// m_value * other,
+// (m_derivatives * other));
+// }
+//
+// friend inline const AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >
+// operator*(const Real& other, const AutoDiffScalar& a)
+// {
+// return AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >(
+// a.value() * other,
+// a.derivatives() * other);
+// }
+
+ inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) >
+ operator/(const Scalar& other) const
+ {
+ return MakeAutoDiffScalar(m_value / other, (m_derivatives * (Scalar(1)/other)));
+ }
+
+ friend inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) >
+ operator/(const Scalar& other, const AutoDiffScalar& a)
+ {
+ return MakeAutoDiffScalar(other / a.value(), a.derivatives() * (Scalar(-other) / (a.value()*a.value())));
+ }
+
+// inline const AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >
+// operator/(const Real& other) const
+// {
+// return AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >(
+// m_value / other,
+// (m_derivatives * (Real(1)/other)));
+// }
+//
+// friend inline const AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >
+// operator/(const Real& other, const AutoDiffScalar& a)
+// {
+// return AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >(
+// other / a.value(),
+// a.derivatives() * (-Real(1)/other));
+// }
+
+ template<typename OtherDerType>
+ inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(
+ CwiseBinaryOp<internal::scalar_difference_op<Scalar> EIGEN_COMMA
+ const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) EIGEN_COMMA
+ const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename internal::remove_all<OtherDerType>::type,Scalar,product) >,Scalar,product) >
+ operator/(const AutoDiffScalar<OtherDerType>& other) const
+ {
+ internal::make_coherent(m_derivatives, other.derivatives());
+ return MakeAutoDiffScalar(
+ m_value / other.value(),
+ ((m_derivatives * other.value()) - (other.derivatives() * m_value))
+ * (Scalar(1)/(other.value()*other.value())));
+ }
+
+ template<typename OtherDerType>
+ inline const AutoDiffScalar<CwiseBinaryOp<internal::scalar_sum_op<Scalar>,
+ const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product),
+ const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename internal::remove_all<OtherDerType>::type,Scalar,product) > >
+ operator*(const AutoDiffScalar<OtherDerType>& other) const
+ {
+ internal::make_coherent(m_derivatives, other.derivatives());
+ return MakeAutoDiffScalar(
+ m_value * other.value(),
+ (m_derivatives * other.value()) + (other.derivatives() * m_value));
+ }
+
+ inline AutoDiffScalar& operator*=(const Scalar& other)
+ {
+ *this = *this * other;
+ return *this;
+ }
+
+ template<typename OtherDerType>
+ inline AutoDiffScalar& operator*=(const AutoDiffScalar<OtherDerType>& other)
+ {
+ *this = *this * other;
+ return *this;
+ }
+
+ inline AutoDiffScalar& operator/=(const Scalar& other)
+ {
+ *this = *this / other;
+ return *this;
+ }
+
+ template<typename OtherDerType>
+ inline AutoDiffScalar& operator/=(const AutoDiffScalar<OtherDerType>& other)
+ {
+ *this = *this / other;
+ return *this;
+ }
+
+ protected:
+ Scalar m_value;
+ DerType m_derivatives;
+
+};
+
+namespace internal {
+
+template<typename DerivativeType>
+struct auto_diff_special_op<DerivativeType, true>
+// : auto_diff_scalar_op<DerivativeType, typename NumTraits<Scalar>::Real,
+// is_same<Scalar,typename NumTraits<Scalar>::Real>::value>
+{
+ typedef typename remove_all<DerivativeType>::type DerType;
+ typedef typename traits<DerType>::Scalar Scalar;
+ typedef typename NumTraits<Scalar>::Real Real;
+
+// typedef auto_diff_scalar_op<DerivativeType, typename NumTraits<Scalar>::Real,
+// is_same<Scalar,typename NumTraits<Scalar>::Real>::value> Base;
+
+// using Base::operator+;
+// using Base::operator+=;
+// using Base::operator-;
+// using Base::operator-=;
+// using Base::operator*;
+// using Base::operator*=;
+
+ const AutoDiffScalar<DerivativeType>& derived() const { return *static_cast<const AutoDiffScalar<DerivativeType>*>(this); }
+ AutoDiffScalar<DerivativeType>& derived() { return *static_cast<AutoDiffScalar<DerivativeType>*>(this); }
+
+
+ inline const AutoDiffScalar<DerType&> operator+(const Real& other) const
+ {
+ return AutoDiffScalar<DerType&>(derived().value() + other, derived().derivatives());
+ }
+
+ friend inline const AutoDiffScalar<DerType&> operator+(const Real& a, const AutoDiffScalar<DerivativeType>& b)
+ {
+ return AutoDiffScalar<DerType&>(a + b.value(), b.derivatives());
+ }
+
+ inline AutoDiffScalar<DerivativeType>& operator+=(const Real& other)
+ {
+ derived().value() += other;
+ return derived();
+ }
+
+
+ inline const AutoDiffScalar<typename CwiseUnaryOp<bind2nd_op<scalar_product_op<Scalar,Real> >, DerType>::Type >
+ operator*(const Real& other) const
+ {
+ return AutoDiffScalar<typename CwiseUnaryOp<bind2nd_op<scalar_product_op<Scalar,Real> >, DerType>::Type >(
+ derived().value() * other,
+ derived().derivatives() * other);
+ }
+
+ friend inline const AutoDiffScalar<typename CwiseUnaryOp<bind1st_op<scalar_product_op<Real,Scalar> >, DerType>::Type >
+ operator*(const Real& other, const AutoDiffScalar<DerivativeType>& a)
+ {
+ return AutoDiffScalar<typename CwiseUnaryOp<bind1st_op<scalar_product_op<Real,Scalar> >, DerType>::Type >(
+ a.value() * other,
+ a.derivatives() * other);
+ }
+
+ inline AutoDiffScalar<DerivativeType>& operator*=(const Scalar& other)
+ {
+ *this = *this * other;
+ return derived();
+ }
+};
+
+template<typename DerivativeType>
+struct auto_diff_special_op<DerivativeType, false>
+{
+ void operator*() const;
+ void operator-() const;
+ void operator+() const;
+};
+
+template<typename BinOp, typename A, typename B, typename RefType>
+void make_coherent_expression(CwiseBinaryOp<BinOp,A,B> xpr, const RefType &ref)
+{
+ make_coherent(xpr.const_cast_derived().lhs(), ref);
+ make_coherent(xpr.const_cast_derived().rhs(), ref);
+}
+
+template<typename UnaryOp, typename A, typename RefType>
+void make_coherent_expression(const CwiseUnaryOp<UnaryOp,A> &xpr, const RefType &ref)
+{
+ make_coherent(xpr.nestedExpression().const_cast_derived(), ref);
+}
+
+// needed for compilation only
+template<typename UnaryOp, typename A, typename RefType>
+void make_coherent_expression(const CwiseNullaryOp<UnaryOp,A> &, const RefType &)
+{}
+
+template<typename A_Scalar, int A_Rows, int A_Cols, int A_Options, int A_MaxRows, int A_MaxCols, typename B>
+struct make_coherent_impl<Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols>, B> {
+ typedef Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols> A;
+ static void run(A& a, B& b) {
+ if((A_Rows==Dynamic || A_Cols==Dynamic) && (a.size()==0))
+ {
+ a.resize(b.size());
+ a.setZero();
+ }
+ else if (B::SizeAtCompileTime==Dynamic && a.size()!=0 && b.size()==0)
+ {
+ make_coherent_expression(b,a);
+ }
+ }
+};
+
+template<typename A, typename B_Scalar, int B_Rows, int B_Cols, int B_Options, int B_MaxRows, int B_MaxCols>
+struct make_coherent_impl<A, Matrix<B_Scalar, B_Rows, B_Cols, B_Options, B_MaxRows, B_MaxCols> > {
+ typedef Matrix<B_Scalar, B_Rows, B_Cols, B_Options, B_MaxRows, B_MaxCols> B;
+ static void run(A& a, B& b) {
+ if((B_Rows==Dynamic || B_Cols==Dynamic) && (b.size()==0))
+ {
+ b.resize(a.size());
+ b.setZero();
+ }
+ else if (A::SizeAtCompileTime==Dynamic && b.size()!=0 && a.size()==0)
+ {
+ make_coherent_expression(a,b);
+ }
+ }
+};
+
+template<typename A_Scalar, int A_Rows, int A_Cols, int A_Options, int A_MaxRows, int A_MaxCols,
+ typename B_Scalar, int B_Rows, int B_Cols, int B_Options, int B_MaxRows, int B_MaxCols>
+struct make_coherent_impl<Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols>,
+ Matrix<B_Scalar, B_Rows, B_Cols, B_Options, B_MaxRows, B_MaxCols> > {
+ typedef Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols> A;
+ typedef Matrix<B_Scalar, B_Rows, B_Cols, B_Options, B_MaxRows, B_MaxCols> B;
+ static void run(A& a, B& b) {
+ if((A_Rows==Dynamic || A_Cols==Dynamic) && (a.size()==0))
+ {
+ a.resize(b.size());
+ a.setZero();
+ }
+ else if((B_Rows==Dynamic || B_Cols==Dynamic) && (b.size()==0))
+ {
+ b.resize(a.size());
+ b.setZero();
+ }
+ }
+};
+
+} // end namespace internal
+
+template<typename DerType, typename BinOp>
+struct ScalarBinaryOpTraits<AutoDiffScalar<DerType>,typename DerType::Scalar,BinOp>
+{
+ typedef AutoDiffScalar<DerType> ReturnType;
+};
+
+template<typename DerType, typename BinOp>
+struct ScalarBinaryOpTraits<typename DerType::Scalar,AutoDiffScalar<DerType>, BinOp>
+{
+ typedef AutoDiffScalar<DerType> ReturnType;
+};
+
+
+// The following is an attempt to let Eigen's known about expression template, but that's more tricky!
+
+// template<typename DerType, typename BinOp>
+// struct ScalarBinaryOpTraits<AutoDiffScalar<DerType>,AutoDiffScalar<DerType>, BinOp>
+// {
+// enum { Defined = 1 };
+// typedef AutoDiffScalar<typename DerType::PlainObject> ReturnType;
+// };
+//
+// template<typename DerType1,typename DerType2, typename BinOp>
+// struct ScalarBinaryOpTraits<AutoDiffScalar<DerType1>,AutoDiffScalar<DerType2>, BinOp>
+// {
+// enum { Defined = 1 };//internal::is_same<typename DerType1::Scalar,typename DerType2::Scalar>::value };
+// typedef AutoDiffScalar<typename DerType1::PlainObject> ReturnType;
+// };
+
+#define EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(FUNC,CODE) \
+ template<typename DerType> \
+ inline const Eigen::AutoDiffScalar< \
+ EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename Eigen::internal::remove_all<DerType>::type, typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar, product) > \
+ FUNC(const Eigen::AutoDiffScalar<DerType>& x) { \
+ using namespace Eigen; \
+ typedef typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar Scalar; \
+ EIGEN_UNUSED_VARIABLE(sizeof(Scalar)); \
+ CODE; \
+ }
+
+template<typename DerType>
+struct CleanedUpDerType {
+ typedef AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> type;
+};
+
+template<typename DerType>
+inline const AutoDiffScalar<DerType>& conj(const AutoDiffScalar<DerType>& x) { return x; }
+template<typename DerType>
+inline const AutoDiffScalar<DerType>& real(const AutoDiffScalar<DerType>& x) { return x; }
+template<typename DerType>
+inline typename DerType::Scalar imag(const AutoDiffScalar<DerType>&) { return 0.; }
+template<typename DerType, typename T>
+inline typename CleanedUpDerType<DerType>::type (min)(const AutoDiffScalar<DerType>& x, const T& y) {
+ typedef typename CleanedUpDerType<DerType>::type ADS;
+ return (x <= y ? ADS(x) : ADS(y));
+}
+template<typename DerType, typename T>
+inline typename CleanedUpDerType<DerType>::type (max)(const AutoDiffScalar<DerType>& x, const T& y) {
+ typedef typename CleanedUpDerType<DerType>::type ADS;
+ return (x >= y ? ADS(x) : ADS(y));
+}
+template<typename DerType, typename T>
+inline typename CleanedUpDerType<DerType>::type (min)(const T& x, const AutoDiffScalar<DerType>& y) {
+ typedef typename CleanedUpDerType<DerType>::type ADS;
+ return (x < y ? ADS(x) : ADS(y));
+}
+template<typename DerType, typename T>
+inline typename CleanedUpDerType<DerType>::type (max)(const T& x, const AutoDiffScalar<DerType>& y) {
+ typedef typename CleanedUpDerType<DerType>::type ADS;
+ return (x > y ? ADS(x) : ADS(y));
+}
+template<typename DerType>
+inline typename CleanedUpDerType<DerType>::type (min)(const AutoDiffScalar<DerType>& x, const AutoDiffScalar<DerType>& y) {
+ return (x.value() < y.value() ? x : y);
+}
+template<typename DerType>
+inline typename CleanedUpDerType<DerType>::type (max)(const AutoDiffScalar<DerType>& x, const AutoDiffScalar<DerType>& y) {
+ return (x.value() >= y.value() ? x : y);
+}
+
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(abs,
+ using std::abs;
+ return Eigen::MakeAutoDiffScalar(abs(x.value()), x.derivatives() * (x.value()<0 ? -1 : 1) );)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(abs2,
+ using numext::abs2;
+ return Eigen::MakeAutoDiffScalar(abs2(x.value()), x.derivatives() * (Scalar(2)*x.value()));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(sqrt,
+ using std::sqrt;
+ Scalar sqrtx = sqrt(x.value());
+ return Eigen::MakeAutoDiffScalar(sqrtx,x.derivatives() * (Scalar(0.5) / sqrtx));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(cos,
+ using std::cos;
+ using std::sin;
+ return Eigen::MakeAutoDiffScalar(cos(x.value()), x.derivatives() * (-sin(x.value())));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(sin,
+ using std::sin;
+ using std::cos;
+ return Eigen::MakeAutoDiffScalar(sin(x.value()),x.derivatives() * cos(x.value()));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(exp,
+ using std::exp;
+ Scalar expx = exp(x.value());
+ return Eigen::MakeAutoDiffScalar(expx,x.derivatives() * expx);)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(log,
+ using std::log;
+ return Eigen::MakeAutoDiffScalar(log(x.value()),x.derivatives() * (Scalar(1)/x.value()));)
+
+template<typename DerType>
+inline const Eigen::AutoDiffScalar<
+EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename internal::remove_all<DerType>::type,typename internal::traits<typename internal::remove_all<DerType>::type>::Scalar,product) >
+pow(const Eigen::AutoDiffScalar<DerType> &x, const typename internal::traits<typename internal::remove_all<DerType>::type>::Scalar &y)
+{
+ using namespace Eigen;
+ using std::pow;
+ return Eigen::MakeAutoDiffScalar(pow(x.value(),y), x.derivatives() * (y * pow(x.value(),y-1)));
+}
+
+
+template<typename DerTypeA,typename DerTypeB>
+inline const AutoDiffScalar<Matrix<typename internal::traits<typename internal::remove_all<DerTypeA>::type>::Scalar,Dynamic,1> >
+atan2(const AutoDiffScalar<DerTypeA>& a, const AutoDiffScalar<DerTypeB>& b)
+{
+ using std::atan2;
+ typedef typename internal::traits<typename internal::remove_all<DerTypeA>::type>::Scalar Scalar;
+ typedef AutoDiffScalar<Matrix<Scalar,Dynamic,1> > PlainADS;
+ PlainADS ret;
+ ret.value() = atan2(a.value(), b.value());
+
+ Scalar squared_hypot = a.value() * a.value() + b.value() * b.value();
+
+ // if (squared_hypot==0) the derivation is undefined and the following results in a NaN:
+ ret.derivatives() = (a.derivatives() * b.value() - a.value() * b.derivatives()) / squared_hypot;
+
+ return ret;
+}
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(tan,
+ using std::tan;
+ using std::cos;
+ return Eigen::MakeAutoDiffScalar(tan(x.value()),x.derivatives() * (Scalar(1)/numext::abs2(cos(x.value()))));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(asin,
+ using std::sqrt;
+ using std::asin;
+ return Eigen::MakeAutoDiffScalar(asin(x.value()),x.derivatives() * (Scalar(1)/sqrt(1-numext::abs2(x.value()))));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(acos,
+ using std::sqrt;
+ using std::acos;
+ return Eigen::MakeAutoDiffScalar(acos(x.value()),x.derivatives() * (Scalar(-1)/sqrt(1-numext::abs2(x.value()))));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(tanh,
+ using std::cosh;
+ using std::tanh;
+ return Eigen::MakeAutoDiffScalar(tanh(x.value()),x.derivatives() * (Scalar(1)/numext::abs2(cosh(x.value()))));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(sinh,
+ using std::sinh;
+ using std::cosh;
+ return Eigen::MakeAutoDiffScalar(sinh(x.value()),x.derivatives() * cosh(x.value()));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(cosh,
+ using std::sinh;
+ using std::cosh;
+ return Eigen::MakeAutoDiffScalar(cosh(x.value()),x.derivatives() * sinh(x.value()));)
+
+#undef EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY
+
+template<typename DerType> struct NumTraits<AutoDiffScalar<DerType> >
+ : NumTraits< typename NumTraits<typename internal::remove_all<DerType>::type::Scalar>::Real >
+{
+ typedef typename internal::remove_all<DerType>::type DerTypeCleaned;
+ typedef AutoDiffScalar<Matrix<typename NumTraits<typename DerTypeCleaned::Scalar>::Real,DerTypeCleaned::RowsAtCompileTime,DerTypeCleaned::ColsAtCompileTime,
+ 0, DerTypeCleaned::MaxRowsAtCompileTime, DerTypeCleaned::MaxColsAtCompileTime> > Real;
+ typedef AutoDiffScalar<DerType> NonInteger;
+ typedef AutoDiffScalar<DerType> Nested;
+ typedef typename NumTraits<typename DerTypeCleaned::Scalar>::Literal Literal;
+ enum{
+ RequireInitialization = 1
+ };
+};
+
+}
+
+namespace std {
+
+template <typename T>
+class numeric_limits<Eigen::AutoDiffScalar<T> >
+ : public numeric_limits<typename T::Scalar> {};
+
+template <typename T>
+class numeric_limits<Eigen::AutoDiffScalar<T&> >
+ : public numeric_limits<typename T::Scalar> {};
+
+} // namespace std
+
+#endif // EIGEN_AUTODIFF_SCALAR_H
diff --git a/src/EigenUnsupported/src/AutoDiff/AutoDiffVector.h b/src/EigenUnsupported/src/AutoDiff/AutoDiffVector.h
new file mode 100644
index 0000000..8c2d048
--- /dev/null
+++ b/src/EigenUnsupported/src/AutoDiff/AutoDiffVector.h
@@ -0,0 +1,220 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_AUTODIFF_VECTOR_H
+#define EIGEN_AUTODIFF_VECTOR_H
+
+namespace Eigen {
+
+/* \class AutoDiffScalar
+ * \brief A scalar type replacement with automatic differentation capability
+ *
+ * \param DerType the vector type used to store/represent the derivatives (e.g. Vector3f)
+ *
+ * This class represents a scalar value while tracking its respective derivatives.
+ *
+ * It supports the following list of global math function:
+ * - std::abs, std::sqrt, std::pow, std::exp, std::log, std::sin, std::cos,
+ * - internal::abs, internal::sqrt, numext::pow, internal::exp, internal::log, internal::sin, internal::cos,
+ * - internal::conj, internal::real, internal::imag, numext::abs2.
+ *
+ * AutoDiffScalar can be used as the scalar type of an Eigen::Matrix object. However,
+ * in that case, the expression template mechanism only occurs at the top Matrix level,
+ * while derivatives are computed right away.
+ *
+ */
+template<typename ValueType, typename JacobianType>
+class AutoDiffVector
+{
+ public:
+ //typedef typename internal::traits<ValueType>::Scalar Scalar;
+ typedef typename internal::traits<ValueType>::Scalar BaseScalar;
+ typedef AutoDiffScalar<Matrix<BaseScalar,JacobianType::RowsAtCompileTime,1> > ActiveScalar;
+ typedef ActiveScalar Scalar;
+ typedef AutoDiffScalar<typename JacobianType::ColXpr> CoeffType;
+ typedef typename JacobianType::Index Index;
+
+ inline AutoDiffVector() {}
+
+ inline AutoDiffVector(const ValueType& values)
+ : m_values(values)
+ {
+ m_jacobian.setZero();
+ }
+
+
+ CoeffType operator[] (Index i) { return CoeffType(m_values[i], m_jacobian.col(i)); }
+ const CoeffType operator[] (Index i) const { return CoeffType(m_values[i], m_jacobian.col(i)); }
+
+ CoeffType operator() (Index i) { return CoeffType(m_values[i], m_jacobian.col(i)); }
+ const CoeffType operator() (Index i) const { return CoeffType(m_values[i], m_jacobian.col(i)); }
+
+ CoeffType coeffRef(Index i) { return CoeffType(m_values[i], m_jacobian.col(i)); }
+ const CoeffType coeffRef(Index i) const { return CoeffType(m_values[i], m_jacobian.col(i)); }
+
+ Index size() const { return m_values.size(); }
+
+ // FIXME here we could return an expression of the sum
+ Scalar sum() const { /*std::cerr << "sum \n\n";*/ /*std::cerr << m_jacobian.rowwise().sum() << "\n\n";*/ return Scalar(m_values.sum(), m_jacobian.rowwise().sum()); }
+
+
+ inline AutoDiffVector(const ValueType& values, const JacobianType& jac)
+ : m_values(values), m_jacobian(jac)
+ {}
+
+ template<typename OtherValueType, typename OtherJacobianType>
+ inline AutoDiffVector(const AutoDiffVector<OtherValueType, OtherJacobianType>& other)
+ : m_values(other.values()), m_jacobian(other.jacobian())
+ {}
+
+ inline AutoDiffVector(const AutoDiffVector& other)
+ : m_values(other.values()), m_jacobian(other.jacobian())
+ {}
+
+ template<typename OtherValueType, typename OtherJacobianType>
+ inline AutoDiffVector& operator=(const AutoDiffVector<OtherValueType, OtherJacobianType>& other)
+ {
+ m_values = other.values();
+ m_jacobian = other.jacobian();
+ return *this;
+ }
+
+ inline AutoDiffVector& operator=(const AutoDiffVector& other)
+ {
+ m_values = other.values();
+ m_jacobian = other.jacobian();
+ return *this;
+ }
+
+ inline const ValueType& values() const { return m_values; }
+ inline ValueType& values() { return m_values; }
+
+ inline const JacobianType& jacobian() const { return m_jacobian; }
+ inline JacobianType& jacobian() { return m_jacobian; }
+
+ template<typename OtherValueType,typename OtherJacobianType>
+ inline const AutoDiffVector<
+ typename MakeCwiseBinaryOp<internal::scalar_sum_op<BaseScalar>,ValueType,OtherValueType>::Type,
+ typename MakeCwiseBinaryOp<internal::scalar_sum_op<BaseScalar>,JacobianType,OtherJacobianType>::Type >
+ operator+(const AutoDiffVector<OtherValueType,OtherJacobianType>& other) const
+ {
+ return AutoDiffVector<
+ typename MakeCwiseBinaryOp<internal::scalar_sum_op<BaseScalar>,ValueType,OtherValueType>::Type,
+ typename MakeCwiseBinaryOp<internal::scalar_sum_op<BaseScalar>,JacobianType,OtherJacobianType>::Type >(
+ m_values + other.values(),
+ m_jacobian + other.jacobian());
+ }
+
+ template<typename OtherValueType, typename OtherJacobianType>
+ inline AutoDiffVector&
+ operator+=(const AutoDiffVector<OtherValueType,OtherJacobianType>& other)
+ {
+ m_values += other.values();
+ m_jacobian += other.jacobian();
+ return *this;
+ }
+
+ template<typename OtherValueType,typename OtherJacobianType>
+ inline const AutoDiffVector<
+ typename MakeCwiseBinaryOp<internal::scalar_difference_op<Scalar>,ValueType,OtherValueType>::Type,
+ typename MakeCwiseBinaryOp<internal::scalar_difference_op<Scalar>,JacobianType,OtherJacobianType>::Type >
+ operator-(const AutoDiffVector<OtherValueType,OtherJacobianType>& other) const
+ {
+ return AutoDiffVector<
+ typename MakeCwiseBinaryOp<internal::scalar_difference_op<Scalar>,ValueType,OtherValueType>::Type,
+ typename MakeCwiseBinaryOp<internal::scalar_difference_op<Scalar>,JacobianType,OtherJacobianType>::Type >(
+ m_values - other.values(),
+ m_jacobian - other.jacobian());
+ }
+
+ template<typename OtherValueType, typename OtherJacobianType>
+ inline AutoDiffVector&
+ operator-=(const AutoDiffVector<OtherValueType,OtherJacobianType>& other)
+ {
+ m_values -= other.values();
+ m_jacobian -= other.jacobian();
+ return *this;
+ }
+
+ inline const AutoDiffVector<
+ typename MakeCwiseUnaryOp<internal::scalar_opposite_op<Scalar>, ValueType>::Type,
+ typename MakeCwiseUnaryOp<internal::scalar_opposite_op<Scalar>, JacobianType>::Type >
+ operator-() const
+ {
+ return AutoDiffVector<
+ typename MakeCwiseUnaryOp<internal::scalar_opposite_op<Scalar>, ValueType>::Type,
+ typename MakeCwiseUnaryOp<internal::scalar_opposite_op<Scalar>, JacobianType>::Type >(
+ -m_values,
+ -m_jacobian);
+ }
+
+ inline const AutoDiffVector<
+ typename MakeCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, ValueType>::Type,
+ typename MakeCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, JacobianType>::Type>
+ operator*(const BaseScalar& other) const
+ {
+ return AutoDiffVector<
+ typename MakeCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, ValueType>::Type,
+ typename MakeCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, JacobianType>::Type >(
+ m_values * other,
+ m_jacobian * other);
+ }
+
+ friend inline const AutoDiffVector<
+ typename MakeCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, ValueType>::Type,
+ typename MakeCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, JacobianType>::Type >
+ operator*(const Scalar& other, const AutoDiffVector& v)
+ {
+ return AutoDiffVector<
+ typename MakeCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, ValueType>::Type,
+ typename MakeCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, JacobianType>::Type >(
+ v.values() * other,
+ v.jacobian() * other);
+ }
+
+// template<typename OtherValueType,typename OtherJacobianType>
+// inline const AutoDiffVector<
+// CwiseBinaryOp<internal::scalar_multiple_op<Scalar>, ValueType, OtherValueType>
+// CwiseBinaryOp<internal::scalar_sum_op<Scalar>,
+// CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, JacobianType>,
+// CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, OtherJacobianType> > >
+// operator*(const AutoDiffVector<OtherValueType,OtherJacobianType>& other) const
+// {
+// return AutoDiffVector<
+// CwiseBinaryOp<internal::scalar_multiple_op<Scalar>, ValueType, OtherValueType>
+// CwiseBinaryOp<internal::scalar_sum_op<Scalar>,
+// CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, JacobianType>,
+// CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, OtherJacobianType> > >(
+// m_values.cwise() * other.values(),
+// (m_jacobian * other.values()) + (m_values * other.jacobian()));
+// }
+
+ inline AutoDiffVector& operator*=(const Scalar& other)
+ {
+ m_values *= other;
+ m_jacobian *= other;
+ return *this;
+ }
+
+ template<typename OtherValueType,typename OtherJacobianType>
+ inline AutoDiffVector& operator*=(const AutoDiffVector<OtherValueType,OtherJacobianType>& other)
+ {
+ *this = *this * other;
+ return *this;
+ }
+
+ protected:
+ ValueType m_values;
+ JacobianType m_jacobian;
+
+};
+
+}
+
+#endif // EIGEN_AUTODIFF_VECTOR_H
diff --git a/src/EigenUnsupported/src/BVH/BVAlgorithms.h b/src/EigenUnsupported/src/BVH/BVAlgorithms.h
new file mode 100644
index 0000000..994c8af
--- /dev/null
+++ b/src/EigenUnsupported/src/BVH/BVAlgorithms.h
@@ -0,0 +1,293 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Ilya Baran <ibaran@mit.edu>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BVALGORITHMS_H
+#define EIGEN_BVALGORITHMS_H
+
+namespace Eigen {
+
+namespace internal {
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template<typename BVH, typename Intersector>
+bool intersect_helper(const BVH &tree, Intersector &intersector, typename BVH::Index root)
+{
+ typedef typename BVH::Index Index;
+ typedef typename BVH::VolumeIterator VolIter;
+ typedef typename BVH::ObjectIterator ObjIter;
+
+ VolIter vBegin = VolIter(), vEnd = VolIter();
+ ObjIter oBegin = ObjIter(), oEnd = ObjIter();
+
+ std::vector<Index> todo(1, root);
+
+ while(!todo.empty()) {
+ tree.getChildren(todo.back(), vBegin, vEnd, oBegin, oEnd);
+ todo.pop_back();
+
+ for(; vBegin != vEnd; ++vBegin) //go through child volumes
+ if(intersector.intersectVolume(tree.getVolume(*vBegin)))
+ todo.push_back(*vBegin);
+
+ for(; oBegin != oEnd; ++oBegin) //go through child objects
+ if(intersector.intersectObject(*oBegin))
+ return true; //intersector said to stop query
+ }
+ return false;
+}
+#endif //not EIGEN_PARSED_BY_DOXYGEN
+
+template<typename Volume1, typename Object1, typename Object2, typename Intersector>
+struct intersector_helper1
+{
+ intersector_helper1(const Object2 &inStored, Intersector &in) : stored(inStored), intersector(in) {}
+ bool intersectVolume(const Volume1 &vol) { return intersector.intersectVolumeObject(vol, stored); }
+ bool intersectObject(const Object1 &obj) { return intersector.intersectObjectObject(obj, stored); }
+ Object2 stored;
+ Intersector &intersector;
+private:
+ intersector_helper1& operator=(const intersector_helper1&);
+};
+
+template<typename Volume2, typename Object2, typename Object1, typename Intersector>
+struct intersector_helper2
+{
+ intersector_helper2(const Object1 &inStored, Intersector &in) : stored(inStored), intersector(in) {}
+ bool intersectVolume(const Volume2 &vol) { return intersector.intersectObjectVolume(stored, vol); }
+ bool intersectObject(const Object2 &obj) { return intersector.intersectObjectObject(stored, obj); }
+ Object1 stored;
+ Intersector &intersector;
+private:
+ intersector_helper2& operator=(const intersector_helper2&);
+};
+
+} // end namespace internal
+
+/** Given a BVH, runs the query encapsulated by \a intersector.
+ * The Intersector type must provide the following members: \code
+ bool intersectVolume(const BVH::Volume &volume) //returns true if volume intersects the query
+ bool intersectObject(const BVH::Object &object) //returns true if the search should terminate immediately
+ \endcode
+ */
+template<typename BVH, typename Intersector>
+void BVIntersect(const BVH &tree, Intersector &intersector)
+{
+ internal::intersect_helper(tree, intersector, tree.getRootIndex());
+}
+
+/** Given two BVH's, runs the query on their Cartesian product encapsulated by \a intersector.
+ * The Intersector type must provide the following members: \code
+ bool intersectVolumeVolume(const BVH1::Volume &v1, const BVH2::Volume &v2) //returns true if product of volumes intersects the query
+ bool intersectVolumeObject(const BVH1::Volume &v1, const BVH2::Object &o2) //returns true if the volume-object product intersects the query
+ bool intersectObjectVolume(const BVH1::Object &o1, const BVH2::Volume &v2) //returns true if the volume-object product intersects the query
+ bool intersectObjectObject(const BVH1::Object &o1, const BVH2::Object &o2) //returns true if the search should terminate immediately
+ \endcode
+ */
+template<typename BVH1, typename BVH2, typename Intersector>
+void BVIntersect(const BVH1 &tree1, const BVH2 &tree2, Intersector &intersector) //TODO: tandem descent when it makes sense
+{
+ typedef typename BVH1::Index Index1;
+ typedef typename BVH2::Index Index2;
+ typedef internal::intersector_helper1<typename BVH1::Volume, typename BVH1::Object, typename BVH2::Object, Intersector> Helper1;
+ typedef internal::intersector_helper2<typename BVH2::Volume, typename BVH2::Object, typename BVH1::Object, Intersector> Helper2;
+ typedef typename BVH1::VolumeIterator VolIter1;
+ typedef typename BVH1::ObjectIterator ObjIter1;
+ typedef typename BVH2::VolumeIterator VolIter2;
+ typedef typename BVH2::ObjectIterator ObjIter2;
+
+ VolIter1 vBegin1 = VolIter1(), vEnd1 = VolIter1();
+ ObjIter1 oBegin1 = ObjIter1(), oEnd1 = ObjIter1();
+ VolIter2 vBegin2 = VolIter2(), vEnd2 = VolIter2(), vCur2 = VolIter2();
+ ObjIter2 oBegin2 = ObjIter2(), oEnd2 = ObjIter2(), oCur2 = ObjIter2();
+
+ std::vector<std::pair<Index1, Index2> > todo(1, std::make_pair(tree1.getRootIndex(), tree2.getRootIndex()));
+
+ while(!todo.empty()) {
+ tree1.getChildren(todo.back().first, vBegin1, vEnd1, oBegin1, oEnd1);
+ tree2.getChildren(todo.back().second, vBegin2, vEnd2, oBegin2, oEnd2);
+ todo.pop_back();
+
+ for(; vBegin1 != vEnd1; ++vBegin1) { //go through child volumes of first tree
+ const typename BVH1::Volume &vol1 = tree1.getVolume(*vBegin1);
+ for(vCur2 = vBegin2; vCur2 != vEnd2; ++vCur2) { //go through child volumes of second tree
+ if(intersector.intersectVolumeVolume(vol1, tree2.getVolume(*vCur2)))
+ todo.push_back(std::make_pair(*vBegin1, *vCur2));
+ }
+
+ for(oCur2 = oBegin2; oCur2 != oEnd2; ++oCur2) {//go through child objects of second tree
+ Helper1 helper(*oCur2, intersector);
+ if(internal::intersect_helper(tree1, helper, *vBegin1))
+ return; //intersector said to stop query
+ }
+ }
+
+ for(; oBegin1 != oEnd1; ++oBegin1) { //go through child objects of first tree
+ for(vCur2 = vBegin2; vCur2 != vEnd2; ++vCur2) { //go through child volumes of second tree
+ Helper2 helper(*oBegin1, intersector);
+ if(internal::intersect_helper(tree2, helper, *vCur2))
+ return; //intersector said to stop query
+ }
+
+ for(oCur2 = oBegin2; oCur2 != oEnd2; ++oCur2) {//go through child objects of second tree
+ if(intersector.intersectObjectObject(*oBegin1, *oCur2))
+ return; //intersector said to stop query
+ }
+ }
+ }
+}
+
+namespace internal {
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template<typename BVH, typename Minimizer>
+typename Minimizer::Scalar minimize_helper(const BVH &tree, Minimizer &minimizer, typename BVH::Index root, typename Minimizer::Scalar minimum)
+{
+ typedef typename Minimizer::Scalar Scalar;
+ typedef typename BVH::Index Index;
+ typedef std::pair<Scalar, Index> QueueElement; //first element is priority
+ typedef typename BVH::VolumeIterator VolIter;
+ typedef typename BVH::ObjectIterator ObjIter;
+
+ VolIter vBegin = VolIter(), vEnd = VolIter();
+ ObjIter oBegin = ObjIter(), oEnd = ObjIter();
+ std::priority_queue<QueueElement, std::vector<QueueElement>, std::greater<QueueElement> > todo; //smallest is at the top
+
+ todo.push(std::make_pair(Scalar(), root));
+
+ while(!todo.empty()) {
+ tree.getChildren(todo.top().second, vBegin, vEnd, oBegin, oEnd);
+ todo.pop();
+
+ for(; oBegin != oEnd; ++oBegin) //go through child objects
+ minimum = (std::min)(minimum, minimizer.minimumOnObject(*oBegin));
+
+ for(; vBegin != vEnd; ++vBegin) { //go through child volumes
+ Scalar val = minimizer.minimumOnVolume(tree.getVolume(*vBegin));
+ if(val < minimum)
+ todo.push(std::make_pair(val, *vBegin));
+ }
+ }
+
+ return minimum;
+}
+#endif //not EIGEN_PARSED_BY_DOXYGEN
+
+
+template<typename Volume1, typename Object1, typename Object2, typename Minimizer>
+struct minimizer_helper1
+{
+ typedef typename Minimizer::Scalar Scalar;
+ minimizer_helper1(const Object2 &inStored, Minimizer &m) : stored(inStored), minimizer(m) {}
+ Scalar minimumOnVolume(const Volume1 &vol) { return minimizer.minimumOnVolumeObject(vol, stored); }
+ Scalar minimumOnObject(const Object1 &obj) { return minimizer.minimumOnObjectObject(obj, stored); }
+ Object2 stored;
+ Minimizer &minimizer;
+private:
+ minimizer_helper1& operator=(const minimizer_helper1&);
+};
+
+template<typename Volume2, typename Object2, typename Object1, typename Minimizer>
+struct minimizer_helper2
+{
+ typedef typename Minimizer::Scalar Scalar;
+ minimizer_helper2(const Object1 &inStored, Minimizer &m) : stored(inStored), minimizer(m) {}
+ Scalar minimumOnVolume(const Volume2 &vol) { return minimizer.minimumOnObjectVolume(stored, vol); }
+ Scalar minimumOnObject(const Object2 &obj) { return minimizer.minimumOnObjectObject(stored, obj); }
+ Object1 stored;
+ Minimizer &minimizer;
+private:
+ minimizer_helper2& operator=(const minimizer_helper2&);
+};
+
+} // end namespace internal
+
+/** Given a BVH, runs the query encapsulated by \a minimizer.
+ * \returns the minimum value.
+ * The Minimizer type must provide the following members: \code
+ typedef Scalar //the numeric type of what is being minimized--not necessarily the Scalar type of the BVH (if it has one)
+ Scalar minimumOnVolume(const BVH::Volume &volume)
+ Scalar minimumOnObject(const BVH::Object &object)
+ \endcode
+ */
+template<typename BVH, typename Minimizer>
+typename Minimizer::Scalar BVMinimize(const BVH &tree, Minimizer &minimizer)
+{
+ return internal::minimize_helper(tree, minimizer, tree.getRootIndex(), (std::numeric_limits<typename Minimizer::Scalar>::max)());
+}
+
+/** Given two BVH's, runs the query on their cartesian product encapsulated by \a minimizer.
+ * \returns the minimum value.
+ * The Minimizer type must provide the following members: \code
+ typedef Scalar //the numeric type of what is being minimized--not necessarily the Scalar type of the BVH (if it has one)
+ Scalar minimumOnVolumeVolume(const BVH1::Volume &v1, const BVH2::Volume &v2)
+ Scalar minimumOnVolumeObject(const BVH1::Volume &v1, const BVH2::Object &o2)
+ Scalar minimumOnObjectVolume(const BVH1::Object &o1, const BVH2::Volume &v2)
+ Scalar minimumOnObjectObject(const BVH1::Object &o1, const BVH2::Object &o2)
+ \endcode
+ */
+template<typename BVH1, typename BVH2, typename Minimizer>
+typename Minimizer::Scalar BVMinimize(const BVH1 &tree1, const BVH2 &tree2, Minimizer &minimizer)
+{
+ typedef typename Minimizer::Scalar Scalar;
+ typedef typename BVH1::Index Index1;
+ typedef typename BVH2::Index Index2;
+ typedef internal::minimizer_helper1<typename BVH1::Volume, typename BVH1::Object, typename BVH2::Object, Minimizer> Helper1;
+ typedef internal::minimizer_helper2<typename BVH2::Volume, typename BVH2::Object, typename BVH1::Object, Minimizer> Helper2;
+ typedef std::pair<Scalar, std::pair<Index1, Index2> > QueueElement; //first element is priority
+ typedef typename BVH1::VolumeIterator VolIter1;
+ typedef typename BVH1::ObjectIterator ObjIter1;
+ typedef typename BVH2::VolumeIterator VolIter2;
+ typedef typename BVH2::ObjectIterator ObjIter2;
+
+ VolIter1 vBegin1 = VolIter1(), vEnd1 = VolIter1();
+ ObjIter1 oBegin1 = ObjIter1(), oEnd1 = ObjIter1();
+ VolIter2 vBegin2 = VolIter2(), vEnd2 = VolIter2(), vCur2 = VolIter2();
+ ObjIter2 oBegin2 = ObjIter2(), oEnd2 = ObjIter2(), oCur2 = ObjIter2();
+ std::priority_queue<QueueElement, std::vector<QueueElement>, std::greater<QueueElement> > todo; //smallest is at the top
+
+ Scalar minimum = (std::numeric_limits<Scalar>::max)();
+ todo.push(std::make_pair(Scalar(), std::make_pair(tree1.getRootIndex(), tree2.getRootIndex())));
+
+ while(!todo.empty()) {
+ tree1.getChildren(todo.top().second.first, vBegin1, vEnd1, oBegin1, oEnd1);
+ tree2.getChildren(todo.top().second.second, vBegin2, vEnd2, oBegin2, oEnd2);
+ todo.pop();
+
+ for(; oBegin1 != oEnd1; ++oBegin1) { //go through child objects of first tree
+ for(oCur2 = oBegin2; oCur2 != oEnd2; ++oCur2) {//go through child objects of second tree
+ minimum = (std::min)(minimum, minimizer.minimumOnObjectObject(*oBegin1, *oCur2));
+ }
+
+ for(vCur2 = vBegin2; vCur2 != vEnd2; ++vCur2) { //go through child volumes of second tree
+ Helper2 helper(*oBegin1, minimizer);
+ minimum = (std::min)(minimum, internal::minimize_helper(tree2, helper, *vCur2, minimum));
+ }
+ }
+
+ for(; vBegin1 != vEnd1; ++vBegin1) { //go through child volumes of first tree
+ const typename BVH1::Volume &vol1 = tree1.getVolume(*vBegin1);
+
+ for(oCur2 = oBegin2; oCur2 != oEnd2; ++oCur2) {//go through child objects of second tree
+ Helper1 helper(*oCur2, minimizer);
+ minimum = (std::min)(minimum, internal::minimize_helper(tree1, helper, *vBegin1, minimum));
+ }
+
+ for(vCur2 = vBegin2; vCur2 != vEnd2; ++vCur2) { //go through child volumes of second tree
+ Scalar val = minimizer.minimumOnVolumeVolume(vol1, tree2.getVolume(*vCur2));
+ if(val < minimum)
+ todo.push(std::make_pair(val, std::make_pair(*vBegin1, *vCur2)));
+ }
+ }
+ }
+ return minimum;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_BVALGORITHMS_H
diff --git a/src/EigenUnsupported/src/BVH/KdBVH.h b/src/EigenUnsupported/src/BVH/KdBVH.h
new file mode 100644
index 0000000..2d5b76a
--- /dev/null
+++ b/src/EigenUnsupported/src/BVH/KdBVH.h
@@ -0,0 +1,223 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Ilya Baran <ibaran@mit.edu>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef KDBVH_H_INCLUDED
+#define KDBVH_H_INCLUDED
+
+namespace Eigen {
+
+namespace internal {
+
+//internal pair class for the BVH--used instead of std::pair because of alignment
+template<typename Scalar, int Dim>
+struct vector_int_pair
+{
+EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar, Dim)
+ typedef Matrix<Scalar, Dim, 1> VectorType;
+
+ vector_int_pair(const VectorType &v, int i) : first(v), second(i) {}
+
+ VectorType first;
+ int second;
+};
+
+//these templates help the tree initializer get the bounding boxes either from a provided
+//iterator range or using bounding_box in a unified way
+template<typename ObjectList, typename VolumeList, typename BoxIter>
+struct get_boxes_helper {
+ void operator()(const ObjectList &objects, BoxIter boxBegin, BoxIter boxEnd, VolumeList &outBoxes)
+ {
+ outBoxes.insert(outBoxes.end(), boxBegin, boxEnd);
+ eigen_assert(outBoxes.size() == objects.size());
+ EIGEN_ONLY_USED_FOR_DEBUG(objects);
+ }
+};
+
+template<typename ObjectList, typename VolumeList>
+struct get_boxes_helper<ObjectList, VolumeList, int> {
+ void operator()(const ObjectList &objects, int, int, VolumeList &outBoxes)
+ {
+ outBoxes.reserve(objects.size());
+ for(int i = 0; i < (int)objects.size(); ++i)
+ outBoxes.push_back(bounding_box(objects[i]));
+ }
+};
+
+} // end namespace internal
+
+
+/** \class KdBVH
+ * \brief A simple bounding volume hierarchy based on AlignedBox
+ *
+ * \param _Scalar The underlying scalar type of the bounding boxes
+ * \param _Dim The dimension of the space in which the hierarchy lives
+ * \param _Object The object type that lives in the hierarchy. It must have value semantics. Either bounding_box(_Object) must
+ * be defined and return an AlignedBox<_Scalar, _Dim> or bounding boxes must be provided to the tree initializer.
+ *
+ * This class provides a simple (as opposed to optimized) implementation of a bounding volume hierarchy analogous to a Kd-tree.
+ * Given a sequence of objects, it computes their bounding boxes, constructs a Kd-tree of their centers
+ * and builds a BVH with the structure of that Kd-tree. When the elements of the tree are too expensive to be copied around,
+ * it is useful for _Object to be a pointer.
+ */
+template<typename _Scalar, int _Dim, typename _Object> class KdBVH
+{
+public:
+ enum { Dim = _Dim };
+ typedef _Object Object;
+ typedef std::vector<Object, aligned_allocator<Object> > ObjectList;
+ typedef _Scalar Scalar;
+ typedef AlignedBox<Scalar, Dim> Volume;
+ typedef std::vector<Volume, aligned_allocator<Volume> > VolumeList;
+ typedef int Index;
+ typedef const int *VolumeIterator; //the iterators are just pointers into the tree's vectors
+ typedef const Object *ObjectIterator;
+
+ KdBVH() {}
+
+ /** Given an iterator range over \a Object references, constructs the BVH. Requires that bounding_box(Object) return a Volume. */
+ template<typename Iter> KdBVH(Iter begin, Iter end) { init(begin, end, 0, 0); } //int is recognized by init as not being an iterator type
+
+ /** Given an iterator range over \a Object references and an iterator range over their bounding boxes, constructs the BVH */
+ template<typename OIter, typename BIter> KdBVH(OIter begin, OIter end, BIter boxBegin, BIter boxEnd) { init(begin, end, boxBegin, boxEnd); }
+
+ /** Given an iterator range over \a Object references, constructs the BVH, overwriting whatever is in there currently.
+ * Requires that bounding_box(Object) return a Volume. */
+ template<typename Iter> void init(Iter begin, Iter end) { init(begin, end, 0, 0); }
+
+ /** Given an iterator range over \a Object references and an iterator range over their bounding boxes,
+ * constructs the BVH, overwriting whatever is in there currently. */
+ template<typename OIter, typename BIter> void init(OIter begin, OIter end, BIter boxBegin, BIter boxEnd)
+ {
+ objects.clear();
+ boxes.clear();
+ children.clear();
+
+ objects.insert(objects.end(), begin, end);
+ int n = static_cast<int>(objects.size());
+
+ if(n < 2)
+ return; //if we have at most one object, we don't need any internal nodes
+
+ VolumeList objBoxes;
+ VIPairList objCenters;
+
+ //compute the bounding boxes depending on BIter type
+ internal::get_boxes_helper<ObjectList, VolumeList, BIter>()(objects, boxBegin, boxEnd, objBoxes);
+
+ objCenters.reserve(n);
+ boxes.reserve(n - 1);
+ children.reserve(2 * n - 2);
+
+ for(int i = 0; i < n; ++i)
+ objCenters.push_back(VIPair(objBoxes[i].center(), i));
+
+ build(objCenters, 0, n, objBoxes, 0); //the recursive part of the algorithm
+
+ ObjectList tmp(n);
+ tmp.swap(objects);
+ for(int i = 0; i < n; ++i)
+ objects[i] = tmp[objCenters[i].second];
+ }
+
+ /** \returns the index of the root of the hierarchy */
+ inline Index getRootIndex() const { return (int)boxes.size() - 1; }
+
+ /** Given an \a index of a node, on exit, \a outVBegin and \a outVEnd range over the indices of the volume children of the node
+ * and \a outOBegin and \a outOEnd range over the object children of the node */
+ EIGEN_STRONG_INLINE void getChildren(Index index, VolumeIterator &outVBegin, VolumeIterator &outVEnd,
+ ObjectIterator &outOBegin, ObjectIterator &outOEnd) const
+ { //inlining this function should open lots of optimization opportunities to the compiler
+ if(index < 0) {
+ outVBegin = outVEnd;
+ if(!objects.empty())
+ outOBegin = &(objects[0]);
+ outOEnd = outOBegin + objects.size(); //output all objects--necessary when the tree has only one object
+ return;
+ }
+
+ int numBoxes = static_cast<int>(boxes.size());
+
+ int idx = index * 2;
+ if(children[idx + 1] < numBoxes) { //second index is always bigger
+ outVBegin = &(children[idx]);
+ outVEnd = outVBegin + 2;
+ outOBegin = outOEnd;
+ }
+ else if(children[idx] >= numBoxes) { //if both children are objects
+ outVBegin = outVEnd;
+ outOBegin = &(objects[children[idx] - numBoxes]);
+ outOEnd = outOBegin + 2;
+ } else { //if the first child is a volume and the second is an object
+ outVBegin = &(children[idx]);
+ outVEnd = outVBegin + 1;
+ outOBegin = &(objects[children[idx + 1] - numBoxes]);
+ outOEnd = outOBegin + 1;
+ }
+ }
+
+ /** \returns the bounding box of the node at \a index */
+ inline const Volume &getVolume(Index index) const
+ {
+ return boxes[index];
+ }
+
+private:
+ typedef internal::vector_int_pair<Scalar, Dim> VIPair;
+ typedef std::vector<VIPair, aligned_allocator<VIPair> > VIPairList;
+ typedef Matrix<Scalar, Dim, 1> VectorType;
+ struct VectorComparator //compares vectors, or more specifically, VIPairs along a particular dimension
+ {
+ VectorComparator(int inDim) : dim(inDim) {}
+ inline bool operator()(const VIPair &v1, const VIPair &v2) const { return v1.first[dim] < v2.first[dim]; }
+ int dim;
+ };
+
+ //Build the part of the tree between objects[from] and objects[to] (not including objects[to]).
+ //This routine partitions the objCenters in [from, to) along the dimension dim, recursively constructs
+ //the two halves, and adds their parent node. TODO: a cache-friendlier layout
+ void build(VIPairList &objCenters, int from, int to, const VolumeList &objBoxes, int dim)
+ {
+ eigen_assert(to - from > 1);
+ if(to - from == 2) {
+ boxes.push_back(objBoxes[objCenters[from].second].merged(objBoxes[objCenters[from + 1].second]));
+ children.push_back(from + (int)objects.size() - 1); //there are objects.size() - 1 tree nodes
+ children.push_back(from + (int)objects.size());
+ }
+ else if(to - from == 3) {
+ int mid = from + 2;
+ std::nth_element(objCenters.begin() + from, objCenters.begin() + mid,
+ objCenters.begin() + to, VectorComparator(dim)); //partition
+ build(objCenters, from, mid, objBoxes, (dim + 1) % Dim);
+ int idx1 = (int)boxes.size() - 1;
+ boxes.push_back(boxes[idx1].merged(objBoxes[objCenters[mid].second]));
+ children.push_back(idx1);
+ children.push_back(mid + (int)objects.size() - 1);
+ }
+ else {
+ int mid = from + (to - from) / 2;
+ nth_element(objCenters.begin() + from, objCenters.begin() + mid,
+ objCenters.begin() + to, VectorComparator(dim)); //partition
+ build(objCenters, from, mid, objBoxes, (dim + 1) % Dim);
+ int idx1 = (int)boxes.size() - 1;
+ build(objCenters, mid, to, objBoxes, (dim + 1) % Dim);
+ int idx2 = (int)boxes.size() - 1;
+ boxes.push_back(boxes[idx1].merged(boxes[idx2]));
+ children.push_back(idx1);
+ children.push_back(idx2);
+ }
+ }
+
+ std::vector<int> children; //children of x are children[2x] and children[2x+1], indices bigger than boxes.size() index into objects.
+ VolumeList boxes;
+ ObjectList objects;
+};
+
+} // end namespace Eigen
+
+#endif //KDBVH_H_INCLUDED
diff --git a/src/EigenUnsupported/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h b/src/EigenUnsupported/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h
new file mode 100644
index 0000000..0fbd847
--- /dev/null
+++ b/src/EigenUnsupported/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h
@@ -0,0 +1,790 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 David Harmon <dharmon@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ARPACKGENERALIZEDSELFADJOINTEIGENSOLVER_H
+#define EIGEN_ARPACKGENERALIZEDSELFADJOINTEIGENSOLVER_H
+
+#include "../../../../Eigen/Dense"
+
+namespace Eigen {
+
+namespace internal {
+ template<typename Scalar, typename RealScalar> struct arpack_wrapper;
+ template<typename MatrixSolver, typename MatrixType, typename Scalar, bool BisSPD> struct OP;
+}
+
+
+
+template<typename MatrixType, typename MatrixSolver=SimplicialLLT<MatrixType>, bool BisSPD=false>
+class ArpackGeneralizedSelfAdjointEigenSolver
+{
+public:
+ //typedef typename MatrixSolver::MatrixType MatrixType;
+
+ /** \brief Scalar type for matrices of type \p MatrixType. */
+ typedef typename MatrixType::Scalar Scalar;
+ typedef typename MatrixType::Index Index;
+
+ /** \brief Real scalar type for \p MatrixType.
+ *
+ * This is just \c Scalar if #Scalar is real (e.g., \c float or
+ * \c Scalar), and the type of the real part of \c Scalar if #Scalar is
+ * complex.
+ */
+ typedef typename NumTraits<Scalar>::Real RealScalar;
+
+ /** \brief Type for vector of eigenvalues as returned by eigenvalues().
+ *
+ * This is a column vector with entries of type #RealScalar.
+ * The length of the vector is the size of \p nbrEigenvalues.
+ */
+ typedef typename internal::plain_col_type<MatrixType, RealScalar>::type RealVectorType;
+
+ /** \brief Default constructor.
+ *
+ * The default constructor is for cases in which the user intends to
+ * perform decompositions via compute().
+ *
+ */
+ ArpackGeneralizedSelfAdjointEigenSolver()
+ : m_eivec(),
+ m_eivalues(),
+ m_isInitialized(false),
+ m_eigenvectorsOk(false),
+ m_nbrConverged(0),
+ m_nbrIterations(0)
+ { }
+
+ /** \brief Constructor; computes generalized eigenvalues of given matrix with respect to another matrix.
+ *
+ * \param[in] A Self-adjoint matrix whose eigenvalues / eigenvectors will
+ * computed. By default, the upper triangular part is used, but can be changed
+ * through the template parameter.
+ * \param[in] B Self-adjoint matrix for the generalized eigenvalue problem.
+ * \param[in] nbrEigenvalues The number of eigenvalues / eigenvectors to compute.
+ * Must be less than the size of the input matrix, or an error is returned.
+ * \param[in] eigs_sigma String containing either "LM", "SM", "LA", or "SA", with
+ * respective meanings to find the largest magnitude , smallest magnitude,
+ * largest algebraic, or smallest algebraic eigenvalues. Alternatively, this
+ * value can contain floating point value in string form, in which case the
+ * eigenvalues closest to this value will be found.
+ * \param[in] options Can be #ComputeEigenvectors (default) or #EigenvaluesOnly.
+ * \param[in] tol What tolerance to find the eigenvalues to. Default is 0, which
+ * means machine precision.
+ *
+ * This constructor calls compute(const MatrixType&, const MatrixType&, Index, string, int, RealScalar)
+ * to compute the eigenvalues of the matrix \p A with respect to \p B. The eigenvectors are computed if
+ * \p options equals #ComputeEigenvectors.
+ *
+ */
+ ArpackGeneralizedSelfAdjointEigenSolver(const MatrixType& A, const MatrixType& B,
+ Index nbrEigenvalues, std::string eigs_sigma="LM",
+ int options=ComputeEigenvectors, RealScalar tol=0.0)
+ : m_eivec(),
+ m_eivalues(),
+ m_isInitialized(false),
+ m_eigenvectorsOk(false),
+ m_nbrConverged(0),
+ m_nbrIterations(0)
+ {
+ compute(A, B, nbrEigenvalues, eigs_sigma, options, tol);
+ }
+
+ /** \brief Constructor; computes eigenvalues of given matrix.
+ *
+ * \param[in] A Self-adjoint matrix whose eigenvalues / eigenvectors will
+ * computed. By default, the upper triangular part is used, but can be changed
+ * through the template parameter.
+ * \param[in] nbrEigenvalues The number of eigenvalues / eigenvectors to compute.
+ * Must be less than the size of the input matrix, or an error is returned.
+ * \param[in] eigs_sigma String containing either "LM", "SM", "LA", or "SA", with
+ * respective meanings to find the largest magnitude , smallest magnitude,
+ * largest algebraic, or smallest algebraic eigenvalues. Alternatively, this
+ * value can contain floating point value in string form, in which case the
+ * eigenvalues closest to this value will be found.
+ * \param[in] options Can be #ComputeEigenvectors (default) or #EigenvaluesOnly.
+ * \param[in] tol What tolerance to find the eigenvalues to. Default is 0, which
+ * means machine precision.
+ *
+ * This constructor calls compute(const MatrixType&, Index, string, int, RealScalar)
+ * to compute the eigenvalues of the matrix \p A. The eigenvectors are computed if
+ * \p options equals #ComputeEigenvectors.
+ *
+ */
+
+ ArpackGeneralizedSelfAdjointEigenSolver(const MatrixType& A,
+ Index nbrEigenvalues, std::string eigs_sigma="LM",
+ int options=ComputeEigenvectors, RealScalar tol=0.0)
+ : m_eivec(),
+ m_eivalues(),
+ m_isInitialized(false),
+ m_eigenvectorsOk(false),
+ m_nbrConverged(0),
+ m_nbrIterations(0)
+ {
+ compute(A, nbrEigenvalues, eigs_sigma, options, tol);
+ }
+
+
+ /** \brief Computes generalized eigenvalues / eigenvectors of given matrix using the external ARPACK library.
+ *
+ * \param[in] A Selfadjoint matrix whose eigendecomposition is to be computed.
+ * \param[in] B Selfadjoint matrix for generalized eigenvalues.
+ * \param[in] nbrEigenvalues The number of eigenvalues / eigenvectors to compute.
+ * Must be less than the size of the input matrix, or an error is returned.
+ * \param[in] eigs_sigma String containing either "LM", "SM", "LA", or "SA", with
+ * respective meanings to find the largest magnitude , smallest magnitude,
+ * largest algebraic, or smallest algebraic eigenvalues. Alternatively, this
+ * value can contain floating point value in string form, in which case the
+ * eigenvalues closest to this value will be found.
+ * \param[in] options Can be #ComputeEigenvectors (default) or #EigenvaluesOnly.
+ * \param[in] tol What tolerance to find the eigenvalues to. Default is 0, which
+ * means machine precision.
+ *
+ * \returns Reference to \c *this
+ *
+ * This function computes the generalized eigenvalues of \p A with respect to \p B using ARPACK. The eigenvalues()
+ * function can be used to retrieve them. If \p options equals #ComputeEigenvectors,
+ * then the eigenvectors are also computed and can be retrieved by
+ * calling eigenvectors().
+ *
+ */
+ ArpackGeneralizedSelfAdjointEigenSolver& compute(const MatrixType& A, const MatrixType& B,
+ Index nbrEigenvalues, std::string eigs_sigma="LM",
+ int options=ComputeEigenvectors, RealScalar tol=0.0);
+
+ /** \brief Computes eigenvalues / eigenvectors of given matrix using the external ARPACK library.
+ *
+ * \param[in] A Selfadjoint matrix whose eigendecomposition is to be computed.
+ * \param[in] nbrEigenvalues The number of eigenvalues / eigenvectors to compute.
+ * Must be less than the size of the input matrix, or an error is returned.
+ * \param[in] eigs_sigma String containing either "LM", "SM", "LA", or "SA", with
+ * respective meanings to find the largest magnitude , smallest magnitude,
+ * largest algebraic, or smallest algebraic eigenvalues. Alternatively, this
+ * value can contain floating point value in string form, in which case the
+ * eigenvalues closest to this value will be found.
+ * \param[in] options Can be #ComputeEigenvectors (default) or #EigenvaluesOnly.
+ * \param[in] tol What tolerance to find the eigenvalues to. Default is 0, which
+ * means machine precision.
+ *
+ * \returns Reference to \c *this
+ *
+ * This function computes the eigenvalues of \p A using ARPACK. The eigenvalues()
+ * function can be used to retrieve them. If \p options equals #ComputeEigenvectors,
+ * then the eigenvectors are also computed and can be retrieved by
+ * calling eigenvectors().
+ *
+ */
+ ArpackGeneralizedSelfAdjointEigenSolver& compute(const MatrixType& A,
+ Index nbrEigenvalues, std::string eigs_sigma="LM",
+ int options=ComputeEigenvectors, RealScalar tol=0.0);
+
+
+ /** \brief Returns the eigenvectors of given matrix.
+ *
+ * \returns A const reference to the matrix whose columns are the eigenvectors.
+ *
+ * \pre The eigenvectors have been computed before.
+ *
+ * Column \f$ k \f$ of the returned matrix is an eigenvector corresponding
+ * to eigenvalue number \f$ k \f$ as returned by eigenvalues(). The
+ * eigenvectors are normalized to have (Euclidean) norm equal to one. If
+ * this object was used to solve the eigenproblem for the selfadjoint
+ * matrix \f$ A \f$, then the matrix returned by this function is the
+ * matrix \f$ V \f$ in the eigendecomposition \f$ A V = D V \f$.
+ * For the generalized eigenproblem, the matrix returned is the solution \f$ A V = D B V \f$
+ *
+ * Example: \include SelfAdjointEigenSolver_eigenvectors.cpp
+ * Output: \verbinclude SelfAdjointEigenSolver_eigenvectors.out
+ *
+ * \sa eigenvalues()
+ */
+ const Matrix<Scalar, Dynamic, Dynamic>& eigenvectors() const
+ {
+ eigen_assert(m_isInitialized && "ArpackGeneralizedSelfAdjointEigenSolver is not initialized.");
+ eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues.");
+ return m_eivec;
+ }
+
+ /** \brief Returns the eigenvalues of given matrix.
+ *
+ * \returns A const reference to the column vector containing the eigenvalues.
+ *
+ * \pre The eigenvalues have been computed before.
+ *
+ * The eigenvalues are repeated according to their algebraic multiplicity,
+ * so there are as many eigenvalues as rows in the matrix. The eigenvalues
+ * are sorted in increasing order.
+ *
+ * Example: \include SelfAdjointEigenSolver_eigenvalues.cpp
+ * Output: \verbinclude SelfAdjointEigenSolver_eigenvalues.out
+ *
+ * \sa eigenvectors(), MatrixBase::eigenvalues()
+ */
+ const Matrix<Scalar, Dynamic, 1>& eigenvalues() const
+ {
+ eigen_assert(m_isInitialized && "ArpackGeneralizedSelfAdjointEigenSolver is not initialized.");
+ return m_eivalues;
+ }
+
+ /** \brief Computes the positive-definite square root of the matrix.
+ *
+ * \returns the positive-definite square root of the matrix
+ *
+ * \pre The eigenvalues and eigenvectors of a positive-definite matrix
+ * have been computed before.
+ *
+ * The square root of a positive-definite matrix \f$ A \f$ is the
+ * positive-definite matrix whose square equals \f$ A \f$. This function
+ * uses the eigendecomposition \f$ A = V D V^{-1} \f$ to compute the
+ * square root as \f$ A^{1/2} = V D^{1/2} V^{-1} \f$.
+ *
+ * Example: \include SelfAdjointEigenSolver_operatorSqrt.cpp
+ * Output: \verbinclude SelfAdjointEigenSolver_operatorSqrt.out
+ *
+ * \sa operatorInverseSqrt(),
+ * \ref MatrixFunctions_Module "MatrixFunctions Module"
+ */
+ Matrix<Scalar, Dynamic, Dynamic> operatorSqrt() const
+ {
+ eigen_assert(m_isInitialized && "SelfAdjointEigenSolver is not initialized.");
+ eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues.");
+ return m_eivec * m_eivalues.cwiseSqrt().asDiagonal() * m_eivec.adjoint();
+ }
+
+ /** \brief Computes the inverse square root of the matrix.
+ *
+ * \returns the inverse positive-definite square root of the matrix
+ *
+ * \pre The eigenvalues and eigenvectors of a positive-definite matrix
+ * have been computed before.
+ *
+ * This function uses the eigendecomposition \f$ A = V D V^{-1} \f$ to
+ * compute the inverse square root as \f$ V D^{-1/2} V^{-1} \f$. This is
+ * cheaper than first computing the square root with operatorSqrt() and
+ * then its inverse with MatrixBase::inverse().
+ *
+ * Example: \include SelfAdjointEigenSolver_operatorInverseSqrt.cpp
+ * Output: \verbinclude SelfAdjointEigenSolver_operatorInverseSqrt.out
+ *
+ * \sa operatorSqrt(), MatrixBase::inverse(),
+ * \ref MatrixFunctions_Module "MatrixFunctions Module"
+ */
+ Matrix<Scalar, Dynamic, Dynamic> operatorInverseSqrt() const
+ {
+ eigen_assert(m_isInitialized && "SelfAdjointEigenSolver is not initialized.");
+ eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues.");
+ return m_eivec * m_eivalues.cwiseInverse().cwiseSqrt().asDiagonal() * m_eivec.adjoint();
+ }
+
+ /** \brief Reports whether previous computation was successful.
+ *
+ * \returns \c Success if computation was successful, \c NoConvergence otherwise.
+ */
+ ComputationInfo info() const
+ {
+ eigen_assert(m_isInitialized && "ArpackGeneralizedSelfAdjointEigenSolver is not initialized.");
+ return m_info;
+ }
+
+ size_t getNbrConvergedEigenValues() const
+ { return m_nbrConverged; }
+
+ size_t getNbrIterations() const
+ { return m_nbrIterations; }
+
+protected:
+ Matrix<Scalar, Dynamic, Dynamic> m_eivec;
+ Matrix<Scalar, Dynamic, 1> m_eivalues;
+ ComputationInfo m_info;
+ bool m_isInitialized;
+ bool m_eigenvectorsOk;
+
+ size_t m_nbrConverged;
+ size_t m_nbrIterations;
+};
+
+
+
+
+
+template<typename MatrixType, typename MatrixSolver, bool BisSPD>
+ArpackGeneralizedSelfAdjointEigenSolver<MatrixType, MatrixSolver, BisSPD>&
+ ArpackGeneralizedSelfAdjointEigenSolver<MatrixType, MatrixSolver, BisSPD>
+::compute(const MatrixType& A, Index nbrEigenvalues,
+ std::string eigs_sigma, int options, RealScalar tol)
+{
+ MatrixType B(0,0);
+ compute(A, B, nbrEigenvalues, eigs_sigma, options, tol);
+
+ return *this;
+}
+
+
+template<typename MatrixType, typename MatrixSolver, bool BisSPD>
+ArpackGeneralizedSelfAdjointEigenSolver<MatrixType, MatrixSolver, BisSPD>&
+ ArpackGeneralizedSelfAdjointEigenSolver<MatrixType, MatrixSolver, BisSPD>
+::compute(const MatrixType& A, const MatrixType& B, Index nbrEigenvalues,
+ std::string eigs_sigma, int options, RealScalar tol)
+{
+ eigen_assert(A.cols() == A.rows());
+ eigen_assert(B.cols() == B.rows());
+ eigen_assert(B.rows() == 0 || A.cols() == B.rows());
+ eigen_assert((options &~ (EigVecMask | GenEigMask)) == 0
+ && (options & EigVecMask) != EigVecMask
+ && "invalid option parameter");
+
+ bool isBempty = (B.rows() == 0) || (B.cols() == 0);
+
+ // For clarity, all parameters match their ARPACK name
+ //
+ // Always 0 on the first call
+ //
+ int ido = 0;
+
+ int n = (int)A.cols();
+
+ // User options: "LA", "SA", "SM", "LM", "BE"
+ //
+ char whch[3] = "LM";
+
+ // Specifies the shift if iparam[6] = { 3, 4, 5 }, not used if iparam[6] = { 1, 2 }
+ //
+ RealScalar sigma = 0.0;
+
+ if (eigs_sigma.length() >= 2 && isalpha(eigs_sigma[0]) && isalpha(eigs_sigma[1]))
+ {
+ eigs_sigma[0] = toupper(eigs_sigma[0]);
+ eigs_sigma[1] = toupper(eigs_sigma[1]);
+
+ // In the following special case we're going to invert the problem, since solving
+ // for larger magnitude is much much faster
+ // i.e., if 'SM' is specified, we're going to really use 'LM', the default
+ //
+ if (eigs_sigma.substr(0,2) != "SM")
+ {
+ whch[0] = eigs_sigma[0];
+ whch[1] = eigs_sigma[1];
+ }
+ }
+ else
+ {
+ eigen_assert(false && "Specifying clustered eigenvalues is not yet supported!");
+
+ // If it's not scalar values, then the user may be explicitly
+ // specifying the sigma value to cluster the evs around
+ //
+ sigma = atof(eigs_sigma.c_str());
+
+ // If atof fails, it returns 0.0, which is a fine default
+ //
+ }
+
+ // "I" means normal eigenvalue problem, "G" means generalized
+ //
+ char bmat[2] = "I";
+ if (eigs_sigma.substr(0,2) == "SM" || !(isalpha(eigs_sigma[0]) && isalpha(eigs_sigma[1])) || (!isBempty && !BisSPD))
+ bmat[0] = 'G';
+
+ // Now we determine the mode to use
+ //
+ int mode = (bmat[0] == 'G') + 1;
+ if (eigs_sigma.substr(0,2) == "SM" || !(isalpha(eigs_sigma[0]) && isalpha(eigs_sigma[1])))
+ {
+ // We're going to use shift-and-invert mode, and basically find
+ // the largest eigenvalues of the inverse operator
+ //
+ mode = 3;
+ }
+
+ // The user-specified number of eigenvalues/vectors to compute
+ //
+ int nev = (int)nbrEigenvalues;
+
+ // Allocate space for ARPACK to store the residual
+ //
+ Scalar *resid = new Scalar[n];
+
+ // Number of Lanczos vectors, must satisfy nev < ncv <= n
+ // Note that this indicates that nev != n, and we cannot compute
+ // all eigenvalues of a mtrix
+ //
+ int ncv = std::min(std::max(2*nev, 20), n);
+
+ // The working n x ncv matrix, also store the final eigenvectors (if computed)
+ //
+ Scalar *v = new Scalar[n*ncv];
+ int ldv = n;
+
+ // Working space
+ //
+ Scalar *workd = new Scalar[3*n];
+ int lworkl = ncv*ncv+8*ncv; // Must be at least this length
+ Scalar *workl = new Scalar[lworkl];
+
+ int *iparam= new int[11];
+ iparam[0] = 1; // 1 means we let ARPACK perform the shifts, 0 means we'd have to do it
+ iparam[2] = std::max(300, (int)std::ceil(2*n/std::max(ncv,1)));
+ iparam[6] = mode; // The mode, 1 is standard ev problem, 2 for generalized ev, 3 for shift-and-invert
+
+ // Used during reverse communicate to notify where arrays start
+ //
+ int *ipntr = new int[11];
+
+ // Error codes are returned in here, initial value of 0 indicates a random initial
+ // residual vector is used, any other values means resid contains the initial residual
+ // vector, possibly from a previous run
+ //
+ int info = 0;
+
+ Scalar scale = 1.0;
+ //if (!isBempty)
+ //{
+ //Scalar scale = B.norm() / std::sqrt(n);
+ //scale = std::pow(2, std::floor(std::log(scale+1)));
+ ////M /= scale;
+ //for (size_t i=0; i<(size_t)B.outerSize(); i++)
+ // for (typename MatrixType::InnerIterator it(B, i); it; ++it)
+ // it.valueRef() /= scale;
+ //}
+
+ MatrixSolver OP;
+ if (mode == 1 || mode == 2)
+ {
+ if (!isBempty)
+ OP.compute(B);
+ }
+ else if (mode == 3)
+ {
+ if (sigma == 0.0)
+ {
+ OP.compute(A);
+ }
+ else
+ {
+ // Note: We will never enter here because sigma must be 0.0
+ //
+ if (isBempty)
+ {
+ MatrixType AminusSigmaB(A);
+ for (Index i=0; i<A.rows(); ++i)
+ AminusSigmaB.coeffRef(i,i) -= sigma;
+
+ OP.compute(AminusSigmaB);
+ }
+ else
+ {
+ MatrixType AminusSigmaB = A - sigma * B;
+ OP.compute(AminusSigmaB);
+ }
+ }
+ }
+
+ if (!(mode == 1 && isBempty) && !(mode == 2 && isBempty) && OP.info() != Success)
+ std::cout << "Error factoring matrix" << std::endl;
+
+ do
+ {
+ internal::arpack_wrapper<Scalar, RealScalar>::saupd(&ido, bmat, &n, whch, &nev, &tol, resid,
+ &ncv, v, &ldv, iparam, ipntr, workd, workl,
+ &lworkl, &info);
+
+ if (ido == -1 || ido == 1)
+ {
+ Scalar *in = workd + ipntr[0] - 1;
+ Scalar *out = workd + ipntr[1] - 1;
+
+ if (ido == 1 && mode != 2)
+ {
+ Scalar *out2 = workd + ipntr[2] - 1;
+ if (isBempty || mode == 1)
+ Matrix<Scalar, Dynamic, 1>::Map(out2, n) = Matrix<Scalar, Dynamic, 1>::Map(in, n);
+ else
+ Matrix<Scalar, Dynamic, 1>::Map(out2, n) = B * Matrix<Scalar, Dynamic, 1>::Map(in, n);
+
+ in = workd + ipntr[2] - 1;
+ }
+
+ if (mode == 1)
+ {
+ if (isBempty)
+ {
+ // OP = A
+ //
+ Matrix<Scalar, Dynamic, 1>::Map(out, n) = A * Matrix<Scalar, Dynamic, 1>::Map(in, n);
+ }
+ else
+ {
+ // OP = L^{-1}AL^{-T}
+ //
+ internal::OP<MatrixSolver, MatrixType, Scalar, BisSPD>::applyOP(OP, A, n, in, out);
+ }
+ }
+ else if (mode == 2)
+ {
+ if (ido == 1)
+ Matrix<Scalar, Dynamic, 1>::Map(in, n) = A * Matrix<Scalar, Dynamic, 1>::Map(in, n);
+
+ // OP = B^{-1} A
+ //
+ Matrix<Scalar, Dynamic, 1>::Map(out, n) = OP.solve(Matrix<Scalar, Dynamic, 1>::Map(in, n));
+ }
+ else if (mode == 3)
+ {
+ // OP = (A-\sigmaB)B (\sigma could be 0, and B could be I)
+ // The B * in is already computed and stored at in if ido == 1
+ //
+ if (ido == 1 || isBempty)
+ Matrix<Scalar, Dynamic, 1>::Map(out, n) = OP.solve(Matrix<Scalar, Dynamic, 1>::Map(in, n));
+ else
+ Matrix<Scalar, Dynamic, 1>::Map(out, n) = OP.solve(B * Matrix<Scalar, Dynamic, 1>::Map(in, n));
+ }
+ }
+ else if (ido == 2)
+ {
+ Scalar *in = workd + ipntr[0] - 1;
+ Scalar *out = workd + ipntr[1] - 1;
+
+ if (isBempty || mode == 1)
+ Matrix<Scalar, Dynamic, 1>::Map(out, n) = Matrix<Scalar, Dynamic, 1>::Map(in, n);
+ else
+ Matrix<Scalar, Dynamic, 1>::Map(out, n) = B * Matrix<Scalar, Dynamic, 1>::Map(in, n);
+ }
+ } while (ido != 99);
+
+ if (info == 1)
+ m_info = NoConvergence;
+ else if (info == 3)
+ m_info = NumericalIssue;
+ else if (info < 0)
+ m_info = InvalidInput;
+ else if (info != 0)
+ eigen_assert(false && "Unknown ARPACK return value!");
+ else
+ {
+ // Do we compute eigenvectors or not?
+ //
+ int rvec = (options & ComputeEigenvectors) == ComputeEigenvectors;
+
+ // "A" means "All", use "S" to choose specific eigenvalues (not yet supported in ARPACK))
+ //
+ char howmny[2] = "A";
+
+ // if howmny == "S", specifies the eigenvalues to compute (not implemented in ARPACK)
+ //
+ int *select = new int[ncv];
+
+ // Final eigenvalues
+ //
+ m_eivalues.resize(nev, 1);
+
+ internal::arpack_wrapper<Scalar, RealScalar>::seupd(&rvec, howmny, select, m_eivalues.data(), v, &ldv,
+ &sigma, bmat, &n, whch, &nev, &tol, resid, &ncv,
+ v, &ldv, iparam, ipntr, workd, workl, &lworkl, &info);
+
+ if (info == -14)
+ m_info = NoConvergence;
+ else if (info != 0)
+ m_info = InvalidInput;
+ else
+ {
+ if (rvec)
+ {
+ m_eivec.resize(A.rows(), nev);
+ for (int i=0; i<nev; i++)
+ for (int j=0; j<n; j++)
+ m_eivec(j,i) = v[i*n+j] / scale;
+
+ if (mode == 1 && !isBempty && BisSPD)
+ internal::OP<MatrixSolver, MatrixType, Scalar, BisSPD>::project(OP, n, nev, m_eivec.data());
+
+ m_eigenvectorsOk = true;
+ }
+
+ m_nbrIterations = iparam[2];
+ m_nbrConverged = iparam[4];
+
+ m_info = Success;
+ }
+
+ delete[] select;
+ }
+
+ delete[] v;
+ delete[] iparam;
+ delete[] ipntr;
+ delete[] workd;
+ delete[] workl;
+ delete[] resid;
+
+ m_isInitialized = true;
+
+ return *this;
+}
+
+
+// Single precision
+//
+extern "C" void ssaupd_(int *ido, char *bmat, int *n, char *which,
+ int *nev, float *tol, float *resid, int *ncv,
+ float *v, int *ldv, int *iparam, int *ipntr,
+ float *workd, float *workl, int *lworkl,
+ int *info);
+
+extern "C" void sseupd_(int *rvec, char *All, int *select, float *d,
+ float *z, int *ldz, float *sigma,
+ char *bmat, int *n, char *which, int *nev,
+ float *tol, float *resid, int *ncv, float *v,
+ int *ldv, int *iparam, int *ipntr, float *workd,
+ float *workl, int *lworkl, int *ierr);
+
+// Double precision
+//
+extern "C" void dsaupd_(int *ido, char *bmat, int *n, char *which,
+ int *nev, double *tol, double *resid, int *ncv,
+ double *v, int *ldv, int *iparam, int *ipntr,
+ double *workd, double *workl, int *lworkl,
+ int *info);
+
+extern "C" void dseupd_(int *rvec, char *All, int *select, double *d,
+ double *z, int *ldz, double *sigma,
+ char *bmat, int *n, char *which, int *nev,
+ double *tol, double *resid, int *ncv, double *v,
+ int *ldv, int *iparam, int *ipntr, double *workd,
+ double *workl, int *lworkl, int *ierr);
+
+
+namespace internal {
+
+template<typename Scalar, typename RealScalar> struct arpack_wrapper
+{
+ static inline void saupd(int *ido, char *bmat, int *n, char *which,
+ int *nev, RealScalar *tol, Scalar *resid, int *ncv,
+ Scalar *v, int *ldv, int *iparam, int *ipntr,
+ Scalar *workd, Scalar *workl, int *lworkl, int *info)
+ {
+ EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsComplex, NUMERIC_TYPE_MUST_BE_REAL)
+ }
+
+ static inline void seupd(int *rvec, char *All, int *select, Scalar *d,
+ Scalar *z, int *ldz, RealScalar *sigma,
+ char *bmat, int *n, char *which, int *nev,
+ RealScalar *tol, Scalar *resid, int *ncv, Scalar *v,
+ int *ldv, int *iparam, int *ipntr, Scalar *workd,
+ Scalar *workl, int *lworkl, int *ierr)
+ {
+ EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsComplex, NUMERIC_TYPE_MUST_BE_REAL)
+ }
+};
+
+template <> struct arpack_wrapper<float, float>
+{
+ static inline void saupd(int *ido, char *bmat, int *n, char *which,
+ int *nev, float *tol, float *resid, int *ncv,
+ float *v, int *ldv, int *iparam, int *ipntr,
+ float *workd, float *workl, int *lworkl, int *info)
+ {
+ ssaupd_(ido, bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, ipntr, workd, workl, lworkl, info);
+ }
+
+ static inline void seupd(int *rvec, char *All, int *select, float *d,
+ float *z, int *ldz, float *sigma,
+ char *bmat, int *n, char *which, int *nev,
+ float *tol, float *resid, int *ncv, float *v,
+ int *ldv, int *iparam, int *ipntr, float *workd,
+ float *workl, int *lworkl, int *ierr)
+ {
+ sseupd_(rvec, All, select, d, z, ldz, sigma, bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, ipntr,
+ workd, workl, lworkl, ierr);
+ }
+};
+
+template <> struct arpack_wrapper<double, double>
+{
+ static inline void saupd(int *ido, char *bmat, int *n, char *which,
+ int *nev, double *tol, double *resid, int *ncv,
+ double *v, int *ldv, int *iparam, int *ipntr,
+ double *workd, double *workl, int *lworkl, int *info)
+ {
+ dsaupd_(ido, bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, ipntr, workd, workl, lworkl, info);
+ }
+
+ static inline void seupd(int *rvec, char *All, int *select, double *d,
+ double *z, int *ldz, double *sigma,
+ char *bmat, int *n, char *which, int *nev,
+ double *tol, double *resid, int *ncv, double *v,
+ int *ldv, int *iparam, int *ipntr, double *workd,
+ double *workl, int *lworkl, int *ierr)
+ {
+ dseupd_(rvec, All, select, d, v, ldv, sigma, bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, ipntr,
+ workd, workl, lworkl, ierr);
+ }
+};
+
+
+template<typename MatrixSolver, typename MatrixType, typename Scalar, bool BisSPD>
+struct OP
+{
+ static inline void applyOP(MatrixSolver &OP, const MatrixType &A, int n, Scalar *in, Scalar *out);
+ static inline void project(MatrixSolver &OP, int n, int k, Scalar *vecs);
+};
+
+template<typename MatrixSolver, typename MatrixType, typename Scalar>
+struct OP<MatrixSolver, MatrixType, Scalar, true>
+{
+ static inline void applyOP(MatrixSolver &OP, const MatrixType &A, int n, Scalar *in, Scalar *out)
+{
+ // OP = L^{-1} A L^{-T} (B = LL^T)
+ //
+ // First solve L^T out = in
+ //
+ Matrix<Scalar, Dynamic, 1>::Map(out, n) = OP.matrixU().solve(Matrix<Scalar, Dynamic, 1>::Map(in, n));
+ Matrix<Scalar, Dynamic, 1>::Map(out, n) = OP.permutationPinv() * Matrix<Scalar, Dynamic, 1>::Map(out, n);
+
+ // Then compute out = A out
+ //
+ Matrix<Scalar, Dynamic, 1>::Map(out, n) = A * Matrix<Scalar, Dynamic, 1>::Map(out, n);
+
+ // Then solve L out = out
+ //
+ Matrix<Scalar, Dynamic, 1>::Map(out, n) = OP.permutationP() * Matrix<Scalar, Dynamic, 1>::Map(out, n);
+ Matrix<Scalar, Dynamic, 1>::Map(out, n) = OP.matrixL().solve(Matrix<Scalar, Dynamic, 1>::Map(out, n));
+}
+
+ static inline void project(MatrixSolver &OP, int n, int k, Scalar *vecs)
+{
+ // Solve L^T out = in
+ //
+ Matrix<Scalar, Dynamic, Dynamic>::Map(vecs, n, k) = OP.matrixU().solve(Matrix<Scalar, Dynamic, Dynamic>::Map(vecs, n, k));
+ Matrix<Scalar, Dynamic, Dynamic>::Map(vecs, n, k) = OP.permutationPinv() * Matrix<Scalar, Dynamic, Dynamic>::Map(vecs, n, k);
+}
+
+};
+
+template<typename MatrixSolver, typename MatrixType, typename Scalar>
+struct OP<MatrixSolver, MatrixType, Scalar, false>
+{
+ static inline void applyOP(MatrixSolver &OP, const MatrixType &A, int n, Scalar *in, Scalar *out)
+{
+ eigen_assert(false && "Should never be in here...");
+}
+
+ static inline void project(MatrixSolver &OP, int n, int k, Scalar *vecs)
+{
+ eigen_assert(false && "Should never be in here...");
+}
+
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_ARPACKSELFADJOINTEIGENSOLVER_H
+
diff --git a/src/EigenUnsupported/src/EulerAngles/CMakeLists.txt b/src/EigenUnsupported/src/EulerAngles/CMakeLists.txt
new file mode 100644
index 0000000..22088eb
--- /dev/null
+++ b/src/EigenUnsupported/src/EulerAngles/CMakeLists.txt
@@ -0,0 +1,6 @@
+file(GLOB Eigen_EulerAngles_SRCS "*.h")
+
+install(FILES
+ ${Eigen_EulerAngles_SRCS}
+ DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/EulerAngles COMPONENT Devel
+ )
diff --git a/src/EigenUnsupported/src/EulerAngles/EulerAngles.h b/src/EigenUnsupported/src/EulerAngles/EulerAngles.h
new file mode 100644
index 0000000..e43cdb7
--- /dev/null
+++ b/src/EigenUnsupported/src/EulerAngles/EulerAngles.h
@@ -0,0 +1,355 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Tal Hadad <tal_hd@hotmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_EULERANGLESCLASS_H// TODO: Fix previous "EIGEN_EULERANGLES_H" definition?
+#define EIGEN_EULERANGLESCLASS_H
+
+namespace Eigen
+{
+ /** \class EulerAngles
+ *
+ * \ingroup EulerAngles_Module
+ *
+ * \brief Represents a rotation in a 3 dimensional space as three Euler angles.
+ *
+ * Euler rotation is a set of three rotation of three angles over three fixed axes, defined by the EulerSystem given as a template parameter.
+ *
+ * Here is how intrinsic Euler angles works:
+ * - first, rotate the axes system over the alpha axis in angle alpha
+ * - then, rotate the axes system over the beta axis(which was rotated in the first stage) in angle beta
+ * - then, rotate the axes system over the gamma axis(which was rotated in the two stages above) in angle gamma
+ *
+ * \note This class support only intrinsic Euler angles for simplicity,
+ * see EulerSystem how to easily overcome this for extrinsic systems.
+ *
+ * ### Rotation representation and conversions ###
+ *
+ * It has been proved(see Wikipedia link below) that every rotation can be represented
+ * by Euler angles, but there is no single representation (e.g. unlike rotation matrices).
+ * Therefore, you can convert from Eigen rotation and to them
+ * (including rotation matrices, which is not called "rotations" by Eigen design).
+ *
+ * Euler angles usually used for:
+ * - convenient human representation of rotation, especially in interactive GUI.
+ * - gimbal systems and robotics
+ * - efficient encoding(i.e. 3 floats only) of rotation for network protocols.
+ *
+ * However, Euler angles are slow comparing to quaternion or matrices,
+ * because their unnatural math definition, although it's simple for human.
+ * To overcome this, this class provide easy movement from the math friendly representation
+ * to the human friendly representation, and vise-versa.
+ *
+ * All the user need to do is a safe simple C++ type conversion,
+ * and this class take care for the math.
+ * Additionally, some axes related computation is done in compile time.
+ *
+ * #### Euler angles ranges in conversions ####
+ * Rotations representation as EulerAngles are not single (unlike matrices),
+ * and even have infinite EulerAngles representations.<BR>
+ * For example, add or subtract 2*PI from either angle of EulerAngles
+ * and you'll get the same rotation.
+ * This is the general reason for infinite representation,
+ * but it's not the only general reason for not having a single representation.
+ *
+ * When converting rotation to EulerAngles, this class convert it to specific ranges
+ * When converting some rotation to EulerAngles, the rules for ranges are as follow:
+ * - If the rotation we converting from is an EulerAngles
+ * (even when it represented as RotationBase explicitly), angles ranges are __undefined__.
+ * - otherwise, alpha and gamma angles will be in the range [-PI, PI].<BR>
+ * As for Beta angle:
+ * - If the system is Tait-Bryan, the beta angle will be in the range [-PI/2, PI/2].
+ * - otherwise:
+ * - If the beta axis is positive, the beta angle will be in the range [0, PI]
+ * - If the beta axis is negative, the beta angle will be in the range [-PI, 0]
+ *
+ * \sa EulerAngles(const MatrixBase<Derived>&)
+ * \sa EulerAngles(const RotationBase<Derived, 3>&)
+ *
+ * ### Convenient user typedefs ###
+ *
+ * Convenient typedefs for EulerAngles exist for float and double scalar,
+ * in a form of EulerAngles{A}{B}{C}{scalar},
+ * e.g. \ref EulerAnglesXYZd, \ref EulerAnglesZYZf.
+ *
+ * Only for positive axes{+x,+y,+z} Euler systems are have convenient typedef.
+ * If you need negative axes{-x,-y,-z}, it is recommended to create you own typedef with
+ * a word that represent what you need.
+ *
+ * ### Example ###
+ *
+ * \include EulerAngles.cpp
+ * Output: \verbinclude EulerAngles.out
+ *
+ * ### Additional reading ###
+ *
+ * If you're want to get more idea about how Euler system work in Eigen see EulerSystem.
+ *
+ * More information about Euler angles: https://en.wikipedia.org/wiki/Euler_angles
+ *
+ * \tparam _Scalar the scalar type, i.e. the type of the angles.
+ *
+ * \tparam _System the EulerSystem to use, which represents the axes of rotation.
+ */
+ template <typename _Scalar, class _System>
+ class EulerAngles : public RotationBase<EulerAngles<_Scalar, _System>, 3>
+ {
+ public:
+ typedef RotationBase<EulerAngles<_Scalar, _System>, 3> Base;
+
+ /** the scalar type of the angles */
+ typedef _Scalar Scalar;
+ typedef typename NumTraits<Scalar>::Real RealScalar;
+
+ /** the EulerSystem to use, which represents the axes of rotation. */
+ typedef _System System;
+
+ typedef Matrix<Scalar,3,3> Matrix3; /*!< the equivalent rotation matrix type */
+ typedef Matrix<Scalar,3,1> Vector3; /*!< the equivalent 3 dimension vector type */
+ typedef Quaternion<Scalar> QuaternionType; /*!< the equivalent quaternion type */
+ typedef AngleAxis<Scalar> AngleAxisType; /*!< the equivalent angle-axis type */
+
+ /** \returns the axis vector of the first (alpha) rotation */
+ static Vector3 AlphaAxisVector() {
+ const Vector3& u = Vector3::Unit(System::AlphaAxisAbs - 1);
+ return System::IsAlphaOpposite ? -u : u;
+ }
+
+ /** \returns the axis vector of the second (beta) rotation */
+ static Vector3 BetaAxisVector() {
+ const Vector3& u = Vector3::Unit(System::BetaAxisAbs - 1);
+ return System::IsBetaOpposite ? -u : u;
+ }
+
+ /** \returns the axis vector of the third (gamma) rotation */
+ static Vector3 GammaAxisVector() {
+ const Vector3& u = Vector3::Unit(System::GammaAxisAbs - 1);
+ return System::IsGammaOpposite ? -u : u;
+ }
+
+ private:
+ Vector3 m_angles;
+
+ public:
+ /** Default constructor without initialization. */
+ EulerAngles() {}
+ /** Constructs and initialize an EulerAngles (\p alpha, \p beta, \p gamma). */
+ EulerAngles(const Scalar& alpha, const Scalar& beta, const Scalar& gamma) :
+ m_angles(alpha, beta, gamma) {}
+
+ // TODO: Test this constructor
+ /** Constructs and initialize an EulerAngles from the array data {alpha, beta, gamma} */
+ explicit EulerAngles(const Scalar* data) : m_angles(data) {}
+
+ /** Constructs and initializes an EulerAngles from either:
+ * - a 3x3 rotation matrix expression(i.e. pure orthogonal matrix with determinant of +1),
+ * - a 3D vector expression representing Euler angles.
+ *
+ * \note If \p other is a 3x3 rotation matrix, the angles range rules will be as follow:<BR>
+ * Alpha and gamma angles will be in the range [-PI, PI].<BR>
+ * As for Beta angle:
+ * - If the system is Tait-Bryan, the beta angle will be in the range [-PI/2, PI/2].
+ * - otherwise:
+ * - If the beta axis is positive, the beta angle will be in the range [0, PI]
+ * - If the beta axis is negative, the beta angle will be in the range [-PI, 0]
+ */
+ template<typename Derived>
+ explicit EulerAngles(const MatrixBase<Derived>& other) { *this = other; }
+
+ /** Constructs and initialize Euler angles from a rotation \p rot.
+ *
+ * \note If \p rot is an EulerAngles (even when it represented as RotationBase explicitly),
+ * angles ranges are __undefined__.
+ * Otherwise, alpha and gamma angles will be in the range [-PI, PI].<BR>
+ * As for Beta angle:
+ * - If the system is Tait-Bryan, the beta angle will be in the range [-PI/2, PI/2].
+ * - otherwise:
+ * - If the beta axis is positive, the beta angle will be in the range [0, PI]
+ * - If the beta axis is negative, the beta angle will be in the range [-PI, 0]
+ */
+ template<typename Derived>
+ EulerAngles(const RotationBase<Derived, 3>& rot) { System::CalcEulerAngles(*this, rot.toRotationMatrix()); }
+
+ /*EulerAngles(const QuaternionType& q)
+ {
+ // TODO: Implement it in a faster way for quaternions
+ // According to http://www.euclideanspace.com/maths/geometry/rotations/conversions/quaternionToEuler/
+ // we can compute only the needed matrix cells and then convert to euler angles. (see ZYX example below)
+ // Currently we compute all matrix cells from quaternion.
+
+ // Special case only for ZYX
+ //Scalar y2 = q.y() * q.y();
+ //m_angles[0] = std::atan2(2*(q.w()*q.z() + q.x()*q.y()), (1 - 2*(y2 + q.z()*q.z())));
+ //m_angles[1] = std::asin( 2*(q.w()*q.y() - q.z()*q.x()));
+ //m_angles[2] = std::atan2(2*(q.w()*q.x() + q.y()*q.z()), (1 - 2*(q.x()*q.x() + y2)));
+ }*/
+
+ /** \returns The angle values stored in a vector (alpha, beta, gamma). */
+ const Vector3& angles() const { return m_angles; }
+ /** \returns A read-write reference to the angle values stored in a vector (alpha, beta, gamma). */
+ Vector3& angles() { return m_angles; }
+
+ /** \returns The value of the first angle. */
+ Scalar alpha() const { return m_angles[0]; }
+ /** \returns A read-write reference to the angle of the first angle. */
+ Scalar& alpha() { return m_angles[0]; }
+
+ /** \returns The value of the second angle. */
+ Scalar beta() const { return m_angles[1]; }
+ /** \returns A read-write reference to the angle of the second angle. */
+ Scalar& beta() { return m_angles[1]; }
+
+ /** \returns The value of the third angle. */
+ Scalar gamma() const { return m_angles[2]; }
+ /** \returns A read-write reference to the angle of the third angle. */
+ Scalar& gamma() { return m_angles[2]; }
+
+ /** \returns The Euler angles rotation inverse (which is as same as the negative),
+ * (-alpha, -beta, -gamma).
+ */
+ EulerAngles inverse() const
+ {
+ EulerAngles res;
+ res.m_angles = -m_angles;
+ return res;
+ }
+
+ /** \returns The Euler angles rotation negative (which is as same as the inverse),
+ * (-alpha, -beta, -gamma).
+ */
+ EulerAngles operator -() const
+ {
+ return inverse();
+ }
+
+ /** Set \c *this from either:
+ * - a 3x3 rotation matrix expression(i.e. pure orthogonal matrix with determinant of +1),
+ * - a 3D vector expression representing Euler angles.
+ *
+ * See EulerAngles(const MatrixBase<Derived, 3>&) for more information about
+ * angles ranges output.
+ */
+ template<class Derived>
+ EulerAngles& operator=(const MatrixBase<Derived>& other)
+ {
+ EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename Derived::Scalar>::value),
+ YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+
+ internal::eulerangles_assign_impl<System, Derived>::run(*this, other.derived());
+ return *this;
+ }
+
+ // TODO: Assign and construct from another EulerAngles (with different system)
+
+ /** Set \c *this from a rotation.
+ *
+ * See EulerAngles(const RotationBase<Derived, 3>&) for more information about
+ * angles ranges output.
+ */
+ template<typename Derived>
+ EulerAngles& operator=(const RotationBase<Derived, 3>& rot) {
+ System::CalcEulerAngles(*this, rot.toRotationMatrix());
+ return *this;
+ }
+
+ /** \returns \c true if \c *this is approximately equal to \a other, within the precision
+ * determined by \a prec.
+ *
+ * \sa MatrixBase::isApprox() */
+ bool isApprox(const EulerAngles& other,
+ const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const
+ { return angles().isApprox(other.angles(), prec); }
+
+ /** \returns an equivalent 3x3 rotation matrix. */
+ Matrix3 toRotationMatrix() const
+ {
+ // TODO: Calc it faster
+ return static_cast<QuaternionType>(*this).toRotationMatrix();
+ }
+
+ /** Convert the Euler angles to quaternion. */
+ operator QuaternionType() const
+ {
+ return
+ AngleAxisType(alpha(), AlphaAxisVector()) *
+ AngleAxisType(beta(), BetaAxisVector()) *
+ AngleAxisType(gamma(), GammaAxisVector());
+ }
+
+ friend std::ostream& operator<<(std::ostream& s, const EulerAngles<Scalar, System>& eulerAngles)
+ {
+ s << eulerAngles.angles().transpose();
+ return s;
+ }
+
+ /** \returns \c *this with scalar type casted to \a NewScalarType */
+ template <typename NewScalarType>
+ EulerAngles<NewScalarType, System> cast() const
+ {
+ EulerAngles<NewScalarType, System> e;
+ e.angles() = angles().template cast<NewScalarType>();
+ return e;
+ }
+ };
+
+#define EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(AXES, SCALAR_TYPE, SCALAR_POSTFIX) \
+ /** \ingroup EulerAngles_Module */ \
+ typedef EulerAngles<SCALAR_TYPE, EulerSystem##AXES> EulerAngles##AXES##SCALAR_POSTFIX;
+
+#define EIGEN_EULER_ANGLES_TYPEDEFS(SCALAR_TYPE, SCALAR_POSTFIX) \
+ EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(XYZ, SCALAR_TYPE, SCALAR_POSTFIX) \
+ EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(XYX, SCALAR_TYPE, SCALAR_POSTFIX) \
+ EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(XZY, SCALAR_TYPE, SCALAR_POSTFIX) \
+ EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(XZX, SCALAR_TYPE, SCALAR_POSTFIX) \
+ \
+ EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(YZX, SCALAR_TYPE, SCALAR_POSTFIX) \
+ EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(YZY, SCALAR_TYPE, SCALAR_POSTFIX) \
+ EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(YXZ, SCALAR_TYPE, SCALAR_POSTFIX) \
+ EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(YXY, SCALAR_TYPE, SCALAR_POSTFIX) \
+ \
+ EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(ZXY, SCALAR_TYPE, SCALAR_POSTFIX) \
+ EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(ZXZ, SCALAR_TYPE, SCALAR_POSTFIX) \
+ EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(ZYX, SCALAR_TYPE, SCALAR_POSTFIX) \
+ EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(ZYZ, SCALAR_TYPE, SCALAR_POSTFIX)
+
+EIGEN_EULER_ANGLES_TYPEDEFS(float, f)
+EIGEN_EULER_ANGLES_TYPEDEFS(double, d)
+
+ namespace internal
+ {
+ template<typename _Scalar, class _System>
+ struct traits<EulerAngles<_Scalar, _System> >
+ {
+ typedef _Scalar Scalar;
+ };
+
+ // set from a rotation matrix
+ template<class System, class Other>
+ struct eulerangles_assign_impl<System,Other,3,3>
+ {
+ typedef typename Other::Scalar Scalar;
+ static void run(EulerAngles<Scalar, System>& e, const Other& m)
+ {
+ System::CalcEulerAngles(e, m);
+ }
+ };
+
+ // set from a vector of Euler angles
+ template<class System, class Other>
+ struct eulerangles_assign_impl<System,Other,3,1>
+ {
+ typedef typename Other::Scalar Scalar;
+ static void run(EulerAngles<Scalar, System>& e, const Other& vec)
+ {
+ e.angles() = vec;
+ }
+ };
+ }
+}
+
+#endif // EIGEN_EULERANGLESCLASS_H
diff --git a/src/EigenUnsupported/src/EulerAngles/EulerSystem.h b/src/EigenUnsupported/src/EulerAngles/EulerSystem.h
new file mode 100644
index 0000000..2a833b0
--- /dev/null
+++ b/src/EigenUnsupported/src/EulerAngles/EulerSystem.h
@@ -0,0 +1,305 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Tal Hadad <tal_hd@hotmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_EULERSYSTEM_H
+#define EIGEN_EULERSYSTEM_H
+
+namespace Eigen
+{
+ // Forward declarations
+ template <typename _Scalar, class _System>
+ class EulerAngles;
+
+ namespace internal
+ {
+ // TODO: Add this trait to the Eigen internal API?
+ template <int Num, bool IsPositive = (Num > 0)>
+ struct Abs
+ {
+ enum { value = Num };
+ };
+
+ template <int Num>
+ struct Abs<Num, false>
+ {
+ enum { value = -Num };
+ };
+
+ template <int Axis>
+ struct IsValidAxis
+ {
+ enum { value = Axis != 0 && Abs<Axis>::value <= 3 };
+ };
+
+ template<typename System,
+ typename Other,
+ int OtherRows=Other::RowsAtCompileTime,
+ int OtherCols=Other::ColsAtCompileTime>
+ struct eulerangles_assign_impl;
+ }
+
+ #define EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT(COND,MSG) typedef char static_assertion_##MSG[(COND)?1:-1]
+
+ /** \brief Representation of a fixed signed rotation axis for EulerSystem.
+ *
+ * \ingroup EulerAngles_Module
+ *
+ * Values here represent:
+ * - The axis of the rotation: X, Y or Z.
+ * - The sign (i.e. direction of the rotation along the axis): positive(+) or negative(-)
+ *
+ * Therefore, this could express all the axes {+X,+Y,+Z,-X,-Y,-Z}
+ *
+ * For positive axis, use +EULER_{axis}, and for negative axis use -EULER_{axis}.
+ */
+ enum EulerAxis
+ {
+ EULER_X = 1, /*!< the X axis */
+ EULER_Y = 2, /*!< the Y axis */
+ EULER_Z = 3 /*!< the Z axis */
+ };
+
+ /** \class EulerSystem
+ *
+ * \ingroup EulerAngles_Module
+ *
+ * \brief Represents a fixed Euler rotation system.
+ *
+ * This meta-class goal is to represent the Euler system in compilation time, for EulerAngles.
+ *
+ * You can use this class to get two things:
+ * - Build an Euler system, and then pass it as a template parameter to EulerAngles.
+ * - Query some compile time data about an Euler system. (e.g. Whether it's Tait-Bryan)
+ *
+ * Euler rotation is a set of three rotation on fixed axes. (see \ref EulerAngles)
+ * This meta-class store constantly those signed axes. (see \ref EulerAxis)
+ *
+ * ### Types of Euler systems ###
+ *
+ * All and only valid 3 dimension Euler rotation over standard
+ * signed axes{+X,+Y,+Z,-X,-Y,-Z} are supported:
+ * - all axes X, Y, Z in each valid order (see below what order is valid)
+ * - rotation over the axis is supported both over the positive and negative directions.
+ * - both Tait-Bryan and proper/classic Euler angles (i.e. the opposite).
+ *
+ * Since EulerSystem support both positive and negative directions,
+ * you may call this rotation distinction in other names:
+ * - _right handed_ or _left handed_
+ * - _counterclockwise_ or _clockwise_
+ *
+ * Notice all axed combination are valid, and would trigger a static assertion.
+ * Same unsigned axes can't be neighbors, e.g. {X,X,Y} is invalid.
+ * This yield two and only two classes:
+ * - _Tait-Bryan_ - all unsigned axes are distinct, e.g. {X,Y,Z}
+ * - _proper/classic Euler angles_ - The first and the third unsigned axes is equal,
+ * and the second is different, e.g. {X,Y,X}
+ *
+ * ### Intrinsic vs extrinsic Euler systems ###
+ *
+ * Only intrinsic Euler systems are supported for simplicity.
+ * If you want to use extrinsic Euler systems,
+ * just use the equal intrinsic opposite order for axes and angles.
+ * I.e axes (A,B,C) becomes (C,B,A), and angles (a,b,c) becomes (c,b,a).
+ *
+ * ### Convenient user typedefs ###
+ *
+ * Convenient typedefs for EulerSystem exist (only for positive axes Euler systems),
+ * in a form of EulerSystem{A}{B}{C}, e.g. \ref EulerSystemXYZ.
+ *
+ * ### Additional reading ###
+ *
+ * More information about Euler angles: https://en.wikipedia.org/wiki/Euler_angles
+ *
+ * \tparam _AlphaAxis the first fixed EulerAxis
+ *
+ * \tparam _BetaAxis the second fixed EulerAxis
+ *
+ * \tparam _GammaAxis the third fixed EulerAxis
+ */
+ template <int _AlphaAxis, int _BetaAxis, int _GammaAxis>
+ class EulerSystem
+ {
+ public:
+ // It's defined this way and not as enum, because I think
+ // that enum is not guerantee to support negative numbers
+
+ /** The first rotation axis */
+ static const int AlphaAxis = _AlphaAxis;
+
+ /** The second rotation axis */
+ static const int BetaAxis = _BetaAxis;
+
+ /** The third rotation axis */
+ static const int GammaAxis = _GammaAxis;
+
+ enum
+ {
+ AlphaAxisAbs = internal::Abs<AlphaAxis>::value, /*!< the first rotation axis unsigned */
+ BetaAxisAbs = internal::Abs<BetaAxis>::value, /*!< the second rotation axis unsigned */
+ GammaAxisAbs = internal::Abs<GammaAxis>::value, /*!< the third rotation axis unsigned */
+
+ IsAlphaOpposite = (AlphaAxis < 0) ? 1 : 0, /*!< whether alpha axis is negative */
+ IsBetaOpposite = (BetaAxis < 0) ? 1 : 0, /*!< whether beta axis is negative */
+ IsGammaOpposite = (GammaAxis < 0) ? 1 : 0, /*!< whether gamma axis is negative */
+
+ // Parity is even if alpha axis X is followed by beta axis Y, or Y is followed
+ // by Z, or Z is followed by X; otherwise it is odd.
+ IsOdd = ((AlphaAxisAbs)%3 == (BetaAxisAbs - 1)%3) ? 0 : 1, /*!< whether the Euler system is odd */
+ IsEven = IsOdd ? 0 : 1, /*!< whether the Euler system is even */
+
+ IsTaitBryan = ((unsigned)AlphaAxisAbs != (unsigned)GammaAxisAbs) ? 1 : 0 /*!< whether the Euler system is Tait-Bryan */
+ };
+
+ private:
+
+ EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT(internal::IsValidAxis<AlphaAxis>::value,
+ ALPHA_AXIS_IS_INVALID);
+
+ EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT(internal::IsValidAxis<BetaAxis>::value,
+ BETA_AXIS_IS_INVALID);
+
+ EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT(internal::IsValidAxis<GammaAxis>::value,
+ GAMMA_AXIS_IS_INVALID);
+
+ EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT((unsigned)AlphaAxisAbs != (unsigned)BetaAxisAbs,
+ ALPHA_AXIS_CANT_BE_EQUAL_TO_BETA_AXIS);
+
+ EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT((unsigned)BetaAxisAbs != (unsigned)GammaAxisAbs,
+ BETA_AXIS_CANT_BE_EQUAL_TO_GAMMA_AXIS);
+
+ static const int
+ // I, J, K are the pivot indexes permutation for the rotation matrix, that match this Euler system.
+ // They are used in this class converters.
+ // They are always different from each other, and their possible values are: 0, 1, or 2.
+ I_ = AlphaAxisAbs - 1,
+ J_ = (AlphaAxisAbs - 1 + 1 + IsOdd)%3,
+ K_ = (AlphaAxisAbs - 1 + 2 - IsOdd)%3
+ ;
+
+ // TODO: Get @mat parameter in form that avoids double evaluation.
+ template <typename Derived>
+ static void CalcEulerAngles_imp(Matrix<typename MatrixBase<Derived>::Scalar, 3, 1>& res, const MatrixBase<Derived>& mat, internal::true_type /*isTaitBryan*/)
+ {
+ using std::atan2;
+ using std::sqrt;
+
+ typedef typename Derived::Scalar Scalar;
+
+ const Scalar plusMinus = IsEven? 1 : -1;
+ const Scalar minusPlus = IsOdd? 1 : -1;
+
+ const Scalar Rsum = sqrt((mat(I_,I_) * mat(I_,I_) + mat(I_,J_) * mat(I_,J_) + mat(J_,K_) * mat(J_,K_) + mat(K_,K_) * mat(K_,K_))/2);
+ res[1] = atan2(plusMinus * mat(I_,K_), Rsum);
+
+ // There is a singularity when cos(beta) == 0
+ if(Rsum > 4 * NumTraits<Scalar>::epsilon()) {// cos(beta) != 0
+ res[0] = atan2(minusPlus * mat(J_, K_), mat(K_, K_));
+ res[2] = atan2(minusPlus * mat(I_, J_), mat(I_, I_));
+ }
+ else if(plusMinus * mat(I_, K_) > 0) {// cos(beta) == 0 and sin(beta) == 1
+ Scalar spos = mat(J_, I_) + plusMinus * mat(K_, J_); // 2*sin(alpha + plusMinus * gamma
+ Scalar cpos = mat(J_, J_) + minusPlus * mat(K_, I_); // 2*cos(alpha + plusMinus * gamma)
+ Scalar alphaPlusMinusGamma = atan2(spos, cpos);
+ res[0] = alphaPlusMinusGamma;
+ res[2] = 0;
+ }
+ else {// cos(beta) == 0 and sin(beta) == -1
+ Scalar sneg = plusMinus * (mat(K_, J_) + minusPlus * mat(J_, I_)); // 2*sin(alpha + minusPlus*gamma)
+ Scalar cneg = mat(J_, J_) + plusMinus * mat(K_, I_); // 2*cos(alpha + minusPlus*gamma)
+ Scalar alphaMinusPlusBeta = atan2(sneg, cneg);
+ res[0] = alphaMinusPlusBeta;
+ res[2] = 0;
+ }
+ }
+
+ template <typename Derived>
+ static void CalcEulerAngles_imp(Matrix<typename MatrixBase<Derived>::Scalar,3,1>& res,
+ const MatrixBase<Derived>& mat, internal::false_type /*isTaitBryan*/)
+ {
+ using std::atan2;
+ using std::sqrt;
+
+ typedef typename Derived::Scalar Scalar;
+
+ const Scalar plusMinus = IsEven? 1 : -1;
+ const Scalar minusPlus = IsOdd? 1 : -1;
+
+ const Scalar Rsum = sqrt((mat(I_, J_) * mat(I_, J_) + mat(I_, K_) * mat(I_, K_) + mat(J_, I_) * mat(J_, I_) + mat(K_, I_) * mat(K_, I_)) / 2);
+
+ res[1] = atan2(Rsum, mat(I_, I_));
+
+ // There is a singularity when sin(beta) == 0
+ if(Rsum > 4 * NumTraits<Scalar>::epsilon()) {// sin(beta) != 0
+ res[0] = atan2(mat(J_, I_), minusPlus * mat(K_, I_));
+ res[2] = atan2(mat(I_, J_), plusMinus * mat(I_, K_));
+ }
+ else if(mat(I_, I_) > 0) {// sin(beta) == 0 and cos(beta) == 1
+ Scalar spos = plusMinus * mat(K_, J_) + minusPlus * mat(J_, K_); // 2*sin(alpha + gamma)
+ Scalar cpos = mat(J_, J_) + mat(K_, K_); // 2*cos(alpha + gamma)
+ res[0] = atan2(spos, cpos);
+ res[2] = 0;
+ }
+ else {// sin(beta) == 0 and cos(beta) == -1
+ Scalar sneg = plusMinus * mat(K_, J_) + plusMinus * mat(J_, K_); // 2*sin(alpha - gamma)
+ Scalar cneg = mat(J_, J_) - mat(K_, K_); // 2*cos(alpha - gamma)
+ res[0] = atan2(sneg, cneg);
+ res[2] = 0;
+ }
+ }
+
+ template<typename Scalar>
+ static void CalcEulerAngles(
+ EulerAngles<Scalar, EulerSystem>& res,
+ const typename EulerAngles<Scalar, EulerSystem>::Matrix3& mat)
+ {
+ CalcEulerAngles_imp(
+ res.angles(), mat,
+ typename internal::conditional<IsTaitBryan, internal::true_type, internal::false_type>::type());
+
+ if (IsAlphaOpposite)
+ res.alpha() = -res.alpha();
+
+ if (IsBetaOpposite)
+ res.beta() = -res.beta();
+
+ if (IsGammaOpposite)
+ res.gamma() = -res.gamma();
+ }
+
+ template <typename _Scalar, class _System>
+ friend class Eigen::EulerAngles;
+
+ template<typename System,
+ typename Other,
+ int OtherRows,
+ int OtherCols>
+ friend struct internal::eulerangles_assign_impl;
+ };
+
+#define EIGEN_EULER_SYSTEM_TYPEDEF(A, B, C) \
+ /** \ingroup EulerAngles_Module */ \
+ typedef EulerSystem<EULER_##A, EULER_##B, EULER_##C> EulerSystem##A##B##C;
+
+ EIGEN_EULER_SYSTEM_TYPEDEF(X,Y,Z)
+ EIGEN_EULER_SYSTEM_TYPEDEF(X,Y,X)
+ EIGEN_EULER_SYSTEM_TYPEDEF(X,Z,Y)
+ EIGEN_EULER_SYSTEM_TYPEDEF(X,Z,X)
+
+ EIGEN_EULER_SYSTEM_TYPEDEF(Y,Z,X)
+ EIGEN_EULER_SYSTEM_TYPEDEF(Y,Z,Y)
+ EIGEN_EULER_SYSTEM_TYPEDEF(Y,X,Z)
+ EIGEN_EULER_SYSTEM_TYPEDEF(Y,X,Y)
+
+ EIGEN_EULER_SYSTEM_TYPEDEF(Z,X,Y)
+ EIGEN_EULER_SYSTEM_TYPEDEF(Z,X,Z)
+ EIGEN_EULER_SYSTEM_TYPEDEF(Z,Y,X)
+ EIGEN_EULER_SYSTEM_TYPEDEF(Z,Y,Z)
+}
+
+#endif // EIGEN_EULERSYSTEM_H
diff --git a/src/EigenUnsupported/src/FFT/ei_fftw_impl.h b/src/EigenUnsupported/src/FFT/ei_fftw_impl.h
new file mode 100644
index 0000000..1c2cd24
--- /dev/null
+++ b/src/EigenUnsupported/src/FFT/ei_fftw_impl.h
@@ -0,0 +1,261 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Mark Borgerding mark a borgerding net
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+namespace Eigen {
+
+namespace internal {
+
+ // FFTW uses non-const arguments
+ // so we must use ugly const_cast calls for all the args it uses
+ //
+ // This should be safe as long as
+ // 1. we use FFTW_ESTIMATE for all our planning
+ // see the FFTW docs section 4.3.2 "Planner Flags"
+ // 2. fftw_complex is compatible with std::complex
+ // This assumes std::complex<T> layout is array of size 2 with real,imag
+ template <typename T>
+ inline
+ T * fftw_cast(const T* p)
+ {
+ return const_cast<T*>( p);
+ }
+
+ inline
+ fftw_complex * fftw_cast( const std::complex<double> * p)
+ {
+ return const_cast<fftw_complex*>( reinterpret_cast<const fftw_complex*>(p) );
+ }
+
+ inline
+ fftwf_complex * fftw_cast( const std::complex<float> * p)
+ {
+ return const_cast<fftwf_complex*>( reinterpret_cast<const fftwf_complex*>(p) );
+ }
+
+ inline
+ fftwl_complex * fftw_cast( const std::complex<long double> * p)
+ {
+ return const_cast<fftwl_complex*>( reinterpret_cast<const fftwl_complex*>(p) );
+ }
+
+ template <typename T>
+ struct fftw_plan {};
+
+ template <>
+ struct fftw_plan<float>
+ {
+ typedef float scalar_type;
+ typedef fftwf_complex complex_type;
+ fftwf_plan m_plan;
+ fftw_plan() :m_plan(NULL) {}
+ ~fftw_plan() {if (m_plan) fftwf_destroy_plan(m_plan);}
+
+ inline
+ void fwd(complex_type * dst,complex_type * src,int nfft) {
+ if (m_plan==NULL) m_plan = fftwf_plan_dft_1d(nfft,src,dst, FFTW_FORWARD, FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
+ fftwf_execute_dft( m_plan, src,dst);
+ }
+ inline
+ void inv(complex_type * dst,complex_type * src,int nfft) {
+ if (m_plan==NULL) m_plan = fftwf_plan_dft_1d(nfft,src,dst, FFTW_BACKWARD , FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
+ fftwf_execute_dft( m_plan, src,dst);
+ }
+ inline
+ void fwd(complex_type * dst,scalar_type * src,int nfft) {
+ if (m_plan==NULL) m_plan = fftwf_plan_dft_r2c_1d(nfft,src,dst,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
+ fftwf_execute_dft_r2c( m_plan,src,dst);
+ }
+ inline
+ void inv(scalar_type * dst,complex_type * src,int nfft) {
+ if (m_plan==NULL)
+ m_plan = fftwf_plan_dft_c2r_1d(nfft,src,dst,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
+ fftwf_execute_dft_c2r( m_plan, src,dst);
+ }
+
+ inline
+ void fwd2( complex_type * dst,complex_type * src,int n0,int n1) {
+ if (m_plan==NULL) m_plan = fftwf_plan_dft_2d(n0,n1,src,dst,FFTW_FORWARD,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
+ fftwf_execute_dft( m_plan, src,dst);
+ }
+ inline
+ void inv2( complex_type * dst,complex_type * src,int n0,int n1) {
+ if (m_plan==NULL) m_plan = fftwf_plan_dft_2d(n0,n1,src,dst,FFTW_BACKWARD,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
+ fftwf_execute_dft( m_plan, src,dst);
+ }
+
+ };
+ template <>
+ struct fftw_plan<double>
+ {
+ typedef double scalar_type;
+ typedef fftw_complex complex_type;
+ ::fftw_plan m_plan;
+ fftw_plan() :m_plan(NULL) {}
+ ~fftw_plan() {if (m_plan) fftw_destroy_plan(m_plan);}
+
+ inline
+ void fwd(complex_type * dst,complex_type * src,int nfft) {
+ if (m_plan==NULL) m_plan = fftw_plan_dft_1d(nfft,src,dst, FFTW_FORWARD, FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
+ fftw_execute_dft( m_plan, src,dst);
+ }
+ inline
+ void inv(complex_type * dst,complex_type * src,int nfft) {
+ if (m_plan==NULL) m_plan = fftw_plan_dft_1d(nfft,src,dst, FFTW_BACKWARD , FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
+ fftw_execute_dft( m_plan, src,dst);
+ }
+ inline
+ void fwd(complex_type * dst,scalar_type * src,int nfft) {
+ if (m_plan==NULL) m_plan = fftw_plan_dft_r2c_1d(nfft,src,dst,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
+ fftw_execute_dft_r2c( m_plan,src,dst);
+ }
+ inline
+ void inv(scalar_type * dst,complex_type * src,int nfft) {
+ if (m_plan==NULL)
+ m_plan = fftw_plan_dft_c2r_1d(nfft,src,dst,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
+ fftw_execute_dft_c2r( m_plan, src,dst);
+ }
+ inline
+ void fwd2( complex_type * dst,complex_type * src,int n0,int n1) {
+ if (m_plan==NULL) m_plan = fftw_plan_dft_2d(n0,n1,src,dst,FFTW_FORWARD,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
+ fftw_execute_dft( m_plan, src,dst);
+ }
+ inline
+ void inv2( complex_type * dst,complex_type * src,int n0,int n1) {
+ if (m_plan==NULL) m_plan = fftw_plan_dft_2d(n0,n1,src,dst,FFTW_BACKWARD,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
+ fftw_execute_dft( m_plan, src,dst);
+ }
+ };
+ template <>
+ struct fftw_plan<long double>
+ {
+ typedef long double scalar_type;
+ typedef fftwl_complex complex_type;
+ fftwl_plan m_plan;
+ fftw_plan() :m_plan(NULL) {}
+ ~fftw_plan() {if (m_plan) fftwl_destroy_plan(m_plan);}
+
+ inline
+ void fwd(complex_type * dst,complex_type * src,int nfft) {
+ if (m_plan==NULL) m_plan = fftwl_plan_dft_1d(nfft,src,dst, FFTW_FORWARD, FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
+ fftwl_execute_dft( m_plan, src,dst);
+ }
+ inline
+ void inv(complex_type * dst,complex_type * src,int nfft) {
+ if (m_plan==NULL) m_plan = fftwl_plan_dft_1d(nfft,src,dst, FFTW_BACKWARD , FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
+ fftwl_execute_dft( m_plan, src,dst);
+ }
+ inline
+ void fwd(complex_type * dst,scalar_type * src,int nfft) {
+ if (m_plan==NULL) m_plan = fftwl_plan_dft_r2c_1d(nfft,src,dst,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
+ fftwl_execute_dft_r2c( m_plan,src,dst);
+ }
+ inline
+ void inv(scalar_type * dst,complex_type * src,int nfft) {
+ if (m_plan==NULL)
+ m_plan = fftwl_plan_dft_c2r_1d(nfft,src,dst,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
+ fftwl_execute_dft_c2r( m_plan, src,dst);
+ }
+ inline
+ void fwd2( complex_type * dst,complex_type * src,int n0,int n1) {
+ if (m_plan==NULL) m_plan = fftwl_plan_dft_2d(n0,n1,src,dst,FFTW_FORWARD,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
+ fftwl_execute_dft( m_plan, src,dst);
+ }
+ inline
+ void inv2( complex_type * dst,complex_type * src,int n0,int n1) {
+ if (m_plan==NULL) m_plan = fftwl_plan_dft_2d(n0,n1,src,dst,FFTW_BACKWARD,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
+ fftwl_execute_dft( m_plan, src,dst);
+ }
+ };
+
+ template <typename _Scalar>
+ struct fftw_impl
+ {
+ typedef _Scalar Scalar;
+ typedef std::complex<Scalar> Complex;
+
+ inline
+ void clear()
+ {
+ m_plans.clear();
+ }
+
+ // complex-to-complex forward FFT
+ inline
+ void fwd( Complex * dst,const Complex *src,int nfft)
+ {
+ get_plan(nfft,false,dst,src).fwd(fftw_cast(dst), fftw_cast(src),nfft );
+ }
+
+ // real-to-complex forward FFT
+ inline
+ void fwd( Complex * dst,const Scalar * src,int nfft)
+ {
+ get_plan(nfft,false,dst,src).fwd(fftw_cast(dst), fftw_cast(src) ,nfft);
+ }
+
+ // 2-d complex-to-complex
+ inline
+ void fwd2(Complex * dst, const Complex * src, int n0,int n1)
+ {
+ get_plan(n0,n1,false,dst,src).fwd2(fftw_cast(dst), fftw_cast(src) ,n0,n1);
+ }
+
+ // inverse complex-to-complex
+ inline
+ void inv(Complex * dst,const Complex *src,int nfft)
+ {
+ get_plan(nfft,true,dst,src).inv(fftw_cast(dst), fftw_cast(src),nfft );
+ }
+
+ // half-complex to scalar
+ inline
+ void inv( Scalar * dst,const Complex * src,int nfft)
+ {
+ get_plan(nfft,true,dst,src).inv(fftw_cast(dst), fftw_cast(src),nfft );
+ }
+
+ // 2-d complex-to-complex
+ inline
+ void inv2(Complex * dst, const Complex * src, int n0,int n1)
+ {
+ get_plan(n0,n1,true,dst,src).inv2(fftw_cast(dst), fftw_cast(src) ,n0,n1);
+ }
+
+
+ protected:
+ typedef fftw_plan<Scalar> PlanData;
+
+ typedef Eigen::numext::int64_t int64_t;
+
+ typedef std::map<int64_t,PlanData> PlanMap;
+
+ PlanMap m_plans;
+
+ inline
+ PlanData & get_plan(int nfft,bool inverse,void * dst,const void * src)
+ {
+ bool inplace = (dst==src);
+ bool aligned = ( (reinterpret_cast<size_t>(src)&15) | (reinterpret_cast<size_t>(dst)&15) ) == 0;
+ int64_t key = ( (nfft<<3 ) | (inverse<<2) | (inplace<<1) | aligned ) << 1;
+ return m_plans[key];
+ }
+
+ inline
+ PlanData & get_plan(int n0,int n1,bool inverse,void * dst,const void * src)
+ {
+ bool inplace = (dst==src);
+ bool aligned = ( (reinterpret_cast<size_t>(src)&15) | (reinterpret_cast<size_t>(dst)&15) ) == 0;
+ int64_t key = ( ( (((int64_t)n0) << 30)|(n1<<3 ) | (inverse<<2) | (inplace<<1) | aligned ) << 1 ) + 1;
+ return m_plans[key];
+ }
+ };
+
+} // end namespace internal
+
+} // end namespace Eigen
diff --git a/src/EigenUnsupported/src/FFT/ei_kissfft_impl.h b/src/EigenUnsupported/src/FFT/ei_kissfft_impl.h
new file mode 100644
index 0000000..430953a
--- /dev/null
+++ b/src/EigenUnsupported/src/FFT/ei_kissfft_impl.h
@@ -0,0 +1,449 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Mark Borgerding mark a borgerding net
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+namespace Eigen {
+
+namespace internal {
+
+ // This FFT implementation was derived from kissfft http:sourceforge.net/projects/kissfft
+ // Copyright 2003-2009 Mark Borgerding
+
+template <typename _Scalar>
+struct kiss_cpx_fft
+{
+ typedef _Scalar Scalar;
+ typedef std::complex<Scalar> Complex;
+ std::vector<Complex> m_twiddles;
+ std::vector<int> m_stageRadix;
+ std::vector<int> m_stageRemainder;
+ std::vector<Complex> m_scratchBuf;
+ bool m_inverse;
+
+ inline void make_twiddles(int nfft, bool inverse)
+ {
+ using numext::sin;
+ using numext::cos;
+ m_inverse = inverse;
+ m_twiddles.resize(nfft);
+ double phinc = 0.25 * double(EIGEN_PI) / nfft;
+ Scalar flip = inverse ? Scalar(1) : Scalar(-1);
+ m_twiddles[0] = Complex(Scalar(1), Scalar(0));
+ if ((nfft&1)==0)
+ m_twiddles[nfft/2] = Complex(Scalar(-1), Scalar(0));
+ int i=1;
+ for (;i*8<nfft;++i)
+ {
+ Scalar c = Scalar(cos(i*8*phinc));
+ Scalar s = Scalar(sin(i*8*phinc));
+ m_twiddles[i] = Complex(c, s*flip);
+ m_twiddles[nfft-i] = Complex(c, -s*flip);
+ }
+ for (;i*4<nfft;++i)
+ {
+ Scalar c = Scalar(cos((2*nfft-8*i)*phinc));
+ Scalar s = Scalar(sin((2*nfft-8*i)*phinc));
+ m_twiddles[i] = Complex(s, c*flip);
+ m_twiddles[nfft-i] = Complex(s, -c*flip);
+ }
+ for (;i*8<3*nfft;++i)
+ {
+ Scalar c = Scalar(cos((8*i-2*nfft)*phinc));
+ Scalar s = Scalar(sin((8*i-2*nfft)*phinc));
+ m_twiddles[i] = Complex(-s, c*flip);
+ m_twiddles[nfft-i] = Complex(-s, -c*flip);
+ }
+ for (;i*2<nfft;++i)
+ {
+ Scalar c = Scalar(cos((4*nfft-8*i)*phinc));
+ Scalar s = Scalar(sin((4*nfft-8*i)*phinc));
+ m_twiddles[i] = Complex(-c, s*flip);
+ m_twiddles[nfft-i] = Complex(-c, -s*flip);
+ }
+ }
+
+ void factorize(int nfft)
+ {
+ //start factoring out 4's, then 2's, then 3,5,7,9,...
+ int n= nfft;
+ int p=4;
+ do {
+ while (n % p) {
+ switch (p) {
+ case 4: p = 2; break;
+ case 2: p = 3; break;
+ default: p += 2; break;
+ }
+ if (p*p>n)
+ p=n;// impossible to have a factor > sqrt(n)
+ }
+ n /= p;
+ m_stageRadix.push_back(p);
+ m_stageRemainder.push_back(n);
+ if ( p > 5 )
+ m_scratchBuf.resize(p); // scratchbuf will be needed in bfly_generic
+ }while(n>1);
+ }
+
+ template <typename _Src>
+ inline
+ void work( int stage,Complex * xout, const _Src * xin, size_t fstride,size_t in_stride)
+ {
+ int p = m_stageRadix[stage];
+ int m = m_stageRemainder[stage];
+ Complex * Fout_beg = xout;
+ Complex * Fout_end = xout + p*m;
+
+ if (m>1) {
+ do{
+ // recursive call:
+ // DFT of size m*p performed by doing
+ // p instances of smaller DFTs of size m,
+ // each one takes a decimated version of the input
+ work(stage+1, xout , xin, fstride*p,in_stride);
+ xin += fstride*in_stride;
+ }while( (xout += m) != Fout_end );
+ }else{
+ do{
+ *xout = *xin;
+ xin += fstride*in_stride;
+ }while(++xout != Fout_end );
+ }
+ xout=Fout_beg;
+
+ // recombine the p smaller DFTs
+ switch (p) {
+ case 2: bfly2(xout,fstride,m); break;
+ case 3: bfly3(xout,fstride,m); break;
+ case 4: bfly4(xout,fstride,m); break;
+ case 5: bfly5(xout,fstride,m); break;
+ default: bfly_generic(xout,fstride,m,p); break;
+ }
+ }
+
+ inline
+ void bfly2( Complex * Fout, const size_t fstride, int m)
+ {
+ for (int k=0;k<m;++k) {
+ Complex t = Fout[m+k] * m_twiddles[k*fstride];
+ Fout[m+k] = Fout[k] - t;
+ Fout[k] += t;
+ }
+ }
+
+ inline
+ void bfly4( Complex * Fout, const size_t fstride, const size_t m)
+ {
+ Complex scratch[6];
+ int negative_if_inverse = m_inverse * -2 +1;
+ for (size_t k=0;k<m;++k) {
+ scratch[0] = Fout[k+m] * m_twiddles[k*fstride];
+ scratch[1] = Fout[k+2*m] * m_twiddles[k*fstride*2];
+ scratch[2] = Fout[k+3*m] * m_twiddles[k*fstride*3];
+ scratch[5] = Fout[k] - scratch[1];
+
+ Fout[k] += scratch[1];
+ scratch[3] = scratch[0] + scratch[2];
+ scratch[4] = scratch[0] - scratch[2];
+ scratch[4] = Complex( scratch[4].imag()*negative_if_inverse , -scratch[4].real()* negative_if_inverse );
+
+ Fout[k+2*m] = Fout[k] - scratch[3];
+ Fout[k] += scratch[3];
+ Fout[k+m] = scratch[5] + scratch[4];
+ Fout[k+3*m] = scratch[5] - scratch[4];
+ }
+ }
+
+ inline
+ void bfly3( Complex * Fout, const size_t fstride, const size_t m)
+ {
+ size_t k=m;
+ const size_t m2 = 2*m;
+ Complex *tw1,*tw2;
+ Complex scratch[5];
+ Complex epi3;
+ epi3 = m_twiddles[fstride*m];
+
+ tw1=tw2=&m_twiddles[0];
+
+ do{
+ scratch[1]=Fout[m] * *tw1;
+ scratch[2]=Fout[m2] * *tw2;
+
+ scratch[3]=scratch[1]+scratch[2];
+ scratch[0]=scratch[1]-scratch[2];
+ tw1 += fstride;
+ tw2 += fstride*2;
+ Fout[m] = Complex( Fout->real() - Scalar(.5)*scratch[3].real() , Fout->imag() - Scalar(.5)*scratch[3].imag() );
+ scratch[0] *= epi3.imag();
+ *Fout += scratch[3];
+ Fout[m2] = Complex( Fout[m].real() + scratch[0].imag() , Fout[m].imag() - scratch[0].real() );
+ Fout[m] += Complex( -scratch[0].imag(),scratch[0].real() );
+ ++Fout;
+ }while(--k);
+ }
+
+ inline
+ void bfly5( Complex * Fout, const size_t fstride, const size_t m)
+ {
+ Complex *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
+ size_t u;
+ Complex scratch[13];
+ Complex * twiddles = &m_twiddles[0];
+ Complex *tw;
+ Complex ya,yb;
+ ya = twiddles[fstride*m];
+ yb = twiddles[fstride*2*m];
+
+ Fout0=Fout;
+ Fout1=Fout0+m;
+ Fout2=Fout0+2*m;
+ Fout3=Fout0+3*m;
+ Fout4=Fout0+4*m;
+
+ tw=twiddles;
+ for ( u=0; u<m; ++u ) {
+ scratch[0] = *Fout0;
+
+ scratch[1] = *Fout1 * tw[u*fstride];
+ scratch[2] = *Fout2 * tw[2*u*fstride];
+ scratch[3] = *Fout3 * tw[3*u*fstride];
+ scratch[4] = *Fout4 * tw[4*u*fstride];
+
+ scratch[7] = scratch[1] + scratch[4];
+ scratch[10] = scratch[1] - scratch[4];
+ scratch[8] = scratch[2] + scratch[3];
+ scratch[9] = scratch[2] - scratch[3];
+
+ *Fout0 += scratch[7];
+ *Fout0 += scratch[8];
+
+ scratch[5] = scratch[0] + Complex(
+ (scratch[7].real()*ya.real() ) + (scratch[8].real() *yb.real() ),
+ (scratch[7].imag()*ya.real()) + (scratch[8].imag()*yb.real())
+ );
+
+ scratch[6] = Complex(
+ (scratch[10].imag()*ya.imag()) + (scratch[9].imag()*yb.imag()),
+ -(scratch[10].real()*ya.imag()) - (scratch[9].real()*yb.imag())
+ );
+
+ *Fout1 = scratch[5] - scratch[6];
+ *Fout4 = scratch[5] + scratch[6];
+
+ scratch[11] = scratch[0] +
+ Complex(
+ (scratch[7].real()*yb.real()) + (scratch[8].real()*ya.real()),
+ (scratch[7].imag()*yb.real()) + (scratch[8].imag()*ya.real())
+ );
+
+ scratch[12] = Complex(
+ -(scratch[10].imag()*yb.imag()) + (scratch[9].imag()*ya.imag()),
+ (scratch[10].real()*yb.imag()) - (scratch[9].real()*ya.imag())
+ );
+
+ *Fout2=scratch[11]+scratch[12];
+ *Fout3=scratch[11]-scratch[12];
+
+ ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
+ }
+ }
+
+ /* perform the butterfly for one stage of a mixed radix FFT */
+ inline
+ void bfly_generic(
+ Complex * Fout,
+ const size_t fstride,
+ int m,
+ int p
+ )
+ {
+ int u,k,q1,q;
+ Complex * twiddles = &m_twiddles[0];
+ Complex t;
+ int Norig = static_cast<int>(m_twiddles.size());
+ Complex * scratchbuf = &m_scratchBuf[0];
+
+ for ( u=0; u<m; ++u ) {
+ k=u;
+ for ( q1=0 ; q1<p ; ++q1 ) {
+ scratchbuf[q1] = Fout[ k ];
+ k += m;
+ }
+
+ k=u;
+ for ( q1=0 ; q1<p ; ++q1 ) {
+ int twidx=0;
+ Fout[ k ] = scratchbuf[0];
+ for (q=1;q<p;++q ) {
+ twidx += static_cast<int>(fstride) * k;
+ if (twidx>=Norig) twidx-=Norig;
+ t=scratchbuf[q] * twiddles[twidx];
+ Fout[ k ] += t;
+ }
+ k += m;
+ }
+ }
+ }
+};
+
+template <typename _Scalar>
+struct kissfft_impl
+{
+ typedef _Scalar Scalar;
+ typedef std::complex<Scalar> Complex;
+
+ void clear()
+ {
+ m_plans.clear();
+ m_realTwiddles.clear();
+ }
+
+ inline
+ void fwd( Complex * dst,const Complex *src,int nfft)
+ {
+ get_plan(nfft,false).work(0, dst, src, 1,1);
+ }
+
+ inline
+ void fwd2( Complex * dst,const Complex *src,int n0,int n1)
+ {
+ EIGEN_UNUSED_VARIABLE(dst);
+ EIGEN_UNUSED_VARIABLE(src);
+ EIGEN_UNUSED_VARIABLE(n0);
+ EIGEN_UNUSED_VARIABLE(n1);
+ }
+
+ inline
+ void inv2( Complex * dst,const Complex *src,int n0,int n1)
+ {
+ EIGEN_UNUSED_VARIABLE(dst);
+ EIGEN_UNUSED_VARIABLE(src);
+ EIGEN_UNUSED_VARIABLE(n0);
+ EIGEN_UNUSED_VARIABLE(n1);
+ }
+
+ // real-to-complex forward FFT
+ // perform two FFTs of src even and src odd
+ // then twiddle to recombine them into the half-spectrum format
+ // then fill in the conjugate symmetric half
+ inline
+ void fwd( Complex * dst,const Scalar * src,int nfft)
+ {
+ if ( nfft&3 ) {
+ // use generic mode for odd
+ m_tmpBuf1.resize(nfft);
+ get_plan(nfft,false).work(0, &m_tmpBuf1[0], src, 1,1);
+ std::copy(m_tmpBuf1.begin(),m_tmpBuf1.begin()+(nfft>>1)+1,dst );
+ }else{
+ int ncfft = nfft>>1;
+ int ncfft2 = nfft>>2;
+ Complex * rtw = real_twiddles(ncfft2);
+
+ // use optimized mode for even real
+ fwd( dst, reinterpret_cast<const Complex*> (src), ncfft);
+ Complex dc(dst[0].real() + dst[0].imag());
+ Complex nyquist(dst[0].real() - dst[0].imag());
+ int k;
+ for ( k=1;k <= ncfft2 ; ++k ) {
+ Complex fpk = dst[k];
+ Complex fpnk = conj(dst[ncfft-k]);
+ Complex f1k = fpk + fpnk;
+ Complex f2k = fpk - fpnk;
+ Complex tw= f2k * rtw[k-1];
+ dst[k] = (f1k + tw) * Scalar(.5);
+ dst[ncfft-k] = conj(f1k -tw)*Scalar(.5);
+ }
+ dst[0] = dc;
+ dst[ncfft] = nyquist;
+ }
+ }
+
+ // inverse complex-to-complex
+ inline
+ void inv(Complex * dst,const Complex *src,int nfft)
+ {
+ get_plan(nfft,true).work(0, dst, src, 1,1);
+ }
+
+ // half-complex to scalar
+ inline
+ void inv( Scalar * dst,const Complex * src,int nfft)
+ {
+ if (nfft&3) {
+ m_tmpBuf1.resize(nfft);
+ m_tmpBuf2.resize(nfft);
+ std::copy(src,src+(nfft>>1)+1,m_tmpBuf1.begin() );
+ for (int k=1;k<(nfft>>1)+1;++k)
+ m_tmpBuf1[nfft-k] = conj(m_tmpBuf1[k]);
+ inv(&m_tmpBuf2[0],&m_tmpBuf1[0],nfft);
+ for (int k=0;k<nfft;++k)
+ dst[k] = m_tmpBuf2[k].real();
+ }else{
+ // optimized version for multiple of 4
+ int ncfft = nfft>>1;
+ int ncfft2 = nfft>>2;
+ Complex * rtw = real_twiddles(ncfft2);
+ m_tmpBuf1.resize(ncfft);
+ m_tmpBuf1[0] = Complex( src[0].real() + src[ncfft].real(), src[0].real() - src[ncfft].real() );
+ for (int k = 1; k <= ncfft / 2; ++k) {
+ Complex fk = src[k];
+ Complex fnkc = conj(src[ncfft-k]);
+ Complex fek = fk + fnkc;
+ Complex tmp = fk - fnkc;
+ Complex fok = tmp * conj(rtw[k-1]);
+ m_tmpBuf1[k] = fek + fok;
+ m_tmpBuf1[ncfft-k] = conj(fek - fok);
+ }
+ get_plan(ncfft,true).work(0, reinterpret_cast<Complex*>(dst), &m_tmpBuf1[0], 1,1);
+ }
+ }
+
+ protected:
+ typedef kiss_cpx_fft<Scalar> PlanData;
+ typedef std::map<int,PlanData> PlanMap;
+
+ PlanMap m_plans;
+ std::map<int, std::vector<Complex> > m_realTwiddles;
+ std::vector<Complex> m_tmpBuf1;
+ std::vector<Complex> m_tmpBuf2;
+
+ inline
+ int PlanKey(int nfft, bool isinverse) const { return (nfft<<1) | int(isinverse); }
+
+ inline
+ PlanData & get_plan(int nfft, bool inverse)
+ {
+ // TODO look for PlanKey(nfft, ! inverse) and conjugate the twiddles
+ PlanData & pd = m_plans[ PlanKey(nfft,inverse) ];
+ if ( pd.m_twiddles.size() == 0 ) {
+ pd.make_twiddles(nfft,inverse);
+ pd.factorize(nfft);
+ }
+ return pd;
+ }
+
+ inline
+ Complex * real_twiddles(int ncfft2)
+ {
+ using std::acos;
+ std::vector<Complex> & twidref = m_realTwiddles[ncfft2];// creates new if not there
+ if ( (int)twidref.size() != ncfft2 ) {
+ twidref.resize(ncfft2);
+ int ncfft= ncfft2<<1;
+ Scalar pi = acos( Scalar(-1) );
+ for (int k=1;k<=ncfft2;++k)
+ twidref[k-1] = exp( Complex(0,-pi * (Scalar(k) / ncfft + Scalar(.5)) ) );
+ }
+ return &twidref[0];
+ }
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
diff --git a/src/EigenUnsupported/src/IterativeSolvers/ConstrainedConjGrad.h b/src/EigenUnsupported/src/IterativeSolvers/ConstrainedConjGrad.h
new file mode 100644
index 0000000..e7d70f3
--- /dev/null
+++ b/src/EigenUnsupported/src/IterativeSolvers/ConstrainedConjGrad.h
@@ -0,0 +1,187 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+
+/* NOTE The functions of this file have been adapted from the GMM++ library */
+
+//========================================================================
+//
+// Copyright (C) 2002-2007 Yves Renard
+//
+// This file is a part of GETFEM++
+//
+// Getfem++ is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; version 2.1 of the License.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Lesser General Public License for more details.
+// You should have received a copy of the GNU Lesser General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301,
+// USA.
+//
+//========================================================================
+
+#include "../../../../Eigen/src/Core/util/NonMPL2.h"
+
+#ifndef EIGEN_CONSTRAINEDCG_H
+#define EIGEN_CONSTRAINEDCG_H
+
+#include "../../../../Eigen/Core"
+
+namespace Eigen {
+
+namespace internal {
+
+/** \ingroup IterativeLinearSolvers_Module
+ * Compute the pseudo inverse of the non-square matrix C such that
+ * \f$ CINV = (C * C^T)^{-1} * C \f$ based on a conjugate gradient method.
+ *
+ * This function is internally used by constrained_cg.
+ */
+template <typename CMatrix, typename CINVMatrix>
+void pseudo_inverse(const CMatrix &C, CINVMatrix &CINV)
+{
+ // optimisable : copie de la ligne, precalcul de C * trans(C).
+ typedef typename CMatrix::Scalar Scalar;
+ typedef typename CMatrix::Index Index;
+ // FIXME use sparse vectors ?
+ typedef Matrix<Scalar,Dynamic,1> TmpVec;
+
+ Index rows = C.rows(), cols = C.cols();
+
+ TmpVec d(rows), e(rows), l(cols), p(rows), q(rows), r(rows);
+ Scalar rho, rho_1, alpha;
+ d.setZero();
+
+ typedef Triplet<double> T;
+ std::vector<T> tripletList;
+
+ for (Index i = 0; i < rows; ++i)
+ {
+ d[i] = 1.0;
+ rho = 1.0;
+ e.setZero();
+ r = d;
+ p = d;
+
+ while (rho >= 1e-38)
+ { /* conjugate gradient to compute e */
+ /* which is the i-th row of inv(C * trans(C)) */
+ l = C.transpose() * p;
+ q = C * l;
+ alpha = rho / p.dot(q);
+ e += alpha * p;
+ r += -alpha * q;
+ rho_1 = rho;
+ rho = r.dot(r);
+ p = (rho/rho_1) * p + r;
+ }
+
+ l = C.transpose() * e; // l is the i-th row of CINV
+ // FIXME add a generic "prune/filter" expression for both dense and sparse object to sparse
+ for (Index j=0; j<l.size(); ++j)
+ if (l[j]<1e-15)
+ tripletList.push_back(T(i,j,l(j)));
+
+
+ d[i] = 0.0;
+ }
+ CINV.setFromTriplets(tripletList.begin(), tripletList.end());
+}
+
+
+
+/** \ingroup IterativeLinearSolvers_Module
+ * Constrained conjugate gradient
+ *
+ * Computes the minimum of \f$ 1/2((Ax).x) - bx \f$ under the constraint \f$ Cx \le f \f$
+ */
+template<typename TMatrix, typename CMatrix,
+ typename VectorX, typename VectorB, typename VectorF>
+void constrained_cg(const TMatrix& A, const CMatrix& C, VectorX& x,
+ const VectorB& b, const VectorF& f, IterationController &iter)
+{
+ using std::sqrt;
+ typedef typename TMatrix::Scalar Scalar;
+ typedef typename TMatrix::Index Index;
+ typedef Matrix<Scalar,Dynamic,1> TmpVec;
+
+ Scalar rho = 1.0, rho_1, lambda, gamma;
+ Index xSize = x.size();
+ TmpVec p(xSize), q(xSize), q2(xSize),
+ r(xSize), old_z(xSize), z(xSize),
+ memox(xSize);
+ std::vector<bool> satured(C.rows());
+ p.setZero();
+ iter.setRhsNorm(sqrt(b.dot(b))); // gael vect_sp(PS, b, b)
+ if (iter.rhsNorm() == 0.0) iter.setRhsNorm(1.0);
+
+ SparseMatrix<Scalar,RowMajor> CINV(C.rows(), C.cols());
+ pseudo_inverse(C, CINV);
+
+ while(true)
+ {
+ // computation of residual
+ old_z = z;
+ memox = x;
+ r = b;
+ r += A * -x;
+ z = r;
+ bool transition = false;
+ for (Index i = 0; i < C.rows(); ++i)
+ {
+ Scalar al = C.row(i).dot(x) - f.coeff(i);
+ if (al >= -1.0E-15)
+ {
+ if (!satured[i])
+ {
+ satured[i] = true;
+ transition = true;
+ }
+ Scalar bb = CINV.row(i).dot(z);
+ if (bb > 0.0)
+ // FIXME: we should allow that: z += -bb * C.row(i);
+ for (typename CMatrix::InnerIterator it(C,i); it; ++it)
+ z.coeffRef(it.index()) -= bb*it.value();
+ }
+ else
+ satured[i] = false;
+ }
+
+ // descent direction
+ rho_1 = rho;
+ rho = r.dot(z);
+
+ if (iter.finished(rho)) break;
+ if (transition || iter.first()) gamma = 0.0;
+ else gamma = (std::max)(0.0, (rho - old_z.dot(z)) / rho_1);
+ p = z + gamma*p;
+
+ ++iter;
+ // one dimensionnal optimization
+ q = A * p;
+ lambda = rho / q.dot(p);
+ for (Index i = 0; i < C.rows(); ++i)
+ {
+ if (!satured[i])
+ {
+ Scalar bb = C.row(i).dot(p) - f[i];
+ if (bb > 0.0)
+ lambda = (std::min)(lambda, (f.coeff(i)-C.row(i).dot(x)) / bb);
+ }
+ }
+ x += lambda * p;
+ memox -= x;
+ }
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_CONSTRAINEDCG_H
diff --git a/src/EigenUnsupported/src/IterativeSolvers/DGMRES.h b/src/EigenUnsupported/src/IterativeSolvers/DGMRES.h
new file mode 100644
index 0000000..5ae011b
--- /dev/null
+++ b/src/EigenUnsupported/src/IterativeSolvers/DGMRES.h
@@ -0,0 +1,511 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_DGMRES_H
+#define EIGEN_DGMRES_H
+
+#include "../../../../Eigen/Eigenvalues"
+
+namespace Eigen {
+
+template< typename _MatrixType,
+ typename _Preconditioner = DiagonalPreconditioner<typename _MatrixType::Scalar> >
+class DGMRES;
+
+namespace internal {
+
+template< typename _MatrixType, typename _Preconditioner>
+struct traits<DGMRES<_MatrixType,_Preconditioner> >
+{
+ typedef _MatrixType MatrixType;
+ typedef _Preconditioner Preconditioner;
+};
+
+/** \brief Computes a permutation vector to have a sorted sequence
+ * \param vec The vector to reorder.
+ * \param perm gives the sorted sequence on output. Must be initialized with 0..n-1
+ * \param ncut Put the ncut smallest elements at the end of the vector
+ * WARNING This is an expensive sort, so should be used only
+ * for small size vectors
+ * TODO Use modified QuickSplit or std::nth_element to get the smallest values
+ */
+template <typename VectorType, typename IndexType>
+void sortWithPermutation (VectorType& vec, IndexType& perm, typename IndexType::Scalar& ncut)
+{
+ eigen_assert(vec.size() == perm.size());
+ bool flag;
+ for (Index k = 0; k < ncut; k++)
+ {
+ flag = false;
+ for (Index j = 0; j < vec.size()-1; j++)
+ {
+ if ( vec(perm(j)) < vec(perm(j+1)) )
+ {
+ std::swap(perm(j),perm(j+1));
+ flag = true;
+ }
+ if (!flag) break; // The vector is in sorted order
+ }
+ }
+}
+
+}
+/**
+ * \ingroup IterativeLinearSolvers_Module
+ * \brief A Restarted GMRES with deflation.
+ * This class implements a modification of the GMRES solver for
+ * sparse linear systems. The basis is built with modified
+ * Gram-Schmidt. At each restart, a few approximated eigenvectors
+ * corresponding to the smallest eigenvalues are used to build a
+ * preconditioner for the next cycle. This preconditioner
+ * for deflation can be combined with any other preconditioner,
+ * the IncompleteLUT for instance. The preconditioner is applied
+ * at right of the matrix and the combination is multiplicative.
+ *
+ * \tparam _MatrixType the type of the sparse matrix A, can be a dense or a sparse matrix.
+ * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner
+ * Typical usage :
+ * \code
+ * SparseMatrix<double> A;
+ * VectorXd x, b;
+ * //Fill A and b ...
+ * DGMRES<SparseMatrix<double> > solver;
+ * solver.set_restart(30); // Set restarting value
+ * solver.setEigenv(1); // Set the number of eigenvalues to deflate
+ * solver.compute(A);
+ * x = solver.solve(b);
+ * \endcode
+ *
+ * DGMRES can also be used in a matrix-free context, see the following \link MatrixfreeSolverExample example \endlink.
+ *
+ * References :
+ * [1] D. NUENTSA WAKAM and F. PACULL, Memory Efficient Hybrid
+ * Algebraic Solvers for Linear Systems Arising from Compressible
+ * Flows, Computers and Fluids, In Press,
+ * https://doi.org/10.1016/j.compfluid.2012.03.023
+ * [2] K. Burrage and J. Erhel, On the performance of various
+ * adaptive preconditioned GMRES strategies, 5(1998), 101-121.
+ * [3] J. Erhel, K. Burrage and B. Pohl, Restarted GMRES
+ * preconditioned by deflation,J. Computational and Applied
+ * Mathematics, 69(1996), 303-318.
+
+ *
+ */
+template< typename _MatrixType, typename _Preconditioner>
+class DGMRES : public IterativeSolverBase<DGMRES<_MatrixType,_Preconditioner> >
+{
+ typedef IterativeSolverBase<DGMRES> Base;
+ using Base::matrix;
+ using Base::m_error;
+ using Base::m_iterations;
+ using Base::m_info;
+ using Base::m_isInitialized;
+ using Base::m_tolerance;
+ public:
+ using Base::_solve_impl;
+ using Base::_solve_with_guess_impl;
+ typedef _MatrixType MatrixType;
+ typedef typename MatrixType::Scalar Scalar;
+ typedef typename MatrixType::StorageIndex StorageIndex;
+ typedef typename MatrixType::RealScalar RealScalar;
+ typedef _Preconditioner Preconditioner;
+ typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
+ typedef Matrix<RealScalar,Dynamic,Dynamic> DenseRealMatrix;
+ typedef Matrix<Scalar,Dynamic,1> DenseVector;
+ typedef Matrix<RealScalar,Dynamic,1> DenseRealVector;
+ typedef Matrix<std::complex<RealScalar>, Dynamic, 1> ComplexVector;
+
+
+ /** Default constructor. */
+ DGMRES() : Base(),m_restart(30),m_neig(0),m_r(0),m_maxNeig(5),m_isDeflAllocated(false),m_isDeflInitialized(false) {}
+
+ /** Initialize the solver with matrix \a A for further \c Ax=b solving.
+ *
+ * This constructor is a shortcut for the default constructor followed
+ * by a call to compute().
+ *
+ * \warning this class stores a reference to the matrix A as well as some
+ * precomputed values that depend on it. Therefore, if \a A is changed
+ * this class becomes invalid. Call compute() to update it with the new
+ * matrix A, or modify a copy of A.
+ */
+ template<typename MatrixDerived>
+ explicit DGMRES(const EigenBase<MatrixDerived>& A) : Base(A.derived()), m_restart(30),m_neig(0),m_r(0),m_maxNeig(5),m_isDeflAllocated(false),m_isDeflInitialized(false) {}
+
+ ~DGMRES() {}
+
+ /** \internal */
+ template<typename Rhs,typename Dest>
+ void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const
+ {
+ EIGEN_STATIC_ASSERT(Rhs::ColsAtCompileTime==1 || Dest::ColsAtCompileTime==1, YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX);
+
+ m_iterations = Base::maxIterations();
+ m_error = Base::m_tolerance;
+
+ dgmres(matrix(), b, x, Base::m_preconditioner);
+ }
+
+ /**
+ * Get the restart value
+ */
+ Index restart() { return m_restart; }
+
+ /**
+ * Set the restart value (default is 30)
+ */
+ void set_restart(const Index restart) { m_restart=restart; }
+
+ /**
+ * Set the number of eigenvalues to deflate at each restart
+ */
+ void setEigenv(const Index neig)
+ {
+ m_neig = neig;
+ if (neig+1 > m_maxNeig) m_maxNeig = neig+1; // To allow for complex conjugates
+ }
+
+ /**
+ * Get the size of the deflation subspace size
+ */
+ Index deflSize() {return m_r; }
+
+ /**
+ * Set the maximum size of the deflation subspace
+ */
+ void setMaxEigenv(const Index maxNeig) { m_maxNeig = maxNeig; }
+
+ protected:
+ // DGMRES algorithm
+ template<typename Rhs, typename Dest>
+ void dgmres(const MatrixType& mat,const Rhs& rhs, Dest& x, const Preconditioner& precond) const;
+ // Perform one cycle of GMRES
+ template<typename Dest>
+ Index dgmresCycle(const MatrixType& mat, const Preconditioner& precond, Dest& x, DenseVector& r0, RealScalar& beta, const RealScalar& normRhs, Index& nbIts) const;
+ // Compute data to use for deflation
+ Index dgmresComputeDeflationData(const MatrixType& mat, const Preconditioner& precond, const Index& it, StorageIndex& neig) const;
+ // Apply deflation to a vector
+ template<typename RhsType, typename DestType>
+ Index dgmresApplyDeflation(const RhsType& In, DestType& Out) const;
+ ComplexVector schurValues(const ComplexSchur<DenseMatrix>& schurofH) const;
+ ComplexVector schurValues(const RealSchur<DenseMatrix>& schurofH) const;
+ // Init data for deflation
+ void dgmresInitDeflation(Index& rows) const;
+ mutable DenseMatrix m_V; // Krylov basis vectors
+ mutable DenseMatrix m_H; // Hessenberg matrix
+ mutable DenseMatrix m_Hes; // Initial hessenberg matrix without Givens rotations applied
+ mutable Index m_restart; // Maximum size of the Krylov subspace
+ mutable DenseMatrix m_U; // Vectors that form the basis of the invariant subspace
+ mutable DenseMatrix m_MU; // matrix operator applied to m_U (for next cycles)
+ mutable DenseMatrix m_T; /* T=U^T*M^{-1}*A*U */
+ mutable PartialPivLU<DenseMatrix> m_luT; // LU factorization of m_T
+ mutable StorageIndex m_neig; //Number of eigenvalues to extract at each restart
+ mutable Index m_r; // Current number of deflated eigenvalues, size of m_U
+ mutable Index m_maxNeig; // Maximum number of eigenvalues to deflate
+ mutable RealScalar m_lambdaN; //Modulus of the largest eigenvalue of A
+ mutable bool m_isDeflAllocated;
+ mutable bool m_isDeflInitialized;
+
+ //Adaptive strategy
+ mutable RealScalar m_smv; // Smaller multiple of the remaining number of steps allowed
+ mutable bool m_force; // Force the use of deflation at each restart
+
+};
+/**
+ * \brief Perform several cycles of restarted GMRES with modified Gram Schmidt,
+ *
+ * A right preconditioner is used combined with deflation.
+ *
+ */
+template< typename _MatrixType, typename _Preconditioner>
+template<typename Rhs, typename Dest>
+void DGMRES<_MatrixType, _Preconditioner>::dgmres(const MatrixType& mat,const Rhs& rhs, Dest& x,
+ const Preconditioner& precond) const
+{
+ const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
+
+ RealScalar normRhs = rhs.norm();
+ if(normRhs <= considerAsZero)
+ {
+ x.setZero();
+ m_error = 0;
+ return;
+ }
+
+ //Initialization
+ m_isDeflInitialized = false;
+ Index n = mat.rows();
+ DenseVector r0(n);
+ Index nbIts = 0;
+ m_H.resize(m_restart+1, m_restart);
+ m_Hes.resize(m_restart, m_restart);
+ m_V.resize(n,m_restart+1);
+ //Initial residual vector and initial norm
+ if(x.squaredNorm()==0)
+ x = precond.solve(rhs);
+ r0 = rhs - mat * x;
+ RealScalar beta = r0.norm();
+
+ m_error = beta/normRhs;
+ if(m_error < m_tolerance)
+ m_info = Success;
+ else
+ m_info = NoConvergence;
+
+ // Iterative process
+ while (nbIts < m_iterations && m_info == NoConvergence)
+ {
+ dgmresCycle(mat, precond, x, r0, beta, normRhs, nbIts);
+
+ // Compute the new residual vector for the restart
+ if (nbIts < m_iterations && m_info == NoConvergence) {
+ r0 = rhs - mat * x;
+ beta = r0.norm();
+ }
+ }
+}
+
+/**
+ * \brief Perform one restart cycle of DGMRES
+ * \param mat The coefficient matrix
+ * \param precond The preconditioner
+ * \param x the new approximated solution
+ * \param r0 The initial residual vector
+ * \param beta The norm of the residual computed so far
+ * \param normRhs The norm of the right hand side vector
+ * \param nbIts The number of iterations
+ */
+template< typename _MatrixType, typename _Preconditioner>
+template<typename Dest>
+Index DGMRES<_MatrixType, _Preconditioner>::dgmresCycle(const MatrixType& mat, const Preconditioner& precond, Dest& x, DenseVector& r0, RealScalar& beta, const RealScalar& normRhs, Index& nbIts) const
+{
+ //Initialization
+ DenseVector g(m_restart+1); // Right hand side of the least square problem
+ g.setZero();
+ g(0) = Scalar(beta);
+ m_V.col(0) = r0/beta;
+ m_info = NoConvergence;
+ std::vector<JacobiRotation<Scalar> >gr(m_restart); // Givens rotations
+ Index it = 0; // Number of inner iterations
+ Index n = mat.rows();
+ DenseVector tv1(n), tv2(n); //Temporary vectors
+ while (m_info == NoConvergence && it < m_restart && nbIts < m_iterations)
+ {
+ // Apply preconditioner(s) at right
+ if (m_isDeflInitialized )
+ {
+ dgmresApplyDeflation(m_V.col(it), tv1); // Deflation
+ tv2 = precond.solve(tv1);
+ }
+ else
+ {
+ tv2 = precond.solve(m_V.col(it)); // User's selected preconditioner
+ }
+ tv1 = mat * tv2;
+
+ // Orthogonalize it with the previous basis in the basis using modified Gram-Schmidt
+ Scalar coef;
+ for (Index i = 0; i <= it; ++i)
+ {
+ coef = tv1.dot(m_V.col(i));
+ tv1 = tv1 - coef * m_V.col(i);
+ m_H(i,it) = coef;
+ m_Hes(i,it) = coef;
+ }
+ // Normalize the vector
+ coef = tv1.norm();
+ m_V.col(it+1) = tv1/coef;
+ m_H(it+1, it) = coef;
+// m_Hes(it+1,it) = coef;
+
+ // FIXME Check for happy breakdown
+
+ // Update Hessenberg matrix with Givens rotations
+ for (Index i = 1; i <= it; ++i)
+ {
+ m_H.col(it).applyOnTheLeft(i-1,i,gr[i-1].adjoint());
+ }
+ // Compute the new plane rotation
+ gr[it].makeGivens(m_H(it, it), m_H(it+1,it));
+ // Apply the new rotation
+ m_H.col(it).applyOnTheLeft(it,it+1,gr[it].adjoint());
+ g.applyOnTheLeft(it,it+1, gr[it].adjoint());
+
+ beta = std::abs(g(it+1));
+ m_error = beta/normRhs;
+ // std::cerr << nbIts << " Relative Residual Norm " << m_error << std::endl;
+ it++; nbIts++;
+
+ if (m_error < m_tolerance)
+ {
+ // The method has converged
+ m_info = Success;
+ break;
+ }
+ }
+
+ // Compute the new coefficients by solving the least square problem
+// it++;
+ //FIXME Check first if the matrix is singular ... zero diagonal
+ DenseVector nrs(m_restart);
+ nrs = m_H.topLeftCorner(it,it).template triangularView<Upper>().solve(g.head(it));
+
+ // Form the new solution
+ if (m_isDeflInitialized)
+ {
+ tv1 = m_V.leftCols(it) * nrs;
+ dgmresApplyDeflation(tv1, tv2);
+ x = x + precond.solve(tv2);
+ }
+ else
+ x = x + precond.solve(m_V.leftCols(it) * nrs);
+
+ // Go for a new cycle and compute data for deflation
+ if(nbIts < m_iterations && m_info == NoConvergence && m_neig > 0 && (m_r+m_neig) < m_maxNeig)
+ dgmresComputeDeflationData(mat, precond, it, m_neig);
+ return 0;
+
+}
+
+
+template< typename _MatrixType, typename _Preconditioner>
+void DGMRES<_MatrixType, _Preconditioner>::dgmresInitDeflation(Index& rows) const
+{
+ m_U.resize(rows, m_maxNeig);
+ m_MU.resize(rows, m_maxNeig);
+ m_T.resize(m_maxNeig, m_maxNeig);
+ m_lambdaN = 0.0;
+ m_isDeflAllocated = true;
+}
+
+template< typename _MatrixType, typename _Preconditioner>
+inline typename DGMRES<_MatrixType, _Preconditioner>::ComplexVector DGMRES<_MatrixType, _Preconditioner>::schurValues(const ComplexSchur<DenseMatrix>& schurofH) const
+{
+ return schurofH.matrixT().diagonal();
+}
+
+template< typename _MatrixType, typename _Preconditioner>
+inline typename DGMRES<_MatrixType, _Preconditioner>::ComplexVector DGMRES<_MatrixType, _Preconditioner>::schurValues(const RealSchur<DenseMatrix>& schurofH) const
+{
+ const DenseMatrix& T = schurofH.matrixT();
+ Index it = T.rows();
+ ComplexVector eig(it);
+ Index j = 0;
+ while (j < it-1)
+ {
+ if (T(j+1,j) ==Scalar(0))
+ {
+ eig(j) = std::complex<RealScalar>(T(j,j),RealScalar(0));
+ j++;
+ }
+ else
+ {
+ eig(j) = std::complex<RealScalar>(T(j,j),T(j+1,j));
+ eig(j+1) = std::complex<RealScalar>(T(j,j+1),T(j+1,j+1));
+ j++;
+ }
+ }
+ if (j < it-1) eig(j) = std::complex<RealScalar>(T(j,j),RealScalar(0));
+ return eig;
+}
+
+template< typename _MatrixType, typename _Preconditioner>
+Index DGMRES<_MatrixType, _Preconditioner>::dgmresComputeDeflationData(const MatrixType& mat, const Preconditioner& precond, const Index& it, StorageIndex& neig) const
+{
+ // First, find the Schur form of the Hessenberg matrix H
+ typename internal::conditional<NumTraits<Scalar>::IsComplex, ComplexSchur<DenseMatrix>, RealSchur<DenseMatrix> >::type schurofH;
+ bool computeU = true;
+ DenseMatrix matrixQ(it,it);
+ matrixQ.setIdentity();
+ schurofH.computeFromHessenberg(m_Hes.topLeftCorner(it,it), matrixQ, computeU);
+
+ ComplexVector eig(it);
+ Matrix<StorageIndex,Dynamic,1>perm(it);
+ eig = this->schurValues(schurofH);
+
+ // Reorder the absolute values of Schur values
+ DenseRealVector modulEig(it);
+ for (Index j=0; j<it; ++j) modulEig(j) = std::abs(eig(j));
+ perm.setLinSpaced(it,0,internal::convert_index<StorageIndex>(it-1));
+ internal::sortWithPermutation(modulEig, perm, neig);
+
+ if (!m_lambdaN)
+ {
+ m_lambdaN = (std::max)(modulEig.maxCoeff(), m_lambdaN);
+ }
+ //Count the real number of extracted eigenvalues (with complex conjugates)
+ Index nbrEig = 0;
+ while (nbrEig < neig)
+ {
+ if(eig(perm(it-nbrEig-1)).imag() == RealScalar(0)) nbrEig++;
+ else nbrEig += 2;
+ }
+ // Extract the Schur vectors corresponding to the smallest Ritz values
+ DenseMatrix Sr(it, nbrEig);
+ Sr.setZero();
+ for (Index j = 0; j < nbrEig; j++)
+ {
+ Sr.col(j) = schurofH.matrixU().col(perm(it-j-1));
+ }
+
+ // Form the Schur vectors of the initial matrix using the Krylov basis
+ DenseMatrix X;
+ X = m_V.leftCols(it) * Sr;
+ if (m_r)
+ {
+ // Orthogonalize X against m_U using modified Gram-Schmidt
+ for (Index j = 0; j < nbrEig; j++)
+ for (Index k =0; k < m_r; k++)
+ X.col(j) = X.col(j) - (m_U.col(k).dot(X.col(j)))*m_U.col(k);
+ }
+
+ // Compute m_MX = A * M^-1 * X
+ Index m = m_V.rows();
+ if (!m_isDeflAllocated)
+ dgmresInitDeflation(m);
+ DenseMatrix MX(m, nbrEig);
+ DenseVector tv1(m);
+ for (Index j = 0; j < nbrEig; j++)
+ {
+ tv1 = mat * X.col(j);
+ MX.col(j) = precond.solve(tv1);
+ }
+
+ //Update m_T = [U'MU U'MX; X'MU X'MX]
+ m_T.block(m_r, m_r, nbrEig, nbrEig) = X.transpose() * MX;
+ if(m_r)
+ {
+ m_T.block(0, m_r, m_r, nbrEig) = m_U.leftCols(m_r).transpose() * MX;
+ m_T.block(m_r, 0, nbrEig, m_r) = X.transpose() * m_MU.leftCols(m_r);
+ }
+
+ // Save X into m_U and m_MX in m_MU
+ for (Index j = 0; j < nbrEig; j++) m_U.col(m_r+j) = X.col(j);
+ for (Index j = 0; j < nbrEig; j++) m_MU.col(m_r+j) = MX.col(j);
+ // Increase the size of the invariant subspace
+ m_r += nbrEig;
+
+ // Factorize m_T into m_luT
+ m_luT.compute(m_T.topLeftCorner(m_r, m_r));
+
+ //FIXME CHeck if the factorization was correctly done (nonsingular matrix)
+ m_isDeflInitialized = true;
+ return 0;
+}
+template<typename _MatrixType, typename _Preconditioner>
+template<typename RhsType, typename DestType>
+Index DGMRES<_MatrixType, _Preconditioner>::dgmresApplyDeflation(const RhsType &x, DestType &y) const
+{
+ DenseVector x1 = m_U.leftCols(m_r).transpose() * x;
+ y = x + m_U.leftCols(m_r) * ( m_lambdaN * m_luT.solve(x1) - x1);
+ return 0;
+}
+
+} // end namespace Eigen
+#endif
diff --git a/src/EigenUnsupported/src/IterativeSolvers/GMRES.h b/src/EigenUnsupported/src/IterativeSolvers/GMRES.h
new file mode 100644
index 0000000..ff91209
--- /dev/null
+++ b/src/EigenUnsupported/src/IterativeSolvers/GMRES.h
@@ -0,0 +1,335 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2012, 2014 Kolja Brix <brix@igpm.rwth-aaachen.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GMRES_H
+#define EIGEN_GMRES_H
+
+namespace Eigen {
+
+namespace internal {
+
+/**
+* Generalized Minimal Residual Algorithm based on the
+* Arnoldi algorithm implemented with Householder reflections.
+*
+* Parameters:
+* \param mat matrix of linear system of equations
+* \param rhs right hand side vector of linear system of equations
+* \param x on input: initial guess, on output: solution
+* \param precond preconditioner used
+* \param iters on input: maximum number of iterations to perform
+* on output: number of iterations performed
+* \param restart number of iterations for a restart
+* \param tol_error on input: relative residual tolerance
+* on output: residuum achieved
+*
+* \sa IterativeMethods::bicgstab()
+*
+*
+* For references, please see:
+*
+* Saad, Y. and Schultz, M. H.
+* GMRES: A Generalized Minimal Residual Algorithm for Solving Nonsymmetric Linear Systems.
+* SIAM J.Sci.Stat.Comp. 7, 1986, pp. 856 - 869.
+*
+* Saad, Y.
+* Iterative Methods for Sparse Linear Systems.
+* Society for Industrial and Applied Mathematics, Philadelphia, 2003.
+*
+* Walker, H. F.
+* Implementations of the GMRES method.
+* Comput.Phys.Comm. 53, 1989, pp. 311 - 320.
+*
+* Walker, H. F.
+* Implementation of the GMRES Method using Householder Transformations.
+* SIAM J.Sci.Stat.Comp. 9, 1988, pp. 152 - 163.
+*
+*/
+template<typename MatrixType, typename Rhs, typename Dest, typename Preconditioner>
+bool gmres(const MatrixType & mat, const Rhs & rhs, Dest & x, const Preconditioner & precond,
+ Index &iters, const Index &restart, typename Dest::RealScalar & tol_error) {
+
+ using std::sqrt;
+ using std::abs;
+
+ typedef typename Dest::RealScalar RealScalar;
+ typedef typename Dest::Scalar Scalar;
+ typedef Matrix < Scalar, Dynamic, 1 > VectorType;
+ typedef Matrix < Scalar, Dynamic, Dynamic, ColMajor> FMatrixType;
+
+ const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
+
+ if(rhs.norm() <= considerAsZero)
+ {
+ x.setZero();
+ tol_error = 0;
+ return true;
+ }
+
+ RealScalar tol = tol_error;
+ const Index maxIters = iters;
+ iters = 0;
+
+ const Index m = mat.rows();
+
+ // residual and preconditioned residual
+ VectorType p0 = rhs - mat*x;
+ VectorType r0 = precond.solve(p0);
+
+ const RealScalar r0Norm = r0.norm();
+
+ // is initial guess already good enough?
+ if(r0Norm == 0)
+ {
+ tol_error = 0;
+ return true;
+ }
+
+ // storage for Hessenberg matrix and Householder data
+ FMatrixType H = FMatrixType::Zero(m, restart + 1);
+ VectorType w = VectorType::Zero(restart + 1);
+ VectorType tau = VectorType::Zero(restart + 1);
+
+ // storage for Jacobi rotations
+ std::vector < JacobiRotation < Scalar > > G(restart);
+
+ // storage for temporaries
+ VectorType t(m), v(m), workspace(m), x_new(m);
+
+ // generate first Householder vector
+ Ref<VectorType> H0_tail = H.col(0).tail(m - 1);
+ RealScalar beta;
+ r0.makeHouseholder(H0_tail, tau.coeffRef(0), beta);
+ w(0) = Scalar(beta);
+
+ for (Index k = 1; k <= restart; ++k)
+ {
+ ++iters;
+
+ v = VectorType::Unit(m, k - 1);
+
+ // apply Householder reflections H_{1} ... H_{k-1} to v
+ // TODO: use a HouseholderSequence
+ for (Index i = k - 1; i >= 0; --i) {
+ v.tail(m - i).applyHouseholderOnTheLeft(H.col(i).tail(m - i - 1), tau.coeffRef(i), workspace.data());
+ }
+
+ // apply matrix M to v: v = mat * v;
+ t.noalias() = mat * v;
+ v = precond.solve(t);
+
+ // apply Householder reflections H_{k-1} ... H_{1} to v
+ // TODO: use a HouseholderSequence
+ for (Index i = 0; i < k; ++i) {
+ v.tail(m - i).applyHouseholderOnTheLeft(H.col(i).tail(m - i - 1), tau.coeffRef(i), workspace.data());
+ }
+
+ if (v.tail(m - k).norm() != 0.0)
+ {
+ if (k <= restart)
+ {
+ // generate new Householder vector
+ Ref<VectorType> Hk_tail = H.col(k).tail(m - k - 1);
+ v.tail(m - k).makeHouseholder(Hk_tail, tau.coeffRef(k), beta);
+
+ // apply Householder reflection H_{k} to v
+ v.tail(m - k).applyHouseholderOnTheLeft(Hk_tail, tau.coeffRef(k), workspace.data());
+ }
+ }
+
+ if (k > 1)
+ {
+ for (Index i = 0; i < k - 1; ++i)
+ {
+ // apply old Givens rotations to v
+ v.applyOnTheLeft(i, i + 1, G[i].adjoint());
+ }
+ }
+
+ if (k<m && v(k) != (Scalar) 0)
+ {
+ // determine next Givens rotation
+ G[k - 1].makeGivens(v(k - 1), v(k));
+
+ // apply Givens rotation to v and w
+ v.applyOnTheLeft(k - 1, k, G[k - 1].adjoint());
+ w.applyOnTheLeft(k - 1, k, G[k - 1].adjoint());
+ }
+
+ // insert coefficients into upper matrix triangle
+ H.col(k-1).head(k) = v.head(k);
+
+ tol_error = abs(w(k)) / r0Norm;
+ bool stop = (k==m || tol_error < tol || iters == maxIters);
+
+ if (stop || k == restart)
+ {
+ // solve upper triangular system
+ Ref<VectorType> y = w.head(k);
+ H.topLeftCorner(k, k).template triangularView <Upper>().solveInPlace(y);
+
+ // use Horner-like scheme to calculate solution vector
+ x_new.setZero();
+ for (Index i = k - 1; i >= 0; --i)
+ {
+ x_new(i) += y(i);
+ // apply Householder reflection H_{i} to x_new
+ x_new.tail(m - i).applyHouseholderOnTheLeft(H.col(i).tail(m - i - 1), tau.coeffRef(i), workspace.data());
+ }
+
+ x += x_new;
+
+ if(stop)
+ {
+ return true;
+ }
+ else
+ {
+ k=0;
+
+ // reset data for restart
+ p0.noalias() = rhs - mat*x;
+ r0 = precond.solve(p0);
+
+ // clear Hessenberg matrix and Householder data
+ H.setZero();
+ w.setZero();
+ tau.setZero();
+
+ // generate first Householder vector
+ r0.makeHouseholder(H0_tail, tau.coeffRef(0), beta);
+ w(0) = Scalar(beta);
+ }
+ }
+ }
+
+ return false;
+
+}
+
+}
+
+template< typename _MatrixType,
+ typename _Preconditioner = DiagonalPreconditioner<typename _MatrixType::Scalar> >
+class GMRES;
+
+namespace internal {
+
+template< typename _MatrixType, typename _Preconditioner>
+struct traits<GMRES<_MatrixType,_Preconditioner> >
+{
+ typedef _MatrixType MatrixType;
+ typedef _Preconditioner Preconditioner;
+};
+
+}
+
+/** \ingroup IterativeLinearSolvers_Module
+ * \brief A GMRES solver for sparse square problems
+ *
+ * This class allows to solve for A.x = b sparse linear problems using a generalized minimal
+ * residual method. The vectors x and b can be either dense or sparse.
+ *
+ * \tparam _MatrixType the type of the sparse matrix A, can be a dense or a sparse matrix.
+ * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner
+ *
+ * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
+ * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations
+ * and NumTraits<Scalar>::epsilon() for the tolerance.
+ *
+ * This class can be used as the direct solver classes. Here is a typical usage example:
+ * \code
+ * int n = 10000;
+ * VectorXd x(n), b(n);
+ * SparseMatrix<double> A(n,n);
+ * // fill A and b
+ * GMRES<SparseMatrix<double> > solver(A);
+ * x = solver.solve(b);
+ * std::cout << "#iterations: " << solver.iterations() << std::endl;
+ * std::cout << "estimated error: " << solver.error() << std::endl;
+ * // update b, and solve again
+ * x = solver.solve(b);
+ * \endcode
+ *
+ * By default the iterations start with x=0 as an initial guess of the solution.
+ * One can control the start using the solveWithGuess() method.
+ *
+ * GMRES can also be used in a matrix-free context, see the following \link MatrixfreeSolverExample example \endlink.
+ *
+ * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
+ */
+template< typename _MatrixType, typename _Preconditioner>
+class GMRES : public IterativeSolverBase<GMRES<_MatrixType,_Preconditioner> >
+{
+ typedef IterativeSolverBase<GMRES> Base;
+ using Base::matrix;
+ using Base::m_error;
+ using Base::m_iterations;
+ using Base::m_info;
+ using Base::m_isInitialized;
+
+private:
+ Index m_restart;
+
+public:
+ using Base::_solve_impl;
+ typedef _MatrixType MatrixType;
+ typedef typename MatrixType::Scalar Scalar;
+ typedef typename MatrixType::RealScalar RealScalar;
+ typedef _Preconditioner Preconditioner;
+
+public:
+
+ /** Default constructor. */
+ GMRES() : Base(), m_restart(30) {}
+
+ /** Initialize the solver with matrix \a A for further \c Ax=b solving.
+ *
+ * This constructor is a shortcut for the default constructor followed
+ * by a call to compute().
+ *
+ * \warning this class stores a reference to the matrix A as well as some
+ * precomputed values that depend on it. Therefore, if \a A is changed
+ * this class becomes invalid. Call compute() to update it with the new
+ * matrix A, or modify a copy of A.
+ */
+ template<typename MatrixDerived>
+ explicit GMRES(const EigenBase<MatrixDerived>& A) : Base(A.derived()), m_restart(30) {}
+
+ ~GMRES() {}
+
+ /** Get the number of iterations after that a restart is performed.
+ */
+ Index get_restart() { return m_restart; }
+
+ /** Set the number of iterations after that a restart is performed.
+ * \param restart number of iterations for a restarti, default is 30.
+ */
+ void set_restart(const Index restart) { m_restart=restart; }
+
+ /** \internal */
+ template<typename Rhs,typename Dest>
+ void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const
+ {
+ m_iterations = Base::maxIterations();
+ m_error = Base::m_tolerance;
+ bool ret = internal::gmres(matrix(), b, x, Base::m_preconditioner, m_iterations, m_restart, m_error);
+ m_info = (!ret) ? NumericalIssue
+ : m_error <= Base::m_tolerance ? Success
+ : NoConvergence;
+ }
+
+protected:
+
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_GMRES_H
diff --git a/src/EigenUnsupported/src/IterativeSolvers/IDRS.h b/src/EigenUnsupported/src/IterativeSolvers/IDRS.h
new file mode 100755
index 0000000..90d20fa
--- /dev/null
+++ b/src/EigenUnsupported/src/IterativeSolvers/IDRS.h
@@ -0,0 +1,436 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2020 Chris Schoutrop <c.e.m.schoutrop@tue.nl>
+// Copyright (C) 2020 Jens Wehner <j.wehner@esciencecenter.nl>
+// Copyright (C) 2020 Jan van Dijk <j.v.dijk@tue.nl>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#ifndef EIGEN_IDRS_H
+#define EIGEN_IDRS_H
+
+namespace Eigen
+{
+
+ namespace internal
+ {
+ /** \internal Low-level Induced Dimension Reduction algoritm
+ \param A The matrix A
+ \param b The right hand side vector b
+ \param x On input and initial solution, on output the computed solution.
+ \param precond A preconditioner being able to efficiently solve for an
+ approximation of Ax=b (regardless of b)
+ \param iter On input the max number of iteration, on output the number of performed iterations.
+ \param relres On input the tolerance error, on output an estimation of the relative error.
+ \param S On input Number of the dimension of the shadow space.
+ \param smoothing switches residual smoothing on.
+ \param angle small omega lead to faster convergence at the expense of numerical stability
+ \param replacement switches on a residual replacement strategy to increase accuracy of residual at the expense of more Mat*vec products
+ \return false in the case of numerical issue, for example a break down of IDRS.
+ */
+ template<typename Vector, typename RealScalar>
+ typename Vector::Scalar omega(const Vector& t, const Vector& s, RealScalar angle)
+ {
+ using numext::abs;
+ typedef typename Vector::Scalar Scalar;
+ const RealScalar ns = s.norm();
+ const RealScalar nt = t.norm();
+ const Scalar ts = t.dot(s);
+ const RealScalar rho = abs(ts / (nt * ns));
+
+ if (rho < angle) {
+ if (ts == Scalar(0)) {
+ return Scalar(0);
+ }
+ // Original relation for om is given by
+ // om = om * angle / rho;
+ // To alleviate potential (near) division by zero this can be rewritten as
+ // om = angle * (ns / nt) * (ts / abs(ts)) = angle * (ns / nt) * sgn(ts)
+ return angle * (ns / nt) * (ts / abs(ts));
+ }
+ return ts / (nt * nt);
+ }
+
+ template <typename MatrixType, typename Rhs, typename Dest, typename Preconditioner>
+ bool idrs(const MatrixType& A, const Rhs& b, Dest& x, const Preconditioner& precond,
+ Index& iter,
+ typename Dest::RealScalar& relres, Index S, bool smoothing, typename Dest::RealScalar angle, bool replacement)
+ {
+ typedef typename Dest::RealScalar RealScalar;
+ typedef typename Dest::Scalar Scalar;
+ typedef Matrix<Scalar, Dynamic, 1> VectorType;
+ typedef Matrix<Scalar, Dynamic, Dynamic, ColMajor> DenseMatrixType;
+ const Index N = b.size();
+ S = S < x.rows() ? S : x.rows();
+ const RealScalar tol = relres;
+ const Index maxit = iter;
+
+ Index replacements = 0;
+ bool trueres = false;
+
+ FullPivLU<DenseMatrixType> lu_solver;
+
+ DenseMatrixType P;
+ {
+ HouseholderQR<DenseMatrixType> qr(DenseMatrixType::Random(N, S));
+ P = (qr.householderQ() * DenseMatrixType::Identity(N, S));
+ }
+
+ const RealScalar normb = b.norm();
+
+ if (internal::isApprox(normb, RealScalar(0)))
+ {
+ //Solution is the zero vector
+ x.setZero();
+ iter = 0;
+ relres = 0;
+ return true;
+ }
+ // from http://homepage.tudelft.nl/1w5b5/IDRS/manual.pdf
+ // A peak in the residual is considered dangerously high if‖ri‖/‖b‖> C(tol/epsilon).
+ // With epsilon the
+ // relative machine precision. The factor tol/epsilon corresponds to the size of a
+ // finite precision number that is so large that the absolute round-off error in
+ // this number, when propagated through the process, makes it impossible to
+ // achieve the required accuracy.The factor C accounts for the accumulation of
+ // round-off errors. This parameter has beenset to 10−3.
+ // mp is epsilon/C
+ // 10^3 * eps is very conservative, so normally no residual replacements will take place.
+ // It only happens if things go very wrong. Too many restarts may ruin the convergence.
+ const RealScalar mp = RealScalar(1e3) * NumTraits<Scalar>::epsilon();
+
+
+
+ //Compute initial residual
+ const RealScalar tolb = tol * normb; //Relative tolerance
+ VectorType r = b - A * x;
+
+ VectorType x_s, r_s;
+
+ if (smoothing)
+ {
+ x_s = x;
+ r_s = r;
+ }
+
+ RealScalar normr = r.norm();
+
+ if (normr <= tolb)
+ {
+ //Initial guess is a good enough solution
+ iter = 0;
+ relres = normr / normb;
+ return true;
+ }
+
+ DenseMatrixType G = DenseMatrixType::Zero(N, S);
+ DenseMatrixType U = DenseMatrixType::Zero(N, S);
+ DenseMatrixType M = DenseMatrixType::Identity(S, S);
+ VectorType t(N), v(N);
+ Scalar om = 1.;
+
+ //Main iteration loop, guild G-spaces:
+ iter = 0;
+
+ while (normr > tolb && iter < maxit)
+ {
+ //New right hand size for small system:
+ VectorType f = (r.adjoint() * P).adjoint();
+
+ for (Index k = 0; k < S; ++k)
+ {
+ //Solve small system and make v orthogonal to P:
+ //c = M(k:s,k:s)\f(k:s);
+ lu_solver.compute(M.block(k , k , S -k, S - k ));
+ VectorType c = lu_solver.solve(f.segment(k , S - k ));
+ //v = r - G(:,k:s)*c;
+ v = r - G.rightCols(S - k ) * c;
+ //Preconditioning
+ v = precond.solve(v);
+
+ //Compute new U(:,k) and G(:,k), G(:,k) is in space G_j
+ U.col(k) = U.rightCols(S - k ) * c + om * v;
+ G.col(k) = A * U.col(k );
+
+ //Bi-Orthogonalise the new basis vectors:
+ for (Index i = 0; i < k-1 ; ++i)
+ {
+ //alpha = ( P(:,i)'*G(:,k) )/M(i,i);
+ Scalar alpha = P.col(i ).dot(G.col(k )) / M(i, i );
+ G.col(k ) = G.col(k ) - alpha * G.col(i );
+ U.col(k ) = U.col(k ) - alpha * U.col(i );
+ }
+
+ //New column of M = P'*G (first k-1 entries are zero)
+ //M(k:s,k) = (G(:,k)'*P(:,k:s))';
+ M.block(k , k , S - k , 1) = (G.col(k ).adjoint() * P.rightCols(S - k )).adjoint();
+
+ if (internal::isApprox(M(k,k), Scalar(0)))
+ {
+ return false;
+ }
+
+ //Make r orthogonal to q_i, i = 0..k-1
+ Scalar beta = f(k ) / M(k , k );
+ r = r - beta * G.col(k );
+ x = x + beta * U.col(k );
+ normr = r.norm();
+
+ if (replacement && normr > tolb / mp)
+ {
+ trueres = true;
+ }
+
+ //Smoothing:
+ if (smoothing)
+ {
+ t = r_s - r;
+ //gamma is a Scalar, but the conversion is not allowed
+ Scalar gamma = t.dot(r_s) / t.norm();
+ r_s = r_s - gamma * t;
+ x_s = x_s - gamma * (x_s - x);
+ normr = r_s.norm();
+ }
+
+ if (normr < tolb || iter == maxit)
+ {
+ break;
+ }
+
+ //New f = P'*r (first k components are zero)
+ if (k < S-1)
+ {
+ f.segment(k + 1, S - (k + 1) ) = f.segment(k + 1 , S - (k + 1)) - beta * M.block(k + 1 , k , S - (k + 1), 1);
+ }
+ }//end for
+
+ if (normr < tolb || iter == maxit)
+ {
+ break;
+ }
+
+ //Now we have sufficient vectors in G_j to compute residual in G_j+1
+ //Note: r is already perpendicular to P so v = r
+ //Preconditioning
+ v = r;
+ v = precond.solve(v);
+
+ //Matrix-vector multiplication:
+ t = A * v;
+
+ //Computation of a new omega
+ om = internal::omega(t, r, angle);
+
+ if (om == RealScalar(0.0))
+ {
+ return false;
+ }
+
+ r = r - om * t;
+ x = x + om * v;
+ normr = r.norm();
+
+ if (replacement && normr > tolb / mp)
+ {
+ trueres = true;
+ }
+
+ //Residual replacement?
+ if (trueres && normr < normb)
+ {
+ r = b - A * x;
+ trueres = false;
+ replacements++;
+ }
+
+ //Smoothing:
+ if (smoothing)
+ {
+ t = r_s - r;
+ Scalar gamma = t.dot(r_s) /t.norm();
+ r_s = r_s - gamma * t;
+ x_s = x_s - gamma * (x_s - x);
+ normr = r_s.norm();
+ }
+
+ iter++;
+
+ }//end while
+
+ if (smoothing)
+ {
+ x = x_s;
+ }
+ relres=normr/normb;
+ return true;
+ }
+
+ } // namespace internal
+
+ template <typename _MatrixType, typename _Preconditioner = DiagonalPreconditioner<typename _MatrixType::Scalar> >
+ class IDRS;
+
+ namespace internal
+ {
+
+ template <typename _MatrixType, typename _Preconditioner>
+ struct traits<Eigen::IDRS<_MatrixType, _Preconditioner> >
+ {
+ typedef _MatrixType MatrixType;
+ typedef _Preconditioner Preconditioner;
+ };
+
+ } // namespace internal
+
+
+/** \ingroup IterativeLinearSolvers_Module
+ * \brief The Induced Dimension Reduction method (IDR(s)) is a short-recurrences Krylov method for sparse square problems.
+ *
+ * This class allows to solve for A.x = b sparse linear problems. The vectors x and b can be either dense or sparse.
+ * he Induced Dimension Reduction method, IDR(), is a robust and efficient short-recurrence Krylov subspace method for
+ * solving large nonsymmetric systems of linear equations.
+ *
+ * For indefinite systems IDR(S) outperforms both BiCGStab and BiCGStab(L). Additionally, IDR(S) can handle matrices
+ * with complex eigenvalues more efficiently than BiCGStab.
+ *
+ * Many problems that do not converge for BiCGSTAB converge for IDR(s) (for larger values of s). And if both methods
+ * converge the convergence for IDR(s) is typically much faster for difficult systems (for example indefinite problems).
+ *
+ * IDR(s) is a limited memory finite termination method. In exact arithmetic it converges in at most N+N/s iterations,
+ * with N the system size. It uses a fixed number of 4+3s vector. In comparison, BiCGSTAB terminates in 2N iterations
+ * and uses 7 vectors. GMRES terminates in at most N iterations, and uses I+3 vectors, with I the number of iterations.
+ * Restarting GMRES limits the memory consumption, but destroys the finite termination property.
+ *
+ * \tparam _MatrixType the type of the sparse matrix A, can be a dense or a sparse matrix.
+ * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner
+ *
+ * \implsparsesolverconcept
+ *
+ * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
+ * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations
+ * and NumTraits<Scalar>::epsilon() for the tolerance.
+ *
+ * The tolerance corresponds to the relative residual error: |Ax-b|/|b|
+ *
+ * \b Performance: when using sparse matrices, best performance is achied for a row-major sparse matrix format.
+ * Moreover, in this case multi-threading can be exploited if the user code is compiled with OpenMP enabled.
+ * See \ref TopicMultiThreading for details.
+ *
+ * By default the iterations start with x=0 as an initial guess of the solution.
+ * One can control the start using the solveWithGuess() method.
+ *
+ * IDR(s) can also be used in a matrix-free context, see the following \link MatrixfreeSolverExample example \endlink.
+ *
+ * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
+ */
+ template <typename _MatrixType, typename _Preconditioner>
+ class IDRS : public IterativeSolverBase<IDRS<_MatrixType, _Preconditioner> >
+ {
+
+ public:
+ typedef _MatrixType MatrixType;
+ typedef typename MatrixType::Scalar Scalar;
+ typedef typename MatrixType::RealScalar RealScalar;
+ typedef _Preconditioner Preconditioner;
+
+ private:
+ typedef IterativeSolverBase<IDRS> Base;
+ using Base::m_error;
+ using Base::m_info;
+ using Base::m_isInitialized;
+ using Base::m_iterations;
+ using Base::matrix;
+ Index m_S;
+ bool m_smoothing;
+ RealScalar m_angle;
+ bool m_residual;
+
+ public:
+ /** Default constructor. */
+ IDRS(): m_S(4), m_smoothing(false), m_angle(RealScalar(0.7)), m_residual(false) {}
+
+ /** Initialize the solver with matrix \a A for further \c Ax=b solving.
+
+ This constructor is a shortcut for the default constructor followed
+ by a call to compute().
+
+ \warning this class stores a reference to the matrix A as well as some
+ precomputed values that depend on it. Therefore, if \a A is changed
+ this class becomes invalid. Call compute() to update it with the new
+ matrix A, or modify a copy of A.
+ */
+ template <typename MatrixDerived>
+ explicit IDRS(const EigenBase<MatrixDerived>& A) : Base(A.derived()), m_S(4), m_smoothing(false),
+ m_angle(RealScalar(0.7)), m_residual(false) {}
+
+
+ /** \internal */
+ /** Loops over the number of columns of b and does the following:
+ 1. sets the tolerence and maxIterations
+ 2. Calls the function that has the core solver routine
+ */
+ template <typename Rhs, typename Dest>
+ void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const
+ {
+ m_iterations = Base::maxIterations();
+ m_error = Base::m_tolerance;
+
+ bool ret = internal::idrs(matrix(), b, x, Base::m_preconditioner, m_iterations, m_error, m_S,m_smoothing,m_angle,m_residual);
+
+ m_info = (!ret) ? NumericalIssue : m_error <= Base::m_tolerance ? Success : NoConvergence;
+ }
+
+ /** Sets the parameter S, indicating the dimension of the shadow space. Default is 4*/
+ void setS(Index S)
+ {
+ if (S < 1)
+ {
+ S = 4;
+ }
+
+ m_S = S;
+ }
+
+ /** Switches off and on smoothing.
+ Residual smoothing results in monotonically decreasing residual norms at
+ the expense of two extra vectors of storage and a few extra vector
+ operations. Although monotonic decrease of the residual norms is a
+ desirable property, the rate of convergence of the unsmoothed process and
+ the smoothed process is basically the same. Default is off */
+ void setSmoothing(bool smoothing)
+ {
+ m_smoothing=smoothing;
+ }
+
+ /** The angle must be a real scalar. In IDR(s), a value for the
+ iteration parameter omega must be chosen in every s+1th step. The most
+ natural choice is to select a value to minimize the norm of the next residual.
+ This corresponds to the parameter omega = 0. In practice, this may lead to
+ values of omega that are so small that the other iteration parameters
+ cannot be computed with sufficient accuracy. In such cases it is better to
+ increase the value of omega sufficiently such that a compromise is reached
+ between accurate computations and reduction of the residual norm. The
+ parameter angle =0.7 (”maintaining the convergence strategy”)
+ results in such a compromise. */
+ void setAngle(RealScalar angle)
+ {
+ m_angle=angle;
+ }
+
+ /** The parameter replace is a logical that determines whether a
+ residual replacement strategy is employed to increase the accuracy of the
+ solution. */
+ void setResidualUpdate(bool update)
+ {
+ m_residual=update;
+ }
+
+ };
+
+} // namespace Eigen
+
+#endif /* EIGEN_IDRS_H */
diff --git a/src/EigenUnsupported/src/IterativeSolvers/IncompleteLU.h b/src/EigenUnsupported/src/IterativeSolvers/IncompleteLU.h
new file mode 100644
index 0000000..7d08c35
--- /dev/null
+++ b/src/EigenUnsupported/src/IterativeSolvers/IncompleteLU.h
@@ -0,0 +1,90 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_INCOMPLETE_LU_H
+#define EIGEN_INCOMPLETE_LU_H
+
+namespace Eigen {
+
+template <typename _Scalar>
+class IncompleteLU : public SparseSolverBase<IncompleteLU<_Scalar> >
+{
+ protected:
+ typedef SparseSolverBase<IncompleteLU<_Scalar> > Base;
+ using Base::m_isInitialized;
+
+ typedef _Scalar Scalar;
+ typedef Matrix<Scalar,Dynamic,1> Vector;
+ typedef typename Vector::Index Index;
+ typedef SparseMatrix<Scalar,RowMajor> FactorType;
+
+ public:
+ typedef Matrix<Scalar,Dynamic,Dynamic> MatrixType;
+
+ IncompleteLU() {}
+
+ template<typename MatrixType>
+ IncompleteLU(const MatrixType& mat)
+ {
+ compute(mat);
+ }
+
+ Index rows() const { return m_lu.rows(); }
+ Index cols() const { return m_lu.cols(); }
+
+ template<typename MatrixType>
+ IncompleteLU& compute(const MatrixType& mat)
+ {
+ m_lu = mat;
+ int size = mat.cols();
+ Vector diag(size);
+ for(int i=0; i<size; ++i)
+ {
+ typename FactorType::InnerIterator k_it(m_lu,i);
+ for(; k_it && k_it.index()<i; ++k_it)
+ {
+ int k = k_it.index();
+ k_it.valueRef() /= diag(k);
+
+ typename FactorType::InnerIterator j_it(k_it);
+ typename FactorType::InnerIterator kj_it(m_lu, k);
+ while(kj_it && kj_it.index()<=k) ++kj_it;
+ for(++j_it; j_it; )
+ {
+ if(kj_it.index()==j_it.index())
+ {
+ j_it.valueRef() -= k_it.value() * kj_it.value();
+ ++j_it;
+ ++kj_it;
+ }
+ else if(kj_it.index()<j_it.index()) ++kj_it;
+ else ++j_it;
+ }
+ }
+ if(k_it && k_it.index()==i) diag(i) = k_it.value();
+ else diag(i) = 1;
+ }
+ m_isInitialized = true;
+ return *this;
+ }
+
+ template<typename Rhs, typename Dest>
+ void _solve_impl(const Rhs& b, Dest& x) const
+ {
+ x = m_lu.template triangularView<UnitLower>().solve(b);
+ x = m_lu.template triangularView<Upper>().solve(x);
+ }
+
+ protected:
+ FactorType m_lu;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_INCOMPLETE_LU_H
diff --git a/src/EigenUnsupported/src/IterativeSolvers/IterationController.h b/src/EigenUnsupported/src/IterativeSolvers/IterationController.h
new file mode 100644
index 0000000..a116e09
--- /dev/null
+++ b/src/EigenUnsupported/src/IterativeSolvers/IterationController.h
@@ -0,0 +1,154 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+
+/* NOTE The class IterationController has been adapted from the iteration
+ * class of the GMM++ and ITL libraries.
+ */
+
+//=======================================================================
+// Copyright (C) 1997-2001
+// Authors: Andrew Lumsdaine <lums@osl.iu.edu>
+// Lie-Quan Lee <llee@osl.iu.edu>
+//
+// This file is part of the Iterative Template Library
+//
+// You should have received a copy of the License Agreement for the
+// Iterative Template Library along with the software; see the
+// file LICENSE.
+//
+// Permission to modify the code and to distribute modified code is
+// granted, provided the text of this NOTICE is retained, a notice that
+// the code was modified is included with the above COPYRIGHT NOTICE and
+// with the COPYRIGHT NOTICE in the LICENSE file, and that the LICENSE
+// file is distributed with the modified code.
+//
+// LICENSOR MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED.
+// By way of example, but not limitation, Licensor MAKES NO
+// REPRESENTATIONS OR WARRANTIES OF MERCHANTABILITY OR FITNESS FOR ANY
+// PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE COMPONENTS
+// OR DOCUMENTATION WILL NOT INFRINGE ANY PATENTS, COPYRIGHTS, TRADEMARKS
+// OR OTHER RIGHTS.
+//=======================================================================
+
+//========================================================================
+//
+// Copyright (C) 2002-2007 Yves Renard
+//
+// This file is a part of GETFEM++
+//
+// Getfem++ is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as
+// published by the Free Software Foundation; version 2.1 of the License.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Lesser General Public License for more details.
+// You should have received a copy of the GNU Lesser General Public
+// License along with this program; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301,
+// USA.
+//
+//========================================================================
+
+#include "../../../../Eigen/src/Core/util/NonMPL2.h"
+
+#ifndef EIGEN_ITERATION_CONTROLLER_H
+#define EIGEN_ITERATION_CONTROLLER_H
+
+namespace Eigen {
+
+/** \ingroup IterativeLinearSolvers_Module
+ * \class IterationController
+ *
+ * \brief Controls the iterations of the iterative solvers
+ *
+ * This class has been adapted from the iteration class of GMM++ and ITL libraries.
+ *
+ */
+class IterationController
+{
+ protected :
+ double m_rhsn; ///< Right hand side norm
+ size_t m_maxiter; ///< Max. number of iterations
+ int m_noise; ///< if noise > 0 iterations are printed
+ double m_resmax; ///< maximum residual
+ double m_resminreach, m_resadd;
+ size_t m_nit; ///< iteration number
+ double m_res; ///< last computed residual
+ bool m_written;
+ void (*m_callback)(const IterationController&);
+ public :
+
+ void init()
+ {
+ m_nit = 0; m_res = 0.0; m_written = false;
+ m_resminreach = 1E50; m_resadd = 0.0;
+ m_callback = 0;
+ }
+
+ IterationController(double r = 1.0E-8, int noi = 0, size_t mit = size_t(-1))
+ : m_rhsn(1.0), m_maxiter(mit), m_noise(noi), m_resmax(r) { init(); }
+
+ void operator ++(int) { m_nit++; m_written = false; m_resadd += m_res; }
+ void operator ++() { (*this)++; }
+
+ bool first() { return m_nit == 0; }
+
+ /* get/set the "noisyness" (verbosity) of the solvers */
+ int noiseLevel() const { return m_noise; }
+ void setNoiseLevel(int n) { m_noise = n; }
+ void reduceNoiseLevel() { if (m_noise > 0) m_noise--; }
+
+ double maxResidual() const { return m_resmax; }
+ void setMaxResidual(double r) { m_resmax = r; }
+
+ double residual() const { return m_res; }
+
+ /* change the user-definable callback, called after each iteration */
+ void setCallback(void (*t)(const IterationController&))
+ {
+ m_callback = t;
+ }
+
+ size_t iteration() const { return m_nit; }
+ void setIteration(size_t i) { m_nit = i; }
+
+ size_t maxIterarions() const { return m_maxiter; }
+ void setMaxIterations(size_t i) { m_maxiter = i; }
+
+ double rhsNorm() const { return m_rhsn; }
+ void setRhsNorm(double r) { m_rhsn = r; }
+
+ bool converged() const { return m_res <= m_rhsn * m_resmax; }
+ bool converged(double nr)
+ {
+ using std::abs;
+ m_res = abs(nr);
+ m_resminreach = (std::min)(m_resminreach, m_res);
+ return converged();
+ }
+ template<typename VectorType> bool converged(const VectorType &v)
+ { return converged(v.squaredNorm()); }
+
+ bool finished(double nr)
+ {
+ if (m_callback) m_callback(*this);
+ if (m_noise > 0 && !m_written)
+ {
+ converged(nr);
+ m_written = true;
+ }
+ return (m_nit >= m_maxiter || converged(nr));
+ }
+ template <typename VectorType>
+ bool finished(const MatrixBase<VectorType> &v)
+ { return finished(double(v.squaredNorm())); }
+
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_ITERATION_CONTROLLER_H
diff --git a/src/EigenUnsupported/src/IterativeSolvers/MINRES.h b/src/EigenUnsupported/src/IterativeSolvers/MINRES.h
new file mode 100644
index 0000000..5db454d
--- /dev/null
+++ b/src/EigenUnsupported/src/IterativeSolvers/MINRES.h
@@ -0,0 +1,267 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Giacomo Po <gpo@ucla.edu>
+// Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2018 David Hyde <dabh@stanford.edu>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#ifndef EIGEN_MINRES_H_
+#define EIGEN_MINRES_H_
+
+
+namespace Eigen {
+
+ namespace internal {
+
+ /** \internal Low-level MINRES algorithm
+ * \param mat The matrix A
+ * \param rhs The right hand side vector b
+ * \param x On input and initial solution, on output the computed solution.
+ * \param precond A right preconditioner being able to efficiently solve for an
+ * approximation of Ax=b (regardless of b)
+ * \param iters On input the max number of iteration, on output the number of performed iterations.
+ * \param tol_error On input the tolerance error, on output an estimation of the relative error.
+ */
+ template<typename MatrixType, typename Rhs, typename Dest, typename Preconditioner>
+ EIGEN_DONT_INLINE
+ void minres(const MatrixType& mat, const Rhs& rhs, Dest& x,
+ const Preconditioner& precond, Index& iters,
+ typename Dest::RealScalar& tol_error)
+ {
+ using std::sqrt;
+ typedef typename Dest::RealScalar RealScalar;
+ typedef typename Dest::Scalar Scalar;
+ typedef Matrix<Scalar,Dynamic,1> VectorType;
+
+ // Check for zero rhs
+ const RealScalar rhsNorm2(rhs.squaredNorm());
+ if(rhsNorm2 == 0)
+ {
+ x.setZero();
+ iters = 0;
+ tol_error = 0;
+ return;
+ }
+
+ // initialize
+ const Index maxIters(iters); // initialize maxIters to iters
+ const Index N(mat.cols()); // the size of the matrix
+ const RealScalar threshold2(tol_error*tol_error*rhsNorm2); // convergence threshold (compared to residualNorm2)
+
+ // Initialize preconditioned Lanczos
+ VectorType v_old(N); // will be initialized inside loop
+ VectorType v( VectorType::Zero(N) ); //initialize v
+ VectorType v_new(rhs-mat*x); //initialize v_new
+ RealScalar residualNorm2(v_new.squaredNorm());
+ VectorType w(N); // will be initialized inside loop
+ VectorType w_new(precond.solve(v_new)); // initialize w_new
+// RealScalar beta; // will be initialized inside loop
+ RealScalar beta_new2(v_new.dot(w_new));
+ eigen_assert(beta_new2 >= 0.0 && "PRECONDITIONER IS NOT POSITIVE DEFINITE");
+ RealScalar beta_new(sqrt(beta_new2));
+ const RealScalar beta_one(beta_new);
+ // Initialize other variables
+ RealScalar c(1.0); // the cosine of the Givens rotation
+ RealScalar c_old(1.0);
+ RealScalar s(0.0); // the sine of the Givens rotation
+ RealScalar s_old(0.0); // the sine of the Givens rotation
+ VectorType p_oold(N); // will be initialized in loop
+ VectorType p_old(VectorType::Zero(N)); // initialize p_old=0
+ VectorType p(p_old); // initialize p=0
+ RealScalar eta(1.0);
+
+ iters = 0; // reset iters
+ while ( iters < maxIters )
+ {
+ // Preconditioned Lanczos
+ /* Note that there are 4 variants on the Lanczos algorithm. These are
+ * described in Paige, C. C. (1972). Computational variants of
+ * the Lanczos method for the eigenproblem. IMA Journal of Applied
+ * Mathematics, 10(3), 373-381. The current implementation corresponds
+ * to the case A(2,7) in the paper. It also corresponds to
+ * algorithm 6.14 in Y. Saad, Iterative Methods for Sparse Linear
+ * Systems, 2003 p.173. For the preconditioned version see
+ * A. Greenbaum, Iterative Methods for Solving Linear Systems, SIAM (1987).
+ */
+ const RealScalar beta(beta_new);
+ v_old = v; // update: at first time step, this makes v_old = 0 so value of beta doesn't matter
+ v_new /= beta_new; // overwrite v_new for next iteration
+ w_new /= beta_new; // overwrite w_new for next iteration
+ v = v_new; // update
+ w = w_new; // update
+ v_new.noalias() = mat*w - beta*v_old; // compute v_new
+ const RealScalar alpha = v_new.dot(w);
+ v_new -= alpha*v; // overwrite v_new
+ w_new = precond.solve(v_new); // overwrite w_new
+ beta_new2 = v_new.dot(w_new); // compute beta_new
+ eigen_assert(beta_new2 >= 0.0 && "PRECONDITIONER IS NOT POSITIVE DEFINITE");
+ beta_new = sqrt(beta_new2); // compute beta_new
+
+ // Givens rotation
+ const RealScalar r2 =s*alpha+c*c_old*beta; // s, s_old, c and c_old are still from previous iteration
+ const RealScalar r3 =s_old*beta; // s, s_old, c and c_old are still from previous iteration
+ const RealScalar r1_hat=c*alpha-c_old*s*beta;
+ const RealScalar r1 =sqrt( std::pow(r1_hat,2) + std::pow(beta_new,2) );
+ c_old = c; // store for next iteration
+ s_old = s; // store for next iteration
+ c=r1_hat/r1; // new cosine
+ s=beta_new/r1; // new sine
+
+ // Update solution
+ p_oold = p_old;
+ p_old = p;
+ p.noalias()=(w-r2*p_old-r3*p_oold) /r1; // IS NOALIAS REQUIRED?
+ x += beta_one*c*eta*p;
+
+ /* Update the squared residual. Note that this is the estimated residual.
+ The real residual |Ax-b|^2 may be slightly larger */
+ residualNorm2 *= s*s;
+
+ if ( residualNorm2 < threshold2)
+ {
+ break;
+ }
+
+ eta=-s*eta; // update eta
+ iters++; // increment iteration number (for output purposes)
+ }
+
+ /* Compute error. Note that this is the estimated error. The real
+ error |Ax-b|/|b| may be slightly larger */
+ tol_error = std::sqrt(residualNorm2 / rhsNorm2);
+ }
+
+ }
+
+ template< typename _MatrixType, int _UpLo=Lower,
+ typename _Preconditioner = IdentityPreconditioner>
+ class MINRES;
+
+ namespace internal {
+
+ template< typename _MatrixType, int _UpLo, typename _Preconditioner>
+ struct traits<MINRES<_MatrixType,_UpLo,_Preconditioner> >
+ {
+ typedef _MatrixType MatrixType;
+ typedef _Preconditioner Preconditioner;
+ };
+
+ }
+
+ /** \ingroup IterativeLinearSolvers_Module
+ * \brief A minimal residual solver for sparse symmetric problems
+ *
+ * This class allows to solve for A.x = b sparse linear problems using the MINRES algorithm
+ * of Paige and Saunders (1975). The sparse matrix A must be symmetric (possibly indefinite).
+ * The vectors x and b can be either dense or sparse.
+ *
+ * \tparam _MatrixType the type of the sparse matrix A, can be a dense or a sparse matrix.
+ * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower,
+ * Upper, or Lower|Upper in which the full matrix entries will be considered. Default is Lower.
+ * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner
+ *
+ * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
+ * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations
+ * and NumTraits<Scalar>::epsilon() for the tolerance.
+ *
+ * This class can be used as the direct solver classes. Here is a typical usage example:
+ * \code
+ * int n = 10000;
+ * VectorXd x(n), b(n);
+ * SparseMatrix<double> A(n,n);
+ * // fill A and b
+ * MINRES<SparseMatrix<double> > mr;
+ * mr.compute(A);
+ * x = mr.solve(b);
+ * std::cout << "#iterations: " << mr.iterations() << std::endl;
+ * std::cout << "estimated error: " << mr.error() << std::endl;
+ * // update b, and solve again
+ * x = mr.solve(b);
+ * \endcode
+ *
+ * By default the iterations start with x=0 as an initial guess of the solution.
+ * One can control the start using the solveWithGuess() method.
+ *
+ * MINRES can also be used in a matrix-free context, see the following \link MatrixfreeSolverExample example \endlink.
+ *
+ * \sa class ConjugateGradient, BiCGSTAB, SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
+ */
+ template< typename _MatrixType, int _UpLo, typename _Preconditioner>
+ class MINRES : public IterativeSolverBase<MINRES<_MatrixType,_UpLo,_Preconditioner> >
+ {
+
+ typedef IterativeSolverBase<MINRES> Base;
+ using Base::matrix;
+ using Base::m_error;
+ using Base::m_iterations;
+ using Base::m_info;
+ using Base::m_isInitialized;
+ public:
+ using Base::_solve_impl;
+ typedef _MatrixType MatrixType;
+ typedef typename MatrixType::Scalar Scalar;
+ typedef typename MatrixType::RealScalar RealScalar;
+ typedef _Preconditioner Preconditioner;
+
+ enum {UpLo = _UpLo};
+
+ public:
+
+ /** Default constructor. */
+ MINRES() : Base() {}
+
+ /** Initialize the solver with matrix \a A for further \c Ax=b solving.
+ *
+ * This constructor is a shortcut for the default constructor followed
+ * by a call to compute().
+ *
+ * \warning this class stores a reference to the matrix A as well as some
+ * precomputed values that depend on it. Therefore, if \a A is changed
+ * this class becomes invalid. Call compute() to update it with the new
+ * matrix A, or modify a copy of A.
+ */
+ template<typename MatrixDerived>
+ explicit MINRES(const EigenBase<MatrixDerived>& A) : Base(A.derived()) {}
+
+ /** Destructor. */
+ ~MINRES(){}
+
+ /** \internal */
+ template<typename Rhs,typename Dest>
+ void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const
+ {
+ typedef typename Base::MatrixWrapper MatrixWrapper;
+ typedef typename Base::ActualMatrixType ActualMatrixType;
+ enum {
+ TransposeInput = (!MatrixWrapper::MatrixFree)
+ && (UpLo==(Lower|Upper))
+ && (!MatrixType::IsRowMajor)
+ && (!NumTraits<Scalar>::IsComplex)
+ };
+ typedef typename internal::conditional<TransposeInput,Transpose<const ActualMatrixType>, ActualMatrixType const&>::type RowMajorWrapper;
+ EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(MatrixWrapper::MatrixFree,UpLo==(Lower|Upper)),MATRIX_FREE_CONJUGATE_GRADIENT_IS_COMPATIBLE_WITH_UPPER_UNION_LOWER_MODE_ONLY);
+ typedef typename internal::conditional<UpLo==(Lower|Upper),
+ RowMajorWrapper,
+ typename MatrixWrapper::template ConstSelfAdjointViewReturnType<UpLo>::Type
+ >::type SelfAdjointWrapper;
+
+ m_iterations = Base::maxIterations();
+ m_error = Base::m_tolerance;
+ RowMajorWrapper row_mat(matrix());
+ internal::minres(SelfAdjointWrapper(row_mat), b, x,
+ Base::m_preconditioner, m_iterations, m_error);
+ m_info = m_error <= Base::m_tolerance ? Success : NoConvergence;
+ }
+
+ protected:
+
+ };
+
+} // end namespace Eigen
+
+#endif // EIGEN_MINRES_H
diff --git a/src/EigenUnsupported/src/IterativeSolvers/Scaling.h b/src/EigenUnsupported/src/IterativeSolvers/Scaling.h
new file mode 100644
index 0000000..9b3eb53
--- /dev/null
+++ b/src/EigenUnsupported/src/IterativeSolvers/Scaling.h
@@ -0,0 +1,193 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Desire NUENTSA WAKAM <desire.nuentsa_wakam@inria.fr
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ITERSCALING_H
+#define EIGEN_ITERSCALING_H
+
+namespace Eigen {
+
+/**
+ * \ingroup IterativeSolvers_Module
+ * \brief iterative scaling algorithm to equilibrate rows and column norms in matrices
+ *
+ * This class can be used as a preprocessing tool to accelerate the convergence of iterative methods
+ *
+ * This feature is useful to limit the pivoting amount during LU/ILU factorization
+ * The scaling strategy as presented here preserves the symmetry of the problem
+ * NOTE It is assumed that the matrix does not have empty row or column,
+ *
+ * Example with key steps
+ * \code
+ * VectorXd x(n), b(n);
+ * SparseMatrix<double> A;
+ * // fill A and b;
+ * IterScaling<SparseMatrix<double> > scal;
+ * // Compute the left and right scaling vectors. The matrix is equilibrated at output
+ * scal.computeRef(A);
+ * // Scale the right hand side
+ * b = scal.LeftScaling().cwiseProduct(b);
+ * // Now, solve the equilibrated linear system with any available solver
+ *
+ * // Scale back the computed solution
+ * x = scal.RightScaling().cwiseProduct(x);
+ * \endcode
+ *
+ * \tparam _MatrixType the type of the matrix. It should be a real square sparsematrix
+ *
+ * References : D. Ruiz and B. Ucar, A Symmetry Preserving Algorithm for Matrix Scaling, INRIA Research report RR-7552
+ *
+ * \sa \ref IncompleteLUT
+ */
+template<typename _MatrixType>
+class IterScaling
+{
+ public:
+ typedef _MatrixType MatrixType;
+ typedef typename MatrixType::Scalar Scalar;
+ typedef typename MatrixType::Index Index;
+
+ public:
+ IterScaling() { init(); }
+
+ IterScaling(const MatrixType& matrix)
+ {
+ init();
+ compute(matrix);
+ }
+
+ ~IterScaling() { }
+
+ /**
+ * Compute the left and right diagonal matrices to scale the input matrix @p mat
+ *
+ * FIXME This algorithm will be modified such that the diagonal elements are permuted on the diagonal.
+ *
+ * \sa LeftScaling() RightScaling()
+ */
+ void compute (const MatrixType& mat)
+ {
+ using std::abs;
+ int m = mat.rows();
+ int n = mat.cols();
+ eigen_assert((m>0 && m == n) && "Please give a non - empty matrix");
+ m_left.resize(m);
+ m_right.resize(n);
+ m_left.setOnes();
+ m_right.setOnes();
+ m_matrix = mat;
+ VectorXd Dr, Dc, DrRes, DcRes; // Temporary Left and right scaling vectors
+ Dr.resize(m); Dc.resize(n);
+ DrRes.resize(m); DcRes.resize(n);
+ double EpsRow = 1.0, EpsCol = 1.0;
+ int its = 0;
+ do
+ { // Iterate until the infinite norm of each row and column is approximately 1
+ // Get the maximum value in each row and column
+ Dr.setZero(); Dc.setZero();
+ for (int k=0; k<m_matrix.outerSize(); ++k)
+ {
+ for (typename MatrixType::InnerIterator it(m_matrix, k); it; ++it)
+ {
+ if ( Dr(it.row()) < abs(it.value()) )
+ Dr(it.row()) = abs(it.value());
+
+ if ( Dc(it.col()) < abs(it.value()) )
+ Dc(it.col()) = abs(it.value());
+ }
+ }
+ for (int i = 0; i < m; ++i)
+ {
+ Dr(i) = std::sqrt(Dr(i));
+ }
+ for (int i = 0; i < n; ++i)
+ {
+ Dc(i) = std::sqrt(Dc(i));
+ }
+ // Save the scaling factors
+ for (int i = 0; i < m; ++i)
+ {
+ m_left(i) /= Dr(i);
+ }
+ for (int i = 0; i < n; ++i)
+ {
+ m_right(i) /= Dc(i);
+ }
+ // Scale the rows and the columns of the matrix
+ DrRes.setZero(); DcRes.setZero();
+ for (int k=0; k<m_matrix.outerSize(); ++k)
+ {
+ for (typename MatrixType::InnerIterator it(m_matrix, k); it; ++it)
+ {
+ it.valueRef() = it.value()/( Dr(it.row()) * Dc(it.col()) );
+ // Accumulate the norms of the row and column vectors
+ if ( DrRes(it.row()) < abs(it.value()) )
+ DrRes(it.row()) = abs(it.value());
+
+ if ( DcRes(it.col()) < abs(it.value()) )
+ DcRes(it.col()) = abs(it.value());
+ }
+ }
+ DrRes.array() = (1-DrRes.array()).abs();
+ EpsRow = DrRes.maxCoeff();
+ DcRes.array() = (1-DcRes.array()).abs();
+ EpsCol = DcRes.maxCoeff();
+ its++;
+ }while ( (EpsRow >m_tol || EpsCol > m_tol) && (its < m_maxits) );
+ m_isInitialized = true;
+ }
+ /** Compute the left and right vectors to scale the vectors
+ * the input matrix is scaled with the computed vectors at output
+ *
+ * \sa compute()
+ */
+ void computeRef (MatrixType& mat)
+ {
+ compute (mat);
+ mat = m_matrix;
+ }
+ /** Get the vector to scale the rows of the matrix
+ */
+ VectorXd& LeftScaling()
+ {
+ return m_left;
+ }
+
+ /** Get the vector to scale the columns of the matrix
+ */
+ VectorXd& RightScaling()
+ {
+ return m_right;
+ }
+
+ /** Set the tolerance for the convergence of the iterative scaling algorithm
+ */
+ void setTolerance(double tol)
+ {
+ m_tol = tol;
+ }
+
+ protected:
+
+ void init()
+ {
+ m_tol = 1e-10;
+ m_maxits = 5;
+ m_isInitialized = false;
+ }
+
+ MatrixType m_matrix;
+ mutable ComputationInfo m_info;
+ bool m_isInitialized;
+ VectorXd m_left; // Left scaling vector
+ VectorXd m_right; // m_right scaling vector
+ double m_tol;
+ int m_maxits; // Maximum number of iterations allowed
+};
+}
+#endif
diff --git a/src/EigenUnsupported/src/KroneckerProduct/KroneckerTensorProduct.h b/src/EigenUnsupported/src/KroneckerProduct/KroneckerTensorProduct.h
new file mode 100644
index 0000000..6a9b0be
--- /dev/null
+++ b/src/EigenUnsupported/src/KroneckerProduct/KroneckerTensorProduct.h
@@ -0,0 +1,305 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Kolja Brix <brix@igpm.rwth-aachen.de>
+// Copyright (C) 2011 Andreas Platen <andiplaten@gmx.de>
+// Copyright (C) 2012 Chen-Pang He <jdh8@ms63.hinet.net>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef KRONECKER_TENSOR_PRODUCT_H
+#define KRONECKER_TENSOR_PRODUCT_H
+
+namespace Eigen {
+
+/*!
+ * \ingroup KroneckerProduct_Module
+ *
+ * \brief The base class of dense and sparse Kronecker product.
+ *
+ * \tparam Derived is the derived type.
+ */
+template<typename Derived>
+class KroneckerProductBase : public ReturnByValue<Derived>
+{
+ private:
+ typedef typename internal::traits<Derived> Traits;
+ typedef typename Traits::Scalar Scalar;
+
+ protected:
+ typedef typename Traits::Lhs Lhs;
+ typedef typename Traits::Rhs Rhs;
+
+ public:
+ /*! \brief Constructor. */
+ KroneckerProductBase(const Lhs& A, const Rhs& B)
+ : m_A(A), m_B(B)
+ {}
+
+ inline Index rows() const { return m_A.rows() * m_B.rows(); }
+ inline Index cols() const { return m_A.cols() * m_B.cols(); }
+
+ /*!
+ * This overrides ReturnByValue::coeff because this function is
+ * efficient enough.
+ */
+ Scalar coeff(Index row, Index col) const
+ {
+ return m_A.coeff(row / m_B.rows(), col / m_B.cols()) *
+ m_B.coeff(row % m_B.rows(), col % m_B.cols());
+ }
+
+ /*!
+ * This overrides ReturnByValue::coeff because this function is
+ * efficient enough.
+ */
+ Scalar coeff(Index i) const
+ {
+ EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+ return m_A.coeff(i / m_A.size()) * m_B.coeff(i % m_A.size());
+ }
+
+ protected:
+ typename Lhs::Nested m_A;
+ typename Rhs::Nested m_B;
+};
+
+/*!
+ * \ingroup KroneckerProduct_Module
+ *
+ * \brief Kronecker tensor product helper class for dense matrices
+ *
+ * This class is the return value of kroneckerProduct(MatrixBase,
+ * MatrixBase). Use the function rather than construct this class
+ * directly to avoid specifying template prarameters.
+ *
+ * \tparam Lhs Type of the left-hand side, a matrix expression.
+ * \tparam Rhs Type of the rignt-hand side, a matrix expression.
+ */
+template<typename Lhs, typename Rhs>
+class KroneckerProduct : public KroneckerProductBase<KroneckerProduct<Lhs,Rhs> >
+{
+ private:
+ typedef KroneckerProductBase<KroneckerProduct> Base;
+ using Base::m_A;
+ using Base::m_B;
+
+ public:
+ /*! \brief Constructor. */
+ KroneckerProduct(const Lhs& A, const Rhs& B)
+ : Base(A, B)
+ {}
+
+ /*! \brief Evaluate the Kronecker tensor product. */
+ template<typename Dest> void evalTo(Dest& dst) const;
+};
+
+/*!
+ * \ingroup KroneckerProduct_Module
+ *
+ * \brief Kronecker tensor product helper class for sparse matrices
+ *
+ * If at least one of the operands is a sparse matrix expression,
+ * then this class is returned and evaluates into a sparse matrix.
+ *
+ * This class is the return value of kroneckerProduct(EigenBase,
+ * EigenBase). Use the function rather than construct this class
+ * directly to avoid specifying template prarameters.
+ *
+ * \tparam Lhs Type of the left-hand side, a matrix expression.
+ * \tparam Rhs Type of the rignt-hand side, a matrix expression.
+ */
+template<typename Lhs, typename Rhs>
+class KroneckerProductSparse : public KroneckerProductBase<KroneckerProductSparse<Lhs,Rhs> >
+{
+ private:
+ typedef KroneckerProductBase<KroneckerProductSparse> Base;
+ using Base::m_A;
+ using Base::m_B;
+
+ public:
+ /*! \brief Constructor. */
+ KroneckerProductSparse(const Lhs& A, const Rhs& B)
+ : Base(A, B)
+ {}
+
+ /*! \brief Evaluate the Kronecker tensor product. */
+ template<typename Dest> void evalTo(Dest& dst) const;
+};
+
+template<typename Lhs, typename Rhs>
+template<typename Dest>
+void KroneckerProduct<Lhs,Rhs>::evalTo(Dest& dst) const
+{
+ const int BlockRows = Rhs::RowsAtCompileTime,
+ BlockCols = Rhs::ColsAtCompileTime;
+ const Index Br = m_B.rows(),
+ Bc = m_B.cols();
+ for (Index i=0; i < m_A.rows(); ++i)
+ for (Index j=0; j < m_A.cols(); ++j)
+ Block<Dest,BlockRows,BlockCols>(dst,i*Br,j*Bc,Br,Bc) = m_A.coeff(i,j) * m_B;
+}
+
+template<typename Lhs, typename Rhs>
+template<typename Dest>
+void KroneckerProductSparse<Lhs,Rhs>::evalTo(Dest& dst) const
+{
+ Index Br = m_B.rows(), Bc = m_B.cols();
+ dst.resize(this->rows(), this->cols());
+ dst.resizeNonZeros(0);
+
+ // 1 - evaluate the operands if needed:
+ typedef typename internal::nested_eval<Lhs,Dynamic>::type Lhs1;
+ typedef typename internal::remove_all<Lhs1>::type Lhs1Cleaned;
+ const Lhs1 lhs1(m_A);
+ typedef typename internal::nested_eval<Rhs,Dynamic>::type Rhs1;
+ typedef typename internal::remove_all<Rhs1>::type Rhs1Cleaned;
+ const Rhs1 rhs1(m_B);
+
+ // 2 - construct respective iterators
+ typedef Eigen::InnerIterator<Lhs1Cleaned> LhsInnerIterator;
+ typedef Eigen::InnerIterator<Rhs1Cleaned> RhsInnerIterator;
+
+ // compute number of non-zeros per innervectors of dst
+ {
+ // TODO VectorXi is not necessarily big enough!
+ VectorXi nnzA = VectorXi::Zero(Dest::IsRowMajor ? m_A.rows() : m_A.cols());
+ for (Index kA=0; kA < m_A.outerSize(); ++kA)
+ for (LhsInnerIterator itA(lhs1,kA); itA; ++itA)
+ nnzA(Dest::IsRowMajor ? itA.row() : itA.col())++;
+
+ VectorXi nnzB = VectorXi::Zero(Dest::IsRowMajor ? m_B.rows() : m_B.cols());
+ for (Index kB=0; kB < m_B.outerSize(); ++kB)
+ for (RhsInnerIterator itB(rhs1,kB); itB; ++itB)
+ nnzB(Dest::IsRowMajor ? itB.row() : itB.col())++;
+
+ Matrix<int,Dynamic,Dynamic,ColMajor> nnzAB = nnzB * nnzA.transpose();
+ dst.reserve(VectorXi::Map(nnzAB.data(), nnzAB.size()));
+ }
+
+ for (Index kA=0; kA < m_A.outerSize(); ++kA)
+ {
+ for (Index kB=0; kB < m_B.outerSize(); ++kB)
+ {
+ for (LhsInnerIterator itA(lhs1,kA); itA; ++itA)
+ {
+ for (RhsInnerIterator itB(rhs1,kB); itB; ++itB)
+ {
+ Index i = itA.row() * Br + itB.row(),
+ j = itA.col() * Bc + itB.col();
+ dst.insert(i,j) = itA.value() * itB.value();
+ }
+ }
+ }
+ }
+}
+
+namespace internal {
+
+template<typename _Lhs, typename _Rhs>
+struct traits<KroneckerProduct<_Lhs,_Rhs> >
+{
+ typedef typename remove_all<_Lhs>::type Lhs;
+ typedef typename remove_all<_Rhs>::type Rhs;
+ typedef typename ScalarBinaryOpTraits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
+ typedef typename promote_index_type<typename Lhs::StorageIndex, typename Rhs::StorageIndex>::type StorageIndex;
+
+ enum {
+ Rows = size_at_compile_time<traits<Lhs>::RowsAtCompileTime, traits<Rhs>::RowsAtCompileTime>::ret,
+ Cols = size_at_compile_time<traits<Lhs>::ColsAtCompileTime, traits<Rhs>::ColsAtCompileTime>::ret,
+ MaxRows = size_at_compile_time<traits<Lhs>::MaxRowsAtCompileTime, traits<Rhs>::MaxRowsAtCompileTime>::ret,
+ MaxCols = size_at_compile_time<traits<Lhs>::MaxColsAtCompileTime, traits<Rhs>::MaxColsAtCompileTime>::ret
+ };
+
+ typedef Matrix<Scalar,Rows,Cols> ReturnType;
+};
+
+template<typename _Lhs, typename _Rhs>
+struct traits<KroneckerProductSparse<_Lhs,_Rhs> >
+{
+ typedef MatrixXpr XprKind;
+ typedef typename remove_all<_Lhs>::type Lhs;
+ typedef typename remove_all<_Rhs>::type Rhs;
+ typedef typename ScalarBinaryOpTraits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
+ typedef typename cwise_promote_storage_type<typename traits<Lhs>::StorageKind, typename traits<Rhs>::StorageKind, scalar_product_op<typename Lhs::Scalar, typename Rhs::Scalar> >::ret StorageKind;
+ typedef typename promote_index_type<typename Lhs::StorageIndex, typename Rhs::StorageIndex>::type StorageIndex;
+
+ enum {
+ LhsFlags = Lhs::Flags,
+ RhsFlags = Rhs::Flags,
+
+ RowsAtCompileTime = size_at_compile_time<traits<Lhs>::RowsAtCompileTime, traits<Rhs>::RowsAtCompileTime>::ret,
+ ColsAtCompileTime = size_at_compile_time<traits<Lhs>::ColsAtCompileTime, traits<Rhs>::ColsAtCompileTime>::ret,
+ MaxRowsAtCompileTime = size_at_compile_time<traits<Lhs>::MaxRowsAtCompileTime, traits<Rhs>::MaxRowsAtCompileTime>::ret,
+ MaxColsAtCompileTime = size_at_compile_time<traits<Lhs>::MaxColsAtCompileTime, traits<Rhs>::MaxColsAtCompileTime>::ret,
+
+ EvalToRowMajor = (int(LhsFlags) & int(RhsFlags) & RowMajorBit),
+ RemovedBits = ~(EvalToRowMajor ? 0 : RowMajorBit),
+
+ Flags = ((int(LhsFlags) | int(RhsFlags)) & HereditaryBits & RemovedBits)
+ | EvalBeforeNestingBit,
+ CoeffReadCost = HugeCost
+ };
+
+ typedef SparseMatrix<Scalar, 0, StorageIndex> ReturnType;
+};
+
+} // end namespace internal
+
+/*!
+ * \ingroup KroneckerProduct_Module
+ *
+ * Computes Kronecker tensor product of two dense matrices
+ *
+ * \warning If you want to replace a matrix by its Kronecker product
+ * with some matrix, do \b NOT do this:
+ * \code
+ * A = kroneckerProduct(A,B); // bug!!! caused by aliasing effect
+ * \endcode
+ * instead, use eval() to work around this:
+ * \code
+ * A = kroneckerProduct(A,B).eval();
+ * \endcode
+ *
+ * \param a Dense matrix a
+ * \param b Dense matrix b
+ * \return Kronecker tensor product of a and b
+ */
+template<typename A, typename B>
+KroneckerProduct<A,B> kroneckerProduct(const MatrixBase<A>& a, const MatrixBase<B>& b)
+{
+ return KroneckerProduct<A, B>(a.derived(), b.derived());
+}
+
+/*!
+ * \ingroup KroneckerProduct_Module
+ *
+ * Computes Kronecker tensor product of two matrices, at least one of
+ * which is sparse
+ *
+ * \warning If you want to replace a matrix by its Kronecker product
+ * with some matrix, do \b NOT do this:
+ * \code
+ * A = kroneckerProduct(A,B); // bug!!! caused by aliasing effect
+ * \endcode
+ * instead, use eval() to work around this:
+ * \code
+ * A = kroneckerProduct(A,B).eval();
+ * \endcode
+ *
+ * \param a Dense/sparse matrix a
+ * \param b Dense/sparse matrix b
+ * \return Kronecker tensor product of a and b, stored in a sparse
+ * matrix
+ */
+template<typename A, typename B>
+KroneckerProductSparse<A,B> kroneckerProduct(const EigenBase<A>& a, const EigenBase<B>& b)
+{
+ return KroneckerProductSparse<A,B>(a.derived(), b.derived());
+}
+
+} // end namespace Eigen
+
+#endif // KRONECKER_TENSOR_PRODUCT_H
diff --git a/src/EigenUnsupported/src/LevenbergMarquardt/CopyrightMINPACK.txt b/src/EigenUnsupported/src/LevenbergMarquardt/CopyrightMINPACK.txt
new file mode 100644
index 0000000..ae7984d
--- /dev/null
+++ b/src/EigenUnsupported/src/LevenbergMarquardt/CopyrightMINPACK.txt
@@ -0,0 +1,52 @@
+Minpack Copyright Notice (1999) University of Chicago. All rights reserved
+
+Redistribution and use in source and binary forms, with or
+without modification, are permitted provided that the
+following conditions are met:
+
+1. Redistributions of source code must retain the above
+copyright notice, this list of conditions and the following
+disclaimer.
+
+2. Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following
+disclaimer in the documentation and/or other materials
+provided with the distribution.
+
+3. The end-user documentation included with the
+redistribution, if any, must include the following
+acknowledgment:
+
+ "This product includes software developed by the
+ University of Chicago, as Operator of Argonne National
+ Laboratory.
+
+Alternately, this acknowledgment may appear in the software
+itself, if and wherever such third-party acknowledgments
+normally appear.
+
+4. WARRANTY DISCLAIMER. THE SOFTWARE IS SUPPLIED "AS IS"
+WITHOUT WARRANTY OF ANY KIND. THE COPYRIGHT HOLDER, THE
+UNITED STATES, THE UNITED STATES DEPARTMENT OF ENERGY, AND
+THEIR EMPLOYEES: (1) DISCLAIM ANY WARRANTIES, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO ANY IMPLIED WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE
+OR NON-INFRINGEMENT, (2) DO NOT ASSUME ANY LEGAL LIABILITY
+OR RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR
+USEFULNESS OF THE SOFTWARE, (3) DO NOT REPRESENT THAT USE OF
+THE SOFTWARE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS, (4)
+DO NOT WARRANT THAT THE SOFTWARE WILL FUNCTION
+UNINTERRUPTED, THAT IT IS ERROR-FREE OR THAT ANY ERRORS WILL
+BE CORRECTED.
+
+5. LIMITATION OF LIABILITY. IN NO EVENT WILL THE COPYRIGHT
+HOLDER, THE UNITED STATES, THE UNITED STATES DEPARTMENT OF
+ENERGY, OR THEIR EMPLOYEES: BE LIABLE FOR ANY INDIRECT,
+INCIDENTAL, CONSEQUENTIAL, SPECIAL OR PUNITIVE DAMAGES OF
+ANY KIND OR NATURE, INCLUDING BUT NOT LIMITED TO LOSS OF
+PROFITS OR LOSS OF DATA, FOR ANY REASON WHATSOEVER, WHETHER
+SUCH LIABILITY IS ASSERTED ON THE BASIS OF CONTRACT, TORT
+(INCLUDING NEGLIGENCE OR STRICT LIABILITY), OR OTHERWISE,
+EVEN IF ANY OF SAID PARTIES HAS BEEN WARNED OF THE
+POSSIBILITY OF SUCH LOSS OR DAMAGES.
+
diff --git a/src/EigenUnsupported/src/LevenbergMarquardt/LMcovar.h b/src/EigenUnsupported/src/LevenbergMarquardt/LMcovar.h
new file mode 100644
index 0000000..b75bea2
--- /dev/null
+++ b/src/EigenUnsupported/src/LevenbergMarquardt/LMcovar.h
@@ -0,0 +1,84 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This code initially comes from MINPACK whose original authors are:
+// Copyright Jorge More - Argonne National Laboratory
+// Copyright Burt Garbow - Argonne National Laboratory
+// Copyright Ken Hillstrom - Argonne National Laboratory
+//
+// This Source Code Form is subject to the terms of the Minpack license
+// (a BSD-like license) described in the campaigned CopyrightMINPACK.txt file.
+
+#ifndef EIGEN_LMCOVAR_H
+#define EIGEN_LMCOVAR_H
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Scalar>
+void covar(
+ Matrix< Scalar, Dynamic, Dynamic > &r,
+ const VectorXi& ipvt,
+ Scalar tol = std::sqrt(NumTraits<Scalar>::epsilon()) )
+{
+ using std::abs;
+ /* Local variables */
+ Index i, j, k, l, ii, jj;
+ bool sing;
+ Scalar temp;
+
+ /* Function Body */
+ const Index n = r.cols();
+ const Scalar tolr = tol * abs(r(0,0));
+ Matrix< Scalar, Dynamic, 1 > wa(n);
+ eigen_assert(ipvt.size()==n);
+
+ /* form the inverse of r in the full upper triangle of r. */
+ l = -1;
+ for (k = 0; k < n; ++k)
+ if (abs(r(k,k)) > tolr) {
+ r(k,k) = 1. / r(k,k);
+ for (j = 0; j <= k-1; ++j) {
+ temp = r(k,k) * r(j,k);
+ r(j,k) = 0.;
+ r.col(k).head(j+1) -= r.col(j).head(j+1) * temp;
+ }
+ l = k;
+ }
+
+ /* form the full upper triangle of the inverse of (r transpose)*r */
+ /* in the full upper triangle of r. */
+ for (k = 0; k <= l; ++k) {
+ for (j = 0; j <= k-1; ++j)
+ r.col(j).head(j+1) += r.col(k).head(j+1) * r(j,k);
+ r.col(k).head(k+1) *= r(k,k);
+ }
+
+ /* form the full lower triangle of the covariance matrix */
+ /* in the strict lower triangle of r and in wa. */
+ for (j = 0; j < n; ++j) {
+ jj = ipvt[j];
+ sing = j > l;
+ for (i = 0; i <= j; ++i) {
+ if (sing)
+ r(i,j) = 0.;
+ ii = ipvt[i];
+ if (ii > jj)
+ r(ii,jj) = r(i,j);
+ if (ii < jj)
+ r(jj,ii) = r(i,j);
+ }
+ wa[jj] = r(j,j);
+ }
+
+ /* symmetrize the covariance matrix in r. */
+ r.topLeftCorner(n,n).template triangularView<StrictlyUpper>() = r.topLeftCorner(n,n).transpose();
+ r.diagonal() = wa;
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_LMCOVAR_H
diff --git a/src/EigenUnsupported/src/LevenbergMarquardt/LMonestep.h b/src/EigenUnsupported/src/LevenbergMarquardt/LMonestep.h
new file mode 100644
index 0000000..25b32ec
--- /dev/null
+++ b/src/EigenUnsupported/src/LevenbergMarquardt/LMonestep.h
@@ -0,0 +1,202 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Thomas Capricelli <orzel@freehackers.org>
+//
+// This code initially comes from MINPACK whose original authors are:
+// Copyright Jorge More - Argonne National Laboratory
+// Copyright Burt Garbow - Argonne National Laboratory
+// Copyright Ken Hillstrom - Argonne National Laboratory
+//
+// This Source Code Form is subject to the terms of the Minpack license
+// (a BSD-like license) described in the campaigned CopyrightMINPACK.txt file.
+
+#ifndef EIGEN_LMONESTEP_H
+#define EIGEN_LMONESTEP_H
+
+namespace Eigen {
+
+template<typename FunctorType>
+LevenbergMarquardtSpace::Status
+LevenbergMarquardt<FunctorType>::minimizeOneStep(FVectorType &x)
+{
+ using std::abs;
+ using std::sqrt;
+ RealScalar temp, temp1,temp2;
+ RealScalar ratio;
+ RealScalar pnorm, xnorm, fnorm1, actred, dirder, prered;
+ eigen_assert(x.size()==n); // check the caller is not cheating us
+
+ temp = 0.0; xnorm = 0.0;
+ /* calculate the jacobian matrix. */
+ Index df_ret = m_functor.df(x, m_fjac);
+ if (df_ret<0)
+ return LevenbergMarquardtSpace::UserAsked;
+ if (df_ret>0)
+ // numerical diff, we evaluated the function df_ret times
+ m_nfev += df_ret;
+ else m_njev++;
+
+ /* compute the qr factorization of the jacobian. */
+ for (int j = 0; j < x.size(); ++j)
+ m_wa2(j) = m_fjac.col(j).blueNorm();
+ QRSolver qrfac(m_fjac);
+ if(qrfac.info() != Success) {
+ m_info = NumericalIssue;
+ return LevenbergMarquardtSpace::ImproperInputParameters;
+ }
+ // Make a copy of the first factor with the associated permutation
+ m_rfactor = qrfac.matrixR();
+ m_permutation = (qrfac.colsPermutation());
+
+ /* on the first iteration and if external scaling is not used, scale according */
+ /* to the norms of the columns of the initial jacobian. */
+ if (m_iter == 1) {
+ if (!m_useExternalScaling)
+ for (Index j = 0; j < n; ++j)
+ m_diag[j] = (m_wa2[j]==0.)? 1. : m_wa2[j];
+
+ /* on the first iteration, calculate the norm of the scaled x */
+ /* and initialize the step bound m_delta. */
+ xnorm = m_diag.cwiseProduct(x).stableNorm();
+ m_delta = m_factor * xnorm;
+ if (m_delta == 0.)
+ m_delta = m_factor;
+ }
+
+ /* form (q transpose)*m_fvec and store the first n components in */
+ /* m_qtf. */
+ m_wa4 = m_fvec;
+ m_wa4 = qrfac.matrixQ().adjoint() * m_fvec;
+ m_qtf = m_wa4.head(n);
+
+ /* compute the norm of the scaled gradient. */
+ m_gnorm = 0.;
+ if (m_fnorm != 0.)
+ for (Index j = 0; j < n; ++j)
+ if (m_wa2[m_permutation.indices()[j]] != 0.)
+ m_gnorm = (std::max)(m_gnorm, abs( m_rfactor.col(j).head(j+1).dot(m_qtf.head(j+1)/m_fnorm) / m_wa2[m_permutation.indices()[j]]));
+
+ /* test for convergence of the gradient norm. */
+ if (m_gnorm <= m_gtol) {
+ m_info = Success;
+ return LevenbergMarquardtSpace::CosinusTooSmall;
+ }
+
+ /* rescale if necessary. */
+ if (!m_useExternalScaling)
+ m_diag = m_diag.cwiseMax(m_wa2);
+
+ do {
+ /* determine the levenberg-marquardt parameter. */
+ internal::lmpar2(qrfac, m_diag, m_qtf, m_delta, m_par, m_wa1);
+
+ /* store the direction p and x + p. calculate the norm of p. */
+ m_wa1 = -m_wa1;
+ m_wa2 = x + m_wa1;
+ pnorm = m_diag.cwiseProduct(m_wa1).stableNorm();
+
+ /* on the first iteration, adjust the initial step bound. */
+ if (m_iter == 1)
+ m_delta = (std::min)(m_delta,pnorm);
+
+ /* evaluate the function at x + p and calculate its norm. */
+ if ( m_functor(m_wa2, m_wa4) < 0)
+ return LevenbergMarquardtSpace::UserAsked;
+ ++m_nfev;
+ fnorm1 = m_wa4.stableNorm();
+
+ /* compute the scaled actual reduction. */
+ actred = -1.;
+ if (Scalar(.1) * fnorm1 < m_fnorm)
+ actred = 1. - numext::abs2(fnorm1 / m_fnorm);
+
+ /* compute the scaled predicted reduction and */
+ /* the scaled directional derivative. */
+ m_wa3 = m_rfactor.template triangularView<Upper>() * (m_permutation.inverse() *m_wa1);
+ temp1 = numext::abs2(m_wa3.stableNorm() / m_fnorm);
+ temp2 = numext::abs2(sqrt(m_par) * pnorm / m_fnorm);
+ prered = temp1 + temp2 / Scalar(.5);
+ dirder = -(temp1 + temp2);
+
+ /* compute the ratio of the actual to the predicted */
+ /* reduction. */
+ ratio = 0.;
+ if (prered != 0.)
+ ratio = actred / prered;
+
+ /* update the step bound. */
+ if (ratio <= Scalar(.25)) {
+ if (actred >= 0.)
+ temp = RealScalar(.5);
+ if (actred < 0.)
+ temp = RealScalar(.5) * dirder / (dirder + RealScalar(.5) * actred);
+ if (RealScalar(.1) * fnorm1 >= m_fnorm || temp < RealScalar(.1))
+ temp = Scalar(.1);
+ /* Computing MIN */
+ m_delta = temp * (std::min)(m_delta, pnorm / RealScalar(.1));
+ m_par /= temp;
+ } else if (!(m_par != 0. && ratio < RealScalar(.75))) {
+ m_delta = pnorm / RealScalar(.5);
+ m_par = RealScalar(.5) * m_par;
+ }
+
+ /* test for successful iteration. */
+ if (ratio >= RealScalar(1e-4)) {
+ /* successful iteration. update x, m_fvec, and their norms. */
+ x = m_wa2;
+ m_wa2 = m_diag.cwiseProduct(x);
+ m_fvec = m_wa4;
+ xnorm = m_wa2.stableNorm();
+ m_fnorm = fnorm1;
+ ++m_iter;
+ }
+
+ /* tests for convergence. */
+ if (abs(actred) <= m_ftol && prered <= m_ftol && Scalar(.5) * ratio <= 1. && m_delta <= m_xtol * xnorm)
+ {
+ m_info = Success;
+ return LevenbergMarquardtSpace::RelativeErrorAndReductionTooSmall;
+ }
+ if (abs(actred) <= m_ftol && prered <= m_ftol && Scalar(.5) * ratio <= 1.)
+ {
+ m_info = Success;
+ return LevenbergMarquardtSpace::RelativeReductionTooSmall;
+ }
+ if (m_delta <= m_xtol * xnorm)
+ {
+ m_info = Success;
+ return LevenbergMarquardtSpace::RelativeErrorTooSmall;
+ }
+
+ /* tests for termination and stringent tolerances. */
+ if (m_nfev >= m_maxfev)
+ {
+ m_info = NoConvergence;
+ return LevenbergMarquardtSpace::TooManyFunctionEvaluation;
+ }
+ if (abs(actred) <= NumTraits<Scalar>::epsilon() && prered <= NumTraits<Scalar>::epsilon() && Scalar(.5) * ratio <= 1.)
+ {
+ m_info = Success;
+ return LevenbergMarquardtSpace::FtolTooSmall;
+ }
+ if (m_delta <= NumTraits<Scalar>::epsilon() * xnorm)
+ {
+ m_info = Success;
+ return LevenbergMarquardtSpace::XtolTooSmall;
+ }
+ if (m_gnorm <= NumTraits<Scalar>::epsilon())
+ {
+ m_info = Success;
+ return LevenbergMarquardtSpace::GtolTooSmall;
+ }
+
+ } while (ratio < Scalar(1e-4));
+
+ return LevenbergMarquardtSpace::Running;
+}
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_LMONESTEP_H
diff --git a/src/EigenUnsupported/src/LevenbergMarquardt/LMpar.h b/src/EigenUnsupported/src/LevenbergMarquardt/LMpar.h
new file mode 100644
index 0000000..9a48365
--- /dev/null
+++ b/src/EigenUnsupported/src/LevenbergMarquardt/LMpar.h
@@ -0,0 +1,160 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This code initially comes from MINPACK whose original authors are:
+// Copyright Jorge More - Argonne National Laboratory
+// Copyright Burt Garbow - Argonne National Laboratory
+// Copyright Ken Hillstrom - Argonne National Laboratory
+//
+// This Source Code Form is subject to the terms of the Minpack license
+// (a BSD-like license) described in the campaigned CopyrightMINPACK.txt file.
+
+#ifndef EIGEN_LMPAR_H
+#define EIGEN_LMPAR_H
+
+namespace Eigen {
+
+namespace internal {
+
+ template <typename QRSolver, typename VectorType>
+ void lmpar2(
+ const QRSolver &qr,
+ const VectorType &diag,
+ const VectorType &qtb,
+ typename VectorType::Scalar m_delta,
+ typename VectorType::Scalar &par,
+ VectorType &x)
+
+ {
+ using std::sqrt;
+ using std::abs;
+ typedef typename QRSolver::MatrixType MatrixType;
+ typedef typename QRSolver::Scalar Scalar;
+// typedef typename QRSolver::StorageIndex StorageIndex;
+
+ /* Local variables */
+ Index j;
+ Scalar fp;
+ Scalar parc, parl;
+ Index iter;
+ Scalar temp, paru;
+ Scalar gnorm;
+ Scalar dxnorm;
+
+ // Make a copy of the triangular factor.
+ // This copy is modified during call the qrsolv
+ MatrixType s;
+ s = qr.matrixR();
+
+ /* Function Body */
+ const Scalar dwarf = (std::numeric_limits<Scalar>::min)();
+ const Index n = qr.matrixR().cols();
+ eigen_assert(n==diag.size());
+ eigen_assert(n==qtb.size());
+
+ VectorType wa1, wa2;
+
+ /* compute and store in x the gauss-newton direction. if the */
+ /* jacobian is rank-deficient, obtain a least squares solution. */
+
+ // const Index rank = qr.nonzeroPivots(); // exactly double(0.)
+ const Index rank = qr.rank(); // use a threshold
+ wa1 = qtb;
+ wa1.tail(n-rank).setZero();
+ //FIXME There is no solve in place for sparse triangularView
+ wa1.head(rank) = s.topLeftCorner(rank,rank).template triangularView<Upper>().solve(qtb.head(rank));
+
+ x = qr.colsPermutation()*wa1;
+
+ /* initialize the iteration counter. */
+ /* evaluate the function at the origin, and test */
+ /* for acceptance of the gauss-newton direction. */
+ iter = 0;
+ wa2 = diag.cwiseProduct(x);
+ dxnorm = wa2.blueNorm();
+ fp = dxnorm - m_delta;
+ if (fp <= Scalar(0.1) * m_delta) {
+ par = 0;
+ return;
+ }
+
+ /* if the jacobian is not rank deficient, the newton */
+ /* step provides a lower bound, parl, for the zero of */
+ /* the function. otherwise set this bound to zero. */
+ parl = 0.;
+ if (rank==n) {
+ wa1 = qr.colsPermutation().inverse() * diag.cwiseProduct(wa2)/dxnorm;
+ s.topLeftCorner(n,n).transpose().template triangularView<Lower>().solveInPlace(wa1);
+ temp = wa1.blueNorm();
+ parl = fp / m_delta / temp / temp;
+ }
+
+ /* calculate an upper bound, paru, for the zero of the function. */
+ for (j = 0; j < n; ++j)
+ wa1[j] = s.col(j).head(j+1).dot(qtb.head(j+1)) / diag[qr.colsPermutation().indices()(j)];
+
+ gnorm = wa1.stableNorm();
+ paru = gnorm / m_delta;
+ if (paru == 0.)
+ paru = dwarf / (std::min)(m_delta,Scalar(0.1));
+
+ /* if the input par lies outside of the interval (parl,paru), */
+ /* set par to the closer endpoint. */
+ par = (std::max)(par,parl);
+ par = (std::min)(par,paru);
+ if (par == 0.)
+ par = gnorm / dxnorm;
+
+ /* beginning of an iteration. */
+ while (true) {
+ ++iter;
+
+ /* evaluate the function at the current value of par. */
+ if (par == 0.)
+ par = (std::max)(dwarf,Scalar(.001) * paru); /* Computing MAX */
+ wa1 = sqrt(par)* diag;
+
+ VectorType sdiag(n);
+ lmqrsolv(s, qr.colsPermutation(), wa1, qtb, x, sdiag);
+
+ wa2 = diag.cwiseProduct(x);
+ dxnorm = wa2.blueNorm();
+ temp = fp;
+ fp = dxnorm - m_delta;
+
+ /* if the function is small enough, accept the current value */
+ /* of par. also test for the exceptional cases where parl */
+ /* is zero or the number of iterations has reached 10. */
+ if (abs(fp) <= Scalar(0.1) * m_delta || (parl == 0. && fp <= temp && temp < 0.) || iter == 10)
+ break;
+
+ /* compute the newton correction. */
+ wa1 = qr.colsPermutation().inverse() * diag.cwiseProduct(wa2/dxnorm);
+ // we could almost use this here, but the diagonal is outside qr, in sdiag[]
+ for (j = 0; j < n; ++j) {
+ wa1[j] /= sdiag[j];
+ temp = wa1[j];
+ for (Index i = j+1; i < n; ++i)
+ wa1[i] -= s.coeff(i,j) * temp;
+ }
+ temp = wa1.blueNorm();
+ parc = fp / m_delta / temp / temp;
+
+ /* depending on the sign of the function, update parl or paru. */
+ if (fp > 0.)
+ parl = (std::max)(parl,par);
+ if (fp < 0.)
+ paru = (std::min)(paru,par);
+
+ /* compute an improved estimate for par. */
+ par = (std::max)(parl,par+parc);
+ }
+ if (iter == 0)
+ par = 0.;
+ return;
+ }
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_LMPAR_H
diff --git a/src/EigenUnsupported/src/LevenbergMarquardt/LMqrsolv.h b/src/EigenUnsupported/src/LevenbergMarquardt/LMqrsolv.h
new file mode 100644
index 0000000..1234858
--- /dev/null
+++ b/src/EigenUnsupported/src/LevenbergMarquardt/LMqrsolv.h
@@ -0,0 +1,188 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Thomas Capricelli <orzel@freehackers.org>
+// Copyright (C) 2012 Desire Nuentsa <desire.nuentsa_wakam@inria.fr>
+//
+// This code initially comes from MINPACK whose original authors are:
+// Copyright Jorge More - Argonne National Laboratory
+// Copyright Burt Garbow - Argonne National Laboratory
+// Copyright Ken Hillstrom - Argonne National Laboratory
+//
+// This Source Code Form is subject to the terms of the Minpack license
+// (a BSD-like license) described in the campaigned CopyrightMINPACK.txt file.
+
+#ifndef EIGEN_LMQRSOLV_H
+#define EIGEN_LMQRSOLV_H
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Scalar,int Rows, int Cols, typename PermIndex>
+void lmqrsolv(
+ Matrix<Scalar,Rows,Cols> &s,
+ const PermutationMatrix<Dynamic,Dynamic,PermIndex> &iPerm,
+ const Matrix<Scalar,Dynamic,1> &diag,
+ const Matrix<Scalar,Dynamic,1> &qtb,
+ Matrix<Scalar,Dynamic,1> &x,
+ Matrix<Scalar,Dynamic,1> &sdiag)
+{
+ /* Local variables */
+ Index i, j, k;
+ Scalar temp;
+ Index n = s.cols();
+ Matrix<Scalar,Dynamic,1> wa(n);
+ JacobiRotation<Scalar> givens;
+
+ /* Function Body */
+ // the following will only change the lower triangular part of s, including
+ // the diagonal, though the diagonal is restored afterward
+
+ /* copy r and (q transpose)*b to preserve input and initialize s. */
+ /* in particular, save the diagonal elements of r in x. */
+ x = s.diagonal();
+ wa = qtb;
+
+
+ s.topLeftCorner(n,n).template triangularView<StrictlyLower>() = s.topLeftCorner(n,n).transpose();
+ /* eliminate the diagonal matrix d using a givens rotation. */
+ for (j = 0; j < n; ++j) {
+
+ /* prepare the row of d to be eliminated, locating the */
+ /* diagonal element using p from the qr factorization. */
+ const PermIndex l = iPerm.indices()(j);
+ if (diag[l] == 0.)
+ break;
+ sdiag.tail(n-j).setZero();
+ sdiag[j] = diag[l];
+
+ /* the transformations to eliminate the row of d */
+ /* modify only a single element of (q transpose)*b */
+ /* beyond the first n, which is initially zero. */
+ Scalar qtbpj = 0.;
+ for (k = j; k < n; ++k) {
+ /* determine a givens rotation which eliminates the */
+ /* appropriate element in the current row of d. */
+ givens.makeGivens(-s(k,k), sdiag[k]);
+
+ /* compute the modified diagonal element of r and */
+ /* the modified element of ((q transpose)*b,0). */
+ s(k,k) = givens.c() * s(k,k) + givens.s() * sdiag[k];
+ temp = givens.c() * wa[k] + givens.s() * qtbpj;
+ qtbpj = -givens.s() * wa[k] + givens.c() * qtbpj;
+ wa[k] = temp;
+
+ /* accumulate the transformation in the row of s. */
+ for (i = k+1; i<n; ++i) {
+ temp = givens.c() * s(i,k) + givens.s() * sdiag[i];
+ sdiag[i] = -givens.s() * s(i,k) + givens.c() * sdiag[i];
+ s(i,k) = temp;
+ }
+ }
+ }
+
+ /* solve the triangular system for z. if the system is */
+ /* singular, then obtain a least squares solution. */
+ Index nsing;
+ for(nsing=0; nsing<n && sdiag[nsing]!=0; nsing++) {}
+
+ wa.tail(n-nsing).setZero();
+ s.topLeftCorner(nsing, nsing).transpose().template triangularView<Upper>().solveInPlace(wa.head(nsing));
+
+ // restore
+ sdiag = s.diagonal();
+ s.diagonal() = x;
+
+ /* permute the components of z back to components of x. */
+ x = iPerm * wa;
+}
+
+template <typename Scalar, int _Options, typename Index>
+void lmqrsolv(
+ SparseMatrix<Scalar,_Options,Index> &s,
+ const PermutationMatrix<Dynamic,Dynamic> &iPerm,
+ const Matrix<Scalar,Dynamic,1> &diag,
+ const Matrix<Scalar,Dynamic,1> &qtb,
+ Matrix<Scalar,Dynamic,1> &x,
+ Matrix<Scalar,Dynamic,1> &sdiag)
+{
+ /* Local variables */
+ typedef SparseMatrix<Scalar,RowMajor,Index> FactorType;
+ Index i, j, k, l;
+ Scalar temp;
+ Index n = s.cols();
+ Matrix<Scalar,Dynamic,1> wa(n);
+ JacobiRotation<Scalar> givens;
+
+ /* Function Body */
+ // the following will only change the lower triangular part of s, including
+ // the diagonal, though the diagonal is restored afterward
+
+ /* copy r and (q transpose)*b to preserve input and initialize R. */
+ wa = qtb;
+ FactorType R(s);
+ // Eliminate the diagonal matrix d using a givens rotation
+ for (j = 0; j < n; ++j)
+ {
+ // Prepare the row of d to be eliminated, locating the
+ // diagonal element using p from the qr factorization
+ l = iPerm.indices()(j);
+ if (diag(l) == Scalar(0))
+ break;
+ sdiag.tail(n-j).setZero();
+ sdiag[j] = diag[l];
+ // the transformations to eliminate the row of d
+ // modify only a single element of (q transpose)*b
+ // beyond the first n, which is initially zero.
+
+ Scalar qtbpj = 0;
+ // Browse the nonzero elements of row j of the upper triangular s
+ for (k = j; k < n; ++k)
+ {
+ typename FactorType::InnerIterator itk(R,k);
+ for (; itk; ++itk){
+ if (itk.index() < k) continue;
+ else break;
+ }
+ //At this point, we have the diagonal element R(k,k)
+ // Determine a givens rotation which eliminates
+ // the appropriate element in the current row of d
+ givens.makeGivens(-itk.value(), sdiag(k));
+
+ // Compute the modified diagonal element of r and
+ // the modified element of ((q transpose)*b,0).
+ itk.valueRef() = givens.c() * itk.value() + givens.s() * sdiag(k);
+ temp = givens.c() * wa(k) + givens.s() * qtbpj;
+ qtbpj = -givens.s() * wa(k) + givens.c() * qtbpj;
+ wa(k) = temp;
+
+ // Accumulate the transformation in the remaining k row/column of R
+ for (++itk; itk; ++itk)
+ {
+ i = itk.index();
+ temp = givens.c() * itk.value() + givens.s() * sdiag(i);
+ sdiag(i) = -givens.s() * itk.value() + givens.c() * sdiag(i);
+ itk.valueRef() = temp;
+ }
+ }
+ }
+
+ // Solve the triangular system for z. If the system is
+ // singular, then obtain a least squares solution
+ Index nsing;
+ for(nsing = 0; nsing<n && sdiag(nsing) !=0; nsing++) {}
+
+ wa.tail(n-nsing).setZero();
+// x = wa;
+ wa.head(nsing) = R.topLeftCorner(nsing,nsing).template triangularView<Upper>().solve/*InPlace*/(wa.head(nsing));
+
+ sdiag = R.diagonal();
+ // Permute the components of z back to components of x
+ x = iPerm * wa;
+}
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_LMQRSOLV_H
diff --git a/src/EigenUnsupported/src/LevenbergMarquardt/LevenbergMarquardt.h b/src/EigenUnsupported/src/LevenbergMarquardt/LevenbergMarquardt.h
new file mode 100644
index 0000000..62561da
--- /dev/null
+++ b/src/EigenUnsupported/src/LevenbergMarquardt/LevenbergMarquardt.h
@@ -0,0 +1,396 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Thomas Capricelli <orzel@freehackers.org>
+// Copyright (C) 2012 Desire Nuentsa <desire.nuentsa_wakam@inria.fr>
+//
+// The algorithm of this class initially comes from MINPACK whose original authors are:
+// Copyright Jorge More - Argonne National Laboratory
+// Copyright Burt Garbow - Argonne National Laboratory
+// Copyright Ken Hillstrom - Argonne National Laboratory
+//
+// This Source Code Form is subject to the terms of the Minpack license
+// (a BSD-like license) described in the campaigned CopyrightMINPACK.txt file.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_LEVENBERGMARQUARDT_H
+#define EIGEN_LEVENBERGMARQUARDT_H
+
+
+namespace Eigen {
+namespace LevenbergMarquardtSpace {
+ enum Status {
+ NotStarted = -2,
+ Running = -1,
+ ImproperInputParameters = 0,
+ RelativeReductionTooSmall = 1,
+ RelativeErrorTooSmall = 2,
+ RelativeErrorAndReductionTooSmall = 3,
+ CosinusTooSmall = 4,
+ TooManyFunctionEvaluation = 5,
+ FtolTooSmall = 6,
+ XtolTooSmall = 7,
+ GtolTooSmall = 8,
+ UserAsked = 9
+ };
+}
+
+template <typename _Scalar, int NX=Dynamic, int NY=Dynamic>
+struct DenseFunctor
+{
+ typedef _Scalar Scalar;
+ enum {
+ InputsAtCompileTime = NX,
+ ValuesAtCompileTime = NY
+ };
+ typedef Matrix<Scalar,InputsAtCompileTime,1> InputType;
+ typedef Matrix<Scalar,ValuesAtCompileTime,1> ValueType;
+ typedef Matrix<Scalar,ValuesAtCompileTime,InputsAtCompileTime> JacobianType;
+ typedef ColPivHouseholderQR<JacobianType> QRSolver;
+ const int m_inputs, m_values;
+
+ DenseFunctor() : m_inputs(InputsAtCompileTime), m_values(ValuesAtCompileTime) {}
+ DenseFunctor(int inputs, int values) : m_inputs(inputs), m_values(values) {}
+
+ int inputs() const { return m_inputs; }
+ int values() const { return m_values; }
+
+ //int operator()(const InputType &x, ValueType& fvec) { }
+ // should be defined in derived classes
+
+ //int df(const InputType &x, JacobianType& fjac) { }
+ // should be defined in derived classes
+};
+
+template <typename _Scalar, typename _Index>
+struct SparseFunctor
+{
+ typedef _Scalar Scalar;
+ typedef _Index Index;
+ typedef Matrix<Scalar,Dynamic,1> InputType;
+ typedef Matrix<Scalar,Dynamic,1> ValueType;
+ typedef SparseMatrix<Scalar, ColMajor, Index> JacobianType;
+ typedef SparseQR<JacobianType, COLAMDOrdering<int> > QRSolver;
+ enum {
+ InputsAtCompileTime = Dynamic,
+ ValuesAtCompileTime = Dynamic
+ };
+
+ SparseFunctor(int inputs, int values) : m_inputs(inputs), m_values(values) {}
+
+ int inputs() const { return m_inputs; }
+ int values() const { return m_values; }
+
+ const int m_inputs, m_values;
+ //int operator()(const InputType &x, ValueType& fvec) { }
+ // to be defined in the functor
+
+ //int df(const InputType &x, JacobianType& fjac) { }
+ // to be defined in the functor if no automatic differentiation
+
+};
+namespace internal {
+template <typename QRSolver, typename VectorType>
+void lmpar2(const QRSolver &qr, const VectorType &diag, const VectorType &qtb,
+ typename VectorType::Scalar m_delta, typename VectorType::Scalar &par,
+ VectorType &x);
+ }
+/**
+ * \ingroup NonLinearOptimization_Module
+ * \brief Performs non linear optimization over a non-linear function,
+ * using a variant of the Levenberg Marquardt algorithm.
+ *
+ * Check wikipedia for more information.
+ * http://en.wikipedia.org/wiki/Levenberg%E2%80%93Marquardt_algorithm
+ */
+template<typename _FunctorType>
+class LevenbergMarquardt : internal::no_assignment_operator
+{
+ public:
+ typedef _FunctorType FunctorType;
+ typedef typename FunctorType::QRSolver QRSolver;
+ typedef typename FunctorType::JacobianType JacobianType;
+ typedef typename JacobianType::Scalar Scalar;
+ typedef typename JacobianType::RealScalar RealScalar;
+ typedef typename QRSolver::StorageIndex PermIndex;
+ typedef Matrix<Scalar,Dynamic,1> FVectorType;
+ typedef PermutationMatrix<Dynamic,Dynamic,int> PermutationType;
+ public:
+ LevenbergMarquardt(FunctorType& functor)
+ : m_functor(functor),m_nfev(0),m_njev(0),m_fnorm(0.0),m_gnorm(0),
+ m_isInitialized(false),m_info(InvalidInput)
+ {
+ resetParameters();
+ m_useExternalScaling=false;
+ }
+
+ LevenbergMarquardtSpace::Status minimize(FVectorType &x);
+ LevenbergMarquardtSpace::Status minimizeInit(FVectorType &x);
+ LevenbergMarquardtSpace::Status minimizeOneStep(FVectorType &x);
+ LevenbergMarquardtSpace::Status lmder1(
+ FVectorType &x,
+ const Scalar tol = std::sqrt(NumTraits<Scalar>::epsilon())
+ );
+ static LevenbergMarquardtSpace::Status lmdif1(
+ FunctorType &functor,
+ FVectorType &x,
+ Index *nfev,
+ const Scalar tol = std::sqrt(NumTraits<Scalar>::epsilon())
+ );
+
+ /** Sets the default parameters */
+ void resetParameters()
+ {
+ using std::sqrt;
+
+ m_factor = 100.;
+ m_maxfev = 400;
+ m_ftol = sqrt(NumTraits<RealScalar>::epsilon());
+ m_xtol = sqrt(NumTraits<RealScalar>::epsilon());
+ m_gtol = 0. ;
+ m_epsfcn = 0. ;
+ }
+
+ /** Sets the tolerance for the norm of the solution vector*/
+ void setXtol(RealScalar xtol) { m_xtol = xtol; }
+
+ /** Sets the tolerance for the norm of the vector function*/
+ void setFtol(RealScalar ftol) { m_ftol = ftol; }
+
+ /** Sets the tolerance for the norm of the gradient of the error vector*/
+ void setGtol(RealScalar gtol) { m_gtol = gtol; }
+
+ /** Sets the step bound for the diagonal shift */
+ void setFactor(RealScalar factor) { m_factor = factor; }
+
+ /** Sets the error precision */
+ void setEpsilon (RealScalar epsfcn) { m_epsfcn = epsfcn; }
+
+ /** Sets the maximum number of function evaluation */
+ void setMaxfev(Index maxfev) {m_maxfev = maxfev; }
+
+ /** Use an external Scaling. If set to true, pass a nonzero diagonal to diag() */
+ void setExternalScaling(bool value) {m_useExternalScaling = value; }
+
+ /** \returns the tolerance for the norm of the solution vector */
+ RealScalar xtol() const {return m_xtol; }
+
+ /** \returns the tolerance for the norm of the vector function */
+ RealScalar ftol() const {return m_ftol; }
+
+ /** \returns the tolerance for the norm of the gradient of the error vector */
+ RealScalar gtol() const {return m_gtol; }
+
+ /** \returns the step bound for the diagonal shift */
+ RealScalar factor() const {return m_factor; }
+
+ /** \returns the error precision */
+ RealScalar epsilon() const {return m_epsfcn; }
+
+ /** \returns the maximum number of function evaluation */
+ Index maxfev() const {return m_maxfev; }
+
+ /** \returns a reference to the diagonal of the jacobian */
+ FVectorType& diag() {return m_diag; }
+
+ /** \returns the number of iterations performed */
+ Index iterations() { return m_iter; }
+
+ /** \returns the number of functions evaluation */
+ Index nfev() { return m_nfev; }
+
+ /** \returns the number of jacobian evaluation */
+ Index njev() { return m_njev; }
+
+ /** \returns the norm of current vector function */
+ RealScalar fnorm() {return m_fnorm; }
+
+ /** \returns the norm of the gradient of the error */
+ RealScalar gnorm() {return m_gnorm; }
+
+ /** \returns the LevenbergMarquardt parameter */
+ RealScalar lm_param(void) { return m_par; }
+
+ /** \returns a reference to the current vector function
+ */
+ FVectorType& fvec() {return m_fvec; }
+
+ /** \returns a reference to the matrix where the current Jacobian matrix is stored
+ */
+ JacobianType& jacobian() {return m_fjac; }
+
+ /** \returns a reference to the triangular matrix R from the QR of the jacobian matrix.
+ * \sa jacobian()
+ */
+ JacobianType& matrixR() {return m_rfactor; }
+
+ /** the permutation used in the QR factorization
+ */
+ PermutationType permutation() {return m_permutation; }
+
+ /**
+ * \brief Reports whether the minimization was successful
+ * \returns \c Success if the minimization was successful,
+ * \c NumericalIssue if a numerical problem arises during the
+ * minimization process, for example during the QR factorization
+ * \c NoConvergence if the minimization did not converge after
+ * the maximum number of function evaluation allowed
+ * \c InvalidInput if the input matrix is invalid
+ */
+ ComputationInfo info() const
+ {
+
+ return m_info;
+ }
+ private:
+ JacobianType m_fjac;
+ JacobianType m_rfactor; // The triangular matrix R from the QR of the jacobian matrix m_fjac
+ FunctorType &m_functor;
+ FVectorType m_fvec, m_qtf, m_diag;
+ Index n;
+ Index m;
+ Index m_nfev;
+ Index m_njev;
+ RealScalar m_fnorm; // Norm of the current vector function
+ RealScalar m_gnorm; //Norm of the gradient of the error
+ RealScalar m_factor; //
+ Index m_maxfev; // Maximum number of function evaluation
+ RealScalar m_ftol; //Tolerance in the norm of the vector function
+ RealScalar m_xtol; //
+ RealScalar m_gtol; //tolerance of the norm of the error gradient
+ RealScalar m_epsfcn; //
+ Index m_iter; // Number of iterations performed
+ RealScalar m_delta;
+ bool m_useExternalScaling;
+ PermutationType m_permutation;
+ FVectorType m_wa1, m_wa2, m_wa3, m_wa4; //Temporary vectors
+ RealScalar m_par;
+ bool m_isInitialized; // Check whether the minimization step has been called
+ ComputationInfo m_info;
+};
+
+template<typename FunctorType>
+LevenbergMarquardtSpace::Status
+LevenbergMarquardt<FunctorType>::minimize(FVectorType &x)
+{
+ LevenbergMarquardtSpace::Status status = minimizeInit(x);
+ if (status==LevenbergMarquardtSpace::ImproperInputParameters) {
+ m_isInitialized = true;
+ return status;
+ }
+ do {
+// std::cout << " uv " << x.transpose() << "\n";
+ status = minimizeOneStep(x);
+ } while (status==LevenbergMarquardtSpace::Running);
+ m_isInitialized = true;
+ return status;
+}
+
+template<typename FunctorType>
+LevenbergMarquardtSpace::Status
+LevenbergMarquardt<FunctorType>::minimizeInit(FVectorType &x)
+{
+ n = x.size();
+ m = m_functor.values();
+
+ m_wa1.resize(n); m_wa2.resize(n); m_wa3.resize(n);
+ m_wa4.resize(m);
+ m_fvec.resize(m);
+ //FIXME Sparse Case : Allocate space for the jacobian
+ m_fjac.resize(m, n);
+// m_fjac.reserve(VectorXi::Constant(n,5)); // FIXME Find a better alternative
+ if (!m_useExternalScaling)
+ m_diag.resize(n);
+ eigen_assert( (!m_useExternalScaling || m_diag.size()==n) && "When m_useExternalScaling is set, the caller must provide a valid 'm_diag'");
+ m_qtf.resize(n);
+
+ /* Function Body */
+ m_nfev = 0;
+ m_njev = 0;
+
+ /* check the input parameters for errors. */
+ if (n <= 0 || m < n || m_ftol < 0. || m_xtol < 0. || m_gtol < 0. || m_maxfev <= 0 || m_factor <= 0.){
+ m_info = InvalidInput;
+ return LevenbergMarquardtSpace::ImproperInputParameters;
+ }
+
+ if (m_useExternalScaling)
+ for (Index j = 0; j < n; ++j)
+ if (m_diag[j] <= 0.)
+ {
+ m_info = InvalidInput;
+ return LevenbergMarquardtSpace::ImproperInputParameters;
+ }
+
+ /* evaluate the function at the starting point */
+ /* and calculate its norm. */
+ m_nfev = 1;
+ if ( m_functor(x, m_fvec) < 0)
+ return LevenbergMarquardtSpace::UserAsked;
+ m_fnorm = m_fvec.stableNorm();
+
+ /* initialize levenberg-marquardt parameter and iteration counter. */
+ m_par = 0.;
+ m_iter = 1;
+
+ return LevenbergMarquardtSpace::NotStarted;
+}
+
+template<typename FunctorType>
+LevenbergMarquardtSpace::Status
+LevenbergMarquardt<FunctorType>::lmder1(
+ FVectorType &x,
+ const Scalar tol
+ )
+{
+ n = x.size();
+ m = m_functor.values();
+
+ /* check the input parameters for errors. */
+ if (n <= 0 || m < n || tol < 0.)
+ return LevenbergMarquardtSpace::ImproperInputParameters;
+
+ resetParameters();
+ m_ftol = tol;
+ m_xtol = tol;
+ m_maxfev = 100*(n+1);
+
+ return minimize(x);
+}
+
+
+template<typename FunctorType>
+LevenbergMarquardtSpace::Status
+LevenbergMarquardt<FunctorType>::lmdif1(
+ FunctorType &functor,
+ FVectorType &x,
+ Index *nfev,
+ const Scalar tol
+ )
+{
+ Index n = x.size();
+ Index m = functor.values();
+
+ /* check the input parameters for errors. */
+ if (n <= 0 || m < n || tol < 0.)
+ return LevenbergMarquardtSpace::ImproperInputParameters;
+
+ NumericalDiff<FunctorType> numDiff(functor);
+ // embedded LevenbergMarquardt
+ LevenbergMarquardt<NumericalDiff<FunctorType> > lm(numDiff);
+ lm.setFtol(tol);
+ lm.setXtol(tol);
+ lm.setMaxfev(200*(n+1));
+
+ LevenbergMarquardtSpace::Status info = LevenbergMarquardtSpace::Status(lm.minimize(x));
+ if (nfev)
+ * nfev = lm.nfev();
+ return info;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_LEVENBERGMARQUARDT_H
diff --git a/src/EigenUnsupported/src/MatrixFunctions/MatrixExponential.h b/src/EigenUnsupported/src/MatrixFunctions/MatrixExponential.h
new file mode 100644
index 0000000..02284b0
--- /dev/null
+++ b/src/EigenUnsupported/src/MatrixFunctions/MatrixExponential.h
@@ -0,0 +1,441 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009, 2010, 2013 Jitse Niesen <jitse@maths.leeds.ac.uk>
+// Copyright (C) 2011, 2013 Chen-Pang He <jdh8@ms63.hinet.net>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATRIX_EXPONENTIAL
+#define EIGEN_MATRIX_EXPONENTIAL
+
+#include "StemFunction.h"
+
+namespace Eigen {
+namespace internal {
+
+/** \brief Scaling operator.
+ *
+ * This struct is used by CwiseUnaryOp to scale a matrix by \f$ 2^{-s} \f$.
+ */
+template <typename RealScalar>
+struct MatrixExponentialScalingOp
+{
+ /** \brief Constructor.
+ *
+ * \param[in] squarings The integer \f$ s \f$ in this document.
+ */
+ MatrixExponentialScalingOp(int squarings) : m_squarings(squarings) { }
+
+
+ /** \brief Scale a matrix coefficient.
+ *
+ * \param[in,out] x The scalar to be scaled, becoming \f$ 2^{-s} x \f$.
+ */
+ inline const RealScalar operator() (const RealScalar& x) const
+ {
+ using std::ldexp;
+ return ldexp(x, -m_squarings);
+ }
+
+ typedef std::complex<RealScalar> ComplexScalar;
+
+ /** \brief Scale a matrix coefficient.
+ *
+ * \param[in,out] x The scalar to be scaled, becoming \f$ 2^{-s} x \f$.
+ */
+ inline const ComplexScalar operator() (const ComplexScalar& x) const
+ {
+ using std::ldexp;
+ return ComplexScalar(ldexp(x.real(), -m_squarings), ldexp(x.imag(), -m_squarings));
+ }
+
+ private:
+ int m_squarings;
+};
+
+/** \brief Compute the (3,3)-Pad&eacute; approximant to the exponential.
+ *
+ * After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
+ * approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
+ */
+template <typename MatA, typename MatU, typename MatV>
+void matrix_exp_pade3(const MatA& A, MatU& U, MatV& V)
+{
+ typedef typename MatA::PlainObject MatrixType;
+ typedef typename NumTraits<typename traits<MatA>::Scalar>::Real RealScalar;
+ const RealScalar b[] = {120.L, 60.L, 12.L, 1.L};
+ const MatrixType A2 = A * A;
+ const MatrixType tmp = b[3] * A2 + b[1] * MatrixType::Identity(A.rows(), A.cols());
+ U.noalias() = A * tmp;
+ V = b[2] * A2 + b[0] * MatrixType::Identity(A.rows(), A.cols());
+}
+
+/** \brief Compute the (5,5)-Pad&eacute; approximant to the exponential.
+ *
+ * After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
+ * approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
+ */
+template <typename MatA, typename MatU, typename MatV>
+void matrix_exp_pade5(const MatA& A, MatU& U, MatV& V)
+{
+ typedef typename MatA::PlainObject MatrixType;
+ typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
+ const RealScalar b[] = {30240.L, 15120.L, 3360.L, 420.L, 30.L, 1.L};
+ const MatrixType A2 = A * A;
+ const MatrixType A4 = A2 * A2;
+ const MatrixType tmp = b[5] * A4 + b[3] * A2 + b[1] * MatrixType::Identity(A.rows(), A.cols());
+ U.noalias() = A * tmp;
+ V = b[4] * A4 + b[2] * A2 + b[0] * MatrixType::Identity(A.rows(), A.cols());
+}
+
+/** \brief Compute the (7,7)-Pad&eacute; approximant to the exponential.
+ *
+ * After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
+ * approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
+ */
+template <typename MatA, typename MatU, typename MatV>
+void matrix_exp_pade7(const MatA& A, MatU& U, MatV& V)
+{
+ typedef typename MatA::PlainObject MatrixType;
+ typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
+ const RealScalar b[] = {17297280.L, 8648640.L, 1995840.L, 277200.L, 25200.L, 1512.L, 56.L, 1.L};
+ const MatrixType A2 = A * A;
+ const MatrixType A4 = A2 * A2;
+ const MatrixType A6 = A4 * A2;
+ const MatrixType tmp = b[7] * A6 + b[5] * A4 + b[3] * A2
+ + b[1] * MatrixType::Identity(A.rows(), A.cols());
+ U.noalias() = A * tmp;
+ V = b[6] * A6 + b[4] * A4 + b[2] * A2 + b[0] * MatrixType::Identity(A.rows(), A.cols());
+
+}
+
+/** \brief Compute the (9,9)-Pad&eacute; approximant to the exponential.
+ *
+ * After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
+ * approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
+ */
+template <typename MatA, typename MatU, typename MatV>
+void matrix_exp_pade9(const MatA& A, MatU& U, MatV& V)
+{
+ typedef typename MatA::PlainObject MatrixType;
+ typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
+ const RealScalar b[] = {17643225600.L, 8821612800.L, 2075673600.L, 302702400.L, 30270240.L,
+ 2162160.L, 110880.L, 3960.L, 90.L, 1.L};
+ const MatrixType A2 = A * A;
+ const MatrixType A4 = A2 * A2;
+ const MatrixType A6 = A4 * A2;
+ const MatrixType A8 = A6 * A2;
+ const MatrixType tmp = b[9] * A8 + b[7] * A6 + b[5] * A4 + b[3] * A2
+ + b[1] * MatrixType::Identity(A.rows(), A.cols());
+ U.noalias() = A * tmp;
+ V = b[8] * A8 + b[6] * A6 + b[4] * A4 + b[2] * A2 + b[0] * MatrixType::Identity(A.rows(), A.cols());
+}
+
+/** \brief Compute the (13,13)-Pad&eacute; approximant to the exponential.
+ *
+ * After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
+ * approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
+ */
+template <typename MatA, typename MatU, typename MatV>
+void matrix_exp_pade13(const MatA& A, MatU& U, MatV& V)
+{
+ typedef typename MatA::PlainObject MatrixType;
+ typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
+ const RealScalar b[] = {64764752532480000.L, 32382376266240000.L, 7771770303897600.L,
+ 1187353796428800.L, 129060195264000.L, 10559470521600.L, 670442572800.L,
+ 33522128640.L, 1323241920.L, 40840800.L, 960960.L, 16380.L, 182.L, 1.L};
+ const MatrixType A2 = A * A;
+ const MatrixType A4 = A2 * A2;
+ const MatrixType A6 = A4 * A2;
+ V = b[13] * A6 + b[11] * A4 + b[9] * A2; // used for temporary storage
+ MatrixType tmp = A6 * V;
+ tmp += b[7] * A6 + b[5] * A4 + b[3] * A2 + b[1] * MatrixType::Identity(A.rows(), A.cols());
+ U.noalias() = A * tmp;
+ tmp = b[12] * A6 + b[10] * A4 + b[8] * A2;
+ V.noalias() = A6 * tmp;
+ V += b[6] * A6 + b[4] * A4 + b[2] * A2 + b[0] * MatrixType::Identity(A.rows(), A.cols());
+}
+
+/** \brief Compute the (17,17)-Pad&eacute; approximant to the exponential.
+ *
+ * After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
+ * approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
+ *
+ * This function activates only if your long double is double-double or quadruple.
+ */
+#if LDBL_MANT_DIG > 64
+template <typename MatA, typename MatU, typename MatV>
+void matrix_exp_pade17(const MatA& A, MatU& U, MatV& V)
+{
+ typedef typename MatA::PlainObject MatrixType;
+ typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
+ const RealScalar b[] = {830034394580628357120000.L, 415017197290314178560000.L,
+ 100610229646136770560000.L, 15720348382208870400000.L,
+ 1774878043152614400000.L, 153822763739893248000.L, 10608466464820224000.L,
+ 595373117923584000.L, 27563570274240000.L, 1060137318240000.L,
+ 33924394183680.L, 899510451840.L, 19554575040.L, 341863200.L, 4651200.L,
+ 46512.L, 306.L, 1.L};
+ const MatrixType A2 = A * A;
+ const MatrixType A4 = A2 * A2;
+ const MatrixType A6 = A4 * A2;
+ const MatrixType A8 = A4 * A4;
+ V = b[17] * A8 + b[15] * A6 + b[13] * A4 + b[11] * A2; // used for temporary storage
+ MatrixType tmp = A8 * V;
+ tmp += b[9] * A8 + b[7] * A6 + b[5] * A4 + b[3] * A2
+ + b[1] * MatrixType::Identity(A.rows(), A.cols());
+ U.noalias() = A * tmp;
+ tmp = b[16] * A8 + b[14] * A6 + b[12] * A4 + b[10] * A2;
+ V.noalias() = tmp * A8;
+ V += b[8] * A8 + b[6] * A6 + b[4] * A4 + b[2] * A2
+ + b[0] * MatrixType::Identity(A.rows(), A.cols());
+}
+#endif
+
+template <typename MatrixType, typename RealScalar = typename NumTraits<typename traits<MatrixType>::Scalar>::Real>
+struct matrix_exp_computeUV
+{
+ /** \brief Compute Pad&eacute; approximant to the exponential.
+ *
+ * Computes \c U, \c V and \c squarings such that \f$ (V+U)(V-U)^{-1} \f$ is a Pad&eacute;
+ * approximant of \f$ \exp(2^{-\mbox{squarings}}M) \f$ around \f$ M = 0 \f$, where \f$ M \f$
+ * denotes the matrix \c arg. The degree of the Pad&eacute; approximant and the value of squarings
+ * are chosen such that the approximation error is no more than the round-off error.
+ */
+ static void run(const MatrixType& arg, MatrixType& U, MatrixType& V, int& squarings);
+};
+
+template <typename MatrixType>
+struct matrix_exp_computeUV<MatrixType, float>
+{
+ template <typename ArgType>
+ static void run(const ArgType& arg, MatrixType& U, MatrixType& V, int& squarings)
+ {
+ using std::frexp;
+ using std::pow;
+ const float l1norm = arg.cwiseAbs().colwise().sum().maxCoeff();
+ squarings = 0;
+ if (l1norm < 4.258730016922831e-001f) {
+ matrix_exp_pade3(arg, U, V);
+ } else if (l1norm < 1.880152677804762e+000f) {
+ matrix_exp_pade5(arg, U, V);
+ } else {
+ const float maxnorm = 3.925724783138660f;
+ frexp(l1norm / maxnorm, &squarings);
+ if (squarings < 0) squarings = 0;
+ MatrixType A = arg.unaryExpr(MatrixExponentialScalingOp<float>(squarings));
+ matrix_exp_pade7(A, U, V);
+ }
+ }
+};
+
+template <typename MatrixType>
+struct matrix_exp_computeUV<MatrixType, double>
+{
+ typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
+ template <typename ArgType>
+ static void run(const ArgType& arg, MatrixType& U, MatrixType& V, int& squarings)
+ {
+ using std::frexp;
+ using std::pow;
+ const RealScalar l1norm = arg.cwiseAbs().colwise().sum().maxCoeff();
+ squarings = 0;
+ if (l1norm < 1.495585217958292e-002) {
+ matrix_exp_pade3(arg, U, V);
+ } else if (l1norm < 2.539398330063230e-001) {
+ matrix_exp_pade5(arg, U, V);
+ } else if (l1norm < 9.504178996162932e-001) {
+ matrix_exp_pade7(arg, U, V);
+ } else if (l1norm < 2.097847961257068e+000) {
+ matrix_exp_pade9(arg, U, V);
+ } else {
+ const RealScalar maxnorm = 5.371920351148152;
+ frexp(l1norm / maxnorm, &squarings);
+ if (squarings < 0) squarings = 0;
+ MatrixType A = arg.unaryExpr(MatrixExponentialScalingOp<RealScalar>(squarings));
+ matrix_exp_pade13(A, U, V);
+ }
+ }
+};
+
+template <typename MatrixType>
+struct matrix_exp_computeUV<MatrixType, long double>
+{
+ template <typename ArgType>
+ static void run(const ArgType& arg, MatrixType& U, MatrixType& V, int& squarings)
+ {
+#if LDBL_MANT_DIG == 53 // double precision
+ matrix_exp_computeUV<MatrixType, double>::run(arg, U, V, squarings);
+
+#else
+
+ using std::frexp;
+ using std::pow;
+ const long double l1norm = arg.cwiseAbs().colwise().sum().maxCoeff();
+ squarings = 0;
+
+#if LDBL_MANT_DIG <= 64 // extended precision
+
+ if (l1norm < 4.1968497232266989671e-003L) {
+ matrix_exp_pade3(arg, U, V);
+ } else if (l1norm < 1.1848116734693823091e-001L) {
+ matrix_exp_pade5(arg, U, V);
+ } else if (l1norm < 5.5170388480686700274e-001L) {
+ matrix_exp_pade7(arg, U, V);
+ } else if (l1norm < 1.3759868875587845383e+000L) {
+ matrix_exp_pade9(arg, U, V);
+ } else {
+ const long double maxnorm = 4.0246098906697353063L;
+ frexp(l1norm / maxnorm, &squarings);
+ if (squarings < 0) squarings = 0;
+ MatrixType A = arg.unaryExpr(MatrixExponentialScalingOp<long double>(squarings));
+ matrix_exp_pade13(A, U, V);
+ }
+
+#elif LDBL_MANT_DIG <= 106 // double-double
+
+ if (l1norm < 3.2787892205607026992947488108213e-005L) {
+ matrix_exp_pade3(arg, U, V);
+ } else if (l1norm < 6.4467025060072760084130906076332e-003L) {
+ matrix_exp_pade5(arg, U, V);
+ } else if (l1norm < 6.8988028496595374751374122881143e-002L) {
+ matrix_exp_pade7(arg, U, V);
+ } else if (l1norm < 2.7339737518502231741495857201670e-001L) {
+ matrix_exp_pade9(arg, U, V);
+ } else if (l1norm < 1.3203382096514474905666448850278e+000L) {
+ matrix_exp_pade13(arg, U, V);
+ } else {
+ const long double maxnorm = 3.2579440895405400856599663723517L;
+ frexp(l1norm / maxnorm, &squarings);
+ if (squarings < 0) squarings = 0;
+ MatrixType A = arg.unaryExpr(MatrixExponentialScalingOp<long double>(squarings));
+ matrix_exp_pade17(A, U, V);
+ }
+
+#elif LDBL_MANT_DIG <= 113 // quadruple precision
+
+ if (l1norm < 1.639394610288918690547467954466970e-005L) {
+ matrix_exp_pade3(arg, U, V);
+ } else if (l1norm < 4.253237712165275566025884344433009e-003L) {
+ matrix_exp_pade5(arg, U, V);
+ } else if (l1norm < 5.125804063165764409885122032933142e-002L) {
+ matrix_exp_pade7(arg, U, V);
+ } else if (l1norm < 2.170000765161155195453205651889853e-001L) {
+ matrix_exp_pade9(arg, U, V);
+ } else if (l1norm < 1.125358383453143065081397882891878e+000L) {
+ matrix_exp_pade13(arg, U, V);
+ } else {
+ const long double maxnorm = 2.884233277829519311757165057717815L;
+ frexp(l1norm / maxnorm, &squarings);
+ if (squarings < 0) squarings = 0;
+ MatrixType A = arg.unaryExpr(MatrixExponentialScalingOp<long double>(squarings));
+ matrix_exp_pade17(A, U, V);
+ }
+
+#else
+
+ // this case should be handled in compute()
+ eigen_assert(false && "Bug in MatrixExponential");
+
+#endif
+#endif // LDBL_MANT_DIG
+ }
+};
+
+template<typename T> struct is_exp_known_type : false_type {};
+template<> struct is_exp_known_type<float> : true_type {};
+template<> struct is_exp_known_type<double> : true_type {};
+#if LDBL_MANT_DIG <= 113
+template<> struct is_exp_known_type<long double> : true_type {};
+#endif
+
+template <typename ArgType, typename ResultType>
+void matrix_exp_compute(const ArgType& arg, ResultType &result, true_type) // natively supported scalar type
+{
+ typedef typename ArgType::PlainObject MatrixType;
+ MatrixType U, V;
+ int squarings;
+ matrix_exp_computeUV<MatrixType>::run(arg, U, V, squarings); // Pade approximant is (U+V) / (-U+V)
+ MatrixType numer = U + V;
+ MatrixType denom = -U + V;
+ result = denom.partialPivLu().solve(numer);
+ for (int i=0; i<squarings; i++)
+ result *= result; // undo scaling by repeated squaring
+}
+
+
+/* Computes the matrix exponential
+ *
+ * \param arg argument of matrix exponential (should be plain object)
+ * \param result variable in which result will be stored
+ */
+template <typename ArgType, typename ResultType>
+void matrix_exp_compute(const ArgType& arg, ResultType &result, false_type) // default
+{
+ typedef typename ArgType::PlainObject MatrixType;
+ typedef typename traits<MatrixType>::Scalar Scalar;
+ typedef typename NumTraits<Scalar>::Real RealScalar;
+ typedef typename std::complex<RealScalar> ComplexScalar;
+ result = arg.matrixFunction(internal::stem_function_exp<ComplexScalar>);
+}
+
+} // end namespace Eigen::internal
+
+/** \ingroup MatrixFunctions_Module
+ *
+ * \brief Proxy for the matrix exponential of some matrix (expression).
+ *
+ * \tparam Derived Type of the argument to the matrix exponential.
+ *
+ * This class holds the argument to the matrix exponential until it is assigned or evaluated for
+ * some other reason (so the argument should not be changed in the meantime). It is the return type
+ * of MatrixBase::exp() and most of the time this is the only way it is used.
+ */
+template<typename Derived> struct MatrixExponentialReturnValue
+: public ReturnByValue<MatrixExponentialReturnValue<Derived> >
+{
+ public:
+ /** \brief Constructor.
+ *
+ * \param src %Matrix (expression) forming the argument of the matrix exponential.
+ */
+ MatrixExponentialReturnValue(const Derived& src) : m_src(src) { }
+
+ /** \brief Compute the matrix exponential.
+ *
+ * \param result the matrix exponential of \p src in the constructor.
+ */
+ template <typename ResultType>
+ inline void evalTo(ResultType& result) const
+ {
+ const typename internal::nested_eval<Derived, 10>::type tmp(m_src);
+ internal::matrix_exp_compute(tmp, result, internal::is_exp_known_type<typename Derived::RealScalar>());
+ }
+
+ Index rows() const { return m_src.rows(); }
+ Index cols() const { return m_src.cols(); }
+
+ protected:
+ const typename internal::ref_selector<Derived>::type m_src;
+};
+
+namespace internal {
+template<typename Derived>
+struct traits<MatrixExponentialReturnValue<Derived> >
+{
+ typedef typename Derived::PlainObject ReturnType;
+};
+}
+
+template <typename Derived>
+const MatrixExponentialReturnValue<Derived> MatrixBase<Derived>::exp() const
+{
+ eigen_assert(rows() == cols());
+ return MatrixExponentialReturnValue<Derived>(derived());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATRIX_EXPONENTIAL
diff --git a/src/EigenUnsupported/src/MatrixFunctions/MatrixFunction.h b/src/EigenUnsupported/src/MatrixFunctions/MatrixFunction.h
new file mode 100644
index 0000000..cc12ab6
--- /dev/null
+++ b/src/EigenUnsupported/src/MatrixFunctions/MatrixFunction.h
@@ -0,0 +1,569 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2011, 2013 Jitse Niesen <jitse@maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATRIX_FUNCTION_H
+#define EIGEN_MATRIX_FUNCTION_H
+
+#include "StemFunction.h"
+
+
+namespace Eigen {
+
+namespace internal {
+
+/** \brief Maximum distance allowed between eigenvalues to be considered "close". */
+static const float matrix_function_separation = 0.1f;
+
+/** \ingroup MatrixFunctions_Module
+ * \class MatrixFunctionAtomic
+ * \brief Helper class for computing matrix functions of atomic matrices.
+ *
+ * Here, an atomic matrix is a triangular matrix whose diagonal entries are close to each other.
+ */
+template <typename MatrixType>
+class MatrixFunctionAtomic
+{
+ public:
+
+ typedef typename MatrixType::Scalar Scalar;
+ typedef typename stem_function<Scalar>::type StemFunction;
+
+ /** \brief Constructor
+ * \param[in] f matrix function to compute.
+ */
+ MatrixFunctionAtomic(StemFunction f) : m_f(f) { }
+
+ /** \brief Compute matrix function of atomic matrix
+ * \param[in] A argument of matrix function, should be upper triangular and atomic
+ * \returns f(A), the matrix function evaluated at the given matrix
+ */
+ MatrixType compute(const MatrixType& A);
+
+ private:
+ StemFunction* m_f;
+};
+
+template <typename MatrixType>
+typename NumTraits<typename MatrixType::Scalar>::Real matrix_function_compute_mu(const MatrixType& A)
+{
+ typedef typename plain_col_type<MatrixType>::type VectorType;
+ Index rows = A.rows();
+ const MatrixType N = MatrixType::Identity(rows, rows) - A;
+ VectorType e = VectorType::Ones(rows);
+ N.template triangularView<Upper>().solveInPlace(e);
+ return e.cwiseAbs().maxCoeff();
+}
+
+template <typename MatrixType>
+MatrixType MatrixFunctionAtomic<MatrixType>::compute(const MatrixType& A)
+{
+ // TODO: Use that A is upper triangular
+ typedef typename NumTraits<Scalar>::Real RealScalar;
+ Index rows = A.rows();
+ Scalar avgEival = A.trace() / Scalar(RealScalar(rows));
+ MatrixType Ashifted = A - avgEival * MatrixType::Identity(rows, rows);
+ RealScalar mu = matrix_function_compute_mu(Ashifted);
+ MatrixType F = m_f(avgEival, 0) * MatrixType::Identity(rows, rows);
+ MatrixType P = Ashifted;
+ MatrixType Fincr;
+ for (Index s = 1; double(s) < 1.1 * double(rows) + 10.0; s++) { // upper limit is fairly arbitrary
+ Fincr = m_f(avgEival, static_cast<int>(s)) * P;
+ F += Fincr;
+ P = Scalar(RealScalar(1)/RealScalar(s + 1)) * P * Ashifted;
+
+ // test whether Taylor series converged
+ const RealScalar F_norm = F.cwiseAbs().rowwise().sum().maxCoeff();
+ const RealScalar Fincr_norm = Fincr.cwiseAbs().rowwise().sum().maxCoeff();
+ if (Fincr_norm < NumTraits<Scalar>::epsilon() * F_norm) {
+ RealScalar delta = 0;
+ RealScalar rfactorial = 1;
+ for (Index r = 0; r < rows; r++) {
+ RealScalar mx = 0;
+ for (Index i = 0; i < rows; i++)
+ mx = (std::max)(mx, std::abs(m_f(Ashifted(i, i) + avgEival, static_cast<int>(s+r))));
+ if (r != 0)
+ rfactorial *= RealScalar(r);
+ delta = (std::max)(delta, mx / rfactorial);
+ }
+ const RealScalar P_norm = P.cwiseAbs().rowwise().sum().maxCoeff();
+ if (mu * delta * P_norm < NumTraits<Scalar>::epsilon() * F_norm) // series converged
+ break;
+ }
+ }
+ return F;
+}
+
+/** \brief Find cluster in \p clusters containing some value
+ * \param[in] key Value to find
+ * \returns Iterator to cluster containing \p key, or \c clusters.end() if no cluster in \p m_clusters
+ * contains \p key.
+ */
+template <typename Index, typename ListOfClusters>
+typename ListOfClusters::iterator matrix_function_find_cluster(Index key, ListOfClusters& clusters)
+{
+ typename std::list<Index>::iterator j;
+ for (typename ListOfClusters::iterator i = clusters.begin(); i != clusters.end(); ++i) {
+ j = std::find(i->begin(), i->end(), key);
+ if (j != i->end())
+ return i;
+ }
+ return clusters.end();
+}
+
+/** \brief Partition eigenvalues in clusters of ei'vals close to each other
+ *
+ * \param[in] eivals Eigenvalues
+ * \param[out] clusters Resulting partition of eigenvalues
+ *
+ * The partition satisfies the following two properties:
+ * # Any eigenvalue in a certain cluster is at most matrix_function_separation() away from another eigenvalue
+ * in the same cluster.
+ * # The distance between two eigenvalues in different clusters is more than matrix_function_separation().
+ * The implementation follows Algorithm 4.1 in the paper of Davies and Higham.
+ */
+template <typename EivalsType, typename Cluster>
+void matrix_function_partition_eigenvalues(const EivalsType& eivals, std::list<Cluster>& clusters)
+{
+ typedef typename EivalsType::RealScalar RealScalar;
+ for (Index i=0; i<eivals.rows(); ++i) {
+ // Find cluster containing i-th ei'val, adding a new cluster if necessary
+ typename std::list<Cluster>::iterator qi = matrix_function_find_cluster(i, clusters);
+ if (qi == clusters.end()) {
+ Cluster l;
+ l.push_back(i);
+ clusters.push_back(l);
+ qi = clusters.end();
+ --qi;
+ }
+
+ // Look for other element to add to the set
+ for (Index j=i+1; j<eivals.rows(); ++j) {
+ if (abs(eivals(j) - eivals(i)) <= RealScalar(matrix_function_separation)
+ && std::find(qi->begin(), qi->end(), j) == qi->end()) {
+ typename std::list<Cluster>::iterator qj = matrix_function_find_cluster(j, clusters);
+ if (qj == clusters.end()) {
+ qi->push_back(j);
+ } else {
+ qi->insert(qi->end(), qj->begin(), qj->end());
+ clusters.erase(qj);
+ }
+ }
+ }
+ }
+}
+
+/** \brief Compute size of each cluster given a partitioning */
+template <typename ListOfClusters, typename Index>
+void matrix_function_compute_cluster_size(const ListOfClusters& clusters, Matrix<Index, Dynamic, 1>& clusterSize)
+{
+ const Index numClusters = static_cast<Index>(clusters.size());
+ clusterSize.setZero(numClusters);
+ Index clusterIndex = 0;
+ for (typename ListOfClusters::const_iterator cluster = clusters.begin(); cluster != clusters.end(); ++cluster) {
+ clusterSize[clusterIndex] = cluster->size();
+ ++clusterIndex;
+ }
+}
+
+/** \brief Compute start of each block using clusterSize */
+template <typename VectorType>
+void matrix_function_compute_block_start(const VectorType& clusterSize, VectorType& blockStart)
+{
+ blockStart.resize(clusterSize.rows());
+ blockStart(0) = 0;
+ for (Index i = 1; i < clusterSize.rows(); i++) {
+ blockStart(i) = blockStart(i-1) + clusterSize(i-1);
+ }
+}
+
+/** \brief Compute mapping of eigenvalue indices to cluster indices */
+template <typename EivalsType, typename ListOfClusters, typename VectorType>
+void matrix_function_compute_map(const EivalsType& eivals, const ListOfClusters& clusters, VectorType& eivalToCluster)
+{
+ eivalToCluster.resize(eivals.rows());
+ Index clusterIndex = 0;
+ for (typename ListOfClusters::const_iterator cluster = clusters.begin(); cluster != clusters.end(); ++cluster) {
+ for (Index i = 0; i < eivals.rows(); ++i) {
+ if (std::find(cluster->begin(), cluster->end(), i) != cluster->end()) {
+ eivalToCluster[i] = clusterIndex;
+ }
+ }
+ ++clusterIndex;
+ }
+}
+
+/** \brief Compute permutation which groups ei'vals in same cluster together */
+template <typename DynVectorType, typename VectorType>
+void matrix_function_compute_permutation(const DynVectorType& blockStart, const DynVectorType& eivalToCluster, VectorType& permutation)
+{
+ DynVectorType indexNextEntry = blockStart;
+ permutation.resize(eivalToCluster.rows());
+ for (Index i = 0; i < eivalToCluster.rows(); i++) {
+ Index cluster = eivalToCluster[i];
+ permutation[i] = indexNextEntry[cluster];
+ ++indexNextEntry[cluster];
+ }
+}
+
+/** \brief Permute Schur decomposition in U and T according to permutation */
+template <typename VectorType, typename MatrixType>
+void matrix_function_permute_schur(VectorType& permutation, MatrixType& U, MatrixType& T)
+{
+ for (Index i = 0; i < permutation.rows() - 1; i++) {
+ Index j;
+ for (j = i; j < permutation.rows(); j++) {
+ if (permutation(j) == i) break;
+ }
+ eigen_assert(permutation(j) == i);
+ for (Index k = j-1; k >= i; k--) {
+ JacobiRotation<typename MatrixType::Scalar> rotation;
+ rotation.makeGivens(T(k, k+1), T(k+1, k+1) - T(k, k));
+ T.applyOnTheLeft(k, k+1, rotation.adjoint());
+ T.applyOnTheRight(k, k+1, rotation);
+ U.applyOnTheRight(k, k+1, rotation);
+ std::swap(permutation.coeffRef(k), permutation.coeffRef(k+1));
+ }
+ }
+}
+
+/** \brief Compute block diagonal part of matrix function.
+ *
+ * This routine computes the matrix function applied to the block diagonal part of \p T (which should be
+ * upper triangular), with the blocking given by \p blockStart and \p clusterSize. The matrix function of
+ * each diagonal block is computed by \p atomic. The off-diagonal parts of \p fT are set to zero.
+ */
+template <typename MatrixType, typename AtomicType, typename VectorType>
+void matrix_function_compute_block_atomic(const MatrixType& T, AtomicType& atomic, const VectorType& blockStart, const VectorType& clusterSize, MatrixType& fT)
+{
+ fT.setZero(T.rows(), T.cols());
+ for (Index i = 0; i < clusterSize.rows(); ++i) {
+ fT.block(blockStart(i), blockStart(i), clusterSize(i), clusterSize(i))
+ = atomic.compute(T.block(blockStart(i), blockStart(i), clusterSize(i), clusterSize(i)));
+ }
+}
+
+/** \brief Solve a triangular Sylvester equation AX + XB = C
+ *
+ * \param[in] A the matrix A; should be square and upper triangular
+ * \param[in] B the matrix B; should be square and upper triangular
+ * \param[in] C the matrix C; should have correct size.
+ *
+ * \returns the solution X.
+ *
+ * If A is m-by-m and B is n-by-n, then both C and X are m-by-n. The (i,j)-th component of the Sylvester
+ * equation is
+ * \f[
+ * \sum_{k=i}^m A_{ik} X_{kj} + \sum_{k=1}^j X_{ik} B_{kj} = C_{ij}.
+ * \f]
+ * This can be re-arranged to yield:
+ * \f[
+ * X_{ij} = \frac{1}{A_{ii} + B_{jj}} \Bigl( C_{ij}
+ * - \sum_{k=i+1}^m A_{ik} X_{kj} - \sum_{k=1}^{j-1} X_{ik} B_{kj} \Bigr).
+ * \f]
+ * It is assumed that A and B are such that the numerator is never zero (otherwise the Sylvester equation
+ * does not have a unique solution). In that case, these equations can be evaluated in the order
+ * \f$ i=m,\ldots,1 \f$ and \f$ j=1,\ldots,n \f$.
+ */
+template <typename MatrixType>
+MatrixType matrix_function_solve_triangular_sylvester(const MatrixType& A, const MatrixType& B, const MatrixType& C)
+{
+ eigen_assert(A.rows() == A.cols());
+ eigen_assert(A.isUpperTriangular());
+ eigen_assert(B.rows() == B.cols());
+ eigen_assert(B.isUpperTriangular());
+ eigen_assert(C.rows() == A.rows());
+ eigen_assert(C.cols() == B.rows());
+
+ typedef typename MatrixType::Scalar Scalar;
+
+ Index m = A.rows();
+ Index n = B.rows();
+ MatrixType X(m, n);
+
+ for (Index i = m - 1; i >= 0; --i) {
+ for (Index j = 0; j < n; ++j) {
+
+ // Compute AX = \sum_{k=i+1}^m A_{ik} X_{kj}
+ Scalar AX;
+ if (i == m - 1) {
+ AX = 0;
+ } else {
+ Matrix<Scalar,1,1> AXmatrix = A.row(i).tail(m-1-i) * X.col(j).tail(m-1-i);
+ AX = AXmatrix(0,0);
+ }
+
+ // Compute XB = \sum_{k=1}^{j-1} X_{ik} B_{kj}
+ Scalar XB;
+ if (j == 0) {
+ XB = 0;
+ } else {
+ Matrix<Scalar,1,1> XBmatrix = X.row(i).head(j) * B.col(j).head(j);
+ XB = XBmatrix(0,0);
+ }
+
+ X(i,j) = (C(i,j) - AX - XB) / (A(i,i) + B(j,j));
+ }
+ }
+ return X;
+}
+
+/** \brief Compute part of matrix function above block diagonal.
+ *
+ * This routine completes the computation of \p fT, denoting a matrix function applied to the triangular
+ * matrix \p T. It assumes that the block diagonal part of \p fT has already been computed. The part below
+ * the diagonal is zero, because \p T is upper triangular.
+ */
+template <typename MatrixType, typename VectorType>
+void matrix_function_compute_above_diagonal(const MatrixType& T, const VectorType& blockStart, const VectorType& clusterSize, MatrixType& fT)
+{
+ typedef internal::traits<MatrixType> Traits;
+ typedef typename MatrixType::Scalar Scalar;
+ static const int Options = MatrixType::Options;
+ typedef Matrix<Scalar, Dynamic, Dynamic, Options, Traits::RowsAtCompileTime, Traits::ColsAtCompileTime> DynMatrixType;
+
+ for (Index k = 1; k < clusterSize.rows(); k++) {
+ for (Index i = 0; i < clusterSize.rows() - k; i++) {
+ // compute (i, i+k) block
+ DynMatrixType A = T.block(blockStart(i), blockStart(i), clusterSize(i), clusterSize(i));
+ DynMatrixType B = -T.block(blockStart(i+k), blockStart(i+k), clusterSize(i+k), clusterSize(i+k));
+ DynMatrixType C = fT.block(blockStart(i), blockStart(i), clusterSize(i), clusterSize(i))
+ * T.block(blockStart(i), blockStart(i+k), clusterSize(i), clusterSize(i+k));
+ C -= T.block(blockStart(i), blockStart(i+k), clusterSize(i), clusterSize(i+k))
+ * fT.block(blockStart(i+k), blockStart(i+k), clusterSize(i+k), clusterSize(i+k));
+ for (Index m = i + 1; m < i + k; m++) {
+ C += fT.block(blockStart(i), blockStart(m), clusterSize(i), clusterSize(m))
+ * T.block(blockStart(m), blockStart(i+k), clusterSize(m), clusterSize(i+k));
+ C -= T.block(blockStart(i), blockStart(m), clusterSize(i), clusterSize(m))
+ * fT.block(blockStart(m), blockStart(i+k), clusterSize(m), clusterSize(i+k));
+ }
+ fT.block(blockStart(i), blockStart(i+k), clusterSize(i), clusterSize(i+k))
+ = matrix_function_solve_triangular_sylvester(A, B, C);
+ }
+ }
+}
+
+/** \ingroup MatrixFunctions_Module
+ * \brief Class for computing matrix functions.
+ * \tparam MatrixType type of the argument of the matrix function,
+ * expected to be an instantiation of the Matrix class template.
+ * \tparam AtomicType type for computing matrix function of atomic blocks.
+ * \tparam IsComplex used internally to select correct specialization.
+ *
+ * This class implements the Schur-Parlett algorithm for computing matrix functions. The spectrum of the
+ * matrix is divided in clustered of eigenvalues that lies close together. This class delegates the
+ * computation of the matrix function on every block corresponding to these clusters to an object of type
+ * \p AtomicType and uses these results to compute the matrix function of the whole matrix. The class
+ * \p AtomicType should have a \p compute() member function for computing the matrix function of a block.
+ *
+ * \sa class MatrixFunctionAtomic, class MatrixLogarithmAtomic
+ */
+template <typename MatrixType, int IsComplex = NumTraits<typename internal::traits<MatrixType>::Scalar>::IsComplex>
+struct matrix_function_compute
+{
+ /** \brief Compute the matrix function.
+ *
+ * \param[in] A argument of matrix function, should be a square matrix.
+ * \param[in] atomic class for computing matrix function of atomic blocks.
+ * \param[out] result the function \p f applied to \p A, as
+ * specified in the constructor.
+ *
+ * See MatrixBase::matrixFunction() for details on how this computation
+ * is implemented.
+ */
+ template <typename AtomicType, typename ResultType>
+ static void run(const MatrixType& A, AtomicType& atomic, ResultType &result);
+};
+
+/** \internal \ingroup MatrixFunctions_Module
+ * \brief Partial specialization of MatrixFunction for real matrices
+ *
+ * This converts the real matrix to a complex matrix, compute the matrix function of that matrix, and then
+ * converts the result back to a real matrix.
+ */
+template <typename MatrixType>
+struct matrix_function_compute<MatrixType, 0>
+{
+ template <typename MatA, typename AtomicType, typename ResultType>
+ static void run(const MatA& A, AtomicType& atomic, ResultType &result)
+ {
+ typedef internal::traits<MatrixType> Traits;
+ typedef typename Traits::Scalar Scalar;
+ static const int Rows = Traits::RowsAtCompileTime, Cols = Traits::ColsAtCompileTime;
+ static const int MaxRows = Traits::MaxRowsAtCompileTime, MaxCols = Traits::MaxColsAtCompileTime;
+
+ typedef std::complex<Scalar> ComplexScalar;
+ typedef Matrix<ComplexScalar, Rows, Cols, 0, MaxRows, MaxCols> ComplexMatrix;
+
+ ComplexMatrix CA = A.template cast<ComplexScalar>();
+ ComplexMatrix Cresult;
+ matrix_function_compute<ComplexMatrix>::run(CA, atomic, Cresult);
+ result = Cresult.real();
+ }
+};
+
+/** \internal \ingroup MatrixFunctions_Module
+ * \brief Partial specialization of MatrixFunction for complex matrices
+ */
+template <typename MatrixType>
+struct matrix_function_compute<MatrixType, 1>
+{
+ template <typename MatA, typename AtomicType, typename ResultType>
+ static void run(const MatA& A, AtomicType& atomic, ResultType &result)
+ {
+ typedef internal::traits<MatrixType> Traits;
+
+ // compute Schur decomposition of A
+ const ComplexSchur<MatrixType> schurOfA(A);
+ eigen_assert(schurOfA.info()==Success);
+ MatrixType T = schurOfA.matrixT();
+ MatrixType U = schurOfA.matrixU();
+
+ // partition eigenvalues into clusters of ei'vals "close" to each other
+ std::list<std::list<Index> > clusters;
+ matrix_function_partition_eigenvalues(T.diagonal(), clusters);
+
+ // compute size of each cluster
+ Matrix<Index, Dynamic, 1> clusterSize;
+ matrix_function_compute_cluster_size(clusters, clusterSize);
+
+ // blockStart[i] is row index at which block corresponding to i-th cluster starts
+ Matrix<Index, Dynamic, 1> blockStart;
+ matrix_function_compute_block_start(clusterSize, blockStart);
+
+ // compute map so that eivalToCluster[i] = j means that i-th ei'val is in j-th cluster
+ Matrix<Index, Dynamic, 1> eivalToCluster;
+ matrix_function_compute_map(T.diagonal(), clusters, eivalToCluster);
+
+ // compute permutation which groups ei'vals in same cluster together
+ Matrix<Index, Traits::RowsAtCompileTime, 1> permutation;
+ matrix_function_compute_permutation(blockStart, eivalToCluster, permutation);
+
+ // permute Schur decomposition
+ matrix_function_permute_schur(permutation, U, T);
+
+ // compute result
+ MatrixType fT; // matrix function applied to T
+ matrix_function_compute_block_atomic(T, atomic, blockStart, clusterSize, fT);
+ matrix_function_compute_above_diagonal(T, blockStart, clusterSize, fT);
+ result = U * (fT.template triangularView<Upper>() * U.adjoint());
+ }
+};
+
+} // end of namespace internal
+
+/** \ingroup MatrixFunctions_Module
+ *
+ * \brief Proxy for the matrix function of some matrix (expression).
+ *
+ * \tparam Derived Type of the argument to the matrix function.
+ *
+ * This class holds the argument to the matrix function until it is assigned or evaluated for some other
+ * reason (so the argument should not be changed in the meantime). It is the return type of
+ * matrixBase::matrixFunction() and related functions and most of the time this is the only way it is used.
+ */
+template<typename Derived> class MatrixFunctionReturnValue
+: public ReturnByValue<MatrixFunctionReturnValue<Derived> >
+{
+ public:
+ typedef typename Derived::Scalar Scalar;
+ typedef typename internal::stem_function<Scalar>::type StemFunction;
+
+ protected:
+ typedef typename internal::ref_selector<Derived>::type DerivedNested;
+
+ public:
+
+ /** \brief Constructor.
+ *
+ * \param[in] A %Matrix (expression) forming the argument of the matrix function.
+ * \param[in] f Stem function for matrix function under consideration.
+ */
+ MatrixFunctionReturnValue(const Derived& A, StemFunction f) : m_A(A), m_f(f) { }
+
+ /** \brief Compute the matrix function.
+ *
+ * \param[out] result \p f applied to \p A, where \p f and \p A are as in the constructor.
+ */
+ template <typename ResultType>
+ inline void evalTo(ResultType& result) const
+ {
+ typedef typename internal::nested_eval<Derived, 10>::type NestedEvalType;
+ typedef typename internal::remove_all<NestedEvalType>::type NestedEvalTypeClean;
+ typedef internal::traits<NestedEvalTypeClean> Traits;
+ typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
+ typedef Matrix<ComplexScalar, Dynamic, Dynamic, 0, Traits::RowsAtCompileTime, Traits::ColsAtCompileTime> DynMatrixType;
+
+ typedef internal::MatrixFunctionAtomic<DynMatrixType> AtomicType;
+ AtomicType atomic(m_f);
+
+ internal::matrix_function_compute<typename NestedEvalTypeClean::PlainObject>::run(m_A, atomic, result);
+ }
+
+ Index rows() const { return m_A.rows(); }
+ Index cols() const { return m_A.cols(); }
+
+ private:
+ const DerivedNested m_A;
+ StemFunction *m_f;
+};
+
+namespace internal {
+template<typename Derived>
+struct traits<MatrixFunctionReturnValue<Derived> >
+{
+ typedef typename Derived::PlainObject ReturnType;
+};
+}
+
+
+/********** MatrixBase methods **********/
+
+
+template <typename Derived>
+const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::matrixFunction(typename internal::stem_function<typename internal::traits<Derived>::Scalar>::type f) const
+{
+ eigen_assert(rows() == cols());
+ return MatrixFunctionReturnValue<Derived>(derived(), f);
+}
+
+template <typename Derived>
+const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::sin() const
+{
+ eigen_assert(rows() == cols());
+ typedef typename internal::stem_function<Scalar>::ComplexScalar ComplexScalar;
+ return MatrixFunctionReturnValue<Derived>(derived(), internal::stem_function_sin<ComplexScalar>);
+}
+
+template <typename Derived>
+const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::cos() const
+{
+ eigen_assert(rows() == cols());
+ typedef typename internal::stem_function<Scalar>::ComplexScalar ComplexScalar;
+ return MatrixFunctionReturnValue<Derived>(derived(), internal::stem_function_cos<ComplexScalar>);
+}
+
+template <typename Derived>
+const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::sinh() const
+{
+ eigen_assert(rows() == cols());
+ typedef typename internal::stem_function<Scalar>::ComplexScalar ComplexScalar;
+ return MatrixFunctionReturnValue<Derived>(derived(), internal::stem_function_sinh<ComplexScalar>);
+}
+
+template <typename Derived>
+const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::cosh() const
+{
+ eigen_assert(rows() == cols());
+ typedef typename internal::stem_function<Scalar>::ComplexScalar ComplexScalar;
+ return MatrixFunctionReturnValue<Derived>(derived(), internal::stem_function_cosh<ComplexScalar>);
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATRIX_FUNCTION_H
diff --git a/src/EigenUnsupported/src/MatrixFunctions/MatrixLogarithm.h b/src/EigenUnsupported/src/MatrixFunctions/MatrixLogarithm.h
new file mode 100644
index 0000000..e917013
--- /dev/null
+++ b/src/EigenUnsupported/src/MatrixFunctions/MatrixLogarithm.h
@@ -0,0 +1,373 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011, 2013 Jitse Niesen <jitse@maths.leeds.ac.uk>
+// Copyright (C) 2011 Chen-Pang He <jdh8@ms63.hinet.net>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATRIX_LOGARITHM
+#define EIGEN_MATRIX_LOGARITHM
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Scalar>
+struct matrix_log_min_pade_degree
+{
+ static const int value = 3;
+};
+
+template <typename Scalar>
+struct matrix_log_max_pade_degree
+{
+ typedef typename NumTraits<Scalar>::Real RealScalar;
+ static const int value = std::numeric_limits<RealScalar>::digits<= 24? 5: // single precision
+ std::numeric_limits<RealScalar>::digits<= 53? 7: // double precision
+ std::numeric_limits<RealScalar>::digits<= 64? 8: // extended precision
+ std::numeric_limits<RealScalar>::digits<=106? 10: // double-double
+ 11; // quadruple precision
+};
+
+/** \brief Compute logarithm of 2x2 triangular matrix. */
+template <typename MatrixType>
+void matrix_log_compute_2x2(const MatrixType& A, MatrixType& result)
+{
+ typedef typename MatrixType::Scalar Scalar;
+ typedef typename MatrixType::RealScalar RealScalar;
+ using std::abs;
+ using std::ceil;
+ using std::imag;
+ using std::log;
+
+ Scalar logA00 = log(A(0,0));
+ Scalar logA11 = log(A(1,1));
+
+ result(0,0) = logA00;
+ result(1,0) = Scalar(0);
+ result(1,1) = logA11;
+
+ Scalar y = A(1,1) - A(0,0);
+ if (y==Scalar(0))
+ {
+ result(0,1) = A(0,1) / A(0,0);
+ }
+ else if ((abs(A(0,0)) < RealScalar(0.5)*abs(A(1,1))) || (abs(A(0,0)) > 2*abs(A(1,1))))
+ {
+ result(0,1) = A(0,1) * (logA11 - logA00) / y;
+ }
+ else
+ {
+ // computation in previous branch is inaccurate if A(1,1) \approx A(0,0)
+ RealScalar unwindingNumber = ceil((imag(logA11 - logA00) - RealScalar(EIGEN_PI)) / RealScalar(2*EIGEN_PI));
+ result(0,1) = A(0,1) * (numext::log1p(y/A(0,0)) + Scalar(0,RealScalar(2*EIGEN_PI)*unwindingNumber)) / y;
+ }
+}
+
+/* \brief Get suitable degree for Pade approximation. (specialized for RealScalar = float) */
+inline int matrix_log_get_pade_degree(float normTminusI)
+{
+ const float maxNormForPade[] = { 2.5111573934555054e-1 /* degree = 3 */ , 4.0535837411880493e-1,
+ 5.3149729967117310e-1 };
+ const int minPadeDegree = matrix_log_min_pade_degree<float>::value;
+ const int maxPadeDegree = matrix_log_max_pade_degree<float>::value;
+ int degree = minPadeDegree;
+ for (; degree <= maxPadeDegree; ++degree)
+ if (normTminusI <= maxNormForPade[degree - minPadeDegree])
+ break;
+ return degree;
+}
+
+/* \brief Get suitable degree for Pade approximation. (specialized for RealScalar = double) */
+inline int matrix_log_get_pade_degree(double normTminusI)
+{
+ const double maxNormForPade[] = { 1.6206284795015624e-2 /* degree = 3 */ , 5.3873532631381171e-2,
+ 1.1352802267628681e-1, 1.8662860613541288e-1, 2.642960831111435e-1 };
+ const int minPadeDegree = matrix_log_min_pade_degree<double>::value;
+ const int maxPadeDegree = matrix_log_max_pade_degree<double>::value;
+ int degree = minPadeDegree;
+ for (; degree <= maxPadeDegree; ++degree)
+ if (normTminusI <= maxNormForPade[degree - minPadeDegree])
+ break;
+ return degree;
+}
+
+/* \brief Get suitable degree for Pade approximation. (specialized for RealScalar = long double) */
+inline int matrix_log_get_pade_degree(long double normTminusI)
+{
+#if LDBL_MANT_DIG == 53 // double precision
+ const long double maxNormForPade[] = { 1.6206284795015624e-2L /* degree = 3 */ , 5.3873532631381171e-2L,
+ 1.1352802267628681e-1L, 1.8662860613541288e-1L, 2.642960831111435e-1L };
+#elif LDBL_MANT_DIG <= 64 // extended precision
+ const long double maxNormForPade[] = { 5.48256690357782863103e-3L /* degree = 3 */, 2.34559162387971167321e-2L,
+ 5.84603923897347449857e-2L, 1.08486423756725170223e-1L, 1.68385767881294446649e-1L,
+ 2.32777776523703892094e-1L };
+#elif LDBL_MANT_DIG <= 106 // double-double
+ const long double maxNormForPade[] = { 8.58970550342939562202529664318890e-5L /* degree = 3 */,
+ 9.34074328446359654039446552677759e-4L, 4.26117194647672175773064114582860e-3L,
+ 1.21546224740281848743149666560464e-2L, 2.61100544998339436713088248557444e-2L,
+ 4.66170074627052749243018566390567e-2L, 7.32585144444135027565872014932387e-2L,
+ 1.05026503471351080481093652651105e-1L };
+#else // quadruple precision
+ const long double maxNormForPade[] = { 4.7419931187193005048501568167858103e-5L /* degree = 3 */,
+ 5.8853168473544560470387769480192666e-4L, 2.9216120366601315391789493628113520e-3L,
+ 8.8415758124319434347116734705174308e-3L, 1.9850836029449446668518049562565291e-2L,
+ 3.6688019729653446926585242192447447e-2L, 5.9290962294020186998954055264528393e-2L,
+ 8.6998436081634343903250580992127677e-2L, 1.1880960220216759245467951592883642e-1L };
+#endif
+ const int minPadeDegree = matrix_log_min_pade_degree<long double>::value;
+ const int maxPadeDegree = matrix_log_max_pade_degree<long double>::value;
+ int degree = minPadeDegree;
+ for (; degree <= maxPadeDegree; ++degree)
+ if (normTminusI <= maxNormForPade[degree - minPadeDegree])
+ break;
+ return degree;
+}
+
+/* \brief Compute Pade approximation to matrix logarithm */
+template <typename MatrixType>
+void matrix_log_compute_pade(MatrixType& result, const MatrixType& T, int degree)
+{
+ typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
+ const int minPadeDegree = 3;
+ const int maxPadeDegree = 11;
+ assert(degree >= minPadeDegree && degree <= maxPadeDegree);
+ // FIXME this creates float-conversion-warnings if these are enabled.
+ // Either manually convert each value, or disable the warning locally
+ const RealScalar nodes[][maxPadeDegree] = {
+ { 0.1127016653792583114820734600217600L, 0.5000000000000000000000000000000000L, // degree 3
+ 0.8872983346207416885179265399782400L },
+ { 0.0694318442029737123880267555535953L, 0.3300094782075718675986671204483777L, // degree 4
+ 0.6699905217924281324013328795516223L, 0.9305681557970262876119732444464048L },
+ { 0.0469100770306680036011865608503035L, 0.2307653449471584544818427896498956L, // degree 5
+ 0.5000000000000000000000000000000000L, 0.7692346550528415455181572103501044L,
+ 0.9530899229693319963988134391496965L },
+ { 0.0337652428984239860938492227530027L, 0.1693953067668677431693002024900473L, // degree 6
+ 0.3806904069584015456847491391596440L, 0.6193095930415984543152508608403560L,
+ 0.8306046932331322568306997975099527L, 0.9662347571015760139061507772469973L },
+ { 0.0254460438286207377369051579760744L, 0.1292344072003027800680676133596058L, // degree 7
+ 0.2970774243113014165466967939615193L, 0.5000000000000000000000000000000000L,
+ 0.7029225756886985834533032060384807L, 0.8707655927996972199319323866403942L,
+ 0.9745539561713792622630948420239256L },
+ { 0.0198550717512318841582195657152635L, 0.1016667612931866302042230317620848L, // degree 8
+ 0.2372337950418355070911304754053768L, 0.4082826787521750975302619288199080L,
+ 0.5917173212478249024697380711800920L, 0.7627662049581644929088695245946232L,
+ 0.8983332387068133697957769682379152L, 0.9801449282487681158417804342847365L },
+ { 0.0159198802461869550822118985481636L, 0.0819844463366821028502851059651326L, // degree 9
+ 0.1933142836497048013456489803292629L, 0.3378732882980955354807309926783317L,
+ 0.5000000000000000000000000000000000L, 0.6621267117019044645192690073216683L,
+ 0.8066857163502951986543510196707371L, 0.9180155536633178971497148940348674L,
+ 0.9840801197538130449177881014518364L },
+ { 0.0130467357414141399610179939577740L, 0.0674683166555077446339516557882535L, // degree 10
+ 0.1602952158504877968828363174425632L, 0.2833023029353764046003670284171079L,
+ 0.4255628305091843945575869994351400L, 0.5744371694908156054424130005648600L,
+ 0.7166976970646235953996329715828921L, 0.8397047841495122031171636825574368L,
+ 0.9325316833444922553660483442117465L, 0.9869532642585858600389820060422260L },
+ { 0.0108856709269715035980309994385713L, 0.0564687001159523504624211153480364L, // degree 11
+ 0.1349239972129753379532918739844233L, 0.2404519353965940920371371652706952L,
+ 0.3652284220238275138342340072995692L, 0.5000000000000000000000000000000000L,
+ 0.6347715779761724861657659927004308L, 0.7595480646034059079628628347293048L,
+ 0.8650760027870246620467081260155767L, 0.9435312998840476495375788846519636L,
+ 0.9891143290730284964019690005614287L } };
+
+ const RealScalar weights[][maxPadeDegree] = {
+ { 0.2777777777777777777777777777777778L, 0.4444444444444444444444444444444444L, // degree 3
+ 0.2777777777777777777777777777777778L },
+ { 0.1739274225687269286865319746109997L, 0.3260725774312730713134680253890003L, // degree 4
+ 0.3260725774312730713134680253890003L, 0.1739274225687269286865319746109997L },
+ { 0.1184634425280945437571320203599587L, 0.2393143352496832340206457574178191L, // degree 5
+ 0.2844444444444444444444444444444444L, 0.2393143352496832340206457574178191L,
+ 0.1184634425280945437571320203599587L },
+ { 0.0856622461895851725201480710863665L, 0.1803807865240693037849167569188581L, // degree 6
+ 0.2339569672863455236949351719947755L, 0.2339569672863455236949351719947755L,
+ 0.1803807865240693037849167569188581L, 0.0856622461895851725201480710863665L },
+ { 0.0647424830844348466353057163395410L, 0.1398526957446383339507338857118898L, // degree 7
+ 0.1909150252525594724751848877444876L, 0.2089795918367346938775510204081633L,
+ 0.1909150252525594724751848877444876L, 0.1398526957446383339507338857118898L,
+ 0.0647424830844348466353057163395410L },
+ { 0.0506142681451881295762656771549811L, 0.1111905172266872352721779972131204L, // degree 8
+ 0.1568533229389436436689811009933007L, 0.1813418916891809914825752246385978L,
+ 0.1813418916891809914825752246385978L, 0.1568533229389436436689811009933007L,
+ 0.1111905172266872352721779972131204L, 0.0506142681451881295762656771549811L },
+ { 0.0406371941807872059859460790552618L, 0.0903240803474287020292360156214564L, // degree 9
+ 0.1303053482014677311593714347093164L, 0.1561735385200014200343152032922218L,
+ 0.1651196775006298815822625346434870L, 0.1561735385200014200343152032922218L,
+ 0.1303053482014677311593714347093164L, 0.0903240803474287020292360156214564L,
+ 0.0406371941807872059859460790552618L },
+ { 0.0333356721543440687967844049466659L, 0.0747256745752902965728881698288487L, // degree 10
+ 0.1095431812579910219977674671140816L, 0.1346333596549981775456134607847347L,
+ 0.1477621123573764350869464973256692L, 0.1477621123573764350869464973256692L,
+ 0.1346333596549981775456134607847347L, 0.1095431812579910219977674671140816L,
+ 0.0747256745752902965728881698288487L, 0.0333356721543440687967844049466659L },
+ { 0.0278342835580868332413768602212743L, 0.0627901847324523123173471496119701L, // degree 11
+ 0.0931451054638671257130488207158280L, 0.1165968822959952399592618524215876L,
+ 0.1314022722551233310903444349452546L, 0.1364625433889503153572417641681711L,
+ 0.1314022722551233310903444349452546L, 0.1165968822959952399592618524215876L,
+ 0.0931451054638671257130488207158280L, 0.0627901847324523123173471496119701L,
+ 0.0278342835580868332413768602212743L } };
+
+ MatrixType TminusI = T - MatrixType::Identity(T.rows(), T.rows());
+ result.setZero(T.rows(), T.rows());
+ for (int k = 0; k < degree; ++k) {
+ RealScalar weight = weights[degree-minPadeDegree][k];
+ RealScalar node = nodes[degree-minPadeDegree][k];
+ result += weight * (MatrixType::Identity(T.rows(), T.rows()) + node * TminusI)
+ .template triangularView<Upper>().solve(TminusI);
+ }
+}
+
+/** \brief Compute logarithm of triangular matrices with size > 2.
+ * \details This uses a inverse scale-and-square algorithm. */
+template <typename MatrixType>
+void matrix_log_compute_big(const MatrixType& A, MatrixType& result)
+{
+ typedef typename MatrixType::Scalar Scalar;
+ typedef typename NumTraits<Scalar>::Real RealScalar;
+ using std::pow;
+
+ int numberOfSquareRoots = 0;
+ int numberOfExtraSquareRoots = 0;
+ int degree;
+ MatrixType T = A, sqrtT;
+
+ const int maxPadeDegree = matrix_log_max_pade_degree<Scalar>::value;
+ const RealScalar maxNormForPade = RealScalar(
+ maxPadeDegree<= 5? 5.3149729967117310e-1L: // single precision
+ maxPadeDegree<= 7? 2.6429608311114350e-1L: // double precision
+ maxPadeDegree<= 8? 2.32777776523703892094e-1L: // extended precision
+ maxPadeDegree<=10? 1.05026503471351080481093652651105e-1L: // double-double
+ 1.1880960220216759245467951592883642e-1L); // quadruple precision
+
+ while (true) {
+ RealScalar normTminusI = (T - MatrixType::Identity(T.rows(), T.rows())).cwiseAbs().colwise().sum().maxCoeff();
+ if (normTminusI < maxNormForPade) {
+ degree = matrix_log_get_pade_degree(normTminusI);
+ int degree2 = matrix_log_get_pade_degree(normTminusI / RealScalar(2));
+ if ((degree - degree2 <= 1) || (numberOfExtraSquareRoots == 1))
+ break;
+ ++numberOfExtraSquareRoots;
+ }
+ matrix_sqrt_triangular(T, sqrtT);
+ T = sqrtT.template triangularView<Upper>();
+ ++numberOfSquareRoots;
+ }
+
+ matrix_log_compute_pade(result, T, degree);
+ result *= pow(RealScalar(2), RealScalar(numberOfSquareRoots)); // TODO replace by bitshift if possible
+}
+
+/** \ingroup MatrixFunctions_Module
+ * \class MatrixLogarithmAtomic
+ * \brief Helper class for computing matrix logarithm of atomic matrices.
+ *
+ * Here, an atomic matrix is a triangular matrix whose diagonal entries are close to each other.
+ *
+ * \sa class MatrixFunctionAtomic, MatrixBase::log()
+ */
+template <typename MatrixType>
+class MatrixLogarithmAtomic
+{
+public:
+ /** \brief Compute matrix logarithm of atomic matrix
+ * \param[in] A argument of matrix logarithm, should be upper triangular and atomic
+ * \returns The logarithm of \p A.
+ */
+ MatrixType compute(const MatrixType& A);
+};
+
+template <typename MatrixType>
+MatrixType MatrixLogarithmAtomic<MatrixType>::compute(const MatrixType& A)
+{
+ using std::log;
+ MatrixType result(A.rows(), A.rows());
+ if (A.rows() == 1)
+ result(0,0) = log(A(0,0));
+ else if (A.rows() == 2)
+ matrix_log_compute_2x2(A, result);
+ else
+ matrix_log_compute_big(A, result);
+ return result;
+}
+
+} // end of namespace internal
+
+/** \ingroup MatrixFunctions_Module
+ *
+ * \brief Proxy for the matrix logarithm of some matrix (expression).
+ *
+ * \tparam Derived Type of the argument to the matrix function.
+ *
+ * This class holds the argument to the matrix function until it is
+ * assigned or evaluated for some other reason (so the argument
+ * should not be changed in the meantime). It is the return type of
+ * MatrixBase::log() and most of the time this is the only way it
+ * is used.
+ */
+template<typename Derived> class MatrixLogarithmReturnValue
+: public ReturnByValue<MatrixLogarithmReturnValue<Derived> >
+{
+public:
+ typedef typename Derived::Scalar Scalar;
+ typedef typename Derived::Index Index;
+
+protected:
+ typedef typename internal::ref_selector<Derived>::type DerivedNested;
+
+public:
+
+ /** \brief Constructor.
+ *
+ * \param[in] A %Matrix (expression) forming the argument of the matrix logarithm.
+ */
+ explicit MatrixLogarithmReturnValue(const Derived& A) : m_A(A) { }
+
+ /** \brief Compute the matrix logarithm.
+ *
+ * \param[out] result Logarithm of \c A, where \c A is as specified in the constructor.
+ */
+ template <typename ResultType>
+ inline void evalTo(ResultType& result) const
+ {
+ typedef typename internal::nested_eval<Derived, 10>::type DerivedEvalType;
+ typedef typename internal::remove_all<DerivedEvalType>::type DerivedEvalTypeClean;
+ typedef internal::traits<DerivedEvalTypeClean> Traits;
+ typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
+ typedef Matrix<ComplexScalar, Dynamic, Dynamic, 0, Traits::RowsAtCompileTime, Traits::ColsAtCompileTime> DynMatrixType;
+ typedef internal::MatrixLogarithmAtomic<DynMatrixType> AtomicType;
+ AtomicType atomic;
+
+ internal::matrix_function_compute<typename DerivedEvalTypeClean::PlainObject>::run(m_A, atomic, result);
+ }
+
+ Index rows() const { return m_A.rows(); }
+ Index cols() const { return m_A.cols(); }
+
+private:
+ const DerivedNested m_A;
+};
+
+namespace internal {
+ template<typename Derived>
+ struct traits<MatrixLogarithmReturnValue<Derived> >
+ {
+ typedef typename Derived::PlainObject ReturnType;
+ };
+}
+
+
+/********** MatrixBase method **********/
+
+
+template <typename Derived>
+const MatrixLogarithmReturnValue<Derived> MatrixBase<Derived>::log() const
+{
+ eigen_assert(rows() == cols());
+ return MatrixLogarithmReturnValue<Derived>(derived());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATRIX_LOGARITHM
diff --git a/src/EigenUnsupported/src/MatrixFunctions/MatrixPower.h b/src/EigenUnsupported/src/MatrixFunctions/MatrixPower.h
new file mode 100644
index 0000000..d7672d7
--- /dev/null
+++ b/src/EigenUnsupported/src/MatrixFunctions/MatrixPower.h
@@ -0,0 +1,705 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012, 2013 Chen-Pang He <jdh8@ms63.hinet.net>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATRIX_POWER
+#define EIGEN_MATRIX_POWER
+
+namespace Eigen {
+
+template<typename MatrixType> class MatrixPower;
+
+/**
+ * \ingroup MatrixFunctions_Module
+ *
+ * \brief Proxy for the matrix power of some matrix.
+ *
+ * \tparam MatrixType type of the base, a matrix.
+ *
+ * This class holds the arguments to the matrix power until it is
+ * assigned or evaluated for some other reason (so the argument
+ * should not be changed in the meantime). It is the return type of
+ * MatrixPower::operator() and related functions and most of the
+ * time this is the only way it is used.
+ */
+/* TODO This class is only used by MatrixPower, so it should be nested
+ * into MatrixPower, like MatrixPower::ReturnValue. However, my
+ * compiler complained about unused template parameter in the
+ * following declaration in namespace internal.
+ *
+ * template<typename MatrixType>
+ * struct traits<MatrixPower<MatrixType>::ReturnValue>;
+ */
+template<typename MatrixType>
+class MatrixPowerParenthesesReturnValue : public ReturnByValue< MatrixPowerParenthesesReturnValue<MatrixType> >
+{
+ public:
+ typedef typename MatrixType::RealScalar RealScalar;
+
+ /**
+ * \brief Constructor.
+ *
+ * \param[in] pow %MatrixPower storing the base.
+ * \param[in] p scalar, the exponent of the matrix power.
+ */
+ MatrixPowerParenthesesReturnValue(MatrixPower<MatrixType>& pow, RealScalar p) : m_pow(pow), m_p(p)
+ { }
+
+ /**
+ * \brief Compute the matrix power.
+ *
+ * \param[out] result
+ */
+ template<typename ResultType>
+ inline void evalTo(ResultType& result) const
+ { m_pow.compute(result, m_p); }
+
+ Index rows() const { return m_pow.rows(); }
+ Index cols() const { return m_pow.cols(); }
+
+ private:
+ MatrixPower<MatrixType>& m_pow;
+ const RealScalar m_p;
+};
+
+/**
+ * \ingroup MatrixFunctions_Module
+ *
+ * \brief Class for computing matrix powers.
+ *
+ * \tparam MatrixType type of the base, expected to be an instantiation
+ * of the Matrix class template.
+ *
+ * This class is capable of computing triangular real/complex matrices
+ * raised to a power in the interval \f$ (-1, 1) \f$.
+ *
+ * \note Currently this class is only used by MatrixPower. One may
+ * insist that this be nested into MatrixPower. This class is here to
+ * facilitate future development of triangular matrix functions.
+ */
+template<typename MatrixType>
+class MatrixPowerAtomic : internal::noncopyable
+{
+ private:
+ enum {
+ RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+ MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime
+ };
+ typedef typename MatrixType::Scalar Scalar;
+ typedef typename MatrixType::RealScalar RealScalar;
+ typedef std::complex<RealScalar> ComplexScalar;
+ typedef Block<MatrixType,Dynamic,Dynamic> ResultType;
+
+ const MatrixType& m_A;
+ RealScalar m_p;
+
+ void computePade(int degree, const MatrixType& IminusT, ResultType& res) const;
+ void compute2x2(ResultType& res, RealScalar p) const;
+ void computeBig(ResultType& res) const;
+ static int getPadeDegree(float normIminusT);
+ static int getPadeDegree(double normIminusT);
+ static int getPadeDegree(long double normIminusT);
+ static ComplexScalar computeSuperDiag(const ComplexScalar&, const ComplexScalar&, RealScalar p);
+ static RealScalar computeSuperDiag(RealScalar, RealScalar, RealScalar p);
+
+ public:
+ /**
+ * \brief Constructor.
+ *
+ * \param[in] T the base of the matrix power.
+ * \param[in] p the exponent of the matrix power, should be in
+ * \f$ (-1, 1) \f$.
+ *
+ * The class stores a reference to T, so it should not be changed
+ * (or destroyed) before evaluation. Only the upper triangular
+ * part of T is read.
+ */
+ MatrixPowerAtomic(const MatrixType& T, RealScalar p);
+
+ /**
+ * \brief Compute the matrix power.
+ *
+ * \param[out] res \f$ A^p \f$ where A and p are specified in the
+ * constructor.
+ */
+ void compute(ResultType& res) const;
+};
+
+template<typename MatrixType>
+MatrixPowerAtomic<MatrixType>::MatrixPowerAtomic(const MatrixType& T, RealScalar p) :
+ m_A(T), m_p(p)
+{
+ eigen_assert(T.rows() == T.cols());
+ eigen_assert(p > -1 && p < 1);
+}
+
+template<typename MatrixType>
+void MatrixPowerAtomic<MatrixType>::compute(ResultType& res) const
+{
+ using std::pow;
+ switch (m_A.rows()) {
+ case 0:
+ break;
+ case 1:
+ res(0,0) = pow(m_A(0,0), m_p);
+ break;
+ case 2:
+ compute2x2(res, m_p);
+ break;
+ default:
+ computeBig(res);
+ }
+}
+
+template<typename MatrixType>
+void MatrixPowerAtomic<MatrixType>::computePade(int degree, const MatrixType& IminusT, ResultType& res) const
+{
+ int i = 2*degree;
+ res = (m_p-RealScalar(degree)) / RealScalar(2*i-2) * IminusT;
+
+ for (--i; i; --i) {
+ res = (MatrixType::Identity(IminusT.rows(), IminusT.cols()) + res).template triangularView<Upper>()
+ .solve((i==1 ? -m_p : i&1 ? (-m_p-RealScalar(i/2))/RealScalar(2*i) : (m_p-RealScalar(i/2))/RealScalar(2*i-2)) * IminusT).eval();
+ }
+ res += MatrixType::Identity(IminusT.rows(), IminusT.cols());
+}
+
+// This function assumes that res has the correct size (see bug 614)
+template<typename MatrixType>
+void MatrixPowerAtomic<MatrixType>::compute2x2(ResultType& res, RealScalar p) const
+{
+ using std::abs;
+ using std::pow;
+ res.coeffRef(0,0) = pow(m_A.coeff(0,0), p);
+
+ for (Index i=1; i < m_A.cols(); ++i) {
+ res.coeffRef(i,i) = pow(m_A.coeff(i,i), p);
+ if (m_A.coeff(i-1,i-1) == m_A.coeff(i,i))
+ res.coeffRef(i-1,i) = p * pow(m_A.coeff(i,i), p-1);
+ else if (2*abs(m_A.coeff(i-1,i-1)) < abs(m_A.coeff(i,i)) || 2*abs(m_A.coeff(i,i)) < abs(m_A.coeff(i-1,i-1)))
+ res.coeffRef(i-1,i) = (res.coeff(i,i)-res.coeff(i-1,i-1)) / (m_A.coeff(i,i)-m_A.coeff(i-1,i-1));
+ else
+ res.coeffRef(i-1,i) = computeSuperDiag(m_A.coeff(i,i), m_A.coeff(i-1,i-1), p);
+ res.coeffRef(i-1,i) *= m_A.coeff(i-1,i);
+ }
+}
+
+template<typename MatrixType>
+void MatrixPowerAtomic<MatrixType>::computeBig(ResultType& res) const
+{
+ using std::ldexp;
+ const int digits = std::numeric_limits<RealScalar>::digits;
+ const RealScalar maxNormForPade = RealScalar(
+ digits <= 24? 4.3386528e-1L // single precision
+ : digits <= 53? 2.789358995219730e-1L // double precision
+ : digits <= 64? 2.4471944416607995472e-1L // extended precision
+ : digits <= 106? 1.1016843812851143391275867258512e-1L // double-double
+ : 9.134603732914548552537150753385375e-2L); // quadruple precision
+ MatrixType IminusT, sqrtT, T = m_A.template triangularView<Upper>();
+ RealScalar normIminusT;
+ int degree, degree2, numberOfSquareRoots = 0;
+ bool hasExtraSquareRoot = false;
+
+ for (Index i=0; i < m_A.cols(); ++i)
+ eigen_assert(m_A(i,i) != RealScalar(0));
+
+ while (true) {
+ IminusT = MatrixType::Identity(m_A.rows(), m_A.cols()) - T;
+ normIminusT = IminusT.cwiseAbs().colwise().sum().maxCoeff();
+ if (normIminusT < maxNormForPade) {
+ degree = getPadeDegree(normIminusT);
+ degree2 = getPadeDegree(normIminusT/2);
+ if (degree - degree2 <= 1 || hasExtraSquareRoot)
+ break;
+ hasExtraSquareRoot = true;
+ }
+ matrix_sqrt_triangular(T, sqrtT);
+ T = sqrtT.template triangularView<Upper>();
+ ++numberOfSquareRoots;
+ }
+ computePade(degree, IminusT, res);
+
+ for (; numberOfSquareRoots; --numberOfSquareRoots) {
+ compute2x2(res, ldexp(m_p, -numberOfSquareRoots));
+ res = res.template triangularView<Upper>() * res;
+ }
+ compute2x2(res, m_p);
+}
+
+template<typename MatrixType>
+inline int MatrixPowerAtomic<MatrixType>::getPadeDegree(float normIminusT)
+{
+ const float maxNormForPade[] = { 2.8064004e-1f /* degree = 3 */ , 4.3386528e-1f };
+ int degree = 3;
+ for (; degree <= 4; ++degree)
+ if (normIminusT <= maxNormForPade[degree - 3])
+ break;
+ return degree;
+}
+
+template<typename MatrixType>
+inline int MatrixPowerAtomic<MatrixType>::getPadeDegree(double normIminusT)
+{
+ const double maxNormForPade[] = { 1.884160592658218e-2 /* degree = 3 */ , 6.038881904059573e-2, 1.239917516308172e-1,
+ 1.999045567181744e-1, 2.789358995219730e-1 };
+ int degree = 3;
+ for (; degree <= 7; ++degree)
+ if (normIminusT <= maxNormForPade[degree - 3])
+ break;
+ return degree;
+}
+
+template<typename MatrixType>
+inline int MatrixPowerAtomic<MatrixType>::getPadeDegree(long double normIminusT)
+{
+#if LDBL_MANT_DIG == 53
+ const int maxPadeDegree = 7;
+ const double maxNormForPade[] = { 1.884160592658218e-2L /* degree = 3 */ , 6.038881904059573e-2L, 1.239917516308172e-1L,
+ 1.999045567181744e-1L, 2.789358995219730e-1L };
+#elif LDBL_MANT_DIG <= 64
+ const int maxPadeDegree = 8;
+ const long double maxNormForPade[] = { 6.3854693117491799460e-3L /* degree = 3 */ , 2.6394893435456973676e-2L,
+ 6.4216043030404063729e-2L, 1.1701165502926694307e-1L, 1.7904284231268670284e-1L, 2.4471944416607995472e-1L };
+#elif LDBL_MANT_DIG <= 106
+ const int maxPadeDegree = 10;
+ const double maxNormForPade[] = { 1.0007161601787493236741409687186e-4L /* degree = 3 */ ,
+ 1.0007161601787493236741409687186e-3L, 4.7069769360887572939882574746264e-3L, 1.3220386624169159689406653101695e-2L,
+ 2.8063482381631737920612944054906e-2L, 4.9625993951953473052385361085058e-2L, 7.7367040706027886224557538328171e-2L,
+ 1.1016843812851143391275867258512e-1L };
+#else
+ const int maxPadeDegree = 10;
+ const double maxNormForPade[] = { 5.524506147036624377378713555116378e-5L /* degree = 3 */ ,
+ 6.640600568157479679823602193345995e-4L, 3.227716520106894279249709728084626e-3L,
+ 9.619593944683432960546978734646284e-3L, 2.134595382433742403911124458161147e-2L,
+ 3.908166513900489428442993794761185e-2L, 6.266780814639442865832535460550138e-2L,
+ 9.134603732914548552537150753385375e-2L };
+#endif
+ int degree = 3;
+ for (; degree <= maxPadeDegree; ++degree)
+ if (normIminusT <= maxNormForPade[degree - 3])
+ break;
+ return degree;
+}
+
+template<typename MatrixType>
+inline typename MatrixPowerAtomic<MatrixType>::ComplexScalar
+MatrixPowerAtomic<MatrixType>::computeSuperDiag(const ComplexScalar& curr, const ComplexScalar& prev, RealScalar p)
+{
+ using std::ceil;
+ using std::exp;
+ using std::log;
+ using std::sinh;
+
+ ComplexScalar logCurr = log(curr);
+ ComplexScalar logPrev = log(prev);
+ RealScalar unwindingNumber = ceil((numext::imag(logCurr - logPrev) - RealScalar(EIGEN_PI)) / RealScalar(2*EIGEN_PI));
+ ComplexScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2) + ComplexScalar(0, RealScalar(EIGEN_PI)*unwindingNumber);
+ return RealScalar(2) * exp(RealScalar(0.5) * p * (logCurr + logPrev)) * sinh(p * w) / (curr - prev);
+}
+
+template<typename MatrixType>
+inline typename MatrixPowerAtomic<MatrixType>::RealScalar
+MatrixPowerAtomic<MatrixType>::computeSuperDiag(RealScalar curr, RealScalar prev, RealScalar p)
+{
+ using std::exp;
+ using std::log;
+ using std::sinh;
+
+ RealScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2);
+ return 2 * exp(p * (log(curr) + log(prev)) / 2) * sinh(p * w) / (curr - prev);
+}
+
+/**
+ * \ingroup MatrixFunctions_Module
+ *
+ * \brief Class for computing matrix powers.
+ *
+ * \tparam MatrixType type of the base, expected to be an instantiation
+ * of the Matrix class template.
+ *
+ * This class is capable of computing real/complex matrices raised to
+ * an arbitrary real power. Meanwhile, it saves the result of Schur
+ * decomposition if an non-integral power has even been calculated.
+ * Therefore, if you want to compute multiple (>= 2) matrix powers
+ * for the same matrix, using the class directly is more efficient than
+ * calling MatrixBase::pow().
+ *
+ * Example:
+ * \include MatrixPower_optimal.cpp
+ * Output: \verbinclude MatrixPower_optimal.out
+ */
+template<typename MatrixType>
+class MatrixPower : internal::noncopyable
+{
+ private:
+ typedef typename MatrixType::Scalar Scalar;
+ typedef typename MatrixType::RealScalar RealScalar;
+
+ public:
+ /**
+ * \brief Constructor.
+ *
+ * \param[in] A the base of the matrix power.
+ *
+ * The class stores a reference to A, so it should not be changed
+ * (or destroyed) before evaluation.
+ */
+ explicit MatrixPower(const MatrixType& A) :
+ m_A(A),
+ m_conditionNumber(0),
+ m_rank(A.cols()),
+ m_nulls(0)
+ { eigen_assert(A.rows() == A.cols()); }
+
+ /**
+ * \brief Returns the matrix power.
+ *
+ * \param[in] p exponent, a real scalar.
+ * \return The expression \f$ A^p \f$, where A is specified in the
+ * constructor.
+ */
+ const MatrixPowerParenthesesReturnValue<MatrixType> operator()(RealScalar p)
+ { return MatrixPowerParenthesesReturnValue<MatrixType>(*this, p); }
+
+ /**
+ * \brief Compute the matrix power.
+ *
+ * \param[in] p exponent, a real scalar.
+ * \param[out] res \f$ A^p \f$ where A is specified in the
+ * constructor.
+ */
+ template<typename ResultType>
+ void compute(ResultType& res, RealScalar p);
+
+ Index rows() const { return m_A.rows(); }
+ Index cols() const { return m_A.cols(); }
+
+ private:
+ typedef std::complex<RealScalar> ComplexScalar;
+ typedef Matrix<ComplexScalar, Dynamic, Dynamic, 0,
+ MatrixType::RowsAtCompileTime, MatrixType::ColsAtCompileTime> ComplexMatrix;
+
+ /** \brief Reference to the base of matrix power. */
+ typename MatrixType::Nested m_A;
+
+ /** \brief Temporary storage. */
+ MatrixType m_tmp;
+
+ /** \brief Store the result of Schur decomposition. */
+ ComplexMatrix m_T, m_U;
+
+ /** \brief Store fractional power of m_T. */
+ ComplexMatrix m_fT;
+
+ /**
+ * \brief Condition number of m_A.
+ *
+ * It is initialized as 0 to avoid performing unnecessary Schur
+ * decomposition, which is the bottleneck.
+ */
+ RealScalar m_conditionNumber;
+
+ /** \brief Rank of m_A. */
+ Index m_rank;
+
+ /** \brief Rank deficiency of m_A. */
+ Index m_nulls;
+
+ /**
+ * \brief Split p into integral part and fractional part.
+ *
+ * \param[in] p The exponent.
+ * \param[out] p The fractional part ranging in \f$ (-1, 1) \f$.
+ * \param[out] intpart The integral part.
+ *
+ * Only if the fractional part is nonzero, it calls initialize().
+ */
+ void split(RealScalar& p, RealScalar& intpart);
+
+ /** \brief Perform Schur decomposition for fractional power. */
+ void initialize();
+
+ template<typename ResultType>
+ void computeIntPower(ResultType& res, RealScalar p);
+
+ template<typename ResultType>
+ void computeFracPower(ResultType& res, RealScalar p);
+
+ template<int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+ static void revertSchur(
+ Matrix<ComplexScalar, Rows, Cols, Options, MaxRows, MaxCols>& res,
+ const ComplexMatrix& T,
+ const ComplexMatrix& U);
+
+ template<int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+ static void revertSchur(
+ Matrix<RealScalar, Rows, Cols, Options, MaxRows, MaxCols>& res,
+ const ComplexMatrix& T,
+ const ComplexMatrix& U);
+};
+
+template<typename MatrixType>
+template<typename ResultType>
+void MatrixPower<MatrixType>::compute(ResultType& res, RealScalar p)
+{
+ using std::pow;
+ switch (cols()) {
+ case 0:
+ break;
+ case 1:
+ res(0,0) = pow(m_A.coeff(0,0), p);
+ break;
+ default:
+ RealScalar intpart;
+ split(p, intpart);
+
+ res = MatrixType::Identity(rows(), cols());
+ computeIntPower(res, intpart);
+ if (p) computeFracPower(res, p);
+ }
+}
+
+template<typename MatrixType>
+void MatrixPower<MatrixType>::split(RealScalar& p, RealScalar& intpart)
+{
+ using std::floor;
+ using std::pow;
+
+ intpart = floor(p);
+ p -= intpart;
+
+ // Perform Schur decomposition if it is not yet performed and the power is
+ // not an integer.
+ if (!m_conditionNumber && p)
+ initialize();
+
+ // Choose the more stable of intpart = floor(p) and intpart = ceil(p).
+ if (p > RealScalar(0.5) && p > (1-p) * pow(m_conditionNumber, p)) {
+ --p;
+ ++intpart;
+ }
+}
+
+template<typename MatrixType>
+void MatrixPower<MatrixType>::initialize()
+{
+ const ComplexSchur<MatrixType> schurOfA(m_A);
+ JacobiRotation<ComplexScalar> rot;
+ ComplexScalar eigenvalue;
+
+ m_fT.resizeLike(m_A);
+ m_T = schurOfA.matrixT();
+ m_U = schurOfA.matrixU();
+ m_conditionNumber = m_T.diagonal().array().abs().maxCoeff() / m_T.diagonal().array().abs().minCoeff();
+
+ // Move zero eigenvalues to the bottom right corner.
+ for (Index i = cols()-1; i>=0; --i) {
+ if (m_rank <= 2)
+ return;
+ if (m_T.coeff(i,i) == RealScalar(0)) {
+ for (Index j=i+1; j < m_rank; ++j) {
+ eigenvalue = m_T.coeff(j,j);
+ rot.makeGivens(m_T.coeff(j-1,j), eigenvalue);
+ m_T.applyOnTheRight(j-1, j, rot);
+ m_T.applyOnTheLeft(j-1, j, rot.adjoint());
+ m_T.coeffRef(j-1,j-1) = eigenvalue;
+ m_T.coeffRef(j,j) = RealScalar(0);
+ m_U.applyOnTheRight(j-1, j, rot);
+ }
+ --m_rank;
+ }
+ }
+
+ m_nulls = rows() - m_rank;
+ if (m_nulls) {
+ eigen_assert(m_T.bottomRightCorner(m_nulls, m_nulls).isZero()
+ && "Base of matrix power should be invertible or with a semisimple zero eigenvalue.");
+ m_fT.bottomRows(m_nulls).fill(RealScalar(0));
+ }
+}
+
+template<typename MatrixType>
+template<typename ResultType>
+void MatrixPower<MatrixType>::computeIntPower(ResultType& res, RealScalar p)
+{
+ using std::abs;
+ using std::fmod;
+ RealScalar pp = abs(p);
+
+ if (p<0)
+ m_tmp = m_A.inverse();
+ else
+ m_tmp = m_A;
+
+ while (true) {
+ if (fmod(pp, 2) >= 1)
+ res = m_tmp * res;
+ pp /= 2;
+ if (pp < 1)
+ break;
+ m_tmp *= m_tmp;
+ }
+}
+
+template<typename MatrixType>
+template<typename ResultType>
+void MatrixPower<MatrixType>::computeFracPower(ResultType& res, RealScalar p)
+{
+ Block<ComplexMatrix,Dynamic,Dynamic> blockTp(m_fT, 0, 0, m_rank, m_rank);
+ eigen_assert(m_conditionNumber);
+ eigen_assert(m_rank + m_nulls == rows());
+
+ MatrixPowerAtomic<ComplexMatrix>(m_T.topLeftCorner(m_rank, m_rank), p).compute(blockTp);
+ if (m_nulls) {
+ m_fT.topRightCorner(m_rank, m_nulls) = m_T.topLeftCorner(m_rank, m_rank).template triangularView<Upper>()
+ .solve(blockTp * m_T.topRightCorner(m_rank, m_nulls));
+ }
+ revertSchur(m_tmp, m_fT, m_U);
+ res = m_tmp * res;
+}
+
+template<typename MatrixType>
+template<int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+inline void MatrixPower<MatrixType>::revertSchur(
+ Matrix<ComplexScalar, Rows, Cols, Options, MaxRows, MaxCols>& res,
+ const ComplexMatrix& T,
+ const ComplexMatrix& U)
+{ res.noalias() = U * (T.template triangularView<Upper>() * U.adjoint()); }
+
+template<typename MatrixType>
+template<int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+inline void MatrixPower<MatrixType>::revertSchur(
+ Matrix<RealScalar, Rows, Cols, Options, MaxRows, MaxCols>& res,
+ const ComplexMatrix& T,
+ const ComplexMatrix& U)
+{ res.noalias() = (U * (T.template triangularView<Upper>() * U.adjoint())).real(); }
+
+/**
+ * \ingroup MatrixFunctions_Module
+ *
+ * \brief Proxy for the matrix power of some matrix (expression).
+ *
+ * \tparam Derived type of the base, a matrix (expression).
+ *
+ * This class holds the arguments to the matrix power until it is
+ * assigned or evaluated for some other reason (so the argument
+ * should not be changed in the meantime). It is the return type of
+ * MatrixBase::pow() and related functions and most of the
+ * time this is the only way it is used.
+ */
+template<typename Derived>
+class MatrixPowerReturnValue : public ReturnByValue< MatrixPowerReturnValue<Derived> >
+{
+ public:
+ typedef typename Derived::PlainObject PlainObject;
+ typedef typename Derived::RealScalar RealScalar;
+
+ /**
+ * \brief Constructor.
+ *
+ * \param[in] A %Matrix (expression), the base of the matrix power.
+ * \param[in] p real scalar, the exponent of the matrix power.
+ */
+ MatrixPowerReturnValue(const Derived& A, RealScalar p) : m_A(A), m_p(p)
+ { }
+
+ /**
+ * \brief Compute the matrix power.
+ *
+ * \param[out] result \f$ A^p \f$ where \p A and \p p are as in the
+ * constructor.
+ */
+ template<typename ResultType>
+ inline void evalTo(ResultType& result) const
+ { MatrixPower<PlainObject>(m_A.eval()).compute(result, m_p); }
+
+ Index rows() const { return m_A.rows(); }
+ Index cols() const { return m_A.cols(); }
+
+ private:
+ const Derived& m_A;
+ const RealScalar m_p;
+};
+
+/**
+ * \ingroup MatrixFunctions_Module
+ *
+ * \brief Proxy for the matrix power of some matrix (expression).
+ *
+ * \tparam Derived type of the base, a matrix (expression).
+ *
+ * This class holds the arguments to the matrix power until it is
+ * assigned or evaluated for some other reason (so the argument
+ * should not be changed in the meantime). It is the return type of
+ * MatrixBase::pow() and related functions and most of the
+ * time this is the only way it is used.
+ */
+template<typename Derived>
+class MatrixComplexPowerReturnValue : public ReturnByValue< MatrixComplexPowerReturnValue<Derived> >
+{
+ public:
+ typedef typename Derived::PlainObject PlainObject;
+ typedef typename std::complex<typename Derived::RealScalar> ComplexScalar;
+
+ /**
+ * \brief Constructor.
+ *
+ * \param[in] A %Matrix (expression), the base of the matrix power.
+ * \param[in] p complex scalar, the exponent of the matrix power.
+ */
+ MatrixComplexPowerReturnValue(const Derived& A, const ComplexScalar& p) : m_A(A), m_p(p)
+ { }
+
+ /**
+ * \brief Compute the matrix power.
+ *
+ * Because \p p is complex, \f$ A^p \f$ is simply evaluated as \f$
+ * \exp(p \log(A)) \f$.
+ *
+ * \param[out] result \f$ A^p \f$ where \p A and \p p are as in the
+ * constructor.
+ */
+ template<typename ResultType>
+ inline void evalTo(ResultType& result) const
+ { result = (m_p * m_A.log()).exp(); }
+
+ Index rows() const { return m_A.rows(); }
+ Index cols() const { return m_A.cols(); }
+
+ private:
+ const Derived& m_A;
+ const ComplexScalar m_p;
+};
+
+namespace internal {
+
+template<typename MatrixPowerType>
+struct traits< MatrixPowerParenthesesReturnValue<MatrixPowerType> >
+{ typedef typename MatrixPowerType::PlainObject ReturnType; };
+
+template<typename Derived>
+struct traits< MatrixPowerReturnValue<Derived> >
+{ typedef typename Derived::PlainObject ReturnType; };
+
+template<typename Derived>
+struct traits< MatrixComplexPowerReturnValue<Derived> >
+{ typedef typename Derived::PlainObject ReturnType; };
+
+}
+
+template<typename Derived>
+const MatrixPowerReturnValue<Derived> MatrixBase<Derived>::pow(const RealScalar& p) const
+{ return MatrixPowerReturnValue<Derived>(derived(), p); }
+
+template<typename Derived>
+const MatrixComplexPowerReturnValue<Derived> MatrixBase<Derived>::pow(const std::complex<RealScalar>& p) const
+{ return MatrixComplexPowerReturnValue<Derived>(derived(), p); }
+
+} // namespace Eigen
+
+#endif // EIGEN_MATRIX_POWER
diff --git a/src/EigenUnsupported/src/MatrixFunctions/MatrixSquareRoot.h b/src/EigenUnsupported/src/MatrixFunctions/MatrixSquareRoot.h
new file mode 100644
index 0000000..e363e77
--- /dev/null
+++ b/src/EigenUnsupported/src/MatrixFunctions/MatrixSquareRoot.h
@@ -0,0 +1,368 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011, 2013 Jitse Niesen <jitse@maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATRIX_SQUARE_ROOT
+#define EIGEN_MATRIX_SQUARE_ROOT
+
+namespace Eigen {
+
+namespace internal {
+
+// pre: T.block(i,i,2,2) has complex conjugate eigenvalues
+// post: sqrtT.block(i,i,2,2) is square root of T.block(i,i,2,2)
+template <typename MatrixType, typename ResultType>
+void matrix_sqrt_quasi_triangular_2x2_diagonal_block(const MatrixType& T, Index i, ResultType& sqrtT)
+{
+ // TODO: This case (2-by-2 blocks with complex conjugate eigenvalues) is probably hidden somewhere
+ // in EigenSolver. If we expose it, we could call it directly from here.
+ typedef typename traits<MatrixType>::Scalar Scalar;
+ Matrix<Scalar,2,2> block = T.template block<2,2>(i,i);
+ EigenSolver<Matrix<Scalar,2,2> > es(block);
+ sqrtT.template block<2,2>(i,i)
+ = (es.eigenvectors() * es.eigenvalues().cwiseSqrt().asDiagonal() * es.eigenvectors().inverse()).real();
+}
+
+// pre: block structure of T is such that (i,j) is a 1x1 block,
+// all blocks of sqrtT to left of and below (i,j) are correct
+// post: sqrtT(i,j) has the correct value
+template <typename MatrixType, typename ResultType>
+void matrix_sqrt_quasi_triangular_1x1_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT)
+{
+ typedef typename traits<MatrixType>::Scalar Scalar;
+ Scalar tmp = (sqrtT.row(i).segment(i+1,j-i-1) * sqrtT.col(j).segment(i+1,j-i-1)).value();
+ sqrtT.coeffRef(i,j) = (T.coeff(i,j) - tmp) / (sqrtT.coeff(i,i) + sqrtT.coeff(j,j));
+}
+
+// similar to compute1x1offDiagonalBlock()
+template <typename MatrixType, typename ResultType>
+void matrix_sqrt_quasi_triangular_1x2_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT)
+{
+ typedef typename traits<MatrixType>::Scalar Scalar;
+ Matrix<Scalar,1,2> rhs = T.template block<1,2>(i,j);
+ if (j-i > 1)
+ rhs -= sqrtT.block(i, i+1, 1, j-i-1) * sqrtT.block(i+1, j, j-i-1, 2);
+ Matrix<Scalar,2,2> A = sqrtT.coeff(i,i) * Matrix<Scalar,2,2>::Identity();
+ A += sqrtT.template block<2,2>(j,j).transpose();
+ sqrtT.template block<1,2>(i,j).transpose() = A.fullPivLu().solve(rhs.transpose());
+}
+
+// similar to compute1x1offDiagonalBlock()
+template <typename MatrixType, typename ResultType>
+void matrix_sqrt_quasi_triangular_2x1_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT)
+{
+ typedef typename traits<MatrixType>::Scalar Scalar;
+ Matrix<Scalar,2,1> rhs = T.template block<2,1>(i,j);
+ if (j-i > 2)
+ rhs -= sqrtT.block(i, i+2, 2, j-i-2) * sqrtT.block(i+2, j, j-i-2, 1);
+ Matrix<Scalar,2,2> A = sqrtT.coeff(j,j) * Matrix<Scalar,2,2>::Identity();
+ A += sqrtT.template block<2,2>(i,i);
+ sqrtT.template block<2,1>(i,j) = A.fullPivLu().solve(rhs);
+}
+
+// solves the equation A X + X B = C where all matrices are 2-by-2
+template <typename MatrixType>
+void matrix_sqrt_quasi_triangular_solve_auxiliary_equation(MatrixType& X, const MatrixType& A, const MatrixType& B, const MatrixType& C)
+{
+ typedef typename traits<MatrixType>::Scalar Scalar;
+ Matrix<Scalar,4,4> coeffMatrix = Matrix<Scalar,4,4>::Zero();
+ coeffMatrix.coeffRef(0,0) = A.coeff(0,0) + B.coeff(0,0);
+ coeffMatrix.coeffRef(1,1) = A.coeff(0,0) + B.coeff(1,1);
+ coeffMatrix.coeffRef(2,2) = A.coeff(1,1) + B.coeff(0,0);
+ coeffMatrix.coeffRef(3,3) = A.coeff(1,1) + B.coeff(1,1);
+ coeffMatrix.coeffRef(0,1) = B.coeff(1,0);
+ coeffMatrix.coeffRef(0,2) = A.coeff(0,1);
+ coeffMatrix.coeffRef(1,0) = B.coeff(0,1);
+ coeffMatrix.coeffRef(1,3) = A.coeff(0,1);
+ coeffMatrix.coeffRef(2,0) = A.coeff(1,0);
+ coeffMatrix.coeffRef(2,3) = B.coeff(1,0);
+ coeffMatrix.coeffRef(3,1) = A.coeff(1,0);
+ coeffMatrix.coeffRef(3,2) = B.coeff(0,1);
+
+ Matrix<Scalar,4,1> rhs;
+ rhs.coeffRef(0) = C.coeff(0,0);
+ rhs.coeffRef(1) = C.coeff(0,1);
+ rhs.coeffRef(2) = C.coeff(1,0);
+ rhs.coeffRef(3) = C.coeff(1,1);
+
+ Matrix<Scalar,4,1> result;
+ result = coeffMatrix.fullPivLu().solve(rhs);
+
+ X.coeffRef(0,0) = result.coeff(0);
+ X.coeffRef(0,1) = result.coeff(1);
+ X.coeffRef(1,0) = result.coeff(2);
+ X.coeffRef(1,1) = result.coeff(3);
+}
+
+// similar to compute1x1offDiagonalBlock()
+template <typename MatrixType, typename ResultType>
+void matrix_sqrt_quasi_triangular_2x2_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT)
+{
+ typedef typename traits<MatrixType>::Scalar Scalar;
+ Matrix<Scalar,2,2> A = sqrtT.template block<2,2>(i,i);
+ Matrix<Scalar,2,2> B = sqrtT.template block<2,2>(j,j);
+ Matrix<Scalar,2,2> C = T.template block<2,2>(i,j);
+ if (j-i > 2)
+ C -= sqrtT.block(i, i+2, 2, j-i-2) * sqrtT.block(i+2, j, j-i-2, 2);
+ Matrix<Scalar,2,2> X;
+ matrix_sqrt_quasi_triangular_solve_auxiliary_equation(X, A, B, C);
+ sqrtT.template block<2,2>(i,j) = X;
+}
+
+// pre: T is quasi-upper-triangular and sqrtT is a zero matrix of the same size
+// post: the diagonal blocks of sqrtT are the square roots of the diagonal blocks of T
+template <typename MatrixType, typename ResultType>
+void matrix_sqrt_quasi_triangular_diagonal(const MatrixType& T, ResultType& sqrtT)
+{
+ using std::sqrt;
+ const Index size = T.rows();
+ for (Index i = 0; i < size; i++) {
+ if (i == size - 1 || T.coeff(i+1, i) == 0) {
+ eigen_assert(T(i,i) >= 0);
+ sqrtT.coeffRef(i,i) = sqrt(T.coeff(i,i));
+ }
+ else {
+ matrix_sqrt_quasi_triangular_2x2_diagonal_block(T, i, sqrtT);
+ ++i;
+ }
+ }
+}
+
+// pre: T is quasi-upper-triangular and diagonal blocks of sqrtT are square root of diagonal blocks of T.
+// post: sqrtT is the square root of T.
+template <typename MatrixType, typename ResultType>
+void matrix_sqrt_quasi_triangular_off_diagonal(const MatrixType& T, ResultType& sqrtT)
+{
+ const Index size = T.rows();
+ for (Index j = 1; j < size; j++) {
+ if (T.coeff(j, j-1) != 0) // if T(j-1:j, j-1:j) is a 2-by-2 block
+ continue;
+ for (Index i = j-1; i >= 0; i--) {
+ if (i > 0 && T.coeff(i, i-1) != 0) // if T(i-1:i, i-1:i) is a 2-by-2 block
+ continue;
+ bool iBlockIs2x2 = (i < size - 1) && (T.coeff(i+1, i) != 0);
+ bool jBlockIs2x2 = (j < size - 1) && (T.coeff(j+1, j) != 0);
+ if (iBlockIs2x2 && jBlockIs2x2)
+ matrix_sqrt_quasi_triangular_2x2_off_diagonal_block(T, i, j, sqrtT);
+ else if (iBlockIs2x2 && !jBlockIs2x2)
+ matrix_sqrt_quasi_triangular_2x1_off_diagonal_block(T, i, j, sqrtT);
+ else if (!iBlockIs2x2 && jBlockIs2x2)
+ matrix_sqrt_quasi_triangular_1x2_off_diagonal_block(T, i, j, sqrtT);
+ else if (!iBlockIs2x2 && !jBlockIs2x2)
+ matrix_sqrt_quasi_triangular_1x1_off_diagonal_block(T, i, j, sqrtT);
+ }
+ }
+}
+
+} // end of namespace internal
+
+/** \ingroup MatrixFunctions_Module
+ * \brief Compute matrix square root of quasi-triangular matrix.
+ *
+ * \tparam MatrixType type of \p arg, the argument of matrix square root,
+ * expected to be an instantiation of the Matrix class template.
+ * \tparam ResultType type of \p result, where result is to be stored.
+ * \param[in] arg argument of matrix square root.
+ * \param[out] result matrix square root of upper Hessenberg part of \p arg.
+ *
+ * This function computes the square root of the upper quasi-triangular matrix stored in the upper
+ * Hessenberg part of \p arg. Only the upper Hessenberg part of \p result is updated, the rest is
+ * not touched. See MatrixBase::sqrt() for details on how this computation is implemented.
+ *
+ * \sa MatrixSquareRoot, MatrixSquareRootQuasiTriangular
+ */
+template <typename MatrixType, typename ResultType>
+void matrix_sqrt_quasi_triangular(const MatrixType &arg, ResultType &result)
+{
+ eigen_assert(arg.rows() == arg.cols());
+ result.resize(arg.rows(), arg.cols());
+ internal::matrix_sqrt_quasi_triangular_diagonal(arg, result);
+ internal::matrix_sqrt_quasi_triangular_off_diagonal(arg, result);
+}
+
+
+/** \ingroup MatrixFunctions_Module
+ * \brief Compute matrix square root of triangular matrix.
+ *
+ * \tparam MatrixType type of \p arg, the argument of matrix square root,
+ * expected to be an instantiation of the Matrix class template.
+ * \tparam ResultType type of \p result, where result is to be stored.
+ * \param[in] arg argument of matrix square root.
+ * \param[out] result matrix square root of upper triangular part of \p arg.
+ *
+ * Only the upper triangular part (including the diagonal) of \p result is updated, the rest is not
+ * touched. See MatrixBase::sqrt() for details on how this computation is implemented.
+ *
+ * \sa MatrixSquareRoot, MatrixSquareRootQuasiTriangular
+ */
+template <typename MatrixType, typename ResultType>
+void matrix_sqrt_triangular(const MatrixType &arg, ResultType &result)
+{
+ using std::sqrt;
+ typedef typename MatrixType::Scalar Scalar;
+
+ eigen_assert(arg.rows() == arg.cols());
+
+ // Compute square root of arg and store it in upper triangular part of result
+ // This uses that the square root of triangular matrices can be computed directly.
+ result.resize(arg.rows(), arg.cols());
+ for (Index i = 0; i < arg.rows(); i++) {
+ result.coeffRef(i,i) = sqrt(arg.coeff(i,i));
+ }
+ for (Index j = 1; j < arg.cols(); j++) {
+ for (Index i = j-1; i >= 0; i--) {
+ // if i = j-1, then segment has length 0 so tmp = 0
+ Scalar tmp = (result.row(i).segment(i+1,j-i-1) * result.col(j).segment(i+1,j-i-1)).value();
+ // denominator may be zero if original matrix is singular
+ result.coeffRef(i,j) = (arg.coeff(i,j) - tmp) / (result.coeff(i,i) + result.coeff(j,j));
+ }
+ }
+}
+
+
+namespace internal {
+
+/** \ingroup MatrixFunctions_Module
+ * \brief Helper struct for computing matrix square roots of general matrices.
+ * \tparam MatrixType type of the argument of the matrix square root,
+ * expected to be an instantiation of the Matrix class template.
+ *
+ * \sa MatrixSquareRootTriangular, MatrixSquareRootQuasiTriangular, MatrixBase::sqrt()
+ */
+template <typename MatrixType, int IsComplex = NumTraits<typename internal::traits<MatrixType>::Scalar>::IsComplex>
+struct matrix_sqrt_compute
+{
+ /** \brief Compute the matrix square root
+ *
+ * \param[in] arg matrix whose square root is to be computed.
+ * \param[out] result square root of \p arg.
+ *
+ * See MatrixBase::sqrt() for details on how this computation is implemented.
+ */
+ template <typename ResultType> static void run(const MatrixType &arg, ResultType &result);
+};
+
+
+// ********** Partial specialization for real matrices **********
+
+template <typename MatrixType>
+struct matrix_sqrt_compute<MatrixType, 0>
+{
+ typedef typename MatrixType::PlainObject PlainType;
+ template <typename ResultType>
+ static void run(const MatrixType &arg, ResultType &result)
+ {
+ eigen_assert(arg.rows() == arg.cols());
+
+ // Compute Schur decomposition of arg
+ const RealSchur<PlainType> schurOfA(arg);
+ const PlainType& T = schurOfA.matrixT();
+ const PlainType& U = schurOfA.matrixU();
+
+ // Compute square root of T
+ PlainType sqrtT = PlainType::Zero(arg.rows(), arg.cols());
+ matrix_sqrt_quasi_triangular(T, sqrtT);
+
+ // Compute square root of arg
+ result = U * sqrtT * U.adjoint();
+ }
+};
+
+
+// ********** Partial specialization for complex matrices **********
+
+template <typename MatrixType>
+struct matrix_sqrt_compute<MatrixType, 1>
+{
+ typedef typename MatrixType::PlainObject PlainType;
+ template <typename ResultType>
+ static void run(const MatrixType &arg, ResultType &result)
+ {
+ eigen_assert(arg.rows() == arg.cols());
+
+ // Compute Schur decomposition of arg
+ const ComplexSchur<PlainType> schurOfA(arg);
+ const PlainType& T = schurOfA.matrixT();
+ const PlainType& U = schurOfA.matrixU();
+
+ // Compute square root of T
+ PlainType sqrtT;
+ matrix_sqrt_triangular(T, sqrtT);
+
+ // Compute square root of arg
+ result = U * (sqrtT.template triangularView<Upper>() * U.adjoint());
+ }
+};
+
+} // end namespace internal
+
+/** \ingroup MatrixFunctions_Module
+ *
+ * \brief Proxy for the matrix square root of some matrix (expression).
+ *
+ * \tparam Derived Type of the argument to the matrix square root.
+ *
+ * This class holds the argument to the matrix square root until it
+ * is assigned or evaluated for some other reason (so the argument
+ * should not be changed in the meantime). It is the return type of
+ * MatrixBase::sqrt() and most of the time this is the only way it is
+ * used.
+ */
+template<typename Derived> class MatrixSquareRootReturnValue
+: public ReturnByValue<MatrixSquareRootReturnValue<Derived> >
+{
+ protected:
+ typedef typename internal::ref_selector<Derived>::type DerivedNested;
+
+ public:
+ /** \brief Constructor.
+ *
+ * \param[in] src %Matrix (expression) forming the argument of the
+ * matrix square root.
+ */
+ explicit MatrixSquareRootReturnValue(const Derived& src) : m_src(src) { }
+
+ /** \brief Compute the matrix square root.
+ *
+ * \param[out] result the matrix square root of \p src in the
+ * constructor.
+ */
+ template <typename ResultType>
+ inline void evalTo(ResultType& result) const
+ {
+ typedef typename internal::nested_eval<Derived, 10>::type DerivedEvalType;
+ typedef typename internal::remove_all<DerivedEvalType>::type DerivedEvalTypeClean;
+ DerivedEvalType tmp(m_src);
+ internal::matrix_sqrt_compute<DerivedEvalTypeClean>::run(tmp, result);
+ }
+
+ Index rows() const { return m_src.rows(); }
+ Index cols() const { return m_src.cols(); }
+
+ protected:
+ const DerivedNested m_src;
+};
+
+namespace internal {
+template<typename Derived>
+struct traits<MatrixSquareRootReturnValue<Derived> >
+{
+ typedef typename Derived::PlainObject ReturnType;
+};
+}
+
+template <typename Derived>
+const MatrixSquareRootReturnValue<Derived> MatrixBase<Derived>::sqrt() const
+{
+ eigen_assert(rows() == cols());
+ return MatrixSquareRootReturnValue<Derived>(derived());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_MATRIX_FUNCTION
diff --git a/src/EigenUnsupported/src/MatrixFunctions/StemFunction.h b/src/EigenUnsupported/src/MatrixFunctions/StemFunction.h
new file mode 100644
index 0000000..7604df9
--- /dev/null
+++ b/src/EigenUnsupported/src/MatrixFunctions/StemFunction.h
@@ -0,0 +1,117 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010, 2013 Jitse Niesen <jitse@maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_STEM_FUNCTION
+#define EIGEN_STEM_FUNCTION
+
+namespace Eigen {
+
+namespace internal {
+
+/** \brief The exponential function (and its derivatives). */
+template <typename Scalar>
+Scalar stem_function_exp(Scalar x, int)
+{
+ using std::exp;
+ return exp(x);
+}
+
+/** \brief Cosine (and its derivatives). */
+template <typename Scalar>
+Scalar stem_function_cos(Scalar x, int n)
+{
+ using std::cos;
+ using std::sin;
+ Scalar res;
+
+ switch (n % 4) {
+ case 0:
+ res = std::cos(x);
+ break;
+ case 1:
+ res = -std::sin(x);
+ break;
+ case 2:
+ res = -std::cos(x);
+ break;
+ case 3:
+ res = std::sin(x);
+ break;
+ }
+ return res;
+}
+
+/** \brief Sine (and its derivatives). */
+template <typename Scalar>
+Scalar stem_function_sin(Scalar x, int n)
+{
+ using std::cos;
+ using std::sin;
+ Scalar res;
+
+ switch (n % 4) {
+ case 0:
+ res = std::sin(x);
+ break;
+ case 1:
+ res = std::cos(x);
+ break;
+ case 2:
+ res = -std::sin(x);
+ break;
+ case 3:
+ res = -std::cos(x);
+ break;
+ }
+ return res;
+}
+
+/** \brief Hyperbolic cosine (and its derivatives). */
+template <typename Scalar>
+Scalar stem_function_cosh(Scalar x, int n)
+{
+ using std::cosh;
+ using std::sinh;
+ Scalar res;
+
+ switch (n % 2) {
+ case 0:
+ res = std::cosh(x);
+ break;
+ case 1:
+ res = std::sinh(x);
+ break;
+ }
+ return res;
+}
+
+/** \brief Hyperbolic sine (and its derivatives). */
+template <typename Scalar>
+Scalar stem_function_sinh(Scalar x, int n)
+{
+ using std::cosh;
+ using std::sinh;
+ Scalar res;
+
+ switch (n % 2) {
+ case 0:
+ res = std::sinh(x);
+ break;
+ case 1:
+ res = std::cosh(x);
+ break;
+ }
+ return res;
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_STEM_FUNCTION
diff --git a/src/EigenUnsupported/src/MoreVectorization/MathFunctions.h b/src/EigenUnsupported/src/MoreVectorization/MathFunctions.h
new file mode 100644
index 0000000..63cb28d
--- /dev/null
+++ b/src/EigenUnsupported/src/MoreVectorization/MathFunctions.h
@@ -0,0 +1,95 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Rohit Garg <rpg.314@gmail.com>
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MOREVECTORIZATION_MATHFUNCTIONS_H
+#define EIGEN_MOREVECTORIZATION_MATHFUNCTIONS_H
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal \returns the arcsin of \a a (coeff-wise) */
+template<typename Packet> inline static Packet pasin(Packet a) { return std::asin(a); }
+
+#ifdef EIGEN_VECTORIZE_SSE
+
+template<> EIGEN_DONT_INLINE Packet4f pasin(Packet4f x)
+{
+ _EIGEN_DECLARE_CONST_Packet4f(half, 0.5);
+ _EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5);
+ _EIGEN_DECLARE_CONST_Packet4f(3half, 1.5);
+
+ _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000);
+
+ _EIGEN_DECLARE_CONST_Packet4f(pi, 3.141592654);
+ _EIGEN_DECLARE_CONST_Packet4f(pi_over_2, 3.141592654*0.5);
+
+ _EIGEN_DECLARE_CONST_Packet4f(asin1, 4.2163199048E-2);
+ _EIGEN_DECLARE_CONST_Packet4f(asin2, 2.4181311049E-2);
+ _EIGEN_DECLARE_CONST_Packet4f(asin3, 4.5470025998E-2);
+ _EIGEN_DECLARE_CONST_Packet4f(asin4, 7.4953002686E-2);
+ _EIGEN_DECLARE_CONST_Packet4f(asin5, 1.6666752422E-1);
+
+ Packet4f a = pabs(x);//got the absolute value
+
+ Packet4f sign_bit= _mm_and_ps(x, p4f_sign_mask);//extracted the sign bit
+
+ Packet4f z1,z2;//will need them during computation
+
+
+//will compute the two branches for asin
+//so first compare with half
+
+ Packet4f branch_mask= _mm_cmpgt_ps(a, p4f_half);//this is to select which branch to take
+//both will be taken, and finally results will be merged
+//the branch for values >0.5
+
+ {
+//the core series expansion
+ z1=pmadd(p4f_minus_half,a,p4f_half);
+ Packet4f x1=psqrt(z1);
+ Packet4f s1=pmadd(p4f_asin1, z1, p4f_asin2);
+ Packet4f s2=pmadd(s1, z1, p4f_asin3);
+ Packet4f s3=pmadd(s2,z1, p4f_asin4);
+ Packet4f s4=pmadd(s3,z1, p4f_asin5);
+ Packet4f temp=pmul(s4,z1);//not really a madd but a mul by z so that the next term can be a madd
+ z1=pmadd(temp,x1,x1);
+ z1=padd(z1,z1);
+ z1=psub(p4f_pi_over_2,z1);
+ }
+
+ {
+//the core series expansion
+ Packet4f x2=a;
+ z2=pmul(x2,x2);
+ Packet4f s1=pmadd(p4f_asin1, z2, p4f_asin2);
+ Packet4f s2=pmadd(s1, z2, p4f_asin3);
+ Packet4f s3=pmadd(s2,z2, p4f_asin4);
+ Packet4f s4=pmadd(s3,z2, p4f_asin5);
+ Packet4f temp=pmul(s4,z2);//not really a madd but a mul by z so that the next term can be a madd
+ z2=pmadd(temp,x2,x2);
+ }
+
+/* select the correct result from the two branch evaluations */
+ z1 = _mm_and_ps(branch_mask, z1);
+ z2 = _mm_andnot_ps(branch_mask, z2);
+ Packet4f z = _mm_or_ps(z1,z2);
+
+/* update the sign */
+ return _mm_xor_ps(z, sign_bit);
+}
+
+#endif // EIGEN_VECTORIZE_SSE
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_MOREVECTORIZATION_MATHFUNCTIONS_H
diff --git a/src/EigenUnsupported/src/NonLinearOptimization/HybridNonLinearSolver.h b/src/EigenUnsupported/src/NonLinearOptimization/HybridNonLinearSolver.h
new file mode 100644
index 0000000..07c5ef0
--- /dev/null
+++ b/src/EigenUnsupported/src/NonLinearOptimization/HybridNonLinearSolver.h
@@ -0,0 +1,601 @@
+// -*- coding: utf-8
+// vim: set fileencoding=utf-8
+
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Thomas Capricelli <orzel@freehackers.org>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_HYBRIDNONLINEARSOLVER_H
+#define EIGEN_HYBRIDNONLINEARSOLVER_H
+
+namespace Eigen {
+
+namespace HybridNonLinearSolverSpace {
+ enum Status {
+ Running = -1,
+ ImproperInputParameters = 0,
+ RelativeErrorTooSmall = 1,
+ TooManyFunctionEvaluation = 2,
+ TolTooSmall = 3,
+ NotMakingProgressJacobian = 4,
+ NotMakingProgressIterations = 5,
+ UserAsked = 6
+ };
+}
+
+/**
+ * \ingroup NonLinearOptimization_Module
+ * \brief Finds a zero of a system of n
+ * nonlinear functions in n variables by a modification of the Powell
+ * hybrid method ("dogleg").
+ *
+ * The user must provide a subroutine which calculates the
+ * functions. The Jacobian is either provided by the user, or approximated
+ * using a forward-difference method.
+ *
+ */
+template<typename FunctorType, typename Scalar=double>
+class HybridNonLinearSolver
+{
+public:
+ typedef DenseIndex Index;
+
+ HybridNonLinearSolver(FunctorType &_functor)
+ : functor(_functor) { nfev=njev=iter = 0; fnorm= 0.; useExternalScaling=false;}
+
+ struct Parameters {
+ Parameters()
+ : factor(Scalar(100.))
+ , maxfev(1000)
+ , xtol(numext::sqrt(NumTraits<Scalar>::epsilon()))
+ , nb_of_subdiagonals(-1)
+ , nb_of_superdiagonals(-1)
+ , epsfcn(Scalar(0.)) {}
+ Scalar factor;
+ Index maxfev; // maximum number of function evaluation
+ Scalar xtol;
+ Index nb_of_subdiagonals;
+ Index nb_of_superdiagonals;
+ Scalar epsfcn;
+ };
+ typedef Matrix< Scalar, Dynamic, 1 > FVectorType;
+ typedef Matrix< Scalar, Dynamic, Dynamic > JacobianType;
+ /* TODO: if eigen provides a triangular storage, use it here */
+ typedef Matrix< Scalar, Dynamic, Dynamic > UpperTriangularType;
+
+ HybridNonLinearSolverSpace::Status hybrj1(
+ FVectorType &x,
+ const Scalar tol = numext::sqrt(NumTraits<Scalar>::epsilon())
+ );
+
+ HybridNonLinearSolverSpace::Status solveInit(FVectorType &x);
+ HybridNonLinearSolverSpace::Status solveOneStep(FVectorType &x);
+ HybridNonLinearSolverSpace::Status solve(FVectorType &x);
+
+ HybridNonLinearSolverSpace::Status hybrd1(
+ FVectorType &x,
+ const Scalar tol = numext::sqrt(NumTraits<Scalar>::epsilon())
+ );
+
+ HybridNonLinearSolverSpace::Status solveNumericalDiffInit(FVectorType &x);
+ HybridNonLinearSolverSpace::Status solveNumericalDiffOneStep(FVectorType &x);
+ HybridNonLinearSolverSpace::Status solveNumericalDiff(FVectorType &x);
+
+ void resetParameters(void) { parameters = Parameters(); }
+ Parameters parameters;
+ FVectorType fvec, qtf, diag;
+ JacobianType fjac;
+ UpperTriangularType R;
+ Index nfev;
+ Index njev;
+ Index iter;
+ Scalar fnorm;
+ bool useExternalScaling;
+private:
+ FunctorType &functor;
+ Index n;
+ Scalar sum;
+ bool sing;
+ Scalar temp;
+ Scalar delta;
+ bool jeval;
+ Index ncsuc;
+ Scalar ratio;
+ Scalar pnorm, xnorm, fnorm1;
+ Index nslow1, nslow2;
+ Index ncfail;
+ Scalar actred, prered;
+ FVectorType wa1, wa2, wa3, wa4;
+
+ HybridNonLinearSolver& operator=(const HybridNonLinearSolver&);
+};
+
+
+
+template<typename FunctorType, typename Scalar>
+HybridNonLinearSolverSpace::Status
+HybridNonLinearSolver<FunctorType,Scalar>::hybrj1(
+ FVectorType &x,
+ const Scalar tol
+ )
+{
+ n = x.size();
+
+ /* check the input parameters for errors. */
+ if (n <= 0 || tol < 0.)
+ return HybridNonLinearSolverSpace::ImproperInputParameters;
+
+ resetParameters();
+ parameters.maxfev = 100*(n+1);
+ parameters.xtol = tol;
+ diag.setConstant(n, 1.);
+ useExternalScaling = true;
+ return solve(x);
+}
+
+template<typename FunctorType, typename Scalar>
+HybridNonLinearSolverSpace::Status
+HybridNonLinearSolver<FunctorType,Scalar>::solveInit(FVectorType &x)
+{
+ n = x.size();
+
+ wa1.resize(n); wa2.resize(n); wa3.resize(n); wa4.resize(n);
+ fvec.resize(n);
+ qtf.resize(n);
+ fjac.resize(n, n);
+ if (!useExternalScaling)
+ diag.resize(n);
+ eigen_assert( (!useExternalScaling || diag.size()==n) && "When useExternalScaling is set, the caller must provide a valid 'diag'");
+
+ /* Function Body */
+ nfev = 0;
+ njev = 0;
+
+ /* check the input parameters for errors. */
+ if (n <= 0 || parameters.xtol < 0. || parameters.maxfev <= 0 || parameters.factor <= 0. )
+ return HybridNonLinearSolverSpace::ImproperInputParameters;
+ if (useExternalScaling)
+ for (Index j = 0; j < n; ++j)
+ if (diag[j] <= 0.)
+ return HybridNonLinearSolverSpace::ImproperInputParameters;
+
+ /* evaluate the function at the starting point */
+ /* and calculate its norm. */
+ nfev = 1;
+ if ( functor(x, fvec) < 0)
+ return HybridNonLinearSolverSpace::UserAsked;
+ fnorm = fvec.stableNorm();
+
+ /* initialize iteration counter and monitors. */
+ iter = 1;
+ ncsuc = 0;
+ ncfail = 0;
+ nslow1 = 0;
+ nslow2 = 0;
+
+ return HybridNonLinearSolverSpace::Running;
+}
+
+template<typename FunctorType, typename Scalar>
+HybridNonLinearSolverSpace::Status
+HybridNonLinearSolver<FunctorType,Scalar>::solveOneStep(FVectorType &x)
+{
+ using std::abs;
+
+ eigen_assert(x.size()==n); // check the caller is not cheating us
+
+ Index j;
+ std::vector<JacobiRotation<Scalar> > v_givens(n), w_givens(n);
+
+ jeval = true;
+
+ /* calculate the jacobian matrix. */
+ if ( functor.df(x, fjac) < 0)
+ return HybridNonLinearSolverSpace::UserAsked;
+ ++njev;
+
+ wa2 = fjac.colwise().blueNorm();
+
+ /* on the first iteration and if external scaling is not used, scale according */
+ /* to the norms of the columns of the initial jacobian. */
+ if (iter == 1) {
+ if (!useExternalScaling)
+ for (j = 0; j < n; ++j)
+ diag[j] = (wa2[j]==0.) ? 1. : wa2[j];
+
+ /* on the first iteration, calculate the norm of the scaled x */
+ /* and initialize the step bound delta. */
+ xnorm = diag.cwiseProduct(x).stableNorm();
+ delta = parameters.factor * xnorm;
+ if (delta == 0.)
+ delta = parameters.factor;
+ }
+
+ /* compute the qr factorization of the jacobian. */
+ HouseholderQR<JacobianType> qrfac(fjac); // no pivoting:
+
+ /* copy the triangular factor of the qr factorization into r. */
+ R = qrfac.matrixQR();
+
+ /* accumulate the orthogonal factor in fjac. */
+ fjac = qrfac.householderQ();
+
+ /* form (q transpose)*fvec and store in qtf. */
+ qtf = fjac.transpose() * fvec;
+
+ /* rescale if necessary. */
+ if (!useExternalScaling)
+ diag = diag.cwiseMax(wa2);
+
+ while (true) {
+ /* determine the direction p. */
+ internal::dogleg<Scalar>(R, diag, qtf, delta, wa1);
+
+ /* store the direction p and x + p. calculate the norm of p. */
+ wa1 = -wa1;
+ wa2 = x + wa1;
+ pnorm = diag.cwiseProduct(wa1).stableNorm();
+
+ /* on the first iteration, adjust the initial step bound. */
+ if (iter == 1)
+ delta = (std::min)(delta,pnorm);
+
+ /* evaluate the function at x + p and calculate its norm. */
+ if ( functor(wa2, wa4) < 0)
+ return HybridNonLinearSolverSpace::UserAsked;
+ ++nfev;
+ fnorm1 = wa4.stableNorm();
+
+ /* compute the scaled actual reduction. */
+ actred = -1.;
+ if (fnorm1 < fnorm) /* Computing 2nd power */
+ actred = 1. - numext::abs2(fnorm1 / fnorm);
+
+ /* compute the scaled predicted reduction. */
+ wa3 = R.template triangularView<Upper>()*wa1 + qtf;
+ temp = wa3.stableNorm();
+ prered = 0.;
+ if (temp < fnorm) /* Computing 2nd power */
+ prered = 1. - numext::abs2(temp / fnorm);
+
+ /* compute the ratio of the actual to the predicted reduction. */
+ ratio = 0.;
+ if (prered > 0.)
+ ratio = actred / prered;
+
+ /* update the step bound. */
+ if (ratio < Scalar(.1)) {
+ ncsuc = 0;
+ ++ncfail;
+ delta = Scalar(.5) * delta;
+ } else {
+ ncfail = 0;
+ ++ncsuc;
+ if (ratio >= Scalar(.5) || ncsuc > 1)
+ delta = (std::max)(delta, pnorm / Scalar(.5));
+ if (abs(ratio - 1.) <= Scalar(.1)) {
+ delta = pnorm / Scalar(.5);
+ }
+ }
+
+ /* test for successful iteration. */
+ if (ratio >= Scalar(1e-4)) {
+ /* successful iteration. update x, fvec, and their norms. */
+ x = wa2;
+ wa2 = diag.cwiseProduct(x);
+ fvec = wa4;
+ xnorm = wa2.stableNorm();
+ fnorm = fnorm1;
+ ++iter;
+ }
+
+ /* determine the progress of the iteration. */
+ ++nslow1;
+ if (actred >= Scalar(.001))
+ nslow1 = 0;
+ if (jeval)
+ ++nslow2;
+ if (actred >= Scalar(.1))
+ nslow2 = 0;
+
+ /* test for convergence. */
+ if (delta <= parameters.xtol * xnorm || fnorm == 0.)
+ return HybridNonLinearSolverSpace::RelativeErrorTooSmall;
+
+ /* tests for termination and stringent tolerances. */
+ if (nfev >= parameters.maxfev)
+ return HybridNonLinearSolverSpace::TooManyFunctionEvaluation;
+ if (Scalar(.1) * (std::max)(Scalar(.1) * delta, pnorm) <= NumTraits<Scalar>::epsilon() * xnorm)
+ return HybridNonLinearSolverSpace::TolTooSmall;
+ if (nslow2 == 5)
+ return HybridNonLinearSolverSpace::NotMakingProgressJacobian;
+ if (nslow1 == 10)
+ return HybridNonLinearSolverSpace::NotMakingProgressIterations;
+
+ /* criterion for recalculating jacobian. */
+ if (ncfail == 2)
+ break; // leave inner loop and go for the next outer loop iteration
+
+ /* calculate the rank one modification to the jacobian */
+ /* and update qtf if necessary. */
+ wa1 = diag.cwiseProduct( diag.cwiseProduct(wa1)/pnorm );
+ wa2 = fjac.transpose() * wa4;
+ if (ratio >= Scalar(1e-4))
+ qtf = wa2;
+ wa2 = (wa2-wa3)/pnorm;
+
+ /* compute the qr factorization of the updated jacobian. */
+ internal::r1updt<Scalar>(R, wa1, v_givens, w_givens, wa2, wa3, &sing);
+ internal::r1mpyq<Scalar>(n, n, fjac.data(), v_givens, w_givens);
+ internal::r1mpyq<Scalar>(1, n, qtf.data(), v_givens, w_givens);
+
+ jeval = false;
+ }
+ return HybridNonLinearSolverSpace::Running;
+}
+
+template<typename FunctorType, typename Scalar>
+HybridNonLinearSolverSpace::Status
+HybridNonLinearSolver<FunctorType,Scalar>::solve(FVectorType &x)
+{
+ HybridNonLinearSolverSpace::Status status = solveInit(x);
+ if (status==HybridNonLinearSolverSpace::ImproperInputParameters)
+ return status;
+ while (status==HybridNonLinearSolverSpace::Running)
+ status = solveOneStep(x);
+ return status;
+}
+
+
+
+template<typename FunctorType, typename Scalar>
+HybridNonLinearSolverSpace::Status
+HybridNonLinearSolver<FunctorType,Scalar>::hybrd1(
+ FVectorType &x,
+ const Scalar tol
+ )
+{
+ n = x.size();
+
+ /* check the input parameters for errors. */
+ if (n <= 0 || tol < 0.)
+ return HybridNonLinearSolverSpace::ImproperInputParameters;
+
+ resetParameters();
+ parameters.maxfev = 200*(n+1);
+ parameters.xtol = tol;
+
+ diag.setConstant(n, 1.);
+ useExternalScaling = true;
+ return solveNumericalDiff(x);
+}
+
+template<typename FunctorType, typename Scalar>
+HybridNonLinearSolverSpace::Status
+HybridNonLinearSolver<FunctorType,Scalar>::solveNumericalDiffInit(FVectorType &x)
+{
+ n = x.size();
+
+ if (parameters.nb_of_subdiagonals<0) parameters.nb_of_subdiagonals= n-1;
+ if (parameters.nb_of_superdiagonals<0) parameters.nb_of_superdiagonals= n-1;
+
+ wa1.resize(n); wa2.resize(n); wa3.resize(n); wa4.resize(n);
+ qtf.resize(n);
+ fjac.resize(n, n);
+ fvec.resize(n);
+ if (!useExternalScaling)
+ diag.resize(n);
+ eigen_assert( (!useExternalScaling || diag.size()==n) && "When useExternalScaling is set, the caller must provide a valid 'diag'");
+
+ /* Function Body */
+ nfev = 0;
+ njev = 0;
+
+ /* check the input parameters for errors. */
+ if (n <= 0 || parameters.xtol < 0. || parameters.maxfev <= 0 || parameters.nb_of_subdiagonals< 0 || parameters.nb_of_superdiagonals< 0 || parameters.factor <= 0. )
+ return HybridNonLinearSolverSpace::ImproperInputParameters;
+ if (useExternalScaling)
+ for (Index j = 0; j < n; ++j)
+ if (diag[j] <= 0.)
+ return HybridNonLinearSolverSpace::ImproperInputParameters;
+
+ /* evaluate the function at the starting point */
+ /* and calculate its norm. */
+ nfev = 1;
+ if ( functor(x, fvec) < 0)
+ return HybridNonLinearSolverSpace::UserAsked;
+ fnorm = fvec.stableNorm();
+
+ /* initialize iteration counter and monitors. */
+ iter = 1;
+ ncsuc = 0;
+ ncfail = 0;
+ nslow1 = 0;
+ nslow2 = 0;
+
+ return HybridNonLinearSolverSpace::Running;
+}
+
+template<typename FunctorType, typename Scalar>
+HybridNonLinearSolverSpace::Status
+HybridNonLinearSolver<FunctorType,Scalar>::solveNumericalDiffOneStep(FVectorType &x)
+{
+ using std::sqrt;
+ using std::abs;
+
+ assert(x.size()==n); // check the caller is not cheating us
+
+ Index j;
+ std::vector<JacobiRotation<Scalar> > v_givens(n), w_givens(n);
+
+ jeval = true;
+ if (parameters.nb_of_subdiagonals<0) parameters.nb_of_subdiagonals= n-1;
+ if (parameters.nb_of_superdiagonals<0) parameters.nb_of_superdiagonals= n-1;
+
+ /* calculate the jacobian matrix. */
+ if (internal::fdjac1(functor, x, fvec, fjac, parameters.nb_of_subdiagonals, parameters.nb_of_superdiagonals, parameters.epsfcn) <0)
+ return HybridNonLinearSolverSpace::UserAsked;
+ nfev += (std::min)(parameters.nb_of_subdiagonals+parameters.nb_of_superdiagonals+ 1, n);
+
+ wa2 = fjac.colwise().blueNorm();
+
+ /* on the first iteration and if external scaling is not used, scale according */
+ /* to the norms of the columns of the initial jacobian. */
+ if (iter == 1) {
+ if (!useExternalScaling)
+ for (j = 0; j < n; ++j)
+ diag[j] = (wa2[j]==0.) ? 1. : wa2[j];
+
+ /* on the first iteration, calculate the norm of the scaled x */
+ /* and initialize the step bound delta. */
+ xnorm = diag.cwiseProduct(x).stableNorm();
+ delta = parameters.factor * xnorm;
+ if (delta == 0.)
+ delta = parameters.factor;
+ }
+
+ /* compute the qr factorization of the jacobian. */
+ HouseholderQR<JacobianType> qrfac(fjac); // no pivoting:
+
+ /* copy the triangular factor of the qr factorization into r. */
+ R = qrfac.matrixQR();
+
+ /* accumulate the orthogonal factor in fjac. */
+ fjac = qrfac.householderQ();
+
+ /* form (q transpose)*fvec and store in qtf. */
+ qtf = fjac.transpose() * fvec;
+
+ /* rescale if necessary. */
+ if (!useExternalScaling)
+ diag = diag.cwiseMax(wa2);
+
+ while (true) {
+ /* determine the direction p. */
+ internal::dogleg<Scalar>(R, diag, qtf, delta, wa1);
+
+ /* store the direction p and x + p. calculate the norm of p. */
+ wa1 = -wa1;
+ wa2 = x + wa1;
+ pnorm = diag.cwiseProduct(wa1).stableNorm();
+
+ /* on the first iteration, adjust the initial step bound. */
+ if (iter == 1)
+ delta = (std::min)(delta,pnorm);
+
+ /* evaluate the function at x + p and calculate its norm. */
+ if ( functor(wa2, wa4) < 0)
+ return HybridNonLinearSolverSpace::UserAsked;
+ ++nfev;
+ fnorm1 = wa4.stableNorm();
+
+ /* compute the scaled actual reduction. */
+ actred = -1.;
+ if (fnorm1 < fnorm) /* Computing 2nd power */
+ actred = 1. - numext::abs2(fnorm1 / fnorm);
+
+ /* compute the scaled predicted reduction. */
+ wa3 = R.template triangularView<Upper>()*wa1 + qtf;
+ temp = wa3.stableNorm();
+ prered = 0.;
+ if (temp < fnorm) /* Computing 2nd power */
+ prered = 1. - numext::abs2(temp / fnorm);
+
+ /* compute the ratio of the actual to the predicted reduction. */
+ ratio = 0.;
+ if (prered > 0.)
+ ratio = actred / prered;
+
+ /* update the step bound. */
+ if (ratio < Scalar(.1)) {
+ ncsuc = 0;
+ ++ncfail;
+ delta = Scalar(.5) * delta;
+ } else {
+ ncfail = 0;
+ ++ncsuc;
+ if (ratio >= Scalar(.5) || ncsuc > 1)
+ delta = (std::max)(delta, pnorm / Scalar(.5));
+ if (abs(ratio - 1.) <= Scalar(.1)) {
+ delta = pnorm / Scalar(.5);
+ }
+ }
+
+ /* test for successful iteration. */
+ if (ratio >= Scalar(1e-4)) {
+ /* successful iteration. update x, fvec, and their norms. */
+ x = wa2;
+ wa2 = diag.cwiseProduct(x);
+ fvec = wa4;
+ xnorm = wa2.stableNorm();
+ fnorm = fnorm1;
+ ++iter;
+ }
+
+ /* determine the progress of the iteration. */
+ ++nslow1;
+ if (actred >= Scalar(.001))
+ nslow1 = 0;
+ if (jeval)
+ ++nslow2;
+ if (actred >= Scalar(.1))
+ nslow2 = 0;
+
+ /* test for convergence. */
+ if (delta <= parameters.xtol * xnorm || fnorm == 0.)
+ return HybridNonLinearSolverSpace::RelativeErrorTooSmall;
+
+ /* tests for termination and stringent tolerances. */
+ if (nfev >= parameters.maxfev)
+ return HybridNonLinearSolverSpace::TooManyFunctionEvaluation;
+ if (Scalar(.1) * (std::max)(Scalar(.1) * delta, pnorm) <= NumTraits<Scalar>::epsilon() * xnorm)
+ return HybridNonLinearSolverSpace::TolTooSmall;
+ if (nslow2 == 5)
+ return HybridNonLinearSolverSpace::NotMakingProgressJacobian;
+ if (nslow1 == 10)
+ return HybridNonLinearSolverSpace::NotMakingProgressIterations;
+
+ /* criterion for recalculating jacobian. */
+ if (ncfail == 2)
+ break; // leave inner loop and go for the next outer loop iteration
+
+ /* calculate the rank one modification to the jacobian */
+ /* and update qtf if necessary. */
+ wa1 = diag.cwiseProduct( diag.cwiseProduct(wa1)/pnorm );
+ wa2 = fjac.transpose() * wa4;
+ if (ratio >= Scalar(1e-4))
+ qtf = wa2;
+ wa2 = (wa2-wa3)/pnorm;
+
+ /* compute the qr factorization of the updated jacobian. */
+ internal::r1updt<Scalar>(R, wa1, v_givens, w_givens, wa2, wa3, &sing);
+ internal::r1mpyq<Scalar>(n, n, fjac.data(), v_givens, w_givens);
+ internal::r1mpyq<Scalar>(1, n, qtf.data(), v_givens, w_givens);
+
+ jeval = false;
+ }
+ return HybridNonLinearSolverSpace::Running;
+}
+
+template<typename FunctorType, typename Scalar>
+HybridNonLinearSolverSpace::Status
+HybridNonLinearSolver<FunctorType,Scalar>::solveNumericalDiff(FVectorType &x)
+{
+ HybridNonLinearSolverSpace::Status status = solveNumericalDiffInit(x);
+ if (status==HybridNonLinearSolverSpace::ImproperInputParameters)
+ return status;
+ while (status==HybridNonLinearSolverSpace::Running)
+ status = solveNumericalDiffOneStep(x);
+ return status;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_HYBRIDNONLINEARSOLVER_H
+
+//vim: ai ts=4 sts=4 et sw=4
diff --git a/src/EigenUnsupported/src/NonLinearOptimization/LevenbergMarquardt.h b/src/EigenUnsupported/src/NonLinearOptimization/LevenbergMarquardt.h
new file mode 100644
index 0000000..fe3b79c
--- /dev/null
+++ b/src/EigenUnsupported/src/NonLinearOptimization/LevenbergMarquardt.h
@@ -0,0 +1,657 @@
+// -*- coding: utf-8
+// vim: set fileencoding=utf-8
+
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Thomas Capricelli <orzel@freehackers.org>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_LEVENBERGMARQUARDT__H
+#define EIGEN_LEVENBERGMARQUARDT__H
+
+namespace Eigen {
+
+namespace LevenbergMarquardtSpace {
+ enum Status {
+ NotStarted = -2,
+ Running = -1,
+ ImproperInputParameters = 0,
+ RelativeReductionTooSmall = 1,
+ RelativeErrorTooSmall = 2,
+ RelativeErrorAndReductionTooSmall = 3,
+ CosinusTooSmall = 4,
+ TooManyFunctionEvaluation = 5,
+ FtolTooSmall = 6,
+ XtolTooSmall = 7,
+ GtolTooSmall = 8,
+ UserAsked = 9
+ };
+}
+
+
+
+/**
+ * \ingroup NonLinearOptimization_Module
+ * \brief Performs non linear optimization over a non-linear function,
+ * using a variant of the Levenberg Marquardt algorithm.
+ *
+ * Check wikipedia for more information.
+ * http://en.wikipedia.org/wiki/Levenberg%E2%80%93Marquardt_algorithm
+ */
+template<typename FunctorType, typename Scalar=double>
+class LevenbergMarquardt
+{
+ static Scalar sqrt_epsilon()
+ {
+ using std::sqrt;
+ return sqrt(NumTraits<Scalar>::epsilon());
+ }
+
+public:
+ LevenbergMarquardt(FunctorType &_functor)
+ : functor(_functor) { nfev = njev = iter = 0; fnorm = gnorm = 0.; useExternalScaling=false; }
+
+ typedef DenseIndex Index;
+
+ struct Parameters {
+ Parameters()
+ : factor(Scalar(100.))
+ , maxfev(400)
+ , ftol(sqrt_epsilon())
+ , xtol(sqrt_epsilon())
+ , gtol(Scalar(0.))
+ , epsfcn(Scalar(0.)) {}
+ Scalar factor;
+ Index maxfev; // maximum number of function evaluation
+ Scalar ftol;
+ Scalar xtol;
+ Scalar gtol;
+ Scalar epsfcn;
+ };
+
+ typedef Matrix< Scalar, Dynamic, 1 > FVectorType;
+ typedef Matrix< Scalar, Dynamic, Dynamic > JacobianType;
+
+ LevenbergMarquardtSpace::Status lmder1(
+ FVectorType &x,
+ const Scalar tol = sqrt_epsilon()
+ );
+
+ LevenbergMarquardtSpace::Status minimize(FVectorType &x);
+ LevenbergMarquardtSpace::Status minimizeInit(FVectorType &x);
+ LevenbergMarquardtSpace::Status minimizeOneStep(FVectorType &x);
+
+ static LevenbergMarquardtSpace::Status lmdif1(
+ FunctorType &functor,
+ FVectorType &x,
+ Index *nfev,
+ const Scalar tol = sqrt_epsilon()
+ );
+
+ LevenbergMarquardtSpace::Status lmstr1(
+ FVectorType &x,
+ const Scalar tol = sqrt_epsilon()
+ );
+
+ LevenbergMarquardtSpace::Status minimizeOptimumStorage(FVectorType &x);
+ LevenbergMarquardtSpace::Status minimizeOptimumStorageInit(FVectorType &x);
+ LevenbergMarquardtSpace::Status minimizeOptimumStorageOneStep(FVectorType &x);
+
+ void resetParameters(void) { parameters = Parameters(); }
+
+ Parameters parameters;
+ FVectorType fvec, qtf, diag;
+ JacobianType fjac;
+ PermutationMatrix<Dynamic,Dynamic> permutation;
+ Index nfev;
+ Index njev;
+ Index iter;
+ Scalar fnorm, gnorm;
+ bool useExternalScaling;
+
+ Scalar lm_param(void) { return par; }
+private:
+
+ FunctorType &functor;
+ Index n;
+ Index m;
+ FVectorType wa1, wa2, wa3, wa4;
+
+ Scalar par, sum;
+ Scalar temp, temp1, temp2;
+ Scalar delta;
+ Scalar ratio;
+ Scalar pnorm, xnorm, fnorm1, actred, dirder, prered;
+
+ LevenbergMarquardt& operator=(const LevenbergMarquardt&);
+};
+
+template<typename FunctorType, typename Scalar>
+LevenbergMarquardtSpace::Status
+LevenbergMarquardt<FunctorType,Scalar>::lmder1(
+ FVectorType &x,
+ const Scalar tol
+ )
+{
+ n = x.size();
+ m = functor.values();
+
+ /* check the input parameters for errors. */
+ if (n <= 0 || m < n || tol < 0.)
+ return LevenbergMarquardtSpace::ImproperInputParameters;
+
+ resetParameters();
+ parameters.ftol = tol;
+ parameters.xtol = tol;
+ parameters.maxfev = 100*(n+1);
+
+ return minimize(x);
+}
+
+
+template<typename FunctorType, typename Scalar>
+LevenbergMarquardtSpace::Status
+LevenbergMarquardt<FunctorType,Scalar>::minimize(FVectorType &x)
+{
+ LevenbergMarquardtSpace::Status status = minimizeInit(x);
+ if (status==LevenbergMarquardtSpace::ImproperInputParameters)
+ return status;
+ do {
+ status = minimizeOneStep(x);
+ } while (status==LevenbergMarquardtSpace::Running);
+ return status;
+}
+
+template<typename FunctorType, typename Scalar>
+LevenbergMarquardtSpace::Status
+LevenbergMarquardt<FunctorType,Scalar>::minimizeInit(FVectorType &x)
+{
+ n = x.size();
+ m = functor.values();
+
+ wa1.resize(n); wa2.resize(n); wa3.resize(n);
+ wa4.resize(m);
+ fvec.resize(m);
+ fjac.resize(m, n);
+ if (!useExternalScaling)
+ diag.resize(n);
+ eigen_assert( (!useExternalScaling || diag.size()==n) && "When useExternalScaling is set, the caller must provide a valid 'diag'");
+ qtf.resize(n);
+
+ /* Function Body */
+ nfev = 0;
+ njev = 0;
+
+ /* check the input parameters for errors. */
+ if (n <= 0 || m < n || parameters.ftol < 0. || parameters.xtol < 0. || parameters.gtol < 0. || parameters.maxfev <= 0 || parameters.factor <= 0.)
+ return LevenbergMarquardtSpace::ImproperInputParameters;
+
+ if (useExternalScaling)
+ for (Index j = 0; j < n; ++j)
+ if (diag[j] <= 0.)
+ return LevenbergMarquardtSpace::ImproperInputParameters;
+
+ /* evaluate the function at the starting point */
+ /* and calculate its norm. */
+ nfev = 1;
+ if ( functor(x, fvec) < 0)
+ return LevenbergMarquardtSpace::UserAsked;
+ fnorm = fvec.stableNorm();
+
+ /* initialize levenberg-marquardt parameter and iteration counter. */
+ par = 0.;
+ iter = 1;
+
+ return LevenbergMarquardtSpace::NotStarted;
+}
+
+template<typename FunctorType, typename Scalar>
+LevenbergMarquardtSpace::Status
+LevenbergMarquardt<FunctorType,Scalar>::minimizeOneStep(FVectorType &x)
+{
+ using std::abs;
+ using std::sqrt;
+
+ eigen_assert(x.size()==n); // check the caller is not cheating us
+
+ /* calculate the jacobian matrix. */
+ Index df_ret = functor.df(x, fjac);
+ if (df_ret<0)
+ return LevenbergMarquardtSpace::UserAsked;
+ if (df_ret>0)
+ // numerical diff, we evaluated the function df_ret times
+ nfev += df_ret;
+ else njev++;
+
+ /* compute the qr factorization of the jacobian. */
+ wa2 = fjac.colwise().blueNorm();
+ ColPivHouseholderQR<JacobianType> qrfac(fjac);
+ fjac = qrfac.matrixQR();
+ permutation = qrfac.colsPermutation();
+
+ /* on the first iteration and if external scaling is not used, scale according */
+ /* to the norms of the columns of the initial jacobian. */
+ if (iter == 1) {
+ if (!useExternalScaling)
+ for (Index j = 0; j < n; ++j)
+ diag[j] = (wa2[j]==0.)? 1. : wa2[j];
+
+ /* on the first iteration, calculate the norm of the scaled x */
+ /* and initialize the step bound delta. */
+ xnorm = diag.cwiseProduct(x).stableNorm();
+ delta = parameters.factor * xnorm;
+ if (delta == 0.)
+ delta = parameters.factor;
+ }
+
+ /* form (q transpose)*fvec and store the first n components in */
+ /* qtf. */
+ wa4 = fvec;
+ wa4.applyOnTheLeft(qrfac.householderQ().adjoint());
+ qtf = wa4.head(n);
+
+ /* compute the norm of the scaled gradient. */
+ gnorm = 0.;
+ if (fnorm != 0.)
+ for (Index j = 0; j < n; ++j)
+ if (wa2[permutation.indices()[j]] != 0.)
+ gnorm = (std::max)(gnorm, abs( fjac.col(j).head(j+1).dot(qtf.head(j+1)/fnorm) / wa2[permutation.indices()[j]]));
+
+ /* test for convergence of the gradient norm. */
+ if (gnorm <= parameters.gtol)
+ return LevenbergMarquardtSpace::CosinusTooSmall;
+
+ /* rescale if necessary. */
+ if (!useExternalScaling)
+ diag = diag.cwiseMax(wa2);
+
+ do {
+
+ /* determine the levenberg-marquardt parameter. */
+ internal::lmpar2<Scalar>(qrfac, diag, qtf, delta, par, wa1);
+
+ /* store the direction p and x + p. calculate the norm of p. */
+ wa1 = -wa1;
+ wa2 = x + wa1;
+ pnorm = diag.cwiseProduct(wa1).stableNorm();
+
+ /* on the first iteration, adjust the initial step bound. */
+ if (iter == 1)
+ delta = (std::min)(delta,pnorm);
+
+ /* evaluate the function at x + p and calculate its norm. */
+ if ( functor(wa2, wa4) < 0)
+ return LevenbergMarquardtSpace::UserAsked;
+ ++nfev;
+ fnorm1 = wa4.stableNorm();
+
+ /* compute the scaled actual reduction. */
+ actred = -1.;
+ if (Scalar(.1) * fnorm1 < fnorm)
+ actred = 1. - numext::abs2(fnorm1 / fnorm);
+
+ /* compute the scaled predicted reduction and */
+ /* the scaled directional derivative. */
+ wa3 = fjac.template triangularView<Upper>() * (qrfac.colsPermutation().inverse() *wa1);
+ temp1 = numext::abs2(wa3.stableNorm() / fnorm);
+ temp2 = numext::abs2(sqrt(par) * pnorm / fnorm);
+ prered = temp1 + temp2 / Scalar(.5);
+ dirder = -(temp1 + temp2);
+
+ /* compute the ratio of the actual to the predicted */
+ /* reduction. */
+ ratio = 0.;
+ if (prered != 0.)
+ ratio = actred / prered;
+
+ /* update the step bound. */
+ if (ratio <= Scalar(.25)) {
+ if (actred >= 0.)
+ temp = Scalar(.5);
+ if (actred < 0.)
+ temp = Scalar(.5) * dirder / (dirder + Scalar(.5) * actred);
+ if (Scalar(.1) * fnorm1 >= fnorm || temp < Scalar(.1))
+ temp = Scalar(.1);
+ /* Computing MIN */
+ delta = temp * (std::min)(delta, pnorm / Scalar(.1));
+ par /= temp;
+ } else if (!(par != 0. && ratio < Scalar(.75))) {
+ delta = pnorm / Scalar(.5);
+ par = Scalar(.5) * par;
+ }
+
+ /* test for successful iteration. */
+ if (ratio >= Scalar(1e-4)) {
+ /* successful iteration. update x, fvec, and their norms. */
+ x = wa2;
+ wa2 = diag.cwiseProduct(x);
+ fvec = wa4;
+ xnorm = wa2.stableNorm();
+ fnorm = fnorm1;
+ ++iter;
+ }
+
+ /* tests for convergence. */
+ if (abs(actred) <= parameters.ftol && prered <= parameters.ftol && Scalar(.5) * ratio <= 1. && delta <= parameters.xtol * xnorm)
+ return LevenbergMarquardtSpace::RelativeErrorAndReductionTooSmall;
+ if (abs(actred) <= parameters.ftol && prered <= parameters.ftol && Scalar(.5) * ratio <= 1.)
+ return LevenbergMarquardtSpace::RelativeReductionTooSmall;
+ if (delta <= parameters.xtol * xnorm)
+ return LevenbergMarquardtSpace::RelativeErrorTooSmall;
+
+ /* tests for termination and stringent tolerances. */
+ if (nfev >= parameters.maxfev)
+ return LevenbergMarquardtSpace::TooManyFunctionEvaluation;
+ if (abs(actred) <= NumTraits<Scalar>::epsilon() && prered <= NumTraits<Scalar>::epsilon() && Scalar(.5) * ratio <= 1.)
+ return LevenbergMarquardtSpace::FtolTooSmall;
+ if (delta <= NumTraits<Scalar>::epsilon() * xnorm)
+ return LevenbergMarquardtSpace::XtolTooSmall;
+ if (gnorm <= NumTraits<Scalar>::epsilon())
+ return LevenbergMarquardtSpace::GtolTooSmall;
+
+ } while (ratio < Scalar(1e-4));
+
+ return LevenbergMarquardtSpace::Running;
+}
+
+template<typename FunctorType, typename Scalar>
+LevenbergMarquardtSpace::Status
+LevenbergMarquardt<FunctorType,Scalar>::lmstr1(
+ FVectorType &x,
+ const Scalar tol
+ )
+{
+ n = x.size();
+ m = functor.values();
+
+ /* check the input parameters for errors. */
+ if (n <= 0 || m < n || tol < 0.)
+ return LevenbergMarquardtSpace::ImproperInputParameters;
+
+ resetParameters();
+ parameters.ftol = tol;
+ parameters.xtol = tol;
+ parameters.maxfev = 100*(n+1);
+
+ return minimizeOptimumStorage(x);
+}
+
+template<typename FunctorType, typename Scalar>
+LevenbergMarquardtSpace::Status
+LevenbergMarquardt<FunctorType,Scalar>::minimizeOptimumStorageInit(FVectorType &x)
+{
+ n = x.size();
+ m = functor.values();
+
+ wa1.resize(n); wa2.resize(n); wa3.resize(n);
+ wa4.resize(m);
+ fvec.resize(m);
+ // Only R is stored in fjac. Q is only used to compute 'qtf', which is
+ // Q.transpose()*rhs. qtf will be updated using givens rotation,
+ // instead of storing them in Q.
+ // The purpose it to only use a nxn matrix, instead of mxn here, so
+ // that we can handle cases where m>>n :
+ fjac.resize(n, n);
+ if (!useExternalScaling)
+ diag.resize(n);
+ eigen_assert( (!useExternalScaling || diag.size()==n) && "When useExternalScaling is set, the caller must provide a valid 'diag'");
+ qtf.resize(n);
+
+ /* Function Body */
+ nfev = 0;
+ njev = 0;
+
+ /* check the input parameters for errors. */
+ if (n <= 0 || m < n || parameters.ftol < 0. || parameters.xtol < 0. || parameters.gtol < 0. || parameters.maxfev <= 0 || parameters.factor <= 0.)
+ return LevenbergMarquardtSpace::ImproperInputParameters;
+
+ if (useExternalScaling)
+ for (Index j = 0; j < n; ++j)
+ if (diag[j] <= 0.)
+ return LevenbergMarquardtSpace::ImproperInputParameters;
+
+ /* evaluate the function at the starting point */
+ /* and calculate its norm. */
+ nfev = 1;
+ if ( functor(x, fvec) < 0)
+ return LevenbergMarquardtSpace::UserAsked;
+ fnorm = fvec.stableNorm();
+
+ /* initialize levenberg-marquardt parameter and iteration counter. */
+ par = 0.;
+ iter = 1;
+
+ return LevenbergMarquardtSpace::NotStarted;
+}
+
+
+template<typename FunctorType, typename Scalar>
+LevenbergMarquardtSpace::Status
+LevenbergMarquardt<FunctorType,Scalar>::minimizeOptimumStorageOneStep(FVectorType &x)
+{
+ using std::abs;
+ using std::sqrt;
+
+ eigen_assert(x.size()==n); // check the caller is not cheating us
+
+ Index i, j;
+ bool sing;
+
+ /* compute the qr factorization of the jacobian matrix */
+ /* calculated one row at a time, while simultaneously */
+ /* forming (q transpose)*fvec and storing the first */
+ /* n components in qtf. */
+ qtf.fill(0.);
+ fjac.fill(0.);
+ Index rownb = 2;
+ for (i = 0; i < m; ++i) {
+ if (functor.df(x, wa3, rownb) < 0) return LevenbergMarquardtSpace::UserAsked;
+ internal::rwupdt<Scalar>(fjac, wa3, qtf, fvec[i]);
+ ++rownb;
+ }
+ ++njev;
+
+ /* if the jacobian is rank deficient, call qrfac to */
+ /* reorder its columns and update the components of qtf. */
+ sing = false;
+ for (j = 0; j < n; ++j) {
+ if (fjac(j,j) == 0.)
+ sing = true;
+ wa2[j] = fjac.col(j).head(j).stableNorm();
+ }
+ permutation.setIdentity(n);
+ if (sing) {
+ wa2 = fjac.colwise().blueNorm();
+ // TODO We have no unit test covering this code path, do not modify
+ // until it is carefully tested
+ ColPivHouseholderQR<JacobianType> qrfac(fjac);
+ fjac = qrfac.matrixQR();
+ wa1 = fjac.diagonal();
+ fjac.diagonal() = qrfac.hCoeffs();
+ permutation = qrfac.colsPermutation();
+ // TODO : avoid this:
+ for(Index ii=0; ii< fjac.cols(); ii++) fjac.col(ii).segment(ii+1, fjac.rows()-ii-1) *= fjac(ii,ii); // rescale vectors
+
+ for (j = 0; j < n; ++j) {
+ if (fjac(j,j) != 0.) {
+ sum = 0.;
+ for (i = j; i < n; ++i)
+ sum += fjac(i,j) * qtf[i];
+ temp = -sum / fjac(j,j);
+ for (i = j; i < n; ++i)
+ qtf[i] += fjac(i,j) * temp;
+ }
+ fjac(j,j) = wa1[j];
+ }
+ }
+
+ /* on the first iteration and if external scaling is not used, scale according */
+ /* to the norms of the columns of the initial jacobian. */
+ if (iter == 1) {
+ if (!useExternalScaling)
+ for (j = 0; j < n; ++j)
+ diag[j] = (wa2[j]==0.)? 1. : wa2[j];
+
+ /* on the first iteration, calculate the norm of the scaled x */
+ /* and initialize the step bound delta. */
+ xnorm = diag.cwiseProduct(x).stableNorm();
+ delta = parameters.factor * xnorm;
+ if (delta == 0.)
+ delta = parameters.factor;
+ }
+
+ /* compute the norm of the scaled gradient. */
+ gnorm = 0.;
+ if (fnorm != 0.)
+ for (j = 0; j < n; ++j)
+ if (wa2[permutation.indices()[j]] != 0.)
+ gnorm = (std::max)(gnorm, abs( fjac.col(j).head(j+1).dot(qtf.head(j+1)/fnorm) / wa2[permutation.indices()[j]]));
+
+ /* test for convergence of the gradient norm. */
+ if (gnorm <= parameters.gtol)
+ return LevenbergMarquardtSpace::CosinusTooSmall;
+
+ /* rescale if necessary. */
+ if (!useExternalScaling)
+ diag = diag.cwiseMax(wa2);
+
+ do {
+
+ /* determine the levenberg-marquardt parameter. */
+ internal::lmpar<Scalar>(fjac, permutation.indices(), diag, qtf, delta, par, wa1);
+
+ /* store the direction p and x + p. calculate the norm of p. */
+ wa1 = -wa1;
+ wa2 = x + wa1;
+ pnorm = diag.cwiseProduct(wa1).stableNorm();
+
+ /* on the first iteration, adjust the initial step bound. */
+ if (iter == 1)
+ delta = (std::min)(delta,pnorm);
+
+ /* evaluate the function at x + p and calculate its norm. */
+ if ( functor(wa2, wa4) < 0)
+ return LevenbergMarquardtSpace::UserAsked;
+ ++nfev;
+ fnorm1 = wa4.stableNorm();
+
+ /* compute the scaled actual reduction. */
+ actred = -1.;
+ if (Scalar(.1) * fnorm1 < fnorm)
+ actred = 1. - numext::abs2(fnorm1 / fnorm);
+
+ /* compute the scaled predicted reduction and */
+ /* the scaled directional derivative. */
+ wa3 = fjac.topLeftCorner(n,n).template triangularView<Upper>() * (permutation.inverse() * wa1);
+ temp1 = numext::abs2(wa3.stableNorm() / fnorm);
+ temp2 = numext::abs2(sqrt(par) * pnorm / fnorm);
+ prered = temp1 + temp2 / Scalar(.5);
+ dirder = -(temp1 + temp2);
+
+ /* compute the ratio of the actual to the predicted */
+ /* reduction. */
+ ratio = 0.;
+ if (prered != 0.)
+ ratio = actred / prered;
+
+ /* update the step bound. */
+ if (ratio <= Scalar(.25)) {
+ if (actred >= 0.)
+ temp = Scalar(.5);
+ if (actred < 0.)
+ temp = Scalar(.5) * dirder / (dirder + Scalar(.5) * actred);
+ if (Scalar(.1) * fnorm1 >= fnorm || temp < Scalar(.1))
+ temp = Scalar(.1);
+ /* Computing MIN */
+ delta = temp * (std::min)(delta, pnorm / Scalar(.1));
+ par /= temp;
+ } else if (!(par != 0. && ratio < Scalar(.75))) {
+ delta = pnorm / Scalar(.5);
+ par = Scalar(.5) * par;
+ }
+
+ /* test for successful iteration. */
+ if (ratio >= Scalar(1e-4)) {
+ /* successful iteration. update x, fvec, and their norms. */
+ x = wa2;
+ wa2 = diag.cwiseProduct(x);
+ fvec = wa4;
+ xnorm = wa2.stableNorm();
+ fnorm = fnorm1;
+ ++iter;
+ }
+
+ /* tests for convergence. */
+ if (abs(actred) <= parameters.ftol && prered <= parameters.ftol && Scalar(.5) * ratio <= 1. && delta <= parameters.xtol * xnorm)
+ return LevenbergMarquardtSpace::RelativeErrorAndReductionTooSmall;
+ if (abs(actred) <= parameters.ftol && prered <= parameters.ftol && Scalar(.5) * ratio <= 1.)
+ return LevenbergMarquardtSpace::RelativeReductionTooSmall;
+ if (delta <= parameters.xtol * xnorm)
+ return LevenbergMarquardtSpace::RelativeErrorTooSmall;
+
+ /* tests for termination and stringent tolerances. */
+ if (nfev >= parameters.maxfev)
+ return LevenbergMarquardtSpace::TooManyFunctionEvaluation;
+ if (abs(actred) <= NumTraits<Scalar>::epsilon() && prered <= NumTraits<Scalar>::epsilon() && Scalar(.5) * ratio <= 1.)
+ return LevenbergMarquardtSpace::FtolTooSmall;
+ if (delta <= NumTraits<Scalar>::epsilon() * xnorm)
+ return LevenbergMarquardtSpace::XtolTooSmall;
+ if (gnorm <= NumTraits<Scalar>::epsilon())
+ return LevenbergMarquardtSpace::GtolTooSmall;
+
+ } while (ratio < Scalar(1e-4));
+
+ return LevenbergMarquardtSpace::Running;
+}
+
+template<typename FunctorType, typename Scalar>
+LevenbergMarquardtSpace::Status
+LevenbergMarquardt<FunctorType,Scalar>::minimizeOptimumStorage(FVectorType &x)
+{
+ LevenbergMarquardtSpace::Status status = minimizeOptimumStorageInit(x);
+ if (status==LevenbergMarquardtSpace::ImproperInputParameters)
+ return status;
+ do {
+ status = minimizeOptimumStorageOneStep(x);
+ } while (status==LevenbergMarquardtSpace::Running);
+ return status;
+}
+
+template<typename FunctorType, typename Scalar>
+LevenbergMarquardtSpace::Status
+LevenbergMarquardt<FunctorType,Scalar>::lmdif1(
+ FunctorType &functor,
+ FVectorType &x,
+ Index *nfev,
+ const Scalar tol
+ )
+{
+ Index n = x.size();
+ Index m = functor.values();
+
+ /* check the input parameters for errors. */
+ if (n <= 0 || m < n || tol < 0.)
+ return LevenbergMarquardtSpace::ImproperInputParameters;
+
+ NumericalDiff<FunctorType> numDiff(functor);
+ // embedded LevenbergMarquardt
+ LevenbergMarquardt<NumericalDiff<FunctorType>, Scalar > lm(numDiff);
+ lm.parameters.ftol = tol;
+ lm.parameters.xtol = tol;
+ lm.parameters.maxfev = 200*(n+1);
+
+ LevenbergMarquardtSpace::Status info = LevenbergMarquardtSpace::Status(lm.minimize(x));
+ if (nfev)
+ * nfev = lm.nfev;
+ return info;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_LEVENBERGMARQUARDT__H
+
+//vim: ai ts=4 sts=4 et sw=4
diff --git a/src/EigenUnsupported/src/NonLinearOptimization/chkder.h b/src/EigenUnsupported/src/NonLinearOptimization/chkder.h
new file mode 100644
index 0000000..db8ff7d
--- /dev/null
+++ b/src/EigenUnsupported/src/NonLinearOptimization/chkder.h
@@ -0,0 +1,66 @@
+#define chkder_log10e 0.43429448190325182765
+#define chkder_factor 100.
+
+namespace Eigen {
+
+namespace internal {
+
+template<typename Scalar>
+void chkder(
+ const Matrix< Scalar, Dynamic, 1 > &x,
+ const Matrix< Scalar, Dynamic, 1 > &fvec,
+ const Matrix< Scalar, Dynamic, Dynamic > &fjac,
+ Matrix< Scalar, Dynamic, 1 > &xp,
+ const Matrix< Scalar, Dynamic, 1 > &fvecp,
+ int mode,
+ Matrix< Scalar, Dynamic, 1 > &err
+ )
+{
+ using std::sqrt;
+ using std::abs;
+ using std::log;
+
+ typedef DenseIndex Index;
+
+ const Scalar eps = sqrt(NumTraits<Scalar>::epsilon());
+ const Scalar epsf = chkder_factor * NumTraits<Scalar>::epsilon();
+ const Scalar epslog = chkder_log10e * log(eps);
+ Scalar temp;
+
+ const Index m = fvec.size(), n = x.size();
+
+ if (mode != 2) {
+ /* mode = 1. */
+ xp.resize(n);
+ for (Index j = 0; j < n; ++j) {
+ temp = eps * abs(x[j]);
+ if (temp == 0.)
+ temp = eps;
+ xp[j] = x[j] + temp;
+ }
+ }
+ else {
+ /* mode = 2. */
+ err.setZero(m);
+ for (Index j = 0; j < n; ++j) {
+ temp = abs(x[j]);
+ if (temp == 0.)
+ temp = 1.;
+ err += temp * fjac.col(j);
+ }
+ for (Index i = 0; i < m; ++i) {
+ temp = 1.;
+ if (fvec[i] != 0. && fvecp[i] != 0. && abs(fvecp[i] - fvec[i]) >= epsf * abs(fvec[i]))
+ temp = eps * abs((fvecp[i] - fvec[i]) / eps - err[i]) / (abs(fvec[i]) + abs(fvecp[i]));
+ err[i] = 1.;
+ if (temp > NumTraits<Scalar>::epsilon() && temp < eps)
+ err[i] = (chkder_log10e * log(temp) - epslog) / epslog;
+ if (temp >= eps)
+ err[i] = 0.;
+ }
+ }
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
diff --git a/src/EigenUnsupported/src/NonLinearOptimization/covar.h b/src/EigenUnsupported/src/NonLinearOptimization/covar.h
new file mode 100644
index 0000000..68260d1
--- /dev/null
+++ b/src/EigenUnsupported/src/NonLinearOptimization/covar.h
@@ -0,0 +1,70 @@
+namespace Eigen {
+
+namespace internal {
+
+template <typename Scalar>
+void covar(
+ Matrix< Scalar, Dynamic, Dynamic > &r,
+ const VectorXi &ipvt,
+ Scalar tol = std::sqrt(NumTraits<Scalar>::epsilon()) )
+{
+ using std::abs;
+ typedef DenseIndex Index;
+
+ /* Local variables */
+ Index i, j, k, l, ii, jj;
+ bool sing;
+ Scalar temp;
+
+ /* Function Body */
+ const Index n = r.cols();
+ const Scalar tolr = tol * abs(r(0,0));
+ Matrix< Scalar, Dynamic, 1 > wa(n);
+ eigen_assert(ipvt.size()==n);
+
+ /* form the inverse of r in the full upper triangle of r. */
+ l = -1;
+ for (k = 0; k < n; ++k)
+ if (abs(r(k,k)) > tolr) {
+ r(k,k) = 1. / r(k,k);
+ for (j = 0; j <= k-1; ++j) {
+ temp = r(k,k) * r(j,k);
+ r(j,k) = 0.;
+ r.col(k).head(j+1) -= r.col(j).head(j+1) * temp;
+ }
+ l = k;
+ }
+
+ /* form the full upper triangle of the inverse of (r transpose)*r */
+ /* in the full upper triangle of r. */
+ for (k = 0; k <= l; ++k) {
+ for (j = 0; j <= k-1; ++j)
+ r.col(j).head(j+1) += r.col(k).head(j+1) * r(j,k);
+ r.col(k).head(k+1) *= r(k,k);
+ }
+
+ /* form the full lower triangle of the covariance matrix */
+ /* in the strict lower triangle of r and in wa. */
+ for (j = 0; j < n; ++j) {
+ jj = ipvt[j];
+ sing = j > l;
+ for (i = 0; i <= j; ++i) {
+ if (sing)
+ r(i,j) = 0.;
+ ii = ipvt[i];
+ if (ii > jj)
+ r(ii,jj) = r(i,j);
+ if (ii < jj)
+ r(jj,ii) = r(i,j);
+ }
+ wa[jj] = r(j,j);
+ }
+
+ /* symmetrize the covariance matrix in r. */
+ r.topLeftCorner(n,n).template triangularView<StrictlyUpper>() = r.topLeftCorner(n,n).transpose();
+ r.diagonal() = wa;
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
diff --git a/src/EigenUnsupported/src/NonLinearOptimization/dogleg.h b/src/EigenUnsupported/src/NonLinearOptimization/dogleg.h
new file mode 100644
index 0000000..80c5d27
--- /dev/null
+++ b/src/EigenUnsupported/src/NonLinearOptimization/dogleg.h
@@ -0,0 +1,107 @@
+namespace Eigen {
+
+namespace internal {
+
+template <typename Scalar>
+void dogleg(
+ const Matrix< Scalar, Dynamic, Dynamic > &qrfac,
+ const Matrix< Scalar, Dynamic, 1 > &diag,
+ const Matrix< Scalar, Dynamic, 1 > &qtb,
+ Scalar delta,
+ Matrix< Scalar, Dynamic, 1 > &x)
+{
+ using std::abs;
+ using std::sqrt;
+
+ typedef DenseIndex Index;
+
+ /* Local variables */
+ Index i, j;
+ Scalar sum, temp, alpha, bnorm;
+ Scalar gnorm, qnorm;
+ Scalar sgnorm;
+
+ /* Function Body */
+ const Scalar epsmch = NumTraits<Scalar>::epsilon();
+ const Index n = qrfac.cols();
+ eigen_assert(n==qtb.size());
+ eigen_assert(n==x.size());
+ eigen_assert(n==diag.size());
+ Matrix< Scalar, Dynamic, 1 > wa1(n), wa2(n);
+
+ /* first, calculate the gauss-newton direction. */
+ for (j = n-1; j >=0; --j) {
+ temp = qrfac(j,j);
+ if (temp == 0.) {
+ temp = epsmch * qrfac.col(j).head(j+1).maxCoeff();
+ if (temp == 0.)
+ temp = epsmch;
+ }
+ if (j==n-1)
+ x[j] = qtb[j] / temp;
+ else
+ x[j] = (qtb[j] - qrfac.row(j).tail(n-j-1).dot(x.tail(n-j-1))) / temp;
+ }
+
+ /* test whether the gauss-newton direction is acceptable. */
+ qnorm = diag.cwiseProduct(x).stableNorm();
+ if (qnorm <= delta)
+ return;
+
+ // TODO : this path is not tested by Eigen unit tests
+
+ /* the gauss-newton direction is not acceptable. */
+ /* next, calculate the scaled gradient direction. */
+
+ wa1.fill(0.);
+ for (j = 0; j < n; ++j) {
+ wa1.tail(n-j) += qrfac.row(j).tail(n-j) * qtb[j];
+ wa1[j] /= diag[j];
+ }
+
+ /* calculate the norm of the scaled gradient and test for */
+ /* the special case in which the scaled gradient is zero. */
+ gnorm = wa1.stableNorm();
+ sgnorm = 0.;
+ alpha = delta / qnorm;
+ if (gnorm == 0.)
+ goto algo_end;
+
+ /* calculate the point along the scaled gradient */
+ /* at which the quadratic is minimized. */
+ wa1.array() /= (diag*gnorm).array();
+ // TODO : once unit tests cover this part,:
+ // wa2 = qrfac.template triangularView<Upper>() * wa1;
+ for (j = 0; j < n; ++j) {
+ sum = 0.;
+ for (i = j; i < n; ++i) {
+ sum += qrfac(j,i) * wa1[i];
+ }
+ wa2[j] = sum;
+ }
+ temp = wa2.stableNorm();
+ sgnorm = gnorm / temp / temp;
+
+ /* test whether the scaled gradient direction is acceptable. */
+ alpha = 0.;
+ if (sgnorm >= delta)
+ goto algo_end;
+
+ /* the scaled gradient direction is not acceptable. */
+ /* finally, calculate the point along the dogleg */
+ /* at which the quadratic is minimized. */
+ bnorm = qtb.stableNorm();
+ temp = bnorm / gnorm * (bnorm / qnorm) * (sgnorm / delta);
+ temp = temp - delta / qnorm * numext::abs2(sgnorm / delta) + sqrt(numext::abs2(temp - delta / qnorm) + (1.-numext::abs2(delta / qnorm)) * (1.-numext::abs2(sgnorm / delta)));
+ alpha = delta / qnorm * (1. - numext::abs2(sgnorm / delta)) / temp;
+algo_end:
+
+ /* form appropriate convex combination of the gauss-newton */
+ /* direction and the scaled gradient direction. */
+ temp = (1.-alpha) * (std::min)(sgnorm,delta);
+ x = temp * wa1 + alpha * x;
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
diff --git a/src/EigenUnsupported/src/NonLinearOptimization/fdjac1.h b/src/EigenUnsupported/src/NonLinearOptimization/fdjac1.h
new file mode 100644
index 0000000..bb7cf26
--- /dev/null
+++ b/src/EigenUnsupported/src/NonLinearOptimization/fdjac1.h
@@ -0,0 +1,79 @@
+namespace Eigen {
+
+namespace internal {
+
+template<typename FunctorType, typename Scalar>
+DenseIndex fdjac1(
+ const FunctorType &Functor,
+ Matrix< Scalar, Dynamic, 1 > &x,
+ Matrix< Scalar, Dynamic, 1 > &fvec,
+ Matrix< Scalar, Dynamic, Dynamic > &fjac,
+ DenseIndex ml, DenseIndex mu,
+ Scalar epsfcn)
+{
+ using std::sqrt;
+ using std::abs;
+
+ typedef DenseIndex Index;
+
+ /* Local variables */
+ Scalar h;
+ Index j, k;
+ Scalar eps, temp;
+ Index msum;
+ int iflag;
+ Index start, length;
+
+ /* Function Body */
+ const Scalar epsmch = NumTraits<Scalar>::epsilon();
+ const Index n = x.size();
+ eigen_assert(fvec.size()==n);
+ Matrix< Scalar, Dynamic, 1 > wa1(n);
+ Matrix< Scalar, Dynamic, 1 > wa2(n);
+
+ eps = sqrt((std::max)(epsfcn,epsmch));
+ msum = ml + mu + 1;
+ if (msum >= n) {
+ /* computation of dense approximate jacobian. */
+ for (j = 0; j < n; ++j) {
+ temp = x[j];
+ h = eps * abs(temp);
+ if (h == 0.)
+ h = eps;
+ x[j] = temp + h;
+ iflag = Functor(x, wa1);
+ if (iflag < 0)
+ return iflag;
+ x[j] = temp;
+ fjac.col(j) = (wa1-fvec)/h;
+ }
+
+ }else {
+ /* computation of banded approximate jacobian. */
+ for (k = 0; k < msum; ++k) {
+ for (j = k; (msum<0) ? (j>n): (j<n); j += msum) {
+ wa2[j] = x[j];
+ h = eps * abs(wa2[j]);
+ if (h == 0.) h = eps;
+ x[j] = wa2[j] + h;
+ }
+ iflag = Functor(x, wa1);
+ if (iflag < 0)
+ return iflag;
+ for (j = k; (msum<0) ? (j>n): (j<n); j += msum) {
+ x[j] = wa2[j];
+ h = eps * abs(wa2[j]);
+ if (h == 0.) h = eps;
+ fjac.col(j).setZero();
+ start = std::max<Index>(0,j-mu);
+ length = (std::min)(n-1, j+ml) - start + 1;
+ fjac.col(j).segment(start, length) = ( wa1.segment(start, length)-fvec.segment(start, length))/h;
+ }
+ }
+ }
+ return 0;
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
diff --git a/src/EigenUnsupported/src/NonLinearOptimization/lmpar.h b/src/EigenUnsupported/src/NonLinearOptimization/lmpar.h
new file mode 100644
index 0000000..4c17d4c
--- /dev/null
+++ b/src/EigenUnsupported/src/NonLinearOptimization/lmpar.h
@@ -0,0 +1,298 @@
+namespace Eigen {
+
+namespace internal {
+
+template <typename Scalar>
+void lmpar(
+ Matrix< Scalar, Dynamic, Dynamic > &r,
+ const VectorXi &ipvt,
+ const Matrix< Scalar, Dynamic, 1 > &diag,
+ const Matrix< Scalar, Dynamic, 1 > &qtb,
+ Scalar delta,
+ Scalar &par,
+ Matrix< Scalar, Dynamic, 1 > &x)
+{
+ using std::abs;
+ using std::sqrt;
+ typedef DenseIndex Index;
+
+ /* Local variables */
+ Index i, j, l;
+ Scalar fp;
+ Scalar parc, parl;
+ Index iter;
+ Scalar temp, paru;
+ Scalar gnorm;
+ Scalar dxnorm;
+
+
+ /* Function Body */
+ const Scalar dwarf = (std::numeric_limits<Scalar>::min)();
+ const Index n = r.cols();
+ eigen_assert(n==diag.size());
+ eigen_assert(n==qtb.size());
+ eigen_assert(n==x.size());
+
+ Matrix< Scalar, Dynamic, 1 > wa1, wa2;
+
+ /* compute and store in x the gauss-newton direction. if the */
+ /* jacobian is rank-deficient, obtain a least squares solution. */
+ Index nsing = n-1;
+ wa1 = qtb;
+ for (j = 0; j < n; ++j) {
+ if (r(j,j) == 0. && nsing == n-1)
+ nsing = j - 1;
+ if (nsing < n-1)
+ wa1[j] = 0.;
+ }
+ for (j = nsing; j>=0; --j) {
+ wa1[j] /= r(j,j);
+ temp = wa1[j];
+ for (i = 0; i < j ; ++i)
+ wa1[i] -= r(i,j) * temp;
+ }
+
+ for (j = 0; j < n; ++j)
+ x[ipvt[j]] = wa1[j];
+
+ /* initialize the iteration counter. */
+ /* evaluate the function at the origin, and test */
+ /* for acceptance of the gauss-newton direction. */
+ iter = 0;
+ wa2 = diag.cwiseProduct(x);
+ dxnorm = wa2.blueNorm();
+ fp = dxnorm - delta;
+ if (fp <= Scalar(0.1) * delta) {
+ par = 0;
+ return;
+ }
+
+ /* if the jacobian is not rank deficient, the newton */
+ /* step provides a lower bound, parl, for the zero of */
+ /* the function. otherwise set this bound to zero. */
+ parl = 0.;
+ if (nsing >= n-1) {
+ for (j = 0; j < n; ++j) {
+ l = ipvt[j];
+ wa1[j] = diag[l] * (wa2[l] / dxnorm);
+ }
+ // it's actually a triangularView.solveInplace(), though in a weird
+ // way:
+ for (j = 0; j < n; ++j) {
+ Scalar sum = 0.;
+ for (i = 0; i < j; ++i)
+ sum += r(i,j) * wa1[i];
+ wa1[j] = (wa1[j] - sum) / r(j,j);
+ }
+ temp = wa1.blueNorm();
+ parl = fp / delta / temp / temp;
+ }
+
+ /* calculate an upper bound, paru, for the zero of the function. */
+ for (j = 0; j < n; ++j)
+ wa1[j] = r.col(j).head(j+1).dot(qtb.head(j+1)) / diag[ipvt[j]];
+
+ gnorm = wa1.stableNorm();
+ paru = gnorm / delta;
+ if (paru == 0.)
+ paru = dwarf / (std::min)(delta,Scalar(0.1));
+
+ /* if the input par lies outside of the interval (parl,paru), */
+ /* set par to the closer endpoint. */
+ par = (std::max)(par,parl);
+ par = (std::min)(par,paru);
+ if (par == 0.)
+ par = gnorm / dxnorm;
+
+ /* beginning of an iteration. */
+ while (true) {
+ ++iter;
+
+ /* evaluate the function at the current value of par. */
+ if (par == 0.)
+ par = (std::max)(dwarf,Scalar(.001) * paru); /* Computing MAX */
+ wa1 = sqrt(par)* diag;
+
+ Matrix< Scalar, Dynamic, 1 > sdiag(n);
+ qrsolv<Scalar>(r, ipvt, wa1, qtb, x, sdiag);
+
+ wa2 = diag.cwiseProduct(x);
+ dxnorm = wa2.blueNorm();
+ temp = fp;
+ fp = dxnorm - delta;
+
+ /* if the function is small enough, accept the current value */
+ /* of par. also test for the exceptional cases where parl */
+ /* is zero or the number of iterations has reached 10. */
+ if (abs(fp) <= Scalar(0.1) * delta || (parl == 0. && fp <= temp && temp < 0.) || iter == 10)
+ break;
+
+ /* compute the newton correction. */
+ for (j = 0; j < n; ++j) {
+ l = ipvt[j];
+ wa1[j] = diag[l] * (wa2[l] / dxnorm);
+ }
+ for (j = 0; j < n; ++j) {
+ wa1[j] /= sdiag[j];
+ temp = wa1[j];
+ for (i = j+1; i < n; ++i)
+ wa1[i] -= r(i,j) * temp;
+ }
+ temp = wa1.blueNorm();
+ parc = fp / delta / temp / temp;
+
+ /* depending on the sign of the function, update parl or paru. */
+ if (fp > 0.)
+ parl = (std::max)(parl,par);
+ if (fp < 0.)
+ paru = (std::min)(paru,par);
+
+ /* compute an improved estimate for par. */
+ /* Computing MAX */
+ par = (std::max)(parl,par+parc);
+
+ /* end of an iteration. */
+ }
+
+ /* termination. */
+ if (iter == 0)
+ par = 0.;
+ return;
+}
+
+template <typename Scalar>
+void lmpar2(
+ const ColPivHouseholderQR<Matrix< Scalar, Dynamic, Dynamic> > &qr,
+ const Matrix< Scalar, Dynamic, 1 > &diag,
+ const Matrix< Scalar, Dynamic, 1 > &qtb,
+ Scalar delta,
+ Scalar &par,
+ Matrix< Scalar, Dynamic, 1 > &x)
+
+{
+ using std::sqrt;
+ using std::abs;
+ typedef DenseIndex Index;
+
+ /* Local variables */
+ Index j;
+ Scalar fp;
+ Scalar parc, parl;
+ Index iter;
+ Scalar temp, paru;
+ Scalar gnorm;
+ Scalar dxnorm;
+
+
+ /* Function Body */
+ const Scalar dwarf = (std::numeric_limits<Scalar>::min)();
+ const Index n = qr.matrixQR().cols();
+ eigen_assert(n==diag.size());
+ eigen_assert(n==qtb.size());
+
+ Matrix< Scalar, Dynamic, 1 > wa1, wa2;
+
+ /* compute and store in x the gauss-newton direction. if the */
+ /* jacobian is rank-deficient, obtain a least squares solution. */
+
+// const Index rank = qr.nonzeroPivots(); // exactly double(0.)
+ const Index rank = qr.rank(); // use a threshold
+ wa1 = qtb;
+ wa1.tail(n-rank).setZero();
+ qr.matrixQR().topLeftCorner(rank, rank).template triangularView<Upper>().solveInPlace(wa1.head(rank));
+
+ x = qr.colsPermutation()*wa1;
+
+ /* initialize the iteration counter. */
+ /* evaluate the function at the origin, and test */
+ /* for acceptance of the gauss-newton direction. */
+ iter = 0;
+ wa2 = diag.cwiseProduct(x);
+ dxnorm = wa2.blueNorm();
+ fp = dxnorm - delta;
+ if (fp <= Scalar(0.1) * delta) {
+ par = 0;
+ return;
+ }
+
+ /* if the jacobian is not rank deficient, the newton */
+ /* step provides a lower bound, parl, for the zero of */
+ /* the function. otherwise set this bound to zero. */
+ parl = 0.;
+ if (rank==n) {
+ wa1 = qr.colsPermutation().inverse() * diag.cwiseProduct(wa2)/dxnorm;
+ qr.matrixQR().topLeftCorner(n, n).transpose().template triangularView<Lower>().solveInPlace(wa1);
+ temp = wa1.blueNorm();
+ parl = fp / delta / temp / temp;
+ }
+
+ /* calculate an upper bound, paru, for the zero of the function. */
+ for (j = 0; j < n; ++j)
+ wa1[j] = qr.matrixQR().col(j).head(j+1).dot(qtb.head(j+1)) / diag[qr.colsPermutation().indices()(j)];
+
+ gnorm = wa1.stableNorm();
+ paru = gnorm / delta;
+ if (paru == 0.)
+ paru = dwarf / (std::min)(delta,Scalar(0.1));
+
+ /* if the input par lies outside of the interval (parl,paru), */
+ /* set par to the closer endpoint. */
+ par = (std::max)(par,parl);
+ par = (std::min)(par,paru);
+ if (par == 0.)
+ par = gnorm / dxnorm;
+
+ /* beginning of an iteration. */
+ Matrix< Scalar, Dynamic, Dynamic > s = qr.matrixQR();
+ while (true) {
+ ++iter;
+
+ /* evaluate the function at the current value of par. */
+ if (par == 0.)
+ par = (std::max)(dwarf,Scalar(.001) * paru); /* Computing MAX */
+ wa1 = sqrt(par)* diag;
+
+ Matrix< Scalar, Dynamic, 1 > sdiag(n);
+ qrsolv<Scalar>(s, qr.colsPermutation().indices(), wa1, qtb, x, sdiag);
+
+ wa2 = diag.cwiseProduct(x);
+ dxnorm = wa2.blueNorm();
+ temp = fp;
+ fp = dxnorm - delta;
+
+ /* if the function is small enough, accept the current value */
+ /* of par. also test for the exceptional cases where parl */
+ /* is zero or the number of iterations has reached 10. */
+ if (abs(fp) <= Scalar(0.1) * delta || (parl == 0. && fp <= temp && temp < 0.) || iter == 10)
+ break;
+
+ /* compute the newton correction. */
+ wa1 = qr.colsPermutation().inverse() * diag.cwiseProduct(wa2/dxnorm);
+ // we could almost use this here, but the diagonal is outside qr, in sdiag[]
+ // qr.matrixQR().topLeftCorner(n, n).transpose().template triangularView<Lower>().solveInPlace(wa1);
+ for (j = 0; j < n; ++j) {
+ wa1[j] /= sdiag[j];
+ temp = wa1[j];
+ for (Index i = j+1; i < n; ++i)
+ wa1[i] -= s(i,j) * temp;
+ }
+ temp = wa1.blueNorm();
+ parc = fp / delta / temp / temp;
+
+ /* depending on the sign of the function, update parl or paru. */
+ if (fp > 0.)
+ parl = (std::max)(parl,par);
+ if (fp < 0.)
+ paru = (std::min)(paru,par);
+
+ /* compute an improved estimate for par. */
+ par = (std::max)(parl,par+parc);
+ }
+ if (iter == 0)
+ par = 0.;
+ return;
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
diff --git a/src/EigenUnsupported/src/NonLinearOptimization/qrsolv.h b/src/EigenUnsupported/src/NonLinearOptimization/qrsolv.h
new file mode 100644
index 0000000..4f2f560
--- /dev/null
+++ b/src/EigenUnsupported/src/NonLinearOptimization/qrsolv.h
@@ -0,0 +1,91 @@
+namespace Eigen {
+
+namespace internal {
+
+// TODO : once qrsolv2 is removed, use ColPivHouseholderQR or PermutationMatrix instead of ipvt
+template <typename Scalar>
+void qrsolv(
+ Matrix< Scalar, Dynamic, Dynamic > &s,
+ // TODO : use a PermutationMatrix once lmpar is no more:
+ const VectorXi &ipvt,
+ const Matrix< Scalar, Dynamic, 1 > &diag,
+ const Matrix< Scalar, Dynamic, 1 > &qtb,
+ Matrix< Scalar, Dynamic, 1 > &x,
+ Matrix< Scalar, Dynamic, 1 > &sdiag)
+
+{
+ typedef DenseIndex Index;
+
+ /* Local variables */
+ Index i, j, k, l;
+ Scalar temp;
+ Index n = s.cols();
+ Matrix< Scalar, Dynamic, 1 > wa(n);
+ JacobiRotation<Scalar> givens;
+
+ /* Function Body */
+ // the following will only change the lower triangular part of s, including
+ // the diagonal, though the diagonal is restored afterward
+
+ /* copy r and (q transpose)*b to preserve input and initialize s. */
+ /* in particular, save the diagonal elements of r in x. */
+ x = s.diagonal();
+ wa = qtb;
+
+ s.topLeftCorner(n,n).template triangularView<StrictlyLower>() = s.topLeftCorner(n,n).transpose();
+
+ /* eliminate the diagonal matrix d using a givens rotation. */
+ for (j = 0; j < n; ++j) {
+
+ /* prepare the row of d to be eliminated, locating the */
+ /* diagonal element using p from the qr factorization. */
+ l = ipvt[j];
+ if (diag[l] == 0.)
+ break;
+ sdiag.tail(n-j).setZero();
+ sdiag[j] = diag[l];
+
+ /* the transformations to eliminate the row of d */
+ /* modify only a single element of (q transpose)*b */
+ /* beyond the first n, which is initially zero. */
+ Scalar qtbpj = 0.;
+ for (k = j; k < n; ++k) {
+ /* determine a givens rotation which eliminates the */
+ /* appropriate element in the current row of d. */
+ givens.makeGivens(-s(k,k), sdiag[k]);
+
+ /* compute the modified diagonal element of r and */
+ /* the modified element of ((q transpose)*b,0). */
+ s(k,k) = givens.c() * s(k,k) + givens.s() * sdiag[k];
+ temp = givens.c() * wa[k] + givens.s() * qtbpj;
+ qtbpj = -givens.s() * wa[k] + givens.c() * qtbpj;
+ wa[k] = temp;
+
+ /* accumulate the transformation in the row of s. */
+ for (i = k+1; i<n; ++i) {
+ temp = givens.c() * s(i,k) + givens.s() * sdiag[i];
+ sdiag[i] = -givens.s() * s(i,k) + givens.c() * sdiag[i];
+ s(i,k) = temp;
+ }
+ }
+ }
+
+ /* solve the triangular system for z. if the system is */
+ /* singular, then obtain a least squares solution. */
+ Index nsing;
+ for(nsing=0; nsing<n && sdiag[nsing]!=0; nsing++) {}
+
+ wa.tail(n-nsing).setZero();
+ s.topLeftCorner(nsing, nsing).transpose().template triangularView<Upper>().solveInPlace(wa.head(nsing));
+
+ // restore
+ sdiag = s.diagonal();
+ s.diagonal() = x;
+
+ /* permute the components of z back to components of x. */
+ for (j = 0; j < n; ++j) x[ipvt[j]] = wa[j];
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
diff --git a/src/EigenUnsupported/src/NonLinearOptimization/r1mpyq.h b/src/EigenUnsupported/src/NonLinearOptimization/r1mpyq.h
new file mode 100644
index 0000000..36ff700
--- /dev/null
+++ b/src/EigenUnsupported/src/NonLinearOptimization/r1mpyq.h
@@ -0,0 +1,30 @@
+namespace Eigen {
+
+namespace internal {
+
+// TODO : move this to GivensQR once there's such a thing in Eigen
+
+template <typename Scalar>
+void r1mpyq(DenseIndex m, DenseIndex n, Scalar *a, const std::vector<JacobiRotation<Scalar> > &v_givens, const std::vector<JacobiRotation<Scalar> > &w_givens)
+{
+ typedef DenseIndex Index;
+
+ /* apply the first set of givens rotations to a. */
+ for (Index j = n-2; j>=0; --j)
+ for (Index i = 0; i<m; ++i) {
+ Scalar temp = v_givens[j].c() * a[i+m*j] - v_givens[j].s() * a[i+m*(n-1)];
+ a[i+m*(n-1)] = v_givens[j].s() * a[i+m*j] + v_givens[j].c() * a[i+m*(n-1)];
+ a[i+m*j] = temp;
+ }
+ /* apply the second set of givens rotations to a. */
+ for (Index j = 0; j<n-1; ++j)
+ for (Index i = 0; i<m; ++i) {
+ Scalar temp = w_givens[j].c() * a[i+m*j] + w_givens[j].s() * a[i+m*(n-1)];
+ a[i+m*(n-1)] = -w_givens[j].s() * a[i+m*j] + w_givens[j].c() * a[i+m*(n-1)];
+ a[i+m*j] = temp;
+ }
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
diff --git a/src/EigenUnsupported/src/NonLinearOptimization/r1updt.h b/src/EigenUnsupported/src/NonLinearOptimization/r1updt.h
new file mode 100644
index 0000000..09fc652
--- /dev/null
+++ b/src/EigenUnsupported/src/NonLinearOptimization/r1updt.h
@@ -0,0 +1,99 @@
+namespace Eigen {
+
+namespace internal {
+
+template <typename Scalar>
+void r1updt(
+ Matrix< Scalar, Dynamic, Dynamic > &s,
+ const Matrix< Scalar, Dynamic, 1> &u,
+ std::vector<JacobiRotation<Scalar> > &v_givens,
+ std::vector<JacobiRotation<Scalar> > &w_givens,
+ Matrix< Scalar, Dynamic, 1> &v,
+ Matrix< Scalar, Dynamic, 1> &w,
+ bool *sing)
+{
+ typedef DenseIndex Index;
+ const JacobiRotation<Scalar> IdentityRotation = JacobiRotation<Scalar>(1,0);
+
+ /* Local variables */
+ const Index m = s.rows();
+ const Index n = s.cols();
+ Index i, j=1;
+ Scalar temp;
+ JacobiRotation<Scalar> givens;
+
+ // r1updt had a broader usecase, but we don't use it here. And, more
+ // importantly, we can not test it.
+ eigen_assert(m==n);
+ eigen_assert(u.size()==m);
+ eigen_assert(v.size()==n);
+ eigen_assert(w.size()==n);
+
+ /* move the nontrivial part of the last column of s into w. */
+ w[n-1] = s(n-1,n-1);
+
+ /* rotate the vector v into a multiple of the n-th unit vector */
+ /* in such a way that a spike is introduced into w. */
+ for (j=n-2; j>=0; --j) {
+ w[j] = 0.;
+ if (v[j] != 0.) {
+ /* determine a givens rotation which eliminates the */
+ /* j-th element of v. */
+ givens.makeGivens(-v[n-1], v[j]);
+
+ /* apply the transformation to v and store the information */
+ /* necessary to recover the givens rotation. */
+ v[n-1] = givens.s() * v[j] + givens.c() * v[n-1];
+ v_givens[j] = givens;
+
+ /* apply the transformation to s and extend the spike in w. */
+ for (i = j; i < m; ++i) {
+ temp = givens.c() * s(j,i) - givens.s() * w[i];
+ w[i] = givens.s() * s(j,i) + givens.c() * w[i];
+ s(j,i) = temp;
+ }
+ } else
+ v_givens[j] = IdentityRotation;
+ }
+
+ /* add the spike from the rank 1 update to w. */
+ w += v[n-1] * u;
+
+ /* eliminate the spike. */
+ *sing = false;
+ for (j = 0; j < n-1; ++j) {
+ if (w[j] != 0.) {
+ /* determine a givens rotation which eliminates the */
+ /* j-th element of the spike. */
+ givens.makeGivens(-s(j,j), w[j]);
+
+ /* apply the transformation to s and reduce the spike in w. */
+ for (i = j; i < m; ++i) {
+ temp = givens.c() * s(j,i) + givens.s() * w[i];
+ w[i] = -givens.s() * s(j,i) + givens.c() * w[i];
+ s(j,i) = temp;
+ }
+
+ /* store the information necessary to recover the */
+ /* givens rotation. */
+ w_givens[j] = givens;
+ } else
+ v_givens[j] = IdentityRotation;
+
+ /* test for zero diagonal elements in the output s. */
+ if (s(j,j) == 0.) {
+ *sing = true;
+ }
+ }
+ /* move w back into the last column of the output s. */
+ s(n-1,n-1) = w[n-1];
+
+ if (s(j,j) == 0.) {
+ *sing = true;
+ }
+ return;
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
diff --git a/src/EigenUnsupported/src/NonLinearOptimization/rwupdt.h b/src/EigenUnsupported/src/NonLinearOptimization/rwupdt.h
new file mode 100644
index 0000000..6ebf856
--- /dev/null
+++ b/src/EigenUnsupported/src/NonLinearOptimization/rwupdt.h
@@ -0,0 +1,49 @@
+namespace Eigen {
+
+namespace internal {
+
+template <typename Scalar>
+void rwupdt(
+ Matrix< Scalar, Dynamic, Dynamic > &r,
+ const Matrix< Scalar, Dynamic, 1> &w,
+ Matrix< Scalar, Dynamic, 1> &b,
+ Scalar alpha)
+{
+ typedef DenseIndex Index;
+
+ const Index n = r.cols();
+ eigen_assert(r.rows()>=n);
+ std::vector<JacobiRotation<Scalar> > givens(n);
+
+ /* Local variables */
+ Scalar temp, rowj;
+
+ /* Function Body */
+ for (Index j = 0; j < n; ++j) {
+ rowj = w[j];
+
+ /* apply the previous transformations to */
+ /* r(i,j), i=0,1,...,j-1, and to w(j). */
+ for (Index i = 0; i < j; ++i) {
+ temp = givens[i].c() * r(i,j) + givens[i].s() * rowj;
+ rowj = -givens[i].s() * r(i,j) + givens[i].c() * rowj;
+ r(i,j) = temp;
+ }
+
+ /* determine a givens rotation which eliminates w(j). */
+ givens[j].makeGivens(-r(j,j), rowj);
+
+ if (rowj == 0.)
+ continue; // givens[j] is identity
+
+ /* apply the current transformation to r(j,j), b(j), and alpha. */
+ r(j,j) = givens[j].c() * r(j,j) + givens[j].s() * rowj;
+ temp = givens[j].c() * b[j] + givens[j].s() * alpha;
+ alpha = -givens[j].s() * b[j] + givens[j].c() * alpha;
+ b[j] = temp;
+ }
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
diff --git a/src/EigenUnsupported/src/NumericalDiff/NumericalDiff.h b/src/EigenUnsupported/src/NumericalDiff/NumericalDiff.h
new file mode 100644
index 0000000..ea5d8bc
--- /dev/null
+++ b/src/EigenUnsupported/src/NumericalDiff/NumericalDiff.h
@@ -0,0 +1,130 @@
+// -*- coding: utf-8
+// vim: set fileencoding=utf-8
+
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Thomas Capricelli <orzel@freehackers.org>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_NUMERICAL_DIFF_H
+#define EIGEN_NUMERICAL_DIFF_H
+
+namespace Eigen {
+
+enum NumericalDiffMode {
+ Forward,
+ Central
+};
+
+
+/**
+ * This class allows you to add a method df() to your functor, which will
+ * use numerical differentiation to compute an approximate of the
+ * derivative for the functor. Of course, if you have an analytical form
+ * for the derivative, you should rather implement df() by yourself.
+ *
+ * More information on
+ * http://en.wikipedia.org/wiki/Numerical_differentiation
+ *
+ * Currently only "Forward" and "Central" scheme are implemented.
+ */
+template<typename _Functor, NumericalDiffMode mode=Forward>
+class NumericalDiff : public _Functor
+{
+public:
+ typedef _Functor Functor;
+ typedef typename Functor::Scalar Scalar;
+ typedef typename Functor::InputType InputType;
+ typedef typename Functor::ValueType ValueType;
+ typedef typename Functor::JacobianType JacobianType;
+
+ NumericalDiff(Scalar _epsfcn=0.) : Functor(), epsfcn(_epsfcn) {}
+ NumericalDiff(const Functor& f, Scalar _epsfcn=0.) : Functor(f), epsfcn(_epsfcn) {}
+
+ // forward constructors
+ template<typename T0>
+ NumericalDiff(const T0& a0) : Functor(a0), epsfcn(0) {}
+ template<typename T0, typename T1>
+ NumericalDiff(const T0& a0, const T1& a1) : Functor(a0, a1), epsfcn(0) {}
+ template<typename T0, typename T1, typename T2>
+ NumericalDiff(const T0& a0, const T1& a1, const T2& a2) : Functor(a0, a1, a2), epsfcn(0) {}
+
+ enum {
+ InputsAtCompileTime = Functor::InputsAtCompileTime,
+ ValuesAtCompileTime = Functor::ValuesAtCompileTime
+ };
+
+ /**
+ * return the number of evaluation of functor
+ */
+ int df(const InputType& _x, JacobianType &jac) const
+ {
+ using std::sqrt;
+ using std::abs;
+ /* Local variables */
+ Scalar h;
+ int nfev=0;
+ const typename InputType::Index n = _x.size();
+ const Scalar eps = sqrt(((std::max)(epsfcn,NumTraits<Scalar>::epsilon() )));
+ ValueType val1, val2;
+ InputType x = _x;
+ // TODO : we should do this only if the size is not already known
+ val1.resize(Functor::values());
+ val2.resize(Functor::values());
+
+ // initialization
+ switch(mode) {
+ case Forward:
+ // compute f(x)
+ Functor::operator()(x, val1); nfev++;
+ break;
+ case Central:
+ // do nothing
+ break;
+ default:
+ eigen_assert(false);
+ };
+
+ // Function Body
+ for (int j = 0; j < n; ++j) {
+ h = eps * abs(x[j]);
+ if (h == 0.) {
+ h = eps;
+ }
+ switch(mode) {
+ case Forward:
+ x[j] += h;
+ Functor::operator()(x, val2);
+ nfev++;
+ x[j] = _x[j];
+ jac.col(j) = (val2-val1)/h;
+ break;
+ case Central:
+ x[j] += h;
+ Functor::operator()(x, val2); nfev++;
+ x[j] -= 2*h;
+ Functor::operator()(x, val1); nfev++;
+ x[j] = _x[j];
+ jac.col(j) = (val2-val1)/(2*h);
+ break;
+ default:
+ eigen_assert(false);
+ };
+ }
+ return nfev;
+ }
+private:
+ Scalar epsfcn;
+
+ NumericalDiff& operator=(const NumericalDiff&);
+};
+
+} // end namespace Eigen
+
+//vim: ai ts=4 sts=4 et sw=4
+#endif // EIGEN_NUMERICAL_DIFF_H
+
diff --git a/src/EigenUnsupported/src/Polynomials/Companion.h b/src/EigenUnsupported/src/Polynomials/Companion.h
new file mode 100644
index 0000000..59a15b0
--- /dev/null
+++ b/src/EigenUnsupported/src/Polynomials/Companion.h
@@ -0,0 +1,280 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Manuel Yguel <manuel.yguel@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPANION_H
+#define EIGEN_COMPANION_H
+
+// This file requires the user to include
+// * Eigen/Core
+// * Eigen/src/PolynomialSolver.h
+
+namespace Eigen {
+
+namespace internal {
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+
+template<int Size>
+struct decrement_if_fixed_size
+{
+ enum {
+ ret = (Size == Dynamic) ? Dynamic : Size-1 };
+};
+
+#endif
+
+template< typename _Scalar, int _Deg >
+class companion
+{
+ public:
+ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_Deg==Dynamic ? Dynamic : _Deg)
+
+ enum {
+ Deg = _Deg,
+ Deg_1=decrement_if_fixed_size<Deg>::ret
+ };
+
+ typedef _Scalar Scalar;
+ typedef typename NumTraits<Scalar>::Real RealScalar;
+ typedef Matrix<Scalar, Deg, 1> RightColumn;
+ //typedef DiagonalMatrix< Scalar, Deg_1, Deg_1 > BottomLeftDiagonal;
+ typedef Matrix<Scalar, Deg_1, 1> BottomLeftDiagonal;
+
+ typedef Matrix<Scalar, Deg, Deg> DenseCompanionMatrixType;
+ typedef Matrix< Scalar, _Deg, Deg_1 > LeftBlock;
+ typedef Matrix< Scalar, Deg_1, Deg_1 > BottomLeftBlock;
+ typedef Matrix< Scalar, 1, Deg_1 > LeftBlockFirstRow;
+
+ typedef DenseIndex Index;
+
+ public:
+ EIGEN_STRONG_INLINE const _Scalar operator()(Index row, Index col ) const
+ {
+ if( m_bl_diag.rows() > col )
+ {
+ if( 0 < row ){ return m_bl_diag[col]; }
+ else{ return 0; }
+ }
+ else{ return m_monic[row]; }
+ }
+
+ public:
+ template<typename VectorType>
+ void setPolynomial( const VectorType& poly )
+ {
+ const Index deg = poly.size()-1;
+ m_monic = -poly.head(deg)/poly[deg];
+ m_bl_diag.setOnes(deg-1);
+ }
+
+ template<typename VectorType>
+ companion( const VectorType& poly ){
+ setPolynomial( poly ); }
+
+ public:
+ DenseCompanionMatrixType denseMatrix() const
+ {
+ const Index deg = m_monic.size();
+ const Index deg_1 = deg-1;
+ DenseCompanionMatrixType companMat(deg,deg);
+ companMat <<
+ ( LeftBlock(deg,deg_1)
+ << LeftBlockFirstRow::Zero(1,deg_1),
+ BottomLeftBlock::Identity(deg-1,deg-1)*m_bl_diag.asDiagonal() ).finished()
+ , m_monic;
+ return companMat;
+ }
+
+
+
+ protected:
+ /** Helper function for the balancing algorithm.
+ * \returns true if the row and the column, having colNorm and rowNorm
+ * as norms, are balanced, false otherwise.
+ * colB and rowB are respectively the multipliers for
+ * the column and the row in order to balance them.
+ * */
+ bool balanced( RealScalar colNorm, RealScalar rowNorm,
+ bool& isBalanced, RealScalar& colB, RealScalar& rowB );
+
+ /** Helper function for the balancing algorithm.
+ * \returns true if the row and the column, having colNorm and rowNorm
+ * as norms, are balanced, false otherwise.
+ * colB and rowB are respectively the multipliers for
+ * the column and the row in order to balance them.
+ * */
+ bool balancedR( RealScalar colNorm, RealScalar rowNorm,
+ bool& isBalanced, RealScalar& colB, RealScalar& rowB );
+
+ public:
+ /**
+ * Balancing algorithm from B. N. PARLETT and C. REINSCH (1969)
+ * "Balancing a matrix for calculation of eigenvalues and eigenvectors"
+ * adapted to the case of companion matrices.
+ * A matrix with non zero row and non zero column is balanced
+ * for a certain norm if the i-th row and the i-th column
+ * have same norm for all i.
+ */
+ void balance();
+
+ protected:
+ RightColumn m_monic;
+ BottomLeftDiagonal m_bl_diag;
+};
+
+
+
+template< typename _Scalar, int _Deg >
+inline
+bool companion<_Scalar,_Deg>::balanced( RealScalar colNorm, RealScalar rowNorm,
+ bool& isBalanced, RealScalar& colB, RealScalar& rowB )
+{
+ if( RealScalar(0) == colNorm || RealScalar(0) == rowNorm
+ || !(numext::isfinite)(colNorm) || !(numext::isfinite)(rowNorm)){
+ return true;
+ }
+ else
+ {
+ //To find the balancing coefficients, if the radix is 2,
+ //one finds \f$ \sigma \f$ such that
+ // \f$ 2^{2\sigma-1} < rowNorm / colNorm \le 2^{2\sigma+1} \f$
+ // then the balancing coefficient for the row is \f$ 1/2^{\sigma} \f$
+ // and the balancing coefficient for the column is \f$ 2^{\sigma} \f$
+ const RealScalar radix = RealScalar(2);
+ const RealScalar radix2 = RealScalar(4);
+
+ rowB = rowNorm / radix;
+ colB = RealScalar(1);
+ const RealScalar s = colNorm + rowNorm;
+
+ // Find sigma s.t. rowNorm / 2 <= 2^(2*sigma) * colNorm
+ RealScalar scout = colNorm;
+ while (scout < rowB)
+ {
+ colB *= radix;
+ scout *= radix2;
+ }
+
+ // We now have an upper-bound for sigma, try to lower it.
+ // Find sigma s.t. 2^(2*sigma) * colNorm / 2 < rowNorm
+ scout = colNorm * (colB / radix) * colB; // Avoid overflow.
+ while (scout >= rowNorm)
+ {
+ colB /= radix;
+ scout /= radix2;
+ }
+
+ // This line is used to avoid insubstantial balancing.
+ if ((rowNorm + radix * scout) < RealScalar(0.95) * s * colB)
+ {
+ isBalanced = false;
+ rowB = RealScalar(1) / colB;
+ return false;
+ }
+ else
+ {
+ return true;
+ }
+ }
+}
+
+template< typename _Scalar, int _Deg >
+inline
+bool companion<_Scalar,_Deg>::balancedR( RealScalar colNorm, RealScalar rowNorm,
+ bool& isBalanced, RealScalar& colB, RealScalar& rowB )
+{
+ if( RealScalar(0) == colNorm || RealScalar(0) == rowNorm ){ return true; }
+ else
+ {
+ /**
+ * Set the norm of the column and the row to the geometric mean
+ * of the row and column norm
+ */
+ const RealScalar q = colNorm/rowNorm;
+ if( !isApprox( q, _Scalar(1) ) )
+ {
+ rowB = sqrt( colNorm/rowNorm );
+ colB = RealScalar(1)/rowB;
+
+ isBalanced = false;
+ return false;
+ }
+ else{
+ return true; }
+ }
+}
+
+
+template< typename _Scalar, int _Deg >
+void companion<_Scalar,_Deg>::balance()
+{
+ using std::abs;
+ EIGEN_STATIC_ASSERT( Deg == Dynamic || 1 < Deg, YOU_MADE_A_PROGRAMMING_MISTAKE );
+ const Index deg = m_monic.size();
+ const Index deg_1 = deg-1;
+
+ bool hasConverged=false;
+ while( !hasConverged )
+ {
+ hasConverged = true;
+ RealScalar colNorm,rowNorm;
+ RealScalar colB,rowB;
+
+ //First row, first column excluding the diagonal
+ //==============================================
+ colNorm = abs(m_bl_diag[0]);
+ rowNorm = abs(m_monic[0]);
+
+ //Compute balancing of the row and the column
+ if( !balanced( colNorm, rowNorm, hasConverged, colB, rowB ) )
+ {
+ m_bl_diag[0] *= colB;
+ m_monic[0] *= rowB;
+ }
+
+ //Middle rows and columns excluding the diagonal
+ //==============================================
+ for( Index i=1; i<deg_1; ++i )
+ {
+ // column norm, excluding the diagonal
+ colNorm = abs(m_bl_diag[i]);
+
+ // row norm, excluding the diagonal
+ rowNorm = abs(m_bl_diag[i-1]) + abs(m_monic[i]);
+
+ //Compute balancing of the row and the column
+ if( !balanced( colNorm, rowNorm, hasConverged, colB, rowB ) )
+ {
+ m_bl_diag[i] *= colB;
+ m_bl_diag[i-1] *= rowB;
+ m_monic[i] *= rowB;
+ }
+ }
+
+ //Last row, last column excluding the diagonal
+ //============================================
+ const Index ebl = m_bl_diag.size()-1;
+ VectorBlock<RightColumn,Deg_1> headMonic( m_monic, 0, deg_1 );
+ colNorm = headMonic.array().abs().sum();
+ rowNorm = abs( m_bl_diag[ebl] );
+
+ //Compute balancing of the row and the column
+ if( !balanced( colNorm, rowNorm, hasConverged, colB, rowB ) )
+ {
+ headMonic *= colB;
+ m_bl_diag[ebl] *= rowB;
+ }
+ }
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_COMPANION_H
diff --git a/src/EigenUnsupported/src/Polynomials/PolynomialSolver.h b/src/EigenUnsupported/src/Polynomials/PolynomialSolver.h
new file mode 100644
index 0000000..5e0ecbb
--- /dev/null
+++ b/src/EigenUnsupported/src/Polynomials/PolynomialSolver.h
@@ -0,0 +1,428 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Manuel Yguel <manuel.yguel@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_POLYNOMIAL_SOLVER_H
+#define EIGEN_POLYNOMIAL_SOLVER_H
+
+namespace Eigen {
+
+/** \ingroup Polynomials_Module
+ * \class PolynomialSolverBase.
+ *
+ * \brief Defined to be inherited by polynomial solvers: it provides
+ * convenient methods such as
+ * - real roots,
+ * - greatest, smallest complex roots,
+ * - real roots with greatest, smallest absolute real value,
+ * - greatest, smallest real roots.
+ *
+ * It stores the set of roots as a vector of complexes.
+ *
+ */
+template< typename _Scalar, int _Deg >
+class PolynomialSolverBase
+{
+ public:
+ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_Deg==Dynamic ? Dynamic : _Deg)
+
+ typedef _Scalar Scalar;
+ typedef typename NumTraits<Scalar>::Real RealScalar;
+ typedef std::complex<RealScalar> RootType;
+ typedef Matrix<RootType,_Deg,1> RootsType;
+
+ typedef DenseIndex Index;
+
+ protected:
+ template< typename OtherPolynomial >
+ inline void setPolynomial( const OtherPolynomial& poly ){
+ m_roots.resize(poly.size()-1); }
+
+ public:
+ template< typename OtherPolynomial >
+ inline PolynomialSolverBase( const OtherPolynomial& poly ){
+ setPolynomial( poly() ); }
+
+ inline PolynomialSolverBase(){}
+
+ public:
+ /** \returns the complex roots of the polynomial */
+ inline const RootsType& roots() const { return m_roots; }
+
+ public:
+ /** Clear and fills the back insertion sequence with the real roots of the polynomial
+ * i.e. the real part of the complex roots that have an imaginary part which
+ * absolute value is smaller than absImaginaryThreshold.
+ * absImaginaryThreshold takes the dummy_precision associated
+ * with the _Scalar template parameter of the PolynomialSolver class as the default value.
+ *
+ * \param[out] bi_seq : the back insertion sequence (stl concept)
+ * \param[in] absImaginaryThreshold : the maximum bound of the imaginary part of a complex
+ * number that is considered as real.
+ * */
+ template<typename Stl_back_insertion_sequence>
+ inline void realRoots( Stl_back_insertion_sequence& bi_seq,
+ const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision() ) const
+ {
+ using std::abs;
+ bi_seq.clear();
+ for(Index i=0; i<m_roots.size(); ++i )
+ {
+ if( abs( m_roots[i].imag() ) < absImaginaryThreshold ){
+ bi_seq.push_back( m_roots[i].real() ); }
+ }
+ }
+
+ protected:
+ template<typename squaredNormBinaryPredicate>
+ inline const RootType& selectComplexRoot_withRespectToNorm( squaredNormBinaryPredicate& pred ) const
+ {
+ Index res=0;
+ RealScalar norm2 = numext::abs2( m_roots[0] );
+ for( Index i=1; i<m_roots.size(); ++i )
+ {
+ const RealScalar currNorm2 = numext::abs2( m_roots[i] );
+ if( pred( currNorm2, norm2 ) ){
+ res=i; norm2=currNorm2; }
+ }
+ return m_roots[res];
+ }
+
+ public:
+ /**
+ * \returns the complex root with greatest norm.
+ */
+ inline const RootType& greatestRoot() const
+ {
+ std::greater<RealScalar> greater;
+ return selectComplexRoot_withRespectToNorm( greater );
+ }
+
+ /**
+ * \returns the complex root with smallest norm.
+ */
+ inline const RootType& smallestRoot() const
+ {
+ std::less<RealScalar> less;
+ return selectComplexRoot_withRespectToNorm( less );
+ }
+
+ protected:
+ template<typename squaredRealPartBinaryPredicate>
+ inline const RealScalar& selectRealRoot_withRespectToAbsRealPart(
+ squaredRealPartBinaryPredicate& pred,
+ bool& hasArealRoot,
+ const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision() ) const
+ {
+ using std::abs;
+ hasArealRoot = false;
+ Index res=0;
+ RealScalar abs2(0);
+
+ for( Index i=0; i<m_roots.size(); ++i )
+ {
+ if( abs( m_roots[i].imag() ) <= absImaginaryThreshold )
+ {
+ if( !hasArealRoot )
+ {
+ hasArealRoot = true;
+ res = i;
+ abs2 = m_roots[i].real() * m_roots[i].real();
+ }
+ else
+ {
+ const RealScalar currAbs2 = m_roots[i].real() * m_roots[i].real();
+ if( pred( currAbs2, abs2 ) )
+ {
+ abs2 = currAbs2;
+ res = i;
+ }
+ }
+ }
+ else if(!hasArealRoot)
+ {
+ if( abs( m_roots[i].imag() ) < abs( m_roots[res].imag() ) ){
+ res = i;}
+ }
+ }
+ return numext::real_ref(m_roots[res]);
+ }
+
+
+ template<typename RealPartBinaryPredicate>
+ inline const RealScalar& selectRealRoot_withRespectToRealPart(
+ RealPartBinaryPredicate& pred,
+ bool& hasArealRoot,
+ const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision() ) const
+ {
+ using std::abs;
+ hasArealRoot = false;
+ Index res=0;
+ RealScalar val(0);
+
+ for( Index i=0; i<m_roots.size(); ++i )
+ {
+ if( abs( m_roots[i].imag() ) <= absImaginaryThreshold )
+ {
+ if( !hasArealRoot )
+ {
+ hasArealRoot = true;
+ res = i;
+ val = m_roots[i].real();
+ }
+ else
+ {
+ const RealScalar curr = m_roots[i].real();
+ if( pred( curr, val ) )
+ {
+ val = curr;
+ res = i;
+ }
+ }
+ }
+ else
+ {
+ if( abs( m_roots[i].imag() ) < abs( m_roots[res].imag() ) ){
+ res = i; }
+ }
+ }
+ return numext::real_ref(m_roots[res]);
+ }
+
+ public:
+ /**
+ * \returns a real root with greatest absolute magnitude.
+ * A real root is defined as the real part of a complex root with absolute imaginary
+ * part smallest than absImaginaryThreshold.
+ * absImaginaryThreshold takes the dummy_precision associated
+ * with the _Scalar template parameter of the PolynomialSolver class as the default value.
+ * If no real root is found the boolean hasArealRoot is set to false and the real part of
+ * the root with smallest absolute imaginary part is returned instead.
+ *
+ * \param[out] hasArealRoot : boolean true if a real root is found according to the
+ * absImaginaryThreshold criterion, false otherwise.
+ * \param[in] absImaginaryThreshold : threshold on the absolute imaginary part to decide
+ * whether or not a root is real.
+ */
+ inline const RealScalar& absGreatestRealRoot(
+ bool& hasArealRoot,
+ const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision() ) const
+ {
+ std::greater<RealScalar> greater;
+ return selectRealRoot_withRespectToAbsRealPart( greater, hasArealRoot, absImaginaryThreshold );
+ }
+
+
+ /**
+ * \returns a real root with smallest absolute magnitude.
+ * A real root is defined as the real part of a complex root with absolute imaginary
+ * part smallest than absImaginaryThreshold.
+ * absImaginaryThreshold takes the dummy_precision associated
+ * with the _Scalar template parameter of the PolynomialSolver class as the default value.
+ * If no real root is found the boolean hasArealRoot is set to false and the real part of
+ * the root with smallest absolute imaginary part is returned instead.
+ *
+ * \param[out] hasArealRoot : boolean true if a real root is found according to the
+ * absImaginaryThreshold criterion, false otherwise.
+ * \param[in] absImaginaryThreshold : threshold on the absolute imaginary part to decide
+ * whether or not a root is real.
+ */
+ inline const RealScalar& absSmallestRealRoot(
+ bool& hasArealRoot,
+ const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision() ) const
+ {
+ std::less<RealScalar> less;
+ return selectRealRoot_withRespectToAbsRealPart( less, hasArealRoot, absImaginaryThreshold );
+ }
+
+
+ /**
+ * \returns the real root with greatest value.
+ * A real root is defined as the real part of a complex root with absolute imaginary
+ * part smallest than absImaginaryThreshold.
+ * absImaginaryThreshold takes the dummy_precision associated
+ * with the _Scalar template parameter of the PolynomialSolver class as the default value.
+ * If no real root is found the boolean hasArealRoot is set to false and the real part of
+ * the root with smallest absolute imaginary part is returned instead.
+ *
+ * \param[out] hasArealRoot : boolean true if a real root is found according to the
+ * absImaginaryThreshold criterion, false otherwise.
+ * \param[in] absImaginaryThreshold : threshold on the absolute imaginary part to decide
+ * whether or not a root is real.
+ */
+ inline const RealScalar& greatestRealRoot(
+ bool& hasArealRoot,
+ const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision() ) const
+ {
+ std::greater<RealScalar> greater;
+ return selectRealRoot_withRespectToRealPart( greater, hasArealRoot, absImaginaryThreshold );
+ }
+
+
+ /**
+ * \returns the real root with smallest value.
+ * A real root is defined as the real part of a complex root with absolute imaginary
+ * part smallest than absImaginaryThreshold.
+ * absImaginaryThreshold takes the dummy_precision associated
+ * with the _Scalar template parameter of the PolynomialSolver class as the default value.
+ * If no real root is found the boolean hasArealRoot is set to false and the real part of
+ * the root with smallest absolute imaginary part is returned instead.
+ *
+ * \param[out] hasArealRoot : boolean true if a real root is found according to the
+ * absImaginaryThreshold criterion, false otherwise.
+ * \param[in] absImaginaryThreshold : threshold on the absolute imaginary part to decide
+ * whether or not a root is real.
+ */
+ inline const RealScalar& smallestRealRoot(
+ bool& hasArealRoot,
+ const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision() ) const
+ {
+ std::less<RealScalar> less;
+ return selectRealRoot_withRespectToRealPart( less, hasArealRoot, absImaginaryThreshold );
+ }
+
+ protected:
+ RootsType m_roots;
+};
+
+#define EIGEN_POLYNOMIAL_SOLVER_BASE_INHERITED_TYPES( BASE ) \
+ typedef typename BASE::Scalar Scalar; \
+ typedef typename BASE::RealScalar RealScalar; \
+ typedef typename BASE::RootType RootType; \
+ typedef typename BASE::RootsType RootsType;
+
+
+
+/** \ingroup Polynomials_Module
+ *
+ * \class PolynomialSolver
+ *
+ * \brief A polynomial solver
+ *
+ * Computes the complex roots of a real polynomial.
+ *
+ * \param _Scalar the scalar type, i.e., the type of the polynomial coefficients
+ * \param _Deg the degree of the polynomial, can be a compile time value or Dynamic.
+ * Notice that the number of polynomial coefficients is _Deg+1.
+ *
+ * This class implements a polynomial solver and provides convenient methods such as
+ * - real roots,
+ * - greatest, smallest complex roots,
+ * - real roots with greatest, smallest absolute real value.
+ * - greatest, smallest real roots.
+ *
+ * WARNING: this polynomial solver is experimental, part of the unsupported Eigen modules.
+ *
+ *
+ * Currently a QR algorithm is used to compute the eigenvalues of the companion matrix of
+ * the polynomial to compute its roots.
+ * This supposes that the complex moduli of the roots are all distinct: e.g. there should
+ * be no multiple roots or conjugate roots for instance.
+ * With 32bit (float) floating types this problem shows up frequently.
+ * However, almost always, correct accuracy is reached even in these cases for 64bit
+ * (double) floating types and small polynomial degree (<20).
+ */
+template<typename _Scalar, int _Deg>
+class PolynomialSolver : public PolynomialSolverBase<_Scalar,_Deg>
+{
+ public:
+ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_Deg==Dynamic ? Dynamic : _Deg)
+
+ typedef PolynomialSolverBase<_Scalar,_Deg> PS_Base;
+ EIGEN_POLYNOMIAL_SOLVER_BASE_INHERITED_TYPES( PS_Base )
+
+ typedef Matrix<Scalar,_Deg,_Deg> CompanionMatrixType;
+ typedef typename internal::conditional<NumTraits<Scalar>::IsComplex,
+ ComplexEigenSolver<CompanionMatrixType>,
+ EigenSolver<CompanionMatrixType> >::type EigenSolverType;
+ typedef typename internal::conditional<NumTraits<Scalar>::IsComplex, Scalar, std::complex<Scalar> >::type ComplexScalar;
+
+ public:
+ /** Computes the complex roots of a new polynomial. */
+ template< typename OtherPolynomial >
+ void compute( const OtherPolynomial& poly )
+ {
+ eigen_assert( Scalar(0) != poly[poly.size()-1] );
+ eigen_assert( poly.size() > 1 );
+ if(poly.size() > 2 )
+ {
+ internal::companion<Scalar,_Deg> companion( poly );
+ companion.balance();
+ m_eigenSolver.compute( companion.denseMatrix() );
+ m_roots = m_eigenSolver.eigenvalues();
+ // cleanup noise in imaginary part of real roots:
+ // if the imaginary part is rather small compared to the real part
+ // and that cancelling the imaginary part yield a smaller evaluation,
+ // then it's safe to keep the real part only.
+ RealScalar coarse_prec = RealScalar(std::pow(4,poly.size()+1))*NumTraits<RealScalar>::epsilon();
+ for(Index i = 0; i<m_roots.size(); ++i)
+ {
+ if( internal::isMuchSmallerThan(numext::abs(numext::imag(m_roots[i])),
+ numext::abs(numext::real(m_roots[i])),
+ coarse_prec) )
+ {
+ ComplexScalar as_real_root = ComplexScalar(numext::real(m_roots[i]));
+ if( numext::abs(poly_eval(poly, as_real_root))
+ <= numext::abs(poly_eval(poly, m_roots[i])))
+ {
+ m_roots[i] = as_real_root;
+ }
+ }
+ }
+ }
+ else if(poly.size () == 2)
+ {
+ m_roots.resize(1);
+ m_roots[0] = -poly[0]/poly[1];
+ }
+ }
+
+ public:
+ template< typename OtherPolynomial >
+ inline PolynomialSolver( const OtherPolynomial& poly ){
+ compute( poly ); }
+
+ inline PolynomialSolver(){}
+
+ protected:
+ using PS_Base::m_roots;
+ EigenSolverType m_eigenSolver;
+};
+
+
+template< typename _Scalar >
+class PolynomialSolver<_Scalar,1> : public PolynomialSolverBase<_Scalar,1>
+{
+ public:
+ typedef PolynomialSolverBase<_Scalar,1> PS_Base;
+ EIGEN_POLYNOMIAL_SOLVER_BASE_INHERITED_TYPES( PS_Base )
+
+ public:
+ /** Computes the complex roots of a new polynomial. */
+ template< typename OtherPolynomial >
+ void compute( const OtherPolynomial& poly )
+ {
+ eigen_assert( poly.size() == 2 );
+ eigen_assert( Scalar(0) != poly[1] );
+ m_roots[0] = -poly[0]/poly[1];
+ }
+
+ public:
+ template< typename OtherPolynomial >
+ inline PolynomialSolver( const OtherPolynomial& poly ){
+ compute( poly ); }
+
+ inline PolynomialSolver(){}
+
+ protected:
+ using PS_Base::m_roots;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_POLYNOMIAL_SOLVER_H
diff --git a/src/EigenUnsupported/src/Polynomials/PolynomialUtils.h b/src/EigenUnsupported/src/Polynomials/PolynomialUtils.h
new file mode 100644
index 0000000..394e857
--- /dev/null
+++ b/src/EigenUnsupported/src/Polynomials/PolynomialUtils.h
@@ -0,0 +1,143 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Manuel Yguel <manuel.yguel@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_POLYNOMIAL_UTILS_H
+#define EIGEN_POLYNOMIAL_UTILS_H
+
+namespace Eigen {
+
+/** \ingroup Polynomials_Module
+ * \returns the evaluation of the polynomial at x using Horner algorithm.
+ *
+ * \param[in] poly : the vector of coefficients of the polynomial ordered
+ * by degrees i.e. poly[i] is the coefficient of degree i of the polynomial
+ * e.g. \f$ 1 + 3x^2 \f$ is stored as a vector \f$ [ 1, 0, 3 ] \f$.
+ * \param[in] x : the value to evaluate the polynomial at.
+ *
+ * \note for stability:
+ * \f$ |x| \le 1 \f$
+ */
+template <typename Polynomials, typename T>
+inline
+T poly_eval_horner( const Polynomials& poly, const T& x )
+{
+ T val=poly[poly.size()-1];
+ for(DenseIndex i=poly.size()-2; i>=0; --i ){
+ val = val*x + poly[i]; }
+ return val;
+}
+
+/** \ingroup Polynomials_Module
+ * \returns the evaluation of the polynomial at x using stabilized Horner algorithm.
+ *
+ * \param[in] poly : the vector of coefficients of the polynomial ordered
+ * by degrees i.e. poly[i] is the coefficient of degree i of the polynomial
+ * e.g. \f$ 1 + 3x^2 \f$ is stored as a vector \f$ [ 1, 0, 3 ] \f$.
+ * \param[in] x : the value to evaluate the polynomial at.
+ */
+template <typename Polynomials, typename T>
+inline
+T poly_eval( const Polynomials& poly, const T& x )
+{
+ typedef typename NumTraits<T>::Real Real;
+
+ if( numext::abs2( x ) <= Real(1) ){
+ return poly_eval_horner( poly, x ); }
+ else
+ {
+ T val=poly[0];
+ T inv_x = T(1)/x;
+ for( DenseIndex i=1; i<poly.size(); ++i ){
+ val = val*inv_x + poly[i]; }
+
+ return numext::pow(x,(T)(poly.size()-1)) * val;
+ }
+}
+
+/** \ingroup Polynomials_Module
+ * \returns a maximum bound for the absolute value of any root of the polynomial.
+ *
+ * \param[in] poly : the vector of coefficients of the polynomial ordered
+ * by degrees i.e. poly[i] is the coefficient of degree i of the polynomial
+ * e.g. \f$ 1 + 3x^2 \f$ is stored as a vector \f$ [ 1, 0, 3 ] \f$.
+ *
+ * \pre
+ * the leading coefficient of the input polynomial poly must be non zero
+ */
+template <typename Polynomial>
+inline
+typename NumTraits<typename Polynomial::Scalar>::Real cauchy_max_bound( const Polynomial& poly )
+{
+ using std::abs;
+ typedef typename Polynomial::Scalar Scalar;
+ typedef typename NumTraits<Scalar>::Real Real;
+
+ eigen_assert( Scalar(0) != poly[poly.size()-1] );
+ const Scalar inv_leading_coeff = Scalar(1)/poly[poly.size()-1];
+ Real cb(0);
+
+ for( DenseIndex i=0; i<poly.size()-1; ++i ){
+ cb += abs(poly[i]*inv_leading_coeff); }
+ return cb + Real(1);
+}
+
+/** \ingroup Polynomials_Module
+ * \returns a minimum bound for the absolute value of any non zero root of the polynomial.
+ * \param[in] poly : the vector of coefficients of the polynomial ordered
+ * by degrees i.e. poly[i] is the coefficient of degree i of the polynomial
+ * e.g. \f$ 1 + 3x^2 \f$ is stored as a vector \f$ [ 1, 0, 3 ] \f$.
+ */
+template <typename Polynomial>
+inline
+typename NumTraits<typename Polynomial::Scalar>::Real cauchy_min_bound( const Polynomial& poly )
+{
+ using std::abs;
+ typedef typename Polynomial::Scalar Scalar;
+ typedef typename NumTraits<Scalar>::Real Real;
+
+ DenseIndex i=0;
+ while( i<poly.size()-1 && Scalar(0) == poly(i) ){ ++i; }
+ if( poly.size()-1 == i ){
+ return Real(1); }
+
+ const Scalar inv_min_coeff = Scalar(1)/poly[i];
+ Real cb(1);
+ for( DenseIndex j=i+1; j<poly.size(); ++j ){
+ cb += abs(poly[j]*inv_min_coeff); }
+ return Real(1)/cb;
+}
+
+/** \ingroup Polynomials_Module
+ * Given the roots of a polynomial compute the coefficients in the
+ * monomial basis of the monic polynomial with same roots and minimal degree.
+ * If RootVector is a vector of complexes, Polynomial should also be a vector
+ * of complexes.
+ * \param[in] rv : a vector containing the roots of a polynomial.
+ * \param[out] poly : the vector of coefficients of the polynomial ordered
+ * by degrees i.e. poly[i] is the coefficient of degree i of the polynomial
+ * e.g. \f$ 3 + x^2 \f$ is stored as a vector \f$ [ 3, 0, 1 ] \f$.
+ */
+template <typename RootVector, typename Polynomial>
+void roots_to_monicPolynomial( const RootVector& rv, Polynomial& poly )
+{
+
+ typedef typename Polynomial::Scalar Scalar;
+
+ poly.setZero( rv.size()+1 );
+ poly[0] = -rv[0]; poly[1] = Scalar(1);
+ for( DenseIndex i=1; i< rv.size(); ++i )
+ {
+ for( DenseIndex j=i+1; j>0; --j ){ poly[j] = poly[j-1] - rv[i]*poly[j]; }
+ poly[0] = -rv[i]*poly[0];
+ }
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_POLYNOMIAL_UTILS_H
diff --git a/src/EigenUnsupported/src/Skyline/SkylineInplaceLU.h b/src/EigenUnsupported/src/Skyline/SkylineInplaceLU.h
new file mode 100644
index 0000000..6d0370d
--- /dev/null
+++ b/src/EigenUnsupported/src/Skyline/SkylineInplaceLU.h
@@ -0,0 +1,352 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Guillaume Saupin <guillaume.saupin@cea.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SKYLINEINPLACELU_H
+#define EIGEN_SKYLINEINPLACELU_H
+
+namespace Eigen {
+
+/** \ingroup Skyline_Module
+ *
+ * \class SkylineInplaceLU
+ *
+ * \brief Inplace LU decomposition of a skyline matrix and associated features
+ *
+ * \param MatrixType the type of the matrix of which we are computing the LU factorization
+ *
+ */
+template<typename MatrixType>
+class SkylineInplaceLU {
+protected:
+ typedef typename MatrixType::Scalar Scalar;
+ typedef typename MatrixType::Index Index;
+
+ typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
+
+public:
+
+ /** Creates a LU object and compute the respective factorization of \a matrix using
+ * flags \a flags. */
+ SkylineInplaceLU(MatrixType& matrix, int flags = 0)
+ : /*m_matrix(matrix.rows(), matrix.cols()),*/ m_flags(flags), m_status(0), m_lu(matrix) {
+ m_precision = RealScalar(0.1) * Eigen::dummy_precision<RealScalar > ();
+ m_lu.IsRowMajor ? computeRowMajor() : compute();
+ }
+
+ /** Sets the relative threshold value used to prune zero coefficients during the decomposition.
+ *
+ * Setting a value greater than zero speeds up computation, and yields to an incomplete
+ * factorization with fewer non zero coefficients. Such approximate factors are especially
+ * useful to initialize an iterative solver.
+ *
+ * Note that the exact meaning of this parameter might depends on the actual
+ * backend. Moreover, not all backends support this feature.
+ *
+ * \sa precision() */
+ void setPrecision(RealScalar v) {
+ m_precision = v;
+ }
+
+ /** \returns the current precision.
+ *
+ * \sa setPrecision() */
+ RealScalar precision() const {
+ return m_precision;
+ }
+
+ /** Sets the flags. Possible values are:
+ * - CompleteFactorization
+ * - IncompleteFactorization
+ * - MemoryEfficient
+ * - one of the ordering methods
+ * - etc...
+ *
+ * \sa flags() */
+ void setFlags(int f) {
+ m_flags = f;
+ }
+
+ /** \returns the current flags */
+ int flags() const {
+ return m_flags;
+ }
+
+ void setOrderingMethod(int m) {
+ m_flags = m;
+ }
+
+ int orderingMethod() const {
+ return m_flags;
+ }
+
+ /** Computes/re-computes the LU factorization */
+ void compute();
+ void computeRowMajor();
+
+ /** \returns the lower triangular matrix L */
+ //inline const MatrixType& matrixL() const { return m_matrixL; }
+
+ /** \returns the upper triangular matrix U */
+ //inline const MatrixType& matrixU() const { return m_matrixU; }
+
+ template<typename BDerived, typename XDerived>
+ bool solve(const MatrixBase<BDerived> &b, MatrixBase<XDerived>* x,
+ const int transposed = 0) const;
+
+ /** \returns true if the factorization succeeded */
+ inline bool succeeded(void) const {
+ return m_succeeded;
+ }
+
+protected:
+ RealScalar m_precision;
+ int m_flags;
+ mutable int m_status;
+ bool m_succeeded;
+ MatrixType& m_lu;
+};
+
+/** Computes / recomputes the in place LU decomposition of the SkylineInplaceLU.
+ * using the default algorithm.
+ */
+template<typename MatrixType>
+//template<typename _Scalar>
+void SkylineInplaceLU<MatrixType>::compute() {
+ const size_t rows = m_lu.rows();
+ const size_t cols = m_lu.cols();
+
+ eigen_assert(rows == cols && "We do not (yet) support rectangular LU.");
+ eigen_assert(!m_lu.IsRowMajor && "LU decomposition does not work with rowMajor Storage");
+
+ for (Index row = 0; row < rows; row++) {
+ const double pivot = m_lu.coeffDiag(row);
+
+ //Lower matrix Columns update
+ const Index& col = row;
+ for (typename MatrixType::InnerLowerIterator lIt(m_lu, col); lIt; ++lIt) {
+ lIt.valueRef() /= pivot;
+ }
+
+ //Upper matrix update -> contiguous memory access
+ typename MatrixType::InnerLowerIterator lIt(m_lu, col);
+ for (Index rrow = row + 1; rrow < m_lu.rows(); rrow++) {
+ typename MatrixType::InnerUpperIterator uItPivot(m_lu, row);
+ typename MatrixType::InnerUpperIterator uIt(m_lu, rrow);
+ const double coef = lIt.value();
+
+ uItPivot += (rrow - row - 1);
+
+ //update upper part -> contiguous memory access
+ for (++uItPivot; uIt && uItPivot;) {
+ uIt.valueRef() -= uItPivot.value() * coef;
+
+ ++uIt;
+ ++uItPivot;
+ }
+ ++lIt;
+ }
+
+ //Upper matrix update -> non contiguous memory access
+ typename MatrixType::InnerLowerIterator lIt3(m_lu, col);
+ for (Index rrow = row + 1; rrow < m_lu.rows(); rrow++) {
+ typename MatrixType::InnerUpperIterator uItPivot(m_lu, row);
+ const double coef = lIt3.value();
+
+ //update lower part -> non contiguous memory access
+ for (Index i = 0; i < rrow - row - 1; i++) {
+ m_lu.coeffRefLower(rrow, row + i + 1) -= uItPivot.value() * coef;
+ ++uItPivot;
+ }
+ ++lIt3;
+ }
+ //update diag -> contiguous
+ typename MatrixType::InnerLowerIterator lIt2(m_lu, col);
+ for (Index rrow = row + 1; rrow < m_lu.rows(); rrow++) {
+
+ typename MatrixType::InnerUpperIterator uItPivot(m_lu, row);
+ typename MatrixType::InnerUpperIterator uIt(m_lu, rrow);
+ const double coef = lIt2.value();
+
+ uItPivot += (rrow - row - 1);
+ m_lu.coeffRefDiag(rrow) -= uItPivot.value() * coef;
+ ++lIt2;
+ }
+ }
+}
+
+template<typename MatrixType>
+void SkylineInplaceLU<MatrixType>::computeRowMajor() {
+ const size_t rows = m_lu.rows();
+ const size_t cols = m_lu.cols();
+
+ eigen_assert(rows == cols && "We do not (yet) support rectangular LU.");
+ eigen_assert(m_lu.IsRowMajor && "You're trying to apply rowMajor decomposition on a ColMajor matrix !");
+
+ for (Index row = 0; row < rows; row++) {
+ typename MatrixType::InnerLowerIterator llIt(m_lu, row);
+
+
+ for (Index col = llIt.col(); col < row; col++) {
+ if (m_lu.coeffExistLower(row, col)) {
+ const double diag = m_lu.coeffDiag(col);
+
+ typename MatrixType::InnerLowerIterator lIt(m_lu, row);
+ typename MatrixType::InnerUpperIterator uIt(m_lu, col);
+
+
+ const Index offset = lIt.col() - uIt.row();
+
+
+ Index stop = offset > 0 ? col - lIt.col() : col - uIt.row();
+
+ //#define VECTORIZE
+#ifdef VECTORIZE
+ Map<VectorXd > rowVal(lIt.valuePtr() + (offset > 0 ? 0 : -offset), stop);
+ Map<VectorXd > colVal(uIt.valuePtr() + (offset > 0 ? offset : 0), stop);
+
+
+ Scalar newCoeff = m_lu.coeffLower(row, col) - rowVal.dot(colVal);
+#else
+ if (offset > 0) //Skip zero value of lIt
+ uIt += offset;
+ else //Skip zero values of uIt
+ lIt += -offset;
+ Scalar newCoeff = m_lu.coeffLower(row, col);
+
+ for (Index k = 0; k < stop; ++k) {
+ const Scalar tmp = newCoeff;
+ newCoeff = tmp - lIt.value() * uIt.value();
+ ++lIt;
+ ++uIt;
+ }
+#endif
+
+ m_lu.coeffRefLower(row, col) = newCoeff / diag;
+ }
+ }
+
+ //Upper matrix update
+ const Index col = row;
+ typename MatrixType::InnerUpperIterator uuIt(m_lu, col);
+ for (Index rrow = uuIt.row(); rrow < col; rrow++) {
+
+ typename MatrixType::InnerLowerIterator lIt(m_lu, rrow);
+ typename MatrixType::InnerUpperIterator uIt(m_lu, col);
+ const Index offset = lIt.col() - uIt.row();
+
+ Index stop = offset > 0 ? rrow - lIt.col() : rrow - uIt.row();
+
+#ifdef VECTORIZE
+ Map<VectorXd > rowVal(lIt.valuePtr() + (offset > 0 ? 0 : -offset), stop);
+ Map<VectorXd > colVal(uIt.valuePtr() + (offset > 0 ? offset : 0), stop);
+
+ Scalar newCoeff = m_lu.coeffUpper(rrow, col) - rowVal.dot(colVal);
+#else
+ if (offset > 0) //Skip zero value of lIt
+ uIt += offset;
+ else //Skip zero values of uIt
+ lIt += -offset;
+ Scalar newCoeff = m_lu.coeffUpper(rrow, col);
+ for (Index k = 0; k < stop; ++k) {
+ const Scalar tmp = newCoeff;
+ newCoeff = tmp - lIt.value() * uIt.value();
+
+ ++lIt;
+ ++uIt;
+ }
+#endif
+ m_lu.coeffRefUpper(rrow, col) = newCoeff;
+ }
+
+
+ //Diag matrix update
+ typename MatrixType::InnerLowerIterator lIt(m_lu, row);
+ typename MatrixType::InnerUpperIterator uIt(m_lu, row);
+
+ const Index offset = lIt.col() - uIt.row();
+
+
+ Index stop = offset > 0 ? lIt.size() : uIt.size();
+#ifdef VECTORIZE
+ Map<VectorXd > rowVal(lIt.valuePtr() + (offset > 0 ? 0 : -offset), stop);
+ Map<VectorXd > colVal(uIt.valuePtr() + (offset > 0 ? offset : 0), stop);
+ Scalar newCoeff = m_lu.coeffDiag(row) - rowVal.dot(colVal);
+#else
+ if (offset > 0) //Skip zero value of lIt
+ uIt += offset;
+ else //Skip zero values of uIt
+ lIt += -offset;
+ Scalar newCoeff = m_lu.coeffDiag(row);
+ for (Index k = 0; k < stop; ++k) {
+ const Scalar tmp = newCoeff;
+ newCoeff = tmp - lIt.value() * uIt.value();
+ ++lIt;
+ ++uIt;
+ }
+#endif
+ m_lu.coeffRefDiag(row) = newCoeff;
+ }
+}
+
+/** Computes *x = U^-1 L^-1 b
+ *
+ * If \a transpose is set to SvTranspose or SvAdjoint, the solution
+ * of the transposed/adjoint system is computed instead.
+ *
+ * Not all backends implement the solution of the transposed or
+ * adjoint system.
+ */
+template<typename MatrixType>
+template<typename BDerived, typename XDerived>
+bool SkylineInplaceLU<MatrixType>::solve(const MatrixBase<BDerived> &b, MatrixBase<XDerived>* x, const int transposed) const {
+ const size_t rows = m_lu.rows();
+ const size_t cols = m_lu.cols();
+
+
+ for (Index row = 0; row < rows; row++) {
+ x->coeffRef(row) = b.coeff(row);
+ Scalar newVal = x->coeff(row);
+ typename MatrixType::InnerLowerIterator lIt(m_lu, row);
+
+ Index col = lIt.col();
+ while (lIt.col() < row) {
+
+ newVal -= x->coeff(col++) * lIt.value();
+ ++lIt;
+ }
+
+ x->coeffRef(row) = newVal;
+ }
+
+
+ for (Index col = rows - 1; col > 0; col--) {
+ x->coeffRef(col) = x->coeff(col) / m_lu.coeffDiag(col);
+
+ const Scalar x_col = x->coeff(col);
+
+ typename MatrixType::InnerUpperIterator uIt(m_lu, col);
+ uIt += uIt.size()-1;
+
+
+ while (uIt) {
+ x->coeffRef(uIt.row()) -= x_col * uIt.value();
+ //TODO : introduce --operator
+ uIt += -1;
+ }
+
+
+ }
+ x->coeffRef(0) = x->coeff(0) / m_lu.coeffDiag(0);
+
+ return true;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SKYLINEINPLACELU_H
diff --git a/src/EigenUnsupported/src/Skyline/SkylineMatrix.h b/src/EigenUnsupported/src/Skyline/SkylineMatrix.h
new file mode 100644
index 0000000..7c7eace
--- /dev/null
+++ b/src/EigenUnsupported/src/Skyline/SkylineMatrix.h
@@ -0,0 +1,862 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Guillaume Saupin <guillaume.saupin@cea.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SKYLINEMATRIX_H
+#define EIGEN_SKYLINEMATRIX_H
+
+#include "SkylineStorage.h"
+#include "SkylineMatrixBase.h"
+
+namespace Eigen {
+
+/** \ingroup Skyline_Module
+ *
+ * \class SkylineMatrix
+ *
+ * \brief The main skyline matrix class
+ *
+ * This class implements a skyline matrix using the very uncommon storage
+ * scheme.
+ *
+ * \param _Scalar the scalar type, i.e. the type of the coefficients
+ * \param _Options Union of bit flags controlling the storage scheme. Currently the only possibility
+ * is RowMajor. The default is 0 which means column-major.
+ *
+ *
+ */
+namespace internal {
+template<typename _Scalar, int _Options>
+struct traits<SkylineMatrix<_Scalar, _Options> > {
+ typedef _Scalar Scalar;
+ typedef Sparse StorageKind;
+
+ enum {
+ RowsAtCompileTime = Dynamic,
+ ColsAtCompileTime = Dynamic,
+ MaxRowsAtCompileTime = Dynamic,
+ MaxColsAtCompileTime = Dynamic,
+ Flags = SkylineBit | _Options,
+ CoeffReadCost = NumTraits<Scalar>::ReadCost,
+ };
+};
+}
+
+template<typename _Scalar, int _Options>
+class SkylineMatrix
+: public SkylineMatrixBase<SkylineMatrix<_Scalar, _Options> > {
+public:
+ EIGEN_SKYLINE_GENERIC_PUBLIC_INTERFACE(SkylineMatrix)
+ EIGEN_SKYLINE_INHERIT_ASSIGNMENT_OPERATOR(SkylineMatrix, +=)
+ EIGEN_SKYLINE_INHERIT_ASSIGNMENT_OPERATOR(SkylineMatrix, -=)
+
+ using Base::IsRowMajor;
+
+protected:
+
+ typedef SkylineMatrix<Scalar, (Flags&~RowMajorBit) | (IsRowMajor ? RowMajorBit : 0) > TransposedSkylineMatrix;
+
+ Index m_outerSize;
+ Index m_innerSize;
+
+public:
+ Index* m_colStartIndex;
+ Index* m_rowStartIndex;
+ SkylineStorage<Scalar> m_data;
+
+public:
+
+ inline Index rows() const {
+ return IsRowMajor ? m_outerSize : m_innerSize;
+ }
+
+ inline Index cols() const {
+ return IsRowMajor ? m_innerSize : m_outerSize;
+ }
+
+ inline Index innerSize() const {
+ return m_innerSize;
+ }
+
+ inline Index outerSize() const {
+ return m_outerSize;
+ }
+
+ inline Index upperNonZeros() const {
+ return m_data.upperSize();
+ }
+
+ inline Index lowerNonZeros() const {
+ return m_data.lowerSize();
+ }
+
+ inline Index upperNonZeros(Index j) const {
+ return m_colStartIndex[j + 1] - m_colStartIndex[j];
+ }
+
+ inline Index lowerNonZeros(Index j) const {
+ return m_rowStartIndex[j + 1] - m_rowStartIndex[j];
+ }
+
+ inline const Scalar* _diagPtr() const {
+ return &m_data.diag(0);
+ }
+
+ inline Scalar* _diagPtr() {
+ return &m_data.diag(0);
+ }
+
+ inline const Scalar* _upperPtr() const {
+ return &m_data.upper(0);
+ }
+
+ inline Scalar* _upperPtr() {
+ return &m_data.upper(0);
+ }
+
+ inline const Scalar* _lowerPtr() const {
+ return &m_data.lower(0);
+ }
+
+ inline Scalar* _lowerPtr() {
+ return &m_data.lower(0);
+ }
+
+ inline const Index* _upperProfilePtr() const {
+ return &m_data.upperProfile(0);
+ }
+
+ inline Index* _upperProfilePtr() {
+ return &m_data.upperProfile(0);
+ }
+
+ inline const Index* _lowerProfilePtr() const {
+ return &m_data.lowerProfile(0);
+ }
+
+ inline Index* _lowerProfilePtr() {
+ return &m_data.lowerProfile(0);
+ }
+
+ inline Scalar coeff(Index row, Index col) const {
+ const Index outer = IsRowMajor ? row : col;
+ const Index inner = IsRowMajor ? col : row;
+
+ eigen_assert(outer < outerSize());
+ eigen_assert(inner < innerSize());
+
+ if (outer == inner)
+ return this->m_data.diag(outer);
+
+ if (IsRowMajor) {
+ if (inner > outer) //upper matrix
+ {
+ const Index minOuterIndex = inner - m_data.upperProfile(inner);
+ if (outer >= minOuterIndex)
+ return this->m_data.upper(m_colStartIndex[inner] + outer - (inner - m_data.upperProfile(inner)));
+ else
+ return Scalar(0);
+ }
+ if (inner < outer) //lower matrix
+ {
+ const Index minInnerIndex = outer - m_data.lowerProfile(outer);
+ if (inner >= minInnerIndex)
+ return this->m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer)));
+ else
+ return Scalar(0);
+ }
+ return m_data.upper(m_colStartIndex[inner] + outer - inner);
+ } else {
+ if (outer > inner) //upper matrix
+ {
+ const Index maxOuterIndex = inner + m_data.upperProfile(inner);
+ if (outer <= maxOuterIndex)
+ return this->m_data.upper(m_colStartIndex[inner] + (outer - inner));
+ else
+ return Scalar(0);
+ }
+ if (outer < inner) //lower matrix
+ {
+ const Index maxInnerIndex = outer + m_data.lowerProfile(outer);
+
+ if (inner <= maxInnerIndex)
+ return this->m_data.lower(m_rowStartIndex[outer] + (inner - outer));
+ else
+ return Scalar(0);
+ }
+ }
+ }
+
+ inline Scalar& coeffRef(Index row, Index col) {
+ const Index outer = IsRowMajor ? row : col;
+ const Index inner = IsRowMajor ? col : row;
+
+ eigen_assert(outer < outerSize());
+ eigen_assert(inner < innerSize());
+
+ if (outer == inner)
+ return this->m_data.diag(outer);
+
+ if (IsRowMajor) {
+ if (col > row) //upper matrix
+ {
+ const Index minOuterIndex = inner - m_data.upperProfile(inner);
+ eigen_assert(outer >= minOuterIndex && "You tried to access a coeff that does not exist in the storage");
+ return this->m_data.upper(m_colStartIndex[inner] + outer - (inner - m_data.upperProfile(inner)));
+ }
+ if (col < row) //lower matrix
+ {
+ const Index minInnerIndex = outer - m_data.lowerProfile(outer);
+ eigen_assert(inner >= minInnerIndex && "You tried to access a coeff that does not exist in the storage");
+ return this->m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer)));
+ }
+ } else {
+ if (outer > inner) //upper matrix
+ {
+ const Index maxOuterIndex = inner + m_data.upperProfile(inner);
+ eigen_assert(outer <= maxOuterIndex && "You tried to access a coeff that does not exist in the storage");
+ return this->m_data.upper(m_colStartIndex[inner] + (outer - inner));
+ }
+ if (outer < inner) //lower matrix
+ {
+ const Index maxInnerIndex = outer + m_data.lowerProfile(outer);
+ eigen_assert(inner <= maxInnerIndex && "You tried to access a coeff that does not exist in the storage");
+ return this->m_data.lower(m_rowStartIndex[outer] + (inner - outer));
+ }
+ }
+ }
+
+ inline Scalar coeffDiag(Index idx) const {
+ eigen_assert(idx < outerSize());
+ eigen_assert(idx < innerSize());
+ return this->m_data.diag(idx);
+ }
+
+ inline Scalar coeffLower(Index row, Index col) const {
+ const Index outer = IsRowMajor ? row : col;
+ const Index inner = IsRowMajor ? col : row;
+
+ eigen_assert(outer < outerSize());
+ eigen_assert(inner < innerSize());
+ eigen_assert(inner != outer);
+
+ if (IsRowMajor) {
+ const Index minInnerIndex = outer - m_data.lowerProfile(outer);
+ if (inner >= minInnerIndex)
+ return this->m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer)));
+ else
+ return Scalar(0);
+
+ } else {
+ const Index maxInnerIndex = outer + m_data.lowerProfile(outer);
+ if (inner <= maxInnerIndex)
+ return this->m_data.lower(m_rowStartIndex[outer] + (inner - outer));
+ else
+ return Scalar(0);
+ }
+ }
+
+ inline Scalar coeffUpper(Index row, Index col) const {
+ const Index outer = IsRowMajor ? row : col;
+ const Index inner = IsRowMajor ? col : row;
+
+ eigen_assert(outer < outerSize());
+ eigen_assert(inner < innerSize());
+ eigen_assert(inner != outer);
+
+ if (IsRowMajor) {
+ const Index minOuterIndex = inner - m_data.upperProfile(inner);
+ if (outer >= minOuterIndex)
+ return this->m_data.upper(m_colStartIndex[inner] + outer - (inner - m_data.upperProfile(inner)));
+ else
+ return Scalar(0);
+ } else {
+ const Index maxOuterIndex = inner + m_data.upperProfile(inner);
+ if (outer <= maxOuterIndex)
+ return this->m_data.upper(m_colStartIndex[inner] + (outer - inner));
+ else
+ return Scalar(0);
+ }
+ }
+
+ inline Scalar& coeffRefDiag(Index idx) {
+ eigen_assert(idx < outerSize());
+ eigen_assert(idx < innerSize());
+ return this->m_data.diag(idx);
+ }
+
+ inline Scalar& coeffRefLower(Index row, Index col) {
+ const Index outer = IsRowMajor ? row : col;
+ const Index inner = IsRowMajor ? col : row;
+
+ eigen_assert(outer < outerSize());
+ eigen_assert(inner < innerSize());
+ eigen_assert(inner != outer);
+
+ if (IsRowMajor) {
+ const Index minInnerIndex = outer - m_data.lowerProfile(outer);
+ eigen_assert(inner >= minInnerIndex && "You tried to access a coeff that does not exist in the storage");
+ return this->m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer)));
+ } else {
+ const Index maxInnerIndex = outer + m_data.lowerProfile(outer);
+ eigen_assert(inner <= maxInnerIndex && "You tried to access a coeff that does not exist in the storage");
+ return this->m_data.lower(m_rowStartIndex[outer] + (inner - outer));
+ }
+ }
+
+ inline bool coeffExistLower(Index row, Index col) {
+ const Index outer = IsRowMajor ? row : col;
+ const Index inner = IsRowMajor ? col : row;
+
+ eigen_assert(outer < outerSize());
+ eigen_assert(inner < innerSize());
+ eigen_assert(inner != outer);
+
+ if (IsRowMajor) {
+ const Index minInnerIndex = outer - m_data.lowerProfile(outer);
+ return inner >= minInnerIndex;
+ } else {
+ const Index maxInnerIndex = outer + m_data.lowerProfile(outer);
+ return inner <= maxInnerIndex;
+ }
+ }
+
+ inline Scalar& coeffRefUpper(Index row, Index col) {
+ const Index outer = IsRowMajor ? row : col;
+ const Index inner = IsRowMajor ? col : row;
+
+ eigen_assert(outer < outerSize());
+ eigen_assert(inner < innerSize());
+ eigen_assert(inner != outer);
+
+ if (IsRowMajor) {
+ const Index minOuterIndex = inner - m_data.upperProfile(inner);
+ eigen_assert(outer >= minOuterIndex && "You tried to access a coeff that does not exist in the storage");
+ return this->m_data.upper(m_colStartIndex[inner] + outer - (inner - m_data.upperProfile(inner)));
+ } else {
+ const Index maxOuterIndex = inner + m_data.upperProfile(inner);
+ eigen_assert(outer <= maxOuterIndex && "You tried to access a coeff that does not exist in the storage");
+ return this->m_data.upper(m_colStartIndex[inner] + (outer - inner));
+ }
+ }
+
+ inline bool coeffExistUpper(Index row, Index col) {
+ const Index outer = IsRowMajor ? row : col;
+ const Index inner = IsRowMajor ? col : row;
+
+ eigen_assert(outer < outerSize());
+ eigen_assert(inner < innerSize());
+ eigen_assert(inner != outer);
+
+ if (IsRowMajor) {
+ const Index minOuterIndex = inner - m_data.upperProfile(inner);
+ return outer >= minOuterIndex;
+ } else {
+ const Index maxOuterIndex = inner + m_data.upperProfile(inner);
+ return outer <= maxOuterIndex;
+ }
+ }
+
+
+protected:
+
+public:
+ class InnerUpperIterator;
+ class InnerLowerIterator;
+
+ class OuterUpperIterator;
+ class OuterLowerIterator;
+
+ /** Removes all non zeros */
+ inline void setZero() {
+ m_data.clear();
+ memset(m_colStartIndex, 0, (m_outerSize + 1) * sizeof (Index));
+ memset(m_rowStartIndex, 0, (m_outerSize + 1) * sizeof (Index));
+ }
+
+ /** \returns the number of non zero coefficients */
+ inline Index nonZeros() const {
+ return m_data.diagSize() + m_data.upperSize() + m_data.lowerSize();
+ }
+
+ /** Preallocates \a reserveSize non zeros */
+ inline void reserve(Index reserveSize, Index reserveUpperSize, Index reserveLowerSize) {
+ m_data.reserve(reserveSize, reserveUpperSize, reserveLowerSize);
+ }
+
+ /** \returns a reference to a novel non zero coefficient with coordinates \a row x \a col.
+
+ *
+ * \warning This function can be extremely slow if the non zero coefficients
+ * are not inserted in a coherent order.
+ *
+ * After an insertion session, you should call the finalize() function.
+ */
+ EIGEN_DONT_INLINE Scalar & insert(Index row, Index col) {
+ const Index outer = IsRowMajor ? row : col;
+ const Index inner = IsRowMajor ? col : row;
+
+ eigen_assert(outer < outerSize());
+ eigen_assert(inner < innerSize());
+
+ if (outer == inner)
+ return m_data.diag(col);
+
+ if (IsRowMajor) {
+ if (outer < inner) //upper matrix
+ {
+ Index minOuterIndex = 0;
+ minOuterIndex = inner - m_data.upperProfile(inner);
+
+ if (outer < minOuterIndex) //The value does not yet exist
+ {
+ const Index previousProfile = m_data.upperProfile(inner);
+
+ m_data.upperProfile(inner) = inner - outer;
+
+
+ const Index bandIncrement = m_data.upperProfile(inner) - previousProfile;
+ //shift data stored after this new one
+ const Index stop = m_colStartIndex[cols()];
+ const Index start = m_colStartIndex[inner];
+
+
+ for (Index innerIdx = stop; innerIdx >= start; innerIdx--) {
+ m_data.upper(innerIdx + bandIncrement) = m_data.upper(innerIdx);
+ }
+
+ for (Index innerIdx = cols(); innerIdx > inner; innerIdx--) {
+ m_colStartIndex[innerIdx] += bandIncrement;
+ }
+
+ //zeros new data
+ memset(this->_upperPtr() + start, 0, (bandIncrement - 1) * sizeof (Scalar));
+
+ return m_data.upper(m_colStartIndex[inner]);
+ } else {
+ return m_data.upper(m_colStartIndex[inner] + outer - (inner - m_data.upperProfile(inner)));
+ }
+ }
+
+ if (outer > inner) //lower matrix
+ {
+ const Index minInnerIndex = outer - m_data.lowerProfile(outer);
+ if (inner < minInnerIndex) //The value does not yet exist
+ {
+ const Index previousProfile = m_data.lowerProfile(outer);
+ m_data.lowerProfile(outer) = outer - inner;
+
+ const Index bandIncrement = m_data.lowerProfile(outer) - previousProfile;
+ //shift data stored after this new one
+ const Index stop = m_rowStartIndex[rows()];
+ const Index start = m_rowStartIndex[outer];
+
+
+ for (Index innerIdx = stop; innerIdx >= start; innerIdx--) {
+ m_data.lower(innerIdx + bandIncrement) = m_data.lower(innerIdx);
+ }
+
+ for (Index innerIdx = rows(); innerIdx > outer; innerIdx--) {
+ m_rowStartIndex[innerIdx] += bandIncrement;
+ }
+
+ //zeros new data
+ memset(this->_lowerPtr() + start, 0, (bandIncrement - 1) * sizeof (Scalar));
+ return m_data.lower(m_rowStartIndex[outer]);
+ } else {
+ return m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer)));
+ }
+ }
+ } else {
+ if (outer > inner) //upper matrix
+ {
+ const Index maxOuterIndex = inner + m_data.upperProfile(inner);
+ if (outer > maxOuterIndex) //The value does not yet exist
+ {
+ const Index previousProfile = m_data.upperProfile(inner);
+ m_data.upperProfile(inner) = outer - inner;
+
+ const Index bandIncrement = m_data.upperProfile(inner) - previousProfile;
+ //shift data stored after this new one
+ const Index stop = m_rowStartIndex[rows()];
+ const Index start = m_rowStartIndex[inner + 1];
+
+ for (Index innerIdx = stop; innerIdx >= start; innerIdx--) {
+ m_data.upper(innerIdx + bandIncrement) = m_data.upper(innerIdx);
+ }
+
+ for (Index innerIdx = inner + 1; innerIdx < outerSize() + 1; innerIdx++) {
+ m_rowStartIndex[innerIdx] += bandIncrement;
+ }
+ memset(this->_upperPtr() + m_rowStartIndex[inner] + previousProfile + 1, 0, (bandIncrement - 1) * sizeof (Scalar));
+ return m_data.upper(m_rowStartIndex[inner] + m_data.upperProfile(inner));
+ } else {
+ return m_data.upper(m_rowStartIndex[inner] + (outer - inner));
+ }
+ }
+
+ if (outer < inner) //lower matrix
+ {
+ const Index maxInnerIndex = outer + m_data.lowerProfile(outer);
+ if (inner > maxInnerIndex) //The value does not yet exist
+ {
+ const Index previousProfile = m_data.lowerProfile(outer);
+ m_data.lowerProfile(outer) = inner - outer;
+
+ const Index bandIncrement = m_data.lowerProfile(outer) - previousProfile;
+ //shift data stored after this new one
+ const Index stop = m_colStartIndex[cols()];
+ const Index start = m_colStartIndex[outer + 1];
+
+ for (Index innerIdx = stop; innerIdx >= start; innerIdx--) {
+ m_data.lower(innerIdx + bandIncrement) = m_data.lower(innerIdx);
+ }
+
+ for (Index innerIdx = outer + 1; innerIdx < outerSize() + 1; innerIdx++) {
+ m_colStartIndex[innerIdx] += bandIncrement;
+ }
+ memset(this->_lowerPtr() + m_colStartIndex[outer] + previousProfile + 1, 0, (bandIncrement - 1) * sizeof (Scalar));
+ return m_data.lower(m_colStartIndex[outer] + m_data.lowerProfile(outer));
+ } else {
+ return m_data.lower(m_colStartIndex[outer] + (inner - outer));
+ }
+ }
+ }
+ }
+
+ /** Must be called after inserting a set of non zero entries.
+ */
+ inline void finalize() {
+ if (IsRowMajor) {
+ if (rows() > cols())
+ m_data.resize(cols(), cols(), rows(), m_colStartIndex[cols()] + 1, m_rowStartIndex[rows()] + 1);
+ else
+ m_data.resize(rows(), cols(), rows(), m_colStartIndex[cols()] + 1, m_rowStartIndex[rows()] + 1);
+
+ // eigen_assert(rows() == cols() && "memory reorganisatrion only works with suare matrix");
+ //
+ // Scalar* newArray = new Scalar[m_colStartIndex[cols()] + 1 + m_rowStartIndex[rows()] + 1];
+ // Index dataIdx = 0;
+ // for (Index row = 0; row < rows(); row++) {
+ //
+ // const Index nbLowerElts = m_rowStartIndex[row + 1] - m_rowStartIndex[row];
+ // // std::cout << "nbLowerElts" << nbLowerElts << std::endl;
+ // memcpy(newArray + dataIdx, m_data.m_lower + m_rowStartIndex[row], nbLowerElts * sizeof (Scalar));
+ // m_rowStartIndex[row] = dataIdx;
+ // dataIdx += nbLowerElts;
+ //
+ // const Index nbUpperElts = m_colStartIndex[row + 1] - m_colStartIndex[row];
+ // memcpy(newArray + dataIdx, m_data.m_upper + m_colStartIndex[row], nbUpperElts * sizeof (Scalar));
+ // m_colStartIndex[row] = dataIdx;
+ // dataIdx += nbUpperElts;
+ //
+ //
+ // }
+ // //todo : don't access m_data profile directly : add an accessor from SkylineMatrix
+ // m_rowStartIndex[rows()] = m_rowStartIndex[rows()-1] + m_data.lowerProfile(rows()-1);
+ // m_colStartIndex[cols()] = m_colStartIndex[cols()-1] + m_data.upperProfile(cols()-1);
+ //
+ // delete[] m_data.m_lower;
+ // delete[] m_data.m_upper;
+ //
+ // m_data.m_lower = newArray;
+ // m_data.m_upper = newArray;
+ } else {
+ if (rows() > cols())
+ m_data.resize(cols(), rows(), cols(), m_rowStartIndex[cols()] + 1, m_colStartIndex[cols()] + 1);
+ else
+ m_data.resize(rows(), rows(), cols(), m_rowStartIndex[rows()] + 1, m_colStartIndex[rows()] + 1);
+ }
+ }
+
+ inline void squeeze() {
+ finalize();
+ m_data.squeeze();
+ }
+
+ void prune(Scalar reference, RealScalar epsilon = dummy_precision<RealScalar > ()) {
+ //TODO
+ }
+
+ /** Resizes the matrix to a \a rows x \a cols matrix and initializes it to zero
+ * \sa resizeNonZeros(Index), reserve(), setZero()
+ */
+ void resize(size_t rows, size_t cols) {
+ const Index diagSize = rows > cols ? cols : rows;
+ m_innerSize = IsRowMajor ? cols : rows;
+
+ eigen_assert(rows == cols && "Skyline matrix must be square matrix");
+
+ if (diagSize % 2) { // diagSize is odd
+ const Index k = (diagSize - 1) / 2;
+
+ m_data.resize(diagSize, IsRowMajor ? cols : rows, IsRowMajor ? rows : cols,
+ 2 * k * k + k + 1,
+ 2 * k * k + k + 1);
+
+ } else // diagSize is even
+ {
+ const Index k = diagSize / 2;
+ m_data.resize(diagSize, IsRowMajor ? cols : rows, IsRowMajor ? rows : cols,
+ 2 * k * k - k + 1,
+ 2 * k * k - k + 1);
+ }
+
+ if (m_colStartIndex && m_rowStartIndex) {
+ delete[] m_colStartIndex;
+ delete[] m_rowStartIndex;
+ }
+ m_colStartIndex = new Index [cols + 1];
+ m_rowStartIndex = new Index [rows + 1];
+ m_outerSize = diagSize;
+
+ m_data.reset();
+ m_data.clear();
+
+ m_outerSize = diagSize;
+ memset(m_colStartIndex, 0, (cols + 1) * sizeof (Index));
+ memset(m_rowStartIndex, 0, (rows + 1) * sizeof (Index));
+ }
+
+ void resizeNonZeros(Index size) {
+ m_data.resize(size);
+ }
+
+ inline SkylineMatrix()
+ : m_outerSize(-1), m_innerSize(0), m_colStartIndex(0), m_rowStartIndex(0) {
+ resize(0, 0);
+ }
+
+ inline SkylineMatrix(size_t rows, size_t cols)
+ : m_outerSize(0), m_innerSize(0), m_colStartIndex(0), m_rowStartIndex(0) {
+ resize(rows, cols);
+ }
+
+ template<typename OtherDerived>
+ inline SkylineMatrix(const SkylineMatrixBase<OtherDerived>& other)
+ : m_outerSize(0), m_innerSize(0), m_colStartIndex(0), m_rowStartIndex(0) {
+ *this = other.derived();
+ }
+
+ inline SkylineMatrix(const SkylineMatrix & other)
+ : Base(), m_outerSize(0), m_innerSize(0), m_colStartIndex(0), m_rowStartIndex(0) {
+ *this = other.derived();
+ }
+
+ inline void swap(SkylineMatrix & other) {
+ //EIGEN_DBG_SKYLINE(std::cout << "SkylineMatrix:: swap\n");
+ std::swap(m_colStartIndex, other.m_colStartIndex);
+ std::swap(m_rowStartIndex, other.m_rowStartIndex);
+ std::swap(m_innerSize, other.m_innerSize);
+ std::swap(m_outerSize, other.m_outerSize);
+ m_data.swap(other.m_data);
+ }
+
+ inline SkylineMatrix & operator=(const SkylineMatrix & other) {
+ std::cout << "SkylineMatrix& operator=(const SkylineMatrix& other)\n";
+ if (other.isRValue()) {
+ swap(other.const_cast_derived());
+ } else {
+ resize(other.rows(), other.cols());
+ memcpy(m_colStartIndex, other.m_colStartIndex, (m_outerSize + 1) * sizeof (Index));
+ memcpy(m_rowStartIndex, other.m_rowStartIndex, (m_outerSize + 1) * sizeof (Index));
+ m_data = other.m_data;
+ }
+ return *this;
+ }
+
+ template<typename OtherDerived>
+ inline SkylineMatrix & operator=(const SkylineMatrixBase<OtherDerived>& other) {
+ const bool needToTranspose = (Flags & RowMajorBit) != (OtherDerived::Flags & RowMajorBit);
+ if (needToTranspose) {
+ // TODO
+ // return *this;
+ } else {
+ // there is no special optimization
+ return SkylineMatrixBase<SkylineMatrix>::operator=(other.derived());
+ }
+ }
+
+ friend std::ostream & operator <<(std::ostream & s, const SkylineMatrix & m) {
+
+ EIGEN_DBG_SKYLINE(
+ std::cout << "upper elements : " << std::endl;
+ for (Index i = 0; i < m.m_data.upperSize(); i++)
+ std::cout << m.m_data.upper(i) << "\t";
+ std::cout << std::endl;
+ std::cout << "upper profile : " << std::endl;
+ for (Index i = 0; i < m.m_data.upperProfileSize(); i++)
+ std::cout << m.m_data.upperProfile(i) << "\t";
+ std::cout << std::endl;
+ std::cout << "lower startIdx : " << std::endl;
+ for (Index i = 0; i < m.m_data.upperProfileSize(); i++)
+ std::cout << (IsRowMajor ? m.m_colStartIndex[i] : m.m_rowStartIndex[i]) << "\t";
+ std::cout << std::endl;
+
+
+ std::cout << "lower elements : " << std::endl;
+ for (Index i = 0; i < m.m_data.lowerSize(); i++)
+ std::cout << m.m_data.lower(i) << "\t";
+ std::cout << std::endl;
+ std::cout << "lower profile : " << std::endl;
+ for (Index i = 0; i < m.m_data.lowerProfileSize(); i++)
+ std::cout << m.m_data.lowerProfile(i) << "\t";
+ std::cout << std::endl;
+ std::cout << "lower startIdx : " << std::endl;
+ for (Index i = 0; i < m.m_data.lowerProfileSize(); i++)
+ std::cout << (IsRowMajor ? m.m_rowStartIndex[i] : m.m_colStartIndex[i]) << "\t";
+ std::cout << std::endl;
+ );
+ for (Index rowIdx = 0; rowIdx < m.rows(); rowIdx++) {
+ for (Index colIdx = 0; colIdx < m.cols(); colIdx++) {
+ s << m.coeff(rowIdx, colIdx) << "\t";
+ }
+ s << std::endl;
+ }
+ return s;
+ }
+
+ /** Destructor */
+ inline ~SkylineMatrix() {
+ delete[] m_colStartIndex;
+ delete[] m_rowStartIndex;
+ }
+
+ /** Overloaded for performance */
+ Scalar sum() const;
+};
+
+template<typename Scalar, int _Options>
+class SkylineMatrix<Scalar, _Options>::InnerUpperIterator {
+public:
+
+ InnerUpperIterator(const SkylineMatrix& mat, Index outer)
+ : m_matrix(mat), m_outer(outer),
+ m_id(_Options == RowMajor ? mat.m_colStartIndex[outer] : mat.m_rowStartIndex[outer] + 1),
+ m_start(m_id),
+ m_end(_Options == RowMajor ? mat.m_colStartIndex[outer + 1] : mat.m_rowStartIndex[outer + 1] + 1) {
+ }
+
+ inline InnerUpperIterator & operator++() {
+ m_id++;
+ return *this;
+ }
+
+ inline InnerUpperIterator & operator+=(Index shift) {
+ m_id += shift;
+ return *this;
+ }
+
+ inline Scalar value() const {
+ return m_matrix.m_data.upper(m_id);
+ }
+
+ inline Scalar* valuePtr() {
+ return const_cast<Scalar*> (&(m_matrix.m_data.upper(m_id)));
+ }
+
+ inline Scalar& valueRef() {
+ return const_cast<Scalar&> (m_matrix.m_data.upper(m_id));
+ }
+
+ inline Index index() const {
+ return IsRowMajor ? m_outer - m_matrix.m_data.upperProfile(m_outer) + (m_id - m_start) :
+ m_outer + (m_id - m_start) + 1;
+ }
+
+ inline Index row() const {
+ return IsRowMajor ? index() : m_outer;
+ }
+
+ inline Index col() const {
+ return IsRowMajor ? m_outer : index();
+ }
+
+ inline size_t size() const {
+ return m_matrix.m_data.upperProfile(m_outer);
+ }
+
+ inline operator bool() const {
+ return (m_id < m_end) && (m_id >= m_start);
+ }
+
+protected:
+ const SkylineMatrix& m_matrix;
+ const Index m_outer;
+ Index m_id;
+ const Index m_start;
+ const Index m_end;
+};
+
+template<typename Scalar, int _Options>
+class SkylineMatrix<Scalar, _Options>::InnerLowerIterator {
+public:
+
+ InnerLowerIterator(const SkylineMatrix& mat, Index outer)
+ : m_matrix(mat),
+ m_outer(outer),
+ m_id(_Options == RowMajor ? mat.m_rowStartIndex[outer] : mat.m_colStartIndex[outer] + 1),
+ m_start(m_id),
+ m_end(_Options == RowMajor ? mat.m_rowStartIndex[outer + 1] : mat.m_colStartIndex[outer + 1] + 1) {
+ }
+
+ inline InnerLowerIterator & operator++() {
+ m_id++;
+ return *this;
+ }
+
+ inline InnerLowerIterator & operator+=(Index shift) {
+ m_id += shift;
+ return *this;
+ }
+
+ inline Scalar value() const {
+ return m_matrix.m_data.lower(m_id);
+ }
+
+ inline Scalar* valuePtr() {
+ return const_cast<Scalar*> (&(m_matrix.m_data.lower(m_id)));
+ }
+
+ inline Scalar& valueRef() {
+ return const_cast<Scalar&> (m_matrix.m_data.lower(m_id));
+ }
+
+ inline Index index() const {
+ return IsRowMajor ? m_outer - m_matrix.m_data.lowerProfile(m_outer) + (m_id - m_start) :
+ m_outer + (m_id - m_start) + 1;
+ ;
+ }
+
+ inline Index row() const {
+ return IsRowMajor ? m_outer : index();
+ }
+
+ inline Index col() const {
+ return IsRowMajor ? index() : m_outer;
+ }
+
+ inline size_t size() const {
+ return m_matrix.m_data.lowerProfile(m_outer);
+ }
+
+ inline operator bool() const {
+ return (m_id < m_end) && (m_id >= m_start);
+ }
+
+protected:
+ const SkylineMatrix& m_matrix;
+ const Index m_outer;
+ Index m_id;
+ const Index m_start;
+ const Index m_end;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_SKYLINEMATRIX_H
diff --git a/src/EigenUnsupported/src/Skyline/SkylineMatrixBase.h b/src/EigenUnsupported/src/Skyline/SkylineMatrixBase.h
new file mode 100644
index 0000000..b0d5e10
--- /dev/null
+++ b/src/EigenUnsupported/src/Skyline/SkylineMatrixBase.h
@@ -0,0 +1,212 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Guillaume Saupin <guillaume.saupin@cea.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SKYLINEMATRIXBASE_H
+#define EIGEN_SKYLINEMATRIXBASE_H
+
+#include "SkylineUtil.h"
+
+namespace Eigen {
+
+/** \ingroup Skyline_Module
+ *
+ * \class SkylineMatrixBase
+ *
+ * \brief Base class of any skyline matrices or skyline expressions
+ *
+ * \param Derived
+ *
+ */
+template<typename Derived> class SkylineMatrixBase : public EigenBase<Derived> {
+public:
+
+ typedef typename internal::traits<Derived>::Scalar Scalar;
+ typedef typename internal::traits<Derived>::StorageKind StorageKind;
+ typedef typename internal::index<StorageKind>::type Index;
+
+ enum {
+ RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
+ /**< The number of rows at compile-time. This is just a copy of the value provided
+ * by the \a Derived type. If a value is not known at compile-time,
+ * it is set to the \a Dynamic constant.
+ * \sa MatrixBase::rows(), MatrixBase::cols(), ColsAtCompileTime, SizeAtCompileTime */
+
+ ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
+ /**< The number of columns at compile-time. This is just a copy of the value provided
+ * by the \a Derived type. If a value is not known at compile-time,
+ * it is set to the \a Dynamic constant.
+ * \sa MatrixBase::rows(), MatrixBase::cols(), RowsAtCompileTime, SizeAtCompileTime */
+
+
+ SizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::RowsAtCompileTime,
+ internal::traits<Derived>::ColsAtCompileTime>::ret),
+ /**< This is equal to the number of coefficients, i.e. the number of
+ * rows times the number of columns, or to \a Dynamic if this is not
+ * known at compile-time. \sa RowsAtCompileTime, ColsAtCompileTime */
+
+ MaxRowsAtCompileTime = RowsAtCompileTime,
+ MaxColsAtCompileTime = ColsAtCompileTime,
+
+ MaxSizeAtCompileTime = (internal::size_at_compile_time<MaxRowsAtCompileTime,
+ MaxColsAtCompileTime>::ret),
+
+ IsVectorAtCompileTime = RowsAtCompileTime == 1 || ColsAtCompileTime == 1,
+ /**< This is set to true if either the number of rows or the number of
+ * columns is known at compile-time to be equal to 1. Indeed, in that case,
+ * we are dealing with a column-vector (if there is only one column) or with
+ * a row-vector (if there is only one row). */
+
+ Flags = internal::traits<Derived>::Flags,
+ /**< This stores expression \ref flags flags which may or may not be inherited by new expressions
+ * constructed from this one. See the \ref flags "list of flags".
+ */
+
+ CoeffReadCost = internal::traits<Derived>::CoeffReadCost,
+ /**< This is a rough measure of how expensive it is to read one coefficient from
+ * this expression.
+ */
+
+ IsRowMajor = Flags & RowMajorBit ? 1 : 0
+ };
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+ /** This is the "real scalar" type; if the \a Scalar type is already real numbers
+ * (e.g. int, float or double) then \a RealScalar is just the same as \a Scalar. If
+ * \a Scalar is \a std::complex<T> then RealScalar is \a T.
+ *
+ * \sa class NumTraits
+ */
+ typedef typename NumTraits<Scalar>::Real RealScalar;
+
+ /** type of the equivalent square matrix */
+ typedef Matrix<Scalar, EIGEN_SIZE_MAX(RowsAtCompileTime, ColsAtCompileTime),
+ EIGEN_SIZE_MAX(RowsAtCompileTime, ColsAtCompileTime) > SquareMatrixType;
+
+ inline const Derived& derived() const {
+ return *static_cast<const Derived*> (this);
+ }
+
+ inline Derived& derived() {
+ return *static_cast<Derived*> (this);
+ }
+
+ inline Derived& const_cast_derived() const {
+ return *static_cast<Derived*> (const_cast<SkylineMatrixBase*> (this));
+ }
+#endif // not EIGEN_PARSED_BY_DOXYGEN
+
+ /** \returns the number of rows. \sa cols(), RowsAtCompileTime */
+ inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT {
+ return derived().rows();
+ }
+
+ /** \returns the number of columns. \sa rows(), ColsAtCompileTime*/
+ inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT {
+ return derived().cols();
+ }
+
+ /** \returns the number of coefficients, which is \a rows()*cols().
+ * \sa rows(), cols(), SizeAtCompileTime. */
+ inline EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT {
+ return rows() * cols();
+ }
+
+ /** \returns the number of nonzero coefficients which is in practice the number
+ * of stored coefficients. */
+ inline Index nonZeros() const {
+ return derived().nonZeros();
+ }
+
+ /** \returns the size of the storage major dimension,
+ * i.e., the number of columns for a columns major matrix, and the number of rows otherwise */
+ Index outerSize() const {
+ return (int(Flags) & RowMajorBit) ? this->rows() : this->cols();
+ }
+
+ /** \returns the size of the inner dimension according to the storage order,
+ * i.e., the number of rows for a columns major matrix, and the number of cols otherwise */
+ Index innerSize() const {
+ return (int(Flags) & RowMajorBit) ? this->cols() : this->rows();
+ }
+
+ bool isRValue() const {
+ return m_isRValue;
+ }
+
+ Derived& markAsRValue() {
+ m_isRValue = true;
+ return derived();
+ }
+
+ SkylineMatrixBase() : m_isRValue(false) {
+ /* TODO check flags */
+ }
+
+ inline Derived & operator=(const Derived& other) {
+ this->operator=<Derived > (other);
+ return derived();
+ }
+
+ template<typename OtherDerived>
+ inline void assignGeneric(const OtherDerived& other) {
+ derived().resize(other.rows(), other.cols());
+ for (Index row = 0; row < rows(); row++)
+ for (Index col = 0; col < cols(); col++) {
+ if (other.coeff(row, col) != Scalar(0))
+ derived().insert(row, col) = other.coeff(row, col);
+ }
+ derived().finalize();
+ }
+
+ template<typename OtherDerived>
+ inline Derived & operator=(const SkylineMatrixBase<OtherDerived>& other) {
+ //TODO
+ }
+
+ template<typename Lhs, typename Rhs>
+ inline Derived & operator=(const SkylineProduct<Lhs, Rhs, SkylineTimeSkylineProduct>& product);
+
+ friend std::ostream & operator <<(std::ostream & s, const SkylineMatrixBase& m) {
+ s << m.derived();
+ return s;
+ }
+
+ template<typename OtherDerived>
+ const typename SkylineProductReturnType<Derived, OtherDerived>::Type
+ operator*(const MatrixBase<OtherDerived> &other) const;
+
+ /** \internal use operator= */
+ template<typename DenseDerived>
+ void evalTo(MatrixBase<DenseDerived>& dst) const {
+ dst.setZero();
+ for (Index i = 0; i < rows(); i++)
+ for (Index j = 0; j < rows(); j++)
+ dst(i, j) = derived().coeff(i, j);
+ }
+
+ Matrix<Scalar, RowsAtCompileTime, ColsAtCompileTime> toDense() const {
+ return derived();
+ }
+
+ /** \returns the matrix or vector obtained by evaluating this expression.
+ *
+ * Notice that in the case of a plain matrix or vector (not an expression) this function just returns
+ * a const reference, in order to avoid a useless copy.
+ */
+ EIGEN_STRONG_INLINE const typename internal::eval<Derived, IsSkyline>::type eval() const {
+ return typename internal::eval<Derived>::type(derived());
+ }
+
+protected:
+ bool m_isRValue;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_SKYLINEMATRIXBASE_H
diff --git a/src/EigenUnsupported/src/Skyline/SkylineProduct.h b/src/EigenUnsupported/src/Skyline/SkylineProduct.h
new file mode 100644
index 0000000..d9eb814
--- /dev/null
+++ b/src/EigenUnsupported/src/Skyline/SkylineProduct.h
@@ -0,0 +1,295 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Guillaume Saupin <guillaume.saupin@cea.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SKYLINEPRODUCT_H
+#define EIGEN_SKYLINEPRODUCT_H
+
+namespace Eigen {
+
+template<typename Lhs, typename Rhs, int ProductMode>
+struct SkylineProductReturnType {
+ typedef const typename internal::nested_eval<Lhs, Rhs::RowsAtCompileTime>::type LhsNested;
+ typedef const typename internal::nested_eval<Rhs, Lhs::RowsAtCompileTime>::type RhsNested;
+
+ typedef SkylineProduct<LhsNested, RhsNested, ProductMode> Type;
+};
+
+template<typename LhsNested, typename RhsNested, int ProductMode>
+struct internal::traits<SkylineProduct<LhsNested, RhsNested, ProductMode> > {
+ // clean the nested types:
+ typedef typename internal::remove_all<LhsNested>::type _LhsNested;
+ typedef typename internal::remove_all<RhsNested>::type _RhsNested;
+ typedef typename _LhsNested::Scalar Scalar;
+
+ enum {
+ LhsCoeffReadCost = _LhsNested::CoeffReadCost,
+ RhsCoeffReadCost = _RhsNested::CoeffReadCost,
+ LhsFlags = _LhsNested::Flags,
+ RhsFlags = _RhsNested::Flags,
+
+ RowsAtCompileTime = _LhsNested::RowsAtCompileTime,
+ ColsAtCompileTime = _RhsNested::ColsAtCompileTime,
+ InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(_LhsNested::ColsAtCompileTime, _RhsNested::RowsAtCompileTime),
+
+ MaxRowsAtCompileTime = _LhsNested::MaxRowsAtCompileTime,
+ MaxColsAtCompileTime = _RhsNested::MaxColsAtCompileTime,
+
+ EvalToRowMajor = (RhsFlags & LhsFlags & RowMajorBit),
+ ResultIsSkyline = ProductMode == SkylineTimeSkylineProduct,
+
+ RemovedBits = ~((EvalToRowMajor ? 0 : RowMajorBit) | (ResultIsSkyline ? 0 : SkylineBit)),
+
+ Flags = (int(LhsFlags | RhsFlags) & HereditaryBits & RemovedBits)
+ | EvalBeforeAssigningBit
+ | EvalBeforeNestingBit,
+
+ CoeffReadCost = HugeCost
+ };
+
+ typedef typename internal::conditional<ResultIsSkyline,
+ SkylineMatrixBase<SkylineProduct<LhsNested, RhsNested, ProductMode> >,
+ MatrixBase<SkylineProduct<LhsNested, RhsNested, ProductMode> > >::type Base;
+};
+
+namespace internal {
+template<typename LhsNested, typename RhsNested, int ProductMode>
+class SkylineProduct : no_assignment_operator,
+public traits<SkylineProduct<LhsNested, RhsNested, ProductMode> >::Base {
+public:
+
+ EIGEN_GENERIC_PUBLIC_INTERFACE(SkylineProduct)
+
+private:
+
+ typedef typename traits<SkylineProduct>::_LhsNested _LhsNested;
+ typedef typename traits<SkylineProduct>::_RhsNested _RhsNested;
+
+public:
+
+ template<typename Lhs, typename Rhs>
+ EIGEN_STRONG_INLINE SkylineProduct(const Lhs& lhs, const Rhs& rhs)
+ : m_lhs(lhs), m_rhs(rhs) {
+ eigen_assert(lhs.cols() == rhs.rows());
+
+ enum {
+ ProductIsValid = _LhsNested::ColsAtCompileTime == Dynamic
+ || _RhsNested::RowsAtCompileTime == Dynamic
+ || int(_LhsNested::ColsAtCompileTime) == int(_RhsNested::RowsAtCompileTime),
+ AreVectors = _LhsNested::IsVectorAtCompileTime && _RhsNested::IsVectorAtCompileTime,
+ SameSizes = EIGEN_PREDICATE_SAME_MATRIX_SIZE(_LhsNested, _RhsNested)
+ };
+ // note to the lost user:
+ // * for a dot product use: v1.dot(v2)
+ // * for a coeff-wise product use: v1.cwise()*v2
+ EIGEN_STATIC_ASSERT(ProductIsValid || !(AreVectors && SameSizes),
+ INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS)
+ EIGEN_STATIC_ASSERT(ProductIsValid || !(SameSizes && !AreVectors),
+ INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION)
+ EIGEN_STATIC_ASSERT(ProductIsValid || SameSizes, INVALID_MATRIX_PRODUCT)
+ }
+
+ EIGEN_STRONG_INLINE Index rows() const {
+ return m_lhs.rows();
+ }
+
+ EIGEN_STRONG_INLINE Index cols() const {
+ return m_rhs.cols();
+ }
+
+ EIGEN_STRONG_INLINE const _LhsNested& lhs() const {
+ return m_lhs;
+ }
+
+ EIGEN_STRONG_INLINE const _RhsNested& rhs() const {
+ return m_rhs;
+ }
+
+protected:
+ LhsNested m_lhs;
+ RhsNested m_rhs;
+};
+
+// dense = skyline * dense
+// Note that here we force no inlining and separate the setZero() because GCC messes up otherwise
+
+template<typename Lhs, typename Rhs, typename Dest>
+EIGEN_DONT_INLINE void skyline_row_major_time_dense_product(const Lhs& lhs, const Rhs& rhs, Dest& dst) {
+ typedef typename remove_all<Lhs>::type _Lhs;
+ typedef typename remove_all<Rhs>::type _Rhs;
+ typedef typename traits<Lhs>::Scalar Scalar;
+
+ enum {
+ LhsIsRowMajor = (_Lhs::Flags & RowMajorBit) == RowMajorBit,
+ LhsIsSelfAdjoint = (_Lhs::Flags & SelfAdjointBit) == SelfAdjointBit,
+ ProcessFirstHalf = LhsIsSelfAdjoint
+ && (((_Lhs::Flags & (UpperTriangularBit | LowerTriangularBit)) == 0)
+ || ((_Lhs::Flags & UpperTriangularBit) && !LhsIsRowMajor)
+ || ((_Lhs::Flags & LowerTriangularBit) && LhsIsRowMajor)),
+ ProcessSecondHalf = LhsIsSelfAdjoint && (!ProcessFirstHalf)
+ };
+
+ //Use matrix diagonal part <- Improvement : use inner iterator on dense matrix.
+ for (Index col = 0; col < rhs.cols(); col++) {
+ for (Index row = 0; row < lhs.rows(); row++) {
+ dst(row, col) = lhs.coeffDiag(row) * rhs(row, col);
+ }
+ }
+ //Use matrix lower triangular part
+ for (Index row = 0; row < lhs.rows(); row++) {
+ typename _Lhs::InnerLowerIterator lIt(lhs, row);
+ const Index stop = lIt.col() + lIt.size();
+ for (Index col = 0; col < rhs.cols(); col++) {
+
+ Index k = lIt.col();
+ Scalar tmp = 0;
+ while (k < stop) {
+ tmp +=
+ lIt.value() *
+ rhs(k++, col);
+ ++lIt;
+ }
+ dst(row, col) += tmp;
+ lIt += -lIt.size();
+ }
+
+ }
+
+ //Use matrix upper triangular part
+ for (Index lhscol = 0; lhscol < lhs.cols(); lhscol++) {
+ typename _Lhs::InnerUpperIterator uIt(lhs, lhscol);
+ const Index stop = uIt.size() + uIt.row();
+ for (Index rhscol = 0; rhscol < rhs.cols(); rhscol++) {
+
+
+ const Scalar rhsCoeff = rhs.coeff(lhscol, rhscol);
+ Index k = uIt.row();
+ while (k < stop) {
+ dst(k++, rhscol) +=
+ uIt.value() *
+ rhsCoeff;
+ ++uIt;
+ }
+ uIt += -uIt.size();
+ }
+ }
+
+}
+
+template<typename Lhs, typename Rhs, typename Dest>
+EIGEN_DONT_INLINE void skyline_col_major_time_dense_product(const Lhs& lhs, const Rhs& rhs, Dest& dst) {
+ typedef typename remove_all<Lhs>::type _Lhs;
+ typedef typename remove_all<Rhs>::type _Rhs;
+ typedef typename traits<Lhs>::Scalar Scalar;
+
+ enum {
+ LhsIsRowMajor = (_Lhs::Flags & RowMajorBit) == RowMajorBit,
+ LhsIsSelfAdjoint = (_Lhs::Flags & SelfAdjointBit) == SelfAdjointBit,
+ ProcessFirstHalf = LhsIsSelfAdjoint
+ && (((_Lhs::Flags & (UpperTriangularBit | LowerTriangularBit)) == 0)
+ || ((_Lhs::Flags & UpperTriangularBit) && !LhsIsRowMajor)
+ || ((_Lhs::Flags & LowerTriangularBit) && LhsIsRowMajor)),
+ ProcessSecondHalf = LhsIsSelfAdjoint && (!ProcessFirstHalf)
+ };
+
+ //Use matrix diagonal part <- Improvement : use inner iterator on dense matrix.
+ for (Index col = 0; col < rhs.cols(); col++) {
+ for (Index row = 0; row < lhs.rows(); row++) {
+ dst(row, col) = lhs.coeffDiag(row) * rhs(row, col);
+ }
+ }
+
+ //Use matrix upper triangular part
+ for (Index row = 0; row < lhs.rows(); row++) {
+ typename _Lhs::InnerUpperIterator uIt(lhs, row);
+ const Index stop = uIt.col() + uIt.size();
+ for (Index col = 0; col < rhs.cols(); col++) {
+
+ Index k = uIt.col();
+ Scalar tmp = 0;
+ while (k < stop) {
+ tmp +=
+ uIt.value() *
+ rhs(k++, col);
+ ++uIt;
+ }
+
+
+ dst(row, col) += tmp;
+ uIt += -uIt.size();
+ }
+ }
+
+ //Use matrix lower triangular part
+ for (Index lhscol = 0; lhscol < lhs.cols(); lhscol++) {
+ typename _Lhs::InnerLowerIterator lIt(lhs, lhscol);
+ const Index stop = lIt.size() + lIt.row();
+ for (Index rhscol = 0; rhscol < rhs.cols(); rhscol++) {
+
+ const Scalar rhsCoeff = rhs.coeff(lhscol, rhscol);
+ Index k = lIt.row();
+ while (k < stop) {
+ dst(k++, rhscol) +=
+ lIt.value() *
+ rhsCoeff;
+ ++lIt;
+ }
+ lIt += -lIt.size();
+ }
+ }
+
+}
+
+template<typename Lhs, typename Rhs, typename ResultType,
+ int LhsStorageOrder = traits<Lhs>::Flags&RowMajorBit>
+ struct skyline_product_selector;
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct skyline_product_selector<Lhs, Rhs, ResultType, RowMajor> {
+ typedef typename traits<typename remove_all<Lhs>::type>::Scalar Scalar;
+
+ static void run(const Lhs& lhs, const Rhs& rhs, ResultType & res) {
+ skyline_row_major_time_dense_product<Lhs, Rhs, ResultType > (lhs, rhs, res);
+ }
+};
+
+template<typename Lhs, typename Rhs, typename ResultType>
+struct skyline_product_selector<Lhs, Rhs, ResultType, ColMajor> {
+ typedef typename traits<typename remove_all<Lhs>::type>::Scalar Scalar;
+
+ static void run(const Lhs& lhs, const Rhs& rhs, ResultType & res) {
+ skyline_col_major_time_dense_product<Lhs, Rhs, ResultType > (lhs, rhs, res);
+ }
+};
+
+} // end namespace internal
+
+// template<typename Derived>
+// template<typename Lhs, typename Rhs >
+// Derived & MatrixBase<Derived>::lazyAssign(const SkylineProduct<Lhs, Rhs, SkylineTimeDenseProduct>& product) {
+// typedef typename internal::remove_all<Lhs>::type _Lhs;
+// internal::skyline_product_selector<typename internal::remove_all<Lhs>::type,
+// typename internal::remove_all<Rhs>::type,
+// Derived>::run(product.lhs(), product.rhs(), derived());
+//
+// return derived();
+// }
+
+// skyline * dense
+
+template<typename Derived>
+template<typename OtherDerived >
+EIGEN_STRONG_INLINE const typename SkylineProductReturnType<Derived, OtherDerived>::Type
+SkylineMatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const {
+
+ return typename SkylineProductReturnType<Derived, OtherDerived>::Type(derived(), other.derived());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SKYLINEPRODUCT_H
diff --git a/src/EigenUnsupported/src/Skyline/SkylineStorage.h b/src/EigenUnsupported/src/Skyline/SkylineStorage.h
new file mode 100644
index 0000000..cc7514f
--- /dev/null
+++ b/src/EigenUnsupported/src/Skyline/SkylineStorage.h
@@ -0,0 +1,259 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Guillaume Saupin <guillaume.saupin@cea.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SKYLINE_STORAGE_H
+#define EIGEN_SKYLINE_STORAGE_H
+
+namespace Eigen {
+
+/** Stores a skyline set of values in three structures :
+ * The diagonal elements
+ * The upper elements
+ * The lower elements
+ *
+ */
+template<typename Scalar>
+class SkylineStorage {
+ typedef typename NumTraits<Scalar>::Real RealScalar;
+ typedef SparseIndex Index;
+public:
+
+ SkylineStorage()
+ : m_diag(0),
+ m_lower(0),
+ m_upper(0),
+ m_lowerProfile(0),
+ m_upperProfile(0),
+ m_diagSize(0),
+ m_upperSize(0),
+ m_lowerSize(0),
+ m_upperProfileSize(0),
+ m_lowerProfileSize(0),
+ m_allocatedSize(0) {
+ }
+
+ SkylineStorage(const SkylineStorage& other)
+ : m_diag(0),
+ m_lower(0),
+ m_upper(0),
+ m_lowerProfile(0),
+ m_upperProfile(0),
+ m_diagSize(0),
+ m_upperSize(0),
+ m_lowerSize(0),
+ m_upperProfileSize(0),
+ m_lowerProfileSize(0),
+ m_allocatedSize(0) {
+ *this = other;
+ }
+
+ SkylineStorage & operator=(const SkylineStorage& other) {
+ resize(other.diagSize(), other.m_upperProfileSize, other.m_lowerProfileSize, other.upperSize(), other.lowerSize());
+ memcpy(m_diag, other.m_diag, m_diagSize * sizeof (Scalar));
+ memcpy(m_upper, other.m_upper, other.upperSize() * sizeof (Scalar));
+ memcpy(m_lower, other.m_lower, other.lowerSize() * sizeof (Scalar));
+ memcpy(m_upperProfile, other.m_upperProfile, m_upperProfileSize * sizeof (Index));
+ memcpy(m_lowerProfile, other.m_lowerProfile, m_lowerProfileSize * sizeof (Index));
+ return *this;
+ }
+
+ void swap(SkylineStorage& other) {
+ std::swap(m_diag, other.m_diag);
+ std::swap(m_upper, other.m_upper);
+ std::swap(m_lower, other.m_lower);
+ std::swap(m_upperProfile, other.m_upperProfile);
+ std::swap(m_lowerProfile, other.m_lowerProfile);
+ std::swap(m_diagSize, other.m_diagSize);
+ std::swap(m_upperSize, other.m_upperSize);
+ std::swap(m_lowerSize, other.m_lowerSize);
+ std::swap(m_allocatedSize, other.m_allocatedSize);
+ }
+
+ ~SkylineStorage() {
+ delete[] m_diag;
+ delete[] m_upper;
+ if (m_upper != m_lower)
+ delete[] m_lower;
+ delete[] m_upperProfile;
+ delete[] m_lowerProfile;
+ }
+
+ void reserve(Index size, Index upperProfileSize, Index lowerProfileSize, Index upperSize, Index lowerSize) {
+ Index newAllocatedSize = size + upperSize + lowerSize;
+ if (newAllocatedSize > m_allocatedSize)
+ reallocate(size, upperProfileSize, lowerProfileSize, upperSize, lowerSize);
+ }
+
+ void squeeze() {
+ if (m_allocatedSize > m_diagSize + m_upperSize + m_lowerSize)
+ reallocate(m_diagSize, m_upperProfileSize, m_lowerProfileSize, m_upperSize, m_lowerSize);
+ }
+
+ void resize(Index diagSize, Index upperProfileSize, Index lowerProfileSize, Index upperSize, Index lowerSize, float reserveSizeFactor = 0) {
+ if (m_allocatedSize < diagSize + upperSize + lowerSize)
+ reallocate(diagSize, upperProfileSize, lowerProfileSize, upperSize + Index(reserveSizeFactor * upperSize), lowerSize + Index(reserveSizeFactor * lowerSize));
+ m_diagSize = diagSize;
+ m_upperSize = upperSize;
+ m_lowerSize = lowerSize;
+ m_upperProfileSize = upperProfileSize;
+ m_lowerProfileSize = lowerProfileSize;
+ }
+
+ inline Index diagSize() const {
+ return m_diagSize;
+ }
+
+ inline Index upperSize() const {
+ return m_upperSize;
+ }
+
+ inline Index lowerSize() const {
+ return m_lowerSize;
+ }
+
+ inline Index upperProfileSize() const {
+ return m_upperProfileSize;
+ }
+
+ inline Index lowerProfileSize() const {
+ return m_lowerProfileSize;
+ }
+
+ inline Index allocatedSize() const {
+ return m_allocatedSize;
+ }
+
+ inline void clear() {
+ m_diagSize = 0;
+ }
+
+ inline Scalar& diag(Index i) {
+ return m_diag[i];
+ }
+
+ inline const Scalar& diag(Index i) const {
+ return m_diag[i];
+ }
+
+ inline Scalar& upper(Index i) {
+ return m_upper[i];
+ }
+
+ inline const Scalar& upper(Index i) const {
+ return m_upper[i];
+ }
+
+ inline Scalar& lower(Index i) {
+ return m_lower[i];
+ }
+
+ inline const Scalar& lower(Index i) const {
+ return m_lower[i];
+ }
+
+ inline Index& upperProfile(Index i) {
+ return m_upperProfile[i];
+ }
+
+ inline const Index& upperProfile(Index i) const {
+ return m_upperProfile[i];
+ }
+
+ inline Index& lowerProfile(Index i) {
+ return m_lowerProfile[i];
+ }
+
+ inline const Index& lowerProfile(Index i) const {
+ return m_lowerProfile[i];
+ }
+
+ static SkylineStorage Map(Index* upperProfile, Index* lowerProfile, Scalar* diag, Scalar* upper, Scalar* lower, Index size, Index upperSize, Index lowerSize) {
+ SkylineStorage res;
+ res.m_upperProfile = upperProfile;
+ res.m_lowerProfile = lowerProfile;
+ res.m_diag = diag;
+ res.m_upper = upper;
+ res.m_lower = lower;
+ res.m_allocatedSize = res.m_diagSize = size;
+ res.m_upperSize = upperSize;
+ res.m_lowerSize = lowerSize;
+ return res;
+ }
+
+ inline void reset() {
+ memset(m_diag, 0, m_diagSize * sizeof (Scalar));
+ memset(m_upper, 0, m_upperSize * sizeof (Scalar));
+ memset(m_lower, 0, m_lowerSize * sizeof (Scalar));
+ memset(m_upperProfile, 0, m_diagSize * sizeof (Index));
+ memset(m_lowerProfile, 0, m_diagSize * sizeof (Index));
+ }
+
+ void prune(Scalar reference, RealScalar epsilon = dummy_precision<RealScalar>()) {
+ //TODO
+ }
+
+protected:
+
+ inline void reallocate(Index diagSize, Index upperProfileSize, Index lowerProfileSize, Index upperSize, Index lowerSize) {
+
+ Scalar* diag = new Scalar[diagSize];
+ Scalar* upper = new Scalar[upperSize];
+ Scalar* lower = new Scalar[lowerSize];
+ Index* upperProfile = new Index[upperProfileSize];
+ Index* lowerProfile = new Index[lowerProfileSize];
+
+ Index copyDiagSize = (std::min)(diagSize, m_diagSize);
+ Index copyUpperSize = (std::min)(upperSize, m_upperSize);
+ Index copyLowerSize = (std::min)(lowerSize, m_lowerSize);
+ Index copyUpperProfileSize = (std::min)(upperProfileSize, m_upperProfileSize);
+ Index copyLowerProfileSize = (std::min)(lowerProfileSize, m_lowerProfileSize);
+
+ // copy
+ memcpy(diag, m_diag, copyDiagSize * sizeof (Scalar));
+ memcpy(upper, m_upper, copyUpperSize * sizeof (Scalar));
+ memcpy(lower, m_lower, copyLowerSize * sizeof (Scalar));
+ memcpy(upperProfile, m_upperProfile, copyUpperProfileSize * sizeof (Index));
+ memcpy(lowerProfile, m_lowerProfile, copyLowerProfileSize * sizeof (Index));
+
+
+
+ // delete old stuff
+ delete[] m_diag;
+ delete[] m_upper;
+ delete[] m_lower;
+ delete[] m_upperProfile;
+ delete[] m_lowerProfile;
+ m_diag = diag;
+ m_upper = upper;
+ m_lower = lower;
+ m_upperProfile = upperProfile;
+ m_lowerProfile = lowerProfile;
+ m_allocatedSize = diagSize + upperSize + lowerSize;
+ m_upperSize = upperSize;
+ m_lowerSize = lowerSize;
+ }
+
+public:
+ Scalar* m_diag;
+ Scalar* m_upper;
+ Scalar* m_lower;
+ Index* m_upperProfile;
+ Index* m_lowerProfile;
+ Index m_diagSize;
+ Index m_upperSize;
+ Index m_lowerSize;
+ Index m_upperProfileSize;
+ Index m_lowerProfileSize;
+ Index m_allocatedSize;
+
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_SKYLINE_STORAGE_H
diff --git a/src/EigenUnsupported/src/Skyline/SkylineUtil.h b/src/EigenUnsupported/src/Skyline/SkylineUtil.h
new file mode 100644
index 0000000..75eb612
--- /dev/null
+++ b/src/EigenUnsupported/src/Skyline/SkylineUtil.h
@@ -0,0 +1,89 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Guillaume Saupin <guillaume.saupin@cea.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SKYLINEUTIL_H
+#define EIGEN_SKYLINEUTIL_H
+
+namespace Eigen {
+
+#ifdef NDEBUG
+#define EIGEN_DBG_SKYLINE(X)
+#else
+#define EIGEN_DBG_SKYLINE(X) X
+#endif
+
+const unsigned int SkylineBit = 0x1200;
+template<typename Lhs, typename Rhs, int ProductMode> class SkylineProduct;
+enum AdditionalProductEvaluationMode {SkylineTimeDenseProduct, SkylineTimeSkylineProduct, DenseTimeSkylineProduct};
+enum {IsSkyline = SkylineBit};
+
+
+#define EIGEN_SKYLINE_INHERIT_ASSIGNMENT_OPERATOR(Derived, Op) \
+template<typename OtherDerived> \
+EIGEN_STRONG_INLINE Derived& operator Op(const Eigen::SkylineMatrixBase<OtherDerived>& other) \
+{ \
+ return Base::operator Op(other.derived()); \
+} \
+EIGEN_STRONG_INLINE Derived& operator Op(const Derived& other) \
+{ \
+ return Base::operator Op(other); \
+}
+
+#define EIGEN_SKYLINE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, Op) \
+template<typename Other> \
+EIGEN_STRONG_INLINE Derived& operator Op(const Other& scalar) \
+{ \
+ return Base::operator Op(scalar); \
+}
+
+#define EIGEN_SKYLINE_INHERIT_ASSIGNMENT_OPERATORS(Derived) \
+ EIGEN_SKYLINE_INHERIT_ASSIGNMENT_OPERATOR(Derived, =) \
+ EIGEN_SKYLINE_INHERIT_ASSIGNMENT_OPERATOR(Derived, +=) \
+ EIGEN_SKYLINE_INHERIT_ASSIGNMENT_OPERATOR(Derived, -=) \
+ EIGEN_SKYLINE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, *=) \
+ EIGEN_SKYLINE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, /=)
+
+#define _EIGEN_SKYLINE_GENERIC_PUBLIC_INTERFACE(Derived, BaseClass) \
+ typedef BaseClass Base; \
+ typedef typename Eigen::internal::traits<Derived>::Scalar Scalar; \
+ typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; \
+ typedef typename Eigen::internal::traits<Derived>::StorageKind StorageKind; \
+ typedef typename Eigen::internal::index<StorageKind>::type Index; \
+ enum { Flags = Eigen::internal::traits<Derived>::Flags, };
+
+#define EIGEN_SKYLINE_GENERIC_PUBLIC_INTERFACE(Derived) \
+ _EIGEN_SKYLINE_GENERIC_PUBLIC_INTERFACE(Derived, Eigen::SkylineMatrixBase<Derived>)
+
+template<typename Derived> class SkylineMatrixBase;
+template<typename _Scalar, int _Flags = 0> class SkylineMatrix;
+template<typename _Scalar, int _Flags = 0> class DynamicSkylineMatrix;
+template<typename _Scalar, int _Flags = 0> class SkylineVector;
+template<typename _Scalar, int _Flags = 0> class MappedSkylineMatrix;
+
+namespace internal {
+
+template<typename Lhs, typename Rhs> struct skyline_product_mode;
+template<typename Lhs, typename Rhs, int ProductMode = skyline_product_mode<Lhs,Rhs>::value> struct SkylineProductReturnType;
+
+template<typename T> class eval<T,IsSkyline>
+{
+ typedef typename traits<T>::Scalar _Scalar;
+ enum {
+ _Flags = traits<T>::Flags
+ };
+
+ public:
+ typedef SkylineMatrix<_Scalar, _Flags> type;
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SKYLINEUTIL_H
diff --git a/src/EigenUnsupported/src/SparseExtra/BlockOfDynamicSparseMatrix.h b/src/EigenUnsupported/src/SparseExtra/BlockOfDynamicSparseMatrix.h
new file mode 100644
index 0000000..e9ec746
--- /dev/null
+++ b/src/EigenUnsupported/src/SparseExtra/BlockOfDynamicSparseMatrix.h
@@ -0,0 +1,122 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSE_BLOCKFORDYNAMICMATRIX_H
+#define EIGEN_SPARSE_BLOCKFORDYNAMICMATRIX_H
+
+namespace Eigen {
+
+#if 0
+
+// NOTE Have to be reimplemented as a specialization of BlockImpl< DynamicSparseMatrix<_Scalar, _Options, _Index>, ... >
+// See SparseBlock.h for an example
+
+
+/***************************************************************************
+* specialisation for DynamicSparseMatrix
+***************************************************************************/
+
+template<typename _Scalar, int _Options, typename _Index, int Size>
+class SparseInnerVectorSet<DynamicSparseMatrix<_Scalar, _Options, _Index>, Size>
+ : public SparseMatrixBase<SparseInnerVectorSet<DynamicSparseMatrix<_Scalar, _Options, _Index>, Size> >
+{
+ typedef DynamicSparseMatrix<_Scalar, _Options, _Index> MatrixType;
+ public:
+
+ enum { IsRowMajor = internal::traits<SparseInnerVectorSet>::IsRowMajor };
+
+ EIGEN_SPARSE_PUBLIC_INTERFACE(SparseInnerVectorSet)
+ class InnerIterator: public MatrixType::InnerIterator
+ {
+ public:
+ inline InnerIterator(const SparseInnerVectorSet& xpr, Index outer)
+ : MatrixType::InnerIterator(xpr.m_matrix, xpr.m_outerStart + outer), m_outer(outer)
+ {}
+ inline Index row() const { return IsRowMajor ? m_outer : this->index(); }
+ inline Index col() const { return IsRowMajor ? this->index() : m_outer; }
+ protected:
+ Index m_outer;
+ };
+
+ inline SparseInnerVectorSet(const MatrixType& matrix, Index outerStart, Index outerSize)
+ : m_matrix(matrix), m_outerStart(outerStart), m_outerSize(outerSize)
+ {
+ eigen_assert( (outerStart>=0) && ((outerStart+outerSize)<=matrix.outerSize()) );
+ }
+
+ inline SparseInnerVectorSet(const MatrixType& matrix, Index outer)
+ : m_matrix(matrix), m_outerStart(outer), m_outerSize(Size)
+ {
+ eigen_assert(Size!=Dynamic);
+ eigen_assert( (outer>=0) && (outer<matrix.outerSize()) );
+ }
+
+ template<typename OtherDerived>
+ inline SparseInnerVectorSet& operator=(const SparseMatrixBase<OtherDerived>& other)
+ {
+ if (IsRowMajor != ((OtherDerived::Flags&RowMajorBit)==RowMajorBit))
+ {
+ // need to transpose => perform a block evaluation followed by a big swap
+ DynamicSparseMatrix<Scalar,IsRowMajor?RowMajorBit:0> aux(other);
+ *this = aux.markAsRValue();
+ }
+ else
+ {
+ // evaluate/copy vector per vector
+ for (Index j=0; j<m_outerSize.value(); ++j)
+ {
+ SparseVector<Scalar,IsRowMajor ? RowMajorBit : 0> aux(other.innerVector(j));
+ m_matrix.const_cast_derived()._data()[m_outerStart+j].swap(aux._data());
+ }
+ }
+ return *this;
+ }
+
+ inline SparseInnerVectorSet& operator=(const SparseInnerVectorSet& other)
+ {
+ return operator=<SparseInnerVectorSet>(other);
+ }
+
+ Index nonZeros() const
+ {
+ Index count = 0;
+ for (Index j=0; j<m_outerSize.value(); ++j)
+ count += m_matrix._data()[m_outerStart+j].size();
+ return count;
+ }
+
+ const Scalar& lastCoeff() const
+ {
+ EIGEN_STATIC_ASSERT_VECTOR_ONLY(SparseInnerVectorSet);
+ eigen_assert(m_matrix.data()[m_outerStart].size()>0);
+ return m_matrix.data()[m_outerStart].vale(m_matrix.data()[m_outerStart].size()-1);
+ }
+
+// template<typename Sparse>
+// inline SparseInnerVectorSet& operator=(const SparseMatrixBase<OtherDerived>& other)
+// {
+// return *this;
+// }
+
+ EIGEN_STRONG_INLINE Index rows() const { return IsRowMajor ? m_outerSize.value() : m_matrix.rows(); }
+ EIGEN_STRONG_INLINE Index cols() const { return IsRowMajor ? m_matrix.cols() : m_outerSize.value(); }
+
+ protected:
+
+ const typename MatrixType::Nested m_matrix;
+ Index m_outerStart;
+ const internal::variable_if_dynamic<Index, Size> m_outerSize;
+
+};
+
+#endif
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSE_BLOCKFORDYNAMICMATRIX_H
diff --git a/src/EigenUnsupported/src/SparseExtra/BlockSparseMatrix.h b/src/EigenUnsupported/src/SparseExtra/BlockSparseMatrix.h
new file mode 100644
index 0000000..536a0c3
--- /dev/null
+++ b/src/EigenUnsupported/src/SparseExtra/BlockSparseMatrix.h
@@ -0,0 +1,1079 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Desire Nuentsa <desire.nuentsa_wakam@inria.fr>
+// Copyright (C) 2013 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSEBLOCKMATRIX_H
+#define EIGEN_SPARSEBLOCKMATRIX_H
+
+namespace Eigen {
+/** \ingroup SparseCore_Module
+ *
+ * \class BlockSparseMatrix
+ *
+ * \brief A versatile sparse matrix representation where each element is a block
+ *
+ * This class provides routines to manipulate block sparse matrices stored in a
+ * BSR-like representation. There are two main types :
+ *
+ * 1. All blocks have the same number of rows and columns, called block size
+ * in the following. In this case, if this block size is known at compile time,
+ * it can be given as a template parameter like
+ * \code
+ * BlockSparseMatrix<Scalar, 3, ColMajor> bmat(b_rows, b_cols);
+ * \endcode
+ * Here, bmat is a b_rows x b_cols block sparse matrix
+ * where each coefficient is a 3x3 dense matrix.
+ * If the block size is fixed but will be given at runtime,
+ * \code
+ * BlockSparseMatrix<Scalar, Dynamic, ColMajor> bmat(b_rows, b_cols);
+ * bmat.setBlockSize(block_size);
+ * \endcode
+ *
+ * 2. The second case is for variable-block sparse matrices.
+ * Here each block has its own dimensions. The only restriction is that all the blocks
+ * in a row (resp. a column) should have the same number of rows (resp. of columns).
+ * It is thus required in this case to describe the layout of the matrix by calling
+ * setBlockLayout(rowBlocks, colBlocks).
+ *
+ * In any of the previous case, the matrix can be filled by calling setFromTriplets().
+ * A regular sparse matrix can be converted to a block sparse matrix and vice versa.
+ * It is obviously required to describe the block layout beforehand by calling either
+ * setBlockSize() for fixed-size blocks or setBlockLayout for variable-size blocks.
+ *
+ * \tparam _Scalar The Scalar type
+ * \tparam _BlockAtCompileTime The block layout option. It takes the following values
+ * Dynamic : block size known at runtime
+ * a numeric number : fixed-size block known at compile time
+ */
+template<typename _Scalar, int _BlockAtCompileTime=Dynamic, int _Options=ColMajor, typename _StorageIndex=int> class BlockSparseMatrix;
+
+template<typename BlockSparseMatrixT> class BlockSparseMatrixView;
+
+namespace internal {
+template<typename _Scalar, int _BlockAtCompileTime, int _Options, typename _Index>
+struct traits<BlockSparseMatrix<_Scalar,_BlockAtCompileTime,_Options, _Index> >
+{
+ typedef _Scalar Scalar;
+ typedef _Index Index;
+ typedef Sparse StorageKind; // FIXME Where is it used ??
+ typedef MatrixXpr XprKind;
+ enum {
+ RowsAtCompileTime = Dynamic,
+ ColsAtCompileTime = Dynamic,
+ MaxRowsAtCompileTime = Dynamic,
+ MaxColsAtCompileTime = Dynamic,
+ BlockSize = _BlockAtCompileTime,
+ Flags = _Options | NestByRefBit | LvalueBit,
+ CoeffReadCost = NumTraits<Scalar>::ReadCost,
+ SupportedAccessPatterns = InnerRandomAccessPattern
+ };
+};
+template<typename BlockSparseMatrixT>
+struct traits<BlockSparseMatrixView<BlockSparseMatrixT> >
+{
+ typedef Ref<Matrix<typename BlockSparseMatrixT::Scalar, BlockSparseMatrixT::BlockSize, BlockSparseMatrixT::BlockSize> > Scalar;
+ typedef Ref<Matrix<typename BlockSparseMatrixT::RealScalar, BlockSparseMatrixT::BlockSize, BlockSparseMatrixT::BlockSize> > RealScalar;
+
+};
+
+// Function object to sort a triplet list
+template<typename Iterator, bool IsColMajor>
+struct TripletComp
+{
+ typedef typename Iterator::value_type Triplet;
+ bool operator()(const Triplet& a, const Triplet& b)
+ { if(IsColMajor)
+ return ((a.col() == b.col() && a.row() < b.row()) || (a.col() < b.col()));
+ else
+ return ((a.row() == b.row() && a.col() < b.col()) || (a.row() < b.row()));
+ }
+};
+} // end namespace internal
+
+
+/* Proxy to view the block sparse matrix as a regular sparse matrix */
+template<typename BlockSparseMatrixT>
+class BlockSparseMatrixView : public SparseMatrixBase<BlockSparseMatrixT>
+{
+ public:
+ typedef Ref<typename BlockSparseMatrixT::BlockScalar> Scalar;
+ typedef Ref<typename BlockSparseMatrixT::BlockRealScalar> RealScalar;
+ typedef typename BlockSparseMatrixT::Index Index;
+ typedef BlockSparseMatrixT Nested;
+ enum {
+ Flags = BlockSparseMatrixT::Options,
+ Options = BlockSparseMatrixT::Options,
+ RowsAtCompileTime = BlockSparseMatrixT::RowsAtCompileTime,
+ ColsAtCompileTime = BlockSparseMatrixT::ColsAtCompileTime,
+ MaxColsAtCompileTime = BlockSparseMatrixT::MaxColsAtCompileTime,
+ MaxRowsAtCompileTime = BlockSparseMatrixT::MaxRowsAtCompileTime
+ };
+ public:
+ BlockSparseMatrixView(const BlockSparseMatrixT& spblockmat)
+ : m_spblockmat(spblockmat)
+ {}
+
+ Index outerSize() const
+ {
+ return (Flags&RowMajorBit) == 1 ? this->rows() : this->cols();
+ }
+ Index cols() const
+ {
+ return m_spblockmat.blockCols();
+ }
+ Index rows() const
+ {
+ return m_spblockmat.blockRows();
+ }
+ Scalar coeff(Index row, Index col)
+ {
+ return m_spblockmat.coeff(row, col);
+ }
+ Scalar coeffRef(Index row, Index col)
+ {
+ return m_spblockmat.coeffRef(row, col);
+ }
+ // Wrapper to iterate over all blocks
+ class InnerIterator : public BlockSparseMatrixT::BlockInnerIterator
+ {
+ public:
+ InnerIterator(const BlockSparseMatrixView& mat, Index outer)
+ : BlockSparseMatrixT::BlockInnerIterator(mat.m_spblockmat, outer)
+ {}
+
+ };
+
+ protected:
+ const BlockSparseMatrixT& m_spblockmat;
+};
+
+// Proxy to view a regular vector as a block vector
+template<typename BlockSparseMatrixT, typename VectorType>
+class BlockVectorView
+{
+ public:
+ enum {
+ BlockSize = BlockSparseMatrixT::BlockSize,
+ ColsAtCompileTime = VectorType::ColsAtCompileTime,
+ RowsAtCompileTime = VectorType::RowsAtCompileTime,
+ Flags = VectorType::Flags
+ };
+ typedef Ref<const Matrix<typename BlockSparseMatrixT::Scalar, (RowsAtCompileTime==1)? 1 : BlockSize, (ColsAtCompileTime==1)? 1 : BlockSize> >Scalar;
+ typedef typename BlockSparseMatrixT::Index Index;
+ public:
+ BlockVectorView(const BlockSparseMatrixT& spblockmat, const VectorType& vec)
+ : m_spblockmat(spblockmat),m_vec(vec)
+ { }
+ inline Index cols() const
+ {
+ return m_vec.cols();
+ }
+ inline Index size() const
+ {
+ return m_spblockmat.blockRows();
+ }
+ inline Scalar coeff(Index bi) const
+ {
+ Index startRow = m_spblockmat.blockRowsIndex(bi);
+ Index rowSize = m_spblockmat.blockRowsIndex(bi+1) - startRow;
+ return m_vec.middleRows(startRow, rowSize);
+ }
+ inline Scalar coeff(Index bi, Index j) const
+ {
+ Index startRow = m_spblockmat.blockRowsIndex(bi);
+ Index rowSize = m_spblockmat.blockRowsIndex(bi+1) - startRow;
+ return m_vec.block(startRow, j, rowSize, 1);
+ }
+ protected:
+ const BlockSparseMatrixT& m_spblockmat;
+ const VectorType& m_vec;
+};
+
+template<typename VectorType, typename Index> class BlockVectorReturn;
+
+
+// Proxy to view a regular vector as a block vector
+template<typename BlockSparseMatrixT, typename VectorType>
+class BlockVectorReturn
+{
+ public:
+ enum {
+ ColsAtCompileTime = VectorType::ColsAtCompileTime,
+ RowsAtCompileTime = VectorType::RowsAtCompileTime,
+ Flags = VectorType::Flags
+ };
+ typedef Ref<Matrix<typename VectorType::Scalar, RowsAtCompileTime, ColsAtCompileTime> > Scalar;
+ typedef typename BlockSparseMatrixT::Index Index;
+ public:
+ BlockVectorReturn(const BlockSparseMatrixT& spblockmat, VectorType& vec)
+ : m_spblockmat(spblockmat),m_vec(vec)
+ { }
+ inline Index size() const
+ {
+ return m_spblockmat.blockRows();
+ }
+ inline Scalar coeffRef(Index bi)
+ {
+ Index startRow = m_spblockmat.blockRowsIndex(bi);
+ Index rowSize = m_spblockmat.blockRowsIndex(bi+1) - startRow;
+ return m_vec.middleRows(startRow, rowSize);
+ }
+ inline Scalar coeffRef(Index bi, Index j)
+ {
+ Index startRow = m_spblockmat.blockRowsIndex(bi);
+ Index rowSize = m_spblockmat.blockRowsIndex(bi+1) - startRow;
+ return m_vec.block(startRow, j, rowSize, 1);
+ }
+
+ protected:
+ const BlockSparseMatrixT& m_spblockmat;
+ VectorType& m_vec;
+};
+
+// Block version of the sparse dense product
+template<typename Lhs, typename Rhs>
+class BlockSparseTimeDenseProduct;
+
+namespace internal {
+
+template<typename BlockSparseMatrixT, typename VecType>
+struct traits<BlockSparseTimeDenseProduct<BlockSparseMatrixT, VecType> >
+{
+ typedef Dense StorageKind;
+ typedef MatrixXpr XprKind;
+ typedef typename BlockSparseMatrixT::Scalar Scalar;
+ typedef typename BlockSparseMatrixT::Index Index;
+ enum {
+ RowsAtCompileTime = Dynamic,
+ ColsAtCompileTime = Dynamic,
+ MaxRowsAtCompileTime = Dynamic,
+ MaxColsAtCompileTime = Dynamic,
+ Flags = 0,
+ CoeffReadCost = internal::traits<BlockSparseMatrixT>::CoeffReadCost
+ };
+};
+} // end namespace internal
+
+template<typename Lhs, typename Rhs>
+class BlockSparseTimeDenseProduct
+ : public ProductBase<BlockSparseTimeDenseProduct<Lhs,Rhs>, Lhs, Rhs>
+{
+ public:
+ EIGEN_PRODUCT_PUBLIC_INTERFACE(BlockSparseTimeDenseProduct)
+
+ BlockSparseTimeDenseProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
+ {}
+
+ template<typename Dest> void scaleAndAddTo(Dest& dest, const typename Rhs::Scalar& alpha) const
+ {
+ BlockVectorReturn<Lhs,Dest> tmpDest(m_lhs, dest);
+ internal::sparse_time_dense_product( BlockSparseMatrixView<Lhs>(m_lhs), BlockVectorView<Lhs, Rhs>(m_lhs, m_rhs), tmpDest, alpha);
+ }
+
+ private:
+ BlockSparseTimeDenseProduct& operator=(const BlockSparseTimeDenseProduct&);
+};
+
+template<typename _Scalar, int _BlockAtCompileTime, int _Options, typename _StorageIndex>
+class BlockSparseMatrix : public SparseMatrixBase<BlockSparseMatrix<_Scalar,_BlockAtCompileTime, _Options,_StorageIndex> >
+{
+ public:
+ typedef _Scalar Scalar;
+ typedef typename NumTraits<Scalar>::Real RealScalar;
+ typedef _StorageIndex StorageIndex;
+ typedef typename internal::ref_selector<BlockSparseMatrix<_Scalar, _BlockAtCompileTime, _Options, _StorageIndex> >::type Nested;
+
+ enum {
+ Options = _Options,
+ Flags = Options,
+ BlockSize=_BlockAtCompileTime,
+ RowsAtCompileTime = Dynamic,
+ ColsAtCompileTime = Dynamic,
+ MaxRowsAtCompileTime = Dynamic,
+ MaxColsAtCompileTime = Dynamic,
+ IsVectorAtCompileTime = 0,
+ IsColMajor = Flags&RowMajorBit ? 0 : 1
+ };
+ typedef Matrix<Scalar, _BlockAtCompileTime, _BlockAtCompileTime,IsColMajor ? ColMajor : RowMajor> BlockScalar;
+ typedef Matrix<RealScalar, _BlockAtCompileTime, _BlockAtCompileTime,IsColMajor ? ColMajor : RowMajor> BlockRealScalar;
+ typedef typename internal::conditional<_BlockAtCompileTime==Dynamic, Scalar, BlockScalar>::type BlockScalarReturnType;
+ typedef BlockSparseMatrix<Scalar, BlockSize, IsColMajor ? ColMajor : RowMajor, StorageIndex> PlainObject;
+ public:
+ // Default constructor
+ BlockSparseMatrix()
+ : m_innerBSize(0),m_outerBSize(0),m_innerOffset(0),m_outerOffset(0),
+ m_nonzerosblocks(0),m_values(0),m_blockPtr(0),m_indices(0),
+ m_outerIndex(0),m_blockSize(BlockSize)
+ { }
+
+
+ /**
+ * \brief Construct and resize
+ *
+ */
+ BlockSparseMatrix(Index brow, Index bcol)
+ : m_innerBSize(IsColMajor ? brow : bcol),
+ m_outerBSize(IsColMajor ? bcol : brow),
+ m_innerOffset(0),m_outerOffset(0),m_nonzerosblocks(0),
+ m_values(0),m_blockPtr(0),m_indices(0),
+ m_outerIndex(0),m_blockSize(BlockSize)
+ { }
+
+ /**
+ * \brief Copy-constructor
+ */
+ BlockSparseMatrix(const BlockSparseMatrix& other)
+ : m_innerBSize(other.m_innerBSize),m_outerBSize(other.m_outerBSize),
+ m_nonzerosblocks(other.m_nonzerosblocks),m_nonzeros(other.m_nonzeros),
+ m_blockPtr(0),m_blockSize(other.m_blockSize)
+ {
+ // should we allow copying between variable-size blocks and fixed-size blocks ??
+ eigen_assert(m_blockSize == BlockSize && " CAN NOT COPY BETWEEN FIXED-SIZE AND VARIABLE-SIZE BLOCKS");
+
+ std::copy(other.m_innerOffset, other.m_innerOffset+m_innerBSize+1, m_innerOffset);
+ std::copy(other.m_outerOffset, other.m_outerOffset+m_outerBSize+1, m_outerOffset);
+ std::copy(other.m_values, other.m_values+m_nonzeros, m_values);
+
+ if(m_blockSize != Dynamic)
+ std::copy(other.m_blockPtr, other.m_blockPtr+m_nonzerosblocks, m_blockPtr);
+
+ std::copy(other.m_indices, other.m_indices+m_nonzerosblocks, m_indices);
+ std::copy(other.m_outerIndex, other.m_outerIndex+m_outerBSize, m_outerIndex);
+ }
+
+ friend void swap(BlockSparseMatrix& first, BlockSparseMatrix& second)
+ {
+ std::swap(first.m_innerBSize, second.m_innerBSize);
+ std::swap(first.m_outerBSize, second.m_outerBSize);
+ std::swap(first.m_innerOffset, second.m_innerOffset);
+ std::swap(first.m_outerOffset, second.m_outerOffset);
+ std::swap(first.m_nonzerosblocks, second.m_nonzerosblocks);
+ std::swap(first.m_nonzeros, second.m_nonzeros);
+ std::swap(first.m_values, second.m_values);
+ std::swap(first.m_blockPtr, second.m_blockPtr);
+ std::swap(first.m_indices, second.m_indices);
+ std::swap(first.m_outerIndex, second.m_outerIndex);
+ std::swap(first.m_BlockSize, second.m_blockSize);
+ }
+
+ BlockSparseMatrix& operator=(BlockSparseMatrix other)
+ {
+ //Copy-and-swap paradigm ... avoid leaked data if thrown
+ swap(*this, other);
+ return *this;
+ }
+
+ // Destructor
+ ~BlockSparseMatrix()
+ {
+ delete[] m_outerIndex;
+ delete[] m_innerOffset;
+ delete[] m_outerOffset;
+ delete[] m_indices;
+ delete[] m_blockPtr;
+ delete[] m_values;
+ }
+
+
+ /**
+ * \brief Constructor from a sparse matrix
+ *
+ */
+ template<typename MatrixType>
+ inline BlockSparseMatrix(const MatrixType& spmat) : m_blockSize(BlockSize)
+ {
+ EIGEN_STATIC_ASSERT((m_blockSize != Dynamic), THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE);
+
+ *this = spmat;
+ }
+
+ /**
+ * \brief Assignment from a sparse matrix with the same storage order
+ *
+ * Convert from a sparse matrix to block sparse matrix.
+ * \warning Before calling this function, tt is necessary to call
+ * either setBlockLayout() (matrices with variable-size blocks)
+ * or setBlockSize() (for fixed-size blocks).
+ */
+ template<typename MatrixType>
+ inline BlockSparseMatrix& operator=(const MatrixType& spmat)
+ {
+ eigen_assert((m_innerBSize != 0 && m_outerBSize != 0)
+ && "Trying to assign to a zero-size matrix, call resize() first");
+ eigen_assert(((MatrixType::Options&RowMajorBit) != IsColMajor) && "Wrong storage order");
+ typedef SparseMatrix<bool,MatrixType::Options,typename MatrixType::Index> MatrixPatternType;
+ MatrixPatternType blockPattern(blockRows(), blockCols());
+ m_nonzeros = 0;
+
+ // First, compute the number of nonzero blocks and their locations
+ for(StorageIndex bj = 0; bj < m_outerBSize; ++bj)
+ {
+ // Browse each outer block and compute the structure
+ std::vector<bool> nzblocksFlag(m_innerBSize,false); // Record the existing blocks
+ blockPattern.startVec(bj);
+ for(StorageIndex j = blockOuterIndex(bj); j < blockOuterIndex(bj+1); ++j)
+ {
+ typename MatrixType::InnerIterator it_spmat(spmat, j);
+ for(; it_spmat; ++it_spmat)
+ {
+ StorageIndex bi = innerToBlock(it_spmat.index()); // Index of the current nonzero block
+ if(!nzblocksFlag[bi])
+ {
+ // Save the index of this nonzero block
+ nzblocksFlag[bi] = true;
+ blockPattern.insertBackByOuterInnerUnordered(bj, bi) = true;
+ // Compute the total number of nonzeros (including explicit zeros in blocks)
+ m_nonzeros += blockOuterSize(bj) * blockInnerSize(bi);
+ }
+ }
+ } // end current outer block
+ }
+ blockPattern.finalize();
+
+ // Allocate the internal arrays
+ setBlockStructure(blockPattern);
+
+ for(StorageIndex nz = 0; nz < m_nonzeros; ++nz) m_values[nz] = Scalar(0);
+ for(StorageIndex bj = 0; bj < m_outerBSize; ++bj)
+ {
+ // Now copy the values
+ for(StorageIndex j = blockOuterIndex(bj); j < blockOuterIndex(bj+1); ++j)
+ {
+ // Browse the outer block column by column (for column-major matrices)
+ typename MatrixType::InnerIterator it_spmat(spmat, j);
+ for(; it_spmat; ++it_spmat)
+ {
+ StorageIndex idx = 0; // Position of this block in the column block
+ StorageIndex bi = innerToBlock(it_spmat.index()); // Index of the current nonzero block
+ // Go to the inner block where this element belongs to
+ while(bi > m_indices[m_outerIndex[bj]+idx]) ++idx; // Not expensive for ordered blocks
+ StorageIndex idxVal;// Get the right position in the array of values for this element
+ if(m_blockSize == Dynamic)
+ {
+ // Offset from all blocks before ...
+ idxVal = m_blockPtr[m_outerIndex[bj]+idx];
+ // ... and offset inside the block
+ idxVal += (j - blockOuterIndex(bj)) * blockOuterSize(bj) + it_spmat.index() - m_innerOffset[bi];
+ }
+ else
+ {
+ // All blocks before
+ idxVal = (m_outerIndex[bj] + idx) * m_blockSize * m_blockSize;
+ // inside the block
+ idxVal += (j - blockOuterIndex(bj)) * m_blockSize + (it_spmat.index()%m_blockSize);
+ }
+ // Insert the value
+ m_values[idxVal] = it_spmat.value();
+ } // end of this column
+ } // end of this block
+ } // end of this outer block
+
+ return *this;
+ }
+
+ /**
+ * \brief Set the nonzero block pattern of the matrix
+ *
+ * Given a sparse matrix describing the nonzero block pattern,
+ * this function prepares the internal pointers for values.
+ * After calling this function, any *nonzero* block (bi, bj) can be set
+ * with a simple call to coeffRef(bi,bj).
+ *
+ *
+ * \warning Before calling this function, tt is necessary to call
+ * either setBlockLayout() (matrices with variable-size blocks)
+ * or setBlockSize() (for fixed-size blocks).
+ *
+ * \param blockPattern Sparse matrix of boolean elements describing the block structure
+ *
+ * \sa setBlockLayout() \sa setBlockSize()
+ */
+ template<typename MatrixType>
+ void setBlockStructure(const MatrixType& blockPattern)
+ {
+ resize(blockPattern.rows(), blockPattern.cols());
+ reserve(blockPattern.nonZeros());
+
+ // Browse the block pattern and set up the various pointers
+ m_outerIndex[0] = 0;
+ if(m_blockSize == Dynamic) m_blockPtr[0] = 0;
+ for(StorageIndex nz = 0; nz < m_nonzeros; ++nz) m_values[nz] = Scalar(0);
+ for(StorageIndex bj = 0; bj < m_outerBSize; ++bj)
+ {
+ //Browse each outer block
+
+ //First, copy and save the indices of nonzero blocks
+ //FIXME : find a way to avoid this ...
+ std::vector<int> nzBlockIdx;
+ typename MatrixType::InnerIterator it(blockPattern, bj);
+ for(; it; ++it)
+ {
+ nzBlockIdx.push_back(it.index());
+ }
+ std::sort(nzBlockIdx.begin(), nzBlockIdx.end());
+
+ // Now, fill block indices and (eventually) pointers to blocks
+ for(StorageIndex idx = 0; idx < nzBlockIdx.size(); ++idx)
+ {
+ StorageIndex offset = m_outerIndex[bj]+idx; // offset in m_indices
+ m_indices[offset] = nzBlockIdx[idx];
+ if(m_blockSize == Dynamic)
+ m_blockPtr[offset] = m_blockPtr[offset-1] + blockInnerSize(nzBlockIdx[idx]) * blockOuterSize(bj);
+ // There is no blockPtr for fixed-size blocks... not needed !???
+ }
+ // Save the pointer to the next outer block
+ m_outerIndex[bj+1] = m_outerIndex[bj] + nzBlockIdx.size();
+ }
+ }
+
+ /**
+ * \brief Set the number of rows and columns blocks
+ */
+ inline void resize(Index brow, Index bcol)
+ {
+ m_innerBSize = IsColMajor ? brow : bcol;
+ m_outerBSize = IsColMajor ? bcol : brow;
+ }
+
+ /**
+ * \brief set the block size at runtime for fixed-size block layout
+ *
+ * Call this only for fixed-size blocks
+ */
+ inline void setBlockSize(Index blockSize)
+ {
+ m_blockSize = blockSize;
+ }
+
+ /**
+ * \brief Set the row and column block layouts,
+ *
+ * This function set the size of each row and column block.
+ * So this function should be used only for blocks with variable size.
+ * \param rowBlocks : Number of rows per row block
+ * \param colBlocks : Number of columns per column block
+ * \sa resize(), setBlockSize()
+ */
+ inline void setBlockLayout(const VectorXi& rowBlocks, const VectorXi& colBlocks)
+ {
+ const VectorXi& innerBlocks = IsColMajor ? rowBlocks : colBlocks;
+ const VectorXi& outerBlocks = IsColMajor ? colBlocks : rowBlocks;
+ eigen_assert(m_innerBSize == innerBlocks.size() && "CHECK THE NUMBER OF ROW OR COLUMN BLOCKS");
+ eigen_assert(m_outerBSize == outerBlocks.size() && "CHECK THE NUMBER OF ROW OR COLUMN BLOCKS");
+ m_outerBSize = outerBlocks.size();
+ // starting index of blocks... cumulative sums
+ m_innerOffset = new StorageIndex[m_innerBSize+1];
+ m_outerOffset = new StorageIndex[m_outerBSize+1];
+ m_innerOffset[0] = 0;
+ m_outerOffset[0] = 0;
+ std::partial_sum(&innerBlocks[0], &innerBlocks[m_innerBSize-1]+1, &m_innerOffset[1]);
+ std::partial_sum(&outerBlocks[0], &outerBlocks[m_outerBSize-1]+1, &m_outerOffset[1]);
+
+ // Compute the total number of nonzeros
+ m_nonzeros = 0;
+ for(StorageIndex bj = 0; bj < m_outerBSize; ++bj)
+ for(StorageIndex bi = 0; bi < m_innerBSize; ++bi)
+ m_nonzeros += outerBlocks[bj] * innerBlocks[bi];
+
+ }
+
+ /**
+ * \brief Allocate the internal array of pointers to blocks and their inner indices
+ *
+ * \note For fixed-size blocks, call setBlockSize() to set the block.
+ * And For variable-size blocks, call setBlockLayout() before using this function
+ *
+ * \param nonzerosblocks Number of nonzero blocks. The total number of nonzeros is
+ * is computed in setBlockLayout() for variable-size blocks
+ * \sa setBlockSize()
+ */
+ inline void reserve(const Index nonzerosblocks)
+ {
+ eigen_assert((m_innerBSize != 0 && m_outerBSize != 0) &&
+ "TRYING TO RESERVE ZERO-SIZE MATRICES, CALL resize() first");
+
+ //FIXME Should free if already allocated
+ m_outerIndex = new StorageIndex[m_outerBSize+1];
+
+ m_nonzerosblocks = nonzerosblocks;
+ if(m_blockSize != Dynamic)
+ {
+ m_nonzeros = nonzerosblocks * (m_blockSize * m_blockSize);
+ m_blockPtr = 0;
+ }
+ else
+ {
+ // m_nonzeros is already computed in setBlockLayout()
+ m_blockPtr = new StorageIndex[m_nonzerosblocks+1];
+ }
+ m_indices = new StorageIndex[m_nonzerosblocks+1];
+ m_values = new Scalar[m_nonzeros];
+ }
+
+
+ /**
+ * \brief Fill values in a matrix from a triplet list.
+ *
+ * Each triplet item has a block stored in an Eigen dense matrix.
+ * The InputIterator class should provide the functions row(), col() and value()
+ *
+ * \note For fixed-size blocks, call setBlockSize() before this function.
+ *
+ * FIXME Do not accept duplicates
+ */
+ template<typename InputIterator>
+ void setFromTriplets(const InputIterator& begin, const InputIterator& end)
+ {
+ eigen_assert((m_innerBSize!=0 && m_outerBSize !=0) && "ZERO BLOCKS, PLEASE CALL resize() before");
+
+ /* First, sort the triplet list
+ * FIXME This can be unnecessarily expensive since only the inner indices have to be sorted
+ * The best approach is like in SparseMatrix::setFromTriplets()
+ */
+ internal::TripletComp<InputIterator, IsColMajor> tripletcomp;
+ std::sort(begin, end, tripletcomp);
+
+ /* Count the number of rows and column blocks,
+ * and the number of nonzero blocks per outer dimension
+ */
+ VectorXi rowBlocks(m_innerBSize); // Size of each block row
+ VectorXi colBlocks(m_outerBSize); // Size of each block column
+ rowBlocks.setZero(); colBlocks.setZero();
+ VectorXi nzblock_outer(m_outerBSize); // Number of nz blocks per outer vector
+ VectorXi nz_outer(m_outerBSize); // Number of nz per outer vector...for variable-size blocks
+ nzblock_outer.setZero();
+ nz_outer.setZero();
+ for(InputIterator it(begin); it !=end; ++it)
+ {
+ eigen_assert(it->row() >= 0 && it->row() < this->blockRows() && it->col() >= 0 && it->col() < this->blockCols());
+ eigen_assert((it->value().rows() == it->value().cols() && (it->value().rows() == m_blockSize))
+ || (m_blockSize == Dynamic));
+
+ if(m_blockSize == Dynamic)
+ {
+ eigen_assert((rowBlocks[it->row()] == 0 || rowBlocks[it->row()] == it->value().rows()) &&
+ "NON CORRESPONDING SIZES FOR ROW BLOCKS");
+ eigen_assert((colBlocks[it->col()] == 0 || colBlocks[it->col()] == it->value().cols()) &&
+ "NON CORRESPONDING SIZES FOR COLUMN BLOCKS");
+ rowBlocks[it->row()] =it->value().rows();
+ colBlocks[it->col()] = it->value().cols();
+ }
+ nz_outer(IsColMajor ? it->col() : it->row()) += it->value().rows() * it->value().cols();
+ nzblock_outer(IsColMajor ? it->col() : it->row())++;
+ }
+ // Allocate member arrays
+ if(m_blockSize == Dynamic) setBlockLayout(rowBlocks, colBlocks);
+ StorageIndex nzblocks = nzblock_outer.sum();
+ reserve(nzblocks);
+
+ // Temporary markers
+ VectorXi block_id(m_outerBSize); // To be used as a block marker during insertion
+
+ // Setup outer index pointers and markers
+ m_outerIndex[0] = 0;
+ if (m_blockSize == Dynamic) m_blockPtr[0] = 0;
+ for(StorageIndex bj = 0; bj < m_outerBSize; ++bj)
+ {
+ m_outerIndex[bj+1] = m_outerIndex[bj] + nzblock_outer(bj);
+ block_id(bj) = m_outerIndex[bj];
+ if(m_blockSize==Dynamic)
+ {
+ m_blockPtr[m_outerIndex[bj+1]] = m_blockPtr[m_outerIndex[bj]] + nz_outer(bj);
+ }
+ }
+
+ // Fill the matrix
+ for(InputIterator it(begin); it!=end; ++it)
+ {
+ StorageIndex outer = IsColMajor ? it->col() : it->row();
+ StorageIndex inner = IsColMajor ? it->row() : it->col();
+ m_indices[block_id(outer)] = inner;
+ StorageIndex block_size = it->value().rows()*it->value().cols();
+ StorageIndex nz_marker = blockPtr(block_id[outer]);
+ memcpy(&(m_values[nz_marker]), it->value().data(), block_size * sizeof(Scalar));
+ if(m_blockSize == Dynamic)
+ {
+ m_blockPtr[block_id(outer)+1] = m_blockPtr[block_id(outer)] + block_size;
+ }
+ block_id(outer)++;
+ }
+
+ // An alternative when the outer indices are sorted...no need to use an array of markers
+// for(Index bcol = 0; bcol < m_outerBSize; ++bcol)
+// {
+// Index id = 0, id_nz = 0, id_nzblock = 0;
+// for(InputIterator it(begin); it!=end; ++it)
+// {
+// while (id<bcol) // one pass should do the job unless there are empty columns
+// {
+// id++;
+// m_outerIndex[id+1]=m_outerIndex[id];
+// }
+// m_outerIndex[id+1] += 1;
+// m_indices[id_nzblock]=brow;
+// Index block_size = it->value().rows()*it->value().cols();
+// m_blockPtr[id_nzblock+1] = m_blockPtr[id_nzblock] + block_size;
+// id_nzblock++;
+// memcpy(&(m_values[id_nz]),it->value().data(), block_size*sizeof(Scalar));
+// id_nz += block_size;
+// }
+// while(id < m_outerBSize-1) // Empty columns at the end
+// {
+// id++;
+// m_outerIndex[id+1]=m_outerIndex[id];
+// }
+// }
+ }
+
+
+ /**
+ * \returns the number of rows
+ */
+ inline Index rows() const
+ {
+// return blockRows();
+ return (IsColMajor ? innerSize() : outerSize());
+ }
+
+ /**
+ * \returns the number of cols
+ */
+ inline Index cols() const
+ {
+// return blockCols();
+ return (IsColMajor ? outerSize() : innerSize());
+ }
+
+ inline Index innerSize() const
+ {
+ if(m_blockSize == Dynamic) return m_innerOffset[m_innerBSize];
+ else return (m_innerBSize * m_blockSize) ;
+ }
+
+ inline Index outerSize() const
+ {
+ if(m_blockSize == Dynamic) return m_outerOffset[m_outerBSize];
+ else return (m_outerBSize * m_blockSize) ;
+ }
+ /** \returns the number of rows grouped by blocks */
+ inline Index blockRows() const
+ {
+ return (IsColMajor ? m_innerBSize : m_outerBSize);
+ }
+ /** \returns the number of columns grouped by blocks */
+ inline Index blockCols() const
+ {
+ return (IsColMajor ? m_outerBSize : m_innerBSize);
+ }
+
+ inline Index outerBlocks() const { return m_outerBSize; }
+ inline Index innerBlocks() const { return m_innerBSize; }
+
+ /** \returns the block index where outer belongs to */
+ inline Index outerToBlock(Index outer) const
+ {
+ eigen_assert(outer < outerSize() && "OUTER INDEX OUT OF BOUNDS");
+
+ if(m_blockSize != Dynamic)
+ return (outer / m_blockSize); // Integer division
+
+ StorageIndex b_outer = 0;
+ while(m_outerOffset[b_outer] <= outer) ++b_outer;
+ return b_outer - 1;
+ }
+ /** \returns the block index where inner belongs to */
+ inline Index innerToBlock(Index inner) const
+ {
+ eigen_assert(inner < innerSize() && "OUTER INDEX OUT OF BOUNDS");
+
+ if(m_blockSize != Dynamic)
+ return (inner / m_blockSize); // Integer division
+
+ StorageIndex b_inner = 0;
+ while(m_innerOffset[b_inner] <= inner) ++b_inner;
+ return b_inner - 1;
+ }
+
+ /**
+ *\returns a reference to the (i,j) block as an Eigen Dense Matrix
+ */
+ Ref<BlockScalar> coeffRef(Index brow, Index bcol)
+ {
+ eigen_assert(brow < blockRows() && "BLOCK ROW INDEX OUT OF BOUNDS");
+ eigen_assert(bcol < blockCols() && "BLOCK nzblocksFlagCOLUMN OUT OF BOUNDS");
+
+ StorageIndex rsize = IsColMajor ? blockInnerSize(brow): blockOuterSize(bcol);
+ StorageIndex csize = IsColMajor ? blockOuterSize(bcol) : blockInnerSize(brow);
+ StorageIndex inner = IsColMajor ? brow : bcol;
+ StorageIndex outer = IsColMajor ? bcol : brow;
+ StorageIndex offset = m_outerIndex[outer];
+ while(offset < m_outerIndex[outer+1] && m_indices[offset] != inner)
+ offset++;
+ if(m_indices[offset] == inner)
+ {
+ return Map<BlockScalar>(&(m_values[blockPtr(offset)]), rsize, csize);
+ }
+ else
+ {
+ //FIXME the block does not exist, Insert it !!!!!!!!!
+ eigen_assert("DYNAMIC INSERTION IS NOT YET SUPPORTED");
+ }
+ }
+
+ /**
+ * \returns the value of the (i,j) block as an Eigen Dense Matrix
+ */
+ Map<const BlockScalar> coeff(Index brow, Index bcol) const
+ {
+ eigen_assert(brow < blockRows() && "BLOCK ROW INDEX OUT OF BOUNDS");
+ eigen_assert(bcol < blockCols() && "BLOCK COLUMN OUT OF BOUNDS");
+
+ StorageIndex rsize = IsColMajor ? blockInnerSize(brow): blockOuterSize(bcol);
+ StorageIndex csize = IsColMajor ? blockOuterSize(bcol) : blockInnerSize(brow);
+ StorageIndex inner = IsColMajor ? brow : bcol;
+ StorageIndex outer = IsColMajor ? bcol : brow;
+ StorageIndex offset = m_outerIndex[outer];
+ while(offset < m_outerIndex[outer+1] && m_indices[offset] != inner) offset++;
+ if(m_indices[offset] == inner)
+ {
+ return Map<const BlockScalar> (&(m_values[blockPtr(offset)]), rsize, csize);
+ }
+ else
+// return BlockScalar::Zero(rsize, csize);
+ eigen_assert("NOT YET SUPPORTED");
+ }
+
+ // Block Matrix times vector product
+ template<typename VecType>
+ BlockSparseTimeDenseProduct<BlockSparseMatrix, VecType> operator*(const VecType& lhs) const
+ {
+ return BlockSparseTimeDenseProduct<BlockSparseMatrix, VecType>(*this, lhs);
+ }
+
+ /** \returns the number of nonzero blocks */
+ inline Index nonZerosBlocks() const { return m_nonzerosblocks; }
+ /** \returns the total number of nonzero elements, including eventual explicit zeros in blocks */
+ inline Index nonZeros() const { return m_nonzeros; }
+
+ inline BlockScalarReturnType *valuePtr() {return static_cast<BlockScalarReturnType *>(m_values);}
+// inline Scalar *valuePtr(){ return m_values; }
+ inline StorageIndex *innerIndexPtr() {return m_indices; }
+ inline const StorageIndex *innerIndexPtr() const {return m_indices; }
+ inline StorageIndex *outerIndexPtr() {return m_outerIndex; }
+ inline const StorageIndex* outerIndexPtr() const {return m_outerIndex; }
+
+ /** \brief for compatibility purposes with the SparseMatrix class */
+ inline bool isCompressed() const {return true;}
+ /**
+ * \returns the starting index of the bi row block
+ */
+ inline Index blockRowsIndex(Index bi) const
+ {
+ return IsColMajor ? blockInnerIndex(bi) : blockOuterIndex(bi);
+ }
+
+ /**
+ * \returns the starting index of the bj col block
+ */
+ inline Index blockColsIndex(Index bj) const
+ {
+ return IsColMajor ? blockOuterIndex(bj) : blockInnerIndex(bj);
+ }
+
+ inline Index blockOuterIndex(Index bj) const
+ {
+ return (m_blockSize == Dynamic) ? m_outerOffset[bj] : (bj * m_blockSize);
+ }
+ inline Index blockInnerIndex(Index bi) const
+ {
+ return (m_blockSize == Dynamic) ? m_innerOffset[bi] : (bi * m_blockSize);
+ }
+
+ // Not needed ???
+ inline Index blockInnerSize(Index bi) const
+ {
+ return (m_blockSize == Dynamic) ? (m_innerOffset[bi+1] - m_innerOffset[bi]) : m_blockSize;
+ }
+ inline Index blockOuterSize(Index bj) const
+ {
+ return (m_blockSize == Dynamic) ? (m_outerOffset[bj+1]- m_outerOffset[bj]) : m_blockSize;
+ }
+
+ /**
+ * \brief Browse the matrix by outer index
+ */
+ class InnerIterator; // Browse column by column
+
+ /**
+ * \brief Browse the matrix by block outer index
+ */
+ class BlockInnerIterator; // Browse block by block
+
+ friend std::ostream & operator << (std::ostream & s, const BlockSparseMatrix& m)
+ {
+ for (StorageIndex j = 0; j < m.outerBlocks(); ++j)
+ {
+ BlockInnerIterator itb(m, j);
+ for(; itb; ++itb)
+ {
+ s << "("<<itb.row() << ", " << itb.col() << ")\n";
+ s << itb.value() <<"\n";
+ }
+ }
+ s << std::endl;
+ return s;
+ }
+
+ /**
+ * \returns the starting position of the block \p id in the array of values
+ */
+ Index blockPtr(Index id) const
+ {
+ if(m_blockSize == Dynamic) return m_blockPtr[id];
+ else return id * m_blockSize * m_blockSize;
+ //return blockDynIdx(id, typename internal::conditional<(BlockSize==Dynamic), internal::true_type, internal::false_type>::type());
+ }
+
+
+ protected:
+// inline Index blockDynIdx(Index id, internal::true_type) const
+// {
+// return m_blockPtr[id];
+// }
+// inline Index blockDynIdx(Index id, internal::false_type) const
+// {
+// return id * BlockSize * BlockSize;
+// }
+
+ // To be implemented
+ // Insert a block at a particular location... need to make a room for that
+ Map<BlockScalar> insert(Index brow, Index bcol);
+
+ Index m_innerBSize; // Number of block rows
+ Index m_outerBSize; // Number of block columns
+ StorageIndex *m_innerOffset; // Starting index of each inner block (size m_innerBSize+1)
+ StorageIndex *m_outerOffset; // Starting index of each outer block (size m_outerBSize+1)
+ Index m_nonzerosblocks; // Total nonzeros blocks (lower than m_innerBSize x m_outerBSize)
+ Index m_nonzeros; // Total nonzeros elements
+ Scalar *m_values; //Values stored block column after block column (size m_nonzeros)
+ StorageIndex *m_blockPtr; // Pointer to the beginning of each block in m_values, size m_nonzeroblocks ... null for fixed-size blocks
+ StorageIndex *m_indices; //Inner block indices, size m_nonzerosblocks ... OK
+ StorageIndex *m_outerIndex; // Starting pointer of each block column in m_indices (size m_outerBSize)... OK
+ Index m_blockSize; // Size of a block for fixed-size blocks, otherwise -1
+};
+
+template<typename _Scalar, int _BlockAtCompileTime, int _Options, typename _StorageIndex>
+class BlockSparseMatrix<_Scalar, _BlockAtCompileTime, _Options, _StorageIndex>::BlockInnerIterator
+{
+ public:
+
+ enum{
+ Flags = _Options
+ };
+
+ BlockInnerIterator(const BlockSparseMatrix& mat, const Index outer)
+ : m_mat(mat),m_outer(outer),
+ m_id(mat.m_outerIndex[outer]),
+ m_end(mat.m_outerIndex[outer+1])
+ {
+ }
+
+ inline BlockInnerIterator& operator++() {m_id++; return *this; }
+
+ inline const Map<const BlockScalar> value() const
+ {
+ return Map<const BlockScalar>(&(m_mat.m_values[m_mat.blockPtr(m_id)]),
+ rows(),cols());
+ }
+ inline Map<BlockScalar> valueRef()
+ {
+ return Map<BlockScalar>(&(m_mat.m_values[m_mat.blockPtr(m_id)]),
+ rows(),cols());
+ }
+ // Block inner index
+ inline Index index() const {return m_mat.m_indices[m_id]; }
+ inline Index outer() const { return m_outer; }
+ // block row index
+ inline Index row() const {return index(); }
+ // block column index
+ inline Index col() const {return outer(); }
+ // FIXME Number of rows in the current block
+ inline Index rows() const { return (m_mat.m_blockSize==Dynamic) ? (m_mat.m_innerOffset[index()+1] - m_mat.m_innerOffset[index()]) : m_mat.m_blockSize; }
+ // Number of columns in the current block ...
+ inline Index cols() const { return (m_mat.m_blockSize==Dynamic) ? (m_mat.m_outerOffset[m_outer+1]-m_mat.m_outerOffset[m_outer]) : m_mat.m_blockSize;}
+ inline operator bool() const { return (m_id < m_end); }
+
+ protected:
+ const BlockSparseMatrix<_Scalar, _BlockAtCompileTime, _Options, StorageIndex>& m_mat;
+ const Index m_outer;
+ Index m_id;
+ Index m_end;
+};
+
+template<typename _Scalar, int _BlockAtCompileTime, int _Options, typename _StorageIndex>
+class BlockSparseMatrix<_Scalar, _BlockAtCompileTime, _Options, _StorageIndex>::InnerIterator
+{
+ public:
+ InnerIterator(const BlockSparseMatrix& mat, Index outer)
+ : m_mat(mat),m_outerB(mat.outerToBlock(outer)),m_outer(outer),
+ itb(mat, mat.outerToBlock(outer)),
+ m_offset(outer - mat.blockOuterIndex(m_outerB))
+ {
+ if (itb)
+ {
+ m_id = m_mat.blockInnerIndex(itb.index());
+ m_start = m_id;
+ m_end = m_mat.blockInnerIndex(itb.index()+1);
+ }
+ }
+ inline InnerIterator& operator++()
+ {
+ m_id++;
+ if (m_id >= m_end)
+ {
+ ++itb;
+ if (itb)
+ {
+ m_id = m_mat.blockInnerIndex(itb.index());
+ m_start = m_id;
+ m_end = m_mat.blockInnerIndex(itb.index()+1);
+ }
+ }
+ return *this;
+ }
+ inline const Scalar& value() const
+ {
+ return itb.value().coeff(m_id - m_start, m_offset);
+ }
+ inline Scalar& valueRef()
+ {
+ return itb.valueRef().coeff(m_id - m_start, m_offset);
+ }
+ inline Index index() const { return m_id; }
+ inline Index outer() const {return m_outer; }
+ inline Index col() const {return outer(); }
+ inline Index row() const { return index();}
+ inline operator bool() const
+ {
+ return itb;
+ }
+ protected:
+ const BlockSparseMatrix& m_mat;
+ const Index m_outer;
+ const Index m_outerB;
+ BlockInnerIterator itb; // Iterator through the blocks
+ const Index m_offset; // Position of this column in the block
+ Index m_start; // starting inner index of this block
+ Index m_id; // current inner index in the block
+ Index m_end; // starting inner index of the next block
+
+};
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSEBLOCKMATRIX_H
diff --git a/src/EigenUnsupported/src/SparseExtra/DynamicSparseMatrix.h b/src/EigenUnsupported/src/SparseExtra/DynamicSparseMatrix.h
new file mode 100644
index 0000000..42c99e4
--- /dev/null
+++ b/src/EigenUnsupported/src/SparseExtra/DynamicSparseMatrix.h
@@ -0,0 +1,404 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_DYNAMIC_SPARSEMATRIX_H
+#define EIGEN_DYNAMIC_SPARSEMATRIX_H
+
+namespace Eigen {
+
+/** \deprecated use a SparseMatrix in an uncompressed mode
+ *
+ * \class DynamicSparseMatrix
+ *
+ * \brief A sparse matrix class designed for matrix assembly purpose
+ *
+ * \param _Scalar the scalar type, i.e. the type of the coefficients
+ *
+ * Unlike SparseMatrix, this class provides a much higher degree of flexibility. In particular, it allows
+ * random read/write accesses in log(rho*outer_size) where \c rho is the probability that a coefficient is
+ * nonzero and outer_size is the number of columns if the matrix is column-major and the number of rows
+ * otherwise.
+ *
+ * Internally, the data are stored as a std::vector of compressed vector. The performances of random writes might
+ * decrease as the number of nonzeros per inner-vector increase. In practice, we observed very good performance
+ * till about 100 nonzeros/vector, and the performance remains relatively good till 500 nonzeros/vectors.
+ *
+ * \see SparseMatrix
+ */
+
+namespace internal {
+template<typename _Scalar, int _Options, typename _StorageIndex>
+struct traits<DynamicSparseMatrix<_Scalar, _Options, _StorageIndex> >
+{
+ typedef _Scalar Scalar;
+ typedef _StorageIndex StorageIndex;
+ typedef Sparse StorageKind;
+ typedef MatrixXpr XprKind;
+ enum {
+ RowsAtCompileTime = Dynamic,
+ ColsAtCompileTime = Dynamic,
+ MaxRowsAtCompileTime = Dynamic,
+ MaxColsAtCompileTime = Dynamic,
+ Flags = _Options | NestByRefBit | LvalueBit,
+ CoeffReadCost = NumTraits<Scalar>::ReadCost,
+ SupportedAccessPatterns = OuterRandomAccessPattern
+ };
+};
+}
+
+template<typename _Scalar, int _Options, typename _StorageIndex>
+ class DynamicSparseMatrix
+ : public SparseMatrixBase<DynamicSparseMatrix<_Scalar, _Options, _StorageIndex> >
+{
+ typedef SparseMatrixBase<DynamicSparseMatrix> Base;
+ using Base::convert_index;
+ public:
+ EIGEN_SPARSE_PUBLIC_INTERFACE(DynamicSparseMatrix)
+ // FIXME: why are these operator already alvailable ???
+ // EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(DynamicSparseMatrix, +=)
+ // EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(DynamicSparseMatrix, -=)
+ typedef MappedSparseMatrix<Scalar,Flags> Map;
+ using Base::IsRowMajor;
+ using Base::operator=;
+ enum {
+ Options = _Options
+ };
+
+ protected:
+
+ typedef DynamicSparseMatrix<Scalar,(Flags&~RowMajorBit)|(IsRowMajor?RowMajorBit:0), StorageIndex> TransposedSparseMatrix;
+
+ Index m_innerSize;
+ std::vector<internal::CompressedStorage<Scalar,StorageIndex> > m_data;
+
+ public:
+
+ inline Index rows() const { return IsRowMajor ? outerSize() : m_innerSize; }
+ inline Index cols() const { return IsRowMajor ? m_innerSize : outerSize(); }
+ inline Index innerSize() const { return m_innerSize; }
+ inline Index outerSize() const { return convert_index(m_data.size()); }
+ inline Index innerNonZeros(Index j) const { return m_data[j].size(); }
+
+ std::vector<internal::CompressedStorage<Scalar,StorageIndex> >& _data() { return m_data; }
+ const std::vector<internal::CompressedStorage<Scalar,StorageIndex> >& _data() const { return m_data; }
+
+ /** \returns the coefficient value at given position \a row, \a col
+ * This operation involes a log(rho*outer_size) binary search.
+ */
+ inline Scalar coeff(Index row, Index col) const
+ {
+ const Index outer = IsRowMajor ? row : col;
+ const Index inner = IsRowMajor ? col : row;
+ return m_data[outer].at(inner);
+ }
+
+ /** \returns a reference to the coefficient value at given position \a row, \a col
+ * This operation involes a log(rho*outer_size) binary search. If the coefficient does not
+ * exist yet, then a sorted insertion into a sequential buffer is performed.
+ */
+ inline Scalar& coeffRef(Index row, Index col)
+ {
+ const Index outer = IsRowMajor ? row : col;
+ const Index inner = IsRowMajor ? col : row;
+ return m_data[outer].atWithInsertion(inner);
+ }
+
+ class InnerIterator;
+ class ReverseInnerIterator;
+
+ void setZero()
+ {
+ for (Index j=0; j<outerSize(); ++j)
+ m_data[j].clear();
+ }
+
+ /** \returns the number of non zero coefficients */
+ Index nonZeros() const
+ {
+ Index res = 0;
+ for (Index j=0; j<outerSize(); ++j)
+ res += m_data[j].size();
+ return res;
+ }
+
+
+
+ void reserve(Index reserveSize = 1000)
+ {
+ if (outerSize()>0)
+ {
+ Index reserveSizePerVector = (std::max)(reserveSize/outerSize(),Index(4));
+ for (Index j=0; j<outerSize(); ++j)
+ {
+ m_data[j].reserve(reserveSizePerVector);
+ }
+ }
+ }
+
+ /** Does nothing: provided for compatibility with SparseMatrix */
+ inline void startVec(Index /*outer*/) {}
+
+ /** \returns a reference to the non zero coefficient at position \a row, \a col assuming that:
+ * - the nonzero does not already exist
+ * - the new coefficient is the last one of the given inner vector.
+ *
+ * \sa insert, insertBackByOuterInner */
+ inline Scalar& insertBack(Index row, Index col)
+ {
+ return insertBackByOuterInner(IsRowMajor?row:col, IsRowMajor?col:row);
+ }
+
+ /** \sa insertBack */
+ inline Scalar& insertBackByOuterInner(Index outer, Index inner)
+ {
+ eigen_assert(outer<Index(m_data.size()) && inner<m_innerSize && "out of range");
+ eigen_assert(((m_data[outer].size()==0) || (m_data[outer].index(m_data[outer].size()-1)<inner))
+ && "wrong sorted insertion");
+ m_data[outer].append(0, inner);
+ return m_data[outer].value(m_data[outer].size()-1);
+ }
+
+ inline Scalar& insert(Index row, Index col)
+ {
+ const Index outer = IsRowMajor ? row : col;
+ const Index inner = IsRowMajor ? col : row;
+
+ Index startId = 0;
+ Index id = static_cast<Index>(m_data[outer].size()) - 1;
+ m_data[outer].resize(id+2,1);
+
+ while ( (id >= startId) && (m_data[outer].index(id) > inner) )
+ {
+ m_data[outer].index(id+1) = m_data[outer].index(id);
+ m_data[outer].value(id+1) = m_data[outer].value(id);
+ --id;
+ }
+ m_data[outer].index(id+1) = inner;
+ m_data[outer].value(id+1) = 0;
+ return m_data[outer].value(id+1);
+ }
+
+ /** Does nothing: provided for compatibility with SparseMatrix */
+ inline void finalize() {}
+
+ /** Suppress all nonzeros which are smaller than \a reference under the tolerance \a epsilon */
+ void prune(Scalar reference, RealScalar epsilon = NumTraits<RealScalar>::dummy_precision())
+ {
+ for (Index j=0; j<outerSize(); ++j)
+ m_data[j].prune(reference,epsilon);
+ }
+
+ /** Resize the matrix without preserving the data (the matrix is set to zero)
+ */
+ void resize(Index rows, Index cols)
+ {
+ const Index outerSize = IsRowMajor ? rows : cols;
+ m_innerSize = convert_index(IsRowMajor ? cols : rows);
+ setZero();
+ if (Index(m_data.size()) != outerSize)
+ {
+ m_data.resize(outerSize);
+ }
+ }
+
+ void resizeAndKeepData(Index rows, Index cols)
+ {
+ const Index outerSize = IsRowMajor ? rows : cols;
+ const Index innerSize = IsRowMajor ? cols : rows;
+ if (m_innerSize>innerSize)
+ {
+ // remove all coefficients with innerCoord>=innerSize
+ // TODO
+ //std::cerr << "not implemented yet\n";
+ exit(2);
+ }
+ if (m_data.size() != outerSize)
+ {
+ m_data.resize(outerSize);
+ }
+ }
+
+ /** The class DynamicSparseMatrix is deprecated */
+ EIGEN_DEPRECATED inline DynamicSparseMatrix()
+ : m_innerSize(0), m_data(0)
+ {
+ #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+ EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+ #endif
+ eigen_assert(innerSize()==0 && outerSize()==0);
+ }
+
+ /** The class DynamicSparseMatrix is deprecated */
+ EIGEN_DEPRECATED inline DynamicSparseMatrix(Index rows, Index cols)
+ : m_innerSize(0)
+ {
+ #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+ EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+ #endif
+ resize(rows, cols);
+ }
+
+ /** The class DynamicSparseMatrix is deprecated */
+ template<typename OtherDerived>
+ EIGEN_DEPRECATED explicit inline DynamicSparseMatrix(const SparseMatrixBase<OtherDerived>& other)
+ : m_innerSize(0)
+ {
+ #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+ EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+ #endif
+ Base::operator=(other.derived());
+ }
+
+ inline DynamicSparseMatrix(const DynamicSparseMatrix& other)
+ : Base(), m_innerSize(0)
+ {
+ #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+ EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+ #endif
+ *this = other.derived();
+ }
+
+ inline void swap(DynamicSparseMatrix& other)
+ {
+ //EIGEN_DBG_SPARSE(std::cout << "SparseMatrix:: swap\n");
+ std::swap(m_innerSize, other.m_innerSize);
+ //std::swap(m_outerSize, other.m_outerSize);
+ m_data.swap(other.m_data);
+ }
+
+ inline DynamicSparseMatrix& operator=(const DynamicSparseMatrix& other)
+ {
+ if (other.isRValue())
+ {
+ swap(other.const_cast_derived());
+ }
+ else
+ {
+ resize(other.rows(), other.cols());
+ m_data = other.m_data;
+ }
+ return *this;
+ }
+
+ /** Destructor */
+ inline ~DynamicSparseMatrix() {}
+
+ public:
+
+ /** \deprecated
+ * Set the matrix to zero and reserve the memory for \a reserveSize nonzero coefficients. */
+ EIGEN_DEPRECATED void startFill(Index reserveSize = 1000)
+ {
+ setZero();
+ reserve(reserveSize);
+ }
+
+ /** \deprecated use insert()
+ * inserts a nonzero coefficient at given coordinates \a row, \a col and returns its reference assuming that:
+ * 1 - the coefficient does not exist yet
+ * 2 - this the coefficient with greater inner coordinate for the given outer coordinate.
+ * In other words, assuming \c *this is column-major, then there must not exists any nonzero coefficient of coordinates
+ * \c i \c x \a col such that \c i >= \a row. Otherwise the matrix is invalid.
+ *
+ * \see fillrand(), coeffRef()
+ */
+ EIGEN_DEPRECATED Scalar& fill(Index row, Index col)
+ {
+ const Index outer = IsRowMajor ? row : col;
+ const Index inner = IsRowMajor ? col : row;
+ return insertBack(outer,inner);
+ }
+
+ /** \deprecated use insert()
+ * Like fill() but with random inner coordinates.
+ * Compared to the generic coeffRef(), the unique limitation is that we assume
+ * the coefficient does not exist yet.
+ */
+ EIGEN_DEPRECATED Scalar& fillrand(Index row, Index col)
+ {
+ return insert(row,col);
+ }
+
+ /** \deprecated use finalize()
+ * Does nothing. Provided for compatibility with SparseMatrix. */
+ EIGEN_DEPRECATED void endFill() {}
+
+# ifdef EIGEN_DYNAMICSPARSEMATRIX_PLUGIN
+# include EIGEN_DYNAMICSPARSEMATRIX_PLUGIN
+# endif
+ };
+
+template<typename Scalar, int _Options, typename _StorageIndex>
+class DynamicSparseMatrix<Scalar,_Options,_StorageIndex>::InnerIterator : public SparseVector<Scalar,_Options,_StorageIndex>::InnerIterator
+{
+ typedef typename SparseVector<Scalar,_Options,_StorageIndex>::InnerIterator Base;
+ public:
+ InnerIterator(const DynamicSparseMatrix& mat, Index outer)
+ : Base(mat.m_data[outer]), m_outer(outer)
+ {}
+
+ inline Index row() const { return IsRowMajor ? m_outer : Base::index(); }
+ inline Index col() const { return IsRowMajor ? Base::index() : m_outer; }
+ inline Index outer() const { return m_outer; }
+
+ protected:
+ const Index m_outer;
+};
+
+template<typename Scalar, int _Options, typename _StorageIndex>
+class DynamicSparseMatrix<Scalar,_Options,_StorageIndex>::ReverseInnerIterator : public SparseVector<Scalar,_Options,_StorageIndex>::ReverseInnerIterator
+{
+ typedef typename SparseVector<Scalar,_Options,_StorageIndex>::ReverseInnerIterator Base;
+ public:
+ ReverseInnerIterator(const DynamicSparseMatrix& mat, Index outer)
+ : Base(mat.m_data[outer]), m_outer(outer)
+ {}
+
+ inline Index row() const { return IsRowMajor ? m_outer : Base::index(); }
+ inline Index col() const { return IsRowMajor ? Base::index() : m_outer; }
+ inline Index outer() const { return m_outer; }
+
+ protected:
+ const Index m_outer;
+};
+
+namespace internal {
+
+template<typename _Scalar, int _Options, typename _StorageIndex>
+struct evaluator<DynamicSparseMatrix<_Scalar,_Options,_StorageIndex> >
+ : evaluator_base<DynamicSparseMatrix<_Scalar,_Options,_StorageIndex> >
+{
+ typedef _Scalar Scalar;
+ typedef DynamicSparseMatrix<_Scalar,_Options,_StorageIndex> SparseMatrixType;
+ typedef typename SparseMatrixType::InnerIterator InnerIterator;
+ typedef typename SparseMatrixType::ReverseInnerIterator ReverseInnerIterator;
+
+ enum {
+ CoeffReadCost = NumTraits<_Scalar>::ReadCost,
+ Flags = SparseMatrixType::Flags
+ };
+
+ evaluator() : m_matrix(0) {}
+ evaluator(const SparseMatrixType &mat) : m_matrix(&mat) {}
+
+ operator SparseMatrixType&() { return m_matrix->const_cast_derived(); }
+ operator const SparseMatrixType&() const { return *m_matrix; }
+
+ Scalar coeff(Index row, Index col) const { return m_matrix->coeff(row,col); }
+
+ Index nonZerosEstimate() const { return m_matrix->nonZeros(); }
+
+ const SparseMatrixType *m_matrix;
+};
+
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_DYNAMIC_SPARSEMATRIX_H
diff --git a/src/EigenUnsupported/src/SparseExtra/MarketIO.h b/src/EigenUnsupported/src/SparseExtra/MarketIO.h
new file mode 100644
index 0000000..dd786d5
--- /dev/null
+++ b/src/EigenUnsupported/src/SparseExtra/MarketIO.h
@@ -0,0 +1,282 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2012 Desire NUENTSA WAKAM <desire.nuentsa_wakam@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSE_MARKET_IO_H
+#define EIGEN_SPARSE_MARKET_IO_H
+
+#include <iostream>
+#include <vector>
+
+namespace Eigen {
+
+namespace internal
+{
+ template <typename Scalar, typename StorageIndex>
+ inline void GetMarketLine (const char* line, StorageIndex& i, StorageIndex& j, Scalar& value)
+ {
+ std::stringstream sline(line);
+ sline >> i >> j >> value;
+ }
+
+ template<> inline void GetMarketLine (const char* line, int& i, int& j, float& value)
+ { std::sscanf(line, "%d %d %g", &i, &j, &value); }
+
+ template<> inline void GetMarketLine (const char* line, int& i, int& j, double& value)
+ { std::sscanf(line, "%d %d %lg", &i, &j, &value); }
+
+ template<> inline void GetMarketLine (const char* line, int& i, int& j, std::complex<float>& value)
+ { std::sscanf(line, "%d %d %g %g", &i, &j, &numext::real_ref(value), &numext::imag_ref(value)); }
+
+ template<> inline void GetMarketLine (const char* line, int& i, int& j, std::complex<double>& value)
+ { std::sscanf(line, "%d %d %lg %lg", &i, &j, &numext::real_ref(value), &numext::imag_ref(value)); }
+
+ template <typename Scalar, typename StorageIndex>
+ inline void GetMarketLine (const char* line, StorageIndex& i, StorageIndex& j, std::complex<Scalar>& value)
+ {
+ std::stringstream sline(line);
+ Scalar valR, valI;
+ sline >> i >> j >> valR >> valI;
+ value = std::complex<Scalar>(valR,valI);
+ }
+
+ template <typename RealScalar>
+ inline void GetVectorElt (const std::string& line, RealScalar& val)
+ {
+ std::istringstream newline(line);
+ newline >> val;
+ }
+
+ template <typename RealScalar>
+ inline void GetVectorElt (const std::string& line, std::complex<RealScalar>& val)
+ {
+ RealScalar valR, valI;
+ std::istringstream newline(line);
+ newline >> valR >> valI;
+ val = std::complex<RealScalar>(valR, valI);
+ }
+
+ template<typename Scalar>
+ inline void putMarketHeader(std::string& header,int sym)
+ {
+ header= "%%MatrixMarket matrix coordinate ";
+ if(internal::is_same<Scalar, std::complex<float> >::value || internal::is_same<Scalar, std::complex<double> >::value)
+ {
+ header += " complex";
+ if(sym == Symmetric) header += " symmetric";
+ else if (sym == SelfAdjoint) header += " Hermitian";
+ else header += " general";
+ }
+ else
+ {
+ header += " real";
+ if(sym == Symmetric) header += " symmetric";
+ else header += " general";
+ }
+ }
+
+ template<typename Scalar, typename StorageIndex>
+ inline void PutMatrixElt(Scalar value, StorageIndex row, StorageIndex col, std::ofstream& out)
+ {
+ out << row << " "<< col << " " << value << "\n";
+ }
+ template<typename Scalar, typename StorageIndex>
+ inline void PutMatrixElt(std::complex<Scalar> value, StorageIndex row, StorageIndex col, std::ofstream& out)
+ {
+ out << row << " " << col << " " << value.real() << " " << value.imag() << "\n";
+ }
+
+
+ template<typename Scalar>
+ inline void putVectorElt(Scalar value, std::ofstream& out)
+ {
+ out << value << "\n";
+ }
+ template<typename Scalar>
+ inline void putVectorElt(std::complex<Scalar> value, std::ofstream& out)
+ {
+ out << value.real() << " " << value.imag()<< "\n";
+ }
+
+} // end namespace internal
+
+inline bool getMarketHeader(const std::string& filename, int& sym, bool& iscomplex, bool& isvector)
+{
+ sym = 0;
+ iscomplex = false;
+ isvector = false;
+ std::ifstream in(filename.c_str(),std::ios::in);
+ if(!in)
+ return false;
+
+ std::string line;
+ // The matrix header is always the first line in the file
+ std::getline(in, line); eigen_assert(in.good());
+
+ std::stringstream fmtline(line);
+ std::string substr[5];
+ fmtline>> substr[0] >> substr[1] >> substr[2] >> substr[3] >> substr[4];
+ if(substr[2].compare("array") == 0) isvector = true;
+ if(substr[3].compare("complex") == 0) iscomplex = true;
+ if(substr[4].compare("symmetric") == 0) sym = Symmetric;
+ else if (substr[4].compare("Hermitian") == 0) sym = SelfAdjoint;
+
+ return true;
+}
+
+template<typename SparseMatrixType>
+bool loadMarket(SparseMatrixType& mat, const std::string& filename)
+{
+ typedef typename SparseMatrixType::Scalar Scalar;
+ typedef typename SparseMatrixType::StorageIndex StorageIndex;
+ std::ifstream input(filename.c_str(),std::ios::in);
+ if(!input)
+ return false;
+
+ char rdbuffer[4096];
+ input.rdbuf()->pubsetbuf(rdbuffer, 4096);
+
+ const int maxBuffersize = 2048;
+ char buffer[maxBuffersize];
+
+ bool readsizes = false;
+
+ typedef Triplet<Scalar,StorageIndex> T;
+ std::vector<T> elements;
+
+ Index M(-1), N(-1), NNZ(-1);
+ Index count = 0;
+ while(input.getline(buffer, maxBuffersize))
+ {
+ // skip comments
+ //NOTE An appropriate test should be done on the header to get the symmetry
+ if(buffer[0]=='%')
+ continue;
+
+ if(!readsizes)
+ {
+ std::stringstream line(buffer);
+ line >> M >> N >> NNZ;
+ if(M > 0 && N > 0)
+ {
+ readsizes = true;
+ mat.resize(M,N);
+ mat.reserve(NNZ);
+ }
+ }
+ else
+ {
+ StorageIndex i(-1), j(-1);
+ Scalar value;
+ internal::GetMarketLine(buffer, i, j, value);
+
+ i--;
+ j--;
+ if(i>=0 && j>=0 && i<M && j<N)
+ {
+ ++count;
+ elements.push_back(T(i,j,value));
+ }
+ else
+ std::cerr << "Invalid read: " << i << "," << j << "\n";
+ }
+ }
+
+ mat.setFromTriplets(elements.begin(), elements.end());
+ if(count!=NNZ)
+ std::cerr << count << "!=" << NNZ << "\n";
+
+ input.close();
+ return true;
+}
+
+template<typename VectorType>
+bool loadMarketVector(VectorType& vec, const std::string& filename)
+{
+ typedef typename VectorType::Scalar Scalar;
+ std::ifstream in(filename.c_str(), std::ios::in);
+ if(!in)
+ return false;
+
+ std::string line;
+ int n(0), col(0);
+ do
+ { // Skip comments
+ std::getline(in, line); eigen_assert(in.good());
+ } while (line[0] == '%');
+ std::istringstream newline(line);
+ newline >> n >> col;
+ eigen_assert(n>0 && col>0);
+ vec.resize(n);
+ int i = 0;
+ Scalar value;
+ while ( std::getline(in, line) && (i < n) ){
+ internal::GetVectorElt(line, value);
+ vec(i++) = value;
+ }
+ in.close();
+ if (i!=n){
+ std::cerr<< "Unable to read all elements from file " << filename << "\n";
+ return false;
+ }
+ return true;
+}
+
+template<typename SparseMatrixType>
+bool saveMarket(const SparseMatrixType& mat, const std::string& filename, int sym = 0)
+{
+ typedef typename SparseMatrixType::Scalar Scalar;
+ typedef typename SparseMatrixType::RealScalar RealScalar;
+ std::ofstream out(filename.c_str(),std::ios::out);
+ if(!out)
+ return false;
+
+ out.flags(std::ios_base::scientific);
+ out.precision(std::numeric_limits<RealScalar>::digits10 + 2);
+ std::string header;
+ internal::putMarketHeader<Scalar>(header, sym);
+ out << header << std::endl;
+ out << mat.rows() << " " << mat.cols() << " " << mat.nonZeros() << "\n";
+ int count = 0;
+ for(int j=0; j<mat.outerSize(); ++j)
+ for(typename SparseMatrixType::InnerIterator it(mat,j); it; ++it)
+ {
+ ++ count;
+ internal::PutMatrixElt(it.value(), it.row()+1, it.col()+1, out);
+ }
+ out.close();
+ return true;
+}
+
+template<typename VectorType>
+bool saveMarketVector (const VectorType& vec, const std::string& filename)
+{
+ typedef typename VectorType::Scalar Scalar;
+ typedef typename VectorType::RealScalar RealScalar;
+ std::ofstream out(filename.c_str(),std::ios::out);
+ if(!out)
+ return false;
+
+ out.flags(std::ios_base::scientific);
+ out.precision(std::numeric_limits<RealScalar>::digits10 + 2);
+ if(internal::is_same<Scalar, std::complex<float> >::value || internal::is_same<Scalar, std::complex<double> >::value)
+ out << "%%MatrixMarket matrix array complex general\n";
+ else
+ out << "%%MatrixMarket matrix array real general\n";
+ out << vec.size() << " "<< 1 << "\n";
+ for (int i=0; i < vec.size(); i++){
+ internal::putVectorElt(vec(i), out);
+ }
+ out.close();
+ return true;
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPARSE_MARKET_IO_H
diff --git a/src/EigenUnsupported/src/SparseExtra/MatrixMarketIterator.h b/src/EigenUnsupported/src/SparseExtra/MatrixMarketIterator.h
new file mode 100644
index 0000000..02916ea
--- /dev/null
+++ b/src/EigenUnsupported/src/SparseExtra/MatrixMarketIterator.h
@@ -0,0 +1,247 @@
+
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Desire NUENTSA WAKAM <desire.nuentsa_wakam@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BROWSE_MATRICES_H
+#define EIGEN_BROWSE_MATRICES_H
+
+namespace Eigen {
+
+enum {
+ SPD = 0x100,
+ NonSymmetric = 0x0
+};
+
+/**
+ * @brief Iterator to browse matrices from a specified folder
+ *
+ * This is used to load all the matrices from a folder.
+ * The matrices should be in Matrix Market format
+ * It is assumed that the matrices are named as matname.mtx
+ * and matname_SPD.mtx if the matrix is Symmetric and positive definite (or Hermitian)
+ * The right hand side vectors are loaded as well, if they exist.
+ * They should be named as matname_b.mtx.
+ * Note that the right hand side for a SPD matrix is named as matname_SPD_b.mtx
+ *
+ * Sometimes a reference solution is available. In this case, it should be named as matname_x.mtx
+ *
+ * Sample code
+ * \code
+ *
+ * \endcode
+ *
+ * \tparam Scalar The scalar type
+ */
+template <typename Scalar>
+class MatrixMarketIterator
+{
+ typedef typename NumTraits<Scalar>::Real RealScalar;
+ public:
+ typedef Matrix<Scalar,Dynamic,1> VectorType;
+ typedef SparseMatrix<Scalar,ColMajor> MatrixType;
+
+ public:
+ MatrixMarketIterator(const std::string &folder)
+ : m_sym(0), m_isvalid(false), m_matIsLoaded(false), m_hasRhs(false), m_hasrefX(false), m_folder(folder)
+ {
+ m_folder_id = opendir(folder.c_str());
+ if(m_folder_id)
+ Getnextvalidmatrix();
+ }
+
+ ~MatrixMarketIterator()
+ {
+ if (m_folder_id) closedir(m_folder_id);
+ }
+
+ inline MatrixMarketIterator& operator++()
+ {
+ m_matIsLoaded = false;
+ m_hasrefX = false;
+ m_hasRhs = false;
+ Getnextvalidmatrix();
+ return *this;
+ }
+ inline operator bool() const { return m_isvalid;}
+
+ /** Return the sparse matrix corresponding to the current file */
+ inline MatrixType& matrix()
+ {
+ // Read the matrix
+ if (m_matIsLoaded) return m_mat;
+
+ std::string matrix_file = m_folder + "/" + m_matname + ".mtx";
+ if ( !loadMarket(m_mat, matrix_file))
+ {
+ std::cerr << "Warning loadMarket failed when loading \"" << matrix_file << "\"" << std::endl;
+ m_matIsLoaded = false;
+ return m_mat;
+ }
+ m_matIsLoaded = true;
+
+ if (m_sym != NonSymmetric)
+ {
+ // Check whether we need to restore a full matrix:
+ RealScalar diag_norm = m_mat.diagonal().norm();
+ RealScalar lower_norm = m_mat.template triangularView<Lower>().norm();
+ RealScalar upper_norm = m_mat.template triangularView<Upper>().norm();
+ if(lower_norm>diag_norm && upper_norm==diag_norm)
+ {
+ // only the lower part is stored
+ MatrixType tmp(m_mat);
+ m_mat = tmp.template selfadjointView<Lower>();
+ }
+ else if(upper_norm>diag_norm && lower_norm==diag_norm)
+ {
+ // only the upper part is stored
+ MatrixType tmp(m_mat);
+ m_mat = tmp.template selfadjointView<Upper>();
+ }
+ }
+ return m_mat;
+ }
+
+ /** Return the right hand side corresponding to the current matrix.
+ * If the rhs file is not provided, a random rhs is generated
+ */
+ inline VectorType& rhs()
+ {
+ // Get the right hand side
+ if (m_hasRhs) return m_rhs;
+
+ std::string rhs_file;
+ rhs_file = m_folder + "/" + m_matname + "_b.mtx"; // The pattern is matname_b.mtx
+ m_hasRhs = Fileexists(rhs_file);
+ if (m_hasRhs)
+ {
+ m_rhs.resize(m_mat.cols());
+ m_hasRhs = loadMarketVector(m_rhs, rhs_file);
+ }
+ if (!m_hasRhs)
+ {
+ // Generate a random right hand side
+ if (!m_matIsLoaded) this->matrix();
+ m_refX.resize(m_mat.cols());
+ m_refX.setRandom();
+ m_rhs = m_mat * m_refX;
+ m_hasrefX = true;
+ m_hasRhs = true;
+ }
+ return m_rhs;
+ }
+
+ /** Return a reference solution
+ * If it is not provided and if the right hand side is not available
+ * then refX is randomly generated such that A*refX = b
+ * where A and b are the matrix and the rhs.
+ * Note that when a rhs is provided, refX is not available
+ */
+ inline VectorType& refX()
+ {
+ // Check if a reference solution is provided
+ if (m_hasrefX) return m_refX;
+
+ std::string lhs_file;
+ lhs_file = m_folder + "/" + m_matname + "_x.mtx";
+ m_hasrefX = Fileexists(lhs_file);
+ if (m_hasrefX)
+ {
+ m_refX.resize(m_mat.cols());
+ m_hasrefX = loadMarketVector(m_refX, lhs_file);
+ }
+ else
+ m_refX.resize(0);
+ return m_refX;
+ }
+
+ inline std::string& matname() { return m_matname; }
+
+ inline int sym() { return m_sym; }
+
+ bool hasRhs() {return m_hasRhs; }
+ bool hasrefX() {return m_hasrefX; }
+ bool isFolderValid() { return bool(m_folder_id); }
+
+ protected:
+
+ inline bool Fileexists(std::string file)
+ {
+ std::ifstream file_id(file.c_str());
+ if (!file_id.good() )
+ {
+ return false;
+ }
+ else
+ {
+ file_id.close();
+ return true;
+ }
+ }
+
+ void Getnextvalidmatrix( )
+ {
+ m_isvalid = false;
+ // Here, we return with the next valid matrix in the folder
+ while ( (m_curs_id = readdir(m_folder_id)) != NULL) {
+ m_isvalid = false;
+ std::string curfile;
+ curfile = m_folder + "/" + m_curs_id->d_name;
+ // Discard if it is a folder
+ if (m_curs_id->d_type == DT_DIR) continue; //FIXME This may not be available on non BSD systems
+// struct stat st_buf;
+// stat (curfile.c_str(), &st_buf);
+// if (S_ISDIR(st_buf.st_mode)) continue;
+
+ // Determine from the header if it is a matrix or a right hand side
+ bool isvector,iscomplex=false;
+ if(!getMarketHeader(curfile,m_sym,iscomplex,isvector)) continue;
+ if(isvector) continue;
+ if (!iscomplex)
+ {
+ if(internal::is_same<Scalar, std::complex<float> >::value || internal::is_same<Scalar, std::complex<double> >::value)
+ continue;
+ }
+ if (iscomplex)
+ {
+ if(internal::is_same<Scalar, float>::value || internal::is_same<Scalar, double>::value)
+ continue;
+ }
+
+
+ // Get the matrix name
+ std::string filename = m_curs_id->d_name;
+ m_matname = filename.substr(0, filename.length()-4);
+
+ // Find if the matrix is SPD
+ size_t found = m_matname.find("SPD");
+ if( (found!=std::string::npos) && (m_sym != NonSymmetric) )
+ m_sym = SPD;
+
+ m_isvalid = true;
+ break;
+ }
+ }
+ int m_sym; // Symmetry of the matrix
+ MatrixType m_mat; // Current matrix
+ VectorType m_rhs; // Current vector
+ VectorType m_refX; // The reference solution, if exists
+ std::string m_matname; // Matrix Name
+ bool m_isvalid;
+ bool m_matIsLoaded; // Determine if the matrix has already been loaded from the file
+ bool m_hasRhs; // The right hand side exists
+ bool m_hasrefX; // A reference solution is provided
+ std::string m_folder;
+ DIR * m_folder_id;
+ struct dirent *m_curs_id;
+
+};
+
+} // end namespace Eigen
+
+#endif
diff --git a/src/EigenUnsupported/src/SparseExtra/RandomSetter.h b/src/EigenUnsupported/src/SparseExtra/RandomSetter.h
new file mode 100644
index 0000000..985702b
--- /dev/null
+++ b/src/EigenUnsupported/src/SparseExtra/RandomSetter.h
@@ -0,0 +1,349 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_RANDOMSETTER_H
+#define EIGEN_RANDOMSETTER_H
+
+#if defined(EIGEN_GOOGLEHASH_SUPPORT)
+// Ensure the ::google namespace exists, required for checking existence of
+// ::google::dense_hash_map and ::google::sparse_hash_map.
+namespace google {}
+#endif
+
+namespace Eigen {
+
+/** Represents a std::map
+ *
+ * \see RandomSetter
+ */
+template<typename Scalar> struct StdMapTraits
+{
+ typedef int KeyType;
+ typedef std::map<KeyType,Scalar> Type;
+ enum {
+ IsSorted = 1
+ };
+
+ static void setInvalidKey(Type&, const KeyType&) {}
+};
+
+#ifdef EIGEN_UNORDERED_MAP_SUPPORT
+/** Represents a std::unordered_map
+ *
+ * To use it you need to both define EIGEN_UNORDERED_MAP_SUPPORT and include the unordered_map header file
+ * yourself making sure that unordered_map is defined in the std namespace.
+ *
+ * For instance, with current version of gcc you can either enable C++0x standard (-std=c++0x) or do:
+ * \code
+ * #include <tr1/unordered_map>
+ * #define EIGEN_UNORDERED_MAP_SUPPORT
+ * namespace std {
+ * using std::tr1::unordered_map;
+ * }
+ * \endcode
+ *
+ * \see RandomSetter
+ */
+template<typename Scalar> struct StdUnorderedMapTraits
+{
+ typedef int KeyType;
+ typedef std::unordered_map<KeyType,Scalar> Type;
+ enum {
+ IsSorted = 0
+ };
+
+ static void setInvalidKey(Type&, const KeyType&) {}
+};
+#endif // EIGEN_UNORDERED_MAP_SUPPORT
+
+#if defined(EIGEN_GOOGLEHASH_SUPPORT)
+
+namespace google {
+
+// Namespace work-around, since sometimes dense_hash_map and sparse_hash_map
+// are in the global namespace, and other times they are under ::google.
+using namespace ::google;
+
+template<typename KeyType, typename Scalar>
+struct DenseHashMap {
+ typedef dense_hash_map<KeyType, Scalar> type;
+};
+
+template<typename KeyType, typename Scalar>
+struct SparseHashMap {
+ typedef sparse_hash_map<KeyType, Scalar> type;
+};
+
+} // namespace google
+
+/** Represents a google::dense_hash_map
+ *
+ * \see RandomSetter
+ */
+template<typename Scalar> struct GoogleDenseHashMapTraits
+{
+ typedef int KeyType;
+ typedef typename google::DenseHashMap<KeyType,Scalar>::type Type;
+ enum {
+ IsSorted = 0
+ };
+
+ static void setInvalidKey(Type& map, const KeyType& k)
+ { map.set_empty_key(k); }
+};
+
+/** Represents a google::sparse_hash_map
+ *
+ * \see RandomSetter
+ */
+template<typename Scalar> struct GoogleSparseHashMapTraits
+{
+ typedef int KeyType;
+ typedef typename google::SparseHashMap<KeyType,Scalar>::type Type;
+ enum {
+ IsSorted = 0
+ };
+
+ static void setInvalidKey(Type&, const KeyType&) {}
+};
+#endif
+
+/** \class RandomSetter
+ *
+ * \brief The RandomSetter is a wrapper object allowing to set/update a sparse matrix with random access
+ *
+ * \tparam SparseMatrixType the type of the sparse matrix we are updating
+ * \tparam MapTraits a traits class representing the map implementation used for the temporary sparse storage.
+ * Its default value depends on the system.
+ * \tparam OuterPacketBits defines the number of rows (or columns) manage by a single map object
+ * as a power of two exponent.
+ *
+ * This class temporarily represents a sparse matrix object using a generic map implementation allowing for
+ * efficient random access. The conversion from the compressed representation to a hash_map object is performed
+ * in the RandomSetter constructor, while the sparse matrix is updated back at destruction time. This strategy
+ * suggest the use of nested blocks as in this example:
+ *
+ * \code
+ * SparseMatrix<double> m(rows,cols);
+ * {
+ * RandomSetter<SparseMatrix<double> > w(m);
+ * // don't use m but w instead with read/write random access to the coefficients:
+ * for(;;)
+ * w(rand(),rand()) = rand;
+ * }
+ * // when w is deleted, the data are copied back to m
+ * // and m is ready to use.
+ * \endcode
+ *
+ * Since hash_map objects are not fully sorted, representing a full matrix as a single hash_map would
+ * involve a big and costly sort to update the compressed matrix back. To overcome this issue, a RandomSetter
+ * use multiple hash_map, each representing 2^OuterPacketBits columns or rows according to the storage order.
+ * To reach optimal performance, this value should be adjusted according to the average number of nonzeros
+ * per rows/columns.
+ *
+ * The possible values for the template parameter MapTraits are:
+ * - \b StdMapTraits: corresponds to std::map. (does not perform very well)
+ * - \b GnuHashMapTraits: corresponds to __gnu_cxx::hash_map (available only with GCC)
+ * - \b GoogleDenseHashMapTraits: corresponds to google::dense_hash_map (best efficiency, reasonable memory consumption)
+ * - \b GoogleSparseHashMapTraits: corresponds to google::sparse_hash_map (best memory consumption, relatively good performance)
+ *
+ * The default map implementation depends on the availability, and the preferred order is:
+ * GoogleSparseHashMapTraits, GnuHashMapTraits, and finally StdMapTraits.
+ *
+ * For performance and memory consumption reasons it is highly recommended to use one of
+ * Google's hash_map implementations. To enable the support for them, you must define
+ * EIGEN_GOOGLEHASH_SUPPORT. This will include both <google/dense_hash_map> and
+ * <google/sparse_hash_map> for you.
+ *
+ * \see https://github.com/sparsehash/sparsehash
+ */
+template<typename SparseMatrixType,
+ template <typename T> class MapTraits =
+#if defined(EIGEN_GOOGLEHASH_SUPPORT)
+ GoogleDenseHashMapTraits
+#elif defined(_HASH_MAP)
+ GnuHashMapTraits
+#else
+ StdMapTraits
+#endif
+ ,int OuterPacketBits = 6>
+class RandomSetter
+{
+ typedef typename SparseMatrixType::Scalar Scalar;
+ typedef typename SparseMatrixType::StorageIndex StorageIndex;
+
+ struct ScalarWrapper
+ {
+ ScalarWrapper() : value(0) {}
+ Scalar value;
+ };
+ typedef typename MapTraits<ScalarWrapper>::KeyType KeyType;
+ typedef typename MapTraits<ScalarWrapper>::Type HashMapType;
+ static const int OuterPacketMask = (1 << OuterPacketBits) - 1;
+ enum {
+ SwapStorage = 1 - MapTraits<ScalarWrapper>::IsSorted,
+ TargetRowMajor = (SparseMatrixType::Flags & RowMajorBit) ? 1 : 0,
+ SetterRowMajor = SwapStorage ? 1-TargetRowMajor : TargetRowMajor
+ };
+
+ public:
+
+ /** Constructs a random setter object from the sparse matrix \a target
+ *
+ * Note that the initial value of \a target are imported. If you want to re-set
+ * a sparse matrix from scratch, then you must set it to zero first using the
+ * setZero() function.
+ */
+ inline RandomSetter(SparseMatrixType& target)
+ : mp_target(&target)
+ {
+ const Index outerSize = SwapStorage ? target.innerSize() : target.outerSize();
+ const Index innerSize = SwapStorage ? target.outerSize() : target.innerSize();
+ m_outerPackets = outerSize >> OuterPacketBits;
+ if (outerSize&OuterPacketMask)
+ m_outerPackets += 1;
+ m_hashmaps = new HashMapType[m_outerPackets];
+ // compute number of bits needed to store inner indices
+ Index aux = innerSize - 1;
+ m_keyBitsOffset = 0;
+ while (aux)
+ {
+ ++m_keyBitsOffset;
+ aux = aux >> 1;
+ }
+ KeyType ik = (1<<(OuterPacketBits+m_keyBitsOffset));
+ for (Index k=0; k<m_outerPackets; ++k)
+ MapTraits<ScalarWrapper>::setInvalidKey(m_hashmaps[k],ik);
+
+ // insert current coeffs
+ for (Index j=0; j<mp_target->outerSize(); ++j)
+ for (typename SparseMatrixType::InnerIterator it(*mp_target,j); it; ++it)
+ (*this)(TargetRowMajor?j:it.index(), TargetRowMajor?it.index():j) = it.value();
+ }
+
+ /** Destructor updating back the sparse matrix target */
+ ~RandomSetter()
+ {
+ KeyType keyBitsMask = (1<<m_keyBitsOffset)-1;
+ if (!SwapStorage) // also means the map is sorted
+ {
+ mp_target->setZero();
+ mp_target->makeCompressed();
+ mp_target->reserve(nonZeros());
+ Index prevOuter = -1;
+ for (Index k=0; k<m_outerPackets; ++k)
+ {
+ const Index outerOffset = (1<<OuterPacketBits) * k;
+ typename HashMapType::iterator end = m_hashmaps[k].end();
+ for (typename HashMapType::iterator it = m_hashmaps[k].begin(); it!=end; ++it)
+ {
+ const Index outer = (it->first >> m_keyBitsOffset) + outerOffset;
+ const Index inner = it->first & keyBitsMask;
+ if (prevOuter!=outer)
+ {
+ for (Index j=prevOuter+1;j<=outer;++j)
+ mp_target->startVec(j);
+ prevOuter = outer;
+ }
+ mp_target->insertBackByOuterInner(outer, inner) = it->second.value;
+ }
+ }
+ mp_target->finalize();
+ }
+ else
+ {
+ VectorXi positions(mp_target->outerSize());
+ positions.setZero();
+ // pass 1
+ for (Index k=0; k<m_outerPackets; ++k)
+ {
+ typename HashMapType::iterator end = m_hashmaps[k].end();
+ for (typename HashMapType::iterator it = m_hashmaps[k].begin(); it!=end; ++it)
+ {
+ const Index outer = it->first & keyBitsMask;
+ ++positions[outer];
+ }
+ }
+ // prefix sum
+ StorageIndex count = 0;
+ for (Index j=0; j<mp_target->outerSize(); ++j)
+ {
+ StorageIndex tmp = positions[j];
+ mp_target->outerIndexPtr()[j] = count;
+ positions[j] = count;
+ count += tmp;
+ }
+ mp_target->makeCompressed();
+ mp_target->outerIndexPtr()[mp_target->outerSize()] = count;
+ mp_target->resizeNonZeros(count);
+ // pass 2
+ for (Index k=0; k<m_outerPackets; ++k)
+ {
+ const Index outerOffset = (1<<OuterPacketBits) * k;
+ typename HashMapType::iterator end = m_hashmaps[k].end();
+ for (typename HashMapType::iterator it = m_hashmaps[k].begin(); it!=end; ++it)
+ {
+ const Index inner = (it->first >> m_keyBitsOffset) + outerOffset;
+ const Index outer = it->first & keyBitsMask;
+ // sorted insertion
+ // Note that we have to deal with at most 2^OuterPacketBits unsorted coefficients,
+ // moreover those 2^OuterPacketBits coeffs are likely to be sparse, an so only a
+ // small fraction of them have to be sorted, whence the following simple procedure:
+ Index posStart = mp_target->outerIndexPtr()[outer];
+ Index i = (positions[outer]++) - 1;
+ while ( (i >= posStart) && (mp_target->innerIndexPtr()[i] > inner) )
+ {
+ mp_target->valuePtr()[i+1] = mp_target->valuePtr()[i];
+ mp_target->innerIndexPtr()[i+1] = mp_target->innerIndexPtr()[i];
+ --i;
+ }
+ mp_target->innerIndexPtr()[i+1] = internal::convert_index<StorageIndex>(inner);
+ mp_target->valuePtr()[i+1] = it->second.value;
+ }
+ }
+ }
+ delete[] m_hashmaps;
+ }
+
+ /** \returns a reference to the coefficient at given coordinates \a row, \a col */
+ Scalar& operator() (Index row, Index col)
+ {
+ const Index outer = SetterRowMajor ? row : col;
+ const Index inner = SetterRowMajor ? col : row;
+ const Index outerMajor = outer >> OuterPacketBits; // index of the packet/map
+ const Index outerMinor = outer & OuterPacketMask; // index of the inner vector in the packet
+ const KeyType key = internal::convert_index<KeyType>((outerMinor<<m_keyBitsOffset) | inner);
+ return m_hashmaps[outerMajor][key].value;
+ }
+
+ /** \returns the number of non zero coefficients
+ *
+ * \note According to the underlying map/hash_map implementation,
+ * this function might be quite expensive.
+ */
+ Index nonZeros() const
+ {
+ Index nz = 0;
+ for (Index k=0; k<m_outerPackets; ++k)
+ nz += static_cast<Index>(m_hashmaps[k].size());
+ return nz;
+ }
+
+
+ protected:
+
+ HashMapType* m_hashmaps;
+ SparseMatrixType* mp_target;
+ Index m_outerPackets;
+ unsigned char m_keyBitsOffset;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_RANDOMSETTER_H
diff --git a/src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsArrayAPI.h b/src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsArrayAPI.h
new file mode 100644
index 0000000..41d2bf6
--- /dev/null
+++ b/src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsArrayAPI.h
@@ -0,0 +1,286 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#ifndef EIGEN_BESSELFUNCTIONS_ARRAYAPI_H
+#define EIGEN_BESSELFUNCTIONS_ARRAYAPI_H
+
+namespace Eigen {
+
+/** \returns an expression of the coefficient-wise i0(\a x) to the given
+ * arrays.
+ *
+ * It returns the modified Bessel function of the first kind of order zero.
+ *
+ * \param x is the argument
+ *
+ * \note This function supports only float and double scalar types. To support
+ * other scalar types, the user has to provide implementations of i0(T) for
+ * any scalar type T to be supported.
+ *
+ * \sa ArrayBase::bessel_i0()
+ */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+ Eigen::internal::scalar_bessel_i0_op<typename Derived::Scalar>, const Derived>
+bessel_i0(const Eigen::ArrayBase<Derived>& x) {
+ return Eigen::CwiseUnaryOp<
+ Eigen::internal::scalar_bessel_i0_op<typename Derived::Scalar>,
+ const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise i0e(\a x) to the given
+ * arrays.
+ *
+ * It returns the exponentially scaled modified Bessel
+ * function of the first kind of order zero.
+ *
+ * \param x is the argument
+ *
+ * \note This function supports only float and double scalar types. To support
+ * other scalar types, the user has to provide implementations of i0e(T) for
+ * any scalar type T to be supported.
+ *
+ * \sa ArrayBase::bessel_i0e()
+ */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+ Eigen::internal::scalar_bessel_i0e_op<typename Derived::Scalar>, const Derived>
+bessel_i0e(const Eigen::ArrayBase<Derived>& x) {
+ return Eigen::CwiseUnaryOp<
+ Eigen::internal::scalar_bessel_i0e_op<typename Derived::Scalar>,
+ const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise i1(\a x) to the given
+ * arrays.
+ *
+ * It returns the modified Bessel function of the first kind of order one.
+ *
+ * \param x is the argument
+ *
+ * \note This function supports only float and double scalar types. To support
+ * other scalar types, the user has to provide implementations of i1(T) for
+ * any scalar type T to be supported.
+ *
+ * \sa ArrayBase::bessel_i1()
+ */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+ Eigen::internal::scalar_bessel_i1_op<typename Derived::Scalar>, const Derived>
+bessel_i1(const Eigen::ArrayBase<Derived>& x) {
+ return Eigen::CwiseUnaryOp<
+ Eigen::internal::scalar_bessel_i1_op<typename Derived::Scalar>,
+ const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise i1e(\a x) to the given
+ * arrays.
+ *
+ * It returns the exponentially scaled modified Bessel
+ * function of the first kind of order one.
+ *
+ * \param x is the argument
+ *
+ * \note This function supports only float and double scalar types. To support
+ * other scalar types, the user has to provide implementations of i1e(T) for
+ * any scalar type T to be supported.
+ *
+ * \sa ArrayBase::bessel_i1e()
+ */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+ Eigen::internal::scalar_bessel_i1e_op<typename Derived::Scalar>, const Derived>
+bessel_i1e(const Eigen::ArrayBase<Derived>& x) {
+ return Eigen::CwiseUnaryOp<
+ Eigen::internal::scalar_bessel_i1e_op<typename Derived::Scalar>,
+ const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise k0(\a x) to the given
+ * arrays.
+ *
+ * It returns the modified Bessel function of the second kind of order zero.
+ *
+ * \param x is the argument
+ *
+ * \note This function supports only float and double scalar types. To support
+ * other scalar types, the user has to provide implementations of k0(T) for
+ * any scalar type T to be supported.
+ *
+ * \sa ArrayBase::bessel_k0()
+ */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+ Eigen::internal::scalar_bessel_k0_op<typename Derived::Scalar>, const Derived>
+bessel_k0(const Eigen::ArrayBase<Derived>& x) {
+ return Eigen::CwiseUnaryOp<
+ Eigen::internal::scalar_bessel_k0_op<typename Derived::Scalar>,
+ const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise k0e(\a x) to the given
+ * arrays.
+ *
+ * It returns the exponentially scaled modified Bessel
+ * function of the second kind of order zero.
+ *
+ * \param x is the argument
+ *
+ * \note This function supports only float and double scalar types. To support
+ * other scalar types, the user has to provide implementations of k0e(T) for
+ * any scalar type T to be supported.
+ *
+ * \sa ArrayBase::bessel_k0e()
+ */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+ Eigen::internal::scalar_bessel_k0e_op<typename Derived::Scalar>, const Derived>
+bessel_k0e(const Eigen::ArrayBase<Derived>& x) {
+ return Eigen::CwiseUnaryOp<
+ Eigen::internal::scalar_bessel_k0e_op<typename Derived::Scalar>,
+ const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise k1(\a x) to the given
+ * arrays.
+ *
+ * It returns the modified Bessel function of the second kind of order one.
+ *
+ * \param x is the argument
+ *
+ * \note This function supports only float and double scalar types. To support
+ * other scalar types, the user has to provide implementations of k1(T) for
+ * any scalar type T to be supported.
+ *
+ * \sa ArrayBase::bessel_k1()
+ */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+ Eigen::internal::scalar_bessel_k1_op<typename Derived::Scalar>, const Derived>
+bessel_k1(const Eigen::ArrayBase<Derived>& x) {
+ return Eigen::CwiseUnaryOp<
+ Eigen::internal::scalar_bessel_k1_op<typename Derived::Scalar>,
+ const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise k1e(\a x) to the given
+ * arrays.
+ *
+ * It returns the exponentially scaled modified Bessel
+ * function of the second kind of order one.
+ *
+ * \param x is the argument
+ *
+ * \note This function supports only float and double scalar types. To support
+ * other scalar types, the user has to provide implementations of k1e(T) for
+ * any scalar type T to be supported.
+ *
+ * \sa ArrayBase::bessel_k1e()
+ */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+ Eigen::internal::scalar_bessel_k1e_op<typename Derived::Scalar>, const Derived>
+bessel_k1e(const Eigen::ArrayBase<Derived>& x) {
+ return Eigen::CwiseUnaryOp<
+ Eigen::internal::scalar_bessel_k1e_op<typename Derived::Scalar>,
+ const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise j0(\a x) to the given
+ * arrays.
+ *
+ * It returns the Bessel function of the first kind of order zero.
+ *
+ * \param x is the argument
+ *
+ * \note This function supports only float and double scalar types. To support
+ * other scalar types, the user has to provide implementations of j0(T) for
+ * any scalar type T to be supported.
+ *
+ * \sa ArrayBase::bessel_j0()
+ */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+ Eigen::internal::scalar_bessel_j0_op<typename Derived::Scalar>, const Derived>
+bessel_j0(const Eigen::ArrayBase<Derived>& x) {
+ return Eigen::CwiseUnaryOp<
+ Eigen::internal::scalar_bessel_j0_op<typename Derived::Scalar>,
+ const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise y0(\a x) to the given
+ * arrays.
+ *
+ * It returns the Bessel function of the second kind of order zero.
+ *
+ * \param x is the argument
+ *
+ * \note This function supports only float and double scalar types. To support
+ * other scalar types, the user has to provide implementations of y0(T) for
+ * any scalar type T to be supported.
+ *
+ * \sa ArrayBase::bessel_y0()
+ */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+ Eigen::internal::scalar_bessel_y0_op<typename Derived::Scalar>, const Derived>
+bessel_y0(const Eigen::ArrayBase<Derived>& x) {
+ return Eigen::CwiseUnaryOp<
+ Eigen::internal::scalar_bessel_y0_op<typename Derived::Scalar>,
+ const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise j1(\a x) to the given
+ * arrays.
+ *
+ * It returns the modified Bessel function of the first kind of order one.
+ *
+ * \param x is the argument
+ *
+ * \note This function supports only float and double scalar types. To support
+ * other scalar types, the user has to provide implementations of j1(T) for
+ * any scalar type T to be supported.
+ *
+ * \sa ArrayBase::bessel_j1()
+ */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+ Eigen::internal::scalar_bessel_j1_op<typename Derived::Scalar>, const Derived>
+bessel_j1(const Eigen::ArrayBase<Derived>& x) {
+ return Eigen::CwiseUnaryOp<
+ Eigen::internal::scalar_bessel_j1_op<typename Derived::Scalar>,
+ const Derived>(x.derived());
+}
+
+/** \returns an expression of the coefficient-wise y1(\a x) to the given
+ * arrays.
+ *
+ * It returns the Bessel function of the second kind of order one.
+ *
+ * \param x is the argument
+ *
+ * \note This function supports only float and double scalar types. To support
+ * other scalar types, the user has to provide implementations of y1(T) for
+ * any scalar type T to be supported.
+ *
+ * \sa ArrayBase::bessel_y1()
+ */
+template <typename Derived>
+EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
+ Eigen::internal::scalar_bessel_y1_op<typename Derived::Scalar>, const Derived>
+bessel_y1(const Eigen::ArrayBase<Derived>& x) {
+ return Eigen::CwiseUnaryOp<
+ Eigen::internal::scalar_bessel_y1_op<typename Derived::Scalar>,
+ const Derived>(x.derived());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_BESSELFUNCTIONS_ARRAYAPI_H
diff --git a/src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsBFloat16.h b/src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsBFloat16.h
new file mode 100644
index 0000000..6049cc2
--- /dev/null
+++ b/src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsBFloat16.h
@@ -0,0 +1,68 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BESSELFUNCTIONS_BFLOAT16_H
+#define EIGEN_BESSELFUNCTIONS_BFLOAT16_H
+
+namespace Eigen {
+namespace numext {
+
+#if EIGEN_HAS_C99_MATH
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_i0(const Eigen::bfloat16& x) {
+ return Eigen::bfloat16(Eigen::numext::bessel_i0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_i0e(const Eigen::bfloat16& x) {
+ return Eigen::bfloat16(Eigen::numext::bessel_i0e(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_i1(const Eigen::bfloat16& x) {
+ return Eigen::bfloat16(Eigen::numext::bessel_i1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_i1e(const Eigen::bfloat16& x) {
+ return Eigen::bfloat16(Eigen::numext::bessel_i1e(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_j0(const Eigen::bfloat16& x) {
+ return Eigen::bfloat16(Eigen::numext::bessel_j0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_j1(const Eigen::bfloat16& x) {
+ return Eigen::bfloat16(Eigen::numext::bessel_j1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_y0(const Eigen::bfloat16& x) {
+ return Eigen::bfloat16(Eigen::numext::bessel_y0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_y1(const Eigen::bfloat16& x) {
+ return Eigen::bfloat16(Eigen::numext::bessel_y1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_k0(const Eigen::bfloat16& x) {
+ return Eigen::bfloat16(Eigen::numext::bessel_k0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_k0e(const Eigen::bfloat16& x) {
+ return Eigen::bfloat16(Eigen::numext::bessel_k0e(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_k1(const Eigen::bfloat16& x) {
+ return Eigen::bfloat16(Eigen::numext::bessel_k1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_k1e(const Eigen::bfloat16& x) {
+ return Eigen::bfloat16(Eigen::numext::bessel_k1e(static_cast<float>(x)));
+}
+#endif
+
+} // end namespace numext
+} // end namespace Eigen
+
+#endif // EIGEN_BESSELFUNCTIONS_BFLOAT16_H
diff --git a/src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsFunctors.h b/src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsFunctors.h
new file mode 100644
index 0000000..8606a9f
--- /dev/null
+++ b/src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsFunctors.h
@@ -0,0 +1,357 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com>
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BESSELFUNCTIONS_FUNCTORS_H
+#define EIGEN_BESSELFUNCTIONS_FUNCTORS_H
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal
+ * \brief Template functor to compute the modified Bessel function of the first
+ * kind of order zero.
+ * \sa class CwiseUnaryOp, Cwise::bessel_i0()
+ */
+template <typename Scalar>
+struct scalar_bessel_i0_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_i0_op)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+ using numext::bessel_i0;
+ return bessel_i0(x);
+ }
+ typedef typename packet_traits<Scalar>::type Packet;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+ return internal::pbessel_i0(x);
+ }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_i0_op<Scalar> > {
+ enum {
+ // On average, a Chebyshev polynomial of order N=20 is computed.
+ // The cost is N multiplications and 2N additions. We also add
+ // the cost of an additional exp over i0e.
+ Cost = 28 * NumTraits<Scalar>::MulCost + 48 * NumTraits<Scalar>::AddCost,
+ PacketAccess = packet_traits<Scalar>::HasBessel
+ };
+};
+
+/** \internal
+ * \brief Template functor to compute the exponentially scaled modified Bessel
+ * function of the first kind of order zero
+ * \sa class CwiseUnaryOp, Cwise::bessel_i0e()
+ */
+template <typename Scalar>
+struct scalar_bessel_i0e_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_i0e_op)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+ using numext::bessel_i0e;
+ return bessel_i0e(x);
+ }
+ typedef typename packet_traits<Scalar>::type Packet;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+ return internal::pbessel_i0e(x);
+ }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_i0e_op<Scalar> > {
+ enum {
+ // On average, a Chebyshev polynomial of order N=20 is computed.
+ // The cost is N multiplications and 2N additions.
+ Cost = 20 * NumTraits<Scalar>::MulCost + 40 * NumTraits<Scalar>::AddCost,
+ PacketAccess = packet_traits<Scalar>::HasBessel
+ };
+};
+
+/** \internal
+ * \brief Template functor to compute the modified Bessel function of the first
+ * kind of order one
+ * \sa class CwiseUnaryOp, Cwise::bessel_i1()
+ */
+template <typename Scalar>
+struct scalar_bessel_i1_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_i1_op)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+ using numext::bessel_i1;
+ return bessel_i1(x);
+ }
+ typedef typename packet_traits<Scalar>::type Packet;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+ return internal::pbessel_i1(x);
+ }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_i1_op<Scalar> > {
+ enum {
+ // On average, a Chebyshev polynomial of order N=20 is computed.
+ // The cost is N multiplications and 2N additions. We also add
+ // the cost of an additional exp over i1e.
+ Cost = 28 * NumTraits<Scalar>::MulCost + 48 * NumTraits<Scalar>::AddCost,
+ PacketAccess = packet_traits<Scalar>::HasBessel
+ };
+};
+
+/** \internal
+ * \brief Template functor to compute the exponentially scaled modified Bessel
+ * function of the first kind of order zero
+ * \sa class CwiseUnaryOp, Cwise::bessel_i1e()
+ */
+template <typename Scalar>
+struct scalar_bessel_i1e_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_i1e_op)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+ using numext::bessel_i1e;
+ return bessel_i1e(x);
+ }
+ typedef typename packet_traits<Scalar>::type Packet;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+ return internal::pbessel_i1e(x);
+ }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_i1e_op<Scalar> > {
+ enum {
+ // On average, a Chebyshev polynomial of order N=20 is computed.
+ // The cost is N multiplications and 2N additions.
+ Cost = 20 * NumTraits<Scalar>::MulCost + 40 * NumTraits<Scalar>::AddCost,
+ PacketAccess = packet_traits<Scalar>::HasBessel
+ };
+};
+
+/** \internal
+ * \brief Template functor to compute the Bessel function of the second kind of
+ * order zero
+ * \sa class CwiseUnaryOp, Cwise::bessel_j0()
+ */
+template <typename Scalar>
+struct scalar_bessel_j0_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_j0_op)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+ using numext::bessel_j0;
+ return bessel_j0(x);
+ }
+ typedef typename packet_traits<Scalar>::type Packet;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+ return internal::pbessel_j0(x);
+ }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_j0_op<Scalar> > {
+ enum {
+ // 6 polynomial of order ~N=8 is computed.
+ // The cost is N multiplications and N additions each, along with a
+ // sine, cosine and rsqrt cost.
+ Cost = 63 * NumTraits<Scalar>::MulCost + 48 * NumTraits<Scalar>::AddCost,
+ PacketAccess = packet_traits<Scalar>::HasBessel
+ };
+};
+
+/** \internal
+ * \brief Template functor to compute the Bessel function of the second kind of
+ * order zero
+ * \sa class CwiseUnaryOp, Cwise::bessel_y0()
+ */
+template <typename Scalar>
+struct scalar_bessel_y0_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_y0_op)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+ using numext::bessel_y0;
+ return bessel_y0(x);
+ }
+ typedef typename packet_traits<Scalar>::type Packet;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+ return internal::pbessel_y0(x);
+ }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_y0_op<Scalar> > {
+ enum {
+ // 6 polynomial of order ~N=8 is computed.
+ // The cost is N multiplications and N additions each, along with a
+ // sine, cosine, rsqrt and j0 cost.
+ Cost = 126 * NumTraits<Scalar>::MulCost + 96 * NumTraits<Scalar>::AddCost,
+ PacketAccess = packet_traits<Scalar>::HasBessel
+ };
+};
+
+/** \internal
+ * \brief Template functor to compute the Bessel function of the first kind of
+ * order one
+ * \sa class CwiseUnaryOp, Cwise::bessel_j1()
+ */
+template <typename Scalar>
+struct scalar_bessel_j1_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_j1_op)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+ using numext::bessel_j1;
+ return bessel_j1(x);
+ }
+ typedef typename packet_traits<Scalar>::type Packet;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+ return internal::pbessel_j1(x);
+ }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_j1_op<Scalar> > {
+ enum {
+ // 6 polynomial of order ~N=8 is computed.
+ // The cost is N multiplications and N additions each, along with a
+ // sine, cosine and rsqrt cost.
+ Cost = 63 * NumTraits<Scalar>::MulCost + 48 * NumTraits<Scalar>::AddCost,
+ PacketAccess = packet_traits<Scalar>::HasBessel
+ };
+};
+
+/** \internal
+ * \brief Template functor to compute the Bessel function of the second kind of
+ * order one
+ * \sa class CwiseUnaryOp, Cwise::bessel_j1e()
+ */
+template <typename Scalar>
+struct scalar_bessel_y1_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_y1_op)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+ using numext::bessel_y1;
+ return bessel_y1(x);
+ }
+ typedef typename packet_traits<Scalar>::type Packet;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+ return internal::pbessel_y1(x);
+ }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_y1_op<Scalar> > {
+ enum {
+ // 6 polynomial of order ~N=8 is computed.
+ // The cost is N multiplications and N additions each, along with a
+ // sine, cosine, rsqrt and j1 cost.
+ Cost = 126 * NumTraits<Scalar>::MulCost + 96 * NumTraits<Scalar>::AddCost,
+ PacketAccess = packet_traits<Scalar>::HasBessel
+ };
+};
+
+/** \internal
+ * \brief Template functor to compute the modified Bessel function of the second
+ * kind of order zero
+ * \sa class CwiseUnaryOp, Cwise::bessel_k0()
+ */
+template <typename Scalar>
+struct scalar_bessel_k0_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_k0_op)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+ using numext::bessel_k0;
+ return bessel_k0(x);
+ }
+ typedef typename packet_traits<Scalar>::type Packet;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+ return internal::pbessel_k0(x);
+ }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_k0_op<Scalar> > {
+ enum {
+ // On average, a Chebyshev polynomial of order N=10 is computed.
+ // The cost is N multiplications and 2N additions. In addition we compute
+ // i0, a log, exp and prsqrt and sin and cos.
+ Cost = 68 * NumTraits<Scalar>::MulCost + 88 * NumTraits<Scalar>::AddCost,
+ PacketAccess = packet_traits<Scalar>::HasBessel
+ };
+};
+
+/** \internal
+ * \brief Template functor to compute the exponentially scaled modified Bessel
+ * function of the second kind of order zero
+ * \sa class CwiseUnaryOp, Cwise::bessel_k0e()
+ */
+template <typename Scalar>
+struct scalar_bessel_k0e_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_k0e_op)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+ using numext::bessel_k0e;
+ return bessel_k0e(x);
+ }
+ typedef typename packet_traits<Scalar>::type Packet;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+ return internal::pbessel_k0e(x);
+ }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_k0e_op<Scalar> > {
+ enum {
+ // On average, a Chebyshev polynomial of order N=10 is computed.
+ // The cost is N multiplications and 2N additions. In addition we compute
+ // i0, a log, exp and prsqrt and sin and cos.
+ Cost = 68 * NumTraits<Scalar>::MulCost + 88 * NumTraits<Scalar>::AddCost,
+ PacketAccess = packet_traits<Scalar>::HasBessel
+ };
+};
+
+/** \internal
+ * \brief Template functor to compute the modified Bessel function of the
+ * second kind of order one
+ * \sa class CwiseUnaryOp, Cwise::bessel_k1()
+ */
+template <typename Scalar>
+struct scalar_bessel_k1_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_k1_op)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+ using numext::bessel_k1;
+ return bessel_k1(x);
+ }
+ typedef typename packet_traits<Scalar>::type Packet;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+ return internal::pbessel_k1(x);
+ }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_k1_op<Scalar> > {
+ enum {
+ // On average, a Chebyshev polynomial of order N=10 is computed.
+ // The cost is N multiplications and 2N additions. In addition we compute
+ // i1, a log, exp and prsqrt and sin and cos.
+ Cost = 68 * NumTraits<Scalar>::MulCost + 88 * NumTraits<Scalar>::AddCost,
+ PacketAccess = packet_traits<Scalar>::HasBessel
+ };
+};
+
+/** \internal
+ * \brief Template functor to compute the exponentially scaled modified Bessel
+ * function of the second kind of order one
+ * \sa class CwiseUnaryOp, Cwise::bessel_k1e()
+ */
+template <typename Scalar>
+struct scalar_bessel_k1e_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_k1e_op)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+ using numext::bessel_k1e;
+ return bessel_k1e(x);
+ }
+ typedef typename packet_traits<Scalar>::type Packet;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+ return internal::pbessel_k1e(x);
+ }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_k1e_op<Scalar> > {
+ enum {
+ // On average, a Chebyshev polynomial of order N=10 is computed.
+ // The cost is N multiplications and 2N additions. In addition we compute
+ // i1, a log, exp and prsqrt and sin and cos.
+ Cost = 68 * NumTraits<Scalar>::MulCost + 88 * NumTraits<Scalar>::AddCost,
+ PacketAccess = packet_traits<Scalar>::HasBessel
+ };
+};
+
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_BESSELFUNCTIONS_FUNCTORS_H
diff --git a/src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsHalf.h b/src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsHalf.h
new file mode 100644
index 0000000..8930d1a
--- /dev/null
+++ b/src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsHalf.h
@@ -0,0 +1,66 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BESSELFUNCTIONS_HALF_H
+#define EIGEN_BESSELFUNCTIONS_HALF_H
+
+namespace Eigen {
+namespace numext {
+
+#if EIGEN_HAS_C99_MATH
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_i0(const Eigen::half& x) {
+ return Eigen::half(Eigen::numext::bessel_i0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_i0e(const Eigen::half& x) {
+ return Eigen::half(Eigen::numext::bessel_i0e(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_i1(const Eigen::half& x) {
+ return Eigen::half(Eigen::numext::bessel_i1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_i1e(const Eigen::half& x) {
+ return Eigen::half(Eigen::numext::bessel_i1e(static_cast<float>(x)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_j0(const Eigen::half& x) {
+ return Eigen::half(Eigen::numext::bessel_j0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_j1(const Eigen::half& x) {
+ return Eigen::half(Eigen::numext::bessel_j1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_y0(const Eigen::half& x) {
+ return Eigen::half(Eigen::numext::bessel_y0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_y1(const Eigen::half& x) {
+ return Eigen::half(Eigen::numext::bessel_y1(static_cast<float>(x)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_k0(const Eigen::half& x) {
+ return Eigen::half(Eigen::numext::bessel_k0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_k0e(const Eigen::half& x) {
+ return Eigen::half(Eigen::numext::bessel_k0e(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_k1(const Eigen::half& x) {
+ return Eigen::half(Eigen::numext::bessel_k1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_k1e(const Eigen::half& x) {
+ return Eigen::half(Eigen::numext::bessel_k1e(static_cast<float>(x)));
+}
+#endif
+
+} // end namespace numext
+} // end namespace Eigen
+
+#endif // EIGEN_BESSELFUNCTIONS_HALF_H
diff --git a/src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsImpl.h b/src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsImpl.h
new file mode 100644
index 0000000..24812be
--- /dev/null
+++ b/src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsImpl.h
@@ -0,0 +1,1959 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Eugene Brevdo <ebrevdo@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BESSEL_FUNCTIONS_H
+#define EIGEN_BESSEL_FUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+// Parts of this code are based on the Cephes Math Library.
+//
+// Cephes Math Library Release 2.8: June, 2000
+// Copyright 1984, 1987, 1992, 2000 by Stephen L. Moshier
+//
+// Permission has been kindly provided by the original author
+// to incorporate the Cephes software into the Eigen codebase:
+//
+// From: Stephen Moshier
+// To: Eugene Brevdo
+// Subject: Re: Permission to wrap several cephes functions in Eigen
+//
+// Hello Eugene,
+//
+// Thank you for writing.
+//
+// If your licensing is similar to BSD, the formal way that has been
+// handled is simply to add a statement to the effect that you are incorporating
+// the Cephes software by permission of the author.
+//
+// Good luck with your project,
+// Steve
+
+
+/****************************************************************************
+ * Implementation of Bessel function, based on Cephes *
+ ****************************************************************************/
+
+template <typename Scalar>
+struct bessel_i0e_retval {
+ typedef Scalar type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_i0e {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T&) {
+ EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ return ScalarType(0);
+ }
+};
+
+template <typename T>
+struct generic_i0e<T, float> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T& x) {
+ /* i0ef.c
+ *
+ * Modified Bessel function of order zero,
+ * exponentially scaled
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * float x, y, i0ef();
+ *
+ * y = i0ef( x );
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Returns exponentially scaled modified Bessel function
+ * of order zero of the argument.
+ *
+ * The function is defined as i0e(x) = exp(-|x|) j0( ix ).
+ *
+ *
+ *
+ * ACCURACY:
+ *
+ * Relative error:
+ * arithmetic domain # trials peak rms
+ * IEEE 0,30 100000 3.7e-7 7.0e-8
+ * See i0f().
+ *
+ */
+
+ const float A[] = {-1.30002500998624804212E-8f, 6.04699502254191894932E-8f,
+ -2.67079385394061173391E-7f, 1.11738753912010371815E-6f,
+ -4.41673835845875056359E-6f, 1.64484480707288970893E-5f,
+ -5.75419501008210370398E-5f, 1.88502885095841655729E-4f,
+ -5.76375574538582365885E-4f, 1.63947561694133579842E-3f,
+ -4.32430999505057594430E-3f, 1.05464603945949983183E-2f,
+ -2.37374148058994688156E-2f, 4.93052842396707084878E-2f,
+ -9.49010970480476444210E-2f, 1.71620901522208775349E-1f,
+ -3.04682672343198398683E-1f, 6.76795274409476084995E-1f};
+
+ const float B[] = {3.39623202570838634515E-9f, 2.26666899049817806459E-8f,
+ 2.04891858946906374183E-7f, 2.89137052083475648297E-6f,
+ 6.88975834691682398426E-5f, 3.36911647825569408990E-3f,
+ 8.04490411014108831608E-1f};
+ T y = pabs(x);
+ T y_le_eight = internal::pchebevl<T, 18>::run(
+ pmadd(pset1<T>(0.5f), y, pset1<T>(-2.0f)), A);
+ T y_gt_eight = pmul(
+ internal::pchebevl<T, 7>::run(
+ psub(pdiv(pset1<T>(32.0f), y), pset1<T>(2.0f)), B),
+ prsqrt(y));
+ // TODO: Perhaps instead check whether all packet elements are in
+ // [-8, 8] and evaluate a branch based off of that. It's possible
+ // in practice most elements are in this region.
+ return pselect(pcmp_le(y, pset1<T>(8.0f)), y_le_eight, y_gt_eight);
+ }
+};
+
+template <typename T>
+struct generic_i0e<T, double> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T& x) {
+ /* i0e.c
+ *
+ * Modified Bessel function of order zero,
+ * exponentially scaled
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * double x, y, i0e();
+ *
+ * y = i0e( x );
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Returns exponentially scaled modified Bessel function
+ * of order zero of the argument.
+ *
+ * The function is defined as i0e(x) = exp(-|x|) j0( ix ).
+ *
+ *
+ *
+ * ACCURACY:
+ *
+ * Relative error:
+ * arithmetic domain # trials peak rms
+ * IEEE 0,30 30000 5.4e-16 1.2e-16
+ * See i0().
+ *
+ */
+
+ const double A[] = {-4.41534164647933937950E-18, 3.33079451882223809783E-17,
+ -2.43127984654795469359E-16, 1.71539128555513303061E-15,
+ -1.16853328779934516808E-14, 7.67618549860493561688E-14,
+ -4.85644678311192946090E-13, 2.95505266312963983461E-12,
+ -1.72682629144155570723E-11, 9.67580903537323691224E-11,
+ -5.18979560163526290666E-10, 2.65982372468238665035E-9,
+ -1.30002500998624804212E-8, 6.04699502254191894932E-8,
+ -2.67079385394061173391E-7, 1.11738753912010371815E-6,
+ -4.41673835845875056359E-6, 1.64484480707288970893E-5,
+ -5.75419501008210370398E-5, 1.88502885095841655729E-4,
+ -5.76375574538582365885E-4, 1.63947561694133579842E-3,
+ -4.32430999505057594430E-3, 1.05464603945949983183E-2,
+ -2.37374148058994688156E-2, 4.93052842396707084878E-2,
+ -9.49010970480476444210E-2, 1.71620901522208775349E-1,
+ -3.04682672343198398683E-1, 6.76795274409476084995E-1};
+ const double B[] = {
+ -7.23318048787475395456E-18, -4.83050448594418207126E-18,
+ 4.46562142029675999901E-17, 3.46122286769746109310E-17,
+ -2.82762398051658348494E-16, -3.42548561967721913462E-16,
+ 1.77256013305652638360E-15, 3.81168066935262242075E-15,
+ -9.55484669882830764870E-15, -4.15056934728722208663E-14,
+ 1.54008621752140982691E-14, 3.85277838274214270114E-13,
+ 7.18012445138366623367E-13, -1.79417853150680611778E-12,
+ -1.32158118404477131188E-11, -3.14991652796324136454E-11,
+ 1.18891471078464383424E-11, 4.94060238822496958910E-10,
+ 3.39623202570838634515E-9, 2.26666899049817806459E-8,
+ 2.04891858946906374183E-7, 2.89137052083475648297E-6,
+ 6.88975834691682398426E-5, 3.36911647825569408990E-3,
+ 8.04490411014108831608E-1};
+ T y = pabs(x);
+ T y_le_eight = internal::pchebevl<T, 30>::run(
+ pmadd(pset1<T>(0.5), y, pset1<T>(-2.0)), A);
+ T y_gt_eight = pmul(
+ internal::pchebevl<T, 25>::run(
+ psub(pdiv(pset1<T>(32.0), y), pset1<T>(2.0)), B),
+ prsqrt(y));
+ // TODO: Perhaps instead check whether all packet elements are in
+ // [-8, 8] and evaluate a branch based off of that. It's possible
+ // in practice most elements are in this region.
+ return pselect(pcmp_le(y, pset1<T>(8.0)), y_le_eight, y_gt_eight);
+ }
+};
+
+template <typename T>
+struct bessel_i0e_impl {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T x) {
+ return generic_i0e<T>::run(x);
+ }
+};
+
+template <typename Scalar>
+struct bessel_i0_retval {
+ typedef Scalar type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_i0 {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T& x) {
+ return pmul(
+ pexp(pabs(x)),
+ generic_i0e<T, ScalarType>::run(x));
+ }
+};
+
+template <typename T>
+struct bessel_i0_impl {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T x) {
+ return generic_i0<T>::run(x);
+ }
+};
+
+template <typename Scalar>
+struct bessel_i1e_retval {
+ typedef Scalar type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type >
+struct generic_i1e {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T&) {
+ EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ return ScalarType(0);
+ }
+};
+
+template <typename T>
+struct generic_i1e<T, float> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T& x) {
+ /* i1ef.c
+ *
+ * Modified Bessel function of order one,
+ * exponentially scaled
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * float x, y, i1ef();
+ *
+ * y = i1ef( x );
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Returns exponentially scaled modified Bessel function
+ * of order one of the argument.
+ *
+ * The function is defined as i1(x) = -i exp(-|x|) j1( ix ).
+ *
+ *
+ *
+ * ACCURACY:
+ *
+ * Relative error:
+ * arithmetic domain # trials peak rms
+ * IEEE 0, 30 30000 1.5e-6 1.5e-7
+ * See i1().
+ *
+ */
+ const float A[] = {9.38153738649577178388E-9f, -4.44505912879632808065E-8f,
+ 2.00329475355213526229E-7f, -8.56872026469545474066E-7f,
+ 3.47025130813767847674E-6f, -1.32731636560394358279E-5f,
+ 4.78156510755005422638E-5f, -1.61760815825896745588E-4f,
+ 5.12285956168575772895E-4f, -1.51357245063125314899E-3f,
+ 4.15642294431288815669E-3f, -1.05640848946261981558E-2f,
+ 2.47264490306265168283E-2f, -5.29459812080949914269E-2f,
+ 1.02643658689847095384E-1f, -1.76416518357834055153E-1f,
+ 2.52587186443633654823E-1f};
+
+ const float B[] = {-3.83538038596423702205E-9f, -2.63146884688951950684E-8f,
+ -2.51223623787020892529E-7f, -3.88256480887769039346E-6f,
+ -1.10588938762623716291E-4f, -9.76109749136146840777E-3f,
+ 7.78576235018280120474E-1f};
+
+
+ T y = pabs(x);
+ T y_le_eight = pmul(y, internal::pchebevl<T, 17>::run(
+ pmadd(pset1<T>(0.5f), y, pset1<T>(-2.0f)), A));
+ T y_gt_eight = pmul(
+ internal::pchebevl<T, 7>::run(
+ psub(pdiv(pset1<T>(32.0f), y),
+ pset1<T>(2.0f)), B),
+ prsqrt(y));
+ // TODO: Perhaps instead check whether all packet elements are in
+ // [-8, 8] and evaluate a branch based off of that. It's possible
+ // in practice most elements are in this region.
+ y = pselect(pcmp_le(y, pset1<T>(8.0f)), y_le_eight, y_gt_eight);
+ return pselect(pcmp_lt(x, pset1<T>(0.0f)), pnegate(y), y);
+ }
+};
+
+template <typename T>
+struct generic_i1e<T, double> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T& x) {
+ /* i1e.c
+ *
+ * Modified Bessel function of order one,
+ * exponentially scaled
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * double x, y, i1e();
+ *
+ * y = i1e( x );
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Returns exponentially scaled modified Bessel function
+ * of order one of the argument.
+ *
+ * The function is defined as i1(x) = -i exp(-|x|) j1( ix ).
+ *
+ *
+ *
+ * ACCURACY:
+ *
+ * Relative error:
+ * arithmetic domain # trials peak rms
+ * IEEE 0, 30 30000 2.0e-15 2.0e-16
+ * See i1().
+ *
+ */
+ const double A[] = {2.77791411276104639959E-18, -2.11142121435816608115E-17,
+ 1.55363195773620046921E-16, -1.10559694773538630805E-15,
+ 7.60068429473540693410E-15, -5.04218550472791168711E-14,
+ 3.22379336594557470981E-13, -1.98397439776494371520E-12,
+ 1.17361862988909016308E-11, -6.66348972350202774223E-11,
+ 3.62559028155211703701E-10, -1.88724975172282928790E-9,
+ 9.38153738649577178388E-9, -4.44505912879632808065E-8,
+ 2.00329475355213526229E-7, -8.56872026469545474066E-7,
+ 3.47025130813767847674E-6, -1.32731636560394358279E-5,
+ 4.78156510755005422638E-5, -1.61760815825896745588E-4,
+ 5.12285956168575772895E-4, -1.51357245063125314899E-3,
+ 4.15642294431288815669E-3, -1.05640848946261981558E-2,
+ 2.47264490306265168283E-2, -5.29459812080949914269E-2,
+ 1.02643658689847095384E-1, -1.76416518357834055153E-1,
+ 2.52587186443633654823E-1};
+ const double B[] = {
+ 7.51729631084210481353E-18, 4.41434832307170791151E-18,
+ -4.65030536848935832153E-17, -3.20952592199342395980E-17,
+ 2.96262899764595013876E-16, 3.30820231092092828324E-16,
+ -1.88035477551078244854E-15, -3.81440307243700780478E-15,
+ 1.04202769841288027642E-14, 4.27244001671195135429E-14,
+ -2.10154184277266431302E-14, -4.08355111109219731823E-13,
+ -7.19855177624590851209E-13, 2.03562854414708950722E-12,
+ 1.41258074366137813316E-11, 3.25260358301548823856E-11,
+ -1.89749581235054123450E-11, -5.58974346219658380687E-10,
+ -3.83538038596423702205E-9, -2.63146884688951950684E-8,
+ -2.51223623787020892529E-7, -3.88256480887769039346E-6,
+ -1.10588938762623716291E-4, -9.76109749136146840777E-3,
+ 7.78576235018280120474E-1};
+ T y = pabs(x);
+ T y_le_eight = pmul(y, internal::pchebevl<T, 29>::run(
+ pmadd(pset1<T>(0.5), y, pset1<T>(-2.0)), A));
+ T y_gt_eight = pmul(
+ internal::pchebevl<T, 25>::run(
+ psub(pdiv(pset1<T>(32.0), y),
+ pset1<T>(2.0)), B),
+ prsqrt(y));
+ // TODO: Perhaps instead check whether all packet elements are in
+ // [-8, 8] and evaluate a branch based off of that. It's possible
+ // in practice most elements are in this region.
+ y = pselect(pcmp_le(y, pset1<T>(8.0)), y_le_eight, y_gt_eight);
+ return pselect(pcmp_lt(x, pset1<T>(0.0)), pnegate(y), y);
+ }
+};
+
+template <typename T>
+struct bessel_i1e_impl {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T x) {
+ return generic_i1e<T>::run(x);
+ }
+};
+
+template <typename T>
+struct bessel_i1_retval {
+ typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_i1 {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T& x) {
+ return pmul(
+ pexp(pabs(x)),
+ generic_i1e<T, ScalarType>::run(x));
+ }
+};
+
+template <typename T>
+struct bessel_i1_impl {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T x) {
+ return generic_i1<T>::run(x);
+ }
+};
+
+template <typename T>
+struct bessel_k0e_retval {
+ typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_k0e {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T&) {
+ EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ return ScalarType(0);
+ }
+};
+
+template <typename T>
+struct generic_k0e<T, float> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T& x) {
+ /* k0ef.c
+ * Modified Bessel function, third kind, order zero,
+ * exponentially scaled
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * float x, y, k0ef();
+ *
+ * y = k0ef( x );
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Returns exponentially scaled modified Bessel function
+ * of the third kind of order zero of the argument.
+ *
+ *
+ *
+ * ACCURACY:
+ *
+ * Relative error:
+ * arithmetic domain # trials peak rms
+ * IEEE 0, 30 30000 8.1e-7 7.8e-8
+ * See k0().
+ *
+ */
+
+ const float A[] = {1.90451637722020886025E-9f, 2.53479107902614945675E-7f,
+ 2.28621210311945178607E-5f, 1.26461541144692592338E-3f,
+ 3.59799365153615016266E-2f, 3.44289899924628486886E-1f,
+ -5.35327393233902768720E-1f};
+
+ const float B[] = {-1.69753450938905987466E-9f, 8.57403401741422608519E-9f,
+ -4.66048989768794782956E-8f, 2.76681363944501510342E-7f,
+ -1.83175552271911948767E-6f, 1.39498137188764993662E-5f,
+ -1.28495495816278026384E-4f, 1.56988388573005337491E-3f,
+ -3.14481013119645005427E-2f, 2.44030308206595545468E0f};
+ const T MAXNUM = pset1<T>(NumTraits<float>::infinity());
+ const T two = pset1<T>(2.0);
+ T x_le_two = internal::pchebevl<T, 7>::run(
+ pmadd(x, x, pset1<T>(-2.0)), A);
+ x_le_two = pmadd(
+ generic_i0<T, float>::run(x), pnegate(
+ plog(pmul(pset1<T>(0.5), x))), x_le_two);
+ x_le_two = pmul(pexp(x), x_le_two);
+ T x_gt_two = pmul(
+ internal::pchebevl<T, 10>::run(
+ psub(pdiv(pset1<T>(8.0), x), two), B),
+ prsqrt(x));
+ return pselect(
+ pcmp_le(x, pset1<T>(0.0)),
+ MAXNUM,
+ pselect(pcmp_le(x, two), x_le_two, x_gt_two));
+ }
+};
+
+template <typename T>
+struct generic_k0e<T, double> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T& x) {
+ /* k0e.c
+ * Modified Bessel function, third kind, order zero,
+ * exponentially scaled
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * double x, y, k0e();
+ *
+ * y = k0e( x );
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Returns exponentially scaled modified Bessel function
+ * of the third kind of order zero of the argument.
+ *
+ *
+ *
+ * ACCURACY:
+ *
+ * Relative error:
+ * arithmetic domain # trials peak rms
+ * IEEE 0, 30 30000 1.4e-15 1.4e-16
+ * See k0().
+ *
+ */
+
+ const double A[] = {
+ 1.37446543561352307156E-16,
+ 4.25981614279661018399E-14,
+ 1.03496952576338420167E-11,
+ 1.90451637722020886025E-9,
+ 2.53479107902614945675E-7,
+ 2.28621210311945178607E-5,
+ 1.26461541144692592338E-3,
+ 3.59799365153615016266E-2,
+ 3.44289899924628486886E-1,
+ -5.35327393233902768720E-1};
+ const double B[] = {
+ 5.30043377268626276149E-18, -1.64758043015242134646E-17,
+ 5.21039150503902756861E-17, -1.67823109680541210385E-16,
+ 5.51205597852431940784E-16, -1.84859337734377901440E-15,
+ 6.34007647740507060557E-15, -2.22751332699166985548E-14,
+ 8.03289077536357521100E-14, -2.98009692317273043925E-13,
+ 1.14034058820847496303E-12, -4.51459788337394416547E-12,
+ 1.85594911495471785253E-11, -7.95748924447710747776E-11,
+ 3.57739728140030116597E-10, -1.69753450938905987466E-9,
+ 8.57403401741422608519E-9, -4.66048989768794782956E-8,
+ 2.76681363944501510342E-7, -1.83175552271911948767E-6,
+ 1.39498137188764993662E-5, -1.28495495816278026384E-4,
+ 1.56988388573005337491E-3, -3.14481013119645005427E-2,
+ 2.44030308206595545468E0
+ };
+ const T MAXNUM = pset1<T>(NumTraits<double>::infinity());
+ const T two = pset1<T>(2.0);
+ T x_le_two = internal::pchebevl<T, 10>::run(
+ pmadd(x, x, pset1<T>(-2.0)), A);
+ x_le_two = pmadd(
+ generic_i0<T, double>::run(x), pmul(
+ pset1<T>(-1.0), plog(pmul(pset1<T>(0.5), x))), x_le_two);
+ x_le_two = pmul(pexp(x), x_le_two);
+ x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+ T x_gt_two = pmul(
+ internal::pchebevl<T, 25>::run(
+ psub(pdiv(pset1<T>(8.0), x), two), B),
+ prsqrt(x));
+ return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+ }
+};
+
+template <typename T>
+struct bessel_k0e_impl {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T x) {
+ return generic_k0e<T>::run(x);
+ }
+};
+
+template <typename T>
+struct bessel_k0_retval {
+ typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_k0 {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T&) {
+ EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ return ScalarType(0);
+ }
+};
+
+template <typename T>
+struct generic_k0<T, float> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T& x) {
+ /* k0f.c
+ * Modified Bessel function, third kind, order zero
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * float x, y, k0f();
+ *
+ * y = k0f( x );
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Returns modified Bessel function of the third kind
+ * of order zero of the argument.
+ *
+ * The range is partitioned into the two intervals [0,8] and
+ * (8, infinity). Chebyshev polynomial expansions are employed
+ * in each interval.
+ *
+ *
+ *
+ * ACCURACY:
+ *
+ * Tested at 2000 random points between 0 and 8. Peak absolute
+ * error (relative when K0 > 1) was 1.46e-14; rms, 4.26e-15.
+ * Relative error:
+ * arithmetic domain # trials peak rms
+ * IEEE 0, 30 30000 7.8e-7 8.5e-8
+ *
+ * ERROR MESSAGES:
+ *
+ * message condition value returned
+ * K0 domain x <= 0 MAXNUM
+ *
+ */
+
+ const float A[] = {1.90451637722020886025E-9f, 2.53479107902614945675E-7f,
+ 2.28621210311945178607E-5f, 1.26461541144692592338E-3f,
+ 3.59799365153615016266E-2f, 3.44289899924628486886E-1f,
+ -5.35327393233902768720E-1f};
+
+ const float B[] = {-1.69753450938905987466E-9f, 8.57403401741422608519E-9f,
+ -4.66048989768794782956E-8f, 2.76681363944501510342E-7f,
+ -1.83175552271911948767E-6f, 1.39498137188764993662E-5f,
+ -1.28495495816278026384E-4f, 1.56988388573005337491E-3f,
+ -3.14481013119645005427E-2f, 2.44030308206595545468E0f};
+ const T MAXNUM = pset1<T>(NumTraits<float>::infinity());
+ const T two = pset1<T>(2.0);
+ T x_le_two = internal::pchebevl<T, 7>::run(
+ pmadd(x, x, pset1<T>(-2.0)), A);
+ x_le_two = pmadd(
+ generic_i0<T, float>::run(x), pnegate(
+ plog(pmul(pset1<T>(0.5), x))), x_le_two);
+ x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+ T x_gt_two = pmul(
+ pmul(
+ pexp(pnegate(x)),
+ internal::pchebevl<T, 10>::run(
+ psub(pdiv(pset1<T>(8.0), x), two), B)),
+ prsqrt(x));
+ return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+ }
+};
+
+template <typename T>
+struct generic_k0<T, double> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T& x) {
+ /*
+ *
+ * Modified Bessel function, third kind, order zero,
+ * exponentially scaled
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * double x, y, k0();
+ *
+ * y = k0( x );
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Returns exponentially scaled modified Bessel function
+ * of the third kind of order zero of the argument.
+ *
+ *
+ *
+ * ACCURACY:
+ *
+ * Relative error:
+ * arithmetic domain # trials peak rms
+ * IEEE 0, 30 30000 1.4e-15 1.4e-16
+ * See k0().
+ *
+ */
+ const double A[] = {
+ 1.37446543561352307156E-16,
+ 4.25981614279661018399E-14,
+ 1.03496952576338420167E-11,
+ 1.90451637722020886025E-9,
+ 2.53479107902614945675E-7,
+ 2.28621210311945178607E-5,
+ 1.26461541144692592338E-3,
+ 3.59799365153615016266E-2,
+ 3.44289899924628486886E-1,
+ -5.35327393233902768720E-1};
+ const double B[] = {
+ 5.30043377268626276149E-18, -1.64758043015242134646E-17,
+ 5.21039150503902756861E-17, -1.67823109680541210385E-16,
+ 5.51205597852431940784E-16, -1.84859337734377901440E-15,
+ 6.34007647740507060557E-15, -2.22751332699166985548E-14,
+ 8.03289077536357521100E-14, -2.98009692317273043925E-13,
+ 1.14034058820847496303E-12, -4.51459788337394416547E-12,
+ 1.85594911495471785253E-11, -7.95748924447710747776E-11,
+ 3.57739728140030116597E-10, -1.69753450938905987466E-9,
+ 8.57403401741422608519E-9, -4.66048989768794782956E-8,
+ 2.76681363944501510342E-7, -1.83175552271911948767E-6,
+ 1.39498137188764993662E-5, -1.28495495816278026384E-4,
+ 1.56988388573005337491E-3, -3.14481013119645005427E-2,
+ 2.44030308206595545468E0
+ };
+ const T MAXNUM = pset1<T>(NumTraits<double>::infinity());
+ const T two = pset1<T>(2.0);
+ T x_le_two = internal::pchebevl<T, 10>::run(
+ pmadd(x, x, pset1<T>(-2.0)), A);
+ x_le_two = pmadd(
+ generic_i0<T, double>::run(x), pnegate(
+ plog(pmul(pset1<T>(0.5), x))), x_le_two);
+ x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+ T x_gt_two = pmul(
+ pmul(
+ pexp(-x),
+ internal::pchebevl<T, 25>::run(
+ psub(pdiv(pset1<T>(8.0), x), two), B)),
+ prsqrt(x));
+ return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+ }
+};
+
+template <typename T>
+struct bessel_k0_impl {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T x) {
+ return generic_k0<T>::run(x);
+ }
+};
+
+template <typename T>
+struct bessel_k1e_retval {
+ typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_k1e {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T&) {
+ EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ return ScalarType(0);
+ }
+};
+
+template <typename T>
+struct generic_k1e<T, float> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T& x) {
+ /* k1ef.c
+ *
+ * Modified Bessel function, third kind, order one,
+ * exponentially scaled
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * float x, y, k1ef();
+ *
+ * y = k1ef( x );
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Returns exponentially scaled modified Bessel function
+ * of the third kind of order one of the argument:
+ *
+ * k1e(x) = exp(x) * k1(x).
+ *
+ *
+ *
+ * ACCURACY:
+ *
+ * Relative error:
+ * arithmetic domain # trials peak rms
+ * IEEE 0, 30 30000 4.9e-7 6.7e-8
+ * See k1().
+ *
+ */
+
+ const float A[] = {-2.21338763073472585583E-8f, -2.43340614156596823496E-6f,
+ -1.73028895751305206302E-4f, -6.97572385963986435018E-3f,
+ -1.22611180822657148235E-1f, -3.53155960776544875667E-1f,
+ 1.52530022733894777053E0f};
+ const float B[] = {2.01504975519703286596E-9f, -1.03457624656780970260E-8f,
+ 5.74108412545004946722E-8f, -3.50196060308781257119E-7f,
+ 2.40648494783721712015E-6f, -1.93619797416608296024E-5f,
+ 1.95215518471351631108E-4f, -2.85781685962277938680E-3f,
+ 1.03923736576817238437E-1f, 2.72062619048444266945E0f};
+ const T MAXNUM = pset1<T>(NumTraits<float>::infinity());
+ const T two = pset1<T>(2.0);
+ T x_le_two = pdiv(internal::pchebevl<T, 7>::run(
+ pmadd(x, x, pset1<T>(-2.0)), A), x);
+ x_le_two = pmadd(
+ generic_i1<T, float>::run(x), plog(pmul(pset1<T>(0.5), x)), x_le_two);
+ x_le_two = pmul(x_le_two, pexp(x));
+ x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+ T x_gt_two = pmul(
+ internal::pchebevl<T, 10>::run(
+ psub(pdiv(pset1<T>(8.0), x), two), B),
+ prsqrt(x));
+ return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+ }
+};
+
+template <typename T>
+struct generic_k1e<T, double> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T& x) {
+ /* k1e.c
+ *
+ * Modified Bessel function, third kind, order one,
+ * exponentially scaled
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * double x, y, k1e();
+ *
+ * y = k1e( x );
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Returns exponentially scaled modified Bessel function
+ * of the third kind of order one of the argument:
+ *
+ * k1e(x) = exp(x) * k1(x).
+ *
+ *
+ *
+ * ACCURACY:
+ *
+ * Relative error:
+ * arithmetic domain # trials peak rms
+ * IEEE 0, 30 30000 7.8e-16 1.2e-16
+ * See k1().
+ *
+ */
+ const double A[] = {-7.02386347938628759343E-18, -2.42744985051936593393E-15,
+ -6.66690169419932900609E-13, -1.41148839263352776110E-10,
+ -2.21338763073472585583E-8, -2.43340614156596823496E-6,
+ -1.73028895751305206302E-4, -6.97572385963986435018E-3,
+ -1.22611180822657148235E-1, -3.53155960776544875667E-1,
+ 1.52530022733894777053E0};
+ const double B[] = {-5.75674448366501715755E-18, 1.79405087314755922667E-17,
+ -5.68946255844285935196E-17, 1.83809354436663880070E-16,
+ -6.05704724837331885336E-16, 2.03870316562433424052E-15,
+ -7.01983709041831346144E-15, 2.47715442448130437068E-14,
+ -8.97670518232499435011E-14, 3.34841966607842919884E-13,
+ -1.28917396095102890680E-12, 5.13963967348173025100E-12,
+ -2.12996783842756842877E-11, 9.21831518760500529508E-11,
+ -4.19035475934189648750E-10, 2.01504975519703286596E-9,
+ -1.03457624656780970260E-8, 5.74108412545004946722E-8,
+ -3.50196060308781257119E-7, 2.40648494783721712015E-6,
+ -1.93619797416608296024E-5, 1.95215518471351631108E-4,
+ -2.85781685962277938680E-3, 1.03923736576817238437E-1,
+ 2.72062619048444266945E0};
+ const T MAXNUM = pset1<T>(NumTraits<double>::infinity());
+ const T two = pset1<T>(2.0);
+ T x_le_two = pdiv(internal::pchebevl<T, 11>::run(
+ pmadd(x, x, pset1<T>(-2.0)), A), x);
+ x_le_two = pmadd(
+ generic_i1<T, double>::run(x), plog(pmul(pset1<T>(0.5), x)), x_le_two);
+ x_le_two = pmul(x_le_two, pexp(x));
+ x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+ T x_gt_two = pmul(
+ internal::pchebevl<T, 25>::run(
+ psub(pdiv(pset1<T>(8.0), x), two), B),
+ prsqrt(x));
+ return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+ }
+};
+
+template <typename T>
+struct bessel_k1e_impl {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T x) {
+ return generic_k1e<T>::run(x);
+ }
+};
+
+template <typename T>
+struct bessel_k1_retval {
+ typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_k1 {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T&) {
+ EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ return ScalarType(0);
+ }
+};
+
+template <typename T>
+struct generic_k1<T, float> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T& x) {
+ /* k1f.c
+ * Modified Bessel function, third kind, order one
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * float x, y, k1f();
+ *
+ * y = k1f( x );
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Computes the modified Bessel function of the third kind
+ * of order one of the argument.
+ *
+ * The range is partitioned into the two intervals [0,2] and
+ * (2, infinity). Chebyshev polynomial expansions are employed
+ * in each interval.
+ *
+ *
+ *
+ * ACCURACY:
+ *
+ * Relative error:
+ * arithmetic domain # trials peak rms
+ * IEEE 0, 30 30000 4.6e-7 7.6e-8
+ *
+ * ERROR MESSAGES:
+ *
+ * message condition value returned
+ * k1 domain x <= 0 MAXNUM
+ *
+ */
+
+ const float A[] = {-2.21338763073472585583E-8f, -2.43340614156596823496E-6f,
+ -1.73028895751305206302E-4f, -6.97572385963986435018E-3f,
+ -1.22611180822657148235E-1f, -3.53155960776544875667E-1f,
+ 1.52530022733894777053E0f};
+ const float B[] = {2.01504975519703286596E-9f, -1.03457624656780970260E-8f,
+ 5.74108412545004946722E-8f, -3.50196060308781257119E-7f,
+ 2.40648494783721712015E-6f, -1.93619797416608296024E-5f,
+ 1.95215518471351631108E-4f, -2.85781685962277938680E-3f,
+ 1.03923736576817238437E-1f, 2.72062619048444266945E0f};
+ const T MAXNUM = pset1<T>(NumTraits<float>::infinity());
+ const T two = pset1<T>(2.0);
+ T x_le_two = pdiv(internal::pchebevl<T, 7>::run(
+ pmadd(x, x, pset1<T>(-2.0)), A), x);
+ x_le_two = pmadd(
+ generic_i1<T, float>::run(x), plog(pmul(pset1<T>(0.5), x)), x_le_two);
+ x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+ T x_gt_two = pmul(
+ pexp(pnegate(x)),
+ pmul(
+ internal::pchebevl<T, 10>::run(
+ psub(pdiv(pset1<T>(8.0), x), two), B),
+ prsqrt(x)));
+ return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+ }
+};
+
+template <typename T>
+struct generic_k1<T, double> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T& x) {
+ /* k1.c
+ * Modified Bessel function, third kind, order one
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * float x, y, k1f();
+ *
+ * y = k1f( x );
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Computes the modified Bessel function of the third kind
+ * of order one of the argument.
+ *
+ * The range is partitioned into the two intervals [0,2] and
+ * (2, infinity). Chebyshev polynomial expansions are employed
+ * in each interval.
+ *
+ *
+ *
+ * ACCURACY:
+ *
+ * Relative error:
+ * arithmetic domain # trials peak rms
+ * IEEE 0, 30 30000 4.6e-7 7.6e-8
+ *
+ * ERROR MESSAGES:
+ *
+ * message condition value returned
+ * k1 domain x <= 0 MAXNUM
+ *
+ */
+ const double A[] = {-7.02386347938628759343E-18, -2.42744985051936593393E-15,
+ -6.66690169419932900609E-13, -1.41148839263352776110E-10,
+ -2.21338763073472585583E-8, -2.43340614156596823496E-6,
+ -1.73028895751305206302E-4, -6.97572385963986435018E-3,
+ -1.22611180822657148235E-1, -3.53155960776544875667E-1,
+ 1.52530022733894777053E0};
+ const double B[] = {-5.75674448366501715755E-18, 1.79405087314755922667E-17,
+ -5.68946255844285935196E-17, 1.83809354436663880070E-16,
+ -6.05704724837331885336E-16, 2.03870316562433424052E-15,
+ -7.01983709041831346144E-15, 2.47715442448130437068E-14,
+ -8.97670518232499435011E-14, 3.34841966607842919884E-13,
+ -1.28917396095102890680E-12, 5.13963967348173025100E-12,
+ -2.12996783842756842877E-11, 9.21831518760500529508E-11,
+ -4.19035475934189648750E-10, 2.01504975519703286596E-9,
+ -1.03457624656780970260E-8, 5.74108412545004946722E-8,
+ -3.50196060308781257119E-7, 2.40648494783721712015E-6,
+ -1.93619797416608296024E-5, 1.95215518471351631108E-4,
+ -2.85781685962277938680E-3, 1.03923736576817238437E-1,
+ 2.72062619048444266945E0};
+ const T MAXNUM = pset1<T>(NumTraits<double>::infinity());
+ const T two = pset1<T>(2.0);
+ T x_le_two = pdiv(internal::pchebevl<T, 11>::run(
+ pmadd(x, x, pset1<T>(-2.0)), A), x);
+ x_le_two = pmadd(
+ generic_i1<T, double>::run(x), plog(pmul(pset1<T>(0.5), x)), x_le_two);
+ x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+ T x_gt_two = pmul(
+ pexp(-x),
+ pmul(
+ internal::pchebevl<T, 25>::run(
+ psub(pdiv(pset1<T>(8.0), x), two), B),
+ prsqrt(x)));
+ return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+ }
+};
+
+template <typename T>
+struct bessel_k1_impl {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T x) {
+ return generic_k1<T>::run(x);
+ }
+};
+
+template <typename T>
+struct bessel_j0_retval {
+ typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_j0 {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T&) {
+ EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ return ScalarType(0);
+ }
+};
+
+template <typename T>
+struct generic_j0<T, float> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T& x) {
+ /* j0f.c
+ * Bessel function of order zero
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * float x, y, j0f();
+ *
+ * y = j0f( x );
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Returns Bessel function of order zero of the argument.
+ *
+ * The domain is divided into the intervals [0, 2] and
+ * (2, infinity). In the first interval the following polynomial
+ * approximation is used:
+ *
+ *
+ * 2 2 2
+ * (w - r ) (w - r ) (w - r ) P(w)
+ * 1 2 3
+ *
+ * 2
+ * where w = x and the three r's are zeros of the function.
+ *
+ * In the second interval, the modulus and phase are approximated
+ * by polynomials of the form Modulus(x) = sqrt(1/x) Q(1/x)
+ * and Phase(x) = x + 1/x R(1/x^2) - pi/4. The function is
+ *
+ * j0(x) = Modulus(x) cos( Phase(x) ).
+ *
+ *
+ *
+ * ACCURACY:
+ *
+ * Absolute error:
+ * arithmetic domain # trials peak rms
+ * IEEE 0, 2 100000 1.3e-7 3.6e-8
+ * IEEE 2, 32 100000 1.9e-7 5.4e-8
+ *
+ */
+
+ const float JP[] = {-6.068350350393235E-008f, 6.388945720783375E-006f,
+ -3.969646342510940E-004f, 1.332913422519003E-002f,
+ -1.729150680240724E-001f};
+ const float MO[] = {-6.838999669318810E-002f, 1.864949361379502E-001f,
+ -2.145007480346739E-001f, 1.197549369473540E-001f,
+ -3.560281861530129E-003f, -4.969382655296620E-002f,
+ -3.355424622293709E-006f, 7.978845717621440E-001f};
+ const float PH[] = {3.242077816988247E+001f, -3.630592630518434E+001f,
+ 1.756221482109099E+001f, -4.974978466280903E+000f,
+ 1.001973420681837E+000f, -1.939906941791308E-001f,
+ 6.490598792654666E-002f, -1.249992184872738E-001f};
+ const T DR1 = pset1<T>(5.78318596294678452118f);
+ const T NEG_PIO4F = pset1<T>(-0.7853981633974483096f); /* -pi / 4 */
+ T y = pabs(x);
+ T z = pmul(y, y);
+ T y_le_two = pselect(
+ pcmp_lt(y, pset1<T>(1.0e-3f)),
+ pmadd(z, pset1<T>(-0.25f), pset1<T>(1.0f)),
+ pmul(psub(z, DR1), internal::ppolevl<T, 4>::run(z, JP)));
+ T q = pdiv(pset1<T>(1.0f), y);
+ T w = prsqrt(y);
+ T p = pmul(w, internal::ppolevl<T, 7>::run(q, MO));
+ w = pmul(q, q);
+ T yn = pmadd(q, internal::ppolevl<T, 7>::run(w, PH), NEG_PIO4F);
+ T y_gt_two = pmul(p, pcos(padd(yn, y)));
+ return pselect(pcmp_le(y, pset1<T>(2.0)), y_le_two, y_gt_two);
+ }
+};
+
+template <typename T>
+struct generic_j0<T, double> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T& x) {
+ /* j0.c
+ * Bessel function of order zero
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * double x, y, j0();
+ *
+ * y = j0( x );
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Returns Bessel function of order zero of the argument.
+ *
+ * The domain is divided into the intervals [0, 5] and
+ * (5, infinity). In the first interval the following rational
+ * approximation is used:
+ *
+ *
+ * 2 2
+ * (w - r ) (w - r ) P (w) / Q (w)
+ * 1 2 3 8
+ *
+ * 2
+ * where w = x and the two r's are zeros of the function.
+ *
+ * In the second interval, the Hankel asymptotic expansion
+ * is employed with two rational functions of degree 6/6
+ * and 7/7.
+ *
+ *
+ *
+ * ACCURACY:
+ *
+ * Absolute error:
+ * arithmetic domain # trials peak rms
+ * DEC 0, 30 10000 4.4e-17 6.3e-18
+ * IEEE 0, 30 60000 4.2e-16 1.1e-16
+ *
+ */
+ const double PP[] = {7.96936729297347051624E-4, 8.28352392107440799803E-2,
+ 1.23953371646414299388E0, 5.44725003058768775090E0,
+ 8.74716500199817011941E0, 5.30324038235394892183E0,
+ 9.99999999999999997821E-1};
+ const double PQ[] = {9.24408810558863637013E-4, 8.56288474354474431428E-2,
+ 1.25352743901058953537E0, 5.47097740330417105182E0,
+ 8.76190883237069594232E0, 5.30605288235394617618E0,
+ 1.00000000000000000218E0};
+ const double QP[] = {-1.13663838898469149931E-2, -1.28252718670509318512E0,
+ -1.95539544257735972385E1, -9.32060152123768231369E1,
+ -1.77681167980488050595E2, -1.47077505154951170175E2,
+ -5.14105326766599330220E1, -6.05014350600728481186E0};
+ const double QQ[] = {1.00000000000000000000E0, 6.43178256118178023184E1,
+ 8.56430025976980587198E2, 3.88240183605401609683E3,
+ 7.24046774195652478189E3, 5.93072701187316984827E3,
+ 2.06209331660327847417E3, 2.42005740240291393179E2};
+ const double RP[] = {-4.79443220978201773821E9, 1.95617491946556577543E12,
+ -2.49248344360967716204E14, 9.70862251047306323952E15};
+ const double RQ[] = {1.00000000000000000000E0, 4.99563147152651017219E2,
+ 1.73785401676374683123E5, 4.84409658339962045305E7,
+ 1.11855537045356834862E10, 2.11277520115489217587E12,
+ 3.10518229857422583814E14, 3.18121955943204943306E16,
+ 1.71086294081043136091E18};
+ const T DR1 = pset1<T>(5.78318596294678452118E0);
+ const T DR2 = pset1<T>(3.04712623436620863991E1);
+ const T SQ2OPI = pset1<T>(7.9788456080286535587989E-1); /* sqrt(2 / pi) */
+ const T NEG_PIO4 = pset1<T>(-0.7853981633974483096); /* pi / 4 */
+
+ T y = pabs(x);
+ T z = pmul(y, y);
+ T y_le_five = pselect(
+ pcmp_lt(y, pset1<T>(1.0e-5)),
+ pmadd(z, pset1<T>(-0.25), pset1<T>(1.0)),
+ pmul(pmul(psub(z, DR1), psub(z, DR2)),
+ pdiv(internal::ppolevl<T, 3>::run(z, RP),
+ internal::ppolevl<T, 8>::run(z, RQ))));
+ T s = pdiv(pset1<T>(25.0), z);
+ T p = pdiv(
+ internal::ppolevl<T, 6>::run(s, PP),
+ internal::ppolevl<T, 6>::run(s, PQ));
+ T q = pdiv(
+ internal::ppolevl<T, 7>::run(s, QP),
+ internal::ppolevl<T, 7>::run(s, QQ));
+ T yn = padd(y, NEG_PIO4);
+ T w = pdiv(pset1<T>(-5.0), y);
+ p = pmadd(p, pcos(yn), pmul(w, pmul(q, psin(yn))));
+ T y_gt_five = pmul(p, pmul(SQ2OPI, prsqrt(y)));
+ return pselect(pcmp_le(y, pset1<T>(5.0)), y_le_five, y_gt_five);
+ }
+};
+
+template <typename T>
+struct bessel_j0_impl {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T x) {
+ return generic_j0<T>::run(x);
+ }
+};
+
+template <typename T>
+struct bessel_y0_retval {
+ typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_y0 {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T&) {
+ EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ return ScalarType(0);
+ }
+};
+
+template <typename T>
+struct generic_y0<T, float> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T& x) {
+ /* j0f.c
+ * Bessel function of the second kind, order zero
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * float x, y, y0f();
+ *
+ * y = y0f( x );
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Returns Bessel function of the second kind, of order
+ * zero, of the argument.
+ *
+ * The domain is divided into the intervals [0, 2] and
+ * (2, infinity). In the first interval a rational approximation
+ * R(x) is employed to compute
+ *
+ * 2 2 2
+ * y0(x) = (w - r ) (w - r ) (w - r ) R(x) + 2/pi ln(x) j0(x).
+ * 1 2 3
+ *
+ * Thus a call to j0() is required. The three zeros are removed
+ * from R(x) to improve its numerical stability.
+ *
+ * In the second interval, the modulus and phase are approximated
+ * by polynomials of the form Modulus(x) = sqrt(1/x) Q(1/x)
+ * and Phase(x) = x + 1/x S(1/x^2) - pi/4. Then the function is
+ *
+ * y0(x) = Modulus(x) sin( Phase(x) ).
+ *
+ *
+ *
+ *
+ * ACCURACY:
+ *
+ * Absolute error, when y0(x) < 1; else relative error:
+ *
+ * arithmetic domain # trials peak rms
+ * IEEE 0, 2 100000 2.4e-7 3.4e-8
+ * IEEE 2, 32 100000 1.8e-7 5.3e-8
+ *
+ */
+
+ const float YP[] = {9.454583683980369E-008f, -9.413212653797057E-006f,
+ 5.344486707214273E-004f, -1.584289289821316E-002f,
+ 1.707584643733568E-001f};
+ const float MO[] = {-6.838999669318810E-002f, 1.864949361379502E-001f,
+ -2.145007480346739E-001f, 1.197549369473540E-001f,
+ -3.560281861530129E-003f, -4.969382655296620E-002f,
+ -3.355424622293709E-006f, 7.978845717621440E-001f};
+ const float PH[] = {3.242077816988247E+001f, -3.630592630518434E+001f,
+ 1.756221482109099E+001f, -4.974978466280903E+000f,
+ 1.001973420681837E+000f, -1.939906941791308E-001f,
+ 6.490598792654666E-002f, -1.249992184872738E-001f};
+ const T YZ1 = pset1<T>(0.43221455686510834878f);
+ const T TWOOPI = pset1<T>(0.636619772367581343075535f); /* 2 / pi */
+ const T NEG_PIO4F = pset1<T>(-0.7853981633974483096f); /* -pi / 4 */
+ const T NEG_MAXNUM = pset1<T>(-NumTraits<float>::infinity());
+ T z = pmul(x, x);
+ T x_le_two = pmul(TWOOPI, pmul(plog(x), generic_j0<T, float>::run(x)));
+ x_le_two = pmadd(
+ psub(z, YZ1), internal::ppolevl<T, 4>::run(z, YP), x_le_two);
+ x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), NEG_MAXNUM, x_le_two);
+ T q = pdiv(pset1<T>(1.0), x);
+ T w = prsqrt(x);
+ T p = pmul(w, internal::ppolevl<T, 7>::run(q, MO));
+ T u = pmul(q, q);
+ T xn = pmadd(q, internal::ppolevl<T, 7>::run(u, PH), NEG_PIO4F);
+ T x_gt_two = pmul(p, psin(padd(xn, x)));
+ return pselect(pcmp_le(x, pset1<T>(2.0)), x_le_two, x_gt_two);
+ }
+};
+
+template <typename T>
+struct generic_y0<T, double> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T& x) {
+ /* j0.c
+ * Bessel function of the second kind, order zero
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * double x, y, y0();
+ *
+ * y = y0( x );
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Returns Bessel function of the second kind, of order
+ * zero, of the argument.
+ *
+ * The domain is divided into the intervals [0, 5] and
+ * (5, infinity). In the first interval a rational approximation
+ * R(x) is employed to compute
+ * y0(x) = R(x) + 2 * log(x) * j0(x) / PI.
+ * Thus a call to j0() is required.
+ *
+ * In the second interval, the Hankel asymptotic expansion
+ * is employed with two rational functions of degree 6/6
+ * and 7/7.
+ *
+ *
+ *
+ * ACCURACY:
+ *
+ * Absolute error, when y0(x) < 1; else relative error:
+ *
+ * arithmetic domain # trials peak rms
+ * DEC 0, 30 9400 7.0e-17 7.9e-18
+ * IEEE 0, 30 30000 1.3e-15 1.6e-16
+ *
+ */
+ const double PP[] = {7.96936729297347051624E-4, 8.28352392107440799803E-2,
+ 1.23953371646414299388E0, 5.44725003058768775090E0,
+ 8.74716500199817011941E0, 5.30324038235394892183E0,
+ 9.99999999999999997821E-1};
+ const double PQ[] = {9.24408810558863637013E-4, 8.56288474354474431428E-2,
+ 1.25352743901058953537E0, 5.47097740330417105182E0,
+ 8.76190883237069594232E0, 5.30605288235394617618E0,
+ 1.00000000000000000218E0};
+ const double QP[] = {-1.13663838898469149931E-2, -1.28252718670509318512E0,
+ -1.95539544257735972385E1, -9.32060152123768231369E1,
+ -1.77681167980488050595E2, -1.47077505154951170175E2,
+ -5.14105326766599330220E1, -6.05014350600728481186E0};
+ const double QQ[] = {1.00000000000000000000E0, 6.43178256118178023184E1,
+ 8.56430025976980587198E2, 3.88240183605401609683E3,
+ 7.24046774195652478189E3, 5.93072701187316984827E3,
+ 2.06209331660327847417E3, 2.42005740240291393179E2};
+ const double YP[] = {1.55924367855235737965E4, -1.46639295903971606143E7,
+ 5.43526477051876500413E9, -9.82136065717911466409E11,
+ 8.75906394395366999549E13, -3.46628303384729719441E15,
+ 4.42733268572569800351E16, -1.84950800436986690637E16};
+ const double YQ[] = {1.00000000000000000000E0, 1.04128353664259848412E3,
+ 6.26107330137134956842E5, 2.68919633393814121987E8,
+ 8.64002487103935000337E10, 2.02979612750105546709E13,
+ 3.17157752842975028269E15, 2.50596256172653059228E17};
+ const T SQ2OPI = pset1<T>(7.9788456080286535587989E-1); /* sqrt(2 / pi) */
+ const T TWOOPI = pset1<T>(0.636619772367581343075535); /* 2 / pi */
+ const T NEG_PIO4 = pset1<T>(-0.7853981633974483096); /* -pi / 4 */
+ const T NEG_MAXNUM = pset1<T>(-NumTraits<double>::infinity());
+
+ T z = pmul(x, x);
+ T x_le_five = pdiv(internal::ppolevl<T, 7>::run(z, YP),
+ internal::ppolevl<T, 7>::run(z, YQ));
+ x_le_five = pmadd(
+ pmul(TWOOPI, plog(x)), generic_j0<T, double>::run(x), x_le_five);
+ x_le_five = pselect(pcmp_le(x, pset1<T>(0.0)), NEG_MAXNUM, x_le_five);
+ T s = pdiv(pset1<T>(25.0), z);
+ T p = pdiv(
+ internal::ppolevl<T, 6>::run(s, PP),
+ internal::ppolevl<T, 6>::run(s, PQ));
+ T q = pdiv(
+ internal::ppolevl<T, 7>::run(s, QP),
+ internal::ppolevl<T, 7>::run(s, QQ));
+ T xn = padd(x, NEG_PIO4);
+ T w = pdiv(pset1<T>(5.0), x);
+ p = pmadd(p, psin(xn), pmul(w, pmul(q, pcos(xn))));
+ T x_gt_five = pmul(p, pmul(SQ2OPI, prsqrt(x)));
+ return pselect(pcmp_le(x, pset1<T>(5.0)), x_le_five, x_gt_five);
+ }
+};
+
+template <typename T>
+struct bessel_y0_impl {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T x) {
+ return generic_y0<T>::run(x);
+ }
+};
+
+template <typename T>
+struct bessel_j1_retval {
+ typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_j1 {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T&) {
+ EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ return ScalarType(0);
+ }
+};
+
+template <typename T>
+struct generic_j1<T, float> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T& x) {
+ /* j1f.c
+ * Bessel function of order one
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * float x, y, j1f();
+ *
+ * y = j1f( x );
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Returns Bessel function of order one of the argument.
+ *
+ * The domain is divided into the intervals [0, 2] and
+ * (2, infinity). In the first interval a polynomial approximation
+ * 2
+ * (w - r ) x P(w)
+ * 1
+ * 2
+ * is used, where w = x and r is the first zero of the function.
+ *
+ * In the second interval, the modulus and phase are approximated
+ * by polynomials of the form Modulus(x) = sqrt(1/x) Q(1/x)
+ * and Phase(x) = x + 1/x R(1/x^2) - 3pi/4. The function is
+ *
+ * j0(x) = Modulus(x) cos( Phase(x) ).
+ *
+ *
+ *
+ * ACCURACY:
+ *
+ * Absolute error:
+ * arithmetic domain # trials peak rms
+ * IEEE 0, 2 100000 1.2e-7 2.5e-8
+ * IEEE 2, 32 100000 2.0e-7 5.3e-8
+ *
+ *
+ */
+
+ const float JP[] = {-4.878788132172128E-009f, 6.009061827883699E-007f,
+ -4.541343896997497E-005f, 1.937383947804541E-003f,
+ -3.405537384615824E-002f};
+ const float MO1[] = {6.913942741265801E-002f, -2.284801500053359E-001f,
+ 3.138238455499697E-001f, -2.102302420403875E-001f,
+ 5.435364690523026E-003f, 1.493389585089498E-001f,
+ 4.976029650847191E-006f, 7.978845453073848E-001f};
+ const float PH1[] = {-4.497014141919556E+001f, 5.073465654089319E+001f,
+ -2.485774108720340E+001f, 7.222973196770240E+000f,
+ -1.544842782180211E+000f, 3.503787691653334E-001f,
+ -1.637986776941202E-001f, 3.749989509080821E-001f};
+ const T Z1 = pset1<T>(1.46819706421238932572E1f);
+ const T NEG_THPIO4F = pset1<T>(-2.35619449019234492885f); /* -3*pi/4 */
+
+ T y = pabs(x);
+ T z = pmul(y, y);
+ T y_le_two = pmul(
+ psub(z, Z1),
+ pmul(x, internal::ppolevl<T, 4>::run(z, JP)));
+ T q = pdiv(pset1<T>(1.0f), y);
+ T w = prsqrt(y);
+ T p = pmul(w, internal::ppolevl<T, 7>::run(q, MO1));
+ w = pmul(q, q);
+ T yn = pmadd(q, internal::ppolevl<T, 7>::run(w, PH1), NEG_THPIO4F);
+ T y_gt_two = pmul(p, pcos(padd(yn, y)));
+ // j1 is an odd function. This implementation differs from cephes to
+ // take this fact in to account. Cephes returns -j1(x) for y > 2 range.
+ y_gt_two = pselect(
+ pcmp_lt(x, pset1<T>(0.0f)), pnegate(y_gt_two), y_gt_two);
+ return pselect(pcmp_le(y, pset1<T>(2.0f)), y_le_two, y_gt_two);
+ }
+};
+
+template <typename T>
+struct generic_j1<T, double> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T& x) {
+ /* j1.c
+ * Bessel function of order one
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * double x, y, j1();
+ *
+ * y = j1( x );
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Returns Bessel function of order one of the argument.
+ *
+ * The domain is divided into the intervals [0, 8] and
+ * (8, infinity). In the first interval a 24 term Chebyshev
+ * expansion is used. In the second, the asymptotic
+ * trigonometric representation is employed using two
+ * rational functions of degree 5/5.
+ *
+ *
+ *
+ * ACCURACY:
+ *
+ * Absolute error:
+ * arithmetic domain # trials peak rms
+ * DEC 0, 30 10000 4.0e-17 1.1e-17
+ * IEEE 0, 30 30000 2.6e-16 1.1e-16
+ *
+ */
+ const double PP[] = {7.62125616208173112003E-4, 7.31397056940917570436E-2,
+ 1.12719608129684925192E0, 5.11207951146807644818E0,
+ 8.42404590141772420927E0, 5.21451598682361504063E0,
+ 1.00000000000000000254E0};
+ const double PQ[] = {5.71323128072548699714E-4, 6.88455908754495404082E-2,
+ 1.10514232634061696926E0, 5.07386386128601488557E0,
+ 8.39985554327604159757E0, 5.20982848682361821619E0,
+ 9.99999999999999997461E-1};
+ const double QP[] = {5.10862594750176621635E-2, 4.98213872951233449420E0,
+ 7.58238284132545283818E1, 3.66779609360150777800E2,
+ 7.10856304998926107277E2, 5.97489612400613639965E2,
+ 2.11688757100572135698E2, 2.52070205858023719784E1};
+ const double QQ[] = {1.00000000000000000000E0, 7.42373277035675149943E1,
+ 1.05644886038262816351E3, 4.98641058337653607651E3,
+ 9.56231892404756170795E3, 7.99704160447350683650E3,
+ 2.82619278517639096600E3, 3.36093607810698293419E2};
+ const double RP[] = {-8.99971225705559398224E8, 4.52228297998194034323E11,
+ -7.27494245221818276015E13, 3.68295732863852883286E15};
+ const double RQ[] = {1.00000000000000000000E0, 6.20836478118054335476E2,
+ 2.56987256757748830383E5, 8.35146791431949253037E7,
+ 2.21511595479792499675E10, 4.74914122079991414898E12,
+ 7.84369607876235854894E14, 8.95222336184627338078E16,
+ 5.32278620332680085395E18};
+ const T Z1 = pset1<T>(1.46819706421238932572E1);
+ const T Z2 = pset1<T>(4.92184563216946036703E1);
+ const T NEG_THPIO4 = pset1<T>(-2.35619449019234492885); /* -3*pi/4 */
+ const T SQ2OPI = pset1<T>(7.9788456080286535587989E-1); /* sqrt(2 / pi) */
+ T y = pabs(x);
+ T z = pmul(y, y);
+ T y_le_five = pdiv(internal::ppolevl<T, 3>::run(z, RP),
+ internal::ppolevl<T, 8>::run(z, RQ));
+ y_le_five = pmul(pmul(pmul(y_le_five, x), psub(z, Z1)), psub(z, Z2));
+ T s = pdiv(pset1<T>(25.0), z);
+ T p = pdiv(
+ internal::ppolevl<T, 6>::run(s, PP),
+ internal::ppolevl<T, 6>::run(s, PQ));
+ T q = pdiv(
+ internal::ppolevl<T, 7>::run(s, QP),
+ internal::ppolevl<T, 7>::run(s, QQ));
+ T yn = padd(y, NEG_THPIO4);
+ T w = pdiv(pset1<T>(-5.0), y);
+ p = pmadd(p, pcos(yn), pmul(w, pmul(q, psin(yn))));
+ T y_gt_five = pmul(p, pmul(SQ2OPI, prsqrt(y)));
+ // j1 is an odd function. This implementation differs from cephes to
+ // take this fact in to account. Cephes returns -j1(x) for y > 5 range.
+ y_gt_five = pselect(
+ pcmp_lt(x, pset1<T>(0.0)), pnegate(y_gt_five), y_gt_five);
+ return pselect(pcmp_le(y, pset1<T>(5.0)), y_le_five, y_gt_five);
+ }
+};
+
+template <typename T>
+struct bessel_j1_impl {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T x) {
+ return generic_j1<T>::run(x);
+ }
+};
+
+template <typename T>
+struct bessel_y1_retval {
+ typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_y1 {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T&) {
+ EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ return ScalarType(0);
+ }
+};
+
+template <typename T>
+struct generic_y1<T, float> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T& x) {
+ /* j1f.c
+ * Bessel function of second kind of order one
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * double x, y, y1();
+ *
+ * y = y1( x );
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Returns Bessel function of the second kind of order one
+ * of the argument.
+ *
+ * The domain is divided into the intervals [0, 2] and
+ * (2, infinity). In the first interval a rational approximation
+ * R(x) is employed to compute
+ *
+ * 2
+ * y0(x) = (w - r ) x R(x^2) + 2/pi (ln(x) j1(x) - 1/x) .
+ * 1
+ *
+ * Thus a call to j1() is required.
+ *
+ * In the second interval, the modulus and phase are approximated
+ * by polynomials of the form Modulus(x) = sqrt(1/x) Q(1/x)
+ * and Phase(x) = x + 1/x S(1/x^2) - 3pi/4. Then the function is
+ *
+ * y0(x) = Modulus(x) sin( Phase(x) ).
+ *
+ *
+ *
+ *
+ * ACCURACY:
+ *
+ * Absolute error:
+ * arithmetic domain # trials peak rms
+ * IEEE 0, 2 100000 2.2e-7 4.6e-8
+ * IEEE 2, 32 100000 1.9e-7 5.3e-8
+ *
+ * (error criterion relative when |y1| > 1).
+ *
+ */
+
+ const float YP[] = {8.061978323326852E-009f, -9.496460629917016E-007f,
+ 6.719543806674249E-005f, -2.641785726447862E-003f,
+ 4.202369946500099E-002f};
+ const float MO1[] = {6.913942741265801E-002f, -2.284801500053359E-001f,
+ 3.138238455499697E-001f, -2.102302420403875E-001f,
+ 5.435364690523026E-003f, 1.493389585089498E-001f,
+ 4.976029650847191E-006f, 7.978845453073848E-001f};
+ const float PH1[] = {-4.497014141919556E+001f, 5.073465654089319E+001f,
+ -2.485774108720340E+001f, 7.222973196770240E+000f,
+ -1.544842782180211E+000f, 3.503787691653334E-001f,
+ -1.637986776941202E-001f, 3.749989509080821E-001f};
+ const T YO1 = pset1<T>(4.66539330185668857532f);
+ const T NEG_THPIO4F = pset1<T>(-2.35619449019234492885f); /* -3*pi/4 */
+ const T TWOOPI = pset1<T>(0.636619772367581343075535f); /* 2/pi */
+ const T NEG_MAXNUM = pset1<T>(-NumTraits<float>::infinity());
+
+ T z = pmul(x, x);
+ T x_le_two = pmul(psub(z, YO1), internal::ppolevl<T, 4>::run(z, YP));
+ x_le_two = pmadd(
+ x_le_two, x,
+ pmul(TWOOPI, pmadd(
+ generic_j1<T, float>::run(x), plog(x),
+ pdiv(pset1<T>(-1.0f), x))));
+ x_le_two = pselect(pcmp_lt(x, pset1<T>(0.0f)), NEG_MAXNUM, x_le_two);
+
+ T q = pdiv(pset1<T>(1.0), x);
+ T w = prsqrt(x);
+ T p = pmul(w, internal::ppolevl<T, 7>::run(q, MO1));
+ w = pmul(q, q);
+ T xn = pmadd(q, internal::ppolevl<T, 7>::run(w, PH1), NEG_THPIO4F);
+ T x_gt_two = pmul(p, psin(padd(xn, x)));
+ return pselect(pcmp_le(x, pset1<T>(2.0)), x_le_two, x_gt_two);
+ }
+};
+
+template <typename T>
+struct generic_y1<T, double> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T& x) {
+ /* j1.c
+ * Bessel function of second kind of order one
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * double x, y, y1();
+ *
+ * y = y1( x );
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Returns Bessel function of the second kind of order one
+ * of the argument.
+ *
+ * The domain is divided into the intervals [0, 8] and
+ * (8, infinity). In the first interval a 25 term Chebyshev
+ * expansion is used, and a call to j1() is required.
+ * In the second, the asymptotic trigonometric representation
+ * is employed using two rational functions of degree 5/5.
+ *
+ *
+ *
+ * ACCURACY:
+ *
+ * Absolute error:
+ * arithmetic domain # trials peak rms
+ * DEC 0, 30 10000 8.6e-17 1.3e-17
+ * IEEE 0, 30 30000 1.0e-15 1.3e-16
+ *
+ * (error criterion relative when |y1| > 1).
+ *
+ */
+ const double PP[] = {7.62125616208173112003E-4, 7.31397056940917570436E-2,
+ 1.12719608129684925192E0, 5.11207951146807644818E0,
+ 8.42404590141772420927E0, 5.21451598682361504063E0,
+ 1.00000000000000000254E0};
+ const double PQ[] = {5.71323128072548699714E-4, 6.88455908754495404082E-2,
+ 1.10514232634061696926E0, 5.07386386128601488557E0,
+ 8.39985554327604159757E0, 5.20982848682361821619E0,
+ 9.99999999999999997461E-1};
+ const double QP[] = {5.10862594750176621635E-2, 4.98213872951233449420E0,
+ 7.58238284132545283818E1, 3.66779609360150777800E2,
+ 7.10856304998926107277E2, 5.97489612400613639965E2,
+ 2.11688757100572135698E2, 2.52070205858023719784E1};
+ const double QQ[] = {1.00000000000000000000E0, 7.42373277035675149943E1,
+ 1.05644886038262816351E3, 4.98641058337653607651E3,
+ 9.56231892404756170795E3, 7.99704160447350683650E3,
+ 2.82619278517639096600E3, 3.36093607810698293419E2};
+ const double YP[] = {1.26320474790178026440E9, -6.47355876379160291031E11,
+ 1.14509511541823727583E14, -8.12770255501325109621E15,
+ 2.02439475713594898196E17, -7.78877196265950026825E17};
+ const double YQ[] = {1.00000000000000000000E0, 5.94301592346128195359E2,
+ 2.35564092943068577943E5, 7.34811944459721705660E7,
+ 1.87601316108706159478E10, 3.88231277496238566008E12,
+ 6.20557727146953693363E14, 6.87141087355300489866E16,
+ 3.97270608116560655612E18};
+ const T SQ2OPI = pset1<T>(.79788456080286535588);
+ const T NEG_THPIO4 = pset1<T>(-2.35619449019234492885); /* -3*pi/4 */
+ const T TWOOPI = pset1<T>(0.636619772367581343075535); /* 2/pi */
+ const T NEG_MAXNUM = pset1<T>(-NumTraits<double>::infinity());
+
+ T z = pmul(x, x);
+ T x_le_five = pdiv(internal::ppolevl<T, 5>::run(z, YP),
+ internal::ppolevl<T, 8>::run(z, YQ));
+ x_le_five = pmadd(
+ x_le_five, x, pmul(
+ TWOOPI, pmadd(generic_j1<T, double>::run(x), plog(x),
+ pdiv(pset1<T>(-1.0), x))));
+
+ x_le_five = pselect(pcmp_le(x, pset1<T>(0.0)), NEG_MAXNUM, x_le_five);
+ T s = pdiv(pset1<T>(25.0), z);
+ T p = pdiv(
+ internal::ppolevl<T, 6>::run(s, PP),
+ internal::ppolevl<T, 6>::run(s, PQ));
+ T q = pdiv(
+ internal::ppolevl<T, 7>::run(s, QP),
+ internal::ppolevl<T, 7>::run(s, QQ));
+ T xn = padd(x, NEG_THPIO4);
+ T w = pdiv(pset1<T>(5.0), x);
+ p = pmadd(p, psin(xn), pmul(w, pmul(q, pcos(xn))));
+ T x_gt_five = pmul(p, pmul(SQ2OPI, prsqrt(x)));
+ return pselect(pcmp_le(x, pset1<T>(5.0)), x_le_five, x_gt_five);
+ }
+};
+
+template <typename T>
+struct bessel_y1_impl {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T x) {
+ return generic_y1<T>::run(x);
+ }
+};
+
+} // end namespace internal
+
+namespace numext {
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_i0, Scalar)
+ bessel_i0(const Scalar& x) {
+ return EIGEN_MATHFUNC_IMPL(bessel_i0, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_i0e, Scalar)
+ bessel_i0e(const Scalar& x) {
+ return EIGEN_MATHFUNC_IMPL(bessel_i0e, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_i1, Scalar)
+ bessel_i1(const Scalar& x) {
+ return EIGEN_MATHFUNC_IMPL(bessel_i1, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_i1e, Scalar)
+ bessel_i1e(const Scalar& x) {
+ return EIGEN_MATHFUNC_IMPL(bessel_i1e, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_k0, Scalar)
+ bessel_k0(const Scalar& x) {
+ return EIGEN_MATHFUNC_IMPL(bessel_k0, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_k0e, Scalar)
+ bessel_k0e(const Scalar& x) {
+ return EIGEN_MATHFUNC_IMPL(bessel_k0e, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_k1, Scalar)
+ bessel_k1(const Scalar& x) {
+ return EIGEN_MATHFUNC_IMPL(bessel_k1, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_k1e, Scalar)
+ bessel_k1e(const Scalar& x) {
+ return EIGEN_MATHFUNC_IMPL(bessel_k1e, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_j0, Scalar)
+ bessel_j0(const Scalar& x) {
+ return EIGEN_MATHFUNC_IMPL(bessel_j0, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_y0, Scalar)
+ bessel_y0(const Scalar& x) {
+ return EIGEN_MATHFUNC_IMPL(bessel_y0, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_j1, Scalar)
+ bessel_j1(const Scalar& x) {
+ return EIGEN_MATHFUNC_IMPL(bessel_j1, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_y1, Scalar)
+ bessel_y1(const Scalar& x) {
+ return EIGEN_MATHFUNC_IMPL(bessel_y1, Scalar)::run(x);
+}
+
+} // end namespace numext
+
+} // end namespace Eigen
+
+#endif // EIGEN_BESSEL_FUNCTIONS_H
diff --git a/src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsPacketMath.h b/src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsPacketMath.h
new file mode 100644
index 0000000..943d10f
--- /dev/null
+++ b/src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsPacketMath.h
@@ -0,0 +1,118 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BESSELFUNCTIONS_PACKETMATH_H
+#define EIGEN_BESSELFUNCTIONS_PACKETMATH_H
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order zero i0(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_i0(const Packet& x) {
+ return numext::bessel_i0(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order zero i0e(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_i0e(const Packet& x) {
+ return numext::bessel_i0e(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order one i1(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_i1(const Packet& x) {
+ return numext::bessel_i1(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order one i1e(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_i1e(const Packet& x) {
+ return numext::bessel_i1e(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order zero j0(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_j0(const Packet& x) {
+ return numext::bessel_j0(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order zero j1(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_j1(const Packet& x) {
+ return numext::bessel_j1(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order one y0(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_y0(const Packet& x) {
+ return numext::bessel_y0(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order one y1(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_y1(const Packet& x) {
+ return numext::bessel_y1(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order zero k0(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_k0(const Packet& x) {
+ return numext::bessel_k0(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order zero k0e(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_k0e(const Packet& x) {
+ return numext::bessel_k0e(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order one k1e(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_k1(const Packet& x) {
+ return numext::bessel_k1(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order one k1e(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pbessel_k1e(const Packet& x) {
+ return numext::bessel_k1e(x);
+}
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_BESSELFUNCTIONS_PACKETMATH_H
+
diff --git a/src/EigenUnsupported/src/SpecialFunctions/HipVectorCompatibility.h b/src/EigenUnsupported/src/SpecialFunctions/HipVectorCompatibility.h
new file mode 100644
index 0000000..d7b231a
--- /dev/null
+++ b/src/EigenUnsupported/src/SpecialFunctions/HipVectorCompatibility.h
@@ -0,0 +1,67 @@
+#ifndef HIP_VECTOR_COMPATIBILITY_H
+#define HIP_VECTOR_COMPATIBILITY_H
+
+namespace hip_impl {
+ template <typename, typename, unsigned int> struct Scalar_accessor;
+} // end namespace hip_impl
+
+namespace Eigen {
+namespace internal {
+
+#define HIP_SCALAR_ACCESSOR_BUILDER(NAME) \
+template <typename T, typename U, unsigned int n> \
+struct NAME <hip_impl::Scalar_accessor<T, U, n>> : NAME <T> {};
+
+#define HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(NAME) \
+template <typename T, typename U, unsigned int n> \
+struct NAME##_impl <hip_impl::Scalar_accessor<T, U, n>> : NAME##_impl <T> {}; \
+template <typename T, typename U, unsigned int n> \
+struct NAME##_retval <hip_impl::Scalar_accessor<T, U, n>> : NAME##_retval <T> {};
+
+#define HIP_SCALAR_ACCESSOR_BUILDER_IGAMMA(NAME) \
+template <typename T, typename U, unsigned int n, IgammaComputationMode mode> \
+struct NAME <hip_impl::Scalar_accessor<T, U, n>, mode> : NAME <T, mode> {};
+
+#if EIGEN_HAS_C99_MATH
+HIP_SCALAR_ACCESSOR_BUILDER(betainc_helper)
+HIP_SCALAR_ACCESSOR_BUILDER(incbeta_cfe)
+
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(erf)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(erfc)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(igammac)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(lgamma)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(ndtri)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(polygamma)
+
+HIP_SCALAR_ACCESSOR_BUILDER_IGAMMA(igamma_generic_impl)
+#endif
+
+HIP_SCALAR_ACCESSOR_BUILDER(digamma_impl_maybe_poly)
+HIP_SCALAR_ACCESSOR_BUILDER(zeta_impl_series)
+
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_i0)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_i0e)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_i1)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_i1e)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_j0)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_j1)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_k0)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_k0e)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_k1)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_k1e)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_y0)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_y1)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(betainc)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(digamma)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(gamma_sample_der_alpha)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(igamma_der_a)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(igamma)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(zeta)
+
+HIP_SCALAR_ACCESSOR_BUILDER_IGAMMA(igamma_series_impl)
+HIP_SCALAR_ACCESSOR_BUILDER_IGAMMA(igammac_cf_impl)
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // HIP_VECTOR_COMPATIBILITY_H
diff --git a/src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsArrayAPI.h b/src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsArrayAPI.h
new file mode 100644
index 0000000..691ff4d
--- /dev/null
+++ b/src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsArrayAPI.h
@@ -0,0 +1,167 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#ifndef EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H
+#define EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H
+
+namespace Eigen {
+
+/** \cpp11 \returns an expression of the coefficient-wise igamma(\a a, \a x) to the given arrays.
+ *
+ * This function computes the coefficient-wise incomplete gamma function.
+ *
+ * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+ * or float/double in non c++11 mode, the user has to provide implementations of igammac(T,T) for any scalar
+ * type T to be supported.
+ *
+ * \sa Eigen::igammac(), Eigen::lgamma()
+ */
+template<typename Derived,typename ExponentDerived>
+EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
+igamma(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x)
+{
+ return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived, const ExponentDerived>(
+ a.derived(),
+ x.derived()
+ );
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise igamma_der_a(\a a, \a x) to the given arrays.
+ *
+ * This function computes the coefficient-wise derivative of the incomplete
+ * gamma function with respect to the parameter a.
+ *
+ * \note This function supports only float and double scalar types in c++11
+ * mode. To support other scalar types,
+ * or float/double in non c++11 mode, the user has to provide implementations
+ * of igamma_der_a(T,T) for any scalar
+ * type T to be supported.
+ *
+ * \sa Eigen::igamma(), Eigen::lgamma()
+ */
+template <typename Derived, typename ExponentDerived>
+EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_der_a_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
+igamma_der_a(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x) {
+ return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_der_a_op<typename Derived::Scalar>, const Derived, const ExponentDerived>(
+ a.derived(),
+ x.derived());
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise gamma_sample_der_alpha(\a alpha, \a sample) to the given arrays.
+ *
+ * This function computes the coefficient-wise derivative of the sample
+ * of a Gamma(alpha, 1) random variable with respect to the parameter alpha.
+ *
+ * \note This function supports only float and double scalar types in c++11
+ * mode. To support other scalar types,
+ * or float/double in non c++11 mode, the user has to provide implementations
+ * of gamma_sample_der_alpha(T,T) for any scalar
+ * type T to be supported.
+ *
+ * \sa Eigen::igamma(), Eigen::lgamma()
+ */
+template <typename AlphaDerived, typename SampleDerived>
+EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_gamma_sample_der_alpha_op<typename AlphaDerived::Scalar>, const AlphaDerived, const SampleDerived>
+gamma_sample_der_alpha(const Eigen::ArrayBase<AlphaDerived>& alpha, const Eigen::ArrayBase<SampleDerived>& sample) {
+ return Eigen::CwiseBinaryOp<Eigen::internal::scalar_gamma_sample_der_alpha_op<typename AlphaDerived::Scalar>, const AlphaDerived, const SampleDerived>(
+ alpha.derived(),
+ sample.derived());
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise igammac(\a a, \a x) to the given arrays.
+ *
+ * This function computes the coefficient-wise complementary incomplete gamma function.
+ *
+ * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+ * or float/double in non c++11 mode, the user has to provide implementations of igammac(T,T) for any scalar
+ * type T to be supported.
+ *
+ * \sa Eigen::igamma(), Eigen::lgamma()
+ */
+template<typename Derived,typename ExponentDerived>
+EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
+igammac(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x)
+{
+ return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived, const ExponentDerived>(
+ a.derived(),
+ x.derived()
+ );
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise polygamma(\a n, \a x) to the given arrays.
+ *
+ * It returns the \a n -th derivative of the digamma(psi) evaluated at \c x.
+ *
+ * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+ * or float/double in non c++11 mode, the user has to provide implementations of polygamma(T,T) for any scalar
+ * type T to be supported.
+ *
+ * \sa Eigen::digamma()
+ */
+// * \warning Be careful with the order of the parameters: x.polygamma(n) is equivalent to polygamma(n,x)
+// * \sa ArrayBase::polygamma()
+template<typename DerivedN,typename DerivedX>
+EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_polygamma_op<typename DerivedX::Scalar>, const DerivedN, const DerivedX>
+polygamma(const Eigen::ArrayBase<DerivedN>& n, const Eigen::ArrayBase<DerivedX>& x)
+{
+ return Eigen::CwiseBinaryOp<Eigen::internal::scalar_polygamma_op<typename DerivedX::Scalar>, const DerivedN, const DerivedX>(
+ n.derived(),
+ x.derived()
+ );
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise betainc(\a x, \a a, \a b) to the given arrays.
+ *
+ * This function computes the regularized incomplete beta function (integral).
+ *
+ * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+ * or float/double in non c++11 mode, the user has to provide implementations of betainc(T,T,T) for any scalar
+ * type T to be supported.
+ *
+ * \sa Eigen::betainc(), Eigen::lgamma()
+ */
+template<typename ArgADerived, typename ArgBDerived, typename ArgXDerived>
+EIGEN_STRONG_INLINE const Eigen::CwiseTernaryOp<Eigen::internal::scalar_betainc_op<typename ArgXDerived::Scalar>, const ArgADerived, const ArgBDerived, const ArgXDerived>
+betainc(const Eigen::ArrayBase<ArgADerived>& a, const Eigen::ArrayBase<ArgBDerived>& b, const Eigen::ArrayBase<ArgXDerived>& x)
+{
+ return Eigen::CwiseTernaryOp<Eigen::internal::scalar_betainc_op<typename ArgXDerived::Scalar>, const ArgADerived, const ArgBDerived, const ArgXDerived>(
+ a.derived(),
+ b.derived(),
+ x.derived()
+ );
+}
+
+
+/** \returns an expression of the coefficient-wise zeta(\a x, \a q) to the given arrays.
+ *
+ * It returns the Riemann zeta function of two arguments \a x and \a q:
+ *
+ * \param x is the exponent, it must be > 1
+ * \param q is the shift, it must be > 0
+ *
+ * \note This function supports only float and double scalar types. To support other scalar types, the user has
+ * to provide implementations of zeta(T,T) for any scalar type T to be supported.
+ *
+ * \sa ArrayBase::zeta()
+ */
+template<typename DerivedX,typename DerivedQ>
+EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_zeta_op<typename DerivedX::Scalar>, const DerivedX, const DerivedQ>
+zeta(const Eigen::ArrayBase<DerivedX>& x, const Eigen::ArrayBase<DerivedQ>& q)
+{
+ return Eigen::CwiseBinaryOp<Eigen::internal::scalar_zeta_op<typename DerivedX::Scalar>, const DerivedX, const DerivedQ>(
+ x.derived(),
+ q.derived()
+ );
+}
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H
diff --git a/src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsBFloat16.h b/src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsBFloat16.h
new file mode 100644
index 0000000..2d94231
--- /dev/null
+++ b/src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsBFloat16.h
@@ -0,0 +1,58 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIALFUNCTIONS_BFLOAT16_H
+#define EIGEN_SPECIALFUNCTIONS_BFLOAT16_H
+
+namespace Eigen {
+namespace numext {
+
+#if EIGEN_HAS_C99_MATH
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 lgamma(const Eigen::bfloat16& a) {
+ return Eigen::bfloat16(Eigen::numext::lgamma(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 digamma(const Eigen::bfloat16& a) {
+ return Eigen::bfloat16(Eigen::numext::digamma(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 zeta(const Eigen::bfloat16& x, const Eigen::bfloat16& q) {
+ return Eigen::bfloat16(Eigen::numext::zeta(static_cast<float>(x), static_cast<float>(q)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 polygamma(const Eigen::bfloat16& n, const Eigen::bfloat16& x) {
+ return Eigen::bfloat16(Eigen::numext::polygamma(static_cast<float>(n), static_cast<float>(x)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 erf(const Eigen::bfloat16& a) {
+ return Eigen::bfloat16(Eigen::numext::erf(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 erfc(const Eigen::bfloat16& a) {
+ return Eigen::bfloat16(Eigen::numext::erfc(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 ndtri(const Eigen::bfloat16& a) {
+ return Eigen::bfloat16(Eigen::numext::ndtri(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 igamma(const Eigen::bfloat16& a, const Eigen::bfloat16& x) {
+ return Eigen::bfloat16(Eigen::numext::igamma(static_cast<float>(a), static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 igamma_der_a(const Eigen::bfloat16& a, const Eigen::bfloat16& x) {
+ return Eigen::bfloat16(Eigen::numext::igamma_der_a(static_cast<float>(a), static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 gamma_sample_der_alpha(const Eigen::bfloat16& alpha, const Eigen::bfloat16& sample) {
+ return Eigen::bfloat16(Eigen::numext::gamma_sample_der_alpha(static_cast<float>(alpha), static_cast<float>(sample)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 igammac(const Eigen::bfloat16& a, const Eigen::bfloat16& x) {
+ return Eigen::bfloat16(Eigen::numext::igammac(static_cast<float>(a), static_cast<float>(x)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 betainc(const Eigen::bfloat16& a, const Eigen::bfloat16& b, const Eigen::bfloat16& x) {
+ return Eigen::bfloat16(Eigen::numext::betainc(static_cast<float>(a), static_cast<float>(b), static_cast<float>(x)));
+}
+#endif
+
+} // end namespace numext
+} // end namespace Eigen
+
+#endif // EIGEN_SPECIALFUNCTIONS_BFLOAT16_H
diff --git a/src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsFunctors.h b/src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsFunctors.h
new file mode 100644
index 0000000..abefe99
--- /dev/null
+++ b/src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsFunctors.h
@@ -0,0 +1,330 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com>
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIALFUNCTIONS_FUNCTORS_H
+#define EIGEN_SPECIALFUNCTIONS_FUNCTORS_H
+
+namespace Eigen {
+
+namespace internal {
+
+
+/** \internal
+ * \brief Template functor to compute the incomplete gamma function igamma(a, x)
+ *
+ * \sa class CwiseBinaryOp, Cwise::igamma
+ */
+template<typename Scalar> struct scalar_igamma_op : binary_op_base<Scalar,Scalar>
+{
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_igamma_op)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& x) const {
+ using numext::igamma; return igamma(a, x);
+ }
+ template<typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const {
+ return internal::pigamma(a, x);
+ }
+};
+template<typename Scalar>
+struct functor_traits<scalar_igamma_op<Scalar> > {
+ enum {
+ // Guesstimate
+ Cost = 20 * NumTraits<Scalar>::MulCost + 10 * NumTraits<Scalar>::AddCost,
+ PacketAccess = packet_traits<Scalar>::HasIGamma
+ };
+};
+
+/** \internal
+ * \brief Template functor to compute the derivative of the incomplete gamma
+ * function igamma_der_a(a, x)
+ *
+ * \sa class CwiseBinaryOp, Cwise::igamma_der_a
+ */
+template <typename Scalar>
+struct scalar_igamma_der_a_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_igamma_der_a_op)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a, const Scalar& x) const {
+ using numext::igamma_der_a;
+ return igamma_der_a(a, x);
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const {
+ return internal::pigamma_der_a(a, x);
+ }
+};
+template <typename Scalar>
+struct functor_traits<scalar_igamma_der_a_op<Scalar> > {
+ enum {
+ // 2x the cost of igamma
+ Cost = 40 * NumTraits<Scalar>::MulCost + 20 * NumTraits<Scalar>::AddCost,
+ PacketAccess = packet_traits<Scalar>::HasIGammaDerA
+ };
+};
+
+/** \internal
+ * \brief Template functor to compute the derivative of the sample
+ * of a Gamma(alpha, 1) random variable with respect to the parameter alpha
+ * gamma_sample_der_alpha(alpha, sample)
+ *
+ * \sa class CwiseBinaryOp, Cwise::gamma_sample_der_alpha
+ */
+template <typename Scalar>
+struct scalar_gamma_sample_der_alpha_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_gamma_sample_der_alpha_op)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& alpha, const Scalar& sample) const {
+ using numext::gamma_sample_der_alpha;
+ return gamma_sample_der_alpha(alpha, sample);
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& alpha, const Packet& sample) const {
+ return internal::pgamma_sample_der_alpha(alpha, sample);
+ }
+};
+template <typename Scalar>
+struct functor_traits<scalar_gamma_sample_der_alpha_op<Scalar> > {
+ enum {
+ // 2x the cost of igamma, minus the lgamma cost (the lgamma cancels out)
+ Cost = 30 * NumTraits<Scalar>::MulCost + 15 * NumTraits<Scalar>::AddCost,
+ PacketAccess = packet_traits<Scalar>::HasGammaSampleDerAlpha
+ };
+};
+
+/** \internal
+ * \brief Template functor to compute the complementary incomplete gamma function igammac(a, x)
+ *
+ * \sa class CwiseBinaryOp, Cwise::igammac
+ */
+template<typename Scalar> struct scalar_igammac_op : binary_op_base<Scalar,Scalar>
+{
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_igammac_op)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& x) const {
+ using numext::igammac; return igammac(a, x);
+ }
+ template<typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const
+ {
+ return internal::pigammac(a, x);
+ }
+};
+template<typename Scalar>
+struct functor_traits<scalar_igammac_op<Scalar> > {
+ enum {
+ // Guesstimate
+ Cost = 20 * NumTraits<Scalar>::MulCost + 10 * NumTraits<Scalar>::AddCost,
+ PacketAccess = packet_traits<Scalar>::HasIGammac
+ };
+};
+
+
+/** \internal
+ * \brief Template functor to compute the incomplete beta integral betainc(a, b, x)
+ *
+ */
+template<typename Scalar> struct scalar_betainc_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_betainc_op)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& x, const Scalar& a, const Scalar& b) const {
+ using numext::betainc; return betainc(x, a, b);
+ }
+ template<typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& x, const Packet& a, const Packet& b) const
+ {
+ return internal::pbetainc(x, a, b);
+ }
+};
+template<typename Scalar>
+struct functor_traits<scalar_betainc_op<Scalar> > {
+ enum {
+ // Guesstimate
+ Cost = 400 * NumTraits<Scalar>::MulCost + 400 * NumTraits<Scalar>::AddCost,
+ PacketAccess = packet_traits<Scalar>::HasBetaInc
+ };
+};
+
+
+/** \internal
+ * \brief Template functor to compute the natural log of the absolute
+ * value of Gamma of a scalar
+ * \sa class CwiseUnaryOp, Cwise::lgamma()
+ */
+template<typename Scalar> struct scalar_lgamma_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_lgamma_op)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const {
+ using numext::lgamma; return lgamma(a);
+ }
+ typedef typename packet_traits<Scalar>::type Packet;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::plgamma(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_lgamma_op<Scalar> >
+{
+ enum {
+ // Guesstimate
+ Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+ PacketAccess = packet_traits<Scalar>::HasLGamma
+ };
+};
+
+/** \internal
+ * \brief Template functor to compute psi, the derivative of lgamma of a scalar.
+ * \sa class CwiseUnaryOp, Cwise::digamma()
+ */
+template<typename Scalar> struct scalar_digamma_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_digamma_op)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const {
+ using numext::digamma; return digamma(a);
+ }
+ typedef typename packet_traits<Scalar>::type Packet;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::pdigamma(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_digamma_op<Scalar> >
+{
+ enum {
+ // Guesstimate
+ Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+ PacketAccess = packet_traits<Scalar>::HasDiGamma
+ };
+};
+
+/** \internal
+ * \brief Template functor to compute the Riemann Zeta function of two arguments.
+ * \sa class CwiseUnaryOp, Cwise::zeta()
+ */
+template<typename Scalar> struct scalar_zeta_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_zeta_op)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& x, const Scalar& q) const {
+ using numext::zeta; return zeta(x, q);
+ }
+ typedef typename packet_traits<Scalar>::type Packet;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x, const Packet& q) const { return internal::pzeta(x, q); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_zeta_op<Scalar> >
+{
+ enum {
+ // Guesstimate
+ Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+ PacketAccess = packet_traits<Scalar>::HasZeta
+ };
+};
+
+/** \internal
+ * \brief Template functor to compute the polygamma function.
+ * \sa class CwiseUnaryOp, Cwise::polygamma()
+ */
+template<typename Scalar> struct scalar_polygamma_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_polygamma_op)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& n, const Scalar& x) const {
+ using numext::polygamma; return polygamma(n, x);
+ }
+ typedef typename packet_traits<Scalar>::type Packet;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& n, const Packet& x) const { return internal::ppolygamma(n, x); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_polygamma_op<Scalar> >
+{
+ enum {
+ // Guesstimate
+ Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+ PacketAccess = packet_traits<Scalar>::HasPolygamma
+ };
+};
+
+/** \internal
+ * \brief Template functor to compute the error function of a scalar
+ * \sa class CwiseUnaryOp, ArrayBase::erf()
+ */
+template<typename Scalar> struct scalar_erf_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_erf_op)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar
+ operator()(const Scalar& a) const {
+ return numext::erf(a);
+ }
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+ return perf(x);
+ }
+};
+template <typename Scalar>
+struct functor_traits<scalar_erf_op<Scalar> > {
+ enum {
+ PacketAccess = packet_traits<Scalar>::HasErf,
+ Cost =
+ (PacketAccess
+#ifdef EIGEN_VECTORIZE_FMA
+ // TODO(rmlarsen): Move the FMA cost model to a central location.
+ // Haswell can issue 2 add/mul/madd per cycle.
+ // 10 pmadd, 2 pmul, 1 div, 2 other
+ ? (2 * NumTraits<Scalar>::AddCost +
+ 7 * NumTraits<Scalar>::MulCost +
+ scalar_div_cost<Scalar, packet_traits<Scalar>::HasDiv>::value)
+#else
+ ? (12 * NumTraits<Scalar>::AddCost +
+ 12 * NumTraits<Scalar>::MulCost +
+ scalar_div_cost<Scalar, packet_traits<Scalar>::HasDiv>::value)
+#endif
+ // Assume for simplicity that this is as expensive as an exp().
+ : (functor_traits<scalar_exp_op<Scalar> >::Cost))
+ };
+};
+
+/** \internal
+ * \brief Template functor to compute the Complementary Error Function
+ * of a scalar
+ * \sa class CwiseUnaryOp, Cwise::erfc()
+ */
+template<typename Scalar> struct scalar_erfc_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_erfc_op)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const {
+ using numext::erfc; return erfc(a);
+ }
+ typedef typename packet_traits<Scalar>::type Packet;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::perfc(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_erfc_op<Scalar> >
+{
+ enum {
+ // Guesstimate
+ Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+ PacketAccess = packet_traits<Scalar>::HasErfc
+ };
+};
+
+/** \internal
+ * \brief Template functor to compute the Inverse of the normal distribution
+ * function of a scalar
+ * \sa class CwiseUnaryOp, Cwise::ndtri()
+ */
+template<typename Scalar> struct scalar_ndtri_op {
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_ndtri_op)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const {
+ using numext::ndtri; return ndtri(a);
+ }
+ typedef typename packet_traits<Scalar>::type Packet;
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::pndtri(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_ndtri_op<Scalar> >
+{
+ enum {
+ // On average, We are evaluating rational functions with degree N=9 in the
+ // numerator and denominator. This results in 2*N additions and 2*N
+ // multiplications.
+ Cost = 18 * NumTraits<Scalar>::MulCost + 18 * NumTraits<Scalar>::AddCost,
+ PacketAccess = packet_traits<Scalar>::HasNdtri
+ };
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPECIALFUNCTIONS_FUNCTORS_H
diff --git a/src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsHalf.h b/src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsHalf.h
new file mode 100644
index 0000000..2a3a531
--- /dev/null
+++ b/src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsHalf.h
@@ -0,0 +1,58 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIALFUNCTIONS_HALF_H
+#define EIGEN_SPECIALFUNCTIONS_HALF_H
+
+namespace Eigen {
+namespace numext {
+
+#if EIGEN_HAS_C99_MATH
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half lgamma(const Eigen::half& a) {
+ return Eigen::half(Eigen::numext::lgamma(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half digamma(const Eigen::half& a) {
+ return Eigen::half(Eigen::numext::digamma(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half zeta(const Eigen::half& x, const Eigen::half& q) {
+ return Eigen::half(Eigen::numext::zeta(static_cast<float>(x), static_cast<float>(q)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half polygamma(const Eigen::half& n, const Eigen::half& x) {
+ return Eigen::half(Eigen::numext::polygamma(static_cast<float>(n), static_cast<float>(x)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erf(const Eigen::half& a) {
+ return Eigen::half(Eigen::numext::erf(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erfc(const Eigen::half& a) {
+ return Eigen::half(Eigen::numext::erfc(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ndtri(const Eigen::half& a) {
+ return Eigen::half(Eigen::numext::ndtri(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igamma(const Eigen::half& a, const Eigen::half& x) {
+ return Eigen::half(Eigen::numext::igamma(static_cast<float>(a), static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igamma_der_a(const Eigen::half& a, const Eigen::half& x) {
+ return Eigen::half(Eigen::numext::igamma_der_a(static_cast<float>(a), static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half gamma_sample_der_alpha(const Eigen::half& alpha, const Eigen::half& sample) {
+ return Eigen::half(Eigen::numext::gamma_sample_der_alpha(static_cast<float>(alpha), static_cast<float>(sample)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igammac(const Eigen::half& a, const Eigen::half& x) {
+ return Eigen::half(Eigen::numext::igammac(static_cast<float>(a), static_cast<float>(x)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half betainc(const Eigen::half& a, const Eigen::half& b, const Eigen::half& x) {
+ return Eigen::half(Eigen::numext::betainc(static_cast<float>(a), static_cast<float>(b), static_cast<float>(x)));
+}
+#endif
+
+} // end namespace numext
+} // end namespace Eigen
+
+#endif // EIGEN_SPECIALFUNCTIONS_HALF_H
diff --git a/src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsImpl.h b/src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsImpl.h
new file mode 100644
index 0000000..f1c260e
--- /dev/null
+++ b/src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsImpl.h
@@ -0,0 +1,2045 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Eugene Brevdo <ebrevdo@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIAL_FUNCTIONS_H
+#define EIGEN_SPECIAL_FUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+// Parts of this code are based on the Cephes Math Library.
+//
+// Cephes Math Library Release 2.8: June, 2000
+// Copyright 1984, 1987, 1992, 2000 by Stephen L. Moshier
+//
+// Permission has been kindly provided by the original author
+// to incorporate the Cephes software into the Eigen codebase:
+//
+// From: Stephen Moshier
+// To: Eugene Brevdo
+// Subject: Re: Permission to wrap several cephes functions in Eigen
+//
+// Hello Eugene,
+//
+// Thank you for writing.
+//
+// If your licensing is similar to BSD, the formal way that has been
+// handled is simply to add a statement to the effect that you are incorporating
+// the Cephes software by permission of the author.
+//
+// Good luck with your project,
+// Steve
+
+
+/****************************************************************************
+ * Implementation of lgamma, requires C++11/C99 *
+ ****************************************************************************/
+
+template <typename Scalar>
+struct lgamma_impl {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
+ EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ return Scalar(0);
+ }
+};
+
+template <typename Scalar>
+struct lgamma_retval {
+ typedef Scalar type;
+};
+
+#if EIGEN_HAS_C99_MATH
+// Since glibc 2.19
+#if defined(__GLIBC__) && ((__GLIBC__>=2 && __GLIBC_MINOR__ >= 19) || __GLIBC__>2) \
+ && (defined(_DEFAULT_SOURCE) || defined(_BSD_SOURCE) || defined(_SVID_SOURCE))
+#define EIGEN_HAS_LGAMMA_R
+#endif
+
+// Glibc versions before 2.19
+#if defined(__GLIBC__) && ((__GLIBC__==2 && __GLIBC_MINOR__ < 19) || __GLIBC__<2) \
+ && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE))
+#define EIGEN_HAS_LGAMMA_R
+#endif
+
+template <>
+struct lgamma_impl<float> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE float run(float x) {
+#if !defined(EIGEN_GPU_COMPILE_PHASE) && defined (EIGEN_HAS_LGAMMA_R) && !defined(__APPLE__)
+ int dummy;
+ return ::lgammaf_r(x, &dummy);
+#elif defined(SYCL_DEVICE_ONLY)
+ return cl::sycl::lgamma(x);
+#else
+ return ::lgammaf(x);
+#endif
+ }
+};
+
+template <>
+struct lgamma_impl<double> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE double run(double x) {
+#if !defined(EIGEN_GPU_COMPILE_PHASE) && defined(EIGEN_HAS_LGAMMA_R) && !defined(__APPLE__)
+ int dummy;
+ return ::lgamma_r(x, &dummy);
+#elif defined(SYCL_DEVICE_ONLY)
+ return cl::sycl::lgamma(x);
+#else
+ return ::lgamma(x);
+#endif
+ }
+};
+
+#undef EIGEN_HAS_LGAMMA_R
+#endif
+
+/****************************************************************************
+ * Implementation of digamma (psi), based on Cephes *
+ ****************************************************************************/
+
+template <typename Scalar>
+struct digamma_retval {
+ typedef Scalar type;
+};
+
+/*
+ *
+ * Polynomial evaluation helper for the Psi (digamma) function.
+ *
+ * digamma_impl_maybe_poly::run(s) evaluates the asymptotic Psi expansion for
+ * input Scalar s, assuming s is above 10.0.
+ *
+ * If s is above a certain threshold for the given Scalar type, zero
+ * is returned. Otherwise the polynomial is evaluated with enough
+ * coefficients for results matching Scalar machine precision.
+ *
+ *
+ */
+template <typename Scalar>
+struct digamma_impl_maybe_poly {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
+ EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ return Scalar(0);
+ }
+};
+
+
+template <>
+struct digamma_impl_maybe_poly<float> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE float run(const float s) {
+ const float A[] = {
+ -4.16666666666666666667E-3f,
+ 3.96825396825396825397E-3f,
+ -8.33333333333333333333E-3f,
+ 8.33333333333333333333E-2f
+ };
+
+ float z;
+ if (s < 1.0e8f) {
+ z = 1.0f / (s * s);
+ return z * internal::ppolevl<float, 3>::run(z, A);
+ } else return 0.0f;
+ }
+};
+
+template <>
+struct digamma_impl_maybe_poly<double> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE double run(const double s) {
+ const double A[] = {
+ 8.33333333333333333333E-2,
+ -2.10927960927960927961E-2,
+ 7.57575757575757575758E-3,
+ -4.16666666666666666667E-3,
+ 3.96825396825396825397E-3,
+ -8.33333333333333333333E-3,
+ 8.33333333333333333333E-2
+ };
+
+ double z;
+ if (s < 1.0e17) {
+ z = 1.0 / (s * s);
+ return z * internal::ppolevl<double, 6>::run(z, A);
+ }
+ else return 0.0;
+ }
+};
+
+template <typename Scalar>
+struct digamma_impl {
+ EIGEN_DEVICE_FUNC
+ static Scalar run(Scalar x) {
+ /*
+ *
+ * Psi (digamma) function (modified for Eigen)
+ *
+ *
+ * SYNOPSIS:
+ *
+ * double x, y, psi();
+ *
+ * y = psi( x );
+ *
+ *
+ * DESCRIPTION:
+ *
+ * d -
+ * psi(x) = -- ln | (x)
+ * dx
+ *
+ * is the logarithmic derivative of the gamma function.
+ * For integer x,
+ * n-1
+ * -
+ * psi(n) = -EUL + > 1/k.
+ * -
+ * k=1
+ *
+ * If x is negative, it is transformed to a positive argument by the
+ * reflection formula psi(1-x) = psi(x) + pi cot(pi x).
+ * For general positive x, the argument is made greater than 10
+ * using the recurrence psi(x+1) = psi(x) + 1/x.
+ * Then the following asymptotic expansion is applied:
+ *
+ * inf. B
+ * - 2k
+ * psi(x) = log(x) - 1/2x - > -------
+ * - 2k
+ * k=1 2k x
+ *
+ * where the B2k are Bernoulli numbers.
+ *
+ * ACCURACY (float):
+ * Relative error (except absolute when |psi| < 1):
+ * arithmetic domain # trials peak rms
+ * IEEE 0,30 30000 1.3e-15 1.4e-16
+ * IEEE -30,0 40000 1.5e-15 2.2e-16
+ *
+ * ACCURACY (double):
+ * Absolute error, relative when |psi| > 1 :
+ * arithmetic domain # trials peak rms
+ * IEEE -33,0 30000 8.2e-7 1.2e-7
+ * IEEE 0,33 100000 7.3e-7 7.7e-8
+ *
+ * ERROR MESSAGES:
+ * message condition value returned
+ * psi singularity x integer <=0 INFINITY
+ */
+
+ Scalar p, q, nz, s, w, y;
+ bool negative = false;
+
+ const Scalar nan = NumTraits<Scalar>::quiet_NaN();
+ const Scalar m_pi = Scalar(EIGEN_PI);
+
+ const Scalar zero = Scalar(0);
+ const Scalar one = Scalar(1);
+ const Scalar half = Scalar(0.5);
+ nz = zero;
+
+ if (x <= zero) {
+ negative = true;
+ q = x;
+ p = numext::floor(q);
+ if (p == q) {
+ return nan;
+ }
+ /* Remove the zeros of tan(m_pi x)
+ * by subtracting the nearest integer from x
+ */
+ nz = q - p;
+ if (nz != half) {
+ if (nz > half) {
+ p += one;
+ nz = q - p;
+ }
+ nz = m_pi / numext::tan(m_pi * nz);
+ }
+ else {
+ nz = zero;
+ }
+ x = one - x;
+ }
+
+ /* use the recurrence psi(x+1) = psi(x) + 1/x. */
+ s = x;
+ w = zero;
+ while (s < Scalar(10)) {
+ w += one / s;
+ s += one;
+ }
+
+ y = digamma_impl_maybe_poly<Scalar>::run(s);
+
+ y = numext::log(s) - (half / s) - y - w;
+
+ return (negative) ? y - nz : y;
+ }
+};
+
+/****************************************************************************
+ * Implementation of erf, requires C++11/C99 *
+ ****************************************************************************/
+
+/** \internal \returns the error function of \a a (coeff-wise)
+ Doesn't do anything fancy, just a 13/8-degree rational interpolant which
+ is accurate up to a couple of ulp in the range [-4, 4], outside of which
+ fl(erf(x)) = +/-1.
+
+ This implementation works on both scalars and Ts.
+*/
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erf_float(const T& a_x) {
+ // Clamp the inputs to the range [-4, 4] since anything outside
+ // this range is +/-1.0f in single-precision.
+ const T plus_4 = pset1<T>(4.f);
+ const T minus_4 = pset1<T>(-4.f);
+ const T x = pmax(pmin(a_x, plus_4), minus_4);
+ // The monomial coefficients of the numerator polynomial (odd).
+ const T alpha_1 = pset1<T>(-1.60960333262415e-02f);
+ const T alpha_3 = pset1<T>(-2.95459980854025e-03f);
+ const T alpha_5 = pset1<T>(-7.34990630326855e-04f);
+ const T alpha_7 = pset1<T>(-5.69250639462346e-05f);
+ const T alpha_9 = pset1<T>(-2.10102402082508e-06f);
+ const T alpha_11 = pset1<T>(2.77068142495902e-08f);
+ const T alpha_13 = pset1<T>(-2.72614225801306e-10f);
+
+ // The monomial coefficients of the denominator polynomial (even).
+ const T beta_0 = pset1<T>(-1.42647390514189e-02f);
+ const T beta_2 = pset1<T>(-7.37332916720468e-03f);
+ const T beta_4 = pset1<T>(-1.68282697438203e-03f);
+ const T beta_6 = pset1<T>(-2.13374055278905e-04f);
+ const T beta_8 = pset1<T>(-1.45660718464996e-05f);
+
+ // Since the polynomials are odd/even, we need x^2.
+ const T x2 = pmul(x, x);
+
+ // Evaluate the numerator polynomial p.
+ T p = pmadd(x2, alpha_13, alpha_11);
+ p = pmadd(x2, p, alpha_9);
+ p = pmadd(x2, p, alpha_7);
+ p = pmadd(x2, p, alpha_5);
+ p = pmadd(x2, p, alpha_3);
+ p = pmadd(x2, p, alpha_1);
+ p = pmul(x, p);
+
+ // Evaluate the denominator polynomial p.
+ T q = pmadd(x2, beta_8, beta_6);
+ q = pmadd(x2, q, beta_4);
+ q = pmadd(x2, q, beta_2);
+ q = pmadd(x2, q, beta_0);
+
+ // Divide the numerator by the denominator.
+ return pdiv(p, q);
+}
+
+template <typename T>
+struct erf_impl {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE T run(const T& x) {
+ return generic_fast_erf_float(x);
+ }
+};
+
+template <typename Scalar>
+struct erf_retval {
+ typedef Scalar type;
+};
+
+#if EIGEN_HAS_C99_MATH
+template <>
+struct erf_impl<float> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE float run(float x) {
+#if defined(SYCL_DEVICE_ONLY)
+ return cl::sycl::erf(x);
+#else
+ return generic_fast_erf_float(x);
+#endif
+ }
+};
+
+template <>
+struct erf_impl<double> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE double run(double x) {
+#if defined(SYCL_DEVICE_ONLY)
+ return cl::sycl::erf(x);
+#else
+ return ::erf(x);
+#endif
+ }
+};
+#endif // EIGEN_HAS_C99_MATH
+
+/***************************************************************************
+* Implementation of erfc, requires C++11/C99 *
+****************************************************************************/
+
+template <typename Scalar>
+struct erfc_impl {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
+ EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ return Scalar(0);
+ }
+};
+
+template <typename Scalar>
+struct erfc_retval {
+ typedef Scalar type;
+};
+
+#if EIGEN_HAS_C99_MATH
+template <>
+struct erfc_impl<float> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE float run(const float x) {
+#if defined(SYCL_DEVICE_ONLY)
+ return cl::sycl::erfc(x);
+#else
+ return ::erfcf(x);
+#endif
+ }
+};
+
+template <>
+struct erfc_impl<double> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE double run(const double x) {
+#if defined(SYCL_DEVICE_ONLY)
+ return cl::sycl::erfc(x);
+#else
+ return ::erfc(x);
+#endif
+ }
+};
+#endif // EIGEN_HAS_C99_MATH
+
+
+/***************************************************************************
+* Implementation of ndtri. *
+****************************************************************************/
+
+/* Inverse of Normal distribution function (modified for Eigen).
+ *
+ *
+ * SYNOPSIS:
+ *
+ * double x, y, ndtri();
+ *
+ * x = ndtri( y );
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Returns the argument, x, for which the area under the
+ * Gaussian probability density function (integrated from
+ * minus infinity to x) is equal to y.
+ *
+ *
+ * For small arguments 0 < y < exp(-2), the program computes
+ * z = sqrt( -2.0 * log(y) ); then the approximation is
+ * x = z - log(z)/z - (1/z) P(1/z) / Q(1/z).
+ * There are two rational functions P/Q, one for 0 < y < exp(-32)
+ * and the other for y up to exp(-2). For larger arguments,
+ * w = y - 0.5, and x/sqrt(2pi) = w + w**3 R(w**2)/S(w**2)).
+ *
+ *
+ * ACCURACY:
+ *
+ * Relative error:
+ * arithmetic domain # trials peak rms
+ * DEC 0.125, 1 5500 9.5e-17 2.1e-17
+ * DEC 6e-39, 0.135 3500 5.7e-17 1.3e-17
+ * IEEE 0.125, 1 20000 7.2e-16 1.3e-16
+ * IEEE 3e-308, 0.135 50000 4.6e-16 9.8e-17
+ *
+ *
+ * ERROR MESSAGES:
+ *
+ * message condition value returned
+ * ndtri domain x <= 0 -MAXNUM
+ * ndtri domain x >= 1 MAXNUM
+ *
+ */
+ /*
+ Cephes Math Library Release 2.2: June, 1992
+ Copyright 1985, 1987, 1992 by Stephen L. Moshier
+ Direct inquiries to 30 Frost Street, Cambridge, MA 02140
+ */
+
+
+// TODO: Add a cheaper approximation for float.
+
+
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T flipsign(
+ const T& should_flipsign, const T& x) {
+ typedef typename unpacket_traits<T>::type Scalar;
+ const T sign_mask = pset1<T>(Scalar(-0.0));
+ T sign_bit = pand<T>(should_flipsign, sign_mask);
+ return pxor<T>(sign_bit, x);
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double flipsign<double>(
+ const double& should_flipsign, const double& x) {
+ return should_flipsign == 0 ? x : -x;
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float flipsign<float>(
+ const float& should_flipsign, const float& x) {
+ return should_flipsign == 0 ? x : -x;
+}
+
+// We split this computation in to two so that in the scalar path
+// only one branch is evaluated (due to our template specialization of pselect
+// being an if statement.)
+
+template <typename T, typename ScalarType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_ndtri_gt_exp_neg_two(const T& b) {
+ const ScalarType p0[] = {
+ ScalarType(-5.99633501014107895267e1),
+ ScalarType(9.80010754185999661536e1),
+ ScalarType(-5.66762857469070293439e1),
+ ScalarType(1.39312609387279679503e1),
+ ScalarType(-1.23916583867381258016e0)
+ };
+ const ScalarType q0[] = {
+ ScalarType(1.0),
+ ScalarType(1.95448858338141759834e0),
+ ScalarType(4.67627912898881538453e0),
+ ScalarType(8.63602421390890590575e1),
+ ScalarType(-2.25462687854119370527e2),
+ ScalarType(2.00260212380060660359e2),
+ ScalarType(-8.20372256168333339912e1),
+ ScalarType(1.59056225126211695515e1),
+ ScalarType(-1.18331621121330003142e0)
+ };
+ const T sqrt2pi = pset1<T>(ScalarType(2.50662827463100050242e0));
+ const T half = pset1<T>(ScalarType(0.5));
+ T c, c2, ndtri_gt_exp_neg_two;
+
+ c = psub(b, half);
+ c2 = pmul(c, c);
+ ndtri_gt_exp_neg_two = pmadd(c, pmul(
+ c2, pdiv(
+ internal::ppolevl<T, 4>::run(c2, p0),
+ internal::ppolevl<T, 8>::run(c2, q0))), c);
+ return pmul(ndtri_gt_exp_neg_two, sqrt2pi);
+}
+
+template <typename T, typename ScalarType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_ndtri_lt_exp_neg_two(
+ const T& b, const T& should_flipsign) {
+ /* Approximation for interval z = sqrt(-2 log a ) between 2 and 8
+ * i.e., a between exp(-2) = .135 and exp(-32) = 1.27e-14.
+ */
+ const ScalarType p1[] = {
+ ScalarType(4.05544892305962419923e0),
+ ScalarType(3.15251094599893866154e1),
+ ScalarType(5.71628192246421288162e1),
+ ScalarType(4.40805073893200834700e1),
+ ScalarType(1.46849561928858024014e1),
+ ScalarType(2.18663306850790267539e0),
+ ScalarType(-1.40256079171354495875e-1),
+ ScalarType(-3.50424626827848203418e-2),
+ ScalarType(-8.57456785154685413611e-4)
+ };
+ const ScalarType q1[] = {
+ ScalarType(1.0),
+ ScalarType(1.57799883256466749731e1),
+ ScalarType(4.53907635128879210584e1),
+ ScalarType(4.13172038254672030440e1),
+ ScalarType(1.50425385692907503408e1),
+ ScalarType(2.50464946208309415979e0),
+ ScalarType(-1.42182922854787788574e-1),
+ ScalarType(-3.80806407691578277194e-2),
+ ScalarType(-9.33259480895457427372e-4)
+ };
+ /* Approximation for interval z = sqrt(-2 log a ) between 8 and 64
+ * i.e., a between exp(-32) = 1.27e-14 and exp(-2048) = 3.67e-890.
+ */
+ const ScalarType p2[] = {
+ ScalarType(3.23774891776946035970e0),
+ ScalarType(6.91522889068984211695e0),
+ ScalarType(3.93881025292474443415e0),
+ ScalarType(1.33303460815807542389e0),
+ ScalarType(2.01485389549179081538e-1),
+ ScalarType(1.23716634817820021358e-2),
+ ScalarType(3.01581553508235416007e-4),
+ ScalarType(2.65806974686737550832e-6),
+ ScalarType(6.23974539184983293730e-9)
+ };
+ const ScalarType q2[] = {
+ ScalarType(1.0),
+ ScalarType(6.02427039364742014255e0),
+ ScalarType(3.67983563856160859403e0),
+ ScalarType(1.37702099489081330271e0),
+ ScalarType(2.16236993594496635890e-1),
+ ScalarType(1.34204006088543189037e-2),
+ ScalarType(3.28014464682127739104e-4),
+ ScalarType(2.89247864745380683936e-6),
+ ScalarType(6.79019408009981274425e-9)
+ };
+ const T eight = pset1<T>(ScalarType(8.0));
+ const T one = pset1<T>(ScalarType(1));
+ const T neg_two = pset1<T>(ScalarType(-2));
+ T x, x0, x1, z;
+
+ x = psqrt(pmul(neg_two, plog(b)));
+ x0 = psub(x, pdiv(plog(x), x));
+ z = pdiv(one, x);
+ x1 = pmul(
+ z, pselect(
+ pcmp_lt(x, eight),
+ pdiv(internal::ppolevl<T, 8>::run(z, p1),
+ internal::ppolevl<T, 8>::run(z, q1)),
+ pdiv(internal::ppolevl<T, 8>::run(z, p2),
+ internal::ppolevl<T, 8>::run(z, q2))));
+ return flipsign(should_flipsign, psub(x0, x1));
+}
+
+template <typename T, typename ScalarType>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+T generic_ndtri(const T& a) {
+ const T maxnum = pset1<T>(NumTraits<ScalarType>::infinity());
+ const T neg_maxnum = pset1<T>(-NumTraits<ScalarType>::infinity());
+
+ const T zero = pset1<T>(ScalarType(0));
+ const T one = pset1<T>(ScalarType(1));
+ // exp(-2)
+ const T exp_neg_two = pset1<T>(ScalarType(0.13533528323661269189));
+ T b, ndtri, should_flipsign;
+
+ should_flipsign = pcmp_le(a, psub(one, exp_neg_two));
+ b = pselect(should_flipsign, a, psub(one, a));
+
+ ndtri = pselect(
+ pcmp_lt(exp_neg_two, b),
+ generic_ndtri_gt_exp_neg_two<T, ScalarType>(b),
+ generic_ndtri_lt_exp_neg_two<T, ScalarType>(b, should_flipsign));
+
+ return pselect(
+ pcmp_le(a, zero), neg_maxnum,
+ pselect(pcmp_le(one, a), maxnum, ndtri));
+}
+
+template <typename Scalar>
+struct ndtri_retval {
+ typedef Scalar type;
+};
+
+#if !EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct ndtri_impl {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
+ EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ return Scalar(0);
+ }
+};
+
+# else
+
+template <typename Scalar>
+struct ndtri_impl {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar run(const Scalar x) {
+ return generic_ndtri<Scalar, Scalar>(x);
+ }
+};
+
+#endif // EIGEN_HAS_C99_MATH
+
+
+/**************************************************************************************************************
+ * Implementation of igammac (complemented incomplete gamma integral), based on Cephes but requires C++11/C99 *
+ **************************************************************************************************************/
+
+template <typename Scalar>
+struct igammac_retval {
+ typedef Scalar type;
+};
+
+// NOTE: cephes_helper is also used to implement zeta
+template <typename Scalar>
+struct cephes_helper {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar machep() { assert(false && "machep not supported for this type"); return 0.0; }
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar big() { assert(false && "big not supported for this type"); return 0.0; }
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar biginv() { assert(false && "biginv not supported for this type"); return 0.0; }
+};
+
+template <>
+struct cephes_helper<float> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE float machep() {
+ return NumTraits<float>::epsilon() / 2; // 1.0 - machep == 1.0
+ }
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE float big() {
+ // use epsneg (1.0 - epsneg == 1.0)
+ return 1.0f / (NumTraits<float>::epsilon() / 2);
+ }
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE float biginv() {
+ // epsneg
+ return machep();
+ }
+};
+
+template <>
+struct cephes_helper<double> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE double machep() {
+ return NumTraits<double>::epsilon() / 2; // 1.0 - machep == 1.0
+ }
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE double big() {
+ return 1.0 / NumTraits<double>::epsilon();
+ }
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE double biginv() {
+ // inverse of eps
+ return NumTraits<double>::epsilon();
+ }
+};
+
+enum IgammaComputationMode { VALUE, DERIVATIVE, SAMPLE_DERIVATIVE };
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC
+static EIGEN_STRONG_INLINE Scalar main_igamma_term(Scalar a, Scalar x) {
+ /* Compute x**a * exp(-x) / gamma(a) */
+ Scalar logax = a * numext::log(x) - x - lgamma_impl<Scalar>::run(a);
+ if (logax < -numext::log(NumTraits<Scalar>::highest()) ||
+ // Assuming x and a aren't Nan.
+ (numext::isnan)(logax)) {
+ return Scalar(0);
+ }
+ return numext::exp(logax);
+}
+
+template <typename Scalar, IgammaComputationMode mode>
+EIGEN_DEVICE_FUNC
+int igamma_num_iterations() {
+ /* Returns the maximum number of internal iterations for igamma computation.
+ */
+ if (mode == VALUE) {
+ return 2000;
+ }
+
+ if (internal::is_same<Scalar, float>::value) {
+ return 200;
+ } else if (internal::is_same<Scalar, double>::value) {
+ return 500;
+ } else {
+ return 2000;
+ }
+}
+
+template <typename Scalar, IgammaComputationMode mode>
+struct igammac_cf_impl {
+ /* Computes igamc(a, x) or derivative (depending on the mode)
+ * using the continued fraction expansion of the complementary
+ * incomplete Gamma function.
+ *
+ * Preconditions:
+ * a > 0
+ * x >= 1
+ * x >= a
+ */
+ EIGEN_DEVICE_FUNC
+ static Scalar run(Scalar a, Scalar x) {
+ const Scalar zero = 0;
+ const Scalar one = 1;
+ const Scalar two = 2;
+ const Scalar machep = cephes_helper<Scalar>::machep();
+ const Scalar big = cephes_helper<Scalar>::big();
+ const Scalar biginv = cephes_helper<Scalar>::biginv();
+
+ if ((numext::isinf)(x)) {
+ return zero;
+ }
+
+ Scalar ax = main_igamma_term<Scalar>(a, x);
+ // This is independent of mode. If this value is zero,
+ // then the function value is zero. If the function value is zero,
+ // then we are in a neighborhood where the function value evalutes to zero,
+ // so the derivative is zero.
+ if (ax == zero) {
+ return zero;
+ }
+
+ // continued fraction
+ Scalar y = one - a;
+ Scalar z = x + y + one;
+ Scalar c = zero;
+ Scalar pkm2 = one;
+ Scalar qkm2 = x;
+ Scalar pkm1 = x + one;
+ Scalar qkm1 = z * x;
+ Scalar ans = pkm1 / qkm1;
+
+ Scalar dpkm2_da = zero;
+ Scalar dqkm2_da = zero;
+ Scalar dpkm1_da = zero;
+ Scalar dqkm1_da = -x;
+ Scalar dans_da = (dpkm1_da - ans * dqkm1_da) / qkm1;
+
+ for (int i = 0; i < igamma_num_iterations<Scalar, mode>(); i++) {
+ c += one;
+ y += one;
+ z += two;
+
+ Scalar yc = y * c;
+ Scalar pk = pkm1 * z - pkm2 * yc;
+ Scalar qk = qkm1 * z - qkm2 * yc;
+
+ Scalar dpk_da = dpkm1_da * z - pkm1 - dpkm2_da * yc + pkm2 * c;
+ Scalar dqk_da = dqkm1_da * z - qkm1 - dqkm2_da * yc + qkm2 * c;
+
+ if (qk != zero) {
+ Scalar ans_prev = ans;
+ ans = pk / qk;
+
+ Scalar dans_da_prev = dans_da;
+ dans_da = (dpk_da - ans * dqk_da) / qk;
+
+ if (mode == VALUE) {
+ if (numext::abs(ans_prev - ans) <= machep * numext::abs(ans)) {
+ break;
+ }
+ } else {
+ if (numext::abs(dans_da - dans_da_prev) <= machep) {
+ break;
+ }
+ }
+ }
+
+ pkm2 = pkm1;
+ pkm1 = pk;
+ qkm2 = qkm1;
+ qkm1 = qk;
+
+ dpkm2_da = dpkm1_da;
+ dpkm1_da = dpk_da;
+ dqkm2_da = dqkm1_da;
+ dqkm1_da = dqk_da;
+
+ if (numext::abs(pk) > big) {
+ pkm2 *= biginv;
+ pkm1 *= biginv;
+ qkm2 *= biginv;
+ qkm1 *= biginv;
+
+ dpkm2_da *= biginv;
+ dpkm1_da *= biginv;
+ dqkm2_da *= biginv;
+ dqkm1_da *= biginv;
+ }
+ }
+
+ /* Compute x**a * exp(-x) / gamma(a) */
+ Scalar dlogax_da = numext::log(x) - digamma_impl<Scalar>::run(a);
+ Scalar dax_da = ax * dlogax_da;
+
+ switch (mode) {
+ case VALUE:
+ return ans * ax;
+ case DERIVATIVE:
+ return ans * dax_da + dans_da * ax;
+ case SAMPLE_DERIVATIVE:
+ default: // this is needed to suppress clang warning
+ return -(dans_da + ans * dlogax_da) * x;
+ }
+ }
+};
+
+template <typename Scalar, IgammaComputationMode mode>
+struct igamma_series_impl {
+ /* Computes igam(a, x) or its derivative (depending on the mode)
+ * using the series expansion of the incomplete Gamma function.
+ *
+ * Preconditions:
+ * x > 0
+ * a > 0
+ * !(x > 1 && x > a)
+ */
+ EIGEN_DEVICE_FUNC
+ static Scalar run(Scalar a, Scalar x) {
+ const Scalar zero = 0;
+ const Scalar one = 1;
+ const Scalar machep = cephes_helper<Scalar>::machep();
+
+ Scalar ax = main_igamma_term<Scalar>(a, x);
+
+ // This is independent of mode. If this value is zero,
+ // then the function value is zero. If the function value is zero,
+ // then we are in a neighborhood where the function value evalutes to zero,
+ // so the derivative is zero.
+ if (ax == zero) {
+ return zero;
+ }
+
+ ax /= a;
+
+ /* power series */
+ Scalar r = a;
+ Scalar c = one;
+ Scalar ans = one;
+
+ Scalar dc_da = zero;
+ Scalar dans_da = zero;
+
+ for (int i = 0; i < igamma_num_iterations<Scalar, mode>(); i++) {
+ r += one;
+ Scalar term = x / r;
+ Scalar dterm_da = -x / (r * r);
+ dc_da = term * dc_da + dterm_da * c;
+ dans_da += dc_da;
+ c *= term;
+ ans += c;
+
+ if (mode == VALUE) {
+ if (c <= machep * ans) {
+ break;
+ }
+ } else {
+ if (numext::abs(dc_da) <= machep * numext::abs(dans_da)) {
+ break;
+ }
+ }
+ }
+
+ Scalar dlogax_da = numext::log(x) - digamma_impl<Scalar>::run(a + one);
+ Scalar dax_da = ax * dlogax_da;
+
+ switch (mode) {
+ case VALUE:
+ return ans * ax;
+ case DERIVATIVE:
+ return ans * dax_da + dans_da * ax;
+ case SAMPLE_DERIVATIVE:
+ default: // this is needed to suppress clang warning
+ return -(dans_da + ans * dlogax_da) * x / a;
+ }
+ }
+};
+
+#if !EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct igammac_impl {
+ EIGEN_DEVICE_FUNC
+ static Scalar run(Scalar a, Scalar x) {
+ EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ return Scalar(0);
+ }
+};
+
+#else
+
+template <typename Scalar>
+struct igammac_impl {
+ EIGEN_DEVICE_FUNC
+ static Scalar run(Scalar a, Scalar x) {
+ /* igamc()
+ *
+ * Incomplete gamma integral (modified for Eigen)
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * double a, x, y, igamc();
+ *
+ * y = igamc( a, x );
+ *
+ * DESCRIPTION:
+ *
+ * The function is defined by
+ *
+ *
+ * igamc(a,x) = 1 - igam(a,x)
+ *
+ * inf.
+ * -
+ * 1 | | -t a-1
+ * = ----- | e t dt.
+ * - | |
+ * | (a) -
+ * x
+ *
+ *
+ * In this implementation both arguments must be positive.
+ * The integral is evaluated by either a power series or
+ * continued fraction expansion, depending on the relative
+ * values of a and x.
+ *
+ * ACCURACY (float):
+ *
+ * Relative error:
+ * arithmetic domain # trials peak rms
+ * IEEE 0,30 30000 7.8e-6 5.9e-7
+ *
+ *
+ * ACCURACY (double):
+ *
+ * Tested at random a, x.
+ * a x Relative error:
+ * arithmetic domain domain # trials peak rms
+ * IEEE 0.5,100 0,100 200000 1.9e-14 1.7e-15
+ * IEEE 0.01,0.5 0,100 200000 1.4e-13 1.6e-15
+ *
+ */
+ /*
+ Cephes Math Library Release 2.2: June, 1992
+ Copyright 1985, 1987, 1992 by Stephen L. Moshier
+ Direct inquiries to 30 Frost Street, Cambridge, MA 02140
+ */
+ const Scalar zero = 0;
+ const Scalar one = 1;
+ const Scalar nan = NumTraits<Scalar>::quiet_NaN();
+
+ if ((x < zero) || (a <= zero)) {
+ // domain error
+ return nan;
+ }
+
+ if ((numext::isnan)(a) || (numext::isnan)(x)) { // propagate nans
+ return nan;
+ }
+
+ if ((x < one) || (x < a)) {
+ return (one - igamma_series_impl<Scalar, VALUE>::run(a, x));
+ }
+
+ return igammac_cf_impl<Scalar, VALUE>::run(a, x);
+ }
+};
+
+#endif // EIGEN_HAS_C99_MATH
+
+/************************************************************************************************
+ * Implementation of igamma (incomplete gamma integral), based on Cephes but requires C++11/C99 *
+ ************************************************************************************************/
+
+#if !EIGEN_HAS_C99_MATH
+
+template <typename Scalar, IgammaComputationMode mode>
+struct igamma_generic_impl {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar x) {
+ EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ return Scalar(0);
+ }
+};
+
+#else
+
+template <typename Scalar, IgammaComputationMode mode>
+struct igamma_generic_impl {
+ EIGEN_DEVICE_FUNC
+ static Scalar run(Scalar a, Scalar x) {
+ /* Depending on the mode, returns
+ * - VALUE: incomplete Gamma function igamma(a, x)
+ * - DERIVATIVE: derivative of incomplete Gamma function d/da igamma(a, x)
+ * - SAMPLE_DERIVATIVE: implicit derivative of a Gamma random variable
+ * x ~ Gamma(x | a, 1), dx/da = -1 / Gamma(x | a, 1) * d igamma(a, x) / dx
+ *
+ * Derivatives are implemented by forward-mode differentiation.
+ */
+ const Scalar zero = 0;
+ const Scalar one = 1;
+ const Scalar nan = NumTraits<Scalar>::quiet_NaN();
+
+ if (x == zero) return zero;
+
+ if ((x < zero) || (a <= zero)) { // domain error
+ return nan;
+ }
+
+ if ((numext::isnan)(a) || (numext::isnan)(x)) { // propagate nans
+ return nan;
+ }
+
+ if ((x > one) && (x > a)) {
+ Scalar ret = igammac_cf_impl<Scalar, mode>::run(a, x);
+ if (mode == VALUE) {
+ return one - ret;
+ } else {
+ return -ret;
+ }
+ }
+
+ return igamma_series_impl<Scalar, mode>::run(a, x);
+ }
+};
+
+#endif // EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct igamma_retval {
+ typedef Scalar type;
+};
+
+template <typename Scalar>
+struct igamma_impl : igamma_generic_impl<Scalar, VALUE> {
+ /* igam()
+ * Incomplete gamma integral.
+ *
+ * The CDF of Gamma(a, 1) random variable at the point x.
+ *
+ * Accuracy estimation. For each a in [10^-2, 10^-1...10^3] we sample
+ * 50 Gamma random variables x ~ Gamma(x | a, 1), a total of 300 points.
+ * The ground truth is computed by mpmath. Mean absolute error:
+ * float: 1.26713e-05
+ * double: 2.33606e-12
+ *
+ * Cephes documentation below.
+ *
+ * SYNOPSIS:
+ *
+ * double a, x, y, igam();
+ *
+ * y = igam( a, x );
+ *
+ * DESCRIPTION:
+ *
+ * The function is defined by
+ *
+ * x
+ * -
+ * 1 | | -t a-1
+ * igam(a,x) = ----- | e t dt.
+ * - | |
+ * | (a) -
+ * 0
+ *
+ *
+ * In this implementation both arguments must be positive.
+ * The integral is evaluated by either a power series or
+ * continued fraction expansion, depending on the relative
+ * values of a and x.
+ *
+ * ACCURACY (double):
+ *
+ * Relative error:
+ * arithmetic domain # trials peak rms
+ * IEEE 0,30 200000 3.6e-14 2.9e-15
+ * IEEE 0,100 300000 9.9e-14 1.5e-14
+ *
+ *
+ * ACCURACY (float):
+ *
+ * Relative error:
+ * arithmetic domain # trials peak rms
+ * IEEE 0,30 20000 7.8e-6 5.9e-7
+ *
+ */
+ /*
+ Cephes Math Library Release 2.2: June, 1992
+ Copyright 1985, 1987, 1992 by Stephen L. Moshier
+ Direct inquiries to 30 Frost Street, Cambridge, MA 02140
+ */
+
+ /* left tail of incomplete gamma function:
+ *
+ * inf. k
+ * a -x - x
+ * x e > ----------
+ * - -
+ * k=0 | (a+k+1)
+ *
+ */
+};
+
+template <typename Scalar>
+struct igamma_der_a_retval : igamma_retval<Scalar> {};
+
+template <typename Scalar>
+struct igamma_der_a_impl : igamma_generic_impl<Scalar, DERIVATIVE> {
+ /* Derivative of the incomplete Gamma function with respect to a.
+ *
+ * Computes d/da igamma(a, x) by forward differentiation of the igamma code.
+ *
+ * Accuracy estimation. For each a in [10^-2, 10^-1...10^3] we sample
+ * 50 Gamma random variables x ~ Gamma(x | a, 1), a total of 300 points.
+ * The ground truth is computed by mpmath. Mean absolute error:
+ * float: 6.17992e-07
+ * double: 4.60453e-12
+ *
+ * Reference:
+ * R. Moore. "Algorithm AS 187: Derivatives of the incomplete gamma
+ * integral". Journal of the Royal Statistical Society. 1982
+ */
+};
+
+template <typename Scalar>
+struct gamma_sample_der_alpha_retval : igamma_retval<Scalar> {};
+
+template <typename Scalar>
+struct gamma_sample_der_alpha_impl
+ : igamma_generic_impl<Scalar, SAMPLE_DERIVATIVE> {
+ /* Derivative of a Gamma random variable sample with respect to alpha.
+ *
+ * Consider a sample of a Gamma random variable with the concentration
+ * parameter alpha: sample ~ Gamma(alpha, 1). The reparameterization
+ * derivative that we want to compute is dsample / dalpha =
+ * d igammainv(alpha, u) / dalpha, where u = igamma(alpha, sample).
+ * However, this formula is numerically unstable and expensive, so instead
+ * we use implicit differentiation:
+ *
+ * igamma(alpha, sample) = u, where u ~ Uniform(0, 1).
+ * Apply d / dalpha to both sides:
+ * d igamma(alpha, sample) / dalpha
+ * + d igamma(alpha, sample) / dsample * dsample/dalpha = 0
+ * d igamma(alpha, sample) / dalpha
+ * + Gamma(sample | alpha, 1) dsample / dalpha = 0
+ * dsample/dalpha = - (d igamma(alpha, sample) / dalpha)
+ * / Gamma(sample | alpha, 1)
+ *
+ * Here Gamma(sample | alpha, 1) is the PDF of the Gamma distribution
+ * (note that the derivative of the CDF w.r.t. sample is the PDF).
+ * See the reference below for more details.
+ *
+ * The derivative of igamma(alpha, sample) is computed by forward
+ * differentiation of the igamma code. Division by the Gamma PDF is performed
+ * in the same code, increasing the accuracy and speed due to cancellation
+ * of some terms.
+ *
+ * Accuracy estimation. For each alpha in [10^-2, 10^-1...10^3] we sample
+ * 50 Gamma random variables sample ~ Gamma(sample | alpha, 1), a total of 300
+ * points. The ground truth is computed by mpmath. Mean absolute error:
+ * float: 2.1686e-06
+ * double: 1.4774e-12
+ *
+ * Reference:
+ * M. Figurnov, S. Mohamed, A. Mnih "Implicit Reparameterization Gradients".
+ * 2018
+ */
+};
+
+/*****************************************************************************
+ * Implementation of Riemann zeta function of two arguments, based on Cephes *
+ *****************************************************************************/
+
+template <typename Scalar>
+struct zeta_retval {
+ typedef Scalar type;
+};
+
+template <typename Scalar>
+struct zeta_impl_series {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
+ EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ return Scalar(0);
+ }
+};
+
+template <>
+struct zeta_impl_series<float> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE bool run(float& a, float& b, float& s, const float x, const float machep) {
+ int i = 0;
+ while(i < 9)
+ {
+ i += 1;
+ a += 1.0f;
+ b = numext::pow( a, -x );
+ s += b;
+ if( numext::abs(b/s) < machep )
+ return true;
+ }
+
+ //Return whether we are done
+ return false;
+ }
+};
+
+template <>
+struct zeta_impl_series<double> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE bool run(double& a, double& b, double& s, const double x, const double machep) {
+ int i = 0;
+ while( (i < 9) || (a <= 9.0) )
+ {
+ i += 1;
+ a += 1.0;
+ b = numext::pow( a, -x );
+ s += b;
+ if( numext::abs(b/s) < machep )
+ return true;
+ }
+
+ //Return whether we are done
+ return false;
+ }
+};
+
+template <typename Scalar>
+struct zeta_impl {
+ EIGEN_DEVICE_FUNC
+ static Scalar run(Scalar x, Scalar q) {
+ /* zeta.c
+ *
+ * Riemann zeta function of two arguments
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * double x, q, y, zeta();
+ *
+ * y = zeta( x, q );
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ *
+ *
+ * inf.
+ * - -x
+ * zeta(x,q) = > (k+q)
+ * -
+ * k=0
+ *
+ * where x > 1 and q is not a negative integer or zero.
+ * The Euler-Maclaurin summation formula is used to obtain
+ * the expansion
+ *
+ * n
+ * - -x
+ * zeta(x,q) = > (k+q)
+ * -
+ * k=1
+ *
+ * 1-x inf. B x(x+1)...(x+2j)
+ * (n+q) 1 - 2j
+ * + --------- - ------- + > --------------------
+ * x-1 x - x+2j+1
+ * 2(n+q) j=1 (2j)! (n+q)
+ *
+ * where the B2j are Bernoulli numbers. Note that (see zetac.c)
+ * zeta(x,1) = zetac(x) + 1.
+ *
+ *
+ *
+ * ACCURACY:
+ *
+ * Relative error for single precision:
+ * arithmetic domain # trials peak rms
+ * IEEE 0,25 10000 6.9e-7 1.0e-7
+ *
+ * Large arguments may produce underflow in powf(), in which
+ * case the results are inaccurate.
+ *
+ * REFERENCE:
+ *
+ * Gradshteyn, I. S., and I. M. Ryzhik, Tables of Integrals,
+ * Series, and Products, p. 1073; Academic Press, 1980.
+ *
+ */
+
+ int i;
+ Scalar p, r, a, b, k, s, t, w;
+
+ const Scalar A[] = {
+ Scalar(12.0),
+ Scalar(-720.0),
+ Scalar(30240.0),
+ Scalar(-1209600.0),
+ Scalar(47900160.0),
+ Scalar(-1.8924375803183791606e9), /*1.307674368e12/691*/
+ Scalar(7.47242496e10),
+ Scalar(-2.950130727918164224e12), /*1.067062284288e16/3617*/
+ Scalar(1.1646782814350067249e14), /*5.109094217170944e18/43867*/
+ Scalar(-4.5979787224074726105e15), /*8.028576626982912e20/174611*/
+ Scalar(1.8152105401943546773e17), /*1.5511210043330985984e23/854513*/
+ Scalar(-7.1661652561756670113e18) /*1.6938241367317436694528e27/236364091*/
+ };
+
+ const Scalar maxnum = NumTraits<Scalar>::infinity();
+ const Scalar zero = 0.0, half = 0.5, one = 1.0;
+ const Scalar machep = cephes_helper<Scalar>::machep();
+ const Scalar nan = NumTraits<Scalar>::quiet_NaN();
+
+ if( x == one )
+ return maxnum;
+
+ if( x < one )
+ {
+ return nan;
+ }
+
+ if( q <= zero )
+ {
+ if(q == numext::floor(q))
+ {
+ if (x == numext::floor(x) && long(x) % 2 == 0) {
+ return maxnum;
+ }
+ else {
+ return nan;
+ }
+ }
+ p = x;
+ r = numext::floor(p);
+ if (p != r)
+ return nan;
+ }
+
+ /* Permit negative q but continue sum until n+q > +9 .
+ * This case should be handled by a reflection formula.
+ * If q<0 and x is an integer, there is a relation to
+ * the polygamma function.
+ */
+ s = numext::pow( q, -x );
+ a = q;
+ b = zero;
+ // Run the summation in a helper function that is specific to the floating precision
+ if (zeta_impl_series<Scalar>::run(a, b, s, x, machep)) {
+ return s;
+ }
+
+ w = a;
+ s += b*w/(x-one);
+ s -= half * b;
+ a = one;
+ k = zero;
+ for( i=0; i<12; i++ )
+ {
+ a *= x + k;
+ b /= w;
+ t = a*b/A[i];
+ s = s + t;
+ t = numext::abs(t/s);
+ if( t < machep ) {
+ break;
+ }
+ k += one;
+ a *= x + k;
+ b /= w;
+ k += one;
+ }
+ return s;
+ }
+};
+
+/****************************************************************************
+ * Implementation of polygamma function, requires C++11/C99 *
+ ****************************************************************************/
+
+template <typename Scalar>
+struct polygamma_retval {
+ typedef Scalar type;
+};
+
+#if !EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct polygamma_impl {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar run(Scalar n, Scalar x) {
+ EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ return Scalar(0);
+ }
+};
+
+#else
+
+template <typename Scalar>
+struct polygamma_impl {
+ EIGEN_DEVICE_FUNC
+ static Scalar run(Scalar n, Scalar x) {
+ Scalar zero = 0.0, one = 1.0;
+ Scalar nplus = n + one;
+ const Scalar nan = NumTraits<Scalar>::quiet_NaN();
+
+ // Check that n is a non-negative integer
+ if (numext::floor(n) != n || n < zero) {
+ return nan;
+ }
+ // Just return the digamma function for n = 0
+ else if (n == zero) {
+ return digamma_impl<Scalar>::run(x);
+ }
+ // Use the same implementation as scipy
+ else {
+ Scalar factorial = numext::exp(lgamma_impl<Scalar>::run(nplus));
+ return numext::pow(-one, nplus) * factorial * zeta_impl<Scalar>::run(nplus, x);
+ }
+ }
+};
+
+#endif // EIGEN_HAS_C99_MATH
+
+/************************************************************************************************
+ * Implementation of betainc (incomplete beta integral), based on Cephes but requires C++11/C99 *
+ ************************************************************************************************/
+
+template <typename Scalar>
+struct betainc_retval {
+ typedef Scalar type;
+};
+
+#if !EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct betainc_impl {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar b, Scalar x) {
+ EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ return Scalar(0);
+ }
+};
+
+#else
+
+template <typename Scalar>
+struct betainc_impl {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar run(Scalar, Scalar, Scalar) {
+ /* betaincf.c
+ *
+ * Incomplete beta integral
+ *
+ *
+ * SYNOPSIS:
+ *
+ * float a, b, x, y, betaincf();
+ *
+ * y = betaincf( a, b, x );
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Returns incomplete beta integral of the arguments, evaluated
+ * from zero to x. The function is defined as
+ *
+ * x
+ * - -
+ * | (a+b) | | a-1 b-1
+ * ----------- | t (1-t) dt.
+ * - - | |
+ * | (a) | (b) -
+ * 0
+ *
+ * The domain of definition is 0 <= x <= 1. In this
+ * implementation a and b are restricted to positive values.
+ * The integral from x to 1 may be obtained by the symmetry
+ * relation
+ *
+ * 1 - betainc( a, b, x ) = betainc( b, a, 1-x ).
+ *
+ * The integral is evaluated by a continued fraction expansion.
+ * If a < 1, the function calls itself recursively after a
+ * transformation to increase a to a+1.
+ *
+ * ACCURACY (float):
+ *
+ * Tested at random points (a,b,x) with a and b in the indicated
+ * interval and x between 0 and 1.
+ *
+ * arithmetic domain # trials peak rms
+ * Relative error:
+ * IEEE 0,30 10000 3.7e-5 5.1e-6
+ * IEEE 0,100 10000 1.7e-4 2.5e-5
+ * The useful domain for relative error is limited by underflow
+ * of the single precision exponential function.
+ * Absolute error:
+ * IEEE 0,30 100000 2.2e-5 9.6e-7
+ * IEEE 0,100 10000 6.5e-5 3.7e-6
+ *
+ * Larger errors may occur for extreme ratios of a and b.
+ *
+ * ACCURACY (double):
+ * arithmetic domain # trials peak rms
+ * IEEE 0,5 10000 6.9e-15 4.5e-16
+ * IEEE 0,85 250000 2.2e-13 1.7e-14
+ * IEEE 0,1000 30000 5.3e-12 6.3e-13
+ * IEEE 0,10000 250000 9.3e-11 7.1e-12
+ * IEEE 0,100000 10000 8.7e-10 4.8e-11
+ * Outputs smaller than the IEEE gradual underflow threshold
+ * were excluded from these statistics.
+ *
+ * ERROR MESSAGES:
+ * message condition value returned
+ * incbet domain x<0, x>1 nan
+ * incbet underflow nan
+ */
+
+ EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ return Scalar(0);
+ }
+};
+
+/* Continued fraction expansion #1 for incomplete beta integral (small_branch = True)
+ * Continued fraction expansion #2 for incomplete beta integral (small_branch = False)
+ */
+template <typename Scalar>
+struct incbeta_cfe {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar b, Scalar x, bool small_branch) {
+ EIGEN_STATIC_ASSERT((internal::is_same<Scalar, float>::value ||
+ internal::is_same<Scalar, double>::value),
+ THIS_TYPE_IS_NOT_SUPPORTED);
+ const Scalar big = cephes_helper<Scalar>::big();
+ const Scalar machep = cephes_helper<Scalar>::machep();
+ const Scalar biginv = cephes_helper<Scalar>::biginv();
+
+ const Scalar zero = 0;
+ const Scalar one = 1;
+ const Scalar two = 2;
+
+ Scalar xk, pk, pkm1, pkm2, qk, qkm1, qkm2;
+ Scalar k1, k2, k3, k4, k5, k6, k7, k8, k26update;
+ Scalar ans;
+ int n;
+
+ const int num_iters = (internal::is_same<Scalar, float>::value) ? 100 : 300;
+ const Scalar thresh =
+ (internal::is_same<Scalar, float>::value) ? machep : Scalar(3) * machep;
+ Scalar r = (internal::is_same<Scalar, float>::value) ? zero : one;
+
+ if (small_branch) {
+ k1 = a;
+ k2 = a + b;
+ k3 = a;
+ k4 = a + one;
+ k5 = one;
+ k6 = b - one;
+ k7 = k4;
+ k8 = a + two;
+ k26update = one;
+ } else {
+ k1 = a;
+ k2 = b - one;
+ k3 = a;
+ k4 = a + one;
+ k5 = one;
+ k6 = a + b;
+ k7 = a + one;
+ k8 = a + two;
+ k26update = -one;
+ x = x / (one - x);
+ }
+
+ pkm2 = zero;
+ qkm2 = one;
+ pkm1 = one;
+ qkm1 = one;
+ ans = one;
+ n = 0;
+
+ do {
+ xk = -(x * k1 * k2) / (k3 * k4);
+ pk = pkm1 + pkm2 * xk;
+ qk = qkm1 + qkm2 * xk;
+ pkm2 = pkm1;
+ pkm1 = pk;
+ qkm2 = qkm1;
+ qkm1 = qk;
+
+ xk = (x * k5 * k6) / (k7 * k8);
+ pk = pkm1 + pkm2 * xk;
+ qk = qkm1 + qkm2 * xk;
+ pkm2 = pkm1;
+ pkm1 = pk;
+ qkm2 = qkm1;
+ qkm1 = qk;
+
+ if (qk != zero) {
+ r = pk / qk;
+ if (numext::abs(ans - r) < numext::abs(r) * thresh) {
+ return r;
+ }
+ ans = r;
+ }
+
+ k1 += one;
+ k2 += k26update;
+ k3 += two;
+ k4 += two;
+ k5 += one;
+ k6 -= k26update;
+ k7 += two;
+ k8 += two;
+
+ if ((numext::abs(qk) + numext::abs(pk)) > big) {
+ pkm2 *= biginv;
+ pkm1 *= biginv;
+ qkm2 *= biginv;
+ qkm1 *= biginv;
+ }
+ if ((numext::abs(qk) < biginv) || (numext::abs(pk) < biginv)) {
+ pkm2 *= big;
+ pkm1 *= big;
+ qkm2 *= big;
+ qkm1 *= big;
+ }
+ } while (++n < num_iters);
+
+ return ans;
+ }
+};
+
+/* Helper functions depending on the Scalar type */
+template <typename Scalar>
+struct betainc_helper {};
+
+template <>
+struct betainc_helper<float> {
+ /* Core implementation, assumes a large (> 1.0) */
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float incbsa(float aa, float bb,
+ float xx) {
+ float ans, a, b, t, x, onemx;
+ bool reversed_a_b = false;
+
+ onemx = 1.0f - xx;
+
+ /* see if x is greater than the mean */
+ if (xx > (aa / (aa + bb))) {
+ reversed_a_b = true;
+ a = bb;
+ b = aa;
+ t = xx;
+ x = onemx;
+ } else {
+ a = aa;
+ b = bb;
+ t = onemx;
+ x = xx;
+ }
+
+ /* Choose expansion for optimal convergence */
+ if (b > 10.0f) {
+ if (numext::abs(b * x / a) < 0.3f) {
+ t = betainc_helper<float>::incbps(a, b, x);
+ if (reversed_a_b) t = 1.0f - t;
+ return t;
+ }
+ }
+
+ ans = x * (a + b - 2.0f) / (a - 1.0f);
+ if (ans < 1.0f) {
+ ans = incbeta_cfe<float>::run(a, b, x, true /* small_branch */);
+ t = b * numext::log(t);
+ } else {
+ ans = incbeta_cfe<float>::run(a, b, x, false /* small_branch */);
+ t = (b - 1.0f) * numext::log(t);
+ }
+
+ t += a * numext::log(x) + lgamma_impl<float>::run(a + b) -
+ lgamma_impl<float>::run(a) - lgamma_impl<float>::run(b);
+ t += numext::log(ans / a);
+ t = numext::exp(t);
+
+ if (reversed_a_b) t = 1.0f - t;
+ return t;
+ }
+
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE float incbps(float a, float b, float x) {
+ float t, u, y, s;
+ const float machep = cephes_helper<float>::machep();
+
+ y = a * numext::log(x) + (b - 1.0f) * numext::log1p(-x) - numext::log(a);
+ y -= lgamma_impl<float>::run(a) + lgamma_impl<float>::run(b);
+ y += lgamma_impl<float>::run(a + b);
+
+ t = x / (1.0f - x);
+ s = 0.0f;
+ u = 1.0f;
+ do {
+ b -= 1.0f;
+ if (b == 0.0f) {
+ break;
+ }
+ a += 1.0f;
+ u *= t * b / a;
+ s += u;
+ } while (numext::abs(u) > machep);
+
+ return numext::exp(y) * (1.0f + s);
+ }
+};
+
+template <>
+struct betainc_impl<float> {
+ EIGEN_DEVICE_FUNC
+ static float run(float a, float b, float x) {
+ const float nan = NumTraits<float>::quiet_NaN();
+ float ans, t;
+
+ if (a <= 0.0f) return nan;
+ if (b <= 0.0f) return nan;
+ if ((x <= 0.0f) || (x >= 1.0f)) {
+ if (x == 0.0f) return 0.0f;
+ if (x == 1.0f) return 1.0f;
+ // mtherr("betaincf", DOMAIN);
+ return nan;
+ }
+
+ /* transformation for small aa */
+ if (a <= 1.0f) {
+ ans = betainc_helper<float>::incbsa(a + 1.0f, b, x);
+ t = a * numext::log(x) + b * numext::log1p(-x) +
+ lgamma_impl<float>::run(a + b) - lgamma_impl<float>::run(a + 1.0f) -
+ lgamma_impl<float>::run(b);
+ return (ans + numext::exp(t));
+ } else {
+ return betainc_helper<float>::incbsa(a, b, x);
+ }
+ }
+};
+
+template <>
+struct betainc_helper<double> {
+ EIGEN_DEVICE_FUNC
+ static EIGEN_STRONG_INLINE double incbps(double a, double b, double x) {
+ const double machep = cephes_helper<double>::machep();
+
+ double s, t, u, v, n, t1, z, ai;
+
+ ai = 1.0 / a;
+ u = (1.0 - b) * x;
+ v = u / (a + 1.0);
+ t1 = v;
+ t = u;
+ n = 2.0;
+ s = 0.0;
+ z = machep * ai;
+ while (numext::abs(v) > z) {
+ u = (n - b) * x / n;
+ t *= u;
+ v = t / (a + n);
+ s += v;
+ n += 1.0;
+ }
+ s += t1;
+ s += ai;
+
+ u = a * numext::log(x);
+ // TODO: gamma() is not directly implemented in Eigen.
+ /*
+ if ((a + b) < maxgam && numext::abs(u) < maxlog) {
+ t = gamma(a + b) / (gamma(a) * gamma(b));
+ s = s * t * pow(x, a);
+ }
+ */
+ t = lgamma_impl<double>::run(a + b) - lgamma_impl<double>::run(a) -
+ lgamma_impl<double>::run(b) + u + numext::log(s);
+ return s = numext::exp(t);
+ }
+};
+
+template <>
+struct betainc_impl<double> {
+ EIGEN_DEVICE_FUNC
+ static double run(double aa, double bb, double xx) {
+ const double nan = NumTraits<double>::quiet_NaN();
+ const double machep = cephes_helper<double>::machep();
+ // const double maxgam = 171.624376956302725;
+
+ double a, b, t, x, xc, w, y;
+ bool reversed_a_b = false;
+
+ if (aa <= 0.0 || bb <= 0.0) {
+ return nan; // goto domerr;
+ }
+
+ if ((xx <= 0.0) || (xx >= 1.0)) {
+ if (xx == 0.0) return (0.0);
+ if (xx == 1.0) return (1.0);
+ // mtherr("incbet", DOMAIN);
+ return nan;
+ }
+
+ if ((bb * xx) <= 1.0 && xx <= 0.95) {
+ return betainc_helper<double>::incbps(aa, bb, xx);
+ }
+
+ w = 1.0 - xx;
+
+ /* Reverse a and b if x is greater than the mean. */
+ if (xx > (aa / (aa + bb))) {
+ reversed_a_b = true;
+ a = bb;
+ b = aa;
+ xc = xx;
+ x = w;
+ } else {
+ a = aa;
+ b = bb;
+ xc = w;
+ x = xx;
+ }
+
+ if (reversed_a_b && (b * x) <= 1.0 && x <= 0.95) {
+ t = betainc_helper<double>::incbps(a, b, x);
+ if (t <= machep) {
+ t = 1.0 - machep;
+ } else {
+ t = 1.0 - t;
+ }
+ return t;
+ }
+
+ /* Choose expansion for better convergence. */
+ y = x * (a + b - 2.0) - (a - 1.0);
+ if (y < 0.0) {
+ w = incbeta_cfe<double>::run(a, b, x, true /* small_branch */);
+ } else {
+ w = incbeta_cfe<double>::run(a, b, x, false /* small_branch */) / xc;
+ }
+
+ /* Multiply w by the factor
+ a b _ _ _
+ x (1-x) | (a+b) / ( a | (a) | (b) ) . */
+
+ y = a * numext::log(x);
+ t = b * numext::log(xc);
+ // TODO: gamma is not directly implemented in Eigen.
+ /*
+ if ((a + b) < maxgam && numext::abs(y) < maxlog && numext::abs(t) < maxlog)
+ {
+ t = pow(xc, b);
+ t *= pow(x, a);
+ t /= a;
+ t *= w;
+ t *= gamma(a + b) / (gamma(a) * gamma(b));
+ } else {
+ */
+ /* Resort to logarithms. */
+ y += t + lgamma_impl<double>::run(a + b) - lgamma_impl<double>::run(a) -
+ lgamma_impl<double>::run(b);
+ y += numext::log(w / a);
+ t = numext::exp(y);
+
+ /* } */
+ // done:
+
+ if (reversed_a_b) {
+ if (t <= machep) {
+ t = 1.0 - machep;
+ } else {
+ t = 1.0 - t;
+ }
+ }
+ return t;
+ }
+};
+
+#endif // EIGEN_HAS_C99_MATH
+
+} // end namespace internal
+
+namespace numext {
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(lgamma, Scalar)
+ lgamma(const Scalar& x) {
+ return EIGEN_MATHFUNC_IMPL(lgamma, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(digamma, Scalar)
+ digamma(const Scalar& x) {
+ return EIGEN_MATHFUNC_IMPL(digamma, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(zeta, Scalar)
+zeta(const Scalar& x, const Scalar& q) {
+ return EIGEN_MATHFUNC_IMPL(zeta, Scalar)::run(x, q);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(polygamma, Scalar)
+polygamma(const Scalar& n, const Scalar& x) {
+ return EIGEN_MATHFUNC_IMPL(polygamma, Scalar)::run(n, x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(erf, Scalar)
+ erf(const Scalar& x) {
+ return EIGEN_MATHFUNC_IMPL(erf, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(erfc, Scalar)
+ erfc(const Scalar& x) {
+ return EIGEN_MATHFUNC_IMPL(erfc, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(ndtri, Scalar)
+ ndtri(const Scalar& x) {
+ return EIGEN_MATHFUNC_IMPL(ndtri, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igamma, Scalar)
+ igamma(const Scalar& a, const Scalar& x) {
+ return EIGEN_MATHFUNC_IMPL(igamma, Scalar)::run(a, x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igamma_der_a, Scalar)
+ igamma_der_a(const Scalar& a, const Scalar& x) {
+ return EIGEN_MATHFUNC_IMPL(igamma_der_a, Scalar)::run(a, x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(gamma_sample_der_alpha, Scalar)
+ gamma_sample_der_alpha(const Scalar& a, const Scalar& x) {
+ return EIGEN_MATHFUNC_IMPL(gamma_sample_der_alpha, Scalar)::run(a, x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igammac, Scalar)
+ igammac(const Scalar& a, const Scalar& x) {
+ return EIGEN_MATHFUNC_IMPL(igammac, Scalar)::run(a, x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(betainc, Scalar)
+ betainc(const Scalar& a, const Scalar& b, const Scalar& x) {
+ return EIGEN_MATHFUNC_IMPL(betainc, Scalar)::run(a, b, x);
+}
+
+} // end namespace numext
+} // end namespace Eigen
+
+#endif // EIGEN_SPECIAL_FUNCTIONS_H
diff --git a/src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsPacketMath.h b/src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsPacketMath.h
new file mode 100644
index 0000000..2bb0179
--- /dev/null
+++ b/src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsPacketMath.h
@@ -0,0 +1,79 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIALFUNCTIONS_PACKETMATH_H
+#define EIGEN_SPECIALFUNCTIONS_PACKETMATH_H
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal \returns the ln(|gamma(\a a)|) (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet plgamma(const Packet& a) { using numext::lgamma; return lgamma(a); }
+
+/** \internal \returns the derivative of lgamma, psi(\a a) (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pdigamma(const Packet& a) { using numext::digamma; return digamma(a); }
+
+/** \internal \returns the zeta function of two arguments (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pzeta(const Packet& x, const Packet& q) { using numext::zeta; return zeta(x, q); }
+
+/** \internal \returns the polygamma function (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet ppolygamma(const Packet& n, const Packet& x) { using numext::polygamma; return polygamma(n, x); }
+
+/** \internal \returns the erf(\a a) (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet perf(const Packet& a) { using numext::erf; return erf(a); }
+
+/** \internal \returns the erfc(\a a) (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet perfc(const Packet& a) { using numext::erfc; return erfc(a); }
+
+/** \internal \returns the ndtri(\a a) (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pndtri(const Packet& a) {
+ typedef typename unpacket_traits<Packet>::type ScalarType;
+ using internal::generic_ndtri; return generic_ndtri<Packet, ScalarType>(a);
+}
+
+/** \internal \returns the incomplete gamma function igamma(\a a, \a x) */
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+Packet pigamma(const Packet& a, const Packet& x) { using numext::igamma; return igamma(a, x); }
+
+/** \internal \returns the derivative of the incomplete gamma function
+ * igamma_der_a(\a a, \a x) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pigamma_der_a(const Packet& a, const Packet& x) {
+ using numext::igamma_der_a; return igamma_der_a(a, x);
+}
+
+/** \internal \returns compute the derivative of the sample
+ * of Gamma(alpha, 1) random variable with respect to the parameter a
+ * gamma_sample_der_alpha(\a alpha, \a sample) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pgamma_sample_der_alpha(const Packet& alpha, const Packet& sample) {
+ using numext::gamma_sample_der_alpha; return gamma_sample_der_alpha(alpha, sample);
+}
+
+/** \internal \returns the complementary incomplete gamma function igammac(\a a, \a x) */
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+Packet pigammac(const Packet& a, const Packet& x) { using numext::igammac; return igammac(a, x); }
+
+/** \internal \returns the complementary incomplete gamma function betainc(\a a, \a b, \a x) */
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+Packet pbetainc(const Packet& a, const Packet& b,const Packet& x) { using numext::betainc; return betainc(a, b, x); }
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPECIALFUNCTIONS_PACKETMATH_H
diff --git a/src/EigenUnsupported/src/SpecialFunctions/arch/AVX/BesselFunctions.h b/src/EigenUnsupported/src/SpecialFunctions/arch/AVX/BesselFunctions.h
new file mode 100644
index 0000000..2d76692
--- /dev/null
+++ b/src/EigenUnsupported/src/SpecialFunctions/arch/AVX/BesselFunctions.h
@@ -0,0 +1,46 @@
+#ifndef EIGEN_AVX_BESSELFUNCTIONS_H
+#define EIGEN_AVX_BESSELFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_i0)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_i0)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_i0e)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_i0e)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_i1)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_i1)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_i1e)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_i1e)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_j0)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_j0)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_j1)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_j1)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_k0)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_k0)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_k0e)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_k0e)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_k1)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_k1)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_k1e)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_k1e)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_y0)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_y0)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_y1)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_y1)
+
+} // namespace internal
+} // namespace Eigen
+
+#endif // EIGEN_AVX_BESSELFUNCTIONS_H
diff --git a/src/EigenUnsupported/src/SpecialFunctions/arch/AVX/SpecialFunctions.h b/src/EigenUnsupported/src/SpecialFunctions/arch/AVX/SpecialFunctions.h
new file mode 100644
index 0000000..35e62a8
--- /dev/null
+++ b/src/EigenUnsupported/src/SpecialFunctions/arch/AVX/SpecialFunctions.h
@@ -0,0 +1,16 @@
+#ifndef EIGEN_AVX_SPECIALFUNCTIONS_H
+#define EIGEN_AVX_SPECIALFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, perf)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, perf)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pndtri)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pndtri)
+
+} // namespace internal
+} // namespace Eigen
+
+#endif // EIGEN_AVX_SPECIAL_FUNCTIONS_H
diff --git a/src/EigenUnsupported/src/SpecialFunctions/arch/AVX512/BesselFunctions.h b/src/EigenUnsupported/src/SpecialFunctions/arch/AVX512/BesselFunctions.h
new file mode 100644
index 0000000..7dd3c3e
--- /dev/null
+++ b/src/EigenUnsupported/src/SpecialFunctions/arch/AVX512/BesselFunctions.h
@@ -0,0 +1,46 @@
+#ifndef EIGEN_AVX512_BESSELFUNCTIONS_H
+#define EIGEN_AVX512_BESSELFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_i0)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_i0)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_i0e)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_i0e)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_i1)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_i1)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_i1e)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_i1e)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_j0)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_j0)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_j1)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_j1)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_k0)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_k0)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_k0e)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_k0e)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_k1)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_k1)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_k1e)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_k1e)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_y0)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_y0)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_y1)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_y1)
+
+} // namespace internal
+} // namespace Eigen
+
+#endif // EIGEN_AVX512_BESSELFUNCTIONS_H
diff --git a/src/EigenUnsupported/src/SpecialFunctions/arch/AVX512/SpecialFunctions.h b/src/EigenUnsupported/src/SpecialFunctions/arch/AVX512/SpecialFunctions.h
new file mode 100644
index 0000000..79878f2
--- /dev/null
+++ b/src/EigenUnsupported/src/SpecialFunctions/arch/AVX512/SpecialFunctions.h
@@ -0,0 +1,16 @@
+#ifndef EIGEN_AVX512_SPECIALFUNCTIONS_H
+#define EIGEN_AVX512_SPECIALFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, perf)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, perf)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pndtri)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pndtri)
+
+} // namespace internal
+} // namespace Eigen
+
+#endif // EIGEN_AVX512_SPECIAL_FUNCTIONS_H
diff --git a/src/EigenUnsupported/src/SpecialFunctions/arch/GPU/SpecialFunctions.h b/src/EigenUnsupported/src/SpecialFunctions/arch/GPU/SpecialFunctions.h
new file mode 100644
index 0000000..dd3bf4d
--- /dev/null
+++ b/src/EigenUnsupported/src/SpecialFunctions/arch/GPU/SpecialFunctions.h
@@ -0,0 +1,369 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GPU_SPECIALFUNCTIONS_H
+#define EIGEN_GPU_SPECIALFUNCTIONS_H
+
+namespace Eigen {
+
+namespace internal {
+
+// Make sure this is only available when targeting a GPU: we don't want to
+// introduce conflicts between these packet_traits definitions and the ones
+// we'll use on the host side (SSE, AVX, ...)
+#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 plgamma<float4>(const float4& a)
+{
+ return make_float4(lgammaf(a.x), lgammaf(a.y), lgammaf(a.z), lgammaf(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 plgamma<double2>(const double2& a)
+{
+ using numext::lgamma;
+ return make_double2(lgamma(a.x), lgamma(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pdigamma<float4>(const float4& a)
+{
+ using numext::digamma;
+ return make_float4(digamma(a.x), digamma(a.y), digamma(a.z), digamma(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pdigamma<double2>(const double2& a)
+{
+ using numext::digamma;
+ return make_double2(digamma(a.x), digamma(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pzeta<float4>(const float4& x, const float4& q)
+{
+ using numext::zeta;
+ return make_float4(zeta(x.x, q.x), zeta(x.y, q.y), zeta(x.z, q.z), zeta(x.w, q.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pzeta<double2>(const double2& x, const double2& q)
+{
+ using numext::zeta;
+ return make_double2(zeta(x.x, q.x), zeta(x.y, q.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 ppolygamma<float4>(const float4& n, const float4& x)
+{
+ using numext::polygamma;
+ return make_float4(polygamma(n.x, x.x), polygamma(n.y, x.y), polygamma(n.z, x.z), polygamma(n.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 ppolygamma<double2>(const double2& n, const double2& x)
+{
+ using numext::polygamma;
+ return make_double2(polygamma(n.x, x.x), polygamma(n.y, x.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 perf<float4>(const float4& a)
+{
+ return make_float4(erff(a.x), erff(a.y), erff(a.z), erff(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 perf<double2>(const double2& a)
+{
+ using numext::erf;
+ return make_double2(erf(a.x), erf(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 perfc<float4>(const float4& a)
+{
+ using numext::erfc;
+ return make_float4(erfc(a.x), erfc(a.y), erfc(a.z), erfc(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 perfc<double2>(const double2& a)
+{
+ using numext::erfc;
+ return make_double2(erfc(a.x), erfc(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pndtri<float4>(const float4& a)
+{
+ using numext::ndtri;
+ return make_float4(ndtri(a.x), ndtri(a.y), ndtri(a.z), ndtri(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pndtri<double2>(const double2& a)
+{
+ using numext::ndtri;
+ return make_double2(ndtri(a.x), ndtri(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pigamma<float4>(const float4& a, const float4& x)
+{
+ using numext::igamma;
+ return make_float4(
+ igamma(a.x, x.x),
+ igamma(a.y, x.y),
+ igamma(a.z, x.z),
+ igamma(a.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pigamma<double2>(const double2& a, const double2& x)
+{
+ using numext::igamma;
+ return make_double2(igamma(a.x, x.x), igamma(a.y, x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pigamma_der_a<float4>(
+ const float4& a, const float4& x) {
+ using numext::igamma_der_a;
+ return make_float4(igamma_der_a(a.x, x.x), igamma_der_a(a.y, x.y),
+ igamma_der_a(a.z, x.z), igamma_der_a(a.w, x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pigamma_der_a<double2>(const double2& a, const double2& x) {
+ using numext::igamma_der_a;
+ return make_double2(igamma_der_a(a.x, x.x), igamma_der_a(a.y, x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pgamma_sample_der_alpha<float4>(
+ const float4& alpha, const float4& sample) {
+ using numext::gamma_sample_der_alpha;
+ return make_float4(
+ gamma_sample_der_alpha(alpha.x, sample.x),
+ gamma_sample_der_alpha(alpha.y, sample.y),
+ gamma_sample_der_alpha(alpha.z, sample.z),
+ gamma_sample_der_alpha(alpha.w, sample.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pgamma_sample_der_alpha<double2>(const double2& alpha, const double2& sample) {
+ using numext::gamma_sample_der_alpha;
+ return make_double2(
+ gamma_sample_der_alpha(alpha.x, sample.x),
+ gamma_sample_der_alpha(alpha.y, sample.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pigammac<float4>(const float4& a, const float4& x)
+{
+ using numext::igammac;
+ return make_float4(
+ igammac(a.x, x.x),
+ igammac(a.y, x.y),
+ igammac(a.z, x.z),
+ igammac(a.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pigammac<double2>(const double2& a, const double2& x)
+{
+ using numext::igammac;
+ return make_double2(igammac(a.x, x.x), igammac(a.y, x.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pbetainc<float4>(const float4& a, const float4& b, const float4& x)
+{
+ using numext::betainc;
+ return make_float4(
+ betainc(a.x, b.x, x.x),
+ betainc(a.y, b.y, x.y),
+ betainc(a.z, b.z, x.z),
+ betainc(a.w, b.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pbetainc<double2>(const double2& a, const double2& b, const double2& x)
+{
+ using numext::betainc;
+ return make_double2(betainc(a.x, b.x, x.x), betainc(a.y, b.y, x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_i0e<float4>(const float4& x) {
+ using numext::bessel_i0e;
+ return make_float4(bessel_i0e(x.x), bessel_i0e(x.y), bessel_i0e(x.z), bessel_i0e(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_i0e<double2>(const double2& x) {
+ using numext::bessel_i0e;
+ return make_double2(bessel_i0e(x.x), bessel_i0e(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_i0<float4>(const float4& x) {
+ using numext::bessel_i0;
+ return make_float4(bessel_i0(x.x), bessel_i0(x.y), bessel_i0(x.z), bessel_i0(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_i0<double2>(const double2& x) {
+ using numext::bessel_i0;
+ return make_double2(bessel_i0(x.x), bessel_i0(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_i1e<float4>(const float4& x) {
+ using numext::bessel_i1e;
+ return make_float4(bessel_i1e(x.x), bessel_i1e(x.y), bessel_i1e(x.z), bessel_i1e(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_i1e<double2>(const double2& x) {
+ using numext::bessel_i1e;
+ return make_double2(bessel_i1e(x.x), bessel_i1e(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_i1<float4>(const float4& x) {
+ using numext::bessel_i1;
+ return make_float4(bessel_i1(x.x), bessel_i1(x.y), bessel_i1(x.z), bessel_i1(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_i1<double2>(const double2& x) {
+ using numext::bessel_i1;
+ return make_double2(bessel_i1(x.x), bessel_i1(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_k0e<float4>(const float4& x) {
+ using numext::bessel_k0e;
+ return make_float4(bessel_k0e(x.x), bessel_k0e(x.y), bessel_k0e(x.z), bessel_k0e(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_k0e<double2>(const double2& x) {
+ using numext::bessel_k0e;
+ return make_double2(bessel_k0e(x.x), bessel_k0e(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_k0<float4>(const float4& x) {
+ using numext::bessel_k0;
+ return make_float4(bessel_k0(x.x), bessel_k0(x.y), bessel_k0(x.z), bessel_k0(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_k0<double2>(const double2& x) {
+ using numext::bessel_k0;
+ return make_double2(bessel_k0(x.x), bessel_k0(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_k1e<float4>(const float4& x) {
+ using numext::bessel_k1e;
+ return make_float4(bessel_k1e(x.x), bessel_k1e(x.y), bessel_k1e(x.z), bessel_k1e(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_k1e<double2>(const double2& x) {
+ using numext::bessel_k1e;
+ return make_double2(bessel_k1e(x.x), bessel_k1e(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_k1<float4>(const float4& x) {
+ using numext::bessel_k1;
+ return make_float4(bessel_k1(x.x), bessel_k1(x.y), bessel_k1(x.z), bessel_k1(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_k1<double2>(const double2& x) {
+ using numext::bessel_k1;
+ return make_double2(bessel_k1(x.x), bessel_k1(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_j0<float4>(const float4& x) {
+ using numext::bessel_j0;
+ return make_float4(bessel_j0(x.x), bessel_j0(x.y), bessel_j0(x.z), bessel_j0(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_j0<double2>(const double2& x) {
+ using numext::bessel_j0;
+ return make_double2(bessel_j0(x.x), bessel_j0(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_j1<float4>(const float4& x) {
+ using numext::bessel_j1;
+ return make_float4(bessel_j1(x.x), bessel_j1(x.y), bessel_j1(x.z), bessel_j1(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_j1<double2>(const double2& x) {
+ using numext::bessel_j1;
+ return make_double2(bessel_j1(x.x), bessel_j1(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_y0<float4>(const float4& x) {
+ using numext::bessel_y0;
+ return make_float4(bessel_y0(x.x), bessel_y0(x.y), bessel_y0(x.z), bessel_y0(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_y0<double2>(const double2& x) {
+ using numext::bessel_y0;
+ return make_double2(bessel_y0(x.x), bessel_y0(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_y1<float4>(const float4& x) {
+ using numext::bessel_y1;
+ return make_float4(bessel_y1(x.x), bessel_y1(x.y), bessel_y1(x.z), bessel_y1(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
+pbessel_y1<double2>(const double2& x) {
+ using numext::bessel_y1;
+ return make_double2(bessel_y1(x.x), bessel_y1(x.y));
+}
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_GPU_SPECIALFUNCTIONS_H
diff --git a/src/EigenUnsupported/src/SpecialFunctions/arch/NEON/BesselFunctions.h b/src/EigenUnsupported/src/SpecialFunctions/arch/NEON/BesselFunctions.h
new file mode 100644
index 0000000..67433b0
--- /dev/null
+++ b/src/EigenUnsupported/src/SpecialFunctions/arch/NEON/BesselFunctions.h
@@ -0,0 +1,54 @@
+#ifndef EIGEN_NEON_BESSELFUNCTIONS_H
+#define EIGEN_NEON_BESSELFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
+
+#define NEON_HALF_TO_FLOAT_FUNCTIONS(METHOD) \
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
+Packet8hf METHOD<Packet8hf>(const Packet8hf& x) { \
+ const Packet4f lo = METHOD<Packet4f>(vcvt_f32_f16(vget_low_f16(x))); \
+ const Packet4f hi = METHOD<Packet4f>(vcvt_f32_f16(vget_high_f16(x))); \
+ return vcombine_f16(vcvt_f16_f32(lo), vcvt_f16_f32(hi)); \
+} \
+ \
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
+Packet4hf METHOD<Packet4hf>(const Packet4hf& x) { \
+ return vcvt_f16_f32(METHOD<Packet4f>(vcvt_f32_f16(x))); \
+}
+
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_i0)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_i0e)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_i1)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_i1e)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_j0)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_j1)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_k0)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_k0e)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_k1)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_k1e)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_y0)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_y1)
+
+#undef NEON_HALF_TO_FLOAT_FUNCTIONS
+#endif
+
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_i0)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_i0e)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_i1)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_i1e)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_j0)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_j1)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_k0)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_k0e)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_k1)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_k1e)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_y0)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_y1)
+
+} // namespace internal
+} // namespace Eigen
+
+#endif // EIGEN_NEON_BESSELFUNCTIONS_H
diff --git a/src/EigenUnsupported/src/SpecialFunctions/arch/NEON/SpecialFunctions.h b/src/EigenUnsupported/src/SpecialFunctions/arch/NEON/SpecialFunctions.h
new file mode 100644
index 0000000..ec92951
--- /dev/null
+++ b/src/EigenUnsupported/src/SpecialFunctions/arch/NEON/SpecialFunctions.h
@@ -0,0 +1,34 @@
+#ifndef EIGEN_NEON_SPECIALFUNCTIONS_H
+#define EIGEN_NEON_SPECIALFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
+
+#define NEON_HALF_TO_FLOAT_FUNCTIONS(METHOD) \
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
+Packet8hf METHOD<Packet8hf>(const Packet8hf& x) { \
+ const Packet4f lo = METHOD<Packet4f>(vcvt_f32_f16(vget_low_f16(x))); \
+ const Packet4f hi = METHOD<Packet4f>(vcvt_f32_f16(vget_high_f16(x))); \
+ return vcombine_f16(vcvt_f16_f32(lo), vcvt_f16_f32(hi)); \
+} \
+ \
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
+Packet4hf METHOD<Packet4hf>(const Packet4hf& x) { \
+ return vcvt_f16_f32(METHOD<Packet4f>(vcvt_f32_f16(x))); \
+}
+
+NEON_HALF_TO_FLOAT_FUNCTIONS(perf)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pndtri)
+
+#undef NEON_HALF_TO_FLOAT_FUNCTIONS
+#endif
+
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, perf)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pndtri)
+
+} // namespace internal
+} // namespace Eigen
+
+#endif // EIGEN_NEON_SPECIALFUNCTIONS_H
diff --git a/src/EigenUnsupported/src/Splines/Spline.h b/src/EigenUnsupported/src/Splines/Spline.h
new file mode 100644
index 0000000..79edd52
--- /dev/null
+++ b/src/EigenUnsupported/src/Splines/Spline.h
@@ -0,0 +1,507 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 20010-2011 Hauke Heibel <hauke.heibel@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPLINE_H
+#define EIGEN_SPLINE_H
+
+#include "SplineFwd.h"
+
+namespace Eigen
+{
+ /**
+ * \ingroup Splines_Module
+ * \class Spline
+ * \brief A class representing multi-dimensional spline curves.
+ *
+ * The class represents B-splines with non-uniform knot vectors. Each control
+ * point of the B-spline is associated with a basis function
+ * \f{align*}
+ * C(u) & = \sum_{i=0}^{n}N_{i,p}(u)P_i
+ * \f}
+ *
+ * \tparam _Scalar The underlying data type (typically float or double)
+ * \tparam _Dim The curve dimension (e.g. 2 or 3)
+ * \tparam _Degree Per default set to Dynamic; could be set to the actual desired
+ * degree for optimization purposes (would result in stack allocation
+ * of several temporary variables).
+ **/
+ template <typename _Scalar, int _Dim, int _Degree>
+ class Spline
+ {
+ public:
+ typedef _Scalar Scalar; /*!< The spline curve's scalar type. */
+ enum { Dimension = _Dim /*!< The spline curve's dimension. */ };
+ enum { Degree = _Degree /*!< The spline curve's degree. */ };
+
+ /** \brief The point type the spline is representing. */
+ typedef typename SplineTraits<Spline>::PointType PointType;
+
+ /** \brief The data type used to store knot vectors. */
+ typedef typename SplineTraits<Spline>::KnotVectorType KnotVectorType;
+
+ /** \brief The data type used to store parameter vectors. */
+ typedef typename SplineTraits<Spline>::ParameterVectorType ParameterVectorType;
+
+ /** \brief The data type used to store non-zero basis functions. */
+ typedef typename SplineTraits<Spline>::BasisVectorType BasisVectorType;
+
+ /** \brief The data type used to store the values of the basis function derivatives. */
+ typedef typename SplineTraits<Spline>::BasisDerivativeType BasisDerivativeType;
+
+ /** \brief The data type representing the spline's control points. */
+ typedef typename SplineTraits<Spline>::ControlPointVectorType ControlPointVectorType;
+
+ /**
+ * \brief Creates a (constant) zero spline.
+ * For Splines with dynamic degree, the resulting degree will be 0.
+ **/
+ Spline()
+ : m_knots(1, (Degree==Dynamic ? 2 : 2*Degree+2))
+ , m_ctrls(ControlPointVectorType::Zero(Dimension,(Degree==Dynamic ? 1 : Degree+1)))
+ {
+ // in theory this code can go to the initializer list but it will get pretty
+ // much unreadable ...
+ enum { MinDegree = (Degree==Dynamic ? 0 : Degree) };
+ m_knots.template segment<MinDegree+1>(0) = Array<Scalar,1,MinDegree+1>::Zero();
+ m_knots.template segment<MinDegree+1>(MinDegree+1) = Array<Scalar,1,MinDegree+1>::Ones();
+ }
+
+ /**
+ * \brief Creates a spline from a knot vector and control points.
+ * \param knots The spline's knot vector.
+ * \param ctrls The spline's control point vector.
+ **/
+ template <typename OtherVectorType, typename OtherArrayType>
+ Spline(const OtherVectorType& knots, const OtherArrayType& ctrls) : m_knots(knots), m_ctrls(ctrls) {}
+
+ /**
+ * \brief Copy constructor for splines.
+ * \param spline The input spline.
+ **/
+ template <int OtherDegree>
+ Spline(const Spline<Scalar, Dimension, OtherDegree>& spline) :
+ m_knots(spline.knots()), m_ctrls(spline.ctrls()) {}
+
+ /**
+ * \brief Returns the knots of the underlying spline.
+ **/
+ const KnotVectorType& knots() const { return m_knots; }
+
+ /**
+ * \brief Returns the ctrls of the underlying spline.
+ **/
+ const ControlPointVectorType& ctrls() const { return m_ctrls; }
+
+ /**
+ * \brief Returns the spline value at a given site \f$u\f$.
+ *
+ * The function returns
+ * \f{align*}
+ * C(u) & = \sum_{i=0}^{n}N_{i,p}P_i
+ * \f}
+ *
+ * \param u Parameter \f$u \in [0;1]\f$ at which the spline is evaluated.
+ * \return The spline value at the given location \f$u\f$.
+ **/
+ PointType operator()(Scalar u) const;
+
+ /**
+ * \brief Evaluation of spline derivatives of up-to given order.
+ *
+ * The function returns
+ * \f{align*}
+ * \frac{d^i}{du^i}C(u) & = \sum_{i=0}^{n} \frac{d^i}{du^i} N_{i,p}(u)P_i
+ * \f}
+ * for i ranging between 0 and order.
+ *
+ * \param u Parameter \f$u \in [0;1]\f$ at which the spline derivative is evaluated.
+ * \param order The order up to which the derivatives are computed.
+ **/
+ typename SplineTraits<Spline>::DerivativeType
+ derivatives(Scalar u, DenseIndex order) const;
+
+ /**
+ * \copydoc Spline::derivatives
+ * Using the template version of this function is more efficieent since
+ * temporary objects are allocated on the stack whenever this is possible.
+ **/
+ template <int DerivativeOrder>
+ typename SplineTraits<Spline,DerivativeOrder>::DerivativeType
+ derivatives(Scalar u, DenseIndex order = DerivativeOrder) const;
+
+ /**
+ * \brief Computes the non-zero basis functions at the given site.
+ *
+ * Splines have local support and a point from their image is defined
+ * by exactly \f$p+1\f$ control points \f$P_i\f$ where \f$p\f$ is the
+ * spline degree.
+ *
+ * This function computes the \f$p+1\f$ non-zero basis function values
+ * for a given parameter value \f$u\f$. It returns
+ * \f{align*}{
+ * N_{i,p}(u), \hdots, N_{i+p+1,p}(u)
+ * \f}
+ *
+ * \param u Parameter \f$u \in [0;1]\f$ at which the non-zero basis functions
+ * are computed.
+ **/
+ typename SplineTraits<Spline>::BasisVectorType
+ basisFunctions(Scalar u) const;
+
+ /**
+ * \brief Computes the non-zero spline basis function derivatives up to given order.
+ *
+ * The function computes
+ * \f{align*}{
+ * \frac{d^i}{du^i} N_{i,p}(u), \hdots, \frac{d^i}{du^i} N_{i+p+1,p}(u)
+ * \f}
+ * with i ranging from 0 up to the specified order.
+ *
+ * \param u Parameter \f$u \in [0;1]\f$ at which the non-zero basis function
+ * derivatives are computed.
+ * \param order The order up to which the basis function derivatives are computes.
+ **/
+ typename SplineTraits<Spline>::BasisDerivativeType
+ basisFunctionDerivatives(Scalar u, DenseIndex order) const;
+
+ /**
+ * \copydoc Spline::basisFunctionDerivatives
+ * Using the template version of this function is more efficieent since
+ * temporary objects are allocated on the stack whenever this is possible.
+ **/
+ template <int DerivativeOrder>
+ typename SplineTraits<Spline,DerivativeOrder>::BasisDerivativeType
+ basisFunctionDerivatives(Scalar u, DenseIndex order = DerivativeOrder) const;
+
+ /**
+ * \brief Returns the spline degree.
+ **/
+ DenseIndex degree() const;
+
+ /**
+ * \brief Returns the span within the knot vector in which u is falling.
+ * \param u The site for which the span is determined.
+ **/
+ DenseIndex span(Scalar u) const;
+
+ /**
+ * \brief Computes the span within the provided knot vector in which u is falling.
+ **/
+ static DenseIndex Span(typename SplineTraits<Spline>::Scalar u, DenseIndex degree, const typename SplineTraits<Spline>::KnotVectorType& knots);
+
+ /**
+ * \brief Returns the spline's non-zero basis functions.
+ *
+ * The function computes and returns
+ * \f{align*}{
+ * N_{i,p}(u), \hdots, N_{i+p+1,p}(u)
+ * \f}
+ *
+ * \param u The site at which the basis functions are computed.
+ * \param degree The degree of the underlying spline.
+ * \param knots The underlying spline's knot vector.
+ **/
+ static BasisVectorType BasisFunctions(Scalar u, DenseIndex degree, const KnotVectorType& knots);
+
+ /**
+ * \copydoc Spline::basisFunctionDerivatives
+ * \param degree The degree of the underlying spline
+ * \param knots The underlying spline's knot vector.
+ **/
+ static BasisDerivativeType BasisFunctionDerivatives(
+ const Scalar u, const DenseIndex order, const DenseIndex degree, const KnotVectorType& knots);
+
+ private:
+ KnotVectorType m_knots; /*!< Knot vector. */
+ ControlPointVectorType m_ctrls; /*!< Control points. */
+
+ template <typename DerivativeType>
+ static void BasisFunctionDerivativesImpl(
+ const typename Spline<_Scalar, _Dim, _Degree>::Scalar u,
+ const DenseIndex order,
+ const DenseIndex p,
+ const typename Spline<_Scalar, _Dim, _Degree>::KnotVectorType& U,
+ DerivativeType& N_);
+ };
+
+ template <typename _Scalar, int _Dim, int _Degree>
+ DenseIndex Spline<_Scalar, _Dim, _Degree>::Span(
+ typename SplineTraits< Spline<_Scalar, _Dim, _Degree> >::Scalar u,
+ DenseIndex degree,
+ const typename SplineTraits< Spline<_Scalar, _Dim, _Degree> >::KnotVectorType& knots)
+ {
+ // Piegl & Tiller, "The NURBS Book", A2.1 (p. 68)
+ if (u <= knots(0)) return degree;
+ const Scalar* pos = std::upper_bound(knots.data()+degree-1, knots.data()+knots.size()-degree-1, u);
+ return static_cast<DenseIndex>( std::distance(knots.data(), pos) - 1 );
+ }
+
+ template <typename _Scalar, int _Dim, int _Degree>
+ typename Spline<_Scalar, _Dim, _Degree>::BasisVectorType
+ Spline<_Scalar, _Dim, _Degree>::BasisFunctions(
+ typename Spline<_Scalar, _Dim, _Degree>::Scalar u,
+ DenseIndex degree,
+ const typename Spline<_Scalar, _Dim, _Degree>::KnotVectorType& knots)
+ {
+ const DenseIndex p = degree;
+ const DenseIndex i = Spline::Span(u, degree, knots);
+
+ const KnotVectorType& U = knots;
+
+ BasisVectorType left(p+1); left(0) = Scalar(0);
+ BasisVectorType right(p+1); right(0) = Scalar(0);
+
+ VectorBlock<BasisVectorType,Degree>(left,1,p) = u - VectorBlock<const KnotVectorType,Degree>(U,i+1-p,p).reverse();
+ VectorBlock<BasisVectorType,Degree>(right,1,p) = VectorBlock<const KnotVectorType,Degree>(U,i+1,p) - u;
+
+ BasisVectorType N(1,p+1);
+ N(0) = Scalar(1);
+ for (DenseIndex j=1; j<=p; ++j)
+ {
+ Scalar saved = Scalar(0);
+ for (DenseIndex r=0; r<j; r++)
+ {
+ const Scalar tmp = N(r)/(right(r+1)+left(j-r));
+ N[r] = saved + right(r+1)*tmp;
+ saved = left(j-r)*tmp;
+ }
+ N(j) = saved;
+ }
+ return N;
+ }
+
+ template <typename _Scalar, int _Dim, int _Degree>
+ DenseIndex Spline<_Scalar, _Dim, _Degree>::degree() const
+ {
+ if (_Degree == Dynamic)
+ return m_knots.size() - m_ctrls.cols() - 1;
+ else
+ return _Degree;
+ }
+
+ template <typename _Scalar, int _Dim, int _Degree>
+ DenseIndex Spline<_Scalar, _Dim, _Degree>::span(Scalar u) const
+ {
+ return Spline::Span(u, degree(), knots());
+ }
+
+ template <typename _Scalar, int _Dim, int _Degree>
+ typename Spline<_Scalar, _Dim, _Degree>::PointType Spline<_Scalar, _Dim, _Degree>::operator()(Scalar u) const
+ {
+ enum { Order = SplineTraits<Spline>::OrderAtCompileTime };
+
+ const DenseIndex span = this->span(u);
+ const DenseIndex p = degree();
+ const BasisVectorType basis_funcs = basisFunctions(u);
+
+ const Replicate<BasisVectorType,Dimension,1> ctrl_weights(basis_funcs);
+ const Block<const ControlPointVectorType,Dimension,Order> ctrl_pts(ctrls(),0,span-p,Dimension,p+1);
+ return (ctrl_weights * ctrl_pts).rowwise().sum();
+ }
+
+ /* --------------------------------------------------------------------------------------------- */
+
+ template <typename SplineType, typename DerivativeType>
+ void derivativesImpl(const SplineType& spline, typename SplineType::Scalar u, DenseIndex order, DerivativeType& der)
+ {
+ enum { Dimension = SplineTraits<SplineType>::Dimension };
+ enum { Order = SplineTraits<SplineType>::OrderAtCompileTime };
+ enum { DerivativeOrder = DerivativeType::ColsAtCompileTime };
+
+ typedef typename SplineTraits<SplineType>::ControlPointVectorType ControlPointVectorType;
+ typedef typename SplineTraits<SplineType,DerivativeOrder>::BasisDerivativeType BasisDerivativeType;
+ typedef typename BasisDerivativeType::ConstRowXpr BasisDerivativeRowXpr;
+
+ const DenseIndex p = spline.degree();
+ const DenseIndex span = spline.span(u);
+
+ const DenseIndex n = (std::min)(p, order);
+
+ der.resize(Dimension,n+1);
+
+ // Retrieve the basis function derivatives up to the desired order...
+ const BasisDerivativeType basis_func_ders = spline.template basisFunctionDerivatives<DerivativeOrder>(u, n+1);
+
+ // ... and perform the linear combinations of the control points.
+ for (DenseIndex der_order=0; der_order<n+1; ++der_order)
+ {
+ const Replicate<BasisDerivativeRowXpr,Dimension,1> ctrl_weights( basis_func_ders.row(der_order) );
+ const Block<const ControlPointVectorType,Dimension,Order> ctrl_pts(spline.ctrls(),0,span-p,Dimension,p+1);
+ der.col(der_order) = (ctrl_weights * ctrl_pts).rowwise().sum();
+ }
+ }
+
+ template <typename _Scalar, int _Dim, int _Degree>
+ typename SplineTraits< Spline<_Scalar, _Dim, _Degree> >::DerivativeType
+ Spline<_Scalar, _Dim, _Degree>::derivatives(Scalar u, DenseIndex order) const
+ {
+ typename SplineTraits< Spline >::DerivativeType res;
+ derivativesImpl(*this, u, order, res);
+ return res;
+ }
+
+ template <typename _Scalar, int _Dim, int _Degree>
+ template <int DerivativeOrder>
+ typename SplineTraits< Spline<_Scalar, _Dim, _Degree>, DerivativeOrder >::DerivativeType
+ Spline<_Scalar, _Dim, _Degree>::derivatives(Scalar u, DenseIndex order) const
+ {
+ typename SplineTraits< Spline, DerivativeOrder >::DerivativeType res;
+ derivativesImpl(*this, u, order, res);
+ return res;
+ }
+
+ template <typename _Scalar, int _Dim, int _Degree>
+ typename SplineTraits< Spline<_Scalar, _Dim, _Degree> >::BasisVectorType
+ Spline<_Scalar, _Dim, _Degree>::basisFunctions(Scalar u) const
+ {
+ return Spline::BasisFunctions(u, degree(), knots());
+ }
+
+ /* --------------------------------------------------------------------------------------------- */
+
+
+ template <typename _Scalar, int _Dim, int _Degree>
+ template <typename DerivativeType>
+ void Spline<_Scalar, _Dim, _Degree>::BasisFunctionDerivativesImpl(
+ const typename Spline<_Scalar, _Dim, _Degree>::Scalar u,
+ const DenseIndex order,
+ const DenseIndex p,
+ const typename Spline<_Scalar, _Dim, _Degree>::KnotVectorType& U,
+ DerivativeType& N_)
+ {
+ typedef Spline<_Scalar, _Dim, _Degree> SplineType;
+ enum { Order = SplineTraits<SplineType>::OrderAtCompileTime };
+
+ const DenseIndex span = SplineType::Span(u, p, U);
+
+ const DenseIndex n = (std::min)(p, order);
+
+ N_.resize(n+1, p+1);
+
+ BasisVectorType left = BasisVectorType::Zero(p+1);
+ BasisVectorType right = BasisVectorType::Zero(p+1);
+
+ Matrix<Scalar,Order,Order> ndu(p+1,p+1);
+
+ Scalar saved, temp; // FIXME These were double instead of Scalar. Was there a reason for that?
+
+ ndu(0,0) = 1.0;
+
+ DenseIndex j;
+ for (j=1; j<=p; ++j)
+ {
+ left[j] = u-U[span+1-j];
+ right[j] = U[span+j]-u;
+ saved = 0.0;
+
+ for (DenseIndex r=0; r<j; ++r)
+ {
+ /* Lower triangle */
+ ndu(j,r) = right[r+1]+left[j-r];
+ temp = ndu(r,j-1)/ndu(j,r);
+ /* Upper triangle */
+ ndu(r,j) = static_cast<Scalar>(saved+right[r+1] * temp);
+ saved = left[j-r] * temp;
+ }
+
+ ndu(j,j) = static_cast<Scalar>(saved);
+ }
+
+ for (j = p; j>=0; --j)
+ N_(0,j) = ndu(j,p);
+
+ // Compute the derivatives
+ DerivativeType a(n+1,p+1);
+ DenseIndex r=0;
+ for (; r<=p; ++r)
+ {
+ DenseIndex s1,s2;
+ s1 = 0; s2 = 1; // alternate rows in array a
+ a(0,0) = 1.0;
+
+ // Compute the k-th derivative
+ for (DenseIndex k=1; k<=static_cast<DenseIndex>(n); ++k)
+ {
+ Scalar d = 0.0;
+ DenseIndex rk,pk,j1,j2;
+ rk = r-k; pk = p-k;
+
+ if (r>=k)
+ {
+ a(s2,0) = a(s1,0)/ndu(pk+1,rk);
+ d = a(s2,0)*ndu(rk,pk);
+ }
+
+ if (rk>=-1) j1 = 1;
+ else j1 = -rk;
+
+ if (r-1 <= pk) j2 = k-1;
+ else j2 = p-r;
+
+ for (j=j1; j<=j2; ++j)
+ {
+ a(s2,j) = (a(s1,j)-a(s1,j-1))/ndu(pk+1,rk+j);
+ d += a(s2,j)*ndu(rk+j,pk);
+ }
+
+ if (r<=pk)
+ {
+ a(s2,k) = -a(s1,k-1)/ndu(pk+1,r);
+ d += a(s2,k)*ndu(r,pk);
+ }
+
+ N_(k,r) = static_cast<Scalar>(d);
+ j = s1; s1 = s2; s2 = j; // Switch rows
+ }
+ }
+
+ /* Multiply through by the correct factors */
+ /* (Eq. [2.9]) */
+ r = p;
+ for (DenseIndex k=1; k<=static_cast<DenseIndex>(n); ++k)
+ {
+ for (j=p; j>=0; --j) N_(k,j) *= r;
+ r *= p-k;
+ }
+ }
+
+ template <typename _Scalar, int _Dim, int _Degree>
+ typename SplineTraits< Spline<_Scalar, _Dim, _Degree> >::BasisDerivativeType
+ Spline<_Scalar, _Dim, _Degree>::basisFunctionDerivatives(Scalar u, DenseIndex order) const
+ {
+ typename SplineTraits<Spline<_Scalar, _Dim, _Degree> >::BasisDerivativeType der;
+ BasisFunctionDerivativesImpl(u, order, degree(), knots(), der);
+ return der;
+ }
+
+ template <typename _Scalar, int _Dim, int _Degree>
+ template <int DerivativeOrder>
+ typename SplineTraits< Spline<_Scalar, _Dim, _Degree>, DerivativeOrder >::BasisDerivativeType
+ Spline<_Scalar, _Dim, _Degree>::basisFunctionDerivatives(Scalar u, DenseIndex order) const
+ {
+ typename SplineTraits< Spline<_Scalar, _Dim, _Degree>, DerivativeOrder >::BasisDerivativeType der;
+ BasisFunctionDerivativesImpl(u, order, degree(), knots(), der);
+ return der;
+ }
+
+ template <typename _Scalar, int _Dim, int _Degree>
+ typename SplineTraits<Spline<_Scalar, _Dim, _Degree> >::BasisDerivativeType
+ Spline<_Scalar, _Dim, _Degree>::BasisFunctionDerivatives(
+ const typename Spline<_Scalar, _Dim, _Degree>::Scalar u,
+ const DenseIndex order,
+ const DenseIndex degree,
+ const typename Spline<_Scalar, _Dim, _Degree>::KnotVectorType& knots)
+ {
+ typename SplineTraits<Spline>::BasisDerivativeType der;
+ BasisFunctionDerivativesImpl(u, order, degree, knots, der);
+ return der;
+ }
+}
+
+#endif // EIGEN_SPLINE_H
diff --git a/src/EigenUnsupported/src/Splines/SplineFitting.h b/src/EigenUnsupported/src/Splines/SplineFitting.h
new file mode 100644
index 0000000..9f6e8af
--- /dev/null
+++ b/src/EigenUnsupported/src/Splines/SplineFitting.h
@@ -0,0 +1,431 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 20010-2011 Hauke Heibel <hauke.heibel@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPLINE_FITTING_H
+#define EIGEN_SPLINE_FITTING_H
+
+#include <algorithm>
+#include <functional>
+#include <numeric>
+#include <vector>
+
+#include "SplineFwd.h"
+
+#include "../../../../Eigen/LU"
+#include "../../../../Eigen/QR"
+
+namespace Eigen
+{
+ /**
+ * \brief Computes knot averages.
+ * \ingroup Splines_Module
+ *
+ * The knots are computed as
+ * \f{align*}
+ * u_0 & = \hdots = u_p = 0 \\
+ * u_{m-p} & = \hdots = u_{m} = 1 \\
+ * u_{j+p} & = \frac{1}{p}\sum_{i=j}^{j+p-1}\bar{u}_i \quad\quad j=1,\hdots,n-p
+ * \f}
+ * where \f$p\f$ is the degree and \f$m+1\f$ the number knots
+ * of the desired interpolating spline.
+ *
+ * \param[in] parameters The input parameters. During interpolation one for each data point.
+ * \param[in] degree The spline degree which is used during the interpolation.
+ * \param[out] knots The output knot vector.
+ *
+ * \sa Les Piegl and Wayne Tiller, The NURBS book (2nd ed.), 1997, 9.2.1 Global Curve Interpolation to Point Data
+ **/
+ template <typename KnotVectorType>
+ void KnotAveraging(const KnotVectorType& parameters, DenseIndex degree, KnotVectorType& knots)
+ {
+ knots.resize(parameters.size()+degree+1);
+
+ for (DenseIndex j=1; j<parameters.size()-degree; ++j)
+ knots(j+degree) = parameters.segment(j,degree).mean();
+
+ knots.segment(0,degree+1) = KnotVectorType::Zero(degree+1);
+ knots.segment(knots.size()-degree-1,degree+1) = KnotVectorType::Ones(degree+1);
+ }
+
+ /**
+ * \brief Computes knot averages when derivative constraints are present.
+ * Note that this is a technical interpretation of the referenced article
+ * since the algorithm contained therein is incorrect as written.
+ * \ingroup Splines_Module
+ *
+ * \param[in] parameters The parameters at which the interpolation B-Spline
+ * will intersect the given interpolation points. The parameters
+ * are assumed to be a non-decreasing sequence.
+ * \param[in] degree The degree of the interpolating B-Spline. This must be
+ * greater than zero.
+ * \param[in] derivativeIndices The indices corresponding to parameters at
+ * which there are derivative constraints. The indices are assumed
+ * to be a non-decreasing sequence.
+ * \param[out] knots The calculated knot vector. These will be returned as a
+ * non-decreasing sequence
+ *
+ * \sa Les A. Piegl, Khairan Rajab, Volha Smarodzinana. 2008.
+ * Curve interpolation with directional constraints for engineering design.
+ * Engineering with Computers
+ **/
+ template <typename KnotVectorType, typename ParameterVectorType, typename IndexArray>
+ void KnotAveragingWithDerivatives(const ParameterVectorType& parameters,
+ const unsigned int degree,
+ const IndexArray& derivativeIndices,
+ KnotVectorType& knots)
+ {
+ typedef typename ParameterVectorType::Scalar Scalar;
+
+ DenseIndex numParameters = parameters.size();
+ DenseIndex numDerivatives = derivativeIndices.size();
+
+ if (numDerivatives < 1)
+ {
+ KnotAveraging(parameters, degree, knots);
+ return;
+ }
+
+ DenseIndex startIndex;
+ DenseIndex endIndex;
+
+ DenseIndex numInternalDerivatives = numDerivatives;
+
+ if (derivativeIndices[0] == 0)
+ {
+ startIndex = 0;
+ --numInternalDerivatives;
+ }
+ else
+ {
+ startIndex = 1;
+ }
+ if (derivativeIndices[numDerivatives - 1] == numParameters - 1)
+ {
+ endIndex = numParameters - degree;
+ --numInternalDerivatives;
+ }
+ else
+ {
+ endIndex = numParameters - degree - 1;
+ }
+
+ // There are (endIndex - startIndex + 1) knots obtained from the averaging
+ // and 2 for the first and last parameters.
+ DenseIndex numAverageKnots = endIndex - startIndex + 3;
+ KnotVectorType averageKnots(numAverageKnots);
+ averageKnots[0] = parameters[0];
+
+ int newKnotIndex = 0;
+ for (DenseIndex i = startIndex; i <= endIndex; ++i)
+ averageKnots[++newKnotIndex] = parameters.segment(i, degree).mean();
+ averageKnots[++newKnotIndex] = parameters[numParameters - 1];
+
+ newKnotIndex = -1;
+
+ ParameterVectorType temporaryParameters(numParameters + 1);
+ KnotVectorType derivativeKnots(numInternalDerivatives);
+ for (DenseIndex i = 0; i < numAverageKnots - 1; ++i)
+ {
+ temporaryParameters[0] = averageKnots[i];
+ ParameterVectorType parameterIndices(numParameters);
+ int temporaryParameterIndex = 1;
+ for (DenseIndex j = 0; j < numParameters; ++j)
+ {
+ Scalar parameter = parameters[j];
+ if (parameter >= averageKnots[i] && parameter < averageKnots[i + 1])
+ {
+ parameterIndices[temporaryParameterIndex] = j;
+ temporaryParameters[temporaryParameterIndex++] = parameter;
+ }
+ }
+ temporaryParameters[temporaryParameterIndex] = averageKnots[i + 1];
+
+ for (int j = 0; j <= temporaryParameterIndex - 2; ++j)
+ {
+ for (DenseIndex k = 0; k < derivativeIndices.size(); ++k)
+ {
+ if (parameterIndices[j + 1] == derivativeIndices[k]
+ && parameterIndices[j + 1] != 0
+ && parameterIndices[j + 1] != numParameters - 1)
+ {
+ derivativeKnots[++newKnotIndex] = temporaryParameters.segment(j, 3).mean();
+ break;
+ }
+ }
+ }
+ }
+
+ KnotVectorType temporaryKnots(averageKnots.size() + derivativeKnots.size());
+
+ std::merge(averageKnots.data(), averageKnots.data() + averageKnots.size(),
+ derivativeKnots.data(), derivativeKnots.data() + derivativeKnots.size(),
+ temporaryKnots.data());
+
+ // Number of knots (one for each point and derivative) plus spline order.
+ DenseIndex numKnots = numParameters + numDerivatives + degree + 1;
+ knots.resize(numKnots);
+
+ knots.head(degree).fill(temporaryKnots[0]);
+ knots.tail(degree).fill(temporaryKnots.template tail<1>()[0]);
+ knots.segment(degree, temporaryKnots.size()) = temporaryKnots;
+ }
+
+ /**
+ * \brief Computes chord length parameters which are required for spline interpolation.
+ * \ingroup Splines_Module
+ *
+ * \param[in] pts The data points to which a spline should be fit.
+ * \param[out] chord_lengths The resulting chord length vector.
+ *
+ * \sa Les Piegl and Wayne Tiller, The NURBS book (2nd ed.), 1997, 9.2.1 Global Curve Interpolation to Point Data
+ **/
+ template <typename PointArrayType, typename KnotVectorType>
+ void ChordLengths(const PointArrayType& pts, KnotVectorType& chord_lengths)
+ {
+ typedef typename KnotVectorType::Scalar Scalar;
+
+ const DenseIndex n = pts.cols();
+
+ // 1. compute the column-wise norms
+ chord_lengths.resize(pts.cols());
+ chord_lengths[0] = 0;
+ chord_lengths.rightCols(n-1) = (pts.array().leftCols(n-1) - pts.array().rightCols(n-1)).matrix().colwise().norm();
+
+ // 2. compute the partial sums
+ std::partial_sum(chord_lengths.data(), chord_lengths.data()+n, chord_lengths.data());
+
+ // 3. normalize the data
+ chord_lengths /= chord_lengths(n-1);
+ chord_lengths(n-1) = Scalar(1);
+ }
+
+ /**
+ * \brief Spline fitting methods.
+ * \ingroup Splines_Module
+ **/
+ template <typename SplineType>
+ struct SplineFitting
+ {
+ typedef typename SplineType::KnotVectorType KnotVectorType;
+ typedef typename SplineType::ParameterVectorType ParameterVectorType;
+
+ /**
+ * \brief Fits an interpolating Spline to the given data points.
+ *
+ * \param pts The points for which an interpolating spline will be computed.
+ * \param degree The degree of the interpolating spline.
+ *
+ * \returns A spline interpolating the initially provided points.
+ **/
+ template <typename PointArrayType>
+ static SplineType Interpolate(const PointArrayType& pts, DenseIndex degree);
+
+ /**
+ * \brief Fits an interpolating Spline to the given data points.
+ *
+ * \param pts The points for which an interpolating spline will be computed.
+ * \param degree The degree of the interpolating spline.
+ * \param knot_parameters The knot parameters for the interpolation.
+ *
+ * \returns A spline interpolating the initially provided points.
+ **/
+ template <typename PointArrayType>
+ static SplineType Interpolate(const PointArrayType& pts, DenseIndex degree, const KnotVectorType& knot_parameters);
+
+ /**
+ * \brief Fits an interpolating spline to the given data points and
+ * derivatives.
+ *
+ * \param points The points for which an interpolating spline will be computed.
+ * \param derivatives The desired derivatives of the interpolating spline at interpolation
+ * points.
+ * \param derivativeIndices An array indicating which point each derivative belongs to. This
+ * must be the same size as @a derivatives.
+ * \param degree The degree of the interpolating spline.
+ *
+ * \returns A spline interpolating @a points with @a derivatives at those points.
+ *
+ * \sa Les A. Piegl, Khairan Rajab, Volha Smarodzinana. 2008.
+ * Curve interpolation with directional constraints for engineering design.
+ * Engineering with Computers
+ **/
+ template <typename PointArrayType, typename IndexArray>
+ static SplineType InterpolateWithDerivatives(const PointArrayType& points,
+ const PointArrayType& derivatives,
+ const IndexArray& derivativeIndices,
+ const unsigned int degree);
+
+ /**
+ * \brief Fits an interpolating spline to the given data points and derivatives.
+ *
+ * \param points The points for which an interpolating spline will be computed.
+ * \param derivatives The desired derivatives of the interpolating spline at interpolation points.
+ * \param derivativeIndices An array indicating which point each derivative belongs to. This
+ * must be the same size as @a derivatives.
+ * \param degree The degree of the interpolating spline.
+ * \param parameters The parameters corresponding to the interpolation points.
+ *
+ * \returns A spline interpolating @a points with @a derivatives at those points.
+ *
+ * \sa Les A. Piegl, Khairan Rajab, Volha Smarodzinana. 2008.
+ * Curve interpolation with directional constraints for engineering design.
+ * Engineering with Computers
+ */
+ template <typename PointArrayType, typename IndexArray>
+ static SplineType InterpolateWithDerivatives(const PointArrayType& points,
+ const PointArrayType& derivatives,
+ const IndexArray& derivativeIndices,
+ const unsigned int degree,
+ const ParameterVectorType& parameters);
+ };
+
+ template <typename SplineType>
+ template <typename PointArrayType>
+ SplineType SplineFitting<SplineType>::Interpolate(const PointArrayType& pts, DenseIndex degree, const KnotVectorType& knot_parameters)
+ {
+ typedef typename SplineType::KnotVectorType::Scalar Scalar;
+ typedef typename SplineType::ControlPointVectorType ControlPointVectorType;
+
+ typedef Matrix<Scalar,Dynamic,Dynamic> MatrixType;
+
+ KnotVectorType knots;
+ KnotAveraging(knot_parameters, degree, knots);
+
+ DenseIndex n = pts.cols();
+ MatrixType A = MatrixType::Zero(n,n);
+ for (DenseIndex i=1; i<n-1; ++i)
+ {
+ const DenseIndex span = SplineType::Span(knot_parameters[i], degree, knots);
+
+ // The segment call should somehow be told the spline order at compile time.
+ A.row(i).segment(span-degree, degree+1) = SplineType::BasisFunctions(knot_parameters[i], degree, knots);
+ }
+ A(0,0) = 1.0;
+ A(n-1,n-1) = 1.0;
+
+ HouseholderQR<MatrixType> qr(A);
+
+ // Here, we are creating a temporary due to an Eigen issue.
+ ControlPointVectorType ctrls = qr.solve(MatrixType(pts.transpose())).transpose();
+
+ return SplineType(knots, ctrls);
+ }
+
+ template <typename SplineType>
+ template <typename PointArrayType>
+ SplineType SplineFitting<SplineType>::Interpolate(const PointArrayType& pts, DenseIndex degree)
+ {
+ KnotVectorType chord_lengths; // knot parameters
+ ChordLengths(pts, chord_lengths);
+ return Interpolate(pts, degree, chord_lengths);
+ }
+
+ template <typename SplineType>
+ template <typename PointArrayType, typename IndexArray>
+ SplineType
+ SplineFitting<SplineType>::InterpolateWithDerivatives(const PointArrayType& points,
+ const PointArrayType& derivatives,
+ const IndexArray& derivativeIndices,
+ const unsigned int degree,
+ const ParameterVectorType& parameters)
+ {
+ typedef typename SplineType::KnotVectorType::Scalar Scalar;
+ typedef typename SplineType::ControlPointVectorType ControlPointVectorType;
+
+ typedef Matrix<Scalar, Dynamic, Dynamic> MatrixType;
+
+ const DenseIndex n = points.cols() + derivatives.cols();
+
+ KnotVectorType knots;
+
+ KnotAveragingWithDerivatives(parameters, degree, derivativeIndices, knots);
+
+ // fill matrix
+ MatrixType A = MatrixType::Zero(n, n);
+
+ // Use these dimensions for quicker populating, then transpose for solving.
+ MatrixType b(points.rows(), n);
+
+ DenseIndex startRow;
+ DenseIndex derivativeStart;
+
+ // End derivatives.
+ if (derivativeIndices[0] == 0)
+ {
+ A.template block<1, 2>(1, 0) << -1, 1;
+
+ Scalar y = (knots(degree + 1) - knots(0)) / degree;
+ b.col(1) = y*derivatives.col(0);
+
+ startRow = 2;
+ derivativeStart = 1;
+ }
+ else
+ {
+ startRow = 1;
+ derivativeStart = 0;
+ }
+ if (derivativeIndices[derivatives.cols() - 1] == points.cols() - 1)
+ {
+ A.template block<1, 2>(n - 2, n - 2) << -1, 1;
+
+ Scalar y = (knots(knots.size() - 1) - knots(knots.size() - (degree + 2))) / degree;
+ b.col(b.cols() - 2) = y*derivatives.col(derivatives.cols() - 1);
+ }
+
+ DenseIndex row = startRow;
+ DenseIndex derivativeIndex = derivativeStart;
+ for (DenseIndex i = 1; i < parameters.size() - 1; ++i)
+ {
+ const DenseIndex span = SplineType::Span(parameters[i], degree, knots);
+
+ if (derivativeIndex < derivativeIndices.size() && derivativeIndices[derivativeIndex] == i)
+ {
+ A.block(row, span - degree, 2, degree + 1)
+ = SplineType::BasisFunctionDerivatives(parameters[i], 1, degree, knots);
+
+ b.col(row++) = points.col(i);
+ b.col(row++) = derivatives.col(derivativeIndex++);
+ }
+ else
+ {
+ A.row(row).segment(span - degree, degree + 1)
+ = SplineType::BasisFunctions(parameters[i], degree, knots);
+ b.col(row++) = points.col(i);
+ }
+ }
+ b.col(0) = points.col(0);
+ b.col(b.cols() - 1) = points.col(points.cols() - 1);
+ A(0,0) = 1;
+ A(n - 1, n - 1) = 1;
+
+ // Solve
+ FullPivLU<MatrixType> lu(A);
+ ControlPointVectorType controlPoints = lu.solve(MatrixType(b.transpose())).transpose();
+
+ SplineType spline(knots, controlPoints);
+
+ return spline;
+ }
+
+ template <typename SplineType>
+ template <typename PointArrayType, typename IndexArray>
+ SplineType
+ SplineFitting<SplineType>::InterpolateWithDerivatives(const PointArrayType& points,
+ const PointArrayType& derivatives,
+ const IndexArray& derivativeIndices,
+ const unsigned int degree)
+ {
+ ParameterVectorType parameters;
+ ChordLengths(points, parameters);
+ return InterpolateWithDerivatives(points, derivatives, derivativeIndices, degree, parameters);
+ }
+}
+
+#endif // EIGEN_SPLINE_FITTING_H
diff --git a/src/EigenUnsupported/src/Splines/SplineFwd.h b/src/EigenUnsupported/src/Splines/SplineFwd.h
new file mode 100644
index 0000000..00d6b49
--- /dev/null
+++ b/src/EigenUnsupported/src/Splines/SplineFwd.h
@@ -0,0 +1,93 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 20010-2011 Hauke Heibel <hauke.heibel@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPLINES_FWD_H
+#define EIGEN_SPLINES_FWD_H
+
+#include "../../../../Eigen/Core"
+
+namespace Eigen
+{
+ template <typename Scalar, int Dim, int Degree = Dynamic> class Spline;
+
+ template < typename SplineType, int DerivativeOrder = Dynamic > struct SplineTraits {};
+
+ /**
+ * \ingroup Splines_Module
+ * \brief Compile-time attributes of the Spline class for Dynamic degree.
+ **/
+ template <typename _Scalar, int _Dim, int _Degree>
+ struct SplineTraits< Spline<_Scalar, _Dim, _Degree>, Dynamic >
+ {
+ typedef _Scalar Scalar; /*!< The spline curve's scalar type. */
+ enum { Dimension = _Dim /*!< The spline curve's dimension. */ };
+ enum { Degree = _Degree /*!< The spline curve's degree. */ };
+
+ enum { OrderAtCompileTime = _Degree==Dynamic ? Dynamic : _Degree+1 /*!< The spline curve's order at compile-time. */ };
+ enum { NumOfDerivativesAtCompileTime = OrderAtCompileTime /*!< The number of derivatives defined for the current spline. */ };
+
+ enum { DerivativeMemoryLayout = Dimension==1 ? RowMajor : ColMajor /*!< The derivative type's memory layout. */ };
+
+ /** \brief The data type used to store non-zero basis functions. */
+ typedef Array<Scalar,1,OrderAtCompileTime> BasisVectorType;
+
+ /** \brief The data type used to store the values of the basis function derivatives. */
+ typedef Array<Scalar,Dynamic,Dynamic,RowMajor,NumOfDerivativesAtCompileTime,OrderAtCompileTime> BasisDerivativeType;
+
+ /** \brief The data type used to store the spline's derivative values. */
+ typedef Array<Scalar,Dimension,Dynamic,DerivativeMemoryLayout,Dimension,NumOfDerivativesAtCompileTime> DerivativeType;
+
+ /** \brief The point type the spline is representing. */
+ typedef Array<Scalar,Dimension,1> PointType;
+
+ /** \brief The data type used to store knot vectors. */
+ typedef Array<Scalar,1,Dynamic> KnotVectorType;
+
+ /** \brief The data type used to store parameter vectors. */
+ typedef Array<Scalar,1,Dynamic> ParameterVectorType;
+
+ /** \brief The data type representing the spline's control points. */
+ typedef Array<Scalar,Dimension,Dynamic> ControlPointVectorType;
+ };
+
+ /**
+ * \ingroup Splines_Module
+ * \brief Compile-time attributes of the Spline class for fixed degree.
+ *
+ * The traits class inherits all attributes from the SplineTraits of Dynamic degree.
+ **/
+ template < typename _Scalar, int _Dim, int _Degree, int _DerivativeOrder >
+ struct SplineTraits< Spline<_Scalar, _Dim, _Degree>, _DerivativeOrder > : public SplineTraits< Spline<_Scalar, _Dim, _Degree> >
+ {
+ enum { OrderAtCompileTime = _Degree==Dynamic ? Dynamic : _Degree+1 /*!< The spline curve's order at compile-time. */ };
+ enum { NumOfDerivativesAtCompileTime = _DerivativeOrder==Dynamic ? Dynamic : _DerivativeOrder+1 /*!< The number of derivatives defined for the current spline. */ };
+
+ enum { DerivativeMemoryLayout = _Dim==1 ? RowMajor : ColMajor /*!< The derivative type's memory layout. */ };
+
+ /** \brief The data type used to store the values of the basis function derivatives. */
+ typedef Array<_Scalar,Dynamic,Dynamic,RowMajor,NumOfDerivativesAtCompileTime,OrderAtCompileTime> BasisDerivativeType;
+
+ /** \brief The data type used to store the spline's derivative values. */
+ typedef Array<_Scalar,_Dim,Dynamic,DerivativeMemoryLayout,_Dim,NumOfDerivativesAtCompileTime> DerivativeType;
+ };
+
+ /** \brief 2D float B-spline with dynamic degree. */
+ typedef Spline<float,2> Spline2f;
+
+ /** \brief 3D float B-spline with dynamic degree. */
+ typedef Spline<float,3> Spline3f;
+
+ /** \brief 2D double B-spline with dynamic degree. */
+ typedef Spline<double,2> Spline2d;
+
+ /** \brief 3D double B-spline with dynamic degree. */
+ typedef Spline<double,3> Spline3d;
+}
+
+#endif // EIGEN_SPLINES_FWD_H