From eda5bc26f44ee9a6f83dcf8c91f17296d7fc509d Mon Sep 17 00:00:00 2001 From: Nao Pross Date: Mon, 12 Feb 2024 14:52:43 +0100 Subject: Move into version control --- src/EigenUnsupported/AdolcForward | 159 ++ src/EigenUnsupported/AlignedVector3 | 234 +++ src/EigenUnsupported/ArpackSupport | 30 + src/EigenUnsupported/AutoDiff | 46 + src/EigenUnsupported/BVH | 95 + src/EigenUnsupported/CMakeLists.txt | 32 + src/EigenUnsupported/CXX11/CMakeLists.txt | 8 + src/EigenUnsupported/CXX11/Tensor | 137 ++ src/EigenUnsupported/CXX11/TensorSymmetry | 42 + src/EigenUnsupported/CXX11/ThreadPool | 74 + src/EigenUnsupported/CXX11/src/Tensor/README.md | 1815 +++++++++++++++++ src/EigenUnsupported/CXX11/src/Tensor/Tensor.h | 554 ++++++ .../CXX11/src/Tensor/TensorArgMax.h | 329 ++++ .../CXX11/src/Tensor/TensorAssign.h | 247 +++ src/EigenUnsupported/CXX11/src/Tensor/TensorBase.h | 1176 +++++++++++ .../CXX11/src/Tensor/TensorBlock.h | 1559 +++++++++++++++ .../CXX11/src/Tensor/TensorBroadcasting.h | 1093 +++++++++++ .../CXX11/src/Tensor/TensorChipping.h | 518 +++++ .../CXX11/src/Tensor/TensorConcatenation.h | 377 ++++ .../CXX11/src/Tensor/TensorContraction.h | 1023 ++++++++++ .../CXX11/src/Tensor/TensorContractionBlocking.h | 73 + .../CXX11/src/Tensor/TensorContractionCuda.h | 6 + .../CXX11/src/Tensor/TensorContractionGpu.h | 1413 ++++++++++++++ .../CXX11/src/Tensor/TensorContractionMapper.h | 575 ++++++ .../CXX11/src/Tensor/TensorContractionSycl.h | 1650 ++++++++++++++++ .../CXX11/src/Tensor/TensorContractionThreadPool.h | 1679 ++++++++++++++++ .../CXX11/src/Tensor/TensorConversion.h | 456 +++++ .../CXX11/src/Tensor/TensorConvolution.h | 1132 +++++++++++ .../CXX11/src/Tensor/TensorConvolutionSycl.h | 544 ++++++ .../CXX11/src/Tensor/TensorCostModel.h | 214 ++ .../CXX11/src/Tensor/TensorCustomOp.h | 347 ++++ .../CXX11/src/Tensor/TensorDevice.h | 137 ++ .../CXX11/src/Tensor/TensorDeviceCuda.h | 6 + .../CXX11/src/Tensor/TensorDeviceDefault.h | 104 + .../CXX11/src/Tensor/TensorDeviceGpu.h | 389 ++++ .../CXX11/src/Tensor/TensorDeviceSycl.h | 1048 ++++++++++ .../CXX11/src/Tensor/TensorDeviceThreadPool.h | 409 ++++ .../CXX11/src/Tensor/TensorDimensionList.h | 236 +++ .../CXX11/src/Tensor/TensorDimensions.h | 490 +++++ .../CXX11/src/Tensor/TensorEvalTo.h | 236 +++ .../CXX11/src/Tensor/TensorEvaluator.h | 983 ++++++++++ .../CXX11/src/Tensor/TensorExecutor.h | 703 +++++++ src/EigenUnsupported/CXX11/src/Tensor/TensorExpr.h | 388 ++++ src/EigenUnsupported/CXX11/src/Tensor/TensorFFT.h | 669 +++++++ .../CXX11/src/Tensor/TensorFixedSize.h | 379 ++++ .../CXX11/src/Tensor/TensorForcedEval.h | 237 +++ .../CXX11/src/Tensor/TensorForwardDeclarations.h | 191 ++ .../CXX11/src/Tensor/TensorFunctors.h | 488 +++++ .../CXX11/src/Tensor/TensorGenerator.h | 302 +++ .../CXX11/src/Tensor/TensorGlobalFunctions.h | 33 + .../CXX11/src/Tensor/TensorGpuHipCudaDefines.h | 99 + .../CXX11/src/Tensor/TensorGpuHipCudaUndefines.h | 44 + src/EigenUnsupported/CXX11/src/Tensor/TensorIO.h | 79 + .../CXX11/src/Tensor/TensorImagePatch.h | 603 ++++++ .../CXX11/src/Tensor/TensorIndexList.h | 738 +++++++ .../CXX11/src/Tensor/TensorInflation.h | 247 +++ .../CXX11/src/Tensor/TensorInitializer.h | 82 + .../CXX11/src/Tensor/TensorIntDiv.h | 263 +++ .../CXX11/src/Tensor/TensorLayoutSwap.h | 216 +++ .../CXX11/src/Tensor/TensorMacros.h | 98 + src/EigenUnsupported/CXX11/src/Tensor/TensorMap.h | 327 ++++ src/EigenUnsupported/CXX11/src/Tensor/TensorMeta.h | 311 +++ .../CXX11/src/Tensor/TensorMorphing.h | 1102 +++++++++++ .../CXX11/src/Tensor/TensorPadding.h | 708 +++++++ .../CXX11/src/Tensor/TensorPatch.h | 291 +++ .../CXX11/src/Tensor/TensorRandom.h | 322 +++ .../CXX11/src/Tensor/TensorReduction.h | 998 ++++++++++ .../CXX11/src/Tensor/TensorReductionCuda.h | 6 + .../CXX11/src/Tensor/TensorReductionGpu.h | 966 +++++++++ .../CXX11/src/Tensor/TensorReductionSycl.h | 582 ++++++ src/EigenUnsupported/CXX11/src/Tensor/TensorRef.h | 454 +++++ .../CXX11/src/Tensor/TensorReverse.h | 465 +++++ src/EigenUnsupported/CXX11/src/Tensor/TensorScan.h | 528 +++++ .../CXX11/src/Tensor/TensorScanSycl.h | 513 +++++ .../CXX11/src/Tensor/TensorShuffling.h | 471 +++++ .../CXX11/src/Tensor/TensorStorage.h | 161 ++ .../CXX11/src/Tensor/TensorStriding.h | 346 ++++ .../CXX11/src/Tensor/TensorTrace.h | 303 +++ .../CXX11/src/Tensor/TensorTraits.h | 264 +++ .../CXX11/src/Tensor/TensorUInt128.h | 249 +++ .../CXX11/src/Tensor/TensorVolumePatch.h | 629 ++++++ .../CXX11/src/TensorSymmetry/DynamicSymmetry.h | 293 +++ .../CXX11/src/TensorSymmetry/StaticSymmetry.h | 236 +++ .../CXX11/src/TensorSymmetry/Symmetry.h | 338 ++++ .../src/TensorSymmetry/util/TemplateGroupTheory.h | 669 +++++++ .../CXX11/src/ThreadPool/Barrier.h | 67 + .../CXX11/src/ThreadPool/EventCount.h | 249 +++ .../CXX11/src/ThreadPool/NonBlockingThreadPool.h | 486 +++++ .../CXX11/src/ThreadPool/RunQueue.h | 236 +++ .../CXX11/src/ThreadPool/ThreadCancel.h | 23 + .../CXX11/src/ThreadPool/ThreadEnvironment.h | 40 + .../CXX11/src/ThreadPool/ThreadLocal.h | 301 +++ .../CXX11/src/ThreadPool/ThreadPoolInterface.h | 48 + .../CXX11/src/ThreadPool/ThreadYield.h | 20 + src/EigenUnsupported/CXX11/src/util/CXX11Meta.h | 537 +++++ .../CXX11/src/util/CXX11Workarounds.h | 88 + src/EigenUnsupported/CXX11/src/util/EmulateArray.h | 261 +++ .../CXX11/src/util/MaxSizeVector.h | 158 ++ src/EigenUnsupported/EulerAngles | 43 + src/EigenUnsupported/FFT | 419 ++++ src/EigenUnsupported/IterativeSolvers | 51 + src/EigenUnsupported/KroneckerProduct | 36 + src/EigenUnsupported/LevenbergMarquardt | 49 + src/EigenUnsupported/MPRealSupport | 213 ++ src/EigenUnsupported/MatrixFunctions | 504 +++++ src/EigenUnsupported/MoreVectorization | 24 + src/EigenUnsupported/NonLinearOptimization | 140 ++ src/EigenUnsupported/NumericalDiff | 56 + src/EigenUnsupported/OpenGLSupport | 322 +++ src/EigenUnsupported/Polynomials | 137 ++ src/EigenUnsupported/Skyline | 39 + src/EigenUnsupported/SparseExtra | 54 + src/EigenUnsupported/SpecialFunctions | 103 + src/EigenUnsupported/Splines | 35 + .../src/AutoDiff/AutoDiffJacobian.h | 108 ++ src/EigenUnsupported/src/AutoDiff/AutoDiffScalar.h | 730 +++++++ src/EigenUnsupported/src/AutoDiff/AutoDiffVector.h | 220 +++ src/EigenUnsupported/src/BVH/BVAlgorithms.h | 293 +++ src/EigenUnsupported/src/BVH/KdBVH.h | 223 +++ .../src/Eigenvalues/ArpackSelfAdjointEigenSolver.h | 790 ++++++++ .../src/EulerAngles/CMakeLists.txt | 6 + src/EigenUnsupported/src/EulerAngles/EulerAngles.h | 355 ++++ src/EigenUnsupported/src/EulerAngles/EulerSystem.h | 305 +++ src/EigenUnsupported/src/FFT/ei_fftw_impl.h | 261 +++ src/EigenUnsupported/src/FFT/ei_kissfft_impl.h | 449 +++++ .../src/IterativeSolvers/ConstrainedConjGrad.h | 187 ++ src/EigenUnsupported/src/IterativeSolvers/DGMRES.h | 511 +++++ src/EigenUnsupported/src/IterativeSolvers/GMRES.h | 335 ++++ src/EigenUnsupported/src/IterativeSolvers/IDRS.h | 436 +++++ .../src/IterativeSolvers/IncompleteLU.h | 90 + .../src/IterativeSolvers/IterationController.h | 154 ++ src/EigenUnsupported/src/IterativeSolvers/MINRES.h | 267 +++ .../src/IterativeSolvers/Scaling.h | 193 ++ .../src/KroneckerProduct/KroneckerTensorProduct.h | 305 +++ .../src/LevenbergMarquardt/CopyrightMINPACK.txt | 52 + .../src/LevenbergMarquardt/LMcovar.h | 84 + .../src/LevenbergMarquardt/LMonestep.h | 202 ++ .../src/LevenbergMarquardt/LMpar.h | 160 ++ .../src/LevenbergMarquardt/LMqrsolv.h | 188 ++ .../src/LevenbergMarquardt/LevenbergMarquardt.h | 396 ++++ .../src/MatrixFunctions/MatrixExponential.h | 441 +++++ .../src/MatrixFunctions/MatrixFunction.h | 569 ++++++ .../src/MatrixFunctions/MatrixLogarithm.h | 373 ++++ .../src/MatrixFunctions/MatrixPower.h | 705 +++++++ .../src/MatrixFunctions/MatrixSquareRoot.h | 368 ++++ .../src/MatrixFunctions/StemFunction.h | 117 ++ .../src/MoreVectorization/MathFunctions.h | 95 + .../NonLinearOptimization/HybridNonLinearSolver.h | 601 ++++++ .../src/NonLinearOptimization/LevenbergMarquardt.h | 657 +++++++ .../src/NonLinearOptimization/chkder.h | 66 + .../src/NonLinearOptimization/covar.h | 70 + .../src/NonLinearOptimization/dogleg.h | 107 + .../src/NonLinearOptimization/fdjac1.h | 79 + .../src/NonLinearOptimization/lmpar.h | 298 +++ .../src/NonLinearOptimization/qrsolv.h | 91 + .../src/NonLinearOptimization/r1mpyq.h | 30 + .../src/NonLinearOptimization/r1updt.h | 99 + .../src/NonLinearOptimization/rwupdt.h | 49 + .../src/NumericalDiff/NumericalDiff.h | 130 ++ src/EigenUnsupported/src/Polynomials/Companion.h | 280 +++ .../src/Polynomials/PolynomialSolver.h | 428 ++++ .../src/Polynomials/PolynomialUtils.h | 143 ++ .../src/Skyline/SkylineInplaceLU.h | 352 ++++ src/EigenUnsupported/src/Skyline/SkylineMatrix.h | 862 +++++++++ .../src/Skyline/SkylineMatrixBase.h | 212 ++ src/EigenUnsupported/src/Skyline/SkylineProduct.h | 295 +++ src/EigenUnsupported/src/Skyline/SkylineStorage.h | 259 +++ src/EigenUnsupported/src/Skyline/SkylineUtil.h | 89 + .../src/SparseExtra/BlockOfDynamicSparseMatrix.h | 122 ++ .../src/SparseExtra/BlockSparseMatrix.h | 1079 +++++++++++ .../src/SparseExtra/DynamicSparseMatrix.h | 404 ++++ src/EigenUnsupported/src/SparseExtra/MarketIO.h | 282 +++ .../src/SparseExtra/MatrixMarketIterator.h | 247 +++ .../src/SparseExtra/RandomSetter.h | 349 ++++ .../src/SpecialFunctions/BesselFunctionsArrayAPI.h | 286 +++ .../src/SpecialFunctions/BesselFunctionsBFloat16.h | 68 + .../src/SpecialFunctions/BesselFunctionsFunctors.h | 357 ++++ .../src/SpecialFunctions/BesselFunctionsHalf.h | 66 + .../src/SpecialFunctions/BesselFunctionsImpl.h | 1959 +++++++++++++++++++ .../SpecialFunctions/BesselFunctionsPacketMath.h | 118 ++ .../src/SpecialFunctions/HipVectorCompatibility.h | 67 + .../SpecialFunctions/SpecialFunctionsArrayAPI.h | 167 ++ .../SpecialFunctions/SpecialFunctionsBFloat16.h | 58 + .../SpecialFunctions/SpecialFunctionsFunctors.h | 330 ++++ .../src/SpecialFunctions/SpecialFunctionsHalf.h | 58 + .../src/SpecialFunctions/SpecialFunctionsImpl.h | 2045 ++++++++++++++++++++ .../SpecialFunctions/SpecialFunctionsPacketMath.h | 79 + .../SpecialFunctions/arch/AVX/BesselFunctions.h | 46 + .../SpecialFunctions/arch/AVX/SpecialFunctions.h | 16 + .../SpecialFunctions/arch/AVX512/BesselFunctions.h | 46 + .../arch/AVX512/SpecialFunctions.h | 16 + .../SpecialFunctions/arch/GPU/SpecialFunctions.h | 369 ++++ .../SpecialFunctions/arch/NEON/BesselFunctions.h | 54 + .../SpecialFunctions/arch/NEON/SpecialFunctions.h | 34 + src/EigenUnsupported/src/Splines/Spline.h | 507 +++++ src/EigenUnsupported/src/Splines/SplineFitting.h | 431 +++++ src/EigenUnsupported/src/Splines/SplineFwd.h | 93 + 197 files changed, 68723 insertions(+) create mode 100644 src/EigenUnsupported/AdolcForward create mode 100644 src/EigenUnsupported/AlignedVector3 create mode 100644 src/EigenUnsupported/ArpackSupport create mode 100644 src/EigenUnsupported/AutoDiff create mode 100644 src/EigenUnsupported/BVH create mode 100644 src/EigenUnsupported/CMakeLists.txt create mode 100644 src/EigenUnsupported/CXX11/CMakeLists.txt create mode 100644 src/EigenUnsupported/CXX11/Tensor create mode 100644 src/EigenUnsupported/CXX11/TensorSymmetry create mode 100644 src/EigenUnsupported/CXX11/ThreadPool create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/README.md create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/Tensor.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorArgMax.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorAssign.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorBase.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorBlock.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorBroadcasting.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorChipping.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorConcatenation.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorContraction.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorContractionBlocking.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorContractionCuda.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorContractionGpu.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorContractionMapper.h create mode 100755 src/EigenUnsupported/CXX11/src/Tensor/TensorContractionSycl.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorContractionThreadPool.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorConversion.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorConvolution.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorConvolutionSycl.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorCostModel.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorCustomOp.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorDevice.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceCuda.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceDefault.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceGpu.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceSycl.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceThreadPool.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorDimensionList.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorDimensions.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorEvalTo.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorEvaluator.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorExecutor.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorExpr.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorFFT.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorFixedSize.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorForcedEval.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorForwardDeclarations.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorFunctors.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorGenerator.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorGlobalFunctions.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorGpuHipCudaDefines.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorIO.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorImagePatch.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorIndexList.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorInflation.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorInitializer.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorIntDiv.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorLayoutSwap.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorMacros.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorMap.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorMeta.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorMorphing.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorPadding.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorPatch.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorRandom.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorReduction.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorReductionCuda.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorReductionGpu.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorReductionSycl.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorRef.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorReverse.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorScan.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorScanSycl.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorShuffling.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorStorage.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorStriding.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorTrace.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorTraits.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorUInt128.h create mode 100644 src/EigenUnsupported/CXX11/src/Tensor/TensorVolumePatch.h create mode 100644 src/EigenUnsupported/CXX11/src/TensorSymmetry/DynamicSymmetry.h create mode 100644 src/EigenUnsupported/CXX11/src/TensorSymmetry/StaticSymmetry.h create mode 100644 src/EigenUnsupported/CXX11/src/TensorSymmetry/Symmetry.h create mode 100644 src/EigenUnsupported/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h create mode 100644 src/EigenUnsupported/CXX11/src/ThreadPool/Barrier.h create mode 100644 src/EigenUnsupported/CXX11/src/ThreadPool/EventCount.h create mode 100644 src/EigenUnsupported/CXX11/src/ThreadPool/NonBlockingThreadPool.h create mode 100644 src/EigenUnsupported/CXX11/src/ThreadPool/RunQueue.h create mode 100644 src/EigenUnsupported/CXX11/src/ThreadPool/ThreadCancel.h create mode 100644 src/EigenUnsupported/CXX11/src/ThreadPool/ThreadEnvironment.h create mode 100644 src/EigenUnsupported/CXX11/src/ThreadPool/ThreadLocal.h create mode 100644 src/EigenUnsupported/CXX11/src/ThreadPool/ThreadPoolInterface.h create mode 100644 src/EigenUnsupported/CXX11/src/ThreadPool/ThreadYield.h create mode 100644 src/EigenUnsupported/CXX11/src/util/CXX11Meta.h create mode 100644 src/EigenUnsupported/CXX11/src/util/CXX11Workarounds.h create mode 100644 src/EigenUnsupported/CXX11/src/util/EmulateArray.h create mode 100644 src/EigenUnsupported/CXX11/src/util/MaxSizeVector.h create mode 100644 src/EigenUnsupported/EulerAngles create mode 100644 src/EigenUnsupported/FFT create mode 100644 src/EigenUnsupported/IterativeSolvers create mode 100644 src/EigenUnsupported/KroneckerProduct create mode 100644 src/EigenUnsupported/LevenbergMarquardt create mode 100644 src/EigenUnsupported/MPRealSupport create mode 100644 src/EigenUnsupported/MatrixFunctions create mode 100644 src/EigenUnsupported/MoreVectorization create mode 100644 src/EigenUnsupported/NonLinearOptimization create mode 100644 src/EigenUnsupported/NumericalDiff create mode 100644 src/EigenUnsupported/OpenGLSupport create mode 100644 src/EigenUnsupported/Polynomials create mode 100644 src/EigenUnsupported/Skyline create mode 100644 src/EigenUnsupported/SparseExtra create mode 100644 src/EigenUnsupported/SpecialFunctions create mode 100644 src/EigenUnsupported/Splines create mode 100644 src/EigenUnsupported/src/AutoDiff/AutoDiffJacobian.h create mode 100755 src/EigenUnsupported/src/AutoDiff/AutoDiffScalar.h create mode 100644 src/EigenUnsupported/src/AutoDiff/AutoDiffVector.h create mode 100644 src/EigenUnsupported/src/BVH/BVAlgorithms.h create mode 100644 src/EigenUnsupported/src/BVH/KdBVH.h create mode 100644 src/EigenUnsupported/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h create mode 100644 src/EigenUnsupported/src/EulerAngles/CMakeLists.txt create mode 100644 src/EigenUnsupported/src/EulerAngles/EulerAngles.h create mode 100644 src/EigenUnsupported/src/EulerAngles/EulerSystem.h create mode 100644 src/EigenUnsupported/src/FFT/ei_fftw_impl.h create mode 100644 src/EigenUnsupported/src/FFT/ei_kissfft_impl.h create mode 100644 src/EigenUnsupported/src/IterativeSolvers/ConstrainedConjGrad.h create mode 100644 src/EigenUnsupported/src/IterativeSolvers/DGMRES.h create mode 100644 src/EigenUnsupported/src/IterativeSolvers/GMRES.h create mode 100755 src/EigenUnsupported/src/IterativeSolvers/IDRS.h create mode 100644 src/EigenUnsupported/src/IterativeSolvers/IncompleteLU.h create mode 100644 src/EigenUnsupported/src/IterativeSolvers/IterationController.h create mode 100644 src/EigenUnsupported/src/IterativeSolvers/MINRES.h create mode 100644 src/EigenUnsupported/src/IterativeSolvers/Scaling.h create mode 100644 src/EigenUnsupported/src/KroneckerProduct/KroneckerTensorProduct.h create mode 100644 src/EigenUnsupported/src/LevenbergMarquardt/CopyrightMINPACK.txt create mode 100644 src/EigenUnsupported/src/LevenbergMarquardt/LMcovar.h create mode 100644 src/EigenUnsupported/src/LevenbergMarquardt/LMonestep.h create mode 100644 src/EigenUnsupported/src/LevenbergMarquardt/LMpar.h create mode 100644 src/EigenUnsupported/src/LevenbergMarquardt/LMqrsolv.h create mode 100644 src/EigenUnsupported/src/LevenbergMarquardt/LevenbergMarquardt.h create mode 100644 src/EigenUnsupported/src/MatrixFunctions/MatrixExponential.h create mode 100644 src/EigenUnsupported/src/MatrixFunctions/MatrixFunction.h create mode 100644 src/EigenUnsupported/src/MatrixFunctions/MatrixLogarithm.h create mode 100644 src/EigenUnsupported/src/MatrixFunctions/MatrixPower.h create mode 100644 src/EigenUnsupported/src/MatrixFunctions/MatrixSquareRoot.h create mode 100644 src/EigenUnsupported/src/MatrixFunctions/StemFunction.h create mode 100644 src/EigenUnsupported/src/MoreVectorization/MathFunctions.h create mode 100644 src/EigenUnsupported/src/NonLinearOptimization/HybridNonLinearSolver.h create mode 100644 src/EigenUnsupported/src/NonLinearOptimization/LevenbergMarquardt.h create mode 100644 src/EigenUnsupported/src/NonLinearOptimization/chkder.h create mode 100644 src/EigenUnsupported/src/NonLinearOptimization/covar.h create mode 100644 src/EigenUnsupported/src/NonLinearOptimization/dogleg.h create mode 100644 src/EigenUnsupported/src/NonLinearOptimization/fdjac1.h create mode 100644 src/EigenUnsupported/src/NonLinearOptimization/lmpar.h create mode 100644 src/EigenUnsupported/src/NonLinearOptimization/qrsolv.h create mode 100644 src/EigenUnsupported/src/NonLinearOptimization/r1mpyq.h create mode 100644 src/EigenUnsupported/src/NonLinearOptimization/r1updt.h create mode 100644 src/EigenUnsupported/src/NonLinearOptimization/rwupdt.h create mode 100644 src/EigenUnsupported/src/NumericalDiff/NumericalDiff.h create mode 100644 src/EigenUnsupported/src/Polynomials/Companion.h create mode 100644 src/EigenUnsupported/src/Polynomials/PolynomialSolver.h create mode 100644 src/EigenUnsupported/src/Polynomials/PolynomialUtils.h create mode 100644 src/EigenUnsupported/src/Skyline/SkylineInplaceLU.h create mode 100644 src/EigenUnsupported/src/Skyline/SkylineMatrix.h create mode 100644 src/EigenUnsupported/src/Skyline/SkylineMatrixBase.h create mode 100644 src/EigenUnsupported/src/Skyline/SkylineProduct.h create mode 100644 src/EigenUnsupported/src/Skyline/SkylineStorage.h create mode 100644 src/EigenUnsupported/src/Skyline/SkylineUtil.h create mode 100644 src/EigenUnsupported/src/SparseExtra/BlockOfDynamicSparseMatrix.h create mode 100644 src/EigenUnsupported/src/SparseExtra/BlockSparseMatrix.h create mode 100644 src/EigenUnsupported/src/SparseExtra/DynamicSparseMatrix.h create mode 100644 src/EigenUnsupported/src/SparseExtra/MarketIO.h create mode 100644 src/EigenUnsupported/src/SparseExtra/MatrixMarketIterator.h create mode 100644 src/EigenUnsupported/src/SparseExtra/RandomSetter.h create mode 100644 src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsArrayAPI.h create mode 100644 src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsBFloat16.h create mode 100644 src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsFunctors.h create mode 100644 src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsHalf.h create mode 100644 src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsImpl.h create mode 100644 src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsPacketMath.h create mode 100644 src/EigenUnsupported/src/SpecialFunctions/HipVectorCompatibility.h create mode 100644 src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsArrayAPI.h create mode 100644 src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsBFloat16.h create mode 100644 src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsFunctors.h create mode 100644 src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsHalf.h create mode 100644 src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsImpl.h create mode 100644 src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsPacketMath.h create mode 100644 src/EigenUnsupported/src/SpecialFunctions/arch/AVX/BesselFunctions.h create mode 100644 src/EigenUnsupported/src/SpecialFunctions/arch/AVX/SpecialFunctions.h create mode 100644 src/EigenUnsupported/src/SpecialFunctions/arch/AVX512/BesselFunctions.h create mode 100644 src/EigenUnsupported/src/SpecialFunctions/arch/AVX512/SpecialFunctions.h create mode 100644 src/EigenUnsupported/src/SpecialFunctions/arch/GPU/SpecialFunctions.h create mode 100644 src/EigenUnsupported/src/SpecialFunctions/arch/NEON/BesselFunctions.h create mode 100644 src/EigenUnsupported/src/SpecialFunctions/arch/NEON/SpecialFunctions.h create mode 100644 src/EigenUnsupported/src/Splines/Spline.h create mode 100644 src/EigenUnsupported/src/Splines/SplineFitting.h create mode 100644 src/EigenUnsupported/src/Splines/SplineFwd.h (limited to 'src/EigenUnsupported') diff --git a/src/EigenUnsupported/AdolcForward b/src/EigenUnsupported/AdolcForward new file mode 100644 index 0000000..56caeae --- /dev/null +++ b/src/EigenUnsupported/AdolcForward @@ -0,0 +1,159 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2009 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_ADLOC_FORWARD +#define EIGEN_ADLOC_FORWARD + +//-------------------------------------------------------------------------------- +// +// This file provides support for adolc's adouble type in forward mode. +// ADOL-C is a C++ automatic differentiation library, +// see https://projects.coin-or.org/ADOL-C for more information. +// +// Note that the maximal number of directions is controlled by +// the preprocessor token NUMBER_DIRECTIONS. The default is 2. +// +//-------------------------------------------------------------------------------- + +#define ADOLC_TAPELESS +#ifndef NUMBER_DIRECTIONS +# define NUMBER_DIRECTIONS 2 +#endif +#include + +// adolc defines some very stupid macros: +#if defined(malloc) +# undef malloc +#endif + +#if defined(calloc) +# undef calloc +#endif + +#if defined(realloc) +# undef realloc +#endif + +#include "../../Eigen/Core" + +namespace Eigen { + +/** + * \defgroup AdolcForward_Module Adolc forward module + * This module provides support for adolc's adouble type in forward mode. + * ADOL-C is a C++ automatic differentiation library, + * see https://projects.coin-or.org/ADOL-C for more information. + * It mainly consists in: + * - a struct Eigen::NumTraits specialization + * - overloads of internal::* math function for adtl::adouble type. + * + * Note that the maximal number of directions is controlled by + * the preprocessor token NUMBER_DIRECTIONS. The default is 2. + * + * \code + * #include + * \endcode + */ + //@{ + +} // namespace Eigen + +// Eigen's require a few additional functions which must be defined in the same namespace +// than the custom scalar type own namespace +namespace adtl { + +inline const adouble& conj(const adouble& x) { return x; } +inline const adouble& real(const adouble& x) { return x; } +inline adouble imag(const adouble&) { return 0.; } +inline adouble abs(const adouble& x) { return fabs(x); } +inline adouble abs2(const adouble& x) { return x*x; } + +inline bool (isinf)(const adouble& x) { return (Eigen::numext::isinf)(x.getValue()); } +inline bool (isnan)(const adouble& x) { return (Eigen::numext::isnan)(x.getValue()); } + +} + +namespace Eigen { + +template<> struct NumTraits + : NumTraits +{ + typedef adtl::adouble Real; + typedef adtl::adouble NonInteger; + typedef adtl::adouble Nested; + enum { + IsComplex = 0, + IsInteger = 0, + IsSigned = 1, + RequireInitialization = 1, + ReadCost = 1, + AddCost = 1, + MulCost = 1 + }; +}; + +template class AdolcForwardJacobian : public Functor +{ + typedef adtl::adouble ActiveScalar; +public: + + AdolcForwardJacobian() : Functor() {} + AdolcForwardJacobian(const Functor& f) : Functor(f) {} + + // forward constructors + template + AdolcForwardJacobian(const T0& a0) : Functor(a0) {} + template + AdolcForwardJacobian(const T0& a0, const T1& a1) : Functor(a0, a1) {} + template + AdolcForwardJacobian(const T0& a0, const T1& a1, const T1& a2) : Functor(a0, a1, a2) {} + + typedef typename Functor::InputType InputType; + typedef typename Functor::ValueType ValueType; + typedef typename Functor::JacobianType JacobianType; + + typedef Matrix ActiveInput; + typedef Matrix ActiveValue; + + void operator() (const InputType& x, ValueType* v, JacobianType* _jac) const + { + eigen_assert(v!=0); + if (!_jac) + { + Functor::operator()(x, v); + return; + } + + JacobianType& jac = *_jac; + + ActiveInput ax = x.template cast(); + ActiveValue av(jac.rows()); + + for (int j=0; j +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_ALIGNED_VECTOR3 +#define EIGEN_ALIGNED_VECTOR3 + +#include "../../Eigen/Geometry" + +#include "../../Eigen/src/Core/util/DisableStupidWarnings.h" + +namespace Eigen { + +/** + * \defgroup AlignedVector3_Module Aligned vector3 module + * + * \code + * #include + * \endcode + */ + //@{ + + +/** \class AlignedVector3 + * + * \brief A vectorization friendly 3D vector + * + * This class represents a 3D vector internally using a 4D vector + * such that vectorization can be seamlessly enabled. Of course, + * the same result can be achieved by directly using a 4D vector. + * This class makes this process simpler. + * + */ +// TODO specialize Cwise +template class AlignedVector3; + +namespace internal { +template struct traits > + : traits > +{ +}; +} + +template class AlignedVector3 + : public MatrixBase > +{ + typedef Matrix<_Scalar,4,1> CoeffType; + CoeffType m_coeffs; + public: + + typedef MatrixBase > Base; + EIGEN_DENSE_PUBLIC_INTERFACE(AlignedVector3) + using Base::operator*; + + inline Index rows() const { return 3; } + inline Index cols() const { return 1; } + + Scalar* data() { return m_coeffs.data(); } + const Scalar* data() const { return m_coeffs.data(); } + Index innerStride() const { return 1; } + Index outerStride() const { return 3; } + + inline const Scalar& coeff(Index row, Index col) const + { return m_coeffs.coeff(row, col); } + + inline Scalar& coeffRef(Index row, Index col) + { return m_coeffs.coeffRef(row, col); } + + inline const Scalar& coeff(Index index) const + { return m_coeffs.coeff(index); } + + inline Scalar& coeffRef(Index index) + { return m_coeffs.coeffRef(index);} + + + inline AlignedVector3() + {} + + inline AlignedVector3(const Scalar& x, const Scalar& y, const Scalar& z) + : m_coeffs(x, y, z, Scalar(0)) + {} + + inline AlignedVector3(const AlignedVector3& other) + : Base(), m_coeffs(other.m_coeffs) + {} + + template + struct generic_assign_selector {}; + + template struct generic_assign_selector + { + inline static void run(AlignedVector3& dest, const XprType& src) + { + dest.m_coeffs = src; + } + }; + + template struct generic_assign_selector + { + inline static void run(AlignedVector3& dest, const XprType& src) + { + dest.m_coeffs.template head<3>() = src; + dest.m_coeffs.w() = Scalar(0); + } + }; + + template + inline AlignedVector3(const MatrixBase& other) + { + generic_assign_selector::run(*this,other.derived()); + } + + inline AlignedVector3& operator=(const AlignedVector3& other) + { m_coeffs = other.m_coeffs; return *this; } + + template + inline AlignedVector3& operator=(const MatrixBase& other) + { + generic_assign_selector::run(*this,other.derived()); + return *this; + } + + inline AlignedVector3 operator+(const AlignedVector3& other) const + { return AlignedVector3(m_coeffs + other.m_coeffs); } + + inline AlignedVector3& operator+=(const AlignedVector3& other) + { m_coeffs += other.m_coeffs; return *this; } + + inline AlignedVector3 operator-(const AlignedVector3& other) const + { return AlignedVector3(m_coeffs - other.m_coeffs); } + + inline AlignedVector3 operator-() const + { return AlignedVector3(-m_coeffs); } + + inline AlignedVector3 operator-=(const AlignedVector3& other) + { m_coeffs -= other.m_coeffs; return *this; } + + inline AlignedVector3 operator*(const Scalar& s) const + { return AlignedVector3(m_coeffs * s); } + + inline friend AlignedVector3 operator*(const Scalar& s,const AlignedVector3& vec) + { return AlignedVector3(s * vec.m_coeffs); } + + inline AlignedVector3& operator*=(const Scalar& s) + { m_coeffs *= s; return *this; } + + inline AlignedVector3 operator/(const Scalar& s) const + { return AlignedVector3(m_coeffs / s); } + + inline AlignedVector3& operator/=(const Scalar& s) + { m_coeffs /= s; return *this; } + + inline Scalar dot(const AlignedVector3& other) const + { + eigen_assert(m_coeffs.w()==Scalar(0)); + eigen_assert(other.m_coeffs.w()==Scalar(0)); + return m_coeffs.dot(other.m_coeffs); + } + + inline void normalize() + { + m_coeffs /= norm(); + } + + inline AlignedVector3 normalized() const + { + return AlignedVector3(m_coeffs / norm()); + } + + inline Scalar sum() const + { + eigen_assert(m_coeffs.w()==Scalar(0)); + return m_coeffs.sum(); + } + + inline Scalar squaredNorm() const + { + eigen_assert(m_coeffs.w()==Scalar(0)); + return m_coeffs.squaredNorm(); + } + + inline Scalar norm() const + { + using std::sqrt; + return sqrt(squaredNorm()); + } + + inline AlignedVector3 cross(const AlignedVector3& other) const + { + return AlignedVector3(m_coeffs.cross3(other.m_coeffs)); + } + + template + inline bool isApprox(const MatrixBase& other, const RealScalar& eps=NumTraits::dummy_precision()) const + { + return m_coeffs.template head<3>().isApprox(other,eps); + } + + CoeffType& coeffs() { return m_coeffs; } + const CoeffType& coeffs() const { return m_coeffs; } +}; + +namespace internal { + +template +struct eval, Dense> +{ + typedef const AlignedVector3<_Scalar>& type; +}; + +template +struct evaluator > + : evaluator > +{ + typedef AlignedVector3 XprType; + typedef evaluator > Base; + + evaluator(const XprType &m) : Base(m.coeffs()) {} +}; + +} + +//@} + +} + +#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h" + +#endif // EIGEN_ALIGNED_VECTOR3 diff --git a/src/EigenUnsupported/ArpackSupport b/src/EigenUnsupported/ArpackSupport new file mode 100644 index 0000000..67c4ac8 --- /dev/null +++ b/src/EigenUnsupported/ArpackSupport @@ -0,0 +1,30 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_ARPACKSUPPORT_MODULE_H +#define EIGEN_ARPACKSUPPORT_MODULE_H + +#include "../../Eigen/Core" + +/** \defgroup ArpackSupport_Module Arpack support module + * + * This module provides a wrapper to Arpack, a library for sparse eigenvalue decomposition. + * + * \code + * #include + * \endcode + */ + +#include "../../Eigen/SparseCholesky" + +#include "../../Eigen/src/Core/util/DisableStupidWarnings.h" +#include "src/Eigenvalues/ArpackSelfAdjointEigenSolver.h" + +#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h" + +#endif // EIGEN_ARPACKSUPPORT_MODULE_H diff --git a/src/EigenUnsupported/AutoDiff b/src/EigenUnsupported/AutoDiff new file mode 100644 index 0000000..7a4ff46 --- /dev/null +++ b/src/EigenUnsupported/AutoDiff @@ -0,0 +1,46 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2009 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_AUTODIFF_MODULE +#define EIGEN_AUTODIFF_MODULE + +namespace Eigen { + +/** + * \defgroup AutoDiff_Module Auto Diff module + * + * This module features forward automatic differentation via a simple + * templated scalar type wrapper AutoDiffScalar. + * + * Warning : this should NOT be confused with numerical differentiation, which + * is a different method and has its own module in Eigen : \ref NumericalDiff_Module. + * + * \code + * #include + * \endcode + */ +//@{ + +} +#include "../../Eigen/src/Core/util/DisableStupidWarnings.h" + + +#include "src/AutoDiff/AutoDiffScalar.h" +// #include "src/AutoDiff/AutoDiffVector.h" +#include "src/AutoDiff/AutoDiffJacobian.h" + +#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h" + + + +namespace Eigen { +//@} +} + +#endif // EIGEN_AUTODIFF_MODULE diff --git a/src/EigenUnsupported/BVH b/src/EigenUnsupported/BVH new file mode 100644 index 0000000..666c983 --- /dev/null +++ b/src/EigenUnsupported/BVH @@ -0,0 +1,95 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Ilya Baran +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_BVH_MODULE_H +#define EIGEN_BVH_MODULE_H + +#include "../../Eigen/Core" +#include "../../Eigen/Geometry" +#include "../../Eigen/StdVector" +#include +#include + +namespace Eigen { + +/** + * \defgroup BVH_Module BVH module + * \brief This module provides generic bounding volume hierarchy algorithms + * and reference tree implementations. + * + * + * \code + * #include + * \endcode + * + * A bounding volume hierarchy (BVH) can accelerate many geometric queries. This module provides a generic implementation + * of the two basic algorithms over a BVH: intersection of a query object against all objects in the hierarchy and minimization + * of a function over the objects in the hierarchy. It also provides intersection and minimization over a cartesian product of + * two BVH's. A BVH accelerates intersection by using the fact that if a query object does not intersect a volume, then it cannot + * intersect any object contained in that volume. Similarly, a BVH accelerates minimization because the minimum of a function + * over a volume is no greater than the minimum of a function over any object contained in it. + * + * Some sample queries that can be written in terms of intersection are: + * - Determine all points where a ray intersects a triangle mesh + * - Given a set of points, determine which are contained in a query sphere + * - Given a set of spheres, determine which contain the query point + * - Given a set of disks, determine if any is completely contained in a query rectangle (represent each 2D disk as a point \f$(x,y,r)\f$ + * in 3D and represent the rectangle as a pyramid based on the original rectangle and shrinking in the \f$r\f$ direction) + * - Given a set of points, count how many pairs are \f$d\pm\epsilon\f$ apart (done by looking at the cartesian product of the set + * of points with itself) + * + * Some sample queries that can be written in terms of function minimization over a set of objects are: + * - Find the intersection between a ray and a triangle mesh closest to the ray origin (function is infinite off the ray) + * - Given a polyline and a query point, determine the closest point on the polyline to the query + * - Find the diameter of a point cloud (done by looking at the cartesian product and using negative distance as the function) + * - Determine how far two meshes are from colliding (this is also a cartesian product query) + * + * This implementation decouples the basic algorithms both from the type of hierarchy (and the types of the bounding volumes) and + * from the particulars of the query. To enable abstraction from the BVH, the BVH is required to implement a generic mechanism + * for traversal. To abstract from the query, the query is responsible for keeping track of results. + * + * To be used in the algorithms, a hierarchy must implement the following traversal mechanism (see KdBVH for a sample implementation): \code + typedef Volume //the type of bounding volume + typedef Object //the type of object in the hierarchy + typedef Index //a reference to a node in the hierarchy--typically an int or a pointer + typedef VolumeIterator //an iterator type over node children--returns Index + typedef ObjectIterator //an iterator over object (leaf) children--returns const Object & + Index getRootIndex() const //returns the index of the hierarchy root + const Volume &getVolume(Index index) const //returns the bounding volume of the node at given index + void getChildren(Index index, VolumeIterator &outVBegin, VolumeIterator &outVEnd, + ObjectIterator &outOBegin, ObjectIterator &outOEnd) const + //getChildren takes a node index and makes [outVBegin, outVEnd) range over its node children + //and [outOBegin, outOEnd) range over its object children + \endcode + * + * To use the hierarchy, call BVIntersect or BVMinimize, passing it a BVH (or two, for cartesian product) and a minimizer or intersector. + * For an intersection query on a single BVH, the intersector encapsulates the query and must provide two functions: + * \code + bool intersectVolume(const Volume &volume) //returns true if the query intersects the volume + bool intersectObject(const Object &object) //returns true if the intersection search should terminate immediately + \endcode + * The guarantee that BVIntersect provides is that intersectObject will be called on every object whose bounding volume + * intersects the query (but possibly on other objects too) unless the search is terminated prematurely. It is the + * responsibility of the intersectObject function to keep track of the results in whatever manner is appropriate. + * The cartesian product intersection and the BVMinimize queries are similar--see their individual documentation. + * + * The following is a simple but complete example for how to use the BVH to accelerate the search for a closest red-blue point pair: + * \include BVH_Example.cpp + * Output: \verbinclude BVH_Example.out + */ +} + +//@{ + +#include "src/BVH/BVAlgorithms.h" +#include "src/BVH/KdBVH.h" + +//@} + +#endif // EIGEN_BVH_MODULE_H diff --git a/src/EigenUnsupported/CMakeLists.txt b/src/EigenUnsupported/CMakeLists.txt new file mode 100644 index 0000000..631a060 --- /dev/null +++ b/src/EigenUnsupported/CMakeLists.txt @@ -0,0 +1,32 @@ +set(Eigen_HEADERS + AdolcForward + AlignedVector3 + ArpackSupport + AutoDiff + BVH + EulerAngles + FFT + IterativeSolvers + KroneckerProduct + LevenbergMarquardt + MatrixFunctions + MoreVectorization + MPRealSupport + NonLinearOptimization + NumericalDiff + OpenGLSupport + Polynomials + Skyline + SparseExtra + SpecialFunctions + Splines + ) + +install(FILES + ${Eigen_HEADERS} + DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel + ) + +install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel FILES_MATCHING PATTERN "*.h") + +add_subdirectory(CXX11) diff --git a/src/EigenUnsupported/CXX11/CMakeLists.txt b/src/EigenUnsupported/CXX11/CMakeLists.txt new file mode 100644 index 0000000..385ed24 --- /dev/null +++ b/src/EigenUnsupported/CXX11/CMakeLists.txt @@ -0,0 +1,8 @@ +set(Eigen_CXX11_HEADERS Tensor TensorSymmetry ThreadPool) + +install(FILES + ${Eigen_CXX11_HEADERS} + DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel + ) + +install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel FILES_MATCHING PATTERN "*.h") diff --git a/src/EigenUnsupported/CXX11/Tensor b/src/EigenUnsupported/CXX11/Tensor new file mode 100644 index 0000000..0938bb5 --- /dev/null +++ b/src/EigenUnsupported/CXX11/Tensor @@ -0,0 +1,137 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// Copyright (C) 2013 Christian Seiler +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +//#ifndef EIGEN_CXX11_TENSOR_MODULE +//#define EIGEN_CXX11_TENSOR_MODULE + +#include "../../../Eigen/Core" + +#if EIGEN_HAS_CXX11 + +#include "../SpecialFunctions" + +#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h" +#include "src/util/CXX11Meta.h" +#include "src/util/MaxSizeVector.h" + +/** \defgroup CXX11_Tensor_Module Tensor Module + * + * This module provides a Tensor class for storing arbitrarily indexed + * objects. + * + * \code + * #include + * \endcode + * + * Much of the documentation can be found \ref eigen_tensors "here". + */ + +#include +#include +#include +#include +#include +#include +#include + +#if defined(EIGEN_USE_THREADS) || defined(EIGEN_USE_SYCL) +#include "ThreadPool" +#endif + +#ifdef EIGEN_USE_GPU + #include + #if defined(EIGEN_USE_HIP) + #include + #else + #include + #endif +#endif + +#include "src/Tensor/TensorMacros.h" +#include "src/Tensor/TensorForwardDeclarations.h" +#include "src/Tensor/TensorMeta.h" +#include "src/Tensor/TensorFunctors.h" +#include "src/Tensor/TensorCostModel.h" +#include "src/Tensor/TensorDeviceDefault.h" +#include "src/Tensor/TensorDeviceThreadPool.h" +#include "src/Tensor/TensorDeviceGpu.h" +#ifndef gpu_assert +#define gpu_assert(x) +#endif +#include "src/Tensor/TensorDeviceSycl.h" +#include "src/Tensor/TensorIndexList.h" +#include "src/Tensor/TensorDimensionList.h" +#include "src/Tensor/TensorDimensions.h" +#include "src/Tensor/TensorInitializer.h" +#include "src/Tensor/TensorTraits.h" +#include "src/Tensor/TensorRandom.h" +#include "src/Tensor/TensorUInt128.h" +#include "src/Tensor/TensorIntDiv.h" +#include "src/Tensor/TensorGlobalFunctions.h" + +#include "src/Tensor/TensorBase.h" +#include "src/Tensor/TensorBlock.h" + +#include "src/Tensor/TensorEvaluator.h" +#include "src/Tensor/TensorExpr.h" +#include "src/Tensor/TensorReduction.h" +#include "src/Tensor/TensorReductionGpu.h" +#include "src/Tensor/TensorArgMax.h" +#include "src/Tensor/TensorConcatenation.h" +#include "src/Tensor/TensorContractionMapper.h" +#include "src/Tensor/TensorContractionBlocking.h" +#include "src/Tensor/TensorContraction.h" +#include "src/Tensor/TensorContractionThreadPool.h" +#include "src/Tensor/TensorContractionGpu.h" +#include "src/Tensor/TensorConversion.h" +#include "src/Tensor/TensorConvolution.h" +#include "src/Tensor/TensorFFT.h" +#include "src/Tensor/TensorPatch.h" +#include "src/Tensor/TensorImagePatch.h" +#include "src/Tensor/TensorVolumePatch.h" +#include "src/Tensor/TensorBroadcasting.h" +#include "src/Tensor/TensorChipping.h" +#include "src/Tensor/TensorInflation.h" +#include "src/Tensor/TensorLayoutSwap.h" +#include "src/Tensor/TensorMorphing.h" +#include "src/Tensor/TensorPadding.h" +#include "src/Tensor/TensorReverse.h" +#include "src/Tensor/TensorShuffling.h" +#include "src/Tensor/TensorStriding.h" +#include "src/Tensor/TensorCustomOp.h" +#include "src/Tensor/TensorEvalTo.h" +#include "src/Tensor/TensorForcedEval.h" +#include "src/Tensor/TensorGenerator.h" +#include "src/Tensor/TensorAssign.h" +#include "src/Tensor/TensorScan.h" +#include "src/Tensor/TensorTrace.h" + +#ifdef EIGEN_USE_SYCL +#include "src/Tensor/TensorReductionSycl.h" +#include "src/Tensor/TensorConvolutionSycl.h" +#include "src/Tensor/TensorContractionSycl.h" +#include "src/Tensor/TensorScanSycl.h" +#endif + +#include "src/Tensor/TensorExecutor.h" +#include "src/Tensor/TensorDevice.h" + +#include "src/Tensor/TensorStorage.h" +#include "src/Tensor/Tensor.h" +#include "src/Tensor/TensorFixedSize.h" +#include "src/Tensor/TensorMap.h" +#include "src/Tensor/TensorRef.h" + +#include "src/Tensor/TensorIO.h" + +#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h" + +#endif // EIGEN_HAS_CXX11 +//#endif // EIGEN_CXX11_TENSOR_MODULE diff --git a/src/EigenUnsupported/CXX11/TensorSymmetry b/src/EigenUnsupported/CXX11/TensorSymmetry new file mode 100644 index 0000000..b09c5e4 --- /dev/null +++ b/src/EigenUnsupported/CXX11/TensorSymmetry @@ -0,0 +1,42 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2013 Christian Seiler +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSORSYMMETRY_MODULE +#define EIGEN_CXX11_TENSORSYMMETRY_MODULE + +#include "Tensor" + +#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h" + +#include "src/util/CXX11Meta.h" + +/** \defgroup CXX11_TensorSymmetry_Module Tensor Symmetry Module + * + * This module provides a classes that allow for the definition of + * symmetries w.r.t. tensor indices. + * + * Including this module will implicitly include the Tensor module. + * + * \code + * #include + * \endcode + */ + +#include "src/TensorSymmetry/util/TemplateGroupTheory.h" +#include "src/TensorSymmetry/Symmetry.h" +#include "src/TensorSymmetry/StaticSymmetry.h" +#include "src/TensorSymmetry/DynamicSymmetry.h" + +#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h" + +#endif // EIGEN_CXX11_TENSORSYMMETRY_MODULE + +/* + * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; + */ diff --git a/src/EigenUnsupported/CXX11/ThreadPool b/src/EigenUnsupported/CXX11/ThreadPool new file mode 100644 index 0000000..c5cafb2 --- /dev/null +++ b/src/EigenUnsupported/CXX11/ThreadPool @@ -0,0 +1,74 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_THREADPOOL_MODULE +#define EIGEN_CXX11_THREADPOOL_MODULE + +#include "../../../Eigen/Core" + +#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h" + +/** \defgroup CXX11_ThreadPool_Module C++11 ThreadPool Module + * + * This module provides 2 threadpool implementations + * - a simple reference implementation + * - a faster non blocking implementation + * + * This module requires C++11. + * + * \code + * #include + * \endcode + */ + + +// The code depends on CXX11, so only include the module if the +// compiler supports it. +#if (EIGEN_COMP_CXXVER >= 11) +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// There are non-parenthesized calls to "max" in the header, +// which trigger a check in test/main.h causing compilation to fail. +// We work around the check here by removing the check for max in +// the case where we have to emulate thread_local. +#ifdef max +#undef max +#endif +#include + +#include "src/util/CXX11Meta.h" +#include "src/util/MaxSizeVector.h" + +#include "src/ThreadPool/ThreadLocal.h" +#include "src/ThreadPool/ThreadYield.h" +#include "src/ThreadPool/ThreadCancel.h" +#include "src/ThreadPool/EventCount.h" +#include "src/ThreadPool/RunQueue.h" +#include "src/ThreadPool/ThreadPoolInterface.h" +#include "src/ThreadPool/ThreadEnvironment.h" +#include "src/ThreadPool/Barrier.h" +#include "src/ThreadPool/NonBlockingThreadPool.h" + +#endif + +#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h" + +#endif // EIGEN_CXX11_THREADPOOL_MODULE diff --git a/src/EigenUnsupported/CXX11/src/Tensor/README.md b/src/EigenUnsupported/CXX11/src/Tensor/README.md new file mode 100644 index 0000000..2f65b1b --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/README.md @@ -0,0 +1,1815 @@ +# Eigen Tensors {#eigen_tensors} + +Tensors are multidimensional arrays of elements. Elements are typically scalars, +but more complex types such as strings are also supported. + +## Tensor Classes + +You can manipulate a tensor with one of the following classes. They all are in +the namespace `::Eigen.` + + +### Class Tensor + +This is the class to use to create a tensor and allocate memory for it. The +class is templatized with the tensor datatype, such as float or int, and the +tensor rank. The rank is the number of dimensions, for example rank 2 is a +matrix. + +Tensors of this class are resizable. For example, if you assign a tensor of a +different size to a Tensor, that tensor is resized to match its new value. + +#### Constructor Tensor(size0, size1, ...) + +Constructor for a Tensor. The constructor must be passed `rank` integers +indicating the sizes of the instance along each of the the `rank` +dimensions. + + // Create a tensor of rank 3 of sizes 2, 3, 4. This tensor owns + // memory to hold 24 floating point values (24 = 2 x 3 x 4). + Tensor t_3d(2, 3, 4); + + // Resize t_3d by assigning a tensor of different sizes, but same rank. + t_3d = Tensor(3, 4, 3); + +#### Constructor Tensor(size_array) + +Constructor where the sizes for the constructor are specified as an array of +values instead of an explicitly list of parameters. The array type to use is +`Eigen::array`. The array can be constructed automatically +from an initializer list. + + // Create a tensor of strings of rank 2 with sizes 5, 7. + Tensor t_2d({5, 7}); + + +### Class TensorFixedSize> + +Class to use for tensors of fixed size, where the size is known at compile +time. Fixed sized tensors can provide very fast computations because all their +dimensions are known by the compiler. FixedSize tensors are not resizable. + +If the total number of elements in a fixed size tensor is small enough the +tensor data is held onto the stack and does not cause heap allocation and free. + + // Create a 4 x 3 tensor of floats. + TensorFixedSize> t_4x3; + +### Class TensorMap> + +This is the class to use to create a tensor on top of memory allocated and +owned by another part of your code. It allows to view any piece of allocated +memory as a Tensor. Instances of this class do not own the memory where the +data are stored. + +A TensorMap is not resizable because it does not own the memory where its data +are stored. + +#### Constructor TensorMap>(data, size0, size1, ...) + +Constructor for a Tensor. The constructor must be passed a pointer to the +storage for the data, and "rank" size attributes. The storage has to be +large enough to hold all the data. + + // Map a tensor of ints on top of stack-allocated storage. + int storage[128]; // 2 x 4 x 2 x 8 = 128 + TensorMap> t_4d(storage, 2, 4, 2, 8); + + // The same storage can be viewed as a different tensor. + // You can also pass the sizes as an array. + TensorMap> t_2d(storage, 16, 8); + + // You can also map fixed-size tensors. Here we get a 1d view of + // the 2d fixed-size tensor. + TensorFixedSize> t_4x3; + TensorMap> t_12(t_4x3.data(), 12); + + +#### Class TensorRef + +See Assigning to a TensorRef below. + +## Accessing Tensor Elements + +#### tensor(index0, index1...) + +Return the element at position `(index0, index1...)` in tensor +`tensor`. You must pass as many parameters as the rank of `tensor`. +The expression can be used as an l-value to set the value of the element at the +specified position. The value returned is of the datatype of the tensor. + + // Set the value of the element at position (0, 1, 0); + Tensor t_3d(2, 3, 4); + t_3d(0, 1, 0) = 12.0f; + + // Initialize all elements to random values. + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 4; ++k) { + t_3d(i, j, k) = ...some random value...; + } + } + } + + // Print elements of a tensor. + for (int i = 0; i < 2; ++i) { + LOG(INFO) << t_3d(i, 0, 0); + } + + +## TensorLayout + +The tensor library supports 2 layouts: `ColMajor` (the default) and +`RowMajor`. Only the default column major layout is currently fully +supported, and it is therefore not recommended to attempt to use the row major +layout at the moment. + +The layout of a tensor is optionally specified as part of its type. If not +specified explicitly column major is assumed. + + Tensor col_major; // equivalent to Tensor + TensorMap > row_major(data, ...); + +All the arguments to an expression must use the same layout. Attempting to mix +different layouts will result in a compilation error. + +It is possible to change the layout of a tensor or an expression using the +`swap_layout()` method. Note that this will also reverse the order of the +dimensions. + + Tensor col_major(2, 4); + Tensor row_major(2, 4); + + Tensor col_major_result = col_major; // ok, layouts match + Tensor col_major_result = row_major; // will not compile + + // Simple layout swap + col_major_result = row_major.swap_layout(); + eigen_assert(col_major_result.dimension(0) == 4); + eigen_assert(col_major_result.dimension(1) == 2); + + // Swap the layout and preserve the order of the dimensions + array shuffle(1, 0); + col_major_result = row_major.swap_layout().shuffle(shuffle); + eigen_assert(col_major_result.dimension(0) == 2); + eigen_assert(col_major_result.dimension(1) == 4); + + +## Tensor Operations + +The Eigen Tensor library provides a vast library of operations on Tensors: +numerical operations such as addition and multiplication, geometry operations +such as slicing and shuffling, etc. These operations are available as methods +of the Tensor classes, and in some cases as operator overloads. For example +the following code computes the elementwise addition of two tensors: + + Tensor t1(2, 3, 4); + ...set some values in t1... + Tensor t2(2, 3, 4); + ...set some values in t2... + // Set t3 to the element wise sum of t1 and t2 + Tensor t3 = t1 + t2; + +While the code above looks easy enough, it is important to understand that the +expression `t1 + t2` is not actually adding the values of the tensors. The +expression instead constructs a "tensor operator" object of the class +TensorCwiseBinaryOp, which has references to the tensors +`t1` and `t2`. This is a small C++ object that knows how to add +`t1` and `t2`. It is only when the value of the expression is assigned +to the tensor `t3` that the addition is actually performed. Technically, +this happens through the overloading of `operator=()` in the Tensor class. + +This mechanism for computing tensor expressions allows for lazy evaluation and +optimizations which are what make the tensor library very fast. + +Of course, the tensor operators do nest, and the expression `t1 + t2 * 0.3f` +is actually represented with the (approximate) tree of operators: + + TensorCwiseBinaryOp(t1, TensorCwiseUnaryOp(t2, 0.3f)) + + +### Tensor Operations and C++ "auto" + +Because Tensor operations create tensor operators, the C++ `auto` keyword +does not have its intuitive meaning. Consider these 2 lines of code: + + Tensor t3 = t1 + t2; + auto t4 = t1 + t2; + +In the first line we allocate the tensor `t3` and it will contain the +result of the addition of `t1` and `t2`. In the second line, `t4` +is actually the tree of tensor operators that will compute the addition of +`t1` and `t2`. In fact, `t4` is *not* a tensor and you cannot get +the values of its elements: + + Tensor t3 = t1 + t2; + cout << t3(0, 0, 0); // OK prints the value of t1(0, 0, 0) + t2(0, 0, 0) + + auto t4 = t1 + t2; + cout << t4(0, 0, 0); // Compilation error! + +When you use `auto` you do not get a Tensor as a result but instead a +non-evaluated expression. So only use `auto` to delay evaluation. + +Unfortunately, there is no single underlying concrete type for holding +non-evaluated expressions, hence you have to use auto in the case when you do +want to hold non-evaluated expressions. + +When you need the results of set of tensor computations you have to assign the +result to a Tensor that will be capable of holding onto them. This can be +either a normal Tensor, a fixed size Tensor, or a TensorMap on an existing +piece of memory. All the following will work: + + auto t4 = t1 + t2; + + Tensor result = t4; // Could also be: result(t4); + cout << result(0, 0, 0); + + TensorMap result(, , ...) = t4; + cout << result(0, 0, 0); + + TensorFixedSize> result = t4; + cout << result(0, 0, 0); + +Until you need the results, you can keep the operation around, and even reuse +it for additional operations. As long as you keep the expression as an +operation, no computation is performed. + + // One way to compute exp((t1 + t2) * 0.2f); + auto t3 = t1 + t2; + auto t4 = t3 * 0.2f; + auto t5 = t4.exp(); + Tensor result = t5; + + // Another way, exactly as efficient as the previous one: + Tensor result = ((t1 + t2) * 0.2f).exp(); + +### Controlling When Expression are Evaluated + +There are several ways to control when expressions are evaluated: + +* Assignment to a Tensor, TensorFixedSize, or TensorMap. +* Use of the eval() method. +* Assignment to a TensorRef. + +#### Assigning to a Tensor, TensorFixedSize, or TensorMap. + +The most common way to evaluate an expression is to assign it to a Tensor. In +the example below, the `auto` declarations make the intermediate values +"Operations", not Tensors, and do not cause the expressions to be evaluated. +The assignment to the Tensor `result` causes the evaluation of all the +operations. + + auto t3 = t1 + t2; // t3 is an Operation. + auto t4 = t3 * 0.2f; // t4 is an Operation. + auto t5 = t4.exp(); // t5 is an Operation. + Tensor result = t5; // The operations are evaluated. + +If you know the ranks and sizes of the Operation value you can assign the +Operation to a TensorFixedSize instead of a Tensor, which is a bit more +efficient. + + // We know that the result is a 4x4x2 tensor! + TensorFixedSize> result = t5; + +Simiarly, assigning an expression to a TensorMap causes its evaluation. Like +tensors of type TensorFixedSize, TensorMaps cannot be resized so they have to +have the rank and sizes of the expression that are assigned to them. + +#### Calling eval(). + +When you compute large composite expressions, you sometimes want to tell Eigen +that an intermediate value in the expression tree is worth evaluating ahead of +time. This is done by inserting a call to the `eval()` method of the +expression Operation. + + // The previous example could have been written: + Tensor result = ((t1 + t2) * 0.2f).exp(); + + // If you want to compute (t1 + t2) once ahead of time you can write: + Tensor result = ((t1 + t2).eval() * 0.2f).exp(); + +Semantically, calling `eval()` is equivalent to materializing the value of +the expression in a temporary Tensor of the right size. The code above in +effect does: + + // .eval() knows the size! + TensorFixedSize> tmp = t1 + t2; + Tensor result = (tmp * 0.2f).exp(); + +Note that the return value of `eval()` is itself an Operation, so the +following code does not do what you may think: + + // Here t3 is an evaluation Operation. t3 has not been evaluated yet. + auto t3 = (t1 + t2).eval(); + + // You can use t3 in another expression. Still no evaluation. + auto t4 = (t3 * 0.2f).exp(); + + // The value is evaluated when you assign the Operation to a Tensor, using + // an intermediate tensor to represent t3.x + Tensor result = t4; + +While in the examples above calling `eval()` does not make a difference in +performance, in other cases it can make a huge difference. In the expression +below the `broadcast()` expression causes the `X.maximum()` expression +to be evaluated many times: + + Tensor<...> X ...; + Tensor<...> Y = ((X - X.maximum(depth_dim).reshape(dims2d).broadcast(bcast)) + * beta).exp(); + +Inserting a call to `eval()` between the `maximum()` and +`reshape()` calls guarantees that maximum() is only computed once and +greatly speeds-up execution: + + Tensor<...> Y = + ((X - X.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)) + * beta).exp(); + +In the other example below, the tensor `Y` is both used in the expression +and its assignment. This is an aliasing problem and if the evaluation is not +done in the right order Y will be updated incrementally during the evaluation +resulting in bogus results: + + Tensor<...> Y ...; + Y = Y / (Y.sum(depth_dim).reshape(dims2d).broadcast(bcast)); + +Inserting a call to `eval()` between the `sum()` and `reshape()` +expressions ensures that the sum is computed before any updates to `Y` are +done. + + Y = Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast)); + +Note that an eval around the full right hand side expression is not needed +because the generated has to compute the i-th value of the right hand side +before assigning it to the left hand side. + +However, if you were assigning the expression value to a shuffle of `Y` +then you would need to force an eval for correctness by adding an `eval()` +call for the right hand side: + + Y.shuffle(...) = + (Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast))).eval(); + + +#### Assigning to a TensorRef. + +If you need to access only a few elements from the value of an expression you +can avoid materializing the value in a full tensor by using a TensorRef. + +A TensorRef is a small wrapper class for any Eigen Operation. It provides +overloads for the `()` operator that let you access individual values in +the expression. TensorRef is convenient, because the Operation themselves do +not provide a way to access individual elements. + + // Create a TensorRef for the expression. The expression is not + // evaluated yet. + TensorRef > ref = ((t1 + t2) * 0.2f).exp(); + + // Use "ref" to access individual elements. The expression is evaluated + // on the fly. + float at_0 = ref(0, 0, 0); + cout << ref(0, 1, 0); + +Only use TensorRef when you need a subset of the values of the expression. +TensorRef only computes the values you access. However note that if you are +going to access all the values it will be much faster to materialize the +results in a Tensor first. + +In some cases, if the full Tensor result would be very large, you may save +memory by accessing it as a TensorRef. But not always. So don't count on it. + + +### Controlling How Expressions Are Evaluated + +The tensor library provides several implementations of the various operations +such as contractions and convolutions. The implementations are optimized for +different environments: single threaded on CPU, multi threaded on CPU, or on a +GPU using cuda. Additional implementations may be added later. + +You can choose which implementation to use with the `device()` call. If +you do not choose an implementation explicitly the default implementation that +uses a single thread on the CPU is used. + +The default implementation has been optimized for recent Intel CPUs, taking +advantage of SSE, AVX, and FMA instructions. Work is ongoing to tune the +library on ARM CPUs. Note that you need to pass compiler-dependent flags +to enable the use of SSE, AVX, and other instructions. + +For example, the following code adds two tensors using the default +single-threaded CPU implementation: + + Tensor a(30, 40); + Tensor b(30, 40); + Tensor c = a + b; + +To choose a different implementation you have to insert a `device()` call +before the assignment of the result. For technical C++ reasons this requires +that the Tensor for the result be declared on its own. This means that you +have to know the size of the result. + + Eigen::Tensor c(30, 40); + c.device(...) = a + b; + +The call to `device()` must be the last call on the left of the operator=. + +You must pass to the `device()` call an Eigen device object. There are +presently three devices you can use: DefaultDevice, ThreadPoolDevice and +GpuDevice. + + +#### Evaluating With the DefaultDevice + +This is exactly the same as not inserting a `device()` call. + + DefaultDevice my_device; + c.device(my_device) = a + b; + +#### Evaluating with a Thread Pool + + // Create the Eigen ThreadPool + Eigen::ThreadPool pool(8 /* number of threads in pool */) + + // Create the Eigen ThreadPoolDevice. + Eigen::ThreadPoolDevice my_device(&pool, 4 /* number of threads to use */); + + // Now just use the device when evaluating expressions. + Eigen::Tensor c(30, 50); + c.device(my_device) = a.contract(b, dot_product_dims); + + +#### Evaluating On GPU + +This is presently a bit more complicated than just using a thread pool device. +You need to create a GPU device but you also need to explicitly allocate the +memory for tensors with cuda. + + +## API Reference + +### Datatypes + +In the documentation of the tensor methods and Operation we mention datatypes +that are tensor-type specific: + +#### ::Dimensions + +Acts like an array of ints. Has an `int size` attribute, and can be +indexed like an array to access individual values. Used to represent the +dimensions of a tensor. See `dimensions()`. + +#### ::Index + +Acts like an `int`. Used for indexing tensors along their dimensions. See +`operator()`, `dimension()`, and `size()`. + +#### ::Scalar + +Represents the datatype of individual tensor elements. For example, for a +`Tensor`, `Scalar` is the type `float`. See +`setConstant()`. + +#### + +We use this pseudo type to indicate that a tensor Operation is returned by a +method. We indicate in the text the type and dimensions of the tensor that the +Operation returns after evaluation. + +The Operation will have to be evaluated, for example by assigning it to a +tensor, before you can access the values of the resulting tensor. You can also +access the values through a TensorRef. + + +## Built-in Tensor Methods + +These are usual C++ methods that act on tensors immediately. They are not +Operations which provide delayed evaluation of their results. Unless specified +otherwise, all the methods listed below are available on all tensor classes: +Tensor, TensorFixedSize, and TensorMap. + +## Metadata + +### int NumDimensions + +Constant value indicating the number of dimensions of a Tensor. This is also +known as the tensor "rank". + + Eigen::Tensor a(3, 4); + cout << "Dims " << a.NumDimensions; + => Dims 2 + +### Dimensions dimensions() + +Returns an array-like object representing the dimensions of the tensor. +The actual type of the `dimensions()` result is `::``Dimensions`. + + Eigen::Tensor a(3, 4); + const Eigen::Tensor::Dimensions& d = a.dimensions(); + cout << "Dim size: " << d.size << ", dim 0: " << d[0] + << ", dim 1: " << d[1]; + => Dim size: 2, dim 0: 3, dim 1: 4 + +If you use a C++11 compiler, you can use `auto` to simplify the code: + + const auto& d = a.dimensions(); + cout << "Dim size: " << d.size << ", dim 0: " << d[0] + << ", dim 1: " << d[1]; + => Dim size: 2, dim 0: 3, dim 1: 4 + +### Index dimension(Index n) + +Returns the n-th dimension of the tensor. The actual type of the +`dimension()` result is `::``Index`, but you can +always use it like an int. + + Eigen::Tensor a(3, 4); + int dim1 = a.dimension(1); + cout << "Dim 1: " << dim1; + => Dim 1: 4 + +### Index size() + +Returns the total number of elements in the tensor. This is the product of all +the tensor dimensions. The actual type of the `size()` result is +`::``Index`, but you can always use it like an int. + + Eigen::Tensor a(3, 4); + cout << "Size: " << a.size(); + => Size: 12 + + +### Getting Dimensions From An Operation + +A few operations provide `dimensions()` directly, +e.g. `TensorReslicingOp`. Most operations defer calculating dimensions +until the operation is being evaluated. If you need access to the dimensions +of a deferred operation, you can wrap it in a TensorRef (see Assigning to a +TensorRef above), which provides `dimensions()` and `dimension()` as +above. + +TensorRef can also wrap the plain Tensor types, so this is a useful idiom in +templated contexts where the underlying object could be either a raw Tensor +or some deferred operation (e.g. a slice of a Tensor). In this case, the +template code can wrap the object in a TensorRef and reason about its +dimensionality while remaining agnostic to the underlying type. + + +## Constructors + +### Tensor + +Creates a tensor of the specified size. The number of arguments must be equal +to the rank of the tensor. The content of the tensor is not initialized. + + Eigen::Tensor a(3, 4); + cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl; + => NumRows: 3 NumCols: 4 + +### TensorFixedSize + +Creates a tensor of the specified size. The number of arguments in the Sizes<> +template parameter determines the rank of the tensor. The content of the tensor +is not initialized. + + Eigen::TensorFixedSize> a; + cout << "Rank: " << a.rank() << endl; + => Rank: 2 + cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl; + => NumRows: 3 NumCols: 4 + +### TensorMap + +Creates a tensor mapping an existing array of data. The data must not be freed +until the TensorMap is discarded, and the size of the data must be large enough +to accommodate the coefficients of the tensor. + + float data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; + Eigen::TensorMap> a(data, 3, 4); + cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl; + => NumRows: 3 NumCols: 4 + cout << "a(1, 2): " << a(1, 2) << endl; + => a(1, 2): 7 + + +## Contents Initialization + +When a new Tensor or a new TensorFixedSize are created, memory is allocated to +hold all the tensor elements, but the memory is not initialized. Similarly, +when a new TensorMap is created on top of non-initialized memory the memory its +contents are not initialized. + +You can use one of the methods below to initialize the tensor memory. These +have an immediate effect on the tensor and return the tensor itself as a +result. These are not tensor Operations which delay evaluation. + +### setConstant(const Scalar& val) + +Sets all elements of the tensor to the constant value `val`. `Scalar` +is the type of data stored in the tensor. You can pass any value that is +convertible to that type. + +Returns the tensor itself in case you want to chain another call. + + a.setConstant(12.3f); + cout << "Constant: " << endl << a << endl << endl; + => + Constant: + 12.3 12.3 12.3 12.3 + 12.3 12.3 12.3 12.3 + 12.3 12.3 12.3 12.3 + +Note that `setConstant()` can be used on any tensor where the element type +has a copy constructor and an `operator=()`: + + Eigen::Tensor a(2, 3); + a.setConstant("yolo"); + cout << "String tensor: " << endl << a << endl << endl; + => + String tensor: + yolo yolo yolo + yolo yolo yolo + + +### setZero() + +Fills the tensor with zeros. Equivalent to `setConstant(Scalar(0))`. +Returns the tensor itself in case you want to chain another call. + + a.setZero(); + cout << "Zeros: " << endl << a << endl << endl; + => + Zeros: + 0 0 0 0 + 0 0 0 0 + 0 0 0 0 + + +### setValues({..initializer_list}) + +Fills the tensor with explicit values specified in a std::initializer_list. +The type of the initializer list depends on the type and rank of the tensor. + +If the tensor has rank N, the initializer list must be nested N times. The +most deeply nested lists must contains P scalars of the Tensor type where P is +the size of the last dimension of the Tensor. + +For example, for a `TensorFixedSize` the initializer list must +contains 2 lists of 3 floats each. + +`setValues()` returns the tensor itself in case you want to chain another +call. + + Eigen::Tensor a(2, 3); + a.setValues({{0.0f, 1.0f, 2.0f}, {3.0f, 4.0f, 5.0f}}); + cout << "a" << endl << a << endl << endl; + => + a + 0 1 2 + 3 4 5 + +If a list is too short, the corresponding elements of the tensor will not be +changed. This is valid at each level of nesting. For example the following +code only sets the values of the first row of the tensor. + + Eigen::Tensor a(2, 3); + a.setConstant(1000); + a.setValues({{10, 20, 30}}); + cout << "a" << endl << a << endl << endl; + => + a + 10 20 30 + 1000 1000 1000 + +### setRandom() + +Fills the tensor with random values. Returns the tensor itself in case you +want to chain another call. + + a.setRandom(); + cout << "Random: " << endl << a << endl << endl; + => + Random: + 0.680375 0.59688 -0.329554 0.10794 + -0.211234 0.823295 0.536459 -0.0452059 + 0.566198 -0.604897 -0.444451 0.257742 + +You can customize `setRandom()` by providing your own random number +generator as a template argument: + + a.setRandom(); + +Here, `MyRandomGenerator` must be a struct with the following member +functions, where Scalar and Index are the same as `::``Scalar` +and `::``Index`. + +See `struct UniformRandomGenerator` in TensorFunctors.h for an example. + + // Custom number generator for use with setRandom(). + struct MyRandomGenerator { + // Default and copy constructors. Both are needed + MyRandomGenerator() { } + MyRandomGenerator(const MyRandomGenerator& ) { } + + // Return a random value to be used. "element_location" is the + // location of the entry to set in the tensor, it can typically + // be ignored. + Scalar operator()(Eigen::DenseIndex element_location, + Eigen::DenseIndex /*unused*/ = 0) const { + return ; + } + + // Same as above but generates several numbers at a time. + typename internal::packet_traits::type packetOp( + Eigen::DenseIndex packet_location, Eigen::DenseIndex /*unused*/ = 0) const { + return ; + } + }; + +You can also use one of the 2 random number generators that are part of the +tensor library: +* UniformRandomGenerator +* NormalRandomGenerator + + +## Data Access + +The Tensor, TensorFixedSize, and TensorRef classes provide the following +accessors to access the tensor coefficients: + + const Scalar& operator()(const array& indices) + const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) + Scalar& operator()(const array& indices) + Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) + +The number of indices must be equal to the rank of the tensor. Moreover, these +accessors are not available on tensor expressions. In order to access the +values of a tensor expression, the expression must either be evaluated or +wrapped in a TensorRef. + + +### Scalar* data() and const Scalar* data() const + +Returns a pointer to the storage for the tensor. The pointer is const if the +tensor was const. This allows direct access to the data. The layout of the +data depends on the tensor layout: RowMajor or ColMajor. + +This access is usually only needed for special cases, for example when mixing +Eigen Tensor code with other libraries. + +Scalar is the type of data stored in the tensor. + + Eigen::Tensor a(3, 4); + float* a_data = a.data(); + a_data[0] = 123.45f; + cout << "a(0, 0): " << a(0, 0); + => a(0, 0): 123.45 + + +## Tensor Operations + +All the methods documented below return non evaluated tensor `Operations`. +These can be chained: you can apply another Tensor Operation to the value +returned by the method. + +The chain of Operation is evaluated lazily, typically when it is assigned to a +tensor. See "Controlling when Expression are Evaluated" for more details about +their evaluation. + +### constant(const Scalar& val) + +Returns a tensor of the same type and dimensions as the original tensor but +where all elements have the value `val`. + +This is useful, for example, when you want to add or subtract a constant from a +tensor, or multiply every element of a tensor by a scalar. + + Eigen::Tensor a(2, 3); + a.setConstant(1.0f); + Eigen::Tensor b = a + a.constant(2.0f); + Eigen::Tensor c = b * b.constant(0.2f); + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + cout << "c" << endl << c << endl << endl; + => + a + 1 1 1 + 1 1 1 + + b + 3 3 3 + 3 3 3 + + c + 0.6 0.6 0.6 + 0.6 0.6 0.6 + +### random() + +Returns a tensor of the same type and dimensions as the current tensor +but where all elements have random values. + +This is for example useful to add random values to an existing tensor. +The generation of random values can be customized in the same manner +as for `setRandom()`. + + Eigen::Tensor a(2, 3); + a.setConstant(1.0f); + Eigen::Tensor b = a + a.random(); + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + => + a + 1 1 1 + 1 1 1 + + b + 1.68038 1.5662 1.82329 + 0.788766 1.59688 0.395103 + + +## Unary Element Wise Operations + +All these operations take a single input tensor as argument and return a tensor +of the same type and dimensions as the tensor to which they are applied. The +requested operations are applied to each element independently. + +### operator-() + +Returns a tensor of the same type and dimensions as the original tensor +containing the opposite values of the original tensor. + + Eigen::Tensor a(2, 3); + a.setConstant(1.0f); + Eigen::Tensor b = -a; + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + => + a + 1 1 1 + 1 1 1 + + b + -1 -1 -1 + -1 -1 -1 + +### sqrt() + +Returns a tensor of the same type and dimensions as the original tensor +containing the square roots of the original tensor. + +### rsqrt() + +Returns a tensor of the same type and dimensions as the original tensor +containing the inverse square roots of the original tensor. + +### square() + +Returns a tensor of the same type and dimensions as the original tensor +containing the squares of the original tensor values. + +### inverse() + +Returns a tensor of the same type and dimensions as the original tensor +containing the inverse of the original tensor values. + +### exp() + +Returns a tensor of the same type and dimensions as the original tensor +containing the exponential of the original tensor. + +### log() + +Returns a tensor of the same type and dimensions as the original tensor +containing the natural logarithms of the original tensor. + +### abs() + +Returns a tensor of the same type and dimensions as the original tensor +containing the absolute values of the original tensor. + +### pow(Scalar exponent) + +Returns a tensor of the same type and dimensions as the original tensor +containing the coefficients of the original tensor to the power of the +exponent. + +The type of the exponent, Scalar, is always the same as the type of the +tensor coefficients. For example, only integer exponents can be used in +conjuntion with tensors of integer values. + +You can use cast() to lift this restriction. For example this computes +cubic roots of an int Tensor: + + Eigen::Tensor a(2, 3); + a.setValues({{0, 1, 8}, {27, 64, 125}}); + Eigen::Tensor b = a.cast().pow(1.0 / 3.0); + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + => + a + 0 1 8 + 27 64 125 + + b + 0 1 2 + 3 4 5 + +### operator * (Scalar scale) + +Multiplies all the coefficients of the input tensor by the provided scale. + +### cwiseMax(Scalar threshold) +TODO + +### cwiseMin(Scalar threshold) +TODO + +### unaryExpr(const CustomUnaryOp& func) +TODO + + +## Binary Element Wise Operations + +These operations take two input tensors as arguments. The 2 input tensors should +be of the same type and dimensions. The result is a tensor of the same +dimensions as the tensors to which they are applied, and unless otherwise +specified it is also of the same type. The requested operations are applied to +each pair of elements independently. + +### operator+(const OtherDerived& other) + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise sums of the inputs. + +### operator-(const OtherDerived& other) + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise differences of the inputs. + +### operator*(const OtherDerived& other) + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise products of the inputs. + +### operator/(const OtherDerived& other) + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise quotients of the inputs. + +This operator is not supported for integer types. + +### cwiseMax(const OtherDerived& other) + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise maximums of the inputs. + +### cwiseMin(const OtherDerived& other) + +Returns a tensor of the same type and dimensions as the input tensors +containing the coefficient wise mimimums of the inputs. + +### Logical operators + +The following logical operators are supported as well: + +* operator&&(const OtherDerived& other) +* operator||(const OtherDerived& other) +* operator<(const OtherDerived& other) +* operator<=(const OtherDerived& other) +* operator>(const OtherDerived& other) +* operator>=(const OtherDerived& other) +* operator==(const OtherDerived& other) +* operator!=(const OtherDerived& other) + +They all return a tensor of boolean values. + + +## Selection (select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) + +Selection is a coefficient-wise ternary operator that is the tensor equivalent +to the if-then-else operation. + + Tensor if = ...; + Tensor then = ...; + Tensor else = ...; + Tensor result = if.select(then, else); + +The 3 arguments must be of the same dimensions, which will also be the dimension +of the result. The 'if' tensor must be of type boolean, the 'then' and the +'else' tensor must be of the same type, which will also be the type of the +result. + +Each coefficient in the result is equal to the corresponding coefficient in the +'then' tensor if the corresponding value in the 'if' tensor is true. If not, the +resulting coefficient will come from the 'else' tensor. + + +## Contraction + +Tensor *contractions* are a generalization of the matrix product to the +multidimensional case. + + // Create 2 matrices using tensors of rank 2 + Eigen::Tensor a(2, 3); + a.setValues({{1, 2, 3}, {6, 5, 4}}); + Eigen::Tensor b(3, 2); + b.setValues({{1, 2}, {4, 5}, {5, 6}}); + + // Compute the traditional matrix product + Eigen::array, 1> product_dims = { Eigen::IndexPair(1, 0) }; + Eigen::Tensor AB = a.contract(b, product_dims); + + // Compute the product of the transpose of the matrices + Eigen::array, 1> transposed_product_dims = { Eigen::IndexPair(0, 1) }; + Eigen::Tensor AtBt = a.contract(b, transposed_product_dims); + + // Contraction to scalar value using a double contraction. + // First coordinate of both tensors are contracted as well as both second coordinates, i.e., this computes the sum of the squares of the elements. + Eigen::array, 2> double_contraction_product_dims = { Eigen::IndexPair(0, 0), Eigen::IndexPair(1, 1) }; + Eigen::Tensor AdoubleContractedA = a.contract(a, double_contraction_product_dims); + + // Extracting the scalar value of the tensor contraction for further usage + int value = AdoubleContractedA(0); + +## Reduction Operations + +A *Reduction* operation returns a tensor with fewer dimensions than the +original tensor. The values in the returned tensor are computed by applying a +*reduction operator* to slices of values from the original tensor. You specify +the dimensions along which the slices are made. + +The Eigen Tensor library provides a set of predefined reduction operators such +as `maximum()` and `sum()` and lets you define additional operators by +implementing a few methods from a reductor template. + +### Reduction Dimensions + +All reduction operations take a single parameter of type +`::``Dimensions` which can always be specified as an array of +ints. These are called the "reduction dimensions." The values are the indices +of the dimensions of the input tensor over which the reduction is done. The +parameter can have at most as many element as the rank of the input tensor; +each element must be less than the tensor rank, as it indicates one of the +dimensions to reduce. + +Each dimension of the input tensor should occur at most once in the reduction +dimensions as the implementation does not remove duplicates. + +The order of the values in the reduction dimensions does not affect the +results, but the code may execute faster if you list the dimensions in +increasing order. + +Example: Reduction along one dimension. + + // Create a tensor of 2 dimensions + Eigen::Tensor a(2, 3); + a.setValues({{1, 2, 3}, {6, 5, 4}}); + // Reduce it along the second dimension (1)... + Eigen::array dims({1 /* dimension to reduce */}); + // ...using the "maximum" operator. + // The result is a tensor with one dimension. The size of + // that dimension is the same as the first (non-reduced) dimension of a. + Eigen::Tensor b = a.maximum(dims); + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + => + a + 1 2 3 + 6 5 4 + + b + 3 + 6 + +Example: Reduction along two dimensions. + + Eigen::Tensor a(2, 3, 4); + a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f}, + {7.0f, 6.0f, 5.0f, 4.0f}, + {8.0f, 9.0f, 10.0f, 11.0f}}, + {{12.0f, 13.0f, 14.0f, 15.0f}, + {19.0f, 18.0f, 17.0f, 16.0f}, + {20.0f, 21.0f, 22.0f, 23.0f}}}); + // The tensor a has 3 dimensions. We reduce along the + // first 2, resulting in a tensor with a single dimension + // of size 4 (the last dimension of a.) + // Note that we pass the array of reduction dimensions + // directly to the maximum() call. + Eigen::Tensor b = + a.maximum(Eigen::array({0, 1})); + cout << "b" << endl << b << endl << endl; + => + b + 20 + 21 + 22 + 23 + +#### Reduction along all dimensions + +As a special case, if you pass no parameter to a reduction operation the +original tensor is reduced along *all* its dimensions. The result is a +scalar, represented as a zero-dimension tensor. + + Eigen::Tensor a(2, 3, 4); + a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f}, + {7.0f, 6.0f, 5.0f, 4.0f}, + {8.0f, 9.0f, 10.0f, 11.0f}}, + {{12.0f, 13.0f, 14.0f, 15.0f}, + {19.0f, 18.0f, 17.0f, 16.0f}, + {20.0f, 21.0f, 22.0f, 23.0f}}}); + // Reduce along all dimensions using the sum() operator. + Eigen::Tensor b = a.sum(); + cout << "b" << endl << b << endl << endl; + => + b + 276 + + +### sum(const Dimensions& new_dims) +### sum() + +Reduce a tensor using the sum() operator. The resulting values +are the sum of the reduced values. + +### mean(const Dimensions& new_dims) +### mean() + +Reduce a tensor using the mean() operator. The resulting values +are the mean of the reduced values. + +### maximum(const Dimensions& new_dims) +### maximum() + +Reduce a tensor using the maximum() operator. The resulting values are the +largest of the reduced values. + +### minimum(const Dimensions& new_dims) +### minimum() + +Reduce a tensor using the minimum() operator. The resulting values +are the smallest of the reduced values. + +### prod(const Dimensions& new_dims) +### prod() + +Reduce a tensor using the prod() operator. The resulting values +are the product of the reduced values. + +### all(const Dimensions& new_dims) +### all() +Reduce a tensor using the all() operator. Casts tensor to bool and then checks +whether all elements are true. Runs through all elements rather than +short-circuiting, so may be significantly inefficient. + +### any(const Dimensions& new_dims) +### any() +Reduce a tensor using the any() operator. Casts tensor to bool and then checks +whether any element is true. Runs through all elements rather than +short-circuiting, so may be significantly inefficient. + + +### reduce(const Dimensions& new_dims, const Reducer& reducer) + +Reduce a tensor using a user-defined reduction operator. See `SumReducer` +in TensorFunctors.h for information on how to implement a reduction operator. + + +## Trace + +A *Trace* operation returns a tensor with fewer dimensions than the original +tensor. It returns a tensor whose elements are the sum of the elements of the +original tensor along the main diagonal for a list of specified dimensions, the +"trace dimensions". Similar to the `Reduction Dimensions`, the trace dimensions +are passed as an input parameter to the operation, are of type `::``Dimensions` +, and have the same requirements when passed as an input parameter. In addition, +the trace dimensions must have the same size. + +Example: Trace along 2 dimensions. + + // Create a tensor of 3 dimensions + Eigen::Tensor a(2, 2, 3); + a.setValues({{{1, 2, 3}, {4, 5, 6}}, {{7, 8, 9}, {10, 11, 12}}}); + // Specify the dimensions along which the trace will be computed. + // In this example, the trace can only be computed along the dimensions + // with indices 0 and 1 + Eigen::array dims({0, 1}); + // The output tensor contains all but the trace dimensions. + Tensor a_trace = a.trace(dims); + cout << "a_trace:" << endl; + cout << a_trace << endl; + => + a_trace: + 11 + 13 + 15 + + +### trace(const Dimensions& new_dims) +### trace() + +As a special case, if no parameter is passed to the operation, trace is computed +along *all* dimensions of the input tensor. + +Example: Trace along all dimensions. + + // Create a tensor of 3 dimensions, with all dimensions having the same size. + Eigen::Tensor a(3, 3, 3); + a.setValues({{{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, + {{10, 11, 12}, {13, 14, 15}, {16, 17, 18}}, + {{19, 20, 21}, {22, 23, 24}, {25, 26, 27}}}); + // Result is a zero dimension tensor + Tensor a_trace = a.trace(); + cout<<"a_trace:"< + a_trace: + 42 + + +## Scan Operations + +A *Scan* operation returns a tensor with the same dimensions as the original +tensor. The operation performs an inclusive scan along the specified +axis, which means it computes a running total along the axis for a given +reduction operation. +If the reduction operation corresponds to summation, then this computes the +prefix sum of the tensor along the given axis. + +Example: +dd a comment to this line + + // Create a tensor of 2 dimensions + Eigen::Tensor a(2, 3); + a.setValues({{1, 2, 3}, {4, 5, 6}}); + // Scan it along the second dimension (1) using summation + Eigen::Tensor b = a.cumsum(1); + // The result is a tensor with the same size as the input + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + => + a + 1 2 3 + 4 5 6 + + b + 1 3 6 + 4 9 15 + +### cumsum(const Index& axis) + +Perform a scan by summing consecutive entries. + +### cumprod(const Index& axis) + +Perform a scan by multiplying consecutive entries. + + +## Convolutions + +### convolve(const Kernel& kernel, const Dimensions& dims) + +Returns a tensor that is the output of the convolution of the input tensor with the kernel, +along the specified dimensions of the input tensor. The dimension size for dimensions of the output tensor +which were part of the convolution will be reduced by the formula: +output_dim_size = input_dim_size - kernel_dim_size + 1 (requires: input_dim_size >= kernel_dim_size). +The dimension sizes for dimensions that were not part of the convolution will remain the same. +Performance of the convolution can depend on the length of the stride(s) of the input tensor dimension(s) along which the +convolution is computed (the first dimension has the shortest stride for ColMajor, whereas RowMajor's shortest stride is +for the last dimension). + + // Compute convolution along the second and third dimension. + Tensor input(3, 3, 7, 11); + Tensor kernel(2, 2); + Tensor output(3, 2, 6, 11); + input.setRandom(); + kernel.setRandom(); + + Eigen::array dims({1, 2}); // Specify second and third dimension for convolution. + output = input.convolve(kernel, dims); + + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 6; ++k) { + for (int l = 0; l < 11; ++l) { + const float result = output(i,j,k,l); + const float expected = input(i,j+0,k+0,l) * kernel(0,0) + + input(i,j+1,k+0,l) * kernel(1,0) + + input(i,j+0,k+1,l) * kernel(0,1) + + input(i,j+1,k+1,l) * kernel(1,1); + VERIFY_IS_APPROX(result, expected); + } + } + } + } + + +## Geometrical Operations + +These operations return a Tensor with different dimensions than the original +Tensor. They can be used to access slices of tensors, see them with different +dimensions, or pad tensors with additional data. + +### reshape(const Dimensions& new_dims) + +Returns a view of the input tensor that has been reshaped to the specified +new dimensions. The argument new_dims is an array of Index values. The +rank of the resulting tensor is equal to the number of elements in new_dims. + +The product of all the sizes in the new dimension array must be equal to +the number of elements in the input tensor. + + // Increase the rank of the input tensor by introducing a new dimension + // of size 1. + Tensor input(7, 11); + array three_dims{{7, 11, 1}}; + Tensor result = input.reshape(three_dims); + + // Decrease the rank of the input tensor by merging 2 dimensions; + array one_dim{{7 * 11}}; + Tensor result = input.reshape(one_dim); + +This operation does not move any data in the input tensor, so the resulting +contents of a reshaped Tensor depend on the data layout of the original Tensor. + +For example this is what happens when you `reshape()` a 2D ColMajor tensor +to one dimension: + + Eigen::Tensor a(2, 3); + a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}}); + Eigen::array one_dim({3 * 2}); + Eigen::Tensor b = a.reshape(one_dim); + cout << "b" << endl << b << endl; + => + b + 0 + 300 + 100 + 400 + 200 + 500 + +This is what happens when the 2D Tensor is RowMajor: + + Eigen::Tensor a(2, 3); + a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}}); + Eigen::array one_dim({3 * 2}); + Eigen::Tensor b = a.reshape(one_dim); + cout << "b" << endl << b << endl; + => + b + 0 + 100 + 200 + 300 + 400 + 500 + +The reshape operation is a lvalue. In other words, it can be used on the left +side of the assignment operator. + +The previous example can be rewritten as follow: + + Eigen::Tensor a(2, 3); + a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}}); + Eigen::array two_dim({2, 3}); + Eigen::Tensor b(6); + b.reshape(two_dim) = a; + cout << "b" << endl << b << endl; + => + b + 0 + 300 + 100 + 400 + 200 + 500 + +Note that "b" itself was not reshaped but that instead the assignment is done to +the reshape view of b. + + +### shuffle(const Shuffle& shuffle) + +Returns a copy of the input tensor whose dimensions have been +reordered according to the specified permutation. The argument shuffle +is an array of Index values. Its size is the rank of the input +tensor. It must contain a permutation of 0, 1, ..., rank - 1. The i-th +dimension of the output tensor equals to the size of the shuffle[i]-th +dimension of the input tensor. For example: + + // Shuffle all dimensions to the left by 1. + Tensor input(20, 30, 50); + // ... set some values in input. + Tensor output = input.shuffle({1, 2, 0}) + + eigen_assert(output.dimension(0) == 30); + eigen_assert(output.dimension(1) == 50); + eigen_assert(output.dimension(2) == 20); + +Indices into the output tensor are shuffled accordingly to formulate +indices into the input tensor. For example, one can assert in the above +code snippet that: + + eigen_assert(output(3, 7, 11) == input(11, 3, 7)); + +In general, one can assert that + + eigen_assert(output(..., indices[shuffle[i]], ...) == + input(..., indices[i], ...)) + +The shuffle operation results in a lvalue, which means that it can be assigned +to. In other words, it can be used on the left side of the assignment operator. + +Let's rewrite the previous example to take advantage of this feature: + + // Shuffle all dimensions to the left by 1. + Tensor input(20, 30, 50); + // ... set some values in input. + Tensor output(30, 50, 20); + output.shuffle({2, 0, 1}) = input; + + +### stride(const Strides& strides) + +Returns a view of the input tensor that strides (skips stride-1 +elements) along each of the dimensions. The argument strides is an +array of Index values. The dimensions of the resulting tensor are +ceil(input_dimensions[i] / strides[i]). + +For example this is what happens when you `stride()` a 2D tensor: + + Eigen::Tensor a(4, 3); + a.setValues({{0, 100, 200}, {300, 400, 500}, {600, 700, 800}, {900, 1000, 1100}}); + Eigen::array strides({3, 2}); + Eigen::Tensor b = a.stride(strides); + cout << "b" << endl << b << endl; + => + b + 0 200 + 900 1100 + +It is possible to assign a tensor to a stride: + Tensor input(20, 30, 50); + // ... set some values in input. + Tensor output(40, 90, 200); + output.stride({2, 3, 4}) = input; + + +### slice(const StartIndices& offsets, const Sizes& extents) + +Returns a sub-tensor of the given tensor. For each dimension i, the slice is +made of the coefficients stored between offset[i] and offset[i] + extents[i] in +the input tensor. + + Eigen::Tensor a(4, 3); + a.setValues({{0, 100, 200}, {300, 400, 500}, + {600, 700, 800}, {900, 1000, 1100}}); + Eigen::array offsets = {1, 0}; + Eigen::array extents = {2, 2}; + Eigen::Tensor slice = a.slice(offsets, extents); + cout << "a" << endl << a << endl; + => + a + 0 100 200 + 300 400 500 + 600 700 800 + 900 1000 1100 + cout << "slice" << endl << slice << endl; + => + slice + 300 400 + 600 700 + + +### chip(const Index offset, const Index dim) + +A chip is a special kind of slice. It is the subtensor at the given offset in +the dimension dim. The returned tensor has one fewer dimension than the input +tensor: the dimension dim is removed. + +For example, a matrix chip would be either a row or a column of the input +matrix. + + Eigen::Tensor a(4, 3); + a.setValues({{0, 100, 200}, {300, 400, 500}, + {600, 700, 800}, {900, 1000, 1100}}); + Eigen::Tensor row_3 = a.chip(2, 0); + Eigen::Tensor col_2 = a.chip(1, 1); + cout << "a" << endl << a << endl; + => + a + 0 100 200 + 300 400 500 + 600 700 800 + 900 1000 1100 + cout << "row_3" << endl << row_3 << endl; + => + row_3 + 600 700 800 + cout << "col_2" << endl << col_2 << endl; + => + col_2 + 100 400 700 1000 + +It is possible to assign values to a tensor chip since the chip operation is a +lvalue. For example: + + Eigen::Tensor a(3); + a.setValues({{100, 200, 300}}); + Eigen::Tensor b(2, 3); + b.setZero(); + b.chip(0, 0) = a; + cout << "a" << endl << a << endl; + => + a + 100 + 200 + 300 + cout << "b" << endl << b << endl; + => + b + 100 200 300 + 0 0 0 + + +### reverse(const ReverseDimensions& reverse) + +Returns a view of the input tensor that reverses the order of the coefficients +along a subset of the dimensions. The argument reverse is an array of boolean +values that indicates whether or not the order of the coefficients should be +reversed along each of the dimensions. This operation preserves the dimensions +of the input tensor. + +For example this is what happens when you `reverse()` the first dimension +of a 2D tensor: + + Eigen::Tensor a(4, 3); + a.setValues({{0, 100, 200}, {300, 400, 500}, + {600, 700, 800}, {900, 1000, 1100}}); + Eigen::array reverse({true, false}); + Eigen::Tensor b = a.reverse(reverse); + cout << "a" << endl << a << endl << "b" << endl << b << endl; + => + a + 0 100 200 + 300 400 500 + 600 700 800 + 900 1000 1100 + b + 900 1000 1100 + 600 700 800 + 300 400 500 + 0 100 200 + + +### broadcast(const Broadcast& broadcast) + +Returns a view of the input tensor in which the input is replicated one to many +times. +The broadcast argument specifies how many copies of the input tensor need to be +made in each of the dimensions. + + Eigen::Tensor a(2, 3); + a.setValues({{0, 100, 200}, {300, 400, 500}}); + Eigen::array bcast({3, 2}); + Eigen::Tensor b = a.broadcast(bcast); + cout << "a" << endl << a << endl << "b" << endl << b << endl; + => + a + 0 100 200 + 300 400 500 + b + 0 100 200 0 100 200 + 300 400 500 300 400 500 + 0 100 200 0 100 200 + 300 400 500 300 400 500 + 0 100 200 0 100 200 + 300 400 500 300 400 500 + +### concatenate(const OtherDerived& other, Axis axis) + +TODO + +### pad(const PaddingDimensions& padding) + +Returns a view of the input tensor in which the input is padded with zeros. + + Eigen::Tensor a(2, 3); + a.setValues({{0, 100, 200}, {300, 400, 500}}); + Eigen::array, 2> paddings; + paddings[0] = make_pair(0, 1); + paddings[1] = make_pair(2, 3); + Eigen::Tensor b = a.pad(paddings); + cout << "a" << endl << a << endl << "b" << endl << b << endl; + => + a + 0 100 200 + 300 400 500 + b + 0 0 0 0 + 0 0 0 0 + 0 100 200 0 + 300 400 500 0 + 0 0 0 0 + 0 0 0 0 + 0 0 0 0 + + +### extract_patches(const PatchDims& patch_dims) + +Returns a tensor of coefficient patches extracted from the input tensor, where +each patch is of dimension specified by 'patch_dims'. The returned tensor has +one greater dimension than the input tensor, which is used to index each patch. +The patch index in the output tensor depends on the data layout of the input +tensor: the patch index is the last dimension ColMajor layout, and the first +dimension in RowMajor layout. + +For example, given the following input tensor: + + Eigen::Tensor tensor(3,4); + tensor.setValues({{0.0f, 1.0f, 2.0f, 3.0f}, + {4.0f, 5.0f, 6.0f, 7.0f}, + {8.0f, 9.0f, 10.0f, 11.0f}}); + + cout << "tensor: " << endl << tensor << endl; + => + tensor: + 0 1 2 3 + 4 5 6 7 + 8 9 10 11 + +Six 2x2 patches can be extracted and indexed using the following code: + + Eigen::Tensor patch; + Eigen::array patch_dims; + patch_dims[0] = 2; + patch_dims[1] = 2; + patch = tensor.extract_patches(patch_dims); + for (int k = 0; k < 6; ++k) { + cout << "patch index: " << k << endl; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 2; ++j) { + if (DataLayout == ColMajor) { + cout << patch(i, j, k) << " "; + } else { + cout << patch(k, i, j) << " "; + } + } + cout << endl; + } + } + +This code results in the following output when the data layout is ColMajor: + + patch index: 0 + 0 1 + 4 5 + patch index: 1 + 4 5 + 8 9 + patch index: 2 + 1 2 + 5 6 + patch index: 3 + 5 6 + 9 10 + patch index: 4 + 2 3 + 6 7 + patch index: 5 + 6 7 + 10 11 + +This code results in the following output when the data layout is RowMajor: +(NOTE: the set of patches is the same as in ColMajor, but are indexed differently). + + patch index: 0 + 0 1 + 4 5 + patch index: 1 + 1 2 + 5 6 + patch index: 2 + 2 3 + 6 7 + patch index: 3 + 4 5 + 8 9 + patch index: 4 + 5 6 + 9 10 + patch index: 5 + 6 7 + 10 11 + +### extract_image_patches(const Index patch_rows, const Index patch_cols, const Index row_stride, const Index col_stride, const PaddingType padding_type) + +Returns a tensor of coefficient image patches extracted from the input tensor, +which is expected to have dimensions ordered as follows (depending on the data +layout of the input tensor, and the number of additional dimensions 'N'): + +*) ColMajor +1st dimension: channels (of size d) +2nd dimension: rows (of size r) +3rd dimension: columns (of size c) +4th-Nth dimension: time (for video) or batch (for bulk processing). + +*) RowMajor (reverse order of ColMajor) +1st-Nth dimension: time (for video) or batch (for bulk processing). +N+1'th dimension: columns (of size c) +N+2'th dimension: rows (of size r) +N+3'th dimension: channels (of size d) + +The returned tensor has one greater dimension than the input tensor, which is +used to index each patch. The patch index in the output tensor depends on the +data layout of the input tensor: the patch index is the 4'th dimension in +ColMajor layout, and the 4'th from the last dimension in RowMajor layout. + +For example, given the following input tensor with the following dimension +sizes: + *) depth: 2 + *) rows: 3 + *) columns: 5 + *) batch: 7 + + Tensor tensor(2,3,5,7); + Tensor tensor_row_major = tensor.swap_layout(); + +2x2 image patches can be extracted and indexed using the following code: + +*) 2D patch: ColMajor (patch indexed by second-to-last dimension) + + Tensor twod_patch; + twod_patch = tensor.extract_image_patches<2, 2>(); + // twod_patch.dimension(0) == 2 + // twod_patch.dimension(1) == 2 + // twod_patch.dimension(2) == 2 + // twod_patch.dimension(3) == 3*5 + // twod_patch.dimension(4) == 7 + +*) 2D patch: RowMajor (patch indexed by the second dimension) + + Tensor twod_patch_row_major; + twod_patch_row_major = tensor_row_major.extract_image_patches<2, 2>(); + // twod_patch_row_major.dimension(0) == 7 + // twod_patch_row_major.dimension(1) == 3*5 + // twod_patch_row_major.dimension(2) == 2 + // twod_patch_row_major.dimension(3) == 2 + // twod_patch_row_major.dimension(4) == 2 + +## Special Operations + +### cast() + +Returns a tensor of type T with the same dimensions as the original tensor. +The returned tensor contains the values of the original tensor converted to +type T. + + Eigen::Tensor a(2, 3); + Eigen::Tensor b = a.cast(); + +This can be useful for example if you need to do element-wise division of +Tensors of integers. This is not currently supported by the Tensor library +but you can easily cast the tensors to floats to do the division: + + Eigen::Tensor a(2, 3); + a.setValues({{0, 1, 2}, {3, 4, 5}}); + Eigen::Tensor b = + (a.cast() / a.constant(2).cast()).cast(); + cout << "a" << endl << a << endl << endl; + cout << "b" << endl << b << endl << endl; + => + a + 0 1 2 + 3 4 5 + + b + 0 0 1 + 1 2 2 + + +### eval() + +TODO + + +## Representation of scalar values + +Scalar values are often represented by tensors of size 1 and rank 0.For example +Tensor::maximum() currently returns a Tensor. Similarly, the inner +product of 2 1d tensors (through contractions) returns a 0d tensor. + +## Limitations + +* The number of tensor dimensions is currently limited to 250 when using a + compiler that supports cxx11. It is limited to only 5 for older compilers. +* The IndexList class requires a cxx11 compliant compiler. You can use an + array of indices instead if you don't have access to a modern compiler. +* On GPUs only floating point values are properly tested and optimized for. +* Complex and integer values are known to be broken on GPUs. If you try to use + them you'll most likely end up triggering a static assertion failure such as + EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + + diff --git a/src/EigenUnsupported/CXX11/src/Tensor/Tensor.h b/src/EigenUnsupported/CXX11/src/Tensor/Tensor.h new file mode 100644 index 0000000..8cac2bb --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/Tensor.h @@ -0,0 +1,554 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// Copyright (C) 2013 Christian Seiler +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_H +#define EIGEN_CXX11_TENSOR_TENSOR_H + +namespace Eigen { + +/** \class Tensor + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor class. + * + * The %Tensor class is the work-horse for all \em dense tensors within Eigen. + * + * The %Tensor class encompasses only dynamic-size objects so far. + * + * The first two template parameters are required: + * \tparam Scalar_ Numeric type, e.g. float, double, int or `std::complex`. + * User defined scalar types are supported as well (see \ref user_defined_scalars "here"). + * \tparam NumIndices_ Number of indices (i.e. rank of the tensor) + * + * The remaining template parameters are optional -- in most cases you don't have to worry about them. + * \tparam Options_ A combination of either \b #RowMajor or \b #ColMajor, and of either + * \b #AutoAlign or \b #DontAlign. + * The former controls \ref TopicStorageOrders "storage order", and defaults to column-major. The latter controls alignment, which is required + * for vectorization. It defaults to aligning tensors. Note that tensors currently do not support any operations that profit from vectorization. + * Support for such operations (i.e. adding two tensors etc.) is planned. + * + * You can access elements of tensors using normal subscripting: + * + * \code + * Eigen::Tensor t(10, 10, 10, 10); + * t(0, 1, 2, 3) = 42.0; + * \endcode + * + * This class can be extended with the help of the plugin mechanism described on the page + * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_TENSOR_PLUGIN. + * + * Some notes: + * + *
+ *
Relation to other parts of Eigen:
+ *
The midterm development goal for this class is to have a similar hierarchy as Eigen uses for matrices, so that + * taking blocks or using tensors in expressions is easily possible, including an interface with the vector/matrix code + * by providing .asMatrix() and .asVector() (or similar) methods for rank 2 and 1 tensors. However, currently, the %Tensor + * class does not provide any of these features and is only available as a stand-alone class that just allows for + * coefficient access. Also, when fixed-size tensors are implemented, the number of template arguments is likely to + * change dramatically.
+ *
+ * + * \ref TopicStorageOrders + */ + +template +class Tensor : public TensorBase > +{ + public: + typedef Tensor Self; + typedef TensorBase > Base; + typedef typename Eigen::internal::nested::type Nested; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + typedef Scalar_ Scalar; + typedef typename NumTraits::Real RealScalar; + typedef typename Base::CoeffReturnType CoeffReturnType; + + enum { + IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0) & !(Options_&DontAlign), + Layout = Options_ & RowMajor ? RowMajor : ColMajor, + CoordAccess = true, + RawAccess = true + }; + + static const int Options = Options_; + static const int NumIndices = NumIndices_; + typedef DSizes Dimensions; + + protected: + TensorStorage m_storage; + +#ifdef EIGEN_HAS_SFINAE + template + struct isOfNormalIndex{ + static const bool is_array = internal::is_base_of, CustomIndices>::value; + static const bool is_int = NumTraits::IsInteger; + static const bool value = is_array | is_int; + }; +#endif + + public: + // Metadata + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_storage.dimensions(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); } + + // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED + // work, because that uses base().coeffRef() - and we don't yet + // implement a similar class hierarchy + inline Self& base() { return *this; } + inline const Self& base() const { return *this; } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeff(array{{firstIndex, secondIndex, otherIndices...}}); + } +#endif + + // normal indices + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(const array& indices) const + { + eigen_internal_assert(checkIndexRange(indices)); + return m_storage.data()[linearizedIndex(indices)]; + } + + // custom indices +#ifdef EIGEN_HAS_SFINAE + template::value) ) + > + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(CustomIndices& indices) const + { + return coeff(internal::customIndices2Array(indices)); + } +#endif + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff() const + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return m_storage.data()[0]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const + { + eigen_internal_assert(index >= 0 && index < size()); + return m_storage.data()[index]; + } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template + inline Scalar& coeffRef(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeffRef(array{{firstIndex, secondIndex, otherIndices...}}); + } +#endif + + // normal indices + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array& indices) + { + eigen_internal_assert(checkIndexRange(indices)); + return m_storage.data()[linearizedIndex(indices)]; + } + + // custom indices +#ifdef EIGEN_HAS_SFINAE + template::value) ) + > + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(CustomIndices& indices) + { + return coeffRef(internal::customIndices2Array(indices)); + } +#endif + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef() + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return m_storage.data()[0]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) + { + eigen_internal_assert(index >= 0 && index < size()); + return m_storage.data()[index]; + } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template + inline const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return this->operator()(array{{firstIndex, secondIndex, otherIndices...}}); + } +#else + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const + { + return coeff(array(i0, i1)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const + { + return coeff(array(i0, i1, i2)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const + { + return coeff(array(i0, i1, i2, i3)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const + { + return coeff(array(i0, i1, i2, i3, i4)); + } +#endif + + // custom indices +#ifdef EIGEN_HAS_SFINAE + template::value) ) + > + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(CustomIndices& indices) const + { + return coeff(internal::customIndices2Array(indices)); + } +#endif + + // normal indices + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(const array& indices) const + { + return coeff(indices); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const + { + eigen_internal_assert(index >= 0 && index < size()); + return coeff(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()() const + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return coeff(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const + { + // The bracket operator is only for vectors, use the parenthesis operator instead. + EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE); + return coeff(index); + } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template + inline Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return operator()(array{{firstIndex, secondIndex, otherIndices...}}); + } +#else + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1) + { + return coeffRef(array(i0, i1)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2) + { + return coeffRef(array(i0, i1, i2)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3) + { + return coeffRef(array(i0, i1, i2, i3)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) + { + return coeffRef(array(i0, i1, i2, i3, i4)); + } +#endif + + // normal indices + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(const array& indices) + { + return coeffRef(indices); + } + + // custom indices +#ifdef EIGEN_HAS_SFINAE + template::value) ) + > + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(CustomIndices& indices) + { + return coeffRef(internal::customIndices2Array(indices)); + } +#endif + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index index) + { + eigen_assert(index >= 0 && index < size()); + return coeffRef(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()() + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return coeffRef(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator[](Index index) + { + // The bracket operator is only for vectors, use the parenthesis operator instead + EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeffRef(index); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor() + : m_storage() + { + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor(const Self& other) + : m_storage(other.m_storage) + { + } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index firstDimension, IndexTypes... otherDimensions) + : m_storage(firstDimension, otherDimensions...) + { + // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } +#else + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1) + : m_storage(dim1, array(dim1)) + { + EIGEN_STATIC_ASSERT(1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2) + : m_storage(dim1*dim2, array(dim1, dim2)) + { + EIGEN_STATIC_ASSERT(2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3) + : m_storage(dim1*dim2*dim3, array(dim1, dim2, dim3)) + { + EIGEN_STATIC_ASSERT(3 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3, Index dim4) + : m_storage(dim1*dim2*dim3*dim4, array(dim1, dim2, dim3, dim4)) + { + EIGEN_STATIC_ASSERT(4 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) + : m_storage(dim1*dim2*dim3*dim4*dim5, array(dim1, dim2, dim3, dim4, dim5)) + { + EIGEN_STATIC_ASSERT(5 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + } +#endif + + /** Normal Dimension */ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(const array& dimensions) + : m_storage(internal::array_prod(dimensions), dimensions) + { + EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED + } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor(const TensorBase& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other.derived()); + resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); + internal::TensorExecutor::run(assign, DefaultDevice()); + } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor(const TensorBase& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other.derived()); + resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); + internal::TensorExecutor::run(assign, DefaultDevice()); + } + + #if EIGEN_HAS_RVALUE_REFERENCES + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor(Self&& other) + : m_storage(std::move(other.m_storage)) + { + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor& operator=(Self&& other) + { + m_storage = std::move(other.m_storage); + return *this; + } + #endif + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor& operator=(const Tensor& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Tensor& operator=(const OtherDerived& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other); + resize(TensorEvaluator(assign, DefaultDevice()).dimensions()); + internal::TensorExecutor::run(assign, DefaultDevice()); + return *this; + } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template EIGEN_DEVICE_FUNC + void resize(Index firstDimension, IndexTypes... otherDimensions) + { + // The number of dimensions used to resize a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + resize(array{{firstDimension, otherDimensions...}}); + } +#endif + + /** Normal Dimension */ + EIGEN_DEVICE_FUNC void resize(const array& dimensions) + { + int i; + Index size = Index(1); + for (i = 0; i < NumIndices; i++) { + internal::check_rows_cols_for_overflow::run(size, dimensions[i]); + size *= dimensions[i]; + } + #ifdef EIGEN_INITIALIZE_COEFFS + bool size_changed = size != this->size(); + m_storage.resize(size, dimensions); + if(size_changed) EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED + #else + m_storage.resize(size, dimensions); + #endif + } + + // Why this overload, DSizes is derived from array ??? // + EIGEN_DEVICE_FUNC void resize(const DSizes& dimensions) { + array dims; + for (int i = 0; i < NumIndices; ++i) { + dims[i] = dimensions[i]; + } + resize(dims); + } + + EIGEN_DEVICE_FUNC + void resize() + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + // Nothing to do: rank 0 tensors have fixed size + } + +#ifdef EIGEN_HAS_INDEX_LIST + template + EIGEN_DEVICE_FUNC + void resize(const Eigen::IndexList& dimensions) { + array dims; + for (int i = 0; i < NumIndices; ++i) { + dims[i] = static_cast(dimensions[i]); + } + resize(dims); + } +#endif + + /** Custom Dimension */ +#ifdef EIGEN_HAS_SFINAE + template::value) ) + > + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize(CustomDimension& dimensions) + { + resize(internal::customIndices2Array(dimensions)); + } +#endif + +#ifndef EIGEN_EMULATE_CXX11_META_H + template + EIGEN_DEVICE_FUNC + void resize(const Sizes& dimensions) { + array dims; + for (int i = 0; i < NumIndices; ++i) { + dims[i] = static_cast(dimensions[i]); + } + resize(dims); + } +#else + template + EIGEN_DEVICE_FUNC + void resize(const Sizes& dimensions) { + array dims; + for (int i = 0; i < NumIndices; ++i) { + dims[i] = static_cast(dimensions[i]); + } + resize(dims); + } +#endif + + protected: + + bool checkIndexRange(const array& indices) const + { + using internal::array_apply_and_reduce; + using internal::array_zip_and_reduce; + using internal::greater_equal_zero_op; + using internal::logical_and_op; + using internal::lesser_op; + + return + // check whether the indices are all >= 0 + array_apply_and_reduce(indices) && + // check whether the indices fit in the dimensions + array_zip_and_reduce(indices, m_storage.dimensions()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index linearizedIndex(const array& indices) const + { + if (Options&RowMajor) { + return m_storage.dimensions().IndexOfRowMajor(indices); + } else { + return m_storage.dimensions().IndexOfColMajor(indices); + } + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorArgMax.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorArgMax.h new file mode 100644 index 0000000..8b8fb92 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorArgMax.h @@ -0,0 +1,329 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Eugene Brevdo +// Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H +#define EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H + +namespace Eigen { +namespace internal { + +/** \class TensorIndexTuple + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor + Index Tuple class. + * + * + */ +template +struct traits > : public traits +{ + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef Tuple Scalar; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorIndexTupleOpEIGEN_DEVICE_REF type; +}; + +template +struct nested, 1, + typename eval >::type> +{ + typedef TensorIndexTupleOp type; +}; + +} // end namespace internal + +template +class TensorIndexTupleOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + typedef Tuple CoeffReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIndexTupleOp(const XprType& expr) + : m_xpr(expr) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorIndexTupleOp XprType; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + + typedef typename TensorEvaluator::Dimensions Dimensions; + static const int NumDims = internal::array_size::value; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + enum { + IsAligned = /*TensorEvaluator::IsAligned*/ false, + PacketAccess = /*TensorEvaluator::PacketAccess*/ false, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device) { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { + return m_impl.dimensions(); + } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return CoeffReturnType(index, m_impl.coeff(index)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, 1); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + +#ifdef EIGEN_USE_SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif + + protected: + TensorEvaluator m_impl; +}; + +namespace internal { + +/** \class TensorTupleIndex + * \ingroup CXX11_Tensor_Module + * + * \brief Converts to Tensor > and reduces to Tensor. + * + */ +template +struct traits > : public traits +{ + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef Index Scalar; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions - array_size::value; + static const int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorTupleReducerOpEIGEN_DEVICE_REF type; +}; + +template +struct nested, 1, + typename eval >::type> +{ + typedef TensorTupleReducerOp type; +}; + +} // end namespace internal + +template +class TensorTupleReducerOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + typedef Index CoeffReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTupleReducerOp(const XprType& expr, + const ReduceOp& reduce_op, + const Index return_dim, + const Dims& reduce_dims) + : m_xpr(expr), m_reduce_op(reduce_op), m_return_dim(return_dim), m_reduce_dims(reduce_dims) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC + const ReduceOp& reduce_op() const { return m_reduce_op; } + + EIGEN_DEVICE_FUNC + const Dims& reduce_dims() const { return m_reduce_dims; } + + EIGEN_DEVICE_FUNC + Index return_dim() const { return m_return_dim; } + + protected: + typename XprType::Nested m_xpr; + const ReduceOp m_reduce_op; + const Index m_return_dim; + const Dims m_reduce_dims; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorTupleReducerOp XprType; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename TensorIndexTupleOp::CoeffReturnType TupleType; + typedef typename TensorEvaluator >, Device>::Dimensions Dimensions; + typedef typename TensorEvaluator , Device>::Dimensions InputDimensions; + static const int NumDims = internal::array_size::value; + typedef array StrideDims; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + typedef StorageMemory TupleStorageMem; + + enum { + IsAligned = /*TensorEvaluator::IsAligned*/ false, + PacketAccess = /*TensorEvaluator::PacketAccess*/ false, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + Layout = TensorEvaluator >, Device>::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_orig_impl(op.expression(), device), + m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device), + m_return_dim(op.return_dim()) + { + gen_strides(m_orig_impl.dimensions(), m_strides); + if (Layout == static_cast(ColMajor)) { + const Index total_size = internal::array_prod(m_orig_impl.dimensions()); + m_stride_mod = (m_return_dim < NumDims - 1) ? m_strides[m_return_dim + 1] : total_size; + } else { + const Index total_size = internal::array_prod(m_orig_impl.dimensions()); + m_stride_mod = (m_return_dim > 0) ? m_strides[m_return_dim - 1] : total_size; + } + // If m_return_dim is not a valid index, returns 1 or this can crash on Windows. + m_stride_div = ((m_return_dim >= 0) && + (m_return_dim < static_cast(m_strides.size()))) + ? m_strides[m_return_dim] : 1; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { + return m_impl.dimensions(); + } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + const TupleType v = m_impl.coeff(index); + return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div; + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } +#ifdef EIGEN_USE_SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + m_orig_impl.bind(cgh); + } +#endif + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double compute_cost = 1.0 + + (m_return_dim < 0 ? 0.0 : (TensorOpCost::ModCost() + TensorOpCost::DivCost())); + return m_orig_impl.costPerCoeff(vectorized) + + m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost); + } + + private: + EIGEN_DEVICE_FUNC void gen_strides(const InputDimensions& dims, StrideDims& strides) { + if (m_return_dim < 0) { + return; // Won't be using the strides. + } + eigen_assert(m_return_dim < NumDims && + "Asking to convert index to a dimension outside of the rank"); + + // Calculate m_stride_div and m_stride_mod, which are used to + // calculate the value of an index w.r.t. the m_return_dim. + if (Layout == static_cast(ColMajor)) { + strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + strides[i] = strides[i-1] * dims[i-1]; + } + } else { + strides[NumDims-1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + strides[i] = strides[i+1] * dims[i+1]; + } + } + } + + protected: + TensorEvaluator, Device> m_orig_impl; + TensorEvaluator >, Device> m_impl; + const Index m_return_dim; + StrideDims m_strides; + Index m_stride_mod; + Index m_stride_div; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorAssign.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorAssign.h new file mode 100644 index 0000000..e5811d6 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorAssign.h @@ -0,0 +1,247 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H +#define EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H + +namespace Eigen { + +/** \class TensorAssign + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor assignment class. + * + * This class is represents the assignment of the values resulting from the evaluation of + * the rhs expression to the memory locations denoted by the lhs expression. + */ +namespace internal { +template +struct traits > +{ + typedef typename LhsXprType::Scalar Scalar; + typedef typename traits::StorageKind StorageKind; + typedef typename promote_index_type::Index, + typename traits::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef typename remove_reference::type _LhsNested; + typedef typename remove_reference::type _RhsNested; + static const std::size_t NumDimensions = internal::traits::NumDimensions; + static const int Layout = internal::traits::Layout; + typedef typename traits::PointerType PointerType; + + enum { + Flags = 0 + }; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorAssignOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorAssignOp type; +}; + +} // end namespace internal + + + +template +class TensorAssignOp : public TensorBase > +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename LhsXprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + static const int NumDims = Eigen::internal::traits::NumDimensions; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs) {} + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC + typename internal::remove_all::type& + lhsExpression() const { return *((typename internal::remove_all::type*)&m_lhs_xpr); } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + rhsExpression() const { return m_rhs_xpr; } + + protected: + typename internal::remove_all::type& m_lhs_xpr; + const typename internal::remove_all::type& m_rhs_xpr; +}; + + +template +struct TensorEvaluator, Device> +{ + typedef TensorAssignOp XprType; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename TensorEvaluator::Dimensions Dimensions; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static const int PacketSize = PacketType::size; + static const int NumDims = XprType::NumDims; + + enum { + IsAligned = int(TensorEvaluator::IsAligned) & + int(TensorEvaluator::IsAligned), + PacketAccess = int(TensorEvaluator::PacketAccess) & + int(TensorEvaluator::PacketAccess), + BlockAccess = int(TensorEvaluator::BlockAccess) & + int(TensorEvaluator::BlockAccess), + PreferBlockAccess = int(TensorEvaluator::PreferBlockAccess) | + int(TensorEvaluator::PreferBlockAccess), + Layout = TensorEvaluator::Layout, + RawAccess = TensorEvaluator::RawAccess + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlock + RightTensorBlock; + //===--------------------------------------------------------------------===// + + TensorEvaluator(const XprType& op, const Device& device) : + m_leftImpl(op.lhsExpression(), device), + m_rightImpl(op.rhsExpression(), device) + { + EIGEN_STATIC_ASSERT( + (static_cast(TensorEvaluator::Layout) == + static_cast(TensorEvaluator::Layout)), + YOU_MADE_A_PROGRAMMING_MISTAKE); + } + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const + { + // The dimensions of the lhs and the rhs tensors should be equal to prevent + // overflows and ensure the result is fully initialized. + // TODO: use left impl instead if right impl dimensions are known at compile time. + return m_rightImpl.dimensions(); + } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions())); + m_leftImpl.evalSubExprsIfNeeded(NULL); + // If the lhs provides raw access to its storage area (i.e. if m_leftImpl.data() returns a non + // null value), attempt to evaluate the rhs expression in place. Returns true iff in place + // evaluation isn't supported and the caller still needs to manually assign the values generated + // by the rhs to the lhs. + return m_rightImpl.evalSubExprsIfNeeded(m_leftImpl.data()); + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType, EvalSubExprsCallback done) { + m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done](bool) { + m_rightImpl.evalSubExprsIfNeededAsync( + m_leftImpl.data(), [done](bool need_assign) { done(need_assign); }); + }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { + m_leftImpl.cleanup(); + m_rightImpl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) { + m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) { + + const int LhsStoreMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; + const int RhsLoadMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; + m_leftImpl.template writePacket(i, m_rightImpl.template packet(i)); + } + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + { + return m_leftImpl.coeff(index); + } + template + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + { + return m_leftImpl.template packet(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + // We assume that evalPacket or evalScalar is called to perform the + // assignment and account for the cost of the write here, but reduce left + // cost by one load because we are using m_leftImpl.coeffRef. + TensorOpCost left = m_leftImpl.costPerCoeff(vectorized); + return m_rightImpl.costPerCoeff(vectorized) + + TensorOpCost( + numext::maxi(0.0, left.bytes_loaded() - sizeof(CoeffReturnType)), + left.bytes_stored(), left.compute_cycles()) + + TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + return internal::TensorBlockResourceRequirements::merge( + m_leftImpl.getResourceRequirements(), + m_rightImpl.getResourceRequirements()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock( + TensorBlockDesc& desc, TensorBlockScratch& scratch) { + if (TensorEvaluator::RawAccess && + m_leftImpl.data() != NULL) { + // If destination has raw data access, we pass it as a potential + // destination for a block descriptor evaluation. + desc.template AddDestinationBuffer( + /*dst_base=*/m_leftImpl.data() + desc.offset(), + /*dst_strides=*/internal::strides(m_leftImpl.dimensions())); + } + + RightTensorBlock block = m_rightImpl.block(desc, scratch, /*root_of_expr_ast=*/true); + // If block was evaluated into a destination, there is no need to do assignment. + if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) { + m_leftImpl.writeBlock(desc, block); + } + block.cleanup(); + } + +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_leftImpl.bind(cgh); + m_rightImpl.bind(cgh); + } +#endif + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_leftImpl.data(); } + + private: + TensorEvaluator m_leftImpl; + TensorEvaluator m_rightImpl; +}; + +} + + +#endif // EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorBase.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorBase.h new file mode 100644 index 0000000..35b6458 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorBase.h @@ -0,0 +1,1176 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_BASE_H +#define EIGEN_CXX11_TENSOR_TENSOR_BASE_H + +// clang-format off + +namespace Eigen { + +/** \class TensorBase + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor base class. + * + * This class is the common parent of the Tensor and TensorMap class, thus + * making it possible to use either class interchangeably in expressions. + */ +#ifndef EIGEN_PARSED_BY_DOXYGEN +// FIXME Doxygen does not like the inheritance with different template parameters +// Since there is no doxygen documentation inside, we disable it for now +template +class TensorBase +{ + public: + typedef internal::traits DerivedTraits; + typedef typename DerivedTraits::Scalar Scalar; + typedef typename DerivedTraits::Index Index; + typedef typename internal::remove_const::type CoeffReturnType; + static const int NumDimensions = DerivedTraits::NumDimensions; + + // Generic nullary operation support. + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseNullaryOp + nullaryExpr(const CustomNullaryOp& func) const { + return TensorCwiseNullaryOp(derived(), func); + } + + // Coefficient-wise nullary operators + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseNullaryOp, const Derived> + constant(const Scalar& value) const { + return nullaryExpr(internal::scalar_constant_op(value)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseNullaryOp, const Derived> + random() const { + return nullaryExpr(internal::UniformRandomGenerator()); + } + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseNullaryOp + random(const RandomGenerator& gen = RandomGenerator()) const { + return nullaryExpr(gen); + } + + // Tensor generation + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorGeneratorOp + generate(const Generator& generator) const { + return TensorGeneratorOp(derived(), generator); + } + + // Generic unary operation support. + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp + unaryExpr(const CustomUnaryOp& func) const { + return TensorCwiseUnaryOp(derived(), func); + } + + // Coefficient-wise unary operators + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + operator-() const { + return unaryExpr(internal::scalar_opposite_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + sqrt() const { + return unaryExpr(internal::scalar_sqrt_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + sign() const { + return unaryExpr(internal::scalar_sign_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + rsqrt() const { + return unaryExpr(internal::scalar_rsqrt_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + square() const { + return unaryExpr(internal::scalar_square_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + cube() const { + return unaryExpr(internal::scalar_cube_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + inverse() const { + return unaryExpr(internal::scalar_inverse_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + tanh() const { + return unaryExpr(internal::scalar_tanh_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + lgamma() const { + return unaryExpr(internal::scalar_lgamma_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + digamma() const { + return unaryExpr(internal::scalar_digamma_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_i0() const { + return unaryExpr(internal::scalar_bessel_i0_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_i0e() const { + return unaryExpr(internal::scalar_bessel_i0e_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_i1() const { + return unaryExpr(internal::scalar_bessel_i1_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_i1e() const { + return unaryExpr(internal::scalar_bessel_i1e_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_j0() const { + return unaryExpr(internal::scalar_bessel_j0_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_y0() const { + return unaryExpr(internal::scalar_bessel_y0_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_j1() const { + return unaryExpr(internal::scalar_bessel_j1_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_y1() const { + return unaryExpr(internal::scalar_bessel_y1_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_k0() const { + return unaryExpr(internal::scalar_bessel_k0_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_k0e() const { + return unaryExpr(internal::scalar_bessel_k0e_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_k1() const { + return unaryExpr(internal::scalar_bessel_k1_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + bessel_k1e() const { + return unaryExpr(internal::scalar_bessel_k1e_op()); + } + + // igamma(a = this, x = other) + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + igamma(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_igamma_op()); + } + + // igamma_der_a(a = this, x = other) + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + igamma_der_a(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_igamma_der_a_op()); + } + + // gamma_sample_der_alpha(alpha = this, sample = other) + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + gamma_sample_der_alpha(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_gamma_sample_der_alpha_op()); + } + + // igammac(a = this, x = other) + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + igammac(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_igammac_op()); + } + + // zeta(x = this, q = other) + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + zeta(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_zeta_op()); + } + + // polygamma(n = this, x = other) + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + polygamma(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_polygamma_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + erf() const { + return unaryExpr(internal::scalar_erf_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + erfc() const { + return unaryExpr(internal::scalar_erfc_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + ndtri() const { + return unaryExpr(internal::scalar_ndtri_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + sigmoid() const { + return unaryExpr(internal::scalar_logistic_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + exp() const { + return unaryExpr(internal::scalar_exp_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + expm1() const { + return unaryExpr(internal::scalar_expm1_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + log() const { + return unaryExpr(internal::scalar_log_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + log1p() const { + return unaryExpr(internal::scalar_log1p_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + log2() const { + return unaryExpr(internal::scalar_log2_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + abs() const { + return unaryExpr(internal::scalar_abs_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + clip(Scalar min, Scalar max) const { + return unaryExpr(internal::scalar_clamp_op(min, max)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const typename internal::conditional::IsComplex, + TensorCwiseUnaryOp, const Derived>, + Derived>::type + conjugate() const { + return choose(Cond::IsComplex>(), unaryExpr(internal::scalar_conjugate_op()), derived()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp >, const Derived> + pow(Scalar exponent) const { + return unaryExpr(internal::bind2nd_op >(exponent)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + real() const { + return unaryExpr(internal::scalar_real_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + imag() const { + return unaryExpr(internal::scalar_imag_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp >, const Derived> + operator+ (Scalar rhs) const { + return unaryExpr(internal::bind2nd_op >(rhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE friend + const TensorCwiseUnaryOp >, const Derived> + operator+ (Scalar lhs, const Derived& rhs) { + return rhs.unaryExpr(internal::bind1st_op >(lhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp >, const Derived> + operator- (Scalar rhs) const { + EIGEN_STATIC_ASSERT((NumTraits::IsSigned || internal::is_same >::value), YOU_MADE_A_PROGRAMMING_MISTAKE); + return unaryExpr(internal::bind2nd_op >(rhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE friend + const TensorCwiseUnaryOp >, const Derived> + operator- (Scalar lhs, const Derived& rhs) { + return rhs.unaryExpr(internal::bind1st_op >(lhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp >, const Derived> + operator* (Scalar rhs) const { + return unaryExpr(internal::bind2nd_op >(rhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE friend + const TensorCwiseUnaryOp >, const Derived> + operator* (Scalar lhs, const Derived& rhs) { + return rhs.unaryExpr(internal::bind1st_op >(lhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp >, const Derived> + operator/ (Scalar rhs) const { + return unaryExpr(internal::bind2nd_op >(rhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE friend + const TensorCwiseUnaryOp >, const Derived> + operator/ (Scalar lhs, const Derived& rhs) { + return rhs.unaryExpr(internal::bind1st_op >(lhs)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + operator% (Scalar rhs) const { + EIGEN_STATIC_ASSERT(NumTraits::IsInteger, YOU_MADE_A_PROGRAMMING_MISTAKE_TRY_MOD); + return unaryExpr(internal::scalar_mod_op(rhs)); + } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + cwiseMax(Scalar threshold) const { + return cwiseMax(constant(threshold)); + } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + cwiseMin(Scalar threshold) const { + return cwiseMin(constant(threshold)); + } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const typename internal::conditional::value, + Derived, + TensorConversionOp >::type + cast() const { + return choose(Cond::value>(), derived(), TensorConversionOp(derived())); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + round() const { + return unaryExpr(internal::scalar_round_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + rint() const { + return unaryExpr(internal::scalar_rint_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + ceil() const { + return unaryExpr(internal::scalar_ceil_op()); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + floor() const { + return unaryExpr(internal::scalar_floor_op()); + } + + // Generic binary operation support. + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp + binaryExpr(const OtherDerived& other, const CustomBinaryOp& func) const { + return TensorCwiseBinaryOp(derived(), other, func); + } + + // Coefficient-wise binary operators. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator+(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_sum_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator-(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_difference_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator*(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_product_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator/(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_quotient_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + cwiseMax(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_max_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + cwiseMin(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_min_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp + operator&&(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_boolean_and_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp + operator||(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_boolean_or_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp + operator^(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_boolean_xor_op()); + } + + // Comparisons and tests. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator<(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_cmp_op()); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator<=(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_cmp_op()); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator>(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_cmp_op()); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator>=(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_cmp_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator==(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_cmp_op()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCwiseBinaryOp, const Derived, const OtherDerived> + operator!=(const OtherDerived& other) const { + return binaryExpr(other.derived(), internal::scalar_cmp_op()); + } + + // comparisons and tests for Scalars + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + operator<(Scalar threshold) const { + return operator<(constant(threshold)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + operator<=(Scalar threshold) const { + return operator<=(constant(threshold)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + operator>(Scalar threshold) const { + return operator>(constant(threshold)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + operator>=(Scalar threshold) const { + return operator>=(constant(threshold)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + operator==(Scalar threshold) const { + return operator==(constant(threshold)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const TensorCwiseNullaryOp, const Derived> > + operator!=(Scalar threshold) const { + return operator!=(constant(threshold)); + } + + // Checks + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + (isnan)() const { + return unaryExpr(internal::scalar_isnan_op()); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + (isinf)() const { + return unaryExpr(internal::scalar_isinf_op()); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + (isfinite)() const { + return unaryExpr(internal::scalar_isfinite_op()); + } + + // Coefficient-wise ternary operators. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorSelectOp + select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) const { + return TensorSelectOp(derived(), thenTensor.derived(), elseTensor.derived()); + } + + // Contractions. + typedef Eigen::IndexPair DimensionPair; + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorContractionOp + contract(const OtherDerived& other, const Dimensions& dims) const { + return TensorContractionOp(derived(), other.derived(), dims); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorContractionOp + contract(const OtherDerived& other, const Dimensions& dims, const OutputKernel& output_kernel) const { + return TensorContractionOp(derived(), other.derived(), dims, output_kernel); + } + + // Convolutions. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorConvolutionOp + convolve(const KernelDerived& kernel, const Dimensions& dims) const { + return TensorConvolutionOp(derived(), kernel.derived(), dims); + } + + // Fourier transforms + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorFFTOp + fft(const FFT& dims) const { + return TensorFFTOp(derived(), dims); + } + + // Scan. + typedef TensorScanOp, const Derived> TensorScanSumOp; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorScanSumOp + cumsum(const Index& axis, bool exclusive = false) const { + return TensorScanSumOp(derived(), axis, exclusive); + } + + typedef TensorScanOp, const Derived> TensorScanProdOp; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorScanProdOp + cumprod(const Index& axis, bool exclusive = false) const { + return TensorScanProdOp(derived(), axis, exclusive); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorScanOp + scan(const Index& axis, const Reducer& reducer, bool exclusive = false) const { + return TensorScanOp(derived(), axis, exclusive, reducer); + } + + // Reductions. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const Dims, const Derived> + sum(const Dims& dims) const { + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::SumReducer()); + } + + const TensorReductionOp, const DimensionList, const Derived> + sum() const { + DimensionList in_dims; + return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::SumReducer()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const Dims, const Derived> + mean(const Dims& dims) const { + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MeanReducer()); + } + + const TensorReductionOp, const DimensionList, const Derived> + mean() const { + DimensionList in_dims; + return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::MeanReducer()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const Dims, const Derived> + prod(const Dims& dims) const { + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::ProdReducer()); + } + + const TensorReductionOp, const DimensionList, const Derived> + prod() const { + DimensionList in_dims; + return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::ProdReducer()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const Dims, const Derived> + maximum(const Dims& dims) const { + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MaxReducer()); + } + + template + const TensorReductionOp, const DimensionList, const Derived> + maximum() const { + DimensionList in_dims; + return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::MaxReducer()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const Dims, const Derived> + minimum(const Dims& dims) const { + return TensorReductionOp, const Dims, const Derived>(derived(), dims, internal::MinReducer()); + } + + template + const TensorReductionOp, const DimensionList, const Derived> + minimum() const { + DimensionList in_dims; + return TensorReductionOp, const DimensionList, const Derived>(derived(), in_dims, internal::MinReducer()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp::value, Derived, TensorConversionOp >::type > + all(const Dims& dims) const { + return cast().reduce(dims, internal::AndReducer()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const typename internal::conditional::value, Derived, TensorConversionOp >::type > + all() const { + DimensionList in_dims; + return cast().reduce(in_dims, internal::AndReducer()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp::value, Derived, TensorConversionOp >::type > + any(const Dims& dims) const { + return cast().reduce(dims, internal::OrReducer()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp, const typename internal::conditional::value, Derived, TensorConversionOp >::type > + any() const { + DimensionList in_dims; + return cast().reduce(in_dims, internal::OrReducer()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorTupleReducerOp< + internal::ArgMaxTupleReducer >, + const array, const Derived> + argmax() const { + array in_dims; + for (Index d = 0; d < NumDimensions; ++d) in_dims[d] = d; + return TensorTupleReducerOp< + internal::ArgMaxTupleReducer >, + const array, + const Derived>(derived(), internal::ArgMaxTupleReducer >(), -1, in_dims); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorTupleReducerOp< + internal::ArgMinTupleReducer >, + const array, const Derived> + argmin() const { + array in_dims; + for (Index d = 0; d < NumDimensions; ++d) in_dims[d] = d; + return TensorTupleReducerOp< + internal::ArgMinTupleReducer >, + const array, + const Derived>(derived(), internal::ArgMinTupleReducer >(), -1, in_dims); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorTupleReducerOp< + internal::ArgMaxTupleReducer >, + const array, const Derived> + argmax(const Index return_dim) const { + array in_dims; + in_dims[0] = return_dim; + return TensorTupleReducerOp< + internal::ArgMaxTupleReducer >, + const array, + const Derived>(derived(), internal::ArgMaxTupleReducer >(), return_dim, in_dims); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorTupleReducerOp< + internal::ArgMinTupleReducer >, + const array, const Derived> + argmin(const Index return_dim) const { + array in_dims; + in_dims[0] = return_dim; + return TensorTupleReducerOp< + internal::ArgMinTupleReducer >, + const array, + const Derived>(derived(), internal::ArgMinTupleReducer >(), return_dim, in_dims); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReductionOp + reduce(const Dims& dims, const Reducer& reducer) const { + return TensorReductionOp(derived(), dims, reducer); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorTraceOp + trace(const Dims& dims) const { + return TensorTraceOp(derived(), dims); + } + + const TensorTraceOp, const Derived> + trace() const { + DimensionList in_dims; + return TensorTraceOp, const Derived>(derived(), in_dims); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorBroadcastingOp + broadcast(const Broadcast& bcast) const { + return TensorBroadcastingOp(derived(), bcast); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorConcatenationOp + concatenate(const OtherDerived& other, Axis axis) const { + return TensorConcatenationOp(derived(), other.derived(), axis); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorPatchOp + extract_patches(const PatchDims& patch_dims) const { + return TensorPatchOp(derived(), patch_dims); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorImagePatchOp + extract_image_patches(const Index patch_rows = 1, const Index patch_cols = 1, + const Index row_stride = 1, const Index col_stride = 1, + const Index in_row_stride = 1, const Index in_col_stride = 1, + const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = Scalar(0)) const { + return TensorImagePatchOp(derived(), patch_rows, patch_cols, row_stride, col_stride, + in_row_stride, in_col_stride, 1, 1, padding_type, padding_value); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorImagePatchOp + extract_image_patches(const Index patch_rows, const Index patch_cols, + const Index row_stride, const Index col_stride, + const Index in_row_stride, const Index in_col_stride, + const Index row_inflate_stride, const Index col_inflate_stride, + const Index padding_top, const Index padding_bottom, + const Index padding_left,const Index padding_right, + const Scalar padding_value) const { + return TensorImagePatchOp(derived(), patch_rows, patch_cols, row_stride, col_stride, + in_row_stride, in_col_stride, row_inflate_stride, col_inflate_stride, + padding_top, padding_bottom, padding_left, padding_right, padding_value); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorVolumePatchOp + extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols, + const Index plane_stride = 1, const Index row_stride = 1, const Index col_stride = 1, + const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = Scalar(0)) const { + return TensorVolumePatchOp(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, 1, 1, 1, padding_type, padding_value); + } + + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorVolumePatchOp + extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols, + const Index plane_stride, const Index row_stride, const Index col_stride, + const Index plane_inflate_stride, const Index row_inflate_stride, const Index col_inflate_stride, + const Index padding_top_z, const Index padding_bottom_z, + const Index padding_top, const Index padding_bottom, + const Index padding_left, const Index padding_right, const Scalar padding_value = Scalar(0)) const { + return TensorVolumePatchOp(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, plane_inflate_stride, row_inflate_stride, col_inflate_stride, padding_top_z, padding_bottom_z, padding_top, padding_bottom, padding_left, padding_right, padding_value); + } + + // Morphing operators. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorLayoutSwapOp + swap_layout() const { + return TensorLayoutSwapOp(derived()); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReshapingOp + reshape(const NewDimensions& newDimensions) const { + return TensorReshapingOp(derived(), newDimensions); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorSlicingOp + slice(const StartIndices& startIndices, const Sizes& sizes) const { + return TensorSlicingOp(derived(), startIndices, sizes); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorStridingSlicingOp + stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) const { + return TensorStridingSlicingOp(derived(), startIndices, stopIndices, strides); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorChippingOp + chip(const Index offset) const { + return TensorChippingOp(derived(), offset, DimId); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorChippingOp + chip(const Index offset, const Index dim) const { + return TensorChippingOp(derived(), offset, dim); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReverseOp + reverse(const ReverseDimensions& rev) const { + return TensorReverseOp(derived(), rev); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorPaddingOp + pad(const PaddingDimensions& padding) const { + return TensorPaddingOp(derived(), padding, internal::scalar_cast_op()(0)); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorPaddingOp + pad(const PaddingDimensions& padding, const Scalar padding_value) const { + return TensorPaddingOp(derived(), padding, padding_value); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorShufflingOp + shuffle(const Shuffle& shfl) const { + return TensorShufflingOp(derived(), shfl); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorStridingOp + stride(const Strides& strides) const { + return TensorStridingOp(derived(), strides); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorInflationOp + inflate(const Strides& strides) const { + return TensorInflationOp(derived(), strides); + } + + // Returns a tensor containing index/value tuples + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorIndexTupleOp + index_tuples() const { + return TensorIndexTupleOp(derived()); + } + + // Support for custom unary and binary operations + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCustomUnaryOp customOp(const CustomUnaryFunc& op) const { + return TensorCustomUnaryOp(derived(), op); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorCustomBinaryOp customOp(const OtherDerived& other, const CustomBinaryFunc& op) const { + return TensorCustomBinaryOp(derived(), other, op); + } + + // Force the evaluation of the expression. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorForcedEvalOp eval() const { + return TensorForcedEvalOp(derived()); + } + + protected: + template friend class Tensor; + template friend class TensorFixedSize; + // the Eigen:: prefix is required to workaround a compilation issue with nvcc 9.0 + template friend class Eigen::TensorBase; + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast(this); } +}; + +template::value> +class TensorBase : public TensorBase { + public: + typedef TensorBase Base; + typedef internal::traits DerivedTraits; + typedef typename DerivedTraits::Scalar Scalar; + typedef typename DerivedTraits::Index Index; + typedef Scalar CoeffReturnType; + static const int NumDimensions = DerivedTraits::NumDimensions; + + template friend class Tensor; + template friend class TensorFixedSize; + // the Eigen:: prefix is required to workaround a compilation issue with nvcc 9.0 + template friend class Eigen::TensorBase; + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setZero() { + return setConstant(Scalar(0)); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setConstant(const Scalar& val) { + return derived() = this->constant(val); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setRandom() { + return derived() = this->random(); + } + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setRandom() { + return derived() = this->template random(); + } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& setValues( + const typename internal::Initializer::InitList& vals) { + TensorEvaluator eval(derived(), DefaultDevice()); + internal::initialize_tensor(eval, vals); + return derived(); + } +#endif // EIGEN_HAS_VARIADIC_TEMPLATES + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator+=(const OtherDerived& other) { + return derived() = derived() + other.derived(); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator-=(const OtherDerived& other) { + return derived() = derived() - other.derived(); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator*=(const OtherDerived& other) { + return derived() = derived() * other.derived(); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Derived& operator/=(const OtherDerived& other) { + return derived() = derived() / other.derived(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorLayoutSwapOp + swap_layout() const { + return TensorLayoutSwapOp(derived()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorLayoutSwapOp + swap_layout() { + return TensorLayoutSwapOp(derived()); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorConcatenationOp + concatenate(const OtherDerived& other, const Axis& axis) const { + return TensorConcatenationOp(derived(), other, axis); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorConcatenationOp + concatenate(const OtherDerived& other, const Axis& axis) { + return TensorConcatenationOp(derived(), other, axis); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReshapingOp + reshape(const NewDimensions& newDimensions) const { + return TensorReshapingOp(derived(), newDimensions); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorReshapingOp + reshape(const NewDimensions& newDimensions) { + return TensorReshapingOp(derived(), newDimensions); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorSlicingOp + slice(const StartIndices& startIndices, const Sizes& sizes) const { + return TensorSlicingOp(derived(), startIndices, sizes); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorSlicingOp + slice(const StartIndices& startIndices, const Sizes& sizes) { + return TensorSlicingOp(derived(), startIndices, sizes); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorStridingSlicingOp + stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) const { + return TensorStridingSlicingOp(derived(), startIndices, stopIndices, strides); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorStridingSlicingOp + stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) { + return TensorStridingSlicingOp(derived(), startIndices, stopIndices, strides); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorChippingOp + chip(const Index offset) const { + return TensorChippingOp(derived(), offset, DimId); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorChippingOp + chip(const Index offset) { + return TensorChippingOp(derived(), offset, DimId); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorChippingOp + chip(const Index offset, const Index dim) const { + return TensorChippingOp(derived(), offset, dim); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorChippingOp + chip(const Index offset, const Index dim) { + return TensorChippingOp(derived(), offset, dim); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorReverseOp + reverse(const ReverseDimensions& rev) const { + return TensorReverseOp(derived(), rev); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorReverseOp + reverse(const ReverseDimensions& rev) { + return TensorReverseOp(derived(), rev); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorShufflingOp + shuffle(const Shuffle& shfl) const { + return TensorShufflingOp(derived(), shfl); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorShufflingOp + shuffle(const Shuffle& shfl) { + return TensorShufflingOp(derived(), shfl); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const TensorStridingOp + stride(const Strides& strides) const { + return TensorStridingOp(derived(), strides); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorStridingOp + stride(const Strides& strides) { + return TensorStridingOp(derived(), strides); + } + + // Select the device on which to evaluate the expression. + template + TensorDevice device(const DeviceType& dev) { + return TensorDevice(dev, derived()); + } + + // Select the async device on which to evaluate the expression. + template + TensorAsyncDevice device(const DeviceType& dev, DoneCallback done) { + return TensorAsyncDevice(dev, derived(), std::move(done)); + } + + protected: + EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TensorBase) + EIGEN_DEFAULT_COPY_CONSTRUCTOR(TensorBase) + + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& operator=(const OtherDerived& other) + { + typedef TensorAssignOp Assign; + Assign assign(derived(), other.derived()); + internal::TensorExecutor::run(assign, DefaultDevice()); + return derived(); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Derived& derived() { return *static_cast(this); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast(this); } +}; +#endif // EIGEN_PARSED_BY_DOXYGEN +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_BASE_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorBlock.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorBlock.h new file mode 100644 index 0000000..1e55d12 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorBlock.h @@ -0,0 +1,1559 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H +#define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H + +namespace Eigen { +namespace internal { + +// -------------------------------------------------------------------------- // +// Forward declarations for templates defined below. +template +class TensorBlockIO; + +// -------------------------------------------------------------------------- // +// Helper function to compute strides for densely stored buffer of given +// dimensions. + +// TODO(ezhulenev): We compute strides 1000 times in different evaluators, use +// this function instead everywhere. +template +EIGEN_ALWAYS_INLINE DSizes strides( + const DSizes& dimensions) { + DSizes strides; + if (NumDims == 0) return strides; + + // TODO(ezhulenev): Use templates to unroll this loop (similar to + // h_array_reduce in CXX11meta.h)? Benchmark it. + if (static_cast(Layout) == static_cast(ColMajor)) { + strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + strides[i] = strides[i - 1] * dimensions[i - 1]; + } + } else { + strides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + strides[i] = strides[i + 1] * dimensions[i + 1]; + } + } + + return strides; +} + +template +EIGEN_ALWAYS_INLINE DSizes strides( + const Eigen::array& dimensions) { + return strides(DSizes(dimensions)); +} + +template +EIGEN_STRONG_INLINE DSizes strides( + const Sizes& sizes) { + return strides(DSizes(sizes)); +} + +// -------------------------------------------------------------------------- // + +// Tensor block shape type defines what are the shape preference for the blocks +// extracted from the larger tensor. +// +// Example: blocks of 100 elements from the large 100x100 tensor: +// - tensor: 100x100 +// - target_block_size: 100 +// +// TensorBlockShapeType: +// - kUniformAllDims: 100 blocks of size 10x10 +// - kSkewedInnerDims: 100 blocks of size 100x1 (or 1x100 depending on a column +// or row major layout) +enum class TensorBlockShapeType { kUniformAllDims, kSkewedInnerDims }; + +struct TensorBlockResourceRequirements { + TensorBlockShapeType shape_type; // target block shape + size_t size; // target block size + TensorOpCost cost_per_coeff; // cost of computing a single block element + +#ifdef EIGEN_HIPCC + // For HIPCC, we need to explicitly declare as a "device fun", the constructor + // which is implicitly invoked in the "merge" / "any" routines. else HIPCC + // errors out complaining about the lack of a matching constructor + EIGEN_DEVICE_FUNC + TensorBlockResourceRequirements(TensorBlockShapeType shape_type_, size_t size_, + TensorOpCost cost_) + : shape_type(shape_type_), size(size_), cost_per_coeff(cost_) + {} +#endif + + template + EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize( + TensorBlockShapeType shape_type, size_t size_in_bytes, + TensorOpCost cost) { + const size_t size = numext::maxi(size_t(1), size_in_bytes / sizeof(Scalar)); + return {shape_type, size, cost}; + } + + template + EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize( + TensorBlockShapeType shape_type, size_t size_in_bytes) { + // This default cost per coefficient is valid for most materialized tensor + // block evaluation implementations, because they typically just read + // coefficients from the underlying tensor storage, and write to the tensor + // block buffer (scratch or destination memory, reads and writes have linear + // access pattern). We ignore the fixed cost of block evaluation, because in + // practice it should negligible. + // + // Lazy block evaluation adds the cost of calling a functor for each + // coefficient. + // + // All non-trivial block evaluation implementations must provide their own + // cost approximation (e.g. shuffling inner dimension has a much higher cost + // because it reads memory randomly, although the total number of moved + // bytes is the same). + return withShapeAndSize(shape_type, size_in_bytes, + {/*bytes_loaded=*/sizeof(Scalar), + /*bytes_stored=*/sizeof(Scalar), + /*compute_cycles=*/0}); + } + + template + EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements skewed( + size_t size_in_bytes) { + return withShapeAndSize(TensorBlockShapeType::kSkewedInnerDims, + size_in_bytes); + } + + template + EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements uniform( + size_t size_in_bytes) { + return withShapeAndSize(TensorBlockShapeType::kUniformAllDims, + size_in_bytes); + } + + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE TensorBlockResourceRequirements + merge(const TensorBlockResourceRequirements& lhs, + const TensorBlockResourceRequirements& rhs) { + return {merge(lhs.shape_type, rhs.shape_type), // shape_type + merge(lhs.size, rhs.size), // size + merge(lhs.cost_per_coeff, rhs.cost_per_coeff)}; // cost_per_coeff + } + + EIGEN_DEVICE_FUNC TensorBlockResourceRequirements& addCostPerCoeff( + TensorOpCost cost) { + cost_per_coeff += cost; + return *this; + } + + // This is a resource requirement that should be returned from expressions + // that do not have any block evaluation preference (e.g. default tensor + // expression with raw buffer access). + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() { + return {TensorBlockShapeType::kUniformAllDims, 1, {0, 0, 0}}; + } + + private: + using Requirements = TensorBlockResourceRequirements; + + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size) { + return numext::maxi(lhs_size, rhs_size); + } + + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE TensorBlockShapeType + merge(TensorBlockShapeType lhs, TensorBlockShapeType rhs) { + return (lhs == TensorBlockShapeType::kSkewedInnerDims || + rhs == TensorBlockShapeType::kSkewedInnerDims) + ? TensorBlockShapeType::kSkewedInnerDims + : TensorBlockShapeType::kUniformAllDims; + } + + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost, + TensorOpCost rhs_cost) { + return lhs_cost + rhs_cost; + } +}; + +// -------------------------------------------------------------------------- // +// TensorBlockDescriptor specifies a block offset within a tensor and the block +// sizes along each of the tensor dimensions. + +template +class TensorBlockDescriptor { + public: + typedef DSizes Dimensions; + + // If we evaluate a Tensor assignment, and expression on the left, already has + // a memory buffer, then we might do performance optimization, and evaluate + // the root expression directly into the final output memory. Some time it's + // possible to reuse it for materializing subexpressions inside an expression + // tree, to to avoid dynamic memory allocation. + // + // The pointer type of the underlying storage is erased, because passing + // Scalar type through all the expression evaluation layers is way too many + // templates. In practice destination buffer type should always match the + // evaluated expression scalar type. + class DestinationBuffer { + public: + enum DestinationBufferKind : int { + // The above explicit specification of "int" as the enum basetype is + // needed to get around a HIPCC link error ("the field type is not + // amp-compatible") + // which is issued for class members with the enum type. + // TODO(rocm): + // remove the "int" basetype once HIPCC has been fixed to not error out + // in the above scenario. + + // Destination buffer is not defined (`m_data` == nullptr). + kEmpty, + + // Tensor block defined by an owning tensor block descriptor can fit + // contiguously into the destination buffer. In this case it's safe to + // materialize tensor block in the destination buffer, wrap it in a + // TensorMap, and use to build Eigen expression on top of it. + kContiguous, + + // Destination buffer strides do not match strides of the contiguously + // stored block, and it's impossible to define a TensorMap over this + // buffer. However if we are evaluating a root of an expression tree, we + // still can materialize an output into this destination, because we can + // guarantee that no one will ever access it through block API. + // + // In theory it is possible to build valid TensorStriding + // expression on top of this destination buffer, however it has + // inefficient coeff/packet access, and defeats the purpose of fast block + // evaluation API. + kStrided + }; + + template + Scalar* data() const { + eigen_assert(m_data_type_size == sizeof(Scalar)); + return static_cast(m_data); + } + + const Dimensions& strides() const { return m_strides; } + const DestinationBufferKind& kind() const { return m_kind; } + + private: + friend class TensorBlockDescriptor; + + DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {} + + template + DestinationBuffer(Scalar* data, const Dimensions& strides, + DestinationBufferKind kind) + : m_data(static_cast(data)), + m_data_type_size(sizeof(Scalar)), + m_strides(strides), + m_kind(kind) {} + + template + static DestinationBuffer make(const TensorBlockDescriptor& desc, + Scalar* data, const Dimensions& strides) { + return DestinationBuffer(data, strides, kind(desc, strides)); + } + + template + static DestinationBufferKind kind(const TensorBlockDescriptor& desc, + const Dimensions& strides) { + const Dimensions& desc_dims = desc.dimensions(); + const Dimensions& desc_strides = internal::strides(desc_dims); + for (int i = 0; i < NumDims; ++i) { + if (desc_dims[i] == 1) continue; + if (desc_strides[i] != strides[i]) return kStrided; + } + return kContiguous; + } + + // Storage pointer is type erased, to reduce template bloat, but we still + // keep the size of the underlying element type for error checking. + void* m_data; + size_t m_data_type_size; + + // Destination buffer dimensions always match the dimensions of a tensor + // block descriptor it belongs to, however strides might be different. + Dimensions m_strides; + + DestinationBufferKind m_kind; + }; + + TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions, + const DestinationBuffer& destination) + : m_offset(offset), + m_dimensions(dimensions), + m_destination(destination) {} + + TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions) + : m_offset(offset), + m_dimensions(dimensions), + m_destination(DestinationBuffer()) {} + + IndexType offset() const { return m_offset; } + const Dimensions& dimensions() const { return m_dimensions; } + IndexType dimension(int index) const { return m_dimensions[index]; } + IndexType size() const { return array_prod(m_dimensions); } + + const DestinationBuffer& destination() const { return m_destination; } + + template + void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) { + eigen_assert(dst_base != NULL); + m_destination = + DestinationBuffer::template make(*this, dst_base, dst_strides); + } + + template + void AddDestinationBuffer( + Scalar* dst_base, + const DSizes& dst_strides) { + // DSizes constructor will do index type promotion if it's safe. + AddDestinationBuffer(dst_base, Dimensions(dst_strides)); + } + + TensorBlockDescriptor& DropDestinationBuffer() { + m_destination.m_data = NULL; + m_destination.m_kind = DestinationBuffer::kEmpty; + return *this; + } + + bool HasDestinationBuffer() const { + return m_destination.kind() != DestinationBuffer::kEmpty; + } + + // Returns a copy of `*this` with updated offset. + TensorBlockDescriptor WithOffset(IndexType offset) const { + return TensorBlockDescriptor(offset, m_dimensions, m_destination); + } + + private: + // Offset and dimensions are immutable after construction. Block descriptor + // can only be mutated by adding or dropping destination. + const IndexType m_offset; + const Dimensions m_dimensions; + DestinationBuffer m_destination; +}; + +// -------------------------------------------------------------------------- // +// TensorBlockMapper is responsible for iterating over the blocks of a tensor. + +template +class TensorBlockMapper { + typedef TensorBlockDescriptor BlockDescriptor; + + public: + typedef DSizes Dimensions; + + TensorBlockMapper() = default; + TensorBlockMapper(const DSizes& dimensions, + const TensorBlockResourceRequirements& requirements) + : m_tensor_dimensions(dimensions), m_requirements(requirements) { + // Compute block dimensions and the total number of blocks. + InitializeBlockDimensions(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const { + return m_total_block_count; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const { + return m_block_dimensions.TotalSize(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes& + blockDimensions() const { + return m_block_dimensions; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor + blockDescriptor(IndexType block_index) const { + static const bool isColMajor = Layout == static_cast(ColMajor); + + IndexType offset = 0; + DSizes dimensions; + + if (NumDims == 0) return BlockDescriptor(offset, dimensions); + + // Iterate outer -> inner dimensions. + for (int i = NumDims - 1; i >= 0; --i) { + const int dim = isColMajor ? i : NumDims - i - 1; + + const IndexType idx = block_index / m_block_strides[dim]; + block_index -= idx * m_block_strides[dim]; + + const IndexType coord = idx * m_block_dimensions[dim]; + dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord, + m_block_dimensions[dim]); + offset += coord * m_tensor_strides[dim]; + } + + return {offset, dimensions}; + } + + private: + void InitializeBlockDimensions() { + // Requested block shape and size. + const TensorBlockShapeType shape_type = m_requirements.shape_type; + IndexType target_block_size = + numext::maxi(1, static_cast(m_requirements.size)); + + IndexType tensor_size = m_tensor_dimensions.TotalSize(); + + // Corner case: one of the dimensions is zero. Logic below is too complex + // to handle this case on a general basis, just use unit block size. + // Note: we must not yield blocks with zero dimensions (recipe for + // overflows/underflows, divisions by zero and NaNs later). + if (tensor_size == 0) { + for (int i = 0; i < NumDims; ++i) { + m_block_dimensions[i] = 1; + } + m_total_block_count = 0; + return; + } + + // If tensor fits into a target block size, evaluate it as a single block. + if (tensor_size <= target_block_size) { + m_block_dimensions = m_tensor_dimensions; + m_total_block_count = 1; + // The only valid block index is `0`, and in this case we do not need + // to compute real strides for tensor or blocks (see blockDescriptor). + for (int i = 0; i < NumDims; ++i) { + m_tensor_strides[i] = 0; + m_block_strides[i] = 1; + } + return; + } + + static const bool isColMajor = Layout == static_cast(ColMajor); + + // Block shape skewed towards inner dimension. + if (shape_type == TensorBlockShapeType::kSkewedInnerDims) { + IndexType coeff_to_allocate = target_block_size; + + for (int i = 0; i < NumDims; ++i) { + const int dim = isColMajor ? i : NumDims - i - 1; + m_block_dimensions[dim] = + numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]); + coeff_to_allocate = divup( + coeff_to_allocate, + numext::maxi(static_cast(1), m_block_dimensions[dim])); + } + eigen_assert(coeff_to_allocate == 1); + + } else if (shape_type == TensorBlockShapeType::kUniformAllDims) { + // Tensor will not fit within 'target_block_size' budget: calculate tensor + // block dimension sizes based on "square" dimension size target. + const IndexType dim_size_target = convert_index( + std::pow(static_cast(target_block_size), + 1.0f / static_cast(m_block_dimensions.rank()))); + + for (int i = 0; i < NumDims; ++i) { + // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it + // a multiple of the packet size. Note that reducing + // 'block_dim_size' in this manner can increase the number of + // blocks, and so will amplify any per-block overhead. + m_block_dimensions[i] = + numext::mini(dim_size_target, m_tensor_dimensions[i]); + } + + // Add any un-allocated coefficients to inner dimension(s). + IndexType total_size = m_block_dimensions.TotalSize(); + for (int i = 0; i < NumDims; ++i) { + const int dim = isColMajor ? i : NumDims - i - 1; + + if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) { + const IndexType total_size_other_dims = + total_size / m_block_dimensions[dim]; + const IndexType alloc_avail = + divup(target_block_size, total_size_other_dims); + if (alloc_avail == m_block_dimensions[dim]) { + // Insufficient excess coefficients to allocate. + break; + } + m_block_dimensions[dim] = + numext::mini(m_tensor_dimensions[dim], alloc_avail); + total_size = total_size_other_dims * m_block_dimensions[dim]; + } + } + + } else { + eigen_assert(false); // unknown block shape + } + + eigen_assert(m_block_dimensions.TotalSize() >= + numext::mini(target_block_size, + m_tensor_dimensions.TotalSize())); + + // Calculate block counts by dimension and total block count. + DSizes block_count; + for (int i = 0; i < NumDims; ++i) { + block_count[i] = divup(m_tensor_dimensions[i], m_block_dimensions[i]); + } + m_total_block_count = array_prod(block_count); + + // Calculate block strides (used for enumerating blocks). + m_tensor_strides = strides(m_tensor_dimensions); + m_block_strides = strides(block_count); + } + + DSizes m_tensor_dimensions; + TensorBlockResourceRequirements m_requirements; + + DSizes m_block_dimensions; + IndexType m_total_block_count; + + DSizes m_tensor_strides; + DSizes m_block_strides; +}; + +// -------------------------------------------------------------------------- // +// TensorBlockScratchAllocator is responsible for allocating temporary buffers +// for block evaluation (output or input block materialization). Given that +// Eigen expression traversal order is deterministic, all temporary allocations +// are happening in the same order, and usually have exactly the same size. +// Scratch allocator keeps a trace of all dynamic allocations, and after the +// first block evaluation is completed, we should be able to reuse all the +// temporary buffers for the next block evaluation. + +template +class TensorBlockScratchAllocator { + public: + explicit TensorBlockScratchAllocator(const Device& device) + : m_device(device), m_allocation_index(0) {} + + ~TensorBlockScratchAllocator() { + for (size_t i = 0; i < m_allocations.size(); ++i) { + m_device.deallocate(m_allocations[i].ptr); + } + } + + void* allocate(size_t size) { + // TODO(ezhulenev): Remove when replaced with inlined vector. + if (m_allocations.capacity() == 0) m_allocations.reserve(8); + + // Check if we already have an existing allocation att current index. + const int num_allocations = static_cast(m_allocations.size()); + const bool has_allocation = m_allocation_index < num_allocations; + + // Allocation index can't be larger than the number of allocations. + eigen_assert(m_allocation_index <= num_allocations); + + // If we have existing allocation, and its size is larger or equal to + // requested size, we do nothing. + + // If current allocation can't fit requested size, we deallocate it, and + // replace with a larger allocation. + if (has_allocation && m_allocations[m_allocation_index].size < size) { + m_device.deallocate(m_allocations[m_allocation_index].ptr); + m_allocations[m_allocation_index].ptr = m_device.allocate(size); + m_allocations[m_allocation_index].size = size; + } + + // Make a new allocation if we don't have and existing one. + if (!has_allocation) { + Allocation allocation; + allocation.ptr = m_device.allocate(size); + allocation.size = size; + m_allocations.push_back(allocation); + } + + eigen_assert(m_allocations[m_allocation_index].ptr != NULL); + eigen_assert(m_allocations[m_allocation_index].size >= size); + + return m_allocations[m_allocation_index++].ptr; + } + + void reset() { m_allocation_index = 0; } + + private: + struct Allocation { + void* ptr; + size_t size; + }; + + const Device& m_device; + int m_allocation_index; + // TODO(ezhulenev): This should be an inlined vector. + std::vector m_allocations; +}; + +// -------------------------------------------------------------------------- // +// TensorBlockKind represents all possible block kinds, that can be produced by +// TensorEvaluator::evalBlock function. +enum TensorBlockKind { + // Tensor block that is a lazy expression that must be assigned to a + // destination using TensorBlockAssign. + kExpr, + + // Tensor block that is a view into a memory buffer owned by an underlying + // Tensor expression (e.g. it can be a view into a Tensor buffer). + kView, + + // Tensor block that was materialized in a scratch memory buffer, allocated + // with TensorBlockScratchAllocator. This block must be copied to a + // destination, similar to a block of `kExpr` type. + kMaterializedInScratch, + + // Tensor block that was materialized directly into the final output memory + // buffer. For example if the left side of an assignment is a Tensor, we can + // directly materialize the block in the destination memory. + // + // If strides in the output buffer do not match tensor block strides, the + // Tensor expression will be invalid, and should not be used by + // TensorBlockAssign or for constructing another block expression. + kMaterializedInOutput +}; + +// -------------------------------------------------------------------------- // +// TensorBlockNotImplemented should be used to defined TensorBlock typedef in +// TensorEvaluators that do not support block evaluation. + +class TensorBlockNotImplemented { + public: + typedef void XprType; +}; + +// -------------------------------------------------------------------------- // +// XprScalar extracts Scalar type from the Eigen expressions (if expression type +// is not void). It's required to be able to define lazy block expression for +// argument types, that do not support block evaluation. + +template +struct XprScalar { + typedef typename XprType::Scalar type; +}; +template <> +struct XprScalar { + typedef void type; +}; + +// -------------------------------------------------------------------------- // +// TensorMaterializedBlock is a fully evaluated block of the original tensor, +// and XprType is just a TensorMap over the data. This block type is typically +// used to materialize blocks of tensor expressions, that can't be efficiently +// represented as lazy Tensor expressions with fast coeff/packet operations, +// e.g. we materialize all broadcasts into evaluated blocks. +// +// TensorMaterializedBlock does not own its memory buffer, it's either a memory +// buffer that backs the original expression (e.g. block is just a view into a +// Tensor), or a memory buffer allocated with scratch allocator, and in this +// case the scratch allocator will deallocate it at the end of block based +// expression execution. +// +// If the block was evaluated directly into the output buffer, and strides in +// the output buffer do not match block strides, the TensorMap expression will +// be invalid, and should never be used in block assignment or any other tensor +// expression. + +template +class TensorMaterializedBlock { + public: + typedef DSizes Dimensions; + typedef TensorMap > XprType; + + TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data, + const Dimensions& dimensions, bool valid_expr = true) + : m_kind(kind), + m_data(data), + m_dimensions(dimensions), + m_expr(m_data, m_dimensions), + m_valid_expr(valid_expr) { + eigen_assert(m_kind == internal::TensorBlockKind::kView || + m_kind == internal::TensorBlockKind::kMaterializedInScratch || + m_kind == internal::TensorBlockKind::kMaterializedInOutput); + } + + TensorBlockKind kind() const { return m_kind; } + // NOTE(ezhulenev): Returning XprType by value like in other block types + // causes asan failures. The theory is that XprType::Nested doesn't work + // properly for TensorMap. + const XprType& expr() const { + eigen_assert(m_valid_expr); + return m_expr; + } + const Scalar* data() const { return m_data; } + void cleanup() {} + + typedef internal::TensorBlockDescriptor TensorBlockDesc; + + // TensorMaterializedBlock can be backed by different types of storage: + // + // (1) Contiguous block of memory allocated with scratch allocator. + // (2) Contiguous block of memory reused from tensor block descriptor + // destination buffer. + // (3) Strided block of memory reused from tensor block descriptor + // destination buffer. + // + class Storage { + public: + Scalar* data() const { return m_data; } + const Dimensions& dimensions() const { return m_dimensions; } + const Dimensions& strides() const { return m_strides; } + + TensorMaterializedBlock AsTensorMaterializedBlock() const { + return TensorMaterializedBlock( + m_materialized_in_output + ? internal::TensorBlockKind::kMaterializedInOutput + : internal::TensorBlockKind::kMaterializedInScratch, + m_data, m_dimensions, !m_strided_storage); + } + + private: + friend class TensorMaterializedBlock; + + Storage(Scalar* data, const Dimensions& dimensions, + const Dimensions& strides, bool materialized_in_output, + bool strided_storage) + : m_data(data), + m_dimensions(dimensions), + m_strides(strides), + m_materialized_in_output(materialized_in_output), + m_strided_storage(strided_storage) {} + + Scalar* m_data; + Dimensions m_dimensions; + Dimensions m_strides; + bool m_materialized_in_output; + bool m_strided_storage; + }; + + // Creates a storage for materialized block either from the block descriptor + // destination buffer, or allocates a new buffer with scratch allocator. + template + EIGEN_STRONG_INLINE static Storage prepareStorage( + TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool allow_strided_storage = false) { + // Try to reuse destination as an output block buffer. + typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer; + + if (desc.destination().kind() == DestinationBuffer::kContiguous) { + Scalar* buffer = desc.destination().template data(); + desc.DropDestinationBuffer(); + return Storage(buffer, desc.dimensions(), + internal::strides(desc.dimensions()), + /*materialized_in_output=*/true, + /*strided_storage=*/false); + + } else if (desc.destination().kind() == DestinationBuffer::kStrided && + allow_strided_storage) { + Scalar* buffer = desc.destination().template data(); + desc.DropDestinationBuffer(); + return Storage(buffer, desc.dimensions(), desc.destination().strides(), + /*materialized_in_output=*/true, /*strided_storage=*/true); + + } else { + void* mem = scratch.allocate(desc.size() * sizeof(Scalar)); + return Storage(static_cast(mem), desc.dimensions(), + internal::strides(desc.dimensions()), + /*materialized_in_output=*/false, + /*strided_storage=*/false); + } + } + + // Creates a materialized block for the given descriptor from a memory buffer. + template + EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize( + const Scalar* data, const DataDimensions& data_dims, + TensorBlockDesc& desc, TensorBlockScratch& scratch) { + eigen_assert(array_size::value == desc.dimensions().size()); + + // If a tensor block dimensions covers a contiguous block of the underlying + // memory, we can skip block buffer memory allocation, and construct a block + // from existing `data` memory buffer. + // + // Example: (RowMajor layout) + // data_dims: [11, 12, 13, 14] + // desc.dimensions(): [1, 1, 3, 14] + // + // In this case we can construct a TensorBlock starting at + // `data + desc.offset()`, with a `desc.dimensions()` block sizes. + static const bool is_col_major = Layout == ColMajor; + + // Find out how many inner dimensions have a matching size. + int num_matching_inner_dims = 0; + for (int i = 0; i < NumDims; ++i) { + int dim = is_col_major ? i : NumDims - i - 1; + if (data_dims[dim] != desc.dimensions()[dim]) break; + ++num_matching_inner_dims; + } + + // All the outer dimensions must be of size `1`, except a single dimension + // before the matching inner dimension (`3` in the example above). + bool can_use_direct_access = true; + for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) { + int dim = is_col_major ? i : NumDims - i - 1; + if (desc.dimension(dim) != 1) { + can_use_direct_access = false; + break; + } + } + + if (can_use_direct_access) { + const Scalar* block_start = data + desc.offset(); + return TensorMaterializedBlock(internal::TensorBlockKind::kView, + block_start, desc.dimensions()); + + } else { + // Reuse destination buffer or allocate new buffer with scratch allocator. + const Storage storage = prepareStorage(desc, scratch); + + typedef internal::TensorBlockIO + TensorBlockIO; + typedef typename TensorBlockIO::Dst TensorBlockIODst; + typedef typename TensorBlockIO::Src TensorBlockIOSrc; + + TensorBlockIOSrc src(internal::strides(Dimensions(data_dims)), + data, desc.offset()); + TensorBlockIODst dst(storage.dimensions(), storage.strides(), + storage.data()); + + TensorBlockIO::Copy(dst, src); + return storage.AsTensorMaterializedBlock(); + } + } + + private: + TensorBlockKind m_kind; + const Scalar* m_data; + Dimensions m_dimensions; + XprType m_expr; + bool m_valid_expr; +}; + +// -------------------------------------------------------------------------- // +// TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp +// functor to the blocks produced by the underlying Tensor expression. + +template +class TensorCwiseUnaryBlock { + static const bool NoArgBlockAccess = + internal::is_void::value; + + public: + typedef typename conditional< + NoArgBlockAccess, void, + TensorCwiseUnaryOp >:: + type XprType; + + typedef typename XprScalar::type Scalar; + + TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor) + : m_arg_block(arg_block), m_functor(functor) {} + + TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } + + XprType expr() const { return XprType(m_arg_block.expr(), m_functor); } + const Scalar* data() const { return NULL; } + void cleanup() { m_arg_block.cleanup(); } + + private: + ArgTensorBlock m_arg_block; + UnaryOp m_functor; +}; + +// -------------------------------------------------------------------------- // +// TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp +// functor to the blocks produced by the underlying Tensor expression. + +template +class TensorCwiseBinaryBlock { + static const bool NoArgBlockAccess = + internal::is_void::value || + internal::is_void::value; + + public: + typedef typename conditional< + NoArgBlockAccess, void, + TensorCwiseBinaryOp >::type + XprType; + + typedef typename XprScalar::type Scalar; + + TensorCwiseBinaryBlock(const LhsTensorBlock& left_block, + const RhsTensorBlock& right_block, + const BinaryOp& functor) + : m_left_block(left_block), + m_right_block(right_block), + m_functor(functor) {} + + TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } + + XprType expr() const { + return XprType(m_left_block.expr(), m_right_block.expr(), m_functor); + } + + const Scalar* data() const { return NULL; } + + void cleanup() { + m_left_block.cleanup(); + m_right_block.cleanup(); + } + + private: + LhsTensorBlock m_left_block; + RhsTensorBlock m_right_block; + BinaryOp m_functor; +}; + +// -------------------------------------------------------------------------- // +// TensorUnaryExprBlock is a lazy tensor expression block that can construct +// an arbitrary tensor expression from a block of the underlying type (this is a +// generalization of the TensorCwiseUnaryBlock for arbitrary expressions). + +template +class TensorUnaryExprBlock { + typedef typename ArgTensorBlock::XprType ArgXprType; + static const bool NoArgBlockAccess = internal::is_void::value; + + public: + typedef typename conditional< + NoArgBlockAccess, void, + typename BlockFactory::template XprType::type>::type XprType; + + typedef typename XprScalar::type Scalar; + + TensorUnaryExprBlock(const ArgTensorBlock& arg_block, + const BlockFactory& factory) + : m_arg_block(arg_block), m_factory(factory) {} + + TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } + XprType expr() const { return m_factory.expr(m_arg_block.expr()); } + const Scalar* data() const { return NULL; } + void cleanup() { m_arg_block.cleanup(); } + + private: + ArgTensorBlock m_arg_block; + BlockFactory m_factory; +}; + +// -------------------------------------------------------------------------- // +// TensorTernaryExprBlock is a lazy tensor expression block that can construct +// an arbitrary tensor expression from three blocks of the underlying type. + +template +class TensorTernaryExprBlock { + typedef typename Arg1TensorBlock::XprType Arg1XprType; + typedef typename Arg2TensorBlock::XprType Arg2XprType; + typedef typename Arg3TensorBlock::XprType Arg3XprType; + + static const bool NoArgBlockAccess = internal::is_void::value || + internal::is_void::value || + internal::is_void::value; + + public: + typedef typename conditional< + NoArgBlockAccess, void, + typename BlockFactory::template XprType::type>::type XprType; + + typedef typename XprScalar::type Scalar; + + TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block, + const Arg2TensorBlock& arg2_block, + const Arg3TensorBlock& arg3_block, + const BlockFactory& factory) + : m_arg1_block(arg1_block), + m_arg2_block(arg2_block), + m_arg3_block(arg3_block), + m_factory(factory) {} + + TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } + XprType expr() const { + return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(), + m_arg3_block.expr()); + } + const Scalar* data() const { return NULL; } + void cleanup() { + m_arg1_block.cleanup(); + m_arg2_block.cleanup(); + m_arg3_block.cleanup(); + } + + private: + Arg1TensorBlock m_arg1_block; + Arg2TensorBlock m_arg2_block; + Arg3TensorBlock m_arg3_block; + BlockFactory m_factory; +}; + +// -------------------------------------------------------------------------- // +// StridedLinearBufferCopy provides a method to copy data between two linear +// buffers with different strides, with optimized paths for scatter/gather. + +template +class StridedLinearBufferCopy { + typedef typename packet_traits::type Packet; + enum { + Vectorizable = packet_traits::Vectorizable, + PacketSize = packet_traits::size + }; + + public: + // Specifying linear copy kind statically gives ~30% speedup for small sizes. + enum class Kind { + Linear = 0, // src_stride == 1 && dst_stride == 1 + Scatter = 1, // src_stride == 1 && dst_stride != 1 + FillLinear = 2, // src_stride == 0 && dst_stride == 1 + FillScatter = 3, // src_stride == 0 && dst_stride != 1 + Gather = 4, // dst_stride == 1 + Random = 5 // everything else + }; + + struct Dst { + Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {} + + IndexType offset; + IndexType stride; + Scalar* data; + }; + + struct Src { + Src(IndexType o, IndexType s, const Scalar* d) + : offset(o), stride(s), data(d) {} + + IndexType offset; + IndexType stride; + const Scalar* data; + }; + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst, + const Src& src, + const size_t count) { + Run(count, dst.offset, dst.stride, dst.data, src.offset, src.stride, + src.data); + } + + private: + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const IndexType count, const IndexType dst_offset, + const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data, + const IndexType src_offset, const IndexType src_stride, + const Scalar* EIGEN_RESTRICT src_data) { + const Scalar* src = &src_data[src_offset]; + Scalar* dst = &dst_data[dst_offset]; + + if (!Vectorizable) { + for (Index i = 0; i < count; ++i) { + dst[i * dst_stride] = src[i * src_stride]; + } + return; + } + + const IndexType vectorized_size = count - PacketSize; + IndexType i = 0; + + if (kind == StridedLinearBufferCopy::Kind::Linear) { + // ******************************************************************** // + // Linear copy from `src` to `dst`. + const IndexType unrolled_size = count - 4 * PacketSize; + eigen_assert(src_stride == 1 && dst_stride == 1); + for (; i <= unrolled_size; i += 4 * PacketSize) { + for (int j = 0; j < 4; ++j) { + Packet p = ploadu(src + i + j * PacketSize); + pstoreu(dst + i + j * PacketSize, p); + } + } + for (; i <= vectorized_size; i += PacketSize) { + Packet p = ploadu(src + i); + pstoreu(dst + i, p); + } + for (; i < count; ++i) { + dst[i] = src[i]; + } + // ******************************************************************** // + } else if (kind == StridedLinearBufferCopy::Kind::Scatter) { + // Scatter from `src` to `dst`. + eigen_assert(src_stride == 1 && dst_stride != 1); + for (; i <= vectorized_size; i += PacketSize) { + Packet p = ploadu(src + i); + pscatter(dst + i * dst_stride, p, dst_stride); + } + for (; i < count; ++i) { + dst[i * dst_stride] = src[i]; + } + // ******************************************************************** // + } else if (kind == StridedLinearBufferCopy::Kind::FillLinear) { + // Fill `dst` with value at `*src`. + eigen_assert(src_stride == 0 && dst_stride == 1); + const IndexType unrolled_size = count - 4 * PacketSize; + Packet p = pload1(src); + for (; i <= unrolled_size; i += 4 * PacketSize) { + for (int j = 0; j < 4; ++j) { + pstoreu(dst + i + j * PacketSize, p); + } + } + for (; i <= vectorized_size; i += PacketSize) { + pstoreu(dst + i, p); + } + for (; i < count; ++i) { + dst[i] = *src; + } + // ******************************************************************** // + } else if (kind == StridedLinearBufferCopy::Kind::FillScatter) { + // Scatter `*src` into `dst`. + eigen_assert(src_stride == 0 && dst_stride != 1); + Packet p = pload1(src); + for (; i <= vectorized_size; i += PacketSize) { + pscatter(dst + i * dst_stride, p, dst_stride); + } + for (; i < count; ++i) { + dst[i * dst_stride] = *src; + } + // ******************************************************************** // + } else if (kind == StridedLinearBufferCopy::Kind::Gather) { + // Gather from `src` into `dst`. + eigen_assert(dst_stride == 1); + for (; i <= vectorized_size; i += PacketSize) { + Packet p = pgather(src + i * src_stride, src_stride); + pstoreu(dst + i, p); + } + for (; i < count; ++i) { + dst[i] = src[i * src_stride]; + } + // ******************************************************************** // + } else if (kind == StridedLinearBufferCopy::Kind::Random) { + // Random. + for (; i < count; ++i) { + dst[i * dst_stride] = src[i * src_stride]; + } + } else { + eigen_assert(false); + } + } +}; + +// -------------------------------------------------------------------------- // +// TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block. +// It's possible to specify src->dst dimension mapping for the copy operation. +// Dimensions of `dst` specify how many elements have to be copied, for the +// `src` we need to know only stride to navigate through source memory buffer. + +template +class TensorBlockIO { + static const bool IsColMajor = (Layout == ColMajor); + + typedef StridedLinearBufferCopy LinCopy; + + public: + typedef DSizes Dimensions; + typedef DSizes DimensionsMap; + + struct Dst { + Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst, + IndexType dst_offset = 0) + : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {} + + Dimensions dims; + Dimensions strides; + Scalar* data; + IndexType offset; + }; + + struct Src { + Src(const Dimensions& src_strides, const Scalar* src, + IndexType src_offset = 0) + : strides(src_strides), data(src), offset(src_offset) {} + + Dimensions strides; + const Scalar* data; + IndexType offset; + }; + + // Copies data to `dst` from `src`, using provided dimensions mapping: + // + // src_dimension_index = dst_to_src_dim_map[dst_dimension_index] + // + // Returns the number of copied elements. + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy( + const Dst& dst, const Src& src, const DimensionsMap& dst_to_src_dim_map) { + // Copy single scalar value from `src` to `dst`. + if (NumDims == 0) { + *(dst.data + dst.offset) = *(src.data + src.offset); + return 1; + } + + // Both `dst` and `src` must have contiguous innermost dimension. We also + // accept the special case with stride '0', because it's used as a trick to + // implement broadcasting. + { + int inner_dim = IsColMajor ? 0 : NumDims - 1; + EIGEN_UNUSED_VARIABLE(inner_dim); + eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0); + eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0); + } + + // Give a shorter name to `dst_to_src_dim_map`. + const DimensionsMap& dim_map = dst_to_src_dim_map; + + // Do not squeeze reordered inner dimensions. + int num_squeezable_dims = NumSqueezableInnerDims(dim_map); + + // NOTE: We find the innermost dimension (contiguous in memory) in the dst + // block, and we write data linearly into that dimension, reading it from + // the src. If dimensions are reordered, we might end up reading data from + // the src with `stride != 1`. + // + // NOTE: Random-Read/Linear-Write can be up to ~2X faster than + // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680 + + // Find the innermost dimension in the dst whose size is not 1. This is the + // effective inner dim. + int num_size_one_inner_dims = 0; + for (int i = 0; i < num_squeezable_dims; ++i) { + const int dst_dim = IsColMajor ? i : NumDims - i - 1; + if (dst.dims[dst_dim] != 1) break; + num_size_one_inner_dims++; + } + + // If all dimensions are of size 1, just copy a scalar from `src` to `dst`. + if (num_size_one_inner_dims == NumDims) { + *(dst.data + dst.offset) = *(src.data + src.offset); + return 1; + } + + // Outermost dimension in the dst with `stride == 1` (contiguous in memory). + const int dst_stride1_dim = IsColMajor + ? num_size_one_inner_dims + : NumDims - num_size_one_inner_dims - 1; + + // Dimension in the src that corresponds to the dst innermost dimension. + const int src_dim_for_dst_stride1_dim = + NumDims == 0 ? 1 : dim_map[dst_stride1_dim]; + + // Size of the innermost dimension (length of contiguous blocks of memory). + IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim]; + + // Squeeze multiple inner dims into one if they are contiguous in `dst` and + // `src` memory, so we can do less linear copy calls. + for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) { + const int dst_dim = IsColMajor ? i : NumDims - i - 1; + const IndexType dst_stride = dst.strides[dst_dim]; + const IndexType src_stride = src.strides[dim_map[dst_dim]]; + if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) { + dst_inner_dim_size *= dst.dims[dst_dim]; + ++num_size_one_inner_dims; + } else { + break; + } + } + + // Setup strides to read data from `src` and write to `dst`. + IndexType input_offset = src.offset; + IndexType output_offset = dst.offset; + IndexType input_stride = + NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim]; + IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim]; + + const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1; + array it; + + // Initialize block iterator state. Squeeze away any dimension of size 1. + int idx = 0; // currently initialized iterator state index + for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) { + const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2; + if (dst.dims[dst_dim] == 1) continue; + + it[idx].size = dst.dims[dst_dim]; + it[idx].input_stride = src.strides[dim_map[dst_dim]]; + it[idx].output_stride = dst.strides[dst_dim]; + + it[idx].input_span = it[idx].input_stride * (it[idx].size - 1); + it[idx].output_span = it[idx].output_stride * (it[idx].size - 1); + + idx++; + } + + // Iterate copying data from src to dst. + const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize(); + +#define COPY_INNER_DIM(KIND) \ + IndexType num_copied = 0; \ + for (num_copied = 0; num_copied < block_total_size; \ + num_copied += dst_inner_dim_size) { \ + LinCopy::template Run( \ + typename LinCopy::Dst(output_offset, output_stride, dst.data), \ + typename LinCopy::Src(input_offset, input_stride, src.data), \ + dst_inner_dim_size); \ + \ + for (int j = 0; j < idx; ++j) { \ + if (++it[j].count < it[j].size) { \ + input_offset += it[j].input_stride; \ + output_offset += it[j].output_stride; \ + break; \ + } \ + it[j].count = 0; \ + input_offset -= it[j].input_span; \ + output_offset -= it[j].output_span; \ + } \ + } \ + return num_copied; + + if (input_stride == 1 && output_stride == 1) { + COPY_INNER_DIM(LinCopy::Kind::Linear); + } else if (input_stride == 1 && output_stride != 1) { + COPY_INNER_DIM(LinCopy::Kind::Scatter); + } else if (input_stride == 0 && output_stride == 1) { + COPY_INNER_DIM(LinCopy::Kind::FillLinear); + } else if (input_stride == 0 && output_stride != 1) { + COPY_INNER_DIM(LinCopy::Kind::FillScatter); + } else if (output_stride == 1) { + COPY_INNER_DIM(LinCopy::Kind::Gather); + } else { + COPY_INNER_DIM(LinCopy::Kind::Random); + } + +#undef COPY_INNER_DIM + } + + // Copy from `src` to `dst` with an identity src->dst dimension map. Returns + // the number of copied elements. + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst& dst, + const Src& src) { + DimensionsMap dst_to_src_map; + for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i; + return Copy(dst, src, dst_to_src_map); + } + + private: + struct BlockIteratorState { + BlockIteratorState() + : size(0), + count(0), + input_stride(0), + output_stride(0), + input_span(0), + output_span(0) {} + + IndexType size; + IndexType count; + IndexType input_stride; + IndexType output_stride; + IndexType input_span; + IndexType output_span; + }; + + // Compute how many inner dimensions it's allowed to squeeze when doing IO + // between two tensor blocks. It's safe to squeeze inner dimensions, only + // if they are not reordered. + static int NumSqueezableInnerDims(const DimensionsMap& dim_map) { + int num_squeezable_dims = 0; + for (int i = 0; i < NumDims; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + if (dim_map[dim] != dim) break; + num_squeezable_dims++; + } + return num_squeezable_dims; + } +}; + +// -------------------------------------------------------------------------- // +// TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to +// a Tensor block defined by `desc`, backed by a memory buffer at `target`. +// +// Currently there is no way to write from a Tensor expression to a block of +// memory, if dimensions are reordered. If you need to do that, you should +// materialize a Tensor block expression into a memory buffer, and then use +// TensorBlockIO to copy data between two memory buffers with a custom +// `target->src` dimension map (see definition above). +// +// Also currently the innermost dimension of `target` must have a stride '1' +// (contiguous in memory). This restriction could be lifted with a `pscatter`, +// but in practice it's never needed, and there is a similar TensorBlockIO +// workaround for that. +// +// TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO +// where `src` is a tensor expression. Explore if it is possible to rewrite IO +// to use expressions instead of pointers, and after that TensorBlockAssignment +// will become an alias to IO. +template +class TensorBlockAssignment { + // We will use coeff/packet path to evaluate block expressions. + typedef TensorEvaluator + TensorBlockEvaluator; + + typedef DSizes Dimensions; + + enum { + Vectorizable = packet_traits::Vectorizable, + PacketSize = packet_traits::size + }; + + template + struct InnerDimAssign { + EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, + const Evaluator& eval, + IndexType eval_offset) { + for (IndexType i = 0; i < count; ++i) { + target[i] = eval.coeff(eval_offset + i); + } + } + }; + + template + struct InnerDimAssign { + EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, + const Evaluator& eval, + IndexType eval_offset) { + typedef typename packet_traits::type Packet; + + const IndexType unrolled_size = count - 4 * PacketSize; + const IndexType vectorized_size = count - PacketSize; + IndexType i = 0; + + for (; i <= unrolled_size; i += 4 * PacketSize) { + for (int j = 0; j < 4; ++j) { + const IndexType idx = eval_offset + i + j * PacketSize; + Packet p = eval.template packet(idx); + pstoreu(target + i + j * PacketSize, p); + } + } + + for (; i <= vectorized_size; i += PacketSize) { + Packet p = eval.template packet(eval_offset + i); + pstoreu(target + i, p); + } + + for (; i < count; ++i) { + target[i] = eval.coeff(eval_offset + i); + } + } + }; + + public: + struct Target { + Target(const Dimensions& target_dims, const Dimensions& target_strides, + Scalar* target_data, IndexType target_offset = 0) + : dims(target_dims), + strides(target_strides), + data(target_data), + offset(target_offset) {} + + Dimensions dims; + Dimensions strides; + Scalar* data; + IndexType offset; + }; + + static Target target(const Dimensions& target_dims, + const Dimensions& target_strides, Scalar* target_data, + IndexType target_offset = 0) { + return Target(target_dims, target_strides, target_data, target_offset); + } + + template + static Target target( + const DSizes& target_dims, + const DSizes& target_strides, + Scalar* target_data, IndexType target_offset = 0) { + // DSizes constructor will do index type promotion if it's safe. + return Target(Dimensions(target_dims), Dimensions(target_strides), + target_data, target_offset); + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const Target& target, const TensorBlockExpr& expr) { + // Prepare evaluator for block expression. + DefaultDevice default_device; + TensorBlockEvaluator eval(expr, default_device); + + // Tensor block expression dimension should match destination dimensions. + eigen_assert(dimensions_match(target.dims, eval.dimensions())); + + static const int Layout = TensorBlockEvaluator::Layout; + static const bool is_col_major = Layout == ColMajor; + + // Initialize output inner dimension size based on a layout. + const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize(); + const int inner_dim_idx = is_col_major ? 0 : NumDims - 1; + IndexType output_inner_dim_size = target.dims[inner_dim_idx]; + + // Target inner dimension stride must be '1'. + eigen_assert(target.strides[inner_dim_idx] == 1); + + // Squeeze multiple inner dims into one if they are contiguous in `target`. + IndexType num_squeezed_dims = 0; + for (Index i = 1; i < NumDims; ++i) { + const Index dim = is_col_major ? i : NumDims - i - 1; + const IndexType target_stride = target.strides[dim]; + + if (output_inner_dim_size == target_stride) { + output_inner_dim_size *= target.dims[dim]; + num_squeezed_dims++; + } else { + break; + } + } + + // Initialize output block iterator state. Dimension in this array are + // always in inner_most -> outer_most order (col major layout). + array it; + + int idx = 0; // currently initialized iterator state index + for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) { + const Index dim = is_col_major ? i + 1 : NumDims - i - 2; + + it[idx].count = 0; + it[idx].size = target.dims[dim]; + it[idx].output_stride = target.strides[dim]; + it[idx].output_span = it[idx].output_stride * (it[idx].size - 1); + idx++; + } + + // We read block expression from the beginning, and start writing data to + // `target` at given offset. + IndexType input_offset = 0; + IndexType output_offset = target.offset; + + // Iterate copying data from `eval` to `target`. + for (IndexType i = 0; i < output_size; i += output_inner_dim_size) { + // Assign to `target` at current offset. + InnerDimAssign::Run(target.data + output_offset, + output_inner_dim_size, eval, + input_offset); + + // Move input offset forward by the number of assigned coefficients. + input_offset += output_inner_dim_size; + + // Update index. + for (int j = 0; j < idx; ++j) { + if (++it[j].count < it[j].size) { + output_offset += it[j].output_stride; + break; + } + it[j].count = 0; + output_offset -= it[j].output_span; + } + } + } + + private: + struct BlockIteratorState { + BlockIteratorState() + : count(0), size(0), output_stride(0), output_span(0) {} + + IndexType count; + IndexType size; + IndexType output_stride; + IndexType output_span; + }; +}; + +// -------------------------------------------------------------------------- // + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorBroadcasting.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorBroadcasting.h new file mode 100644 index 0000000..a354132 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorBroadcasting.h @@ -0,0 +1,1093 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H +#define EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H + +namespace Eigen { + +/** \class TensorBroadcasting + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor broadcasting class. + * + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorBroadcastingOp EIGEN_DEVICE_REF type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorBroadcastingOp type; +}; + +template +struct is_input_scalar { + static const bool value = false; +}; +template <> +struct is_input_scalar > { + static const bool value = true; +}; +#ifndef EIGEN_EMULATE_CXX11_META_H +template +struct is_input_scalar > { + static const bool value = (Sizes::total_size == 1); +}; +#endif + +} // end namespace internal + + + +template +class TensorBroadcastingOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBroadcastingOp(const XprType& expr, const Broadcast& broadcast) + : m_xpr(expr), m_broadcast(broadcast) {} + + EIGEN_DEVICE_FUNC + const Broadcast& broadcast() const { return m_broadcast; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const Broadcast m_broadcast; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorBroadcastingOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename TensorEvaluator::Dimensions InputDimensions; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = PacketType::size; + protected: // all the non-static fields must have the same access control, otherwise the TensorEvaluator wont be standard layout; + bool isCopy, nByOne, oneByN; + public: + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + RawAccess = false + }; + + typedef typename internal::remove_const::type ScalarNoConst; + + // We do block based broadcasting using a trick with 2x tensor rank and 0 + // strides. See block method implementation for details. + typedef DSizes BroadcastDimensions; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlock + ArgTensorBlock; + + typedef typename internal::TensorMaterializedBlock + TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : isCopy(false), nByOne(false), oneByN(false), + m_device(device), m_broadcast(op.broadcast()), m_impl(op.expression(), device) + { + + // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar + // and store the result in a scalar. Instead one should reshape the scalar into a a N-D + // tensor with N >= 1 of 1 element first and then broadcast. + EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + const InputDimensions& input_dims = m_impl.dimensions(); + isCopy = true; + for (int i = 0; i < NumDims; ++i) { + eigen_assert(input_dims[i] > 0); + m_dimensions[i] = input_dims[i] * m_broadcast[i]; + if (m_broadcast[i] != 1) { + isCopy = false; + } + } + + if (static_cast(Layout) == static_cast(ColMajor)) { + m_inputStrides[0] = 1; + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + } + } else { + m_inputStrides[NumDims-1] = 1; + m_outputStrides[NumDims-1] = 1; + for (int i = NumDims-2; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; + m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1]; + } + } + + if (input_dims[0] == 1) { + oneByN = true; + for (int i = 1; i < NumDims; ++i) { + if (m_broadcast[i] != 1) { + oneByN = false; + break; + } + } + } else if (input_dims[NumDims-1] == 1) { + nByOne = true; + for (int i = 0; i < NumDims-1; ++i) { + if (m_broadcast[i] != 1) { + nByOne = false; + break; + } + } + } + + // Handle special format like NCHW, its input shape is '[1, N..., 1]' and + // broadcast shape is '[N, 1..., N]' + if (!oneByN && !nByOne) { + if (input_dims[0] == 1 && input_dims[NumDims-1] == 1 && NumDims > 2) { + nByOne = true; + oneByN = true; + for (int i = 1; i < NumDims-1; ++i) { + if (m_broadcast[i] != 1) { + nByOne = false; + oneByN = false; + break; + } + } + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType, EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const + { + if (internal::is_input_scalar::type>::value) { + return m_impl.coeff(0); + } + + if (static_cast(Layout) == static_cast(ColMajor)) { + if (isCopy) { + return m_impl.coeff(index); + } else { + return coeffColMajor(index); + } + } else { + if (isCopy) { + return m_impl.coeff(index); + } else { + return coeffRowMajor(index); + } + } + } + + // TODO: attempt to speed this up. The integer divisions and modulo are slow + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexColMajor(Index index) const { + Index inputIndex = 0; + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + if (internal::index_statically_eq(i, 1)) { + eigen_assert(idx < m_impl.dimensions()[i]); + inputIndex += idx * m_inputStrides[i]; + } else { + if (internal::index_statically_eq(i, 1)) { + eigen_assert(idx % m_impl.dimensions()[i] == 0); + } else { + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + } + } + index -= idx * m_outputStrides[i]; + } + if (internal::index_statically_eq(0, 1)) { + eigen_assert(index < m_impl.dimensions()[0]); + inputIndex += index; + } else { + if (internal::index_statically_eq(0, 1)) { + eigen_assert(index % m_impl.dimensions()[0] == 0); + } else { + inputIndex += (index % m_impl.dimensions()[0]); + } + } + return inputIndex; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const + { + return m_impl.coeff(indexColMajor(index)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexRowMajor(Index index) const { + Index inputIndex = 0; + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i]; + if (internal::index_statically_eq(i, 1)) { + eigen_assert(idx < m_impl.dimensions()[i]); + inputIndex += idx * m_inputStrides[i]; + } else { + if (internal::index_statically_eq(i, 1)) { + eigen_assert(idx % m_impl.dimensions()[i] == 0); + } else { + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + } + } + index -= idx * m_outputStrides[i]; + } + if (internal::index_statically_eq(NumDims - 1, 1)) { + eigen_assert(index < m_impl.dimensions()[NumDims - 1]); + inputIndex += index; + } else { + if (internal::index_statically_eq(NumDims - 1, 1)) { + eigen_assert(index % m_impl.dimensions()[NumDims - 1] == 0); + } else { + inputIndex += (index % m_impl.dimensions()[NumDims - 1]); + } + } + return inputIndex; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const + { + return m_impl.coeff(indexRowMajor(index)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const + { + if (internal::is_input_scalar::type>::value) { + return internal::pset1(m_impl.coeff(0)); + } + + if (static_cast(Layout) == static_cast(ColMajor)) { + if (isCopy) { + #ifdef EIGEN_GPU_COMPILE_PHASE + // See PR 437: on NVIDIA P100 and K20m we observed a x3-4 speed up by enforcing + // unaligned loads here. The reason is unclear though. + return m_impl.template packet(index); + #else + return m_impl.template packet(index); + #endif + } else if (oneByN && !nByOne) { + return packetNByOne(index); + } else if (!oneByN && nByOne) { + return packetOneByN(index); + } else if (oneByN && nByOne) { + return packetOneByNByOne(index); + } else { + return packetColMajor(index); + } + } else { + if (isCopy) { + #ifdef EIGEN_GPU_COMPILE_PHASE + // See above. + return m_impl.template packet(index); + #else + return m_impl.template packet(index); + #endif + } else if (oneByN && !nByOne) { + return packetOneByN(index); + } else if (!oneByN && nByOne) { + return packetNByOne(index); + } else if (oneByN && nByOne) { + return packetOneByNByOne(index); + } else { + return packetRowMajor(index); + } + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByNByOne + (Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + Index startDim, endDim; + Index inputIndex, outputOffset, batchedIndex; + + if (static_cast(Layout) == static_cast(ColMajor)) { + startDim = NumDims - 1; + endDim = 1; + } else { + startDim = 0; + endDim = NumDims - 2; + } + + batchedIndex = index % m_outputStrides[startDim]; + inputIndex = batchedIndex / m_outputStrides[endDim]; + outputOffset = batchedIndex % m_outputStrides[endDim]; + + if (outputOffset + PacketSize <= m_outputStrides[endDim]) { + values[0] = m_impl.coeff(inputIndex); + return internal::pload1(values); + } else { + EIGEN_UNROLL_LOOP + for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) { + if (outputOffset + cur < m_outputStrides[endDim]) { + values[i] = m_impl.coeff(inputIndex); + } else { + ++inputIndex; + inputIndex = (inputIndex == m_inputStrides[startDim] ? 0 : inputIndex); + values[i] = m_impl.coeff(inputIndex); + outputOffset = 0; + cur = 0; + } + } + return internal::pload(values); + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByN(Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + Index dim, inputIndex; + + if (static_cast(Layout) == static_cast(ColMajor)) { + dim = NumDims - 1; + } else { + dim = 0; + } + + inputIndex = index % m_inputStrides[dim]; + if (inputIndex + PacketSize <= m_inputStrides[dim]) { + return m_impl.template packet(inputIndex); + } else { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + if (inputIndex > m_inputStrides[dim]-1) { + inputIndex = 0; + } + values[i] = m_impl.coeff(inputIndex++); + } + return internal::pload(values); + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetNByOne(Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + Index dim, inputIndex, outputOffset; + + if (static_cast(Layout) == static_cast(ColMajor)) { + dim = 1; + } else { + dim = NumDims - 2; + } + + inputIndex = index / m_outputStrides[dim]; + outputOffset = index % m_outputStrides[dim]; + if (outputOffset + PacketSize <= m_outputStrides[dim]) { + values[0] = m_impl.coeff(inputIndex); + return internal::pload1(values); + } else { + EIGEN_UNROLL_LOOP + for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) { + if (outputOffset + cur < m_outputStrides[dim]) { + values[i] = m_impl.coeff(inputIndex); + } else { + values[i] = m_impl.coeff(++inputIndex); + outputOffset = 0; + cur = 0; + } + } + return internal::pload(values); + } + } + + // Ignore the LoadMode and always use unaligned loads since we can't guarantee + // the alignment at compile time. + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + const Index originalIndex = index; + + Index inputIndex = 0; + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + if (internal::index_statically_eq(i, 1)) { + eigen_assert(idx < m_impl.dimensions()[i]); + inputIndex += idx * m_inputStrides[i]; + } else { + if (internal::index_statically_eq(i, 1)) { + eigen_assert(idx % m_impl.dimensions()[i] == 0); + } else { + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + } + } + index -= idx * m_outputStrides[i]; + } + Index innermostLoc; + if (internal::index_statically_eq(0, 1)) { + eigen_assert(index < m_impl.dimensions()[0]); + innermostLoc = index; + } else { + if (internal::index_statically_eq(0, 1)) { + eigen_assert(index % m_impl.dimensions()[0] == 0); + innermostLoc = 0; + } else { + innermostLoc = index % m_impl.dimensions()[0]; + } + } + inputIndex += innermostLoc; + + // Todo: this could be extended to the second dimension if we're not + // broadcasting alongside the first dimension, and so on. + if (innermostLoc + PacketSize <= m_impl.dimensions()[0]) { + return m_impl.template packet(inputIndex); + } else { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + values[0] = m_impl.coeff(inputIndex); + EIGEN_UNROLL_LOOP + for (int i = 1; i < PacketSize; ++i) { + if (innermostLoc + i < m_impl.dimensions()[0]) { + values[i] = m_impl.coeff(inputIndex+i); + } else { + values[i] = coeffColMajor(originalIndex+i); + } + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + const Index originalIndex = index; + + Index inputIndex = 0; + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i]; + if (internal::index_statically_eq(i, 1)) { + eigen_assert(idx < m_impl.dimensions()[i]); + inputIndex += idx * m_inputStrides[i]; + } else { + if (internal::index_statically_eq(i, 1)) { + eigen_assert(idx % m_impl.dimensions()[i] == 0); + } else { + inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i]; + } + } + index -= idx * m_outputStrides[i]; + } + Index innermostLoc; + if (internal::index_statically_eq(NumDims-1, 1)) { + eigen_assert(index < m_impl.dimensions()[NumDims-1]); + innermostLoc = index; + } else { + if (internal::index_statically_eq(NumDims-1, 1)) { + eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0); + innermostLoc = 0; + } else { + innermostLoc = index % m_impl.dimensions()[NumDims-1]; + } + } + inputIndex += innermostLoc; + + // Todo: this could be extended to the second dimension if we're not + // broadcasting alongside the first dimension, and so on. + if (innermostLoc + PacketSize <= m_impl.dimensions()[NumDims-1]) { + return m_impl.template packet(inputIndex); + } else { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + values[0] = m_impl.coeff(inputIndex); + EIGEN_UNROLL_LOOP + for (int i = 1; i < PacketSize; ++i) { + if (innermostLoc + i < m_impl.dimensions()[NumDims-1]) { + values[i] = m_impl.coeff(inputIndex+i); + } else { + values[i] = coeffRowMajor(originalIndex+i); + } + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + double compute_cost = TensorOpCost::AddCost(); + if (!isCopy && NumDims > 0) { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i > 0; --i) { + compute_cost += TensorOpCost::DivCost(); + if (internal::index_statically_eq(i, 1)) { + compute_cost += + TensorOpCost::MulCost() + TensorOpCost::AddCost(); + } else { + if (!internal::index_statically_eq(i, 1)) { + compute_cost += TensorOpCost::MulCost() + + TensorOpCost::ModCost() + + TensorOpCost::AddCost(); + } + } + compute_cost += + TensorOpCost::MulCost() + TensorOpCost::AddCost(); + } + } + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large + // tensors. But this might need further tuning. + const size_t target_size = m_device.firstLevelCacheSize(); + return internal::TensorBlockResourceRequirements::merge( + m_impl.getResourceRequirements(), + internal::TensorBlockResourceRequirements::skewed(target_size)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + BlockBroadcastingParams params = blockBroadcastingParams(desc); + + if (params.inner_dim_size == 0 || params.bcast_dim_size == 0) { + return emptyBlock(); + } + + // Prepare storage for the materialized broadcasting result. + const typename TensorBlock::Storage block_storage = + TensorBlock::prepareStorage(desc, scratch); + ScalarNoConst* materialized_output = block_storage.data(); + + // We potentially will need to materialize input blocks. + size_t materialized_input_size = 0; + ScalarNoConst* materialized_input = NULL; + + // Initialize block broadcating iterator state for outer dimensions (outer + // with regard to bcast dimension). Dimension in this array are always in + // inner_most -> outer_most order (col major layout). + array it; + int idx = 0; + + for (int i = params.inner_dim_count + 1; i < NumDims; ++i) { + const Index dim = IsColMajor ? i : NumDims - 1 - i; + it[idx].size = params.output_dims[dim]; + it[idx].count = 0; + it[idx].output_stride = m_outputStrides[dim]; + it[idx].output_span = it[idx].output_stride * (it[idx].size - 1); + idx++; + } + + // Write output into the beginning of `materialized_output`. + Index output_offset = 0; + + // We will fill output block by broadcasting along the bcast dim, and + // iterating over outer dimension. + const Index output_size = NumDims == 0 ? 1 : params.output_dims.TotalSize(); + + for (Index num_output_coeffs = 0; num_output_coeffs < output_size;) { + ScalarNoConst* bcast_output = materialized_output + num_output_coeffs; + Index bcast_offset = desc.offset() + output_offset; + + // Broadcast along the bcast dimension. + num_output_coeffs += BroadcastBlockAlongBcastDim( + params, bcast_offset, scratch, bcast_output, &materialized_input, + &materialized_input_size); + + // Switch to the next outer dimension. + for (int j = 0; j < idx; ++j) { + if (++it[j].count < it[j].size) { + output_offset += it[j].output_stride; + break; + } + it[j].count = 0; + output_offset -= it[j].output_span; + } + } + + return block_storage.AsTensorMaterializedBlock(); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + + const TensorEvaluator& impl() const { return m_impl; } + + Broadcast functor() const { return m_broadcast; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind( + cl::sycl::handler& cgh) const { + m_impl.bind(cgh); + } +#endif + private: + static const bool IsColMajor = + static_cast(Layout) == static_cast(ColMajor); + + // We will build a general case block broadcasting on top of broadcasting + // primitive that will do broadcasting only for the inner dimension(s) along + // the first dimension smaller than the input size (it's called `bcast_dim`). + // + // Example: + // dim: 0 1 2 (ColMajor) + // input size: [9, 3, 6] + // block size: [9, 2, 6] + // + // We will compute broadcasted block by iterating over the outer dimensions + // before `bcast_dim` (only dimension `2` in this example) and computing + // broadcasts along the `bcast_dim` (dimension `1` in this example). + + // BlockBroadcastingParams holds precomputed parameters for broadcasting a + // single block along the broadcasting dimension. Sizes and strides along the + // `bcast_dim` might be invalid, they will be adjusted later in + // `BroadcastBlockAlongBcastDim`. + struct BlockBroadcastingParams { + Dimensions input_dims; // input expression dimensions + Dimensions output_dims; // output block sizes + Dimensions output_strides; // output block strides + + int inner_dim_count; // count inner dimensions matching in size + int bcast_dim; // broadcasting dimension index + Index bcast_dim_size; // broadcasting dimension size + Index inner_dim_size; // inner dimensions size + + // Block sizes and strides for the input block where all dimensions before + // `bcast_dim` are equal to `1`. + Dimensions input_block_sizes; + Dimensions input_block_strides; + + // Block sizes and strides for blocks with extra dimensions and strides `0`. + BroadcastDimensions bcast_block_sizes; + BroadcastDimensions bcast_block_strides; + BroadcastDimensions bcast_input_strides; + }; + + struct BlockBroadcastingIteratorState { + Index size; + Index count; + Index output_stride; + Index output_span; + }; + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlockBroadcastingParams + blockBroadcastingParams(TensorBlockDesc& desc) const { + BlockBroadcastingParams params; + + params.input_dims = Dimensions(m_impl.dimensions()); + + // Output block sizes and strides. + params.output_dims = desc.dimensions(); + params.output_strides = internal::strides(params.output_dims); + + // Find the broadcasting dimension (first dimension with output size smaller + // that the input size). + params.bcast_dim = 0; + params.bcast_dim_size = 1; + params.inner_dim_size = 1; + + // Count the number of inner dimensions that have the same size in the block + // and in the broadcast expression. + params.inner_dim_count = 0; + + for (int i = 0; i < NumDims; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + + if (params.output_dims[dim] == m_dimensions[dim]) { + params.inner_dim_size *= params.output_dims[dim]; + ++params.inner_dim_count; + continue; + } + + // First non-matching dimension is the broadcasting dimension. + eigen_assert(params.output_dims[dim] < m_dimensions[dim]); + params.bcast_dim = dim; + params.bcast_dim_size = params.output_dims[dim]; + break; + } + + // Calculate the input block size for looking into the input. + for (int i = 0; i < params.inner_dim_count; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + params.input_block_sizes[dim] = params.input_dims[dim]; + } + for (int i = params.inner_dim_count; i < NumDims; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + params.input_block_sizes[dim] = 1; + } + params.input_block_strides = + internal::strides(params.input_block_sizes); + + // Broadcast with the 0-stride trick: Create 1 extra dim for each + // broadcast, set the input stride to 0. + // + // When ColMajor: + // + // - bcast_block_sizes: + // [d_0, b_0, d_1, b_1, ...] + // + // - bcast_block_strides: + // [output_block_strides[0], output_block_strides[0] * d_0, + // output_block_strides[1], output_block_strides[1] * d_1, + // ...] + // + // - bcast_input_strides: + // [input_block_strides[0], 0, + // input_block_strides[1], 0, + // ...]. + // + for (int i = 0; i < params.inner_dim_count; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + + const int copy_dim = IsColMajor ? 2 * i : 2 * NumDims - 2 * i - 1; + const int broadcast_dim = IsColMajor ? copy_dim + 1 : copy_dim - 1; + + params.bcast_block_sizes[copy_dim] = params.input_dims[dim]; + params.bcast_block_sizes[broadcast_dim] = m_broadcast[dim]; + params.bcast_block_strides[copy_dim] = params.output_strides[dim]; + params.bcast_block_strides[broadcast_dim] = + params.output_strides[dim] * params.input_dims[dim]; + params.bcast_input_strides[copy_dim] = params.input_block_strides[dim]; + params.bcast_input_strides[broadcast_dim] = 0; + } + + for (int i = 2 * params.inner_dim_count; i < 2 * NumDims; ++i) { + const int dim = IsColMajor ? i : 2 * NumDims - i - 1; + params.bcast_block_sizes[dim] = 1; + params.bcast_block_strides[dim] = 0; + params.bcast_input_strides[dim] = 0; + } + + return params; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock emptyBlock() const { + DSizes dimensions; + for (int i = 0; i < NumDims; ++i) dimensions[i] = 0; + return TensorBlock(internal::TensorBlockKind::kView, NULL, dimensions); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlockAlongBcastDim( + BlockBroadcastingParams params, Index bcast_offset, + TensorBlockScratch& scratch, ScalarNoConst* materialized_output, + ScalarNoConst** materialized_input, + size_t* materialized_input_size) const { + if (params.bcast_dim_size == 1) { + // We just need one block read using the ready-set values above. + return BroadcastBlock( + params.input_block_sizes, params.input_block_strides, + params.bcast_block_sizes, params.bcast_block_strides, + params.bcast_input_strides, bcast_offset, 0, scratch, + materialized_output, materialized_input, materialized_input_size); + + } else if (params.input_dims[params.bcast_dim] == 1) { + // Broadcast bcast dimension (< NumDims) by bcast_dim_size. + const int broadcast_bcast_dim = + IsColMajor ? 2 * params.inner_dim_count + 1 + : 2 * NumDims - 2 * params.inner_dim_count - 2; + + params.bcast_block_sizes[broadcast_bcast_dim] = params.bcast_dim_size; + params.bcast_input_strides[broadcast_bcast_dim] = 0; + params.bcast_block_strides[broadcast_bcast_dim] = + params.output_strides[params.bcast_dim]; + + return BroadcastBlock( + params.input_block_sizes, params.input_block_strides, + params.bcast_block_sizes, params.bcast_block_strides, + params.bcast_input_strides, bcast_offset, 0, scratch, + materialized_output, materialized_input, materialized_input_size); + + } else { + // Keep track of the total number of the coefficients written to the + // output block. + Index num_output_coeffs = 0; + + // The general case. Let's denote the output block as + // + // x[..., a:a+bcast_dim_size, :, ..., :] + // + // where a:a+bcast_dim_size is a slice on the bcast_dim dimension + // (< NumDims). We need to split the a:a+bcast_dim_size into possibly 3 + // sub-blocks: + // + // (1) a:b, where b is the smallest multiple of + // input_dims[bcast_dim_start] in [a, a+bcast_dim_size]. + // + // (2) b:c, where c is the largest multiple of input_dims[bcast_dim_start] + // in [a, a+bcast_dim_size]. + // + // (3) c:a+bcast_dim_size . + // + // Or, when b and c do not exist, we just need to process the whole block + // together. + + // Find a. + const Index bcast_dim_left_index = + bcast_offset / m_outputStrides[params.bcast_dim]; + + // Find b and c. + const Index input_bcast_dim_size = params.input_dims[params.bcast_dim]; + + // First multiple after a. This is b when <= bcast_dim_left_index + + // bcast_dim_size. + const Index first_multiple = + divup(bcast_dim_left_index, input_bcast_dim_size) * + input_bcast_dim_size; + + if (first_multiple <= bcast_dim_left_index + params.bcast_dim_size) { + // b exists, so does c. Find it. + const Index last_multiple = + (bcast_dim_left_index + params.bcast_dim_size) / + input_bcast_dim_size * input_bcast_dim_size; + const int copy_bcast_dim = + IsColMajor ? 2 * params.inner_dim_count + : 2 * NumDims - 2 * params.inner_dim_count - 1; + const int broadcast_bcast_dim = + IsColMajor ? 2 * params.inner_dim_count + 1 + : 2 * NumDims - 2 * params.inner_dim_count - 2; + + if (first_multiple > bcast_dim_left_index) { + const Index head_size = first_multiple - bcast_dim_left_index; + params.input_block_sizes[params.bcast_dim] = head_size; + params.bcast_block_sizes[copy_bcast_dim] = head_size; + params.bcast_input_strides[copy_bcast_dim] = + params.input_block_strides[params.bcast_dim]; + params.bcast_block_strides[copy_bcast_dim] = + params.output_strides[params.bcast_dim]; + params.bcast_block_sizes[broadcast_bcast_dim] = 1; + params.bcast_input_strides[broadcast_bcast_dim] = 0; + params.bcast_block_strides[broadcast_bcast_dim] = + params.output_strides[params.bcast_dim] * + params.input_dims[params.bcast_dim]; + + num_output_coeffs += BroadcastBlock( + params.input_block_sizes, params.input_block_strides, + params.bcast_block_sizes, params.bcast_block_strides, + params.bcast_input_strides, bcast_offset, 0, scratch, + materialized_output, materialized_input, materialized_input_size); + } + if (first_multiple < last_multiple) { + params.input_block_sizes[params.bcast_dim] = input_bcast_dim_size; + params.bcast_block_sizes[copy_bcast_dim] = input_bcast_dim_size; + params.bcast_input_strides[copy_bcast_dim] = + params.input_block_strides[params.bcast_dim]; + params.bcast_block_strides[copy_bcast_dim] = + params.output_strides[params.bcast_dim]; + params.bcast_block_sizes[broadcast_bcast_dim] = + (last_multiple - first_multiple) / input_bcast_dim_size; + params.bcast_input_strides[broadcast_bcast_dim] = 0; + params.bcast_block_strides[broadcast_bcast_dim] = + params.output_strides[params.bcast_dim] * + params.input_dims[params.bcast_dim]; + const Index offset = (first_multiple - bcast_dim_left_index) * + m_outputStrides[params.bcast_dim]; + + num_output_coeffs += BroadcastBlock( + params.input_block_sizes, params.input_block_strides, + params.bcast_block_sizes, params.bcast_block_strides, + params.bcast_input_strides, bcast_offset, offset, scratch, + materialized_output, materialized_input, materialized_input_size); + } + if (last_multiple < bcast_dim_left_index + params.bcast_dim_size) { + const Index tail_size = + bcast_dim_left_index + params.bcast_dim_size - last_multiple; + params.input_block_sizes[params.bcast_dim] = tail_size; + params.bcast_block_sizes[copy_bcast_dim] = tail_size; + params.bcast_input_strides[copy_bcast_dim] = + params.input_block_strides[params.bcast_dim]; + params.bcast_block_strides[copy_bcast_dim] = + params.output_strides[params.bcast_dim]; + params.bcast_block_sizes[broadcast_bcast_dim] = 1; + params.bcast_input_strides[broadcast_bcast_dim] = 0; + params.bcast_block_strides[broadcast_bcast_dim] = + params.output_strides[params.bcast_dim] * + params.input_dims[params.bcast_dim]; + const Index offset = (last_multiple - bcast_dim_left_index) * + m_outputStrides[params.bcast_dim]; + + num_output_coeffs += BroadcastBlock( + params.input_block_sizes, params.input_block_strides, + params.bcast_block_sizes, params.bcast_block_strides, + params.bcast_input_strides, bcast_offset, offset, scratch, + materialized_output, materialized_input, materialized_input_size); + } + } else { + // b and c do not exist. + const int copy_bcast_dim = + IsColMajor ? 2 * params.inner_dim_count + : 2 * NumDims - 2 * params.inner_dim_count - 1; + params.input_block_sizes[params.bcast_dim] = params.bcast_dim_size; + params.bcast_block_sizes[copy_bcast_dim] = params.bcast_dim_size; + params.bcast_input_strides[copy_bcast_dim] = + params.input_block_strides[params.bcast_dim]; + params.bcast_block_strides[copy_bcast_dim] = + params.output_strides[params.bcast_dim]; + + num_output_coeffs += BroadcastBlock( + params.input_block_sizes, params.input_block_strides, + params.bcast_block_sizes, params.bcast_block_strides, + params.bcast_input_strides, bcast_offset, 0, scratch, + materialized_output, materialized_input, materialized_input_size); + } + + return num_output_coeffs; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlock( + const Dimensions& input_block_sizes, + const Dimensions& input_block_strides, + const BroadcastDimensions& bcast_block_sizes, + const BroadcastDimensions& bcast_block_strides, + const BroadcastDimensions& bcast_input_strides, Index bcast_offset, + Index offset, TensorBlockScratch& scratch, + ScalarNoConst* materialized_output, ScalarNoConst** materialized_input, + size_t* materialized_input_size) const { + // ---------------------------------------------------------------------- // + // Tensor block descriptor for reading block from the input. + const Index input_offset = bcast_offset + offset; + TensorBlockDesc input_desc( + IsColMajor ? indexColMajor(input_offset) : indexRowMajor(input_offset), + input_block_sizes); + + ArgTensorBlock input_block = m_impl.block(input_desc, scratch); + + // ---------------------------------------------------------------------- // + // Materialize input block into a temporary memory buffer only if it's not + // already available in the arg block. + const ScalarNoConst* input_buffer = NULL; + + if (input_block.data() != NULL) { + // Input block already has raw data, there is no need to materialize it. + input_buffer = input_block.data(); + + } else { + // Otherwise we have to do block assignment into a temporary buffer. + + // Maybe reuse previously allocated buffer, or allocate a new one with a + // scratch allocator. + const size_t input_total_size = input_block_sizes.TotalSize(); + if (*materialized_input == NULL || + *materialized_input_size < input_total_size) { + *materialized_input_size = input_total_size; + void* mem = scratch.allocate(*materialized_input_size * sizeof(Scalar)); + *materialized_input = static_cast(mem); + } + + typedef internal::TensorBlockAssignment< + ScalarNoConst, NumDims, typename ArgTensorBlock::XprType, Index> + TensorBlockAssignment; + + TensorBlockAssignment::Run( + TensorBlockAssignment::target(input_block_sizes, input_block_strides, + *materialized_input), + input_block.expr()); + + input_buffer = *materialized_input; + } + + // ---------------------------------------------------------------------- // + // Copy data from materialized input block to the materialized output, using + // given broadcast strides (strides with zeroes). + typedef internal::TensorBlockIO + TensorBlockIO; + + typename TensorBlockIO::Src src(bcast_input_strides, input_buffer); + typename TensorBlockIO::Dst dst(bcast_block_sizes, bcast_block_strides, + materialized_output + offset); + + return TensorBlockIO::Copy(dst, src); + } + +protected: + const Device EIGEN_DEVICE_REF m_device; + const typename internal::remove_reference::type m_broadcast; + Dimensions m_dimensions; + array m_outputStrides; + array m_inputStrides; + TensorEvaluator m_impl; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorChipping.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorChipping.h new file mode 100644 index 0000000..3764573 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorChipping.h @@ -0,0 +1,518 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H +#define EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H + +namespace Eigen { + +/** \class TensorKChippingReshaping + * \ingroup CXX11_Tensor_Module + * + * \brief A chip is a thin slice, corresponding to a column or a row in a 2-d tensor. + * + * + */ + +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions - 1; + static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorChippingOp EIGEN_DEVICE_REF type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorChippingOp type; +}; + +template +struct DimensionId +{ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) { + EIGEN_UNUSED_VARIABLE(dim); + eigen_assert(dim == DimId); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const { + return DimId; + } +}; +template <> +struct DimensionId +{ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) : actual_dim(dim) { + eigen_assert(dim >= 0); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const { + return actual_dim; + } + private: + const DenseIndex actual_dim; +}; + + +} // end namespace internal + + + +template +class TensorChippingOp : public TensorBase > +{ + public: + typedef TensorBase > Base; + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset, const Index dim) + : m_xpr(expr), m_offset(offset), m_dim(dim) { + } + + EIGEN_DEVICE_FUNC + const Index offset() const { return m_offset; } + EIGEN_DEVICE_FUNC + const Index dim() const { return m_dim.actualDim(); } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorChippingOp) + + protected: + typename XprType::Nested m_xpr; + const Index m_offset; + const internal::DimensionId m_dim; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorChippingOp XprType; + static const int NumInputDims = internal::array_size::Dimensions>::value; + static const int NumDims = NumInputDims-1; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + enum { + // Alignment can't be guaranteed at compile time since it depends on the + // slice offsets. + IsAligned = false, + Layout = TensorEvaluator::Layout, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess, + // Chipping of outer-most dimension is a trivial operation, because we can + // read and write directly from the underlying tensor using single offset. + IsOuterChipping = (static_cast(Layout) == ColMajor && DimId == NumInputDims - 1) || + (static_cast(Layout) == RowMajor && DimId == 0), + // Chipping inner-most dimension. + IsInnerChipping = (static_cast(Layout) == ColMajor && DimId == 0) || + (static_cast(Layout) == RowMajor && DimId == NumInputDims - 1), + // Prefer block access if the underlying expression prefers it, otherwise + // only if chipping is not trivial. + PreferBlockAccess = TensorEvaluator::PreferBlockAccess || + !IsOuterChipping, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + typedef typename internal::remove_const::type ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef internal::TensorBlockDescriptor + ArgTensorBlockDesc; + typedef typename TensorEvaluator::TensorBlock + ArgTensorBlock; + + typedef typename internal::TensorMaterializedBlock + TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device) + { + EIGEN_STATIC_ASSERT((NumInputDims >= 1), YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert(NumInputDims > m_dim.actualDim()); + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + eigen_assert(op.offset() < input_dims[m_dim.actualDim()]); + + int j = 0; + for (int i = 0; i < NumInputDims; ++i) { + if (i != m_dim.actualDim()) { + m_dimensions[j] = input_dims[i]; + ++j; + } + } + + m_stride = 1; + m_inputStride = 1; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < m_dim.actualDim(); ++i) { + m_stride *= input_dims[i]; + m_inputStride *= input_dims[i]; + } + } else { + for (int i = NumInputDims-1; i > m_dim.actualDim(); --i) { + m_stride *= input_dims[i]; + m_inputStride *= input_dims[i]; + } + } + m_inputStride *= input_dims[m_dim.actualDim()]; + m_inputOffset = m_stride * op.offset(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_impl.coeff(srcCoeff(index)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + if (isInnerChipping()) { + // m_stride is equal to 1, so let's avoid the integer division. + eigen_assert(m_stride == 1); + Index inputIndex = index * m_inputStride + m_inputOffset; + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + values[i] = m_impl.coeff(inputIndex); + inputIndex += m_inputStride; + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } else if (isOuterChipping()) { + // m_stride is always greater than index, so let's avoid the integer division. + eigen_assert(m_stride > index); + return m_impl.template packet(index + m_inputOffset); + } else { + const Index idx = index / m_stride; + const Index rem = index - idx * m_stride; + if (rem + PacketSize <= m_stride) { + Index inputIndex = idx * m_inputStride + m_inputOffset + rem; + return m_impl.template packet(inputIndex); + } else { + // Cross the stride boundary. Fallback to slow path. + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index); + ++index; + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + double cost = 0; + if ((static_cast(Layout) == static_cast(ColMajor) && + m_dim.actualDim() == 0) || + (static_cast(Layout) == static_cast(RowMajor) && + m_dim.actualDim() == NumInputDims - 1)) { + cost += TensorOpCost::MulCost() + TensorOpCost::AddCost(); + } else if ((static_cast(Layout) == static_cast(ColMajor) && + m_dim.actualDim() == NumInputDims - 1) || + (static_cast(Layout) == static_cast(RowMajor) && + m_dim.actualDim() == 0)) { + cost += TensorOpCost::AddCost(); + } else { + cost += 3 * TensorOpCost::MulCost() + TensorOpCost::DivCost() + + 3 * TensorOpCost::AddCost(); + } + + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + const size_t target_size = m_device.lastLevelCacheSize(); + return internal::TensorBlockResourceRequirements::merge( + internal::TensorBlockResourceRequirements::skewed(target_size), + m_impl.getResourceRequirements()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool root_of_expr_ast = false) const { + const Index chip_dim = m_dim.actualDim(); + + DSizes input_block_dims; + for (int i = 0; i < NumInputDims; ++i) { + input_block_dims[i] + = i < chip_dim ? desc.dimension(i) + : i > chip_dim ? desc.dimension(i - 1) + : 1; + } + + ArgTensorBlockDesc arg_desc(srcCoeff(desc.offset()), input_block_dims); + + // Try to reuse destination buffer for materializing argument block. + if (desc.HasDestinationBuffer()) { + DSizes arg_destination_strides; + for (int i = 0; i < NumInputDims; ++i) { + arg_destination_strides[i] + = i < chip_dim ? desc.destination().strides()[i] + : i > chip_dim ? desc.destination().strides()[i - 1] + : 0; // for dimensions of size `1` stride should never be used. + } + + arg_desc.template AddDestinationBuffer( + desc.destination().template data(), + arg_destination_strides); + } + + ArgTensorBlock arg_block = m_impl.block(arg_desc, scratch, root_of_expr_ast); + if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer(); + + if (arg_block.data() != NULL) { + // Forward argument block buffer if possible. + return TensorBlock(arg_block.kind(), arg_block.data(), + desc.dimensions()); + + } else { + // Assign argument block expression to a buffer. + + // Prepare storage for the materialized chipping result. + const typename TensorBlock::Storage block_storage = + TensorBlock::prepareStorage(desc, scratch); + + typedef internal::TensorBlockAssignment< + ScalarNoConst, NumInputDims, typename ArgTensorBlock::XprType, Index> + TensorBlockAssignment; + + TensorBlockAssignment::Run( + TensorBlockAssignment::target( + arg_desc.dimensions(), + internal::strides(arg_desc.dimensions()), + block_storage.data()), + arg_block.expr()); + + return block_storage.AsTensorMaterializedBlock(); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const { + typename Storage::Type result = constCast(m_impl.data()); + if (isOuterChipping() && result) { + return result + m_inputOffset; + } else { + return NULL; + } + } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const + { + Index inputIndex; + if (isInnerChipping()) { + // m_stride is equal to 1, so let's avoid the integer division. + eigen_assert(m_stride == 1); + inputIndex = index * m_inputStride + m_inputOffset; + } else if (isOuterChipping()) { + // m_stride is always greater than index, so let's avoid the integer + // division. + eigen_assert(m_stride > index); + inputIndex = index + m_inputOffset; + } else { + const Index idx = index / m_stride; + inputIndex = idx * m_inputStride + m_inputOffset; + index -= idx * m_stride; + inputIndex += index; + } + return inputIndex; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isInnerChipping() const { + return IsInnerChipping || + (static_cast(Layout) == ColMajor && m_dim.actualDim() == 0) || + (static_cast(Layout) == RowMajor && m_dim.actualDim() == NumInputDims - 1); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isOuterChipping() const { + return IsOuterChipping || + (static_cast(Layout) == ColMajor && m_dim.actualDim() == NumInputDims-1) || + (static_cast(Layout) == RowMajor && m_dim.actualDim() == 0); + } + + Dimensions m_dimensions; + Index m_stride; + Index m_inputOffset; + Index m_inputStride; + TensorEvaluator m_impl; + const internal::DimensionId m_dim; + const Device EIGEN_DEVICE_REF m_device; +}; + + +// Eval as lvalue +template +struct TensorEvaluator, Device> + : public TensorEvaluator, Device> +{ + typedef TensorEvaluator, Device> Base; + typedef TensorChippingOp XprType; + static const int NumInputDims = internal::array_size::Dimensions>::value; + static const int NumDims = NumInputDims-1; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = PacketType::size; + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::RawAccess, + Layout = TensorEvaluator::Layout, + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) + { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + + if (this->isInnerChipping()) { + // m_stride is equal to 1, so let's avoid the integer division. + eigen_assert(this->m_stride == 1); + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + internal::pstore(values, x); + Index inputIndex = index * this->m_inputStride + this->m_inputOffset; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + this->m_impl.coeffRef(inputIndex) = values[i]; + inputIndex += this->m_inputStride; + } + } else if (this->isOuterChipping()) { + // m_stride is always greater than index, so let's avoid the integer division. + eigen_assert(this->m_stride > index); + this->m_impl.template writePacket(index + this->m_inputOffset, x); + } else { + const Index idx = index / this->m_stride; + const Index rem = index - idx * this->m_stride; + if (rem + PacketSize <= this->m_stride) { + const Index inputIndex = idx * this->m_inputStride + this->m_inputOffset + rem; + this->m_impl.template writePacket(inputIndex, x); + } else { + // Cross stride boundary. Fallback to slow path. + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + internal::pstore(values, x); + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + this->coeffRef(index) = values[i]; + ++index; + } + } + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( + const TensorBlockDesc& desc, const TensorBlock& block) { + assert(this->m_impl.data() != NULL); + + const Index chip_dim = this->m_dim.actualDim(); + + DSizes input_block_dims; + for (int i = 0; i < NumInputDims; ++i) { + input_block_dims[i] = i < chip_dim ? desc.dimension(i) + : i > chip_dim ? desc.dimension(i - 1) + : 1; + } + + typedef TensorReshapingOp, + const typename TensorBlock::XprType> + TensorBlockExpr; + + typedef internal::TensorBlockAssignment + TensorBlockAssign; + + TensorBlockAssign::Run( + TensorBlockAssign::target( + input_block_dims, + internal::strides(this->m_impl.dimensions()), + this->m_impl.data(), this->srcCoeff(desc.offset())), + block.expr().reshape(input_block_dims)); + } +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorConcatenation.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorConcatenation.h new file mode 100644 index 0000000..5235a8e --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorConcatenation.h @@ -0,0 +1,377 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H + +namespace Eigen { + +/** \class TensorConcatenationOp + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor concatenation class. + * + * + */ +namespace internal { +template +struct traits > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename promote_storage_type::ret Scalar; + typedef typename promote_storage_type::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef typename promote_index_type::Index, + typename traits::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef typename remove_reference::type _LhsNested; + typedef typename remove_reference::type _RhsNested; + static const int NumDimensions = traits::NumDimensions; + static const int Layout = traits::Layout; + enum { Flags = 0 }; + typedef typename conditional::val, + typename traits::PointerType, typename traits::PointerType>::type PointerType; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorConcatenationOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorConcatenationOp type; +}; + +} // end namespace internal + + +template +class TensorConcatenationOp : public TensorBase, WriteAccessors> +{ + public: + typedef TensorBase, WriteAccessors> Base; + typedef typename internal::traits::Scalar Scalar; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + typedef typename internal::nested::type Nested; + typedef typename internal::promote_storage_type::ret CoeffReturnType; + typedef typename NumTraits::Real RealScalar; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConcatenationOp(const LhsXprType& lhs, const RhsXprType& rhs, Axis axis) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_axis(axis) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + lhsExpression() const { return m_lhs_xpr; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + rhsExpression() const { return m_rhs_xpr; } + + EIGEN_DEVICE_FUNC const Axis& axis() const { return m_axis; } + + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorConcatenationOp) + protected: + typename LhsXprType::Nested m_lhs_xpr; + typename RhsXprType::Nested m_rhs_xpr; + const Axis m_axis; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorConcatenationOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::Dimensions>::value; + static const int RightNumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess && + TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess || + TensorEvaluator::PreferBlockAccess, + Layout = TensorEvaluator::Layout, + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis()) + { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout) || NumDims == 1), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((NumDims == RightNumDims), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + + eigen_assert(0 <= m_axis && m_axis < NumDims); + const Dimensions& lhs_dims = m_leftImpl.dimensions(); + const Dimensions& rhs_dims = m_rightImpl.dimensions(); + { + int i = 0; + for (; i < m_axis; ++i) { + eigen_assert(lhs_dims[i] > 0); + eigen_assert(lhs_dims[i] == rhs_dims[i]); + m_dimensions[i] = lhs_dims[i]; + } + eigen_assert(lhs_dims[i] > 0); // Now i == m_axis. + eigen_assert(rhs_dims[i] > 0); + m_dimensions[i] = lhs_dims[i] + rhs_dims[i]; + for (++i; i < NumDims; ++i) { + eigen_assert(lhs_dims[i] > 0); + eigen_assert(lhs_dims[i] == rhs_dims[i]); + m_dimensions[i] = lhs_dims[i]; + } + } + + if (static_cast(Layout) == static_cast(ColMajor)) { + m_leftStrides[0] = 1; + m_rightStrides[0] = 1; + m_outputStrides[0] = 1; + + for (int j = 1; j < NumDims; ++j) { + m_leftStrides[j] = m_leftStrides[j-1] * lhs_dims[j-1]; + m_rightStrides[j] = m_rightStrides[j-1] * rhs_dims[j-1]; + m_outputStrides[j] = m_outputStrides[j-1] * m_dimensions[j-1]; + } + } else { + m_leftStrides[NumDims - 1] = 1; + m_rightStrides[NumDims - 1] = 1; + m_outputStrides[NumDims - 1] = 1; + + for (int j = NumDims - 2; j >= 0; --j) { + m_leftStrides[j] = m_leftStrides[j+1] * lhs_dims[j+1]; + m_rightStrides[j] = m_rightStrides[j+1] * rhs_dims[j+1]; + m_outputStrides[j] = m_outputStrides[j+1] * m_dimensions[j+1]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + // TODO(phli): Add short-circuit memcpy evaluation if underlying data are linear? + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) + { + m_leftImpl.evalSubExprsIfNeeded(NULL); + m_rightImpl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_STRONG_INLINE void cleanup() + { + m_leftImpl.cleanup(); + m_rightImpl.cleanup(); + } + + // TODO(phli): attempt to speed this up. The integer divisions and modulo are slow. + // See CL/76180724 comments for more ideas. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + // Collect dimension-wise indices (subs). + array subs; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + subs[i] = index / m_outputStrides[i]; + index -= subs[i] * m_outputStrides[i]; + } + subs[0] = index; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + subs[i] = index / m_outputStrides[i]; + index -= subs[i] * m_outputStrides[i]; + } + subs[NumDims - 1] = index; + } + + const Dimensions& left_dims = m_leftImpl.dimensions(); + if (subs[m_axis] < left_dims[m_axis]) { + Index left_index; + if (static_cast(Layout) == static_cast(ColMajor)) { + left_index = subs[0]; + EIGEN_UNROLL_LOOP + for (int i = 1; i < NumDims; ++i) { + left_index += (subs[i] % left_dims[i]) * m_leftStrides[i]; + } + } else { + left_index = subs[NumDims - 1]; + EIGEN_UNROLL_LOOP + for (int i = NumDims - 2; i >= 0; --i) { + left_index += (subs[i] % left_dims[i]) * m_leftStrides[i]; + } + } + return m_leftImpl.coeff(left_index); + } else { + subs[m_axis] -= left_dims[m_axis]; + const Dimensions& right_dims = m_rightImpl.dimensions(); + Index right_index; + if (static_cast(Layout) == static_cast(ColMajor)) { + right_index = subs[0]; + EIGEN_UNROLL_LOOP + for (int i = 1; i < NumDims; ++i) { + right_index += (subs[i] % right_dims[i]) * m_rightStrides[i]; + } + } else { + right_index = subs[NumDims - 1]; + EIGEN_UNROLL_LOOP + for (int i = NumDims - 2; i >= 0; --i) { + right_index += (subs[i] % right_dims[i]) * m_rightStrides[i]; + } + } + return m_rightImpl.coeff(right_index); + } + } + + // TODO(phli): Add a real vectorization. + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + const int packetSize = PacketType::size; + EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index + packetSize - 1 < dimensions().TotalSize()); + + EIGEN_ALIGN_MAX CoeffReturnType values[packetSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double compute_cost = NumDims * (2 * TensorOpCost::AddCost() + + 2 * TensorOpCost::MulCost() + + TensorOpCost::DivCost() + + TensorOpCost::ModCost()); + const double lhs_size = m_leftImpl.dimensions().TotalSize(); + const double rhs_size = m_rightImpl.dimensions().TotalSize(); + return (lhs_size / (lhs_size + rhs_size)) * + m_leftImpl.costPerCoeff(vectorized) + + (rhs_size / (lhs_size + rhs_size)) * + m_rightImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + + #ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_leftImpl.bind(cgh); + m_rightImpl.bind(cgh); + } + #endif + + protected: + Dimensions m_dimensions; + array m_outputStrides; + array m_leftStrides; + array m_rightStrides; + TensorEvaluator m_leftImpl; + TensorEvaluator m_rightImpl; + const Axis m_axis; +}; + +// Eval as lvalue +template + struct TensorEvaluator, Device> + : public TensorEvaluator, Device> +{ + typedef TensorEvaluator, Device> Base; + typedef TensorConcatenationOp XprType; + typedef typename Base::Dimensions Dimensions; + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess && + TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess || + TensorEvaluator::PreferBlockAccess, + Layout = TensorEvaluator::Layout, + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device) + : Base(op, device) + { + EIGEN_STATIC_ASSERT((static_cast(Layout) == static_cast(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE); + } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + { + // Collect dimension-wise indices (subs). + array subs; + for (int i = Base::NumDims - 1; i > 0; --i) { + subs[i] = index / this->m_outputStrides[i]; + index -= subs[i] * this->m_outputStrides[i]; + } + subs[0] = index; + + const Dimensions& left_dims = this->m_leftImpl.dimensions(); + if (subs[this->m_axis] < left_dims[this->m_axis]) { + Index left_index = subs[0]; + for (int i = 1; i < Base::NumDims; ++i) { + left_index += (subs[i] % left_dims[i]) * this->m_leftStrides[i]; + } + return this->m_leftImpl.coeffRef(left_index); + } else { + subs[this->m_axis] -= left_dims[this->m_axis]; + const Dimensions& right_dims = this->m_rightImpl.dimensions(); + Index right_index = subs[0]; + for (int i = 1; i < Base::NumDims; ++i) { + right_index += (subs[i] % right_dims[i]) * this->m_rightStrides[i]; + } + return this->m_rightImpl.coeffRef(right_index); + } + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + const int packetSize = PacketType::size; + EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index + packetSize - 1 < this->dimensions().TotalSize()); + + EIGEN_ALIGN_MAX CoeffReturnType values[packetSize]; + internal::pstore(values, x); + for (int i = 0; i < packetSize; ++i) { + coeffRef(index+i) = values[i]; + } + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorContraction.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorContraction.h new file mode 100644 index 0000000..8b35f79 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorContraction.h @@ -0,0 +1,1023 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H + +namespace Eigen { + +/** \class TensorContraction + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor contraction class. + * + * + */ +namespace internal { + +template +struct traits > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename gebp_traits::type, + typename remove_const::type>::ResScalar Scalar; + + typedef typename promote_storage_type::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef typename promote_index_type::Index, + typename traits::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef typename remove_reference::type _LhsNested; + typedef typename remove_reference::type _RhsNested; + + // From NumDims below. + static const int NumDimensions = traits::NumDimensions + traits::NumDimensions - 2 * array_size::value; + static const int Layout = traits::Layout; + typedef typename conditional::val, + typename traits::PointerType, + typename traits::PointerType>::type + PointerType; + + enum { + Flags = 0 + }; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorContractionOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorContractionOp type; +}; + +template +struct traits, Device_> > { + typedef Indices_ Indices; + typedef LeftArgType_ LeftArgType; + typedef RightArgType_ RightArgType; + typedef OutputKernelType_ OutputKernelType; + typedef Device_ Device; + + // From NumDims below. + static const int NumDimensions = traits::NumDimensions + traits::NumDimensions - 2 * array_size::value; +}; + +// Helper class to allocate and deallocate temporary memory for packed buffers. +template +struct TensorContractionBlockMemAllocator { + typedef void* BlockMemHandle; + + template + EIGEN_DEVICE_FUNC static BlockMemHandle allocate(Device& d, const Index bm, + const Index bk, + const Index bn, + LhsScalar** lhs_block, + RhsScalar** rhs_block) { + eigen_assert(lhs_block); + eigen_assert(rhs_block); + BlockSizes sz = ComputeLhsRhsBlockSizes(bm, bk, bn); + char* block_mem = static_cast(d.allocate(sz.lhs_size + sz.rhs_size)); + eigen_assert(block_mem); + *lhs_block = reinterpret_cast(block_mem); + *rhs_block = reinterpret_cast(block_mem + sz.lhs_size); + return block_mem; + } + + template + EIGEN_DEVICE_FUNC static BlockMemHandle allocateSlices( + Device& d, const Index bm, const Index bk, const Index bn, + const Index num_lhs, const Index num_rhs, const Index num_slices, + std::vector* lhs_blocks, + std::vector* rhs_blocks) { + eigen_assert(num_slices > 0); + eigen_assert(num_lhs >= 0 && num_rhs >= 0); + eigen_assert(num_lhs == 0 || lhs_blocks); + eigen_assert(num_rhs == 0 || rhs_blocks); + BlockSizes sz = ComputeLhsRhsBlockSizes(bm, bk, bn); + void* block_mem = d.allocate( + (num_lhs * sz.lhs_size + num_rhs * sz.rhs_size) * num_slices); + eigen_assert(block_mem); + char* mem = static_cast(block_mem); + + for (Index x = 0; x < num_slices; x++) { + if (num_lhs > 0) lhs_blocks[x].resize(num_lhs); + for (Index m = 0; m < num_lhs; m++) { + lhs_blocks[x][m] = reinterpret_cast(mem); + mem += sz.lhs_size; + } + if (num_rhs > 0) rhs_blocks[x].resize(num_rhs); + for (Index n = 0; n < num_rhs; n++) { + rhs_blocks[x][n] = reinterpret_cast(mem); + mem += sz.rhs_size; + } + } + + return block_mem; + } + + template + EIGEN_DEVICE_FUNC static void deallocate(Device& d, BlockMemHandle handle) { + d.deallocate(handle); + } + + private: + struct BlockSizes { + Index lhs_size; + Index rhs_size; + }; + EIGEN_DEVICE_FUNC static BlockSizes ComputeLhsRhsBlockSizes(const Index bm, + const Index bk, + const Index bn) { + Index align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1); + BlockSizes sz; + sz.lhs_size = divup(bm * bk * sizeof(LhsScalar), align) * align; + sz.rhs_size = divup(bn * bk * sizeof(RhsScalar), align) * align; + return sz; + } +}; + +// WARNING: In this code we assume that Lhs and Rhs tensor expressions are in +// ColMajor storage order. This property is guaranteed by the +// TensorContractionOp evaluator. TensorContractionKernel specifies how we pack +// blocks of Lhs and Rhs tensor expressions, and how we invoke matrix +// multiplication for these blocks. Default tensor contraction uses +// gemm_pack_rhs, gemm_pack_lhs and gebp_kernel from Eigen Core (see +// GeneralBlocPanelKernel.h for details). +// +// By specializing contraction kernels we can use other low level libraries to +// perform matrix multiplication, and still rely on Eigen contraction evaluator. +// This also includes full support in TensorContractionThreadPool, assuming that +// underlying gemm do not use it's own threading. +// +// - ResScalar/LhsScalar/RhsScalar - scalar type for the result of +// multiplication, lhs tensor and rhs tensor respectively. +// +// - StorageIndex - index type for the tensor expressions. In practice almost +// always is Eigen::Index. +// +// - OutputMapper provides access to the memory of the output matrix. In +// practice it's always column major blas_data_mapper (it must be of ResScalar +// type). +// +// - LhsMapper/RhsMapper similarly to blas_data_mapper provide a two dimensional +// view into the Lhs/Rhs tensor expressions. In practice it's +// TensorContractionInputMapper, or some specialization of it based on the +// type of tensor expression (e.g. TensorImagePatchOp has optimized input +// mapper). +template +struct TensorContractionKernel { + // True if `invoke()` supports `beta` in `C <- alpha * A * B + beta * C` + // (otherwise beta should be always equal to 1). + enum { HasBeta = false }; + + EIGEN_DEVICE_FUNC + TensorContractionKernel(StorageIndex m_, StorageIndex k_, StorageIndex n_, + StorageIndex bm_, StorageIndex bk_, StorageIndex bn_) + : m(m_), k(k_), n(n_), bm(bm_), bk(bk_), bn(bn_) {} + + // Pack blocks of Lhs and Rhs into contiguous blocks in memory. + typedef LhsScalar* LhsBlock; + typedef RhsScalar* RhsBlock; + + // Packed Lhs/Rhs block memory allocator. + typedef TensorContractionBlockMemAllocator + BlockMemAllocator; + typedef typename BlockMemAllocator::BlockMemHandle BlockMemHandle; + + typedef typename internal::gebp_traits Traits; + + typedef internal::gemm_pack_lhs< + LhsScalar, StorageIndex, typename LhsMapper::SubMapper, Traits::mr, + Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor> + LhsPacker; + + typedef internal::gemm_pack_rhs + RhsPacker; + + typedef internal::gebp_kernel + GebpKernel; + + template + EIGEN_DEVICE_FUNC BlockMemHandle allocate(Device& d, LhsBlock* lhs_block, + RhsBlock* rhs_block) { + return BlockMemAllocator::allocate(d, bm, bk, bn, lhs_block, rhs_block); + } + + template + EIGEN_DEVICE_FUNC BlockMemHandle allocateSlices( + Device& d, const StorageIndex num_lhs, const StorageIndex num_rhs, + const StorageIndex num_slices, std::vector* lhs_blocks, + std::vector* rhs_blocks) { + return BlockMemAllocator::allocateSlices( + d, bm, bk, bn, num_lhs, num_rhs, num_slices, lhs_blocks, rhs_blocks); + } + + template + EIGEN_DEVICE_FUNC static void deallocate(Device& d, BlockMemHandle handle) { + BlockMemAllocator::deallocate(d, handle); + } + + EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void packLhs( + LhsBlock* lhsBlock, const typename LhsMapper::SubMapper& data_mapper, + const StorageIndex depth, const StorageIndex rows) { + LhsPacker()(*lhsBlock, data_mapper, depth, rows, /*stride*/ 0, + /*offset*/ 0); + } + + EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void packRhs( + RhsBlock* rhsBlock, const typename RhsMapper::SubMapper& data_mapper, + const StorageIndex depth, const StorageIndex cols) { + RhsPacker()(*rhsBlock, data_mapper, depth, cols); + } + + EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void invoke( + const OutputMapper& output_mapper, const LhsBlock& lhsBlock, + const RhsBlock& rhsBlock, const StorageIndex rows, + const StorageIndex depth, const StorageIndex cols, + const ResScalar alpha, const ResScalar beta) { + // Default GEBP kernel does not support beta. + eigen_assert(beta == ResScalar(1)); + static const int kComputeStrideFromBlockDimensions = -1; + GebpKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha, + /*strideA*/ kComputeStrideFromBlockDimensions, + /*strideB*/ kComputeStrideFromBlockDimensions, + /*offsetA*/ 0, /*offsetB*/ 0); + } + + private: + // These are dimensions of the original Tensors, and selected block sizes. The + // actual block sizes passed to all function above might be smaller because of + // the partial blocks at the end. + const StorageIndex m; + const StorageIndex k; + const StorageIndex n; + const StorageIndex bm; + const StorageIndex bk; + const StorageIndex bn; +}; + +} // end namespace internal + +// Tensor contraction params that should enable to get from output matrix +// 2-dimensional coordinates to the output tensor dimensions. +struct TensorContractionParams { + // TensorContraction evaluator assumes that both tensors are in ColMajor + // layout, if tensors are in RowMajor evaluator swap lhs with rhs. + bool swapped_arguments; +}; + +// Output kernel allows to fuse operations into the tensor contraction. +// +// Examples: +// 1. Elementwise Relu transformation following Conv2D. +// 2. AddBias to the Conv2D output channels dimension. +// +// The NoOpOutputKernel implements an output kernel that does absolutely nothing. +struct NoOpOutputKernel { + /** + * Tensor contraction evaluator calls this kernel after finishing each block + * of output matrix. Output blocks belong to the 2-dimensional output tensor. + * + * TensorContractionParams contains contraction dimensions information + * required to map output 2-d space into the expected output tensor space + * (potentially higher dimensional). + * + * \param[in] output_mapper Access to output tensor memory + * \param[in] params Tensor contraction parameters + * \param[in] i Index of a first row available through output_mapper + * \param[in] j Index of a first column available through output_mapper + * \param[in] num_rows Number of available rows + * \param[in] num_cols Number of available columns + */ + template + EIGEN_ALWAYS_INLINE void operator()( + const internal::blas_data_mapper& output_mapper, + const TensorContractionParams& params, Index i, + Index j, Index num_rows, Index num_cols) const { + EIGEN_UNUSED_VARIABLE(output_mapper); + EIGEN_UNUSED_VARIABLE(params); + EIGEN_UNUSED_VARIABLE(i); + EIGEN_UNUSED_VARIABLE(j); + EIGEN_UNUSED_VARIABLE(num_rows); + EIGEN_UNUSED_VARIABLE(num_cols); + } +}; + +template +class TensorContractionOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename internal::gebp_traits::ResScalar CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp( + const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims, + const OutputKernelType& output_kernel = OutputKernelType()) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims), + m_output_kernel(output_kernel) {} + + EIGEN_DEVICE_FUNC + const Indices& indices() const { return m_indices; } + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + lhsExpression() const { return m_lhs_xpr; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + rhsExpression() const { return m_rhs_xpr; } + + EIGEN_DEVICE_FUNC + const OutputKernelType& outputKernel() const { return m_output_kernel; } + + protected: + typename LhsXprType::Nested m_lhs_xpr; + typename RhsXprType::Nested m_rhs_xpr; + const Indices m_indices; + const OutputKernelType m_output_kernel; +}; + + +template +struct TensorContractionEvaluatorBase : internal::no_assignment_operator +{ + typedef typename internal::traits::Indices Indices; + typedef typename internal::traits::LeftArgType LeftArgType; + typedef typename internal::traits::RightArgType RightArgType; + typedef typename internal::traits::OutputKernelType OutputKernelType; + typedef typename internal::traits::Device Device; + + typedef TensorContractionOp XprType; + typedef typename internal::remove_const::type Scalar; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + enum { + IsAligned = true, + PacketAccess = (PacketType::size > 1), + BlockAccess = false, + PreferBlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = true + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef typename internal::conditional< + static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; + typedef typename internal::conditional< + static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; + + typedef TensorEvaluator LeftEvaluatorType; + typedef TensorEvaluator RightEvaluatorType; + + static const int LDims = + internal::array_size::Dimensions>::value; + static const int RDims = + internal::array_size::Dimensions>::value; + static const int ContractDims = internal::array_size::value; + static const int NumDims = LDims + RDims - 2 * ContractDims; + + typedef array contract_t; + typedef array left_nocontract_t; + typedef array right_nocontract_t; + + typedef DSizes Dimensions; + + EIGEN_STRONG_INLINE + TensorContractionEvaluatorBase(const XprType& op, const Device& device) + : m_leftImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), + op.lhsExpression(), op.rhsExpression()), device), + m_rightImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), + op.rhsExpression(), op.lhsExpression()), device), + m_device(device), + m_output_kernel(op.outputKernel()), + m_result(NULL) { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == + static_cast(TensorEvaluator::Layout)), + YOU_MADE_A_PROGRAMMING_MISTAKE); + + + DSizes eval_left_dims; + DSizes eval_right_dims; + array, ContractDims> eval_op_indices; + if (static_cast(Layout) == static_cast(ColMajor)) { + // For ColMajor, we keep using the existing dimensions + for (int i = 0; i < LDims; i++) { + eval_left_dims[i] = m_leftImpl.dimensions()[i]; + } + for (int i = 0; i < RDims; i++) { + eval_right_dims[i] = m_rightImpl.dimensions()[i]; + } + // We keep the pairs of contracting indices. + for (int i = 0; i < ContractDims; i++) { + eval_op_indices[i].first = op.indices()[i].first; + eval_op_indices[i].second = op.indices()[i].second; + } + } else { + // For RowMajor, we need to reverse the existing dimensions + for (int i = 0; i < LDims; i++) { + eval_left_dims[i] = m_leftImpl.dimensions()[LDims - i - 1]; + } + for (int i = 0; i < RDims; i++) { + eval_right_dims[i] = m_rightImpl.dimensions()[RDims - i - 1]; + } + // We need to flip all the pairs of contracting indices as well as + // reversing the dimensions. + for (int i = 0; i < ContractDims; i++) { + eval_op_indices[i].first = LDims - 1 - op.indices()[ContractDims - 1 - i].second; + eval_op_indices[i].second = RDims - 1 - op.indices()[ContractDims - 1 - i].first; + } + } + + // Check for duplicate axes and make sure the first index in eval_op_indices + // is increasing. Using O(n^2) sorting is OK since ContractDims is small + for (int i = 0; i < ContractDims; i++) { + for (int j = i + 1; j < ContractDims; j++) { + eigen_assert(eval_op_indices[j].first != eval_op_indices[i].first && + eval_op_indices[j].second != eval_op_indices[i].second && + "contraction axes should be unique"); + if (eval_op_indices[j].first < eval_op_indices[i].first) { + numext::swap(eval_op_indices[j], eval_op_indices[i]); + } + } + } + + array lhs_strides; + lhs_strides[0] = 1; + for (int i = 0; i < LDims-1; ++i) { + lhs_strides[i+1] = lhs_strides[i] * eval_left_dims[i]; + } + + array rhs_strides; + rhs_strides[0] = 1; + for (int i = 0; i < RDims-1; ++i) { + rhs_strides[i+1] = rhs_strides[i] * eval_right_dims[i]; + } + + if (m_i_strides.size() > 0) m_i_strides[0] = 1; + if (m_j_strides.size() > 0) m_j_strides[0] = 1; + if (m_k_strides.size() > 0) m_k_strides[0] = 1; + + m_i_size = 1; + m_j_size = 1; + m_k_size = 1; + + // To compute the dimension, we simply concatenate the non-contracting + // dimensions of the left and then the right tensor. Additionally, we also + // compute the strides corresponding to the left non-contracting + // dimensions and right non-contracting dimensions. + m_lhs_inner_dim_contiguous = true; + int dim_idx = 0; + Index nocontract_idx = 0; + + for (int i = 0; i < LDims; i++) { + // find if we are contracting on index i of left tensor + bool contracting = false; + for (int j = 0; j < ContractDims; j++) { + if (eval_op_indices[j].first == i) { + contracting = true; + break; + } + } + if (!contracting) { + // add dimension size to output dimensions + m_dimensions[dim_idx] = eval_left_dims[i]; + m_left_nocontract_strides[nocontract_idx] = lhs_strides[i]; + if (dim_idx != i) { + m_lhs_inner_dim_contiguous = false; + } + if (nocontract_idx+1 < internal::array_size::value) { + m_i_strides[nocontract_idx+1] = + m_i_strides[nocontract_idx] * eval_left_dims[i]; + } else { + m_i_size = m_i_strides[nocontract_idx] * eval_left_dims[i]; + } + dim_idx++; + nocontract_idx++; + } + } + + nocontract_idx = 0; + for (int i = 0; i < RDims; i++) { + bool contracting = false; + // find if we are contracting on index i of right tensor + for (int j = 0; j < ContractDims; j++) { + if (eval_op_indices[j].second == i) { + contracting = true; + break; + } + } + if (!contracting) { + m_dimensions[dim_idx] = eval_right_dims[i]; + if (nocontract_idx+1 < internal::array_size::value) { + m_j_strides[nocontract_idx+1] = + m_j_strides[nocontract_idx] * eval_right_dims[i]; + } else { + m_j_size = m_j_strides[nocontract_idx] * eval_right_dims[i]; + } + m_right_nocontract_strides[nocontract_idx] = rhs_strides[i]; + dim_idx++; + nocontract_idx++; + } + } + + // Now compute the strides corresponding to the contracting dimensions. We + // assumed above that non-contracting axes are represented in the same order + // in the matrix as they are in the tensor. This is not the case for + // contracting axes. As the contracting axes must be of the same size in + // each tensor, we'll only look at the first tensor here. + m_rhs_inner_dim_contiguous = true; + m_rhs_inner_dim_reordered = false; + for (int i = 0; i < ContractDims; i++) { + Index left = eval_op_indices[i].first; + Index right = eval_op_indices[i].second; + + Index size = eval_left_dims[left]; + eigen_assert(size == eval_right_dims[right] && + "Contraction axes must be same size"); + + if (i+1 < static_cast(internal::array_size::value)) { + m_k_strides[i+1] = m_k_strides[i] * size; + } else { + m_k_size = m_k_strides[i] * size; + } + m_left_contracting_strides[i] = lhs_strides[left]; + m_right_contracting_strides[i] = rhs_strides[right]; + + if (i > 0 && right < eval_op_indices[i-1].second) { + m_rhs_inner_dim_reordered = true; + } + if (right != i) { + m_rhs_inner_dim_contiguous = false; + } + } + + // If the layout is RowMajor, we need to reverse the m_dimensions + if (static_cast(Layout) == static_cast(RowMajor)) { + for (int i = 0, j = NumDims - 1; i < j; i++, j--) { + numext::swap(m_dimensions[i], m_dimensions[j]); + } + } + + // A set of parameters that will allow output kernel to get from output + // tensor dimensions (i, j) into the original tensor dimensions. + // TODO(ezhulenev): Add parameters required to infer output tensor index for + // more complex contractions than 2x2 on internal dimension. + m_tensor_contraction_params.swapped_arguments = static_cast(Layout) == RowMajor; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + m_leftImpl.evalSubExprsIfNeeded(NULL); + m_rightImpl.evalSubExprsIfNeeded(NULL); + if (data) { + evalTo(data); + return false; + } else { + m_result = static_cast(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); + evalTo(m_result); + return true; + } + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType dest, EvalSubExprsCallback done) { + m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done, dest](bool) { + m_rightImpl.evalSubExprsIfNeededAsync(nullptr, [this, done, dest](bool) { + if (dest) { + evalToAsync(dest, [done]() { done(false); }); + } else { + m_result = static_cast( + m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); + evalToAsync(m_result, [done]() { done(true); }); + } + }); + }); + } +#endif // EIGEN_USE_THREADS + +#ifndef TENSOR_CONTRACTION_DISPATCH +#define TENSOR_CONTRACTION_DISPATCH(METHOD, ALIGNMENT, ARGS) \ + if (this->m_lhs_inner_dim_contiguous) { \ + if (this->m_rhs_inner_dim_contiguous) { \ + if (this->m_rhs_inner_dim_reordered) { \ + METHOD ARGS; \ + } else { \ + METHOD ARGS; \ + } \ + } else { \ + if (this->m_rhs_inner_dim_reordered) { \ + METHOD ARGS; \ + } else { \ + METHOD ARGS; \ + } \ + } \ + } else { \ + if (this->m_rhs_inner_dim_contiguous) { \ + if (this->m_rhs_inner_dim_reordered) { \ + METHOD ARGS; \ + } else { \ + METHOD ARGS; \ + } \ + } else { \ + if (this->m_rhs_inner_dim_reordered) { \ + METHOD ARGS; \ + } else { \ + METHOD ARGS; \ + } \ + } \ + } +#endif + +#ifndef TENSOR_CONTRACTION_ASYNC_DISPATCH +#define TENSOR_CONTRACTION_ASYNC_DISPATCH(METHOD, DONE, ALIGNMENT, ARGS, FN) \ + if (this->m_lhs_inner_dim_contiguous) { \ + if (this->m_rhs_inner_dim_contiguous) { \ + if (this->m_rhs_inner_dim_reordered) { \ + (new METHOD ARGS)->FN; \ + } else { \ + (new METHOD ARGS)->FN; \ + } \ + } else { \ + if (this->m_rhs_inner_dim_reordered) { \ + (new METHOD ARGS)->FN; \ + } else { \ + (new METHOD ARGS)->FN; \ + } \ + } \ + } else { \ + if (this->m_rhs_inner_dim_contiguous) { \ + if (this->m_rhs_inner_dim_reordered) { \ + (new METHOD ARGS)->FN; \ + } else { \ + (new METHOD ARGS)->FN; \ + } \ + } else { \ + if (this->m_rhs_inner_dim_reordered) { \ + (new METHOD ARGS)->FN; \ + } else { \ + (new METHOD ARGS)->FN; \ + } \ + } \ + } +#endif + + EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const { + static_cast(this)->template evalProduct(buffer); + } + +#ifdef EIGEN_USE_THREADS + template + void evalToAsync(Scalar* buffer, EvalToCallback done) const { + static_cast(this) + ->template evalProductAsync(buffer, + std::move(done)); + } +#endif // EIGEN_USE_THREADS + + template + void evalProductSequential(Scalar* buffer) const { + if (this->m_j_size == 1) { + this->template evalGemv(buffer); + } else { + this->template evalGemm(buffer); + } + } + + template + #if !defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC + #endif + void evalGemv(Scalar* buffer) const { + const Index rows = m_i_size; + const Index cols = m_k_size; + + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + const Index lhs_packet_size = internal::unpacket_traits::size; + const Index rhs_packet_size = internal::unpacket_traits::size; + const int lhs_alignment = LeftEvaluator::IsAligned ? Aligned : Unaligned; + const int rhs_alignment = RightEvaluator::IsAligned ? Aligned : Unaligned; + typedef internal::TensorContractionInputMapper LhsMapper; + + typedef internal::TensorContractionInputMapper RhsMapper; + + LhsMapper lhs(m_leftImpl, m_left_nocontract_strides, m_i_strides, + m_left_contracting_strides, m_k_strides); + RhsMapper rhs(m_rightImpl, m_right_nocontract_strides, m_j_strides, + m_right_contracting_strides, m_k_strides); + + const Scalar alpha(1); + const Index resIncr(1); + + // zero out the result buffer (which must be of size at least rows * sizeof(Scalar) + m_device.memset(buffer, 0, rows * sizeof(Scalar)); + + internal::general_matrix_vector_product::run( + rows, cols, lhs, rhs, + buffer, resIncr, alpha); + + typedef internal::blas_data_mapper OutputMapper; + m_output_kernel(OutputMapper(buffer, rows), m_tensor_contraction_params, + static_cast(0), static_cast(0), rows, + static_cast(1)); + } + + template + #if !defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC + #endif + void evalGemm(Scalar* buffer) const { + // columns in left side, rows in right side + const Index k = this->m_k_size; + this->template evalGemmPartial(buffer, 0, k, 1); + } + + template + EIGEN_DEVICE_FUNC void evalGemmPartialWithoutOutputKernel( + Scalar* buffer, Index k_start, Index k_end, int num_threads) const { + evalGemmPartial(buffer, k_start, k_end, + num_threads); + } + + template + EIGEN_DEVICE_FUNC void evalGemmPartial(Scalar* buffer, Index k_start, Index k_end, int num_threads) const { + eigen_assert(k_end >= k_start && k_start >= 0 && k_end <= this->m_k_size); + // columns in slice on left side, rows on right side + const Index k_slice = k_end - k_start; + + // rows in left side + const Index m = this->m_i_size; + + // columns in right side + const Index n = this->m_j_size; + + // define data mappers for Lhs and Rhs + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; + + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + + const Index lhs_packet_size = internal::unpacket_traits::size; + const Index rhs_packet_size = internal::unpacket_traits::size; + + typedef internal::TensorContractionInputMapper LhsMapper; + + typedef internal::TensorContractionInputMapper RhsMapper; + + typedef internal::blas_data_mapper OutputMapper; + + typedef internal::TensorContractionKernel< + Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper> + TensorContractionKernel; + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + + OutputMapper output(buffer, m); + + // Sizes of the blocks to load in cache. See the Goto paper for details. + internal::TensorContractionBlocking + blocking(k_slice, m, n, num_threads); + const Index kc = blocking.kc(); + const Index mc = numext::mini(m, blocking.mc()); + const Index nc = numext::mini(n, blocking.nc()); + + typedef typename TensorContractionKernel::LhsBlock LhsBlock; + typedef typename TensorContractionKernel::RhsBlock RhsBlock; + + LhsBlock blockA; + RhsBlock blockB; + + TensorContractionKernel kernel(m, k_slice, n, mc, kc, nc); + + typedef typename TensorContractionKernel::BlockMemHandle BlockMemHandle; + const BlockMemHandle packed_mem = + kernel.allocate(this->m_device, &blockA, &blockB); + + // If a contraction kernel does not support beta, explicitly initialize + // output buffer with zeroes. + if (!TensorContractionKernel::HasBeta) { + this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); + } + + for(Index i2=0; i2= k_end) { + m_output_kernel(output_mapper, m_tensor_contraction_params, i2, j2, + actual_mc, actual_nc); + } + } + } + } + + kernel.deallocate(this->m_device, packed_mem); + } + + EIGEN_STRONG_INLINE void cleanup() { + m_leftImpl.cleanup(); + m_rightImpl.cleanup(); + + if (m_result != NULL) { + m_device.deallocate(m_result); + m_result = NULL; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + return m_result[index]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + return internal::ploadt(m_result + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_result; } + +protected: + Dimensions m_dimensions; + + contract_t m_k_strides; + contract_t m_left_contracting_strides; + contract_t m_right_contracting_strides; + + bool m_lhs_inner_dim_contiguous; + bool m_rhs_inner_dim_contiguous; + bool m_rhs_inner_dim_reordered; + + left_nocontract_t m_i_strides; + right_nocontract_t m_j_strides; + left_nocontract_t m_left_nocontract_strides; + right_nocontract_t m_right_nocontract_strides; + + Index m_i_size; + Index m_j_size; + Index m_k_size; + + TensorContractionParams m_tensor_contraction_params; + + TensorEvaluator m_leftImpl; + TensorEvaluator m_rightImpl; + const Device EIGEN_DEVICE_REF m_device; + OutputKernelType m_output_kernel; + EvaluatorPointerType m_result; +}; + + +// evaluator for default device +template +struct TensorEvaluator, Device> : + public TensorContractionEvaluatorBase< + TensorEvaluator, Device> > { + typedef TensorEvaluator, Device> Self; + typedef TensorContractionEvaluatorBase Base; + + typedef TensorContractionOp XprType; + typedef typename internal::remove_const::type Scalar; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + enum { + Layout = TensorEvaluator::Layout + }; + + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef typename internal::conditional< + static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; + typedef typename internal::conditional< + static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; + + static const int LDims = + internal::array_size::Dimensions>::value; + static const int RDims = + internal::array_size::Dimensions>::value; + static const int ContractDims = internal::array_size::value; + + typedef array contract_t; + typedef array left_nocontract_t; + typedef array right_nocontract_t; + + static const int NumDims = LDims + RDims - 2 * ContractDims; + + // Could we use NumDimensions here? + typedef DSizes Dimensions; + + TensorEvaluator(const XprType& op, const Device& device) : + Base(op, device) { } + + template + void evalProduct(Scalar* buffer) const { + TENSOR_CONTRACTION_DISPATCH(this->template evalProductSequential, Alignment, (buffer)); + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionBlocking.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionBlocking.h new file mode 100644 index 0000000..974feb0 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionBlocking.h @@ -0,0 +1,73 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H + + +namespace Eigen { +namespace internal { + +enum { + ShardByRow = 0, + ShardByCol = 1 +}; + + +// Default Blocking Strategy +template +class TensorContractionBlocking { + public: + + /* + adding EIGEN_DEVICE_FUNC unconditionally to 'TensorContractionBlocking' constructor in `TensorContractionBlocking.h` + requires adding EIGEN_DEVICE_FUNC to `computeProductBlockingSizes` in `GeneralBlockPanelKernel.h` + which in turn, requires adding EIGEN_DEVICE_FUNC to `evaluateProductBlockingSizesHeuristic` in `GeneralBlockPanelKernel.h` + which in turn, requires adding EIGEN_DEVICE_FUNC to `manage_caching_sizes` in `GeneralBlockPanelKernel.h` + (else HIPCC will error out) + + However adding EIGEN_DEVICE_FUNC to `manage_caching_sizes` in `GeneralBlockPanelKernel.h` + results in NVCC erroring out with the following error + + ../Eigen/src/Core/products/GeneralBlockPanelKernel.h(57): error #2901: + dynamic initialization is not supported for function-scope static variables within a __device__/__global__ function + */ + + #if !defined(EIGEN_HIPCC) + EIGEN_DEVICE_FUNC + #endif + TensorContractionBlocking(StorageIndex k, StorageIndex m, StorageIndex n, StorageIndex num_threads = 1) : + kc_(k), mc_(m), nc_(n) + { + if (ShardingType == ShardByCol) { + computeProductBlockingSizes(kc_, mc_, nc_, num_threads); + } + else { + computeProductBlockingSizes(kc_, nc_, mc_, num_threads); + } + + const int rhs_packet_size = internal::packet_traits::size; + kc_ = (rhs_packet_size <= 8 || kc_ <= rhs_packet_size) ? + kc_ : (kc_ / rhs_packet_size) * rhs_packet_size; + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex mc() const { return mc_; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex nc() const { return nc_; } + + private: + StorageIndex kc_; + StorageIndex mc_; + StorageIndex nc_; +}; + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionCuda.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionCuda.h new file mode 100644 index 0000000..3f315fe --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionCuda.h @@ -0,0 +1,6 @@ + +#if defined(__clang__) || defined(__GNUC__) +#warning "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorContractionGpu.h file" +#endif + +#include "TensorContractionGpu.h" diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionGpu.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionGpu.h new file mode 100644 index 0000000..c818038 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionGpu.h @@ -0,0 +1,1413 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014-2015 Benoit Steiner +// Copyright (C) 2015 Navdeep Jaitly +// Copyright (C) 2014 Eric Martin +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H + +#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC) + +namespace Eigen { + +template +__device__ EIGEN_STRONG_INLINE void +EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem, + const Index m_size, const Index n_size, const Index k_size) { + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + // declare and initialize 64 registers for output 8x8 block + + // prefetch registers + Scalar lhs_pf0; + Scalar lhs_pf1; + Scalar lhs_pf2; + Scalar lhs_pf3; + Scalar lhs_pf4; + Scalar lhs_pf5; + Scalar lhs_pf6; + Scalar lhs_pf7; + + Scalar rhs_pf0; + Scalar rhs_pf1; + Scalar rhs_pf2; + Scalar rhs_pf3; + Scalar rhs_pf4; + Scalar rhs_pf5; + Scalar rhs_pf6; + Scalar rhs_pf7; + + // shared memory is formatted + // (contract idx in block, nocontract idx in block, block idx) + // where block idx is column major. This transposition limits the number of + // bank conflicts when reading the LHS. The core idea is that since the contracting + // index is shared by both sides, then the contracting index should be in threadIdx.x. + + // On the LHS, we pad each row inside of each block with an extra element. This makes + // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts + // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks. + + // On the RHS we just add 8 padding elements to the end of each block. This gives no bank + // conflicts on writes and also none on reads. + + // storage indices + const Index lhs_store_idx_base = threadIdx.y * 72 + threadIdx.x * 9 + threadIdx.z; + const Index rhs_store_idx_base = threadIdx.y * 72 + threadIdx.z * 8 + threadIdx.x; + + const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0; + const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1; + const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2; + const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3; + const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4; + const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5; + const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6; + const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7; + + const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0; + const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1; + const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2; + const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3; + const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4; + const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5; + const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6; + const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7; + + // in the loading code, the following variables are important: + // threadIdx.x: the vertical position in an 8x8 block + // threadIdx.y: the vertical index of the 8x8 block in the grid + // threadIdx.z: the horizontal position in an 8x8 block + // k: the horizontal index of the 8x8 block in the grid + // + // The k parameter is implicit (it was the loop counter for a loop that went + // from 0 to <8, but now that loop is unrolled in the below code. + + const Index load_idx_vert = threadIdx.x + 8 * threadIdx.y; + const Index lhs_vert = base_m + load_idx_vert; + +#define prefetchIntoRegisters(base_k) \ + { \ + lhs_pf0 = conv(0); \ + lhs_pf1 = conv(0); \ + lhs_pf2 = conv(0); \ + lhs_pf3 = conv(0); \ + lhs_pf4 = conv(0); \ + lhs_pf5 = conv(0); \ + lhs_pf6 = conv(0); \ + lhs_pf7 = conv(0); \ + \ + rhs_pf0 = conv(0); \ + rhs_pf1 = conv(0); \ + rhs_pf2 = conv(0); \ + rhs_pf3 = conv(0); \ + rhs_pf4 = conv(0); \ + rhs_pf5 = conv(0); \ + rhs_pf6 = conv(0); \ + rhs_pf7 = conv(0); \ + \ + if (!needs_edge_check || lhs_vert < m_size) { \ + const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8; \ + const Index lhs_horiz_1 = base_k + threadIdx.z + 1 * 8; \ + const Index lhs_horiz_2 = base_k + threadIdx.z + 2 * 8; \ + const Index lhs_horiz_3 = base_k + threadIdx.z + 3 * 8; \ + const Index lhs_horiz_4 = base_k + threadIdx.z + 4 * 8; \ + const Index lhs_horiz_5 = base_k + threadIdx.z + 5 * 8; \ + const Index lhs_horiz_6 = base_k + threadIdx.z + 6 * 8; \ + const Index lhs_horiz_7 = base_k + threadIdx.z + 7 * 8; \ + \ + if (!needs_edge_check || lhs_horiz_7 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ + lhs_pf7 = lhs(lhs_vert, lhs_horiz_7); \ + } else if (lhs_horiz_6 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ + } else if (lhs_horiz_5 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + } else if (lhs_horiz_4 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + } else if (lhs_horiz_3 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + } else if (lhs_horiz_2 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + } else if (lhs_horiz_1 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + } else if (lhs_horiz_0 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + } \ + } \ + \ + const Index rhs_vert = base_k + load_idx_vert; \ + if (!needs_edge_check || rhs_vert < k_size) { \ + const Index rhs_horiz_0 = base_n + threadIdx.z + 0 * 8; \ + const Index rhs_horiz_1 = base_n + threadIdx.z + 1 * 8; \ + const Index rhs_horiz_2 = base_n + threadIdx.z + 2 * 8; \ + const Index rhs_horiz_3 = base_n + threadIdx.z + 3 * 8; \ + const Index rhs_horiz_4 = base_n + threadIdx.z + 4 * 8; \ + const Index rhs_horiz_5 = base_n + threadIdx.z + 5 * 8; \ + const Index rhs_horiz_6 = base_n + threadIdx.z + 6 * 8; \ + const Index rhs_horiz_7 = base_n + threadIdx.z + 7 * 8; \ + \ + if (rhs_horiz_7 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ + rhs_pf7 = rhs(rhs_vert, rhs_horiz_7); \ + } else if (rhs_horiz_6 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ + } else if (rhs_horiz_5 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + } else if (rhs_horiz_4 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + } else if (rhs_horiz_3 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + } else if (rhs_horiz_2 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + } else if (rhs_horiz_1 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + } else if (rhs_horiz_0 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + } \ + } \ + } \ + +#define writeRegToShmem(_) \ + lhs_shmem[lhs_store_idx_0] = lhs_pf0; \ + rhs_shmem[rhs_store_idx_0] = rhs_pf0; \ + \ + lhs_shmem[lhs_store_idx_1] = lhs_pf1; \ + rhs_shmem[rhs_store_idx_1] = rhs_pf1; \ + \ + lhs_shmem[lhs_store_idx_2] = lhs_pf2; \ + rhs_shmem[rhs_store_idx_2] = rhs_pf2; \ + \ + lhs_shmem[lhs_store_idx_3] = lhs_pf3; \ + rhs_shmem[rhs_store_idx_3] = rhs_pf3; \ + \ + lhs_shmem[lhs_store_idx_4] = lhs_pf4; \ + rhs_shmem[rhs_store_idx_4] = rhs_pf4; \ + \ + lhs_shmem[lhs_store_idx_5] = lhs_pf5; \ + rhs_shmem[rhs_store_idx_5] = rhs_pf5; \ + \ + lhs_shmem[lhs_store_idx_6] = lhs_pf6; \ + rhs_shmem[rhs_store_idx_6] = rhs_pf6; \ + \ + lhs_shmem[lhs_store_idx_7] = lhs_pf7; \ + rhs_shmem[rhs_store_idx_7] = rhs_pf7; \ + + // declare and initialize result array +#define res(i, j) _res_##i##j +#define initResultRow(i) \ + Scalar res(i, 0) = conv(0); \ + Scalar res(i, 1) = conv(0); \ + Scalar res(i, 2) = conv(0); \ + Scalar res(i, 3) = conv(0); \ + Scalar res(i, 4) = conv(0); \ + Scalar res(i, 5) = conv(0); \ + Scalar res(i, 6) = conv(0); \ + Scalar res(i, 7) = conv(0); \ + + internal::scalar_cast_op conv; + initResultRow(0); + initResultRow(1); + initResultRow(2); + initResultRow(3); + initResultRow(4); + initResultRow(5); + initResultRow(6); + initResultRow(7); +#undef initResultRow + + for (Index base_k = 0; base_k < k_size; base_k += 64) { + // wait for previous iteration to finish with shmem. Despite common sense, + // the code is a bit faster with this here then at bottom of loop + __syncthreads(); + + prefetchIntoRegisters(base_k); + writeRegToShmem(); + + #undef prefetchIntoRegisters + #undef writeRegToShmem + + // wait for shared mem packing to be done before starting computation + __syncthreads(); + + // compute 8x8 matrix product by outer product. This involves packing one column + // of LHS and one row of RHS into registers (takes 16 registers). + +#define lcol(i) _lcol##i + Scalar lcol(0); + Scalar lcol(1); + Scalar lcol(2); + Scalar lcol(3); + Scalar lcol(4); + Scalar lcol(5); + Scalar lcol(6); + Scalar lcol(7); + +#define rrow(j) _rrow##j + Scalar rrow(0); + Scalar rrow(1); + Scalar rrow(2); + Scalar rrow(3); + Scalar rrow(4); + Scalar rrow(5); + Scalar rrow(6); + Scalar rrow(7); + + // Now x corresponds to k, y to m, and z to n + const Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y]; + const Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z]; + +#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))] +#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))] + +#define loadData(i, j) \ + lcol(0) = lhs_element(0, j); \ + rrow(0) = rhs_element(i, 0); \ + lcol(1) = lhs_element(1, j); \ + rrow(1) = rhs_element(i, 1); \ + lcol(2) = lhs_element(2, j); \ + rrow(2) = rhs_element(i, 2); \ + lcol(3) = lhs_element(3, j); \ + rrow(3) = rhs_element(i, 3); \ + lcol(4) = lhs_element(4, j); \ + rrow(4) = rhs_element(i, 4); \ + lcol(5) = lhs_element(5, j); \ + rrow(5) = rhs_element(i, 5); \ + lcol(6) = lhs_element(6, j); \ + rrow(6) = rhs_element(i, 6); \ + lcol(7) = lhs_element(7, j); \ + rrow(7) = rhs_element(i, 7); \ + +#define computeCol(j) \ + res(0, j) += lcol(0) * rrow(j); \ + res(1, j) += lcol(1) * rrow(j); \ + res(2, j) += lcol(2) * rrow(j); \ + res(3, j) += lcol(3) * rrow(j); \ + res(4, j) += lcol(4) * rrow(j); \ + res(5, j) += lcol(5) * rrow(j); \ + res(6, j) += lcol(6) * rrow(j); \ + res(7, j) += lcol(7) * rrow(j); \ + +#define computePass(i) \ + loadData(i, i); \ + \ + computeCol(0); \ + computeCol(1); \ + computeCol(2); \ + computeCol(3); \ + computeCol(4); \ + computeCol(5); \ + computeCol(6); \ + computeCol(7); \ + + computePass(0); + computePass(1); + computePass(2); + computePass(3); + computePass(4); + computePass(5); + computePass(6); + computePass(7); + +#undef lcol +#undef rrow +#undef lhs_element +#undef rhs_element +#undef loadData +#undef computeCol +#undef computePass + } // end loop over k + + // we've now iterated over all of the large (ie width 64) k blocks and + // accumulated results in registers. At this point thread (x, y, z) contains + // the sum across all big k blocks of the product of little k block of index (x, y) + // with block of index (y, z). To compute the final output, we need to reduce + // the 8 threads over y by summation. +#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000) +#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask) +#else +#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask) +#endif + +#define reduceRow(i, mask) \ + shuffleInc(i, 0, mask); \ + shuffleInc(i, 1, mask); \ + shuffleInc(i, 2, mask); \ + shuffleInc(i, 3, mask); \ + shuffleInc(i, 4, mask); \ + shuffleInc(i, 5, mask); \ + shuffleInc(i, 6, mask); \ + shuffleInc(i, 7, mask); \ + +#define reduceMatrix(mask) \ + reduceRow(0, mask); \ + reduceRow(1, mask); \ + reduceRow(2, mask); \ + reduceRow(3, mask); \ + reduceRow(4, mask); \ + reduceRow(5, mask); \ + reduceRow(6, mask); \ + reduceRow(7, mask); \ + + // actually perform the reduction, now each thread of index (_, y, z) + // contains the correct values in its registers that belong in the output + // block + reduceMatrix(1); + reduceMatrix(2); + reduceMatrix(4); + +#undef shuffleInc +#undef reduceRow +#undef reduceMatrix + + // now we need to copy the 64 values into main memory. We can't split work + // among threads because all variables are in registers. There's 2 ways + // to do this: + // (1) have 1 thread do 64 writes from registers into global memory + // (2) have 1 thread do 64 writes into shared memory, and then 8 threads + // each do 8 writes into global memory. We can just overwrite the shared + // memory from the problem we just solved. + // (2) is slightly faster than (1) due to less branching and more ILP + + // TODO: won't yield much gain, but could just use currently unused shared mem + // and then we won't have to sync + // wait for shared mem to be out of use + __syncthreads(); + +#define writeResultShmem(i, j) \ + lhs_shmem[i + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j] = res(i, j); \ + +#define writeRow(i) \ + writeResultShmem(i, 0); \ + writeResultShmem(i, 1); \ + writeResultShmem(i, 2); \ + writeResultShmem(i, 3); \ + writeResultShmem(i, 4); \ + writeResultShmem(i, 5); \ + writeResultShmem(i, 6); \ + writeResultShmem(i, 7); \ + + if (threadIdx.x == 0) { + writeRow(0); + writeRow(1); + writeRow(2); + writeRow(3); + writeRow(4); + writeRow(5); + writeRow(6); + writeRow(7); + } +#undef writeResultShmem +#undef writeRow + + const int max_i_write = numext::mini((int)((m_size - base_m - threadIdx.y + 7) / 8), 8); + const int max_j_write = numext::mini((int)((n_size - base_n - threadIdx.z + 7) / 8), 8); + + if (threadIdx.x < max_i_write) { + if (max_j_write == 8) { + // TODO: can i trade bank conflicts for coalesced writes? + Scalar val0 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 0]; + Scalar val1 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 1]; + Scalar val2 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 2]; + Scalar val3 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 3]; + Scalar val4 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 4]; + Scalar val5 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 5]; + Scalar val6 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 6]; + Scalar val7 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 7]; + + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 0) = val0; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 1) = val1; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 2) = val2; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 3) = val3; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 4) = val4; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 5) = val5; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 6) = val6; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 7) = val7; + } else { +#pragma unroll 7 + for (int j = 0; j < max_j_write; j++) { + Scalar val = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j]; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * j) = val; + } + } + } +#undef res +} + + +template +__global__ void +#if defined(EIGEN_HIPCC) +__launch_bounds__(512, 1) +#else +__launch_bounds__(512) +#endif +EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ Scalar lhs_shmem[72 * 64]; + __shared__ Scalar rhs_shmem[72 * 64]; + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + if (base_m + 63 < m_size && base_n + 63 < n_size) { + EigenContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); + } else { + EigenContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); + } +} + + +template +__device__ __forceinline__ void +EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, float2 lhs_shmem2[][16], + float2 rhs_shmem2[][8], const Index m_size, + const Index n_size, const Index k_size, + const Index base_m, const Index base_n) { + + // prefetch registers + float4 lhs_pf0, rhs_pf0; + + float4 results[4]; + for (int i=0; i < 4; i++) { + results[i].x = results[i].y = results[i].z = results[i].w = 0; + } + +#define prefetch_lhs(reg, row, col) \ + if (!CHECK_LHS_BOUNDARY) { \ + if (col < k_size) { \ + reg =lhs.template loadPacket(row, col); \ + } \ + } else { \ + if (col < k_size) { \ + if (row + 3 < m_size) { \ + reg =lhs.template loadPacket(row, col); \ + } else if (row + 2 < m_size) { \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + reg.z =lhs(row + 2, col); \ + } else if (row + 1 < m_size) { \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + } else if (row < m_size) { \ + reg.x =lhs(row + 0, col); \ + } \ + } \ + } \ + + Index lhs_vert = base_m+threadIdx.x*4; + + for (Index k = 0; k < k_size; k += 16) { + + lhs_pf0 = internal::pset1(0); + rhs_pf0 = internal::pset1(0); + + Index lhs_horiz = threadIdx.y+k; + prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz) + + Index rhs_vert = k+(threadIdx.x%4)*4; + Index rhs_horiz0 = (threadIdx.x>>2)+threadIdx.y*4+base_n; + + if (!CHECK_RHS_BOUNDARY) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if (rhs_vert + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } else { + if (rhs_horiz0 < n_size) { + if ((rhs_vert + 3) < k_size) { + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + } else if ((rhs_vert + 2) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if ((rhs_vert + 1) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } + } + float x1, x2 ; + // the following can be a bitwise operation..... some day. + if((threadIdx.x%8) < 4) { + x1 = rhs_pf0.y; + x2 = rhs_pf0.w; + } else { + x1 = rhs_pf0.x; + x2 = rhs_pf0.z; + } + #if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000) + x1 = __shfl_xor(x1, 4); + x2 = __shfl_xor(x2, 4); + #else + x1 = __shfl_xor_sync(0xFFFFFFFF, x1, 4); + x2 = __shfl_xor_sync(0xFFFFFFFF, x2, 4); + #endif + if((threadIdx.x%8) < 4) { + rhs_pf0.y = x1; + rhs_pf0.w = x2; + } else { + rhs_pf0.x = x1; + rhs_pf0.z = x2; + } + + // We have 64 features. + // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1. + // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3. + // ... + // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63 + // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1 + // ... + rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2][threadIdx.x%8] = make_float2(rhs_pf0.x, rhs_pf0.y); + rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2+32][threadIdx.x%8] = make_float2(rhs_pf0.z, rhs_pf0.w); + + // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // ... + // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) + // ... + + lhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(lhs_pf0.x, lhs_pf0.y); + lhs_shmem2[threadIdx.y+16][threadIdx.x] = make_float2(lhs_pf0.z, lhs_pf0.w); + + +#define add_vals(fl1, fl2, fr1, fr2)\ + results[0].x += fl1.x * fr1.x;\ + results[0].y += fl1.y * fr1.x;\ + results[0].z += fl2.x * fr1.x;\ + results[0].w += fl2.y * fr1.x;\ +\ + results[1].x += fl1.x * fr1.y;\ + results[1].y += fl1.y * fr1.y;\ + results[1].z += fl2.x * fr1.y;\ + results[1].w += fl2.y * fr1.y;\ +\ + results[2].x += fl1.x * fr2.x;\ + results[2].y += fl1.y * fr2.x;\ + results[2].z += fl2.x * fr2.x;\ + results[2].w += fl2.y * fr2.x;\ +\ + results[3].x += fl1.x * fr2.y;\ + results[3].y += fl1.y * fr2.y;\ + results[3].z += fl2.x * fr2.y;\ + results[3].w += fl2.y * fr2.y;\ + + __syncthreads(); + + // Do the multiplies. + #pragma unroll + for (int koff = 0; koff < 16; koff ++) { + // 32 x threads. + float2 fl1 = lhs_shmem2[koff][threadIdx.x]; + float2 fl2 = lhs_shmem2[koff + 16][threadIdx.x]; + + int start_feature = threadIdx.y * 4; + float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4]; + float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4]; + + add_vals(fl1, fl2, fr1, fr2) + } + __syncthreads(); + } + +#undef prefetch_lhs +#undef add_vals + + Index horiz_base = threadIdx.y*4+base_n; + if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (!CHECK_RHS_BOUNDARY) { + // CHECK LHS + if (lhs_vert + 3 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (lhs_vert + 2 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + } + } else if (lhs_vert + 1 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + } + } else if (lhs_vert < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + } + } + } else if (!CHECK_LHS_BOUNDARY) { + // CHECK RHS + /* + int ncols_rem = fminf(n_size- horiz_base, 4); + for (int i = 0; i < ncols_rem; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + }*/ + for (int i = 0; i < 4; i++) { + if (horiz_base+i < n_size) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } else { + // CHECK both boundaries. + for (int i = 0; i < 4; i++) { + if (horiz_base+i < n_size) { + if (lhs_vert < m_size) + output(lhs_vert, horiz_base + i) = results[i].x; + if (lhs_vert + 1 < m_size) + output(lhs_vert + 1, horiz_base + i) = results[i].y; + if (lhs_vert + 2 < m_size) + output(lhs_vert + 2, horiz_base + i) = results[i].z; + if (lhs_vert + 3 < m_size) + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } +} + + +template +__device__ __forceinline__ void +EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, float2 lhs_shmem2[][32], + float2 rhs_shmem2[][8], const Index m_size, + const Index n_size, const Index k_size, + const Index base_m, const Index base_n) { + + // prefetch registers + float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3; + float4 rhs_pf0, rhs_pf1; + + float4 results[8]; + for (int i=0; i < 8; i++) { + results[i].x = results[i].y = results[i].z = results[i].w = 0; + } + + Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32; + for (Index k = 0; k < k_size; k += 32) { + lhs_pf0 = internal::pset1(0); + lhs_pf1 = internal::pset1(0); + lhs_pf2 = internal::pset1(0); + lhs_pf3 = internal::pset1(0); + + rhs_pf0 = internal::pset1(0); + rhs_pf1 = internal::pset1(0); + + if (!CHECK_LHS_BOUNDARY) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf3 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + } + } else { + // just CHECK_LHS_BOUNDARY + if (lhs_vert + 3 < m_size) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf3 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + } + } else if (lhs_vert + 2 < m_size) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); + lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); + lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24)); + lhs_pf3.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); + lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); + } + } else if (lhs_vert + 1 < m_size) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); + lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + } + } else if (lhs_vert < m_size) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + } + } + } + __syncthreads(); + Index rhs_vert = k+threadIdx.x*4; + Index rhs_horiz0 = threadIdx.y*2+base_n; + Index rhs_horiz1 = threadIdx.y*2+1+base_n; + if (!CHECK_RHS_BOUNDARY) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + rhs_pf1 = rhs.template loadPacket(rhs_vert, rhs_horiz1); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); + } else if (rhs_vert + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + } + } else { + if (rhs_horiz1 < n_size) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + rhs_pf1 = rhs.template loadPacket(rhs_vert, rhs_horiz1); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); + } else if (k+threadIdx.x*4 + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + } else if (k+threadIdx.x*4 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + } + } else if (rhs_horiz0 < n_size) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + } else if ((rhs_vert + 2) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if ((rhs_vert + 1) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } + } + __syncthreads(); + // Loaded. Do computation + // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1. + // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3. + // .. + // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63 + rhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(rhs_pf0.x, rhs_pf1.x); + // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1. + // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3. + // .. + rhs_shmem2[threadIdx.y+32][threadIdx.x] = make_float2(rhs_pf0.y, rhs_pf1.y); + // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1. + // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3. + rhs_shmem2[threadIdx.y+64][threadIdx.x] = make_float2(rhs_pf0.z, rhs_pf1.z); + // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1. + // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3. + rhs_shmem2[threadIdx.y+96][threadIdx.x] = make_float2(rhs_pf0.w, rhs_pf1.w); + + // LHS. + // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) + // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) + // ... + // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) + // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) + + +#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\ + results[0].x += a_feat1.x * f1.x;\ + results[1].x += a_feat1.x * f1.y;\ + results[2].x += a_feat1.x * f2.x;\ + results[3].x += a_feat1.x * f2.y;\ + results[4].x += a_feat1.x * f3.x;\ + results[5].x += a_feat1.x * f3.y;\ + results[6].x += a_feat1.x * f4.x;\ + results[7].x += a_feat1.x * f4.y;\ +\ + results[0].y += a_feat1.y * f1.x;\ + results[1].y += a_feat1.y * f1.y;\ + results[2].y += a_feat1.y * f2.x;\ + results[3].y += a_feat1.y * f2.y;\ + results[4].y += a_feat1.y * f3.x;\ + results[5].y += a_feat1.y * f3.y;\ + results[6].y += a_feat1.y * f4.x;\ + results[7].y += a_feat1.y * f4.y;\ +\ + results[0].z += a_feat2.x * f1.x;\ + results[1].z += a_feat2.x * f1.y;\ + results[2].z += a_feat2.x * f2.x;\ + results[3].z += a_feat2.x * f2.y;\ + results[4].z += a_feat2.x * f3.x;\ + results[5].z += a_feat2.x * f3.y;\ + results[6].z += a_feat2.x * f4.x;\ + results[7].z += a_feat2.x * f4.y;\ +\ + results[0].w += a_feat2.y * f1.x;\ + results[1].w += a_feat2.y * f1.y;\ + results[2].w += a_feat2.y * f2.x;\ + results[3].w += a_feat2.y * f2.y;\ + results[4].w += a_feat2.y * f3.x;\ + results[5].w += a_feat2.y * f3.y;\ + results[6].w += a_feat2.y * f4.x;\ + results[7].w += a_feat2.y * f4.y;\ + + lhs_shmem2[threadIdx.y/4][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y); + lhs_shmem2[threadIdx.y/4+8][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y); + lhs_shmem2[threadIdx.y/4+16][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y); + lhs_shmem2[threadIdx.y/4+24][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y); + + lhs_shmem2[threadIdx.y/4 + 32][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w); + lhs_shmem2[threadIdx.y/4 + 40][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w); + lhs_shmem2[threadIdx.y/4 + 48][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w); + lhs_shmem2[threadIdx.y/4 + 56][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w); + + __syncthreads(); + + // Do the multiplies. + #pragma unroll + for (int koff = 0; koff < 32; koff ++) { + float2 a3 = lhs_shmem2[koff][threadIdx.x + (threadIdx.y % 4) * 8]; + float2 a4 = lhs_shmem2[koff + 32][threadIdx.x + (threadIdx.y % 4) * 8]; + + // first feature is at (threadIdx.y/4) * 8 last is at start + 8. + int start_feature = (threadIdx.y / 4) * 8; + + float2 br1 = rhs_shmem2[start_feature/2 + (koff % 4) * 32][koff/4]; + float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4]; + float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4]; + float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4]; + + add_vals(a3, a4, br1, br2, br3, br4) + } + __syncthreads(); + } // end loop over k + + __syncthreads(); + Index horiz_base = (threadIdx.y/4)*8+base_n; + if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (!CHECK_RHS_BOUNDARY) { + if (lhs_vert + 3 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (lhs_vert + 2 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + } + } else if (lhs_vert + 1 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + } + } else if (lhs_vert < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + } + } + } else if (!CHECK_LHS_BOUNDARY) { + // CHECK BOUNDARY_B + for (int i = 0; i < 8; i++) { + if (horiz_base + i < n_size) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } else { + // CHECK both boundaries. + for (int i = 0; i < 8; i++) { + if (horiz_base + i < n_size) { + if (lhs_vert < m_size) + output(lhs_vert, horiz_base + i) = results[i].x; + if (lhs_vert + 1 < m_size) + output(lhs_vert + 1, horiz_base + i) = results[i].y; + if (lhs_vert + 2 < m_size) + output(lhs_vert + 2, horiz_base + i) = results[i].z; + if (lhs_vert + 3 < m_size) + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } +} + + +template +__global__ void +#if defined(EIGEN_HIPCC) +__launch_bounds__(256, 1) +#else +__launch_bounds__(256) +#endif +EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ float2 lhs_shmem[64*32]; + __shared__ float2 rhs_shmem[128*8]; + + typedef float2 LHS_MEM[64][32]; + typedef float2 RHS_MEM[128][8]; + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 128 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + bool check_rhs = (base_n + 63) >= n_size; + bool check_lhs128 = (base_m + 127) >= m_size; + + if (!check_rhs) { + if (!check_lhs128) { + // >= 128 rows left + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } + } else { + if (!check_lhs128) { + // >= 128 rows left + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } + } +} + +template +__global__ void +#if defined(EIGEN_HIPCC) +__launch_bounds__(256, 1) +#else +__launch_bounds__(256) +#endif +EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ float2 lhs_shmem[32][16]; + __shared__ float2 rhs_shmem[64][8]; + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + if (base_m + 63 < m_size) { + if (base_n + 63 < n_size) { + EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } + } else { + if (base_n + 63 < n_size) { + EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } + } +} + + +template +struct TensorEvaluator, GpuDevice> : + public TensorContractionEvaluatorBase, GpuDevice> > { + + typedef GpuDevice Device; + + typedef TensorEvaluator, Device> Self; + typedef TensorContractionEvaluatorBase Base; + + typedef TensorContractionOp XprType; + typedef typename internal::remove_const::type Scalar; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + enum { + Layout = TensorEvaluator::Layout, + }; + + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef typename internal::conditional< + static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; + typedef typename internal::conditional< + static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; + + static const int LDims = + internal::array_size::Dimensions>::value; + static const int RDims = + internal::array_size::Dimensions>::value; + static const int ContractDims = internal::array_size::value; + + typedef array left_dim_mapper_t; + typedef array right_dim_mapper_t; + + typedef array contract_t; + typedef array left_nocontract_t; + typedef array right_nocontract_t; + + static const int NumDims = LDims + RDims - 2 * ContractDims; + + typedef DSizes Dimensions; + + // typedefs needed in evalTo + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; + + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + + typedef typename LeftEvaluator::Dimensions LeftDimensions; + typedef typename RightEvaluator::Dimensions RightDimensions; + + TensorEvaluator(const XprType& op, const Device& device) : + Base(op, device) + { + EIGEN_STATIC_ASSERT( (internal::is_same::value), + GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS); + } + + // We need to redefine this method to make nvcc happy + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + this->m_leftImpl.evalSubExprsIfNeeded(NULL); + this->m_rightImpl.evalSubExprsIfNeeded(NULL); + if (data) { + evalTo(data); + return false; + } else { + this->m_result = static_cast(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar))); + evalTo(this->m_result); + return true; + } + } + + void evalTo(Scalar* buffer) const { + if (this->m_lhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + } + else { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + } + } + + template struct LaunchKernels { + static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) { + const Index m_blocks = (m + 63) / 64; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(8, 8, 8); + LAUNCH_GPU_KERNEL((EigenContractionKernel), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); + } + }; + + template struct LaunchKernels { + static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) { + if (m < 768 || n < 768) { + const Index m_blocks = (m + 63) / 64; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(16, 16, 1); + LAUNCH_GPU_KERNEL((EigenFloatContractionKernel16x16), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); + } else { + const Index m_blocks = (m + 127) / 128; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(8, 32, 1); + LAUNCH_GPU_KERNEL((EigenFloatContractionKernel), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); + } + } + }; + + template + void evalTyped(Scalar* buffer) const { + // columns in left side, rows in right side + const Index k = this->m_k_size; + EIGEN_UNUSED_VARIABLE(k) + + // rows in left side + const Index m = this->m_i_size; + + // columns in right side + const Index n = this->m_j_size; + + // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) + this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); + + typedef internal::TensorContractionInputMapper LhsMapper; + + typedef internal::TensorContractionInputMapper RhsMapper; + + typedef internal::blas_data_mapper OutputMapper; + + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + + OutputMapper output(buffer, m); + +#if defined(EIGEN_USE_HIP) + setGpuSharedMemConfig(hipSharedMemBankSizeEightByte); +#else + setGpuSharedMemConfig(cudaSharedMemBankSizeEightByte); +#endif + + LaunchKernels::Run(lhs, rhs, output, m, n, k, this->m_device); + } +}; + +} // end namespace Eigen + +#endif // EIGEN_USE_GPU and EIGEN_GPUCC +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionMapper.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionMapper.h new file mode 100644 index 0000000..9ab900b --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionMapper.h @@ -0,0 +1,575 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H + +namespace Eigen { + +namespace internal { + +enum { + Rhs = 0, + Lhs = 1 +}; + +/* + * Implementation of the Eigen blas_data_mapper class for tensors. + */ +/// The make pointer class is used by sycl in order to build the mapper class on the device. For other platform the default make pointer is used which +/// is scalar * for CoeffLoader. +template class MakePointer_ = MakePointer> +struct CoeffLoader; + +template class MakePointer_ = MakePointer> +class BaseTensorContractionMapper; + +template class MakePointer_> +struct CoeffLoader { + enum { + DirectOffsets = false + }; + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_tensor(tensor) { } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index) { + eigen_assert(false && "unsupported"); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename MakePointer_::Type + data() const { + eigen_assert(false && "unsupported"); + return NULL; + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return m_tensor.coeff(index); } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename Tensor::PacketReturnType packet(typename Tensor::Index index) const + { + return m_tensor.template packet(index); + } + + #ifdef EIGEN_USE_SYCL + // The placeholder accessors require to be bound to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_tensor.bind(cgh); + } + #endif + + private: + const Tensor m_tensor; +}; + +template class MakePointer_> +struct CoeffLoader { + enum { + DirectOffsets = true + }; + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_data(tensor.data()) {} + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) { + m_data += offset; + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename MakePointer_::Type + data() const { + return m_data; + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return loadConstant(m_data+index); } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename Tensor::PacketReturnType packet(typename Tensor::Index index) const + { + return internal::ploadt_ro(m_data + index); + } + + #ifdef EIGEN_USE_SYCL + // The placeholder accessors require to be bound to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_data.bind(cgh); + } + #endif + private: + typedef typename Tensor::Scalar Scalar; + + typename MakePointer_::Type m_data; +}; + +template class MakePointer_ = MakePointer> +class SimpleTensorContractionMapper { + public: + EIGEN_DEVICE_FUNC + SimpleTensorContractionMapper(const Tensor& tensor, + const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, + const contract_t& contract_strides, + const contract_t& k_strides) : + m_tensor(tensor), + m_nocontract_strides(nocontract_strides), + m_ij_strides(ij_strides), + m_contract_strides(contract_strides), + m_k_strides(k_strides) { } + + enum { + DirectOffsets = CoeffLoader::DirectOffsets + }; + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) { + m_tensor.offsetBuffer(offset); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE void prefetch(Index /*i*/) { } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar operator()(Index row) const { + // column major assumption + return operator()(row, 0); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar operator()(Index row, Index col) const { + return m_tensor.coeff(computeIndex(row, col)); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const { + const bool left = (side == Lhs); + EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963 + Index nocontract_val = left ? row : col; + Index linidx = 0; + EIGEN_UNROLL_LOOP + for (int i = static_cast(array_size::value) - 1; i > 0; i--) { + const Index idx = nocontract_val / m_ij_strides[i]; + linidx += idx * m_nocontract_strides[i]; + nocontract_val -= idx * m_ij_strides[i]; + } + if (array_size::value > array_size::value) { + if (side == Lhs && inner_dim_contiguous) { + eigen_assert(m_nocontract_strides[0] == 1); + linidx += nocontract_val; + } else { + linidx += nocontract_val * m_nocontract_strides[0]; + } + } + + Index contract_val = left ? col : row; + if(array_size::value > 0) { + EIGEN_UNROLL_LOOP + for (int i = static_cast(array_size::value) - 1; i > 0; i--) { + const Index idx = contract_val / m_k_strides[i]; + linidx += idx * m_contract_strides[i]; + contract_val -= idx * m_k_strides[i]; + } + + if (side == Rhs && inner_dim_contiguous) { + eigen_assert(m_contract_strides[0] == 1); + linidx += contract_val; + } else { + linidx += contract_val * m_contract_strides[0]; + } + } + + return linidx; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE IndexPair computeIndexPair(Index row, Index col, const Index distance) const { + const bool left = (side == Lhs); + EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963 + Index nocontract_val[2] = {left ? row : col, left ? row + distance : col}; + Index linidx[2] = {0, 0}; + if (array_size::value > array_size::value) { + EIGEN_UNROLL_LOOP + for (int i = static_cast(array_size::value) - 1; i > 0; i--) { + const Index idx0 = nocontract_val[0] / m_ij_strides[i]; + const Index idx1 = nocontract_val[1] / m_ij_strides[i]; + linidx[0] += idx0 * m_nocontract_strides[i]; + linidx[1] += idx1 * m_nocontract_strides[i]; + nocontract_val[0] -= idx0 * m_ij_strides[i]; + nocontract_val[1] -= idx1 * m_ij_strides[i]; + } + if (side == Lhs && inner_dim_contiguous) { + eigen_assert(m_nocontract_strides[0] == 1); + linidx[0] += nocontract_val[0]; + linidx[1] += nocontract_val[1]; + } else { + linidx[0] += nocontract_val[0] * m_nocontract_strides[0]; + linidx[1] += nocontract_val[1] * m_nocontract_strides[0]; + } + } + + Index contract_val[2] = {left ? col : row, left ? col : row + distance}; + if (array_size::value> 0) { + EIGEN_UNROLL_LOOP + for (int i = static_cast(array_size::value) - 1; i > 0; i--) { + const Index idx0 = contract_val[0] / m_k_strides[i]; + const Index idx1 = contract_val[1] / m_k_strides[i]; + linidx[0] += idx0 * m_contract_strides[i]; + linidx[1] += idx1 * m_contract_strides[i]; + contract_val[0] -= idx0 * m_k_strides[i]; + contract_val[1] -= idx1 * m_k_strides[i]; + } + + if (side == Rhs && inner_dim_contiguous) { + eigen_assert(m_contract_strides[0] == 1); + linidx[0] += contract_val[0]; + linidx[1] += contract_val[1]; + } else { + linidx[0] += contract_val[0] * m_contract_strides[0]; + linidx[1] += contract_val[1] * m_contract_strides[0]; + } + } + return IndexPair(linidx[0], linidx[1]); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index firstAligned(Index size) const { + // Only claim alignment when we can compute the actual stride (ie when we're + // dealing with the lhs with inner_dim_contiguous. This is because the + // matrix-vector product relies on the stride when dealing with aligned inputs. + return (Alignment == Aligned) && (side == Lhs) && inner_dim_contiguous ? 0 : size; + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index stride() const { + return ((side == Lhs) && inner_dim_contiguous && array_size::value > 0) ? m_contract_strides[0] : 1; + } + + #ifdef EIGEN_USE_SYCL + // The placeholder accessors require to be bound to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_tensor.bind(cgh); + } + #endif + + const CoeffLoader& tensor() const { + return m_tensor; + } + + const nocontract_t& nocontract_strides() const { + return m_nocontract_strides; + } + const nocontract_t& ij_strides() const { return m_ij_strides; } + const contract_t& contract_strides() const { return m_contract_strides; } + const contract_t& k_strides() const { return m_k_strides; } + + protected: + CoeffLoader m_tensor; + const nocontract_t m_nocontract_strides; + const nocontract_t m_ij_strides; + const contract_t m_contract_strides; + const contract_t m_k_strides; +}; + +template class MakePointer_> +class BaseTensorContractionMapper : public SimpleTensorContractionMapper +{ + public: + typedef SimpleTensorContractionMapper ParentMapper; + + EIGEN_DEVICE_FUNC + BaseTensorContractionMapper(const Tensor& tensor, + const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, + const contract_t& contract_strides, + const contract_t& k_strides) : + ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename internal::enable_if::size==packet_size,PacketT>::type + load(Index i, Index j) const + { + // whole method makes column major assumption + + // don't need to add offsets for now (because operator handles that) + // current code assumes packet size must be a multiple of 2 + EIGEN_STATIC_ASSERT(packet_size % 2 == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + + if (Tensor::PacketAccess && inner_dim_contiguous && !inner_dim_reordered) { + const Index index = this->computeIndex(i, j); + eigen_assert(this->computeIndex(i+packet_size-1, j) == index + packet_size-1); + return this->m_tensor.template packet(index); + } + + const IndexPair indexPair = this->computeIndexPair(i, j, packet_size - 1); + const Index first = indexPair.first; + const Index lastIdx = indexPair.second; + + // We can always do optimized packet reads from left hand side right now, because + // the vertical matrix dimension on the left hand side is never contracting. + // On the right hand side we need to check if the contracting dimensions may have + // been shuffled first. + if (Tensor::PacketAccess && + (side == Lhs || internal::array_size::value <= 1 || !inner_dim_reordered) && + (lastIdx - first) == (packet_size - 1)) { + + return this->m_tensor.template packet(first); + } + + EIGEN_ALIGN_MAX Scalar data[packet_size]; + + data[0] = this->m_tensor.coeff(first); + EIGEN_UNROLL_LOOP + for (Index k = 1; k < packet_size - 1; k += 2) { + const IndexPair internal_pair = this->computeIndexPair(i + k, j, 1); + data[k] = this->m_tensor.coeff(internal_pair.first); + data[k + 1] = this->m_tensor.coeff(internal_pair.second); + } + data[packet_size - 1] = this->m_tensor.coeff(lastIdx); + + return pload(data); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename internal::enable_if::size!=packet_size,PacketT>::type + load(Index i, Index j) const + { + const Index requested_packet_size = internal::unpacket_traits::size; + EIGEN_ALIGN_MAX Scalar data[requested_packet_size]; + + const IndexPair indexPair = this->computeIndexPair(i, j, requested_packet_size - 1); + const Index first = indexPair.first; + const Index lastIdx = indexPair.second; + + data[0] = this->m_tensor.coeff(first); + for (Index k = 1; k < requested_packet_size - 1; k += 2) { + const IndexPair internal_pair = this->computeIndexPair(i + k, j, 1); + data[k] = this->m_tensor.coeff(internal_pair.first); + data[k + 1] = this->m_tensor.coeff(internal_pair.second); + } + data[requested_packet_size - 1] = this->m_tensor.coeff(lastIdx); + + return pload(data); + } + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const { + return this->load(i,j); + } +}; + + +template class MakePointer_> +class BaseTensorContractionMapper + : public SimpleTensorContractionMapper +{ + public: + typedef SimpleTensorContractionMapper ParentMapper; + + EIGEN_DEVICE_FUNC + BaseTensorContractionMapper(const Tensor& tensor, + const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, + const contract_t& contract_strides, + const contract_t& k_strides) : + ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } + + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const { + EIGEN_ALIGN_MAX Scalar data[1]; + data[0] = this->m_tensor.coeff(this->computeIndex(i, j)); + return pload(data); + } + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE PacketT load(Index i, Index j) const { + EIGEN_ALIGN_MAX Scalar data[1]; + data[0] = this->m_tensor.coeff(this->computeIndex(i, j)); + return pload(data); + } +}; + + +template class MakePointer_=MakePointer> +class TensorContractionSubMapper { + public: + + typedef BaseTensorContractionMapper ParentMapper; + typedef TensorContractionSubMapper Self; + typedef Self LinearMapper; + + enum { + // We can use direct offsets iff the parent mapper supports then and we can compute the strides. + // TODO: we should also enable direct offsets for the Rhs case. + UseDirectOffsets = ParentMapper::DirectOffsets && (side == Lhs) && inner_dim_contiguous && (array_size::value > 0) + }; + + EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset) + : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) { + // Bake the offsets into the buffer used by the base mapper whenever possible. This avoids the need to recompute + // this offset every time we attempt to access a coefficient. + if (UseDirectOffsets) { + Index stride = m_base_mapper.stride(); + m_base_mapper.offsetBuffer(vert_offset + horiz_offset * stride); + } + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const { + if (UseDirectOffsets) { + return m_base_mapper(i, 0); + } + return m_base_mapper(i + m_vert_offset, m_horiz_offset); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const { + if (UseDirectOffsets) { + return m_base_mapper(i, j); + } + return m_base_mapper(i + m_vert_offset, j + m_horiz_offset); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i) const { + if (UseDirectOffsets) { + return m_base_mapper.template loadPacket(i, 0); + } + return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i, Index j) const { + if (UseDirectOffsets) { + return m_base_mapper.template loadPacket(i, j); + } + return m_base_mapper.template loadPacket(i + m_vert_offset, j + m_horiz_offset); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i, Index j) const { + if (UseDirectOffsets) { + return m_base_mapper.template load(i, j); + } + return m_base_mapper.template loadPacket(i + m_vert_offset, j + m_horiz_offset); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketT& p) const { + if (UseDirectOffsets) { + m_base_mapper.storePacket(i, 0, p); + } + m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const { + if (UseDirectOffsets) { + return LinearMapper(m_base_mapper, i, j); + } + return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const { + EIGEN_STATIC_ASSERT((internal::is_same::value), YOU_MADE_A_PROGRAMMING_MISTAKE); + const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned; + if (UseDirectOffsets) { + return m_base_mapper.template loadPacket(i, 0); + } + return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool aligned(Index) const { + return false; + } + + #ifdef EIGEN_USE_SYCL + // The placeholder accessors require to be bound to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_base_mapper.bind(cgh); + } + #endif + + const ParentMapper& base_mapper() const { return m_base_mapper; } + Index vert_offset() const { return m_vert_offset; } + Index horiz_offset() const { return m_horiz_offset; } + + private: + ParentMapper m_base_mapper; + const Index m_vert_offset; + const Index m_horiz_offset; +}; + + +template class MakePointer_=MakePointer> +class TensorContractionInputMapper + : public BaseTensorContractionMapper { + + public: + typedef Scalar_ Scalar; + typedef BaseTensorContractionMapper Base; + typedef TensorContractionSubMapper SubMapper; + typedef SubMapper VectorMapper; + + EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor, + const nocontract_t& nocontract_strides, + const nocontract_t& ij_strides, + const contract_t& contract_strides, + const contract_t& k_strides) + : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const { + return SubMapper(*this, i, j); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const { + return VectorMapper(*this, i, j); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const CoeffLoader& get_tensor() const { + return Base::m_tensor; + } +}; + + +template struct TensorContractionInputMapperTrait; + +template class MakePointer_> +struct TensorContractionInputMapperTrait > { + + typedef Tensor_ XprType; + static const bool inner_dim_contiguous = inner_dim_contiguous_; + static const bool inner_dim_reordered = inner_dim_reordered_; + }; + + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionSycl.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionSycl.h new file mode 100755 index 0000000..473c228 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionSycl.h @@ -0,0 +1,1650 @@ +// This file is part of Eigen, a lightweight C++ template library for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla Public License v. 2.0. If a copy of the MPL was not +// distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/***************************************************************** + * TensorContractionSycl.h + * + * \brief: + * TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend + * + *****************************************************************/ + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H + +namespace Eigen { + +namespace TensorSycl { +namespace internal { + +#ifndef EIGEN_SYCL_DISABLE_GEMV +/*! + * \brief TVPanelSize, a template class used for setting the panel size required for launching General TensorVector + * contraction kernel on various hardware devices. + * + * \tparam Scalar: determines the element type of the tensor/vector + * + * \tparam StorageIndex determines the Index type. + * + * \tparam NCWindow: determines the number of non-contracting element to be process by each work-group + * + * \tparam CFactor: determines the number of contracting element to be process by each thread + * + * \tparam NCFactor: determines the number of non-contracting element to be process by each thread + */ +template +struct TVPanelSize { + // LocalThreadSizeC: determines total number of thread per workgroup for the contracting dimension + static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeC = EIGEN_SYCL_LOCAL_THREAD_DIM0; + // LocalThreadSizeNC: determines total number of thread per workgroup for the non-contracting dimension + static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC = EIGEN_SYCL_LOCAL_THREAD_DIM1; + // TileSizeDimNC: determines the tile size for the non-contracting dimension + static EIGEN_CONSTEXPR StorageIndex TileSizeDimNC = NCWindow / NCFactor; + // TileSizeDimC: determines the tile size for the contracting dimension + static EIGEN_CONSTEXPR StorageIndex TileSizeDimC = CFactor * LocalThreadSizeNC * LocalThreadSizeC; + // WorkLoadPerThreadNC : determines workload per thread for loading the non-contracting dimension + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC = TileSizeDimNC / LocalThreadSizeNC; + // WorkLoadPerThreadC: determines workload per thread for loading the non-contracting dimension + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadC = TileSizeDimC / LocalThreadSizeC; + // BC : determines if supporting bank conflict is required + static EIGEN_CONSTEXPR bool BC = false; +}; +#endif + +/*! + * \brief TTPanelSize, a template class used for setting the panel size required for launching General Tensor Tensor + contraction kernel on various hardware devices. + * + * \tparam Scalar: determines the element type of the tensor + * + * \tparam StorageIndex: determines the Index type. + * + * \tparam REG_SIZE_M: determines workload per thread for loading the M dimension This can be varied based on the + available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro). + * + * \tparam REG_SIZE_N: determines workload per thread for loading the N dimension This can be varied based on the + available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro). + * + * \tparam TSDK: determines Tile size for dimension K. The packet size is assumed to be considered + */ + +template +struct TTPanelSize { + // TileSizeDimK: determines Tile size for dimension K. The packet size is assumed to be considered + static EIGEN_CONSTEXPR StorageIndex TileSizeDimK = TSDK; + // WorkLoadPerThreadM : determines workload per thread for loading the M dimension This can be varied based on the + // available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro// +#ifndef EIGEN_SYCL_REG_M + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = REG_SIZE_M; +#else + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = EIGEN_SYCL_REG_M; +#endif +// WorkLoadPerThreadN : determines workload per thread for loading the N dimension This can be varied based on the +// available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro +#ifndef EIGEN_SYCL_REG_N + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = REG_SIZE_N; +#else + static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = EIGEN_SYCL_REG_N; +#endif + // LocalThreadSizeM: determines total number of thread per workgroup for the m dimension + static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeM = EIGEN_SYCL_LOCAL_THREAD_DIM0; + // LocalThreadSizeN: determines total number of thread per workgroup for the n dimension + static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeN = EIGEN_SYCL_LOCAL_THREAD_DIM1; + // TileSizeDimM: determines the tile size for the m dimension + static EIGEN_CONSTEXPR StorageIndex TileSizeDimM = LocalThreadSizeM * WorkLoadPerThreadM; + // TileSizeDimN: determines the tile size for the n dimension + static EIGEN_CONSTEXPR StorageIndex TileSizeDimN = LocalThreadSizeN * WorkLoadPerThreadN; + // LoadPerThreadLhs: determines workload per thread for loading Lhs Tensor. This must be divisable by packetsize + static EIGEN_CONSTEXPR StorageIndex LoadPerThreadLhs = + ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimN)); + // LoadPerThreadRhs: determines workload per thread for loading Rhs Tensor. This must be divisable by packetsize + static EIGEN_CONSTEXPR StorageIndex LoadPerThreadRhs = + ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimM)); + // BC : determines if supporting bank conflict is required + static EIGEN_CONSTEXPR bool BC = true; + // DoubleBuffer: determines if double buffering technique should be used (This can be disabled by + // EIGEN_SYCL_DISABLE_DOUBLE_BUFFER macro when the device doesnot have sufficient local memory) + static EIGEN_CONSTEXPR bool DoubleBuffer = +#ifdef EIGEN_SYCL_DISABLE_DOUBLE_BUFFER + false; +#else + true; +#endif +}; + +/* ! + * \brief contraction_type: an enum class representing the Tensor Contraction implementation algorithm. This is used to + * specialize the contraction algorithm based on device support for dedicated local memory. + */ +enum class contraction_type { local, no_local }; +/* ! + * \brief data_source an enum class determining the location of the data in a memory hierarchy (global, local, private). + */ +enum class data_source { global_mem, local_mem, private_mem }; + +/*! + * \brief read, a template function used for loading the data from global + memory. This function is used to guarantee coalesced and vectorized load whenever possible + * + * \tparam PacketLoad: determines if the each element of this tensor block should be loaded in a packet mode + * + * \param is_coalesced_layout: determines whether or not the Tensor data in a memory can be access coalesced and + vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the + contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case + when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed. + * + * \tparam PacketType: determines the type of packet + * + * \tparam TensorMapper: determines the input tensor mapper type + * + * \tparam StorageIndex: determines the Index type + + * \param tensorMapper: is the input tensor + * + * \param NCIndex: is the non-contracting dim index + * + * \param CIndex is the contracting dim index + * + * \param ld: is the leading dimension of the flattened tensor + */ +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if::type read( + const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &ld) { + const StorageIndex row = (is_coalesced_layout) ? NCIndex : CIndex; + const StorageIndex col = (is_coalesced_layout) ? CIndex : NCIndex; + return tensorMapper.get_tensor().template packet(row + (col * ld)); +} + +/*! + * \brief read, special overload of read function, when the read access is not vectorized + * + * \tparam PacketLoad: determines if the each element of this tensor block should be loaded in a packet mode + * + * \param is_coalesced_layout: determines whether or not the Tensor data in a memory can be access coalesced and + vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the + contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case + when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed. + * + * \tparam PacketType: determines the type of packet + * + * \tparam TensorMapper: determines the input tensor mapper type + * + * \tparam StorageIndex: determines the Index type + + * \param tensorMapper: is the input tensor + * + * \param NCIndex: is the non-contracting dim index + * + * \param CIndex: is the contracting dim index + */ +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if::type read( + const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &) { + const StorageIndex row = (IsRhs) ? CIndex : NCIndex; + const StorageIndex col = (IsRhs) ? NCIndex : CIndex; + return tensorMapper(row, col); +} + +/*! + * \brief write, a template function used for storing the data to local memory. This function is used to guarantee + * coalesced and vectorized store whenever possible. + * + * \tparam StorageIndex: determines the Index type + * + * \param ld is the leading dimension of the local memory. ld is a compile time value for the local memory + * + * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy. + * + * \tparam PacketType: determines the type of packet + * + * \tparam DataScalar: determines the output data type + * + * \param packet_data: the data to be written in the local memory + * + * \param ptr: a pointer to the local memory + * + * \param CIndex is the contracting dim index + */ + +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename ::Eigen::internal::enable_if
::type + write(PacketType &packet_data, DataScalar ptr) { + EIGEN_CONSTEXPR int PacketSize = Eigen::internal::unpacket_traits::size; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; i++) { + *ptr = PacketWrapper::scalarize(i, packet_data); + ptr += ld; + } +} + +/*! + * \brief Overloading the write function for storing the data to global memory, when vectorization enabled This function + * is used to guarantee coalesced and vectorized store whenever possible. + * + * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy. + * + * \tparam PacketType: determines the type of packet + * + * \tparam DataScalar: determines the output data type + * + * \param packet_data: the data to be written in the local memory + * + * \param ptr: a pointer to the local memory + */ + +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if< + Eigen::internal::unpacket_traits::size != 1 && dt == data_source::global_mem, void>::type +write(PacketType &packet_data, DataScalar *ptr) { + ::Eigen::internal::pstoreu(ptr, packet_data); +} + +/*! + * \brief Overloading the write function for storing the data to global memory, when vectorization is disabled. + * + * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy. + * + * \tparam PacketType: determines the type of packet + * + * \tparam DataScalar: determines the output data type + * + * \param packet_data: the data to be written in the local memory + * + * \param ptr: a pointer to the local memory + */ +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if< + Eigen::internal::unpacket_traits::size == 1 && dt == data_source::global_mem, void>::type +write(PacketType &packet_data, DataScalar *ptr) { + *ptr = packet_data; +} + +/*! + * \brief check_boundary: is used to check the edge condition for non-internal blocks. + * + * \tparam is_internal: determines if the block is internal + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary(bool) { + return true; +} + +/*! + * \brief check_boundary: specialization of the check_boundary for non-internal blocks. + * + * \param cond: true when the data is in range. Otherwise false + */ +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary(bool cond) { + return cond; +} + +/*! + * \brief BlockProperties is a template class that provides different characteristic of a block of each Tensor processed + * by each workgroup. + * + * \tparam is_transposed: iff true, determines whether or not the block of the Tensor is transposed + * + * \tparam packet_load_: determines if the each element of this tensor block should be loaded in a packet mode + * + * \tparam PacketType: determines the type of packet + * + * \tparam OutType: determines the type of each element for this block of tensor. If packet load is true, it will be + * packetType; Otherwise it will be scalar Type + * + * \param elements_per_access determines the size of each element based on OutType + * + * \param is_coalesced_layout determines whether or not the Tensor data in a memory can be access coalesced and + * vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the + * contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case + * when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed. + * + * \param nc_stride determines the stride of non-contracting dimension to access the next adjustment element within the + * Tensor Block for each workgroup + * + * \param c_stride determines the stride of contracting dimension to access the next adjustment element within the + * Tensor Block for each workgroup + */ +template +struct BlockProperties { + static EIGEN_CONSTEXPR bool packet_load = packet_load_; + typedef typename Eigen::internal::unpacket_traits::type OutScalar; + static EIGEN_CONSTEXPR bool is_rhs = is_rhs_; + typedef typename Eigen::internal::conditional::type OutType; + static EIGEN_CONSTEXPR int elements_per_access = Eigen::internal::unpacket_traits::size; + static EIGEN_CONSTEXPR bool is_coalesced_layout = !(is_transposed ^ is_rhs); + static EIGEN_CONSTEXPR int nc_stride = (is_coalesced_layout ? elements_per_access : 1); + static EIGEN_CONSTEXPR int c_stride = (is_coalesced_layout ? 1 : elements_per_access); +}; + +/*! + * \brief ThreadProperties is a template class that provides each thread's properties within a workgroup. Please see + * the sycl-1.2.1 specification (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for the workgroup, + * work-items + * + * \tparam StorageIndex: determines the StorageIndex Type + * + * \param linearLocalThreadId: determines the linearized location of a thread within a work-group + * + * \param kGroupId: determines the logical group id in a k dimension of the flattened tensor. It will be > 1 when + * tall/skinny algorithm is used + * + * \param mGroupOffset: determines the logical start position of all thread within a workgroup for the m dimension of + * the flattened tensor. + * + * \param kGroupOffset determines the logical start position of all thread within a workgroup for the k dimension of the + * flattened tensor. It will be > 1 when tall/skinny algorithm is used. + * + * \param mLocalOffset: determines the logical start position of each thread within a workgroup for the m dimension of a + * flattened tensor. The position determines the distance of each thread within the workgroup from each other + * independent from their global position. + * + * \param nLocalOffset: determines the logical start position of each thread within a workgroup for the n dimension of a + * flattened tensor. The position determines the distance of each thread within the workgroup from each other + * independent from their global position. + * + * \param mGlobalOffset: determines the logical start position of each thread a thread for the m dimension on a + * flattened tensor + * + * \param nGlobalOffset: determines the logical start position of each thread a thread for the n dimension on a + * flattened tensor + * + * \param kSize : determine the number of the k elements of the flattened Tensor to be processed by each thread for the + * given tensor block. This is !=K dimension of Flattened Tensor when Tall/Skinny matrix is used. + * + * \param is_internal : this will determined if the thread within the work-group computes an internal block of tensor or + * the edge blocks. When it is internal, there is no need to check the boundaries and all the if stantement can be + * resolve by compiler. + */ +template +struct ThreadProperties { + const StorageIndex linearLocalThreadId; + const StorageIndex kGroupId; + const StorageIndex mGroupOffset; + const StorageIndex nGroupOffset; + const StorageIndex kGroupOffset; + const StorageIndex mLocalOffset; + const StorageIndex nLocalOffset; + const StorageIndex mGlobalOffset; + const StorageIndex nGlobalOffset; + StorageIndex kSize; + const bool is_internal; + // this is used to adjust the last block + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ThreadProperties( + const StorageIndex linearLocalThreadId_, const StorageIndex kGroupId_, const StorageIndex mGroupOffset_, + const StorageIndex nGroupOffset_, const StorageIndex kGroupOffset_, const StorageIndex mLocalOffset_, + const StorageIndex nLocalOffset_, const StorageIndex mGlobalOffset_, const StorageIndex nGlobalOffset_, + StorageIndex kSize_, const bool is_internal_) + : linearLocalThreadId(linearLocalThreadId_), + kGroupId(kGroupId_), + mGroupOffset(mGroupOffset_), + nGroupOffset(nGroupOffset_), + kGroupOffset(kGroupOffset_), + mLocalOffset(mLocalOffset_), + nLocalOffset(nLocalOffset_), + mGlobalOffset(mGlobalOffset_), + nGlobalOffset(nGlobalOffset_), + kSize(kSize_), + is_internal(is_internal_) {} +}; + +/*! + * \brief TensorContractionKernel is a template class that provides Tensor -Tensor contraction operation. + * + * \tparam OutScalar: determines the output scalar type + * + * \tparam LhsScalar: determines the left-hand-side scalar type + * + * \tparam RhsScalar: determines the right-hand-side scalar type + * + * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification + (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition) + * + * \tparam LhsMapper determines the tensor contraction mapper type for left-hand-side matrix + * + * \tparam RhsMapper determines the tensor contraction mapper type for right-hand-side matrix + * + * \tparam StorageIndex: determines the StorageIndex Type + * + * \tparam Properties: determines the Contraction Panel properties + * + * \tparam TripleDim: determines the M, K, N dimensions for the flatten tensors in order to treat them as a matrix + * + * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression. + * + * \tparam input_mapper_properties : determine if the input tensors are matrix. If they are matrix, special memory + access is used to guarantee that always the memory access are coalesced. + * + * \tptaram IsFinal : determine if this is the final kernel. If so, the result will be written in a final output. + Otherwise, the result of contraction will be written iin a temporary buffer. This is the case when Tall/Skinny + contraction is used. So in this case, a final reduction step is required to compute final output. + + * \tparam contraction_tp: it is an enum value representing whether the local memroy/no local memory implementation of + the algorithm to be used + * + * \param scratch: local memory containing tiles of LHS and RHS tensors for each work-group + * + * \param lhs: determines the left-hand-side flattened tensor (tensor mapper) + * + * \param rhs: determines the right-hand-side flattened tensor (tensor mapper) + * + * \param out_res: determines the output tensor containing the contraction result + * + * \param groupSizeM: a logical number determining the number of work-group for m dimension + * + * \param groupSizeN: a logical number determining the number of work-group for n dimension + * + * \param numTiles: determines total number of tiles on the k dimension + * + * \param TripleDim: determines the M, K, N dimensions for the flatten tensors in order to treat them as a matrix + */ +template +class TensorContractionKernel { + public: + typedef typename Eigen::TensorSycl::internal::Vectorise::PacketReturnType + PacketReturnType; + static EIGEN_CONSTEXPR int PacketSize = + Eigen::TensorSycl::internal::Vectorise::PacketSize; + static EIGEN_CONSTEXPR bool is_lhs_transposed = + !::Eigen::internal::TensorContractionInputMapperTrait::inner_dim_contiguous; + static EIGEN_CONSTEXPR bool is_rhs_transposed = + !::Eigen::internal::TensorContractionInputMapperTrait::inner_dim_contiguous; + + typedef BlockProperties + LHSBlockProperties; + + typedef BlockProperties + RHSBlockProperties; + + static EIGEN_CONSTEXPR StorageIndex NStride = + contraction_tp == contraction_type::local ? Properties::WorkLoadPerThreadN : RHSBlockProperties::nc_stride; + + typedef cl::sycl::accessor Scratch; + typedef cl::sycl::multi_ptr local_ptr; + typedef OutScalar * /*cl::sycl::multi_ptr*/ private_ptr; + typedef + typename ::Eigen::internal::conditional::type + tile_ptr; + static EIGEN_CONSTEXPR StorageIndex LSDL = contraction_tp == contraction_type::local + ? Properties::TileSizeDimM + Properties::BC + : Properties::WorkLoadPerThreadM; + static EIGEN_CONSTEXPR StorageIndex LSDR = contraction_tp == contraction_type::local + ? Properties::TileSizeDimN + Properties::BC + : Properties::WorkLoadPerThreadN; + static EIGEN_CONSTEXPR StorageIndex LocalOffset = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN; + + /** + * \brief MemHolder this is a place holder struct for creating memory hierarchy in SYCL. Inside SYCL kernel it is not + * allowed to have dynamic memory allocation. While the local memory is created outside of the kernel and passed to + * the kernel as an accessor, the private memory can only allowed to be allocated statically. Since we are abstracting + * the TiledMemory for both local and private memory, the MemHolder structs is used as a helper to abstract out + * different type of memory needed when local/no_local memory computation is called. + * + * \tparam contraction_type: it is an enum value representing whether the local memroy/no local memory implementation + of the algorithm to be used + * \tparam the private memory size + * \param ptr the tile memory pointer type + */ + template + struct MemHolder { + tile_ptr ptr; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MemHolder(local_ptr block_start_ptr) : ptr(block_start_ptr) {} + }; + /** + * \brief specialization of memHolder class when no local memory kernel is used. + */ + template + struct MemHolder { + OutScalar ptr[MemSize] = {OutScalar{0}}; + }; + /** + * \brief TiledMemory: contains required memory pointer for loading each tile of the TensorContraction panel from + * global memory to local/private memory when local/no_local algorithm used. + * + * \param lhs_scratch_extract : determines the LHS tile memory. It is either private or local memory based on the + * selected contraction_type. + * + * \param rhs_scratch_extract : determines the RHS tile memory. It is either private or local memory based on the + * selected contraction_type. + * + * \param lhs_extract_index: determins the position of each thread on a local memory for lhs input. When private + * memory is used this is set to zero as this is not applicable in case of private memory. + * + * \param rhs_extract_index: determins the position of each thread on a local memory for rhs input. When private + * memory is used this is set to zero as this is not applicable in case of private memory. + * + * \param lhs_scratch_compute : determines the location to load for computation for lhs_local memory. This is the + * same as lhs_scratch_extract for private memory. + * + * \param rhs_scratch_compute : determines the location to load for computation for rhs_local memory. This is the + * same as rhs_scratch_extract for private memory. + */ + struct TiledMemory { + MemHolder lhs_scratch_extract; + MemHolder rhs_scratch_extract; + tile_ptr lhs_scratch_ptr_compute; + tile_ptr rhs_scratch_ptr_compute; + const std::pair lhs_extract_index; + const std::pair rhs_extract_index; + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TiledMemory(const ThreadProperties &, local_ptr, + typename ::Eigen::internal::enable_if::type * = 0) + : lhs_scratch_extract{}, + rhs_scratch_extract{}, + lhs_scratch_ptr_compute(lhs_scratch_extract.ptr), + rhs_scratch_ptr_compute(rhs_scratch_extract.ptr), + lhs_extract_index(std::pair(StorageIndex{0}, StorageIndex{0})), + rhs_extract_index(std::pair(StorageIndex{0}, StorageIndex{0})) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TiledMemory(const ThreadProperties &thread_properties, local_ptr block_start_ptr, + typename ::Eigen::internal::enable_if::type * = 0) + : lhs_scratch_extract{block_start_ptr}, + rhs_scratch_extract{lhs_scratch_extract.ptr + + ((Properties::DoubleBuffer + 1) * LSDL * Properties::TileSizeDimK)}, + lhs_scratch_ptr_compute(lhs_scratch_extract.ptr + thread_properties.mLocalOffset), + rhs_scratch_ptr_compute(rhs_scratch_extract.ptr + thread_properties.nLocalOffset), + lhs_extract_index( + local_id_extract(thread_properties.linearLocalThreadId)), + rhs_extract_index( + local_id_extract(thread_properties.linearLocalThreadId)) {} + }; + + Scratch scratch; + const LhsMapper lhs; + const RhsMapper rhs; + OutAccessor out_res; + const StorageIndex groupSizeM; + const StorageIndex groupSizeN; + const StorageIndex numTiles; + const TripleDim triple_dim; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_, + const RhsMapper rhs_, OutAccessor out_res_, + const StorageIndex groupSizeM_, + const StorageIndex groupSizeN_, + const StorageIndex numTiles_, + const TripleDim triple_dim_) + : scratch(scratch_), + lhs(lhs_), + rhs(rhs_), + out_res(out_res_), + groupSizeM(groupSizeM_), + groupSizeN(groupSizeN_), + numTiles(numTiles_), + triple_dim(triple_dim_) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_, + const RhsMapper rhs_, OutAccessor out_res_, + const StorageIndex groupSizeM_, + const StorageIndex numTiles_, + const TripleDim triple_dim_) + : TensorContractionKernel(scratch_, lhs_, rhs_, out_res_, groupSizeM_, 1, numTiles_, triple_dim_) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) { + const StorageIndex linearLocalThreadId = itemID.get_local_id(0); + const StorageIndex nLocalThreadId = linearLocalThreadId / Properties::LocalThreadSizeM; + const StorageIndex mLocalThreadId = linearLocalThreadId % Properties::LocalThreadSizeM; + const StorageIndex mGroupId = itemID.get_group(0) % groupSizeM; + const StorageIndex tmp = itemID.get_group(0) / groupSizeM; + const StorageIndex nGroupId = IsFinal ? tmp : tmp % groupSizeN; + const StorageIndex kGroupId = IsFinal ? 0 : tmp / groupSizeN; + const StorageIndex mGroupOffset = mGroupId * Properties::TileSizeDimM; + const StorageIndex nGroupOffset = nGroupId * Properties::TileSizeDimN; + const StorageIndex mLocalOffset = PacketSize * mLocalThreadId; + const StorageIndex nLocalOffset = NStride * nLocalThreadId; + const StorageIndex mGlobalOffset = mGroupOffset + mLocalOffset; + const StorageIndex nGlobalOffset = nGroupOffset + nLocalOffset; + + const StorageIndex kSizePerWG = IsFinal ? triple_dim.K : numTiles * Properties::TileSizeDimK; + StorageIndex kGroupOffset = kGroupId * kSizePerWG; + const bool is_internal = triple_dim.M - mGroupOffset >= Properties::TileSizeDimM && + triple_dim.N - nGroupOffset >= Properties::TileSizeDimN && + triple_dim.K - kGroupOffset >= kSizePerWG; + // this is used to adjust the last block + StorageIndex kSize = IsFinal ? triple_dim.K : std::min(kSizePerWG, triple_dim.K - kGroupOffset); + // This is used to find out the lats K offset so that kGroupOffset -kSize can compute the coffset for loading to + // tile + kGroupOffset += kSize; + + auto thread_properties = + ThreadProperties(linearLocalThreadId, kGroupId, mGroupOffset, nGroupOffset, kGroupOffset, + mLocalOffset, nLocalOffset, mGlobalOffset, nGlobalOffset, kSize, is_internal); + + auto out_ptr = out_res.get_pointer() + (IsFinal ? 0 : thread_properties.kGroupId * triple_dim.M * triple_dim.N); + + (thread_properties.is_internal) ? compute_panel(itemID, thread_properties, out_ptr) + : compute_panel(itemID, thread_properties, out_ptr); + } + // The compute block computes the contraction operation private block for each thread and store the resutl in the + // privateRes memory of Each computation the compute block function is independent of local and no local concepts as + // it only compute the block on each thread's private memory space + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_block_per_tile(OutScalar *lhs_block_ptr, OutScalar *rhs_block_ptr, + PacketReturnType *privateRes) { + StorageIndex idx = 0; + EIGEN_CONSTEXPR StorageIndex lhs_stride = + contraction_tp == contraction_type::local ? (PacketSize * Properties::LocalThreadSizeM) : 1; + EIGEN_UNROLL_LOOP + for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN; wLPTN++) { + auto rhsPacket = PacketReturnType{*(rhs_block_ptr + wLPTN)}; + StorageIndex lhs_index = 0; + EIGEN_UNROLL_LOOP + for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) { + PacketReturnType lhsPack{}; + Eigen::TensorSycl::internal::PacketWrapper::set_packet(lhsPack, + lhs_block_ptr + lhs_index); + privateRes[idx] = ::Eigen::internal::pmadd(lhsPack, rhsPacket, privateRes[idx]); + + lhs_index += lhs_stride; + idx++; + } + } + } + // The store function write the computed contraction operation in the private memory of each thread to the global + // memory. The store function is independent of local and no local concepts s that it can be abstract out in the base + // class. + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void store(OutPtr *out_ptr, PacketReturnType *privateRes, + StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) { + auto chk_bound = [&](const StorageIndex &mIndex, const StorageIndex &nIndex) EIGEN_DEVICE_FUNC { + return (mIndex + PacketSize - 1 < triple_dim.M && nGlobalOffset + nIndex < triple_dim.N); + }; + // when local memory is not used M and N are both accessed in a coalesced way. However, when local memory is + // available the k*N is transposed in the local to N*K therefore, each blocks operates on blockId* + // WorkLoadPerThreadN slice of N + EIGEN_CONSTEXPR StorageIndex GlobalNStride = + contraction_tp == contraction_type::local ? 1 : Properties::LocalThreadSizeN; + EIGEN_UNROLL_LOOP + for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN / PrivateNStride; wLPTN++) { + // output leading dimension + StorageIndex outputLD = 0; + // When local memory is used the PrivateNstride is always 1 because the coalesed access on N is loaded into Local + // memory and extracting from local to global is the same as no transposed version. However, when local memory is + // not used and RHS is transposed we packetize the load for RHS. + EIGEN_UNROLL_LOOP + for (StorageIndex nId = 0; nId < PrivateNStride; nId++) { + StorageIndex globalRow = mGlobalOffset; + EIGEN_UNROLL_LOOP + for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) { + PacketReturnType privetOut = privateRes[wLPTM]; + if (check_boundary(chk_bound(globalRow, nId))) { + // Store the final results in C. The C matrix has always M as a first StorageIndex and N as a second + // StorageIndex Therefore it is always coalesced layout + write(privetOut, out_ptr + outputLD + globalRow); + } else { + EIGEN_UNROLL_LOOP + for (StorageIndex mId = 0; mId < PacketSize; mId++) { + StorageIndex mOffset = globalRow + mId; + if (mOffset < triple_dim.M && (nGlobalOffset + nId < triple_dim.N)) { + out_ptr[mOffset + outputLD] = + Eigen::TensorSycl::internal::PacketWrapper::scalarize(mId, privetOut); + } + } + } + globalRow += (PacketSize * Properties::LocalThreadSizeM); + } + outputLD += triple_dim.M; + privateRes += Properties::WorkLoadPerThreadM / PacketSize; + } + out_ptr += (GlobalNStride * outputLD); + + nGlobalOffset += (PrivateNStride * GlobalNStride); + } + } + // when no local memory is used the following extract_block will be enabled + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename ::Eigen::internal::enable_if::type + extract_block(const Input &inpt, PrivateReg private_ptr, const std::pair &, + const StorageIndex &ncOffset, const StorageIndex cOffset) { + EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC = + InputBlockProperties::is_rhs ? Properties::LocalThreadSizeN : Properties::LocalThreadSizeM; + EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC = + InputBlockProperties::is_rhs ? Properties::WorkLoadPerThreadN : Properties::WorkLoadPerThreadM; + const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M; + + auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC { + return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) && + (NCIndex + InputBlockProperties::nc_stride - 1 < NC)); + }; + const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K; + StorageIndex cIndex = cOffset; + + EIGEN_UNROLL_LOOP + for (StorageIndex cId = 0; cId < Properties::TileSizeDimK / InputBlockProperties::c_stride; cId++) { + StorageIndex ncIndex = ncOffset; + EIGEN_UNROLL_LOOP + for (StorageIndex ncId = 0; ncId < WorkLoadPerThreadNC / InputBlockProperties::nc_stride; ncId++) { + if (check_boundary(chk_bound(cIndex, ncIndex))) { + auto val = + read(inpt, ncIndex, cIndex, ld); + + write(val, private_ptr); + } else { + EIGEN_UNROLL_LOOP + for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) { + const StorageIndex ncInd = ncIndex + (InputBlockProperties::is_coalesced_layout ? i : 0); + const StorageIndex cInd = cIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i); + OutScalar val = + (ncInd < NC && cInd < triple_dim.K) + ? read( + inpt, ncInd, cInd, ld) + : OutScalar(0); + write( + val, private_ptr + (InputBlockProperties::is_coalesced_layout ? i : 0) + + ((InputBlockProperties::is_coalesced_layout ? 0 : i) * WorkLoadPerThreadNC)); + } + } + + // if it is lhs we have to load it packetised when the packet size is > 1, because the output is coalesced. So + // even if M is not accessed in a coalesced mode, we have to load packet_size number of m per thread. + ncIndex = (!InputBlockProperties::is_rhs && InputBlockProperties::nc_stride == 1 && PacketSize != 1) + ? ncOffset + (ncId + 1) % PacketSize + ((ncId + 1) / PacketSize) * LocalThreadSizeNC + : (ncIndex + InputBlockProperties::nc_stride * LocalThreadSizeNC); + private_ptr += InputBlockProperties::nc_stride; + } + // the previous for loop ( private_ptr += (ncId * nc_stride)) has already moved ptr with one WorkLoadPerThreadNC + private_ptr += (InputBlockProperties::c_stride - 1) * WorkLoadPerThreadNC; + cIndex += InputBlockProperties::c_stride; + } + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::pair local_id_extract( + const StorageIndex &linearLocalThreadId) { + const StorageIndex localThreadNC = + (InputBlockProperties::is_coalesced_layout) + ? linearLocalThreadId % (TileSizeDimNC / InputBlockProperties::nc_stride) + : linearLocalThreadId / (Properties::TileSizeDimK / InputBlockProperties::c_stride); + const StorageIndex localThreadC = + (InputBlockProperties::is_coalesced_layout) + ? linearLocalThreadId / (TileSizeDimNC / InputBlockProperties::nc_stride) + : linearLocalThreadId % (Properties::TileSizeDimK / InputBlockProperties::c_stride); + return std::pair(localThreadNC, localThreadC); + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename ::Eigen::internal::enable_if::type + sync_mem(const cl::sycl::nd_item<1> &, bool &db_offset) noexcept { + db_offset = !db_offset; + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename ::Eigen::internal::enable_if::type + sync_mem(const cl::sycl::nd_item<1> &itemID, bool &) noexcept { + itemID.barrier(cl::sycl::access::fence_space::local_space); + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename ::Eigen::internal::enable_if::type + sync_mem(const cl::sycl::nd_item<1> &, bool &) noexcept { + return; + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename ::Eigen::internal::enable_if::type + sync_thread(const cl::sycl::nd_item<1> & +#ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION + itemID +#endif + ) noexcept { +#ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION + itemID.barrier(cl::sycl::access::fence_spacce::local_space); +#else + return; +#endif + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename ::Eigen::internal::enable_if::type + sync_thread(const cl::sycl::nd_item<1> &itemID) { + itemID.barrier(cl::sycl::access::fence_space::local_space); + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if::type sync_thread( + const cl::sycl::nd_item<1> &) { + return; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_tile_per_panel(const cl::sycl::nd_item<1> &itemID, + ThreadProperties &thread_properties, + TiledMemory &tiled_input_block, + PacketReturnType *privateRes, bool &db_offset) { + // Tiling the Rhs block from global to local memory + extract_block( + rhs, tiled_input_block.rhs_scratch_extract.ptr + (db_offset * Properties::TileSizeDimK * LSDR), + tiled_input_block.rhs_extract_index, + contraction_tp == contraction_type::local ? thread_properties.nGroupOffset : thread_properties.nGlobalOffset, + thread_properties.kGroupOffset - thread_properties.kSize); + + sync_thread(itemID); + + // Tiling the Lhs block from global to local memory + extract_block( + lhs, tiled_input_block.lhs_scratch_extract.ptr + (db_offset * LSDL * Properties::TileSizeDimK), + tiled_input_block.lhs_extract_index, + contraction_tp == contraction_type::local ? thread_properties.mGroupOffset : thread_properties.mGlobalOffset, + thread_properties.kGroupOffset - thread_properties.kSize); + + // itemID.barrier(cl::sycl::access::fence_space::local_space); + sync_thread(itemID); + // switch to compute mede + StorageIndex lhs_offset = (db_offset * LSDL * Properties::TileSizeDimK); + StorageIndex rhs_offset = (db_offset * Properties::TileSizeDimK * LSDR); + // Loop over the values of a single tile + for (StorageIndex k = 0; k < Properties::TileSizeDimK; k++) { + compute_block_per_tile(tiled_input_block.lhs_scratch_ptr_compute + lhs_offset, + tiled_input_block.rhs_scratch_ptr_compute + rhs_offset, privateRes); + lhs_offset += LSDL; + rhs_offset += LSDR; + } + // computing the K index for the next tile + thread_properties.kSize -= Properties::TileSizeDimK; + sync_mem(itemID, db_offset); + } + + // when local memory is available the following compute_panel will be enabled + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(const cl::sycl::nd_item<1> &itemID, + ThreadProperties &thread_properties, + OutPtr out_ptr) { + auto tiled_input_block = TiledMemory{thread_properties, scratch.get_pointer()}; + // Allocate register space + PacketReturnType privateRes[Properties::WorkLoadPerThreadM * Properties::WorkLoadPerThreadN / PacketSize] = { + PacketReturnType{0}}; + bool db_offset = 0; + + while (thread_properties.kSize >= Properties::TileSizeDimK) { + compute_tile_per_panel(itemID, thread_properties, tiled_input_block, privateRes, db_offset); + } + if (thread_properties.kSize > 0) { + compute_tile_per_panel(itemID, thread_properties, tiled_input_block, privateRes, db_offset); + } + + // Storing the final results in the output + store(1) : RHSBlockProperties::nc_stride>( + out_ptr + thread_properties.nGlobalOffset * triple_dim.M, privateRes, thread_properties.mGlobalOffset, + thread_properties.nGlobalOffset); + } + // When local memory is available the following extract_block will be enabled + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename ::Eigen::internal::enable_if::type + extract_block(const Input &inpt, Local local_ptr, const std::pair& local_index, + const StorageIndex &ncOffset, const StorageIndex cOffset) { + EIGEN_CONSTEXPR StorageIndex TileSizeDimNC = + InputBlockProperties::is_rhs ? Properties::TileSizeDimN : Properties::TileSizeDimM; + EIGEN_CONSTEXPR StorageIndex LoadPerThread = + InputBlockProperties::is_rhs ? Properties::LoadPerThreadRhs : Properties::LoadPerThreadLhs; + EIGEN_CONSTEXPR StorageIndex LSD = InputBlockProperties::is_rhs ? LSDR : LSDL; + static_assert(((LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride) == 0) && + (LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride) == 0)), + " LocalOffset must be divisable by stride"); + const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M; + StorageIndex localThreadNC = local_index.first; + StorageIndex localThreadC = local_index.second; + auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC { + return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) && + (NCIndex + InputBlockProperties::nc_stride - 1 < NC)); + }; + EIGEN_UNROLL_LOOP + for (StorageIndex lPT = 0; lPT < LoadPerThread / InputBlockProperties::elements_per_access; lPT++) { + const StorageIndex CIndex = cOffset + (InputBlockProperties::c_stride * localThreadC); + const StorageIndex NCIndex = ncOffset + (InputBlockProperties::nc_stride * localThreadNC); + const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K; + if (check_boundary(chk_bound(CIndex, NCIndex))) { + auto val = + read(inpt, NCIndex, CIndex, ld); + write( + val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) + + (InputBlockProperties::c_stride * localThreadC * LSD)); + } else { + EIGEN_UNROLL_LOOP + for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) { + const StorageIndex nCInd = NCIndex + (InputBlockProperties::is_coalesced_layout ? i : 0); + const StorageIndex cInd = CIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i); + OutScalar val = + (nCInd < NC && cInd < triple_dim.K) + ? read( + inpt, nCInd, cInd, ld) + : OutScalar(0); + + write( + val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) + + (InputBlockProperties::is_coalesced_layout ? i : 0) + + ((InputBlockProperties::c_stride * localThreadC + + (InputBlockProperties::is_coalesced_layout ? 0 : i)) * + LSD)); + } + } + localThreadNC += (InputBlockProperties::is_coalesced_layout) + ? LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride) + : LocalOffset / (Properties::TileSizeDimK / InputBlockProperties::c_stride); + localThreadC += (InputBlockProperties::is_coalesced_layout) + ? LocalOffset / (TileSizeDimNC / InputBlockProperties::nc_stride) + : LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride); + } + } +}; + +#ifndef EIGEN_SYCL_DISABLE_GEMV + +/*! + * \brief GeneralVectorTensor is a template class that provides Tensor -vector contraction operation, which is a special + * case of Tensor Tensor contraction. + * + * \tparam OutScalar: determines the output scalar type + * + * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification + * (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition) + * + * \tparam VectorMapper: determines the tensor contraction mapper for the vector input (can be lhs or rhs) + * + * \tparam TensorMapper: determines the tensor contraction mapper for the tensor input (can be lhs or rhs) + * + * \tparam StorageIndex: determines the StorageIndex Type + * + * \tparam Properties: determines the Contraction Panel properties + * + * \tparam KFactor: determines the number of elements in K dimension in a Tile + * + * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression. + * + * \tparam is_lhs_vec: determines whether lhs is a vector or rhs is a vector + * + * \tparam IsFinal: determine if this is the final kernel. If so, the result will be written in a final output. + * Otherwise, the result of contraction will be written iin a temporary buffer. + * + * \param scratch: determines the local memory containing the vector block for each work-group + * + * \param vec: determines the vector input (tensor mapper) + * + * \param mat: determines the tensor input (tensor mapper) + * + * \param out_res: determines the output vector containing the contraction result + * + * \param nonContractGroupSize: a logical number determining the number of work-group for non-contracting dimension + * + * \param nonContractDim: determines the size of non contracting dimension for the flattened tensor + * + * \param contractDim: determines the size of non contracting dimension for the flattened tensor + * + */ +template +struct GeneralVectorTensor { + typedef typename Eigen::TensorSycl::internal::Vectorise::PacketReturnType + PacketReturnType; + static EIGEN_CONSTEXPR int PacketSize = + Eigen::TensorSycl::internal::Vectorise::PacketSize; + typedef cl::sycl::accessor Scratch; + + static EIGEN_CONSTEXPR StorageIndex OutScratchOffset = + KFactor * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC; + + // Since the access layout for a vector can always be coalesced, when LHS is a vector, we pass false and false to make + // sure that the !^ is true When RHS is a vector, we pass true and true to make sure that the !^ is true. + typedef BlockProperties + VecBlockProperties; + + Scratch scratch; + const VectorMapper vec; + const TensorMapper mat; + OutAccessor out_res; + const StorageIndex nonContractGroupSize; + const StorageIndex nonContractDim; + const StorageIndex contractDim; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE GeneralVectorTensor(Scratch scratch_, const VectorMapper vec_, + const TensorMapper mat_, OutAccessor out_res_, + const StorageIndex nonContractGroupSize_, + const StorageIndex nonContractDim_, + const StorageIndex contractDim_) + : scratch(scratch_), + vec(vec_), + mat(mat_), + out_res(out_res_), + nonContractGroupSize(nonContractGroupSize_), + nonContractDim(nonContractDim_), + contractDim(contractDim_) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) { + auto scratch_ptr = scratch.get_pointer(); + const StorageIndex linearLocalThreadId = itemID.get_local_id(0); + StorageIndex nonContractId = is_lhs_vec ? linearLocalThreadId / Properties::LocalThreadSizeC + : linearLocalThreadId % Properties::LocalThreadSizeNC; + StorageIndex contractId = is_lhs_vec ? linearLocalThreadId % Properties::LocalThreadSizeC + : linearLocalThreadId / Properties::LocalThreadSizeNC; + const StorageIndex cGroupSize = itemID.get_group_range(0) / nonContractGroupSize; + const StorageIndex nonContractGroupId = + is_lhs_vec ? itemID.get_group(0) / cGroupSize : itemID.get_group(0) % nonContractGroupSize; + const StorageIndex contractGroupId = + is_lhs_vec ? itemID.get_group(0) % cGroupSize : itemID.get_group(0) / nonContractGroupSize; + auto out_ptr = out_res.get_pointer() + (IsFinal ? 0 : contractGroupId * nonContractDim); + + const StorageIndex nonContractGroupOffset = nonContractGroupId * Properties::TileSizeDimNC; + const StorageIndex contractGroupOffset = contractGroupId * Properties::TileSizeDimC; + auto outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC; + const StorageIndex globalNonContractDimOffset = nonContractGroupOffset + nonContractId; + const StorageIndex globalContractDimOffset = contractGroupOffset + contractId; + auto local_output = scratch_ptr + OutScratchOffset; + const bool is_internal = nonContractDim - nonContractGroupOffset >= Properties::TileSizeDimNC && + contractDim - contractGroupOffset >= Properties::TileSizeDimC; + is_internal + ? compute_panel(itemID, vec, mat, local_output, out_ptr, +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + scratch_ptr, contractGroupOffset, +#endif + nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId, + nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex) + : compute_panel(itemID, vec, mat, local_output, out_ptr, +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + scratch_ptr, contractGroupOffset, +#endif + nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId, + nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex); + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel( + const cl::sycl::nd_item<1> &itemID, const VectorMapper &vec, const TensorMapper &mat, OutScalar *local_output, + OutPtr out_ptr, +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + OutScalar *scratch_ptr, const StorageIndex contractGroupOffset, +#endif + const StorageIndex nonContractGroupOffset, const StorageIndex linearLocalThreadId, StorageIndex contractDim, + StorageIndex nonContractDim, StorageIndex contractId, StorageIndex nonContractId, + StorageIndex globalContractDimOffset, StorageIndex globalNonContractDimOffset, StorageIndex outScratchIndex) { + OutScalar outScalar[Properties::WorkLoadPerThreadNC] = {OutScalar(0)}; + // Reading the vector +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + const StorageIndex vectorOffset = contractGroupOffset + linearLocalThreadId; + extract_block(vec, scratch_ptr, linearLocalThreadId, + vectorOffset, contractDim); + + itemID.barrier(cl::sycl::access::fence_space::local_space); + auto in_scratch_ptr = scratch_ptr + contractId; +#endif + + StorageIndex privateOffsetC = 0; + EIGEN_UNROLL_LOOP + for (StorageIndex i = 0; i < Properties::WorkLoadPerThreadC; i++) { + StorageIndex privateOffsetNC = 0; + bool contract_conds = ((globalContractDimOffset + privateOffsetC) < contractDim); +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + auto vecScalar = *in_scratch_ptr; +#else + auto vecScalar = (check_boundary(contract_conds)) + ? vec(is_lhs_vec ? StorageIndex(0) : globalContractDimOffset + privateOffsetC, + is_lhs_vec ? globalContractDimOffset + privateOffsetC : StorageIndex(0)) + : OutScalar(0); +#endif + EIGEN_UNROLL_LOOP + for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) { + auto matScalar = (check_boundary( + contract_conds && ((globalNonContractDimOffset + privateOffsetNC) < nonContractDim))) + ? mat(is_lhs_vec ? globalContractDimOffset + privateOffsetC + : globalNonContractDimOffset + privateOffsetNC, + is_lhs_vec ? globalNonContractDimOffset + privateOffsetNC + : globalContractDimOffset + privateOffsetC) + : OutScalar(0); + + outScalar[j] = cl::sycl::mad(matScalar, vecScalar, outScalar[j]); + privateOffsetNC += Properties::LocalThreadSizeNC; + } + privateOffsetC += Properties::LocalThreadSizeC; +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + in_scratch_ptr += Properties::LocalThreadSizeC; +#endif + } + + auto out_scratch_ptr = local_output + outScratchIndex; + // Each block of 16*16 element in shared memory should reduce to 16*1 + EIGEN_UNROLL_LOOP + for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) { + *out_scratch_ptr = outScalar[j]; + + out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC); + } + if (is_lhs_vec) { + nonContractId = linearLocalThreadId % Properties::LocalThreadSizeNC; + contractId = linearLocalThreadId / Properties::LocalThreadSizeNC; + outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC; + } + + out_scratch_ptr = local_output + outScratchIndex; + EIGEN_UNROLL_LOOP + for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) { + EIGEN_UNROLL_LOOP + for (StorageIndex offset = Properties::LocalThreadSizeC >> 1; offset > 0; offset >>= 1) { + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (contractId < offset) { + StorageIndex myNeigbourId = (Properties::LocalThreadSizeNC * offset); + *out_scratch_ptr += out_scratch_ptr[myNeigbourId]; + } + } + // moving to next 16 by 16 block + out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC); + } + + if (contractId == 0) { + out_scratch_ptr = local_output + nonContractId; + StorageIndex global_final_offset = nonContractGroupOffset + nonContractId; + out_ptr += global_final_offset; + EIGEN_UNROLL_LOOP + for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) { + if (check_boundary(global_final_offset < nonContractDim)) { + auto res = *out_scratch_ptr; + + *out_ptr = res; + out_ptr += Properties::LocalThreadSizeNC; + } + // moving to next 16 by 16 block to ge the next 16 reduced elements + out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC); + if (!(is_internal_block)) global_final_offset += Properties::LocalThreadSizeNC; + } + } + } + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_block(const Input &inpt, Local *local_ptr, + const StorageIndex &linearLocalThreadId, + const StorageIndex &cOffset, const StorageIndex &C) { + local_ptr += InputBlockProperties::c_stride * linearLocalThreadId; + StorageIndex cIndex = cOffset; + for (StorageIndex cId = 0; cId < CFactor / InputBlockProperties::c_stride; cId++) { + if (check_boundary(cIndex + InputBlockProperties::c_stride - 1 < C)) { + auto val = read(inpt, StorageIndex(0), + cIndex, StorageIndex(1)); + write(val, local_ptr); + } else { + EIGEN_UNROLL_LOOP + for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) { + OutScalar val = + (cIndex + i < C) + ? read( + inpt, StorageIndex(0), cIndex + i, StorageIndex(1)) + : OutScalar(0); + write(val, local_ptr + i); + } + } + local_ptr += InputBlockProperties::c_stride * GroupSize; + cIndex += InputBlockProperties::c_stride * GroupSize; + } + } +}; +#endif + +#ifndef EIGEN_SYCL_DISABLE_SCALAR + +/*! + * \brief GeneralScalarContraction is a template class that provides the scalar value of Tensor -Tensor contraction + * operation, when all the dimensions are contracting dimensions. This Kernel reduces two tensors to an scalar + * + * \tparam OutScalar: determines the output scalar type + * + * \tparam LhsScalar: determines the left-hand-side scalar type + * + * \tparam RhsScalar: determines the right-hand-side scalar type + * + * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification + * (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition) + * + * \tparam LhsMapper: determines the tensor contraction mapper type for left-hand-side matrix + * + * \tparam RhsMapper: determines the tensor contraction mapper type for right-hand-side matrix + * + * \tparam StorageIndex: determines the StorageIndex Type + * + * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression. + * + * \param scratch: local memory containing tiles of LHS and RHS tensors for each work-group + * + * \param lhs: determines the left-hand-side flattened tensor (tensor mapper) + * + * \param rhs: determines the right-hand-side flattened tensor (tensor mapper) + * + * \param out_res: determines the output tensor containing the contraction result + * + * \param rng: determins the total input data size + */ +template +struct GeneralScalarContraction { + typedef cl::sycl::accessor Scratch; + Scratch scratch; + const LhsMapper lhs; + const RhsMapper rhs; + OutAccessor out_res; + const StorageIndex rng; + + EIGEN_DEVICE_FUNC + GeneralScalarContraction(Scratch scratch_, const LhsMapper lhs_, const RhsMapper rhs_, OutAccessor out_res_, + const StorageIndex rng_) + : scratch(scratch_), lhs(lhs_), rhs(rhs_), out_res(out_res_), rng(rng_) {} + + EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) { + auto out_ptr = out_res.get_pointer(); + auto scratch_ptr = scratch.get_pointer().get(); + + StorageIndex globalid = itemID.get_global_id(0); + StorageIndex localid = itemID.get_local_id(0); + OutScalar accumulator = OutScalar(0); + for (StorageIndex i = globalid; i < rng; i += itemID.get_global_range(0)) { + accumulator = cl::sycl::mad(lhs(0, i), rhs(i, 0), accumulator); + } + auto out_scratch_ptr = scratch_ptr + localid; + *out_scratch_ptr = accumulator; + for (StorageIndex offset = itemID.get_local_range(0) >> 1; offset > 0; offset >>= 1) { + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (localid < offset) { + *out_scratch_ptr = (accumulator += out_scratch_ptr[offset]); + } + } + if (localid == 0) { + out_ptr[itemID.get_group(0)] = accumulator; + } + } +}; +#endif + +} // namespace internal +} // namespace TensorSycl + +template +struct TensorEvaluator, + Eigen::SyclDevice> + : public TensorContractionEvaluatorBase, Eigen::SyclDevice>> { + static_assert(std::is_same::value, + "SYCL tensor contraction does not support output kernels."); + + typedef Eigen::SyclDevice Device; + + typedef TensorEvaluator, Device> Self; + typedef TensorContractionEvaluatorBase Base; + typedef TensorContractionOp XprType; + typedef typename internal::remove_const::type Scalar; + typedef typename XprType::Index StorageIndex; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename Base::Storage Storage; + typedef typename Base::EvaluatorPointerType EvaluatorPointerType; + struct TripleDim { + const StorageIndex M; + const StorageIndex N; + const StorageIndex K; + TripleDim(const StorageIndex M_, const StorageIndex N_, const StorageIndex K_) : M(M_), N(N_), K(K_) {} + }; + enum { + Layout = TensorEvaluator::Layout, + PacketAccess = (PacketType::size > 1), + BlockAccess = false, + }; + + static EIGEN_CONSTEXPR int LDims = Base::LDims; + static EIGEN_CONSTEXPR int RDims = Base::RDims; + static EIGEN_CONSTEXPR int ContractDims = Base::ContractDims; + + typedef array left_dim_mapper_t; + typedef array right_dim_mapper_t; + + typedef array contract_t; + typedef array left_nocontract_t; + typedef array right_nocontract_t; + + static const int NumDims = LDims + RDims - 2 * ContractDims; + + typedef DSizes Dimensions; + + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + typedef typename Eigen::internal::remove_const::type LhsScalar; + typedef typename Eigen::internal::remove_const::type RhsScalar; + + typedef typename LeftEvaluator::Dimensions LeftDimensions; + typedef typename RightEvaluator::Dimensions RightDimensions; + + template + struct input_mapper_propertis { + static EIGEN_CONSTEXPR bool is_lhs_matrix = (LDims == 2 && ContractDims == 1) || lhs_inner_dim_contiguous; + static EIGEN_CONSTEXPR bool is_rhs_matrix = + (RDims == 2 && ContractDims == 1) || (rhs_inner_dim_contiguous && !rhs_inner_dim_reordered); + }; + + TensorEvaluator(const XprType &op, const Device &device) : Base(op, device) {} + + // We need to redefine this method to make nvcc happy + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(typename Base::EvaluatorPointerType data) { + this->m_leftImpl.evalSubExprsIfNeeded(NULL); + this->m_rightImpl.evalSubExprsIfNeeded(NULL); + if (!data) { + this->m_result = this->m_device.get( + static_cast(this->m_device.allocate_temp(this->dimensions().TotalSize() * sizeof(Scalar)))); + data = this->m_result; + } + evalToSycl(data); + return (this->m_result != NULL); + } + const Eigen::SyclDevice &device() const { return this->m_device; } + void evalToSycl(typename Base::EvaluatorPointerType buffer) const { + if (this->m_lhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } else { + evalTyped(buffer); + } + } else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } else { + evalTyped(buffer); + } + } + } else { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } else { + evalTyped(buffer); + } + } else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } else { + evalTyped(buffer); + } + } + } + } + + template + void evalTyped(typename Base::EvaluatorPointerType buffer) const { + const auto triple_dim = TripleDim{this->m_i_size, this->m_j_size, this->m_k_size}; + typedef internal::TensorContractionInputMapper< + LhsScalar, StorageIndex, internal::Lhs, LeftEvaluator, left_nocontract_t, contract_t, + PacketType::size, lhs_inner_dim_contiguous, false, Unaligned, MakeSYCLPointer> + LhsMapper; + + typedef internal::TensorContractionInputMapper::size, rhs_inner_dim_contiguous, + rhs_inner_dim_reordered, Unaligned, MakeSYCLPointer> + RhsMapper; + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + +#ifndef EIGEN_SYCL_DISABLE_SCALAR + if (triple_dim.M == 1 && triple_dim.N == 1) { + launchSC(buffer, lhs, rhs, triple_dim.K); + } else +#endif +#ifndef EIGEN_SYCL_DISABLE_GEMV + if (triple_dim.M != 1 && triple_dim.N == 1) { + LaunchVT(buffer, rhs, lhs, triple_dim.M, triple_dim.K); + } else if (triple_dim.M == 1 && triple_dim.N != 1) { + LaunchVT(buffer, lhs, rhs, triple_dim.N, triple_dim.K); + } else // This is equivalent of if (m!=1 && n!=1) +#endif + { + typedef input_mapper_propertis + inpt_mapper_properties; +#ifndef EIGEN_SYCL_DISABLE_SKINNY + bool skinny = false; + auto platform_name = this->device().getPlatformName(); + // This is based on empirical calculation for AMD r9-nano and Fiji + if (platform_name.find("AMD") == 0) { + skinny = (triple_dim.M < triple_dim.K || triple_dim.N < triple_dim.K) && + ((triple_dim.M < 1024 && triple_dim.N < 1024) || + (uint64_t(triple_dim.M * triple_dim.N) < uint64_t(triple_dim.K))); + } else { + skinny = (((std::max(triple_dim.K, triple_dim.N) / std::min(triple_dim.K, triple_dim.N)) > 100) || + ((std::max(triple_dim.K, triple_dim.M) / std::min(triple_dim.K, triple_dim.M)) > 100) || + ((std::max(triple_dim.N, triple_dim.M) / std::min(triple_dim.N, triple_dim.M)) > 100)); + } + if (skinny) + adjustTT(buffer, lhs, rhs, triple_dim); + else +#endif // EIGEN_SYCL_DISABLE_SKINNY + adjustTT(buffer, lhs, rhs, triple_dim); + } + } + + template + void EIGEN_ALWAYS_INLINE adjustTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, + const TripleDim &triple_dim) const { +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON + if (device().has_local_memory()) { + typedef TensorSycl::internal::TTPanelSize PanelParameters; + launchTT( + buffer, lhs, rhs, triple_dim); + } +#endif +#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF + if (!(device().has_local_memory())) { + typedef TensorSycl::internal::TTPanelSize PanelParameters; + launchTT( + buffer, lhs, rhs, triple_dim); + } +#endif + } + + template + void launchTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, + const TripleDim &triple_dim) const { + const StorageIndex roundUpM = Eigen::TensorSycl::internal::roundUp(triple_dim.M, Properties::TileSizeDimM); + const StorageIndex roundUpN = Eigen::TensorSycl::internal::roundUp(triple_dim.N, Properties::TileSizeDimN); + const StorageIndex groupSizeM = roundUpM / Properties::TileSizeDimM; + const StorageIndex groupSizeN = roundUpN / Properties::TileSizeDimN; + + const StorageIndex roundUpK = Eigen::TensorSycl::internal::roundUp(triple_dim.K, Properties::TileSizeDimK); + StorageIndex totalTilesK = roundUpK / Properties::TileSizeDimK; + StorageIndex groupSizeK = + skinny + ? std::max(std::min(totalTilesK, + (StorageIndex)(device().getPowerOfTwo(device().getNumSyclMultiProcessors(), true) * 4) / + (groupSizeM * groupSizeN)), + StorageIndex(1)) + : StorageIndex(1); + + const StorageIndex numTilesPerGroup = Eigen::TensorSycl::internal::roundUp(totalTilesK, groupSizeK) / groupSizeK; + + const StorageIndex totalGroupSize = groupSizeM * groupSizeN * groupSizeK; + + const StorageIndex localRange = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN; + const StorageIndex globalRange = totalGroupSize * localRange; + + const StorageIndex scratchSize = (ct == TensorSycl::internal::contraction_type::local) + ? ((Properties::DoubleBuffer + 1) * + (Properties::TileSizeDimM + Properties::BC) * (Properties::TileSizeDimK)) + + ((Properties::DoubleBuffer + 1) * (Properties::TileSizeDimK) * + (Properties::TileSizeDimN + Properties::BC)) + : StorageIndex(1); + + auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange)); + if (groupSizeK == 1) { + typedef TensorSycl::internal::TensorContractionKernel + ContractKernelName; + device().template binary_kernel_launcher( + lhs, rhs, buffer, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, triple_dim); + } else { + typedef TensorSycl::internal::TensorContractionKernel + ContractKernelName; + CoeffReturnType *temp_pointer = static_cast( + device().allocate_temp(triple_dim.M * triple_dim.N * groupSizeK * sizeof(CoeffReturnType))); + EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer); + + device().template binary_kernel_launcher( + lhs, rhs, tmp_global_accessor, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, + triple_dim); + + typedef Eigen::internal::SumReducer Op; + auto op = Op(); + typedef TensorSycl::internal::SecondStepPartialReduction + ReductionKernel; + + device().template unary_kernel_launcher( + tmp_global_accessor, buffer, + cl::sycl::nd_range<1>(cl::sycl::range<1>(StorageIndex( + Eigen::TensorSycl::internal::roundUp(triple_dim.M * triple_dim.N, localRange))), + cl::sycl::range<1>(localRange)), + StorageIndex(1), op, StorageIndex(triple_dim.M * triple_dim.N), groupSizeK); + + device().deallocate_temp(temp_pointer); + } + } + +#ifndef EIGEN_SYCL_DISABLE_GEMV + template + void EIGEN_ALWAYS_INLINE LaunchVT(EvaluatorPointerType buffer, const VectorMapper &vec, const TensorMapper &mat, + StorageIndex NC, StorageIndex C) const { + const StorageIndex nonContractDim = NC; + EIGEN_CONSTEXPR StorageIndex NCFactor = 1; + EIGEN_CONSTEXPR StorageIndex CFactor = 1; + EIGEN_CONSTEXPR StorageIndex NCWindow = 16; + typedef Eigen::TensorSycl::internal::TVPanelSize + Properties; + const StorageIndex roundUpC = Eigen::TensorSycl::internal::roundUp(C, Properties::TileSizeDimC); + const StorageIndex cNumGroups = roundUpC / (Properties::LocalThreadSizeC * Properties::WorkLoadPerThreadC); + const StorageIndex roundUpNC = Eigen::TensorSycl::internal::roundUp(nonContractDim, Properties::TileSizeDimNC); + const StorageIndex nCNumGroups = roundUpNC / (Properties::LocalThreadSizeNC * Properties::WorkLoadPerThreadNC); + const StorageIndex globalRange = + (roundUpNC / (Properties::WorkLoadPerThreadNC)) * (roundUpC / (Properties::WorkLoadPerThreadC)); + const StorageIndex localRange = Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC; + const StorageIndex scratchSize = + (Properties::WorkLoadPerThreadNC + CFactor) * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC; + auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange)); + if (cNumGroups > 1) { + typedef Eigen::TensorSycl::internal::GeneralVectorTensor + ContractKernelName; + CoeffReturnType *temp_pointer = + static_cast(device().allocate_temp(nonContractDim * cNumGroups * sizeof(CoeffReturnType))); + EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer); + + device().template binary_kernel_launcher( + vec, mat, tmp_global_accessor, thread_range, scratchSize, nCNumGroups, nonContractDim, C); + + typedef Eigen::internal::SumReducer Op; + typedef TensorSycl::internal::SecondStepPartialReduction + ReductionKernel; + + device().template unary_kernel_launcher( + tmp_global_accessor, buffer, + cl::sycl::nd_range<1>(cl::sycl::range<1>(Eigen::TensorSycl::internal::roundUp(nonContractDim, localRange)), + cl::sycl::range<1>(localRange)), + StorageIndex(1), Op(), nonContractDim, cNumGroups); + + device().deallocate_temp(temp_pointer); + } else { + typedef Eigen::TensorSycl::internal::GeneralVectorTensor + ContractKernelName; + device().template binary_kernel_launcher( + vec, mat, buffer, thread_range, scratchSize, nCNumGroups, nonContractDim, C); + } + } +#endif + +#ifndef EIGEN_SYCL_DISABLE_SCALAR + template + EIGEN_ALWAYS_INLINE void launchSC(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, + StorageIndex K) const { + EIGEN_STATIC_ASSERT(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) & + (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)), + "The Local thread size must be a power of 2 for the reduction " + "operation"); + EIGEN_CONSTEXPR StorageIndex local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1; + + // Here we force the code not to be more than 2-step reduction: Our empirical research shows that if each thread + // reduces at least 512 elementss individually, we get better performance. + const StorageIndex num_work_group = ((K + (512 * local_range - 1)) / (512 * local_range) > 1 ? local_range : 1); + const StorageIndex global_range = num_work_group * local_range; + + typedef Eigen::TensorSycl::internal::GeneralScalarContraction< + CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType, LhsMapper, RhsMapper, StorageIndex, false> + ContractKernelName; + auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range)); + if (num_work_group > 1) { + CoeffReturnType *temp_pointer = + static_cast(device().allocate_temp(num_work_group * sizeof(CoeffReturnType))); + EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer); + device().template binary_kernel_launcher(lhs, rhs, tmp_global_accessor, + thread_range, local_range, K); + typedef Eigen::internal::SumReducer Op; + typedef TensorSycl::internal::SecondStepFullReducer + GenericRKernel; + device().template unary_kernel_launcher( + tmp_global_accessor, buffer, + cl::sycl::nd_range<1>(cl::sycl::range<1>(local_range), cl::sycl::range<1>(local_range)), local_range, Op()); + + device().deallocate_temp(temp_pointer); + } else { + device().template binary_kernel_launcher(lhs, rhs, buffer, thread_range, + local_range, K); + } + } +#endif + + EIGEN_STRONG_INLINE void cleanup() { + this->m_leftImpl.cleanup(); + this->m_rightImpl.cleanup(); + + if (this->m_result) { + this->m_device.deallocate_temp(this->m_result); + this->m_result = NULL; + } + } + // The placeholder accessors must bound to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + this->m_leftImpl.bind(cgh); + this->m_rightImpl.bind(cgh); + this->m_result.bind(cgh); + } +}; +} // namespace Eigen +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionThreadPool.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionThreadPool.h new file mode 100644 index 0000000..21be6ea --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -0,0 +1,1679 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H + +// evaluator for thread pool device +#ifdef EIGEN_USE_THREADS + +namespace Eigen { + +template +struct TensorEvaluator, ThreadPoolDevice> : + public TensorContractionEvaluatorBase, ThreadPoolDevice> > { + + typedef ThreadPoolDevice Device; + + typedef TensorEvaluator, Device> Self; + typedef TensorContractionEvaluatorBase Base; + + typedef TensorContractionOp XprType; + typedef typename internal::remove_const::type Scalar; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + enum { + Layout = TensorEvaluator::Layout, + }; + + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef typename internal::conditional< + static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; + typedef typename internal::conditional< + static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; + + static const int LDims = + internal::array_size::Dimensions>::value; + static const int RDims = + internal::array_size::Dimensions>::value; + static const int ContractDims = internal::array_size::value; + + typedef array left_dim_mapper_t; + typedef array right_dim_mapper_t; + + typedef array contract_t; + typedef array left_nocontract_t; + typedef array right_nocontract_t; + + static const int NumDims = LDims + RDims - 2 * ContractDims; + + typedef DSizes Dimensions; + + // typedefs needed in evalTo + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; + typedef typename internal::gebp_traits Traits; + + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + + TensorEvaluator(const XprType& op, const Device& device) : + Base(op, device) {} + + template + void evalProduct(Scalar* buffer) const { + evalProductImpl(buffer, NoCallback()); + } + + template + void evalProductAsync(Scalar* buffer, EvalToCallback done) const { + evalProductImpl(buffer, std::move(done)); + } + + template + void evalProductImpl(Scalar* buffer, DoneCallback done) const { + // This function computes a lot of heuristics in multiple steps, and it + // also has multiple exit points. To keep it sane, readable and all in one + // place, sync/async execution decision is made at runtime at the very end. + // + // (1) In sync mode we allocate Context on the stack, submit computations + // to the device thread pool, and block on a barrier until it is + // completed. + // + // (2) In async mode we allocate Context on the heap, and after all tasks + // are finished, we call provided the done callback, and delete a + // context from the heap. + // + // (*) EvalParallelContext & EvalShardedByInnerDimContext owns all the state + // and temporary buffers, requried for executing the tensor contraction. + // They are responsible for cleaning it up after contraction is done. + static const bool IsEvalInSyncMode = + std::is_same::value; + + const Index m = this->m_i_size; + const Index n = this->m_j_size; + const Index k = this->m_k_size; + if (m == 0 || n == 0 || k == 0) return; + + // Compute a set of algorithm parameters: + // - kernel block sizes (bm, bn, bk) + // - task grain sizes (number of kernels executed per task: gm, gn) + // - number of threads + // - sharding by row/column + // - parallel packing or first lhs then rhs + // and some derived parameters: + // - number of tasks (nm, nn, nk) + // - number of kernels (nm0, nn0) + // Unfortunately, all these parameters are tightly interdependent. + // So in some cases we first compute approximate values, then compute other + // values based on these approximations and then refine the approximations. + + // There are lots of heuristics here. There is some reasoning behind them, + // but ultimately they are just tuned on contraction benchmarks for + // different input configurations, thread counts and instruction sets. + // So feel free to question any of them. + + // Compute whether we want to shard by row or by column. + // This is a first approximation, it will be refined later. Since we don't + // know number of threads yet we use 2, because what's we are most + // interested in at this point is whether it makes sense to use + // parallelization at all or not. + bool shard_by_col = shardByCol(m, n, 2); + + // First approximation of kernel blocking sizes. + // Again, we don't know number of threads yet, so we use 2. + Index bm, bn, bk; + if (shard_by_col) { + internal::TensorContractionBlocking + blocking(k, m, n, 2); + bm = blocking.mc(); + bn = blocking.nc(); + bk = blocking.kc(); + } else { + internal::TensorContractionBlocking + blocking(k, m, n, 2); + bm = blocking.mc(); + bn = blocking.nc(); + bk = blocking.kc(); + } + + // Compute optimal number of threads. + // Note: we use bk instead of k here because we are interested in amount of + // _parallelizable_ computations, and computations are not parallelizable + // across k dimension. + const TensorOpCost cost = + contractionCost(m, n, bm, bn, bk, shard_by_col, false); + int num_threads = TensorCostModel::numThreads( + static_cast(n) * m, cost, this->m_device.numThreads()); + int num_threads_by_k = numThreadsInnerDim(m, n, k); + if (shardByInnerDim(m, n, k, num_threads, num_threads_by_k)) { + // We are in the scenario where it is more effective to shard by the + // inner dimension. + if (IsEvalInSyncMode) { + EvalShardedByInnerDimContext ctx( + this, num_threads_by_k, buffer, m, n, k, std::move(done)); + ctx.template run(); + } else { + auto* ctx = new EvalShardedByInnerDimContext( + this, num_threads_by_k, buffer, m, n, k, std::move(done)); + ctx->template runAsync(); + } + + return; + } + + // TODO(dvyukov): this is a stop-gap to prevent regressions while the cost + // model is not tuned. Remove this when the cost model is tuned. + if (n == 1) num_threads = 1; + + if (num_threads == 1) { + TENSOR_CONTRACTION_DISPATCH(this->template evalProductSequential, + Unaligned, (buffer)); + if (!IsEvalInSyncMode) done(); + return; + } + + // Now that we know number of threads, recalculate sharding and blocking. + shard_by_col = shardByCol(m, n, num_threads); + if (shard_by_col) { + internal::TensorContractionBlocking + blocking(k, m, n, num_threads); + bm = blocking.mc(); + bn = blocking.nc(); + bk = blocking.kc(); + } else { + internal::TensorContractionBlocking + blocking(k, m, n, num_threads); + bm = blocking.mc(); + bn = blocking.nc(); + bk = blocking.kc(); + } + + // Number of kernels for each dimension. + Index nm0 = divup(m, bm); + Index nn0 = divup(n, bn); + Index nk = divup(k, bk); + + // Calculate task grain size (number of kernels executed per task). + // This task size coarsening serves two purposes: + // 1. It reduces per-task overheads including synchronization overheads. + // 2. It allows to use caches better (reuse the same packed rhs in several + // consecutive kernels). + Index gm = 1; + Index gn = 1; + // If we are sharding by column, then we prefer to reduce rows first. + if (shard_by_col) { + gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col); + gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col); + } else { + gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col); + gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col); + } + // Number of tasks in each dimension. + Index nm = divup(nm0, gm); + Index nn = divup(nn0, gn); + + // If there is enough concurrency in the sharding dimension, we choose not + // to paralellize by the other dimension, and execute all kernels in sync + // mode. This reduces parallelism from the nm x nn down to nn + // (shard_by_col==true) or nm (shard_by_col==false). + const Index sharding_dim_tasks = shard_by_col ? nn : nm; + const int num_worker_threads = this->m_device.numThreadsInPool(); + + // With small number of threads we want to make sure that we do not reduce + // parallelism too much. With large number of threads we trade maximum + // parallelism for better memory locality. + const float oversharding_factor = + num_worker_threads <= 4 ? 8.0 : + num_worker_threads <= 8 ? 4.0 : + num_worker_threads <= 16 ? 2.0 : + num_worker_threads <= 32 ? 1.0 : + num_worker_threads <= 64 ? 0.8 : /* num_worker_threads > 64 */ 0.6; + + const bool parallelize_by_sharding_dim_only = + sharding_dim_tasks >= oversharding_factor * num_worker_threads; + + // Last by not least, decide whether we want to issue both lhs and rhs + // packing in parallel; or issue lhs packing first, and then issue rhs + // packing when lhs packing completes (for !shard_by_col lhs and rhs are + // swapped). Parallel packing allows more parallelism (for both packing and + // kernels), while sequential packing provides better locality (once + // a thread finishes rhs packing it proceed to kernels with that rhs). + // First, we are interested in parallel packing if there are few tasks. + bool parallel_pack = num_threads >= nm * nn; + // Also do parallel packing if all data fits into L2$. + if (m * bk * Index(sizeof(LhsScalar)) + n * bk * Index(sizeof(RhsScalar)) <= + l2CacheSize() * num_threads) + parallel_pack = true; + // But don't do it if we will use each rhs only once. Locality seems to be + // more important in this case. + if ((shard_by_col ? nm : nn) == 1) parallel_pack = false; + // Also don't get in the way of parallelize_by_sharding_dim_only + // optimization. + if (parallelize_by_sharding_dim_only) parallel_pack = false; + + // TODO(ezhulnev): With if contexpr we don't need SyncEvalParallelContext. + if (IsEvalInSyncMode) { +#define CONTEXT_ARGS \ + (this, num_threads, buffer, m, n, k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, \ + nn0, shard_by_col, parallel_pack, parallelize_by_sharding_dim_only, \ + NoCallback()) \ + .run() + TENSOR_CONTRACTION_DISPATCH(SyncEvalParallelContext, Alignment, + CONTEXT_ARGS); +#undef CONTEXT_ARGS + + } else { +#define CONTEXT_ARGS \ + (this, num_threads, buffer, m, n, k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, \ + nn0, shard_by_col, parallel_pack, parallelize_by_sharding_dim_only, \ + std::move(done)) + TENSOR_CONTRACTION_ASYNC_DISPATCH(EvalParallelContext, DoneCallback, + Alignment, CONTEXT_ARGS, run()); +#undef CONTEXT_ARGS + } + } + + // ------------------------------------------------------------------------ // + + // Dummy struct to represent an empty DoneCallback. + + struct NoCallback { + void operator()() { + eigen_assert(false && "NoCallback should never be called"); + } + }; + + // ------------------------------------------------------------------------ // + + template + class EvalParallelNotification; + + // Synchronous evaluation notification that blocks caller thread in Wait(). + template + class EvalParallelNotification { + public: + EvalParallelNotification(Context*, NoCallback) {} + void Notify() { done_.Notify(); } + void Wait() { done_.Wait(); } + private: + Eigen::Notification done_; + }; + + // Asynchronous evaluation notification that does not block in Wait(). + template + class EvalParallelNotification { + public: + EvalParallelNotification(Context* ctx, DoneCallback done) + : ctx_(ctx), done_(std::move(done)) {} + + void Notify() { + // Make a copy of done callback, because it will be destructed when we + // will delete context in the next line (EvalParallelNotification is a + // data member of EvalParallelContext class). + DoneCallback done_copy = std::move(done_); + + // Delete parallel evaluation context. + delete ctx_; + + // Now safely call the done callback. + done_copy(); + } + + void Wait() {} + + private: + Context* ctx_; + DoneCallback done_; + }; + + // Context orchestrates sync/async parallel contraction evaluation. When it is + // executed in asynchronous mode, it owns all the shared state that might be + // accessible by block packing and kernel tasks. + + template + class EvalParallelContext { + public: + typedef internal::TensorContractionInputMapper< + LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t, + contract_t, internal::packet_traits::size, + lhs_inner_dim_contiguous, false, Unaligned> + LhsMapper; + typedef internal::TensorContractionInputMapper< + RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t, + contract_t, internal::packet_traits::size, + rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned> + RhsMapper; + + typedef internal::blas_data_mapper OutputMapper; + + typedef internal::TensorContractionKernel< + Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper> + TensorContractionKernel; + + typedef typename TensorContractionKernel::LhsBlock LhsBlock; + typedef typename TensorContractionKernel::RhsBlock RhsBlock; + typedef typename TensorContractionKernel::BlockMemHandle BlockMemHandle; + + EvalParallelContext(const Self* self, int num_threads, Scalar* buffer, + Index tm, Index tn, Index tk, Index bm, Index bn, + Index bk, Index nm, Index nn, Index nk, Index gm, + Index gn, Index nm0, Index nn0, bool shard_by_col, + bool parallel_pack, + bool parallelize_by_sharding_dim_only, + DoneCallback done) + : created_by_thread_id_(std::this_thread::get_id()), + done_(this, std::move(done)), + device_(self->m_device), + lhs_(self->m_leftImpl, self->m_left_nocontract_strides, + self->m_i_strides, self->m_left_contracting_strides, + self->m_k_strides), + rhs_(self->m_rightImpl, self->m_right_nocontract_strides, + self->m_j_strides, self->m_right_contracting_strides, + self->m_k_strides), + buffer_(buffer), + output_(buffer, tm), + output_kernel_(self->m_output_kernel), + tensor_contraction_params_(self->m_tensor_contraction_params), + num_threads_(num_threads), + shard_by_col_(shard_by_col), + parallel_pack_(parallel_pack), + parallelize_by_sharding_dim_only_(parallelize_by_sharding_dim_only), + m_(tm), + n_(tn), + k_(tk), + bm_(bm), + bn_(bn), + bk_(bk), + nm_(nm), + nn_(nn), + nk_(nk), + gm_(gm), + gn_(gn), + nm0_(nm0), + nn0_(nn0), + kernel_(m_, k_, n_, bm_, bk_, bn_), + num_thread_local_allocations_(0), + // We reserve 2X more capacity for a thread local values, than the + // number of threads in the pool to efficiently handle task stealing + // by threads that are not managed by the pool. + thread_local_capacity(2 * (parallelize_by_sharding_dim_only_ + ? device_.numThreadsInPool() + : 0)), + // We will use only one of the Lhs/Rhs thread local storage depending + // on the shard_by_col value and we parallelize by sharding dim ONLY. + lhs_thread_local_blocks_(shard_by_col_ ? 0 : thread_local_capacity, + {*this}, {*this}), + rhs_thread_local_blocks_(shard_by_col_ ? thread_local_capacity : 0, + {*this}, {*this}) { + // These two options are mutually exclusive. + eigen_assert(!(parallel_pack && parallelize_by_sharding_dim_only)); + + for (Index x = 0; x < P; x++) { + // Normal number of notifications for k slice switch is + // nm_ + nn_ + nm_ * nn_. However, first P - 1 slices will receive only + // nm_ + nn_ notifications, because they will not receive notifications + // from preceding kernels. + state_switch_[x] = + x == 0 + ? 1 + : (parallel_pack_ ? nn_ + nm_ : (shard_by_col_ ? nn_ : nm_)) + + (x == P - 1 ? nm_ * nn_ : 0); + state_packing_ready_[x] = + parallel_pack_ ? 0 : (shard_by_col_ ? nm_ : nn_); + state_kernel_[x] = new std::atomic*[nm_]; + for (Index m = 0; m < nm_; m++) { + state_kernel_[x][m] = new std::atomic[nn_]; + // Kernels generally receive 3 notifications (previous kernel + 2 + // packing), but the first slice won't get notifications from previous + // kernels. + for (Index n = 0; n < nn_; n++) + state_kernel_[x][m][n].store( + (x == 0 ? 0 : 1) + (parallel_pack_ ? 2 : 1), + std::memory_order_relaxed); + } + } + + // Allocate memory for packed rhs/lhs matrices. + packed_mem_ = kernel_.allocateSlices( // + device_, // + /*num_lhs=*/nm0_, // + /*num_rhs=*/nn0_, // + /*num_slices=*/std::min(nk_, P - 1), // + packed_lhs_, packed_rhs_); + + if (parallelize_by_sharding_dim_only_) { + const int num_worker_threads = device_.numThreadsInPool(); + + if (shard_by_col) { + can_use_thread_local_packed_ = new std::atomic[nn_]; + for (int i = 0; i < nn_; ++i) + can_use_thread_local_packed_[i].store(true, + std::memory_order_relaxed); + + Index num_blocks = num_worker_threads * gn_; + thread_local_pre_alocated_mem_ = kernel_.allocateSlices( // + device_, // + /*num_lhs=*/0, // + /*num_rhs=*/num_blocks, // + /*num_slices=*/1, // + /*lhs_blocks=*/nullptr, &rhs_thread_local_pre_allocated_); + + } else { + can_use_thread_local_packed_ = new std::atomic[nm_]; + for (int i = 0; i < nm_; ++i) + can_use_thread_local_packed_[i].store(true, + std::memory_order_relaxed); + + Index num_blocks = num_worker_threads * gm_; + thread_local_pre_alocated_mem_ = kernel_.allocateSlices( // + device_, // + /*num_lhs=*/num_blocks, // + /*num_rhs=*/0, // + /*num_slices=*/1, &lhs_thread_local_pre_allocated_, // + /*rhs_blocks=*/nullptr); + } + } + } + + ~EvalParallelContext() { + for (Index x = 0; x < P; x++) { + for (Index m = 0; m < nm_; m++) delete[] state_kernel_[x][m]; + delete[] state_kernel_[x]; + } + kernel_.deallocate(device_, packed_mem_); + if (parallelize_by_sharding_dim_only_) { + kernel_.deallocate(device_, thread_local_pre_alocated_mem_); + delete[] can_use_thread_local_packed_; + } + } + + void run() { + // Kick off packing of the first slice. + signal_switch(0, 1); + + // Wait for overall completion. + // + // If parallel evaluation is executed in async mode, this is a no-op, and + // Wait() will return immediately. In synchronous mode it will block the + // caller thread until it will receive notification from last task. + // + // In async mode, last task when completed will call done callback from + // the same thread, and will delete this context. + // + // TODO(dvyukov): This wait can lead to deadlock if contraction is + // evaluated in synchronous mode. If nthreads contractions are + // concurrently submitted from worker threads, this wait will block all + // worker threads and the system will deadlock. + done_.Wait(); + } + + private: + std::thread::id created_by_thread_id_; + + // This notification is specialized on the type of DoneCallback and can be + // blocking or non-blocking. + EvalParallelNotification done_; + + const Device& device_; + LhsMapper lhs_; + RhsMapper rhs_; + Scalar* const buffer_; + OutputMapper output_; + OutputKernelType output_kernel_; + TensorContractionParams tensor_contraction_params_; + const int num_threads_; + const bool shard_by_col_; + const bool parallel_pack_; + const bool parallelize_by_sharding_dim_only_; + // Matrix sizes. + const Index m_; + const Index n_; + const Index k_; + // Block sizes. + const Index bm_; + const Index bn_; + const Index bk_; + // Number of tasks. + const Index nm_; + const Index nn_; + const Index nk_; + // Task grain sizes (number of kernels executed per task). + const Index gm_; + const Index gn_; + // Number of blocks (this is different from ni_/nn_ because of task size + // coarsening). + const Index nm0_; + const Index nn0_; + // Tensor contraction kernel. + TensorContractionKernel kernel_; + + // Parallelization strategy. + // + // Blocks related to the same k block can run in parallel because they write + // to different output blocks. So we parallelize within k slices, this + // gives us parallelism level of m x n. Before we can start any kernels + // related to k-th slice, we need to issue m lhs packing tasks and n rhs + // packing tasks. + // + // However, there is a bottleneck when we are finishing kernels for k-th + // slice (at the very end there is only 1 runnable kernel). To mitigate this + // bottleneck we allow kernels from k-th and k+1-th slices to run in + // parallel. Note that (m, n, k) and (m, n, k+1) kernels write to the same + // output block, so they must not run in parallel. + // + // This gives us the following dependency graph. + // On each k slice we have m x n kernel tasks, m lhs paking tasks and n rhs + // packing tasks. + // Kernel (m, n, k) can start when: + // - kernel (m, n, k-1) has finished + // - lhs packing (m, k) has finished + // - rhs packing (n, k) has finished + // Lhs/rhs packing can start when: + // - all k-1 packing has finished (artificially imposed to limit amount of + // parallel packing) + // + // On top of that we limit runnable tasks to two consecutive k slices. + // This is done to limit amount of memory we need for packed lhs/rhs + // (for each k slice we need m*bk + n*bk memory in packed_lhs_/packed_rhs_). + // + // state_switch_ tracks when we are ready to switch to the next k slice. + // state_kernel_[m][n] tracks when we are ready to kick off kernel (m, n). + // These variable are rolling over 3 consecutive k slices: first two we are + // actively executing + one to track completion of kernels in the second + // slice. + static const Index P = 3; + + // Handle to the allocated temporary storage for Lhs/Rhs blocks. + BlockMemHandle packed_mem_; + std::vector packed_lhs_[P - 1]; + std::vector packed_rhs_[P - 1]; + + // If we choose to parallelize only by the sharding dimension, each thread + // will have it's own "thead local" (not a c++ thread local storage) memory + // for packed_lhs or packed_rhs (shard_by_col = false of true). This memory + // can't be passed to a kernel that might execute on a different thread. + // + // In practice when we are ready to pack memory for the sharding dimension + // (rhs if shard_by_col==true) of the K-th slice, all kernels for K-1 slice + // already computed (99% of the time), and we can pack data into the thread + // local storage, and guarantee that all the kernels will be executed + // immediately in the same thread. This significantly increases L1 cache hit + // ratio and reduces pressure on the memory bus. + // + // It's still possible that kernel for the K-th slice will be ready before + // completion of the K-1 kernel, so we have to allocate "global" packed_lhs_ + // and packed_rhs_ to allow kernels to be executed later on a thread + // different from the thread that was used for packing. + + // Handle for pre-allocated thread local memory buffers. + BlockMemHandle thread_local_pre_alocated_mem_; + + // Only one of these will be initialized depending on shard_by_col value + // (the size will be `num_worker_threads * num_grains_in_the_sharding_dim`). + std::vector lhs_thread_local_pre_allocated_; + std::vector rhs_thread_local_pre_allocated_; + + // How many thread local blocks were already allocated. + std::atomic num_thread_local_allocations_; + const int thread_local_capacity; + + // We will use pre-allocated Lhs/Rhs blocks defined above, if the number of + // unique threads in a system is below or equal to the number of threads in + // a thread pool. We will fallback on dynamic memory allocation after that. + + // ThreadLocalBlocks is a container for Lhs or Rhs thread local buffers. Its + // size is equal to the grain size in Lhs/Rhs sharding dimension. + template + class ThreadLocalBlocks { + public: + ThreadLocalBlocks() = default; + + ThreadLocalBlocks(BlockType* base, size_t grain_size) + : is_pre_allocated_(true), + thread_local_pre_allocated_base_(base), + grain_size_(grain_size) {} + + ThreadLocalBlocks(BlockMemHandle mem_handle, + std::vector blocks) + : is_pre_allocated_(false), + mem_handle_(std::move(mem_handle)), + blocks_(std::move(blocks)) {} + + BlockType& block(int grain_index) { + eigen_assert(grain_index >= 0); + eigen_assert(static_cast(grain_index) < size()); + return is_pre_allocated_ ? thread_local_pre_allocated_base_[grain_index] + : blocks_[grain_index]; + } + + void Release(EvalParallelContext& ctx) const { + if (!is_pre_allocated_) { + ctx.kernel_.deallocate(ctx.device_, mem_handle_); + } + } + + size_t size() const { + return is_pre_allocated_ ? grain_size_ : blocks_.size(); + } + + private: + bool is_pre_allocated_; + + // Reuse pre-allocated thread local buffers. + BlockType* thread_local_pre_allocated_base_ = nullptr; + size_t grain_size_ = 0; + + // These will be initialized only if `is_pre_allocated == false`. + BlockMemHandle mem_handle_{}; + std::vector blocks_; + }; + + // ThreadLocalBlocksInitialize callable does custom thread local blocks + // initialization, and will reuse pre-allocated buffers if possible, or will + // dynamically allocate new memory. + // + // Lhs/Rhs blocks might be of the same type, so we have to pass explicitly + // for what side do we plan to do block allocation. + template + class ThreadLocalBlocksInitialize { + static constexpr bool kIsLhs = + !is_rhs && std::is_same::value; + static const bool kIsRhs = + is_rhs && std::is_same::value; + static_assert(kIsLhs || kIsRhs, "Unkown block type"); + + using Blocks = ThreadLocalBlocks; + + public: + ThreadLocalBlocksInitialize(EvalParallelContext& ctx) + : ctx_(ctx), + num_worker_threads_(ctx_.device_.numThreadsInPool()) {} + + void operator()(Blocks& blocks) { + const int n = ctx_.num_thread_local_allocations_.fetch_add( + 1, std::memory_order_relaxed); + + if (n >= num_worker_threads_) { + ThreadLocalBlocksAllocator::allocate(ctx_, blocks); + } else { + ThreadLocalBlocksAllocator::reuse(ctx_, n, blocks); + } + } + + private: + // NOTE(ezhulenev): Without 'if constexpr' we have to put calls to + // TensorContractionKernel::allocateSlices into template specializations. + // Also explicit specializations are not allowed at class scope in C++03, + // EvalCtx type parameter is just a workaround for that limitation. + template + struct ThreadLocalBlocksAllocator; + + template + struct ThreadLocalBlocksAllocator { + static void allocate(EvalCtx& ctx, Blocks& blocks) { + std::vector rhs_blocks; + BlockMemHandle mem_handle = ctx.kernel_.allocateSlices( + ctx.device_, + /*num_lhs=*/0, + /*num_rhs=*/ctx.gn_, + /*num_slices=*/1, + /*lhs_blocks=*/nullptr, /*rhs_blocks=*/&rhs_blocks); + + blocks = ThreadLocalBlocks(std::move(mem_handle), + std::move(rhs_blocks)); + } + + static void reuse(EvalCtx& ctx, int index, Blocks& blocks) { + RhsBlock* ptr = &ctx.rhs_thread_local_pre_allocated_[ctx.gn_ * index]; + blocks = ThreadLocalBlocks(ptr, ctx.gn_); + } + }; + + template + struct ThreadLocalBlocksAllocator { + static void allocate(EvalCtx& ctx, Blocks& blocks) { + std::vector lhs_blocks; + BlockMemHandle mem_handle = ctx.kernel_.allocateSlices( + ctx.device_, + /*num_lhs=*/ctx.gm_, + /*num_rhs=*/0, + /*num_slices=*/1, + /*lhs_blocks=*/&lhs_blocks, /*rhs_blocks=*/nullptr); + + blocks = ThreadLocalBlocks(std::move(mem_handle), + std::move(lhs_blocks)); + } + + static void reuse(EvalCtx& ctx, int index, Blocks& blocks) { + LhsBlock* ptr = &ctx.lhs_thread_local_pre_allocated_[ctx.gm_ * index]; + blocks = ThreadLocalBlocks(ptr, ctx.gm_); + } + }; + + EvalParallelContext& ctx_; + const int num_worker_threads_; + }; + + template + class ThreadLocalBlocksRelease { + public: + using Blocks = ThreadLocalBlocks; + ThreadLocalBlocksRelease(EvalParallelContext& ctx) : ctx_(ctx) {} + void operator()(Blocks& blocks) { blocks.Release(ctx_); } + + private: + EvalParallelContext& ctx_; + }; + + // ThreadLocalBlocks initialization callables. + using ThreadLocalLhsInit = + ThreadLocalBlocksInitialize; + using ThreadLocalRhsInit = + ThreadLocalBlocksInitialize; + + // ThreadLocalBlocks release callables. + using ThreadLocalLhsRelease = ThreadLocalBlocksRelease; + using ThreadLocalRhsRelease = ThreadLocalBlocksRelease; + + // Thread local containers for Lhs/Rhs block packs. In practice only one of + // them will be used, depending on the shard_by_col value. + Eigen::ThreadLocal, ThreadLocalLhsInit, + ThreadLocalLhsRelease> + lhs_thread_local_blocks_; + Eigen::ThreadLocal, ThreadLocalRhsInit, + ThreadLocalRhsRelease> + rhs_thread_local_blocks_; + + // After a particular shard for Kth slice missed thread local execution + // opportunity (K-1 slice didn't complete kernels execution), we can no + // longer schedule K+1 and following slices in thread local mode, because + // there is no more guarantee that previous kernels were executed + // sequentially in the same thread (size is nn_ or nm_). + std::atomic* can_use_thread_local_packed_; + + std::atomic** state_kernel_[P]; + // state_switch_ is frequently modified by worker threads, while other + // fields are read-only after constructor. Let's move it to a separate cache + // line to reduce cache-coherency traffic. + char pad_[128]; + std::atomic state_packing_ready_[P]; + std::atomic state_switch_[P]; + + LhsBlock& packed_lhs(Index m, Index k, Index m1, bool use_thread_local) { + if (use_thread_local) { + eigen_assert(!shard_by_col_); + ThreadLocalBlocks& blocks = lhs_thread_local_blocks_.local(); + + Index grain_index = m1 - m * gm_; + return blocks.block(internal::convert_index(grain_index)); // FIXME better make ThreadLocalBlocks use Eigen::Index? + } else { + return packed_lhs_[k % (P - 1)][m1]; + } + } + + RhsBlock& packed_rhs(Index n, Index k, Index n1, bool use_thread_local) { + if (use_thread_local) { + eigen_assert(shard_by_col_); + ThreadLocalBlocks& blocks = rhs_thread_local_blocks_.local(); + + Index grain_index = n1 - n * gn_; + return blocks.block(internal::convert_index(grain_index)); // FIXME better make ThreadLocalBlocks use Eigen::Index? + } else { + return packed_rhs_[k % (P - 1)][n1]; + } + } + + // In following two methods (pack_lhs and pack_rhs), if we know for sure + // that we'll be able to immediately call a kernel with packed data, and do + // not submit it to the thread pool, we can use thread local memory for + // packed data. + // + // We can only reliably check it if we are running all kernels in sync mode + // (parallelize only by sharding dim). If kernel for m==0 (n==0) is ready to + // run, it's guaranteed that all kernels with larger values of m (n) are + // also ready, because we execute them in the same order for all K slices. + + void pack_lhs(Index m, Index k) { + bool use_thread_local = false; + + if (parallelize_by_sharding_dim_only_ && !shard_by_col_ && + can_use_thread_local_packed_[m].load(std::memory_order_relaxed)) { + if (state_kernel_[k % P][m][0].load(std::memory_order_relaxed) == 1) { + use_thread_local = true; + } else { + // If we can't guarantee that all kernels in `k` slice will be + // executed sequentially in current thread, it's no longer safe to use + // thread local memory in following slices along the k dimensions. + eigen_assert(k > 0); + can_use_thread_local_packed_[m].store(false, + std::memory_order_relaxed); + } + } + + const Index mend = m * gm_ + gm(m); + for (Index m1 = m * gm_; m1 < mend; m1++) + kernel_.packLhs(&packed_lhs(m, k, m1, use_thread_local), + lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1)); + + if (!parallel_pack_ && shard_by_col_) { + assert(!use_thread_local); + signal_packing(k); + } else { + signal_switch(k + 1); + for (Index n = nn_ - 1; n >= 0; n--) { + bool sync = parallelize_by_sharding_dim_only_ || n == 0; + signal_kernel(m, n, k, sync, use_thread_local); + } + } + } + + void pack_rhs(Index n, Index k) { + bool use_thread_local = false; + + if (parallelize_by_sharding_dim_only_ && shard_by_col_ && + can_use_thread_local_packed_[n].load(std::memory_order_relaxed)) { + if (state_kernel_[k % P][0][n].load(std::memory_order_relaxed) == 1) { + use_thread_local = true; + } else { + // If we can't guarantee that all kernels in `k` slice will be + // executed sequentially in current thread, it's no longer safe to use + // thread local memory in followig slices along the k dimensions. + eigen_assert(k > 0); + can_use_thread_local_packed_[n].store(false, + std::memory_order_relaxed); + } + } + + const Index nend = n * gn_ + gn(n); + for (Index n1 = n * gn_; n1 < nend; n1++) { + if (!TensorContractionKernel::HasBeta && k == 0) { + // Zero the output memory in parallel, only if contraction kernel does + // not support `beta`. Otherwise we will pass beta 0.0 to the first + // call to the `TensorContractionKernel::invoke()`. + // + // On 10000x2x10000 mm zeroing can easily take half of time. Zero (bn + // x m) row. Safe to do here because all kernels that will write to + // this memory depend on completion of this task. Note: don't call + // device_.memset() here. device_.memset() blocks on thread pool + // worker thread, which can lead to underutilization and deadlocks. + memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar)); + } + kernel_.packRhs(&packed_rhs(n, k, n1, use_thread_local), + rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1)); + } + + if (parallel_pack_ || shard_by_col_) { + signal_switch(k + 1); + for (Index m = nm_ - 1; m >= 0; m--) { + bool sync = parallelize_by_sharding_dim_only_ || m == 0; + signal_kernel(m, n, k, sync, use_thread_local); + } + } else { + assert(!use_thread_local); + signal_packing(k); + } + } + + void kernel(Index m, Index n, Index k, bool use_thread_local) { + // Note: order of iteration matters here. Iteration over m is innermost + // because we want to reuse the same packed rhs in consecutive tasks + // (rhs fits into L2$ while lhs only into L3$). + const Index nend = n * gn_ + gn(n); + const Index mend = m * gm_ + gm(m); + + // NOTE: output = alpha * LHS * RHS + beta * output. + const Scalar alpha = Scalar(1); + const Scalar beta = + (TensorContractionKernel::HasBeta && k == 0) ? Scalar(0) : Scalar(1); + + if (shard_by_col_) { + for (Index n1 = n * gn_; n1 < nend; n1++) { + for (Index m1 = m * gm_; m1 < mend; m1++) { + const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_); + kernel_.invoke( + output_mapper, + packed_lhs(m, k, m1, !shard_by_col_ && use_thread_local), + packed_rhs(n, k, n1, shard_by_col_ && use_thread_local), bm(m1), + bk(k), bn(n1), alpha, beta); + + // We are done with the last task for the [m1, n1] block. + if (k + 1 == nk_) { + output_kernel_(output_mapper, tensor_contraction_params_, + m1 * bm_, n1 * bn_, bm(m1), bn(n1)); + } + } + } + } else { + for (Index m1 = m * gm_; m1 < mend; m1++) + for (Index n1 = n * gn_; n1 < nend; n1++) { + const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_); + kernel_.invoke( + output_mapper, + packed_lhs(m, k, m1, !shard_by_col_ && use_thread_local), + packed_rhs(n, k, n1, shard_by_col_ && use_thread_local), bm(m1), + bk(k), bn(n1), alpha, beta); + + // We are done with the last task for the [m1, n1] block. + if (k + 1 == nk_) { + output_kernel_(output_mapper, tensor_contraction_params_, + m1 * bm_, n1 * bn_, bm(m1), bn(n1)); + } + } + } + signal_kernel(m, n, k + 1, /*sync=*/false, /*use_thread_local=*/false); + signal_switch(k + 2); + } + + void signal_packing(Index k) { + eigen_assert(!parallel_pack_); + Index s = state_packing_ready_[k % P].fetch_sub(1); + eigen_assert(s > 0); + if (s != 1) return; + state_packing_ready_[k % P] = shard_by_col_ ? nm_ : nn_; + enqueue_packing(k, shard_by_col_); + } + + void signal_kernel(Index m, Index n, Index k, bool sync, + bool use_thread_local) { + std::atomic* state = &state_kernel_[k % P][m][n]; + Index s = state->load(); + eigen_assert(s > 0); + if (s != 1 && state->fetch_sub(1) != 1) { + eigen_assert(!use_thread_local); + return; + } + state->store(parallel_pack_ ? 3 : 2, std::memory_order_relaxed); + if (sync) { + kernel(m, n, k, use_thread_local); + } else { + eigen_assert(!use_thread_local); + device_.enqueueNoNotification( + [=]() { kernel(m, n, k, use_thread_local); }); + } + } + + void signal_switch(Index k, Index v = 1) { + Index s = state_switch_[k % P].fetch_sub(v); + eigen_assert(s >= v); + if (s != v) return; + + // Ready to switch to the next k slice. + // Reset counter for the next iteration. + state_switch_[k % P] = + (parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_)) + + nm_ * nn_; + if (k < nk_) { + // Issue lhs/rhs packing. Their completion will in turn kick off + // kernels. + if (parallel_pack_) { + enqueue_packing(k, !shard_by_col_); + enqueue_packing(k, shard_by_col_); + } else if (shard_by_col_) { + enqueue_packing(k, false); + } else { + enqueue_packing(k, true); + } + + // Termination handling. + // Because kernel completion signals k + 2 switch, we need to finish nk + // + 2 slices without issuing any tasks on nk + 1 slice. So here we + // pretend that all nk + 1 packing tasks just finish instantly; so that + // nk + 2 switch only waits for completion of nk kernels. + } else if (k == nk_) { + signal_switch(k + 1, + parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_)); + } else { + done_.Notify(); + } + } + + // Enqueue all rhs/lhs packing for k-th slice. + void enqueue_packing(Index k, bool rhs) { + enqueue_packing_helper(0, rhs ? nn_ : nm_, k, rhs); + } + + void enqueue_packing_helper(Index start, Index end, Index k, bool rhs) { + if (end - start == 1) { + if (rhs) + pack_rhs(start, k); + else + pack_lhs(start, k); + } else { + while (end - start > 1) { + Index mid = (start + end) / 2; + device_.enqueueNoNotification( + [=]() { enqueue_packing_helper(mid, end, k, rhs); }); + end = mid; + } + + // Decide if we want to run first packing task (start == 0) in + // async mode if we parallelize only by sharding dim: + // (1) pack_lhs and pack_rhs call signal_switch before completing + // all calls to signal_kernel, which in sync mode might lead + // to the execution of the first kernel of the k+1 slice, before + // completing a call to the last kernel of the k slice. + // (2) all pack tasks for sharded dim must be executed in a thread + // pool to get pre-allocated thead local buffers. + bool pack_async = + (start == 0) && + (parallelize_by_sharding_dim_only_&& shard_by_col_ == rhs) && + (k > 0 || std::this_thread::get_id() == created_by_thread_id_); + + if (pack_async) { + device_.enqueueNoNotification( + [=]() { enqueue_packing_helper(start, end, k, rhs); }); + } else { + enqueue_packing_helper(start, end, k, rhs); + } + } + } + + // Block sizes with accounting for potentially incomplete last block. + Index bm(Index m) const { return m + 1 < nm0_ ? bm_ : m_ + bm_ - bm_ * nm0_; } + Index bn(Index n) const { return n + 1 < nn0_ ? bn_ : n_ + bn_ - bn_ * nn0_; } + Index bk(Index k) const { return k + 1 < nk_ ? bk_ : k_ + bk_ - bk_ * nk_; } + // Task grain sizes accounting for potentially incomplete last task. + Index gm(Index m) const { return m + 1 < nm_ ? gm_ : nm0_ + gm_ - gm_ * nm_; } + Index gn(Index n) const { return n + 1 < nn_ ? gn_ : nn0_ + gn_ - gn_ * nn_; } + + EvalParallelContext(const EvalParallelContext&) = delete; + void operator=(const EvalParallelContext&) = delete; + }; + + template + using SyncEvalParallelContext = + EvalParallelContext; + + // ------------------------------------------------------------------------ // + + // EvalShardedByInnerDimContext orchestrates sync/async contraction + // evaluation, when we shard by inner dimension. When it is executed in + // asynchronous mode, it owns all the shared state that might be accessible by + // block processing tasks. + + template + struct EvalShardedByInnerDimContext { + EvalShardedByInnerDimContext(const Self* self, int num_threads, + Scalar* result_buffer, + Index m_size, Index n_size, Index k_size, + DoneCallback done_callback) + : evaluator(self), + m_lhs_inner_dim_contiguous(evaluator->m_lhs_inner_dim_contiguous), + m_rhs_inner_dim_contiguous(evaluator->m_rhs_inner_dim_contiguous), + m_rhs_inner_dim_reordered(evaluator->m_rhs_inner_dim_reordered), + result(result_buffer), + m(m_size), + n(n_size), + k(k_size), + done(std::move(done_callback)), + buffer_size_bytes(m * n * sizeof(Scalar)), + block_size(blockSize(k, num_threads)), + num_blocks(divup(k, block_size)), + num_pending_blocks(internal::convert_index(num_blocks)), + l0_ranges(divup(num_blocks, l0_size)), + l0_state(l0_ranges), + block_buffers(num_blocks) { + // Keep count of pending gemm tasks for each l0 range. + for (int i = 0; i < l0_ranges; ++i) { + const Index num_pending_tasks = actualRangeSize(l0_ranges, l0_size, i); + l0_state.emplace_back(internal::convert_index(num_pending_tasks)); + } + + // Allocate temporary buffers for each block. + for (Index block_idx = 0; block_idx < num_blocks; ++block_idx) { + Scalar* buf = block_idx == 0 + ? result + : static_cast(evaluator->m_device.allocate( + buffer_size_bytes)); + block_buffers.emplace_back(buf); + } + } + + ~EvalShardedByInnerDimContext() { + for (Index i = 1; i < num_blocks; ++i) { + evaluator->m_device.deallocate(block_buffers[i]); + } + } + + template + void run() { + Barrier barrier(internal::convert_index(num_blocks)); + eval(barrier, 0, num_blocks); + barrier.Wait(); + + // Aggregate partial sums from l0 ranges. + aggregateL0Blocks(); + + // Apply output kernel. + applyOutputKernel(); + } + + template + void runAsync() { + evalAsync(0, num_blocks); + } + + private: + // The underlying GEMM kernel assumes that k is a multiple of + // the packet size and subtle breakage occurs if this is violated. + static const Index packet_size = internal::packet_traits::size; + + const Self* evaluator; // TensorContraction evaluator + + // These fields required fromTENSOR_CONTRACTION_DISPATCH macro. + bool m_lhs_inner_dim_contiguous; + bool m_rhs_inner_dim_contiguous; + bool m_rhs_inner_dim_reordered; + + Scalar* result; + + Index m; + Index n; + Index k; + + DoneCallback done; + + // ----------------------------------------------------------------------// + // Algorithm parameters. + + // We will compute partial results into the buffers of this size. + Index buffer_size_bytes; + + Index block_size; + Index num_blocks; + + // Keep track of pending tasks when evaluate in async mode. + std::atomic num_pending_blocks; + + // We compute partial gemm results in parallel, and to get the final result + // we need to add them all together. For the large number of threads (>= 48) + // this adds a very expensive sequential step at the end. + // + // We split the [0, num_blocks) into small ranges, and when a task for the + // block finishes its partial gemm computation, it checks if it was the last + // gemm in the range, and if so, it will add all blocks of the range. + // + // After all tasks done, we need to add only these pre-aggregated blocks. + + // For now we use just a single level of ranges to compute pre-aggregated + // partial sums, but in general we can use more layers to compute tree + // aggregation in parallel and reduce the size of the sequential step. + // + // TODO(ezhulenev): Add multilevel tree aggregation? Probably will make + // sense only if number of threads >= ~128? + static const Index l0_size = 4; + Index l0_ranges; + + // Keep count of pending gemm tasks for each l0 range. + MaxSizeVector> l0_state; // [0, l0_ranges) + + // Buffers allocated for each temporary block computation. + MaxSizeVector block_buffers; // [0, num_blocks) + + template + void processBlock(Index block_idx, Index begin, Index end) { + Scalar* buf = block_buffers[block_idx]; + + TENSOR_CONTRACTION_DISPATCH( + evaluator->template evalGemmPartialWithoutOutputKernel, Alignment, + (buf, begin, end, + /*num_threads=*/internal::convert_index(num_blocks))); + + // Check if it was the last task in l0 range. + const Index l0_index = block_idx / l0_size; + const int v = l0_state[l0_index].fetch_sub(1); + eigen_assert(v >= 1); + + // If we processed the last block of the range, we can aggregate all + // partial results into the first block of the range. + if (v == 1) { + const Index rng_size = actualRangeSize(l0_ranges, l0_size, l0_index); + const Index dst_block_idx = l0_index * l0_size; + + if (rng_size == l0_size) { + addAllToBuffer( + m * n, + /*src_buf0=*/block_buffers[dst_block_idx + 1], + /*src_buf1=*/block_buffers[dst_block_idx + 2], + /*src_buf2=*/block_buffers[dst_block_idx + 3], + /*dst_buf= */ block_buffers[dst_block_idx]); + } else { + // Aggregate blocks of potentially incomplete last range. + for (int i = 1; i < rng_size; ++i) { + addToBuffer(m * n, + /*src_buf=*/block_buffers[dst_block_idx + i], + /*dst_buf=*/block_buffers[dst_block_idx]); + } + } + } + } + + // Aggregate partial sums from l0 ranges. + template + void aggregateL0Blocks() const { + Index l0_index = 1; + + for (; l0_index + 2 < l0_ranges; l0_index += 3) { + addAllToBuffer( + m * n, + /*src_buf0=*/block_buffers[(l0_index + 0) * l0_size], + /*src_buf1=*/block_buffers[(l0_index + 1) * l0_size], + /*src_buf2=*/block_buffers[(l0_index + 2) * l0_size], + /*dst_buf= */ block_buffers[0]); + } + + for (; l0_index < l0_ranges; ++l0_index) { + addToBuffer(m * n, block_buffers[l0_index * l0_size], + block_buffers[0]); + } + } + + void applyOutputKernel() const { + typedef internal::blas_data_mapper OutputMapper; + evaluator->m_output_kernel( + OutputMapper(result, m), evaluator->m_tensor_contraction_params, + static_cast(0), static_cast(0), m, n); + } + + // Compute block size with accounting for potentially incomplete last block. + Index actualBlockSize(Index block_idx) const { + return block_idx + 1 < num_blocks + ? block_size + : k + block_size - block_size * num_blocks; + }; + + // Compute range size with accounting for potentially incomplete last range. + Index actualRangeSize(Index num_ranges, Index range_size, + Index range_idx) const { + eigen_assert(range_idx < num_ranges); + return range_idx + 1 < num_ranges + ? range_size + : num_blocks + range_size - range_size * num_ranges; + }; + + template + EIGEN_STRONG_INLINE static void addToBuffer(size_t n, const Scalar* src_buf, + Scalar* tgt_buf) { + const int output_packet_size = + internal::unpacket_traits::size; + size_t i = 0; + const size_t num_packets = n / output_packet_size; + for (; i < output_packet_size * num_packets; i += output_packet_size) { + const PacketReturnType src_val = + internal::pload(src_buf + i); + const PacketReturnType tgt_val = + internal::ploadt(tgt_buf + i); + const PacketReturnType sum = internal::padd(src_val, tgt_val); + internal::pstoret(tgt_buf + i, + sum); + } + for (; i < n; ++i) { + tgt_buf[i] += src_buf[i]; + } + } + + template + EIGEN_STRONG_INLINE static void addAllToBuffer(size_t n, + const Scalar* src_buf0, + const Scalar* src_buf1, + const Scalar* src_buf2, + Scalar* dst_buf) { + using ::Eigen::internal::padd; + using ::Eigen::internal::pload; + using ::Eigen::internal::ploadt; + using ::Eigen::internal::pstoret; + + const int output_packet_size = + internal::unpacket_traits::size; + + size_t i = 0; + const size_t num_packets = n / output_packet_size; + for (; i < output_packet_size * num_packets; i += output_packet_size) { + const auto src_val0 = pload(src_buf0 + i); + const auto src_val1 = pload(src_buf1 + i); + const auto src_val2 = pload(src_buf2 + i); + + const auto dst_val = ploadt(dst_buf + i); + const auto sum = + padd(padd(dst_val, src_val0), padd(src_val1, src_val2)); + + pstoret(dst_buf + i, sum); + } + for (; i < n; ++i) { + dst_buf[i] += src_buf0[i] + src_buf1[i] + src_buf2[i]; + } + } + + template + void eval(Barrier& barrier, Index start_block_idx, Index end_block_idx) { + while (end_block_idx - start_block_idx > 1) { + Index mid_block_idx = (start_block_idx + end_block_idx) / 2; + evaluator->m_device.enqueueNoNotification( + [this, &barrier, mid_block_idx, end_block_idx]() { + eval(barrier, mid_block_idx, end_block_idx); + }); + end_block_idx = mid_block_idx; + } + + Index block_idx = start_block_idx; + Index block_start = block_idx * block_size; + Index block_end = block_start + actualBlockSize(block_idx); + + processBlock(block_idx, block_start, block_end); + barrier.Notify(); + } + + template + void evalAsync(Index start_block_idx, Index end_block_idx) { + while (end_block_idx - start_block_idx > 1) { + Index mid_block_idx = (start_block_idx + end_block_idx) / 2; + evaluator->m_device.enqueueNoNotification( + [this, mid_block_idx, end_block_idx]() { + evalAsync(mid_block_idx, end_block_idx); + }); + end_block_idx = mid_block_idx; + } + + Index block_idx = start_block_idx; + + Index block_start = block_idx * block_size; + Index block_end = block_start + actualBlockSize(block_idx); + + processBlock(block_idx, block_start, block_end); + + int v = num_pending_blocks.fetch_sub(1); + eigen_assert(v >= 1); + + if (v == 1) { + // Aggregate partial sums from l0 ranges. + aggregateL0Blocks(); + + // Apply output kernel. + applyOutputKernel(); + + // NOTE: If we call `done` callback before deleting this (context), + // it might deallocate Self* pointer captured by context, and we'll + // fail in destructor trying to deallocate temporary buffers. + + // Move done call back from context before it will be destructed. + DoneCallback done_copy = std::move(done); + + // We are confident that we are the last one who touches context. + delete this; + + // Now safely call the done callback. + done_copy(); + } + } + + // Cost model doesn't capture well the cost associated with constructing + // tensor contraction mappers and computing loop bounds in gemm_pack_lhs + // and gemm_pack_rhs, so we specify minimum desired block size. + static Index blockSize(Index k, int num_threads) { + const auto round_up = [=](Index index) -> Index { + const Index kmultiple = packet_size <= 8 ? 8 : packet_size; + return divup(index, kmultiple) * kmultiple; + }; + + const Index target_block_size = round_up(divup(k, num_threads)); + const Index desired_min_block_size = 12 * packet_size; + + return numext::mini( + k, numext::maxi(desired_min_block_size, target_block_size)); + } + + EvalShardedByInnerDimContext(const EvalShardedByInnerDimContext&) = delete; + void operator=(const EvalShardedByInnerDimContext&) = delete; + }; + + // ------------------------------------------------------------------------ // + + // Below are the function used by evalProductImpl heuristics, trying to select + // optimcal parameters for parallelization algorithm. + + // Decide whether we want to shard m x n contraction by columns or by rows. + static bool shardByCol(Index m, Index n, Index num_threads) { + // Note: we are comparing both n and m against Traits::nr, it is not + // a mistake. We are trying to figure out how both n and m will fit into + // the main sharding dimension. + + // Sharding by column is the default + // ... unless there is enough data for vectorization over rows + if (m / num_threads >= Traits::nr && + // and not enough data for vectorization over columns + (n / num_threads < Traits::nr || + // ... or barely enough data for vectorization over columns, + // but it is not evenly dividable across threads + (n / num_threads < 4 * Traits::nr && + (n % (num_threads * Traits::nr)) != 0 && + // ... and it is evenly dividable across threads for rows + ((m % (num_threads * Traits::nr)) == 0 || + // .. or it is not evenly dividable for both dimensions but + // there is much more data over rows so that corner effects are + // mitigated. + (m / n >= 6))))) + return false; + // Wait, or if matrices are just substantially prolonged over the other + // dimension. + if (n / num_threads < 16 * Traits::nr && m > n * 32) return false; + return true; + } + + Index coarsenM(Index m, Index n, Index bm, Index bn, Index bk, Index gn, + int num_threads, bool shard_by_col) const { + Index gm = 1; + Index gm1 = 1; + Index nm0 = divup(m, bm); + Index nm1 = nm0; + for (;;) { + // Find the next candidate for m grain size. It needs to result in + // different number of blocks. E.g. if we have 10 kernels, we want to try + // 5 and 10, but not 6, 7, 8 and 9. + while (gm1 <= nm0 && nm1 == divup(nm0, gm1)) gm1++; + if (gm1 > nm0) break; + // Check the candidate. + int res = checkGrain(m, n, bm, bn, bk, gm1, gn, gm, gn, num_threads, + shard_by_col); + if (res < 0) break; + nm1 = divup(nm0, gm1); + if (res == 0) continue; + // Commit new grain size. + gm = gm1; + } + return gm; + } + + Index coarsenN(Index m, Index n, Index bm, Index bn, Index bk, Index gm, + int num_threads, bool shard_by_col) const { + Index gn = 1; + Index gn1 = 1; + Index nn0 = divup(n, bn); + Index nn1 = nn0; + for (;;) { + while (gn1 <= nn0 && nn1 == divup(nn0, gn1)) gn1++; + if (gn1 > nn0) break; + int res = checkGrain(m, n, bm, bn, bk, gm, gn1, gm, gn, num_threads, + shard_by_col); + if (res < 0) break; + nn1 = divup(nn0, gn1); + if (res == 0) continue; + gn = gn1; + } + return gn; + } + + // checkGrain checks whether grain (gm, gn) is suitable and is better than + // (oldgm, oldgn). + int checkGrain(Index m, Index n, Index bm, Index bn, Index bk, Index gm, + Index gn, Index oldgm, Index oldgn, int num_threads, + bool shard_by_col) const { + const TensorOpCost cost = + contractionCost(bm * gm, bn * gn, bm, bn, bk, shard_by_col, true); + double taskSize = TensorCostModel::taskSize( + static_cast(bm) * gm * bn * gn, cost); + // If the task is too small, then we agree on it regardless of anything + // else. Otherwise synchronization overheads will dominate. + if (taskSize < 1) return 1; + // If it is too large, then we reject it and all larger tasks. + if (taskSize > 2) return -1; + // Now we are in presumably good task size range. + // The main deciding factor here is parallelism. Consider that we have 12 + // kernels and 4 threads. Grains of 2, 3 and 4 all yield good task sizes. + // But 2/4 yield 6/3 tasks, which gives us parallelism of 0.75 (at most 3/4 + // of cores will be busy). While grain size 3 gives us 4 tasks, which gives + // us parallelism of 1 (we can load all cores). + Index nm0 = divup(m, bm); + Index nn0 = divup(n, bn); + Index new_tasks = divup(nm0, gm) * divup(nn0, gn); + double new_parallelism = static_cast(new_tasks) / + (divup(new_tasks, num_threads) * num_threads); + Index old_tasks = divup(nm0, oldgm) * divup(nn0, oldgn); + double old_parallelism = static_cast(old_tasks) / + (divup(old_tasks, num_threads) * num_threads); + if (new_parallelism > old_parallelism || new_parallelism == 1) return 1; + return 0; + } + + TensorOpCost contractionCost(Index m, Index n, Index bm, Index bn, Index bk, + bool shard_by_col, bool prepacked) const { + const int packed_size = std::min(PacketType::size, + PacketType::size); + const int output_packet_size = internal::unpacket_traits::size; + const double kd = static_cast(bk); + double compute_bandwidth = computeBandwidth(false, bm, bn, bk); + // Computations. + TensorOpCost cost = TensorOpCost(0, 0, kd * compute_bandwidth, true, packed_size); + // Output stores. + cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size); + if (prepacked) { + // Packing and kernels are executed in different tasks. When we calculate + // task grain size we look only at kernel cost assuming that kernel + // is more expensive than packing. + return cost; + } + // Lhs/rhs loads + computations. + TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * (kd / n); + TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * (kd / m); + // Lhs packing memory cost does not contribute considerably to overall + // execution time because lhs is prefetched early and accessed sequentially. + if (shard_by_col) + lhsCost.dropMemoryCost(); + else + rhsCost.dropMemoryCost(); + return cost + lhsCost + rhsCost; + } + + // Decide whether we want to shard m x k x n contraction over the inner + // (contraction) dimension (k). + static bool shardByInnerDim(Index m, Index n, Index k, int num_threads, + int num_threads_by_k) { + std::ptrdiff_t bufsize = m * n * sizeof(Scalar); + bool shard_by_k = false; + if (n == 1 || // If mat*vec or... + num_threads_by_k < 2 || // running single threaded or... + num_threads_by_k < + num_threads || // sharding by k gives less parallelism or... + bufsize > l3CacheSize() / num_threads_by_k || // need more buffer space + // than L3 cache or... + k / num_threads_by_k < 2 * Traits::nr) { // k per thread is tiny. + shard_by_k = false; + } else if (numext::maxi(m, n) / num_threads < + Traits::nr || // both other dimensions are tiny or... + // k per thread is not small and... + (k / num_threads_by_k > 8 * Traits::nr && + // one of the outer dimensions is tiny or sharding by k offers + // more parallelism. + (numext::mini(m, n) < 2 * Traits::nr || + num_threads_by_k > num_threads))) { + shard_by_k = true; + } + return shard_by_k; + } + + TensorOpCost contractionCostPerInnerDim(Index m, Index n, Index k) const { + // Compute cost. + const int output_packet_size = internal::unpacket_traits::size; + TensorOpCost cost(0, 0, (computeBandwidth(true, m, n, k) * m) * n, true, output_packet_size); + // Output stores. + cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size); + TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * m; + TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * n; + // Since the inner gemm kernel is always sharded by column, the lhs + // load cost is negligible. + lhsCost.dropMemoryCost(); + return cost + lhsCost + rhsCost; + } + + int numThreadsInnerDim(Index m, Index n, Index k) const { + const int output_packet_size = internal::unpacket_traits::size; + TensorOpCost cost = contractionCostPerInnerDim(m, n, k); + double total_parallel_cost = + TensorCostModel::totalCost(k, cost); + // Cost of reduction step accumulating the m*n per-thread buffers into the + // result. + double reduction_cost = TensorCostModel::totalCost( + m * n, TensorOpCost(2, 1, 1, true, output_packet_size)); + int num_threads = 1; + double min_cost = total_parallel_cost; + double kPerThreadOverHead = 3000; + double kFixedOverHead = 100000; + for (int nt = 2; nt <= this->m_device.numThreads(); nt += 2) { + double sequential_cost = + kFixedOverHead + nt * (reduction_cost + kPerThreadOverHead); + double parallel_cost = total_parallel_cost / nt + sequential_cost; + if (parallel_cost < min_cost) { + num_threads = nt; + min_cost = parallel_cost; + } + } + return num_threads; + } + + double computeBandwidth(bool shard_by_col, Index bm, Index bn, + Index bk) const { + // Peak VFMA bandwidth is 0.5. However if we have not enough data for + // vectorization bandwidth drops. The 4.0 and 2.0 bandwidth is determined + // experimentally. + double computeBandwidth = + bk == 1 ? 4.0 + : (shard_by_col ? bn : bm) < Traits::nr || + (shard_by_col ? bm : bn) < Traits::mr + ? 2.0 + : 0.5; +#ifndef EIGEN_VECTORIZE_FMA + // Bandwidth of all of VFMA/MULPS/ADDPS is 0.5 on latest Intel processors. + // However for MULPS/ADDPS we have dependent sequence of 2 such + // instructions, + // so overall bandwidth is 1.0. + if (computeBandwidth == 0.5) computeBandwidth = 1.0; +#endif + return computeBandwidth; + } + +}; + +} // end namespace Eigen + +#endif // EIGEN_USE_THREADS +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorConversion.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorConversion.h new file mode 100644 index 0000000..09d2da9 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorConversion.h @@ -0,0 +1,456 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H + +namespace Eigen { + +/** \class TensorConversionOp + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor conversion class. This class makes it possible to vectorize + * type casting operations when the number of scalars per packet in the source + * and the destination type differ + */ +namespace internal { +template +struct traits > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef TargetType Scalar; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = traits::NumDimensions; + static const int Layout = traits::Layout; + enum { Flags = 0 }; + typedef typename TypeConversion::PointerType>::type PointerType; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorConversionOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorConversionOp type; +}; + +} // end namespace internal + + +template +struct PacketConverter; + +template +struct PacketConverter { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + PacketConverter(const TensorEvaluator& impl) + : m_impl(impl) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { + return internal::pcast(m_impl.template packet(index)); + } + + private: + const TensorEvaluator& m_impl; +}; + + +template +struct PacketConverter { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + PacketConverter(const TensorEvaluator& impl) + : m_impl(impl) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { + const int SrcPacketSize = internal::unpacket_traits::size; + + SrcPacket src1 = m_impl.template packet(index); + SrcPacket src2 = m_impl.template packet(index + SrcPacketSize); + TgtPacket result = internal::pcast(src1, src2); + return result; + } + + private: + const TensorEvaluator& m_impl; +}; + +template +struct PacketConverter { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + PacketConverter(const TensorEvaluator& impl) + : m_impl(impl) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { + const int SrcPacketSize = internal::unpacket_traits::size; + + SrcPacket src1 = m_impl.template packet(index); + SrcPacket src2 = m_impl.template packet(index + SrcPacketSize); + SrcPacket src3 = m_impl.template packet(index + 2 * SrcPacketSize); + SrcPacket src4 = m_impl.template packet(index + 3 * SrcPacketSize); + TgtPacket result = internal::pcast(src1, src2, src3, src4); + return result; + } + + private: + const TensorEvaluator& m_impl; +}; + +template +struct PacketConverter { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + PacketConverter(const TensorEvaluator& impl) + : m_impl(impl) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { + const int SrcPacketSize = internal::unpacket_traits::size; + + SrcPacket src1 = m_impl.template packet(index); + SrcPacket src2 = m_impl.template packet(index + 1 * SrcPacketSize); + SrcPacket src3 = m_impl.template packet(index + 2 * SrcPacketSize); + SrcPacket src4 = m_impl.template packet(index + 3 * SrcPacketSize); + SrcPacket src5 = m_impl.template packet(index + 4 * SrcPacketSize); + SrcPacket src6 = m_impl.template packet(index + 5 * SrcPacketSize); + SrcPacket src7 = m_impl.template packet(index + 6 * SrcPacketSize); + SrcPacket src8 = m_impl.template packet(index + 7 * SrcPacketSize); + TgtPacket result = internal::pcast(src1, src2, src3, src4, src5, src6, src7, src8); + return result; + } + + private: + const TensorEvaluator& m_impl; +}; + +template +struct PacketConverter { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + PacketConverter(const TensorEvaluator& impl) + : m_impl(impl), m_maxIndex(impl.dimensions().TotalSize()) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const { + const int SrcPacketSize = internal::unpacket_traits::size; + // Only call m_impl.packet() when we have direct access to the underlying data. This + // ensures that we don't compute the subexpression twice. We may however load some + // coefficients twice, but in practice this doesn't negatively impact performance. + if (m_impl.data() && (index + SrcPacketSize < m_maxIndex)) { + // Force unaligned memory loads since we can't ensure alignment anymore + return internal::pcast(m_impl.template packet(index)); + } else { + const int TgtPacketSize = internal::unpacket_traits::size; + typedef typename internal::unpacket_traits::type SrcType; + typedef typename internal::unpacket_traits::type TgtType; + internal::scalar_cast_op converter; + EIGEN_ALIGN_MAX typename internal::unpacket_traits::type values[TgtPacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < TgtPacketSize; ++i) { + values[i] = converter(m_impl.coeff(index+i)); + } + TgtPacket rslt = internal::pload(values); + return rslt; + } + } + + private: + const TensorEvaluator& m_impl; + const typename TensorEvaluator::Index m_maxIndex; +}; + +template +class TensorConversionOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename internal::traits::Scalar Scalar; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + typedef typename internal::nested::type Nested; + typedef Scalar CoeffReturnType; + typedef typename NumTraits::Real RealScalar; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConversionOp(const XprType& xpr) + : m_xpr(xpr) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; +}; + +template struct ConversionSubExprEval { + static EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType) { + impl.evalSubExprsIfNeeded(NULL); + return true; + } +}; + +template struct ConversionSubExprEval { + static EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType data) { + return impl.evalSubExprsIfNeeded(data); + } +}; + +#ifdef EIGEN_USE_THREADS +template +struct ConversionSubExprEvalAsync { + static EIGEN_STRONG_INLINE void run(Eval& impl, EvalPointerType, EvalSubExprsCallback done) { + impl.evalSubExprsIfNeededAsync(nullptr, std::move(done)); + } +}; + +template +struct ConversionSubExprEvalAsync { + static EIGEN_STRONG_INLINE void run(Eval& impl, EvalPointerType data, EvalSubExprsCallback done) { + impl.evalSubExprsIfNeededAsync(data, std::move(done)); + } +}; +#endif + +namespace internal { + +template +struct CoeffConv { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetType run(const TensorEvaluator& impl, Index index) { + internal::scalar_cast_op converter; + return converter(impl.coeff(index)); + } +}; + +template +struct CoeffConv { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetType run(const TensorEvaluator& impl, Index index) { + return impl.coeff(index); + } +}; + +template +struct PacketConv { + typedef typename internal::unpacket_traits::type SrcType; + typedef typename internal::unpacket_traits::type TargetType; + + static const int PacketSize = internal::unpacket_traits::size; + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator& impl, Index index) { + internal::scalar_cast_op converter; + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + values[i] = converter(impl.coeff(index+i)); + } + TargetPacket rslt = internal::pload(values); + return rslt; + } +}; + +template +struct PacketConv { + typedef typename internal::unpacket_traits::type SrcType; + typedef typename internal::unpacket_traits::type TargetType; + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator& impl, Index index) { + const int SrcCoeffRatio = internal::type_casting_traits::SrcCoeffRatio; + const int TgtCoeffRatio = internal::type_casting_traits::TgtCoeffRatio; + PacketConverter, SrcPacket, TargetPacket, + SrcCoeffRatio, TgtCoeffRatio> converter(impl); + return converter.template packet(index); + } +}; + +template +struct PacketConv { + typedef typename internal::unpacket_traits::type TargetType; + static const int PacketSize = internal::unpacket_traits::size; + + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator& impl, Index index) { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) values[i] = impl.coeff(index+i); + return internal::pload(values); + } +}; + +template +struct PacketConv { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator& impl, Index index) { + return impl.template packet(index); + } +}; + +} // namespace internal + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorConversionOp XprType; + typedef typename XprType::Index Index; + typedef typename TensorEvaluator::Dimensions Dimensions; + typedef TargetType Scalar; + typedef TargetType CoeffReturnType; + typedef typename internal::remove_all::Scalar>::type SrcType; + typedef typename PacketType::type PacketReturnType; + typedef typename PacketType::type PacketSourceType; + static const int PacketSize = PacketType::size; + static const bool IsSameType = internal::is_same::value; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + enum { + IsAligned = false, + PacketAccess = + #ifndef EIGEN_USE_SYCL + true, + #else + TensorEvaluator::PacketAccess & + internal::type_casting_traits::VectorizedCast, + #endif + BlockAccess = TensorEvaluator::BlockAccess, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + Layout = TensorEvaluator::Layout, + RawAccess = false + }; + + static const int NumDims = internal::array_size::value; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlock + ArgTensorBlock; + + struct TensorConversionOpBlockFactory { + template + struct XprType { + typedef TensorConversionOp type; + }; + + template + typename XprType::type expr(const ArgXprType& expr) const { + return typename XprType::type(expr); + } + }; + + typedef internal::TensorUnaryExprBlock + TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device) + { + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) + { + return ConversionSubExprEval, EvaluatorPointerType>::run(m_impl, data); + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType data, EvalSubExprsCallback done) { + ConversionSubExprEvalAsync, + EvaluatorPointerType, + EvalSubExprsCallback>::run(m_impl, data, std::move(done)); + } +#endif + + EIGEN_STRONG_INLINE void cleanup() + { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return internal::CoeffConv::run(m_impl,index); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType + packet(Index index) const { + // If we are not going to do the cast, we just need to check that base + // TensorEvaluator has packet access. Otherwise we also need to make sure, + // that we have an implementation of vectorized cast. + const bool Vectorizable = + IsSameType + ? TensorEvaluator::PacketAccess + : int(TensorEvaluator::PacketAccess) & + int(internal::type_casting_traits::VectorizedCast); + + return internal::PacketConv::run(m_impl, index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double cast_cost = TensorOpCost::CastCost(); + if (vectorized) { + const double SrcCoeffRatio = + internal::type_casting_traits::SrcCoeffRatio; + const double TgtCoeffRatio = + internal::type_casting_traits::TgtCoeffRatio; + return m_impl.costPerCoeff(vectorized) * (SrcCoeffRatio / PacketSize) + + TensorOpCost(0, 0, TgtCoeffRatio * (cast_cost / PacketSize)); + } else { + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, cast_cost); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + return m_impl.getResourceRequirements(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + return TensorBlock(m_impl.block(desc, scratch), + TensorConversionOpBlockFactory()); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + + /// required by sycl in order to extract the sycl accessor + const TensorEvaluator& impl() const { return m_impl; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif + + protected: + TensorEvaluator m_impl; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorConvolution.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorConvolution.h new file mode 100644 index 0000000..b20f80b --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorConvolution.h @@ -0,0 +1,1132 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H + +namespace Eigen { + +/** \class TensorConvolution + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor convolution class. + * + * + */ +namespace internal { + +template +class IndexMapper { + public: + IndexMapper(const InputDims& input_dims, const array& kernel_dims, + const array& indices) { + + array dimensions = input_dims; + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = indices[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + dimensions[index] = result_dim; + } + + array inputStrides; + array outputStrides; + if (static_cast(Layout) == static_cast(ColMajor)) { + inputStrides[0] = 1; + outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + inputStrides[i] = inputStrides[i-1] * input_dims[i-1]; + outputStrides[i] = outputStrides[i-1] * dimensions[i-1]; + } + } else { + inputStrides[NumDims - 1] = 1; + outputStrides[NumDims - 1] = 1; + for (int i = static_cast(NumDims) - 2; i >= 0; --i) { + inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1]; + outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1]; + } + } + + array gpuInputDimensions; + array gpuOutputDimensions; + array tmp = dimensions; + array ordering; + const size_t offset = static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumDims - NumKernelDims; + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = i + offset; + ordering[index] = indices[i]; + tmp[indices[i]] = -1; + gpuInputDimensions[index] = input_dims[indices[i]]; + gpuOutputDimensions[index] = dimensions[indices[i]]; + } + + int written = static_cast(Layout) == static_cast(ColMajor) + ? NumKernelDims + : 0; + for (int i = 0; i < NumDims; ++i) { + if (tmp[i] >= 0) { + ordering[written] = i; + gpuInputDimensions[written] = input_dims[i]; + gpuOutputDimensions[written] = dimensions[i]; + ++written; + } + } + + for (int i = 0; i < NumDims; ++i) { + m_inputStrides[i] = inputStrides[ordering[i]]; + m_outputStrides[i] = outputStrides[ordering[i]]; + } + + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < NumDims; ++i) { + if (i > NumKernelDims) { + m_gpuInputStrides[i] = + m_gpuInputStrides[i - 1] * gpuInputDimensions[i - 1]; + m_gpuOutputStrides[i] = + m_gpuOutputStrides[i - 1] * gpuOutputDimensions[i - 1]; + } else { + m_gpuInputStrides[i] = 1; + m_gpuOutputStrides[i] = 1; + } + } + } else { + for (int i = NumDims - 1; i >= 0; --i) { + if (static_cast(i + 1) < offset) { + m_gpuInputStrides[i] = + m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1]; + m_gpuOutputStrides[i] = + m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1]; + } else { + m_gpuInputStrides[i] = 1; + m_gpuOutputStrides[i] = 1; + } + } + } + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const { + Index inputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int d = NumDims - 1; d > NumKernelDims; --d) { + const Index idx = p / m_gpuInputStrides[d]; + inputIndex += idx * m_inputStrides[d]; + p -= idx * m_gpuInputStrides[d]; + } + inputIndex += p * m_inputStrides[NumKernelDims]; + } else { + std::ptrdiff_t limit = 0; + if (NumKernelDims < NumDims) { + limit = NumDims - NumKernelDims - 1; + } + for (int d = 0; d < limit; ++d) { + const Index idx = p / m_gpuInputStrides[d]; + inputIndex += idx * m_inputStrides[d]; + p -= idx * m_gpuInputStrides[d]; + } + inputIndex += p * m_inputStrides[limit]; + } + return inputIndex; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const { + Index outputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int d = NumDims - 1; d > NumKernelDims; --d) { + const Index idx = p / m_gpuOutputStrides[d]; + outputIndex += idx * m_outputStrides[d]; + p -= idx * m_gpuOutputStrides[d]; + } + outputIndex += p * m_outputStrides[NumKernelDims]; + } else { + std::ptrdiff_t limit = 0; + if (NumKernelDims < NumDims) { + limit = NumDims - NumKernelDims - 1; + } + for (int d = 0; d < limit; ++d) { + const Index idx = p / m_gpuOutputStrides[d]; + outputIndex += idx * m_outputStrides[d]; + p -= idx * m_gpuOutputStrides[d]; + } + outputIndex += p * m_outputStrides[limit]; + } + return outputIndex; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_inputStrides[offset]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_outputStrides[offset]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j, Index k) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] + + k * m_inputStrides[offset + 2]; + } + + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const { + const size_t offset = static_cast(Layout) == static_cast(ColMajor) + ? 0 + : NumDims - NumKernelDims; + return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] + + k * m_outputStrides[offset + 2]; + } + + private: + static const int NumDims = internal::array_size::value; + array m_inputStrides; + array m_outputStrides; + array m_gpuInputStrides; + array m_gpuOutputStrides; +}; + + + +template +struct traits > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename promote_storage_type::ret Scalar; + typedef typename promote_storage_type::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef typename promote_index_type::Index, + typename traits::Index>::type Index; + typedef typename InputXprType::Nested LhsNested; + typedef typename KernelXprType::Nested RhsNested; + typedef typename remove_reference::type _LhsNested; + typedef typename remove_reference::type _RhsNested; + static const int NumDimensions = traits::NumDimensions; + static const int Layout = traits::Layout; + typedef typename conditional::val, + typename traits::PointerType, typename traits::PointerType>::type PointerType; + + enum { + Flags = 0 + }; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorConvolutionOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorConvolutionOp type; +}; + +} // end namespace internal + + + +template +class TensorConvolutionOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::promote_storage_type::ret CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims) + : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Indices& indices() const { return m_indices; } + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const typename internal::remove_all::type& + inputExpression() const { return m_input_xpr; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const typename internal::remove_all::type& + kernelExpression() const { return m_kernel_xpr; } + + protected: + typename InputXprType::Nested m_input_xpr; + typename KernelXprType::Nested m_kernel_xpr; + const Indices m_indices; +}; + + +template +struct TensorEvaluator, Device> +{ + typedef TensorConvolutionOp XprType; + + static const int NumDims = internal::array_size::Dimensions>::value; + static const int NumKernelDims = internal::array_size::value; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + enum { + IsAligned = int(TensorEvaluator::IsAligned) & int(TensorEvaluator::IsAligned), + PacketAccess = int(TensorEvaluator::PacketAccess) & int(TensorEvaluator::PacketAccess), + BlockAccess = false, + PreferBlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device) + { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); + + const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); + const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); + + if (static_cast(Layout) == static_cast(ColMajor)) { + m_inputStride[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1]; + } + } else { + m_inputStride[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1]; + } + } + + m_dimensions = m_inputImpl.dimensions(); + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + if (i > 0) { + m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1]; + } else { + m_kernelStride[0] = 1; + } + m_indexStride[i] = m_inputStride[index]; + } + + m_outputStride[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1]; + } + } else { + for (int i = NumKernelDims - 1; i >= 0; --i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + if (i < NumKernelDims - 1) { + m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1]; + } else { + m_kernelStride[NumKernelDims - 1] = 1; + } + m_indexStride[i] = m_inputStride[index]; + } + + m_outputStride[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + m_inputImpl.evalSubExprsIfNeeded(NULL); + preloadKernel(); + return true; + } + EIGEN_STRONG_INLINE void cleanup() { + m_inputImpl.cleanup(); + if (m_local_kernel) { + m_device.deallocate((void*)m_kernel); + m_local_kernel = false; + } + m_kernel = NULL; + } + + void evalTo(typename XprType::Scalar* buffer) { + evalSubExprsIfNeeded(NULL); + for (int i = 0; i < dimensions().TotalSize(); ++i) { + buffer[i] += coeff(i); + } + cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + CoeffReturnType result = CoeffReturnType(0); + convolve(firstInput(index), 0, NumKernelDims-1, result); + return result; + } + + template + EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const + { + Index indices[2] = {index, index+PacketSize-1}; + Index startInputs[2] = {0, 0}; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_outputStride[i]; + const Index idx1 = indices[1] / m_outputStride[i]; + startInputs[0] += idx0 * m_inputStride[i]; + startInputs[1] += idx1 * m_inputStride[i]; + indices[0] -= idx0 * m_outputStride[i]; + indices[1] -= idx1 * m_outputStride[i]; + } + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / m_outputStride[i]; + const Index idx1 = indices[1] / m_outputStride[i]; + startInputs[0] += idx0 * m_inputStride[i]; + startInputs[1] += idx1 * m_inputStride[i]; + indices[0] -= idx0 * m_outputStride[i]; + indices[1] -= idx1 * m_outputStride[i]; + } + } + startInputs[0] += indices[0]; + startInputs[1] += indices[1]; + + if (startInputs[1]-startInputs[0] == PacketSize-1) { + PacketReturnType result = internal::pset1(0); + convolvePacket(startInputs[0], 0, NumKernelDims-1, result); + return result; + } else { + EIGEN_ALIGN_MAX Scalar data[PacketSize]; + data[0] = Scalar(0); + convolve(startInputs[0], 0, NumKernelDims-1, data[0]); + for (int i = 1; i < PacketSize-1; ++i) { + data[i] = Scalar(0); + convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]); + } + data[PacketSize-1] = Scalar(0); + convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]); + return internal::pload(data); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double kernel_size = m_kernelImpl.dimensions().TotalSize(); + // We ignore the use of fused multiply-add. + const double convolve_compute_cost = + TensorOpCost::AddCost() + TensorOpCost::MulCost(); + const double firstIndex_compute_cost = + NumDims * + (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + + TensorOpCost::DivCost()); + return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + + kernel_size * (m_inputImpl.costPerCoeff(vectorized) + + m_kernelImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, convolve_compute_cost, vectorized, + PacketSize)); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + + private: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { + Index startInput = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStride[i]; + startInput += idx * m_inputStride[i]; + index -= idx * m_outputStride[i]; + } + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStride[i]; + startInput += idx * m_inputStride[i]; + index -= idx * m_outputStride[i]; + } + } + startInput += index; + return startInput; + } + + EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const { + for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { + const Index input = firstIndex + j * m_indexStride[DimIndex]; + const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; + if (DimIndex > 0) { + convolve(input, kernel, DimIndex-1, accum); + } else { + accum += m_inputImpl.coeff(input) * m_kernel[kernel]; + } + } + } + + template + EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const { + for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { + const Index input = firstIndex + j * m_indexStride[DimIndex]; + const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; + if (DimIndex > 0) { + convolvePacket(input, kernel, DimIndex-1, accum); + } else { + accum = internal::pmadd(m_inputImpl.template packet(input), internal::pset1(m_kernel[kernel]), accum); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() { + // Don't make a local copy of the kernel unless we have to (i.e. it's an + // expression that needs to be evaluated) + const Scalar* in_place = m_kernelImpl.data(); + if (in_place) { + m_kernel = in_place; + m_local_kernel = false; + } else { + size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); + Scalar* local = (Scalar*)m_device.allocate_temp(kernel_sz); + typedef TensorEvalToOp EvalTo; + EvalTo evalToTmp(local, m_kernelArg); + const bool Vectorize = internal::IsVectorizable::value; + internal::TensorExecutor::run(evalToTmp, m_device); + + m_kernel = local; + m_local_kernel = true; + } + } + + array m_inputStride; + array m_outputStride; + + array m_indexStride; + array m_kernelStride; + TensorEvaluator m_inputImpl; + TensorEvaluator m_kernelImpl; + Dimensions m_dimensions; + + KernelArgType m_kernelArg; + const Scalar* m_kernel; + bool m_local_kernel; + const Device EIGEN_DEVICE_REF m_device; +}; + + + + +// Use an optimized implementation of the evaluation code for GPUs whenever possible. +#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC) + +template +struct GetKernelSize { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const { + return StaticKernelSize; + } +}; +template <> +struct GetKernelSize { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const { + return kernelSize; + } +}; + +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel1D( + InputEvaluator eval, + const internal::IndexMapper + indexMapper, + const float* __restrict kernel, const int numPlanes, const int numX, + const int maxX, const int kernelSize, float* buffer) { +#if defined(EIGEN_HIPCC) + HIP_DYNAMIC_SHARED(float, s) +#else + extern __shared__ float s[]; +#endif + + const int first_x = blockIdx.x * maxX; + const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int num_x_input = last_x - first_x + GetKernelSize()(kernelSize); + const int num_x_output = last_x - first_x + 1; + + const int first_plane = blockIdx.y * blockDim.y; + const int plane_stride = blockDim.y * gridDim.y; + + for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) { + // Load inputs to shared memory + const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p); + const int plane_kernel_offset = threadIdx.y * num_x_input; + #pragma unroll + for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { + const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x); + s[i + plane_kernel_offset] = eval.coeff(tensor_index); + } + + __syncthreads(); + + // Compute the convolution + const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p); + + #pragma unroll + for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { + const int kernel_offset = plane_kernel_offset + i; + float result = 0.0f; + #pragma unroll + for (int k = 0; k < GetKernelSize()(kernelSize); ++k) { + result += s[k + kernel_offset] * kernel[k]; + } + const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x); + buffer[tensor_index] = result; + } + __syncthreads(); + } +}; + +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel2D( + InputEvaluator eval, + const internal::IndexMapper + indexMapper, + const float* __restrict kernel, const int numPlanes, const int numX, + const int maxX, const int numY, const int maxY, const int kernelSizeX, + const int kernelSizeY, float* buffer) { +#if defined(EIGEN_HIPCC) + HIP_DYNAMIC_SHARED(float, s) +#else + extern __shared__ float s[]; +#endif + + const int first_x = blockIdx.x * maxX; + const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int num_x_input = last_x - first_x + GetKernelSize()(kernelSizeX); + const int num_x_output = last_x - first_x + 1; + + const int first_y = blockIdx.y * maxY; + const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; + const int num_y_input = last_y - first_y + GetKernelSize()(kernelSizeY); + const int num_y_output = last_y - first_y + 1; + + const int first_plane = blockIdx.z * blockDim.z; + const int plane_stride = blockDim.z * gridDim.z; + + for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) { + + const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p); + const int plane_kernel_offset = threadIdx.z * num_y_input; + + // Load inputs to shared memory + #pragma unroll + for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { + const int input_offset = num_x_input * (j + plane_kernel_offset); + #pragma unroll + for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { + const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y); + s[i + input_offset] = eval.coeff(tensor_index); + } + } + + __syncthreads(); + + // Convolution + const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p); + + #pragma unroll + for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { + #pragma unroll + for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { + float result = 0.0f; + #pragma unroll + for (int l = 0; l < GetKernelSize()(kernelSizeY); ++l) { + const int kernel_offset = kernelSizeX * l; + const int input_offset = i + num_x_input * (j + l + plane_kernel_offset); + #pragma unroll + for (int k = 0; k < GetKernelSize()(kernelSizeX); ++k) { + result += s[k + input_offset] * kernel[k + kernel_offset]; + } + } + const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y); + buffer[tensor_index] = result; + } + } + + __syncthreads(); + } +}; + +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D( + InputEvaluator eval, + const internal::IndexMapper + indexMapper, + const float* __restrict kernel, const size_t numPlanes, const size_t numX, + const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ, + const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY, + const size_t kernelSizeZ, float* buffer) { +#if defined(EIGEN_HIPCC) + HIP_DYNAMIC_SHARED(float, s) +#else + extern __shared__ float s[]; +#endif + + // Load inputs to shared memory + const int first_x = blockIdx.x * maxX; + const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; + const int num_x_input = last_x - first_x + kernelSizeX; + + const int first_y = blockIdx.y * maxY; + const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; + const int num_y_input = last_y - first_y + kernelSizeY; + + const int first_z = blockIdx.z * maxZ; + const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1; + const int num_z_input = last_z - first_z + kernelSizeZ; + + for (int p = 0; p < numPlanes; ++p) { + + const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p); + const int plane_kernel_offset = 0; + + for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) { + for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { + for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { + const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z); + s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index); + } + } + } + + __syncthreads(); + + // Convolution + const int num_z_output = last_z - first_z + 1; + const int num_y_output = last_y - first_y + 1; + const int num_x_output = last_x - first_x + 1; + const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p); + + for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) { + for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { + for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { + float result = 0.0f; + for (int n = 0; n < kernelSizeZ; ++n) { + for (int m = 0; m < kernelSizeY; ++m) { + for (int l = 0; l < kernelSizeX; ++l) { + result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)]; + } + } + } + const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z); + buffer[tensor_index] = result; + } + } + } + __syncthreads(); + } +}; + + + +template +struct TensorEvaluator, GpuDevice> +{ + typedef TensorConvolutionOp XprType; + + static const int NumDims = internal::array_size::Dimensions>::value; + static const int NumKernelDims = internal::array_size::value; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + typedef typename TensorEvaluator::Dimensions KernelDimensions; + + enum { + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = false, + BlockAccess = false, + PreferBlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + TensorEvaluator(const XprType& op, const GpuDevice& device) + : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device) + { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); + + const typename TensorEvaluator::Dimensions& input_dims = m_inputImpl.dimensions(); + const typename TensorEvaluator::Dimensions& kernel_dims = m_kernelImpl.dimensions(); + + m_dimensions = m_inputImpl.dimensions(); + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + } + } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename InputArgType::Scalar Scalar; + static const int PacketSize = internal::unpacket_traits::size; + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + preloadKernel(); + m_inputImpl.evalSubExprsIfNeeded(NULL); + if (data) { + executeEval(data); + return false; + } else { + m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)); + executeEval(m_buf); + return true; + } + } + + EIGEN_STRONG_INLINE void cleanup() { + m_inputImpl.cleanup(); + if (m_buf) { + m_device.deallocate(m_buf); + m_buf = NULL; + } + if (m_local_kernel) { + m_device.deallocate((void*)m_kernel); + m_local_kernel = false; + } + m_kernel = NULL; + } + + EIGEN_STRONG_INLINE void preloadKernel() { + // Don't make a local copy of the kernel unless we have to (i.e. it's an + // expression that needs to be evaluated) + const Scalar* in_place = m_kernelImpl.data(); + if (in_place) { + m_kernel = in_place; + m_local_kernel = false; + } else { + size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); + Scalar* local = (Scalar*)m_device.allocate(kernel_sz); + typedef TensorEvalToOp EvalTo; + EvalTo evalToTmp(local, m_kernelArg); + const bool PacketAccess = internal::IsVectorizable::value; + internal::TensorExecutor::run(evalToTmp, m_device); + + m_kernel = local; + m_local_kernel = true; + } + } + + static unsigned int ceil(unsigned int num, unsigned int denom) { + const unsigned int rounded_toward_zero = num / denom; + if (num > rounded_toward_zero * denom) { + return rounded_toward_zero + 1; + } + return rounded_toward_zero; + } + + void executeEval(Scalar* data) const { + typedef typename TensorEvaluator::Dimensions InputDims; + + const int maxSharedMem = m_device.sharedMemPerBlock(); + const int maxThreadsPerBlock = m_device.maxGpuThreadsPerBlock(); + const int maxBlocksPerProcessor = m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock; + const int numMultiProcessors = m_device.getNumGpuMultiProcessors(); + const int warpSize = 32; + + switch (NumKernelDims) { + case 1: { + const int kernel_size = m_kernelImpl.dimensions().TotalSize(); + + const int numX = dimensions()[m_indices[0]]; + const int numP = dimensions().TotalSize() / numX; + int maxX; + dim3 block_size; + + const int single_stride_dim = + static_cast(Layout) == static_cast(ColMajor) + ? 0 + : m_inputImpl.dimensions().rank() - 1; + if (m_indices[0] == single_stride_dim) { + // Maximum the reuse + const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32; + maxX = numext::mini(inner_dim, numX); + const int maxP = numext::mini(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP); + block_size.x = numext::mini(maxThreadsPerBlock, maxX); + block_size.y = numext::mini(maxThreadsPerBlock / block_size.x, maxP); + } + else { + // Read as much as possible alongside the inner most dimension, that is the plane + const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar)); + const int maxP = numext::mini(inner_dim, numP); + maxX = numext::mini(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX); + + block_size.x = numext::mini(warpSize, maxX); + block_size.y = numext::mini(maxThreadsPerBlock/block_size.x, maxP); + } + + const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar); + gpu_assert(shared_mem <= maxSharedMem); + + const int num_x_blocks = ceil(numX, maxX); + const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem); + const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks); + + dim3 num_blocks(num_x_blocks, numext::mini(num_y_blocks, ceil(numP, block_size.y))); + + + //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; + + const array indices(m_indices[0]); + const array kernel_dims(m_kernelImpl.dimensions()[0]); + internal::IndexMapper indexMapper( + m_inputImpl.dimensions(), kernel_dims, indices); + switch(kernel_size) { + case 4: { + LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data); + break; + } + case 7: { + LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data); + break; + } + default: { + LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data); + } + } + break; + } + + case 2: { + const int idxX = + static_cast(Layout) == static_cast(ColMajor) ? 0 : 1; + const int idxY = + static_cast(Layout) == static_cast(ColMajor) ? 1 : 0; + const int kernel_size_x = m_kernelImpl.dimensions()[idxX]; + const int kernel_size_y = m_kernelImpl.dimensions()[idxY]; + + const int numX = dimensions()[m_indices[idxX]]; + const int numY = dimensions()[m_indices[idxY]]; + const int numP = dimensions().TotalSize() / (numX*numY); + + const float scaling_factor = sqrtf(static_cast(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x)); + + // Snap maxX to warp size + int inner_dim = ((static_cast(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32; + const int maxX = numext::mini(inner_dim, numX); + const int maxY = numext::mini(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY); + const int maxP = numext::mini(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP); + + dim3 block_size; + block_size.x = numext::mini(1024, maxX); + block_size.y = numext::mini(1024/block_size.x, maxY); + block_size.z = numext::mini(1024/(block_size.x*block_size.y), maxP); + + const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar); + gpu_assert(shared_mem <= maxSharedMem); + + const int num_x_blocks = ceil(numX, maxX); + const int num_y_blocks = ceil(numY, maxY); + const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem); + const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks); + + dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini(num_z_blocks, ceil(numP, block_size.z))); + + + //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; + + const array indices(m_indices[idxX], m_indices[idxY]); + const array kernel_dims(m_kernelImpl.dimensions()[idxX], + m_kernelImpl.dimensions()[idxY]); + internal::IndexMapper indexMapper( + m_inputImpl.dimensions(), kernel_dims, indices); + switch (kernel_size_x) { + case 4: { + switch (kernel_size_y) { + case 7: { + LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data); + break; + } + default: { + LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data); + break; + } + } + break; + } + case 7: { + switch (kernel_size_y) { + case 4: { + LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data); + break; + } + default: { + LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data); + break; + } + } + break; + } + default: { + LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data); + break; + } + } + break; + } + + case 3: { + const int idxX = + static_cast(Layout) == static_cast(ColMajor) ? 0 : 2; + const int idxY = + static_cast(Layout) == static_cast(ColMajor) ? 1 : 1; + const int idxZ = + static_cast(Layout) == static_cast(ColMajor) ? 2 : 0; + + const int kernel_size_x = m_kernelImpl.dimensions()[idxX]; + const int kernel_size_y = m_kernelImpl.dimensions()[idxY]; + const int kernel_size_z = m_kernelImpl.dimensions()[idxZ]; + + const int numX = dimensions()[m_indices[idxX]]; + const int numY = dimensions()[m_indices[idxY]]; + const int numZ = dimensions()[m_indices[idxZ]]; + const int numP = dimensions().TotalSize() / (numX*numY*numZ); + + const int maxX = numext::mini(128, numext::mini(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX)); + const int maxY = numext::mini(128, numext::mini(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY)); + const int maxZ = numext::mini(128, numext::mini(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ)); + + dim3 block_size; + block_size.x = numext::mini(32, maxX); + block_size.y = numext::mini(32, maxY); + block_size.z = numext::mini(1024/(block_size.x*block_size.y), maxZ); + dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ)); + + const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar); + gpu_assert(shared_mem <= maxSharedMem); + + //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; + const array indices(m_indices[idxX], m_indices[idxY], + m_indices[idxZ]); + const array kernel_dims(m_kernelImpl.dimensions()[idxX], + m_kernelImpl.dimensions()[idxY], + m_kernelImpl.dimensions()[idxZ]); + internal::IndexMapper indexMapper( + m_inputImpl.dimensions(), kernel_dims, indices); + + LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data); + break; + } + + default: { + EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + eigen_assert(m_buf); + eigen_assert(index < m_dimensions.TotalSize()); + return m_buf[index]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const + { + eigen_assert(m_buf); + eigen_assert(index < m_dimensions.TotalSize()); + return internal::ploadt(m_buf+index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost + // model. + const double kernel_size = m_kernelImpl.dimensions().TotalSize(); + // We ignore the use of fused multiply-add. + const double convolve_compute_cost = + TensorOpCost::AddCost() + TensorOpCost::MulCost(); + const double firstIndex_compute_cost = + NumDims * + (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + + TensorOpCost::DivCost()); + return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + + kernel_size * (m_inputImpl.costPerCoeff(vectorized) + + m_kernelImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, convolve_compute_cost, vectorized, + PacketSize)); + } + + private: + // No assignment (copies are needed by the kernels) + TensorEvaluator& operator = (const TensorEvaluator&); + + TensorEvaluator m_inputImpl; + TensorEvaluator m_kernelImpl; + KernelArgType m_kernelArg; + Indices m_indices; + Dimensions m_dimensions; + Scalar* m_buf; + const Scalar* m_kernel; + bool m_local_kernel; + + const GpuDevice& m_device; +}; +#endif + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorConvolutionSycl.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorConvolutionSycl.h new file mode 100644 index 0000000..033318f --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorConvolutionSycl.h @@ -0,0 +1,544 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// Copyright (C) 2016 Benoit Steiner + +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H + +namespace Eigen { + +/** \class TensorConvolution + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor convolution class. + * + * + */ + +enum class convolution_type { CONV1D, CONV2D, CONV3D }; +template +struct EigenConvolutionKernel; +template +struct EigenConvolutionKernel { + typedef cl::sycl::accessor + Local_accessor; + Local_accessor local_acc; + Evaluator device_evaluator; + Kernel_accessor kernel_filter; + Buffer_accessor buffer_acc; + internal::IndexMapper indexMapper; + const size_t kernelSize; + const cl::sycl::range<2> input_range; + EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, + Buffer_accessor buffer_acc_, + internal::IndexMapper indexMapper_, + const size_t kernelSize_, const cl::sycl::range<2> input_range_) + : local_acc(local_acc_), + device_evaluator(device_evaluator_), + kernel_filter(kernel_filter_), + buffer_acc(buffer_acc_), + indexMapper(indexMapper_), + kernelSize(kernelSize_), + input_range(input_range_) {} + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) { + return (boolean_check[0] && boolean_check[1]); + } + void operator()(cl::sycl::nd_item<2> itemID) { + auto buffer_ptr = buffer_acc.get_pointer(); + auto kernel_ptr = kernel_filter.get_pointer(); + // the required row to be calculated for the for each plane in shered memory + const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1); + const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input; + const size_t input_offset = itemID.get_group(0) * itemID.get_local_range()[0]; + const size_t plane_tensor_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(1)); + /// fill the shared memory + for (size_t i = itemID.get_local_id(0); i < num_input; i += itemID.get_local_range()[0]) { + const size_t local_index = i + plane_kernel_offset; + const size_t tensor_index = + plane_tensor_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + input_offset); + + local_acc[local_index] = + (((i + input_offset) < (input_range[0] + kernelSize - 1)) && itemID.get_global_id(1) < input_range[1]) + ? device_evaluator.coeff(tensor_index) + : CoeffReturnType(0); + } + + itemID.barrier(cl::sycl::access::fence_space::local_space); + + // calculate the convolution // output start x + const size_t first_output_start = itemID.get_group(0) * (itemID.get_local_range()[0]); + if (boundary_check(itemID.get_global_id() < input_range)) { + CoeffReturnType result = static_cast(0); + const size_t index = plane_kernel_offset + itemID.get_local_id(0); + for (size_t k = 0; k < kernelSize; ++k) { + result += (local_acc[k + index] * kernel_ptr[k]); + } + const size_t tensor_index = + indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(1)) + + indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + first_output_start); + buffer_ptr[tensor_index] = result; + } + } +}; + +template +struct EigenConvolutionKernel { + typedef cl::sycl::accessor + Local_accessor; + Local_accessor local_acc; + Evaluator device_evaluator; + Kernel_accessor kernel_filter; + Buffer_accessor buffer_acc; + internal::IndexMapper indexMapper; + const cl::sycl::range<2> kernel_size; + const cl::sycl::range<3> input_range; + EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, + Buffer_accessor buffer_acc_, + internal::IndexMapper indexMapper_, + const cl::sycl::range<2> kernel_size_, const cl::sycl::range<3> input_range_) + : local_acc(local_acc_), + device_evaluator(device_evaluator_), + kernel_filter(kernel_filter_), + buffer_acc(buffer_acc_), + indexMapper(indexMapper_), + kernel_size(kernel_size_), + input_range(input_range_) {} + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) { + return (boolean_check[0] && boolean_check[1] && boolean_check[2]); + } + + void operator()(cl::sycl::nd_item<3> itemID) { + auto buffer_ptr = buffer_acc.get_pointer(); + auto kernel_ptr = kernel_filter.get_pointer(); + // the required row to be calculated for the for each plane in shered memory + const auto num_input = cl::sycl::range<2>{ + (cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)}; + + const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(2)); + const size_t plane_kernel_offset = itemID.get_local_id(2) * num_input[1]; + + const auto input_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0], + itemID.get_group(1) * itemID.get_local_range()[1]}; + + // fill the local memory + bool in_range_dim2 = itemID.get_global_id(2) < input_range[2]; + for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) { + const size_t local_input_offset = num_input[0] * (j + plane_kernel_offset); + bool in_range_dim1 = ((j + input_offset[1]) < (input_range[1] + kernel_size[1] - 1)); + for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) { + const size_t local_index = i + local_input_offset; + const size_t tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset( + i + input_offset[0], j + input_offset[1]); + local_acc[local_index] = (((i + input_offset[0]) < (input_range[0] + kernel_size[0] - 1)) && + in_range_dim1 && in_range_dim2) + ? device_evaluator.coeff(tensor_index) + : CoeffReturnType(0); + } + } + + itemID.barrier(cl::sycl::access::fence_space::local_space); + + // output offset start for each thread + const auto output_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0], + itemID.get_group(1) * itemID.get_local_range()[1]}; + + if (boundary_check(itemID.get_global_id() < input_range)) { + CoeffReturnType result = static_cast(0); + + for (size_t j = 0; j < kernel_size[1]; j++) { + size_t kernel_offset = kernel_size[0] * j; + const size_t index = + (num_input[0] * (plane_kernel_offset + j + itemID.get_local_id(1))) + itemID.get_local_id(0); + for (size_t i = 0; i < kernel_size[0]; i++) { + result += (local_acc[i + index] * kernel_ptr[i + kernel_offset]); + } + } + const size_t tensor_index = + indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(2)) + + indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + output_offset[0], + itemID.get_local_id(1) + output_offset[1]); + + buffer_ptr[tensor_index] = result; + } + } +}; + +template +struct EigenConvolutionKernel { + typedef cl::sycl::accessor + Local_accessor; + Local_accessor local_acc; + Evaluator device_evaluator; + Kernel_accessor kernel_filter; + Buffer_accessor buffer_acc; + internal::IndexMapper indexMapper; + const cl::sycl::range<3> kernel_size; + const cl::sycl::range<3> input_range; + const size_t numP; + + EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, + Buffer_accessor buffer_acc_, + internal::IndexMapper indexMapper_, + const cl::sycl::range<3> kernel_size_, const cl::sycl::range<3> input_range_, + const size_t numP_) + : local_acc(local_acc_), + device_evaluator(device_evaluator_), + kernel_filter(kernel_filter_), + buffer_acc(buffer_acc_), + indexMapper(indexMapper_), + kernel_size(kernel_size_), + input_range(input_range_), + numP(numP_) {} + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) { + return (boolean_check[0] && boolean_check[1] && boolean_check[2]); + } + void operator()(cl::sycl::nd_item<3> itemID) { + auto buffer_ptr = buffer_acc.get_pointer(); + auto kernel_ptr = kernel_filter.get_pointer(); + const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1}; + + const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()}; + + const auto output_offset = + cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range() + itemID.get_local_id()}; + + for (size_t p = 0; p < numP; p++) { + /// fill the shared memory + const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p); + for (size_t k = itemID.get_local_id(2); k < num_input[2]; k += itemID.get_local_range()[2]) { + size_t local_index_dim2 = num_input[0] * num_input[1] * k; + bool cond_k_dim = (k + input_offset[2] < (input_range[2] + kernel_size[2] - 1)); + for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) { + bool cond_j_dim = cond_k_dim && (j + input_offset[1] < (input_range[1] + kernel_size[1] - 1)); + size_t local_index_dim1 = (num_input[0] * j) + local_index_dim2; + for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) { + bool conds = cond_j_dim && (i + input_offset[0] < (input_range[0] + kernel_size[0] - 1)); + const size_t local_index = local_index_dim1 + i; + const size_t tensor_index = + plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset( + i + input_offset[0], j + input_offset[1], k + input_offset[2]); + local_acc[local_index] = conds ? device_evaluator.coeff(tensor_index) : CoeffReturnType(0); + } + } + } + itemID.barrier(cl::sycl::access::fence_space::local_space); + + // calculate the convolution + + if (boundary_check(itemID.get_global_id() < input_range)) { + CoeffReturnType result = static_cast(0); + for (size_t k = 0; k < kernel_size[2]; k++) { + for (size_t j = 0; j < kernel_size[1]; j++) { + for (size_t i = 0; i < kernel_size[0]; i++) { + const size_t kernel_index = i + kernel_size[0] * (j + kernel_size[1] * k); + const size_t local_index = + ((i + itemID.get_local_id(0)) + + num_input[0] * ((j + itemID.get_local_id(1)) + num_input[1] * (k + itemID.get_local_id(2)))); + + result += (local_acc[local_index] * kernel_ptr[kernel_index]); + } + } + } + const size_t tensor_index = + indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p) + + indexMapper.mapGpuOutputKernelToTensorOutputOffset(output_offset[0], output_offset[1], output_offset[2]); + buffer_ptr[tensor_index] = result; + } + + itemID.barrier(cl::sycl::access::fence_space::local_space); + } + } +}; + +template +struct TensorEvaluator, Eigen::SyclDevice> { + typedef TensorConvolutionOp XprType; + + static const int NumDims = + internal::array_size::Dimensions>::value; + static const int NumKernelDims = internal::array_size::value; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + typedef typename TensorEvaluator::Dimensions KernelDimensions; + typedef const Eigen::SyclDevice Device; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename InputArgType::Scalar Scalar; + static const int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + typedef StorageMemory KernelStorage; + + enum { + IsAligned = TensorEvaluator::IsAligned & + TensorEvaluator::IsAligned, + PacketAccess = false, + BlockAccess = false, + PreferBlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + TensorEvaluator(const XprType &op, const Eigen::SyclDevice &device) + : m_inputImpl(op.inputExpression(), device), + m_kernelArg(op.kernelExpression()), + m_kernelImpl(op.kernelExpression(), device), + m_indices(op.indices()), + m_buf(NULL), + m_kernel(NULL), + m_local_kernel(false), + m_device(device) { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == + static_cast(TensorEvaluator::Layout)), + YOU_MADE_A_PROGRAMMING_MISTAKE); + + const typename TensorEvaluator::Dimensions &input_dims = m_inputImpl.dimensions(); + const typename TensorEvaluator::Dimensions &kernel_dims = + m_kernelImpl.dimensions(); + + m_dimensions = m_inputImpl.dimensions(); + for (int i = 0; i < NumKernelDims; ++i) { + const Index index = op.indices()[i]; + const Index input_dim = input_dims[index]; + const Index kernel_dim = kernel_dims[i]; + const Index result_dim = input_dim - kernel_dim + 1; + m_dimensions[index] = result_dim; + } + } + + EIGEN_DEVICE_FUNC const Dimensions &dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + preloadKernel(); + m_inputImpl.evalSubExprsIfNeeded(NULL); + if (data) { + executeEval(data); + return false; + } else { + m_buf = (EvaluatorPointerType)m_device.get( + (Scalar *)m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar))); + executeEval(m_buf); + return true; + } + } + + EIGEN_STRONG_INLINE void cleanup() { + m_inputImpl.cleanup(); + if (m_buf) { + m_device.deallocate_temp(m_buf); + m_buf = NULL; + } + if (m_local_kernel) { + m_device.deallocate_temp(m_kernel); + m_local_kernel = false; + } + m_kernel = NULL; + } + /// used by sycl in order to build the sycl buffer + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device &device() const { return m_device; } + /// used by sycl in order to build the sycl buffer + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_buf; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() { + // Don't make a local copy of the kernel unless we have to (i.e. it's an + // expression that needs to be evaluated) + typename KernelStorage::Type in_place = m_kernelImpl.data(); + if (in_place) { + m_kernel = in_place; + m_local_kernel = false; + } else { + ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); + EvaluatorPointerType local = (EvaluatorPointerType)m_device.get((Scalar *)m_device.allocate_temp(kernel_sz)); + typedef TensorEvalToOp EvalTo; + EvalTo evalToTmp(m_device.get(local), m_kernelArg); + const bool PacketAccess = internal::IsVectorizable::value; + internal::TensorExecutor::run(evalToTmp, m_device); + m_kernel = local; + m_local_kernel = true; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(EvaluatorPointerType data) const { + typedef TensorEvaluator InputEvaluator; + typedef typename InputEvaluator::Dimensions InputDims; + switch (NumKernelDims) { + case 1: { + const size_t numX = dimensions()[m_indices[0]]; + const size_t numP = dimensions().TotalSize() / numX; + const auto input_dim = std::array{numX, numP}; + auto global_range = cl::sycl::range<2>{}; + auto local_range = cl::sycl::range<2>{}; + const size_t kernel_size = m_kernelImpl.dimensions().TotalSize(); + + m_device.parallel_for_setup(input_dim, global_range, local_range); + const size_t local_memory_size = (local_range[0] + kernel_size - 1) * (local_range[1]); + gpu_assert(static_cast(local_memory_size) <= m_device.sharedMemPerBlock()); + const array indices{{m_indices[0]}}; + const array kernel_dims{{m_kernelImpl.dimensions()[0]}}; + internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + + typedef EigenConvolutionKernel + ConvKernel; + + m_device.template binary_kernel_launcher( + m_inputImpl, m_kernel, data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size, + indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1])); + break; + } + + case 2: { + auto kernel_index = std::array{static_cast(Layout) == static_cast(ColMajor) ? 0 : 1, + static_cast(Layout) == static_cast(ColMajor) ? 1 : 0}; + auto kernel_size = cl::sycl::range<2>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]], + (size_t)m_kernelImpl.dimensions()[kernel_index[1]]}; + const size_t numX = dimensions()[m_indices[kernel_index[0]]]; + const size_t numY = dimensions()[m_indices[kernel_index[1]]]; + const size_t numP = dimensions().TotalSize() / (numX * numY); + auto input_dim = std::array{numX, numY, numP}; + + auto global_range = cl::sycl::range<3>{}; + auto local_range = cl::sycl::range<3>{}; + + m_device.parallel_for_setup(input_dim, global_range, local_range); + + const size_t local_memory_size = + (local_range[0] + kernel_size[0] - 1) * (local_range[1] + kernel_size[1] - 1) * local_range[2]; + gpu_assert(static_cast(local_memory_size) <= m_device.sharedMemPerBlock()); + const array indices{{m_indices[kernel_index[0]], m_indices[kernel_index[1]]}}; + const array kernel_dims{ + {m_kernelImpl.dimensions()[kernel_index[0]], m_kernelImpl.dimensions()[kernel_index[1]]}}; + internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + typedef EigenConvolutionKernel + ConvKernel; + m_device.template binary_kernel_launcher( + m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size, + indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]}); + break; + } + + case 3: { + auto kernel_index = std::array{static_cast(Layout) == static_cast(ColMajor) ? 0 : 2, + static_cast(Layout) == static_cast(ColMajor) ? 1 : 1, + static_cast(Layout) == static_cast(ColMajor) ? 2 : 0}; + + auto kernel_size = cl::sycl::range<3>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]], + (size_t)m_kernelImpl.dimensions()[kernel_index[1]], + (size_t)m_kernelImpl.dimensions()[kernel_index[2]]}; + + const size_t numX = dimensions()[m_indices[kernel_index[0]]]; + const size_t numY = dimensions()[m_indices[kernel_index[1]]]; + const size_t numZ = dimensions()[m_indices[kernel_index[2]]]; + auto input_dim = std::array{numX, numY, numZ}; + const size_t numP = dimensions().TotalSize() / (numX * numY * numZ); + + const array indices{ + {m_indices[kernel_index[0]], m_indices[kernel_index[1]], m_indices[kernel_index[2]]}}; + const array kernel_dims{{m_kernelImpl.dimensions()[kernel_index[0]], + m_kernelImpl.dimensions()[kernel_index[1]], + m_kernelImpl.dimensions()[kernel_index[2]]}}; + + internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); + + auto global_range = cl::sycl::range<3>{}; + auto local_range = cl::sycl::range<3>{}; + + m_device.parallel_for_setup(input_dim, global_range, local_range); + auto local_memory_range = (local_range + kernel_size - 1); + const size_t local_memory_size = local_memory_range[0] * local_memory_range[1] * local_memory_range[2]; + + gpu_assert(static_cast(local_memory_size) <= m_device.sharedMemPerBlock()); + typedef EigenConvolutionKernel + ConvKernel; + m_device.template binary_kernel_launcher( + m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size, + indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP); + break; + } + + default: { + EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), + THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + eigen_assert(m_buf != NULL); + eigen_assert(index < m_dimensions.TotalSize()); + return m_buf[index]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const { + eigen_assert(m_buf != NULL); + eigen_assert(index < m_dimensions.TotalSize()); + return internal::ploadt(m_buf + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost + // model. + const double kernel_size = m_kernelImpl.dimensions().TotalSize(); + // We ignore the use of fused multiply-add. + const double convolve_compute_cost = TensorOpCost::AddCost() + TensorOpCost::MulCost(); + const double firstIndex_compute_cost = + NumDims * + (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + TensorOpCost::DivCost()); + return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + + kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize)); + } + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_kernelImpl.bind(cgh); + m_inputImpl.bind(cgh); + m_buf.bind(cgh); + m_kernel.bind(cgh); + } + + private: + // No assignment (copies are needed by the kernels) + TensorEvaluator &operator=(const TensorEvaluator &); + TensorEvaluator m_inputImpl; + KernelArgType m_kernelArg; + TensorEvaluator m_kernelImpl; + Indices m_indices; + Dimensions m_dimensions; + EvaluatorPointerType m_buf; + typename KernelStorage::Type m_kernel; + bool m_local_kernel; + const Eigen::SyclDevice EIGEN_DEVICE_REF m_device; +}; // namespace Eigen + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorCostModel.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorCostModel.h new file mode 100644 index 0000000..195267c --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorCostModel.h @@ -0,0 +1,214 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Rasmus Munk Larsen +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H +#define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H + +namespace Eigen { + +/** \class TensorEvaluator + * \ingroup CXX11_Tensor_Module + * + * \brief A cost model used to limit the number of threads used for evaluating + * tensor expression. + * + */ + +// Class storing the cost of evaluating a tensor expression in terms of the +// estimated number of operand bytes loads, bytes stored, and compute cycles. +class TensorOpCost { + public: + // TODO(rmlarsen): Fix the scalar op costs in Eigen proper. Even a simple + // model based on minimal reciprocal throughput numbers from Intel or + // Agner Fog's tables would be better than what is there now. + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int MulCost() { + return internal::functor_traits< + internal::scalar_product_op >::Cost; + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int AddCost() { + return internal::functor_traits >::Cost; + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int DivCost() { + return internal::functor_traits< + internal::scalar_quotient_op >::Cost; + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int ModCost() { + return internal::functor_traits >::Cost; + } + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int CastCost() { + return internal::functor_traits< + internal::scalar_cast_op >::Cost; + } + + EIGEN_DEVICE_FUNC + TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {} + EIGEN_DEVICE_FUNC + TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles) + : bytes_loaded_(bytes_loaded), + bytes_stored_(bytes_stored), + compute_cycles_(compute_cycles) {} + + EIGEN_DEVICE_FUNC + TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles, + bool vectorized, double packet_size) + : bytes_loaded_(bytes_loaded), + bytes_stored_(bytes_stored), + compute_cycles_(vectorized ? compute_cycles / packet_size + : compute_cycles) { + eigen_assert(bytes_loaded >= 0 && (numext::isfinite)(bytes_loaded)); + eigen_assert(bytes_stored >= 0 && (numext::isfinite)(bytes_stored)); + eigen_assert(compute_cycles >= 0 && (numext::isfinite)(compute_cycles)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const { + return bytes_loaded_; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const { + return bytes_stored_; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const { + return compute_cycles_; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost( + double load_cost, double store_cost, double compute_cost) const { + return load_cost * bytes_loaded_ + store_cost * bytes_stored_ + + compute_cost * compute_cycles_; + } + + // Drop memory access component. Intended for cases when memory accesses are + // sequential or are completely masked by computations. + EIGEN_DEVICE_FUNC void dropMemoryCost() { + bytes_loaded_ = 0; + bytes_stored_ = 0; + } + + // TODO(rmlarsen): Define min in terms of total cost, not elementwise. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin( + const TensorOpCost& rhs) const { + double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded()); + double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored()); + double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles()); + return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles); + } + + // TODO(rmlarsen): Define max in terms of total cost, not elementwise. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax( + const TensorOpCost& rhs) const { + double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded()); + double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored()); + double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles()); + return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=( + const TensorOpCost& rhs) { + bytes_loaded_ += rhs.bytes_loaded(); + bytes_stored_ += rhs.bytes_stored(); + compute_cycles_ += rhs.compute_cycles(); + return *this; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(double rhs) { + bytes_loaded_ *= rhs; + bytes_stored_ *= rhs; + compute_cycles_ *= rhs; + return *this; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+( + TensorOpCost lhs, const TensorOpCost& rhs) { + lhs += rhs; + return lhs; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*( + TensorOpCost lhs, double rhs) { + lhs *= rhs; + return lhs; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*( + double lhs, TensorOpCost rhs) { + rhs *= lhs; + return rhs; + } + + friend std::ostream& operator<<(std::ostream& os, const TensorOpCost& tc) { + return os << "[bytes_loaded = " << tc.bytes_loaded() + << ", bytes_stored = " << tc.bytes_stored() + << ", compute_cycles = " << tc.compute_cycles() << "]"; + } + + private: + double bytes_loaded_; + double bytes_stored_; + double compute_cycles_; +}; + +// TODO(rmlarsen): Implement a policy that chooses an "optimal" number of theads +// in [1:max_threads] instead of just switching multi-threading off for small +// work units. +template +class TensorCostModel { + public: + // Scaling from Eigen compute cost to device cycles. + static const int kDeviceCyclesPerComputeCycle = 1; + + // Costs in device cycles. + static const int kStartupCycles = 100000; + static const int kPerThreadCycles = 100000; + static const int kTaskSize = 40000; + + // Returns the number of threads in [1:max_threads] to use for + // evaluating an expression with the given output size and cost per + // coefficient. + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads( + double output_size, const TensorOpCost& cost_per_coeff, int max_threads) { + double cost = totalCost(output_size, cost_per_coeff); + double threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9; + // Make sure we don't invoke undefined behavior when we convert to an int. + threads = numext::mini(threads, GenericNumTraits::highest()); + return numext::mini(max_threads, + numext::maxi(1, static_cast(threads))); + } + + // taskSize assesses parallel task size. + // Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task + // granularity needs to be increased to mitigate parallelization overheads. + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize( + double output_size, const TensorOpCost& cost_per_coeff) { + return totalCost(output_size, cost_per_coeff) / kTaskSize; + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost( + double output_size, const TensorOpCost& cost_per_coeff) { + // Cost of memory fetches from L2 cache. 64 is typical cache line size. + // 11 is L2 cache latency on Haswell. + // We don't know whether data is in L1, L2 or L3. But we are most interested + // in single-threaded computational time around 100us-10ms (smaller time + // is too small for parallelization, larger time is not interesting + // either because we are probably using all available threads already). + // And for the target time range, L2 seems to be what matters. Data set + // fitting into L1 is too small to take noticeable time. Data set fitting + // only into L3 presumably will take more than 10ms to load and process. + const double kLoadCycles = 1.0 / 64 * 11; + const double kStoreCycles = 1.0 / 64 * 11; + // Scaling from Eigen compute cost to device cycles. + return output_size * + cost_per_coeff.total_cost(kLoadCycles, kStoreCycles, + kDeviceCyclesPerComputeCycle); + } +}; + +} // namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorCustomOp.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorCustomOp.h new file mode 100644 index 0000000..95a8a84 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorCustomOp.h @@ -0,0 +1,347 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H +#define EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H + +namespace Eigen { + +/** \class TensorCustomUnaryOp + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor custom class. + * + * + */ +namespace internal { +template +struct traits > +{ + typedef typename XprType::Scalar Scalar; + typedef typename XprType::StorageKind StorageKind; + typedef typename XprType::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = traits::NumDimensions; + static const int Layout = traits::Layout; + typedef typename traits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorCustomUnaryOpEIGEN_DEVICE_REF type; +}; + +template +struct nested > +{ + typedef TensorCustomUnaryOp type; +}; + +} // end namespace internal + + + +template +class TensorCustomUnaryOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename internal::nested::type Nested; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCustomUnaryOp(const XprType& expr, const CustomUnaryFunc& func) + : m_expr(expr), m_func(func) {} + + EIGEN_DEVICE_FUNC + const CustomUnaryFunc& func() const { return m_func; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_expr; } + + protected: + typename XprType::Nested m_expr; + const CustomUnaryFunc m_func; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorCustomUnaryOp ArgType; + typedef typename internal::traits::Index Index; + static const int NumDims = internal::traits::NumDimensions; + typedef DSizes Dimensions; + typedef typename internal::remove_const::type Scalar; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = PacketType::size; + typedef typename Eigen::internal::traits::PointerType TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + enum { + IsAligned = false, + PacketAccess = (PacketType::size > 1), + BlockAccess = false, + PreferBlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device) + : m_op(op), m_device(device), m_result(NULL) + { + m_dimensions = op.func().dimensions(op.expression()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + if (data) { + evalTo(data); + return false; + } else { + m_result = static_cast(m_device.get( (CoeffReturnType*) + m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar)))); + evalTo(m_result); + return true; + } + } + + EIGEN_STRONG_INLINE void cleanup() { + if (m_result) { + m_device.deallocate_temp(m_result); + m_result = NULL; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + return m_result[index]; + } + + template + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { + return internal::ploadt(m_result + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + // TODO(rmlarsen): Extend CustomOp API to return its cost estimate. + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; } + +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_result.bind(cgh); + } +#endif + + protected: + void evalTo(EvaluatorPointerType data) { + TensorMap > result(m_device.get(data), m_dimensions); + m_op.func().eval(m_op.expression(), result, m_device); + } + + Dimensions m_dimensions; + const ArgType m_op; + const Device EIGEN_DEVICE_REF m_device; + EvaluatorPointerType m_result; +}; + + + +/** \class TensorCustomBinaryOp + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor custom class. + * + * + */ +namespace internal { +template +struct traits > +{ + typedef typename internal::promote_storage_type::ret Scalar; + typedef typename internal::promote_storage_type::ret CoeffReturnType; + typedef typename promote_storage_type::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef typename promote_index_type::Index, + typename traits::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef typename remove_reference::type _LhsNested; + typedef typename remove_reference::type _RhsNested; + static const int NumDimensions = traits::NumDimensions; + static const int Layout = traits::Layout; + typedef typename conditional::val, + typename traits::PointerType, typename traits::PointerType>::type PointerType; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorCustomBinaryOp& type; +}; + +template +struct nested > +{ + typedef TensorCustomBinaryOp type; +}; + +} // end namespace internal + + + +template +class TensorCustomBinaryOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::traits::CoeffReturnType CoeffReturnType; + typedef typename internal::nested::type Nested; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCustomBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const CustomBinaryFunc& func) + + : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_func(func) {} + + EIGEN_DEVICE_FUNC + const CustomBinaryFunc& func() const { return m_func; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + lhsExpression() const { return m_lhs_xpr; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + rhsExpression() const { return m_rhs_xpr; } + + protected: + typename LhsXprType::Nested m_lhs_xpr; + typename RhsXprType::Nested m_rhs_xpr; + const CustomBinaryFunc m_func; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorCustomBinaryOp XprType; + typedef typename internal::traits::Index Index; + static const int NumDims = internal::traits::NumDimensions; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = PacketType::size; + + typedef typename Eigen::internal::traits::PointerType TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + enum { + IsAligned = false, + PacketAccess = (PacketType::size > 1), + BlockAccess = false, + PreferBlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_op(op), m_device(device), m_result(NULL) + { + m_dimensions = op.func().dimensions(op.lhsExpression(), op.rhsExpression()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + if (data) { + evalTo(data); + return false; + } else { + m_result = static_cast(m_device.get( (CoeffReturnType*) + m_device.allocate_temp(dimensions().TotalSize() * sizeof(CoeffReturnType)))); + evalTo(m_result); + return true; + } + } + + EIGEN_STRONG_INLINE void cleanup() { + if (m_result != NULL) { + m_device.deallocate_temp(m_result); + m_result = NULL; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + return m_result[index]; + } + + template + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { + return internal::ploadt(m_result + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + // TODO(rmlarsen): Extend CustomOp API to return its cost estimate. + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; } + +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_result.bind(cgh); + } +#endif + + protected: + void evalTo(EvaluatorPointerType data) { + TensorMap > result(m_device.get(data), m_dimensions); + m_op.func().eval(m_op.lhsExpression(), m_op.rhsExpression(), result, m_device); + } + + Dimensions m_dimensions; + const XprType m_op; + const Device EIGEN_DEVICE_REF m_device; + EvaluatorPointerType m_result; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorDevice.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorDevice.h new file mode 100644 index 0000000..96fa46c --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorDevice.h @@ -0,0 +1,137 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H + +namespace Eigen { + +/** \class TensorDevice + * \ingroup CXX11_Tensor_Module + * + * \brief Pseudo expression providing an operator = that will evaluate its argument + * on the specified computing 'device' (GPU, thread pool, ...) + * + * Example: + * C.device(EIGEN_GPU) = A + B; + * + * Todo: operator *= and /=. + */ + +template class TensorDevice { + public: + TensorDevice(const DeviceType& device, ExpressionType& expression) : m_device(device), m_expression(expression) {} + + EIGEN_DEFAULT_COPY_CONSTRUCTOR(TensorDevice) + + template + EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) { + typedef TensorAssignOp Assign; + Assign assign(m_expression, other); + internal::TensorExecutor::run(assign, m_device); + return *this; + } + + template + EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) { + typedef typename OtherDerived::Scalar Scalar; + typedef TensorCwiseBinaryOp, const ExpressionType, const OtherDerived> Sum; + Sum sum(m_expression, other); + typedef TensorAssignOp Assign; + Assign assign(m_expression, sum); + internal::TensorExecutor::run(assign, m_device); + return *this; + } + + template + EIGEN_STRONG_INLINE TensorDevice& operator-=(const OtherDerived& other) { + typedef typename OtherDerived::Scalar Scalar; + typedef TensorCwiseBinaryOp, const ExpressionType, const OtherDerived> Difference; + Difference difference(m_expression, other); + typedef TensorAssignOp Assign; + Assign assign(m_expression, difference); + internal::TensorExecutor::run(assign, m_device); + return *this; + } + + protected: + const DeviceType& m_device; + ExpressionType& m_expression; +}; + +/** \class TensorAsyncDevice + * \ingroup CXX11_Tensor_Module + * + * \brief Pseudo expression providing an operator = that will evaluate its + * argument asynchronously on the specified device. Currently only + * ThreadPoolDevice implements proper asynchronous execution, while the default + * and GPU devices just run the expression synchronously and call m_done() on + * completion.. + * + * Example: + * auto done = []() { ... expression evaluation done ... }; + * C.device(thread_pool_device, std::move(done)) = A + B; + */ + +template +class TensorAsyncDevice { + public: + TensorAsyncDevice(const DeviceType& device, ExpressionType& expression, + DoneCallback done) + : m_device(device), m_expression(expression), m_done(std::move(done)) {} + + template + EIGEN_STRONG_INLINE TensorAsyncDevice& operator=(const OtherDerived& other) { + typedef TensorAssignOp Assign; + typedef internal::TensorExecutor Executor; + + Assign assign(m_expression, other); + Executor::run(assign, m_device); + m_done(); + + return *this; + } + + protected: + const DeviceType& m_device; + ExpressionType& m_expression; + DoneCallback m_done; +}; + + +#ifdef EIGEN_USE_THREADS +template +class TensorAsyncDevice { + public: + TensorAsyncDevice(const ThreadPoolDevice& device, ExpressionType& expression, + DoneCallback done) + : m_device(device), m_expression(expression), m_done(std::move(done)) {} + + template + EIGEN_STRONG_INLINE TensorAsyncDevice& operator=(const OtherDerived& other) { + typedef TensorAssignOp Assign; + typedef internal::TensorAsyncExecutor Executor; + + // WARNING: After assignment 'm_done' callback will be in undefined state. + Assign assign(m_expression, other); + Executor::runAsync(assign, m_device, std::move(m_done)); + + return *this; + } + + protected: + const ThreadPoolDevice& m_device; + ExpressionType& m_expression; + DoneCallback m_done; +}; +#endif + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceCuda.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceCuda.h new file mode 100644 index 0000000..f779239 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceCuda.h @@ -0,0 +1,6 @@ + +#if defined(__clang__) || defined(__GNUC__) +#warning "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorDeviceGpu.h file" +#endif + +#include "TensorDeviceGpu.h" diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceDefault.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceDefault.h new file mode 100644 index 0000000..46b9d3a --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceDefault.h @@ -0,0 +1,104 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H + + +namespace Eigen { + +// Default device for the machine (typically a single cpu core) +struct DefaultDevice { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + return internal::aligned_malloc(num_bytes); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const { + internal::aligned_free(buffer); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const { + return allocate(num_bytes); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const { + deallocate(buffer); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { + ::memcpy(dst, src, n); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { + memcpy(dst, src, n); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { + memcpy(dst, src, n); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { + ::memset(buffer, c, n); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { + return data; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const { +#if !defined(EIGEN_GPU_COMPILE_PHASE) + // Running on the host CPU + return 1; +#elif defined(EIGEN_HIP_DEVICE_COMPILE) + // Running on a HIP device + return 64; +#else + // Running on a CUDA device + return 32; +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { +#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY) + // Running on the host CPU + return l1CacheSize(); +#elif defined(EIGEN_HIP_DEVICE_COMPILE) + // Running on a HIP device + return 48*1024; // FIXME : update this number for HIP +#else + // Running on a CUDA device, return the amount of shared memory available. + return 48*1024; +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { +#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY) + // Running single threaded on the host CPU + return l3CacheSize(); +#elif defined(EIGEN_HIP_DEVICE_COMPILE) + // Running on a HIP device + return firstLevelCacheSize(); // FIXME : update this number for HIP +#else + // Running on a CUDA device + return firstLevelCacheSize(); +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { +#if !defined(EIGEN_GPU_COMPILE_PHASE) + // Running single threaded on the host CPU + // Should return an enum that encodes the ISA supported by the CPU + return 1; +#elif defined(EIGEN_HIP_DEVICE_COMPILE) + // Running on a HIP device + // return 1 as major for HIP + return 1; +#else + // Running on a CUDA device + return EIGEN_CUDA_ARCH / 100; +#endif + } +}; + +} // namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceGpu.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceGpu.h new file mode 100644 index 0000000..ec2e3cb --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceGpu.h @@ -0,0 +1,389 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H) +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H + +// This header file container defines fo gpu* macros which will resolve to +// their equivalent hip* or cuda* versions depending on the compiler in use +// A separate header (included at the end of this file) will undefine all +#include "TensorGpuHipCudaDefines.h" + +namespace Eigen { + +static const int kGpuScratchSize = 1024; + +// This defines an interface that GPUDevice can take to use +// HIP / CUDA streams underneath. +class StreamInterface { + public: + virtual ~StreamInterface() {} + + virtual const gpuStream_t& stream() const = 0; + virtual const gpuDeviceProp_t& deviceProperties() const = 0; + + // Allocate memory on the actual device where the computation will run + virtual void* allocate(size_t num_bytes) const = 0; + virtual void deallocate(void* buffer) const = 0; + + // Return a scratchpad buffer of size 1k + virtual void* scratchpad() const = 0; + + // Return a semaphore. The semaphore is initially initialized to 0, and + // each kernel using it is responsible for resetting to 0 upon completion + // to maintain the invariant that the semaphore is always equal to 0 upon + // each kernel start. + virtual unsigned int* semaphore() const = 0; +}; + +class GpuDeviceProperties { + public: + GpuDeviceProperties() : + initialized_(false), first_(true), device_properties_(nullptr) {} + + ~GpuDeviceProperties() { + if (device_properties_) { + delete[] device_properties_; + } + } + + EIGEN_STRONG_INLINE const gpuDeviceProp_t& get(int device) const { + return device_properties_[device]; + } + + EIGEN_STRONG_INLINE bool isInitialized() const { + return initialized_; + } + + void initialize() { + if (!initialized_) { + // Attempts to ensure proper behavior in the case of multiple threads + // calling this function simultaneously. This would be trivial to + // implement if we could use std::mutex, but unfortunately mutex don't + // compile with nvcc, so we resort to atomics and thread fences instead. + // Note that if the caller uses a compiler that doesn't support c++11 we + // can't ensure that the initialization is thread safe. + if (first_.exchange(false)) { + // We're the first thread to reach this point. + int num_devices; + gpuError_t status = gpuGetDeviceCount(&num_devices); + if (status != gpuSuccess) { + std::cerr << "Failed to get the number of GPU devices: " + << gpuGetErrorString(status) + << std::endl; + gpu_assert(status == gpuSuccess); + } + device_properties_ = new gpuDeviceProp_t[num_devices]; + for (int i = 0; i < num_devices; ++i) { + status = gpuGetDeviceProperties(&device_properties_[i], i); + if (status != gpuSuccess) { + std::cerr << "Failed to initialize GPU device #" + << i + << ": " + << gpuGetErrorString(status) + << std::endl; + gpu_assert(status == gpuSuccess); + } + } + + std::atomic_thread_fence(std::memory_order_release); + initialized_ = true; + } else { + // Wait for the other thread to inititialize the properties. + while (!initialized_) { + std::atomic_thread_fence(std::memory_order_acquire); + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + } + } + } + } + + private: + volatile bool initialized_; + std::atomic first_; + gpuDeviceProp_t* device_properties_; +}; + +EIGEN_ALWAYS_INLINE const GpuDeviceProperties& GetGpuDeviceProperties() { + static GpuDeviceProperties* deviceProperties = new GpuDeviceProperties(); + if (!deviceProperties->isInitialized()) { + deviceProperties->initialize(); + } + return *deviceProperties; +} + +EIGEN_ALWAYS_INLINE const gpuDeviceProp_t& GetGpuDeviceProperties(int device) { + return GetGpuDeviceProperties().get(device); +} + +static const gpuStream_t default_stream = gpuStreamDefault; + +class GpuStreamDevice : public StreamInterface { + public: + // Use the default stream on the current device + GpuStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) { + gpuGetDevice(&device_); + } + // Use the default stream on the specified device + GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {} + // Use the specified stream. Note that it's the + // caller responsibility to ensure that the stream can run on + // the specified device. If no device is specified the code + // assumes that the stream is associated to the current gpu device. + GpuStreamDevice(const gpuStream_t* stream, int device = -1) + : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) { + if (device < 0) { + gpuGetDevice(&device_); + } else { + int num_devices; + gpuError_t err = gpuGetDeviceCount(&num_devices); + EIGEN_UNUSED_VARIABLE(err) + gpu_assert(err == gpuSuccess); + gpu_assert(device < num_devices); + device_ = device; + } + } + + virtual ~GpuStreamDevice() { + if (scratch_) { + deallocate(scratch_); + } + } + + const gpuStream_t& stream() const { return *stream_; } + const gpuDeviceProp_t& deviceProperties() const { + return GetGpuDeviceProperties(device_); + } + virtual void* allocate(size_t num_bytes) const { + gpuError_t err = gpuSetDevice(device_); + EIGEN_UNUSED_VARIABLE(err) + gpu_assert(err == gpuSuccess); + void* result; + err = gpuMalloc(&result, num_bytes); + gpu_assert(err == gpuSuccess); + gpu_assert(result != NULL); + return result; + } + virtual void deallocate(void* buffer) const { + gpuError_t err = gpuSetDevice(device_); + EIGEN_UNUSED_VARIABLE(err) + gpu_assert(err == gpuSuccess); + gpu_assert(buffer != NULL); + err = gpuFree(buffer); + gpu_assert(err == gpuSuccess); + } + + virtual void* scratchpad() const { + if (scratch_ == NULL) { + scratch_ = allocate(kGpuScratchSize + sizeof(unsigned int)); + } + return scratch_; + } + + virtual unsigned int* semaphore() const { + if (semaphore_ == NULL) { + char* scratch = static_cast(scratchpad()) + kGpuScratchSize; + semaphore_ = reinterpret_cast(scratch); + gpuError_t err = gpuMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_); + EIGEN_UNUSED_VARIABLE(err) + gpu_assert(err == gpuSuccess); + } + return semaphore_; + } + + private: + const gpuStream_t* stream_; + int device_; + mutable void* scratch_; + mutable unsigned int* semaphore_; +}; + +struct GpuDevice { + // The StreamInterface is not owned: the caller is + // responsible for its initialization and eventual destruction. + explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) { + eigen_assert(stream); + } + explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) { + eigen_assert(stream); + } + // TODO(bsteiner): This is an internal API, we should not expose it. + EIGEN_STRONG_INLINE const gpuStream_t& stream() const { + return stream_->stream(); + } + + EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + return stream_->allocate(num_bytes); + } + + EIGEN_STRONG_INLINE void deallocate(void* buffer) const { + stream_->deallocate(buffer); + } + + EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const { + return stream_->allocate(num_bytes); + } + + EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const { + stream_->deallocate(buffer); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { + return data; + } + + EIGEN_STRONG_INLINE void* scratchpad() const { + return stream_->scratchpad(); + } + + EIGEN_STRONG_INLINE unsigned int* semaphore() const { + return stream_->semaphore(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { +#ifndef EIGEN_GPU_COMPILE_PHASE + gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToDevice, + stream_->stream()); + EIGEN_UNUSED_VARIABLE(err) + gpu_assert(err == gpuSuccess); +#else + EIGEN_UNUSED_VARIABLE(dst); + EIGEN_UNUSED_VARIABLE(src); + EIGEN_UNUSED_VARIABLE(n); + eigen_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { + gpuError_t err = + gpuMemcpyAsync(dst, src, n, gpuMemcpyHostToDevice, stream_->stream()); + EIGEN_UNUSED_VARIABLE(err) + gpu_assert(err == gpuSuccess); + } + + EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { + gpuError_t err = + gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToHost, stream_->stream()); + EIGEN_UNUSED_VARIABLE(err) + gpu_assert(err == gpuSuccess); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { +#ifndef EIGEN_GPU_COMPILE_PHASE + gpuError_t err = gpuMemsetAsync(buffer, c, n, stream_->stream()); + EIGEN_UNUSED_VARIABLE(err) + gpu_assert(err == gpuSuccess); +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_STRONG_INLINE size_t numThreads() const { + // FIXME + return 32; + } + + EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { + // FIXME + return 48*1024; + } + + EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { + // We won't try to take advantage of the l2 cache for the time being, and + // there is no l3 cache on hip/cuda devices. + return firstLevelCacheSize(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const { +#ifndef EIGEN_GPU_COMPILE_PHASE + gpuError_t err = gpuStreamSynchronize(stream_->stream()); + if (err != gpuSuccess) { + std::cerr << "Error detected in GPU stream: " + << gpuGetErrorString(err) + << std::endl; + gpu_assert(err == gpuSuccess); + } +#else + gpu_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_STRONG_INLINE int getNumGpuMultiProcessors() const { + return stream_->deviceProperties().multiProcessorCount; + } + EIGEN_STRONG_INLINE int maxGpuThreadsPerBlock() const { + return stream_->deviceProperties().maxThreadsPerBlock; + } + EIGEN_STRONG_INLINE int maxGpuThreadsPerMultiProcessor() const { + return stream_->deviceProperties().maxThreadsPerMultiProcessor; + } + EIGEN_STRONG_INLINE int sharedMemPerBlock() const { + return stream_->deviceProperties().sharedMemPerBlock; + } + EIGEN_STRONG_INLINE int majorDeviceVersion() const { + return stream_->deviceProperties().major; + } + EIGEN_STRONG_INLINE int minorDeviceVersion() const { + return stream_->deviceProperties().minor; + } + + EIGEN_STRONG_INLINE int maxBlocks() const { + return max_blocks_; + } + + // This function checks if the GPU runtime recorded an error for the + // underlying stream device. + inline bool ok() const { +#ifdef EIGEN_GPUCC + gpuError_t error = gpuStreamQuery(stream_->stream()); + return (error == gpuSuccess) || (error == gpuErrorNotReady); +#else + return false; +#endif + } + + private: + const StreamInterface* stream_; + int max_blocks_; +}; + +#if defined(EIGEN_HIPCC) + +#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ + hipLaunchKernelGGL(kernel, dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), __VA_ARGS__); \ + gpu_assert(hipGetLastError() == hipSuccess); + +#else + +#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ + (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ + gpu_assert(cudaGetLastError() == cudaSuccess); + +#endif + +// FIXME: Should be device and kernel specific. +#ifdef EIGEN_GPUCC +static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig config) { +#ifndef EIGEN_GPU_COMPILE_PHASE + gpuError_t status = gpuDeviceSetSharedMemConfig(config); + EIGEN_UNUSED_VARIABLE(status) + gpu_assert(status == gpuSuccess); +#else + EIGEN_UNUSED_VARIABLE(config) +#endif +} +#endif + +} // end namespace Eigen + +// undefine all the gpu* macros we defined at the beginning of the file +#include "TensorGpuHipCudaUndefines.h" + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceSycl.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceSycl.h new file mode 100644 index 0000000..df591c2 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceSycl.h @@ -0,0 +1,1048 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// Copyright (C) 2016 Benoit Steiner + +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H) +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H +#include + +namespace Eigen { + +namespace TensorSycl { +namespace internal { + +/// Cache all the device information needed +struct SyclDeviceInfo { + SyclDeviceInfo(cl::sycl::queue queue) + : local_mem_type( + queue.get_device() + .template get_info()), + max_work_item_sizes( + queue.get_device() + .template get_info< + cl::sycl::info::device::max_work_item_sizes>()), + max_mem_alloc_size( + queue.get_device() + .template get_info< + cl::sycl::info::device::max_mem_alloc_size>()), + max_compute_units(queue.get_device() + .template get_info< + cl::sycl::info::device::max_compute_units>()), + max_work_group_size( + queue.get_device() + .template get_info< + cl::sycl::info::device::max_work_group_size>()), + local_mem_size( + queue.get_device() + .template get_info()), + platform_name(queue.get_device() + .get_platform() + .template get_info()), + device_name(queue.get_device() + .template get_info()), + device_vendor( + queue.get_device() + .template get_info()) {} + + cl::sycl::info::local_mem_type local_mem_type; + cl::sycl::id<3> max_work_item_sizes; + unsigned long max_mem_alloc_size; + unsigned long max_compute_units; + unsigned long max_work_group_size; + size_t local_mem_size; + std::string platform_name; + std::string device_name; + std::string device_vendor; +}; + +} // end namespace internal +} // end namespace TensorSycl + +typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t; +// All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and +// can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently +// TensorFlow via the Eigen SYCL Backend. +EIGEN_STRONG_INLINE auto get_sycl_supported_devices() + -> decltype(cl::sycl::device::get_devices()) { +#ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR + return {cl::sycl::device(cl::sycl::default_selector())}; +#else + std::vector supported_devices; + auto platform_list = cl::sycl::platform::get_platforms(); + for (const auto &platform : platform_list) { + auto device_list = platform.get_devices(); + auto platform_name = + platform.template get_info(); + std::transform(platform_name.begin(), platform_name.end(), + platform_name.begin(), ::tolower); + for (const auto &device : device_list) { + auto vendor = device.template get_info(); + std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower); + bool unsupported_condition = + (device.is_cpu() && platform_name.find("amd") != std::string::npos && + vendor.find("apu") == std::string::npos) || + (platform_name.find("experimental") != std::string::npos) || + device.is_host(); + if (!unsupported_condition) { + supported_devices.push_back(device); + } + } + } + return supported_devices; +#endif +} + +class QueueInterface { + public: + /// Creating device by using cl::sycl::selector or cl::sycl::device. + template + explicit QueueInterface( + const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler, + unsigned num_threads = std::thread::hardware_concurrency()) + : m_queue(dev_or_sel, handler), +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + m_prog(m_queue.get_context(), get_sycl_supported_devices()), +#endif + m_thread_pool(num_threads), + m_device_info(m_queue) { +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + m_prog.build_with_kernel_type(); + auto f = [&](cl::sycl::handler &cgh) { + cgh.single_task(m_prog.get_kernel(), + [=]() {}) + }; + EIGEN_SYCL_TRY_CATCH(m_queue.submit(f)); +#endif + } + + template + explicit QueueInterface( + const DeviceOrSelector &dev_or_sel, + unsigned num_threads = std::thread::hardware_concurrency()) + : QueueInterface(dev_or_sel, + [this](cl::sycl::exception_list l) { + this->exception_caught_ = this->sycl_async_handler(l); + }, + num_threads) {} + +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + EIGEN_STRONG_INLINE cl::sycl::program &program() const { return m_prog; } +#endif + + /// Attach an existing buffer to the pointer map, Eigen will not reuse it + EIGEN_STRONG_INLINE void *attach_buffer( + cl::sycl::buffer &buf) const { + std::lock_guard lock(pmapper_mutex_); + return static_cast(pMapper.add_pointer(buf)); + } + + /// Detach previously attached buffer + EIGEN_STRONG_INLINE void detach_buffer(void *p) const { + std::lock_guard lock(pmapper_mutex_); + TensorSycl::internal::SYCLfree(p, pMapper); + } + + /// Allocating device pointer. This pointer is actually an 8 bytes host + /// pointer used as key to access the sycl device buffer. The reason is that + /// we cannot use device buffer as a pointer as a m_data in Eigen leafNode + /// expressions. So we create a key pointer to be used in Eigen expression + /// construction. When we convert the Eigen construction into the sycl + /// construction we use this pointer as a key in our buffer_map and we make + /// sure that we dedicate only one buffer only for this pointer. The device + /// pointer would be deleted by calling deallocate function. + EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { +#if EIGEN_MAX_ALIGN_BYTES > 0 + size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES; + if (align > 0) { + num_bytes += EIGEN_MAX_ALIGN_BYTES - align; + } +#endif + std::lock_guard lock(pmapper_mutex_); + return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper); + } + + EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const { +#if EIGEN_MAX_ALIGN_BYTES > 0 + size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES; + if (align > 0) { + num_bytes += EIGEN_MAX_ALIGN_BYTES - align; + } +#endif + std::lock_guard lock(pmapper_mutex_); +#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS + if (scratch_buffers.empty()) { + return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper); + ; + } else { + for (auto it = scratch_buffers.begin(); it != scratch_buffers.end();) { + auto buff = pMapper.get_buffer(*it); + if (buff.get_size() >= num_bytes) { + auto ptr = *it; + scratch_buffers.erase(it); + return ptr; + } else { + ++it; + } + } + return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper); + } +#else + return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper); +#endif + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess< + cl::sycl::access::mode::read_write, data_t> + get(data_t *data) const { + return get_range_accessor(data); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get( + TensorSycl::internal::RangeAccess + data) const { + return static_cast(data.get_virtual_pointer()); + } + + EIGEN_STRONG_INLINE void deallocate_temp(void *p) const { + std::lock_guard lock(pmapper_mutex_); +#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS + scratch_buffers.insert(p); +#else + TensorSycl::internal::SYCLfree(p, pMapper); +#endif + } + template + EIGEN_STRONG_INLINE void deallocate_temp( + const TensorSycl::internal::RangeAccess &p) const { + deallocate_temp(p.get_virtual_pointer()); + } + + /// This is used to deallocate the device pointer. p is used as a key inside + /// the map to find the device buffer and delete it. + EIGEN_STRONG_INLINE void deallocate(void *p) const { + std::lock_guard lock(pmapper_mutex_); + TensorSycl::internal::SYCLfree(p, pMapper); + } + + EIGEN_STRONG_INLINE void deallocate_all() const { + std::lock_guard lock(pmapper_mutex_); + TensorSycl::internal::SYCLfreeAll(pMapper); +#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS + scratch_buffers.clear(); +#endif + } + + /// The memcpyHostToDevice is used to copy the data from host to device + /// The destination pointer could be deleted before the copy happend which is + /// why a callback function is needed. By default if none is provided, the + /// function is blocking. + EIGEN_STRONG_INLINE void memcpyHostToDevice( + void *dst, const void *src, size_t n, + std::function callback) const { + static const auto write_mode = cl::sycl::access::mode::discard_write; + static const auto global_access = cl::sycl::access::target::global_buffer; + typedef cl::sycl::accessor + write_accessor; + if (n == 0) { + if (callback) callback(); + return; + } + n /= sizeof(buffer_scalar_t); + auto f = [&](cl::sycl::handler &cgh) { + write_accessor dst_acc = get_range_accessor(cgh, dst, n); + buffer_scalar_t const *ptr = static_cast(src); + auto non_deleter = [](buffer_scalar_t const *) {}; + std::shared_ptr s_ptr(ptr, non_deleter); + cgh.copy(s_ptr, dst_acc); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); + synchronize_and_callback(e, callback); + } + + /// The memcpyDeviceToHost is used to copy the data from device to host. + /// The source pointer could be deleted before the copy happend which is + /// why a callback function is needed. By default if none is provided, the + /// function is blocking. + EIGEN_STRONG_INLINE void memcpyDeviceToHost( + void *dst, const void *src, size_t n, + std::function callback) const { + static const auto read_mode = cl::sycl::access::mode::read; + static const auto global_access = cl::sycl::access::target::global_buffer; + typedef cl::sycl::accessor + read_accessor; + if (n == 0) { + if (callback) callback(); + return; + } + n /= sizeof(buffer_scalar_t); + auto f = [&](cl::sycl::handler &cgh) { + read_accessor src_acc = get_range_accessor(cgh, src, n); + buffer_scalar_t *ptr = static_cast(dst); + auto non_deleter = [](buffer_scalar_t *) {}; + std::shared_ptr s_ptr(ptr, non_deleter); + cgh.copy(src_acc, s_ptr); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); + synchronize_and_callback(e, callback); + } + + /// The memcpy function. + /// No callback is required here as both arguments are on the device + /// and SYCL can handle the dependency. + EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const { + static const auto read_mode = cl::sycl::access::mode::read; + static const auto write_mode = cl::sycl::access::mode::discard_write; + if (n == 0) { + return; + } + n /= sizeof(buffer_scalar_t); + auto f = [&](cl::sycl::handler &cgh) { + auto src_acc = get_range_accessor(cgh, src, n); + auto dst_acc = get_range_accessor(cgh, dst, n); + cgh.copy(src_acc, dst_acc); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); + async_synchronize(e); + } + + /// the memset function. + /// No callback is required here as both arguments are on the device + /// and SYCL can handle the dependency. + EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { + static const auto write_mode = cl::sycl::access::mode::discard_write; + if (n == 0) { + return; + } + n /= sizeof(buffer_scalar_t); + auto f = [&](cl::sycl::handler &cgh) { + auto dst_acc = get_range_accessor(cgh, data, n); + // The cast to uint8_t is here to match the behaviour of the standard + // memset. The cast to buffer_scalar_t is needed to match the type of the + // accessor (in case buffer_scalar_t is not uint8_t) + cgh.fill(dst_acc, static_cast(static_cast(c))); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); + async_synchronize(e); + } + + /// Get a range accessor to the virtual pointer's device memory. This range + /// accessor will allow access to the memory from the pointer to the end of + /// the buffer. + /// + /// NOTE: Inside a kernel the range accessor will always be indexed from the + /// start of the buffer, so the offset in the accessor is only used by + /// methods like handler::copy and will not be available inside a kernel. + template + EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess + get_range_accessor(const void *ptr) const { + static const auto global_access = cl::sycl::access::target::global_buffer; + static const auto is_place_holder = cl::sycl::access::placeholder::true_t; + typedef TensorSycl::internal::RangeAccess ret_type; + typedef const TensorSycl::internal::buffer_data_type_t *internal_ptr_t; + + std::lock_guard lock(pmapper_mutex_); + + auto original_buffer = pMapper.get_buffer(ptr); + const ptrdiff_t offset = pMapper.get_offset(ptr); + const ptrdiff_t typed_offset = offset / sizeof(T); + eigen_assert(typed_offset >= 0); + const auto typed_size = original_buffer.get_size() / sizeof(T); + auto buffer = original_buffer.template reinterpret< + typename Eigen::internal::remove_const::type>( + cl::sycl::range<1>(typed_size)); + const ptrdiff_t size = buffer.get_count() - typed_offset; + eigen_assert(size >= 0); + typedef cl::sycl::accessor::type, + 1, AcMd, global_access, is_place_holder> + placeholder_accessor_t; + const auto start_ptr = static_cast(ptr) - offset; + return ret_type(placeholder_accessor_t(buffer, cl::sycl::range<1>(size), + cl::sycl::id<1>(typed_offset)), + static_cast(typed_offset), + reinterpret_cast(start_ptr)); + } + + /// Get a range accessor to the virtual pointer's device memory with a + /// specified size. + template + EIGEN_STRONG_INLINE cl::sycl::accessor< + buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer> + get_range_accessor(cl::sycl::handler &cgh, const void *ptr, + const Index n_bytes) const { + static const auto global_access = cl::sycl::access::target::global_buffer; + eigen_assert(n_bytes >= 0); + std::lock_guard lock(pmapper_mutex_); + auto buffer = pMapper.get_buffer(ptr); + const ptrdiff_t offset = pMapper.get_offset(ptr); + eigen_assert(offset >= 0); + eigen_assert(offset + n_bytes <= buffer.get_size()); + return buffer.template get_access( + cgh, cl::sycl::range<1>(n_bytes), cl::sycl::id<1>(offset)); + } + + /// Creation of sycl accessor for a buffer. This function first tries to find + /// the buffer in the buffer_map. If found it gets the accessor from it, if + /// not, the function then adds an entry by creating a sycl buffer for that + /// particular pointer. + template + EIGEN_STRONG_INLINE cl::sycl::accessor< + buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer> + get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const { + std::lock_guard lock(pmapper_mutex_); + return pMapper.get_buffer(ptr) + .template get_access( + cgh); + } + + EIGEN_STRONG_INLINE cl::sycl::buffer get_sycl_buffer( + const void *ptr) const { + std::lock_guard lock(pmapper_mutex_); + return pMapper.get_buffer(ptr); + } + + EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const { + std::lock_guard lock(pmapper_mutex_); + return pMapper.get_offset(ptr); + } + + template + EIGEN_ALWAYS_INLINE void binary_kernel_launcher(const Lhs &lhs, + const Rhs &rhs, OutPtr outptr, + Range thread_range, + Index scratchSize, + T... var) const { + auto kernel_functor = [=](cl::sycl::handler &cgh) { + // binding the placeholder accessors to a commandgroup handler + lhs.bind(cgh); + rhs.bind(cgh); + outptr.bind(cgh); + typedef cl::sycl::accessor + LocalAccessor; + + LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh); + cgh.parallel_for( +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + program().template get_kernel(), +#endif + thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...)); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor)); + async_synchronize(e); + } + + template + EIGEN_ALWAYS_INLINE void unary_kernel_launcher(const InPtr &inptr, + OutPtr &outptr, + Range thread_range, + Index scratchSize, + T... var) const { + auto kernel_functor = [=](cl::sycl::handler &cgh) { + // binding the placeholder accessors to a commandgroup handler + inptr.bind(cgh); + outptr.bind(cgh); + typedef cl::sycl::accessor + LocalAccessor; + + LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh); + cgh.parallel_for( +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + program().template get_kernel(), +#endif + thread_range, sycl_kernel(scratch, inptr, outptr, var...)); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor)); + async_synchronize(e); + } + + template + EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(const InPtr &inptr, + Range thread_range, + Index scratchSize, + T... var) const { + auto kernel_functor = [=](cl::sycl::handler &cgh) { + // binding the placeholder accessors to a commandgroup handler + inptr.bind(cgh); + typedef cl::sycl::accessor + LocalAccessor; + + LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh); + cgh.parallel_for( +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + program().template get_kernel(), +#endif + thread_range, sycl_kernel(scratch, inptr, var...)); + }; + cl::sycl::event e; + EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor)); + async_synchronize(e); + } + + + EIGEN_STRONG_INLINE void synchronize() const { +#ifdef EIGEN_EXCEPTIONS + m_queue.wait_and_throw(); +#else + m_queue.wait(); +#endif + } + + + EIGEN_STRONG_INLINE void async_synchronize(cl::sycl::event e) const { + set_latest_event(e); +#ifndef EIGEN_SYCL_ASYNC_EXECUTION + synchronize(); +#endif + } + + template + EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, + Index &rng, Index &GRange) const { + tileSize = static_cast(getNearestPowerOfTwoWorkGroupSize()); + tileSize = std::min(static_cast(EIGEN_SYCL_LOCAL_THREAD_DIM0 * + EIGEN_SYCL_LOCAL_THREAD_DIM1), + static_cast(tileSize)); + rng = n; + if (rng == 0) rng = static_cast(1); + GRange = rng; + if (tileSize > GRange) + tileSize = GRange; + else if (GRange > tileSize) { + Index xMode = static_cast(GRange % tileSize); + if (xMode != 0) GRange += static_cast(tileSize - xMode); + } + } + + /// This is used to prepare the number of threads and also the number of + /// threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup( + const std::array &input_dim, cl::sycl::range<2> &global_range, + cl::sycl::range<2> &local_range) const { + std::array input_range = input_dim; + Index max_workgroup_Size = + static_cast(getNearestPowerOfTwoWorkGroupSize()); + max_workgroup_Size = + std::min(static_cast(EIGEN_SYCL_LOCAL_THREAD_DIM0 * + EIGEN_SYCL_LOCAL_THREAD_DIM1), + static_cast(max_workgroup_Size)); + Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); + local_range[1] = + static_cast(std::pow(2, static_cast(pow_of_2 / 2))); + input_range[1] = input_dim[1]; + if (input_range[1] == 0) input_range[1] = static_cast(1); + global_range[1] = input_range[1]; + if (local_range[1] > global_range[1]) + local_range[1] = global_range[1]; + else if (global_range[1] > local_range[1]) { + Index xMode = static_cast(global_range[1] % local_range[1]); + if (xMode != 0) + global_range[1] += static_cast(local_range[1] - xMode); + } + local_range[0] = static_cast(max_workgroup_Size / local_range[1]); + input_range[0] = input_dim[0]; + if (input_range[0] == 0) input_range[0] = static_cast(1); + global_range[0] = input_range[0]; + if (local_range[0] > global_range[0]) + local_range[0] = global_range[0]; + else if (global_range[0] > local_range[0]) { + Index xMode = static_cast(global_range[0] % local_range[0]); + if (xMode != 0) + global_range[0] += static_cast(local_range[0] - xMode); + } + } + + /// This is used to prepare the number of threads and also the number of + /// threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup( + const std::array &input_dim, cl::sycl::range<3> &global_range, + cl::sycl::range<3> &local_range) const { + std::array input_range = input_dim; + Index max_workgroup_Size = + static_cast(getNearestPowerOfTwoWorkGroupSize()); + max_workgroup_Size = + std::min(static_cast(EIGEN_SYCL_LOCAL_THREAD_DIM0 * + EIGEN_SYCL_LOCAL_THREAD_DIM1), + static_cast(max_workgroup_Size)); + Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); + local_range[2] = + static_cast(std::pow(2, static_cast(pow_of_2 / 3))); + input_range[2] = input_dim[2]; + if (input_range[2] == 0) input_range[1] = static_cast(1); + global_range[2] = input_range[2]; + if (local_range[2] > global_range[2]) + local_range[2] = global_range[2]; + else if (global_range[2] > local_range[2]) { + Index xMode = static_cast(global_range[2] % local_range[2]); + if (xMode != 0) + global_range[2] += static_cast(local_range[2] - xMode); + } + pow_of_2 = static_cast( + std::log2(static_cast(max_workgroup_Size / local_range[2]))); + local_range[1] = + static_cast(std::pow(2, static_cast(pow_of_2 / 2))); + input_range[1] = input_dim[1]; + if (input_range[1] == 0) input_range[1] = static_cast(1); + global_range[1] = input_range[1]; + if (local_range[1] > global_range[1]) + local_range[1] = global_range[1]; + else if (global_range[1] > local_range[1]) { + Index xMode = static_cast(global_range[1] % local_range[1]); + if (xMode != 0) + global_range[1] += static_cast(local_range[1] - xMode); + } + local_range[0] = static_cast(max_workgroup_Size / + (local_range[1] * local_range[2])); + input_range[0] = input_dim[0]; + if (input_range[0] == 0) input_range[0] = static_cast(1); + global_range[0] = input_range[0]; + if (local_range[0] > global_range[0]) + local_range[0] = global_range[0]; + else if (global_range[0] > local_range[0]) { + Index xMode = static_cast(global_range[0] % local_range[0]); + if (xMode != 0) + global_range[0] += static_cast(local_range[0] - xMode); + } + } + + EIGEN_STRONG_INLINE bool has_local_memory() const { +#if !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM) + return false; +#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM) + return true; +#else + return m_device_info.local_mem_type == + cl::sycl::info::local_mem_type::local; +#endif + } + + EIGEN_STRONG_INLINE unsigned long max_buffer_size() const { + return m_device_info.max_mem_alloc_size; + } + + EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { + return m_device_info.max_compute_units; + } + + EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { + return m_device_info.max_work_group_size; + } + + EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const { + return m_device_info.max_work_item_sizes; + } + + /// No need for sycl it should act the same as CPU version + EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; } + + EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { + // OpenCL doesnot have such concept + return 2; + } + + EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { + return m_device_info.local_mem_size; + } + + // This function returns the nearest power of 2 Work-group size which is <= + // maximum device workgroup size. + EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const { + return getPowerOfTwo(m_device_info.max_work_group_size, false); + } + + EIGEN_STRONG_INLINE std::string getPlatformName() const { + return m_device_info.platform_name; + } + + EIGEN_STRONG_INLINE std::string getDeviceName() const { + return m_device_info.device_name; + } + + EIGEN_STRONG_INLINE std::string getDeviceVendor() const { + return m_device_info.device_vendor; + } + + // This function returns the nearest power of 2 + // if roundup is true returns result>=wgsize + // else it return result <= wgsize + EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t wGSize, bool roundUp) const { + if (roundUp) --wGSize; + wGSize |= (wGSize >> 1); + wGSize |= (wGSize >> 2); + wGSize |= (wGSize >> 4); + wGSize |= (wGSize >> 8); + wGSize |= (wGSize >> 16); +#if EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64 || EIGEN_OS_WIN64 + wGSize |= (wGSize >> 32); +#endif + return ((!roundUp) ? (wGSize - (wGSize >> 1)) : ++wGSize); + } + + EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { return m_queue; } + + // This function checks if the runtime recorded an error for the + // underlying stream device. + EIGEN_STRONG_INLINE bool ok() const { + if (!exception_caught_) { + synchronize(); + } + return !exception_caught_; + } + + EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const { +#ifdef EIGEN_SYCL_STORE_LATEST_EVENT + std::lock_guard lock(event_mutex_); + return latest_events_[std::this_thread::get_id()]; +#else + eigen_assert(false); + return cl::sycl::event(); +#endif + } + + // destructor + ~QueueInterface() { + pMapper.clear(); +#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS + scratch_buffers.clear(); +#endif + } + + protected: + EIGEN_STRONG_INLINE void set_latest_event(cl::sycl::event e) const { +#ifdef EIGEN_SYCL_STORE_LATEST_EVENT + std::lock_guard lock(event_mutex_); + latest_events_[std::this_thread::get_id()] = e; +#else + EIGEN_UNUSED_VARIABLE(e); +#endif + } + + void synchronize_and_callback(cl::sycl::event e, + const std::function &callback) const { + set_latest_event(e); + if (callback) { + auto callback_ = [=]() { +#ifdef EIGEN_EXCEPTIONS + cl::sycl::event(e).wait_and_throw(); +#else + cl::sycl::event(e).wait(); +#endif + callback(); + }; + m_thread_pool.Schedule(std::move(callback_)); + } else { +#ifdef EIGEN_EXCEPTIONS + m_queue.wait_and_throw(); +#else + m_queue.wait(); +#endif + } + } + + bool sycl_async_handler(cl::sycl::exception_list exceptions) const { + bool exception_caught = false; + for (const auto &e : exceptions) { + if (e) { + exception_caught = true; + EIGEN_THROW_X(e); + } + } + return exception_caught; + } + + /// class members: + bool exception_caught_ = false; + + mutable std::mutex pmapper_mutex_; + +#ifdef EIGEN_SYCL_STORE_LATEST_EVENT + mutable std::mutex event_mutex_; + mutable std::unordered_map latest_events_; +#endif + + /// std::map is the container used to make sure that we create only one buffer + /// per pointer. The lifespan of the buffer now depends on the lifespan of + /// SyclDevice. If a non-read-only pointer is needed to be accessed on the + /// host we should manually deallocate it. + mutable TensorSycl::internal::PointerMapper pMapper; +#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS + mutable std::unordered_set scratch_buffers; +#endif + /// sycl queue + mutable cl::sycl::queue m_queue; +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + mutable cl::sycl::program m_prog; +#endif + + /// The thread pool is used to wait on events and call callbacks + /// asynchronously + mutable Eigen::ThreadPool m_thread_pool; + + const TensorSycl::internal::SyclDeviceInfo m_device_info; +}; + +struct SyclDeviceBase { + /// QueueInterface is not owned. it is the caller's responsibility to destroy + /// it + const QueueInterface *m_queue_stream; + explicit SyclDeviceBase(const QueueInterface *queue_stream) + : m_queue_stream(queue_stream) {} + EIGEN_STRONG_INLINE const QueueInterface *queue_stream() const { + return m_queue_stream; + } +}; + +// Here is a sycl device struct which accept the sycl queue interface +// as an input +struct SyclDevice : public SyclDeviceBase { + explicit SyclDevice(const QueueInterface *queue_stream) + : SyclDeviceBase(queue_stream) {} + + // this is the accessor used to construct the evaluator + template + EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess + get_range_accessor(const void *ptr) const { + return queue_stream()->template get_range_accessor(ptr); + } + + // get sycl accessor + template + EIGEN_STRONG_INLINE cl::sycl::accessor< + buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer> + get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const { + return queue_stream()->template get_sycl_accessor(cgh, ptr); + } + + /// Accessing the created sycl device buffer for the device pointer + EIGEN_STRONG_INLINE cl::sycl::buffer get_sycl_buffer( + const void *ptr) const { + return queue_stream()->get_sycl_buffer(ptr); + } + + /// This is used to prepare the number of threads and also the number of + /// threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, + Index &rng, Index &GRange) const { + queue_stream()->parallel_for_setup(n, tileSize, rng, GRange); + } + + /// This is used to prepare the number of threads and also the number of + /// threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup( + const std::array &input_dim, cl::sycl::range<2> &global_range, + cl::sycl::range<2> &local_range) const { + queue_stream()->parallel_for_setup(input_dim, global_range, local_range); + } + + /// This is used to prepare the number of threads and also the number of + /// threads per block for sycl kernels + template + EIGEN_STRONG_INLINE void parallel_for_setup( + const std::array &input_dim, cl::sycl::range<3> &global_range, + cl::sycl::range<3> &local_range) const { + queue_stream()->parallel_for_setup(input_dim, global_range, local_range); + } + + /// allocate device memory + EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { + return queue_stream()->allocate(num_bytes); + } + + EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const { + return queue_stream()->allocate_temp(num_bytes); + } + + /// deallocate device memory + EIGEN_STRONG_INLINE void deallocate(void *p) const { + queue_stream()->deallocate(p); + } + + EIGEN_STRONG_INLINE void deallocate_temp(void *buffer) const { + queue_stream()->deallocate_temp(buffer); + } + template + EIGEN_STRONG_INLINE void deallocate_temp( + const TensorSycl::internal::RangeAccess &buffer) const { + queue_stream()->deallocate_temp(buffer); + } + EIGEN_STRONG_INLINE void deallocate_all() const { + queue_stream()->deallocate_all(); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess< + cl::sycl::access::mode::read_write, data_t> + get(data_t *data) const { + return queue_stream()->get(data); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get( + TensorSycl::internal::RangeAccess + data) const { + return queue_stream()->get(data); + } + + /// attach existing buffer + EIGEN_STRONG_INLINE void *attach_buffer( + cl::sycl::buffer &buf) const { + return queue_stream()->attach_buffer(buf); + } + /// detach buffer + EIGEN_STRONG_INLINE void detach_buffer(void *p) const { + queue_stream()->detach_buffer(p); + } + EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const { + return queue_stream()->get_offset(ptr); + } + + // some runtime conditions that can be applied here + EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; } + + /// memcpyHostToDevice + template + EIGEN_STRONG_INLINE void memcpyHostToDevice( + Index *dst, const Index *src, size_t n, + std::function callback = {}) const { + queue_stream()->memcpyHostToDevice(dst, src, n, callback); + } + /// memcpyDeviceToHost + template + EIGEN_STRONG_INLINE void memcpyDeviceToHost( + void *dst, const Index *src, size_t n, + std::function callback = {}) const { + queue_stream()->memcpyDeviceToHost(dst, src, n, callback); + } + /// the memcpy function + template + EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const { + queue_stream()->memcpy(dst, src, n); + } + /// the memset function + EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { + queue_stream()->memset(data, c, n); + } + /// returning the sycl queue + EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { + return queue_stream()->sycl_queue(); + } +#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS + EIGEN_STRONG_INLINE cl::sycl::program &program() const { + return queue_stream()->program(); + } +#endif + + EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return 48 * 1024; } + + EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { + // We won't try to take advantage of the l2 cache for the time being, and + // there is no l3 cache on sycl devices. + return firstLevelCacheSize(); + } + EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { + return queue_stream()->getNumSyclMultiProcessors(); + } + EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { + return queue_stream()->maxSyclThreadsPerBlock(); + } + EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const { + return queue_stream()->maxWorkItemSizes(); + } + EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { + // OpenCL doesnot have such concept + return queue_stream()->maxSyclThreadsPerMultiProcessor(); + } + EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { + return queue_stream()->sharedMemPerBlock(); + } + EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const { + return queue_stream()->getNearestPowerOfTwoWorkGroupSize(); + } + + EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t val, bool roundUp) const { + return queue_stream()->getPowerOfTwo(val, roundUp); + } + /// No need for sycl it should act the same as CPU version + EIGEN_STRONG_INLINE int majorDeviceVersion() const { + return queue_stream()->majorDeviceVersion(); + } + + EIGEN_STRONG_INLINE void synchronize() const { + queue_stream()->synchronize(); + } + EIGEN_STRONG_INLINE void async_synchronize( + cl::sycl::event e = cl::sycl::event()) const { + queue_stream()->async_synchronize(e); + } + EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const { + return queue_stream()->get_latest_event(); + } + + // This function checks if the runtime recorded an error for the + // underlying stream device. + EIGEN_STRONG_INLINE bool ok() const { return queue_stream()->ok(); } + + EIGEN_STRONG_INLINE bool has_local_memory() const { + return queue_stream()->has_local_memory(); + } + EIGEN_STRONG_INLINE long max_buffer_size() const { + return queue_stream()->max_buffer_size(); + } + EIGEN_STRONG_INLINE std::string getPlatformName() const { + return queue_stream()->getPlatformName(); + } + EIGEN_STRONG_INLINE std::string getDeviceName() const { + return queue_stream()->getDeviceName(); + } + EIGEN_STRONG_INLINE std::string getDeviceVendor() const { + return queue_stream()->getDeviceVendor(); + } + template + EIGEN_ALWAYS_INLINE void binary_kernel_launcher(T... var) const { + queue_stream()->template binary_kernel_launcher( + var...); + } + template + EIGEN_ALWAYS_INLINE void unary_kernel_launcher(T... var) const { + queue_stream()->template unary_kernel_launcher( + var...); + } + + template + EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(T... var) const { + queue_stream()->template nullary_kernel_launcher( + var...); + } +}; +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceThreadPool.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceThreadPool.h new file mode 100644 index 0000000..e524b53 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -0,0 +1,409 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#if defined(EIGEN_USE_THREADS) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H) +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H + +namespace Eigen { + +// Runs an arbitrary function and then calls Notify() on the passed in +// Notification. +template struct FunctionWrapperWithNotification +{ + static void run(Notification* n, Function f, Args... args) { + f(args...); + if (n) { + n->Notify(); + } + } +}; + +template struct FunctionWrapperWithBarrier +{ + static void run(Barrier* b, Function f, Args... args) { + f(args...); + if (b) { + b->Notify(); + } + } +}; + +template +static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) { + if (n) { + n->Wait(); + } +} + +// An abstract interface to a device specific memory allocator. +class Allocator { + public: + virtual ~Allocator() {} + virtual void* allocate(size_t num_bytes) const = 0; + virtual void deallocate(void* buffer) const = 0; +}; + +// Build a thread pool device on top the an existing pool of threads. +struct ThreadPoolDevice { + // The ownership of the thread pool remains with the caller. + ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores, Allocator* allocator = nullptr) + : pool_(pool), num_threads_(num_cores), allocator_(allocator) { } + + EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + return allocator_ ? allocator_->allocate(num_bytes) + : internal::aligned_malloc(num_bytes); + } + + EIGEN_STRONG_INLINE void deallocate(void* buffer) const { + if (allocator_) { + allocator_->deallocate(buffer); + } else { + internal::aligned_free(buffer); + } + } + + EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const { + return allocate(num_bytes); + } + + EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const { + deallocate(buffer); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { + return data; + } + + EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { +#ifdef __ANDROID__ + ::memcpy(dst, src, n); +#else + // TODO(rmlarsen): Align blocks on cache lines. + // We have observed that going beyond 4 threads usually just wastes + // CPU cycles due to the threads competing for memory bandwidth, so we + // statically schedule at most 4 block copies here. + const size_t kMinBlockSize = 32768; + const size_t num_threads = CostModel::numThreads(n, TensorOpCost(1.0, 1.0, 0), 4); + if (n <= kMinBlockSize || num_threads < 2) { + ::memcpy(dst, src, n); + } else { + const char* src_ptr = static_cast(src); + char* dst_ptr = static_cast(dst); + const size_t blocksize = (n + (num_threads - 1)) / num_threads; + Barrier barrier(static_cast(num_threads - 1)); + // Launch the last 3 blocks on worker threads. + for (size_t i = 1; i < num_threads; ++i) { + enqueue_with_barrier(&barrier, [n, i, src_ptr, dst_ptr, blocksize] { + ::memcpy(dst_ptr + i * blocksize, src_ptr + i * blocksize, + numext::mini(blocksize, n - (i * blocksize))); + }); + } + // Launch the first block on the main thread. + ::memcpy(dst_ptr, src_ptr, blocksize); + barrier.Wait(); + } +#endif + } + EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { + memcpy(dst, src, n); + } + EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { + memcpy(dst, src, n); + } + + EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { + ::memset(buffer, c, n); + } + + EIGEN_STRONG_INLINE int numThreads() const { + return num_threads_; + } + + // Number of theads available in the underlying thread pool. This number can + // be different from the value returned by numThreads(). + EIGEN_STRONG_INLINE int numThreadsInPool() const { + return pool_->NumThreads(); + } + + EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { + return l1CacheSize(); + } + + EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { + // The l3 cache size is shared between all the cores. + return l3CacheSize() / num_threads_; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { + // Should return an enum that encodes the ISA supported by the CPU + return 1; + } + + template + EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, + Args&&... args) const { + Notification* n = new Notification(); + pool_->Schedule( + std::bind(&FunctionWrapperWithNotification::run, n, + std::move(f), args...)); + return n; + } + + template + EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b, Function&& f, + Args&&... args) const { + pool_->Schedule( + std::bind(&FunctionWrapperWithBarrier::run, b, + std::move(f), args...)); + } + + template + EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, + Args&&... args) const { + if (sizeof...(args) > 0) { + pool_->Schedule(std::bind(std::move(f), args...)); + } else { + pool_->Schedule(std::move(f)); + } + } + + // Returns a logical thread index between 0 and pool_->NumThreads() - 1 if + // called from one of the threads in pool_. Returns -1 otherwise. + EIGEN_STRONG_INLINE int currentThreadId() const { + return pool_->CurrentThreadId(); + } + + // WARNING: This function is synchronous and will block the calling thread. + // + // Synchronous parallelFor executes f with [0, n) arguments in parallel and + // waits for completion. F accepts a half-open interval [first, last). Block + // size is chosen based on the iteration cost and resulting parallel + // efficiency. If block_align is not nullptr, it is called to round up the + // block size. + void parallelFor(Index n, const TensorOpCost& cost, + std::function block_align, + std::function f) const { + if (EIGEN_PREDICT_FALSE(n <= 0)){ + return; + // Compute small problems directly in the caller thread. + } else if (n == 1 || numThreads() == 1 || + CostModel::numThreads(n, cost, static_cast(numThreads())) == 1) { + f(0, n); + return; + } + + // Compute block size and total count of blocks. + ParallelForBlock block = CalculateParallelForBlock(n, cost, block_align); + + // Recursively divide size into halves until we reach block_size. + // Division code rounds mid to block_size, so we are guaranteed to get + // block_count leaves that do actual computations. + Barrier barrier(static_cast(block.count)); + std::function handleRange; + handleRange = [=, &handleRange, &barrier, &f](Index firstIdx, + Index lastIdx) { + while (lastIdx - firstIdx > block.size) { + // Split into halves and schedule the second half on a different thread. + const Index midIdx = firstIdx + divup((lastIdx - firstIdx) / 2, block.size) * block.size; + pool_->Schedule([=, &handleRange]() { handleRange(midIdx, lastIdx); }); + lastIdx = midIdx; + } + // Single block or less, execute directly. + f(firstIdx, lastIdx); + barrier.Notify(); + }; + + if (block.count <= numThreads()) { + // Avoid a thread hop by running the root of the tree and one block on the + // main thread. + handleRange(0, n); + } else { + // Execute the root in the thread pool to avoid running work on more than + // numThreads() threads. + pool_->Schedule([=, &handleRange]() { handleRange(0, n); }); + } + + barrier.Wait(); + } + + // Convenience wrapper for parallelFor that does not align blocks. + void parallelFor(Index n, const TensorOpCost& cost, + std::function f) const { + parallelFor(n, cost, nullptr, std::move(f)); + } + + // WARNING: This function is asynchronous and will not block the calling thread. + // + // Asynchronous parallelFor executes f with [0, n) arguments in parallel + // without waiting for completion. When the last block finished, it will call + // 'done' callback. F accepts a half-open interval [first, last). Block size + // is chosen based on the iteration cost and resulting parallel efficiency. If + // block_align is not nullptr, it is called to round up the block size. + void parallelForAsync(Index n, const TensorOpCost& cost, + std::function block_align, + std::function f, + std::function done) const { + // Compute small problems directly in the caller thread. + if (n <= 1 || numThreads() == 1 || + CostModel::numThreads(n, cost, static_cast(numThreads())) == 1) { + f(0, n); + done(); + return; + } + + // Compute block size and total count of blocks. + ParallelForBlock block = CalculateParallelForBlock(n, cost, block_align); + + ParallelForAsyncContext* const ctx = + new ParallelForAsyncContext(block.count, std::move(f), std::move(done)); + + // Recursively divide size into halves until we reach block_size. + // Division code rounds mid to block_size, so we are guaranteed to get + // block_count leaves that do actual computations. + ctx->handle_range = [this, ctx, block](Index firstIdx, Index lastIdx) { + while (lastIdx - firstIdx > block.size) { + // Split into halves and schedule the second half on a different thread. + const Index midIdx = firstIdx + divup((lastIdx - firstIdx) / 2, block.size) * block.size; + pool_->Schedule( + [ctx, midIdx, lastIdx]() { ctx->handle_range(midIdx, lastIdx); }); + lastIdx = midIdx; + } + + // Single block or less, execute directly. + ctx->f(firstIdx, lastIdx); + + // Delete async context if it was the last block. + if (ctx->count.fetch_sub(1) == 1) delete ctx; + }; + + if (block.count <= numThreads()) { + // Avoid a thread hop by running the root of the tree and one block on the + // main thread. + ctx->handle_range(0, n); + } else { + // Execute the root in the thread pool to avoid running work on more than + // numThreads() threads. + pool_->Schedule([ctx, n]() { ctx->handle_range(0, n); }); + } + } + + // Convenience wrapper for parallelForAsync that does not align blocks. + void parallelForAsync(Index n, const TensorOpCost& cost, + std::function f, + std::function done) const { + parallelForAsync(n, cost, nullptr, std::move(f), std::move(done)); + } + + // Thread pool accessor. + ThreadPoolInterface* getPool() const { return pool_; } + + // Allocator accessor. + Allocator* allocator() const { return allocator_; } + + private: + typedef TensorCostModel CostModel; + + // For parallelForAsync we must keep passed in closures on the heap, and + // delete them only after `done` callback finished. + struct ParallelForAsyncContext { + ParallelForAsyncContext(Index block_count, + std::function block_f, + std::function done_callback) + : count(block_count), + f(std::move(block_f)), + done(std::move(done_callback)) {} + ~ParallelForAsyncContext() { done(); } + + std::atomic count; + std::function f; + std::function done; + + std::function handle_range; + }; + + struct ParallelForBlock { + Index size; // block size + Index count; // number of blocks + }; + + // Calculates block size based on (1) the iteration cost and (2) parallel + // efficiency. We want blocks to be not too small to mitigate parallelization + // overheads; not too large to mitigate tail effect and potential load + // imbalance and we also want number of blocks to be evenly dividable across + // threads. + ParallelForBlock CalculateParallelForBlock( + const Index n, const TensorOpCost& cost, + std::function block_align) const { + const double block_size_f = 1.0 / CostModel::taskSize(1, cost); + const Index max_oversharding_factor = 4; + Index block_size = numext::mini( + n, numext::maxi( + divup(n, max_oversharding_factor * numThreads()), + block_size_f)); + const Index max_block_size = numext::mini(n, 2 * block_size); + + if (block_align) { + Index new_block_size = block_align(block_size); + eigen_assert(new_block_size >= block_size); + block_size = numext::mini(n, new_block_size); + } + + Index block_count = divup(n, block_size); + + // Calculate parallel efficiency as fraction of total CPU time used for + // computations: + double max_efficiency = + static_cast(block_count) / + (divup(block_count, numThreads()) * numThreads()); + + // Now try to increase block size up to max_block_size as long as it + // doesn't decrease parallel efficiency. + for (Index prev_block_count = block_count; + max_efficiency < 1.0 && prev_block_count > 1;) { + // This is the next block size that divides size into a smaller number + // of blocks than the current block_size. + Index coarser_block_size = divup(n, prev_block_count - 1); + if (block_align) { + Index new_block_size = block_align(coarser_block_size); + eigen_assert(new_block_size >= coarser_block_size); + coarser_block_size = numext::mini(n, new_block_size); + } + if (coarser_block_size > max_block_size) { + break; // Reached max block size. Stop. + } + // Recalculate parallel efficiency. + const Index coarser_block_count = divup(n, coarser_block_size); + eigen_assert(coarser_block_count < prev_block_count); + prev_block_count = coarser_block_count; + const double coarser_efficiency = + static_cast(coarser_block_count) / + (divup(coarser_block_count, numThreads()) * numThreads()); + if (coarser_efficiency + 0.01 >= max_efficiency) { + // Taking it. + block_size = coarser_block_size; + block_count = coarser_block_count; + if (max_efficiency < coarser_efficiency) { + max_efficiency = coarser_efficiency; + } + } + } + + return {block_size, block_count}; + } + + ThreadPoolInterface* pool_; + int num_threads_; + Allocator* allocator_; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorDimensionList.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorDimensionList.h new file mode 100644 index 0000000..1a30e45 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorDimensionList.h @@ -0,0 +1,236 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H +#define EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H + +namespace Eigen { + +/** \internal + * + * \class TensorDimensionList + * \ingroup CXX11_Tensor_Module + * + * \brief Special case of tensor index list used to list all the dimensions of a tensor of rank n. + * + * \sa Tensor + */ + +template struct DimensionList { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + const Index operator[] (const Index i) const { return i; } +}; + +namespace internal { + +template struct array_size > { + static const size_t value = Rank; +}; +template struct array_size > { + static const size_t value = Rank; +}; + +template const Index array_get(DimensionList&) { + return n; +} +template const Index array_get(const DimensionList&) { + return n; +} + + +#if EIGEN_HAS_CONSTEXPR +template +struct index_known_statically_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) { + return true; + } +}; +template +struct index_known_statically_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) { + return true; + } +}; + +template +struct all_indices_known_statically_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run() { + return true; + } +}; +template +struct all_indices_known_statically_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run() { + return true; + } +}; + +template +struct indices_statically_known_to_increase_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run() { + return true; + } +}; +template +struct indices_statically_known_to_increase_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run() { + return true; + } +}; + +template +struct index_statically_eq_impl > { + static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return i == value; + } +}; +template +struct index_statically_eq_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return i == value; + } +}; + +template +struct index_statically_ne_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return i != value; + } +}; +template +struct index_statically_ne_impl > { + static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return i != value; + } +}; + +template +struct index_statically_gt_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return i > value; + } +}; +template +struct index_statically_gt_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return i > value; + } +}; + +template +struct index_statically_lt_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return i < value; + } +}; +template +struct index_statically_lt_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { + return i < value; + } +}; + +#else +template +struct index_known_statically_impl > { + EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run(const DenseIndex) { + return true; + } +}; +template +struct index_known_statically_impl > { + EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run(const DenseIndex) { + return true; + } +}; + +template +struct all_indices_known_statically_impl > { + EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run() { + return true; + } +}; +template +struct all_indices_known_statically_impl > { + EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run() { + return true; + } +}; + +template +struct indices_statically_known_to_increase_impl > { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() { + return true; + } +}; +template +struct indices_statically_known_to_increase_impl > { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() { + return true; + } +}; + +template +struct index_statically_eq_impl > { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { + return false; + } +}; +template +struct index_statically_eq_impl > { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { + return false; + } +}; + +template +struct index_statically_ne_impl > { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex){ + return false; + } +}; +template +struct index_statically_ne_impl > { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { + return false; + } +}; + +template +struct index_statically_gt_impl > { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { + return false; + } +}; +template +struct index_statically_gt_impl > { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { + return false; + } +}; + +template +struct index_statically_lt_impl > { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { + return false; + } +}; +template +struct index_statically_lt_impl > { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) { + return false; + } +}; +#endif + +} // end namespace internal +} // end namespace Eigen + + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorDimensions.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorDimensions.h new file mode 100644 index 0000000..f0f1e83 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorDimensions.h @@ -0,0 +1,490 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H +#define EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H + + +namespace Eigen { + +/** \internal + * + * \class TensorDimensions + * \ingroup CXX11_Tensor_Module + * + * \brief Set of classes used to encode and store the dimensions of a Tensor. + * + * The Sizes class encodes as part of the type the number of dimensions and the + * sizes corresponding to each dimension. It uses no storage space since it is + * entirely known at compile time. + * The DSizes class is its dynamic sibling: the number of dimensions is known + * at compile time but the sizes are set during execution. + * + * \sa Tensor + */ + +// Boilerplate code +namespace internal { + +template struct dget { + static const std::ptrdiff_t value = get::value; +}; + + +template +struct fixed_size_tensor_index_linearization_helper +{ + template EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Index run(array const& indices, + const Dimensions& dimensions) + { + return array_get(indices) + + dget::value * + fixed_size_tensor_index_linearization_helper::run(indices, dimensions); + } +}; + +template +struct fixed_size_tensor_index_linearization_helper +{ + template EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Index run(array const&, const Dimensions&) + { + return 0; + } +}; + +template +struct fixed_size_tensor_index_extraction_helper +{ + template EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Index run(const Index index, + const Dimensions& dimensions) + { + const Index mult = (index == n-1) ? 1 : 0; + return array_get(dimensions) * mult + + fixed_size_tensor_index_extraction_helper::run(index, dimensions); + } +}; + +template +struct fixed_size_tensor_index_extraction_helper +{ + template EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Index run(const Index, + const Dimensions&) + { + return 0; + } + }; + +} // end namespace internal + + +// Fixed size +#ifndef EIGEN_EMULATE_CXX11_META_H +template +struct Sizes { + typedef internal::numeric_list Base; + const Base t = Base(); + static const std::ptrdiff_t total_size = internal::arg_prod(Indices...); + static const ptrdiff_t count = Base::count; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t rank() const { + return Base::count; + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t TotalSize() { + return internal::arg_prod(Indices...); + } + + EIGEN_DEVICE_FUNC Sizes() { } + template + explicit EIGEN_DEVICE_FUNC Sizes(const array& /*indices*/) { + // todo: add assertion + } +#if EIGEN_HAS_VARIADIC_TEMPLATES + template EIGEN_DEVICE_FUNC Sizes(DenseIndex...) { } + explicit EIGEN_DEVICE_FUNC Sizes(std::initializer_list /*l*/) { + // todo: add assertion + } +#endif + + template Sizes& operator = (const T& /*other*/) { + // add assertion failure if the size of other is different + return *this; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[] (const std::ptrdiff_t index) const { + return internal::fixed_size_tensor_index_extraction_helper::run(index, t); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + ptrdiff_t IndexOfColMajor(const array& indices) const { + return internal::fixed_size_tensor_index_linearization_helper::run(indices, t); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + ptrdiff_t IndexOfRowMajor(const array& indices) const { + return internal::fixed_size_tensor_index_linearization_helper::run(indices, t); + } +}; + +namespace internal { +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes&) { + return Sizes::total_size; +} +} + +#else + +template +struct non_zero_size { + typedef internal::type2val type; +}; +template <> +struct non_zero_size<0> { + typedef internal::null_type type; +}; + +template struct Sizes { + typedef typename internal::make_type_list::type, typename non_zero_size::type, typename non_zero_size::type, typename non_zero_size::type, typename non_zero_size::type >::type Base; + static const std::ptrdiff_t count = Base::count; + static const std::ptrdiff_t total_size = internal::arg_prod::value; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptrdiff_t rank() const { + return count; + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptrdiff_t TotalSize() { + return internal::arg_prod::value; + } + + Sizes() { } + template + explicit Sizes(const array& /*indices*/) { + // todo: add assertion + } + template Sizes& operator = (const T& /*other*/) { + // add assertion failure if the size of other is different + return *this; + } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template Sizes(DenseIndex... /*indices*/) { } + explicit Sizes(std::initializer_list) { + // todo: add assertion + } +#else + EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex) { + } + EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex) { + } + EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex) { + } + EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) { + } + EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) { + } +#endif + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index operator[] (const Index index) const { + switch (index) { + case 0: + return internal::get<0, Base>::value; + case 1: + return internal::get<1, Base>::value; + case 2: + return internal::get<2, Base>::value; + case 3: + return internal::get<3, Base>::value; + case 4: + return internal::get<4, Base>::value; + default: + eigen_assert(false && "index overflow"); + return static_cast(-1); + } + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + ptrdiff_t IndexOfColMajor(const array& indices) const { + return internal::fixed_size_tensor_index_linearization_helper::run(indices, *reinterpret_cast(this)); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + ptrdiff_t IndexOfRowMajor(const array& indices) const { + return internal::fixed_size_tensor_index_linearization_helper::run(indices, *reinterpret_cast(this)); + } +}; + +namespace internal { +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes&) { + return Sizes::total_size; +} +} + +#endif + +// Boilerplate +namespace internal { +template +struct tensor_index_linearization_helper +{ + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index run(array const& indices, array const& dimensions) + { + return array_get(indices) + + array_get(dimensions) * + tensor_index_linearization_helper::run(indices, dimensions); + } +}; + +template +struct tensor_index_linearization_helper +{ + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index run(array const& indices, array const&) + { + return array_get(indices); + } +}; +} // end namespace internal + + + +// Dynamic size +template +struct DSizes : array { + typedef array Base; + static const int count = NumDims; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { + return NumDims; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex TotalSize() const { + return (NumDims == 0) ? 1 : internal::array_prod(*static_cast(this)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DSizes() { + for (int i = 0 ; i < NumDims; ++i) { + (*this)[i] = 0; + } + } + EIGEN_DEVICE_FUNC explicit DSizes(const array& a) : Base(a) { } + + EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0) { + eigen_assert(NumDims == 1); + (*this)[0] = i0; + } + + EIGEN_DEVICE_FUNC DSizes(const DimensionList& a) { + for (int i = 0 ; i < NumDims; ++i) { + (*this)[i] = a[i]; + } + } + + // Enable DSizes index type promotion only if we are promoting to the + // larger type, e.g. allow to promote dimensions of type int to long. + template + EIGEN_DEVICE_FUNC + explicit DSizes(const array& other, + // Default template parameters require c++11. + typename internal::enable_if< + internal::is_same< + DenseIndex, + typename internal::promote_index_type< + DenseIndex, + OtherIndex + >::type + >::value, void*>::type = 0) { + for (int i = 0; i < NumDims; ++i) { + (*this)[i] = static_cast(other[i]); + } + } + +#ifdef EIGEN_HAS_INDEX_LIST + template + EIGEN_DEVICE_FUNC + explicit DSizes(const Eigen::IndexList& dimensions) { + for (int i = 0; i < dimensions.count; ++i) { + (*this)[i] = dimensions[i]; + } + } +#endif + +#ifndef EIGEN_EMULATE_CXX11_META_H + template + EIGEN_DEVICE_FUNC DSizes(const Sizes& a) { + for (int i = 0 ; i < NumDims; ++i) { + (*this)[i] = a[i]; + } + } +#else + template + EIGEN_DEVICE_FUNC DSizes(const Sizes& a) { + for (int i = 0 ; i < NumDims; ++i) { + (*this)[i] = a[i]; + } + } +#endif + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, DenseIndex secondDimension, IndexTypes... otherDimensions) : Base({{firstDimension, secondDimension, otherDimensions...}}) { + EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 2 == NumDims, YOU_MADE_A_PROGRAMMING_MISTAKE) + } +#else + EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1) { + eigen_assert(NumDims == 2); + (*this)[0] = i0; + (*this)[1] = i1; + } + EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) { + eigen_assert(NumDims == 3); + (*this)[0] = i0; + (*this)[1] = i1; + (*this)[2] = i2; + } + EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) { + eigen_assert(NumDims == 4); + (*this)[0] = i0; + (*this)[1] = i1; + (*this)[2] = i2; + (*this)[3] = i3; + } + EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) { + eigen_assert(NumDims == 5); + (*this)[0] = i0; + (*this)[1] = i1; + (*this)[2] = i2; + (*this)[3] = i3; + (*this)[4] = i4; + } +#endif + + EIGEN_DEVICE_FUNC DSizes& operator = (const array& other) { + *static_cast(this) = other; + return *this; + } + + // A constexpr would be so much better here + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex IndexOfColMajor(const array& indices) const { + return internal::tensor_index_linearization_helper::run(indices, *static_cast(this)); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex IndexOfRowMajor(const array& indices) const { + return internal::tensor_index_linearization_helper::run(indices, *static_cast(this)); + } +}; + +template +std::ostream& operator<<(std::ostream& os, + const DSizes& dims) { + os << "["; + for (int i = 0; i < NumDims; ++i) { + if (i > 0) os << ", "; + os << dims[i]; + } + os << "]"; + return os; +} + +// Boilerplate +namespace internal { +template +struct tensor_vsize_index_linearization_helper +{ + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index run(array const& indices, std::vector const& dimensions) + { + return array_get(indices) + + array_get(dimensions) * + tensor_vsize_index_linearization_helper::run(indices, dimensions); + } +}; + +template +struct tensor_vsize_index_linearization_helper +{ + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Index run(array const& indices, std::vector const&) + { + return array_get(indices); + } +}; +} // end namespace internal + + +namespace internal { + +template struct array_size > { + static const ptrdiff_t value = NumDims; +}; +template struct array_size > { + static const ptrdiff_t value = NumDims; +}; +#ifndef EIGEN_EMULATE_CXX11_META_H +template struct array_size > { +static const std::ptrdiff_t value = Sizes::count; +}; +template struct array_size > { +static const std::ptrdiff_t value = Sizes::count; +}; +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes&) { + return get >::value; +} +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<>&) { + eigen_assert(false && "should never be called"); + return -1; +} +#else +template struct array_size > { + static const ptrdiff_t value = Sizes::count; +}; +template struct array_size > { + static const ptrdiff_t value = Sizes::count; +}; +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes&) { + return get::Base>::value; +} + +#endif + + +template +struct sizes_match_below_dim { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Dims1&, Dims2&) { + return false; + } +}; +template +struct sizes_match_below_dim { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Dims1& dims1, Dims2& dims2) { + return (array_get(dims1) == array_get(dims2)) && + sizes_match_below_dim::run(dims1, dims2); + } +}; +template +struct sizes_match_below_dim { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Dims1&, Dims2&) { + return true; + } +}; + +} // end namespace internal + + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool dimensions_match(Dims1 dims1, Dims2 dims2) { + return internal::sizes_match_below_dim::value, internal::array_size::value>::run(dims1, dims2); +} + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorEvalTo.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorEvalTo.h new file mode 100644 index 0000000..a48d035 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorEvalTo.h @@ -0,0 +1,236 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H +#define EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H + +namespace Eigen { + +/** \class TensorForcedEval + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reshaping class. + * + * + */ +namespace internal { +template class MakePointer_> +struct traits > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; + typedef typename MakePointer_::Type PointerType; + + enum { + Flags = 0 + }; + template + struct MakePointer { + // Intermediate typedef to workaround MSVC issue. + typedef MakePointer_ MakePointerT; + typedef typename MakePointerT::Type Type; + + + }; +}; + +template class MakePointer_> +struct eval, Eigen::Dense> +{ + typedef const TensorEvalToOp& type; +}; + +template class MakePointer_> +struct nested, 1, typename eval >::type> +{ + typedef TensorEvalToOp type; +}; + +} // end namespace internal + + + + +template class MakePointer_> +class TensorEvalToOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename MakePointer_::Type PointerType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + static const int NumDims = Eigen::internal::traits::NumDimensions; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(PointerType buffer, const XprType& expr) + : m_xpr(expr), m_buffer(buffer) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC PointerType buffer() const { return m_buffer; } + + protected: + typename XprType::Nested m_xpr; + PointerType m_buffer; +}; + + + +template class MakePointer_> +struct TensorEvaluator, Device> +{ + typedef TensorEvalToOp XprType; + typedef typename ArgType::Scalar Scalar; + typedef typename TensorEvaluator::Dimensions Dimensions; + typedef typename XprType::Index Index; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = PacketType::size; + typedef typename Eigen::internal::traits::PointerType TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = true, + PreferBlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = true + }; + + static const int NumDims = internal::traits::NumDimensions; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlock + ArgTensorBlock; + + typedef internal::TensorBlockAssignment< + CoeffReturnType, NumDims, typename ArgTensorBlock::XprType, Index> + TensorBlockAssignment; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_buffer(device.get(op.buffer())), m_expression(op.expression()){} + + + EIGEN_STRONG_INLINE ~TensorEvaluator() { + } + + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType scalar) { + EIGEN_UNUSED_VARIABLE(scalar); + eigen_assert(scalar == NULL); + return m_impl.evalSubExprsIfNeeded(m_buffer); + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType scalar, EvalSubExprsCallback done) { + EIGEN_UNUSED_VARIABLE(scalar); + eigen_assert(scalar == NULL); + m_impl.evalSubExprsIfNeededAsync(m_buffer, std::move(done)); + } +#endif + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) { + m_buffer[i] = m_impl.coeff(i); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) { + internal::pstoret(m_buffer + i, m_impl.template packet::IsAligned ? Aligned : Unaligned>(i)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + return m_impl.getResourceRequirements(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock( + TensorBlockDesc& desc, TensorBlockScratch& scratch) { + // Add `m_buffer` as destination buffer to the block descriptor. + desc.template AddDestinationBuffer( + /*dst_base=*/m_buffer + desc.offset(), + /*dst_strides=*/internal::strides(m_impl.dimensions())); + + ArgTensorBlock block = + m_impl.block(desc, scratch, /*root_of_expr_ast=*/true); + + // If block was evaluated into a destination buffer, there is no need to do + // an assignment. + if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) { + TensorBlockAssignment::Run( + TensorBlockAssignment::target( + desc.dimensions(), internal::strides(m_impl.dimensions()), + m_buffer, desc.offset()), + block.expr()); + } + block.cleanup(); + } + + EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_buffer[index]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return internal::ploadt(m_buffer + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + // We assume that evalPacket or evalScalar is called to perform the + // assignment and account for the cost of the write here. + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_buffer; } + ArgType expression() const { return m_expression; } + #ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + m_buffer.bind(cgh); + } + #endif + + + private: + TensorEvaluator m_impl; + EvaluatorPointerType m_buffer; + const ArgType m_expression; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorEvaluator.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorEvaluator.h new file mode 100644 index 0000000..3aff7fa --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorEvaluator.h @@ -0,0 +1,983 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H +#define EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H + +namespace Eigen { + +/** \class TensorEvaluator + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor evaluator classes. + * + * These classes are responsible for the evaluation of the tensor expression. + * + * TODO: add support for more types of expressions, in particular expressions + * leading to lvalues (slicing, reshaping, etc...) + */ + +// Generic evaluator +template +struct TensorEvaluator +{ + typedef typename Derived::Index Index; + typedef typename Derived::Scalar Scalar; + typedef typename Derived::Scalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename Derived::Dimensions Dimensions; + typedef Derived XprType; + static const int PacketSize = PacketType::size; + typedef typename internal::traits::template MakePointer::Type TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + // NumDimensions is -1 for variable dim tensors + static const int NumCoords = internal::traits::NumDimensions > 0 ? + internal::traits::NumDimensions : 0; + + enum { + IsAligned = Derived::IsAligned, + PacketAccess = (PacketType::size > 1), + BlockAccess = internal::is_arithmetic::type>::value, + PreferBlockAccess = false, + Layout = Derived::Layout, + CoordAccess = NumCoords > 0, + RawAccess = true + }; + + typedef typename internal::remove_const::type ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename internal::TensorMaterializedBlock + TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) + : m_data(device.get((const_cast(m.data())))), + m_dims(m.dimensions()), + m_device(device) + { } + + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType dest) { + if (!NumTraits::type>::RequireInitialization && dest) { + m_device.memcpy((void*)(m_device.get(dest)), m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar)); + return false; + } + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType dest, EvalSubExprsCallback done) { + // TODO(ezhulenev): ThreadPoolDevice memcpy is blockign operation. + done(evalSubExprsIfNeeded(dest)); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + eigen_assert(m_data != NULL); + return m_data[index]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { + eigen_assert(m_data != NULL); + return m_data[index]; + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + PacketReturnType packet(Index index) const + { + return internal::ploadt(m_data + index); + } + + // Return a packet starting at `index` where `umask` specifies which elements + // have to be loaded. Type/size of mask depends on PacketReturnType, e.g. for + // Packet16f, `umask` is of type uint16_t and if a bit is 1, corresponding + // float element will be loaded, otherwise 0 will be loaded. + // Function has been templatized to enable Sfinae. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename internal::enable_if::masked_load_available, PacketReturnTypeT>::type + partialPacket(Index index, typename internal::unpacket_traits::mask_t umask) const + { + return internal::ploadu(m_data + index, umask); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + return internal::pstoret(m_data + index, x); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const { + eigen_assert(m_data != NULL); + if (static_cast(Layout) == static_cast(ColMajor)) { + return m_data[m_dims.IndexOfColMajor(coords)]; + } else { + return m_data[m_dims.IndexOfRowMajor(coords)]; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& + coeffRef(const array& coords) { + eigen_assert(m_data != NULL); + if (static_cast(Layout) == static_cast(ColMajor)) { + return m_data[m_dims.IndexOfColMajor(coords)]; + } else { + return m_data[m_dims.IndexOfRowMajor(coords)]; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, + PacketType::size); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + return internal::TensorBlockResourceRequirements::any(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + assert(m_data != NULL); + return TensorBlock::materialize(m_data, m_dims, desc, scratch); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( + const TensorBlockDesc& desc, const TensorBlock& block) { + assert(m_data != NULL); + + typedef typename TensorBlock::XprType TensorBlockExpr; + typedef internal::TensorBlockAssignment + TensorBlockAssign; + + TensorBlockAssign::Run( + TensorBlockAssign::target(desc.dimensions(), + internal::strides(m_dims), m_data, + desc.offset()), + block.expr()); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; } + +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_data.bind(cgh); + } +#endif + protected: + EvaluatorPointerType m_data; + Dimensions m_dims; + const Device EIGEN_DEVICE_REF m_device; +}; + +namespace { +template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T loadConstant(const T* address) { + return *address; +} +// Use the texture cache on CUDA devices whenever possible +#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 +template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float loadConstant(const float* address) { + return __ldg(address); +} +template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double loadConstant(const double* address) { + return __ldg(address); +} +template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +Eigen::half loadConstant(const Eigen::half* address) { + return Eigen::half(half_impl::raw_uint16_to_half(__ldg(&address->x))); +} +#endif +#ifdef EIGEN_USE_SYCL +// overload of load constant should be implemented here based on range access +template +T &loadConstant(const Eigen::TensorSycl::internal::RangeAccess &address) { + return *address; +} +#endif +} + + +// Default evaluator for rvalues +template +struct TensorEvaluator +{ + typedef typename Derived::Index Index; + typedef typename Derived::Scalar Scalar; + typedef typename Derived::Scalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename Derived::Dimensions Dimensions; + typedef const Derived XprType; + typedef typename internal::traits::template MakePointer::Type TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + typedef typename internal::remove_const::type ScalarNoConst; + + // NumDimensions is -1 for variable dim tensors + static const int NumCoords = internal::traits::NumDimensions > 0 ? + internal::traits::NumDimensions : 0; + static const int PacketSize = PacketType::size; + + enum { + IsAligned = Derived::IsAligned, + PacketAccess = (PacketType::size > 1), + BlockAccess = internal::is_arithmetic::value, + PreferBlockAccess = false, + Layout = Derived::Layout, + CoordAccess = NumCoords > 0, + RawAccess = true + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename internal::TensorMaterializedBlock + TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) + : m_data(device.get(m.data())), m_dims(m.dimensions()), m_device(device) + { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + if (!NumTraits::type>::RequireInitialization && data) { + m_device.memcpy((void*)(m_device.get(data)),m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar)); + return false; + } + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType dest, EvalSubExprsCallback done) { + // TODO(ezhulenev): ThreadPoolDevice memcpy is a blockign operation. + done(evalSubExprsIfNeeded(dest)); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + eigen_assert(m_data != NULL); + return loadConstant(m_data+index); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + PacketReturnType packet(Index index) const + { + return internal::ploadt_ro(m_data + index); + } + + // Return a packet starting at `index` where `umask` specifies which elements + // have to be loaded. Type/size of mask depends on PacketReturnType, e.g. for + // Packet16f, `umask` is of type uint16_t and if a bit is 1, corresponding + // float element will be loaded, otherwise 0 will be loaded. + // Function has been templatized to enable Sfinae. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename internal::enable_if::masked_load_available, PacketReturnTypeT>::type + partialPacket(Index index, typename internal::unpacket_traits::mask_t umask) const + { + return internal::ploadu(m_data + index, umask); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const { + eigen_assert(m_data != NULL); + const Index index = (static_cast(Layout) == static_cast(ColMajor)) ? m_dims.IndexOfColMajor(coords) + : m_dims.IndexOfRowMajor(coords); + return loadConstant(m_data+index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, + PacketType::size); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + return internal::TensorBlockResourceRequirements::any(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + assert(m_data != NULL); + return TensorBlock::materialize(m_data, m_dims, desc, scratch); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_data.bind(cgh); + } +#endif + protected: + EvaluatorPointerType m_data; + Dimensions m_dims; + const Device EIGEN_DEVICE_REF m_device; +}; + + + + +// -------------------- CwiseNullaryOp -------------------- + +template +struct TensorEvaluator, Device> +{ + typedef TensorCwiseNullaryOp XprType; + + TensorEvaluator(const XprType& op, const Device& device) + : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper() + { } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename internal::traits::Scalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = PacketType::size; + typedef typename TensorEvaluator::Dimensions Dimensions; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + enum { + IsAligned = true, + PacketAccess = internal::functor_traits::PacketAccess + #ifdef EIGEN_USE_SYCL + && (PacketType::size >1) + #endif + , + BlockAccess = false, + PreferBlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType, EvalSubExprsCallback done) { + done(true); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { } + + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + { + return m_wrapper(m_functor, index); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_wrapper.template packetOp(m_functor, index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, + PacketType::size); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_argImpl.bind(cgh); + } +#endif + + private: + const NullaryOp m_functor; + TensorEvaluator m_argImpl; + const internal::nullary_wrapper m_wrapper; +}; + + + +// -------------------- CwiseUnaryOp -------------------- + +template +struct TensorEvaluator, Device> +{ + typedef TensorCwiseUnaryOp XprType; + + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = int(TensorEvaluator::PacketAccess) & + int(internal::functor_traits::PacketAccess), + BlockAccess = TensorEvaluator::BlockAccess, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + TensorEvaluator(const XprType& op, const Device& device) + : m_device(device), + m_functor(op.functor()), + m_argImpl(op.nestedExpression(), device) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename internal::remove_const::type ScalarNoConst; + typedef typename internal::traits::Scalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = PacketType::size; + typedef typename TensorEvaluator::Dimensions Dimensions; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + static const int NumDims = internal::array_size::value; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlock + ArgTensorBlock; + + typedef internal::TensorCwiseUnaryBlock + TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + m_argImpl.evalSubExprsIfNeeded(NULL); + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType, EvalSubExprsCallback done) { + m_argImpl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { + m_argImpl.cleanup(); + } + + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + { + return m_functor(m_argImpl.coeff(index)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_functor.packetOp(m_argImpl.template packet(index)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double functor_cost = internal::functor_traits::Cost; + return m_argImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + static const double functor_cost = internal::functor_traits::Cost; + return m_argImpl.getResourceRequirements().addCostPerCoeff( + {0, 0, functor_cost / PacketSize}); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + return TensorBlock(m_argImpl.block(desc, scratch), m_functor); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const{ + m_argImpl.bind(cgh); + } +#endif + + + private: + const Device EIGEN_DEVICE_REF m_device; + const UnaryOp m_functor; + TensorEvaluator m_argImpl; +}; + + +// -------------------- CwiseBinaryOp -------------------- + +template +struct TensorEvaluator, Device> +{ + typedef TensorCwiseBinaryOp XprType; + + enum { + IsAligned = int(TensorEvaluator::IsAligned) & + int(TensorEvaluator::IsAligned), + PacketAccess = int(TensorEvaluator::PacketAccess) & + int(TensorEvaluator::PacketAccess) & + int(internal::functor_traits::PacketAccess), + BlockAccess = int(TensorEvaluator::BlockAccess) & + int(TensorEvaluator::BlockAccess), + PreferBlockAccess = int(TensorEvaluator::PreferBlockAccess) | + int(TensorEvaluator::PreferBlockAccess), + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + TensorEvaluator(const XprType& op, const Device& device) + : m_device(device), + m_functor(op.functor()), + m_leftImpl(op.lhsExpression(), device), + m_rightImpl(op.rhsExpression(), device) + { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout) || internal::traits::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions())); + } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename internal::traits::Scalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = PacketType::size; + typedef typename TensorEvaluator::Dimensions Dimensions; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static const int NumDims = internal::array_size< + typename TensorEvaluator::Dimensions>::value; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlock + LeftTensorBlock; + typedef typename TensorEvaluator::TensorBlock + RightTensorBlock; + + typedef internal::TensorCwiseBinaryBlock + TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const + { + // TODO: use right impl instead if right impl dimensions are known at compile time. + return m_leftImpl.dimensions(); + } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + m_leftImpl.evalSubExprsIfNeeded(NULL); + m_rightImpl.evalSubExprsIfNeeded(NULL); + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType, EvalSubExprsCallback done) { + // TODO(ezhulenev): Evaluate two expression in parallel? + m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done](bool) { + m_rightImpl.evalSubExprsIfNeededAsync(nullptr, + [done](bool) { done(true); }); + }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { + m_leftImpl.cleanup(); + m_rightImpl.cleanup(); + } + + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + { + return m_functor(m_leftImpl.coeff(index), m_rightImpl.coeff(index)); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_functor.packetOp(m_leftImpl.template packet(index), m_rightImpl.template packet(index)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double functor_cost = internal::functor_traits::Cost; + return m_leftImpl.costPerCoeff(vectorized) + + m_rightImpl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + static const double functor_cost = internal::functor_traits::Cost; + return internal::TensorBlockResourceRequirements::merge( + m_leftImpl.getResourceRequirements(), + m_rightImpl.getResourceRequirements()) + .addCostPerCoeff({0, 0, functor_cost / PacketSize}); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + desc.DropDestinationBuffer(); + return TensorBlock(m_leftImpl.block(desc, scratch), + m_rightImpl.block(desc, scratch), m_functor); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + + #ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_leftImpl.bind(cgh); + m_rightImpl.bind(cgh); + } + #endif + private: + const Device EIGEN_DEVICE_REF m_device; + const BinaryOp m_functor; + TensorEvaluator m_leftImpl; + TensorEvaluator m_rightImpl; +}; + +// -------------------- CwiseTernaryOp -------------------- + +template +struct TensorEvaluator, Device> +{ + typedef TensorCwiseTernaryOp XprType; + + enum { + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess && + TensorEvaluator::PacketAccess && + TensorEvaluator::PacketAccess && + internal::functor_traits::PacketAccess, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess || + TensorEvaluator::PreferBlockAccess || + TensorEvaluator::PreferBlockAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + TensorEvaluator(const XprType& op, const Device& device) + : m_functor(op.functor()), + m_arg1Impl(op.arg1Expression(), device), + m_arg2Impl(op.arg2Expression(), device), + m_arg3Impl(op.arg3Expression(), device) + { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout) || internal::traits::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE); + + EIGEN_STATIC_ASSERT((internal::is_same::StorageKind, + typename internal::traits::StorageKind>::value), + STORAGE_KIND_MUST_MATCH) + EIGEN_STATIC_ASSERT((internal::is_same::StorageKind, + typename internal::traits::StorageKind>::value), + STORAGE_KIND_MUST_MATCH) + EIGEN_STATIC_ASSERT((internal::is_same::Index, + typename internal::traits::Index>::value), + STORAGE_INDEX_MUST_MATCH) + EIGEN_STATIC_ASSERT((internal::is_same::Index, + typename internal::traits::Index>::value), + STORAGE_INDEX_MUST_MATCH) + + eigen_assert(dimensions_match(m_arg1Impl.dimensions(), m_arg2Impl.dimensions()) && dimensions_match(m_arg1Impl.dimensions(), m_arg3Impl.dimensions())); + } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename internal::traits::Scalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = PacketType::size; + typedef typename TensorEvaluator::Dimensions Dimensions; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const + { + // TODO: use arg2 or arg3 dimensions if they are known at compile time. + return m_arg1Impl.dimensions(); + } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + m_arg1Impl.evalSubExprsIfNeeded(NULL); + m_arg2Impl.evalSubExprsIfNeeded(NULL); + m_arg3Impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_STRONG_INLINE void cleanup() { + m_arg1Impl.cleanup(); + m_arg2Impl.cleanup(); + m_arg3Impl.cleanup(); + } + + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + { + return m_functor(m_arg1Impl.coeff(index), m_arg2Impl.coeff(index), m_arg3Impl.coeff(index)); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_functor.packetOp(m_arg1Impl.template packet(index), + m_arg2Impl.template packet(index), + m_arg3Impl.template packet(index)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double functor_cost = internal::functor_traits::Cost; + return m_arg1Impl.costPerCoeff(vectorized) + + m_arg2Impl.costPerCoeff(vectorized) + + m_arg3Impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_arg1Impl.bind(cgh); + m_arg2Impl.bind(cgh); + m_arg3Impl.bind(cgh); + } +#endif + + private: + const TernaryOp m_functor; + TensorEvaluator m_arg1Impl; + TensorEvaluator m_arg2Impl; + TensorEvaluator m_arg3Impl; +}; + + +// -------------------- SelectOp -------------------- + +template +struct TensorEvaluator, Device> +{ + typedef TensorSelectOp XprType; + typedef typename XprType::Scalar Scalar; + + enum { + IsAligned = TensorEvaluator::IsAligned & + TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & + TensorEvaluator::PacketAccess & + PacketType::HasBlend, + BlockAccess = TensorEvaluator::BlockAccess && + TensorEvaluator::BlockAccess && + TensorEvaluator::BlockAccess, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess || + TensorEvaluator::PreferBlockAccess || + TensorEvaluator::PreferBlockAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + TensorEvaluator(const XprType& op, const Device& device) + : m_condImpl(op.ifExpression(), device), + m_thenImpl(op.thenExpression(), device), + m_elseImpl(op.elseExpression(), device) + { + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert(dimensions_match(m_condImpl.dimensions(), m_thenImpl.dimensions())); + eigen_assert(dimensions_match(m_thenImpl.dimensions(), m_elseImpl.dimensions())); + } + + typedef typename XprType::Index Index; + typedef typename internal::traits::Scalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = PacketType::size; + typedef typename TensorEvaluator::Dimensions Dimensions; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static const int NumDims = internal::array_size::value; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlock + IfArgTensorBlock; + typedef typename TensorEvaluator::TensorBlock + ThenArgTensorBlock; + typedef typename TensorEvaluator::TensorBlock + ElseArgTensorBlock; + + struct TensorSelectOpBlockFactory { + template + struct XprType { + typedef TensorSelectOp type; + }; + + template + typename XprType::type expr( + const IfArgXprType& if_expr, const ThenArgXprType& then_expr, const ElseArgXprType& else_expr) const { + return typename XprType::type(if_expr, then_expr, else_expr); + } + }; + + typedef internal::TensorTernaryExprBlock + TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const + { + // TODO: use then or else impl instead if they happen to be known at compile time. + return m_condImpl.dimensions(); + } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + m_condImpl.evalSubExprsIfNeeded(NULL); + m_thenImpl.evalSubExprsIfNeeded(NULL); + m_elseImpl.evalSubExprsIfNeeded(NULL); + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType, EvalSubExprsCallback done) { + m_condImpl.evalSubExprsIfNeeded(nullptr, [this, done](bool) { + m_thenImpl.evalSubExprsIfNeeded(nullptr, [this, done](bool) { + m_elseImpl.evalSubExprsIfNeeded(nullptr, [done](bool) { done(true); }); + }); + }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { + m_condImpl.cleanup(); + m_thenImpl.cleanup(); + m_elseImpl.cleanup(); + } + + EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const + { + return m_condImpl.coeff(index) ? m_thenImpl.coeff(index) : m_elseImpl.coeff(index); + } + template + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const + { + internal::Selector select; + EIGEN_UNROLL_LOOP + for (Index i = 0; i < PacketSize; ++i) { + select.select[i] = m_condImpl.coeff(index+i); + } + return internal::pblend(select, + m_thenImpl.template packet(index), + m_elseImpl.template packet(index)); + + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + return m_condImpl.costPerCoeff(vectorized) + + m_thenImpl.costPerCoeff(vectorized) + .cwiseMax(m_elseImpl.costPerCoeff(vectorized)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + auto then_req = m_thenImpl.getResourceRequirements(); + auto else_req = m_elseImpl.getResourceRequirements(); + + auto merged_req = + internal::TensorBlockResourceRequirements::merge(then_req, else_req); + merged_req.cost_per_coeff = + then_req.cost_per_coeff.cwiseMax(else_req.cost_per_coeff); + + return internal::TensorBlockResourceRequirements::merge( + m_condImpl.getResourceRequirements(), merged_req); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + // It's unsafe to pass destination buffer to underlying expressions, because + // output might be aliased with one of the inputs. + desc.DropDestinationBuffer(); + + return TensorBlock( + m_condImpl.block(desc, scratch), m_thenImpl.block(desc, scratch), + m_elseImpl.block(desc, scratch), TensorSelectOpBlockFactory()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; } + +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_condImpl.bind(cgh); + m_thenImpl.bind(cgh); + m_elseImpl.bind(cgh); + } +#endif + private: + TensorEvaluator m_condImpl; + TensorEvaluator m_thenImpl; + TensorEvaluator m_elseImpl; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorExecutor.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorExecutor.h new file mode 100644 index 0000000..c52fb77 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorExecutor.h @@ -0,0 +1,703 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H +#define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H + +namespace Eigen { + +/** + * \class TensorExecutor + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor executor class. + * + * This class is responsible for launch the evaluation of the expression on + * the specified computing device. + * + * @tparam Vectorizable can use packet math (SSE/AVX/etc... registers and + * instructions) + * @tparam Tiling can use block based tensor evaluation + * (see TensorBlock.h) + */ +namespace internal { + +/** + * Evaluating TensorBroadcastingOp via coefficient of packet path is extremely + * expensive. If expression has at least one broadcast op in it, and it supports + * block based evaluation, we always prefer it, even for the small tensors. For + * all other tileable ops, block evaluation overhead for small tensors (fits + * into L1) is too large, and we fallback on vectorized evaluation. + */ + +// TODO(ezhulenev): Add specializations for all other types of Tensor ops. + +template +struct ExpressionHasTensorBroadcastingOp { + enum { value = false }; +}; + +template +struct ExpressionHasTensorBroadcastingOp< + const TensorAssignOp > { + enum { value = ExpressionHasTensorBroadcastingOp::value }; +}; + +template +struct ExpressionHasTensorBroadcastingOp< + const TensorCwiseUnaryOp > { + enum { value = ExpressionHasTensorBroadcastingOp::value }; +}; + +template +struct ExpressionHasTensorBroadcastingOp< + const TensorCwiseBinaryOp > { + enum { + value = ExpressionHasTensorBroadcastingOp::value || + ExpressionHasTensorBroadcastingOp::value + }; +}; + +template +struct ExpressionHasTensorBroadcastingOp< + const TensorBroadcastingOp > { + enum { value = true }; +}; + +// -------------------------------------------------------------------------- // + +/** + * Default strategy: the expression is evaluated sequentially with a single cpu + * thread, without vectorization and block evaluation. + */ +template +class TensorExecutor { + public: + typedef typename Expression::Index StorageIndex; + + // Including `unsupported/Eigen/CXX11/Tensor` in different translation units + // with/without `EIGEN_USE_THREADS` or `EIGEN_USE_GPU` is a potential ODR + // violation. If this template is instantiated with a non-default device, it + // means that this header file was included without defining + // `EIGEN_USE_THREADS`, `EIGEN_USE_GPU` or `EIGEN_USE_SYCL`. + static_assert(std::is_same::value, + "Default executor instantiated with non-default device. " + "You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or " + "EIGEN_USE_SYCL before including Eigen headers."); + + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE void run(const Expression& expr, + const Device& device = Device()) { + TensorEvaluator evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) { + const StorageIndex size = array_prod(evaluator.dimensions()); + for (StorageIndex i = 0; i < size; ++i) { + evaluator.evalScalar(i); + } + } + evaluator.cleanup(); + } +}; + +/** + * Default async execution strategy is not implemented. Currently it's only + * available for ThreadPoolDevice (see definition below). + */ +template +class TensorAsyncExecutor {}; + +/** + * Process all the data with a single cpu thread, using vectorized instructions. + */ +template +class TensorExecutor { + public: + typedef typename Expression::Index StorageIndex; + + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE void run( + const Expression& expr, const DefaultDevice& device = DefaultDevice()) { + TensorEvaluator evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) { + const StorageIndex size = array_prod(evaluator.dimensions()); + const int PacketSize = unpacket_traits::PacketReturnType>::size; + + // Give compiler a strong possibility to unroll the loop. But don't insist + // on unrolling, because if the function is expensive compiler should not + // unroll the loop at the expense of inlining. + const StorageIndex UnrolledSize = + (size / (4 * PacketSize)) * 4 * PacketSize; + for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) { + for (StorageIndex j = 0; j < 4; j++) { + evaluator.evalPacket(i + j * PacketSize); + } + } + const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize; + for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) { + evaluator.evalPacket(i); + } + for (StorageIndex i = VectorizedSize; i < size; ++i) { + evaluator.evalScalar(i); + } + } + evaluator.cleanup(); + } +}; + +/** + * Process all the data with a single cpu thread, using blocks of data. By + * sizing a block to fit L1 cache we get better cache performance. + */ +template +class TensorExecutor { + public: + typedef typename traits::Scalar Scalar; + typedef typename remove_const::type ScalarNoConst; + + typedef TensorEvaluator Evaluator; + typedef typename traits::Index StorageIndex; + + static const int NumDims = traits::NumDimensions; + + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE void run(const Expression& expr, + const DefaultDevice& device = DefaultDevice()) { + typedef TensorBlockMapper + TensorBlockMapper; + + typedef internal::TensorBlockDescriptor + TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator + TensorBlockScratch; + + Evaluator evaluator(expr, device); + + // TODO(ezhulenev): Do not use tiling for small tensors? + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + + if (needs_assign) { + // Query expression tree for desired block size/shape. + const TensorBlockResourceRequirements requirements = + evaluator.getResourceRequirements(); + + const TensorBlockMapper block_mapper( + typename TensorBlockDesc::Dimensions(evaluator.dimensions()), + requirements); + + // Share scratch memory allocator between all blocks. + TensorBlockScratch scratch(device); + + const StorageIndex total_block_count = block_mapper.blockCount(); + for (StorageIndex i = 0; i < total_block_count; ++i) { + TensorBlockDesc desc = block_mapper.blockDescriptor(i); + evaluator.evalBlock(desc, scratch); + scratch.reset(); + } + } + evaluator.cleanup(); + } +}; + +/** + * Multicore strategy: the index space is partitioned and each partition is + * executed on a single core. + * + * (1) TensorExecutor will submit work to the ThreadPoolDevice managed thread + * pool, and will block the caller thread until all tasks are finished. + * + * (2) TensorAsyncExecutor is a non-blocking version, that will submit work to + * the ThreadPoolDevice managed thread pool, and will return immediately. + * It will call 'done' callback after all tasks are finished. + */ +#ifdef EIGEN_USE_THREADS + +template +struct TensorExecutorTilingContext { + TensorExecutorTilingContext() = default; + TensorExecutorTilingContext(const TensorBlockMapper& b_mapper, + const TensorOpCost& b_cost, size_t b_aligned_size) + : block_mapper(b_mapper), + cost(b_cost), + aligned_blocksize(b_aligned_size) {} + + TensorBlockMapper block_mapper; // navigate through blocks + TensorOpCost cost; // cost of computing a single block + size_t aligned_blocksize; // block size after memory alignment +}; + +// Computes a block evaluation parameters, and allocates temporary memory buffer +// for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below. +template +TensorExecutorTilingContext GetTensorExecutorTilingContext( + const Evaluator& evaluator) { + // Query expression tree for desired block size/shape. + TensorBlockResourceRequirements requirements = + evaluator.getResourceRequirements(); + + // Update target block size based on cost model. + double taskSize = TensorCostModel::taskSize( + 1, requirements.cost_per_coeff); + requirements.size = static_cast(1.0 / taskSize); + + TensorBlockMapper block_mapper( + typename TensorBlockMapper::Dimensions(evaluator.dimensions()), + requirements); + + size_t block_size = block_mapper.blockTotalSize(); + const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1); + const size_t aligned_blocksize = + align * + divup(block_size * sizeof(typename Evaluator::Scalar), align); + + return {block_mapper, requirements.cost_per_coeff * block_size, + aligned_blocksize}; +} + +template +struct EvalRange { + static void run(Evaluator* evaluator_in, const StorageIndex firstIdx, + const StorageIndex lastIdx) { + Evaluator evaluator = *evaluator_in; + eigen_assert(lastIdx >= firstIdx); + for (StorageIndex i = firstIdx; i < lastIdx; ++i) { + evaluator.evalScalar(i); + } + } + + static StorageIndex alignBlockSize(StorageIndex size) { return size; } +}; + +template +struct EvalRange { + static const int PacketSize = + unpacket_traits::size; + + static void run(Evaluator* evaluator_in, const StorageIndex firstIdx, + const StorageIndex lastIdx) { + Evaluator evaluator = *evaluator_in; + eigen_assert(lastIdx >= firstIdx); + StorageIndex i = firstIdx; + if (lastIdx - firstIdx >= PacketSize) { + eigen_assert(firstIdx % PacketSize == 0); + StorageIndex last_chunk_offset = lastIdx - 4 * PacketSize; + // Give compiler a strong possibility to unroll the loop. But don't insist + // on unrolling, because if the function is expensive compiler should not + // unroll the loop at the expense of inlining. + for (; i <= last_chunk_offset; i += 4 * PacketSize) { + for (StorageIndex j = 0; j < 4; j++) { + evaluator.evalPacket(i + j * PacketSize); + } + } + last_chunk_offset = lastIdx - PacketSize; + for (; i <= last_chunk_offset; i += PacketSize) { + evaluator.evalPacket(i); + } + } + for (; i < lastIdx; ++i) { + evaluator.evalScalar(i); + } + } + + static StorageIndex alignBlockSize(StorageIndex size) { + // Align block size to packet size and account for unrolling in run above. + if (size >= 16 * PacketSize) { + return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1); + } + // Aligning to 4 * PacketSize would increase block size by more than 25%. + return (size + PacketSize - 1) & ~(PacketSize - 1); + } +}; + +template +class TensorExecutor { + public: + typedef typename Expression::Index StorageIndex; + + static EIGEN_STRONG_INLINE void run(const Expression& expr, + const ThreadPoolDevice& device) { + typedef TensorEvaluator Evaluator; + typedef EvalRange EvalRange; + + Evaluator evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); + if (needs_assign) { + const StorageIndex size = array_prod(evaluator.dimensions()); + device.parallelFor(size, evaluator.costPerCoeff(Vectorizable), + EvalRange::alignBlockSize, + [&evaluator](StorageIndex firstIdx, StorageIndex lastIdx) { + EvalRange::run(&evaluator, firstIdx, lastIdx); + }); + } + evaluator.cleanup(); + } +}; + +template +class TensorExecutor { + public: + typedef typename traits::Index IndexType; + typedef typename traits::Scalar Scalar; + typedef typename remove_const::type ScalarNoConst; + + static const int NumDims = traits::NumDimensions; + + typedef TensorEvaluator Evaluator; + typedef TensorBlockMapper BlockMapper; + typedef TensorExecutorTilingContext TilingContext; + + typedef internal::TensorBlockDescriptor + TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator + TensorBlockScratch; + + static EIGEN_STRONG_INLINE void run(const Expression& expr, + const ThreadPoolDevice& device) { + Evaluator evaluator(expr, device); + + const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); + if (needs_assign) { + const TilingContext tiling = + internal::GetTensorExecutorTilingContext(evaluator); + + auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx, + IndexType lastBlockIdx) { + TensorBlockScratch scratch(device); + + for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx; + ++block_idx) { + TensorBlockDesc desc = tiling.block_mapper.blockDescriptor(block_idx); + evaluator.evalBlock(desc, scratch); + scratch.reset(); + } + }; + + // Evaluate small expressions directly as a single block. + if (tiling.block_mapper.blockCount() == 1) { + TensorBlockScratch scratch(device); + TensorBlockDesc desc(0, tiling.block_mapper.blockDimensions()); + evaluator.evalBlock(desc, scratch); + } else { + device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost, + eval_block); + } + } + evaluator.cleanup(); + } +}; + +template +class TensorAsyncExecutor { + public: + typedef typename Expression::Index StorageIndex; + typedef TensorEvaluator Evaluator; + + static EIGEN_STRONG_INLINE void runAsync(const Expression& expr, + const ThreadPoolDevice& device, + DoneCallback done) { + TensorAsyncExecutorContext* const ctx = + new TensorAsyncExecutorContext(expr, device, std::move(done)); + + const auto on_eval_subexprs = [ctx, &device](bool need_assign) -> void { + if (!need_assign) { + delete ctx; + return; + } + + typedef EvalRange EvalRange; + const StorageIndex size = array_prod(ctx->evaluator.dimensions()); + device.parallelForAsync( + size, ctx->evaluator.costPerCoeff(Vectorizable), + EvalRange::alignBlockSize, + [ctx](StorageIndex firstIdx, StorageIndex lastIdx) { + EvalRange::run(&ctx->evaluator, firstIdx, lastIdx); + }, + [ctx]() { delete ctx; }); + }; + + ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs); + } + + private: + struct TensorAsyncExecutorContext { + TensorAsyncExecutorContext(const Expression& expr, + const ThreadPoolDevice& thread_pool, + DoneCallback done) + : evaluator(expr, thread_pool), on_done(std::move(done)) {} + + ~TensorAsyncExecutorContext() { + evaluator.cleanup(); + on_done(); + } + + Evaluator evaluator; + + private: + DoneCallback on_done; + }; +}; + +template +class TensorAsyncExecutor { + public: + typedef typename traits::Index IndexType; + typedef typename traits::Scalar Scalar; + typedef typename remove_const::type ScalarNoConst; + + static const int NumDims = traits::NumDimensions; + + typedef TensorEvaluator Evaluator; + typedef TensorBlockMapper BlockMapper; + typedef TensorExecutorTilingContext TilingContext; + + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator + TensorBlockScratch; + + static EIGEN_STRONG_INLINE void runAsync(const Expression& expr, + const ThreadPoolDevice& device, + DoneCallback done) { + + TensorAsyncExecutorContext* const ctx = + new TensorAsyncExecutorContext(expr, device, std::move(done)); + + const auto on_eval_subexprs = [ctx](bool need_assign) -> void { + if (!need_assign) { + delete ctx; + return; + } + + ctx->tiling = internal::GetTensorExecutorTilingContext< + Evaluator, BlockMapper, Vectorizable>(ctx->evaluator); + + auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) { + TensorBlockScratch scratch(ctx->device); + + for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx; + ++block_idx) { + TensorBlockDesc desc = + ctx->tiling.block_mapper.blockDescriptor(block_idx); + ctx->evaluator.evalBlock(desc, scratch); + scratch.reset(); + } + }; + + // Evaluate small expressions directly as a single block. + if (ctx->tiling.block_mapper.blockCount() == 1) { + TensorBlockScratch scratch(ctx->device); + TensorBlockDesc desc(0, ctx->tiling.block_mapper.blockDimensions()); + ctx->evaluator.evalBlock(desc, scratch); + delete ctx; + } else { + ctx->device.parallelForAsync(ctx->tiling.block_mapper.blockCount(), + ctx->tiling.cost, eval_block, + [ctx]() { delete ctx; }); + } + }; + + ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs); + } + + private: + struct TensorAsyncExecutorContext { + TensorAsyncExecutorContext(const Expression& expr, + const ThreadPoolDevice& thread_pool, + DoneCallback done) + : device(thread_pool), + evaluator(expr, thread_pool), + on_done(std::move(done)) {} + + ~TensorAsyncExecutorContext() { + evaluator.cleanup(); + on_done(); + } + + const ThreadPoolDevice& device; + Evaluator evaluator; + TilingContext tiling; + + private: + DoneCallback on_done; + }; +}; + +#endif // EIGEN_USE_THREADS + +// GPU: the evaluation of the expression is offloaded to a GPU. +#if defined(EIGEN_USE_GPU) + +template +class TensorExecutor { + public: + typedef typename Expression::Index StorageIndex; + static void run(const Expression& expr, const GpuDevice& device); +}; + +#if defined(EIGEN_GPUCC) +template +struct EigenMetaKernelEval { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) { + for (StorageIndex i = firstIdx; i < lastIdx; i += step_size) { + eval.evalScalar(i); + } + } +}; + +template +struct EigenMetaKernelEval { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) { + const StorageIndex PacketSize = unpacket_traits::size; + const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize; + const StorageIndex vectorized_step_size = step_size * PacketSize; + + // Use the vector path + for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size; + i += vectorized_step_size) { + eval.evalPacket(i); + } + for (StorageIndex i = vectorized_size + firstIdx; i < lastIdx; i += step_size) { + eval.evalScalar(i); + } + } +}; + +template +__global__ void +__launch_bounds__(1024) +EigenMetaKernel(Evaluator eval, StorageIndex size) { + + const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x; + const StorageIndex step_size = blockDim.x * gridDim.x; + + const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned; + EigenMetaKernelEval::run(eval, first_index, size, step_size); +} + +/*static*/ +template +EIGEN_STRONG_INLINE void TensorExecutor::run( + const Expression& expr, const GpuDevice& device) { + TensorEvaluator evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); + if (needs_assign) { + + const int block_size = device.maxGpuThreadsPerBlock(); + const int max_blocks = device.getNumGpuMultiProcessors() * + device.maxGpuThreadsPerMultiProcessor() / block_size; + const StorageIndex size = array_prod(evaluator.dimensions()); + // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. + const int num_blocks = numext::maxi(numext::mini(max_blocks, divup(size, block_size)), 1); + + LAUNCH_GPU_KERNEL( + (EigenMetaKernel, StorageIndex>), + num_blocks, block_size, 0, device, evaluator, size); + } + evaluator.cleanup(); +} + +#endif // EIGEN_GPUCC +#endif // EIGEN_USE_GPU + +// SYCL Executor policy +#ifdef EIGEN_USE_SYCL + +template +struct ExecExprFunctorKernel { + typedef typename Evaluator::Index Index; + Evaluator evaluator; + const Index range; + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel( + const Scratch, Evaluator evaluator_, const Index range_) + : evaluator(evaluator_), range(range_) {} + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()( + cl::sycl::nd_item<1> itemID) { + compute(itemID); + } + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if::type + compute(const cl::sycl::nd_item<1>& itemID) { + Index gId = static_cast(itemID.get_global_linear_id()); + Index total_threads = itemID.get_global_range(0); + + for (Index i = gId; i < range; i += total_threads) { + evaluator.evalScalar(i); + } + } + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if::type + compute(const cl::sycl::nd_item<1>& itemID) { + const Index vectorizedRange = + (range / Evaluator::PacketSize) * Evaluator::PacketSize; + Index gId = static_cast(itemID.get_global_linear_id()); + const Index step = Evaluator::PacketSize * itemID.get_global_range(0); + const Index start = Evaluator::PacketSize * gId; + for (Index i = start; i < vectorizedRange; i += step) { + evaluator.evalPacket(i); + } + gId += vectorizedRange; + for (Index i = gId; i < range; i += itemID.get_global_range(0)) { + evaluator.evalScalar(i); + } + } +}; + +template +class TensorExecutor { + public: + typedef typename Expression::Index Index; + static EIGEN_STRONG_INLINE void run(const Expression& expr, + const Eigen::SyclDevice& dev) { + typedef Eigen::TensorEvaluator Evaluator; + Evaluator evaluator(expr, dev); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) { + Index range, GRange, tileSize; + Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions()); + total_size = (total_size == 0) ? 1 : total_size; + const int PacketSize = + Eigen::PacketType::size; + Index vectorizable_threads = static_cast(total_size / PacketSize); + dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange); + range = total_size; + + dev.template nullary_kernel_launcher< + typename Evaluator::CoeffReturnType, + ExecExprFunctorKernel >( + evaluator, + cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), + cl::sycl::range<1>(tileSize)), + Index(1), range); + } + evaluator.cleanup(); + } +}; + +#endif + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorExpr.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorExpr.h new file mode 100644 index 0000000..c9bccfc --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorExpr.h @@ -0,0 +1,388 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXPR_H +#define EIGEN_CXX11_TENSOR_TENSOR_EXPR_H + +namespace Eigen { + +/** \class TensorExpr + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor expression classes. + * + * The TensorCwiseNullaryOp class applies a nullary operators to an expression. + * This is typically used to generate constants. + * + * The TensorCwiseUnaryOp class represents an expression where a unary operator + * (e.g. cwiseSqrt) is applied to an expression. + * + * The TensorCwiseBinaryOp class represents an expression where a binary + * operator (e.g. addition) is applied to a lhs and a rhs expression. + * + */ +namespace internal { +template +struct traits > + : traits +{ + typedef traits XprTraits; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::Nested XprTypeNested; + typedef typename remove_reference::type _XprTypeNested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; + enum { + Flags = 0 + }; +}; + +} // end namespace internal + + + +template +class TensorCwiseNullaryOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef TensorCwiseNullaryOp Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseNullaryOp(const XprType& xpr, const NullaryOp& func = NullaryOp()) + : m_xpr(xpr), m_functor(func) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + nestedExpression() const { return m_xpr; } + + EIGEN_DEVICE_FUNC + const NullaryOp& functor() const { return m_functor; } + + protected: + typename XprType::Nested m_xpr; + const NullaryOp m_functor; +}; + + + +namespace internal { +template +struct traits > + : traits +{ + // TODO(phli): Add InputScalar, InputPacket. Check references to + // current Scalar/Packet to see if the intent is Input or Output. + typedef typename result_of::type Scalar; + typedef traits XprTraits; + typedef typename XprType::Nested XprTypeNested; + typedef typename remove_reference::type _XprTypeNested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; + typedef typename TypeConversion::type + PointerType; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorCwiseUnaryOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorCwiseUnaryOp type; +}; + +} // end namespace internal + + + +template +class TensorCwiseUnaryOp : public TensorBase, ReadOnlyAccessors> +{ + public: + // TODO(phli): Add InputScalar, InputPacket. Check references to + // current Scalar/Packet to see if the intent is Input or Output. + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef Scalar CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp()) + : m_xpr(xpr), m_functor(func) {} + + EIGEN_DEVICE_FUNC + const UnaryOp& functor() const { return m_functor; } + + /** \returns the nested expression */ + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + nestedExpression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const UnaryOp m_functor; +}; + + +namespace internal { +template +struct traits > +{ + // Type promotion to handle the case where the types of the lhs and the rhs + // are different. + // TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket. Check references to + // current Scalar/Packet to see if the intent is Inputs or Output. + typedef typename result_of< + BinaryOp(typename LhsXprType::Scalar, + typename RhsXprType::Scalar)>::type Scalar; + typedef traits XprTraits; + typedef typename promote_storage_type< + typename traits::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef typename promote_index_type< + typename traits::Index, + typename traits::Index>::type Index; + typedef typename LhsXprType::Nested LhsNested; + typedef typename RhsXprType::Nested RhsNested; + typedef typename remove_reference::type _LhsNested; + typedef typename remove_reference::type _RhsNested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; + typedef typename TypeConversion::val, + typename traits::PointerType, + typename traits::PointerType>::type + >::type + PointerType; + enum { + Flags = 0 + }; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorCwiseBinaryOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorCwiseBinaryOp type; +}; + +} // end namespace internal + + + +template +class TensorCwiseBinaryOp : public TensorBase, ReadOnlyAccessors> +{ + public: + // TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket. Check references to + // current Scalar/Packet to see if the intent is Inputs or Output. + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef Scalar CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const BinaryOp& func = BinaryOp()) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_functor(func) {} + + EIGEN_DEVICE_FUNC + const BinaryOp& functor() const { return m_functor; } + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + lhsExpression() const { return m_lhs_xpr; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + rhsExpression() const { return m_rhs_xpr; } + + protected: + typename LhsXprType::Nested m_lhs_xpr; + typename RhsXprType::Nested m_rhs_xpr; + const BinaryOp m_functor; +}; + + +namespace internal { +template +struct traits > +{ + // Type promotion to handle the case where the types of the args are different. + typedef typename result_of< + TernaryOp(typename Arg1XprType::Scalar, + typename Arg2XprType::Scalar, + typename Arg3XprType::Scalar)>::type Scalar; + typedef traits XprTraits; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::Index Index; + typedef typename Arg1XprType::Nested Arg1Nested; + typedef typename Arg2XprType::Nested Arg2Nested; + typedef typename Arg3XprType::Nested Arg3Nested; + typedef typename remove_reference::type _Arg1Nested; + typedef typename remove_reference::type _Arg2Nested; + typedef typename remove_reference::type _Arg3Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; + typedef typename TypeConversion::val, + typename traits::PointerType, + typename traits::PointerType>::type + >::type + PointerType; + enum { + Flags = 0 + }; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorCwiseTernaryOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorCwiseTernaryOp type; +}; + +} // end namespace internal + + + +template +class TensorCwiseTernaryOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef Scalar CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseTernaryOp(const Arg1XprType& arg1, const Arg2XprType& arg2, const Arg3XprType& arg3, const TernaryOp& func = TernaryOp()) + : m_arg1_xpr(arg1), m_arg2_xpr(arg2), m_arg3_xpr(arg3), m_functor(func) {} + + EIGEN_DEVICE_FUNC + const TernaryOp& functor() const { return m_functor; } + + /** \returns the nested expressions */ + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + arg1Expression() const { return m_arg1_xpr; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + arg2Expression() const { return m_arg2_xpr; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + arg3Expression() const { return m_arg3_xpr; } + + protected: + typename Arg1XprType::Nested m_arg1_xpr; + typename Arg2XprType::Nested m_arg2_xpr; + typename Arg3XprType::Nested m_arg3_xpr; + const TernaryOp m_functor; +}; + + +namespace internal { +template +struct traits > + : traits +{ + typedef typename traits::Scalar Scalar; + typedef traits XprTraits; + typedef typename promote_storage_type::StorageKind, + typename traits::StorageKind>::ret StorageKind; + typedef typename promote_index_type::Index, + typename traits::Index>::type Index; + typedef typename IfXprType::Nested IfNested; + typedef typename ThenXprType::Nested ThenNested; + typedef typename ElseXprType::Nested ElseNested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; + typedef typename conditional::val, + typename traits::PointerType, + typename traits::PointerType>::type PointerType; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorSelectOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorSelectOp type; +}; + +} // end namespace internal + + +template +class TensorSelectOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::promote_storage_type::ret CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC + TensorSelectOp(const IfXprType& a_condition, + const ThenXprType& a_then, + const ElseXprType& a_else) + : m_condition(a_condition), m_then(a_then), m_else(a_else) + { } + + EIGEN_DEVICE_FUNC + const IfXprType& ifExpression() const { return m_condition; } + + EIGEN_DEVICE_FUNC + const ThenXprType& thenExpression() const { return m_then; } + + EIGEN_DEVICE_FUNC + const ElseXprType& elseExpression() const { return m_else; } + + protected: + typename IfXprType::Nested m_condition; + typename ThenXprType::Nested m_then; + typename ElseXprType::Nested m_else; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_EXPR_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorFFT.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorFFT.h new file mode 100644 index 0000000..4a1a068 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorFFT.h @@ -0,0 +1,669 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Jianwei Cui +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FFT_H +#define EIGEN_CXX11_TENSOR_TENSOR_FFT_H + +namespace Eigen { + +/** \class TensorFFT + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor FFT class. + * + * TODO: + * Vectorize the Cooley Tukey and the Bluestein algorithm + * Add support for multithreaded evaluation + * Improve the performance on GPU + */ + +template struct MakeComplex { + template + EIGEN_DEVICE_FUNC + T operator() (const T& val) const { return val; } +}; + +template <> struct MakeComplex { + template + EIGEN_DEVICE_FUNC + std::complex operator() (const T& val) const { return std::complex(val, 0); } +}; + +template <> struct MakeComplex { + template + EIGEN_DEVICE_FUNC + std::complex operator() (const std::complex& val) const { return val; } +}; + +template struct PartOf { + template T operator() (const T& val) const { return val; } +}; + +template <> struct PartOf { + template T operator() (const std::complex& val) const { return val.real(); } +}; + +template <> struct PartOf { + template T operator() (const std::complex& val) const { return val.imag(); } +}; + +namespace internal { +template +struct traits > : public traits { + typedef traits XprTraits; + typedef typename NumTraits::Real RealScalar; + typedef typename std::complex ComplexScalar; + typedef typename XprTraits::Scalar InputScalar; + typedef typename conditional::type OutputScalar; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; + typedef typename traits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> { + typedef const TensorFFTOp& type; +}; + +template +struct nested, 1, typename eval >::type> { + typedef TensorFFTOp type; +}; + +} // end namespace internal + +template +class TensorFFTOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename std::complex ComplexScalar; + typedef typename internal::conditional::type OutputScalar; + typedef OutputScalar CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFFTOp(const XprType& expr, const FFT& fft) + : m_xpr(expr), m_fft(fft) {} + + EIGEN_DEVICE_FUNC + const FFT& fft() const { return m_fft; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& expression() const { + return m_xpr; + } + + protected: + typename XprType::Nested m_xpr; + const FFT m_fft; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + typedef TensorFFTOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename std::complex ComplexScalar; + typedef typename TensorEvaluator::Dimensions InputDimensions; + typedef internal::traits XprTraits; + typedef typename XprTraits::Scalar InputScalar; + typedef typename internal::conditional::type OutputScalar; + typedef OutputScalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + enum { + IsAligned = false, + PacketAccess = true, + BlockAccess = false, + PreferBlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) { + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + for (int i = 0; i < NumDims; ++i) { + eigen_assert(input_dims[i] > 0); + m_dimensions[i] = input_dims[i]; + } + + if (static_cast(Layout) == static_cast(ColMajor)) { + m_strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1]; + } + } else { + m_strides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1]; + } + } + m_size = m_dimensions.TotalSize(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { + return m_dimensions; + } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + m_impl.evalSubExprsIfNeeded(NULL); + if (data) { + evalToBuf(data); + return false; + } else { + m_data = (EvaluatorPointerType)m_device.get((CoeffReturnType*)(m_device.allocate_temp(sizeof(CoeffReturnType) * m_size))); + evalToBuf(m_data); + return true; + } + } + + EIGEN_STRONG_INLINE void cleanup() { + if (m_data) { + m_device.deallocate(m_data); + m_data = NULL; + } + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const { + return m_data[index]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType + packet(Index index) const { + return internal::ploadt(m_data + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_data.bind(cgh); + } +#endif + + private: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(EvaluatorPointerType data) { + const bool write_to_out = internal::is_same::value; + ComplexScalar* buf = write_to_out ? (ComplexScalar*)data : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * m_size); + + for (Index i = 0; i < m_size; ++i) { + buf[i] = MakeComplex::value>()(m_impl.coeff(i)); + } + + for (size_t i = 0; i < m_fft.size(); ++i) { + Index dim = m_fft[i]; + eigen_assert(dim >= 0 && dim < NumDims); + Index line_len = m_dimensions[dim]; + eigen_assert(line_len >= 1); + ComplexScalar* line_buf = (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * line_len); + const bool is_power_of_two = isPowerOfTwo(line_len); + const Index good_composite = is_power_of_two ? 0 : findGoodComposite(line_len); + const Index log_len = is_power_of_two ? getLog2(line_len) : getLog2(good_composite); + + ComplexScalar* a = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * good_composite); + ComplexScalar* b = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * good_composite); + ComplexScalar* pos_j_base_powered = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * (line_len + 1)); + if (!is_power_of_two) { + // Compute twiddle factors + // t_n = exp(sqrt(-1) * pi * n^2 / line_len) + // for n = 0, 1,..., line_len-1. + // For n > 2 we use the recurrence t_n = t_{n-1}^2 / t_{n-2} * t_1^2 + + // The recurrence is correct in exact arithmetic, but causes + // numerical issues for large transforms, especially in + // single-precision floating point. + // + // pos_j_base_powered[0] = ComplexScalar(1, 0); + // if (line_len > 1) { + // const ComplexScalar pos_j_base = ComplexScalar( + // numext::cos(M_PI / line_len), numext::sin(M_PI / line_len)); + // pos_j_base_powered[1] = pos_j_base; + // if (line_len > 2) { + // const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base; + // for (int i = 2; i < line_len + 1; ++i) { + // pos_j_base_powered[i] = pos_j_base_powered[i - 1] * + // pos_j_base_powered[i - 1] / + // pos_j_base_powered[i - 2] * + // pos_j_base_sq; + // } + // } + // } + // TODO(rmlarsen): Find a way to use Eigen's vectorized sin + // and cosine functions here. + for (int j = 0; j < line_len + 1; ++j) { + double arg = ((EIGEN_PI * j) * j) / line_len; + std::complex tmp(numext::cos(arg), numext::sin(arg)); + pos_j_base_powered[j] = static_cast(tmp); + } + } + + for (Index partial_index = 0; partial_index < m_size / line_len; ++partial_index) { + const Index base_offset = getBaseOffsetFromIndex(partial_index, dim); + + // get data into line_buf + const Index stride = m_strides[dim]; + if (stride == 1) { + m_device.memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar)); + } else { + Index offset = base_offset; + for (int j = 0; j < line_len; ++j, offset += stride) { + line_buf[j] = buf[offset]; + } + } + + // process the line + if (is_power_of_two) { + processDataLineCooleyTukey(line_buf, line_len, log_len); + } + else { + processDataLineBluestein(line_buf, line_len, good_composite, log_len, a, b, pos_j_base_powered); + } + + // write back + if (FFTDir == FFT_FORWARD && stride == 1) { + m_device.memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar)); + } else { + Index offset = base_offset; + const ComplexScalar div_factor = ComplexScalar(1.0 / line_len, 0); + for (int j = 0; j < line_len; ++j, offset += stride) { + buf[offset] = (FFTDir == FFT_FORWARD) ? line_buf[j] : line_buf[j] * div_factor; + } + } + } + m_device.deallocate(line_buf); + if (!is_power_of_two) { + m_device.deallocate(a); + m_device.deallocate(b); + m_device.deallocate(pos_j_base_powered); + } + } + + if(!write_to_out) { + for (Index i = 0; i < m_size; ++i) { + data[i] = PartOf()(buf[i]); + } + m_device.deallocate(buf); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static bool isPowerOfTwo(Index x) { + eigen_assert(x > 0); + return !(x & (x - 1)); + } + + // The composite number for padding, used in Bluestein's FFT algorithm + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Index findGoodComposite(Index n) { + Index i = 2; + while (i < 2 * n - 1) i *= 2; + return i; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Index getLog2(Index m) { + Index log2m = 0; + while (m >>= 1) log2m++; + return log2m; + } + + // Call Cooley Tukey algorithm directly, data length must be power of 2 + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void processDataLineCooleyTukey(ComplexScalar* line_buf, Index line_len, Index log_len) { + eigen_assert(isPowerOfTwo(line_len)); + scramble_FFT(line_buf, line_len); + compute_1D_Butterfly(line_buf, line_len, log_len); + } + + // Call Bluestein's FFT algorithm, m is a good composite number greater than (2 * n - 1), used as the padding length + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void processDataLineBluestein(ComplexScalar* line_buf, Index line_len, Index good_composite, Index log_len, ComplexScalar* a, ComplexScalar* b, const ComplexScalar* pos_j_base_powered) { + Index n = line_len; + Index m = good_composite; + ComplexScalar* data = line_buf; + + for (Index i = 0; i < n; ++i) { + if(FFTDir == FFT_FORWARD) { + a[i] = data[i] * numext::conj(pos_j_base_powered[i]); + } + else { + a[i] = data[i] * pos_j_base_powered[i]; + } + } + for (Index i = n; i < m; ++i) { + a[i] = ComplexScalar(0, 0); + } + + for (Index i = 0; i < n; ++i) { + if(FFTDir == FFT_FORWARD) { + b[i] = pos_j_base_powered[i]; + } + else { + b[i] = numext::conj(pos_j_base_powered[i]); + } + } + for (Index i = n; i < m - n; ++i) { + b[i] = ComplexScalar(0, 0); + } + for (Index i = m - n; i < m; ++i) { + if(FFTDir == FFT_FORWARD) { + b[i] = pos_j_base_powered[m-i]; + } + else { + b[i] = numext::conj(pos_j_base_powered[m-i]); + } + } + + scramble_FFT(a, m); + compute_1D_Butterfly(a, m, log_len); + + scramble_FFT(b, m); + compute_1D_Butterfly(b, m, log_len); + + for (Index i = 0; i < m; ++i) { + a[i] *= b[i]; + } + + scramble_FFT(a, m); + compute_1D_Butterfly(a, m, log_len); + + //Do the scaling after ifft + for (Index i = 0; i < m; ++i) { + a[i] /= m; + } + + for (Index i = 0; i < n; ++i) { + if(FFTDir == FFT_FORWARD) { + data[i] = a[i] * numext::conj(pos_j_base_powered[i]); + } + else { + data[i] = a[i] * pos_j_base_powered[i]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void scramble_FFT(ComplexScalar* data, Index n) { + eigen_assert(isPowerOfTwo(n)); + Index j = 1; + for (Index i = 1; i < n; ++i){ + if (j > i) { + std::swap(data[j-1], data[i-1]); + } + Index m = n >> 1; + while (m >= 2 && j > m) { + j -= m; + m >>= 1; + } + j += m; + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_2(ComplexScalar* data) { + ComplexScalar tmp = data[1]; + data[1] = data[0] - data[1]; + data[0] += tmp; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_4(ComplexScalar* data) { + ComplexScalar tmp[4]; + tmp[0] = data[0] + data[1]; + tmp[1] = data[0] - data[1]; + tmp[2] = data[2] + data[3]; + if (Dir == FFT_FORWARD) { + tmp[3] = ComplexScalar(0.0, -1.0) * (data[2] - data[3]); + } else { + tmp[3] = ComplexScalar(0.0, 1.0) * (data[2] - data[3]); + } + data[0] = tmp[0] + tmp[2]; + data[1] = tmp[1] + tmp[3]; + data[2] = tmp[0] - tmp[2]; + data[3] = tmp[1] - tmp[3]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_8(ComplexScalar* data) { + ComplexScalar tmp_1[8]; + ComplexScalar tmp_2[8]; + + tmp_1[0] = data[0] + data[1]; + tmp_1[1] = data[0] - data[1]; + tmp_1[2] = data[2] + data[3]; + if (Dir == FFT_FORWARD) { + tmp_1[3] = (data[2] - data[3]) * ComplexScalar(0, -1); + } else { + tmp_1[3] = (data[2] - data[3]) * ComplexScalar(0, 1); + } + tmp_1[4] = data[4] + data[5]; + tmp_1[5] = data[4] - data[5]; + tmp_1[6] = data[6] + data[7]; + if (Dir == FFT_FORWARD) { + tmp_1[7] = (data[6] - data[7]) * ComplexScalar(0, -1); + } else { + tmp_1[7] = (data[6] - data[7]) * ComplexScalar(0, 1); + } + tmp_2[0] = tmp_1[0] + tmp_1[2]; + tmp_2[1] = tmp_1[1] + tmp_1[3]; + tmp_2[2] = tmp_1[0] - tmp_1[2]; + tmp_2[3] = tmp_1[1] - tmp_1[3]; + tmp_2[4] = tmp_1[4] + tmp_1[6]; +// SQRT2DIV2 = sqrt(2)/2 +#define SQRT2DIV2 0.7071067811865476 + if (Dir == FFT_FORWARD) { + tmp_2[5] = (tmp_1[5] + tmp_1[7]) * ComplexScalar(SQRT2DIV2, -SQRT2DIV2); + tmp_2[6] = (tmp_1[4] - tmp_1[6]) * ComplexScalar(0, -1); + tmp_2[7] = (tmp_1[5] - tmp_1[7]) * ComplexScalar(-SQRT2DIV2, -SQRT2DIV2); + } else { + tmp_2[5] = (tmp_1[5] + tmp_1[7]) * ComplexScalar(SQRT2DIV2, SQRT2DIV2); + tmp_2[6] = (tmp_1[4] - tmp_1[6]) * ComplexScalar(0, 1); + tmp_2[7] = (tmp_1[5] - tmp_1[7]) * ComplexScalar(-SQRT2DIV2, SQRT2DIV2); + } + data[0] = tmp_2[0] + tmp_2[4]; + data[1] = tmp_2[1] + tmp_2[5]; + data[2] = tmp_2[2] + tmp_2[6]; + data[3] = tmp_2[3] + tmp_2[7]; + data[4] = tmp_2[0] - tmp_2[4]; + data[5] = tmp_2[1] - tmp_2[5]; + data[6] = tmp_2[2] - tmp_2[6]; + data[7] = tmp_2[3] - tmp_2[7]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_1D_merge( + ComplexScalar* data, Index n, Index n_power_of_2) { + // Original code: + // RealScalar wtemp = std::sin(M_PI/n); + // RealScalar wpi = -std::sin(2 * M_PI/n); + const RealScalar wtemp = m_sin_PI_div_n_LUT[n_power_of_2]; + const RealScalar wpi = (Dir == FFT_FORWARD) + ? m_minus_sin_2_PI_div_n_LUT[n_power_of_2] + : -m_minus_sin_2_PI_div_n_LUT[n_power_of_2]; + + const ComplexScalar wp(wtemp, wpi); + const ComplexScalar wp_one = wp + ComplexScalar(1, 0); + const ComplexScalar wp_one_2 = wp_one * wp_one; + const ComplexScalar wp_one_3 = wp_one_2 * wp_one; + const ComplexScalar wp_one_4 = wp_one_3 * wp_one; + const Index n2 = n / 2; + ComplexScalar w(1.0, 0.0); + for (Index i = 0; i < n2; i += 4) { + ComplexScalar temp0(data[i + n2] * w); + ComplexScalar temp1(data[i + 1 + n2] * w * wp_one); + ComplexScalar temp2(data[i + 2 + n2] * w * wp_one_2); + ComplexScalar temp3(data[i + 3 + n2] * w * wp_one_3); + w = w * wp_one_4; + + data[i + n2] = data[i] - temp0; + data[i] += temp0; + + data[i + 1 + n2] = data[i + 1] - temp1; + data[i + 1] += temp1; + + data[i + 2 + n2] = data[i + 2] - temp2; + data[i + 2] += temp2; + + data[i + 3 + n2] = data[i + 3] - temp3; + data[i + 3] += temp3; + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_1D_Butterfly( + ComplexScalar* data, Index n, Index n_power_of_2) { + eigen_assert(isPowerOfTwo(n)); + if (n > 8) { + compute_1D_Butterfly(data, n / 2, n_power_of_2 - 1); + compute_1D_Butterfly(data + n / 2, n / 2, n_power_of_2 - 1); + butterfly_1D_merge(data, n, n_power_of_2); + } else if (n == 8) { + butterfly_8(data); + } else if (n == 4) { + butterfly_4(data); + } else if (n == 2) { + butterfly_2(data); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getBaseOffsetFromIndex(Index index, Index omitted_dim) const { + Index result = 0; + + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > omitted_dim; --i) { + const Index partial_m_stride = m_strides[i] / m_dimensions[omitted_dim]; + const Index idx = index / partial_m_stride; + index -= idx * partial_m_stride; + result += idx * m_strides[i]; + } + result += index; + } + else { + for (Index i = 0; i < omitted_dim; ++i) { + const Index partial_m_stride = m_strides[i] / m_dimensions[omitted_dim]; + const Index idx = index / partial_m_stride; + index -= idx * partial_m_stride; + result += idx * m_strides[i]; + } + result += index; + } + // Value of index_coords[omitted_dim] is not determined to this step + return result; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getIndexFromOffset(Index base, Index omitted_dim, Index offset) const { + Index result = base + offset * m_strides[omitted_dim] ; + return result; + } + + protected: + Index m_size; + const FFT EIGEN_DEVICE_REF m_fft; + Dimensions m_dimensions; + array m_strides; + TensorEvaluator m_impl; + EvaluatorPointerType m_data; + const Device EIGEN_DEVICE_REF m_device; + + // This will support a maximum FFT size of 2^32 for each dimension + // m_sin_PI_div_n_LUT[i] = (-2) * std::sin(M_PI / std::pow(2,i)) ^ 2; + const RealScalar m_sin_PI_div_n_LUT[32] = { + RealScalar(0.0), + RealScalar(-2), + RealScalar(-0.999999999999999), + RealScalar(-0.292893218813453), + RealScalar(-0.0761204674887130), + RealScalar(-0.0192147195967696), + RealScalar(-0.00481527332780311), + RealScalar(-0.00120454379482761), + RealScalar(-3.01181303795779e-04), + RealScalar(-7.52981608554592e-05), + RealScalar(-1.88247173988574e-05), + RealScalar(-4.70619042382852e-06), + RealScalar(-1.17654829809007e-06), + RealScalar(-2.94137117780840e-07), + RealScalar(-7.35342821488550e-08), + RealScalar(-1.83835707061916e-08), + RealScalar(-4.59589268710903e-09), + RealScalar(-1.14897317243732e-09), + RealScalar(-2.87243293150586e-10), + RealScalar( -7.18108232902250e-11), + RealScalar(-1.79527058227174e-11), + RealScalar(-4.48817645568941e-12), + RealScalar(-1.12204411392298e-12), + RealScalar(-2.80511028480785e-13), + RealScalar(-7.01277571201985e-14), + RealScalar(-1.75319392800498e-14), + RealScalar(-4.38298482001247e-15), + RealScalar(-1.09574620500312e-15), + RealScalar(-2.73936551250781e-16), + RealScalar(-6.84841378126949e-17), + RealScalar(-1.71210344531737e-17), + RealScalar(-4.28025861329343e-18) + }; + + // m_minus_sin_2_PI_div_n_LUT[i] = -std::sin(2 * M_PI / std::pow(2,i)); + const RealScalar m_minus_sin_2_PI_div_n_LUT[32] = { + RealScalar(0.0), + RealScalar(0.0), + RealScalar(-1.00000000000000e+00), + RealScalar(-7.07106781186547e-01), + RealScalar(-3.82683432365090e-01), + RealScalar(-1.95090322016128e-01), + RealScalar(-9.80171403295606e-02), + RealScalar(-4.90676743274180e-02), + RealScalar(-2.45412285229123e-02), + RealScalar(-1.22715382857199e-02), + RealScalar(-6.13588464915448e-03), + RealScalar(-3.06795676296598e-03), + RealScalar(-1.53398018628477e-03), + RealScalar(-7.66990318742704e-04), + RealScalar(-3.83495187571396e-04), + RealScalar(-1.91747597310703e-04), + RealScalar(-9.58737990959773e-05), + RealScalar(-4.79368996030669e-05), + RealScalar(-2.39684498084182e-05), + RealScalar(-1.19842249050697e-05), + RealScalar(-5.99211245264243e-06), + RealScalar(-2.99605622633466e-06), + RealScalar(-1.49802811316901e-06), + RealScalar(-7.49014056584716e-07), + RealScalar(-3.74507028292384e-07), + RealScalar(-1.87253514146195e-07), + RealScalar(-9.36267570730981e-08), + RealScalar(-4.68133785365491e-08), + RealScalar(-2.34066892682746e-08), + RealScalar(-1.17033446341373e-08), + RealScalar(-5.85167231706864e-09), + RealScalar(-2.92583615853432e-09) + }; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FFT_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorFixedSize.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorFixedSize.h new file mode 100644 index 0000000..ca39bb8 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorFixedSize.h @@ -0,0 +1,379 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H +#define EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H + +namespace Eigen { + +/** \class TensorFixedSize + * \ingroup CXX11_Tensor_Module + * + * \brief The fixed sized version of the tensor class. + * + * The fixed sized equivalent of + * Eigen::Tensor t(3, 5, 7); + * is + * Eigen::TensorFixedSize> t; + */ + +template +class TensorFixedSize : public TensorBase > +{ + public: + typedef TensorFixedSize Self; + typedef TensorBase > Base; + typedef typename Eigen::internal::nested::type Nested; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + typedef Scalar_ Scalar; + typedef typename NumTraits::Real RealScalar; + typedef typename Base::CoeffReturnType CoeffReturnType; + + static const int Options = Options_; + + enum { + IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0), + PacketAccess = (internal::packet_traits::size > 1), + BlockAccess = false, + PreferBlockAccess = false, + Layout = Options_ & RowMajor ? RowMajor : ColMajor, + CoordAccess = true, + RawAccess = true + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + typedef Dimensions_ Dimensions; + static const std::size_t NumIndices = Dimensions::count; + + protected: + TensorStorage m_storage; + + public: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_storage.dimensions(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); } + + // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED + // work, because that uses base().coeffRef() - and we don't yet + // implement a similar class hierarchy + inline Self& base() { return *this; } + inline const Self& base() const { return *this; } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index firstIndex, IndexTypes... otherIndices) const + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeff(array{{firstIndex, otherIndices...}}); + } +#endif + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& coeff(const array& indices) const + { + eigen_internal_assert(checkIndexRange(indices)); + return m_storage.data()[linearizedIndex(indices)]; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const + { + eigen_internal_assert(index >= 0 && index < size()); + return m_storage.data()[index]; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& coeff() const + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return m_storage.data()[0]; + } + + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices) + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeffRef(array{{firstIndex, otherIndices...}}); + } +#endif + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(const array& indices) + { + eigen_internal_assert(checkIndexRange(indices)); + return m_storage.data()[linearizedIndex(indices)]; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) + { + eigen_internal_assert(index >= 0 && index < size()); + return m_storage.data()[index]; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef() + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return m_storage.data()[0]; + } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return this->operator()(array{{firstIndex, otherIndices...}}); + } +#else + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const + { + if (Options&RowMajor) { + const Index index = i1 + i0 * m_storage.dimensions()[1]; + return m_storage.data()[index]; + } else { + const Index index = i0 + i1 * m_storage.dimensions()[0]; + return m_storage.data()[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const + { + if (Options&RowMajor) { + const Index index = i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0); + return m_storage.data()[index]; + } else { + const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * i2); + return m_storage.data()[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const + { + if (Options&RowMajor) { + const Index index = i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0)); + return m_storage.data()[index]; + } else { + const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * i3)); + return m_storage.data()[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const + { + if (Options&RowMajor) { + const Index index = i4 + m_storage.dimensions()[4] * (i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0))); + return m_storage.data()[index]; + } else { + const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * (i3 + m_storage.dimensions()[3] * i4))); + return m_storage.data()[index]; + } + } +#endif + + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(const array& indices) const + { + eigen_assert(checkIndexRange(indices)); + return coeff(indices); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const + { + eigen_internal_assert(index >= 0 && index < size()); + return coeff(index); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator()() const + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return coeff(); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const + { + // The bracket operator is only for vectors, use the parenthesis operator instead. + EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE); + return coeff(index); + } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) + { + // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + return operator()(array{{firstIndex, otherIndices...}}); + } +#else + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1) + { + if (Options&RowMajor) { + const Index index = i1 + i0 * m_storage.dimensions()[1]; + return m_storage.data()[index]; + } else { + const Index index = i0 + i1 * m_storage.dimensions()[0]; + return m_storage.data()[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2) + { + if (Options&RowMajor) { + const Index index = i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0); + return m_storage.data()[index]; + } else { + const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * i2); + return m_storage.data()[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3) + { + if (Options&RowMajor) { + const Index index = i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0)); + return m_storage.data()[index]; + } else { + const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * i3)); + return m_storage.data()[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) + { + if (Options&RowMajor) { + const Index index = i4 + m_storage.dimensions()[4] * (i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0))); + return m_storage.data()[index]; + } else { + const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * (i3 + m_storage.dimensions()[3] * i4))); + return m_storage.data()[index]; + } + } +#endif + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(const array& indices) + { + eigen_assert(checkIndexRange(indices)); + return coeffRef(indices); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index index) + { + eigen_assert(index >= 0 && index < size()); + return coeffRef(index); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()() + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE); + return coeffRef(); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator[](Index index) + { + // The bracket operator is only for vectors, use the parenthesis operator instead + EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE) + return coeffRef(index); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorFixedSize() + : m_storage() + { + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorFixedSize(const Self& other) + : m_storage(other.m_storage) + { + } + +#if EIGEN_HAS_RVALUE_REFERENCES + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize(Self&& other) + : m_storage(other.m_storage) + { + } +#endif + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorFixedSize(const TensorBase& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other.derived()); + internal::TensorExecutor::run(assign, DefaultDevice()); + } + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorFixedSize(const TensorBase& other) + { + typedef TensorAssignOp Assign; + Assign assign(*this, other.derived()); + internal::TensorExecutor::run(assign, DefaultDevice()); + } + + // FIXME: check that the dimensions of other match the dimensions of *this. + // Unfortunately this isn't possible yet when the rhs is an expression. + EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(TensorFixedSize) + + + protected: + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE bool checkIndexRange(const array& /*indices*/) const + { + using internal::array_apply_and_reduce; + using internal::array_zip_and_reduce; + using internal::greater_equal_zero_op; + using internal::logical_and_op; + using internal::lesser_op; + + return true; + // check whether the indices are all >= 0 + /* array_apply_and_reduce(indices) && + // check whether the indices fit in the dimensions + array_zip_and_reduce(indices, m_storage.dimensions());*/ + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index linearizedIndex(const array& indices) const + { + if (Options&RowMajor) { + return m_storage.dimensions().IndexOfRowMajor(indices); + } else { + return m_storage.dimensions().IndexOfColMajor(indices); + } + } +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorForcedEval.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorForcedEval.h new file mode 100644 index 0000000..e800ded --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorForcedEval.h @@ -0,0 +1,237 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H +#define EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H + +namespace Eigen { + +/** \class TensorForcedEval + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reshaping class. + * + * + */ +namespace internal { +template +struct traits > +{ + // Type promotion to handle the case where the types of the lhs and the rhs are different. + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename traits::StorageKind StorageKind; + typedef typename traits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; + + enum { + Flags = 0 + }; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorForcedEvalOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorForcedEvalOp type; +}; + +} // end namespace internal + + + +template +class TensorForcedEvalOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorForcedEvalOp(const XprType& expr) + : m_xpr(expr) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; +}; + +namespace internal { +template +struct non_integral_type_placement_new{ + template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index numValues, StorageType m_buffer) { + // Initialize non-trivially constructible types. + if (!internal::is_arithmetic::value) { + for (Index i = 0; i < numValues; ++i) new (m_buffer + i) CoeffReturnType(); + } +} +}; + +// SYCL does not support non-integral types +// having new (m_buffer + i) CoeffReturnType() causes the following compiler error for SYCL Devices +// no matching function for call to 'operator new' +template +struct non_integral_type_placement_new { + template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index, StorageType) { +} +}; +} // end namespace internal + +template +struct TensorEvaluator, Device> +{ + typedef const typename internal::remove_all::type ArgType; + typedef TensorForcedEvalOp XprType; + typedef typename ArgType::Scalar Scalar; + typedef typename TensorEvaluator::Dimensions Dimensions; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = PacketType::size; + typedef typename Eigen::internal::traits::PointerType TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + enum { + IsAligned = true, + PacketAccess = (PacketType::size > 1), + BlockAccess = internal::is_arithmetic::value, + PreferBlockAccess = false, + Layout = TensorEvaluator::Layout, + RawAccess = true + }; + + static const int NumDims = internal::traits::NumDimensions; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename internal::TensorMaterializedBlock + TensorBlock; + //===--------------------------------------------------------------------===// + + TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_op(op.expression()), + m_device(device), m_buffer(NULL) + { } + + EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + const Index numValues = internal::array_prod(m_impl.dimensions()); + m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp(numValues * sizeof(CoeffReturnType))); + + internal::non_integral_type_placement_new()(numValues, m_buffer); + + typedef TensorEvalToOp< const typename internal::remove_const::type > EvalTo; + EvalTo evalToTmp(m_device.get(m_buffer), m_op); + + internal::TensorExecutor< + const EvalTo, typename internal::remove_const::type, + /*Vectorizable=*/internal::IsVectorizable::value, + /*Tiling=*/internal::IsTileable::value>:: + run(evalToTmp, m_device); + + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType, EvalSubExprsCallback done) { + const Index numValues = internal::array_prod(m_impl.dimensions()); + m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp( + numValues * sizeof(CoeffReturnType))); + typedef TensorEvalToOp::type> + EvalTo; + EvalTo evalToTmp(m_device.get(m_buffer), m_op); + + auto on_done = std::bind([](EvalSubExprsCallback done_) { done_(true); }, + std::move(done)); + internal::TensorAsyncExecutor< + const EvalTo, typename internal::remove_const::type, + decltype(on_done), + /*Vectorizable=*/internal::IsVectorizable::value, + /*Tiling=*/internal::IsTileable::value>:: + runAsync(evalToTmp, m_device, std::move(on_done)); + } +#endif + + EIGEN_STRONG_INLINE void cleanup() { + m_device.deallocate_temp(m_buffer); + m_buffer = NULL; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_buffer[index]; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return internal::ploadt(m_buffer + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + return internal::TensorBlockResourceRequirements::any(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + assert(m_buffer != NULL); + return TensorBlock::materialize(m_buffer, m_impl.dimensions(), desc, scratch); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + EvaluatorPointerType data() const { return m_buffer; } + +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_buffer.bind(cgh); + m_impl.bind(cgh); + } +#endif + private: + TensorEvaluator m_impl; + const ArgType m_op; + const Device EIGEN_DEVICE_REF m_device; + EvaluatorPointerType m_buffer; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorForwardDeclarations.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorForwardDeclarations.h new file mode 100644 index 0000000..246ebe4 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -0,0 +1,191 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H +#define EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H + +namespace Eigen { + +// MakePointer class is used as a container of the address space of the pointer +// on the host and on the device. From the host side it generates the T* pointer +// and when EIGEN_USE_SYCL is used it construct a buffer with a map_allocator to +// T* m_data on the host. It is always called on the device. +// Specialisation of MakePointer class for creating the sycl buffer with +// map_allocator. +template struct MakePointer { + typedef T* Type; + typedef const T* ConstType; +}; + +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T* constCast(const T* data) { + return const_cast(data); +} + +// The StorageMemory class is a container of the device specific pointer +// used for refering to a Pointer on TensorEvaluator class. While the TensorExpression +// is a device-agnostic type and need MakePointer class for type conversion, +// the TensorEvaluator class can be specialized for a device, hence it is possible +// to construct different types of temproray storage memory in TensorEvaluator +// for different devices by specializing the following StorageMemory class. +template struct StorageMemory: MakePointer {}; + +namespace internal{ +template struct Pointer_type_promotion { + static const bool val=false; +}; +template struct Pointer_type_promotion { + static const bool val = true; +}; +template struct TypeConversion { + typedef A* type; +}; +} + + +template class MakePointer_ = MakePointer> class TensorMap; +template class Tensor; +template class TensorFixedSize; +template class TensorRef; +template class TensorBase; + +template class TensorCwiseNullaryOp; +template class TensorCwiseUnaryOp; +template class TensorCwiseBinaryOp; +template class TensorCwiseTernaryOp; +template class TensorSelectOp; +template class MakePointer_ = MakePointer > class TensorReductionOp; +template class TensorIndexTupleOp; +template class TensorTupleReducerOp; +template class TensorConcatenationOp; +template class TensorContractionOp; +template class TensorConversionOp; +template class TensorConvolutionOp; +template class TensorFFTOp; +template class TensorPatchOp; +template class TensorImagePatchOp; +template class TensorVolumePatchOp; +template class TensorBroadcastingOp; +template class TensorChippingOp; +template class TensorReshapingOp; +template class TensorLayoutSwapOp; +template class TensorSlicingOp; +template class TensorReverseOp; +template class TensorPaddingOp; +template class TensorShufflingOp; +template class TensorStridingOp; +template class TensorStridingSlicingOp; +template class TensorInflationOp; +template class TensorGeneratorOp; +template class TensorAssignOp; +template class TensorScanOp; +template class TensorTraceOp; + +template class TensorCustomUnaryOp; +template class TensorCustomBinaryOp; + +template class MakePointer_ = MakePointer> class TensorEvalToOp; +template class TensorForcedEvalOp; + +template class TensorDevice; +template class TensorAsyncDevice; +template struct TensorEvaluator; + +struct NoOpOutputKernel; + +struct DefaultDevice; +struct ThreadPoolDevice; +struct GpuDevice; +struct SyclDevice; + +#ifdef EIGEN_USE_SYCL + +template struct MakeSYCLPointer { + typedef Eigen::TensorSycl::internal::RangeAccess Type; +}; + +template +EIGEN_STRONG_INLINE const Eigen::TensorSycl::internal::RangeAccess& +constCast(const Eigen::TensorSycl::internal::RangeAccess& data) { + return data; +} + +template +struct StorageMemory : MakeSYCLPointer {}; +template +struct StorageMemory : StorageMemory {}; + +namespace TensorSycl { +namespace internal{ +template class GenericNondeterministicReducer; +} +} +#endif + + +enum FFTResultType { + RealPart = 0, + ImagPart = 1, + BothParts = 2 +}; + +enum FFTDirection { + FFT_FORWARD = 0, + FFT_REVERSE = 1 +}; + + +namespace internal { + +template +struct IsVectorizable { + static const bool value = TensorEvaluator::PacketAccess; +}; + +template +struct IsVectorizable { + static const bool value = TensorEvaluator::PacketAccess && + TensorEvaluator::IsAligned; +}; + +// Tiled evaluation strategy. +enum TiledEvaluation { + Off = 0, // tiled evaluation is not supported + On = 1, // still work in progress (see TensorBlock.h) +}; + +template +struct IsTileable { + // Check that block evaluation is supported and it's a preferred option (at + // least one sub-expression has much faster block evaluation, e.g. + // broadcasting). + static const bool BlockAccess = + TensorEvaluator::BlockAccess && + TensorEvaluator::PreferBlockAccess; + + static const TiledEvaluation value = + BlockAccess ? TiledEvaluation::On : TiledEvaluation::Off; +}; + +template ::value, + TiledEvaluation Tiling = IsTileable::value> +class TensorExecutor; + +template ::value, + TiledEvaluation Tiling = IsTileable::value> +class TensorAsyncExecutor; + + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorFunctors.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorFunctors.h new file mode 100644 index 0000000..d963032 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorFunctors.h @@ -0,0 +1,488 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H +#define EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H + +namespace Eigen { +namespace internal { + + +/** \internal + * \brief Template functor to compute the modulo between an array and a scalar. + */ +template +struct scalar_mod_op { + EIGEN_DEVICE_FUNC scalar_mod_op(const Scalar& divisor) : m_divisor(divisor) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a % m_divisor; } + const Scalar m_divisor; +}; +template +struct functor_traits > +{ enum { Cost = scalar_div_cost::value, PacketAccess = false }; }; + + +/** \internal + * \brief Template functor to compute the modulo between 2 arrays. + */ +template +struct scalar_mod2_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; } +}; +template +struct functor_traits > +{ enum { Cost = scalar_div_cost::value, PacketAccess = false }; }; + +template +struct scalar_fmod_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar + operator()(const Scalar& a, const Scalar& b) const { + return numext::fmod(a, b); + } +}; +template +struct functor_traits > { + enum { Cost = 13, // Reciprocal throughput of FPREM on Haswell. + PacketAccess = false }; +}; + +template +struct reducer_traits { + enum { + Cost = 1, + PacketAccess = false, + IsStateful = false, + IsExactlyAssociative = true + }; +}; + +// Standard reduction functors +template struct SumReducer +{ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + internal::scalar_sum_op sum_op; + *accum = sum_op(*accum, t); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { + (*accum) = padd(*accum, p); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + internal::scalar_cast_op conv; + return conv(0); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1(initialize()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { + return accum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return vaccum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { + internal::scalar_sum_op sum_op; + return sum_op(saccum, predux(vaccum)); + } +}; + +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::AddCost, + PacketAccess = PacketType::HasAdd, + IsStateful = false, + IsExactlyAssociative = NumTraits::IsInteger + }; +}; + +template struct MeanReducer +{ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + MeanReducer() : scalarCount_(0), packetCount_(0) { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) { + internal::scalar_sum_op sum_op; + *accum = sum_op(*accum, t); + scalarCount_++; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) { + (*accum) = padd(*accum, p); + packetCount_++; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + internal::scalar_cast_op conv; + return conv(0); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1(initialize()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { + internal::scalar_quotient_op quotient_op; + return quotient_op(accum, T(scalarCount_)); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return pdiv(vaccum, pset1(T(packetCount_))); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { + internal::scalar_sum_op sum_op; + internal::scalar_quotient_op quotient_op; + return quotient_op( + sum_op(saccum, predux(vaccum)), + T(scalarCount_ + packetCount_ * unpacket_traits::size)); + } + + protected: + DenseIndex scalarCount_; + DenseIndex packetCount_; +}; + +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::AddCost, + PacketAccess = PacketType::HasAdd && + PacketType::HasDiv && !NumTraits::IsInteger, + IsStateful = true, + IsExactlyAssociative = NumTraits::IsInteger + }; +}; + + +template +struct MinMaxBottomValue { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { + return Eigen::NumTraits::lowest(); + } +}; +template +struct MinMaxBottomValue { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { + return -Eigen::NumTraits::infinity(); + } +}; +template +struct MinMaxBottomValue { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { + return Eigen::NumTraits::highest(); + } +}; +template +struct MinMaxBottomValue { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { + return Eigen::NumTraits::infinity(); + } +}; + + +template struct MaxReducer +{ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + scalar_max_op op; + *accum = op(t, *accum); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { + scalar_max_op op; + (*accum) = op.packetOp(*accum, p); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return MinMaxBottomValue::IsInteger>::bottom_value(); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1(initialize()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { + return accum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return vaccum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { + scalar_max_op op; + return op(saccum, op.predux(vaccum)); + } +}; + +template + struct reducer_traits, Device> { + enum { + Cost = NumTraits::AddCost, + PacketAccess = PacketType::HasMax, + IsStateful = false, + IsExactlyAssociative = (NaNPropagation!=PropagateFast) + }; +}; + +template struct MinReducer +{ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + scalar_min_op op; + *accum = op(t, *accum); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { + scalar_min_op op; + (*accum) = op.packetOp(*accum, p); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return MinMaxBottomValue::IsInteger>::bottom_value(); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1(initialize()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { + return accum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return vaccum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { + scalar_min_op op; + return op(saccum, op.predux(vaccum)); + } +}; + +template + struct reducer_traits, Device> { + enum { + Cost = NumTraits::AddCost, + PacketAccess = PacketType::HasMin, + IsStateful = false, + IsExactlyAssociative = (NaNPropagation!=PropagateFast) + }; +}; + +template struct ProdReducer +{ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + internal::scalar_product_op prod_op; + (*accum) = prod_op(*accum, t); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const { + (*accum) = pmul(*accum, p); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + internal::scalar_cast_op conv; + return conv(1); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const { + return pset1(initialize()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { + return accum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { + return vaccum; + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { + internal::scalar_product_op prod_op; + return prod_op(saccum, predux_mul(vaccum)); + } +}; + +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::MulCost, + PacketAccess = PacketType::HasMul, + IsStateful = false, + IsExactlyAssociative = true + }; +}; + + +struct AndReducer +{ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const { + *accum = *accum && t; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool initialize() const { + return true; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool finalize(bool accum) const { + return accum; + } +}; + +template +struct reducer_traits { + enum { + Cost = 1, + PacketAccess = false, + IsStateful = false, + IsExactlyAssociative = true + }; +}; + + +struct OrReducer { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const { + *accum = *accum || t; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool initialize() const { + return false; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool finalize(bool accum) const { + return accum; + } +}; + +template +struct reducer_traits { + enum { + Cost = 1, + PacketAccess = false, + IsStateful = false, + IsExactlyAssociative = true + }; +}; + +// Argmin/Argmax reducers. Returns the first occurrence if multiple locations +// contain the same min/max value. +template struct ArgMaxTupleReducer +{ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + if (t.second < accum->second) { + return; + } else if (t.second > accum->second || accum->first > t.first ) { + *accum = t; + } + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return T(0, NumTraits::lowest()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T& accum) const { + return accum; + } +}; + +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::AddCost, + PacketAccess = false, + IsStateful = false, + IsExactlyAssociative = true + }; +}; + + +template struct ArgMinTupleReducer +{ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T& t, T* accum) const { + if (t.second > accum->second) { + return; + } else if (t.second < accum->second || accum->first > t.first) { + *accum = t; + } + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return T(0, NumTraits::highest()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T& accum) const { + return accum; + } +}; + +template +struct reducer_traits, Device> { + enum { + Cost = NumTraits::AddCost, + PacketAccess = false, + IsStateful = false, + IsExactlyAssociative = true + }; +}; + + +template +class GaussianGenerator { + public: + static const bool PacketAccess = false; + + EIGEN_DEVICE_FUNC GaussianGenerator(const array& means, + const array& std_devs) + : m_means(means) + { + EIGEN_UNROLL_LOOP + for (size_t i = 0; i < NumDims; ++i) { + m_two_sigmas[i] = std_devs[i] * std_devs[i] * 2; + } + } + + EIGEN_DEVICE_FUNC T operator()(const array& coordinates) const { + T tmp = T(0); + EIGEN_UNROLL_LOOP + for (size_t i = 0; i < NumDims; ++i) { + T offset = coordinates[i] - m_means[i]; + tmp += offset * offset / m_two_sigmas[i]; + } + return numext::exp(-tmp); + } + + private: + array m_means; + array m_two_sigmas; +}; + +template +struct functor_traits > { + enum { + Cost = NumDims * (2 * NumTraits::AddCost + NumTraits::MulCost + + functor_traits >::Cost) + + functor_traits >::Cost, + PacketAccess = GaussianGenerator::PacketAccess + }; +}; + +template +struct scalar_clamp_op { + EIGEN_DEVICE_FUNC inline scalar_clamp_op(const Scalar& _min, const Scalar& _max) : m_min(_min), m_max(_max) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar + operator()(const Scalar& x) const { + return numext::mini(numext::maxi(x, m_min), m_max); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet + packetOp(const Packet& x) const { + return internal::pmin(internal::pmax(x, pset1(m_min)), pset1(m_max)); + } + const Scalar m_min; + const Scalar m_max; +}; +template +struct functor_traits > +{ enum { Cost = 2 * NumTraits::AddCost, PacketAccess = (packet_traits::HasMin && packet_traits::HasMax)}; }; + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorGenerator.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorGenerator.h new file mode 100644 index 0000000..174bf06 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorGenerator.h @@ -0,0 +1,302 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H +#define EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H + +namespace Eigen { + +/** \class TensorGeneratorOp + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor generator class. + * + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorGeneratorOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorGeneratorOp type; +}; + +} // end namespace internal + + + +template +class TensorGeneratorOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorGeneratorOp(const XprType& expr, const Generator& generator) + : m_xpr(expr), m_generator(generator) {} + + EIGEN_DEVICE_FUNC + const Generator& generator() const { return m_generator; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const Generator m_generator; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorGeneratorOp XprType; + typedef typename XprType::Index Index; + typedef typename TensorEvaluator::Dimensions Dimensions; + static const int NumDims = internal::array_size::value; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + enum { + IsAligned = false, + PacketAccess = (PacketType::size > 1), + BlockAccess = true, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + typedef internal::TensorIntDivisor IndexDivisor; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename internal::TensorMaterializedBlock + TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_device(device), m_generator(op.generator()) + { + TensorEvaluator argImpl(op.expression(), device); + m_dimensions = argImpl.dimensions(); + + if (static_cast(Layout) == static_cast(ColMajor)) { + m_strides[0] = 1; + EIGEN_UNROLL_LOOP + for (int i = 1; i < NumDims; ++i) { + m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1]; + if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]); + } + } else { + m_strides[NumDims - 1] = 1; + EIGEN_UNROLL_LOOP + for (int i = NumDims - 2; i >= 0; --i) { + m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1]; + if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + return true; + } + EIGEN_STRONG_INLINE void cleanup() { + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + array coords; + extract_coordinates(index, coords); + return m_generator(coords); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + const int packetSize = PacketType::size; + EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < dimensions().TotalSize()); + + EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; + for (int i = 0; i < packetSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + const size_t target_size = m_device.firstLevelCacheSize(); + // TODO(ezhulenev): Generator should have a cost. + return internal::TensorBlockResourceRequirements::skewed( + target_size); + } + + struct BlockIteratorState { + Index stride; + Index span; + Index size; + Index count; + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + static const bool is_col_major = + static_cast(Layout) == static_cast(ColMajor); + + // Compute spatial coordinates for the first block element. + array coords; + extract_coordinates(desc.offset(), coords); + array initial_coords = coords; + + // Offset in the output block buffer. + Index offset = 0; + + // Initialize output block iterator state. Dimension in this array are + // always in inner_most -> outer_most order (col major layout). + array it; + for (int i = 0; i < NumDims; ++i) { + const int dim = is_col_major ? i : NumDims - 1 - i; + it[i].size = desc.dimension(dim); + it[i].stride = i == 0 ? 1 : (it[i - 1].size * it[i - 1].stride); + it[i].span = it[i].stride * (it[i].size - 1); + it[i].count = 0; + } + eigen_assert(it[0].stride == 1); + + // Prepare storage for the materialized generator result. + const typename TensorBlock::Storage block_storage = + TensorBlock::prepareStorage(desc, scratch); + + CoeffReturnType* block_buffer = block_storage.data(); + + static const int packet_size = PacketType::size; + + static const int inner_dim = is_col_major ? 0 : NumDims - 1; + const Index inner_dim_size = it[0].size; + const Index inner_dim_vectorized = inner_dim_size - packet_size; + + while (it[NumDims - 1].count < it[NumDims - 1].size) { + Index i = 0; + // Generate data for the vectorized part of the inner-most dimension. + for (; i <= inner_dim_vectorized; i += packet_size) { + for (Index j = 0; j < packet_size; ++j) { + array j_coords = coords; // Break loop dependence. + j_coords[inner_dim] += j; + *(block_buffer + offset + i + j) = m_generator(j_coords); + } + coords[inner_dim] += packet_size; + } + // Finalize non-vectorized part of the inner-most dimension. + for (; i < inner_dim_size; ++i) { + *(block_buffer + offset + i) = m_generator(coords); + coords[inner_dim]++; + } + coords[inner_dim] = initial_coords[inner_dim]; + + // For the 1d tensor we need to generate only one inner-most dimension. + if (NumDims == 1) break; + + // Update offset. + for (i = 1; i < NumDims; ++i) { + if (++it[i].count < it[i].size) { + offset += it[i].stride; + coords[is_col_major ? i : NumDims - 1 - i]++; + break; + } + if (i != NumDims - 1) it[i].count = 0; + coords[is_col_major ? i : NumDims - 1 - i] = + initial_coords[is_col_major ? i : NumDims - 1 - i]; + offset -= it[i].span; + } + } + + return block_storage.AsTensorMaterializedBlock(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool) const { + // TODO(rmlarsen): This is just a placeholder. Define interface to make + // generators return their cost. + return TensorOpCost(0, 0, TensorOpCost::AddCost() + + TensorOpCost::MulCost()); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler&) const {} +#endif + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void extract_coordinates(Index index, array& coords) const { + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_fast_strides[i]; + index -= idx * m_strides[i]; + coords[i] = idx; + } + coords[0] = index; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_fast_strides[i]; + index -= idx * m_strides[i]; + coords[i] = idx; + } + coords[NumDims-1] = index; + } + } + + const Device EIGEN_DEVICE_REF m_device; + Dimensions m_dimensions; + array m_strides; + array m_fast_strides; + Generator m_generator; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorGlobalFunctions.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorGlobalFunctions.h new file mode 100644 index 0000000..665b861 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorGlobalFunctions.h @@ -0,0 +1,33 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Eugene Brevdo +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H +#define EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H + +namespace Eigen { + +/** \cpp11 \returns an expression of the coefficient-wise betainc(\a x, \a a, \a b) to the given tensors. + * + * This function computes the regularized incomplete beta function (integral). + * + */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const + TensorCwiseTernaryOp, + const ADerived, const BDerived, const XDerived> + betainc(const ADerived& a, const BDerived& b, const XDerived& x) { + return TensorCwiseTernaryOp< + internal::scalar_betainc_op, const ADerived, + const BDerived, const XDerived>( + a, b, x, internal::scalar_betainc_op()); +} + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorGpuHipCudaDefines.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorGpuHipCudaDefines.h new file mode 100644 index 0000000..cb53ce2 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorGpuHipCudaDefines.h @@ -0,0 +1,99 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// Copyright (C) 2018 Deven Desai +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H) +#define EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H + +// Note that we are using EIGEN_USE_HIP here instead of EIGEN_HIPCC...this is by design +// There is code in the Tensorflow codebase that will define EIGEN_USE_GPU, but +// for some reason gets sent to the gcc/host compiler instead of the gpu/nvcc/hipcc compiler +// When compiling such files, gcc will end up trying to pick up the CUDA headers by +// default (see the code within "unsupported/Eigen/CXX11/Tensor" that is guarded by EIGEN_USE_GPU) +// This will obviously not work when trying to compile tensorflow on a system with no CUDA +// To work around this issue for HIP systems (and leave the default behaviour intact), the +// HIP tensorflow build defines EIGEN_USE_HIP when compiling all source files, and +// "unsupported/Eigen/CXX11/Tensor" has been updated to use HIP header when EIGEN_USE_HIP is +// defined. In continuation of that requirement, the guard here needs to be EIGEN_USE_HIP as well + +#if defined(EIGEN_USE_HIP) + +#define gpuStream_t hipStream_t +#define gpuDeviceProp_t hipDeviceProp_t +#define gpuError_t hipError_t +#define gpuSuccess hipSuccess +#define gpuErrorNotReady hipErrorNotReady +#define gpuGetDeviceCount hipGetDeviceCount +#define gpuGetLastError hipGetLastError +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorName hipGetErrorName +#define gpuGetErrorString hipGetErrorString +#define gpuGetDeviceProperties hipGetDeviceProperties +#define gpuStreamDefault hipStreamDefault +#define gpuGetDevice hipGetDevice +#define gpuSetDevice hipSetDevice +#define gpuMalloc hipMalloc +#define gpuFree hipFree +#define gpuMemsetAsync hipMemsetAsync +#define gpuMemcpyAsync hipMemcpyAsync +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuStreamQuery hipStreamQuery +#define gpuSharedMemConfig hipSharedMemConfig +#define gpuDeviceSetSharedMemConfig hipDeviceSetSharedMemConfig +#define gpuStreamSynchronize hipStreamSynchronize +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuMemcpy hipMemcpy + +#else + +#define gpuStream_t cudaStream_t +#define gpuDeviceProp_t cudaDeviceProp +#define gpuError_t cudaError_t +#define gpuSuccess cudaSuccess +#define gpuErrorNotReady cudaErrorNotReady +#define gpuGetDeviceCount cudaGetDeviceCount +#define gpuGetLastError cudaGetLastError +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorName cudaGetErrorName +#define gpuGetErrorString cudaGetErrorString +#define gpuGetDeviceProperties cudaGetDeviceProperties +#define gpuStreamDefault cudaStreamDefault +#define gpuGetDevice cudaGetDevice +#define gpuSetDevice cudaSetDevice +#define gpuMalloc cudaMalloc +#define gpuFree cudaFree +#define gpuMemsetAsync cudaMemsetAsync +#define gpuMemcpyAsync cudaMemcpyAsync +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuStreamQuery cudaStreamQuery +#define gpuSharedMemConfig cudaSharedMemConfig +#define gpuDeviceSetSharedMemConfig cudaDeviceSetSharedMemConfig +#define gpuStreamSynchronize cudaStreamSynchronize +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuMemcpy cudaMemcpy + +#endif + +// gpu_assert can be overridden +#ifndef gpu_assert + +#if defined(EIGEN_HIP_DEVICE_COMPILE) +// HIPCC do not support the use of assert on the GPU side. +#define gpu_assert(COND) +#else +#define gpu_assert(COND) assert(COND) +#endif + +#endif // gpu_assert + +#endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h new file mode 100644 index 0000000..1d142f2 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h @@ -0,0 +1,44 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// Copyright (C) 2018 Deven Desai +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#if defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H) + +#ifndef EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES + +#undef gpuStream_t +#undef gpuDeviceProp_t +#undef gpuError_t +#undef gpuSuccess +#undef gpuErrorNotReady +#undef gpuGetDeviceCount +#undef gpuGetErrorString +#undef gpuGetDeviceProperties +#undef gpuStreamDefault +#undef gpuGetDevice +#undef gpuSetDevice +#undef gpuMalloc +#undef gpuFree +#undef gpuMemsetAsync +#undef gpuMemcpyAsync +#undef gpuMemcpyDeviceToDevice +#undef gpuMemcpyDeviceToHost +#undef gpuMemcpyHostToDevice +#undef gpuStreamQuery +#undef gpuSharedMemConfig +#undef gpuDeviceSetSharedMemConfig +#undef gpuStreamSynchronize +#undef gpuDeviceSynchronize +#undef gpuMemcpy + +#endif // EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES + +#undef EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H + +#endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorIO.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorIO.h new file mode 100644 index 0000000..a901c5d --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorIO.h @@ -0,0 +1,79 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_IO_H +#define EIGEN_CXX11_TENSOR_TENSOR_IO_H + +namespace Eigen { + +namespace internal { + +// Print the tensor as a 2d matrix +template +struct TensorPrinter { + static void run (std::ostream& os, const Tensor& tensor) { + typedef typename internal::remove_const::type Scalar; + typedef typename Tensor::Index Index; + const Index total_size = internal::array_prod(tensor.dimensions()); + if (total_size > 0) { + const Index first_dim = Eigen::internal::array_get<0>(tensor.dimensions()); + static const int layout = Tensor::Layout; + Map > matrix(const_cast(tensor.data()), first_dim, total_size/first_dim); + os << matrix; + } + } +}; + + +// Print the tensor as a vector +template +struct TensorPrinter { + static void run (std::ostream& os, const Tensor& tensor) { + typedef typename internal::remove_const::type Scalar; + typedef typename Tensor::Index Index; + const Index total_size = internal::array_prod(tensor.dimensions()); + if (total_size > 0) { + Map > array(const_cast(tensor.data()), total_size); + os << array; + } + } +}; + + +// Print the tensor as a scalar +template +struct TensorPrinter { + static void run (std::ostream& os, const Tensor& tensor) { + os << tensor.coeff(0); + } +}; +} + +template +std::ostream& operator << (std::ostream& os, const TensorBase& expr) { + typedef TensorEvaluator, DefaultDevice> Evaluator; + typedef typename Evaluator::Dimensions Dimensions; + + // Evaluate the expression if needed + TensorForcedEvalOp eval = expr.eval(); + Evaluator tensor(eval, DefaultDevice()); + tensor.evalSubExprsIfNeeded(NULL); + + // Print the result + static const int rank = internal::array_size::value; + internal::TensorPrinter::run(os, tensor); + + // Cleanup. + tensor.cleanup(); + return os; +} + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_IO_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorImagePatch.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorImagePatch.h new file mode 100644 index 0000000..dd51850 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorImagePatch.h @@ -0,0 +1,603 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H +#define EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H + +namespace Eigen { + +/** \class TensorImagePatch + * \ingroup CXX11_Tensor_Module + * + * \brief Patch extraction specialized for image processing. + * This assumes that the input has a least 3 dimensions ordered as follow: + * 1st dimension: channels (of size d) + * 2nd dimension: rows (of size r) + * 3rd dimension: columns (of size c) + * There can be additional dimensions such as time (for video) or batch (for + * bulk processing after the first 3. + * Calling the image patch code with patch_rows and patch_cols is equivalent + * to calling the regular patch extraction code with parameters d, patch_rows, + * patch_cols, and 1 for all the additional dimensions. + */ +namespace internal { + +template +struct traits > : public traits +{ + typedef typename internal::remove_const::type Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions + 1; + static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorImagePatchOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorImagePatchOp type; +}; + +template +struct ImagePatchCopyOp { + typedef typename Self::Index Index; + typedef typename Self::Scalar Scalar; + typedef typename Self::Impl Impl; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const Self& self, const Index num_coeff_to_copy, const Index dst_index, + Scalar* dst_data, const Index src_index) { + const Impl& impl = self.impl(); + for (Index i = 0; i < num_coeff_to_copy; ++i) { + dst_data[dst_index + i] = impl.coeff(src_index + i); + } + } +}; + +template +struct ImagePatchCopyOp { + typedef typename Self::Index Index; + typedef typename Self::Scalar Scalar; + typedef typename Self::Impl Impl; + typedef typename packet_traits::type Packet; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const Self& self, const Index num_coeff_to_copy, const Index dst_index, + Scalar* dst_data, const Index src_index) { + const Impl& impl = self.impl(); + const Index packet_size = internal::unpacket_traits::size; + const Index vectorized_size = + (num_coeff_to_copy / packet_size) * packet_size; + for (Index i = 0; i < vectorized_size; i += packet_size) { + Packet p = impl.template packet(src_index + i); + internal::pstoret(dst_data + dst_index + i, p); + } + for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) { + dst_data[dst_index + i] = impl.coeff(src_index + i); + } + } +}; + +template +struct ImagePatchPaddingOp { + typedef typename Self::Index Index; + typedef typename Self::Scalar Scalar; + typedef typename packet_traits::type Packet; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( + const Index num_coeff_to_pad, const Scalar padding_value, + const Index dst_index, Scalar* dst_data) { + const Index packet_size = internal::unpacket_traits::size; + const Packet padded_packet = internal::pset1(padding_value); + const Index vectorized_size = + (num_coeff_to_pad / packet_size) * packet_size; + for (Index i = 0; i < vectorized_size; i += packet_size) { + internal::pstoret(dst_data + dst_index + i, + padded_packet); + } + for (Index i = vectorized_size; i < num_coeff_to_pad; ++i) { + dst_data[dst_index + i] = padding_value; + } + } +}; + +} // end namespace internal + +template +class TensorImagePatchOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols, + DenseIndex row_strides, DenseIndex col_strides, + DenseIndex in_row_strides, DenseIndex in_col_strides, + DenseIndex row_inflate_strides, DenseIndex col_inflate_strides, + PaddingType padding_type, Scalar padding_value) + : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols), + m_row_strides(row_strides), m_col_strides(col_strides), + m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides), + m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides), + m_padding_explicit(false), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0), + m_padding_type(padding_type), m_padding_value(padding_value) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols, + DenseIndex row_strides, DenseIndex col_strides, + DenseIndex in_row_strides, DenseIndex in_col_strides, + DenseIndex row_inflate_strides, DenseIndex col_inflate_strides, + DenseIndex padding_top, DenseIndex padding_bottom, + DenseIndex padding_left, DenseIndex padding_right, + Scalar padding_value) + : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols), + m_row_strides(row_strides), m_col_strides(col_strides), + m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides), + m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides), + m_padding_explicit(true), m_padding_top(padding_top), m_padding_bottom(padding_bottom), + m_padding_left(padding_left), m_padding_right(padding_right), + m_padding_type(PADDING_VALID), m_padding_value(padding_value) {} + + + EIGEN_DEVICE_FUNC + DenseIndex patch_rows() const { return m_patch_rows; } + EIGEN_DEVICE_FUNC + DenseIndex patch_cols() const { return m_patch_cols; } + EIGEN_DEVICE_FUNC + DenseIndex row_strides() const { return m_row_strides; } + EIGEN_DEVICE_FUNC + DenseIndex col_strides() const { return m_col_strides; } + EIGEN_DEVICE_FUNC + DenseIndex in_row_strides() const { return m_in_row_strides; } + EIGEN_DEVICE_FUNC + DenseIndex in_col_strides() const { return m_in_col_strides; } + EIGEN_DEVICE_FUNC + DenseIndex row_inflate_strides() const { return m_row_inflate_strides; } + EIGEN_DEVICE_FUNC + DenseIndex col_inflate_strides() const { return m_col_inflate_strides; } + EIGEN_DEVICE_FUNC + bool padding_explicit() const { return m_padding_explicit; } + EIGEN_DEVICE_FUNC + DenseIndex padding_top() const { return m_padding_top; } + EIGEN_DEVICE_FUNC + DenseIndex padding_bottom() const { return m_padding_bottom; } + EIGEN_DEVICE_FUNC + DenseIndex padding_left() const { return m_padding_left; } + EIGEN_DEVICE_FUNC + DenseIndex padding_right() const { return m_padding_right; } + EIGEN_DEVICE_FUNC + PaddingType padding_type() const { return m_padding_type; } + EIGEN_DEVICE_FUNC + Scalar padding_value() const { return m_padding_value; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const DenseIndex m_patch_rows; + const DenseIndex m_patch_cols; + const DenseIndex m_row_strides; + const DenseIndex m_col_strides; + const DenseIndex m_in_row_strides; + const DenseIndex m_in_col_strides; + const DenseIndex m_row_inflate_strides; + const DenseIndex m_col_inflate_strides; + const bool m_padding_explicit; + const DenseIndex m_padding_top; + const DenseIndex m_padding_bottom; + const DenseIndex m_padding_left; + const DenseIndex m_padding_right; + const PaddingType m_padding_type; + const Scalar m_padding_value; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorImagePatchOp XprType; + typedef typename XprType::Index Index; + static const int NumInputDims = internal::array_size::Dimensions>::value; + static const int NumDims = NumInputDims + 1; + typedef DSizes Dimensions; + typedef typename internal::remove_const::type Scalar; + typedef TensorEvaluator, + Device> Self; + typedef TensorEvaluator Impl; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device) + : m_device(device), m_impl(op.expression(), device) + { + EIGEN_STATIC_ASSERT((NumDims >= 4), YOU_MADE_A_PROGRAMMING_MISTAKE); + + m_paddingValue = op.padding_value(); + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + + // Caches a few variables. + if (static_cast(Layout) == static_cast(ColMajor)) { + m_inputDepth = input_dims[0]; + m_inputRows = input_dims[1]; + m_inputCols = input_dims[2]; + } else { + m_inputDepth = input_dims[NumInputDims-1]; + m_inputRows = input_dims[NumInputDims-2]; + m_inputCols = input_dims[NumInputDims-3]; + } + + m_row_strides = op.row_strides(); + m_col_strides = op.col_strides(); + + // Input strides and effective input/patch size + m_in_row_strides = op.in_row_strides(); + m_in_col_strides = op.in_col_strides(); + m_row_inflate_strides = op.row_inflate_strides(); + m_col_inflate_strides = op.col_inflate_strides(); + // The "effective" input rows and input cols are the input rows and cols + // after inflating them with zeros. + // For examples, a 2x3 matrix with row_inflate_strides and + // col_inflate_strides of 2 comes from: + // A B C + // D E F + // + // to a matrix is 3 x 5: + // + // A . B . C + // . . . . . + // D . E . F + + m_input_rows_eff = (m_inputRows - 1) * m_row_inflate_strides + 1; + m_input_cols_eff = (m_inputCols - 1) * m_col_inflate_strides + 1; + m_patch_rows_eff = op.patch_rows() + (op.patch_rows() - 1) * (m_in_row_strides - 1); + m_patch_cols_eff = op.patch_cols() + (op.patch_cols() - 1) * (m_in_col_strides - 1); + + if (op.padding_explicit()) { + m_outputRows = numext::ceil((m_input_rows_eff + op.padding_top() + op.padding_bottom() - m_patch_rows_eff + 1.f) / static_cast(m_row_strides)); + m_outputCols = numext::ceil((m_input_cols_eff + op.padding_left() + op.padding_right() - m_patch_cols_eff + 1.f) / static_cast(m_col_strides)); + m_rowPaddingTop = op.padding_top(); + m_colPaddingLeft = op.padding_left(); + } else { + // Computing padding from the type + switch (op.padding_type()) { + case PADDING_VALID: + m_outputRows = numext::ceil((m_input_rows_eff - m_patch_rows_eff + 1.f) / static_cast(m_row_strides)); + m_outputCols = numext::ceil((m_input_cols_eff - m_patch_cols_eff + 1.f) / static_cast(m_col_strides)); + // Calculate the padding + m_rowPaddingTop = numext::maxi(0, ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2); + m_colPaddingLeft = numext::maxi(0, ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2); + break; + case PADDING_SAME: + m_outputRows = numext::ceil(m_input_rows_eff / static_cast(m_row_strides)); + m_outputCols = numext::ceil(m_input_cols_eff / static_cast(m_col_strides)); + // Calculate the padding + m_rowPaddingTop = ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2; + m_colPaddingLeft = ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2; + // The padding size calculation for PADDING_SAME has been updated to + // be consistent with how TensorFlow extracts its paddings. + m_rowPaddingTop = numext::maxi(0, m_rowPaddingTop); + m_colPaddingLeft = numext::maxi(0, m_colPaddingLeft); + break; + default: + eigen_assert(false && "unexpected padding"); + m_outputCols=0; // silence the uninitialised warning; + m_outputRows=0; //// silence the uninitialised warning; + } + } + eigen_assert(m_outputRows > 0); + eigen_assert(m_outputCols > 0); + + // Dimensions for result of extraction. + if (static_cast(Layout) == static_cast(ColMajor)) { + // ColMajor + // 0: depth + // 1: patch_rows + // 2: patch_cols + // 3: number of patches + // 4 and beyond: anything else (such as batch). + m_dimensions[0] = input_dims[0]; + m_dimensions[1] = op.patch_rows(); + m_dimensions[2] = op.patch_cols(); + m_dimensions[3] = m_outputRows * m_outputCols; + for (int i = 4; i < NumDims; ++i) { + m_dimensions[i] = input_dims[i-1]; + } + } else { + // RowMajor + // NumDims-1: depth + // NumDims-2: patch_rows + // NumDims-3: patch_cols + // NumDims-4: number of patches + // NumDims-5 and beyond: anything else (such as batch). + m_dimensions[NumDims-1] = input_dims[NumInputDims-1]; + m_dimensions[NumDims-2] = op.patch_rows(); + m_dimensions[NumDims-3] = op.patch_cols(); + m_dimensions[NumDims-4] = m_outputRows * m_outputCols; + for (int i = NumDims-5; i >= 0; --i) { + m_dimensions[i] = input_dims[i]; + } + } + + // Strides for moving the patch in various dimensions. + if (static_cast(Layout) == static_cast(ColMajor)) { + m_colStride = m_dimensions[1]; + m_patchStride = m_colStride * m_dimensions[2] * m_dimensions[0]; + m_otherStride = m_patchStride * m_dimensions[3]; + } else { + m_colStride = m_dimensions[NumDims-2]; + m_patchStride = m_colStride * m_dimensions[NumDims-3] * m_dimensions[NumDims-1]; + m_otherStride = m_patchStride * m_dimensions[NumDims-4]; + } + + // Strides for navigating through the input tensor. + m_rowInputStride = m_inputDepth; + m_colInputStride = m_inputDepth * m_inputRows; + m_patchInputStride = m_inputDepth * m_inputRows * m_inputCols; + + // Fast representations of different variables. + m_fastOtherStride = internal::TensorIntDivisor(m_otherStride); + m_fastPatchStride = internal::TensorIntDivisor(m_patchStride); + m_fastColStride = internal::TensorIntDivisor(m_colStride); + m_fastInflateRowStride = internal::TensorIntDivisor(m_row_inflate_strides); + m_fastInflateColStride = internal::TensorIntDivisor(m_col_inflate_strides); + m_fastInputColsEff = internal::TensorIntDivisor(m_input_cols_eff); + + // Number of patches in the width dimension. + m_fastOutputRows = internal::TensorIntDivisor(m_outputRows); + if (static_cast(Layout) == static_cast(ColMajor)) { + m_fastOutputDepth = internal::TensorIntDivisor(m_dimensions[0]); + } else { + m_fastOutputDepth = internal::TensorIntDivisor(m_dimensions[NumDims-1]); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType, EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + // Patch index corresponding to the passed in index. + const Index patchIndex = index / m_fastPatchStride; + // Find the offset of the element wrt the location of the first element. + const Index patchOffset = (index - patchIndex * m_patchStride) / m_fastOutputDepth; + + // Other ways to index this element. + const Index otherIndex = (NumDims == 4) ? 0 : index / m_fastOtherStride; + const Index patch2DIndex = (NumDims == 4) ? patchIndex : (index - otherIndex * m_otherStride) / m_fastPatchStride; + + // Calculate col index in the input original tensor. + const Index colIndex = patch2DIndex / m_fastOutputRows; + const Index colOffset = patchOffset / m_fastColStride; + const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft; + const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInflateColStride) : 0); + if (inputCol < 0 || inputCol >= m_input_cols_eff || + ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) { + return Scalar(m_paddingValue); + } + + // Calculate row index in the original input tensor. + const Index rowIndex = patch2DIndex - colIndex * m_outputRows; + const Index rowOffset = patchOffset - colOffset * m_colStride; + const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop; + const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInflateRowStride) : 0); + if (inputRow < 0 || inputRow >= m_input_rows_eff || + ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) { + return Scalar(m_paddingValue); + } + + const int depth_index = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - 1; + const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index]; + + const Index inputIndex = depth + origInputRow * m_rowInputStride + origInputCol * m_colInputStride + otherIndex * m_patchInputStride; + return m_impl.coeff(inputIndex); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1) { + return packetWithPossibleZero(index); + } + + const Index indices[2] = {index, index + PacketSize - 1}; + const Index patchIndex = indices[0] / m_fastPatchStride; + if (patchIndex != indices[1] / m_fastPatchStride) { + return packetWithPossibleZero(index); + } + const Index otherIndex = (NumDims == 4) ? 0 : indices[0] / m_fastOtherStride; + eigen_assert(otherIndex == indices[1] / m_fastOtherStride); + + // Find the offset of the element wrt the location of the first element. + const Index patchOffsets[2] = {(indices[0] - patchIndex * m_patchStride) / m_fastOutputDepth, + (indices[1] - patchIndex * m_patchStride) / m_fastOutputDepth}; + + const Index patch2DIndex = (NumDims == 4) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride; + eigen_assert(patch2DIndex == (indices[1] - otherIndex * m_otherStride) / m_fastPatchStride); + + const Index colIndex = patch2DIndex / m_fastOutputRows; + const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride, patchOffsets[1] / m_fastColStride}; + + // Calculate col indices in the original input tensor. + const Index inputCols[2] = {colIndex * m_col_strides + colOffsets[0] - + m_colPaddingLeft, colIndex * m_col_strides + colOffsets[1] - m_colPaddingLeft}; + if (inputCols[1] < 0 || inputCols[0] >= m_inputCols) { + return internal::pset1(Scalar(m_paddingValue)); + } + + if (inputCols[0] == inputCols[1]) { + const Index rowIndex = patch2DIndex - colIndex * m_outputRows; + const Index rowOffsets[2] = {patchOffsets[0] - colOffsets[0]*m_colStride, patchOffsets[1] - colOffsets[1]*m_colStride}; + eigen_assert(rowOffsets[0] <= rowOffsets[1]); + // Calculate col indices in the original input tensor. + const Index inputRows[2] = {rowIndex * m_row_strides + rowOffsets[0] - + m_rowPaddingTop, rowIndex * m_row_strides + rowOffsets[1] - m_rowPaddingTop}; + + if (inputRows[1] < 0 || inputRows[0] >= m_inputRows) { + return internal::pset1(Scalar(m_paddingValue)); + } + + if (inputRows[0] >= 0 && inputRows[1] < m_inputRows) { + // no padding + const int depth_index = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - 1; + const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index]; + const Index inputIndex = depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex * m_patchInputStride; + return m_impl.template packet(inputIndex); + } + } + + return packetWithPossibleZero(index); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return m_impl; } + +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowPaddingTop() const { return m_rowPaddingTop; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colPaddingLeft() const { return m_colPaddingLeft; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputRows() const { return m_outputRows; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputCols() const { return m_outputCols; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userRowStride() const { return m_row_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userColStride() const { return m_col_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInRowStride() const { return m_in_row_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInColStride() const { return m_in_col_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowInflateStride() const { return m_row_inflate_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colInflateStride() const { return m_col_inflate_strides; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + // We conservatively estimate the cost for the code path where the computed + // index is inside the original image and + // TensorEvaluator::CoordAccess is false. + const double compute_cost = 3 * TensorOpCost::DivCost() + + 6 * TensorOpCost::MulCost() + + 8 * TensorOpCost::MulCost(); + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const + { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + Dimensions m_dimensions; + + Index m_otherStride; + Index m_patchStride; + Index m_colStride; + Index m_row_strides; + Index m_col_strides; + + Index m_in_row_strides; + Index m_in_col_strides; + Index m_row_inflate_strides; + Index m_col_inflate_strides; + + Index m_input_rows_eff; + Index m_input_cols_eff; + Index m_patch_rows_eff; + Index m_patch_cols_eff; + + internal::TensorIntDivisor m_fastOtherStride; + internal::TensorIntDivisor m_fastPatchStride; + internal::TensorIntDivisor m_fastColStride; + internal::TensorIntDivisor m_fastInflateRowStride; + internal::TensorIntDivisor m_fastInflateColStride; + internal::TensorIntDivisor m_fastInputColsEff; + + Index m_rowInputStride; + Index m_colInputStride; + Index m_patchInputStride; + + Index m_inputDepth; + Index m_inputRows; + Index m_inputCols; + + Index m_outputRows; + Index m_outputCols; + + Index m_rowPaddingTop; + Index m_colPaddingLeft; + + internal::TensorIntDivisor m_fastOutputRows; + internal::TensorIntDivisor m_fastOutputDepth; + + Scalar m_paddingValue; + + const Device EIGEN_DEVICE_REF m_device; + TensorEvaluator m_impl; +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorIndexList.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorIndexList.h new file mode 100644 index 0000000..2d8c7b9 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorIndexList.h @@ -0,0 +1,738 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H +#define EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H + + +#if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES + +#define EIGEN_HAS_INDEX_LIST + +namespace Eigen { + +/** \internal + * + * \class TensorIndexList + * \ingroup CXX11_Tensor_Module + * + * \brief Set of classes used to encode a set of Tensor dimensions/indices. + * + * The indices in the list can be known at compile time or at runtime. A mix + * of static and dynamic indices can also be provided if needed. The tensor + * code will attempt to take advantage of the indices that are known at + * compile time to optimize the code it generates. + * + * This functionality requires a c++11 compliant compiler. If your compiler + * is older you need to use arrays of indices instead. + * + * Several examples are provided in the cxx11_tensor_index_list.cpp file. + * + * \sa Tensor + */ + +template +struct type2index { + static const Index value = n; + EIGEN_DEVICE_FUNC constexpr operator Index() const { return n; } + EIGEN_DEVICE_FUNC void set(Index val) { + eigen_assert(val == n); + } +}; + +// This can be used with IndexPairList to get compile-time constant pairs, +// such as IndexPairList, type2indexpair<3,4>>(). +template +struct type2indexpair { + static const Index first = f; + static const Index second = s; + + constexpr EIGEN_DEVICE_FUNC operator IndexPair() const { + return IndexPair(f, s); + } + + EIGEN_DEVICE_FUNC void set(const IndexPair& val) { + eigen_assert(val.first == f); + eigen_assert(val.second == s); + } +}; + + +template struct NumTraits > +{ + typedef Index Real; + enum { + IsComplex = 0, + RequireInitialization = false, + ReadCost = 1, + AddCost = 1, + MulCost = 1 + }; + + EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real epsilon() { return 0; } + EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real dummy_precision() { return 0; } + EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real highest() { return n; } + EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real lowest() { return n; } +}; + +namespace internal { +template +EIGEN_DEVICE_FUNC void update_value(T& val, Index new_val) { + val = internal::convert_index(new_val); +} +template +EIGEN_DEVICE_FUNC void update_value(type2index& val, Index new_val) { + val.set(new_val); +} + +template +EIGEN_DEVICE_FUNC void update_value(T& val, IndexPair new_val) { + val = new_val; +} +template +EIGEN_DEVICE_FUNC void update_value(type2indexpair& val, IndexPair new_val) { + val.set(new_val); +} + + +template +struct is_compile_time_constant { + static constexpr bool value = false; +}; + +template +struct is_compile_time_constant > { + static constexpr bool value = true; +}; +template +struct is_compile_time_constant > { + static constexpr bool value = true; +}; +template +struct is_compile_time_constant& > { + static constexpr bool value = true; +}; +template +struct is_compile_time_constant& > { + static constexpr bool value = true; +}; + +template +struct is_compile_time_constant > { + static constexpr bool value = true; +}; +template +struct is_compile_time_constant > { + static constexpr bool value = true; +}; +template +struct is_compile_time_constant& > { + static constexpr bool value = true; +}; +template +struct is_compile_time_constant& > { + static constexpr bool value = true; +}; + + +template +struct IndexTuple; + +template +struct IndexTuple { + EIGEN_DEVICE_FUNC constexpr IndexTuple() : head(), others() { } + EIGEN_DEVICE_FUNC constexpr IndexTuple(const T& v, const O... o) : head(v), others(o...) { } + + constexpr static int count = 1 + sizeof...(O); + T head; + IndexTuple others; + typedef T Head; + typedef IndexTuple Other; +}; + +template + struct IndexTuple { + EIGEN_DEVICE_FUNC constexpr IndexTuple() : head() { } + EIGEN_DEVICE_FUNC constexpr IndexTuple(const T& v) : head(v) { } + + constexpr static int count = 1; + T head; + typedef T Head; +}; + + +template +struct IndexTupleExtractor; + +template +struct IndexTupleExtractor { + + typedef typename IndexTupleExtractor::ValType ValType; + + EIGEN_DEVICE_FUNC static constexpr ValType& get_val(IndexTuple& val) { + return IndexTupleExtractor::get_val(val.others); + } + + EIGEN_DEVICE_FUNC static constexpr const ValType& get_val(const IndexTuple& val) { + return IndexTupleExtractor::get_val(val.others); + } + template + EIGEN_DEVICE_FUNC static void set_val(IndexTuple& val, V& new_val) { + IndexTupleExtractor::set_val(val.others, new_val); + } + +}; + +template + struct IndexTupleExtractor<0, T, O...> { + + typedef T ValType; + + EIGEN_DEVICE_FUNC static constexpr ValType& get_val(IndexTuple& val) { + return val.head; + } + EIGEN_DEVICE_FUNC static constexpr const ValType& get_val(const IndexTuple& val) { + return val.head; + } + template + EIGEN_DEVICE_FUNC static void set_val(IndexTuple& val, V& new_val) { + val.head = new_val; + } +}; + + + +template +EIGEN_DEVICE_FUNC constexpr typename IndexTupleExtractor::ValType& array_get(IndexTuple& tuple) { + return IndexTupleExtractor::get_val(tuple); +} +template +EIGEN_DEVICE_FUNC constexpr const typename IndexTupleExtractor::ValType& array_get(const IndexTuple& tuple) { + return IndexTupleExtractor::get_val(tuple); +} +template + struct array_size > { + static const size_t value = IndexTuple::count; +}; +template + struct array_size > { + static const size_t value = IndexTuple::count; +}; + + + + +template +struct tuple_coeff { + template + EIGEN_DEVICE_FUNC static constexpr ValueT get(const Index i, const IndexTuple& t) { + // return array_get(t) * (i == Idx) + tuple_coeff::get(i, t) * (i != Idx); + return (i == Idx ? array_get(t) : tuple_coeff::get(i, t)); + } + template + EIGEN_DEVICE_FUNC static void set(const Index i, IndexTuple& t, const ValueT& value) { + if (i == Idx) { + update_value(array_get(t), value); + } else { + tuple_coeff::set(i, t, value); + } + } + + template + EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const Index i, const IndexTuple& t) { + return ((i == Idx) & is_compile_time_constant::ValType>::value) || + tuple_coeff::value_known_statically(i, t); + } + + template + EIGEN_DEVICE_FUNC static constexpr bool values_up_to_known_statically(const IndexTuple& t) { + return is_compile_time_constant::ValType>::value && + tuple_coeff::values_up_to_known_statically(t); + } + + template + EIGEN_DEVICE_FUNC static constexpr bool values_up_to_statically_known_to_increase(const IndexTuple& t) { + return is_compile_time_constant::ValType>::value && + is_compile_time_constant::ValType>::value && + array_get(t) > array_get(t) && + tuple_coeff::values_up_to_statically_known_to_increase(t); + } +}; + +template +struct tuple_coeff<0, ValueT> { + template + EIGEN_DEVICE_FUNC static constexpr ValueT get(const Index /*i*/, const IndexTuple& t) { + // eigen_assert (i == 0); // gcc fails to compile assertions in constexpr + return array_get<0>(t)/* * (i == 0)*/; + } + template + EIGEN_DEVICE_FUNC static void set(const Index i, IndexTuple& t, const ValueT value) { + eigen_assert (i == 0); + update_value(array_get<0>(t), value); + } + template + EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const Index i, const IndexTuple&) { + return is_compile_time_constant::ValType>::value && (i == 0); + } + + template + EIGEN_DEVICE_FUNC static constexpr bool values_up_to_known_statically(const IndexTuple&) { + return is_compile_time_constant::ValType>::value; + } + + template + EIGEN_DEVICE_FUNC static constexpr bool values_up_to_statically_known_to_increase(const IndexTuple&) { + return true; + } +}; +} // namespace internal + + + +template +struct IndexList : internal::IndexTuple { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Index operator[] (const Index i) const { + return internal::tuple_coeff >::value-1, Index>::get(i, *this); + } + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Index get(const Index i) const { + return internal::tuple_coeff >::value-1, Index>::get(i, *this); + } + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const Index i, const Index value) { + return internal::tuple_coeff >::value-1, Index>::set(i, *this, value); + } + + EIGEN_DEVICE_FUNC constexpr IndexList(const internal::IndexTuple& other) : internal::IndexTuple(other) { } + EIGEN_DEVICE_FUNC constexpr IndexList(FirstType& first, OtherTypes... other) : internal::IndexTuple(first, other...) { } + EIGEN_DEVICE_FUNC constexpr IndexList() : internal::IndexTuple() { } + + EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const Index i) const { + return internal::tuple_coeff >::value-1, Index>::value_known_statically(i, *this); + } + EIGEN_DEVICE_FUNC constexpr bool all_values_known_statically() const { + return internal::tuple_coeff >::value-1, Index>::values_up_to_known_statically(*this); + } + + EIGEN_DEVICE_FUNC constexpr bool values_statically_known_to_increase() const { + return internal::tuple_coeff >::value-1, Index>::values_up_to_statically_known_to_increase(*this); + } +}; + +template +std::ostream& operator<<(std::ostream& os, + const IndexList& dims) { + os << "["; + for (size_t i = 0; i < 1 + sizeof...(OtherTypes); ++i) { + if (i > 0) os << ", "; + os << dims[i]; + } + os << "]"; + return os; +} + +template +constexpr IndexList make_index_list(FirstType val1, OtherTypes... other_vals) { + return IndexList(val1, other_vals...); +} + + +template +struct IndexPairList : internal::IndexTuple { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr IndexPair operator[] (const Index i) const { + return internal::tuple_coeff >::value-1, IndexPair>::get(i, *this); + } + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const Index i, const IndexPair value) { + return internal::tuple_coeff>::value-1, IndexPair >::set(i, *this, value); + } + + EIGEN_DEVICE_FUNC constexpr IndexPairList(const internal::IndexTuple& other) : internal::IndexTuple(other) { } + EIGEN_DEVICE_FUNC constexpr IndexPairList() : internal::IndexTuple() { } + + EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const Index i) const { + return internal::tuple_coeff >::value-1, Index>::value_known_statically(i, *this); + } +}; + +namespace internal { + +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index array_prod(const IndexList& sizes) { + Index result = 1; + EIGEN_UNROLL_LOOP + for (size_t i = 0; i < array_size >::value; ++i) { + result *= sizes[i]; + } + return result; +} + +template struct array_size > { + static const size_t value = array_size >::value; +}; +template struct array_size > { + static const size_t value = array_size >::value; +}; + +template struct array_size > { + static const size_t value = std::tuple_size >::value; +}; +template struct array_size > { + static const size_t value = std::tuple_size >::value; +}; + +template EIGEN_DEVICE_FUNC constexpr Index array_get(IndexList& a) { + return IndexTupleExtractor::get_val(a); +} +template EIGEN_DEVICE_FUNC constexpr Index array_get(const IndexList& a) { + return IndexTupleExtractor::get_val(a); +} + +template +struct index_known_statically_impl { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index) { + return false; + } +}; + +template +struct index_known_statically_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i) { + return IndexList().value_known_statically(i); + } +}; + +template +struct index_known_statically_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i) { + return IndexList().value_known_statically(i); + } +}; + + +template +struct all_indices_known_statically_impl { + static constexpr bool run() { + return false; + } +}; + +template +struct all_indices_known_statically_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run() { + return IndexList().all_values_known_statically(); + } +}; + +template +struct all_indices_known_statically_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run() { + return IndexList().all_values_known_statically(); + } +}; + + +template +struct indices_statically_known_to_increase_impl { + EIGEN_DEVICE_FUNC static constexpr bool run() { + return false; + } +}; + +template + struct indices_statically_known_to_increase_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run() { + return Eigen::IndexList().values_statically_known_to_increase(); + } +}; + +template + struct indices_statically_known_to_increase_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run() { + return Eigen::IndexList().values_statically_known_to_increase(); + } +}; + + +template +struct index_statically_eq_impl { + EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { + return false; + } +}; + +template +struct index_statically_eq_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + return IndexList().value_known_statically(i) & + (IndexList().get(i) == value); + } +}; + +template +struct index_statically_eq_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + return IndexList().value_known_statically(i) & + (IndexList().get(i) == value); + } +}; + + +template +struct index_statically_ne_impl { + EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { + return false; + } +}; + +template +struct index_statically_ne_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + return IndexList().value_known_statically(i) & + (IndexList().get(i) != value); + } +}; + +template +struct index_statically_ne_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + return IndexList().value_known_statically(i) & + (IndexList().get(i) != value); + } +}; + + +template +struct index_statically_gt_impl { + EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { + return false; + } +}; + +template +struct index_statically_gt_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + return IndexList().value_known_statically(i) & + (IndexList().get(i) > value); + } +}; + +template +struct index_statically_gt_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + return IndexList().value_known_statically(i) & + (IndexList().get(i) > value); + } +}; + + + +template +struct index_statically_lt_impl { + EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { + return false; + } +}; + +template +struct index_statically_lt_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + return IndexList().value_known_statically(i) & + (IndexList().get(i) < value); + } +}; + +template +struct index_statically_lt_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + return IndexList().value_known_statically(i) & + (IndexList().get(i) < value); + } +}; + + + +template +struct index_pair_first_statically_eq_impl { + EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { + return false; + } +}; + +template +struct index_pair_first_statically_eq_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + return IndexPairList().value_known_statically(i) & + (IndexPairList().operator[](i).first == value); + } +}; + +template +struct index_pair_first_statically_eq_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + return IndexPairList().value_known_statically(i) & + (IndexPairList().operator[](i).first == value); + } +}; + + + +template +struct index_pair_second_statically_eq_impl { + EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { + return false; + } +}; + +template +struct index_pair_second_statically_eq_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + return IndexPairList().value_known_statically(i) & + (IndexPairList().operator[](i).second == value); + } +}; + +template +struct index_pair_second_statically_eq_impl > { + EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + return IndexPairList().value_known_statically(i) & + (IndexPairList().operator[](i).second == value); + } +}; + + +} // end namespace internal +} // end namespace Eigen + +#else + +namespace Eigen { +namespace internal { + +template +struct index_known_statically_impl { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const Index) { + return false; + } +}; + +template +struct all_indices_known_statically_impl { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() { + return false; + } +}; + +template +struct indices_statically_known_to_increase_impl { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() { + return false; + } +}; + +template +struct index_statically_eq_impl { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) { + return false; + } +}; + +template +struct index_statically_ne_impl { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) { + return false; + } +}; + +template +struct index_statically_gt_impl { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) { + return false; + } +}; + +template +struct index_statically_lt_impl { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) { + return false; + } +}; + +template +struct index_pair_first_statically_eq_impl { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) { + return false; + } +}; + +template +struct index_pair_second_statically_eq_impl { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) { + return false; + } +}; + + + +} // end namespace internal +} // end namespace Eigen + +#endif + + +namespace Eigen { +namespace internal { +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_known_statically(Index i) { + return index_known_statically_impl::run(i); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool all_indices_known_statically() { + return all_indices_known_statically_impl::run(); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool indices_statically_known_to_increase() { + return indices_statically_known_to_increase_impl::run(); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_eq(Index i, Index value) { + return index_statically_eq_impl::run(i, value); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_ne(Index i, Index value) { + return index_statically_ne_impl::run(i, value); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_gt(Index i, Index value) { + return index_statically_gt_impl::run(i, value); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_lt(Index i, Index value) { + return index_statically_lt_impl::run(i, value); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_first_statically_eq(Index i, Index value) { + return index_pair_first_statically_eq_impl::run(i, value); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_second_statically_eq(Index i, Index value) { + return index_pair_second_statically_eq_impl::run(i, value); +} + +} // end namespace internal +} // end namespace Eigen + + +#endif // EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorInflation.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorInflation.h new file mode 100644 index 0000000..c5cb61a --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorInflation.h @@ -0,0 +1,247 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Ke Yang +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H +#define EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H + +namespace Eigen { + +/** \class TensorInflation + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor inflation class. + * + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorInflationOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorInflationOp type; +}; + +} // end namespace internal + +template +class TensorInflationOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorInflationOp(const XprType& expr, const Strides& strides) + : m_xpr(expr), m_strides(strides) {} + + EIGEN_DEVICE_FUNC + const Strides& strides() const { return m_strides; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const Strides m_strides; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorInflationOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + enum { + IsAligned = /*TensorEvaluator::IsAligned*/ false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_strides(op.strides()) + { + m_dimensions = m_impl.dimensions(); + // Expand each dimension to the inflated dimension. + for (int i = 0; i < NumDims; ++i) { + m_dimensions[i] = (m_dimensions[i] - 1) * op.strides()[i] + 1; + } + + // Remember the strides for fast division. + for (int i = 0; i < NumDims; ++i) { + m_fastStrides[i] = internal::TensorIntDivisor(m_strides[i]); + } + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + if (static_cast(Layout) == static_cast(ColMajor)) { + m_outputStrides[0] = 1; + m_inputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + } + } else { // RowMajor + m_outputStrides[NumDims-1] = 1; + m_inputStrides[NumDims-1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1]; + m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + // Computes the input index given the output index. Returns true if the output + // index doesn't fall into a hole. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool getInputIndex(Index index, Index* inputIndex) const + { + eigen_assert(index < dimensions().TotalSize()); + *inputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + if (idx != idx / m_fastStrides[i] * m_strides[i]) { + return false; + } + *inputIndex += idx / m_strides[i] * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + if (index != index / m_fastStrides[0] * m_strides[0]) { + return false; + } + *inputIndex += index / m_strides[0]; + return true; + } else { + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i]; + if (idx != idx / m_fastStrides[i] * m_strides[i]) { + return false; + } + *inputIndex += idx / m_strides[i] * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + if (index != index / m_fastStrides[NumDims-1] * m_strides[NumDims-1]) { + return false; + } + *inputIndex += index / m_strides[NumDims - 1]; + } + return true; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + Index inputIndex = 0; + if (getInputIndex(index, &inputIndex)) { + return m_impl.coeff(inputIndex); + } else { + return Scalar(0); + } + } + + // TODO(yangke): optimize this function so that we can detect and produce + // all-zero packets + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double compute_cost = NumDims * (3 * TensorOpCost::DivCost() + + 3 * TensorOpCost::MulCost() + + 2 * TensorOpCost::AddCost()); + const double input_size = m_impl.dimensions().TotalSize(); + const double output_size = m_dimensions.TotalSize(); + if (output_size == 0) + return TensorOpCost(); + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(sizeof(CoeffReturnType) * input_size / output_size, 0, + compute_cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif + + protected: + Dimensions m_dimensions; + array m_outputStrides; + array m_inputStrides; + TensorEvaluator m_impl; + const Strides m_strides; + array, NumDims> m_fastStrides; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorInitializer.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorInitializer.h new file mode 100644 index 0000000..26a3818 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorInitializer.h @@ -0,0 +1,82 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H +#define EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H + +#if EIGEN_HAS_VARIADIC_TEMPLATES + +#include + +namespace Eigen { + +/** \class TensorInitializer + * \ingroup CXX11_Tensor_Module + * + * \brief Helper template to initialize Tensors from std::initializer_lists. + */ +namespace internal { + +template +struct Initializer { + typedef std::initializer_list< + typename Initializer::InitList> InitList; + + static void run(TensorEvaluator& tensor, + Eigen::array::Index, traits::NumDimensions>* indices, + const InitList& vals) { + int i = 0; + for (const auto& v : vals) { + (*indices)[traits::NumDimensions - N] = i++; + Initializer::run(tensor, indices, v); + } + } +}; + +template +struct Initializer { + typedef std::initializer_list::Scalar> InitList; + + static void run(TensorEvaluator& tensor, + Eigen::array::Index, traits::NumDimensions>* indices, + const InitList& vals) { + int i = 0; + // There is likely a faster way to do that than iterating. + for (const auto& v : vals) { + (*indices)[traits::NumDimensions - 1] = i++; + tensor.coeffRef(*indices) = v; + } + } +}; + +template +struct Initializer { + typedef typename traits::Scalar InitList; + + static void run(TensorEvaluator& tensor, + Eigen::array::Index, traits::NumDimensions>*, + const InitList& v) { + tensor.coeffRef(0) = v; + } +}; + + +template +void initialize_tensor(TensorEvaluator& tensor, + const typename Initializer::NumDimensions>::InitList& vals) { + Eigen::array::Index, traits::NumDimensions> indices; + Initializer::NumDimensions>::run(tensor, &indices, vals); +} + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_HAS_VARIADIC_TEMPLATES + +#endif // EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorIntDiv.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorIntDiv.h new file mode 100644 index 0000000..6d5cce4 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorIntDiv.h @@ -0,0 +1,263 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H +#define EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H + + +namespace Eigen { + +/** \internal + * + * \class TensorIntDiv + * \ingroup CXX11_Tensor_Module + * + * \brief Fast integer division by a constant. + * + * See the paper from Granlund and Montgomery for explanation. + * (at https://doi.org/10.1145/773473.178249) + * + * \sa Tensor + */ + +namespace internal { + +namespace { + + // Note: result is undefined if val == 0 + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + typename internal::enable_if::type count_leading_zeros(const T val) + { +#ifdef EIGEN_GPU_COMPILE_PHASE + return __clz(val); +#elif defined(SYCL_DEVICE_ONLY) + return cl::sycl::clz(val); +#elif EIGEN_COMP_MSVC + unsigned long index; + _BitScanReverse(&index, val); + return 31 - index; +#else + EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE); + return __builtin_clz(static_cast(val)); +#endif + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + typename internal::enable_if::type count_leading_zeros(const T val) + { +#ifdef EIGEN_GPU_COMPILE_PHASE + return __clzll(val); +#elif defined(SYCL_DEVICE_ONLY) + return static_cast(cl::sycl::clz(val)); +#elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64 + unsigned long index; + _BitScanReverse64(&index, val); + return 63 - index; +#elif EIGEN_COMP_MSVC + // MSVC's _BitScanReverse64 is not available for 32bits builds. + unsigned int lo = (unsigned int)(val&0xffffffff); + unsigned int hi = (unsigned int)((val>>32)&0xffffffff); + int n; + if(hi==0) + n = 32 + count_leading_zeros(lo); + else + n = count_leading_zeros(hi); + return n; +#else + EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE); + return __builtin_clzll(static_cast(val)); +#endif + } + + template + struct UnsignedTraits { + typedef typename conditional::type type; + }; + + template + struct DividerTraits { + typedef typename UnsignedTraits::type type; + static const int N = sizeof(T) * 8; + }; + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) { +#if defined(EIGEN_GPU_COMPILE_PHASE) + return __umulhi(a, b); +#elif defined(SYCL_DEVICE_ONLY) + return cl::sycl::mul_hi(a, static_cast(b)); +#else + return (static_cast(a) * b) >> 32; +#endif + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) { +#if defined(EIGEN_GPU_COMPILE_PHASE) + return __umul64hi(a, b); +#elif defined(SYCL_DEVICE_ONLY) + return cl::sycl::mul_hi(a, static_cast(b)); +#elif EIGEN_HAS_BUILTIN_INT128 + __uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b); + return static_cast(v >> 64); +#else + return (TensorUInt128, uint64_t>(a) * TensorUInt128, uint64_t>(b)).upper(); +#endif + } + + template + struct DividerHelper { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t computeMultiplier(const int log_div, const T divider) { + EIGEN_STATIC_ASSERT(N == 32, YOU_MADE_A_PROGRAMMING_MISTAKE); + return static_cast((static_cast(1) << (N+log_div)) / divider - (static_cast(1) << N) + 1); + } + }; + + template + struct DividerHelper<64, T> { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) { +#if EIGEN_HAS_BUILTIN_INT128 && !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY) + return static_cast((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1); +#else + const uint64_t shift = 1ULL << log_div; + TensorUInt128 result = TensorUInt128 >(shift, 0) / TensorUInt128, uint64_t>(divider) + - TensorUInt128, static_val<0> >(1, 0) + + TensorUInt128, static_val<1> >(1); + return static_cast(result); +#endif + } + }; +} + + +template +struct TensorIntDivisor { + public: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() { + multiplier = 0; + shift1 = 0; + shift2 = 0; + } + + // Must have 0 < divider < 2^31. This is relaxed to + // 0 < divider < 2^63 when using 64-bit indices on platforms that support + // the __uint128_t type. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor(const T divider) { + const int N = DividerTraits::N; + eigen_assert(static_cast::type>(divider) < NumTraits::highest()/2); + eigen_assert(divider > 0); + + // fast ln2 + const int leading_zeros = count_leading_zeros(static_cast(divider)); + int log_div = N - leading_zeros; + // if divider is a power of two then log_div is 1 more than it should be. + if ((static_cast::type>(1) << (log_div-1)) == static_cast::type>(divider)) + log_div--; + + multiplier = DividerHelper::computeMultiplier(log_div, divider); + shift1 = log_div > 1 ? 1 : log_div; + shift2 = log_div > 1 ? log_div-1 : 0; + } + + // Must have 0 <= numerator. On platforms that don't support the __uint128_t + // type numerator should also be less than 2^32-1. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T divide(const T numerator) const { + eigen_assert(static_cast::type>(numerator) < NumTraits::highest()/2); + //eigen_assert(numerator >= 0); // this is implicitly asserted by the line above + + UnsignedType t1 = muluh(multiplier, numerator); + UnsignedType t = (static_cast(numerator) - t1) >> shift1; + return (t1 + t) >> shift2; + } + + private: + typedef typename DividerTraits::type UnsignedType; + UnsignedType multiplier; + int32_t shift1; + int32_t shift2; +}; + + +// Optimized version for signed 32 bit integers. +// Derived from Hacker's Delight. +// Only works for divisors strictly greater than one +template <> +class TensorIntDivisor { + public: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() { + magic = 0; + shift = 0; + } + // Must have 2 <= divider + EIGEN_DEVICE_FUNC TensorIntDivisor(int32_t divider) { + eigen_assert(divider >= 2); + calcMagic(divider); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const { +#ifdef EIGEN_GPU_COMPILE_PHASE + return (__umulhi(magic, n) >> shift); +#elif defined(SYCL_DEVICE_ONLY) + return (cl::sycl::mul_hi(magic, static_cast(n)) >> shift); +#else + uint64_t v = static_cast(magic) * static_cast(n); + return (static_cast(v >> 32) >> shift); +#endif + } + +private: + // Compute the magic numbers. See Hacker's Delight section 10 for an in + // depth explanation. + EIGEN_DEVICE_FUNC void calcMagic(int32_t d) { + const unsigned two31 = 0x80000000; // 2**31. + unsigned ad = d; + unsigned t = two31 + (ad >> 31); + unsigned anc = t - 1 - t%ad; // Absolute value of nc. + int p = 31; // Init. p. + unsigned q1 = two31/anc; // Init. q1 = 2**p/|nc|. + unsigned r1 = two31 - q1*anc; // Init. r1 = rem(2**p, |nc|). + unsigned q2 = two31/ad; // Init. q2 = 2**p/|d|. + unsigned r2 = two31 - q2*ad; // Init. r2 = rem(2**p, |d|). + unsigned delta = 0; + do { + p = p + 1; + q1 = 2*q1; // Update q1 = 2**p/|nc|. + r1 = 2*r1; // Update r1 = rem(2**p, |nc|). + if (r1 >= anc) { // (Must be an unsigned + q1 = q1 + 1; // comparison here). + r1 = r1 - anc;} + q2 = 2*q2; // Update q2 = 2**p/|d|. + r2 = 2*r2; // Update r2 = rem(2**p, |d|). + if (r2 >= ad) { // (Must be an unsigned + q2 = q2 + 1; // comparison here). + r2 = r2 - ad;} + delta = ad - r2; + } while (q1 < delta || (q1 == delta && r1 == 0)); + + magic = (unsigned)(q2 + 1); + shift = p - 32; + } + + uint32_t magic; + int32_t shift; +}; + + +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator / (const T& numerator, const TensorIntDivisor& divisor) { + return divisor.divide(numerator); +} + + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorLayoutSwap.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorLayoutSwap.h new file mode 100644 index 0000000..80106c1 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorLayoutSwap.h @@ -0,0 +1,216 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H +#define EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H + +namespace Eigen { + +/** \class TensorLayoutSwap + * \ingroup CXX11_Tensor_Module + * + * \brief Swap the layout from col-major to row-major, or row-major + * to col-major, and invert the order of the dimensions. + * + * Beware: the dimensions are reversed by this operation. If you want to + * preserve the ordering of the dimensions, you need to combine this + * operation with a shuffle. + * + * \example: + * Tensor input(2, 4); + * Tensor output = input.swap_layout(); + * eigen_assert(output.dimension(0) == 4); + * eigen_assert(output.dimension(1) == 2); + * + * array shuffle(1, 0); + * output = input.swap_layout().shuffle(shuffle); + * eigen_assert(output.dimension(0) == 2); + * eigen_assert(output.dimension(1) == 4); + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = traits::NumDimensions; + static const int Layout = (traits::Layout == ColMajor) ? RowMajor : ColMajor; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorLayoutSwapOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorLayoutSwapOp type; +}; + +} // end namespace internal + + + +template +class TensorLayoutSwapOp : public TensorBase, WriteAccessors> +{ + public: + typedef TensorBase, WriteAccessors> Base; + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorLayoutSwapOp(const XprType& expr) + : m_xpr(expr) {} + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorLayoutSwapOp) + protected: + typename XprType::Nested m_xpr; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorLayoutSwapOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + Layout = (static_cast(TensorEvaluator::Layout) == static_cast(ColMajor)) ? RowMajor : ColMajor, + CoordAccess = false, // to be implemented + RawAccess = TensorEvaluator::RawAccess + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device) + { + for(int i = 0; i < NumDims; ++i) { + m_dimensions[i] = m_impl.dimensions()[NumDims-1-i]; + } + } + +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + return m_impl.evalSubExprsIfNeeded(data); + } + EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_impl.coeff(index); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_impl.template packet(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized); + } + + EIGEN_DEVICE_FUNC typename Storage::Type data() const { + return constCast(m_impl.data()); + } + + const TensorEvaluator& impl() const { return m_impl; } + + protected: + TensorEvaluator m_impl; + Dimensions m_dimensions; +}; + + +// Eval as lvalue +template + struct TensorEvaluator, Device> + : public TensorEvaluator, Device> +{ + typedef TensorEvaluator, Device> Base; + typedef TensorLayoutSwapOp XprType; + + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + Layout = (static_cast(TensorEvaluator::Layout) == static_cast(ColMajor)) ? RowMajor : ColMajor, + CoordAccess = false // to be implemented + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + { + return this->m_impl.coeffRef(index); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + this->m_impl.template writePacket(index, x); + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorMacros.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorMacros.h new file mode 100644 index 0000000..73ff3d2 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorMacros.h @@ -0,0 +1,98 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_META_MACROS_H +#define EIGEN_CXX11_TENSOR_TENSOR_META_MACROS_H + + +/** use this macro in sfinae selection in templated functions + * + * template::value , int >::type = 0 + * > + * void foo(){} + * + * becomes => + * + * template::value ) + * > + * void foo(){} + */ + +// SFINAE requires variadic templates +#if !defined(EIGEN_GPUCC) +#if EIGEN_HAS_VARIADIC_TEMPLATES + // SFINAE doesn't work for gcc <= 4.7 + #ifdef EIGEN_COMP_GNUC + #if EIGEN_GNUC_AT_LEAST(4,8) + #define EIGEN_HAS_SFINAE + #endif + #else + #define EIGEN_HAS_SFINAE + #endif +#endif +#endif + +#define EIGEN_SFINAE_ENABLE_IF( __condition__ ) \ + typename internal::enable_if< ( __condition__ ) , int >::type = 0 + +// Define a macro to use a reference on the host but a value on the device +#if defined(SYCL_DEVICE_ONLY) + #define EIGEN_DEVICE_REF +#else + #define EIGEN_DEVICE_REF & +#endif + +// Define a macro for catching SYCL exceptions if exceptions are enabled +#define EIGEN_SYCL_TRY_CATCH(X) \ + do { \ + EIGEN_TRY {X;} \ + EIGEN_CATCH(const cl::sycl::exception& e) { \ + EIGEN_THROW_X(std::runtime_error("SYCL exception at " + \ + std::string(__FILE__) + ":" + \ + std::to_string(__LINE__) + "\n" + \ + e.what())); \ + } \ + } while (false) + +// Define a macro if local memory flags are unset or one of them is set +// Setting both flags is the same as unsetting them +#if (!defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)) || \ + (defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)) + #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON 1 + #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF 1 +#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM) + #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON 1 +#elif !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM) + #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF 1 +#endif + +#if EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653) + #define EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ + using Base::operator =; \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) { Base::operator=(other); return *this; } \ + template \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const OtherDerived& other) { Base::operator=(other); return *this; } +#else + #define EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ + EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) +#endif + +/** \internal + * \brief Macro to manually inherit assignment operators. + * This is necessary, because the implicitly defined assignment operator gets deleted when a custom operator= is defined. + * This also inherits template operator=(const OtherDerived&) assignments. + * With C++11 or later this also default-implements the copy-constructor + */ +#define EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(Derived) \ + EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ + EIGEN_DEFAULT_COPY_CONSTRUCTOR(Derived) + +#endif diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorMap.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorMap.h new file mode 100644 index 0000000..6834c97 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorMap.h @@ -0,0 +1,327 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_MAP_H +#define EIGEN_CXX11_TENSOR_TENSOR_MAP_H + +namespace Eigen { + +// FIXME use proper doxygen documentation (e.g. \tparam MakePointer_) + +/** \class TensorMap + * \ingroup CXX11_Tensor_Module + * + * \brief A tensor expression mapping an existing array of data. + * + */ +/// `template class MakePointer_` is added to convert the host pointer to the device pointer. +/// It is added due to the fact that for our device compiler `T*` is not allowed. +/// If we wanted to use the same Evaluator functions we have to convert that type to our pointer `T`. +/// This is done through our `MakePointer_` class. By default the Type in the `MakePointer_` is `T*` . +/// Therefore, by adding the default value, we managed to convert the type and it does not break any +/// existing code as its default value is `T*`. +template class MakePointer_> class TensorMap : public TensorBase > +{ + public: + typedef TensorMap Self; + typedef TensorBase > Base; + #ifdef EIGEN_USE_SYCL + typedef typename Eigen::internal::remove_reference::type>::type Nested; + #else + typedef typename Eigen::internal::nested::type Nested; + #endif + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + typedef typename internal::traits::Scalar Scalar; + typedef typename NumTraits::Real RealScalar; + typedef typename PlainObjectType::Base::CoeffReturnType CoeffReturnType; + + typedef typename MakePointer_::Type PointerType; + typedef typename MakePointer_::ConstType PointerConstType; + + // WARN: PointerType still can be a pointer to const (const Scalar*), for + // example in TensorMap> expression. This type of + // expression should be illegal, but adding this restriction is not possible + // in practice (see https://bitbucket.org/eigen/eigen/pull-requests/488). + typedef typename internal::conditional< + bool(internal::is_lvalue::value), + PointerType, // use simple pointer in lvalue expressions + PointerConstType // use const pointer in rvalue expressions + >::type StoragePointerType; + + // If TensorMap was constructed over rvalue expression (e.g. const Tensor), + // we should return a reference to const from operator() (and others), even + // if TensorMap itself is not const. + typedef typename internal::conditional< + bool(internal::is_lvalue::value), + Scalar&, + const Scalar& + >::type StorageRefType; + + static const int Options = Options_; + + static const Index NumIndices = PlainObjectType::NumIndices; + typedef typename PlainObjectType::Dimensions Dimensions; + + enum { + IsAligned = ((int(Options_)&Aligned)==Aligned), + Layout = PlainObjectType::Layout, + CoordAccess = true, + RawAccess = true + }; + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr) : m_data(dataPtr), m_dimensions() { + // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT((0 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) + } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) { + // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT((sizeof...(otherDimensions) + 1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) + } +#else + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) { + // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. + EIGEN_STATIC_ASSERT((1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2) : m_data(dataPtr), m_dimensions(dim1, dim2) { + EIGEN_STATIC_ASSERT(2 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3) { + EIGEN_STATIC_ASSERT(3 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4) { + EIGEN_STATIC_ASSERT(4 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4, dim5) { + EIGEN_STATIC_ASSERT(5 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) + } +#endif + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, const array& dimensions) + : m_data(dataPtr), m_dimensions(dimensions) + { } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, const Dimensions& dimensions) + : m_data(dataPtr), m_dimensions(dimensions) + { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PlainObjectType& tensor) + : m_data(tensor.data()), m_dimensions(tensor.dimensions()) + { } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index rank() const { return m_dimensions.rank(); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_dimensions[n]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE StoragePointerType data() { return m_data; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE StoragePointerType data() const { return m_data; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE StorageRefType operator()(const array& indices) const + { + // eigen_assert(checkIndexRange(indices)); + if (PlainObjectType::Options&RowMajor) { + const Index index = m_dimensions.IndexOfRowMajor(indices); + return m_data[index]; + } else { + const Index index = m_dimensions.IndexOfColMajor(indices); + return m_data[index]; + } + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE StorageRefType operator()() const + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE) + return m_data[0]; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE StorageRefType operator()(Index index) const + { + eigen_internal_assert(index >= 0 && index < size()); + return m_data[index]; + } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE StorageRefType operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const + { + EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(internal::all((Eigen::NumTraits::highest() >= otherIndices)...)); + if (PlainObjectType::Options&RowMajor) { + const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, secondIndex, otherIndices...}}); + return m_data[index]; + } else { + const Index index = m_dimensions.IndexOfColMajor(array{{firstIndex, secondIndex, otherIndices...}}); + return m_data[index]; + } + } +#else + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1) const + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i1 + i0 * m_dimensions[1]; + return m_data[index]; + } else { + const Index index = i0 + i1 * m_dimensions[0]; + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2) const + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2); + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3) const + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * i3)); + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0))); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * (i3 + m_dimensions[3] * i4))); + return m_data[index]; + } + } +#endif + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE StorageRefType operator()(const array& indices) + { + // eigen_assert(checkIndexRange(indices)); + if (PlainObjectType::Options&RowMajor) { + const Index index = m_dimensions.IndexOfRowMajor(indices); + return m_data[index]; + } else { + const Index index = m_dimensions.IndexOfColMajor(indices); + return m_data[index]; + } + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE StorageRefType operator()() + { + EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE) + return m_data[0]; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE StorageRefType operator()(Index index) + { + eigen_internal_assert(index >= 0 && index < size()); + return m_data[index]; + } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE StorageRefType operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) + { + static_assert(sizeof...(otherIndices) + 2 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + eigen_assert(internal::all((Eigen::NumTraits::highest() >= otherIndices)...)); + const std::size_t NumDims = sizeof...(otherIndices) + 2; + if (PlainObjectType::Options&RowMajor) { + const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, secondIndex, otherIndices...}}); + return m_data[index]; + } else { + const Index index = m_dimensions.IndexOfColMajor(array{{firstIndex, secondIndex, otherIndices...}}); + return m_data[index]; + } + } +#else + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1) + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i1 + i0 * m_dimensions[1]; + return m_data[index]; + } else { + const Index index = i0 + i1 * m_dimensions[0]; + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2) + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2); + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3) + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * i3)); + return m_data[index]; + } + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3, Index i4) + { + if (PlainObjectType::Options&RowMajor) { + const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0))); + return m_data[index]; + } else { + const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * (i3 + m_dimensions[3] * i4))); + return m_data[index]; + } + } +#endif + + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorMap) + + private: + StoragePointerType m_data; + Dimensions m_dimensions; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_MAP_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorMeta.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorMeta.h new file mode 100644 index 0000000..a6181d3 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorMeta.h @@ -0,0 +1,311 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_META_H +#define EIGEN_CXX11_TENSOR_TENSOR_META_H + +namespace Eigen { + +template struct Cond {}; + +template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +const T1& choose(Cond, const T1& first, const T2&) { + return first; +} + +template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +const T2& choose(Cond, const T1&, const T2& second) { + return second; +} + + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T divup(const X x, const Y y) { + return static_cast((x + y - 1) / y); +} + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T divup(const T x, const T y) { + return static_cast((x + y - 1) / y); +} + +template struct max_n_1 { + static const size_t size = n; +}; +template <> struct max_n_1<0> { + static const size_t size = 1; +}; + + +// Default packet types +template +struct PacketType : internal::packet_traits { + typedef typename internal::packet_traits::type type; +}; + +// For CUDA packet types when using a GpuDevice +#if defined(EIGEN_USE_GPU) && defined(EIGEN_HAS_GPU_FP16) + +typedef ulonglong2 Packet4h2; +template<> +struct PacketType { + typedef Packet4h2 type; + static const int size = 8; + enum { + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 0, + HasMin = 1, + HasMax = 1, + HasConj = 0, + HasSetLinear = 0, + HasBlend = 0, + + HasDiv = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasExp = 1, + HasExpm1 = 0, + HasLog = 1, + HasLog1p = 0, + HasLog10 = 0, + HasPow = 1, + }; +}; +#endif + +#if defined(EIGEN_USE_SYCL) + +namespace TensorSycl { +namespace internal { + +template struct PlusOp { + static constexpr Index Value = A + B; +}; + +template struct DivOp { + static constexpr Index Value = A / B; +}; + +template class StepOp> +struct static_for { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void loop(UnaryOperator op) { + op(start); + static_for::Value, end, step, + StepOp>::loop(op); + } +}; +template class StepOp> +struct static_for { + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void loop(UnaryOperator) {} +}; + +template +struct Vectorise { + static const int PacketSize = 1; + typedef OutScalar PacketReturnType; +}; + +template +struct Vectorise { + static const int PacketSize = Eigen::PacketType::size; + typedef typename Eigen::PacketType::type PacketReturnType; +}; + +static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index roundUp(Index x, Index y) { + return ((((x) + (y)-1) / (y)) * (y)); +} + +} // namespace internal +} // namespace TensorSycl + +template <> + struct PacketType { + typedef half type; + static const int size = 1; + enum { + HasAdd = 0, + HasSub = 0, + HasMul = 0, + HasNegate = 0, + HasAbs = 0, + HasArg = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasConj = 0, + HasSetLinear = 0, + HasBlend = 0 + }; +}; +template +struct PacketType : internal::default_packet_traits { + typedef Scalar type; + typedef Scalar half; + enum { + Vectorizable = 0, + size = 1, + AlignedOnScalar = 0, + HasHalfPacket = 0 + }; + enum { + HasAdd = 0, + HasSub = 0, + HasMul = 0, + HasNegate = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasConj = 0, + HasSetLinear = 0 + }; + +}; + +template +struct PacketType : PacketType{}; + +#ifndef EIGEN_DONT_VECTORIZE_SYCL +#define PACKET_TYPE(CVQual, Type, val, lengths, DEV)\ +template<> struct PacketType : internal::sycl_packet_traits \ +{\ + typedef typename internal::packet_traits::type type;\ + typedef typename internal::packet_traits::half half;\ +}; + + +PACKET_TYPE(const, float, 1, 4, SyclDevice) +PACKET_TYPE(, float, 1, 4, SyclDevice) +PACKET_TYPE(const, float, 1, 4, const SyclDevice) +PACKET_TYPE(, float, 1, 4, const SyclDevice) + +PACKET_TYPE(const, double, 0, 2, SyclDevice) +PACKET_TYPE(, double, 0, 2, SyclDevice) +PACKET_TYPE(const, double, 0, 2, const SyclDevice) +PACKET_TYPE(, double, 0, 2, const SyclDevice) +#undef PACKET_TYPE + +template<> struct PacketType: PacketType{}; +template<> struct PacketType: PacketType{}; +#endif +#endif + +// Tuple mimics std::pair but works on e.g. nvcc. +template struct Tuple { + public: + U first; + V second; + + typedef U first_type; + typedef V second_type; + + EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Tuple() : first(), second() {} + + EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Tuple(const U& f, const V& s) : first(f), second(s) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void swap(Tuple& rhs) { + using numext::swap; + swap(first, rhs.first); + swap(second, rhs.second); + } +}; + +template +EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +bool operator==(const Tuple& x, const Tuple& y) { + return (x.first == y.first && x.second == y.second); +} + +template +EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +bool operator!=(const Tuple& x, const Tuple& y) { + return !(x == y); +} + + +// Can't use std::pairs on cuda devices +template struct IndexPair { + EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) {} + EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Idx f, Idx s) : first(f), second(s) {} + + EIGEN_DEVICE_FUNC void set(IndexPair val) { + first = val.first; + second = val.second; + } + + Idx first; + Idx second; +}; + + +#ifdef EIGEN_HAS_SFINAE +namespace internal { + + template + EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + array customIndices2Array(IndexType& idx, numeric_list) { + return { idx[Is]... }; + } + template + EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + array customIndices2Array(IndexType&, numeric_list) { + return array(); + } + + /** Make an array (for index/dimensions) out of a custom index */ + template + EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + array customIndices2Array(IndexType& idx) { + return customIndices2Array(idx, typename gen_numeric_list::type{}); + } + + + template + struct is_base_of + { + + typedef char (&yes)[1]; + typedef char (&no)[2]; + + template + struct Host + { + operator BB*() const; + operator DD*(); + }; + + template + static yes check(D*, T); + static no check(B*, int); + + static const bool value = sizeof(check(Host(), int())) == sizeof(yes); + }; + +} +#endif + + + +} // namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_META_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorMorphing.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorMorphing.h new file mode 100644 index 0000000..b3f00f7 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorMorphing.h @@ -0,0 +1,1102 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H +#define EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H + +namespace Eigen { + +/** \class TensorReshaping + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reshaping class. + * + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = array_size::value; + static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorReshapingOpEIGEN_DEVICE_REF type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorReshapingOp type; +}; + +} // end namespace internal + + + +template +class TensorReshapingOp : public TensorBase, WriteAccessors> +{ + public: + typedef TensorBase, WriteAccessors> Base; + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReshapingOp(const XprType& expr, const NewDimensions& dims) + : m_xpr(expr), m_dims(dims) {} + + EIGEN_DEVICE_FUNC + const NewDimensions& dimensions() const { return m_dims; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorReshapingOp) + + protected: + typename XprType::Nested m_xpr; + const NewDimensions m_dims; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorReshapingOp XprType; + typedef NewDimensions Dimensions; + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + typedef StorageMemory::type, Device> ConstCastStorage; + + static const int NumOutputDims = internal::array_size::value; + static const int NumInputDims = internal::array_size::Dimensions>::value; + + enum ReshapingKind { + // We do not use layout information to determine reshaping kind. + // Depending on the layout `N` can be inner or outer dimension. + OneByN = 0, // expr.reshape(1, N) + NByOne = 1, // expr.reshape(N, 1) + Runtime = 2 // Reshape dimensions are dynamic (specified at runtime). + }; + + // clang-format off + static const ReshapingKind kind = +#if defined(EIGEN_HAS_INDEX_LIST) + (NumOutputDims == 2 && internal::index_statically_eq(/*index=*/0, /*value=*/1)) ? OneByN + : (NumOutputDims == 2 && internal::index_statically_eq(/*index=*/1, /*value=*/1)) ? NByOne + : Runtime; +#else + Runtime; +#endif + // clang-format on + + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + // For trivial reshapes with raw access to underlying data we will provide + // zero overhead block access. + // TODO(ezhulenev): Consider adding block access without raw access? + BlockAccess = TensorEvaluator::RawAccess && + NumInputDims > 0 && NumOutputDims > 0, + PreferBlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = TensorEvaluator::RawAccess + }; + + typedef typename internal::remove_const::type ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef + typename internal::TensorMaterializedBlock + TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_dimensions(op.dimensions()) + { + // The total size of the reshaped tensor must be equal to the total size + // of the input tensor. + eigen_assert(internal::array_prod(m_impl.dimensions()) == internal::array_prod(op.dimensions())); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType data, EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(data, std::move(done)); + } +#endif + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + return m_impl.evalSubExprsIfNeeded(data); + } + EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_impl.coeff(index); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + return m_impl.template packet(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + return internal::TensorBlockResourceRequirements::any(); + } + + // required in block(OutputTensorBlock* output_block) const + // For C++03 compatibility this must be defined outside the method + struct BlockIteratorState { + Index stride; + Index span; + Index size; + Index count; + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + eigen_assert(m_impl.data() != NULL); + eigen_assert((kind == Runtime) || + (kind == OneByN && desc.dimensions()[0] == 1) || + (kind == NByOne && desc.dimensions()[1] == 1)); + + if (kind == OneByN || kind == NByOne) { + // We can guarantee at compile time that block is just a contiguous slice + // of the underlying expression memory buffer. + return TensorBlock(internal::TensorBlockKind::kView, + m_impl.data() + desc.offset(), desc.dimensions()); + } else { + // This will do additional runtime checks, and in the end it might be also + // a view, or it might be a block materialized in the temporary buffer. + return TensorBlock::materialize(m_impl.data(), m_dimensions, desc, + scratch); + } + } + + EIGEN_DEVICE_FUNC typename Storage::Type data() const { + return constCast(m_impl.data()); + } + + EIGEN_DEVICE_FUNC const TensorEvaluator& impl() const { return m_impl; } + + #ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } + #endif + protected: + TensorEvaluator m_impl; + NewDimensions m_dimensions; +}; + + +// Eval as lvalue +template + struct TensorEvaluator, Device> + : public TensorEvaluator, Device> + +{ + typedef TensorEvaluator, Device> Base; + typedef TensorReshapingOp XprType; + typedef NewDimensions Dimensions; + + enum { + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::RawAccess, + PreferBlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = TensorEvaluator::RawAccess + }; + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor + TensorBlockDesc; + //===--------------------------------------------------------------------===// + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + { + return this->m_impl.coeffRef(index); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + this->m_impl.template writePacket(index, x); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( + const TensorBlockDesc& desc, const TensorBlock& block) { + assert(this->m_impl.data() != NULL); + + typedef typename TensorBlock::XprType TensorBlockExpr; + typedef internal::TensorBlockAssignment< + Scalar, TensorEvaluator::NumOutputDims, TensorBlockExpr, Index> + TensorBlockAssign; + + TensorBlockAssign::Run( + TensorBlockAssign::target(desc.dimensions(), + internal::strides(this->dimensions()), + this->m_impl.data(), desc.offset()), + block.expr()); + } +}; + + +/** \class TensorSlicing + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor slicing class. + * + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = array_size::value; + static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorSlicingOpEIGEN_DEVICE_REF type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorSlicingOp type; +}; + +} // end namespace internal + + + +template +class TensorSlicingOp : public TensorBase > +{ + public: + typedef TensorBase > Base; + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSlicingOp(const XprType& expr, const StartIndices& indices, const Sizes& sizes) + : m_xpr(expr), m_indices(indices), m_sizes(sizes) {} + + EIGEN_DEVICE_FUNC + const StartIndices& startIndices() const { return m_indices; } + EIGEN_DEVICE_FUNC + const Sizes& sizes() const { return m_sizes; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorSlicingOp) + + protected: + typename XprType::Nested m_xpr; + const StartIndices m_indices; + const Sizes m_sizes; +}; + + +// Fixme: figure out the exact threshold +namespace { +template struct MemcpyTriggerForSlicing { + EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const Device& device) : threshold_(2 * device.numThreads()) { } + EIGEN_DEVICE_FUNC bool operator ()(Index total, Index contiguous) const { + const bool prefer_block_evaluation = BlockAccess && total > 32*1024; + return !prefer_block_evaluation && contiguous > threshold_; + } + + private: + Index threshold_; +}; + +// It is very expensive to start the memcpy kernel on GPU: we therefore only +// use it for large copies. +#ifdef EIGEN_USE_GPU +template struct MemcpyTriggerForSlicing { + EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const GpuDevice&) { } + EIGEN_DEVICE_FUNC bool operator ()(Index, Index contiguous) const { return contiguous > 4*1024*1024; } +}; +#endif + +// It is very expensive to start the memcpy kernel on GPU: we therefore only +// use it for large copies. +#ifdef EIGEN_USE_SYCL +template struct MemcpyTriggerForSlicing { + EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const SyclDevice&) { } + EIGEN_DEVICE_FUNC bool operator ()(Index, Index contiguous) const { return contiguous > 4*1024*1024; } +}; +#endif + +} + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorSlicingOp XprType; + static const int NumDims = internal::array_size::value; + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef Sizes Dimensions; + typedef StorageMemory Storage; + typedef StorageMemory::type, Device> ConstCastStorage; + typedef typename Storage::Type EvaluatorPointerType; + + enum { + // Alignment can't be guaranteed at compile time since it depends on the + // slice offsets and sizes. + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess && + // FIXME: Temporary workaround for bug in slicing of bool tensors. + !internal::is_same::type, bool>::value, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = false + }; + + typedef typename internal::remove_const::type ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + // Tensor slicing does not change the block type. + typedef typename TensorEvaluator::TensorBlock + TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices()) + { + m_is_identity = true; + for (int i = 0; i < internal::array_size::value; ++i) { + eigen_assert(m_impl.dimensions()[i] >= + op.sizes()[i] + op.startIndices()[i]); + if (m_impl.dimensions()[i] != op.sizes()[i] || + op.startIndices()[i] != 0) { + m_is_identity = false; + } + } + + // No strides for scalars. + if (NumDims == 0) return; + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + const Sizes& output_dims = op.sizes(); + if (static_cast(Layout) == static_cast(ColMajor)) { + m_inputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + } + + // Don't initialize m_fastOutputStrides[0] since it won't ever be accessed. + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1); + } + } else { + m_inputStrides[NumDims-1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; + } + + // Don't initialize m_fastOutputStrides[NumDims-1] since it won't ever be accessed. + m_outputStrides[NumDims-1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + m_impl.evalSubExprsIfNeeded(NULL); + if (!NumTraits::type>::RequireInitialization + && data && m_impl.data()) { + Index contiguous_values = 1; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < NumDims; ++i) { + contiguous_values *= dimensions()[i]; + if (dimensions()[i] != m_impl.dimensions()[i]) { + break; + } + } + } else { + for (int i = NumDims-1; i >= 0; --i) { + contiguous_values *= dimensions()[i]; + if (dimensions()[i] != m_impl.dimensions()[i]) { + break; + } + } + } + // Use memcpy if it's going to be faster than using the regular evaluation. + const MemcpyTriggerForSlicing trigger(m_device); + if (trigger(internal::array_prod(dimensions()), contiguous_values)) { + EvaluatorPointerType src = (EvaluatorPointerType)m_impl.data(); + for (Index i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) { + Index offset = srcCoeff(i); + m_device.memcpy((void*)(m_device.get(data + i)), m_device.get(src+offset), contiguous_values * sizeof(Scalar)); + } + return false; + } + } + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType /*data*/, EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + if (m_is_identity) { + return m_impl.coeff(index); + } else { + return m_impl.coeff(srcCoeff(index)); + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + const int packetSize = PacketType::size; + EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+packetSize-1 < internal::array_prod(dimensions())); + + if (m_is_identity) { + return m_impl.template packet(index); + } + + Index inputIndices[] = {0, 0}; + Index indices[] = {index, index + packetSize - 1}; + if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_fastOutputStrides[i]; + const Index idx1 = indices[1] / m_fastOutputStrides[i]; + inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; + inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + m_offsets[0]); + inputIndices[1] += (indices[1] + m_offsets[0]); + } else { + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / m_fastOutputStrides[i]; + const Index idx1 = indices[1] / m_fastOutputStrides[i]; + inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i]; + inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + m_offsets[NumDims-1]); + inputIndices[1] += (indices[1] + m_offsets[NumDims-1]); + } + if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + PacketReturnType rslt = m_impl.template packet(inputIndices[0]); + return rslt; + } + else { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; + values[0] = m_impl.coeff(inputIndices[0]); + values[packetSize-1] = m_impl.coeff(inputIndices[1]); + EIGEN_UNROLL_LOOP + for (int i = 1; i < packetSize-1; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + const size_t target_size = m_device.lastLevelCacheSize(); + return internal::TensorBlockResourceRequirements::merge( + internal::TensorBlockResourceRequirements::skewed(target_size), + m_impl.getResourceRequirements()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + TensorBlockDesc arg_desc = desc.WithOffset(srcCoeff(desc.offset())); + TensorBlock block = m_impl.block(arg_desc, scratch); + if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer(); + return block; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const { + typename Storage::Type result = constCast(m_impl.data()); + if (result) { + Index offset = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < NumDims; ++i) { + if (m_dimensions[i] != m_impl.dimensions()[i]) { + offset += m_offsets[i] * m_inputStrides[i]; + for (int j = i+1; j < NumDims; ++j) { + if (m_dimensions[j] > 1) { + return NULL; + } + offset += m_offsets[j] * m_inputStrides[j]; + } + break; + } + } + } else { + for (int i = NumDims - 1; i >= 0; --i) { + if (m_dimensions[i] != m_impl.dimensions()[i]) { + offset += m_offsets[i] * m_inputStrides[i]; + for (int j = i-1; j >= 0; --j) { + if (m_dimensions[j] > 1) { + return NULL; + } + offset += m_offsets[j] * m_inputStrides[j]; + } + break; + } + } + } + return result + offset; + } + return NULL; + } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif + + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const + { + Index inputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_fastOutputStrides[i]; + inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + inputIndex += (index + m_offsets[0]); + } else { + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_fastOutputStrides[i]; + inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + inputIndex += (index + m_offsets[NumDims-1]); + } + return inputIndex; + } + + array m_outputStrides; + array, NumDims> m_fastOutputStrides; + array m_inputStrides; + TensorEvaluator m_impl; + const Device EIGEN_DEVICE_REF m_device; + Dimensions m_dimensions; + bool m_is_identity; + const StartIndices m_offsets; +}; + + +// Eval as lvalue +template +struct TensorEvaluator, Device> + : public TensorEvaluator, Device> +{ + typedef TensorEvaluator, Device> Base; + typedef TensorSlicingOp XprType; + static const int NumDims = internal::array_size::value; + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef Sizes Dimensions; + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::BlockAccess, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = (NumDims == 1) & TensorEvaluator::RawAccess + }; + + typedef typename internal::remove_const::type ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) + { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + { + if (this->m_is_identity) { + return this->m_impl.coeffRef(index); + } else { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + if (this->m_is_identity) { + this->m_impl.template writePacket(index, x); + return; + } + + const int packetSize = PacketType::size; + Index inputIndices[] = {0, 0}; + Index indices[] = {index, index + packetSize - 1}; + if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; + const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; + inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i]; + inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i]; + indices[0] -= idx0 * this->m_outputStrides[i]; + indices[1] -= idx1 * this->m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + this->m_offsets[0]); + inputIndices[1] += (indices[1] + this->m_offsets[0]); + } else { + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; + const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; + inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i]; + inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i]; + indices[0] -= idx0 * this->m_outputStrides[i]; + indices[1] -= idx1 * this->m_outputStrides[i]; + } + inputIndices[0] += (indices[0] + this->m_offsets[NumDims-1]); + inputIndices[1] += (indices[1] + this->m_offsets[NumDims-1]); + } + if (inputIndices[1] - inputIndices[0] == packetSize - 1) { + this->m_impl.template writePacket(inputIndices[0], x); + } + else { + EIGEN_ALIGN_MAX CoeffReturnType values[packetSize]; + internal::pstore(values, x); + this->m_impl.coeffRef(inputIndices[0]) = values[0]; + this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1]; + EIGEN_UNROLL_LOOP + for (int i = 1; i < packetSize-1; ++i) { + this->coeffRef(index+i) = values[i]; + } + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( + const TensorBlockDesc& desc, const TensorBlock& block) { + TensorBlockDesc arg_desc = desc.WithOffset(this->srcCoeff(desc.offset())); + this->m_impl.writeBlock(arg_desc, block); + } +}; + +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = array_size::value; + static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorStridingSlicingOpEIGEN_DEVICE_REF type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorStridingSlicingOp type; +}; + +} // end namespace internal + + +template +class TensorStridingSlicingOp : public TensorBase > +{ + public: + typedef TensorBase > Base; + typedef typename internal::traits::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename internal::nested::type Nested; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingSlicingOp( + const XprType& expr, const StartIndices& startIndices, + const StopIndices& stopIndices, const Strides& strides) + : m_xpr(expr), m_startIndices(startIndices), m_stopIndices(stopIndices), + m_strides(strides) {} + + EIGEN_DEVICE_FUNC + const StartIndices& startIndices() const { return m_startIndices; } + EIGEN_DEVICE_FUNC + const StartIndices& stopIndices() const { return m_stopIndices; } + EIGEN_DEVICE_FUNC + const StartIndices& strides() const { return m_strides; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorStridingSlicingOp) + + protected: + typename XprType::Nested m_xpr; + const StartIndices m_startIndices; + const StopIndices m_stopIndices; + const Strides m_strides; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorStridingSlicingOp XprType; + static const int NumDims = internal::array_size::value; + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + typedef Strides Dimensions; + + enum { + // Alignment can't be guaranteed at compile time since it depends on the + // slice offsets and sizes. + IsAligned = false, + PacketAccess = false, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + Layout = TensorEvaluator::Layout, + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), + m_device(device), + m_strides(op.strides()) + { + // Handle degenerate intervals by gracefully clamping and allowing m_dimensions to be zero + DSizes startIndicesClamped, stopIndicesClamped; + for (ptrdiff_t i = 0; i < internal::array_size::value; ++i) { + eigen_assert(m_strides[i] != 0 && "0 stride is invalid"); + if (m_strides[i] > 0) { + startIndicesClamped[i] = + clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]); + stopIndicesClamped[i] = + clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]); + } else { + /* implies m_strides[i] < 0 by assert */ + startIndicesClamped[i] = + clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1); + stopIndicesClamped[i] = + clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1); + } + m_startIndices[i] = startIndicesClamped[i]; + } + + typedef typename TensorEvaluator::Dimensions InputDimensions; + const InputDimensions& input_dims = m_impl.dimensions(); + + // compute output tensor shape + m_is_identity = true; + for (int i = 0; i < NumDims; i++) { + Index interval = stopIndicesClamped[i] - startIndicesClamped[i]; + if (interval == 0 || ((interval < 0) != (m_strides[i] < 0))) { + m_dimensions[i] = 0; + } else { + m_dimensions[i] = + (interval / m_strides[i]) + (interval % m_strides[i] != 0 ? 1 : 0); + eigen_assert(m_dimensions[i] >= 0); + } + if (m_strides[i] != 1 || interval != m_impl.dimensions()[i]) { + m_is_identity = false; + } + } + + Strides output_dims = m_dimensions; + + if (static_cast(Layout) == static_cast(ColMajor)) { + m_inputStrides[0] = m_strides[0]; + m_offsets[0] = startIndicesClamped[0]; + Index previousDimProduct = 1; + for (int i = 1; i < NumDims; ++i) { + previousDimProduct *= input_dims[i-1]; + m_inputStrides[i] = previousDimProduct * m_strides[i]; + m_offsets[i] = startIndicesClamped[i] * previousDimProduct; + } + + // Don't initialize m_fastOutputStrides[0] since it won't ever be accessed. + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1); + } + } else { + m_inputStrides[NumDims-1] = m_strides[NumDims-1]; + m_offsets[NumDims-1] = startIndicesClamped[NumDims-1]; + Index previousDimProduct = 1; + for (int i = NumDims - 2; i >= 0; --i) { + previousDimProduct *= input_dims[i+1]; + m_inputStrides[i] = previousDimProduct * m_strides[i]; + m_offsets[i] = startIndicesClamped[i] * previousDimProduct; + } + + m_outputStrides[NumDims-1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + if (m_is_identity) { + return m_impl.coeff(index); + } else { + return m_impl.coeff(srcCoeff(index)); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const { + return NULL; + } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const + { + Index inputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i >= 0; --i) { + const Index idx = index / m_fastOutputStrides[i]; + inputIndex += idx * m_inputStrides[i] + m_offsets[i]; + index -= idx * m_outputStrides[i]; + } + } else { + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims; ++i) { + const Index idx = index / m_fastOutputStrides[i]; + inputIndex += idx * m_inputStrides[i] + m_offsets[i]; + index -= idx * m_outputStrides[i]; + } + } + return inputIndex; + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) { +#ifndef SYCL_DEVICE_ONLY + return numext::maxi(min, numext::mini(max,value)); +#else + return cl::sycl::clamp(value, min, max); +#endif + } + + array m_outputStrides; + array, NumDims> m_fastOutputStrides; + array m_inputStrides; + bool m_is_identity; + TensorEvaluator m_impl; + const Device EIGEN_DEVICE_REF m_device; + DSizes m_startIndices; // clamped startIndices + DSizes m_dimensions; + DSizes m_offsets; // offset in a flattened shape + const Strides m_strides; +}; + +// Eval as lvalue +template +struct TensorEvaluator, Device> + : public TensorEvaluator, Device> +{ + typedef TensorEvaluator, Device> Base; + typedef TensorStridingSlicingOp XprType; + static const int NumDims = internal::array_size::value; + + enum { + IsAligned = false, + PacketAccess = false, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = TensorEvaluator::CoordAccess, + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) + { } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef Strides Dimensions; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + { + if (this->m_is_identity) { + return this->m_impl.coeffRef(index); + } else { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } + } +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorPadding.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorPadding.h new file mode 100644 index 0000000..ee44382 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorPadding.h @@ -0,0 +1,708 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_PADDING_H +#define EIGEN_CXX11_TENSOR_TENSOR_PADDING_H + +namespace Eigen { + +/** \class TensorPadding + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor padding class. + * At the moment only padding with a constant value is supported. + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorPaddingOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorPaddingOp type; +}; + +} // end namespace internal + + + +template +class TensorPaddingOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPaddingOp(const XprType& expr, const PaddingDimensions& padding_dims, const Scalar padding_value) + : m_xpr(expr), m_padding_dims(padding_dims), m_padding_value(padding_value) {} + + EIGEN_DEVICE_FUNC + const PaddingDimensions& padding() const { return m_padding_dims; } + EIGEN_DEVICE_FUNC + Scalar padding_value() const { return m_padding_value; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const PaddingDimensions m_padding_dims; + const Scalar m_padding_value; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorPaddingOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + enum { + IsAligned = true, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = TensorEvaluator::RawAccess, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = true, + RawAccess = false + }; + + typedef typename internal::remove_const::type ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename internal::TensorMaterializedBlock + TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()), m_device(device) + { + // The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead + // to a vector, which doesn't make sense. Instead one should reshape the scalar into a vector + // of 1 element first and then pad. + EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + + // Compute dimensions + m_dimensions = m_impl.dimensions(); + for (int i = 0; i < NumDims; ++i) { + m_dimensions[i] += m_padding[i].first + m_padding[i].second; + } + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + if (static_cast(Layout) == static_cast(ColMajor)) { + m_inputStrides[0] = 1; + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + } + m_outputStrides[NumDims] = m_outputStrides[NumDims-1] * m_dimensions[NumDims-1]; + } else { + m_inputStrides[NumDims - 1] = 1; + m_outputStrides[NumDims] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; + m_outputStrides[i+1] = m_outputStrides[i+2] * m_dimensions[i+1]; + } + m_outputStrides[0] = m_outputStrides[1] * m_dimensions[0]; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType, EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + eigen_assert(index < dimensions().TotalSize()); + Index inputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + if (isPaddingAtIndexForDim(idx, i)) { + return m_paddingValue; + } + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + if (isPaddingAtIndexForDim(index, 0)) { + return m_paddingValue; + } + inputIndex += (index - m_padding[0].first); + } else { + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i+1]; + if (isPaddingAtIndexForDim(idx, i)) { + return m_paddingValue; + } + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + index -= idx * m_outputStrides[i+1]; + } + if (isPaddingAtIndexForDim(index, NumDims-1)) { + return m_paddingValue; + } + inputIndex += (index - m_padding[NumDims-1].first); + } + return m_impl.coeff(inputIndex); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + if (static_cast(Layout) == static_cast(ColMajor)) { + return packetColMajor(index); + } + return packetRowMajor(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + TensorOpCost cost = m_impl.costPerCoeff(vectorized); + if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims; ++i) + updateCostPerDimension(cost, i, i == 0); + } else { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i >= 0; --i) + updateCostPerDimension(cost, i, i == NumDims - 1); + } + return cost; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + const size_t target_size = m_device.lastLevelCacheSize(); + return internal::TensorBlockResourceRequirements::merge( + internal::TensorBlockResourceRequirements::skewed(target_size), + m_impl.getResourceRequirements()); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + // If one of the dimensions is zero, return empty block view. + if (desc.size() == 0) { + return TensorBlock(internal::TensorBlockKind::kView, NULL, + desc.dimensions()); + } + + static const bool IsColMajor = Layout == static_cast(ColMajor); + const int inner_dim_idx = IsColMajor ? 0 : NumDims - 1; + + Index offset = desc.offset(); + + // Compute offsets in the output tensor corresponding to the desc.offset(). + DSizes output_offsets; + for (int i = NumDims - 1; i > 0; --i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + const int stride_dim = IsColMajor ? dim : dim + 1; + output_offsets[dim] = offset / m_outputStrides[stride_dim]; + offset -= output_offsets[dim] * m_outputStrides[stride_dim]; + } + output_offsets[inner_dim_idx] = offset; + + // Offsets in the input corresponding to output offsets. + DSizes input_offsets = output_offsets; + for (int i = 0; i < NumDims; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + input_offsets[dim] = input_offsets[dim] - m_padding[dim].first; + } + + // Compute offset in the input buffer (at this point it might be illegal and + // point outside of the input buffer, because we don't check for negative + // offsets, it will be autocorrected in the block iteration loop below). + Index input_offset = 0; + for (int i = 0; i < NumDims; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + input_offset += input_offsets[dim] * m_inputStrides[dim]; + } + + // Destination buffer and scratch buffer both indexed from 0 and have the + // same dimensions as the requested block (for destination buffer this + // property is guaranteed by `desc.destination()`). + Index output_offset = 0; + const DSizes output_strides = + internal::strides(desc.dimensions()); + + // NOTE(ezhulenev): We initialize bock iteration state for `NumDims - 1` + // dimensions, skipping innermost dimension. In theory it should be possible + // to squeeze matching innermost dimensions, however in practice that did + // not show any improvements in benchmarks. Also in practice first outer + // dimension usually has padding, and will prevent squeezing. + + // Initialize output block iterator state. Dimension in this array are + // always in inner_most -> outer_most order (col major layout). + array it; + for (int i = 0; i < NumDims - 1; ++i) { + const int dim = IsColMajor ? i + 1 : NumDims - i - 2; + it[i].count = 0; + it[i].size = desc.dimension(dim); + + it[i].input_stride = m_inputStrides[dim]; + it[i].input_span = it[i].input_stride * (it[i].size - 1); + + it[i].output_stride = output_strides[dim]; + it[i].output_span = it[i].output_stride * (it[i].size - 1); + } + + const Index input_inner_dim_size = + static_cast(m_impl.dimensions()[inner_dim_idx]); + + // Total output size. + const Index output_size = desc.size(); + + // We will fill inner dimension of this size in the output. It might be + // larger than the inner dimension in the input, so we might have to pad + // before/after we copy values from the input inner dimension. + const Index output_inner_dim_size = desc.dimension(inner_dim_idx); + + // How many values to fill with padding BEFORE reading from the input inner + // dimension. + const Index output_inner_pad_before_size = + input_offsets[inner_dim_idx] < 0 + ? numext::mini(numext::abs(input_offsets[inner_dim_idx]), + output_inner_dim_size) + : 0; + + // How many values we can actually copy from the input inner dimension. + const Index output_inner_copy_size = numext::mini( + // Want to copy from input. + (output_inner_dim_size - output_inner_pad_before_size), + // Can copy from input. + numext::maxi(input_inner_dim_size - (input_offsets[inner_dim_idx] + + output_inner_pad_before_size), + Index(0))); + + eigen_assert(output_inner_copy_size >= 0); + + // How many values to fill with padding AFTER reading from the input inner + // dimension. + const Index output_inner_pad_after_size = + (output_inner_dim_size - output_inner_copy_size - + output_inner_pad_before_size); + + // Sanity check, sum of all sizes must be equal to the output size. + eigen_assert(output_inner_dim_size == + (output_inner_pad_before_size + output_inner_copy_size + + output_inner_pad_after_size)); + + // Keep track of current coordinates and padding in the output. + DSizes output_coord = output_offsets; + DSizes output_padded; + for (int i = 0; i < NumDims; ++i) { + const int dim = IsColMajor ? i : NumDims - i - 1; + output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim); + } + + typedef internal::StridedLinearBufferCopy LinCopy; + + // Prepare storage for the materialized padding result. + const typename TensorBlock::Storage block_storage = + TensorBlock::prepareStorage(desc, scratch); + + // TODO(ezhulenev): Squeeze multiple non-padded inner dimensions into a + // single logical inner dimension. + + // When possible we squeeze writes for the innermost (only if non-padded) + // dimension with the first padded dimension. This allows to reduce the + // number of calls to LinCopy and better utilize vector instructions. + const bool squeeze_writes = + NumDims > 1 && + // inner dimension is not padded + (input_inner_dim_size == m_dimensions[inner_dim_idx]) && + // and equal to the block inner dimension + (input_inner_dim_size == output_inner_dim_size); + + const int squeeze_dim = IsColMajor ? inner_dim_idx + 1 : inner_dim_idx - 1; + + // Maximum coordinate on a squeeze dimension that we can write to. + const Index squeeze_max_coord = + squeeze_writes ? numext::mini( + // max non-padded element in the input + static_cast(m_dimensions[squeeze_dim] - + m_padding[squeeze_dim].second), + // max element in the output buffer + static_cast(output_offsets[squeeze_dim] + + desc.dimension(squeeze_dim))) + : static_cast(0); + + // Iterate copying data from `m_impl.data()` to the output buffer. + for (Index size = 0; size < output_size;) { + // Detect if we are in the padded region (exclude innermost dimension). + bool is_padded = false; + for (int j = 1; j < NumDims; ++j) { + const int dim = IsColMajor ? j : NumDims - j - 1; + is_padded = output_padded[dim]; + if (is_padded) break; + } + + if (is_padded) { + // Fill single innermost dimension with padding value. + size += output_inner_dim_size; + + LinCopy::template Run( + typename LinCopy::Dst(output_offset, 1, block_storage.data()), + typename LinCopy::Src(0, 0, &m_paddingValue), + output_inner_dim_size); + + + } else if (squeeze_writes) { + // Squeeze multiple reads from innermost dimensions. + const Index squeeze_num = squeeze_max_coord - output_coord[squeeze_dim]; + size += output_inner_dim_size * squeeze_num; + + // Copy `squeeze_num` inner dimensions from input to output. + LinCopy::template Run( + typename LinCopy::Dst(output_offset, 1, block_storage.data()), + typename LinCopy::Src(input_offset, 1, m_impl.data()), + output_inner_dim_size * squeeze_num); + + // Update iteration state for only `squeeze_num - 1` processed inner + // dimensions, because we have another iteration state update at the end + // of the loop that will update iteration state for the last inner + // processed dimension. + it[0].count += (squeeze_num - 1); + input_offset += it[0].input_stride * (squeeze_num - 1); + output_offset += it[0].output_stride * (squeeze_num - 1); + output_coord[squeeze_dim] += (squeeze_num - 1); + + } else { + // Single read from innermost dimension. + size += output_inner_dim_size; + + { // Fill with padding before copying from input inner dimension. + const Index out = output_offset; + + LinCopy::template Run( + typename LinCopy::Dst(out, 1, block_storage.data()), + typename LinCopy::Src(0, 0, &m_paddingValue), + output_inner_pad_before_size); + } + + { // Copy data from input inner dimension. + const Index out = output_offset + output_inner_pad_before_size; + const Index in = input_offset + output_inner_pad_before_size; + + eigen_assert(output_inner_copy_size == 0 || m_impl.data() != NULL); + + LinCopy::template Run( + typename LinCopy::Dst(out, 1, block_storage.data()), + typename LinCopy::Src(in, 1, m_impl.data()), + output_inner_copy_size); + } + + { // Fill with padding after copying from input inner dimension. + const Index out = output_offset + output_inner_pad_before_size + + output_inner_copy_size; + + LinCopy::template Run( + typename LinCopy::Dst(out, 1, block_storage.data()), + typename LinCopy::Src(0, 0, &m_paddingValue), + output_inner_pad_after_size); + } + } + + for (int j = 0; j < NumDims - 1; ++j) { + const int dim = IsColMajor ? j + 1 : NumDims - j - 2; + + if (++it[j].count < it[j].size) { + input_offset += it[j].input_stride; + output_offset += it[j].output_stride; + output_coord[dim] += 1; + output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim); + break; + } + it[j].count = 0; + input_offset -= it[j].input_span; + output_offset -= it[j].output_span; + output_coord[dim] -= it[j].size - 1; + output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim); + } + } + + return block_storage.AsTensorMaterializedBlock(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; } + +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif + + private: + struct BlockIteratorState { + BlockIteratorState() + : count(0), + size(0), + input_stride(0), + input_span(0), + output_stride(0), + output_span(0) {} + + Index count; + Index size; + Index input_stride; + Index input_span; + Index output_stride; + Index output_span; + }; + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim( + Index index, int dim_index) const { +#if defined(EIGEN_HAS_INDEX_LIST) + return (!internal::index_pair_first_statically_eq(dim_index, 0) && + index < m_padding[dim_index].first) || + (!internal::index_pair_second_statically_eq(dim_index, 0) && + index >= m_dimensions[dim_index] - m_padding[dim_index].second); +#else + return (index < m_padding[dim_index].first) || + (index >= m_dimensions[dim_index] - m_padding[dim_index].second); +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isLeftPaddingCompileTimeZero( + int dim_index) const { +#if defined(EIGEN_HAS_INDEX_LIST) + return internal::index_pair_first_statically_eq(dim_index, 0); +#else + EIGEN_UNUSED_VARIABLE(dim_index); + return false; +#endif + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isRightPaddingCompileTimeZero( + int dim_index) const { +#if defined(EIGEN_HAS_INDEX_LIST) + return internal::index_pair_second_statically_eq(dim_index, 0); +#else + EIGEN_UNUSED_VARIABLE(dim_index); + return false; +#endif + } + + + void updateCostPerDimension(TensorOpCost& cost, int i, bool first) const { + const double in = static_cast(m_impl.dimensions()[i]); + const double out = in + m_padding[i].first + m_padding[i].second; + if (out == 0) + return; + const double reduction = in / out; + cost *= reduction; + if (first) { + cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost() + + reduction * (1 * TensorOpCost::AddCost())); + } else { + cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost() + + 2 * TensorOpCost::MulCost() + + reduction * (2 * TensorOpCost::MulCost() + + 1 * TensorOpCost::DivCost())); + } + } + + protected: + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + const Index initialIndex = index; + Index inputIndex = 0; + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i > 0; --i) { + const Index firstIdx = index; + const Index lastIdx = index + PacketSize - 1; + const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i]; + const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i]; + const Index lastPaddedRight = m_outputStrides[i+1]; + + if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) { + // all the coefficient are in the padding zone. + return internal::pset1(m_paddingValue); + } + else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) { + // all the coefficient are in the padding zone. + return internal::pset1(m_paddingValue); + } + else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) { + // all the coefficient are between the 2 padding zones. + const Index idx = index / m_outputStrides[i]; + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + else { + // Every other case + return packetWithPossibleZero(initialIndex); + } + } + + const Index lastIdx = index + PacketSize - 1; + const Index firstIdx = index; + const Index lastPaddedLeft = m_padding[0].first; + const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second); + const Index lastPaddedRight = m_outputStrides[1]; + + if (!isLeftPaddingCompileTimeZero(0) && lastIdx < lastPaddedLeft) { + // all the coefficient are in the padding zone. + return internal::pset1(m_paddingValue); + } + else if (!isRightPaddingCompileTimeZero(0) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) { + // all the coefficient are in the padding zone. + return internal::pset1(m_paddingValue); + } + else if ((isLeftPaddingCompileTimeZero(0) && isRightPaddingCompileTimeZero(0)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) { + // all the coefficient are between the 2 padding zones. + inputIndex += (index - m_padding[0].first); + return m_impl.template packet(inputIndex); + } + // Every other case + return packetWithPossibleZero(initialIndex); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + const Index initialIndex = index; + Index inputIndex = 0; + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 1; ++i) { + const Index firstIdx = index; + const Index lastIdx = index + PacketSize - 1; + const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i+1]; + const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1]; + const Index lastPaddedRight = m_outputStrides[i]; + + if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) { + // all the coefficient are in the padding zone. + return internal::pset1(m_paddingValue); + } + else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) { + // all the coefficient are in the padding zone. + return internal::pset1(m_paddingValue); + } + else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) { + // all the coefficient are between the 2 padding zones. + const Index idx = index / m_outputStrides[i+1]; + inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; + index -= idx * m_outputStrides[i+1]; + } + else { + // Every other case + return packetWithPossibleZero(initialIndex); + } + } + + const Index lastIdx = index + PacketSize - 1; + const Index firstIdx = index; + const Index lastPaddedLeft = m_padding[NumDims-1].first; + const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second); + const Index lastPaddedRight = m_outputStrides[NumDims-1]; + + if (!isLeftPaddingCompileTimeZero(NumDims-1) && lastIdx < lastPaddedLeft) { + // all the coefficient are in the padding zone. + return internal::pset1(m_paddingValue); + } + else if (!isRightPaddingCompileTimeZero(NumDims-1) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) { + // all the coefficient are in the padding zone. + return internal::pset1(m_paddingValue); + } + else if ((isLeftPaddingCompileTimeZero(NumDims-1) && isRightPaddingCompileTimeZero(NumDims-1)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) { + // all the coefficient are between the 2 padding zones. + inputIndex += (index - m_padding[NumDims-1].first); + return m_impl.template packet(inputIndex); + } + // Every other case + return packetWithPossibleZero(initialIndex); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const + { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + Dimensions m_dimensions; + array m_outputStrides; + array m_inputStrides; + TensorEvaluator m_impl; + PaddingDimensions m_padding; + + Scalar m_paddingValue; + + const Device EIGEN_DEVICE_REF m_device; +}; + + + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_PADDING_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorPatch.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorPatch.h new file mode 100644 index 0000000..413d25d --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorPatch.h @@ -0,0 +1,291 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_PATCH_H +#define EIGEN_CXX11_TENSOR_TENSOR_PATCH_H + +namespace Eigen { + +/** \class TensorPatch + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor patch class. + * + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions + 1; + static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorPatchOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorPatchOp type; +}; + +} // end namespace internal + + + +template +class TensorPatchOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPatchOp(const XprType& expr, const PatchDim& patch_dims) + : m_xpr(expr), m_patch_dims(patch_dims) {} + + EIGEN_DEVICE_FUNC + const PatchDim& patch_dims() const { return m_patch_dims; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const PatchDim m_patch_dims; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorPatchOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::Dimensions>::value + 1; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device) + { + Index num_patches = 1; + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + const PatchDim& patch_dims = op.patch_dims(); + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < NumDims-1; ++i) { + m_dimensions[i] = patch_dims[i]; + num_patches *= (input_dims[i] - patch_dims[i] + 1); + } + m_dimensions[NumDims-1] = num_patches; + + m_inputStrides[0] = 1; + m_patchStrides[0] = 1; + for (int i = 1; i < NumDims-1; ++i) { + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + m_patchStrides[i] = m_patchStrides[i-1] * (input_dims[i-1] - patch_dims[i-1] + 1); + } + m_outputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + } + } else { + for (int i = 0; i < NumDims-1; ++i) { + m_dimensions[i+1] = patch_dims[i]; + num_patches *= (input_dims[i] - patch_dims[i] + 1); + } + m_dimensions[0] = num_patches; + + m_inputStrides[NumDims-2] = 1; + m_patchStrides[NumDims-2] = 1; + for (int i = NumDims-3; i >= 0; --i) { + m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; + m_patchStrides[i] = m_patchStrides[i+1] * (input_dims[i+1] - patch_dims[i+1] + 1); + } + m_outputStrides[NumDims-1] = 1; + for (int i = NumDims-2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + Index output_stride_index = (static_cast(Layout) == static_cast(ColMajor)) ? NumDims - 1 : 0; + // Find the location of the first element of the patch. + Index patchIndex = index / m_outputStrides[output_stride_index]; + // Find the offset of the element wrt the location of the first element. + Index patchOffset = index - patchIndex * m_outputStrides[output_stride_index]; + Index inputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 2; i > 0; --i) { + const Index patchIdx = patchIndex / m_patchStrides[i]; + patchIndex -= patchIdx * m_patchStrides[i]; + const Index offsetIdx = patchOffset / m_outputStrides[i]; + patchOffset -= offsetIdx * m_outputStrides[i]; + inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i]; + } + } else { + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 2; ++i) { + const Index patchIdx = patchIndex / m_patchStrides[i]; + patchIndex -= patchIdx * m_patchStrides[i]; + const Index offsetIdx = patchOffset / m_outputStrides[i+1]; + patchOffset -= offsetIdx * m_outputStrides[i+1]; + inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i]; + } + } + inputIndex += (patchIndex + patchOffset); + return m_impl.coeff(inputIndex); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + Index output_stride_index = (static_cast(Layout) == static_cast(ColMajor)) ? NumDims - 1 : 0; + Index indices[2] = {index, index + PacketSize - 1}; + Index patchIndices[2] = {indices[0] / m_outputStrides[output_stride_index], + indices[1] / m_outputStrides[output_stride_index]}; + Index patchOffsets[2] = {indices[0] - patchIndices[0] * m_outputStrides[output_stride_index], + indices[1] - patchIndices[1] * m_outputStrides[output_stride_index]}; + + Index inputIndices[2] = {0, 0}; + if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 2; i > 0; --i) { + const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i], + patchIndices[1] / m_patchStrides[i]}; + patchIndices[0] -= patchIdx[0] * m_patchStrides[i]; + patchIndices[1] -= patchIdx[1] * m_patchStrides[i]; + + const Index offsetIdx[2] = {patchOffsets[0] / m_outputStrides[i], + patchOffsets[1] / m_outputStrides[i]}; + patchOffsets[0] -= offsetIdx[0] * m_outputStrides[i]; + patchOffsets[1] -= offsetIdx[1] * m_outputStrides[i]; + + inputIndices[0] += (patchIdx[0] + offsetIdx[0]) * m_inputStrides[i]; + inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i]; + } + } else { + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 2; ++i) { + const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i], + patchIndices[1] / m_patchStrides[i]}; + patchIndices[0] -= patchIdx[0] * m_patchStrides[i]; + patchIndices[1] -= patchIdx[1] * m_patchStrides[i]; + + const Index offsetIdx[2] = {patchOffsets[0] / m_outputStrides[i+1], + patchOffsets[1] / m_outputStrides[i+1]}; + patchOffsets[0] -= offsetIdx[0] * m_outputStrides[i+1]; + patchOffsets[1] -= offsetIdx[1] * m_outputStrides[i+1]; + + inputIndices[0] += (patchIdx[0] + offsetIdx[0]) * m_inputStrides[i]; + inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i]; + } + } + inputIndices[0] += (patchIndices[0] + patchOffsets[0]); + inputIndices[1] += (patchIndices[1] + patchOffsets[1]); + + if (inputIndices[1] - inputIndices[0] == PacketSize - 1) { + PacketReturnType rslt = m_impl.template packet(inputIndices[0]); + return rslt; + } + else { + EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize]; + values[0] = m_impl.coeff(inputIndices[0]); + values[PacketSize-1] = m_impl.coeff(inputIndices[1]); + EIGEN_UNROLL_LOOP + for (int i = 1; i < PacketSize-1; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double compute_cost = NumDims * (TensorOpCost::DivCost() + + TensorOpCost::MulCost() + + 2 * TensorOpCost::AddCost()); + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif + + protected: + Dimensions m_dimensions; + array m_outputStrides; + array m_inputStrides; + array m_patchStrides; + + TensorEvaluator m_impl; + +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_PATCH_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorRandom.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorRandom.h new file mode 100644 index 0000000..37c1d1c --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorRandom.h @@ -0,0 +1,322 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner +// Copyright (C) 2018 Mehdi Goli Codeplay Software Ltd. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H +#define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H + +namespace Eigen { +namespace internal { + +namespace { + +EIGEN_DEVICE_FUNC uint64_t get_random_seed() { +#if defined(EIGEN_GPU_COMPILE_PHASE) + // We don't support 3d kernels since we currently only use 1 and + // 2d kernels. + gpu_assert(threadIdx.z == 0); + return blockIdx.x * blockDim.x + threadIdx.x + + gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y); +#else + // Rely on Eigen's random implementation. + return random(); +#endif +} + +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state, uint64_t stream) { + // TODO: Unify with the implementation in the non blocking thread pool. + uint64_t current = *state; + // Update the internal state + *state = current * 6364136223846793005ULL + (stream << 1 | 1); + // Generate the random output (using the PCG-XSH-RS scheme) + return static_cast((current ^ (current >> 22)) >> (22 + (current >> 61))); +} + +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) { + seed = seed ? seed : get_random_seed(); + return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL; +} + +} // namespace + + +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +T RandomToTypeUniform(uint64_t* state, uint64_t stream) { + unsigned rnd = PCG_XSH_RS_generator(state, stream); + return static_cast(rnd); +} + + +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +Eigen::half RandomToTypeUniform(uint64_t* state, uint64_t stream) { + // Generate 10 random bits for the mantissa, merge with exponent. + unsigned rnd = PCG_XSH_RS_generator(state, stream); + const uint16_t half_bits = static_cast(rnd & 0x3ffu) | (static_cast(15) << 10); + Eigen::half result = Eigen::numext::bit_cast(half_bits); + // Return the final result + return result - Eigen::half(1.0f); +} + +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +Eigen::bfloat16 RandomToTypeUniform(uint64_t* state, uint64_t stream) { + + // Generate 7 random bits for the mantissa, merge with exponent. + unsigned rnd = PCG_XSH_RS_generator(state, stream); + const uint16_t half_bits = static_cast(rnd & 0x7fu) | (static_cast(127) << 7); + Eigen::bfloat16 result = Eigen::numext::bit_cast(half_bits); + // Return the final result + return result - Eigen::bfloat16(1.0f); +} + +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float RandomToTypeUniform(uint64_t* state, uint64_t stream) { + typedef union { + uint32_t raw; + float fp; + } internal; + internal result; + // Generate 23 random bits for the mantissa mantissa + const unsigned rnd = PCG_XSH_RS_generator(state, stream); + result.raw = rnd & 0x7fffffu; + // Set the exponent + result.raw |= (static_cast(127) << 23); + // Return the final result + return result.fp - 1.0f; +} + +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double RandomToTypeUniform(uint64_t* state, uint64_t stream) { + typedef union { + uint64_t raw; + double dp; + } internal; + internal result; + result.raw = 0; + // Generate 52 random bits for the mantissa + // First generate the upper 20 bits + unsigned rnd1 = PCG_XSH_RS_generator(state, stream) & 0xfffffu; + // The generate the lower 32 bits + unsigned rnd2 = PCG_XSH_RS_generator(state, stream); + result.raw = (static_cast(rnd1) << 32) | rnd2; + // Set the exponent + result.raw |= (static_cast(1023) << 52); + // Return the final result + return result.dp - 1.0; +} + +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +std::complex RandomToTypeUniform >(uint64_t* state, uint64_t stream) { + return std::complex(RandomToTypeUniform(state, stream), + RandomToTypeUniform(state, stream)); +} +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +std::complex RandomToTypeUniform >(uint64_t* state, uint64_t stream) { + return std::complex(RandomToTypeUniform(state, stream), + RandomToTypeUniform(state, stream)); +} + +template class UniformRandomGenerator { + public: + static const bool PacketAccess = true; + + // Uses the given "seed" if non-zero, otherwise uses a random seed. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator( + uint64_t seed = 0) { + m_state = PCG_XSH_RS_state(seed); + #ifdef EIGEN_USE_SYCL + // In SYCL it is not possible to build PCG_XSH_RS_state in one step. + // Therefor, we need two step to initializate the m_state. + // IN SYCL, the constructor of the functor is s called on the CPU + // and we get the clock seed here from the CPU. However, This seed is + //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function. + // and only available on the Operator() function (which is called on the GPU). + // Thus for CUDA (((CLOCK + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread + // but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each thread adds + // the (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the construction + // similar to CUDA Therefore, the thread Id injection is not available at this stage. + //However when the operator() is called the thread ID will be avilable. So inside the opeator, + // we add the thrreadID, BlockId,... (which is equivalent of i) + //to the seed and construct the unique m_state per thead similar to cuda. + m_exec_once =false; + #endif + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator( + const UniformRandomGenerator& other) { + m_state = other.m_state; + #ifdef EIGEN_USE_SYCL + m_exec_once =other.m_exec_once; + #endif + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + T operator()(Index i) const { + #ifdef EIGEN_USE_SYCL + if(!m_exec_once) { + // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread + // The (i * 6364136223846793005ULL) is the remaining part of the PCG_XSH_RS_state on the GPU side + m_state += (i * 6364136223846793005ULL); + m_exec_once =true; + } + #endif + T result = RandomToTypeUniform(&m_state, i); + return result; + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Packet packetOp(Index i) const { + const int packetSize = internal::unpacket_traits::size; + EIGEN_ALIGN_MAX T values[packetSize]; + #ifdef EIGEN_USE_SYCL + if(!m_exec_once) { + // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread + m_state += (i * 6364136223846793005ULL); + m_exec_once =true; + } + #endif + EIGEN_UNROLL_LOOP + for (int j = 0; j < packetSize; ++j) { + values[j] = RandomToTypeUniform(&m_state, i); + } + return internal::pload(values); + } + + private: + mutable uint64_t m_state; + #ifdef EIGEN_USE_SYCL + mutable bool m_exec_once; + #endif +}; + +template +struct functor_traits > { + enum { + // Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)). + Cost = 12 * NumTraits::AddCost * + ((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)), + PacketAccess = UniformRandomGenerator::PacketAccess + }; +}; + + + +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +T RandomToTypeNormal(uint64_t* state, uint64_t stream) { + // Use the ratio of uniform method to generate numbers following a normal + // distribution. See for example Numerical Recipes chapter 7.3.9 for the + // details. + T u, v, q; + do { + u = RandomToTypeUniform(state, stream); + v = T(1.7156) * (RandomToTypeUniform(state, stream) - T(0.5)); + const T x = u - T(0.449871); + const T y = numext::abs(v) + T(0.386595); + q = x*x + y * (T(0.196)*y - T(0.25472)*x); + } while (q > T(0.27597) && + (q > T(0.27846) || v*v > T(-4) * numext::log(u) * u*u)); + + return v/u; +} + +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +std::complex RandomToTypeNormal >(uint64_t* state, uint64_t stream) { + return std::complex(RandomToTypeNormal(state, stream), + RandomToTypeNormal(state, stream)); +} +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +std::complex RandomToTypeNormal >(uint64_t* state, uint64_t stream) { + return std::complex(RandomToTypeNormal(state, stream), + RandomToTypeNormal(state, stream)); +} + + +template class NormalRandomGenerator { + public: + static const bool PacketAccess = true; + + // Uses the given "seed" if non-zero, otherwise uses a random seed. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) { + m_state = PCG_XSH_RS_state(seed); + #ifdef EIGEN_USE_SYCL + // In SYCL it is not possible to build PCG_XSH_RS_state in one step. + // Therefor, we need two steps to initializate the m_state. + // IN SYCL, the constructor of the functor is s called on the CPU + // and we get the clock seed here from the CPU. However, This seed is + //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function. + // and only available on the Operator() function (which is called on the GPU). + // Therefore, the thread Id injection is not available at this stage. However when the operator() + //is called the thread ID will be avilable. So inside the opeator, + // we add the thrreadID, BlockId,... (which is equivalent of i) + //to the seed and construct the unique m_state per thead similar to cuda. + m_exec_once =false; + #endif + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator( + const NormalRandomGenerator& other) { + m_state = other.m_state; +#ifdef EIGEN_USE_SYCL + m_exec_once=other.m_exec_once; +#endif + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + T operator()(Index i) const { + #ifdef EIGEN_USE_SYCL + if(!m_exec_once) { + // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread + m_state += (i * 6364136223846793005ULL); + m_exec_once =true; + } + #endif + T result = RandomToTypeNormal(&m_state, i); + return result; + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Packet packetOp(Index i) const { + const int packetSize = internal::unpacket_traits::size; + EIGEN_ALIGN_MAX T values[packetSize]; + #ifdef EIGEN_USE_SYCL + if(!m_exec_once) { + // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread + m_state += (i * 6364136223846793005ULL); + m_exec_once =true; + } + #endif + EIGEN_UNROLL_LOOP + for (int j = 0; j < packetSize; ++j) { + values[j] = RandomToTypeNormal(&m_state, i); + } + return internal::pload(values); + } + + private: + mutable uint64_t m_state; + #ifdef EIGEN_USE_SYCL + mutable bool m_exec_once; + #endif +}; + + +template +struct functor_traits > { + enum { + // On average, we need to generate about 3 random numbers + // 15 mul, 8 add, 1.5 logs + Cost = 3 * functor_traits >::Cost + + 15 * NumTraits::AddCost + 8 * NumTraits::AddCost + + 3 * functor_traits >::Cost / 2, + PacketAccess = NormalRandomGenerator::PacketAccess + }; +}; + + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorReduction.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorReduction.h new file mode 100644 index 0000000..583f462 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorReduction.h @@ -0,0 +1,998 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// Copyright (C) 2016 Mehdi Goli, Codeplay Software Ltd +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H +#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H + +// clang is incompatible with the CUDA syntax wrt making a kernel a class friend, +// so we'll use a macro to make clang happy. +#ifndef KERNEL_FRIEND +#if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__)) +#define KERNEL_FRIEND friend __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 +#else +#define KERNEL_FRIEND friend +#endif +#endif + + +namespace Eigen { + + +/** \class TensorReduction + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reduction class. + * + */ + +namespace internal { + template class MakePointer_ > + struct traits > + : traits +{ + typedef traits XprTraits; + typedef typename XprTraits::Scalar Scalar; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + static const int NumDimensions = XprTraits::NumDimensions - array_size::value; + static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; + + template struct MakePointer { + // Intermediate typedef to workaround MSVC issue. + typedef MakePointer_ MakePointerT; + typedef typename MakePointerT::Type Type; + }; +}; + +template class MakePointer_> +struct eval, Eigen::Dense> +{ + typedef const TensorReductionOp& type; +}; + +template class MakePointer_> +struct nested, 1, typename eval >::type> +{ + typedef TensorReductionOp type; +}; + + +template struct DimInitializer { + template EIGEN_DEVICE_FUNC + static void run(const InputDims& input_dims, + const array::value>& reduced, + OutputDims* output_dims, ReducedDims* reduced_dims) { + const int NumInputDims = internal::array_size::value; + int outputIndex = 0; + int reduceIndex = 0; + for (int i = 0; i < NumInputDims; ++i) { + if (reduced[i]) { + (*reduced_dims)[reduceIndex] = input_dims[i]; + ++reduceIndex; + } else { + (*output_dims)[outputIndex] = input_dims[i]; + ++outputIndex; + } + } + } +}; + +template <> struct DimInitializer > { + template EIGEN_DEVICE_FUNC + static void run(const InputDims& input_dims, const array&, + Sizes<>*, array* reduced_dims) { + const int NumInputDims = internal::array_size::value; + for (int i = 0; i < NumInputDims; ++i) { + (*reduced_dims)[i] = input_dims[i]; + } + } +}; + + +template +struct are_inner_most_dims { + static const bool value = false; +}; +template +struct preserve_inner_most_dims { + static const bool value = false; +}; + +#if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES +template +struct are_inner_most_dims{ + static const bool tmp1 = indices_statically_known_to_increase(); + static const bool tmp2 = index_statically_eq(0, 0); + static const bool tmp3 = index_statically_eq(array_size::value-1, array_size::value-1); + static const bool value = tmp1 & tmp2 & tmp3; +}; +template +struct are_inner_most_dims{ + static const bool tmp1 = indices_statically_known_to_increase(); + static const bool tmp2 = index_statically_eq(0, NumTensorDims - array_size::value); + static const bool tmp3 = index_statically_eq(array_size::value - 1, NumTensorDims - 1); + static const bool value = tmp1 & tmp2 & tmp3; + +}; +template +struct preserve_inner_most_dims{ + static const bool tmp1 = indices_statically_known_to_increase(); + static const bool tmp2 = index_statically_gt(0, 0); + static const bool value = tmp1 & tmp2; + +}; +template +struct preserve_inner_most_dims{ + static const bool tmp1 = indices_statically_known_to_increase(); + static const bool tmp2 = index_statically_lt(array_size::value - 1, NumTensorDims - 1); + static const bool value = tmp1 & tmp2; +}; +#endif + + +template +struct GenericDimReducer { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) { + EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) { + const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex]; + GenericDimReducer::reduce(self, input, reducer, accum); + } + } +}; +template +struct GenericDimReducer<0, Self, Op> { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) { + for (int j = 0; j < self.m_reducedDims[0]; ++j) { + const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0]; + reducer.reduce(self.m_impl.coeff(input), accum); + } + } +}; +template +struct GenericDimReducer<-1, Self, Op> { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index index, Op& reducer, typename Self::CoeffReturnType* accum) { + reducer.reduce(self.m_impl.coeff(index), accum); + } +}; + +template +struct InnerMostDimReducer { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { + typename Self::CoeffReturnType accum = reducer.initialize(); + for (typename Self::Index j = 0; j < numValuesToReduce; ++j) { + reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); + } + return reducer.finalize(accum); + } +}; + +template +struct InnerMostDimReducer { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { + const typename Self::Index packetSize = internal::unpacket_traits::size; + const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize; + typename Self::PacketReturnType paccum = reducer.template initializePacket(); + for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) { + reducer.reducePacket(self.m_impl.template packet(firstIndex + j), &paccum); + } + typename Self::CoeffReturnType accum = reducer.initialize(); + for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) { + reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); + } + return reducer.finalizeBoth(accum, paccum); + } +}; + +#if !defined(EIGEN_HIPCC) +static const int kLeafSize = 1024; + +template +struct InnerMostDimReducer { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType + reduce(const Self& self, typename Self::Index firstIndex, + typename Self::Index numValuesToReduce, Op& reducer) { + typename Self::CoeffReturnType accum = reducer.initialize(); + if (numValuesToReduce > kLeafSize) { + const typename Self::Index half = numValuesToReduce / 2; + reducer.reduce(reduce(self, firstIndex, half, reducer), &accum); + reducer.reduce( + reduce(self, firstIndex + half, numValuesToReduce - half, reducer), + &accum); + } else { + for (typename Self::Index j = 0; j < numValuesToReduce; ++j) { + reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); + } + } + return reducer.finalize(accum); + } +}; + +template +struct InnerMostDimReducer { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType + reduce(const Self& self, typename Self::Index firstIndex, + typename Self::Index numValuesToReduce, Op& reducer) { + const typename Self::Index packetSize = + internal::unpacket_traits::size; + typename Self::CoeffReturnType accum = reducer.initialize(); + if (numValuesToReduce > packetSize * kLeafSize) { + // Make sure the split point is aligned on a packet boundary. + const typename Self::Index split = + packetSize * + divup(firstIndex + divup(numValuesToReduce, typename Self::Index(2)), + packetSize); + const typename Self::Index num_left = + numext::mini(split - firstIndex, numValuesToReduce); + reducer.reduce(reduce(self, firstIndex, num_left, reducer), &accum); + if (num_left < numValuesToReduce) { + reducer.reduce( + reduce(self, split, numValuesToReduce - num_left, reducer), &accum); + } + return reducer.finalize(accum); + } else { + const typename Self::Index UnrollSize = + (numValuesToReduce / (2*packetSize)) * 2*packetSize; + const typename Self::Index VectorizedSize = + (numValuesToReduce / packetSize) * packetSize; + typename Self::PacketReturnType paccum = + reducer.template initializePacket(); + typename Self::PacketReturnType paccum2 = + reducer.template initializePacket(); + for (typename Self::Index j = 0; j < UnrollSize; j += packetSize * 2) { + reducer.reducePacket( + self.m_impl.template packet(firstIndex + j), &paccum); + reducer.reducePacket( + self.m_impl.template packet(firstIndex + j + packetSize), + &paccum2); + } + for (typename Self::Index j = UnrollSize; j < VectorizedSize; j+= packetSize) { + reducer.reducePacket(self.m_impl.template packet( + firstIndex + j), &paccum); + } + reducer.reducePacket(paccum2, &paccum); + for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; + ++j) { + reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); + } + return reducer.finalizeBoth(accum, paccum); + } + } +}; +#endif + +template +struct InnerMostDimPreserver { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) { + eigen_assert(false && "should never be called"); + } +}; + +template +struct InnerMostDimPreserver { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) { + EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + for (typename Self::Index j = 0; j < self.m_reducedDims[DimIndex]; ++j) { + const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex]; + InnerMostDimPreserver::reduce(self, input, reducer, accum); + } + } +}; + +template +struct InnerMostDimPreserver<0, Self, Op, true> { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) { + for (typename Self::Index j = 0; j < self.m_reducedDims[0]; ++j) { + const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0]; + reducer.reducePacket(self.m_impl.template packet(input), accum); + } + } +}; +template +struct InnerMostDimPreserver<-1, Self, Op, true> { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) { + eigen_assert(false && "should never be called"); + } +}; + +// Default full reducer +template +struct FullReducer { + static const bool HasOptimizedImplementation = false; + + static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::EvaluatorPointerType output) { + const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions()); + *output = InnerMostDimReducer::reduce(self, 0, num_coeffs, reducer); + } +}; + + +#ifdef EIGEN_USE_THREADS +// Multithreaded full reducers +template +struct FullReducerShard { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex, + typename Self::Index numValuesToReduce, Op& reducer, + typename Self::CoeffReturnType* output) { + *output = InnerMostDimReducer::reduce( + self, firstIndex, numValuesToReduce, reducer); + } +}; + +// Multithreaded full reducer +template +struct FullReducer { + static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful; + static const Index PacketSize = + unpacket_traits::size; + + // launch one reducer per thread and accumulate the result. + static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device, + typename Self::CoeffReturnType* output) { + typedef typename Self::Index Index; + const Index num_coeffs = array_prod(self.m_impl.dimensions()); + if (num_coeffs == 0) { + *output = reducer.finalize(reducer.initialize()); + return; + } + const TensorOpCost cost = + self.m_impl.costPerCoeff(Vectorizable) + + TensorOpCost(0, 0, internal::functor_traits::Cost, Vectorizable, + PacketSize); + const int num_threads = TensorCostModel::numThreads( + num_coeffs, cost, device.numThreads()); + if (num_threads == 1) { + *output = + InnerMostDimReducer::reduce(self, 0, num_coeffs, reducer); + return; + } + const Index blocksize = + std::floor(static_cast(num_coeffs) / num_threads); + const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0; + eigen_assert(num_coeffs >= numblocks * blocksize); + + Barrier barrier(internal::convert_index(numblocks)); + MaxSizeVector shards(numblocks, reducer.initialize()); + for (Index i = 0; i < numblocks; ++i) { + device.enqueue_with_barrier(&barrier, &FullReducerShard::run, + self, i * blocksize, blocksize, reducer, + &shards[i]); + } + typename Self::CoeffReturnType finalShard; + if (numblocks * blocksize < num_coeffs) { + finalShard = InnerMostDimReducer::reduce( + self, numblocks * blocksize, num_coeffs - numblocks * blocksize, + reducer); + } else { + finalShard = reducer.initialize(); + } + barrier.Wait(); + + for (Index i = 0; i < numblocks; ++i) { + reducer.reduce(shards[i], &finalShard); + } + *output = reducer.finalize(finalShard); + } +}; + +#endif + + +// Default inner reducer +template +struct InnerReducer { + static const bool HasOptimizedImplementation = false; + + EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { + eigen_assert(false && "Not implemented"); + return true; + } +}; + +// Default outer reducer +template +struct OuterReducer { + static const bool HasOptimizedImplementation = false; + + EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { + eigen_assert(false && "Not implemented"); + return true; + } +}; + +#ifdef EIGEN_USE_SYCL +// Default Generic reducer +template +struct GenericReducer { + static const bool HasOptimizedImplementation = false; + + EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { + eigen_assert(false && "Not implemented"); + return true; + } +}; +#endif + +#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*); + + +#if defined(EIGEN_HAS_GPU_FP16) +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits::type*); +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits::type*); +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*); + +#endif + +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); + +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); +#endif + +/** + * For SYCL, the return type of the reduction is deduced from the initialize method of the given Op. + * This allows the reduction to have a different type for the accumulator than the input data type. + * If this is the case, the functor needs to have two reduce method: one for reducing an element of the input + * with the accumulator and the other for reducing two accumulators. + * Such a reducer can be useful for instance when the accumulator is a boolean or a bitset that checks for + * some properties of the input. + */ +template +struct ReductionReturnType { +#if defined(EIGEN_USE_SYCL) + typedef typename remove_const().initialize())>::type type; +#else + typedef typename remove_const::type type; +#endif +}; + +} // end namespace internal + + +template class MakePointer_> +class TensorReductionOp : public TensorBase, ReadOnlyAccessors> { + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename internal::remove_const::type CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorReductionOp(const XprType& expr, const Dims& dims) : m_expr(expr), m_dims(dims) + { } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + TensorReductionOp(const XprType& expr, const Dims& dims, const Op& reducer) : m_expr(expr), m_dims(dims), m_reducer(reducer) + { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const XprType& expression() const { return m_expr; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Dims& dims() const { return m_dims; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Op& reducer() const { return m_reducer; } + + protected: + typename XprType::Nested m_expr; + const Dims m_dims; + const Op m_reducer; +}; + +template +struct TensorReductionEvaluatorBase; + +// Eval as rvalue +template class MakePointer_, typename Device> +struct TensorReductionEvaluatorBase, Device> +{ + typedef internal::reducer_traits ReducerTraits; + typedef Dims ReducedDims; + typedef TensorReductionOp XprType; + typedef typename XprType::Index Index; + typedef ArgType ChildType; + typedef typename TensorEvaluator::Dimensions InputDimensions; + static const int NumInputDims = internal::array_size::value; + static const int NumReducedDims = internal::array_size::value; + static const int NumOutputDims = NumInputDims - NumReducedDims; + typedef typename internal::conditional, DSizes >::type Dimensions; + typedef typename XprType::Scalar Scalar; + typedef TensorReductionEvaluatorBase, Device> Self; + static const bool InputPacketAccess = TensorEvaluator::PacketAccess; + typedef typename internal::ReductionReturnType::type CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const Index PacketSize = PacketType::size; + + typedef typename Eigen::internal::traits::PointerType TensorPointerType; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + // Subset of strides of the input tensor for the non-reduced dimensions. + // Indexed by output dimensions. + static const int NumPreservedStrides = max_n_1::size; + + enum { + IsAligned = false, + PacketAccess = Self::InputPacketAccess && ReducerTraits::PacketAccess, + BlockAccess = false, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + typedef typename internal::remove_const::type ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + static const bool ReducingInnerMostDims = internal::are_inner_most_dims::value; + static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims::value; + static const bool RunningFullReduction = (NumOutputDims==0); + + EIGEN_STRONG_INLINE TensorReductionEvaluatorBase(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device) + { + EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)), + YOU_MADE_A_PROGRAMMING_MISTAKE); + + // Build the bitmap indicating if an input dimension is reduced or not. + for (int i = 0; i < NumInputDims; ++i) { + m_reduced[i] = false; + } + for (int i = 0; i < NumReducedDims; ++i) { + eigen_assert(op.dims()[i] >= 0); + eigen_assert(op.dims()[i] < NumInputDims); + m_reduced[op.dims()[i]] = true; + } + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + internal::DimInitializer::run(input_dims, m_reduced, &m_dimensions, &m_reducedDims); + + // Precompute output strides. + if (NumOutputDims > 0) { + if (static_cast(Layout) == static_cast(ColMajor)) { + m_outputStrides[0] = 1; + for (int i = 1; i < NumOutputDims; ++i) { + m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); + } + } else { + m_outputStrides[NumOutputDims - 1] = 1; + for (int i = NumOutputDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); + } + } + } + + // Precompute input strides. + if (NumInputDims > 0) { + array input_strides; + if (static_cast(Layout) == static_cast(ColMajor)) { + input_strides[0] = 1; + for (int i = 1; i < NumInputDims; ++i) { + input_strides[i] = input_strides[i-1] * input_dims[i-1]; + } + } else { + input_strides.back() = 1; + for (int i = NumInputDims - 2; i >= 0; --i) { + input_strides[i] = input_strides[i + 1] * input_dims[i + 1]; + } + } + + int outputIndex = 0; + int reduceIndex = 0; + for (int i = 0; i < NumInputDims; ++i) { + if (m_reduced[i]) { + m_reducedStrides[reduceIndex] = input_strides[i]; + ++reduceIndex; + } else { + m_preservedStrides[outputIndex] = input_strides[i]; + m_output_to_input_dim_map[outputIndex] = i; + ++outputIndex; + } + } + } + + // Special case for full reductions + if (NumOutputDims == 0) { + m_preservedStrides[0] = internal::array_prod(input_dims); + } + + m_numValuesToReduce = + NumOutputDims == 0 + ? internal::array_prod(input_dims) + : (static_cast(Layout) == static_cast(ColMajor)) + ? m_preservedStrides[0] + : m_preservedStrides[NumOutputDims - 1]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE + bool evalSubExprsIfNeededCommon(EvaluatorPointerType data) { + // Use the FullReducer if possible. + if ((RunningFullReduction && RunningOnSycl) ||(RunningFullReduction && + internal::FullReducer::HasOptimizedImplementation && + ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || + !RunningOnGPU))) { + bool need_assign = false; + if (!data) { + m_result = static_cast(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType)))); + data = m_result; + need_assign = true; + } + Op reducer(m_reducer); + internal::FullReducer::run(*this, reducer, m_device, data); + return need_assign; + } + + // Attempt to use an optimized reduction. + else if ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || (RunningOnSycl)) { + bool reducing_inner_dims = true; + for (int i = 0; i < NumReducedDims; ++i) { + if (static_cast(Layout) == static_cast(ColMajor)) { + reducing_inner_dims &= m_reduced[i]; + } else { + reducing_inner_dims &= m_reduced[NumInputDims - 1 - i]; + } + } + if (internal::InnerReducer::HasOptimizedImplementation && + (reducing_inner_dims || ReducingInnerMostDims)) { + const Index num_values_to_reduce = internal::array_prod(m_reducedDims); + const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); + if (!data) { + if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) || (RunningOnSycl)) { + data = static_cast(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve))); + m_result = data; + } + else { + return true; + } + } + Op reducer(m_reducer); + // For SYCL this if always return false + if (internal::InnerReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) { + if (m_result) { + m_device.deallocate_temp(m_result); + m_result = NULL; + } + return true; + } else { + return (m_result != NULL); + } + } + + bool preserving_inner_dims = true; + for (int i = 0; i < NumReducedDims; ++i) { + if (static_cast(Layout) == static_cast(ColMajor)) { + preserving_inner_dims &= m_reduced[NumInputDims - 1 - i]; + } else { + preserving_inner_dims &= m_reduced[i]; + } + } + if (internal::OuterReducer::HasOptimizedImplementation && + preserving_inner_dims) { + const Index num_values_to_reduce = internal::array_prod(m_reducedDims); + const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); + if (!data) { + if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) || (RunningOnSycl)) { + data = static_cast(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve))); + m_result = data; + } + else { + return true; + } + } + Op reducer(m_reducer); + // For SYCL this if always return false + if (internal::OuterReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) { + if (m_result) { + m_device.deallocate_temp(m_result); + m_result = NULL; + } + return true; + } else { + return (m_result != NULL); + } + } + #if defined(EIGEN_USE_SYCL) + // If there is no Optimised version for SYCL, the reduction expression + // must break into two subexpression and use the SYCL generic Reducer on the device. + if(RunningOnSycl) { + const Index num_values_to_reduce = internal::array_prod(m_reducedDims); + const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); + if (!data) { + data = static_cast(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve))); + m_result = data; + } + Op reducer(m_reducer); + internal::GenericReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); + return (m_result != NULL); + } + #endif + } + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE + void + evalSubExprsIfNeededAsync(EvaluatorPointerType data, + EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(NULL, [this, data, done](bool) { + done(evalSubExprsIfNeededCommon(data)); + }); + } +#endif + + EIGEN_STRONG_INLINE + bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + m_impl.evalSubExprsIfNeeded(NULL); + return evalSubExprsIfNeededCommon(data); + } + + EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + if (m_result) { + m_device.deallocate_temp(m_result); + m_result = NULL; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + if (( RunningFullReduction || RunningOnGPU) && m_result ) { + return *(m_result + index); + } + Op reducer(m_reducer); + if (ReducingInnerMostDims || RunningFullReduction) { + const Index num_values_to_reduce = + (static_cast(Layout) == static_cast(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1]; + return internal::InnerMostDimReducer::reduce(*this, firstInput(index), + num_values_to_reduce, reducer); + } else { + typename Self::CoeffReturnType accum = reducer.initialize(); + internal::GenericDimReducer::reduce(*this, firstInput(index), reducer, &accum); + return reducer.finalize(accum); + } + } + + // TODO(bsteiner): provide a more efficient implementation. + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index + PacketSize - 1 < Index(internal::array_prod(dimensions()))); + + if (RunningOnGPU && m_result) { + return internal::pload(m_result + index); + } + + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + if (ReducingInnerMostDims) { + const Index num_values_to_reduce = + (static_cast(Layout) == static_cast(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1]; + const Index firstIndex = firstInput(index); + for (Index i = 0; i < PacketSize; ++i) { + Op reducer(m_reducer); + values[i] = internal::InnerMostDimReducer::reduce(*this, firstIndex + i * num_values_to_reduce, + num_values_to_reduce, reducer); + } + } else if (PreservingInnerMostDims) { + const Index firstIndex = firstInput(index); + const int innermost_dim = (static_cast(Layout) == static_cast(ColMajor)) ? 0 : NumOutputDims - 1; + // TBD: extend this the the n innermost dimensions that we preserve. + if (((firstIndex % m_dimensions[innermost_dim]) + PacketSize - 1) < m_dimensions[innermost_dim]) { + Op reducer(m_reducer); + typename Self::PacketReturnType accum = reducer.template initializePacket(); + internal::InnerMostDimPreserver::reduce(*this, firstIndex, reducer, &accum); + return reducer.finalizePacket(accum); + } else { + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index + i); + } + } + } else { + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index + i); + } + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + // Must be called after evalSubExprsIfNeeded(). + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + if (RunningFullReduction && m_result) { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); + } else { + const Index num_values_to_reduce = internal::array_prod(m_reducedDims); + const double compute_cost = num_values_to_reduce * internal::functor_traits::Cost; + return m_impl.costPerCoeff(vectorized) * num_values_to_reduce + + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; } + EIGEN_DEVICE_FUNC const TensorEvaluator& impl() const { return m_impl; } + EIGEN_DEVICE_FUNC const Device& device() const { return m_device; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + m_result.bind(cgh); + } +#endif + + private: + template friend struct internal::GenericDimReducer; + template friend struct internal::InnerMostDimReducer; + template friend struct internal::InnerMostDimPreserver; + template friend struct internal::FullReducer; +#ifdef EIGEN_USE_THREADS + template friend struct internal::FullReducerShard; +#endif +#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) + template KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*); +#if defined(EIGEN_HAS_GPU_FP16) + template KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits::type*); + template KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits::type*); + template KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I_, I_, half*); +#endif + template KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); + + template KERNEL_FRIEND void internal::OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); +#endif + +#if defined(EIGEN_USE_SYCL) + template < typename Evaluator_, typename Op__> friend class TensorSycl::internal::GenericNondeterministicReducer; + // SYCL need the Generic reducer for the case the recution algorithm is neither inner, outer, and full reducer + template friend struct internal::GenericReducer; +#endif + + + template friend struct internal::InnerReducer; + + struct BlockIteratorState { + Index input_dim; + Index output_size; + Index output_count; + }; + + // Returns the Index in the input tensor of the first value that needs to be + // used to compute the reduction at output index "index". + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { + if (ReducingInnerMostDims) { + if (static_cast(Layout) == static_cast(ColMajor)) { + return index * m_preservedStrides[0]; + } else { + return index * m_preservedStrides[NumPreservedStrides - 1]; + } + } + // TBD: optimize the case where we preserve the innermost dimensions. + Index startInput = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumOutputDims - 1; i > 0; --i) { + // This is index_i in the output tensor. + const Index idx = index / m_outputStrides[i]; + startInput += idx * m_preservedStrides[i]; + index -= idx * m_outputStrides[i]; + } + if (PreservingInnerMostDims) { + eigen_assert(m_preservedStrides[0] == 1); + startInput += index; + } else { + startInput += index * m_preservedStrides[0]; + } + } else { + for (int i = 0; i < NumOutputDims - 1; ++i) { + // This is index_i in the output tensor. + const Index idx = index / m_outputStrides[i]; + startInput += idx * m_preservedStrides[i]; + index -= idx * m_outputStrides[i]; + } + if (PreservingInnerMostDims) { + eigen_assert(m_preservedStrides[NumPreservedStrides - 1] == 1); + startInput += index; + } else { + startInput += index * m_preservedStrides[NumPreservedStrides - 1]; + } + } + return startInput; + } + + // Bitmap indicating if an input dimension is reduced or not. + array m_reduced; + // Dimensions of the output of the operation. + Dimensions m_dimensions; + // Precomputed strides for the output tensor. + array m_outputStrides; + array, NumOutputDims> m_fastOutputStrides; + array m_preservedStrides; + // Map from output to input dimension index. + array m_output_to_input_dim_map; + // How many values go into each reduction + Index m_numValuesToReduce; + + // Subset of strides of the input tensor for the reduced dimensions. + // Indexed by reduced dimensions. + array m_reducedStrides; + // Size of the input dimensions that are reduced. + // Indexed by reduced dimensions. + array m_reducedDims; + + // Evaluator for the input expression. + TensorEvaluator m_impl; + + // Operation to apply for computing the reduction. + Op m_reducer; + + // For full reductions +#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) + static const bool RunningOnGPU = internal::is_same::value; + static const bool RunningOnSycl = false; +#elif defined(EIGEN_USE_SYCL) +static const bool RunningOnSycl = internal::is_same::type, Eigen::SyclDevice>::value; +static const bool RunningOnGPU = false; +#else + static const bool RunningOnGPU = false; + static const bool RunningOnSycl = false; +#endif + EvaluatorPointerType m_result; + + const Device EIGEN_DEVICE_REF m_device; +}; + +template class MakePointer_, typename Device> +struct TensorEvaluator, Device> +: public TensorReductionEvaluatorBase, Device> { + typedef TensorReductionEvaluatorBase, Device> Base; + EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Device& device) : Base(op, device){} +}; + + +template class MakePointer_> +struct TensorEvaluator, Eigen::SyclDevice> +: public TensorReductionEvaluatorBase, Eigen::SyclDevice> { + + typedef TensorReductionEvaluatorBase, Eigen::SyclDevice> Base; + EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Eigen::SyclDevice& device) : Base(op, device){} + // The coeff function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel + //Therefore the coeff function should be overridden by for SYCL kernel + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::CoeffReturnType coeff(typename Base::Index index) const { + return *(this->data() + index); + } + // The packet function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel + //Therefore the packet function should be overridden by for SYCL kernel + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::PacketReturnType packet(typename Base::Index index) const { + return internal::pload(this->data() + index); + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorReductionCuda.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorReductionCuda.h new file mode 100644 index 0000000..68780cd --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorReductionCuda.h @@ -0,0 +1,6 @@ + +#if defined(__clang__) || defined(__GNUC__) +#warning "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorReductionGpu.h file" +#endif + +#include "TensorReductionGpu.h" diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorReductionGpu.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorReductionGpu.h new file mode 100644 index 0000000..db4e8d8 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorReductionGpu.h @@ -0,0 +1,966 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H +#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H + +namespace Eigen { +namespace internal { + + +#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC) +// Full reducers for GPU, don't vectorize for now + +// Reducer function that enables multiple gpu thread to safely accumulate at the same +// output address. It basically reads the current value of the output variable, and +// attempts to update it with the new value. If in the meantime another gpu thread +// updated the content of the output address it will try again. +template +__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) { +#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300) + if (sizeof(T) == 4) + { + unsigned int oldval = *reinterpret_cast(output); + unsigned int newval = oldval; + reducer.reduce(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + unsigned int readback; + while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) { + oldval = readback; + newval = oldval; + reducer.reduce(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + } + } + else if (sizeof(T) == 8) { + unsigned long long oldval = *reinterpret_cast(output); + unsigned long long newval = oldval; + reducer.reduce(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + unsigned long long readback; + while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) { + oldval = readback; + newval = oldval; + reducer.reduce(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + } + } + else { + gpu_assert(0 && "Wordsize not supported"); + } +#else // EIGEN_CUDA_ARCH >= 300 + gpu_assert(0 && "Shouldn't be called on unsupported device"); +#endif // EIGEN_CUDA_ARCH >= 300 +} + +// We extend atomicExch to support extra data types +template +__device__ inline Type atomicExchCustom(Type* address, Type val) { + return atomicExch(address, val); +} + +template <> +__device__ inline double atomicExchCustom(double* address, double val) { + unsigned long long int* address_as_ull = reinterpret_cast(address); + return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val))); +} + +#ifdef EIGEN_HAS_GPU_FP16 +template +__device__ inline void atomicReduce(half2* output, half2 accum, R& reducer) { + unsigned int oldval = *reinterpret_cast(output); + unsigned int newval = oldval; + reducer.reducePacket(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + unsigned int readback; + while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) { + oldval = readback; + newval = oldval; + reducer.reducePacket(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + } +} +// reduction should be associative since reduction is not atomic in wide vector but atomic in half2 operations +template +__device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reducer) { + half2* houtput=reinterpret_cast(output); + half2* haccum=reinterpret_cast(&accum); + for(int i=0;i<4;++i){ + atomicReduce(houtput+i,*(haccum+i),reducer); + } +} +#endif // EIGEN_HAS_GPU_FP16 + +template <> +__device__ inline void atomicReduce(float* output, float accum, SumReducer&) { +#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300) + atomicAdd(output, accum); +#else // EIGEN_CUDA_ARCH >= 300 + gpu_assert(0 && "Shouldn't be called on unsupported device"); +#endif // EIGEN_CUDA_ARCH >= 300 +} + + +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) { + const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; + const Index num_threads = blockDim.x * gridDim.x; + for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { + output[i] = val; + } +} + + +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs, + typename Self::CoeffReturnType* output, unsigned int* semaphore) { +#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300) + // Initialize the output value + const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x; + if (gridDim.x == 1) { + if (first_index == 0) { + *output = reducer.initialize(); + } + } + else { + if (threadIdx.x == 0) { + unsigned int block = atomicCAS(semaphore, 0u, 1u); + if (block == 0) { + // We're the first block to run, initialize the output value + atomicExchCustom(output, reducer.initialize()); + __threadfence(); + atomicExch(semaphore, 2u); + } + else { + // Wait for the first block to initialize the output value. + // Use atomicCAS here to ensure that the reads aren't cached + unsigned int val; + do { + val = atomicCAS(semaphore, 2u, 2u); + } + while (val < 2u); + } + } + } + + __syncthreads(); + + eigen_assert(gridDim.x == 1 || *semaphore >= 2u); + + typename Self::CoeffReturnType accum = reducer.initialize(); + Index max_iter = numext::mini(num_coeffs - first_index, NumPerThread*BlockSize); + for (Index i = 0; i < max_iter; i+=BlockSize) { + const Index index = first_index + i; + eigen_assert(index < num_coeffs); + typename Self::CoeffReturnType val = input.m_impl.coeff(index); + reducer.reduce(val, &accum); + } + +#pragma unroll + for (int offset = warpSize/2; offset > 0; offset /= 2) { + #if defined(EIGEN_HIPCC) + // use std::is_floating_point to determine the type of reduced_val + // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error + // and list the float and int versions of __shfl_down as the candidate functions. + if (std::is_floating_point::value) { + reducer.reduce(__shfl_down(static_cast(accum), offset, warpSize), &accum); + } else { + reducer.reduce(__shfl_down(static_cast(accum), offset, warpSize), &accum); + } + #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 + reducer.reduce(__shfl_down(accum, offset, warpSize), &accum); + #else + reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum); + #endif + } + + if ((threadIdx.x & (warpSize - 1)) == 0) { + atomicReduce(output, accum, reducer); + } + + if (gridDim.x > 1 && threadIdx.x == 0) { + // Let the last block reset the semaphore + atomicInc(semaphore, gridDim.x + 1); +#if defined(EIGEN_HIPCC) + __threadfence_system(); +#endif + } +#else // EIGEN_CUDA_ARCH >= 300 + gpu_assert(0 && "Shouldn't be called on unsupported device"); +#endif // EIGEN_CUDA_ARCH >= 300 +} + + +#ifdef EIGEN_HAS_GPU_FP16 +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, + packet_traits::type* scratch) { + eigen_assert(blockDim.x == 1); + eigen_assert(gridDim.x == 1); + typedef packet_traits::type packet_type; + Index packet_remainder = + num_coeffs % Index(unpacket_traits::size); + if (packet_remainder != 0) { + half2* h2scratch = reinterpret_cast(scratch); + for (Index i = num_coeffs - packet_remainder; i + 2 <= num_coeffs; i += 2) { + *h2scratch = + __halves2half2(input.m_impl.coeff(i), input.m_impl.coeff(i + 1)); + h2scratch++; + } + if ((num_coeffs & 1) != 0) { + half lastCoeff = input.m_impl.coeff(num_coeffs - 1); + *h2scratch = __halves2half2(lastCoeff, reducer.initialize()); + } + } else { + *scratch = reducer.template initializePacket(); + } +} + +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) { + const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; + const Index num_threads = blockDim.x * gridDim.x; + typedef typename packet_traits::type PacketType; + + const Index num_packets = + num_coeffs / Index(unpacket_traits::size); + PacketType* p_output = reinterpret_cast(output); + for (Index i = thread_id; i < num_packets; i += num_threads) { + p_output[i] = reducer.template initializePacket(); + } + Index packet_remainder = + num_coeffs % Index(unpacket_traits::size); + if (thread_id < packet_remainder) { + output[num_coeffs - packet_remainder + thread_id] = reducer.initialize(); + } +} + +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, + half* output, packet_traits::type* scratch) { + typedef typename packet_traits::type PacketType; + const int packet_width = unpacket_traits::size; + eigen_assert(NumPerThread % packet_width == 0); + const Index first_index = + blockIdx.x * BlockSize * NumPerThread + packet_width * threadIdx.x; + + // Initialize the output value if it wasn't initialized by the ReductionInitKernel + + if (gridDim.x == 1) { + if (first_index == 0) { + int rem = num_coeffs % packet_width; + if (rem != 0) { + half2* p_scratch = reinterpret_cast(scratch); + *scratch = reducer.template initializePacket(); + for (int i = 0; i < rem / 2; i++) { + *p_scratch = __halves2half2( + input.m_impl.coeff(num_coeffs - packet_width + 2 * i), + input.m_impl.coeff(num_coeffs - packet_width + 2 * i + 1)); + p_scratch++; + } + if ((num_coeffs & 1) != 0) { + half last = input.m_impl.coeff(num_coeffs - 1); + *p_scratch = __halves2half2(last, reducer.initialize()); + } + } else { + *scratch = reducer.template initializePacket(); + } + } + __syncthreads(); + } + + PacketType accum = reducer.template initializePacket(); + const Index max_iter = + numext::mini((num_coeffs - first_index) / packet_width, + NumPerThread * BlockSize / packet_width); + for (Index i = 0; i < max_iter; i += BlockSize) { + const Index index = first_index + packet_width * i; + eigen_assert(index + packet_width < num_coeffs); + PacketType val = input.m_impl.template packet(index); + reducer.reducePacket(val, &accum); + } + +#pragma unroll + for (int offset = warpSize/2; offset > 0; offset /= 2) { + #if defined(EIGEN_HIPCC) + PacketType r1; + half2* hr = reinterpret_cast(&r1); + half2* hacc = reinterpret_cast(&accum); + for (int i = 0; i < packet_width / 2; i++) { + // FIXME : remove this workaround once we have native half/half2 support for __shfl_down + union { int i; half2 h; } wka_in, wka_out; + wka_in.h = hacc[i]; + wka_out.i = __shfl_down(wka_in.i, offset, warpSize); + hr[i] = wka_out.h; + } + reducer.reducePacket(r1, &accum); + #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 + PacketType r1; + half2* hr = reinterpret_cast(&r1); + half2* hacc = reinterpret_cast(&accum); + for (int i = 0; i < packet_width / 2; i++) { + hr[i] = __shfl_down(hacc[i], offset, warpSize); + } + reducer.reducePacket(r1, &accum); + #else + PacketType r1; + half2* hr = reinterpret_cast(&r1); + half2* hacc = reinterpret_cast(&accum); + for (int i = 0; i < packet_width / 2; i++) { + hr[i] = __shfl_down_sync(0xFFFFFFFF, hacc[i], (unsigned)offset, warpSize); + } + reducer.reducePacket(r1, &accum); + + #endif + } + + if ((threadIdx.x & (warpSize - 1)) == 0) { + atomicReduce(scratch, accum, reducer); + } + + __syncthreads(); + half2* rv1 = reinterpret_cast(scratch); + if (packet_width > 2) { + reducer.reducePacket(rv1[2], rv1); + reducer.reducePacket(rv1[3], rv1 + 1); + reducer.reducePacket(rv1[1], rv1); + } + if (gridDim.x == 1) { + if (first_index == 0) { + half tmp = __low2half(*rv1); + reducer.reduce(__high2half(*rv1), &tmp); + *output = tmp; + } + } +} + +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op reducer, half* output, packet_traits::type* scratch) { + eigen_assert(threadIdx.x == 1); + half2* pscratch = reinterpret_cast(scratch); + half tmp = __float2half(0.f); + typedef packet_traits::type packet_type; + for (int i = 0; i < unpacket_traits::size; i += 2) { + reducer.reduce(__low2half(*pscratch), &tmp); + reducer.reduce(__high2half(*pscratch), &tmp); + pscratch++; + } + *output = tmp; +} + +#endif // EIGEN_HAS_GPU_FP16 + +template +struct FullReductionLauncher { + static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) { + gpu_assert(false && "Should only be called on doubles, floats and half floats"); + } +}; + +// Specialization for float and double +template +struct FullReductionLauncher< + Self, Op, OutputType, PacketAccess, + typename internal::enable_if< + internal::is_same::value || + internal::is_same::value, + void>::type> { + static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) { + + typedef typename Self::Index Index; + const int block_size = 256; + const int num_per_thread = 128; + const int num_blocks = divup(num_coeffs, block_size * num_per_thread); + + unsigned int* semaphore = NULL; + if (num_blocks > 1) { + semaphore = device.semaphore(); + } + + LAUNCH_GPU_KERNEL((FullReductionKernel), + num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore); + } +}; + +#ifdef EIGEN_HAS_GPU_FP16 +template +struct FullReductionLauncher { + static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) { + gpu_assert(false && "Should not be called since there is no packet accessor"); + } +}; + +template +struct FullReductionLauncher { + static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) { + typedef typename Self::Index Index; + typedef typename packet_traits::type PacketType; + + const int block_size = 256; + const int num_per_thread = 128; + const int num_blocks = divup(num_coeffs, block_size * num_per_thread); + PacketType* scratch = static_cast(device.scratchpad()); + // half2* scratch = static_cast(device.scratchpad()); + + if (num_blocks > 1) { + // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there + // won't be a race conditions between multiple thread blocks. + LAUNCH_GPU_KERNEL((ReductionInitFullReduxKernelHalfFloat), + 1, 1, 0, device, reducer, self, num_coeffs, scratch); + } + + LAUNCH_GPU_KERNEL((FullReductionKernelHalfFloat), + num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch); + + if (num_blocks > 1) { + LAUNCH_GPU_KERNEL((ReductionCleanupKernelHalfFloat), + 1, 1, 0, device, reducer, output, scratch); + } + } +}; +#endif // EIGEN_HAS_GPU_FP16 + + +template +struct FullReducer { + // Unfortunately nvidia doesn't support well exotic types such as complex, + // so reduce the scope of the optimized version of the code to the simple cases + // of doubles, floats and half floats +#ifdef EIGEN_HAS_GPU_FP16 + static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful && + (internal::is_same::value || + internal::is_same::value || + (internal::is_same::value && reducer_traits::PacketAccess)); +#else // EIGEN_HAS_GPU_FP16 + static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful && + (internal::is_same::value || + internal::is_same::value); +#endif // EIGEN_HAS_GPU_FP16 + + template + static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) { + gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats"); + const Index num_coeffs = array_prod(self.m_impl.dimensions()); + // Don't crash when we're called with an input tensor of size 0. + if (num_coeffs == 0) { + return; + } + + FullReductionLauncher::PacketAccess>::run(self, reducer, device, output, num_coeffs); + } +}; + + +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, + typename Self::CoeffReturnType* output) { +#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300) + typedef typename Self::CoeffReturnType Type; + eigen_assert(blockDim.y == 1); + eigen_assert(blockDim.z == 1); + eigen_assert(gridDim.y == 1); + eigen_assert(gridDim.z == 1); + + const int unroll_times = 16; + eigen_assert(NumPerThread % unroll_times == 0); + + const Index input_col_blocks = divup(num_coeffs_to_reduce, blockDim.x * NumPerThread); + const Index num_input_blocks = input_col_blocks * num_preserved_coeffs; + + const Index num_threads = blockDim.x * gridDim.x; + const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + // Initialize the output values if they weren't initialized by the ReductionInitKernel + if (gridDim.x == 1) { + for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { + output[i] = reducer.initialize(); + } + __syncthreads(); + } + + for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) { + const Index row = i / input_col_blocks; + + if (row < num_preserved_coeffs) { + const Index col_block = i % input_col_blocks; + const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x; + + Type reduced_val = reducer.initialize(); + + for (Index j = 0; j < NumPerThread; j += unroll_times) { + const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1); + if (last_col >= num_coeffs_to_reduce) { + for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) { + const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col); + reducer.reduce(val, &reduced_val); + } + break; + } else { + // Faster version of the loop with no branches after unrolling. +#pragma unroll + for (int k = 0; k < unroll_times; ++k) { + const Index col = col_begin + blockDim.x * (j + k); + reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val); + } + } + } + +#pragma unroll + for (int offset = warpSize/2; offset > 0; offset /= 2) { + #if defined(EIGEN_HIPCC) + // use std::is_floating_point to determine the type of reduced_val + // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error + // and list the float and int versions of __shfl_down as the candidate functions. + if (std::is_floating_point::value) { + reducer.reduce(__shfl_down(static_cast(reduced_val), offset), &reduced_val); + } else { + reducer.reduce(__shfl_down(static_cast(reduced_val), offset), &reduced_val); + } + #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 + reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val); + #else + reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val); + #endif + } + + if ((threadIdx.x & (warpSize - 1)) == 0) { + atomicReduce(&(output[row]), reduced_val, reducer); + } + } + } +#else // EIGEN_CUDA_ARCH >= 300 + gpu_assert(0 && "Shouldn't be called on unsupported device"); +#endif // EIGEN_CUDA_ARCH >= 300 +} + +#ifdef EIGEN_HAS_GPU_FP16 + +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, + half* output) { + eigen_assert(blockDim.y == 1); + eigen_assert(blockDim.z == 1); + eigen_assert(gridDim.y == 1); + eigen_assert(gridDim.z == 1); + + typedef typename packet_traits::type PacketType; + const int packet_width = unpacket_traits::size; + const int unroll_times = 16 / packet_width; + eigen_assert(NumPerThread % unroll_times == 0); + eigen_assert(unroll_times % 2 == 0); + + const Index input_col_blocks = divup(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2); + const Index num_input_blocks = divup(input_col_blocks * num_preserved_coeffs, 2); + + const Index num_threads = blockDim.x * gridDim.x; + const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + // Initialize the output values if they weren't initialized by the ReductionInitKernel + if (gridDim.x == 1) { + Index i = packet_width * thread_id; + for (; i + packet_width <= num_preserved_coeffs; + i += packet_width * num_threads) { + PacketType* poutput = reinterpret_cast(output + i); + *poutput = reducer.template initializePacket(); + } + if (i < num_preserved_coeffs) { + output[i] = reducer.initialize(); + } + __syncthreads(); + } + + for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) { + const Index row = 2 * (i / input_col_blocks); // everybody takes 2 rows + + if (row + 1 < num_preserved_coeffs) { + const Index col_block = i % input_col_blocks; + const Index col_begin = + packet_width * (col_block * blockDim.x * NumPerThread + threadIdx.x); + + PacketType reduced_val1 = reducer.template initializePacket(); + PacketType reduced_val2 = reducer.template initializePacket(); + + for (Index j = 0; j < NumPerThread; j += unroll_times) { + const Index last_col = + col_begin + blockDim.x * (j + unroll_times - 1) * packet_width; + if (last_col >= num_coeffs_to_reduce) { + Index col = col_begin + blockDim.x * j; + for (; col + packet_width <= num_coeffs_to_reduce; + col += blockDim.x) { + const PacketType val1 = input.m_impl.template packet( + row * num_coeffs_to_reduce + col); + reducer.reducePacket(val1, &reduced_val1); + const PacketType val2 = input.m_impl.template packet( + (row + 1) * num_coeffs_to_reduce + col); + reducer.reducePacket(val2, &reduced_val2); + } + if (col < num_coeffs_to_reduce) { + PacketType r1 = reducer.template initializePacket(); + PacketType r2 = reducer.template initializePacket(); + half2* hr1 = reinterpret_cast(&r1); + half2* hr2 = reinterpret_cast(&r2); + while (col + 1 < num_coeffs_to_reduce) { + *hr1 = __halves2half2( + input.m_impl.coeff(row * num_coeffs_to_reduce + col), + input.m_impl.coeff(row * num_coeffs_to_reduce + col + 1)); + *hr2 = __halves2half2( + input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col), + input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col + + 1)); + hr1++; + hr2++; + col += 2; + } + if (col < num_coeffs_to_reduce) { + // Peel; + const half last1 = + input.m_impl.coeff(row * num_coeffs_to_reduce + col); + *hr1 = __halves2half2(last1, reducer.initialize()); + const half last2 = + input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col); + *hr2 = __halves2half2(last2, reducer.initialize()); + } + reducer.reducePacket(r1, &reduced_val1); + reducer.reducePacket(r2, &reduced_val2); + } + break; + } else { + // Faster version of the loop with no branches after unrolling. +#pragma unroll + for (int k = 0; k < unroll_times; ++k) { + const Index col = col_begin + blockDim.x * (j + k) * packet_width; + reducer.reducePacket(input.m_impl.template packet( + row * num_coeffs_to_reduce + col), + &reduced_val1); + reducer.reducePacket(input.m_impl.template packet( + (row + 1) * num_coeffs_to_reduce + col), + &reduced_val2); + } + } + } + +#pragma unroll + for (int offset = warpSize/2; offset > 0; offset /= 2) { + #if defined(EIGEN_HIPCC) + PacketType r1; + PacketType r2; + half2* hr1 = reinterpret_cast(&r1); + half2* hr2 = reinterpret_cast(&r2); + half2* rv1 = reinterpret_cast(&reduced_val1); + half2* rv2 = reinterpret_cast(&reduced_val2); + for (int i = 0; i < packet_width / 2; i++) { + // FIXME : remove this workaround once we have native half/half2 support for __shfl_down + union { int i; half2 h; } wka_in1, wka_out1; + wka_in1.h = rv1[i]; + wka_out1.i = __shfl_down(wka_in1.i, offset, warpSize); + hr1[i] = wka_out1.h; + + union { int i; half2 h; } wka_in2, wka_out2; + wka_in2.h = rv2[i]; + wka_out2.i = __shfl_down(wka_in2.i, offset, warpSize); + hr2[i] = wka_out2.h; + } + reducer.reducePacket(r1, &reduced_val1); + reducer.reducePacket(r2, &reduced_val2); + #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 + PacketType r1; + PacketType r2; + half2* hr1 = reinterpret_cast(&r1); + half2* hr2 = reinterpret_cast(&r2); + half2* rv1 = reinterpret_cast(&reduced_val1); + half2* rv2 = reinterpret_cast(&reduced_val2); + for (int i = 0; i < packet_width / 2; i++) { + hr1[i] = __shfl_down(rv1[i], offset, warpSize); + hr2[i] = __shfl_down(rv2[i], offset, warpSize); + } + reducer.reducePacket(r1, &reduced_val1); + reducer.reducePacket(r2, &reduced_val2); + #else + PacketType r1; + PacketType r2; + half2* hr1 = reinterpret_cast(&r1); + half2* hr2 = reinterpret_cast(&r2); + half2* rr1 = reinterpret_cast(&reduced_val1); + half2* rr2 = reinterpret_cast(&reduced_val2); + for (int i = 0; i < packet_width / 2; i++) { + hr1[i] = + __shfl_down_sync(0xFFFFFFFF, rr1[i], (unsigned)offset, warpSize); + hr2[i] = + __shfl_down_sync(0xFFFFFFFF, rr2[i], (unsigned)offset, warpSize); + } + reducer.reducePacket(r1, &reduced_val1); + reducer.reducePacket(r2, &reduced_val2); + + #endif + } + half2* rv1 = reinterpret_cast(&reduced_val1); + half2* rv2 = reinterpret_cast(&reduced_val2); + half2 val; + if (packet_width > 2) { + reducer.reducePacket(rv1[2], rv1); + reducer.reducePacket(rv1[3], rv1 + 1); + reducer.reducePacket(rv1[1], rv1); + reducer.reducePacket(rv2[2], rv2); + reducer.reducePacket(rv2[3], rv2 + 1); + reducer.reducePacket(rv2[1], rv2); + } + half val1 = __low2half(*rv1); + reducer.reduce(__high2half(*rv1), &val1); + half val2 = __low2half(*rv2); + reducer.reduce(__high2half(*rv2), &val2); + val = __halves2half2(val1, val2); + if ((threadIdx.x & (warpSize - 1)) == 0) { + half* loc = output + row; + atomicReduce((half2*)loc, val, reducer); + } + } + } +} + +#endif // EIGEN_HAS_GPU_FP16 + +template +struct InnerReductionLauncher { + static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) { + gpu_assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device"); + return true; + } +}; + +// Specialization for float and double +template +struct InnerReductionLauncher< + Self, Op, OutputType, PacketAccess, + typename internal::enable_if< + internal::is_same::value || + internal::is_same::value, + void>::type> { + static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { + typedef typename Self::Index Index; + + const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; + const int block_size = 256; + const int num_per_thread = 128; + const int dyn_blocks = divup(num_coeffs, block_size * num_per_thread); + const int max_blocks = device.getNumGpuMultiProcessors() * + device.maxGpuThreadsPerMultiProcessor() / block_size; + const int num_blocks = numext::mini(max_blocks, dyn_blocks); + + if (num_blocks > 1) { + // We initialize the outputs outside the reduction kernel when we can't be sure that there + // won't be a race conditions between multiple thread blocks. + const int dyn_blocks = divup(num_preserved_vals, 1024); + const int max_blocks = device.getNumGpuMultiProcessors() * + device.maxGpuThreadsPerMultiProcessor() / 1024; + const int num_blocks = numext::mini(max_blocks, dyn_blocks); + LAUNCH_GPU_KERNEL((ReductionInitKernel), + num_blocks, 1024, 0, device, reducer.initialize(), + num_preserved_vals, output); + } + + LAUNCH_GPU_KERNEL((InnerReductionKernel), + num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); + + return false; + } +}; + +#ifdef EIGEN_HAS_GPU_FP16 +template +struct InnerReductionLauncher { + static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) { + gpu_assert(false && "Should not be called since there is no packet accessor"); + return true; + } +}; + +template +struct InnerReductionLauncher { + static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { + typedef typename Self::Index Index; + + if (num_preserved_vals % 2 != 0) { + // Not supported yet, revert to the slower code path + return true; + } + + const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; + const int block_size = /*256*/128; + const int num_per_thread = /*128*/64; + const int dyn_blocks = divup(num_coeffs, block_size * num_per_thread); + const int max_blocks = device.getNumGpuMultiProcessors() * + device.maxGpuThreadsPerMultiProcessor() / block_size; + const int num_blocks = numext::mini(max_blocks, dyn_blocks); + + if (num_blocks > 1) { + // We initialize the outputs outside the reduction kernel when we can't be sure that there + // won't be a race conditions between multiple thread blocks. + LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat), + 1, 1, 0, device, reducer, self, num_preserved_vals, output); + } + + LAUNCH_GPU_KERNEL((InnerReductionKernelHalfFloat), + num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); + + return false; + } +}; +#endif // EIGEN_HAS_GPU_FP16 + + +template +struct InnerReducer { + // Unfortunately nvidia doesn't support well exotic types such as complex, + // so reduce the scope of the optimized version of the code to the simple case + // of floats and half floats. +#ifdef EIGEN_HAS_GPU_FP16 + static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful && + (internal::is_same::value || + internal::is_same::value || + (internal::is_same::value && reducer_traits::PacketAccess)); +#else // EIGEN_HAS_GPU_FP16 + static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful && + (internal::is_same::value || + internal::is_same::value); +#endif // EIGEN_HAS_GPU_FP16 + + template + static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { + gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats"); + const Index num_coeffs = array_prod(self.m_impl.dimensions()); + // Don't crash when we're called with an input tensor of size 0. + if (num_coeffs == 0) { + return true; + } + // It's faster to use the usual code. + if (num_coeffs_to_reduce <= 128) { + return true; + } + + return InnerReductionLauncher::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals); + } +}; + +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, + typename Self::CoeffReturnType* output) { + const Index num_threads = blockDim.x * gridDim.x; + const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; + // Initialize the output values if they weren't initialized by the ReductionInitKernel + if (gridDim.x == 1) { + for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { + output[i] = reducer.initialize(); + } + __syncthreads(); + } + + // Do the reduction. + const Index max_iter = num_preserved_coeffs * divup(num_coeffs_to_reduce, NumPerThread); + for (Index i = thread_id; i < max_iter; i += num_threads) { + const Index input_col = i % num_preserved_coeffs; + const Index input_row = (i / num_preserved_coeffs) * NumPerThread; + typename Self::CoeffReturnType reduced_val = reducer.initialize(); + const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce); + for (Index j = input_row; j < max_row; j++) { + typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col); + reducer.reduce(val, &reduced_val); + } + atomicReduce(&(output[input_col]), reduced_val, reducer); + } +} + + +template +struct OuterReducer { + // Unfortunately nvidia doesn't support well exotic types such as complex, + // so reduce the scope of the optimized version of the code to the simple case + // of floats. + static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful && + (internal::is_same::value || + internal::is_same::value); + template + static + #if !defined(EIGEN_HIPCC) + // FIXME : leaving this EIGEN_DEVICE_FUNC in, results in the following runtime error + // (in the cxx11_tensor_reduction_gpu test) + // + // terminate called after throwing an instance of 'std::runtime_error' + // what(): No device code available for function: _ZN5Eigen8internal20OuterReductionKernelIL... + // + // don't know why this happens (and why is it a runtime error instead of a compile time error) + // + // this will be fixed by HIP PR#457 + EIGEN_DEVICE_FUNC + #endif + bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) { + gpu_assert(false && "Should only be called to reduce doubles or floats on a gpu device"); + return true; + } + + static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) { + typedef typename Self::Index Index; + + // It's faster to use the usual code. + if (num_coeffs_to_reduce <= 32) { + return true; + } + + const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals; + const int block_size = 256; + const int num_per_thread = 16; + const int dyn_blocks = divup(num_coeffs, block_size * num_per_thread); + const int max_blocks = device.getNumGpuMultiProcessors() * + device.maxGpuThreadsPerMultiProcessor() / block_size; + const int num_blocks = numext::mini(max_blocks, dyn_blocks); + + if (num_blocks > 1) { + // We initialize the outputs in the reduction kernel itself when we don't have to worry + // about race conditions between multiple thread blocks. + const int dyn_blocks = divup(num_preserved_vals, 1024); + const int max_blocks = device.getNumGpuMultiProcessors() * + device.maxGpuThreadsPerMultiProcessor() / 1024; + const int num_blocks = numext::mini(max_blocks, dyn_blocks); + LAUNCH_GPU_KERNEL((ReductionInitKernel), + num_blocks, 1024, 0, device, reducer.initialize(), + num_preserved_vals, output); + } + + LAUNCH_GPU_KERNEL((OuterReductionKernel), + num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output); + + return false; + } +}; + +#endif // defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC) + + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorReductionSycl.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorReductionSycl.h new file mode 100644 index 0000000..474eba0 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorReductionSycl.h @@ -0,0 +1,582 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/***************************************************************** + * TensorReductionSycl.h + * + * \brief: + * This is the specialization of the reduction operation. Two phase reduction approach + * is used since the GPU does not have Global Synchronization for global memory among + * different work-group/thread block. To solve the problem, we need to create two kernels + * to reduce the data, where the first kernel reduce the data locally and each local + * workgroup/thread-block save the input data into global memory. In the second phase (global reduction) + * one work-group uses one work-group/thread-block to reduces the intermediate data into one single element. + * Here is an NVIDIA presentation explaining the optimized two phase reduction algorithm on GPU: + * https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf + * + *****************************************************************/ + +#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP +#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP +namespace Eigen { +namespace TensorSycl { +namespace internal { + +template +struct OpDefiner { + typedef typename Vectorise::PacketReturnType PacketReturnType; + typedef Op type; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Op &op) { return op; } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator, + const Index &) { + return accumulator; + } +}; + +template +struct OpDefiner, CoeffReturnType, Index, false> { + typedef Eigen::internal::SumReducer type; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer &) { + return type(); + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType finalise_op(const CoeffReturnType &accumulator, + const Index &scale) { + ::Eigen::internal::scalar_quotient_op quotient_op; + return quotient_op(accumulator, CoeffReturnType(scale)); + } +}; + +template +struct OpDefiner, CoeffReturnType, Index, true> { + typedef typename Vectorise::PacketReturnType PacketReturnType; + typedef Eigen::internal::SumReducer type; + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer &) { + return type(); + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator, + const Index &scale) { + return ::Eigen::internal::pdiv(accumulator, ::Eigen::internal::pset1(CoeffReturnType(scale))); + } +}; + +template +struct SecondStepFullReducer { + typedef cl::sycl::accessor + LocalAccessor; + typedef OpDefiner OpDef; + typedef typename OpDef::type Op; + LocalAccessor scratch; + InputAccessor aI; + OutputAccessor outAcc; + Op op; + SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_) + : scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {} + + void operator()(cl::sycl::nd_item<1> itemID) { + // Our empirical research shows that the best performance will be achieved + // when there is only one element per thread to reduce in the second step. + // in this step the second step reduction time is almost negligible. + // Hence, in the second step of reduction the input size is fixed to the + // local size, thus, there is only one element read per thread. The + // algorithm must be changed if the number of reduce per thread in the + // second step is greater than 1. Otherwise, the result will be wrong. + const Index localid = itemID.get_local_id(0); + auto aInPtr = aI.get_pointer() + localid; + auto aOutPtr = outAcc.get_pointer(); + CoeffReturnType *scratchptr = scratch.get_pointer(); + CoeffReturnType accumulator = *aInPtr; + + scratchptr[localid] = op.finalize(accumulator); + for (Index offset = itemID.get_local_range(0) / 2; offset > 0; offset /= 2) { + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (localid < offset) { + op.reduce(scratchptr[localid + offset], &accumulator); + scratchptr[localid] = op.finalize(accumulator); + } + } + if (localid == 0) *aOutPtr = op.finalize(accumulator); + } +}; + +// Full reduction first phase. In this version the vectorization is true and the reduction accept +// any generic reducerOp e.g( max, min, sum, mean, iamax, iamin, etc ). +template +class FullReductionKernelFunctor { + public: + typedef typename Evaluator::CoeffReturnType CoeffReturnType; + typedef typename Evaluator::Index Index; + typedef OpDefiner + OpDef; + + typedef typename OpDef::type Op; + typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType; + typedef typename Evaluator::PacketReturnType PacketReturnType; + typedef + typename ::Eigen::internal::conditional<(Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess), + PacketReturnType, CoeffReturnType>::type OutType; + typedef cl::sycl::accessor + LocalAccessor; + LocalAccessor scratch; + Evaluator evaluator; + EvaluatorPointerType final_output; + Index rng; + Op op; + + FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_, + Index rng_, OpType op_) + : scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {} + + void operator()(cl::sycl::nd_item<1> itemID) { compute_reduction(itemID); } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if::type compute_reduction( + const cl::sycl::nd_item<1> &itemID) { + auto output_ptr = final_output.get_pointer(); + Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize; + Index globalid = itemID.get_global_id(0); + Index localid = itemID.get_local_id(0); + Index step = Evaluator::PacketSize * itemID.get_global_range(0); + Index start = Evaluator::PacketSize * globalid; + // vectorizable parts + PacketReturnType packetAccumulator = op.template initializePacket(); + for (Index i = start; i < VectorizedRange; i += step) { + op.template reducePacket(evaluator.impl().template packet(i), &packetAccumulator); + } + globalid += VectorizedRange; + // non vectorizable parts + for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) { + op.template reducePacket( + ::Eigen::TensorSycl::internal::PacketWrapper::convert_to_packet_type( + evaluator.impl().coeff(i), op.initialize()), + &packetAccumulator); + } + scratch[localid] = packetAccumulator = + OpDef::finalise_op(op.template finalizePacket(packetAccumulator), rng); + // reduction parts // Local size is always power of 2 + EIGEN_UNROLL_LOOP + for (Index offset = local_range / 2; offset > 0; offset /= 2) { + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (localid < offset) { + op.template reducePacket(scratch[localid + offset], &packetAccumulator); + scratch[localid] = op.template finalizePacket(packetAccumulator); + } + } + if (localid == 0) { + output_ptr[itemID.get_group(0)] = + op.finalizeBoth(op.initialize(), op.template finalizePacket(packetAccumulator)); + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if::type compute_reduction( + const cl::sycl::nd_item<1> &itemID) { + auto output_ptr = final_output.get_pointer(); + Index globalid = itemID.get_global_id(0); + Index localid = itemID.get_local_id(0); + // vectorizable parts + CoeffReturnType accumulator = op.initialize(); + // non vectorizable parts + for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) { + op.reduce(evaluator.impl().coeff(i), &accumulator); + } + scratch[localid] = accumulator = OpDef::finalise_op(op.finalize(accumulator), rng); + + // reduction parts. the local size is always power of 2 + EIGEN_UNROLL_LOOP + for (Index offset = local_range / 2; offset > 0; offset /= 2) { + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (localid < offset) { + op.reduce(scratch[localid + offset], &accumulator); + scratch[localid] = op.finalize(accumulator); + } + } + if (localid == 0) { + output_ptr[itemID.get_group(0)] = op.finalize(accumulator); + } + } +}; + +template +class GenericNondeterministicReducer { + public: + typedef typename Evaluator::CoeffReturnType CoeffReturnType; + typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType; + typedef typename Evaluator::Index Index; + typedef OpDefiner OpDef; + typedef typename OpDef::type Op; + template + GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_, + Index range_, Index num_values_to_reduce_) + : evaluator(evaluator_), + output_accessor(output_accessor_), + functor(OpDef::get_op(functor_)), + range(range_), + num_values_to_reduce(num_values_to_reduce_) {} + + void operator()(cl::sycl::nd_item<1> itemID) { + auto output_accessor_ptr = output_accessor.get_pointer(); + /// const cast added as a naive solution to solve the qualifier drop error + Index globalid = static_cast(itemID.get_global_linear_id()); + if (globalid < range) { + CoeffReturnType accum = functor.initialize(); + Eigen::internal::GenericDimReducer::reduce( + evaluator, evaluator.firstInput(globalid), functor, &accum); + output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce); + } + } + + private: + Evaluator evaluator; + EvaluatorPointerType output_accessor; + Op functor; + Index range; + Index num_values_to_reduce; +}; + +enum class reduction_dim { inner_most, outer_most }; +// default is preserver +template +struct PartialReductionKernel { + typedef typename Evaluator::CoeffReturnType CoeffReturnType; + typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType; + typedef typename Evaluator::Index Index; + typedef OpDefiner OpDef; + typedef typename OpDef::type Op; + typedef cl::sycl::accessor + ScratchAcc; + ScratchAcc scratch; + Evaluator evaluator; + EvaluatorPointerType output_accessor; + Op op; + const Index preserve_elements_num_groups; + const Index reduce_elements_num_groups; + const Index num_coeffs_to_preserve; + const Index num_coeffs_to_reduce; + + PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_, + const Index preserve_elements_num_groups_, const Index reduce_elements_num_groups_, + const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_) + : scratch(scratch_), + evaluator(evaluator_), + output_accessor(output_accessor_), + op(OpDef::get_op(op_)), + preserve_elements_num_groups(preserve_elements_num_groups_), + reduce_elements_num_groups(reduce_elements_num_groups_), + num_coeffs_to_preserve(num_coeffs_to_preserve_), + num_coeffs_to_reduce(num_coeffs_to_reduce_) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId, + CoeffReturnType &accumulator) { + if (globalPId >= num_coeffs_to_preserve) { + return; + } + Index global_offset = rt == reduction_dim::outer_most ? globalPId + (globalRId * num_coeffs_to_preserve) + : globalRId + (globalPId * num_coeffs_to_reduce); + Index localOffset = globalRId; + + const Index per_thread_local_stride = PannelParameters::LocalThreadSizeR * reduce_elements_num_groups; + const Index per_thread_global_stride = + rt == reduction_dim::outer_most ? num_coeffs_to_preserve * per_thread_local_stride : per_thread_local_stride; + for (Index i = globalRId; i < num_coeffs_to_reduce; i += per_thread_local_stride) { + op.reduce(evaluator.impl().coeff(global_offset), &accumulator); + localOffset += per_thread_local_stride; + global_offset += per_thread_global_stride; + } + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) { + const Index linearLocalThreadId = itemID.get_local_id(0); + Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP + : linearLocalThreadId / PannelParameters::LocalThreadSizeR; + Index rLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId / PannelParameters::LocalThreadSizeP + : linearLocalThreadId % PannelParameters::LocalThreadSizeR; + const Index pGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) % preserve_elements_num_groups + : itemID.get_group(0) / reduce_elements_num_groups; + const Index rGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) / preserve_elements_num_groups + : itemID.get_group(0) % reduce_elements_num_groups; + + Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId; + const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId; + auto scratchPtr = scratch.get_pointer().get(); + auto outPtr = + output_accessor.get_pointer() + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0); + CoeffReturnType accumulator = op.initialize(); + + element_wise_reduce(globalRId, globalPId, accumulator); + + accumulator = OpDef::finalise_op(op.finalize(accumulator), num_coeffs_to_reduce); + scratchPtr[pLocalThreadId + rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)] = + accumulator; + if (rt == reduction_dim::inner_most) { + pLocalThreadId = linearLocalThreadId % PannelParameters::LocalThreadSizeP; + rLocalThreadId = linearLocalThreadId / PannelParameters::LocalThreadSizeP; + globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId; + } + + /* Apply the reduction operation between the current local + * id and the one on the other half of the vector. */ + auto out_scratch_ptr = + scratchPtr + (pLocalThreadId + (rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC))); + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (rt == reduction_dim::inner_most) { + accumulator = *out_scratch_ptr; + } + // The Local LocalThreadSizeR is always power of 2 + EIGEN_UNROLL_LOOP + for (Index offset = PannelParameters::LocalThreadSizeR >> 1; offset > 0; offset >>= 1) { + if (rLocalThreadId < offset) { + op.reduce(out_scratch_ptr[(PannelParameters::LocalThreadSizeP + PannelParameters::BC) * offset], &accumulator); + // The result has already been divided for mean reducer in the + // previous reduction so no need to divide furthermore + *out_scratch_ptr = op.finalize(accumulator); + } + /* All threads collectively read from global memory into local. + * The barrier ensures all threads' IO is resolved before + * execution continues (strictly speaking, all threads within + * a single work-group - there is no co-ordination between + * work-groups, only work-items). */ + itemID.barrier(cl::sycl::access::fence_space::local_space); + } + + if (rLocalThreadId == 0 && (globalPId < num_coeffs_to_preserve)) { + outPtr[globalPId] = op.finalize(accumulator); + } + } +}; + +template +struct SecondStepPartialReduction { + typedef OpDefiner OpDef; + typedef typename OpDef::type Op; + typedef cl::sycl::accessor + ScratchAccessor; + InputAccessor input_accessor; + OutputAccessor output_accessor; + Op op; + const Index num_coeffs_to_preserve; + const Index num_coeffs_to_reduce; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SecondStepPartialReduction(ScratchAccessor, InputAccessor input_accessor_, + OutputAccessor output_accessor_, OpType op_, + const Index num_coeffs_to_preserve_, + const Index num_coeffs_to_reduce_) + : input_accessor(input_accessor_), + output_accessor(output_accessor_), + op(OpDef::get_op(op_)), + num_coeffs_to_preserve(num_coeffs_to_preserve_), + num_coeffs_to_reduce(num_coeffs_to_reduce_) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) { + const Index globalId = itemID.get_global_id(0); + + if (globalId >= num_coeffs_to_preserve) return; + + auto in_ptr = input_accessor.get_pointer() + globalId; + + OutScalar accumulator = op.initialize(); +// num_coeffs_to_reduce is not bigger that 256 + for (Index i = 0; i < num_coeffs_to_reduce; i++) { + op.reduce(*in_ptr, &accumulator); + in_ptr += num_coeffs_to_preserve; + } + output_accessor.get_pointer()[globalId] = op.finalize(accumulator); + } +}; // namespace internal + +template +struct ReductionPannel { + static EIGEN_CONSTEXPR Index LocalThreadSizeP = LTP; + static EIGEN_CONSTEXPR Index LocalThreadSizeR = LTR; + static EIGEN_CONSTEXPR bool BC = BC_; +}; + +template +struct PartialReducerLauncher { + typedef typename Self::EvaluatorPointerType EvaluatorPointerType; + typedef typename Self::CoeffReturnType CoeffReturnType; + typedef typename Self::Storage Storage; + typedef typename Self::Index Index; + typedef ReductionPannel + PannelParameters; + + typedef PartialReductionKernel SyclReducerKerneType; + + static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType output, + Index num_coeffs_to_reduce, Index num_coeffs_to_preserve) { + Index roundUpP = roundUp(num_coeffs_to_preserve, PannelParameters::LocalThreadSizeP); + + // getPowerOfTwo makes sure local range is power of 2 and <= + // maxSyclThreadPerBlock this will help us to avoid extra check on the + // kernel + static_assert(!((PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR) & + (PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR - 1)), + "The Local thread size must be a power of 2 for the reduction " + "operation"); + + EIGEN_CONSTEXPR Index localRange = PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR; + // In this step, we force the code not to be more than 2-step reduction: + // Our empirical research shows that if each thread reduces at least 64 + // elemnts individually, we get better performance. However, this can change + // on different platforms. In this step we force the code not to be + // morthan step reduction: Our empirical research shows that for inner_most + // dim reducer, it is better to have 8 group in a reduce dimension for sizes + // > 1024 to achieve the best performance. + const Index reductionPerThread = 64; + Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(), true); + const Index pNumGroups = roundUpP / PannelParameters::LocalThreadSizeP; + Index rGroups = (cu + pNumGroups - 1) / pNumGroups; + const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ? std::min(rGroups, localRange) : 1; + const Index globalRange = pNumGroups * rNumGroups * localRange; + + EIGEN_CONSTEXPR Index scratchSize = + PannelParameters::LocalThreadSizeR * (PannelParameters::LocalThreadSizeP + PannelParameters::BC); + auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange)); + if (rNumGroups > 1) { + CoeffReturnType *temp_pointer = static_cast( + dev.allocate_temp(num_coeffs_to_preserve * rNumGroups * sizeof(CoeffReturnType))); + EvaluatorPointerType temp_accessor = dev.get(temp_pointer); + dev.template unary_kernel_launcher( + self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve, + num_coeffs_to_reduce); + + typedef SecondStepPartialReduction + SecondStepPartialReductionKernel; + + dev.template unary_kernel_launcher( + temp_accessor, output, + cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)), Index(1), + reducer, num_coeffs_to_preserve, rNumGroups); + + self.device().deallocate_temp(temp_pointer); + } else { + dev.template unary_kernel_launcher( + self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve, + num_coeffs_to_reduce); + } + return false; + } +}; +} // namespace internal +} // namespace TensorSycl + +namespace internal { + +template +struct FullReducer { + typedef typename Self::CoeffReturnType CoeffReturnType; + typedef typename Self::EvaluatorPointerType EvaluatorPointerType; + static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true; + static EIGEN_CONSTEXPR int PacketSize = Self::PacketAccess ? Self::PacketSize : 1; + static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data) { + typedef typename conditional::type OutType; + static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) & + (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)), + "The Local thread size must be a power of 2 for the reduction " + "operation"); + EIGEN_CONSTEXPR Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1; + + typename Self::Index inputSize = self.impl().dimensions().TotalSize(); + // In this step we force the code not to be more than 2-step reduction: + // Our empirical research shows that if each thread reduces at least 512 + // elemnts individually, we get better performance. + const Index reductionPerThread = 2048; + // const Index num_work_group = + Index reductionGroup = dev.getPowerOfTwo( + (inputSize + (reductionPerThread * local_range - 1)) / (reductionPerThread * local_range), true); + const Index num_work_group = std::min(reductionGroup, local_range); + // 1 + // ? local_range + // : 1); + const Index global_range = num_work_group * local_range; + + auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range)); + typedef TensorSycl::internal::FullReductionKernelFunctor reduction_kernel_t; + if (num_work_group > 1) { + CoeffReturnType *temp_pointer = + static_cast(dev.allocate_temp(num_work_group * sizeof(CoeffReturnType))); + typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer); + dev.template unary_kernel_launcher(self, tmp_global_accessor, thread_range, + local_range, inputSize, reducer); + + typedef TensorSycl::internal::SecondStepFullReducer + GenericRKernel; + dev.template unary_kernel_launcher( + tmp_global_accessor, data, + cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)), num_work_group, + reducer); + + dev.deallocate_temp(temp_pointer); + } else { + dev.template unary_kernel_launcher(self, data, thread_range, local_range, inputSize, + reducer); + } + } +}; +// vectorizable inner_most most dim preserver +// col reduction +template +struct OuterReducer { + static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true; + + static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, + typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce, + typename Self::Index num_coeffs_to_preserve) { + return ::Eigen::TensorSycl::internal::PartialReducerLauncher< + Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::outer_most>::run(self, reducer, dev, output, + num_coeffs_to_reduce, + num_coeffs_to_preserve); + } +}; +// row reduction +template +struct InnerReducer { + static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true; + + static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, + typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce, + typename Self::Index num_coeffs_to_preserve) { + return ::Eigen::TensorSycl::internal::PartialReducerLauncher< + Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::inner_most>::run(self, reducer, dev, output, + num_coeffs_to_reduce, + num_coeffs_to_preserve); + } +}; + +// ArmgMax uses this kernel for partial reduction// +// TODO(@mehdi.goli) come up with a better kernel +// generic partial reduction +template +struct GenericReducer { + static EIGEN_CONSTEXPR bool HasOptimizedImplementation = false; + static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, + typename Self::EvaluatorPointerType output, typename Self::Index num_values_to_reduce, + typename Self::Index num_coeffs_to_preserve) { + typename Self::Index range, GRange, tileSize; + dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange); + + dev.template unary_kernel_launcher>( + self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1), + reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast(1)); + return false; + } +}; + +} // namespace internal +} // namespace Eigen + +#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorRef.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorRef.h new file mode 100644 index 0000000..a27d364 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorRef.h @@ -0,0 +1,454 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_REF_H +#define EIGEN_CXX11_TENSOR_TENSOR_REF_H + +namespace Eigen { + +namespace internal { + +template +class TensorLazyBaseEvaluator { + public: + TensorLazyBaseEvaluator() : m_refcount(0) { } + virtual ~TensorLazyBaseEvaluator() { } + + EIGEN_DEVICE_FUNC virtual const Dimensions& dimensions() const = 0; + EIGEN_DEVICE_FUNC virtual const Scalar* data() const = 0; + + EIGEN_DEVICE_FUNC virtual const Scalar coeff(DenseIndex index) const = 0; + EIGEN_DEVICE_FUNC virtual Scalar& coeffRef(DenseIndex index) = 0; + + void incrRefCount() { ++m_refcount; } + void decrRefCount() { --m_refcount; } + int refCount() const { return m_refcount; } + + private: + // No copy, no assignment; + TensorLazyBaseEvaluator(const TensorLazyBaseEvaluator& other); + TensorLazyBaseEvaluator& operator = (const TensorLazyBaseEvaluator& other); + + int m_refcount; +}; + + +template +class TensorLazyEvaluatorReadOnly : public TensorLazyBaseEvaluator::Scalar> { + public: + // typedef typename TensorEvaluator::Dimensions Dimensions; + typedef typename TensorEvaluator::Scalar Scalar; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + typedef TensorEvaluator EvalType; + + TensorLazyEvaluatorReadOnly(const Expr& expr, const Device& device) : m_impl(expr, device), m_dummy(Scalar(0)) { + m_dims = m_impl.dimensions(); + m_impl.evalSubExprsIfNeeded(NULL); + } + virtual ~TensorLazyEvaluatorReadOnly() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC virtual const Dimensions& dimensions() const { + return m_dims; + } + EIGEN_DEVICE_FUNC virtual const Scalar* data() const { + return m_impl.data(); + } + + EIGEN_DEVICE_FUNC virtual const Scalar coeff(DenseIndex index) const { + return m_impl.coeff(index); + } + EIGEN_DEVICE_FUNC virtual Scalar& coeffRef(DenseIndex /*index*/) { + eigen_assert(false && "can't reference the coefficient of a rvalue"); + return m_dummy; + }; + + protected: + TensorEvaluator m_impl; + Dimensions m_dims; + Scalar m_dummy; +}; + +template +class TensorLazyEvaluatorWritable : public TensorLazyEvaluatorReadOnly { + public: + typedef TensorLazyEvaluatorReadOnly Base; + typedef typename Base::Scalar Scalar; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + TensorLazyEvaluatorWritable(const Expr& expr, const Device& device) : Base(expr, device) { + } + virtual ~TensorLazyEvaluatorWritable() { + } + + EIGEN_DEVICE_FUNC virtual Scalar& coeffRef(DenseIndex index) { + return this->m_impl.coeffRef(index); + } +}; + +template +class TensorLazyEvaluator : public internal::conditional::value), + TensorLazyEvaluatorWritable, + TensorLazyEvaluatorReadOnly >::type { + public: + typedef typename internal::conditional::value), + TensorLazyEvaluatorWritable, + TensorLazyEvaluatorReadOnly >::type Base; + typedef typename Base::Scalar Scalar; + + TensorLazyEvaluator(const Expr& expr, const Device& device) : Base(expr, device) { + } + virtual ~TensorLazyEvaluator() { + } +}; + +} // namespace internal + + +/** \class TensorRef + * \ingroup CXX11_Tensor_Module + * + * \brief A reference to a tensor expression + * The expression will be evaluated lazily (as much as possible). + * + */ +template class TensorRef : public TensorBase > +{ + public: + typedef TensorRef Self; + typedef typename PlainObjectType::Base Base; + typedef typename Eigen::internal::nested::type Nested; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::traits::Index Index; + typedef typename internal::traits::Scalar Scalar; + typedef typename NumTraits::Real RealScalar; + typedef typename Base::CoeffReturnType CoeffReturnType; + typedef Scalar* PointerType; + typedef PointerType PointerArgType; + + static const Index NumIndices = PlainObjectType::NumIndices; + typedef typename PlainObjectType::Dimensions Dimensions; + + enum { + IsAligned = false, + PacketAccess = false, + BlockAccess = false, + PreferBlockAccess = false, + Layout = PlainObjectType::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -----------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorRef() : m_evaluator(NULL) { + } + + template + EIGEN_STRONG_INLINE TensorRef(const Expression& expr) : m_evaluator(new internal::TensorLazyEvaluator(expr, DefaultDevice())) { + m_evaluator->incrRefCount(); + } + + template + EIGEN_STRONG_INLINE TensorRef& operator = (const Expression& expr) { + unrefEvaluator(); + m_evaluator = new internal::TensorLazyEvaluator(expr, DefaultDevice()); + m_evaluator->incrRefCount(); + return *this; + } + + ~TensorRef() { + unrefEvaluator(); + } + + TensorRef(const TensorRef& other) : m_evaluator(other.m_evaluator) { + eigen_assert(m_evaluator->refCount() > 0); + m_evaluator->incrRefCount(); + } + + TensorRef& operator = (const TensorRef& other) { + if (this != &other) { + unrefEvaluator(); + m_evaluator = other.m_evaluator; + eigen_assert(m_evaluator->refCount() > 0); + m_evaluator->incrRefCount(); + } + return *this; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index rank() const { return m_evaluator->dimensions().size(); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_evaluator->dimensions()[n]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_evaluator->dimensions(); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index size() const { return m_evaluator->dimensions().TotalSize(); } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar* data() const { return m_evaluator->data(); } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar operator()(Index index) const + { + return m_evaluator->coeff(index); + } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar operator()(Index firstIndex, IndexTypes... otherIndices) const + { + const std::size_t num_indices = (sizeof...(otherIndices) + 1); + const array indices{{firstIndex, otherIndices...}}; + return coeff(indices); + } + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices) + { + const std::size_t num_indices = (sizeof...(otherIndices) + 1); + const array indices{{firstIndex, otherIndices...}}; + return coeffRef(indices); + } +#else + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1) const + { + array indices; + indices[0] = i0; + indices[1] = i1; + return coeff(indices); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2) const + { + array indices; + indices[0] = i0; + indices[1] = i1; + indices[2] = i2; + return coeff(indices); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2, Index i3) const + { + array indices; + indices[0] = i0; + indices[1] = i1; + indices[2] = i2; + indices[3] = i3; + return coeff(indices); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const + { + array indices; + indices[0] = i0; + indices[1] = i1; + indices[2] = i2; + indices[3] = i3; + indices[4] = i4; + return coeff(indices); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1) + { + array indices; + indices[0] = i0; + indices[1] = i1; + return coeffRef(indices); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1, Index i2) + { + array indices; + indices[0] = i0; + indices[1] = i1; + indices[2] = i2; + return coeffRef(indices); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3) + { + array indices; + indices[0] = i0; + indices[1] = i1; + indices[2] = i2; + indices[3] = i3; + return coeffRef(indices); + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1, Index i2, Index i3, Index i4) + { + array indices; + indices[0] = i0; + indices[1] = i1; + indices[2] = i2; + indices[3] = i3; + indices[4] = i4; + return coeffRef(indices); + } +#endif + + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar coeff(const array& indices) const + { + const Dimensions& dims = this->dimensions(); + Index index = 0; + if (PlainObjectType::Options & RowMajor) { + index += indices[0]; + for (size_t i = 1; i < NumIndices; ++i) { + index = index * dims[i] + indices[i]; + } + } else { + index += indices[NumIndices-1]; + for (int i = NumIndices-2; i >= 0; --i) { + index = index * dims[i] + indices[i]; + } + } + return m_evaluator->coeff(index); + } + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(const array& indices) + { + const Dimensions& dims = this->dimensions(); + Index index = 0; + if (PlainObjectType::Options & RowMajor) { + index += indices[0]; + for (size_t i = 1; i < NumIndices; ++i) { + index = index * dims[i] + indices[i]; + } + } else { + index += indices[NumIndices-1]; + for (int i = NumIndices-2; i >= 0; --i) { + index = index * dims[i] + indices[i]; + } + } + return m_evaluator->coeffRef(index); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const Scalar coeff(Index index) const + { + return m_evaluator->coeff(index); + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) + { + return m_evaluator->coeffRef(index); + } + + private: + EIGEN_STRONG_INLINE void unrefEvaluator() { + if (m_evaluator) { + m_evaluator->decrRefCount(); + if (m_evaluator->refCount() == 0) { + delete m_evaluator; + } + } + } + + internal::TensorLazyBaseEvaluator* m_evaluator; +}; + + +// evaluator for rvalues +template +struct TensorEvaluator, Device> +{ + typedef typename Derived::Index Index; + typedef typename Derived::Scalar Scalar; + typedef typename Derived::Scalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename Derived::Dimensions Dimensions; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + enum { + IsAligned = false, + PacketAccess = false, + BlockAccess = false, + PreferBlockAccess = false, + Layout = TensorRef::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef& m, const Device&) + : m_ref(m) + { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_ref.dimensions(); } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + return true; + } + + EIGEN_STRONG_INLINE void cleanup() { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { + return m_ref.coeff(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { + return m_ref.coeffRef(index); + } + + EIGEN_DEVICE_FUNC const Scalar* data() const { return m_ref.data(); } + + protected: + TensorRef m_ref; +}; + + +// evaluator for lvalues +template +struct TensorEvaluator, Device> : public TensorEvaluator, Device> +{ + typedef typename Derived::Index Index; + typedef typename Derived::Scalar Scalar; + typedef typename Derived::Scalar CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef typename Derived::Dimensions Dimensions; + + typedef TensorEvaluator, Device> Base; + + enum { + IsAligned = false, + PacketAccess = false, + BlockAccess = false, + PreferBlockAccess = false, + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(TensorRef& m, const Device& d) : Base(m, d) + { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { + return this->m_ref.coeffRef(index); + } +}; + + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_REF_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorReverse.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorReverse.h new file mode 100644 index 0000000..586ce68 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorReverse.h @@ -0,0 +1,465 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Navdeep Jaitly +// Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H +#define EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H +namespace Eigen { + +/** \class TensorReverse + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reverse elements class. + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorReverseOp& type; +}; + +template +struct nested, 1, + typename eval >::type> +{ + typedef TensorReverseOp type; +}; + +} // end namespace internal + +template +class TensorReverseOp : public TensorBase, WriteAccessors> +{ + public: + typedef TensorBase, WriteAccessors>Base; + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind + StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReverseOp( + const XprType& expr, const ReverseDimensions& reverse_dims) + : m_xpr(expr), m_reverse_dims(reverse_dims) { } + + EIGEN_DEVICE_FUNC + const ReverseDimensions& reverse() const { return m_reverse_dims; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorReverseOp) + + + protected: + typename XprType::Nested m_xpr; + const ReverseDimensions m_reverse_dims; +}; + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorReverseOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = NumDims > 0, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + typedef internal::TensorIntDivisor IndexDivisor; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename TensorEvaluator::TensorBlock + ArgTensorBlock; + + typedef typename internal::TensorMaterializedBlock + TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), + m_reverse(op.reverse()), + m_device(device) + { + // Reversing a scalar isn't supported yet. It would be a no-op anyway. + EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + + // Compute strides + m_dimensions = m_impl.dimensions(); + if (static_cast(Layout) == static_cast(ColMajor)) { + m_strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_strides[i] = m_strides[i-1] * m_dimensions[i-1]; + if (m_strides[i] > 0) m_fastStrides[i] = IndexDivisor(m_strides[i]); + } + } else { + m_strides[NumDims-1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_strides[i] = m_strides[i+1] * m_dimensions[i+1]; + if (m_strides[i] > 0) m_fastStrides[i] = IndexDivisor(m_strides[i]); + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType, EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index reverseIndex( + Index index) const { + eigen_assert(index < dimensions().TotalSize()); + Index inputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i > 0; --i) { + Index idx = index / m_fastStrides[i]; + index -= idx * m_strides[i]; + if (m_reverse[i]) { + idx = m_dimensions[i] - idx - 1; + } + inputIndex += idx * m_strides[i] ; + } + if (m_reverse[0]) { + inputIndex += (m_dimensions[0] - index - 1); + } else { + inputIndex += index; + } + } else { + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 1; ++i) { + Index idx = index / m_fastStrides[i]; + index -= idx * m_strides[i]; + if (m_reverse[i]) { + idx = m_dimensions[i] - idx - 1; + } + inputIndex += idx * m_strides[i] ; + } + if (m_reverse[NumDims-1]) { + inputIndex += (m_dimensions[NumDims-1] - index - 1); + } else { + inputIndex += index; + } + } + return inputIndex; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff( + Index index) const { + return m_impl.coeff(reverseIndex(index)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + PacketReturnType packet(Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + // TODO(ndjaitly): write a better packing routine that uses + // local structure. + EIGEN_ALIGN_MAX typename internal::remove_const::type + values[PacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + const size_t target_size = m_device.lastLevelCacheSize(); + // Block evaluation reads underlying memory in reverse order, and default + // cost model does not properly catch this in bytes stored/loaded. + return internal::TensorBlockResourceRequirements::skewed( + target_size) + .addCostPerCoeff({0, 0, 24}); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + // TODO(ezhulenev): If underlying tensor expression supports and prefers + // block evaluation we must use it. Currently we use coeff and packet + // access into the underlying tensor expression. + // static const bool useBlockAccessForArgType = + // TensorEvaluator::BlockAccess && + // TensorEvaluator::PreferBlockAccess; + + static const bool isColMajor = + static_cast(Layout) == static_cast(ColMajor); + + static const Index inner_dim_idx = isColMajor ? 0 : NumDims - 1; + const bool inner_dim_reversed = m_reverse[inner_dim_idx]; + + // Offset in the output block. + Index block_offset = 0; + + // Offset in the input Tensor. + Index input_offset = reverseIndex(desc.offset()); + + // Initialize output block iterator state. Dimension in this array are + // always in inner_most -> outer_most order (col major layout). + array it; + for (int i = 0; i < NumDims; ++i) { + const int dim = isColMajor ? i : NumDims - 1 - i; + it[i].size = desc.dimension(dim); + it[i].count = 0; + it[i].reverse = m_reverse[dim]; + + it[i].block_stride = + i == 0 ? 1 : (it[i - 1].size * it[i - 1].block_stride); + it[i].block_span = it[i].block_stride * (it[i].size - 1); + + it[i].input_stride = m_strides[dim]; + it[i].input_span = it[i].input_stride * (it[i].size - 1); + + if (it[i].reverse) { + it[i].input_stride = -1 * it[i].input_stride; + it[i].input_span = -1 * it[i].input_span; + } + } + + // If multiple inner dimensions have the same reverse flag, check if we can + // merge them into a single virtual inner dimension. + int effective_inner_dim = 0; + for (int i = 1; i < NumDims; ++i) { + if (it[i].reverse != it[effective_inner_dim].reverse) break; + if (it[i].block_stride != it[effective_inner_dim].size) break; + if (it[i].block_stride != numext::abs(it[i].input_stride)) break; + + it[i].size = it[effective_inner_dim].size * it[i].size; + + it[i].block_stride = 1; + it[i].input_stride = (inner_dim_reversed ? -1 : 1); + + it[i].block_span = it[i].block_stride * (it[i].size - 1); + it[i].input_span = it[i].input_stride * (it[i].size - 1); + + effective_inner_dim = i; + } + + eigen_assert(it[effective_inner_dim].block_stride == 1); + eigen_assert(it[effective_inner_dim].input_stride == + (inner_dim_reversed ? -1 : 1)); + + const Index inner_dim_size = it[effective_inner_dim].size; + + // Prepare storage for the materialized reverse result. + const typename TensorBlock::Storage block_storage = + TensorBlock::prepareStorage(desc, scratch); + CoeffReturnType* block_buffer = block_storage.data(); + + while (it[NumDims - 1].count < it[NumDims - 1].size) { + // Copy inner-most dimension data from reversed location in input. + Index dst = block_offset; + Index src = input_offset; + + // NOTE(ezhulenev): Adding vectorized path with internal::preverse showed + // worse results in benchmarks than a simple coefficient loop. + if (inner_dim_reversed) { + for (Index i = 0; i < inner_dim_size; ++i) { + block_buffer[dst] = m_impl.coeff(src); + ++dst; + --src; + } + } else { + for (Index i = 0; i < inner_dim_size; ++i) { + block_buffer[dst] = m_impl.coeff(src); + ++dst; + ++src; + } + } + + // For the 1d tensor we need to generate only one inner-most dimension. + if ((NumDims - effective_inner_dim) == 1) break; + + // Update offset. + for (Index i = effective_inner_dim + 1; i < NumDims; ++i) { + if (++it[i].count < it[i].size) { + block_offset += it[i].block_stride; + input_offset += it[i].input_stride; + break; + } + if (i != NumDims - 1) it[i].count = 0; + block_offset -= it[i].block_span; + input_offset -= it[i].input_span; + } + } + + return block_storage.AsTensorMaterializedBlock(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + double compute_cost = NumDims * (2 * TensorOpCost::AddCost() + + 2 * TensorOpCost::MulCost() + + TensorOpCost::DivCost()); + for (int i = 0; i < NumDims; ++i) { + if (m_reverse[i]) { + compute_cost += 2 * TensorOpCost::AddCost(); + } + } + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize); + } + + EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; } + +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif + + protected: + Dimensions m_dimensions; + array m_strides; + array m_fastStrides; + TensorEvaluator m_impl; + ReverseDimensions m_reverse; + const Device EIGEN_DEVICE_REF m_device; + + private: + struct BlockIteratorState { + BlockIteratorState() + : size(0), + count(0), + reverse(false), + block_stride(0), + block_span(0), + input_stride(0), + input_span(0) {} + + Index size; + Index count; + bool reverse; + Index block_stride; + Index block_span; + Index input_stride; + Index input_span; + }; +}; + +// Eval as lvalue + +template +struct TensorEvaluator, Device> + : public TensorEvaluator, + Device> { + typedef TensorEvaluator, + Device> Base; + typedef TensorReverseOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::value; + typedef DSizes Dimensions; + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) {} + + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = PacketType::size; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Dimensions& dimensions() const { return this->m_dimensions; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { + return this->m_impl.coeffRef(this->reverseIndex(index)); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + // This code is pilfered from TensorMorphing.h + EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize]; + internal::pstore(values, x); + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + this->coeffRef(index+i) = values[i]; + } + } +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorScan.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorScan.h new file mode 100644 index 0000000..beae854 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorScan.h @@ -0,0 +1,528 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Igor Babuschkin +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_SCAN_H +#define EIGEN_CXX11_TENSOR_TENSOR_SCAN_H + +namespace Eigen { + +namespace internal { + +template +struct traits > + : public traits { + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorScanOp& type; +}; + +template +struct nested, 1, + typename eval >::type> +{ + typedef TensorScanOp type; +}; +} // end namespace internal + +/** \class TensorScan + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor scan class. + */ +template +class TensorScanOp + : public TensorBase, ReadOnlyAccessors> { +public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorScanOp( + const XprType& expr, const Index& axis, bool exclusive = false, const Op& op = Op()) + : m_expr(expr), m_axis(axis), m_accumulator(op), m_exclusive(exclusive) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Index axis() const { return m_axis; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const XprType& expression() const { return m_expr; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Op accumulator() const { return m_accumulator; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + bool exclusive() const { return m_exclusive; } + +protected: + typename XprType::Nested m_expr; + const Index m_axis; + const Op m_accumulator; + const bool m_exclusive; +}; + + +namespace internal { + +template +EIGEN_STRONG_INLINE void ReduceScalar(Self& self, Index offset, + typename Self::CoeffReturnType* data) { + // Compute the scan along the axis, starting at the given offset + typename Self::CoeffReturnType accum = self.accumulator().initialize(); + if (self.stride() == 1) { + if (self.exclusive()) { + for (Index curr = offset; curr < offset + self.size(); ++curr) { + data[curr] = self.accumulator().finalize(accum); + self.accumulator().reduce(self.inner().coeff(curr), &accum); + } + } else { + for (Index curr = offset; curr < offset + self.size(); ++curr) { + self.accumulator().reduce(self.inner().coeff(curr), &accum); + data[curr] = self.accumulator().finalize(accum); + } + } + } else { + if (self.exclusive()) { + for (Index idx3 = 0; idx3 < self.size(); idx3++) { + Index curr = offset + idx3 * self.stride(); + data[curr] = self.accumulator().finalize(accum); + self.accumulator().reduce(self.inner().coeff(curr), &accum); + } + } else { + for (Index idx3 = 0; idx3 < self.size(); idx3++) { + Index curr = offset + idx3 * self.stride(); + self.accumulator().reduce(self.inner().coeff(curr), &accum); + data[curr] = self.accumulator().finalize(accum); + } + } + } +} + +template +EIGEN_STRONG_INLINE void ReducePacket(Self& self, Index offset, + typename Self::CoeffReturnType* data) { + using Scalar = typename Self::CoeffReturnType; + using Packet = typename Self::PacketReturnType; + // Compute the scan along the axis, starting at the calculated offset + Packet accum = self.accumulator().template initializePacket(); + if (self.stride() == 1) { + if (self.exclusive()) { + for (Index curr = offset; curr < offset + self.size(); ++curr) { + internal::pstoreu(data + curr, self.accumulator().finalizePacket(accum)); + self.accumulator().reducePacket(self.inner().template packet(curr), &accum); + } + } else { + for (Index curr = offset; curr < offset + self.size(); ++curr) { + self.accumulator().reducePacket(self.inner().template packet(curr), &accum); + internal::pstoreu(data + curr, self.accumulator().finalizePacket(accum)); + } + } + } else { + if (self.exclusive()) { + for (Index idx3 = 0; idx3 < self.size(); idx3++) { + const Index curr = offset + idx3 * self.stride(); + internal::pstoreu(data + curr, self.accumulator().finalizePacket(accum)); + self.accumulator().reducePacket(self.inner().template packet(curr), &accum); + } + } else { + for (Index idx3 = 0; idx3 < self.size(); idx3++) { + const Index curr = offset + idx3 * self.stride(); + self.accumulator().reducePacket(self.inner().template packet(curr), &accum); + internal::pstoreu(data + curr, self.accumulator().finalizePacket(accum)); + } + } + } +} + +template +struct ReduceBlock { + EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, + typename Self::CoeffReturnType* data) { + for (Index idx2 = 0; idx2 < self.stride(); idx2++) { + // Calculate the starting offset for the scan + Index offset = idx1 + idx2; + ReduceScalar(self, offset, data); + } + } +}; + +// Specialization for vectorized reduction. +template +struct ReduceBlock { + EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, + typename Self::CoeffReturnType* data) { + using Packet = typename Self::PacketReturnType; + const int PacketSize = internal::unpacket_traits::size; + Index idx2 = 0; + for (; idx2 + PacketSize <= self.stride(); idx2 += PacketSize) { + // Calculate the starting offset for the packet scan + Index offset = idx1 + idx2; + ReducePacket(self, offset, data); + } + for (; idx2 < self.stride(); idx2++) { + // Calculate the starting offset for the scan + Index offset = idx1 + idx2; + ReduceScalar(self, offset, data); + } + } +}; + +// Single-threaded CPU implementation of scan +template ::PacketAccess && + internal::reducer_traits::PacketAccess)> +struct ScanLauncher { + void operator()(Self& self, typename Self::CoeffReturnType* data) { + Index total_size = internal::array_prod(self.dimensions()); + + // We fix the index along the scan axis to 0 and perform a + // scan per remaining entry. The iteration is split into two nested + // loops to avoid an integer division by keeping track of each idx1 and + // idx2. + for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) { + ReduceBlock block_reducer; + block_reducer(self, idx1, data); + } + } +}; + +#ifdef EIGEN_USE_THREADS + +// Adjust block_size to avoid false sharing of cachelines among +// threads. Currently set to twice the cache line size on Intel and ARM +// processors. +EIGEN_STRONG_INLINE Index AdjustBlockSize(Index item_size, Index block_size) { + EIGEN_CONSTEXPR Index kBlockAlignment = 128; + const Index items_per_cacheline = + numext::maxi(1, kBlockAlignment / item_size); + return items_per_cacheline * divup(block_size, items_per_cacheline); +} + +template +struct ReduceBlock { + EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, + typename Self::CoeffReturnType* data) { + using Scalar = typename Self::CoeffReturnType; + using Packet = typename Self::PacketReturnType; + const int PacketSize = internal::unpacket_traits::size; + Index num_scalars = self.stride(); + Index num_packets = 0; + if (self.stride() >= PacketSize) { + num_packets = self.stride() / PacketSize; + self.device().parallelFor( + num_packets, + TensorOpCost(PacketSize * self.size(), PacketSize * self.size(), + 16 * PacketSize * self.size(), true, PacketSize), + // Make the shard size large enough that two neighboring threads + // won't write to the same cacheline of `data`. + [=](Index blk_size) { + return AdjustBlockSize(PacketSize * sizeof(Scalar), blk_size); + }, + [&](Index first, Index last) { + for (Index packet = first; packet < last; ++packet) { + const Index idx2 = packet * PacketSize; + ReducePacket(self, idx1 + idx2, data); + } + }); + num_scalars -= num_packets * PacketSize; + } + self.device().parallelFor( + num_scalars, TensorOpCost(self.size(), self.size(), 16 * self.size()), + // Make the shard size large enough that two neighboring threads + // won't write to the same cacheline of `data`. + [=](Index blk_size) { + return AdjustBlockSize(sizeof(Scalar), blk_size); + }, + [&](Index first, Index last) { + for (Index scalar = first; scalar < last; ++scalar) { + const Index idx2 = num_packets * PacketSize + scalar; + ReduceScalar(self, idx1 + idx2, data); + } + }); + } +}; + +template +struct ReduceBlock { + EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, + typename Self::CoeffReturnType* data) { + using Scalar = typename Self::CoeffReturnType; + self.device().parallelFor( + self.stride(), TensorOpCost(self.size(), self.size(), 16 * self.size()), + // Make the shard size large enough that two neighboring threads + // won't write to the same cacheline of `data`. + [=](Index blk_size) { + return AdjustBlockSize(sizeof(Scalar), blk_size); + }, + [&](Index first, Index last) { + for (Index idx2 = first; idx2 < last; ++idx2) { + ReduceScalar(self, idx1 + idx2, data); + } + }); + } +}; + +// Specialization for multi-threaded execution. +template +struct ScanLauncher { + void operator()(Self& self, typename Self::CoeffReturnType* data) { + using Scalar = typename Self::CoeffReturnType; + using Packet = typename Self::PacketReturnType; + const int PacketSize = internal::unpacket_traits::size; + const Index total_size = internal::array_prod(self.dimensions()); + const Index inner_block_size = self.stride() * self.size(); + bool parallelize_by_outer_blocks = (total_size >= (self.stride() * inner_block_size)); + + if ((parallelize_by_outer_blocks && total_size <= 4096) || + (!parallelize_by_outer_blocks && self.stride() < PacketSize)) { + ScanLauncher launcher; + launcher(self, data); + return; + } + + if (parallelize_by_outer_blocks) { + // Parallelize over outer blocks. + const Index num_outer_blocks = total_size / inner_block_size; + self.device().parallelFor( + num_outer_blocks, + TensorOpCost(inner_block_size, inner_block_size, + 16 * PacketSize * inner_block_size, Vectorize, + PacketSize), + [=](Index blk_size) { + return AdjustBlockSize(inner_block_size * sizeof(Scalar), blk_size); + }, + [&](Index first, Index last) { + for (Index idx1 = first; idx1 < last; ++idx1) { + ReduceBlock block_reducer; + block_reducer(self, idx1 * inner_block_size, data); + } + }); + } else { + // Parallelize over inner packets/scalars dimensions when the reduction + // axis is not an inner dimension. + ReduceBlock block_reducer; + for (Index idx1 = 0; idx1 < total_size; + idx1 += self.stride() * self.size()) { + block_reducer(self, idx1, data); + } + } + } +}; +#endif // EIGEN_USE_THREADS + +#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) + +// GPU implementation of scan +// TODO(ibab) This placeholder implementation performs multiple scans in +// parallel, but it would be better to use a parallel scan algorithm and +// optimize memory access. +template +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) { + // Compute offset as in the CPU version + Index val = threadIdx.x + blockIdx.x * blockDim.x; + Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride(); + + if (offset + (self.size() - 1) * self.stride() < total_size) { + // Compute the scan along the axis, starting at the calculated offset + typename Self::CoeffReturnType accum = self.accumulator().initialize(); + for (Index idx = 0; idx < self.size(); idx++) { + Index curr = offset + idx * self.stride(); + if (self.exclusive()) { + data[curr] = self.accumulator().finalize(accum); + self.accumulator().reduce(self.inner().coeff(curr), &accum); + } else { + self.accumulator().reduce(self.inner().coeff(curr), &accum); + data[curr] = self.accumulator().finalize(accum); + } + } + } + __syncthreads(); + +} + +template +struct ScanLauncher { + void operator()(const Self& self, typename Self::CoeffReturnType* data) { + Index total_size = internal::array_prod(self.dimensions()); + Index num_blocks = (total_size / self.size() + 63) / 64; + Index block_size = 64; + + LAUNCH_GPU_KERNEL((ScanKernel), num_blocks, block_size, 0, self.device(), self, total_size, data); + } +}; +#endif // EIGEN_USE_GPU && (EIGEN_GPUCC) + +} // namespace internal + +// Eval as rvalue +template +struct TensorEvaluator, Device> { + + typedef TensorScanOp XprType; + typedef typename XprType::Index Index; + typedef const ArgType ChildTypeNoConst; + typedef const ArgType ChildType; + static const int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + typedef typename internal::remove_const::type Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef TensorEvaluator, Device> Self; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + enum { + IsAligned = false, + PacketAccess = (PacketType::size > 1), + BlockAccess = false, + PreferBlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = true + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), + m_device(device), + m_exclusive(op.exclusive()), + m_accumulator(op.accumulator()), + m_size(m_impl.dimensions()[op.axis()]), + m_stride(1), m_consume_dim(op.axis()), + m_output(NULL) { + + // Accumulating a scalar isn't supported. + EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert(op.axis() >= 0 && op.axis() < NumDims); + + // Compute stride of scan axis + const Dimensions& dims = m_impl.dimensions(); + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = 0; i < op.axis(); ++i) { + m_stride = m_stride * dims[i]; + } + } else { + // dims can only be indexed through unsigned integers, + // so let's use an unsigned type to let the compiler knows. + // This prevents stupid warnings: ""'*((void*)(& evaluator)+64)[18446744073709551615]' may be used uninitialized in this function" + unsigned int axis = internal::convert_index(op.axis()); + for (unsigned int i = NumDims - 1; i > axis; --i) { + m_stride = m_stride * dims[i]; + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { + return m_impl.dimensions(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride() const { + return m_stride; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& consume_dim() const { + return m_consume_dim; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const { + return m_size; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& accumulator() const { + return m_accumulator; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const { + return m_exclusive; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& inner() const { + return m_impl; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const { + return m_device; + } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + m_impl.evalSubExprsIfNeeded(NULL); + internal::ScanLauncher launcher; + if (data) { + launcher(*this, data); + return false; + } + + const Index total_size = internal::array_prod(dimensions()); + m_output = static_cast(m_device.get((Scalar*) m_device.allocate_temp(total_size * sizeof(Scalar)))); + launcher(*this, m_output); + return true; + } + + template + EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { + return internal::ploadt(m_output + index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const + { + return m_output; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_output[index]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const { + return TensorOpCost(sizeof(CoeffReturnType), 0, 0); + } + + EIGEN_STRONG_INLINE void cleanup() { + if (m_output) { + m_device.deallocate_temp(m_output); + m_output = NULL; + } + m_impl.cleanup(); + } + +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + m_output.bind(cgh); + } +#endif +protected: + TensorEvaluator m_impl; + const Device EIGEN_DEVICE_REF m_device; + const bool m_exclusive; + Op m_accumulator; + const Index m_size; + Index m_stride; + Index m_consume_dim; + EvaluatorPointerType m_output; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_SCAN_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorScanSycl.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorScanSycl.h new file mode 100644 index 0000000..7f68ecb --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorScanSycl.h @@ -0,0 +1,513 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/***************************************************************** + * TensorScanSycl.h + * + * \brief: + * Tensor Scan Sycl implement the extend version of + * "Efficient parallel scan algorithms for GPUs." .for Tensor operations. + * The algorithm requires up to 3 stage (consequently 3 kernels) depending on + * the size of the tensor. In the first kernel (ScanKernelFunctor), each + * threads within the work-group individually reduces the allocated elements per + * thread in order to reduces the total number of blocks. In the next step all + * thread within the work-group will reduce the associated blocks into the + * temporary buffers. In the next kernel(ScanBlockKernelFunctor), the temporary + * buffer is given as an input and all the threads within a work-group scan and + * reduces the boundaries between the blocks (generated from the previous + * kernel). and write the data on the temporary buffer. If the second kernel is + * required, the third and final kerenl (ScanAdjustmentKernelFunctor) will + * adjust the final result into the output buffer. + * The original algorithm for the parallel prefix sum can be found here: + * + * Sengupta, Shubhabrata, Mark Harris, and Michael Garland. "Efficient parallel + * scan algorithms for GPUs." NVIDIA, Santa Clara, CA, Tech. Rep. NVR-2008-003 + *1, no. 1 (2008): 1-17. + *****************************************************************/ + +#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP +#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP + +namespace Eigen { +namespace TensorSycl { +namespace internal { + +#ifndef EIGEN_SYCL_MAX_GLOBAL_RANGE +#define EIGEN_SYCL_MAX_GLOBAL_RANGE (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 * 4) +#endif + +template +struct ScanParameters { + // must be power of 2 + static EIGEN_CONSTEXPR index_t ScanPerThread = 8; + const index_t total_size; + const index_t non_scan_size; + const index_t scan_size; + const index_t non_scan_stride; + const index_t scan_stride; + const index_t panel_threads; + const index_t group_threads; + const index_t block_threads; + const index_t elements_per_group; + const index_t elements_per_block; + const index_t loop_range; + + ScanParameters(index_t total_size_, index_t non_scan_size_, index_t scan_size_, index_t non_scan_stride_, + index_t scan_stride_, index_t panel_threads_, index_t group_threads_, index_t block_threads_, + index_t elements_per_group_, index_t elements_per_block_, index_t loop_range_) + : total_size(total_size_), + non_scan_size(non_scan_size_), + scan_size(scan_size_), + non_scan_stride(non_scan_stride_), + scan_stride(scan_stride_), + panel_threads(panel_threads_), + group_threads(group_threads_), + block_threads(block_threads_), + elements_per_group(elements_per_group_), + elements_per_block(elements_per_block_), + loop_range(loop_range_) {} +}; + +enum class scan_step { first, second }; +template +struct ScanKernelFunctor { + typedef cl::sycl::accessor + LocalAccessor; + static EIGEN_CONSTEXPR int PacketSize = ScanParameters::ScanPerThread / 2; + + LocalAccessor scratch; + Evaluator dev_eval; + OutAccessor out_accessor; + OutAccessor temp_accessor; + const ScanParameters scanParameters; + Op accumulator; + const bool inclusive; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanKernelFunctor(LocalAccessor scratch_, const Evaluator dev_eval_, + OutAccessor out_accessor_, OutAccessor temp_accessor_, + const ScanParameters scanParameters_, Op accumulator_, + const bool inclusive_) + : scratch(scratch_), + dev_eval(dev_eval_), + out_accessor(out_accessor_), + temp_accessor(temp_accessor_), + scanParameters(scanParameters_), + accumulator(accumulator_), + inclusive(inclusive_) {} + + template + typename ::Eigen::internal::enable_if::type EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE + read(const Input &inpt, Index global_id) { + return inpt.coeff(global_id); + } + + template + typename ::Eigen::internal::enable_if::type EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE + read(const Input &inpt, Index global_id) { + return inpt[global_id]; + } + + template + typename ::Eigen::internal::enable_if::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + first_step_inclusive_Operation(InclusiveOp inclusive_op) { + inclusive_op(); + } + + template + typename ::Eigen::internal::enable_if::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + first_step_inclusive_Operation(InclusiveOp) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) { + auto out_ptr = out_accessor.get_pointer(); + auto tmp_ptr = temp_accessor.get_pointer(); + auto scratch_ptr = scratch.get_pointer().get(); + + for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) { + Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset)); + Index tmp = data_offset % scanParameters.panel_threads; + const Index panel_id = data_offset / scanParameters.panel_threads; + const Index group_id = tmp / scanParameters.group_threads; + tmp = tmp % scanParameters.group_threads; + const Index block_id = tmp / scanParameters.block_threads; + const Index local_id = tmp % scanParameters.block_threads; + // we put one element per packet in scratch_mem + const Index scratch_stride = scanParameters.elements_per_block / PacketSize; + const Index scratch_offset = (itemID.get_local_id(0) / scanParameters.block_threads) * scratch_stride; + CoeffReturnType private_scan[ScanParameters::ScanPerThread]; + CoeffReturnType inclusive_scan; + // the actual panel size is scan_size * non_scan_size. + // elements_per_panel is roundup to power of 2 for binary tree + const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size; + const Index group_offset = group_id * scanParameters.non_scan_stride; + // This will be effective when the size is bigger than elements_per_block + const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride; + const Index thread_offset = (ScanParameters::ScanPerThread * local_id * scanParameters.scan_stride); + const Index global_offset = panel_offset + group_offset + block_offset + thread_offset; + Index next_elements = 0; + EIGEN_UNROLL_LOOP + for (int i = 0; i < ScanParameters::ScanPerThread; i++) { + Index global_id = global_offset + next_elements; + private_scan[i] = ((((block_id * scanParameters.elements_per_block) + + (ScanParameters::ScanPerThread * local_id) + i) < scanParameters.scan_size) && + (global_id < scanParameters.total_size)) + ? read(dev_eval, global_id) + : accumulator.initialize(); + next_elements += scanParameters.scan_stride; + } + first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC { + if (inclusive) { + inclusive_scan = private_scan[ScanParameters::ScanPerThread - 1]; + } + }); + // This for loop must be 2 + EIGEN_UNROLL_LOOP + for (int packetIndex = 0; packetIndex < ScanParameters::ScanPerThread; packetIndex += PacketSize) { + Index private_offset = 1; + // build sum in place up the tree + EIGEN_UNROLL_LOOP + for (Index d = PacketSize >> 1; d > 0; d >>= 1) { + EIGEN_UNROLL_LOOP + for (Index l = 0; l < d; l++) { + Index ai = private_offset * (2 * l + 1) - 1 + packetIndex; + Index bi = private_offset * (2 * l + 2) - 1 + packetIndex; + CoeffReturnType accum = accumulator.initialize(); + accumulator.reduce(private_scan[ai], &accum); + accumulator.reduce(private_scan[bi], &accum); + private_scan[bi] = accumulator.finalize(accum); + } + private_offset *= 2; + } + scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset] = + private_scan[PacketSize - 1 + packetIndex]; + private_scan[PacketSize - 1 + packetIndex] = accumulator.initialize(); + // traverse down tree & build scan + EIGEN_UNROLL_LOOP + for (Index d = 1; d < PacketSize; d *= 2) { + private_offset >>= 1; + EIGEN_UNROLL_LOOP + for (Index l = 0; l < d; l++) { + Index ai = private_offset * (2 * l + 1) - 1 + packetIndex; + Index bi = private_offset * (2 * l + 2) - 1 + packetIndex; + CoeffReturnType accum = accumulator.initialize(); + accumulator.reduce(private_scan[ai], &accum); + accumulator.reduce(private_scan[bi], &accum); + private_scan[ai] = private_scan[bi]; + private_scan[bi] = accumulator.finalize(accum); + } + } + } + + Index offset = 1; + // build sum in place up the tree + for (Index d = scratch_stride >> 1; d > 0; d >>= 1) { + // Synchronise + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (local_id < d) { + Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset; + Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset; + CoeffReturnType accum = accumulator.initialize(); + accumulator.reduce(scratch_ptr[ai], &accum); + accumulator.reduce(scratch_ptr[bi], &accum); + scratch_ptr[bi] = accumulator.finalize(accum); + } + offset *= 2; + } + // Synchronise + itemID.barrier(cl::sycl::access::fence_space::local_space); + // next step optimisation + if (local_id == 0) { + if (((scanParameters.elements_per_group / scanParameters.elements_per_block) > 1)) { + const Index temp_id = panel_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) * + scanParameters.non_scan_size + + group_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) + + block_id; + tmp_ptr[temp_id] = scratch_ptr[scratch_stride - 1 + scratch_offset]; + } + // clear the last element + scratch_ptr[scratch_stride - 1 + scratch_offset] = accumulator.initialize(); + } + // traverse down tree & build scan + for (Index d = 1; d < scratch_stride; d *= 2) { + offset >>= 1; + // Synchronise + itemID.barrier(cl::sycl::access::fence_space::local_space); + if (local_id < d) { + Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset; + Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset; + CoeffReturnType accum = accumulator.initialize(); + accumulator.reduce(scratch_ptr[ai], &accum); + accumulator.reduce(scratch_ptr[bi], &accum); + scratch_ptr[ai] = scratch_ptr[bi]; + scratch_ptr[bi] = accumulator.finalize(accum); + } + } + // Synchronise + itemID.barrier(cl::sycl::access::fence_space::local_space); + // This for loop must be 2 + EIGEN_UNROLL_LOOP + for (int packetIndex = 0; packetIndex < ScanParameters::ScanPerThread; packetIndex += PacketSize) { + EIGEN_UNROLL_LOOP + for (Index i = 0; i < PacketSize; i++) { + CoeffReturnType accum = private_scan[packetIndex + i]; + accumulator.reduce(scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset], &accum); + private_scan[packetIndex + i] = accumulator.finalize(accum); + } + } + first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC { + if (inclusive) { + accumulator.reduce(private_scan[ScanParameters::ScanPerThread - 1], &inclusive_scan); + private_scan[0] = accumulator.finalize(inclusive_scan); + } + }); + next_elements = 0; + // right the first set of private param + EIGEN_UNROLL_LOOP + for (Index i = 0; i < ScanParameters::ScanPerThread; i++) { + Index global_id = global_offset + next_elements; + if ((((block_id * scanParameters.elements_per_block) + (ScanParameters::ScanPerThread * local_id) + i) < + scanParameters.scan_size) && + (global_id < scanParameters.total_size)) { + Index private_id = (i * !inclusive) + (((i + 1) % ScanParameters::ScanPerThread) * (inclusive)); + out_ptr[global_id] = private_scan[private_id]; + } + next_elements += scanParameters.scan_stride; + } + } // end for loop + } +}; + +template +struct ScanAdjustmentKernelFunctor { + typedef cl::sycl::accessor + LocalAccessor; + static EIGEN_CONSTEXPR int PacketSize = ScanParameters::ScanPerThread / 2; + InAccessor in_accessor; + OutAccessor out_accessor; + const ScanParameters scanParameters; + Op accumulator; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanAdjustmentKernelFunctor(LocalAccessor, InAccessor in_accessor_, + OutAccessor out_accessor_, + const ScanParameters scanParameters_, + Op accumulator_) + : in_accessor(in_accessor_), + out_accessor(out_accessor_), + scanParameters(scanParameters_), + accumulator(accumulator_) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) { + auto in_ptr = in_accessor.get_pointer(); + auto out_ptr = out_accessor.get_pointer(); + + for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) { + Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset)); + Index tmp = data_offset % scanParameters.panel_threads; + const Index panel_id = data_offset / scanParameters.panel_threads; + const Index group_id = tmp / scanParameters.group_threads; + tmp = tmp % scanParameters.group_threads; + const Index block_id = tmp / scanParameters.block_threads; + const Index local_id = tmp % scanParameters.block_threads; + + // the actual panel size is scan_size * non_scan_size. + // elements_per_panel is roundup to power of 2 for binary tree + const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size; + const Index group_offset = group_id * scanParameters.non_scan_stride; + // This will be effective when the size is bigger than elements_per_block + const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride; + const Index thread_offset = ScanParameters::ScanPerThread * local_id * scanParameters.scan_stride; + + const Index global_offset = panel_offset + group_offset + block_offset + thread_offset; + const Index block_size = scanParameters.elements_per_group / scanParameters.elements_per_block; + const Index in_id = (panel_id * block_size * scanParameters.non_scan_size) + (group_id * block_size) + block_id; + CoeffReturnType adjust_val = in_ptr[in_id]; + + Index next_elements = 0; + EIGEN_UNROLL_LOOP + for (Index i = 0; i < ScanParameters::ScanPerThread; i++) { + Index global_id = global_offset + next_elements; + if ((((block_id * scanParameters.elements_per_block) + (ScanParameters::ScanPerThread * local_id) + i) < + scanParameters.scan_size) && + (global_id < scanParameters.total_size)) { + CoeffReturnType accum = adjust_val; + accumulator.reduce(out_ptr[global_id], &accum); + out_ptr[global_id] = accumulator.finalize(accum); + } + next_elements += scanParameters.scan_stride; + } + } + } +}; + +template +struct ScanInfo { + const Index &total_size; + const Index &scan_size; + const Index &panel_size; + const Index &non_scan_size; + const Index &scan_stride; + const Index &non_scan_stride; + + Index max_elements_per_block; + Index block_size; + Index panel_threads; + Index group_threads; + Index block_threads; + Index elements_per_group; + Index elements_per_block; + Index loop_range; + Index global_range; + Index local_range; + const Eigen::SyclDevice &dev; + EIGEN_STRONG_INLINE ScanInfo(const Index &total_size_, const Index &scan_size_, const Index &panel_size_, + const Index &non_scan_size_, const Index &scan_stride_, const Index &non_scan_stride_, + const Eigen::SyclDevice &dev_) + : total_size(total_size_), + scan_size(scan_size_), + panel_size(panel_size_), + non_scan_size(non_scan_size_), + scan_stride(scan_stride_), + non_scan_stride(non_scan_stride_), + dev(dev_) { + // must be power of 2 + local_range = std::min(Index(dev.getNearestPowerOfTwoWorkGroupSize()), + Index(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1)); + + max_elements_per_block = local_range * ScanParameters::ScanPerThread; + + elements_per_group = + dev.getPowerOfTwo(Index(roundUp(Index(scan_size), ScanParameters::ScanPerThread)), true); + const Index elements_per_panel = elements_per_group * non_scan_size; + elements_per_block = std::min(Index(elements_per_group), Index(max_elements_per_block)); + panel_threads = elements_per_panel / ScanParameters::ScanPerThread; + group_threads = elements_per_group / ScanParameters::ScanPerThread; + block_threads = elements_per_block / ScanParameters::ScanPerThread; + block_size = elements_per_group / elements_per_block; +#ifdef EIGEN_SYCL_MAX_GLOBAL_RANGE + const Index max_threads = std::min(Index(panel_threads * panel_size), Index(EIGEN_SYCL_MAX_GLOBAL_RANGE)); +#else + const Index max_threads = panel_threads * panel_size; +#endif + global_range = roundUp(max_threads, local_range); + loop_range = Index( + std::ceil(double(elements_per_panel * panel_size) / (global_range * ScanParameters::ScanPerThread))); + } + inline ScanParameters get_scan_parameter() { + return ScanParameters(total_size, non_scan_size, scan_size, non_scan_stride, scan_stride, panel_threads, + group_threads, block_threads, elements_per_group, elements_per_block, loop_range); + } + inline cl::sycl::nd_range<1> get_thread_range() { + return cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range)); + } +}; + +template +struct SYCLAdjustBlockOffset { + EIGEN_STRONG_INLINE static void adjust_scan_block_offset(EvaluatorPointerType in_ptr, EvaluatorPointerType out_ptr, + Reducer &accumulator, const Index total_size, + const Index scan_size, const Index panel_size, + const Index non_scan_size, const Index scan_stride, + const Index non_scan_stride, const Eigen::SyclDevice &dev) { + auto scan_info = + ScanInfo(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev); + + typedef ScanAdjustmentKernelFunctor + AdjustFuctor; + dev.template unary_kernel_launcher(in_ptr, out_ptr, scan_info.get_thread_range(), + scan_info.max_elements_per_block, + scan_info.get_scan_parameter(), accumulator); + } +}; + +template +struct ScanLauncher_impl { + template + EIGEN_STRONG_INLINE static void scan_block(Input in_ptr, EvaluatorPointerType out_ptr, Reducer &accumulator, + const Index total_size, const Index scan_size, const Index panel_size, + const Index non_scan_size, const Index scan_stride, + const Index non_scan_stride, const bool inclusive, + const Eigen::SyclDevice &dev) { + auto scan_info = + ScanInfo(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev); + const Index temp_pointer_size = scan_info.block_size * non_scan_size * panel_size; + const Index scratch_size = scan_info.max_elements_per_block / (ScanParameters::ScanPerThread / 2); + CoeffReturnType *temp_pointer = + static_cast(dev.allocate_temp(temp_pointer_size * sizeof(CoeffReturnType))); + EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer); + + typedef ScanKernelFunctor ScanFunctor; + dev.template binary_kernel_launcher( + in_ptr, out_ptr, tmp_global_accessor, scan_info.get_thread_range(), scratch_size, + scan_info.get_scan_parameter(), accumulator, inclusive); + + if (scan_info.block_size > 1) { + ScanLauncher_impl::scan_block( + tmp_global_accessor, tmp_global_accessor, accumulator, temp_pointer_size, scan_info.block_size, panel_size, + non_scan_size, Index(1), scan_info.block_size, false, dev); + + SYCLAdjustBlockOffset::adjust_scan_block_offset( + tmp_global_accessor, out_ptr, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride, + non_scan_stride, dev); + } + dev.deallocate_temp(temp_pointer); + } +}; + +} // namespace internal +} // namespace TensorSycl +namespace internal { +template +struct ScanLauncher { + typedef typename Self::Index Index; + typedef typename Self::CoeffReturnType CoeffReturnType; + typedef typename Self::Storage Storage; + typedef typename Self::EvaluatorPointerType EvaluatorPointerType; + void operator()(Self &self, EvaluatorPointerType data) { + const Index total_size = internal::array_prod(self.dimensions()); + const Index scan_size = self.size(); + const Index scan_stride = self.stride(); + // this is the scan op (can be sum or ...) + auto accumulator = self.accumulator(); + auto inclusive = !self.exclusive(); + auto consume_dim = self.consume_dim(); + auto dev = self.device(); + + auto dims = self.inner().dimensions(); + + Index non_scan_size = 1; + Index panel_size = 1; + if (static_cast(Self::Layout) == static_cast(ColMajor)) { + for (int i = 0; i < consume_dim; i++) { + non_scan_size *= dims[i]; + } + for (int i = consume_dim + 1; i < Self::NumDims; i++) { + panel_size *= dims[i]; + } + } else { + for (int i = Self::NumDims - 1; i > consume_dim; i--) { + non_scan_size *= dims[i]; + } + for (int i = consume_dim - 1; i >= 0; i--) { + panel_size *= dims[i]; + } + } + const Index non_scan_stride = (scan_stride > 1) ? 1 : scan_size; + auto eval_impl = self.inner(); + TensorSycl::internal::ScanLauncher_impl::scan_block( + eval_impl, data, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, + inclusive, dev); + } +}; +} // namespace internal +} // namespace Eigen + +#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorShuffling.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorShuffling.h new file mode 100644 index 0000000..e5e5efd --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorShuffling.h @@ -0,0 +1,471 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H +#define EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H + +namespace Eigen { + +/** \class TensorShuffling + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor shuffling class. + * + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorShufflingOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorShufflingOp type; +}; + +} // end namespace internal + + + +template +class TensorShufflingOp : public TensorBase > +{ + public: + typedef TensorBase > Base; + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp(const XprType& expr, const Shuffle& shfl) + : m_xpr(expr), m_shuffle(shfl) {} + + EIGEN_DEVICE_FUNC + const Shuffle& shufflePermutation() const { return m_shuffle; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorShufflingOp) + + + protected: + typename XprType::Nested m_xpr; + const Shuffle m_shuffle; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorEvaluator, Device> Self; + typedef TensorShufflingOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + enum { + IsAligned = false, + PacketAccess = (PacketType::size > 1), + BlockAccess = TensorEvaluator::RawAccess, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + typedef typename internal::remove_const::type ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator TensorBlockScratch; + + typedef typename internal::TensorMaterializedBlock + TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_device(device), + m_impl(op.expression(), device) + { + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + const Shuffle& shuffle = op.shufflePermutation(); + m_is_identity = true; + for (int i = 0; i < NumDims; ++i) { + m_shuffle[i] = static_cast(shuffle[i]); + m_dimensions[i] = input_dims[shuffle[i]]; + m_inverseShuffle[shuffle[i]] = i; + if (m_is_identity && shuffle[i] != i) { + m_is_identity = false; + } + } + + if (static_cast(Layout) == static_cast(ColMajor)) { + m_unshuffledInputStrides[0] = 1; + m_outputStrides[0] = 1; + + for (int i = 1; i < NumDims; ++i) { + m_unshuffledInputStrides[i] = + m_unshuffledInputStrides[i - 1] * input_dims[i - 1]; + m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor( + m_outputStrides[i] > 0 ? m_outputStrides[i] : Index(1)); + } + } else { + m_unshuffledInputStrides[NumDims - 1] = 1; + m_outputStrides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_unshuffledInputStrides[i] = + m_unshuffledInputStrides[i + 1] * input_dims[i + 1]; + m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + m_fastOutputStrides[i] = internal::TensorIntDivisor( + m_outputStrides[i] > 0 ? m_outputStrides[i] : Index(1)); + } + } + + for (int i = 0; i < NumDims; ++i) { + m_inputStrides[i] = m_unshuffledInputStrides[shuffle[i]]; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + +#ifdef EIGEN_USE_THREADS + template + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType, EvalSubExprsCallback done) { + m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + if (m_is_identity) { + return m_impl.coeff(index); + } else { + return m_impl.coeff(srcCoeff(index)); + } + } + + template + struct PacketLoader { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static PacketReturnType Run(const Self& self, Index index) { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + values[i] = self.coeff(index + i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + }; + + template + struct PacketLoader { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static PacketReturnType Run(const Self& self, Index index) { + if (self.m_is_identity) { + return self.m_impl.template packet(index); + } else { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + values[i] = self.coeff(index + i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + }; + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + return PacketLoader::PacketAccess>::Run(*this, index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + static const int inner_dim = + Layout == static_cast(ColMajor) ? 0 : NumDims - 1; + + const size_t target_size = m_device.firstLevelCacheSize(); + const bool inner_dim_shuffled = m_shuffle[inner_dim] != inner_dim; + + // Shuffled inner dimensions leads to a random memory access, which is not + // captured by default cost model bytes loaded/stored. We add this cost + // explicitly. The number of cycles picked based on the benchmarks. + // TODO(ezhulenev): This number was picked based on a very questionable + // benchmarks, add benchmarks that are representative of real workloads. + using BlockRequirements = internal::TensorBlockResourceRequirements; + if (inner_dim_shuffled) { + return BlockRequirements::uniform(target_size) + .addCostPerCoeff({0, 0, NumDims * 28}); + } else { + return BlockRequirements::skewed(target_size); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool root_of_expr_ast = false) const { + assert(m_impl.data() != NULL); + + typedef internal::TensorBlockIO + TensorBlockIO; + typedef typename TensorBlockIO::Dst TensorBlockIODst; + typedef typename TensorBlockIO::Src TensorBlockIOSrc; + + const typename TensorBlock::Storage block_storage = + TensorBlock::prepareStorage( + desc, scratch, /*allow_strided_storage=*/root_of_expr_ast); + + typename TensorBlockIO::Dimensions input_strides(m_unshuffledInputStrides); + TensorBlockIOSrc src(input_strides, m_impl.data(), srcCoeff(desc.offset())); + + TensorBlockIODst dst(block_storage.dimensions(), block_storage.strides(), + block_storage.data()); + + typename TensorBlockIO::DimensionsMap dst_to_src_dim_map(m_shuffle); + TensorBlockIO::Copy(dst, src, dst_to_src_dim_map); + + return block_storage.AsTensorMaterializedBlock(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + const double compute_cost = m_is_identity ? TensorOpCost::AddCost() : + NumDims * (2 * TensorOpCost::AddCost() + + 2 * TensorOpCost::MulCost() + + TensorOpCost::DivCost()); + return m_impl.costPerCoeff(vectorized) + + TensorOpCost(0, 0, compute_cost, m_is_identity /* vectorized */, PacketSize); + } + + EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; } + +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index GetBlockOutputIndex( + Index input_index, + const DSizes& input_block_strides, + const DSizes& output_block_strides, + const DSizes, NumDims>& fast_input_block_strides) const { + Index output_index = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = input_index / fast_input_block_strides[i]; + output_index += idx * output_block_strides[m_inverseShuffle[i]]; + input_index -= idx * input_block_strides[i]; + } + return output_index + input_index * + output_block_strides[m_inverseShuffle[0]]; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = input_index / fast_input_block_strides[i]; + output_index += idx * output_block_strides[m_inverseShuffle[i]]; + input_index -= idx * input_block_strides[i]; + } + return output_index + input_index * + output_block_strides[m_inverseShuffle[NumDims - 1]]; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { + Index inputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_fastOutputStrides[i]; + inputIndex += idx * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + return inputIndex + index * m_inputStrides[0]; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_fastOutputStrides[i]; + inputIndex += idx * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + return inputIndex + index * m_inputStrides[NumDims - 1]; + } + } + + Dimensions m_dimensions; + bool m_is_identity; + array m_shuffle; + array m_inverseShuffle; // TODO(ezhulenev): Make it int type. + array m_outputStrides; + array, NumDims> m_fastOutputStrides; + array m_inputStrides; + array m_unshuffledInputStrides; + + const Device EIGEN_DEVICE_REF m_device; + TensorEvaluator m_impl; +}; + + +// Eval as lvalue +template +struct TensorEvaluator, Device> + : public TensorEvaluator, Device> +{ + typedef TensorEvaluator, Device> Base; + + typedef TensorShufflingOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = PacketType::size; + + enum { + IsAligned = false, + PacketAccess = (PacketType::size > 1), + BlockAccess = TensorEvaluator::RawAccess, + PreferBlockAccess = true, + Layout = TensorEvaluator::Layout, + RawAccess = false + }; + + typedef typename internal::remove_const::type ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor TensorBlockDesc; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) + { } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) + { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } + + template EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + internal::pstore(values, x); + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + this->coeffRef(index+i) = values[i]; + } + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( + const TensorBlockDesc& desc, const TensorBlock& block) { + eigen_assert(this->m_impl.data() != NULL); + + typedef internal::TensorBlockIO + TensorBlockIO; + typedef typename TensorBlockIO::Dst TensorBlockIODst; + typedef typename TensorBlockIO::Src TensorBlockIOSrc; + + const Scalar* block_buffer = block.data(); + + // TODO(ezhulenev): TensorBlockIO should be able to read from any Eigen + // expression with coefficient and packet access as `src`. + void* mem = NULL; + if (block_buffer == NULL) { + mem = this->m_device.allocate(desc.size() * sizeof(Scalar)); + ScalarNoConst* buf = static_cast(mem); + + typedef internal::TensorBlockAssignment< + ScalarNoConst, NumDims, typename TensorBlock::XprType, Index> + TensorBlockAssignment; + + TensorBlockAssignment::Run( + TensorBlockAssignment::target( + desc.dimensions(), internal::strides(desc.dimensions()), + buf), + block.expr()); + + block_buffer = buf; + } + + // Read from block. + TensorBlockIOSrc src(internal::strides(desc.dimensions()), + block_buffer); + + // Write to the output buffer. + typename TensorBlockIO::Dimensions output_strides( + this->m_unshuffledInputStrides); + typename TensorBlockIO::Dimensions output_dimensions; + for (int i = 0; i < NumDims; ++i) { + output_dimensions[this->m_shuffle[i]] = desc.dimension(i); + } + TensorBlockIODst dst(output_dimensions, output_strides, this->m_impl.data(), + this->srcCoeff(desc.offset())); + + // Reorder dimensions according to the shuffle. + typename TensorBlockIO::DimensionsMap dst_to_src_dim_map; + for (int i = 0; i < NumDims; ++i) { + dst_to_src_dim_map[i] = static_cast(this->m_inverseShuffle[i]); + } + TensorBlockIO::Copy(dst, src, dst_to_src_dim_map); + + // Deallocate temporary buffer used for the block materialization. + if (mem != NULL) this->m_device.deallocate(mem); + } +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorStorage.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorStorage.h new file mode 100644 index 0000000..5ff0880 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorStorage.h @@ -0,0 +1,161 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2013 Christian Seiler +// Copyright (C) 2014-2015 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSORSTORAGE_H +#define EIGEN_CXX11_TENSOR_TENSORSTORAGE_H + +#ifdef EIGEN_TENSOR_STORAGE_CTOR_PLUGIN + #define EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN EIGEN_TENSOR_STORAGE_CTOR_PLUGIN; +#else + #define EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN +#endif + +namespace Eigen { + +/** \internal + * + * \class TensorStorage + * \ingroup CXX11_Tensor_Module + * + * \brief Stores the data of a tensor + * + * This class stores the data of fixed-size, dynamic-size or mixed tensors + * in a way as compact as possible. + * + * \sa Tensor + */ +template class TensorStorage; + + +// Pure fixed-size storage +template +class TensorStorage +{ + private: + static const std::size_t Size = FixedDimensions::total_size; + + // Allocate an array of size at least one to prevent compiler warnings. + static const std::size_t MinSize = max_n_1::size; + EIGEN_ALIGN_MAX T m_data[MinSize]; + + public: + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE TensorStorage() { + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE T *data() { return m_data; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const T *data() const { return m_data; } + + static EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const FixedDimensions& dimensions() + { + static const FixedDimensions* singleton_dimensions = new FixedDimensions(); + return *singleton_dimensions; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE DenseIndex size() const { return Size; } +}; + +// pure dynamic +template +class TensorStorage, Options_> +{ + public: + typedef IndexType Index; + typedef DSizes Dimensions; + typedef TensorStorage, Options_> Self; + + EIGEN_DEVICE_FUNC TensorStorage() : m_data(0), m_dimensions() { + if (NumIndices_ == 0) { + m_data = internal::conditional_aligned_new_auto(1); + } + } + EIGEN_DEVICE_FUNC TensorStorage(internal::constructor_without_unaligned_array_assert) + : m_data(0), m_dimensions(internal::template repeat(0)) {} + EIGEN_DEVICE_FUNC TensorStorage(Index size, const array& dimensions) + : m_data(internal::conditional_aligned_new_auto(size)), m_dimensions(dimensions) + { EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + template + EIGEN_DEVICE_FUNC TensorStorage(DenseIndex... indices) : m_dimensions(indices...) { + m_data = internal::conditional_aligned_new_auto(internal::array_prod(m_dimensions)); + } +#endif + + EIGEN_DEVICE_FUNC TensorStorage(const Self& other) + : m_data(internal::conditional_aligned_new_auto(internal::array_prod(other.m_dimensions))) + , m_dimensions(other.m_dimensions) + { + internal::smart_copy(other.m_data, other.m_data+internal::array_prod(other.m_dimensions), m_data); + } + EIGEN_DEVICE_FUNC Self& operator=(const Self& other) + { + if (this != &other) { + Self tmp(other); + this->swap(tmp); + } + return *this; + } + +#if EIGEN_HAS_RVALUE_REFERENCES + EIGEN_DEVICE_FUNC TensorStorage(Self&& other) : TensorStorage() + { + *this = std::move(other); + } + + EIGEN_DEVICE_FUNC Self& operator=(Self&& other) + { + numext::swap(m_data, other.m_data); + numext::swap(m_dimensions, other.m_dimensions); + return *this; + } +#endif + + EIGEN_DEVICE_FUNC ~TensorStorage() { internal::conditional_aligned_delete_auto(m_data, internal::array_prod(m_dimensions)); } + EIGEN_DEVICE_FUNC void swap(Self& other) + { numext::swap(m_data,other.m_data); numext::swap(m_dimensions,other.m_dimensions); } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {return m_dimensions;} + + EIGEN_DEVICE_FUNC void resize(Index size, const array& nbDimensions) + { + const Index currentSz = internal::array_prod(m_dimensions); + if(size != currentSz) + { + internal::conditional_aligned_delete_auto(m_data, currentSz); + if (size) + m_data = internal::conditional_aligned_new_auto(size); + else if (NumIndices_ == 0) { + m_data = internal::conditional_aligned_new_auto(1); + } + else + m_data = 0; + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) + } + m_dimensions = nbDimensions; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T *data() { return m_data; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T *data() const { return m_data; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); } + + private: + T *m_data; + Dimensions m_dimensions; +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSORSTORAGE_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorStriding.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorStriding.h new file mode 100644 index 0000000..2f62a66 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorStriding.h @@ -0,0 +1,346 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H +#define EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H + +namespace Eigen { + +/** \class TensorStriding + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor striding class. + * + * + */ +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions; + static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorStridingOpEIGEN_DEVICE_REF type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorStridingOp type; +}; + +} // end namespace internal + + + +template +class TensorStridingOp : public TensorBase > +{ + public: + typedef TensorBase > Base; + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingOp(const XprType& expr, const Strides& dims) + : m_xpr(expr), m_dims(dims) {} + + EIGEN_DEVICE_FUNC + const Strides& strides() const { return m_dims; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorStridingOp) + + protected: + typename XprType::Nested m_xpr; + const Strides m_dims; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorStridingOp XprType; + typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::Dimensions>::value; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + enum { + IsAligned = /*TensorEvaluator::IsAligned*/false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device) + { + m_dimensions = m_impl.dimensions(); + for (int i = 0; i < NumDims; ++i) { + m_dimensions[i] =Eigen::numext::ceil(static_cast(m_dimensions[i]) / op.strides()[i]); + } + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + if (static_cast(Layout) == static_cast(ColMajor)) { + m_outputStrides[0] = 1; + m_inputStrides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1]; + m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1]; + m_inputStrides[i-1] *= op.strides()[i-1]; + } + m_inputStrides[NumDims-1] *= op.strides()[NumDims-1]; + } else { // RowMajor + m_outputStrides[NumDims-1] = 1; + m_inputStrides[NumDims-1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1]; + m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1]; + m_inputStrides[i+1] *= op.strides()[i+1]; + } + m_inputStrides[0] *= op.strides()[0]; + } + } + + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType/*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + return m_impl.coeff(srcCoeff(index)); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + Index inputIndices[] = {0, 0}; + Index indices[] = {index, index + PacketSize - 1}; + if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / m_outputStrides[i]; + const Index idx1 = indices[1] / m_outputStrides[i]; + inputIndices[0] += idx0 * m_inputStrides[i]; + inputIndices[1] += idx1 * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += indices[0] * m_inputStrides[0]; + inputIndices[1] += indices[1] * m_inputStrides[0]; + } else { // RowMajor + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / m_outputStrides[i]; + const Index idx1 = indices[1] / m_outputStrides[i]; + inputIndices[0] += idx0 * m_inputStrides[i]; + inputIndices[1] += idx1 * m_inputStrides[i]; + indices[0] -= idx0 * m_outputStrides[i]; + indices[1] -= idx1 * m_outputStrides[i]; + } + inputIndices[0] += indices[0] * m_inputStrides[NumDims-1]; + inputIndices[1] += indices[1] * m_inputStrides[NumDims-1]; + } + if (inputIndices[1] - inputIndices[0] == PacketSize - 1) { + PacketReturnType rslt = m_impl.template packet(inputIndices[0]); + return rslt; + } + else { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + values[0] = m_impl.coeff(inputIndices[0]); + values[PacketSize-1] = m_impl.coeff(inputIndices[1]); + EIGEN_UNROLL_LOOP + for (int i = 1; i < PacketSize-1; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { + double compute_cost = (NumDims - 1) * (TensorOpCost::AddCost() + + TensorOpCost::MulCost() + + TensorOpCost::DivCost()) + + TensorOpCost::MulCost(); + if (vectorized) { + compute_cost *= 2; // packet() computes two indices + } + const int innerDim = (static_cast(Layout) == static_cast(ColMajor)) ? 0 : (NumDims - 1); + return m_impl.costPerCoeff(vectorized && m_inputStrides[innerDim] == 1) + + // Computation is not vectorized per se, but it is done once per packet. + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; } + +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const + { + Index inputIndex = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + inputIndex += idx * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + inputIndex += index * m_inputStrides[0]; + } else { // RowMajor + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / m_outputStrides[i]; + inputIndex += idx * m_inputStrides[i]; + index -= idx * m_outputStrides[i]; + } + inputIndex += index * m_inputStrides[NumDims-1]; + } + return inputIndex; + } + + Dimensions m_dimensions; + array m_outputStrides; + array m_inputStrides; + TensorEvaluator m_impl; +}; + +// Eval as lvalue +template +struct TensorEvaluator, Device> + : public TensorEvaluator, Device> +{ + typedef TensorStridingOp XprType; + typedef TensorEvaluator Base; + // typedef typename XprType::Index Index; + static const int NumDims = internal::array_size::Dimensions>::value; + // typedef DSizes Dimensions; + + enum { + IsAligned = /*TensorEvaluator::IsAligned*/false, + PacketAccess = TensorEvaluator::PacketAccess, + PreferBlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) { } + + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = PacketType::size; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) + { + return this->m_impl.coeffRef(this->srcCoeff(index)); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void writePacket(Index index, const PacketReturnType& x) + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < this->dimensions().TotalSize()); + + Index inputIndices[] = {0, 0}; + Index indices[] = {index, index + PacketSize - 1}; + if (static_cast(Layout) == static_cast(ColMajor)) { + EIGEN_UNROLL_LOOP + for (int i = NumDims - 1; i > 0; --i) { + const Index idx0 = indices[0] / this->m_outputStrides[i]; + const Index idx1 = indices[1] / this->m_outputStrides[i]; + inputIndices[0] += idx0 * this->m_inputStrides[i]; + inputIndices[1] += idx1 * this->m_inputStrides[i]; + indices[0] -= idx0 * this->m_outputStrides[i]; + indices[1] -= idx1 * this->m_outputStrides[i]; + } + inputIndices[0] += indices[0] * this->m_inputStrides[0]; + inputIndices[1] += indices[1] * this->m_inputStrides[0]; + } else { // RowMajor + EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx0 = indices[0] / this->m_outputStrides[i]; + const Index idx1 = indices[1] / this->m_outputStrides[i]; + inputIndices[0] += idx0 * this->m_inputStrides[i]; + inputIndices[1] += idx1 * this->m_inputStrides[i]; + indices[0] -= idx0 * this->m_outputStrides[i]; + indices[1] -= idx1 * this->m_outputStrides[i]; + } + inputIndices[0] += indices[0] * this->m_inputStrides[NumDims-1]; + inputIndices[1] += indices[1] * this->m_inputStrides[NumDims-1]; + } + if (inputIndices[1] - inputIndices[0] == PacketSize - 1) { + this->m_impl.template writePacket(inputIndices[0], x); + } + else { + EIGEN_ALIGN_MAX Scalar values[PacketSize]; + internal::pstore(values, x); + this->m_impl.coeffRef(inputIndices[0]) = values[0]; + this->m_impl.coeffRef(inputIndices[1]) = values[PacketSize-1]; + EIGEN_UNROLL_LOOP + for (int i = 1; i < PacketSize-1; ++i) { + this->coeffRef(index+i) = values[i]; + } + } + } +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorTrace.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorTrace.h new file mode 100644 index 0000000..926ecdd --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorTrace.h @@ -0,0 +1,303 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Gagan Goel +// Copyright (C) 2017 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_TRACE_H +#define EIGEN_CXX11_TENSOR_TENSOR_TRACE_H + +namespace Eigen { + +/** \class TensorTrace + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor Trace class. + * + * + */ + +namespace internal { +template +struct traits > : public traits +{ + typedef typename XprType::Scalar Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions - array_size::value; + static const int Layout = XprTraits::Layout; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorTraceOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorTraceOp type; +}; + +} // end namespace internal + + +template +class TensorTraceOp : public TensorBase > +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTraceOp(const XprType& expr, const Dims& dims) + : m_xpr(expr), m_dims(dims) { + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Dims& dims() const { return m_dims; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const typename internal::remove_all::type& expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const Dims m_dims; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorTraceOp XprType; + static const int NumInputDims = internal::array_size::Dimensions>::value; + static const int NumReducedDims = internal::array_size::value; + static const int NumOutputDims = NumInputDims - NumReducedDims; + typedef typename XprType::Index Index; + typedef DSizes Dimensions; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = internal::unpacket_traits::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_traceDim(1), m_device(device) + { + + EIGEN_STATIC_ASSERT((NumOutputDims >= 0), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((NumReducedDims >= 2) || ((NumReducedDims == 0) && (NumInputDims == 0)), YOU_MADE_A_PROGRAMMING_MISTAKE); + + for (int i = 0; i < NumInputDims; ++i) { + m_reduced[i] = false; + } + + const Dims& op_dims = op.dims(); + for (int i = 0; i < NumReducedDims; ++i) { + eigen_assert(op_dims[i] >= 0); + eigen_assert(op_dims[i] < NumInputDims); + m_reduced[op_dims[i]] = true; + } + + // All the dimensions should be distinct to compute the trace + int num_distinct_reduce_dims = 0; + for (int i = 0; i < NumInputDims; ++i) { + if (m_reduced[i]) { + ++num_distinct_reduce_dims; + } + } + + eigen_assert(num_distinct_reduce_dims == NumReducedDims); + + // Compute the dimensions of the result. + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + + int output_index = 0; + int reduced_index = 0; + for (int i = 0; i < NumInputDims; ++i) { + if (m_reduced[i]) { + m_reducedDims[reduced_index] = input_dims[i]; + if (reduced_index > 0) { + // All the trace dimensions must have the same size + eigen_assert(m_reducedDims[0] == m_reducedDims[reduced_index]); + } + ++reduced_index; + } + else { + m_dimensions[output_index] = input_dims[i]; + ++output_index; + } + } + + if (NumReducedDims != 0) { + m_traceDim = m_reducedDims[0]; + } + + // Compute the output strides + if (NumOutputDims > 0) { + if (static_cast(Layout) == static_cast(ColMajor)) { + m_outputStrides[0] = 1; + for (int i = 1; i < NumOutputDims; ++i) { + m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; + } + } + else { + m_outputStrides.back() = 1; + for (int i = NumOutputDims - 2; i >= 0; --i) { + m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; + } + } + } + + // Compute the input strides + if (NumInputDims > 0) { + array input_strides; + if (static_cast(Layout) == static_cast(ColMajor)) { + input_strides[0] = 1; + for (int i = 1; i < NumInputDims; ++i) { + input_strides[i] = input_strides[i - 1] * input_dims[i - 1]; + } + } + else { + input_strides.back() = 1; + for (int i = NumInputDims - 2; i >= 0; --i) { + input_strides[i] = input_strides[i + 1] * input_dims[i + 1]; + } + } + + output_index = 0; + reduced_index = 0; + for (int i = 0; i < NumInputDims; ++i) { + if(m_reduced[i]) { + m_reducedStrides[reduced_index] = input_strides[i]; + ++reduced_index; + } + else { + m_preservedStrides[output_index] = input_strides[i]; + ++output_index; + } + } + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { + return m_dimensions; + } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + // Initialize the result + CoeffReturnType result = internal::cast(0); + Index index_stride = 0; + for (int i = 0; i < NumReducedDims; ++i) { + index_stride += m_reducedStrides[i]; + } + + // If trace is requested along all dimensions, starting index would be 0 + Index cur_index = 0; + if (NumOutputDims != 0) + cur_index = firstInput(index); + for (Index i = 0; i < m_traceDim; ++i) { + result += m_impl.coeff(cur_index); + cur_index += index_stride; + } + + return result; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { + + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE); + eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index + i); + } + PacketReturnType result = internal::ploadt(values); + return result; + } + +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif + + protected: + // Given the output index, finds the first index in the input tensor used to compute the trace + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { + Index startInput = 0; + if (static_cast(Layout) == static_cast(ColMajor)) { + for (int i = NumOutputDims - 1; i > 0; --i) { + const Index idx = index / m_outputStrides[i]; + startInput += idx * m_preservedStrides[i]; + index -= idx * m_outputStrides[i]; + } + startInput += index * m_preservedStrides[0]; + } + else { + for (int i = 0; i < NumOutputDims - 1; ++i) { + const Index idx = index / m_outputStrides[i]; + startInput += idx * m_preservedStrides[i]; + index -= idx * m_outputStrides[i]; + } + startInput += index * m_preservedStrides[NumOutputDims - 1]; + } + return startInput; + } + + Dimensions m_dimensions; + TensorEvaluator m_impl; + // Initialize the size of the trace dimension + Index m_traceDim; + const Device EIGEN_DEVICE_REF m_device; + array m_reduced; + array m_reducedDims; + array m_outputStrides; + array m_reducedStrides; + array m_preservedStrides; +}; + + +} // End namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_TRACE_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorTraits.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorTraits.h new file mode 100644 index 0000000..4f7fd34 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorTraits.h @@ -0,0 +1,264 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H +#define EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H + +namespace Eigen { +namespace internal { + + +template +class compute_tensor_flags +{ + enum { + is_dynamic_size_storage = 1, + + is_aligned = + ( + ((Options&DontAlign)==0) && ( +#if EIGEN_MAX_STATIC_ALIGN_BYTES>0 + (!is_dynamic_size_storage) +#else + 0 +#endif + | +#if EIGEN_MAX_ALIGN_BYTES>0 + is_dynamic_size_storage +#else + 0 +#endif + ) + ), + packet_access_bit = packet_traits::Vectorizable && is_aligned ? PacketAccessBit : 0 + }; + + public: + enum { ret = packet_access_bit }; +}; + + +template +struct traits > +{ + typedef Scalar_ Scalar; + typedef Dense StorageKind; + typedef IndexType_ Index; + static const int NumDimensions = NumIndices_; + static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor; + enum { + Options = Options_, + Flags = compute_tensor_flags::ret | (is_const::value ? 0 : LvalueBit) + }; + template struct MakePointer { + typedef T* Type; + }; + typedef typename MakePointer::Type PointerType; +}; + + +template +struct traits > +{ + typedef Scalar_ Scalar; + typedef Dense StorageKind; + typedef IndexType_ Index; + static const int NumDimensions = array_size::value; + static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor; + enum { + Options = Options_, + Flags = compute_tensor_flags::ret | (is_const::value ? 0: LvalueBit) + }; + template struct MakePointer { + typedef T* Type; + }; + typedef typename MakePointer::Type PointerType; +}; + + +template class MakePointer_> +struct traits > + : public traits +{ + typedef traits BaseTraits; + typedef typename BaseTraits::Scalar Scalar; + typedef typename BaseTraits::StorageKind StorageKind; + typedef typename BaseTraits::Index Index; + static const int NumDimensions = BaseTraits::NumDimensions; + static const int Layout = BaseTraits::Layout; + enum { + Options = Options_, + Flags = BaseTraits::Flags + }; + template struct MakePointer { + // Intermediate typedef to workaround MSVC issue. + typedef MakePointer_ MakePointerT; + typedef typename MakePointerT::Type Type; + }; + typedef typename MakePointer::Type PointerType; +}; + +template +struct traits > + : public traits +{ + typedef traits BaseTraits; + typedef typename BaseTraits::Scalar Scalar; + typedef typename BaseTraits::StorageKind StorageKind; + typedef typename BaseTraits::Index Index; + static const int NumDimensions = BaseTraits::NumDimensions; + static const int Layout = BaseTraits::Layout; + enum { + Options = BaseTraits::Options, + Flags = BaseTraits::Flags + }; + typedef typename BaseTraits::PointerType PointerType; +}; + + +template +struct eval, Eigen::Dense> +{ + typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>EIGEN_DEVICE_REF type; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>EIGEN_DEVICE_REF type; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorFixedSizeEIGEN_DEVICE_REF type; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorFixedSizeEIGEN_DEVICE_REF type; +}; + +template class MakePointer> +struct eval, Eigen::Dense> +{ + typedef const TensorMapEIGEN_DEVICE_REF type; +}; + +template class MakePointer> +struct eval, Eigen::Dense> +{ + typedef const TensorMapEIGEN_DEVICE_REF type; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorRefEIGEN_DEVICE_REF type; +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorRefEIGEN_DEVICE_REF type; +}; + +// TODO nested<> does not exist anymore in Eigen/Core, and it thus has to be removed in favor of ref_selector. +template struct nested +{ + typedef typename ref_selector::type type; +}; + +template +struct nested > +{ + typedef const TensorEIGEN_DEVICE_REF type; +}; + +template +struct nested > +{ + typedef const TensorEIGEN_DEVICE_REF type; +}; + +template +struct nested > +{ + typedef const TensorFixedSizeEIGEN_DEVICE_REF type; +}; + +template +struct nested > +{ + typedef const TensorFixedSizeEIGEN_DEVICE_REF type; +}; + + +template +struct nested > +{ + typedef const TensorRefEIGEN_DEVICE_REF type; +}; + +template +struct nested > +{ + typedef const TensorRefEIGEN_DEVICE_REF type; +}; + +} // end namespace internal + +// Convolutional layers take in an input tensor of shape (D, R, C, B), or (D, C, +// R, B), and convolve it with a set of filters, which can also be presented as +// a tensor (D, K, K, M), where M is the number of filters, K is the filter +// size, and each 3-dimensional tensor of size (D, K, K) is a filter. For +// simplicity we assume that we always use square filters (which is usually the +// case in images), hence the two Ks in the tensor dimension. It also takes in +// a few additional parameters: +// Stride (S): The convolution stride is the offset between locations where we +// apply the filters. A larger stride means that the output will be +// spatially smaller. +// Padding (P): The padding we apply to the input tensor along the R and C +// dimensions. This is usually used to make sure that the spatial +// dimensions of the output matches our intention. +// +// Two types of padding are often used: +// SAME: The pad value is computed so that the output will have size +// R/S and C/S. +// VALID: no padding is carried out. +// When we do padding, the padded values at the padded locations are usually +// zero. +// +// The output dimensions for convolution, when given all the parameters above, +// are as follows: +// When Padding = SAME: the output size is (B, R', C', M), where +// R' = ceil(float(R) / float(S)) +// C' = ceil(float(C) / float(S)) +// where ceil is the ceiling function. The input tensor is padded with 0 as +// needed. The number of padded rows and columns are computed as: +// Pr = ((R' - 1) * S + K - R) / 2 +// Pc = ((C' - 1) * S + K - C) / 2 +// when the stride is 1, we have the simplified case R'=R, C'=C, Pr=Pc=(K-1)/2. +// This is where SAME comes from - the output has the same size as the input has. +// When Padding = VALID: the output size is computed as +// R' = ceil(float(R - K + 1) / float(S)) +// C' = ceil(float(C - K + 1) / float(S)) +// and the number of padded rows and columns are computed in the same way as in +// the SAME case. +// When the stride is 1, we have the simplified case R'=R-K+1, C'=C-K+1, Pr=0, +// Pc=0. +typedef enum { + PADDING_VALID = 1, + PADDING_SAME = 2 +} PaddingType; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorUInt128.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorUInt128.h new file mode 100644 index 0000000..d23f2e4 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorUInt128.h @@ -0,0 +1,249 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_UINT128_H +#define EIGEN_CXX11_TENSOR_TENSOR_UINT128_H + +namespace Eigen { +namespace internal { + + +template +struct static_val { + static const uint64_t value = n; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE operator uint64_t() const { return n; } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val() { } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val(const T& v) { + EIGEN_UNUSED_VARIABLE(v); + eigen_assert(v == n); + } +}; + + +template +struct TensorUInt128 +{ + HIGH high; + LOW low; + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + TensorUInt128(const TensorUInt128& other) : high(other.high), low(other.low) { + EIGEN_STATIC_ASSERT(sizeof(OTHER_HIGH) <= sizeof(HIGH), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(sizeof(OTHER_LOW) <= sizeof(LOW), YOU_MADE_A_PROGRAMMING_MISTAKE); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + TensorUInt128& operator = (const TensorUInt128& other) { + EIGEN_STATIC_ASSERT(sizeof(OTHER_HIGH) <= sizeof(HIGH), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT(sizeof(OTHER_LOW) <= sizeof(LOW), YOU_MADE_A_PROGRAMMING_MISTAKE); + high = other.high; + low = other.low; + return *this; + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + explicit TensorUInt128(const T& x) : high(0), low(x) { + eigen_assert((static_cast::type>(x) <= NumTraits::highest())); + eigen_assert(x >= 0); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + TensorUInt128(HIGH y, LOW x) : high(y), low(x) { } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE operator LOW() const { + return low; + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LOW lower() const { + return low; + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HIGH upper() const { + return high; + } +}; + + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +bool operator == (const TensorUInt128& lhs, const TensorUInt128& rhs) +{ + return (lhs.high == rhs.high) & (lhs.low == rhs.low); +} + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +bool operator != (const TensorUInt128& lhs, const TensorUInt128& rhs) +{ + return (lhs.high != rhs.high) | (lhs.low != rhs.low); +} + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +bool operator >= (const TensorUInt128& lhs, const TensorUInt128& rhs) +{ + if (lhs.high != rhs.high) { + return lhs.high > rhs.high; + } + return lhs.low >= rhs.low; +} + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +bool operator < (const TensorUInt128& lhs, const TensorUInt128& rhs) +{ + if (lhs.high != rhs.high) { + return lhs.high < rhs.high; + } + return lhs.low < rhs.low; +} + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +TensorUInt128 operator + (const TensorUInt128& lhs, const TensorUInt128& rhs) +{ + TensorUInt128 result(lhs.high + rhs.high, lhs.low + rhs.low); + if (result.low < rhs.low) { + result.high += 1; + } + return result; +} + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +TensorUInt128 operator - (const TensorUInt128& lhs, const TensorUInt128& rhs) +{ + TensorUInt128 result(lhs.high - rhs.high, lhs.low - rhs.low); + if (result.low > lhs.low) { + result.high -= 1; + } + return result; +} + + +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +TensorUInt128 operator * (const TensorUInt128& lhs, const TensorUInt128& rhs) +{ + // Split each 128-bit integer into 4 32-bit integers, and then do the + // multiplications by hand as follow: + // lhs a b c d + // rhs e f g h + // ----------- + // ah bh ch dh + // bg cg dg + // cf df + // de + // The result is stored in 2 64bit integers, high and low. + + const uint64_t LOW = 0x00000000FFFFFFFFLL; + const uint64_t HIGH = 0xFFFFFFFF00000000LL; + + uint64_t d = lhs.low & LOW; + uint64_t c = (lhs.low & HIGH) >> 32LL; + uint64_t b = lhs.high & LOW; + uint64_t a = (lhs.high & HIGH) >> 32LL; + + uint64_t h = rhs.low & LOW; + uint64_t g = (rhs.low & HIGH) >> 32LL; + uint64_t f = rhs.high & LOW; + uint64_t e = (rhs.high & HIGH) >> 32LL; + + // Compute the low 32 bits of low + uint64_t acc = d * h; + uint64_t low = acc & LOW; + // Compute the high 32 bits of low. Add a carry every time we wrap around + acc >>= 32LL; + uint64_t carry = 0; + uint64_t acc2 = acc + c * h; + if (acc2 < acc) { + carry++; + } + acc = acc2 + d * g; + if (acc < acc2) { + carry++; + } + low |= (acc << 32LL); + + // Carry forward the high bits of acc to initiate the computation of the + // low 32 bits of high + acc2 = (acc >> 32LL) | (carry << 32LL); + carry = 0; + + acc = acc2 + b * h; + if (acc < acc2) { + carry++; + } + acc2 = acc + c * g; + if (acc2 < acc) { + carry++; + } + acc = acc2 + d * f; + if (acc < acc2) { + carry++; + } + uint64_t high = acc & LOW; + + // Start to compute the high 32 bits of high. + acc2 = (acc >> 32LL) | (carry << 32LL); + + acc = acc2 + a * h; + acc2 = acc + b * g; + acc = acc2 + c * f; + acc2 = acc + d * e; + high |= (acc2 << 32LL); + + return TensorUInt128(high, low); +} + +template +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +TensorUInt128 operator / (const TensorUInt128& lhs, const TensorUInt128& rhs) +{ + if (rhs == TensorUInt128, static_val<1> >(1)) { + return TensorUInt128(lhs.high, lhs.low); + } else if (lhs < rhs) { + return TensorUInt128(0); + } else { + // calculate the biggest power of 2 times rhs that's less than or equal to lhs + TensorUInt128 power2(1); + TensorUInt128 d(rhs); + TensorUInt128 tmp(lhs - d); + while (lhs >= d) { + tmp = tmp - d; + d = d + d; + power2 = power2 + power2; + } + + tmp = TensorUInt128(lhs.high, lhs.low); + TensorUInt128 result(0); + while (power2 != TensorUInt128, static_val<0> >(0)) { + if (tmp >= d) { + tmp = tmp - d; + result = result + power2; + } + // Shift right + power2 = TensorUInt128(power2.high >> 1, (power2.low >> 1) | (power2.high << 63)); + d = TensorUInt128(d.high >> 1, (d.low >> 1) | (d.high << 63)); + } + + return result; + } +} + + +} // namespace internal +} // namespace Eigen + + +#endif // EIGEN_CXX11_TENSOR_TENSOR_UINT128_H diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorVolumePatch.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorVolumePatch.h new file mode 100644 index 0000000..0beb9ff --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/Tensor/TensorVolumePatch.h @@ -0,0 +1,629 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H +#define EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H + +namespace Eigen { + +/** \class TensorVolumePatch + * \ingroup CXX11_Tensor_Module + * + * \brief Patch extraction specialized for processing of volumetric data. + * This assumes that the input has a least 4 dimensions ordered as follows: + * - channels + * - planes + * - rows + * - columns + * - (optional) additional dimensions such as time or batch size. + * Calling the volume patch code with patch_planes, patch_rows, and patch_cols + * is equivalent to calling the regular patch extraction code with parameters + * d, patch_planes, patch_rows, patch_cols, and 1 for all the additional + * dimensions. + */ +namespace internal { + +template +struct traits > : public traits +{ + typedef typename internal::remove_const::type Scalar; + typedef traits XprTraits; + typedef typename XprTraits::StorageKind StorageKind; + typedef typename XprTraits::Index Index; + typedef typename XprType::Nested Nested; + typedef typename remove_reference::type _Nested; + static const int NumDimensions = XprTraits::NumDimensions + 1; + static const int Layout = XprTraits::Layout; + typedef typename XprTraits::PointerType PointerType; + +}; + +template +struct eval, Eigen::Dense> +{ + typedef const TensorVolumePatchOp& type; +}; + +template +struct nested, 1, typename eval >::type> +{ + typedef TensorVolumePatchOp type; +}; + +} // end namespace internal + +template +class TensorVolumePatchOp : public TensorBase, ReadOnlyAccessors> +{ + public: + typedef typename Eigen::internal::traits::Scalar Scalar; + typedef typename Eigen::NumTraits::Real RealScalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename Eigen::internal::nested::type Nested; + typedef typename Eigen::internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::traits::Index Index; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorVolumePatchOp(const XprType& expr, DenseIndex patch_planes, DenseIndex patch_rows, DenseIndex patch_cols, + DenseIndex plane_strides, DenseIndex row_strides, DenseIndex col_strides, + DenseIndex in_plane_strides, DenseIndex in_row_strides, DenseIndex in_col_strides, + DenseIndex plane_inflate_strides, DenseIndex row_inflate_strides, DenseIndex col_inflate_strides, + PaddingType padding_type, Scalar padding_value) + : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols), + m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides), + m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides), + m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides), + m_padding_explicit(false), m_padding_top_z(0), m_padding_bottom_z(0), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0), + m_padding_type(padding_type), m_padding_value(padding_value) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorVolumePatchOp(const XprType& expr, DenseIndex patch_planes, DenseIndex patch_rows, DenseIndex patch_cols, + DenseIndex plane_strides, DenseIndex row_strides, DenseIndex col_strides, + DenseIndex in_plane_strides, DenseIndex in_row_strides, DenseIndex in_col_strides, + DenseIndex plane_inflate_strides, DenseIndex row_inflate_strides, DenseIndex col_inflate_strides, + DenseIndex padding_top_z, DenseIndex padding_bottom_z, + DenseIndex padding_top, DenseIndex padding_bottom, + DenseIndex padding_left, DenseIndex padding_right, + Scalar padding_value) + : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols), + m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides), + m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides), + m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides), + m_padding_explicit(true), m_padding_top_z(padding_top_z), m_padding_bottom_z(padding_bottom_z), m_padding_top(padding_top), m_padding_bottom(padding_bottom), + m_padding_left(padding_left), m_padding_right(padding_right), + m_padding_type(PADDING_VALID), m_padding_value(padding_value) {} + + EIGEN_DEVICE_FUNC + DenseIndex patch_planes() const { return m_patch_planes; } + EIGEN_DEVICE_FUNC + DenseIndex patch_rows() const { return m_patch_rows; } + EIGEN_DEVICE_FUNC + DenseIndex patch_cols() const { return m_patch_cols; } + EIGEN_DEVICE_FUNC + DenseIndex plane_strides() const { return m_plane_strides; } + EIGEN_DEVICE_FUNC + DenseIndex row_strides() const { return m_row_strides; } + EIGEN_DEVICE_FUNC + DenseIndex col_strides() const { return m_col_strides; } + EIGEN_DEVICE_FUNC + DenseIndex in_plane_strides() const { return m_in_plane_strides; } + EIGEN_DEVICE_FUNC + DenseIndex in_row_strides() const { return m_in_row_strides; } + EIGEN_DEVICE_FUNC + DenseIndex in_col_strides() const { return m_in_col_strides; } + EIGEN_DEVICE_FUNC + DenseIndex plane_inflate_strides() const { return m_plane_inflate_strides; } + EIGEN_DEVICE_FUNC + DenseIndex row_inflate_strides() const { return m_row_inflate_strides; } + EIGEN_DEVICE_FUNC + DenseIndex col_inflate_strides() const { return m_col_inflate_strides; } + EIGEN_DEVICE_FUNC + bool padding_explicit() const { return m_padding_explicit; } + EIGEN_DEVICE_FUNC + DenseIndex padding_top_z() const { return m_padding_top_z; } + EIGEN_DEVICE_FUNC + DenseIndex padding_bottom_z() const { return m_padding_bottom_z; } + EIGEN_DEVICE_FUNC + DenseIndex padding_top() const { return m_padding_top; } + EIGEN_DEVICE_FUNC + DenseIndex padding_bottom() const { return m_padding_bottom; } + EIGEN_DEVICE_FUNC + DenseIndex padding_left() const { return m_padding_left; } + EIGEN_DEVICE_FUNC + DenseIndex padding_right() const { return m_padding_right; } + EIGEN_DEVICE_FUNC + PaddingType padding_type() const { return m_padding_type; } + EIGEN_DEVICE_FUNC + Scalar padding_value() const { return m_padding_value; } + + EIGEN_DEVICE_FUNC + const typename internal::remove_all::type& + expression() const { return m_xpr; } + + protected: + typename XprType::Nested m_xpr; + const DenseIndex m_patch_planes; + const DenseIndex m_patch_rows; + const DenseIndex m_patch_cols; + const DenseIndex m_plane_strides; + const DenseIndex m_row_strides; + const DenseIndex m_col_strides; + const DenseIndex m_in_plane_strides; + const DenseIndex m_in_row_strides; + const DenseIndex m_in_col_strides; + const DenseIndex m_plane_inflate_strides; + const DenseIndex m_row_inflate_strides; + const DenseIndex m_col_inflate_strides; + const bool m_padding_explicit; + const DenseIndex m_padding_top_z; + const DenseIndex m_padding_bottom_z; + const DenseIndex m_padding_top; + const DenseIndex m_padding_bottom; + const DenseIndex m_padding_left; + const DenseIndex m_padding_right; + const PaddingType m_padding_type; + const Scalar m_padding_value; +}; + + +// Eval as rvalue +template +struct TensorEvaluator, Device> +{ + typedef TensorVolumePatchOp XprType; + typedef typename XprType::Index Index; + static const int NumInputDims = internal::array_size::Dimensions>::value; + static const int NumDims = NumInputDims + 1; + typedef DSizes Dimensions; + typedef typename internal::remove_const::type Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + static const int PacketSize = PacketType::size; + typedef StorageMemory Storage; + typedef typename Storage::Type EvaluatorPointerType; + + enum { + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator::PreferBlockAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : + m_impl(op.expression(), device) + { + EIGEN_STATIC_ASSERT((NumDims >= 5), YOU_MADE_A_PROGRAMMING_MISTAKE); + + m_paddingValue = op.padding_value(); + + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); + + // Cache a few variables. + if (static_cast(Layout) == static_cast(ColMajor)) { + m_inputDepth = input_dims[0]; + m_inputPlanes = input_dims[1]; + m_inputRows = input_dims[2]; + m_inputCols = input_dims[3]; + } else { + m_inputDepth = input_dims[NumInputDims-1]; + m_inputPlanes = input_dims[NumInputDims-2]; + m_inputRows = input_dims[NumInputDims-3]; + m_inputCols = input_dims[NumInputDims-4]; + } + + m_plane_strides = op.plane_strides(); + m_row_strides = op.row_strides(); + m_col_strides = op.col_strides(); + + // Input strides and effective input/patch size + m_in_plane_strides = op.in_plane_strides(); + m_in_row_strides = op.in_row_strides(); + m_in_col_strides = op.in_col_strides(); + m_plane_inflate_strides = op.plane_inflate_strides(); + m_row_inflate_strides = op.row_inflate_strides(); + m_col_inflate_strides = op.col_inflate_strides(); + + // The "effective" spatial size after inflating data with zeros. + m_input_planes_eff = (m_inputPlanes - 1) * m_plane_inflate_strides + 1; + m_input_rows_eff = (m_inputRows - 1) * m_row_inflate_strides + 1; + m_input_cols_eff = (m_inputCols - 1) * m_col_inflate_strides + 1; + m_patch_planes_eff = op.patch_planes() + (op.patch_planes() - 1) * (m_in_plane_strides - 1); + m_patch_rows_eff = op.patch_rows() + (op.patch_rows() - 1) * (m_in_row_strides - 1); + m_patch_cols_eff = op.patch_cols() + (op.patch_cols() - 1) * (m_in_col_strides - 1); + + if (op.padding_explicit()) { + m_outputPlanes = numext::ceil((m_input_planes_eff + op.padding_top_z() + op.padding_bottom_z() - m_patch_planes_eff + 1.f) / static_cast(m_plane_strides)); + m_outputRows = numext::ceil((m_input_rows_eff + op.padding_top() + op.padding_bottom() - m_patch_rows_eff + 1.f) / static_cast(m_row_strides)); + m_outputCols = numext::ceil((m_input_cols_eff + op.padding_left() + op.padding_right() - m_patch_cols_eff + 1.f) / static_cast(m_col_strides)); + m_planePaddingTop = op.padding_top_z(); + m_rowPaddingTop = op.padding_top(); + m_colPaddingLeft = op.padding_left(); + } else { + // Computing padding from the type + switch (op.padding_type()) { + case PADDING_VALID: + m_outputPlanes = numext::ceil((m_input_planes_eff - m_patch_planes_eff + 1.f) / static_cast(m_plane_strides)); + m_outputRows = numext::ceil((m_input_rows_eff - m_patch_rows_eff + 1.f) / static_cast(m_row_strides)); + m_outputCols = numext::ceil((m_input_cols_eff - m_patch_cols_eff + 1.f) / static_cast(m_col_strides)); + m_planePaddingTop = 0; + m_rowPaddingTop = 0; + m_colPaddingLeft = 0; + break; + case PADDING_SAME: { + m_outputPlanes = numext::ceil(m_input_planes_eff / static_cast(m_plane_strides)); + m_outputRows = numext::ceil(m_input_rows_eff / static_cast(m_row_strides)); + m_outputCols = numext::ceil(m_input_cols_eff / static_cast(m_col_strides)); + const Index dz = (m_outputPlanes - 1) * m_plane_strides + m_patch_planes_eff - m_input_planes_eff; + const Index dy = (m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff; + const Index dx = (m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff; + m_planePaddingTop = dz / 2; + m_rowPaddingTop = dy / 2; + m_colPaddingLeft = dx / 2; + break; + } + default: + eigen_assert(false && "unexpected padding"); + } + } + eigen_assert(m_outputRows > 0); + eigen_assert(m_outputCols > 0); + eigen_assert(m_outputPlanes > 0); + + // Dimensions for result of extraction. + if (static_cast(Layout) == static_cast(ColMajor)) { + // ColMajor + // 0: depth + // 1: patch_planes + // 2: patch_rows + // 3: patch_cols + // 4: number of patches + // 5 and beyond: anything else (such as batch). + m_dimensions[0] = input_dims[0]; + m_dimensions[1] = op.patch_planes(); + m_dimensions[2] = op.patch_rows(); + m_dimensions[3] = op.patch_cols(); + m_dimensions[4] = m_outputPlanes * m_outputRows * m_outputCols; + for (int i = 5; i < NumDims; ++i) { + m_dimensions[i] = input_dims[i-1]; + } + } else { + // RowMajor + // NumDims-1: depth + // NumDims-2: patch_planes + // NumDims-3: patch_rows + // NumDims-4: patch_cols + // NumDims-5: number of patches + // NumDims-6 and beyond: anything else (such as batch). + m_dimensions[NumDims-1] = input_dims[NumInputDims-1]; + m_dimensions[NumDims-2] = op.patch_planes(); + m_dimensions[NumDims-3] = op.patch_rows(); + m_dimensions[NumDims-4] = op.patch_cols(); + m_dimensions[NumDims-5] = m_outputPlanes * m_outputRows * m_outputCols; + for (int i = NumDims-6; i >= 0; --i) { + m_dimensions[i] = input_dims[i]; + } + } + + // Strides for the output tensor. + if (static_cast(Layout) == static_cast(ColMajor)) { + m_rowStride = m_dimensions[1]; + m_colStride = m_dimensions[2] * m_rowStride; + m_patchStride = m_colStride * m_dimensions[3] * m_dimensions[0]; + m_otherStride = m_patchStride * m_dimensions[4]; + } else { + m_rowStride = m_dimensions[NumDims-2]; + m_colStride = m_dimensions[NumDims-3] * m_rowStride; + m_patchStride = m_colStride * m_dimensions[NumDims-4] * m_dimensions[NumDims-1]; + m_otherStride = m_patchStride * m_dimensions[NumDims-5]; + } + + // Strides for navigating through the input tensor. + m_planeInputStride = m_inputDepth; + m_rowInputStride = m_inputDepth * m_inputPlanes; + m_colInputStride = m_inputDepth * m_inputRows * m_inputPlanes; + m_otherInputStride = m_inputDepth * m_inputRows * m_inputCols * m_inputPlanes; + + m_outputPlanesRows = m_outputPlanes * m_outputRows; + + // Fast representations of different variables. + m_fastOtherStride = internal::TensorIntDivisor(m_otherStride); + + m_fastPatchStride = internal::TensorIntDivisor(m_patchStride); + m_fastColStride = internal::TensorIntDivisor(m_colStride); + m_fastRowStride = internal::TensorIntDivisor(m_rowStride); + m_fastInputRowStride = internal::TensorIntDivisor(m_row_inflate_strides); + m_fastInputColStride = internal::TensorIntDivisor(m_col_inflate_strides); + m_fastInputPlaneStride = internal::TensorIntDivisor(m_plane_inflate_strides); + m_fastInputColsEff = internal::TensorIntDivisor(m_input_cols_eff); + m_fastOutputPlanes = internal::TensorIntDivisor(m_outputPlanes); + m_fastOutputPlanesRows = internal::TensorIntDivisor(m_outputPlanesRows); + + if (static_cast(Layout) == static_cast(ColMajor)) { + m_fastOutputDepth = internal::TensorIntDivisor(m_dimensions[0]); + } else { + m_fastOutputDepth = internal::TensorIntDivisor(m_dimensions[NumDims-1]); + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } + + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + m_impl.evalSubExprsIfNeeded(NULL); + return true; + } + + EIGEN_STRONG_INLINE void cleanup() { + m_impl.cleanup(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const + { + // Patch index corresponding to the passed in index. + const Index patchIndex = index / m_fastPatchStride; + + // Spatial offset within the patch. This has to be translated into 3D + // coordinates within the patch. + const Index patchOffset = (index - patchIndex * m_patchStride) / m_fastOutputDepth; + + // Batch, etc. + const Index otherIndex = (NumDims == 5) ? 0 : index / m_fastOtherStride; + const Index patch3DIndex = (NumDims == 5) ? patchIndex : (index - otherIndex * m_otherStride) / m_fastPatchStride; + + // Calculate column index in the input original tensor. + const Index colIndex = patch3DIndex / m_fastOutputPlanesRows; + const Index colOffset = patchOffset / m_fastColStride; + const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft; + const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0); + if (inputCol < 0 || inputCol >= m_input_cols_eff || + ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) { + return Scalar(m_paddingValue); + } + + // Calculate row index in the original input tensor. + const Index rowIndex = (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes; + const Index rowOffset = (patchOffset - colOffset * m_colStride) / m_fastRowStride; + const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop; + const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0); + if (inputRow < 0 || inputRow >= m_input_rows_eff || + ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) { + return Scalar(m_paddingValue); + } + + // Calculate plane index in the original input tensor. + const Index planeIndex = (patch3DIndex - m_outputPlanes * (colIndex * m_outputRows + rowIndex)); + const Index planeOffset = patchOffset - colOffset * m_colStride - rowOffset * m_rowStride; + const Index inputPlane = planeIndex * m_plane_strides + planeOffset * m_in_plane_strides - m_planePaddingTop; + const Index origInputPlane = (m_plane_inflate_strides == 1) ? inputPlane : ((inputPlane >= 0) ? (inputPlane / m_fastInputPlaneStride) : 0); + if (inputPlane < 0 || inputPlane >= m_input_planes_eff || + ((m_plane_inflate_strides != 1) && (inputPlane != origInputPlane * m_plane_inflate_strides))) { + return Scalar(m_paddingValue); + } + + const int depth_index = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - 1; + const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index]; + + const Index inputIndex = depth + + origInputRow * m_rowInputStride + + origInputCol * m_colInputStride + + origInputPlane * m_planeInputStride + + otherIndex * m_otherInputStride; + + return m_impl.coeff(inputIndex); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) + eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + + if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1 || + m_in_plane_strides != 1 || m_plane_inflate_strides != 1) { + return packetWithPossibleZero(index); + } + + const Index indices[2] = {index, index + PacketSize - 1}; + const Index patchIndex = indices[0] / m_fastPatchStride; + if (patchIndex != indices[1] / m_fastPatchStride) { + return packetWithPossibleZero(index); + } + const Index otherIndex = (NumDims == 5) ? 0 : indices[0] / m_fastOtherStride; + eigen_assert(otherIndex == indices[1] / m_fastOtherStride); + + // Find the offset of the element wrt the location of the first element. + const Index patchOffsets[2] = {(indices[0] - patchIndex * m_patchStride) / m_fastOutputDepth, + (indices[1] - patchIndex * m_patchStride) / m_fastOutputDepth}; + + const Index patch3DIndex = (NumDims == 5) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride; + eigen_assert(patch3DIndex == (indices[1] - otherIndex * m_otherStride) / m_fastPatchStride); + + const Index colIndex = patch3DIndex / m_fastOutputPlanesRows; + const Index colOffsets[2] = { + patchOffsets[0] / m_fastColStride, + patchOffsets[1] / m_fastColStride}; + + // Calculate col indices in the original input tensor. + const Index inputCols[2] = { + colIndex * m_col_strides + colOffsets[0] - m_colPaddingLeft, + colIndex * m_col_strides + colOffsets[1] - m_colPaddingLeft}; + if (inputCols[1] < 0 || inputCols[0] >= m_inputCols) { + return internal::pset1(Scalar(m_paddingValue)); + } + + if (inputCols[0] != inputCols[1]) { + return packetWithPossibleZero(index); + } + + const Index rowIndex = (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes; + const Index rowOffsets[2] = { + (patchOffsets[0] - colOffsets[0] * m_colStride) / m_fastRowStride, + (patchOffsets[1] - colOffsets[1] * m_colStride) / m_fastRowStride}; + eigen_assert(rowOffsets[0] <= rowOffsets[1]); + // Calculate col indices in the original input tensor. + const Index inputRows[2] = { + rowIndex * m_row_strides + rowOffsets[0] - m_rowPaddingTop, + rowIndex * m_row_strides + rowOffsets[1] - m_rowPaddingTop}; + + if (inputRows[1] < 0 || inputRows[0] >= m_inputRows) { + return internal::pset1(Scalar(m_paddingValue)); + } + + if (inputRows[0] != inputRows[1]) { + return packetWithPossibleZero(index); + } + + const Index planeIndex = (patch3DIndex - m_outputPlanes * (colIndex * m_outputRows + rowIndex)); + const Index planeOffsets[2] = { + patchOffsets[0] - colOffsets[0] * m_colStride - rowOffsets[0] * m_rowStride, + patchOffsets[1] - colOffsets[1] * m_colStride - rowOffsets[1] * m_rowStride}; + eigen_assert(planeOffsets[0] <= planeOffsets[1]); + const Index inputPlanes[2] = { + planeIndex * m_plane_strides + planeOffsets[0] - m_planePaddingTop, + planeIndex * m_plane_strides + planeOffsets[1] - m_planePaddingTop}; + + if (inputPlanes[1] < 0 || inputPlanes[0] >= m_inputPlanes) { + return internal::pset1(Scalar(m_paddingValue)); + } + + if (inputPlanes[0] >= 0 && inputPlanes[1] < m_inputPlanes) { + // no padding + const int depth_index = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - 1; + const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index]; + const Index inputIndex = depth + + inputRows[0] * m_rowInputStride + + inputCols[0] * m_colInputStride + + m_planeInputStride * inputPlanes[0] + + otherIndex * m_otherInputStride; + return m_impl.template packet(inputIndex); + } + + return packetWithPossibleZero(index); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost + costPerCoeff(bool vectorized) const { + const double compute_cost = + 10 * TensorOpCost::DivCost() + 21 * TensorOpCost::MulCost() + + 8 * TensorOpCost::AddCost(); + return TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + + const TensorEvaluator& impl() const { return m_impl; } + + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index planePaddingTop() const { return m_planePaddingTop; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowPaddingTop() const { return m_rowPaddingTop; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colPaddingLeft() const { return m_colPaddingLeft; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputPlanes() const { return m_outputPlanes; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputRows() const { return m_outputRows; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputCols() const { return m_outputCols; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userPlaneStride() const { return m_plane_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userRowStride() const { return m_row_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userColStride() const { return m_col_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInPlaneStride() const { return m_in_plane_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInRowStride() const { return m_in_row_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInColStride() const { return m_in_col_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index planeInflateStride() const { return m_plane_inflate_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowInflateStride() const { return m_row_inflate_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colInflateStride() const { return m_col_inflate_strides; } + +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_impl.bind(cgh); + } +#endif + protected: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const + { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + EIGEN_UNROLL_LOOP + for (int i = 0; i < PacketSize; ++i) { + values[i] = coeff(index+i); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + + Dimensions m_dimensions; + + // Parameters passed to the constructor. + Index m_plane_strides; + Index m_row_strides; + Index m_col_strides; + + Index m_outputPlanes; + Index m_outputRows; + Index m_outputCols; + + Index m_planePaddingTop; + Index m_rowPaddingTop; + Index m_colPaddingLeft; + + Index m_in_plane_strides; + Index m_in_row_strides; + Index m_in_col_strides; + + Index m_plane_inflate_strides; + Index m_row_inflate_strides; + Index m_col_inflate_strides; + + // Cached input size. + Index m_inputDepth; + Index m_inputPlanes; + Index m_inputRows; + Index m_inputCols; + + // Other cached variables. + Index m_outputPlanesRows; + + // Effective input/patch post-inflation size. + Index m_input_planes_eff; + Index m_input_rows_eff; + Index m_input_cols_eff; + Index m_patch_planes_eff; + Index m_patch_rows_eff; + Index m_patch_cols_eff; + + // Strides for the output tensor. + Index m_otherStride; + Index m_patchStride; + Index m_rowStride; + Index m_colStride; + + // Strides for the input tensor. + Index m_planeInputStride; + Index m_rowInputStride; + Index m_colInputStride; + Index m_otherInputStride; + + internal::TensorIntDivisor m_fastOtherStride; + internal::TensorIntDivisor m_fastPatchStride; + internal::TensorIntDivisor m_fastColStride; + internal::TensorIntDivisor m_fastRowStride; + internal::TensorIntDivisor m_fastInputPlaneStride; + internal::TensorIntDivisor m_fastInputRowStride; + internal::TensorIntDivisor m_fastInputColStride; + internal::TensorIntDivisor m_fastInputColsEff; + internal::TensorIntDivisor m_fastOutputPlanesRows; + internal::TensorIntDivisor m_fastOutputPlanes; + internal::TensorIntDivisor m_fastOutputDepth; + + Scalar m_paddingValue; + + TensorEvaluator m_impl; + + +}; + + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H diff --git a/src/EigenUnsupported/CXX11/src/TensorSymmetry/DynamicSymmetry.h b/src/EigenUnsupported/CXX11/src/TensorSymmetry/DynamicSymmetry.h new file mode 100644 index 0000000..bc4f202 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/TensorSymmetry/DynamicSymmetry.h @@ -0,0 +1,293 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2013 Christian Seiler +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSORSYMMETRY_DYNAMICSYMMETRY_H +#define EIGEN_CXX11_TENSORSYMMETRY_DYNAMICSYMMETRY_H + +namespace Eigen { + +class DynamicSGroup +{ + public: + inline explicit DynamicSGroup() : m_numIndices(1), m_elements(), m_generators(), m_globalFlags(0) { m_elements.push_back(ge(Generator(0, 0, 0))); } + inline DynamicSGroup(const DynamicSGroup& o) : m_numIndices(o.m_numIndices), m_elements(o.m_elements), m_generators(o.m_generators), m_globalFlags(o.m_globalFlags) { } + inline DynamicSGroup(DynamicSGroup&& o) : m_numIndices(o.m_numIndices), m_elements(), m_generators(o.m_generators), m_globalFlags(o.m_globalFlags) { std::swap(m_elements, o.m_elements); } + inline DynamicSGroup& operator=(const DynamicSGroup& o) { m_numIndices = o.m_numIndices; m_elements = o.m_elements; m_generators = o.m_generators; m_globalFlags = o.m_globalFlags; return *this; } + inline DynamicSGroup& operator=(DynamicSGroup&& o) { m_numIndices = o.m_numIndices; std::swap(m_elements, o.m_elements); m_generators = o.m_generators; m_globalFlags = o.m_globalFlags; return *this; } + + void add(int one, int two, int flags = 0); + + template + inline void add(Gen_) { add(Gen_::One, Gen_::Two, Gen_::Flags); } + inline void addSymmetry(int one, int two) { add(one, two, 0); } + inline void addAntiSymmetry(int one, int two) { add(one, two, NegationFlag); } + inline void addHermiticity(int one, int two) { add(one, two, ConjugationFlag); } + inline void addAntiHermiticity(int one, int two) { add(one, two, NegationFlag | ConjugationFlag); } + + template + inline RV apply(const std::array& idx, RV initial, Args&&... args) const + { + eigen_assert(N >= m_numIndices && "Can only apply symmetry group to objects that have at least the required amount of indices."); + for (std::size_t i = 0; i < size(); i++) + initial = Op::run(h_permute(i, idx, typename internal::gen_numeric_list::type()), m_elements[i].flags, initial, std::forward(args)...); + return initial; + } + + template + inline RV apply(const std::vector& idx, RV initial, Args&&... args) const + { + eigen_assert(idx.size() >= m_numIndices && "Can only apply symmetry group to objects that have at least the required amount of indices."); + for (std::size_t i = 0; i < size(); i++) + initial = Op::run(h_permute(i, idx), m_elements[i].flags, initial, std::forward(args)...); + return initial; + } + + inline int globalFlags() const { return m_globalFlags; } + inline std::size_t size() const { return m_elements.size(); } + + template + inline internal::tensor_symmetry_value_setter operator()(Tensor_& tensor, typename Tensor_::Index firstIndex, IndexTypes... otherIndices) const + { + static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + return operator()(tensor, std::array{{firstIndex, otherIndices...}}); + } + + template + inline internal::tensor_symmetry_value_setter operator()(Tensor_& tensor, std::array const& indices) const + { + return internal::tensor_symmetry_value_setter(tensor, *this, indices); + } + private: + struct GroupElement { + std::vector representation; + int flags; + bool isId() const + { + for (std::size_t i = 0; i < representation.size(); i++) + if (i != (size_t)representation[i]) + return false; + return true; + } + }; + struct Generator { + int one; + int two; + int flags; + constexpr inline Generator(int one_, int two_, int flags_) : one(one_), two(two_), flags(flags_) {} + }; + + std::size_t m_numIndices; + std::vector m_elements; + std::vector m_generators; + int m_globalFlags; + + template + inline std::array h_permute(std::size_t which, const std::array& idx, internal::numeric_list) const + { + return std::array{{ idx[n >= m_numIndices ? n : m_elements[which].representation[n]]... }}; + } + + template + inline std::vector h_permute(std::size_t which, std::vector idx) const + { + std::vector result; + result.reserve(idx.size()); + for (auto k : m_elements[which].representation) + result.push_back(idx[k]); + for (std::size_t i = m_numIndices; i < idx.size(); i++) + result.push_back(idx[i]); + return result; + } + + inline GroupElement ge(Generator const& g) const + { + GroupElement result; + result.representation.reserve(m_numIndices); + result.flags = g.flags; + for (std::size_t k = 0; k < m_numIndices; k++) { + if (k == (std::size_t)g.one) + result.representation.push_back(g.two); + else if (k == (std::size_t)g.two) + result.representation.push_back(g.one); + else + result.representation.push_back(int(k)); + } + return result; + } + + GroupElement mul(GroupElement, GroupElement) const; + inline GroupElement mul(Generator g1, GroupElement g2) const + { + return mul(ge(g1), g2); + } + + inline GroupElement mul(GroupElement g1, Generator g2) const + { + return mul(g1, ge(g2)); + } + + inline GroupElement mul(Generator g1, Generator g2) const + { + return mul(ge(g1), ge(g2)); + } + + inline int findElement(GroupElement e) const + { + for (auto ee : m_elements) { + if (ee.representation == e.representation) + return ee.flags ^ e.flags; + } + return -1; + } + + void updateGlobalFlags(int flagDiffOfSameGenerator); +}; + +// dynamic symmetry group that auto-adds the template parameters in the constructor +template +class DynamicSGroupFromTemplateArgs : public DynamicSGroup +{ + public: + inline DynamicSGroupFromTemplateArgs() : DynamicSGroup() + { + add_all(internal::type_list()); + } + inline DynamicSGroupFromTemplateArgs(DynamicSGroupFromTemplateArgs const& other) : DynamicSGroup(other) { } + inline DynamicSGroupFromTemplateArgs(DynamicSGroupFromTemplateArgs&& other) : DynamicSGroup(other) { } + inline DynamicSGroupFromTemplateArgs& operator=(const DynamicSGroupFromTemplateArgs& o) { DynamicSGroup::operator=(o); return *this; } + inline DynamicSGroupFromTemplateArgs& operator=(DynamicSGroupFromTemplateArgs&& o) { DynamicSGroup::operator=(o); return *this; } + + private: + template + inline void add_all(internal::type_list) + { + add(Gen1()); + add_all(internal::type_list()); + } + + inline void add_all(internal::type_list<>) + { + } +}; + +inline DynamicSGroup::GroupElement DynamicSGroup::mul(GroupElement g1, GroupElement g2) const +{ + eigen_internal_assert(g1.representation.size() == m_numIndices); + eigen_internal_assert(g2.representation.size() == m_numIndices); + + GroupElement result; + result.representation.reserve(m_numIndices); + for (std::size_t i = 0; i < m_numIndices; i++) { + int v = g2.representation[g1.representation[i]]; + eigen_assert(v >= 0); + result.representation.push_back(v); + } + result.flags = g1.flags ^ g2.flags; + return result; +} + +inline void DynamicSGroup::add(int one, int two, int flags) +{ + eigen_assert(one >= 0); + eigen_assert(two >= 0); + eigen_assert(one != two); + + if ((std::size_t)one >= m_numIndices || (std::size_t)two >= m_numIndices) { + std::size_t newNumIndices = (one > two) ? one : two + 1; + for (auto& gelem : m_elements) { + gelem.representation.reserve(newNumIndices); + for (std::size_t i = m_numIndices; i < newNumIndices; i++) + gelem.representation.push_back(i); + } + m_numIndices = newNumIndices; + } + + Generator g{one, two, flags}; + GroupElement e = ge(g); + + /* special case for first generator */ + if (m_elements.size() == 1) { + while (!e.isId()) { + m_elements.push_back(e); + e = mul(e, g); + } + + if (e.flags > 0) + updateGlobalFlags(e.flags); + + // only add in case we didn't have identity + if (m_elements.size() > 1) + m_generators.push_back(g); + return; + } + + int p = findElement(e); + if (p >= 0) { + updateGlobalFlags(p); + return; + } + + std::size_t coset_order = m_elements.size(); + m_elements.push_back(e); + for (std::size_t i = 1; i < coset_order; i++) + m_elements.push_back(mul(m_elements[i], e)); + m_generators.push_back(g); + + std::size_t coset_rep = coset_order; + do { + for (auto g : m_generators) { + e = mul(m_elements[coset_rep], g); + p = findElement(e); + if (p < 0) { + // element not yet in group + m_elements.push_back(e); + for (std::size_t i = 1; i < coset_order; i++) + m_elements.push_back(mul(m_elements[i], e)); + } else if (p > 0) { + updateGlobalFlags(p); + } + } + coset_rep += coset_order; + } while (coset_rep < m_elements.size()); +} + +inline void DynamicSGroup::updateGlobalFlags(int flagDiffOfSameGenerator) +{ + switch (flagDiffOfSameGenerator) { + case 0: + default: + // nothing happened + break; + case NegationFlag: + // every element is it's own negative => whole tensor is zero + m_globalFlags |= GlobalZeroFlag; + break; + case ConjugationFlag: + // every element is it's own conjugate => whole tensor is real + m_globalFlags |= GlobalRealFlag; + break; + case (NegationFlag | ConjugationFlag): + // every element is it's own negative conjugate => whole tensor is imaginary + m_globalFlags |= GlobalImagFlag; + break; + /* NOTE: + * since GlobalZeroFlag == GlobalRealFlag | GlobalImagFlag, if one generator + * causes the tensor to be real and the next one to be imaginary, this will + * trivially give the correct result + */ + } +} + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSORSYMMETRY_DYNAMICSYMMETRY_H + +/* + * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; + */ diff --git a/src/EigenUnsupported/CXX11/src/TensorSymmetry/StaticSymmetry.h b/src/EigenUnsupported/CXX11/src/TensorSymmetry/StaticSymmetry.h new file mode 100644 index 0000000..942293b --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/TensorSymmetry/StaticSymmetry.h @@ -0,0 +1,236 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2013 Christian Seiler +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSORSYMMETRY_STATICSYMMETRY_H +#define EIGEN_CXX11_TENSORSYMMETRY_STATICSYMMETRY_H + +namespace Eigen { + +namespace internal { + +template struct tensor_static_symgroup_permutate; + +template +struct tensor_static_symgroup_permutate> +{ + constexpr static std::size_t N = sizeof...(nn); + + template + constexpr static inline std::array run(const std::array& indices) + { + return {{indices[nn]...}}; + } +}; + +template +struct tensor_static_symgroup_element +{ + typedef indices_ indices; + constexpr static int flags = flags_; +}; + +template +struct tensor_static_symgroup_element_ctor +{ + typedef tensor_static_symgroup_element< + typename gen_numeric_list_swapped_pair::type, + Gen::Flags + > type; +}; + +template +struct tensor_static_symgroup_identity_ctor +{ + typedef tensor_static_symgroup_element< + typename gen_numeric_list::type, + 0 + > type; +}; + +template +struct tensor_static_symgroup_multiply_helper +{ + template + constexpr static inline numeric_list::value...> helper(numeric_list) { + return numeric_list::value...>(); + } +}; + +template +struct tensor_static_symgroup_multiply +{ + private: + typedef typename A::indices iia; + typedef typename B::indices iib; + constexpr static int ffa = A::flags; + constexpr static int ffb = B::flags; + + public: + static_assert(iia::count == iib::count, "Cannot multiply symmetry elements with different number of indices."); + + typedef tensor_static_symgroup_element< + decltype(tensor_static_symgroup_multiply_helper::helper(iia())), + ffa ^ ffb + > type; +}; + +template +struct tensor_static_symgroup_equality +{ + typedef typename A::indices iia; + typedef typename B::indices iib; + constexpr static int ffa = A::flags; + constexpr static int ffb = B::flags; + static_assert(iia::count == iib::count, "Cannot compare symmetry elements with different number of indices."); + + constexpr static bool value = is_same::value; + + private: + /* this should be zero if they are identical, or else the tensor + * will be forced to be pure real, pure imaginary or even pure zero + */ + constexpr static int flags_cmp_ = ffa ^ ffb; + + /* either they are not equal, then we don't care whether the flags + * match, or they are equal, and then we have to check + */ + constexpr static bool is_zero = value && flags_cmp_ == NegationFlag; + constexpr static bool is_real = value && flags_cmp_ == ConjugationFlag; + constexpr static bool is_imag = value && flags_cmp_ == (NegationFlag | ConjugationFlag); + + public: + constexpr static int global_flags = + (is_real ? GlobalRealFlag : 0) | + (is_imag ? GlobalImagFlag : 0) | + (is_zero ? GlobalZeroFlag : 0); +}; + +template +struct tensor_static_symgroup +{ + typedef StaticSGroup type; + constexpr static std::size_t size = type::static_size; +}; + +template +constexpr static inline std::array tensor_static_symgroup_index_permute(std::array idx, internal::numeric_list, internal::numeric_list) +{ + return {{ idx[ii]..., idx[jj]... }}; +} + +template +static inline std::vector tensor_static_symgroup_index_permute(std::vector idx, internal::numeric_list) +{ + std::vector result{{ idx[ii]... }}; + std::size_t target_size = idx.size(); + for (std::size_t i = result.size(); i < target_size; i++) + result.push_back(idx[i]); + return result; +} + +template struct tensor_static_symgroup_do_apply; + +template +struct tensor_static_symgroup_do_apply> +{ + template + static inline RV run(const std::array& idx, RV initial, Args&&... args) + { + static_assert(NumIndices >= SGNumIndices, "Can only apply symmetry group to objects that have at least the required amount of indices."); + typedef typename internal::gen_numeric_list::type remaining_indices; + initial = Op::run(tensor_static_symgroup_index_permute(idx, typename first::indices(), remaining_indices()), first::flags, initial, std::forward(args)...); + return tensor_static_symgroup_do_apply>::template run(idx, initial, args...); + } + + template + static inline RV run(const std::vector& idx, RV initial, Args&&... args) + { + eigen_assert(idx.size() >= SGNumIndices && "Can only apply symmetry group to objects that have at least the required amount of indices."); + initial = Op::run(tensor_static_symgroup_index_permute(idx, typename first::indices()), first::flags, initial, std::forward(args)...); + return tensor_static_symgroup_do_apply>::template run(idx, initial, args...); + } +}; + +template +struct tensor_static_symgroup_do_apply> +{ + template + static inline RV run(const std::array&, RV initial, Args&&...) + { + // do nothing + return initial; + } + + template + static inline RV run(const std::vector&, RV initial, Args&&...) + { + // do nothing + return initial; + } +}; + +} // end namespace internal + +template +class StaticSGroup +{ + constexpr static std::size_t NumIndices = internal::tensor_symmetry_num_indices::value; + typedef internal::group_theory::enumerate_group_elements< + internal::tensor_static_symgroup_multiply, + internal::tensor_static_symgroup_equality, + typename internal::tensor_static_symgroup_identity_ctor::type, + internal::type_list::type...> + > group_elements; + typedef typename group_elements::type ge; + public: + constexpr inline StaticSGroup() {} + constexpr inline StaticSGroup(const StaticSGroup&) {} + constexpr inline StaticSGroup(StaticSGroup&&) {} + + template + static inline RV apply(const std::array& idx, RV initial, Args&&... args) + { + return internal::tensor_static_symgroup_do_apply::template run(idx, initial, args...); + } + + template + static inline RV apply(const std::vector& idx, RV initial, Args&&... args) + { + eigen_assert(idx.size() == NumIndices); + return internal::tensor_static_symgroup_do_apply::template run(idx, initial, args...); + } + + constexpr static std::size_t static_size = ge::count; + + constexpr static inline std::size_t size() { + return ge::count; + } + constexpr static inline int globalFlags() { return group_elements::global_flags; } + + template + inline internal::tensor_symmetry_value_setter> operator()(Tensor_& tensor, typename Tensor_::Index firstIndex, IndexTypes... otherIndices) const + { + static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); + return operator()(tensor, std::array{{firstIndex, otherIndices...}}); + } + + template + inline internal::tensor_symmetry_value_setter> operator()(Tensor_& tensor, std::array const& indices) const + { + return internal::tensor_symmetry_value_setter>(tensor, *this, indices); + } +}; + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSORSYMMETRY_STATICSYMMETRY_H + +/* + * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; + */ diff --git a/src/EigenUnsupported/CXX11/src/TensorSymmetry/Symmetry.h b/src/EigenUnsupported/CXX11/src/TensorSymmetry/Symmetry.h new file mode 100644 index 0000000..879d6cd --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/TensorSymmetry/Symmetry.h @@ -0,0 +1,338 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2013 Christian Seiler +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSORSYMMETRY_SYMMETRY_H +#define EIGEN_CXX11_TENSORSYMMETRY_SYMMETRY_H + +namespace Eigen { + +enum { + NegationFlag = 0x01, + ConjugationFlag = 0x02 +}; + +enum { + GlobalRealFlag = 0x01, + GlobalImagFlag = 0x02, + GlobalZeroFlag = 0x03 +}; + +namespace internal { + +template struct tensor_symmetry_pre_analysis; +template struct tensor_static_symgroup; +template struct tensor_static_symgroup_if; +template struct tensor_symmetry_calculate_flags; +template struct tensor_symmetry_assign_value; +template struct tensor_symmetry_num_indices; + +} // end namespace internal + +template +struct Symmetry +{ + static_assert(One_ != Two_, "Symmetries must cover distinct indices."); + constexpr static int One = One_; + constexpr static int Two = Two_; + constexpr static int Flags = 0; +}; + +template +struct AntiSymmetry +{ + static_assert(One_ != Two_, "Symmetries must cover distinct indices."); + constexpr static int One = One_; + constexpr static int Two = Two_; + constexpr static int Flags = NegationFlag; +}; + +template +struct Hermiticity +{ + static_assert(One_ != Two_, "Symmetries must cover distinct indices."); + constexpr static int One = One_; + constexpr static int Two = Two_; + constexpr static int Flags = ConjugationFlag; +}; + +template +struct AntiHermiticity +{ + static_assert(One_ != Two_, "Symmetries must cover distinct indices."); + constexpr static int One = One_; + constexpr static int Two = Two_; + constexpr static int Flags = ConjugationFlag | NegationFlag; +}; + +/** \class DynamicSGroup + * \ingroup TensorSymmetry_Module + * + * \brief Dynamic symmetry group + * + * The %DynamicSGroup class represents a symmetry group that need not be known at + * compile time. It is useful if one wants to support arbitrary run-time defineable + * symmetries for tensors, but it is also instantiated if a symmetry group is defined + * at compile time that would be either too large for the compiler to reasonably + * generate (using templates to calculate this at compile time is very inefficient) + * or that the compiler could generate the group but that it wouldn't make sense to + * unroll the loop for setting coefficients anymore. + */ +class DynamicSGroup; + +/** \internal + * + * \class DynamicSGroupFromTemplateArgs + * \ingroup TensorSymmetry_Module + * + * \brief Dynamic symmetry group, initialized from template arguments + * + * This class is a child class of DynamicSGroup. It uses the template arguments + * specified to initialize itself. + */ +template +class DynamicSGroupFromTemplateArgs; + +/** \class StaticSGroup + * \ingroup TensorSymmetry_Module + * + * \brief Static symmetry group + * + * This class represents a symmetry group that is known and resolved completely + * at compile time. Ideally, no run-time penalty is incurred compared to the + * manual unrolling of the symmetry. + * + * CAUTION: + * + * Do not use this class directly for large symmetry groups. The compiler + * may run into a limit, or segfault or in the very least will take a very, + * very, very long time to compile the code. Use the SGroup class instead + * if you want a static group. That class contains logic that will + * automatically select the DynamicSGroup class instead if the symmetry + * group becomes too large. (In that case, unrolling may not even be + * beneficial.) + */ +template +class StaticSGroup; + +/** \class SGroup + * \ingroup TensorSymmetry_Module + * + * \brief Symmetry group, initialized from template arguments + * + * This class represents a symmetry group whose generators are already + * known at compile time. It may or may not be resolved at compile time, + * depending on the estimated size of the group. + * + * \sa StaticSGroup + * \sa DynamicSGroup + */ +template +class SGroup : public internal::tensor_symmetry_pre_analysis::value, Gen...>::root_type +{ + public: + constexpr static std::size_t NumIndices = internal::tensor_symmetry_num_indices::value; + typedef typename internal::tensor_symmetry_pre_analysis::root_type Base; + + // make standard constructors + assignment operators public + inline SGroup() : Base() { } + inline SGroup(const SGroup& other) : Base(other) { } + inline SGroup(SGroup&& other) : Base(other) { } + inline SGroup& operator=(const SGroup& other) { Base::operator=(other); return *this; } + inline SGroup& operator=(SGroup&& other) { Base::operator=(other); return *this; } + + // all else is defined in the base class +}; + +namespace internal { + +template struct tensor_symmetry_num_indices +{ + constexpr static std::size_t value = 1; +}; + +template struct tensor_symmetry_num_indices, Sym...> +{ +private: + constexpr static std::size_t One = static_cast(One_); + constexpr static std::size_t Two = static_cast(Two_); + constexpr static std::size_t Three = tensor_symmetry_num_indices::value; + + // don't use std::max, since it's not constexpr until C++14... + constexpr static std::size_t maxOneTwoPlusOne = ((One > Two) ? One : Two) + 1; +public: + constexpr static std::size_t value = (maxOneTwoPlusOne > Three) ? maxOneTwoPlusOne : Three; +}; + +template struct tensor_symmetry_num_indices, Sym...> + : public tensor_symmetry_num_indices, Sym...> {}; +template struct tensor_symmetry_num_indices, Sym...> + : public tensor_symmetry_num_indices, Sym...> {}; +template struct tensor_symmetry_num_indices, Sym...> + : public tensor_symmetry_num_indices, Sym...> {}; + +/** \internal + * + * \class tensor_symmetry_pre_analysis + * \ingroup TensorSymmetry_Module + * + * \brief Pre-select whether to use a static or dynamic symmetry group + * + * When a symmetry group could in principle be determined at compile time, + * this template implements the logic whether to actually do that or whether + * to rather defer that to runtime. + * + * The logic is as follows: + *
+ *
No generators (trivial symmetry):
+ *
Use a trivial static group. Ideally, this has no performance impact + * compared to not using symmetry at all. In practice, this might not + * be the case.
+ *
More than 4 generators:
+ *
Calculate the group at run time, it is likely far too large for the + * compiler to be able to properly generate it in a realistic time.
+ *
Up to and including 4 generators:
+ *
Actually enumerate all group elements, but then check how many there + * are. If there are more than 16, it is unlikely that unrolling the + * loop (as is done in the static compile-time case) is sensible, so + * use a dynamic group instead. If there are at most 16 elements, actually + * use that static group. Note that the largest group with 4 generators + * still compiles with reasonable resources.
+ *
+ * + * Note: Example compile time performance with g++-4.6 on an Intenl Core i5-3470 + * with 16 GiB RAM (all generators non-redundant and the subgroups don't + * factorize): + * + * # Generators -O0 -ggdb -O2 + * ------------------------------------------------------------------- + * 1 0.5 s / 250 MiB 0.45s / 230 MiB + * 2 0.5 s / 260 MiB 0.5 s / 250 MiB + * 3 0.65s / 310 MiB 0.62s / 310 MiB + * 4 2.2 s / 860 MiB 1.7 s / 770 MiB + * 5 130 s / 13000 MiB 120 s / 11000 MiB + * + * It is clear that everything is still very efficient up to 4 generators, then + * the memory and CPU requirements become unreasonable. Thus we only instantiate + * the template group theory logic if the number of generators supplied is 4 or + * lower, otherwise this will be forced to be done during runtime, where the + * algorithm is reasonably fast. + */ +template +struct tensor_symmetry_pre_analysis +{ + typedef StaticSGroup<> root_type; +}; + +template +struct tensor_symmetry_pre_analysis +{ + constexpr static std::size_t max_static_generators = 4; + constexpr static std::size_t max_static_elements = 16; + typedef tensor_static_symgroup_if<(sizeof...(Gens_) + 1 <= max_static_generators), NumIndices, Gen_, Gens_...> helper; + constexpr static std::size_t possible_size = helper::size; + + typedef typename conditional< + possible_size == 0 || possible_size >= max_static_elements, + DynamicSGroupFromTemplateArgs, + typename helper::type + >::type root_type; +}; + +template +struct tensor_static_symgroup_if +{ + constexpr static std::size_t size = 0; + typedef void type; +}; + +template +struct tensor_static_symgroup_if : tensor_static_symgroup {}; + +template +struct tensor_symmetry_assign_value +{ + typedef typename Tensor_::Index Index; + typedef typename Tensor_::Scalar Scalar; + constexpr static std::size_t NumIndices = Tensor_::NumIndices; + + static inline int run(const std::array& transformed_indices, int transformation_flags, int dummy, Tensor_& tensor, const Scalar& value_) + { + Scalar value(value_); + if (transformation_flags & ConjugationFlag) + value = numext::conj(value); + if (transformation_flags & NegationFlag) + value = -value; + tensor.coeffRef(transformed_indices) = value; + return dummy; + } +}; + +template +struct tensor_symmetry_calculate_flags +{ + typedef typename Tensor_::Index Index; + constexpr static std::size_t NumIndices = Tensor_::NumIndices; + + static inline int run(const std::array& transformed_indices, int transform_flags, int current_flags, const std::array& orig_indices) + { + if (transformed_indices == orig_indices) { + if (transform_flags & (ConjugationFlag | NegationFlag)) + return current_flags | GlobalImagFlag; // anti-hermitian diagonal + else if (transform_flags & ConjugationFlag) + return current_flags | GlobalRealFlag; // hermitian diagonal + else if (transform_flags & NegationFlag) + return current_flags | GlobalZeroFlag; // anti-symmetric diagonal + } + return current_flags; + } +}; + +template +class tensor_symmetry_value_setter +{ + public: + typedef typename Tensor_::Index Index; + typedef typename Tensor_::Scalar Scalar; + constexpr static std::size_t NumIndices = Tensor_::NumIndices; + + inline tensor_symmetry_value_setter(Tensor_& tensor, Symmetry_ const& symmetry, std::array const& indices) + : m_tensor(tensor), m_symmetry(symmetry), m_indices(indices) { } + + inline tensor_symmetry_value_setter& operator=(Scalar const& value) + { + doAssign(value); + return *this; + } + private: + Tensor_& m_tensor; + Symmetry_ m_symmetry; + std::array m_indices; + + inline void doAssign(Scalar const& value) + { + #ifdef EIGEN_TENSOR_SYMMETRY_CHECK_VALUES + int value_flags = m_symmetry.template apply, int>(m_indices, m_symmetry.globalFlags(), m_indices); + if (value_flags & GlobalRealFlag) + eigen_assert(numext::imag(value) == 0); + if (value_flags & GlobalImagFlag) + eigen_assert(numext::real(value) == 0); + #endif + m_symmetry.template apply, int>(m_indices, 0, m_tensor, value); + } +}; + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSORSYMMETRY_SYMMETRY_H + +/* + * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; + */ diff --git a/src/EigenUnsupported/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h b/src/EigenUnsupported/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h new file mode 100644 index 0000000..54bf9db --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h @@ -0,0 +1,669 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2013 Christian Seiler +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSORSYMMETRY_TEMPLATEGROUPTHEORY_H +#define EIGEN_CXX11_TENSORSYMMETRY_TEMPLATEGROUPTHEORY_H + +namespace Eigen { + +namespace internal { + +namespace group_theory { + +/** \internal + * \file CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h + * This file contains C++ templates that implement group theory algorithms. + * + * The algorithms allow for a compile-time analysis of finite groups. + * + * Currently only Dimino's algorithm is implemented, which returns a list + * of all elements in a group given a set of (possibly redundant) generators. + * (One could also do that with the so-called orbital algorithm, but that + * is much more expensive and usually has no advantages.) + */ + +/********************************************************************** + * "Ok kid, here is where it gets complicated." + * - Amelia Pond in the "Doctor Who" episode + * "The Big Bang" + * + * Dimino's algorithm + * ================== + * + * The following is Dimino's algorithm in sequential form: + * + * Input: identity element, list of generators, equality check, + * multiplication operation + * Output: list of group elements + * + * 1. add identity element + * 2. remove identities from list of generators + * 3. add all powers of first generator that aren't the + * identity element + * 4. go through all remaining generators: + * a. if generator is already in the list of elements + * -> do nothing + * b. otherwise + * i. remember current # of elements + * (i.e. the size of the current subgroup) + * ii. add all current elements (which includes + * the identity) each multiplied from right + * with the current generator to the group + * iii. add all remaining cosets that are generated + * by products of the new generator with itself + * and all other generators seen so far + * + * In functional form, this is implemented as a long set of recursive + * templates that have a complicated relationship. + * + * The main interface for Dimino's algorithm is the template + * enumerate_group_elements. All lists are implemented as variadic + * type_list and numeric_list + * templates. + * + * 'Calling' templates is usually done via typedefs. + * + * This algorithm is an extended version of the basic version. The + * extension consists in the fact that each group element has a set + * of flags associated with it. Multiplication of two group elements + * with each other results in a group element whose flags are the + * XOR of the flags of the previous elements. Each time the algorithm + * notices that a group element it just calculated is already in the + * list of current elements, the flags of both will be compared and + * added to the so-called 'global flags' of the group. + * + * The rationale behind this extension is that this allows not only + * for the description of symmetries between tensor indices, but + * also allows for the description of hermiticity, antisymmetry and + * antihermiticity. Negation and conjugation each are specific bit + * in the flags value and if two different ways to reach a group + * element lead to two different flags, this poses a constraint on + * the allowed values of the resulting tensor. For example, if a + * group element is reach both with and without the conjugation + * flags, it is clear that the resulting tensor has to be real. + * + * Note that this flag mechanism is quite generic and may have other + * uses beyond tensor properties. + * + * IMPORTANT: + * This algorithm assumes the group to be finite. If you try to + * run it with a group that's infinite, the algorithm will only + * terminate once you hit a compiler limit (max template depth). + * Also note that trying to use this implementation to create a + * very large group will probably either make you hit the same + * limit, cause the compiler to segfault or at the very least + * take a *really* long time (hours, days, weeks - sic!) to + * compile. It is not recommended to plug in more than 4 + * generators, unless they are independent of each other. + */ + +/** \internal + * + * \class strip_identities + * \ingroup CXX11_TensorSymmetry_Module + * + * \brief Cleanse a list of group elements of the identity element + * + * This template is used to make a first pass through all initial + * generators of Dimino's algorithm and remove the identity + * elements. + * + * \sa enumerate_group_elements + */ +template class Equality, typename id, typename L> struct strip_identities; + +template< + template class Equality, + typename id, + typename t, + typename... ts +> +struct strip_identities> +{ + typedef typename conditional< + Equality::value, + typename strip_identities>::type, + typename concat, typename strip_identities>::type>::type + >::type type; + constexpr static int global_flags = Equality::global_flags | strip_identities>::global_flags; +}; + +template< + template class Equality, + typename id + EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, ts) +> +struct strip_identities> +{ + typedef type_list<> type; + constexpr static int global_flags = 0; +}; + +/** \internal + * + * \class dimino_first_step_elements_helper + * \ingroup CXX11_TensorSymmetry_Module + * + * \brief Recursive template that adds powers of the first generator to the list of group elements + * + * This template calls itself recursively to add powers of the first + * generator to the list of group elements. It stops if it reaches + * the identity element again. + * + * \sa enumerate_group_elements, dimino_first_step_elements + */ +template< + template class Multiply, + template class Equality, + typename id, + typename g, + typename current_element, + typename elements, + bool dont_add_current_element // = false +> +struct dimino_first_step_elements_helper +#ifndef EIGEN_PARSED_BY_DOXYGEN + : // recursive inheritance is too difficult for Doxygen + public dimino_first_step_elements_helper< + Multiply, + Equality, + id, + g, + typename Multiply::type, + typename concat>::type, + Equality::type, id>::value + > {}; + +template< + template class Multiply, + template class Equality, + typename id, + typename g, + typename current_element, + typename elements +> +struct dimino_first_step_elements_helper +#endif // EIGEN_PARSED_BY_DOXYGEN +{ + typedef elements type; + constexpr static int global_flags = Equality::global_flags; +}; + +/** \internal + * + * \class dimino_first_step_elements + * \ingroup CXX11_TensorSymmetry_Module + * + * \brief Add all powers of the first generator to the list of group elements + * + * This template takes the first non-identity generator and generates the initial + * list of elements which consists of all powers of that generator. For a group + * with just one generated, it would be enumerated after this. + * + * \sa enumerate_group_elements + */ +template< + template class Multiply, + template class Equality, + typename id, + typename generators +> +struct dimino_first_step_elements +{ + typedef typename get<0, generators>::type first_generator; + typedef typename skip<1, generators>::type next_generators; + typedef type_list generators_done; + + typedef dimino_first_step_elements_helper< + Multiply, + Equality, + id, + first_generator, + first_generator, + type_list, + false + > helper; + typedef typename helper::type type; + constexpr static int global_flags = helper::global_flags; +}; + +/** \internal + * + * \class dimino_get_coset_elements + * \ingroup CXX11_TensorSymmetry_Module + * + * \brief Generate all elements of a specific coset + * + * This template generates all the elements of a specific coset by + * multiplying all elements in the given subgroup with the new + * coset representative. Note that the first element of the + * subgroup is always the identity element, so the first element of + * the result of this template is going to be the coset + * representative itself. + * + * Note that this template accepts an additional boolean parameter + * that specifies whether to actually generate the coset (true) or + * just return an empty list (false). + * + * \sa enumerate_group_elements, dimino_add_cosets_for_rep + */ +template< + template class Multiply, + typename sub_group_elements, + typename new_coset_rep, + bool generate_coset // = true +> +struct dimino_get_coset_elements +{ + typedef typename apply_op_from_right::type type; +}; + +template< + template class Multiply, + typename sub_group_elements, + typename new_coset_rep +> +struct dimino_get_coset_elements +{ + typedef type_list<> type; +}; + +/** \internal + * + * \class dimino_add_cosets_for_rep + * \ingroup CXX11_TensorSymmetry_Module + * + * \brief Recursive template for adding coset spaces + * + * This template multiplies the coset representative with a generator + * from the list of previous generators. If the new element is not in + * the group already, it adds the corresponding coset. Finally it + * proceeds to call itself with the next generator from the list. + * + * \sa enumerate_group_elements, dimino_add_all_coset_spaces + */ +template< + template class Multiply, + template class Equality, + typename id, + typename sub_group_elements, + typename elements, + typename generators, + typename rep_element, + int sub_group_size +> +struct dimino_add_cosets_for_rep; + +template< + template class Multiply, + template class Equality, + typename id, + typename sub_group_elements, + typename elements, + typename g, + typename... gs, + typename rep_element, + int sub_group_size +> +struct dimino_add_cosets_for_rep, rep_element, sub_group_size> +{ + typedef typename Multiply::type new_coset_rep; + typedef contained_in_list_gf _cil; + constexpr static bool add_coset = !_cil::value; + + typedef typename dimino_get_coset_elements< + Multiply, + sub_group_elements, + new_coset_rep, + add_coset + >::type coset_elements; + + typedef dimino_add_cosets_for_rep< + Multiply, + Equality, + id, + sub_group_elements, + typename concat::type, + type_list, + rep_element, + sub_group_size + > _helper; + + typedef typename _helper::type type; + constexpr static int global_flags = _cil::global_flags | _helper::global_flags; + + /* Note that we don't have to update global flags here, since + * we will only add these elements if they are not part of + * the group already. But that only happens if the coset rep + * is not already in the group, so the check for the coset rep + * will catch this. + */ +}; + +template< + template class Multiply, + template class Equality, + typename id, + typename sub_group_elements, + typename elements + EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, empty), + typename rep_element, + int sub_group_size +> +struct dimino_add_cosets_for_rep, rep_element, sub_group_size> +{ + typedef elements type; + constexpr static int global_flags = 0; +}; + +/** \internal + * + * \class dimino_add_all_coset_spaces + * \ingroup CXX11_TensorSymmetry_Module + * + * \brief Recursive template for adding all coset spaces for a new generator + * + * This template tries to go through the list of generators (with + * the help of the dimino_add_cosets_for_rep template) as long as + * it still finds elements that are not part of the group and add + * the corresponding cosets. + * + * \sa enumerate_group_elements, dimino_add_cosets_for_rep + */ +template< + template class Multiply, + template class Equality, + typename id, + typename sub_group_elements, + typename elements, + typename generators, + int sub_group_size, + int rep_pos, + bool stop_condition // = false +> +struct dimino_add_all_coset_spaces +{ + typedef typename get::type rep_element; + typedef dimino_add_cosets_for_rep< + Multiply, + Equality, + id, + sub_group_elements, + elements, + generators, + rep_element, + sub_group_elements::count + > _ac4r; + typedef typename _ac4r::type new_elements; + + constexpr static int new_rep_pos = rep_pos + sub_group_elements::count; + constexpr static bool new_stop_condition = new_rep_pos >= new_elements::count; + + typedef dimino_add_all_coset_spaces< + Multiply, + Equality, + id, + sub_group_elements, + new_elements, + generators, + sub_group_size, + new_rep_pos, + new_stop_condition + > _helper; + + typedef typename _helper::type type; + constexpr static int global_flags = _helper::global_flags | _ac4r::global_flags; +}; + +template< + template class Multiply, + template class Equality, + typename id, + typename sub_group_elements, + typename elements, + typename generators, + int sub_group_size, + int rep_pos +> +struct dimino_add_all_coset_spaces +{ + typedef elements type; + constexpr static int global_flags = 0; +}; + +/** \internal + * + * \class dimino_add_generator + * \ingroup CXX11_TensorSymmetry_Module + * + * \brief Enlarge the group by adding a new generator. + * + * It accepts a boolean parameter that determines if the generator is redundant, + * i.e. was already seen in the group. In that case, it reduces to a no-op. + * + * \sa enumerate_group_elements, dimino_add_all_coset_spaces + */ +template< + template class Multiply, + template class Equality, + typename id, + typename elements, + typename generators_done, + typename current_generator, + bool redundant // = false +> +struct dimino_add_generator +{ + /* this template is only called if the generator is not redundant + * => all elements of the group multiplied with the new generator + * are going to be new elements of the most trivial coset space + */ + typedef typename apply_op_from_right::type multiplied_elements; + typedef typename concat::type new_elements; + + constexpr static int rep_pos = elements::count; + + typedef dimino_add_all_coset_spaces< + Multiply, + Equality, + id, + elements, // elements of previous subgroup + new_elements, + typename concat>::type, + elements::count, // size of previous subgroup + rep_pos, + false // don't stop (because rep_pos >= new_elements::count is always false at this point) + > _helper; + typedef typename _helper::type type; + constexpr static int global_flags = _helper::global_flags; +}; + +template< + template class Multiply, + template class Equality, + typename id, + typename elements, + typename generators_done, + typename current_generator +> +struct dimino_add_generator +{ + // redundant case + typedef elements type; + constexpr static int global_flags = 0; +}; + +/** \internal + * + * \class dimino_add_remaining_generators + * \ingroup CXX11_TensorSymmetry_Module + * + * \brief Recursive template that adds all remaining generators to a group + * + * Loop through the list of generators that remain and successively + * add them to the group. + * + * \sa enumerate_group_elements, dimino_add_generator + */ +template< + template class Multiply, + template class Equality, + typename id, + typename generators_done, + typename remaining_generators, + typename elements +> +struct dimino_add_remaining_generators +{ + typedef typename get<0, remaining_generators>::type first_generator; + typedef typename skip<1, remaining_generators>::type next_generators; + + typedef contained_in_list_gf _cil; + + typedef dimino_add_generator< + Multiply, + Equality, + id, + elements, + generators_done, + first_generator, + _cil::value + > _helper; + + typedef typename _helper::type new_elements; + + typedef dimino_add_remaining_generators< + Multiply, + Equality, + id, + typename concat>::type, + next_generators, + new_elements + > _next_iter; + + typedef typename _next_iter::type type; + constexpr static int global_flags = + _cil::global_flags | + _helper::global_flags | + _next_iter::global_flags; +}; + +template< + template class Multiply, + template class Equality, + typename id, + typename generators_done, + typename elements +> +struct dimino_add_remaining_generators, elements> +{ + typedef elements type; + constexpr static int global_flags = 0; +}; + +/** \internal + * + * \class enumerate_group_elements_noid + * \ingroup CXX11_TensorSymmetry_Module + * + * \brief Helper template that implements group element enumeration + * + * This is a helper template that implements the actual enumeration + * of group elements. This has been split so that the list of + * generators can be cleansed of the identity element before + * performing the actual operation. + * + * \sa enumerate_group_elements + */ +template< + template class Multiply, + template class Equality, + typename id, + typename generators, + int initial_global_flags = 0 +> +struct enumerate_group_elements_noid +{ + typedef dimino_first_step_elements first_step; + typedef typename first_step::type first_step_elements; + + typedef dimino_add_remaining_generators< + Multiply, + Equality, + id, + typename first_step::generators_done, + typename first_step::next_generators, // remaining_generators + typename first_step::type // first_step elements + > _helper; + + typedef typename _helper::type type; + constexpr static int global_flags = + initial_global_flags | + first_step::global_flags | + _helper::global_flags; +}; + +// in case when no generators are specified +template< + template class Multiply, + template class Equality, + typename id, + int initial_global_flags +> +struct enumerate_group_elements_noid, initial_global_flags> +{ + typedef type_list type; + constexpr static int global_flags = initial_global_flags; +}; + +/** \internal + * + * \class enumerate_group_elements + * \ingroup CXX11_TensorSymmetry_Module + * + * \brief Enumerate all elements in a finite group + * + * This template enumerates all elements in a finite group. It accepts + * the following template parameters: + * + * \tparam Multiply The multiplication operation that multiplies two group elements + * with each other. + * \tparam Equality The equality check operation that checks if two group elements + * are equal to another. + * \tparam id The identity element + * \tparam _generators A list of (possibly redundant) generators of the group + */ +template< + template class Multiply, + template class Equality, + typename id, + typename _generators +> +struct enumerate_group_elements + : public enumerate_group_elements_noid< + Multiply, + Equality, + id, + typename strip_identities::type, + strip_identities::global_flags + > +{ +}; + +} // end namespace group_theory + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSORSYMMETRY_TEMPLATEGROUPTHEORY_H + +/* + * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; + */ diff --git a/src/EigenUnsupported/CXX11/src/ThreadPool/Barrier.h b/src/EigenUnsupported/CXX11/src/ThreadPool/Barrier.h new file mode 100644 index 0000000..e4c59dc --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/ThreadPool/Barrier.h @@ -0,0 +1,67 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2018 Rasmus Munk Larsen +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// Barrier is an object that allows one or more threads to wait until +// Notify has been called a specified number of times. + +#ifndef EIGEN_CXX11_THREADPOOL_BARRIER_H +#define EIGEN_CXX11_THREADPOOL_BARRIER_H + +namespace Eigen { + +class Barrier { + public: + Barrier(unsigned int count) : state_(count << 1), notified_(false) { + eigen_plain_assert(((count << 1) >> 1) == count); + } + ~Barrier() { eigen_plain_assert((state_ >> 1) == 0); } + + void Notify() { + unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2; + if (v != 1) { + // Clear the lowest bit (waiter flag) and check that the original state + // value was not zero. If it was zero, it means that notify was called + // more times than the original count. + eigen_plain_assert(((v + 2) & ~1) != 0); + return; // either count has not dropped to 0, or waiter is not waiting + } + std::unique_lock l(mu_); + eigen_plain_assert(!notified_); + notified_ = true; + cv_.notify_all(); + } + + void Wait() { + unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel); + if ((v >> 1) == 0) return; + std::unique_lock l(mu_); + while (!notified_) { + cv_.wait(l); + } + } + + private: + std::mutex mu_; + std::condition_variable cv_; + std::atomic state_; // low bit is waiter flag + bool notified_; +}; + +// Notification is an object that allows a user to to wait for another +// thread to signal a notification that an event has occurred. +// +// Multiple threads can wait on the same Notification object, +// but only one caller must call Notify() on the object. +struct Notification : Barrier { + Notification() : Barrier(1){}; +}; + +} // namespace Eigen + +#endif // EIGEN_CXX11_THREADPOOL_BARRIER_H diff --git a/src/EigenUnsupported/CXX11/src/ThreadPool/EventCount.h b/src/EigenUnsupported/CXX11/src/ThreadPool/EventCount.h new file mode 100644 index 0000000..4549aa0 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/ThreadPool/EventCount.h @@ -0,0 +1,249 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Dmitry Vyukov +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H_ +#define EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H_ + +namespace Eigen { + +// EventCount allows to wait for arbitrary predicates in non-blocking +// algorithms. Think of condition variable, but wait predicate does not need to +// be protected by a mutex. Usage: +// Waiting thread does: +// +// if (predicate) +// return act(); +// EventCount::Waiter& w = waiters[my_index]; +// ec.Prewait(&w); +// if (predicate) { +// ec.CancelWait(&w); +// return act(); +// } +// ec.CommitWait(&w); +// +// Notifying thread does: +// +// predicate = true; +// ec.Notify(true); +// +// Notify is cheap if there are no waiting threads. Prewait/CommitWait are not +// cheap, but they are executed only if the preceding predicate check has +// failed. +// +// Algorithm outline: +// There are two main variables: predicate (managed by user) and state_. +// Operation closely resembles Dekker mutual algorithm: +// https://en.wikipedia.org/wiki/Dekker%27s_algorithm +// Waiting thread sets state_ then checks predicate, Notifying thread sets +// predicate then checks state_. Due to seq_cst fences in between these +// operations it is guaranteed than either waiter will see predicate change +// and won't block, or notifying thread will see state_ change and will unblock +// the waiter, or both. But it can't happen that both threads don't see each +// other changes, which would lead to deadlock. +class EventCount { + public: + class Waiter; + + EventCount(MaxSizeVector& waiters) + : state_(kStackMask), waiters_(waiters) { + eigen_plain_assert(waiters.size() < (1 << kWaiterBits) - 1); + } + + ~EventCount() { + // Ensure there are no waiters. + eigen_plain_assert(state_.load() == kStackMask); + } + + // Prewait prepares for waiting. + // After calling Prewait, the thread must re-check the wait predicate + // and then call either CancelWait or CommitWait. + void Prewait() { + uint64_t state = state_.load(std::memory_order_relaxed); + for (;;) { + CheckState(state); + uint64_t newstate = state + kWaiterInc; + CheckState(newstate); + if (state_.compare_exchange_weak(state, newstate, + std::memory_order_seq_cst)) + return; + } + } + + // CommitWait commits waiting after Prewait. + void CommitWait(Waiter* w) { + eigen_plain_assert((w->epoch & ~kEpochMask) == 0); + w->state = Waiter::kNotSignaled; + const uint64_t me = (w - &waiters_[0]) | w->epoch; + uint64_t state = state_.load(std::memory_order_seq_cst); + for (;;) { + CheckState(state, true); + uint64_t newstate; + if ((state & kSignalMask) != 0) { + // Consume the signal and return immidiately. + newstate = state - kWaiterInc - kSignalInc; + } else { + // Remove this thread from pre-wait counter and add to the waiter stack. + newstate = ((state & kWaiterMask) - kWaiterInc) | me; + w->next.store(state & (kStackMask | kEpochMask), + std::memory_order_relaxed); + } + CheckState(newstate); + if (state_.compare_exchange_weak(state, newstate, + std::memory_order_acq_rel)) { + if ((state & kSignalMask) == 0) { + w->epoch += kEpochInc; + Park(w); + } + return; + } + } + } + + // CancelWait cancels effects of the previous Prewait call. + void CancelWait() { + uint64_t state = state_.load(std::memory_order_relaxed); + for (;;) { + CheckState(state, true); + uint64_t newstate = state - kWaiterInc; + // We don't know if the thread was also notified or not, + // so we should not consume a signal unconditionaly. + // Only if number of waiters is equal to number of signals, + // we know that the thread was notified and we must take away the signal. + if (((state & kWaiterMask) >> kWaiterShift) == + ((state & kSignalMask) >> kSignalShift)) + newstate -= kSignalInc; + CheckState(newstate); + if (state_.compare_exchange_weak(state, newstate, + std::memory_order_acq_rel)) + return; + } + } + + // Notify wakes one or all waiting threads. + // Must be called after changing the associated wait predicate. + void Notify(bool notifyAll) { + std::atomic_thread_fence(std::memory_order_seq_cst); + uint64_t state = state_.load(std::memory_order_acquire); + for (;;) { + CheckState(state); + const uint64_t waiters = (state & kWaiterMask) >> kWaiterShift; + const uint64_t signals = (state & kSignalMask) >> kSignalShift; + // Easy case: no waiters. + if ((state & kStackMask) == kStackMask && waiters == signals) return; + uint64_t newstate; + if (notifyAll) { + // Empty wait stack and set signal to number of pre-wait threads. + newstate = + (state & kWaiterMask) | (waiters << kSignalShift) | kStackMask; + } else if (signals < waiters) { + // There is a thread in pre-wait state, unblock it. + newstate = state + kSignalInc; + } else { + // Pop a waiter from list and unpark it. + Waiter* w = &waiters_[state & kStackMask]; + uint64_t next = w->next.load(std::memory_order_relaxed); + newstate = (state & (kWaiterMask | kSignalMask)) | next; + } + CheckState(newstate); + if (state_.compare_exchange_weak(state, newstate, + std::memory_order_acq_rel)) { + if (!notifyAll && (signals < waiters)) + return; // unblocked pre-wait thread + if ((state & kStackMask) == kStackMask) return; + Waiter* w = &waiters_[state & kStackMask]; + if (!notifyAll) w->next.store(kStackMask, std::memory_order_relaxed); + Unpark(w); + return; + } + } + } + + class Waiter { + friend class EventCount; + // Align to 128 byte boundary to prevent false sharing with other Waiter + // objects in the same vector. + EIGEN_ALIGN_TO_BOUNDARY(128) std::atomic next; + std::mutex mu; + std::condition_variable cv; + uint64_t epoch = 0; + unsigned state = kNotSignaled; + enum { + kNotSignaled, + kWaiting, + kSignaled, + }; + }; + + private: + // State_ layout: + // - low kWaiterBits is a stack of waiters committed wait + // (indexes in waiters_ array are used as stack elements, + // kStackMask means empty stack). + // - next kWaiterBits is count of waiters in prewait state. + // - next kWaiterBits is count of pending signals. + // - remaining bits are ABA counter for the stack. + // (stored in Waiter node and incremented on push). + static const uint64_t kWaiterBits = 14; + static const uint64_t kStackMask = (1ull << kWaiterBits) - 1; + static const uint64_t kWaiterShift = kWaiterBits; + static const uint64_t kWaiterMask = ((1ull << kWaiterBits) - 1) + << kWaiterShift; + static const uint64_t kWaiterInc = 1ull << kWaiterShift; + static const uint64_t kSignalShift = 2 * kWaiterBits; + static const uint64_t kSignalMask = ((1ull << kWaiterBits) - 1) + << kSignalShift; + static const uint64_t kSignalInc = 1ull << kSignalShift; + static const uint64_t kEpochShift = 3 * kWaiterBits; + static const uint64_t kEpochBits = 64 - kEpochShift; + static const uint64_t kEpochMask = ((1ull << kEpochBits) - 1) << kEpochShift; + static const uint64_t kEpochInc = 1ull << kEpochShift; + std::atomic state_; + MaxSizeVector& waiters_; + + static void CheckState(uint64_t state, bool waiter = false) { + static_assert(kEpochBits >= 20, "not enough bits to prevent ABA problem"); + const uint64_t waiters = (state & kWaiterMask) >> kWaiterShift; + const uint64_t signals = (state & kSignalMask) >> kSignalShift; + eigen_plain_assert(waiters >= signals); + eigen_plain_assert(waiters < (1 << kWaiterBits) - 1); + eigen_plain_assert(!waiter || waiters > 0); + (void)waiters; + (void)signals; + } + + void Park(Waiter* w) { + std::unique_lock lock(w->mu); + while (w->state != Waiter::kSignaled) { + w->state = Waiter::kWaiting; + w->cv.wait(lock); + } + } + + void Unpark(Waiter* w) { + for (Waiter* next; w; w = next) { + uint64_t wnext = w->next.load(std::memory_order_relaxed) & kStackMask; + next = wnext == kStackMask ? nullptr : &waiters_[wnext]; + unsigned state; + { + std::unique_lock lock(w->mu); + state = w->state; + w->state = Waiter::kSignaled; + } + // Avoid notifying if it wasn't waiting. + if (state == Waiter::kWaiting) w->cv.notify_one(); + } + } + + EventCount(const EventCount&) = delete; + void operator=(const EventCount&) = delete; +}; + +} // namespace Eigen + +#endif // EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H_ diff --git a/src/EigenUnsupported/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/src/EigenUnsupported/CXX11/src/ThreadPool/NonBlockingThreadPool.h new file mode 100644 index 0000000..23a2b54 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/ThreadPool/NonBlockingThreadPool.h @@ -0,0 +1,486 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Dmitry Vyukov +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H +#define EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H + +namespace Eigen { + +template +class ThreadPoolTempl : public Eigen::ThreadPoolInterface { + public: + typedef typename Environment::Task Task; + typedef RunQueue Queue; + + ThreadPoolTempl(int num_threads, Environment env = Environment()) + : ThreadPoolTempl(num_threads, true, env) {} + + ThreadPoolTempl(int num_threads, bool allow_spinning, + Environment env = Environment()) + : env_(env), + num_threads_(num_threads), + allow_spinning_(allow_spinning), + thread_data_(num_threads), + all_coprimes_(num_threads), + waiters_(num_threads), + global_steal_partition_(EncodePartition(0, num_threads_)), + blocked_(0), + spinning_(0), + done_(false), + cancelled_(false), + ec_(waiters_) { + waiters_.resize(num_threads_); + // Calculate coprimes of all numbers [1, num_threads]. + // Coprimes are used for random walks over all threads in Steal + // and NonEmptyQueueIndex. Iteration is based on the fact that if we take + // a random starting thread index t and calculate num_threads - 1 subsequent + // indices as (t + coprime) % num_threads, we will cover all threads without + // repetitions (effectively getting a presudo-random permutation of thread + // indices). + eigen_plain_assert(num_threads_ < kMaxThreads); + for (int i = 1; i <= num_threads_; ++i) { + all_coprimes_.emplace_back(i); + ComputeCoprimes(i, &all_coprimes_.back()); + } +#ifndef EIGEN_THREAD_LOCAL + init_barrier_.reset(new Barrier(num_threads_)); +#endif + thread_data_.resize(num_threads_); + for (int i = 0; i < num_threads_; i++) { + SetStealPartition(i, EncodePartition(0, num_threads_)); + thread_data_[i].thread.reset( + env_.CreateThread([this, i]() { WorkerLoop(i); })); + } +#ifndef EIGEN_THREAD_LOCAL + // Wait for workers to initialize per_thread_map_. Otherwise we might race + // with them in Schedule or CurrentThreadId. + init_barrier_->Wait(); +#endif + } + + ~ThreadPoolTempl() { + done_ = true; + + // Now if all threads block without work, they will start exiting. + // But note that threads can continue to work arbitrary long, + // block, submit new work, unblock and otherwise live full life. + if (!cancelled_) { + ec_.Notify(true); + } else { + // Since we were cancelled, there might be entries in the queues. + // Empty them to prevent their destructor from asserting. + for (size_t i = 0; i < thread_data_.size(); i++) { + thread_data_[i].queue.Flush(); + } + } + // Join threads explicitly (by destroying) to avoid destruction order within + // this class. + for (size_t i = 0; i < thread_data_.size(); ++i) + thread_data_[i].thread.reset(); + } + + void SetStealPartitions(const std::vector>& partitions) { + eigen_plain_assert(partitions.size() == static_cast(num_threads_)); + + // Pass this information to each thread queue. + for (int i = 0; i < num_threads_; i++) { + const auto& pair = partitions[i]; + unsigned start = pair.first, end = pair.second; + AssertBounds(start, end); + unsigned val = EncodePartition(start, end); + SetStealPartition(i, val); + } + } + + void Schedule(std::function fn) EIGEN_OVERRIDE { + ScheduleWithHint(std::move(fn), 0, num_threads_); + } + + void ScheduleWithHint(std::function fn, int start, + int limit) override { + Task t = env_.CreateTask(std::move(fn)); + PerThread* pt = GetPerThread(); + if (pt->pool == this) { + // Worker thread of this pool, push onto the thread's queue. + Queue& q = thread_data_[pt->thread_id].queue; + t = q.PushFront(std::move(t)); + } else { + // A free-standing thread (or worker of another pool), push onto a random + // queue. + eigen_plain_assert(start < limit); + eigen_plain_assert(limit <= num_threads_); + int num_queues = limit - start; + int rnd = Rand(&pt->rand) % num_queues; + eigen_plain_assert(start + rnd < limit); + Queue& q = thread_data_[start + rnd].queue; + t = q.PushBack(std::move(t)); + } + // Note: below we touch this after making w available to worker threads. + // Strictly speaking, this can lead to a racy-use-after-free. Consider that + // Schedule is called from a thread that is neither main thread nor a worker + // thread of this pool. Then, execution of w directly or indirectly + // completes overall computations, which in turn leads to destruction of + // this. We expect that such scenario is prevented by program, that is, + // this is kept alive while any threads can potentially be in Schedule. + if (!t.f) { + ec_.Notify(false); + } else { + env_.ExecuteTask(t); // Push failed, execute directly. + } + } + + void Cancel() EIGEN_OVERRIDE { + cancelled_ = true; + done_ = true; + + // Let each thread know it's been cancelled. +#ifdef EIGEN_THREAD_ENV_SUPPORTS_CANCELLATION + for (size_t i = 0; i < thread_data_.size(); i++) { + thread_data_[i].thread->OnCancel(); + } +#endif + + // Wake up the threads without work to let them exit on their own. + ec_.Notify(true); + } + + int NumThreads() const EIGEN_FINAL { return num_threads_; } + + int CurrentThreadId() const EIGEN_FINAL { + const PerThread* pt = const_cast(this)->GetPerThread(); + if (pt->pool == this) { + return pt->thread_id; + } else { + return -1; + } + } + + private: + // Create a single atomic that encodes start and limit information for + // each thread. + // We expect num_threads_ < 65536, so we can store them in a single + // std::atomic. + // Exposed publicly as static functions so that external callers can reuse + // this encode/decode logic for maintaining their own thread-safe copies of + // scheduling and steal domain(s). + static const int kMaxPartitionBits = 16; + static const int kMaxThreads = 1 << kMaxPartitionBits; + + inline unsigned EncodePartition(unsigned start, unsigned limit) { + return (start << kMaxPartitionBits) | limit; + } + + inline void DecodePartition(unsigned val, unsigned* start, unsigned* limit) { + *limit = val & (kMaxThreads - 1); + val >>= kMaxPartitionBits; + *start = val; + } + + void AssertBounds(int start, int end) { + eigen_plain_assert(start >= 0); + eigen_plain_assert(start < end); // non-zero sized partition + eigen_plain_assert(end <= num_threads_); + } + + inline void SetStealPartition(size_t i, unsigned val) { + thread_data_[i].steal_partition.store(val, std::memory_order_relaxed); + } + + inline unsigned GetStealPartition(int i) { + return thread_data_[i].steal_partition.load(std::memory_order_relaxed); + } + + void ComputeCoprimes(int N, MaxSizeVector* coprimes) { + for (int i = 1; i <= N; i++) { + unsigned a = i; + unsigned b = N; + // If GCD(a, b) == 1, then a and b are coprimes. + while (b != 0) { + unsigned tmp = a; + a = b; + b = tmp % b; + } + if (a == 1) { + coprimes->push_back(i); + } + } + } + + typedef typename Environment::EnvThread Thread; + + struct PerThread { + constexpr PerThread() : pool(NULL), rand(0), thread_id(-1) {} + ThreadPoolTempl* pool; // Parent pool, or null for normal threads. + uint64_t rand; // Random generator state. + int thread_id; // Worker thread index in pool. +#ifndef EIGEN_THREAD_LOCAL + // Prevent false sharing. + char pad_[128]; +#endif + }; + + struct ThreadData { + constexpr ThreadData() : thread(), steal_partition(0), queue() {} + std::unique_ptr thread; + std::atomic steal_partition; + Queue queue; + }; + + Environment env_; + const int num_threads_; + const bool allow_spinning_; + MaxSizeVector thread_data_; + MaxSizeVector> all_coprimes_; + MaxSizeVector waiters_; + unsigned global_steal_partition_; + std::atomic blocked_; + std::atomic spinning_; + std::atomic done_; + std::atomic cancelled_; + EventCount ec_; +#ifndef EIGEN_THREAD_LOCAL + std::unique_ptr init_barrier_; + std::mutex per_thread_map_mutex_; // Protects per_thread_map_. + std::unordered_map> per_thread_map_; +#endif + + // Main worker thread loop. + void WorkerLoop(int thread_id) { +#ifndef EIGEN_THREAD_LOCAL + std::unique_ptr new_pt(new PerThread()); + per_thread_map_mutex_.lock(); + bool insertOK = per_thread_map_.emplace(GlobalThreadIdHash(), std::move(new_pt)).second; + eigen_plain_assert(insertOK); + EIGEN_UNUSED_VARIABLE(insertOK); + per_thread_map_mutex_.unlock(); + init_barrier_->Notify(); + init_barrier_->Wait(); +#endif + PerThread* pt = GetPerThread(); + pt->pool = this; + pt->rand = GlobalThreadIdHash(); + pt->thread_id = thread_id; + Queue& q = thread_data_[thread_id].queue; + EventCount::Waiter* waiter = &waiters_[thread_id]; + // TODO(dvyukov,rmlarsen): The time spent in NonEmptyQueueIndex() is + // proportional to num_threads_ and we assume that new work is scheduled at + // a constant rate, so we set spin_count to 5000 / num_threads_. The + // constant was picked based on a fair dice roll, tune it. + const int spin_count = + allow_spinning_ && num_threads_ > 0 ? 5000 / num_threads_ : 0; + if (num_threads_ == 1) { + // For num_threads_ == 1 there is no point in going through the expensive + // steal loop. Moreover, since NonEmptyQueueIndex() calls PopBack() on the + // victim queues it might reverse the order in which ops are executed + // compared to the order in which they are scheduled, which tends to be + // counter-productive for the types of I/O workloads the single thread + // pools tend to be used for. + while (!cancelled_) { + Task t = q.PopFront(); + for (int i = 0; i < spin_count && !t.f; i++) { + if (!cancelled_.load(std::memory_order_relaxed)) { + t = q.PopFront(); + } + } + if (!t.f) { + if (!WaitForWork(waiter, &t)) { + return; + } + } + if (t.f) { + env_.ExecuteTask(t); + } + } + } else { + while (!cancelled_) { + Task t = q.PopFront(); + if (!t.f) { + t = LocalSteal(); + if (!t.f) { + t = GlobalSteal(); + if (!t.f) { + // Leave one thread spinning. This reduces latency. + if (allow_spinning_ && !spinning_ && !spinning_.exchange(true)) { + for (int i = 0; i < spin_count && !t.f; i++) { + if (!cancelled_.load(std::memory_order_relaxed)) { + t = GlobalSteal(); + } else { + return; + } + } + spinning_ = false; + } + if (!t.f) { + if (!WaitForWork(waiter, &t)) { + return; + } + } + } + } + } + if (t.f) { + env_.ExecuteTask(t); + } + } + } + } + + // Steal tries to steal work from other worker threads in the range [start, + // limit) in best-effort manner. + Task Steal(unsigned start, unsigned limit) { + PerThread* pt = GetPerThread(); + const size_t size = limit - start; + unsigned r = Rand(&pt->rand); + // Reduce r into [0, size) range, this utilizes trick from + // https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/ + eigen_plain_assert(all_coprimes_[size - 1].size() < (1<<30)); + unsigned victim = ((uint64_t)r * (uint64_t)size) >> 32; + unsigned index = ((uint64_t) all_coprimes_[size - 1].size() * (uint64_t)r) >> 32; + unsigned inc = all_coprimes_[size - 1][index]; + + for (unsigned i = 0; i < size; i++) { + eigen_plain_assert(start + victim < limit); + Task t = thread_data_[start + victim].queue.PopBack(); + if (t.f) { + return t; + } + victim += inc; + if (victim >= size) { + victim -= size; + } + } + return Task(); + } + + // Steals work within threads belonging to the partition. + Task LocalSteal() { + PerThread* pt = GetPerThread(); + unsigned partition = GetStealPartition(pt->thread_id); + // If thread steal partition is the same as global partition, there is no + // need to go through the steal loop twice. + if (global_steal_partition_ == partition) return Task(); + unsigned start, limit; + DecodePartition(partition, &start, &limit); + AssertBounds(start, limit); + + return Steal(start, limit); + } + + // Steals work from any other thread in the pool. + Task GlobalSteal() { + return Steal(0, num_threads_); + } + + + // WaitForWork blocks until new work is available (returns true), or if it is + // time to exit (returns false). Can optionally return a task to execute in t + // (in such case t.f != nullptr on return). + bool WaitForWork(EventCount::Waiter* waiter, Task* t) { + eigen_plain_assert(!t->f); + // We already did best-effort emptiness check in Steal, so prepare for + // blocking. + ec_.Prewait(); + // Now do a reliable emptiness check. + int victim = NonEmptyQueueIndex(); + if (victim != -1) { + ec_.CancelWait(); + if (cancelled_) { + return false; + } else { + *t = thread_data_[victim].queue.PopBack(); + return true; + } + } + // Number of blocked threads is used as termination condition. + // If we are shutting down and all worker threads blocked without work, + // that's we are done. + blocked_++; + // TODO is blocked_ required to be unsigned? + if (done_ && blocked_ == static_cast(num_threads_)) { + ec_.CancelWait(); + // Almost done, but need to re-check queues. + // Consider that all queues are empty and all worker threads are preempted + // right after incrementing blocked_ above. Now a free-standing thread + // submits work and calls destructor (which sets done_). If we don't + // re-check queues, we will exit leaving the work unexecuted. + if (NonEmptyQueueIndex() != -1) { + // Note: we must not pop from queues before we decrement blocked_, + // otherwise the following scenario is possible. Consider that instead + // of checking for emptiness we popped the only element from queues. + // Now other worker threads can start exiting, which is bad if the + // work item submits other work. So we just check emptiness here, + // which ensures that all worker threads exit at the same time. + blocked_--; + return true; + } + // Reached stable termination state. + ec_.Notify(true); + return false; + } + ec_.CommitWait(waiter); + blocked_--; + return true; + } + + int NonEmptyQueueIndex() { + PerThread* pt = GetPerThread(); + // We intentionally design NonEmptyQueueIndex to steal work from + // anywhere in the queue so threads don't block in WaitForWork() forever + // when all threads in their partition go to sleep. Steal is still local. + const size_t size = thread_data_.size(); + unsigned r = Rand(&pt->rand); + unsigned inc = all_coprimes_[size - 1][r % all_coprimes_[size - 1].size()]; + unsigned victim = r % size; + for (unsigned i = 0; i < size; i++) { + if (!thread_data_[victim].queue.Empty()) { + return victim; + } + victim += inc; + if (victim >= size) { + victim -= size; + } + } + return -1; + } + + static EIGEN_STRONG_INLINE uint64_t GlobalThreadIdHash() { + return std::hash()(std::this_thread::get_id()); + } + + EIGEN_STRONG_INLINE PerThread* GetPerThread() { +#ifndef EIGEN_THREAD_LOCAL + static PerThread dummy; + auto it = per_thread_map_.find(GlobalThreadIdHash()); + if (it == per_thread_map_.end()) { + return &dummy; + } else { + return it->second.get(); + } +#else + EIGEN_THREAD_LOCAL PerThread per_thread_; + PerThread* pt = &per_thread_; + return pt; +#endif + } + + static EIGEN_STRONG_INLINE unsigned Rand(uint64_t* state) { + uint64_t current = *state; + // Update the internal state + *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL; + // Generate the random output (using the PCG-XSH-RS scheme) + return static_cast((current ^ (current >> 22)) >> + (22 + (current >> 61))); + } +}; + +typedef ThreadPoolTempl ThreadPool; + +} // namespace Eigen + +#endif // EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H diff --git a/src/EigenUnsupported/CXX11/src/ThreadPool/RunQueue.h b/src/EigenUnsupported/CXX11/src/ThreadPool/RunQueue.h new file mode 100644 index 0000000..b572ebc --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/ThreadPool/RunQueue.h @@ -0,0 +1,236 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Dmitry Vyukov +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_ +#define EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_ + +namespace Eigen { + +// RunQueue is a fixed-size, partially non-blocking deque or Work items. +// Operations on front of the queue must be done by a single thread (owner), +// operations on back of the queue can be done by multiple threads concurrently. +// +// Algorithm outline: +// All remote threads operating on the queue back are serialized by a mutex. +// This ensures that at most two threads access state: owner and one remote +// thread (Size aside). The algorithm ensures that the occupied region of the +// underlying array is logically continuous (can wraparound, but no stray +// occupied elements). Owner operates on one end of this region, remote thread +// operates on the other end. Synchronization between these threads +// (potential consumption of the last element and take up of the last empty +// element) happens by means of state variable in each element. States are: +// empty, busy (in process of insertion of removal) and ready. Threads claim +// elements (empty->busy and ready->busy transitions) by means of a CAS +// operation. The finishing transition (busy->empty and busy->ready) are done +// with plain store as the element is exclusively owned by the current thread. +// +// Note: we could permit only pointers as elements, then we would not need +// separate state variable as null/non-null pointer value would serve as state, +// but that would require malloc/free per operation for large, complex values +// (and this is designed to store std::function<()>). +template +class RunQueue { + public: + RunQueue() : front_(0), back_(0) { + // require power-of-two for fast masking + eigen_plain_assert((kSize & (kSize - 1)) == 0); + eigen_plain_assert(kSize > 2); // why would you do this? + eigen_plain_assert(kSize <= (64 << 10)); // leave enough space for counter + for (unsigned i = 0; i < kSize; i++) + array_[i].state.store(kEmpty, std::memory_order_relaxed); + } + + ~RunQueue() { eigen_plain_assert(Size() == 0); } + + // PushFront inserts w at the beginning of the queue. + // If queue is full returns w, otherwise returns default-constructed Work. + Work PushFront(Work w) { + unsigned front = front_.load(std::memory_order_relaxed); + Elem* e = &array_[front & kMask]; + uint8_t s = e->state.load(std::memory_order_relaxed); + if (s != kEmpty || + !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire)) + return w; + front_.store(front + 1 + (kSize << 1), std::memory_order_relaxed); + e->w = std::move(w); + e->state.store(kReady, std::memory_order_release); + return Work(); + } + + // PopFront removes and returns the first element in the queue. + // If the queue was empty returns default-constructed Work. + Work PopFront() { + unsigned front = front_.load(std::memory_order_relaxed); + Elem* e = &array_[(front - 1) & kMask]; + uint8_t s = e->state.load(std::memory_order_relaxed); + if (s != kReady || + !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire)) + return Work(); + Work w = std::move(e->w); + e->state.store(kEmpty, std::memory_order_release); + front = ((front - 1) & kMask2) | (front & ~kMask2); + front_.store(front, std::memory_order_relaxed); + return w; + } + + // PushBack adds w at the end of the queue. + // If queue is full returns w, otherwise returns default-constructed Work. + Work PushBack(Work w) { + std::unique_lock lock(mutex_); + unsigned back = back_.load(std::memory_order_relaxed); + Elem* e = &array_[(back - 1) & kMask]; + uint8_t s = e->state.load(std::memory_order_relaxed); + if (s != kEmpty || + !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire)) + return w; + back = ((back - 1) & kMask2) | (back & ~kMask2); + back_.store(back, std::memory_order_relaxed); + e->w = std::move(w); + e->state.store(kReady, std::memory_order_release); + return Work(); + } + + // PopBack removes and returns the last elements in the queue. + Work PopBack() { + if (Empty()) return Work(); + std::unique_lock lock(mutex_); + unsigned back = back_.load(std::memory_order_relaxed); + Elem* e = &array_[back & kMask]; + uint8_t s = e->state.load(std::memory_order_relaxed); + if (s != kReady || + !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire)) + return Work(); + Work w = std::move(e->w); + e->state.store(kEmpty, std::memory_order_release); + back_.store(back + 1 + (kSize << 1), std::memory_order_relaxed); + return w; + } + + // PopBackHalf removes and returns half last elements in the queue. + // Returns number of elements removed. + unsigned PopBackHalf(std::vector* result) { + if (Empty()) return 0; + std::unique_lock lock(mutex_); + unsigned back = back_.load(std::memory_order_relaxed); + unsigned size = Size(); + unsigned mid = back; + if (size > 1) mid = back + (size - 1) / 2; + unsigned n = 0; + unsigned start = 0; + for (; static_cast(mid - back) >= 0; mid--) { + Elem* e = &array_[mid & kMask]; + uint8_t s = e->state.load(std::memory_order_relaxed); + if (n == 0) { + if (s != kReady || !e->state.compare_exchange_strong( + s, kBusy, std::memory_order_acquire)) + continue; + start = mid; + } else { + // Note: no need to store temporal kBusy, we exclusively own these + // elements. + eigen_plain_assert(s == kReady); + } + result->push_back(std::move(e->w)); + e->state.store(kEmpty, std::memory_order_release); + n++; + } + if (n != 0) + back_.store(start + 1 + (kSize << 1), std::memory_order_relaxed); + return n; + } + + // Size returns current queue size. + // Can be called by any thread at any time. + unsigned Size() const { return SizeOrNotEmpty(); } + + // Empty tests whether container is empty. + // Can be called by any thread at any time. + bool Empty() const { return SizeOrNotEmpty() == 0; } + + // Delete all the elements from the queue. + void Flush() { + while (!Empty()) { + PopFront(); + } + } + + private: + static const unsigned kMask = kSize - 1; + static const unsigned kMask2 = (kSize << 1) - 1; + struct Elem { + std::atomic state; + Work w; + }; + enum { + kEmpty, + kBusy, + kReady, + }; + std::mutex mutex_; + // Low log(kSize) + 1 bits in front_ and back_ contain rolling index of + // front/back, respectively. The remaining bits contain modification counters + // that are incremented on Push operations. This allows us to (1) distinguish + // between empty and full conditions (if we would use log(kSize) bits for + // position, these conditions would be indistinguishable); (2) obtain + // consistent snapshot of front_/back_ for Size operation using the + // modification counters. + std::atomic front_; + std::atomic back_; + Elem array_[kSize]; + + // SizeOrNotEmpty returns current queue size; if NeedSizeEstimate is false, + // only whether the size is 0 is guaranteed to be correct. + // Can be called by any thread at any time. + template + unsigned SizeOrNotEmpty() const { + // Emptiness plays critical role in thread pool blocking. So we go to great + // effort to not produce false positives (claim non-empty queue as empty). + unsigned front = front_.load(std::memory_order_acquire); + for (;;) { + // Capture a consistent snapshot of front/tail. + unsigned back = back_.load(std::memory_order_acquire); + unsigned front1 = front_.load(std::memory_order_relaxed); + if (front != front1) { + front = front1; + std::atomic_thread_fence(std::memory_order_acquire); + continue; + } + if (NeedSizeEstimate) { + return CalculateSize(front, back); + } else { + // This value will be 0 if the queue is empty, and undefined otherwise. + unsigned maybe_zero = ((front ^ back) & kMask2); + // Queue size estimate must agree with maybe zero check on the queue + // empty/non-empty state. + eigen_assert((CalculateSize(front, back) == 0) == (maybe_zero == 0)); + return maybe_zero; + } + } + } + + EIGEN_ALWAYS_INLINE + unsigned CalculateSize(unsigned front, unsigned back) const { + int size = (front & kMask2) - (back & kMask2); + // Fix overflow. + if (size < 0) size += 2 * kSize; + // Order of modification in push/pop is crafted to make the queue look + // larger than it is during concurrent modifications. E.g. push can + // increment size before the corresponding pop has decremented it. + // So the computed size can be up to kSize + 1, fix it. + if (size > static_cast(kSize)) size = kSize; + return static_cast(size); + } + + RunQueue(const RunQueue&) = delete; + void operator=(const RunQueue&) = delete; +}; + +} // namespace Eigen + +#endif // EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_ diff --git a/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadCancel.h b/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadCancel.h new file mode 100644 index 0000000..a05685f --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadCancel.h @@ -0,0 +1,23 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H +#define EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H + +// Try to come up with a portable way to cancel a thread +#if EIGEN_OS_GNULINUX + #define EIGEN_THREAD_CANCEL(t) \ + pthread_cancel(t.native_handle()); + #define EIGEN_SUPPORTS_THREAD_CANCELLATION 1 +#else +#define EIGEN_THREAD_CANCEL(t) +#endif + + +#endif // EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H diff --git a/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadEnvironment.h b/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadEnvironment.h new file mode 100644 index 0000000..d94a064 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadEnvironment.h @@ -0,0 +1,40 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H +#define EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H + +namespace Eigen { + +struct StlThreadEnvironment { + struct Task { + std::function f; + }; + + // EnvThread constructor must start the thread, + // destructor must join the thread. + class EnvThread { + public: + EnvThread(std::function f) : thr_(std::move(f)) {} + ~EnvThread() { thr_.join(); } + // This function is called when the threadpool is cancelled. + void OnCancel() { } + + private: + std::thread thr_; + }; + + EnvThread* CreateThread(std::function f) { return new EnvThread(std::move(f)); } + Task CreateTask(std::function f) { return Task{std::move(f)}; } + void ExecuteTask(const Task& t) { t.f(); } +}; + +} // namespace Eigen + +#endif // EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H diff --git a/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadLocal.h b/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadLocal.h new file mode 100644 index 0000000..4e68474 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadLocal.h @@ -0,0 +1,301 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H +#define EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H + +#ifdef EIGEN_AVOID_THREAD_LOCAL + +#ifdef EIGEN_THREAD_LOCAL +#undef EIGEN_THREAD_LOCAL +#endif + +#else + +#if EIGEN_MAX_CPP_VER >= 11 && \ + ((EIGEN_COMP_GNUC && EIGEN_GNUC_AT_LEAST(4, 8)) || \ + __has_feature(cxx_thread_local) || \ + (EIGEN_COMP_MSVC >= 1900) ) +#define EIGEN_THREAD_LOCAL static thread_local +#endif + +// Disable TLS for Apple and Android builds with older toolchains. +#if defined(__APPLE__) +// Included for TARGET_OS_IPHONE, __IPHONE_OS_VERSION_MIN_REQUIRED, +// __IPHONE_8_0. +#include +#include +#endif +// Checks whether C++11's `thread_local` storage duration specifier is +// supported. +#if defined(__apple_build_version__) && \ + ((__apple_build_version__ < 8000042) || \ + (TARGET_OS_IPHONE && __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_9_0)) +// Notes: Xcode's clang did not support `thread_local` until version +// 8, and even then not for all iOS < 9.0. +#undef EIGEN_THREAD_LOCAL + +#elif defined(__ANDROID__) && EIGEN_COMP_CLANG +// There are platforms for which TLS should not be used even though the compiler +// makes it seem like it's supported (Android NDK < r12b for example). +// This is primarily because of linker problems and toolchain misconfiguration: +// TLS isn't supported until NDK r12b per +// https://developer.android.com/ndk/downloads/revision_history.html +// Since NDK r16, `__NDK_MAJOR__` and `__NDK_MINOR__` are defined in +// . For NDK < r16, users should define these macros, +// e.g. `-D__NDK_MAJOR__=11 -D__NKD_MINOR__=0` for NDK r11. +#if __has_include() +#include +#endif // __has_include() +#if defined(__ANDROID__) && defined(__clang__) && defined(__NDK_MAJOR__) && \ + defined(__NDK_MINOR__) && \ + ((__NDK_MAJOR__ < 12) || ((__NDK_MAJOR__ == 12) && (__NDK_MINOR__ < 1))) +#undef EIGEN_THREAD_LOCAL +#endif +#endif // defined(__ANDROID__) && defined(__clang__) + +#endif // EIGEN_AVOID_THREAD_LOCAL + +namespace Eigen { + +namespace internal { +template +struct ThreadLocalNoOpInitialize { + void operator()(T&) const {} +}; + +template +struct ThreadLocalNoOpRelease { + void operator()(T&) const {} +}; + +} // namespace internal + +// Thread local container for elements of type T, that does not use thread local +// storage. As long as the number of unique threads accessing this storage +// is smaller than `capacity_`, it is lock-free and wait-free. Otherwise it will +// use a mutex for synchronization. +// +// Type `T` has to be default constructible, and by default each thread will get +// a default constructed value. It is possible to specify custom `initialize` +// callable, that will be called lazily from each thread accessing this object, +// and will be passed a default initialized object of type `T`. Also it's +// possible to pass a custom `release` callable, that will be invoked before +// calling ~T(). +// +// Example: +// +// struct Counter { +// int value = 0; +// } +// +// Eigen::ThreadLocal counter(10); +// +// // Each thread will have access to it's own counter object. +// Counter& cnt = counter.local(); +// cnt++; +// +// WARNING: Eigen::ThreadLocal uses the OS-specific value returned by +// std::this_thread::get_id() to identify threads. This value is not guaranteed +// to be unique except for the life of the thread. A newly created thread may +// get an OS-specific ID equal to that of an already destroyed thread. +// +// Somewhat similar to TBB thread local storage, with similar restrictions: +// https://www.threadingbuildingblocks.org/docs/help/reference/thread_local_storage/enumerable_thread_specific_cls.html +// +template , + typename Release = internal::ThreadLocalNoOpRelease> +class ThreadLocal { + // We preallocate default constructed elements in MaxSizedVector. + static_assert(std::is_default_constructible::value, + "ThreadLocal data type must be default constructible"); + + public: + explicit ThreadLocal(int capacity) + : ThreadLocal(capacity, internal::ThreadLocalNoOpInitialize(), + internal::ThreadLocalNoOpRelease()) {} + + ThreadLocal(int capacity, Initialize initialize) + : ThreadLocal(capacity, std::move(initialize), + internal::ThreadLocalNoOpRelease()) {} + + ThreadLocal(int capacity, Initialize initialize, Release release) + : initialize_(std::move(initialize)), + release_(std::move(release)), + capacity_(capacity), + data_(capacity_), + ptr_(capacity_), + filled_records_(0) { + eigen_assert(capacity_ >= 0); + data_.resize(capacity_); + for (int i = 0; i < capacity_; ++i) { + ptr_.emplace_back(nullptr); + } + } + + T& local() { + std::thread::id this_thread = std::this_thread::get_id(); + if (capacity_ == 0) return SpilledLocal(this_thread); + + std::size_t h = std::hash()(this_thread); + const int start_idx = h % capacity_; + + // NOTE: From the definition of `std::this_thread::get_id()` it is + // guaranteed that we never can have concurrent insertions with the same key + // to our hash-map like data structure. If we didn't find an element during + // the initial traversal, it's guaranteed that no one else could have + // inserted it while we are in this function. This allows to massively + // simplify out lock-free insert-only hash map. + + // Check if we already have an element for `this_thread`. + int idx = start_idx; + while (ptr_[idx].load() != nullptr) { + ThreadIdAndValue& record = *(ptr_[idx].load()); + if (record.thread_id == this_thread) return record.value; + + idx += 1; + if (idx >= capacity_) idx -= capacity_; + if (idx == start_idx) break; + } + + // If we are here, it means that we found an insertion point in lookup + // table at `idx`, or we did a full traversal and table is full. + + // If lock-free storage is full, fallback on mutex. + if (filled_records_.load() >= capacity_) return SpilledLocal(this_thread); + + // We double check that we still have space to insert an element into a lock + // free storage. If old value in `filled_records_` is larger than the + // records capacity, it means that some other thread added an element while + // we were traversing lookup table. + int insertion_index = + filled_records_.fetch_add(1, std::memory_order_relaxed); + if (insertion_index >= capacity_) return SpilledLocal(this_thread); + + // At this point it's guaranteed that we can access to + // data_[insertion_index_] without a data race. + data_[insertion_index].thread_id = this_thread; + initialize_(data_[insertion_index].value); + + // That's the pointer we'll put into the lookup table. + ThreadIdAndValue* inserted = &data_[insertion_index]; + + // We'll use nullptr pointer to ThreadIdAndValue in a compare-and-swap loop. + ThreadIdAndValue* empty = nullptr; + + // Now we have to find an insertion point into the lookup table. We start + // from the `idx` that was identified as an insertion point above, it's + // guaranteed that we will have an empty record somewhere in a lookup table + // (because we created a record in the `data_`). + const int insertion_idx = idx; + + do { + // Always start search from the original insertion candidate. + idx = insertion_idx; + while (ptr_[idx].load() != nullptr) { + idx += 1; + if (idx >= capacity_) idx -= capacity_; + // If we did a full loop, it means that we don't have any free entries + // in the lookup table, and this means that something is terribly wrong. + eigen_assert(idx != insertion_idx); + } + // Atomic CAS of the pointer guarantees that any other thread, that will + // follow this pointer will see all the mutations in the `data_`. + } while (!ptr_[idx].compare_exchange_weak(empty, inserted)); + + return inserted->value; + } + + // WARN: It's not thread safe to call it concurrently with `local()`. + void ForEach(std::function f) { + // Reading directly from `data_` is unsafe, because only CAS to the + // record in `ptr_` makes all changes visible to other threads. + for (auto& ptr : ptr_) { + ThreadIdAndValue* record = ptr.load(); + if (record == nullptr) continue; + f(record->thread_id, record->value); + } + + // We did not spill into the map based storage. + if (filled_records_.load(std::memory_order_relaxed) < capacity_) return; + + // Adds a happens before edge from the last call to SpilledLocal(). + std::unique_lock lock(mu_); + for (auto& kv : per_thread_map_) { + f(kv.first, kv.second); + } + } + + // WARN: It's not thread safe to call it concurrently with `local()`. + ~ThreadLocal() { + // Reading directly from `data_` is unsafe, because only CAS to the record + // in `ptr_` makes all changes visible to other threads. + for (auto& ptr : ptr_) { + ThreadIdAndValue* record = ptr.load(); + if (record == nullptr) continue; + release_(record->value); + } + + // We did not spill into the map based storage. + if (filled_records_.load(std::memory_order_relaxed) < capacity_) return; + + // Adds a happens before edge from the last call to SpilledLocal(). + std::unique_lock lock(mu_); + for (auto& kv : per_thread_map_) { + release_(kv.second); + } + } + + private: + struct ThreadIdAndValue { + std::thread::id thread_id; + T value; + }; + + // Use unordered map guarded by a mutex when lock free storage is full. + T& SpilledLocal(std::thread::id this_thread) { + std::unique_lock lock(mu_); + + auto it = per_thread_map_.find(this_thread); + if (it == per_thread_map_.end()) { + auto result = per_thread_map_.emplace(this_thread, T()); + eigen_assert(result.second); + initialize_((*result.first).second); + return (*result.first).second; + } else { + return it->second; + } + } + + Initialize initialize_; + Release release_; + const int capacity_; + + // Storage that backs lock-free lookup table `ptr_`. Records stored in this + // storage contiguously starting from index 0. + MaxSizeVector data_; + + // Atomic pointers to the data stored in `data_`. Used as a lookup table for + // linear probing hash map (https://en.wikipedia.org/wiki/Linear_probing). + MaxSizeVector> ptr_; + + // Number of records stored in the `data_`. + std::atomic filled_records_; + + // We fallback on per thread map if lock-free storage is full. In practice + // this should never happen, if `capacity_` is a reasonable estimate of the + // number of threads running in a system. + std::mutex mu_; // Protects per_thread_map_. + std::unordered_map per_thread_map_; +}; + +} // namespace Eigen + +#endif // EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H diff --git a/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadPoolInterface.h b/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadPoolInterface.h new file mode 100644 index 0000000..25030dc --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadPoolInterface.h @@ -0,0 +1,48 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H +#define EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H + +namespace Eigen { + +// This defines an interface that ThreadPoolDevice can take to use +// custom thread pools underneath. +class ThreadPoolInterface { + public: + // Submits a closure to be run by a thread in the pool. + virtual void Schedule(std::function fn) = 0; + + // Submits a closure to be run by threads in the range [start, end) in the + // pool. + virtual void ScheduleWithHint(std::function fn, int /*start*/, + int /*end*/) { + // Just defer to Schedule in case sub-classes aren't interested in + // overriding this functionality. + Schedule(fn); + } + + // If implemented, stop processing the closures that have been enqueued. + // Currently running closures may still be processed. + // If not implemented, does nothing. + virtual void Cancel() {} + + // Returns the number of threads in the pool. + virtual int NumThreads() const = 0; + + // Returns a logical thread index between 0 and NumThreads() - 1 if called + // from one of the threads in the pool. Returns -1 otherwise. + virtual int CurrentThreadId() const = 0; + + virtual ~ThreadPoolInterface() {} +}; + +} // namespace Eigen + +#endif // EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H diff --git a/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadYield.h b/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadYield.h new file mode 100644 index 0000000..a859c7b --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadYield.h @@ -0,0 +1,20 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H +#define EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H + +// Try to come up with a portable way to yield +#if EIGEN_COMP_GNUC && EIGEN_GNUC_AT_MOST(4, 7) +#define EIGEN_THREAD_YIELD() sched_yield() +#else +#define EIGEN_THREAD_YIELD() std::this_thread::yield() +#endif + +#endif // EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H diff --git a/src/EigenUnsupported/CXX11/src/util/CXX11Meta.h b/src/EigenUnsupported/CXX11/src/util/CXX11Meta.h new file mode 100644 index 0000000..149ceaf --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/util/CXX11Meta.h @@ -0,0 +1,537 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2013 Christian Seiler +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11META_H +#define EIGEN_CXX11META_H + +#include +#include "EmulateArray.h" + +#include "CXX11Workarounds.h" + +namespace Eigen { + +namespace internal { + +/** \internal + * \file CXX11/util/CXX11Meta.h + * This file contains generic metaprogramming classes which are not specifically related to Eigen. + * This file expands upon Core/util/Meta.h and adds support for C++11 specific features. + */ + +template +struct type_list { constexpr static int count = sizeof...(tt); }; + +template +struct type_list { constexpr static int count = sizeof...(tt) + 1; typedef t first_type; }; + +template +struct numeric_list { constexpr static std::size_t count = sizeof...(nn); }; + +template +struct numeric_list { static const std::size_t count = sizeof...(nn) + 1; const static T first_value = n; }; + +#ifndef EIGEN_PARSED_BY_DOXYGEN +/* numeric list constructors + * + * equivalencies: + * constructor result + * typename gen_numeric_list::type numeric_list + * typename gen_numeric_list_reversed::type numeric_list + * typename gen_numeric_list_swapped_pair::type numeric_list + * typename gen_numeric_list_repeated::type numeric_list + */ + +template struct gen_numeric_list : gen_numeric_list {}; +template struct gen_numeric_list { typedef numeric_list type; }; + +template struct gen_numeric_list_reversed : gen_numeric_list_reversed {}; +template struct gen_numeric_list_reversed { typedef numeric_list type; }; + +template struct gen_numeric_list_swapped_pair : gen_numeric_list_swapped_pair {}; +template struct gen_numeric_list_swapped_pair { typedef numeric_list type; }; + +template struct gen_numeric_list_repeated : gen_numeric_list_repeated {}; +template struct gen_numeric_list_repeated { typedef numeric_list type; }; + +/* list manipulation: concatenate */ + +template struct concat; + +template struct concat, type_list> { typedef type_list type; }; +template struct concat, numeric_list > { typedef numeric_list type; }; + +template struct mconcat; +template struct mconcat
{ typedef a type; }; +template struct mconcat : concat {}; +template struct mconcat : concat::type> {}; + +/* list manipulation: extract slices */ + +template struct take; +template struct take> : concat, typename take>::type> {}; +template struct take> { typedef type_list<> type; }; +template struct take<0, type_list> { typedef type_list<> type; }; +template<> struct take<0, type_list<>> { typedef type_list<> type; }; + +template struct take> : concat, typename take>::type> {}; +template struct take> { typedef numeric_list type; }; +template struct take<0, numeric_list> { typedef numeric_list type; }; +template struct take<0, numeric_list> { typedef numeric_list type; }; + +template struct h_skip_helper_numeric; +template struct h_skip_helper_numeric : h_skip_helper_numeric {}; +template struct h_skip_helper_numeric { typedef numeric_list type; }; +template struct h_skip_helper_numeric { typedef numeric_list type; }; +template struct h_skip_helper_numeric { typedef numeric_list type; }; + +template struct h_skip_helper_type; +template struct h_skip_helper_type : h_skip_helper_type {}; +template struct h_skip_helper_type<0, t, tt...> { typedef type_list type; }; +template struct h_skip_helper_type { typedef type_list<> type; }; +template<> struct h_skip_helper_type<0> { typedef type_list<> type; }; +#endif //not EIGEN_PARSED_BY_DOXYGEN + +template +struct h_skip { + template + constexpr static EIGEN_STRONG_INLINE typename h_skip_helper_numeric::type helper(numeric_list) { return typename h_skip_helper_numeric::type(); } + template + constexpr static EIGEN_STRONG_INLINE typename h_skip_helper_type::type helper(type_list) { return typename h_skip_helper_type::type(); } +}; + +template struct skip { typedef decltype(h_skip::helper(a())) type; }; + +template struct slice : take::type> {}; + +/* list manipulation: retrieve single element from list */ + +template struct get; + +template struct get> : get> {}; +template struct get<0, type_list> { typedef a type; }; + +template struct get> : get> {}; +template struct get<0, numeric_list> { constexpr static T value = a; }; + +template constexpr T array_get(const numeric_list&) { + return get<(int)n, numeric_list>::value; +} + +/* always get type, regardless of dummy; good for parameter pack expansion */ + +template struct id_numeric { typedef t type; }; +template struct id_type { typedef t type; }; + +/* equality checking, flagged version */ + +template struct is_same_gf : is_same { constexpr static int global_flags = 0; }; + +/* apply_op to list */ + +template< + bool from_left, // false + template class op, + typename additional_param, + typename... values +> +struct h_apply_op_helper { typedef type_list::type...> type; }; +template< + template class op, + typename additional_param, + typename... values +> +struct h_apply_op_helper { typedef type_list::type...> type; }; + +template< + bool from_left, + template class op, + typename additional_param +> +struct h_apply_op +{ + template + constexpr static typename h_apply_op_helper::type helper(type_list) + { return typename h_apply_op_helper::type(); } +}; + +template< + template class op, + typename additional_param, + typename a +> +struct apply_op_from_left { typedef decltype(h_apply_op::helper(a())) type; }; + +template< + template class op, + typename additional_param, + typename a +> +struct apply_op_from_right { typedef decltype(h_apply_op::helper(a())) type; }; + +/* see if an element is in a list */ + +template< + template class test, + typename check_against, + typename h_list, + bool last_check_positive = false +> +struct contained_in_list; + +template< + template class test, + typename check_against, + typename h_list +> +struct contained_in_list +{ + constexpr static bool value = true; +}; + +template< + template class test, + typename check_against, + typename a, + typename... as +> +struct contained_in_list, false> : contained_in_list, test::value> {}; + +template< + template class test, + typename check_against + EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, empty) +> +struct contained_in_list, false> { constexpr static bool value = false; }; + +/* see if an element is in a list and check for global flags */ + +template< + template class test, + typename check_against, + typename h_list, + int default_flags = 0, + bool last_check_positive = false, + int last_check_flags = default_flags +> +struct contained_in_list_gf; + +template< + template class test, + typename check_against, + typename h_list, + int default_flags, + int last_check_flags +> +struct contained_in_list_gf +{ + constexpr static bool value = true; + constexpr static int global_flags = last_check_flags; +}; + +template< + template class test, + typename check_against, + typename a, + typename... as, + int default_flags, + int last_check_flags +> +struct contained_in_list_gf, default_flags, false, last_check_flags> : contained_in_list_gf, default_flags, test::value, test::global_flags> {}; + +template< + template class test, + typename check_against + EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, empty), + int default_flags, + int last_check_flags +> +struct contained_in_list_gf, default_flags, false, last_check_flags> { constexpr static bool value = false; constexpr static int global_flags = default_flags; }; + +/* generic reductions */ + +template< + typename Reducer, + typename... Ts +> struct reduce; + +template< + typename Reducer +> struct reduce +{ + EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE int run() { return Reducer::Identity; } +}; + +template< + typename Reducer, + typename A +> struct reduce +{ + EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE A run(A a) { return a; } +}; + +template< + typename Reducer, + typename A, + typename... Ts +> struct reduce +{ + EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, Ts... ts) -> decltype(Reducer::run(a, reduce::run(ts...))) { + return Reducer::run(a, reduce::run(ts...)); + } +}; + +/* generic binary operations */ + +struct sum_op { + template EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a + b) { return a + b; } + static constexpr int Identity = 0; +}; +struct product_op { + template EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a * b) { return a * b; } + static constexpr int Identity = 1; +}; + +struct logical_and_op { template constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a && b) { return a && b; } }; +struct logical_or_op { template constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a || b) { return a || b; } }; + +struct equal_op { template constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a == b) { return a == b; } }; +struct not_equal_op { template constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a != b) { return a != b; } }; +struct lesser_op { template constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a < b) { return a < b; } }; +struct lesser_equal_op { template constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a <= b) { return a <= b; } }; +struct greater_op { template constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a > b) { return a > b; } }; +struct greater_equal_op { template constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a >= b) { return a >= b; } }; + +/* generic unary operations */ + +struct not_op { template constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(!a) { return !a; } }; +struct negation_op { template constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(-a) { return -a; } }; +struct greater_equal_zero_op { template constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(a >= 0) { return a >= 0; } }; + + +/* reductions for lists */ + +// using auto -> return value spec makes ICC 13.0 and 13.1 crash here, so we have to hack it +// together in front... (13.0 doesn't work with array_prod/array_reduce/... anyway, but 13.1 +// does... +template +EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE decltype(reduce::run((*((Ts*)0))...)) arg_prod(Ts... ts) +{ + return reduce::run(ts...); +} + +template +constexpr EIGEN_STRONG_INLINE decltype(reduce::run((*((Ts*)0))...)) arg_sum(Ts... ts) +{ + return reduce::run(ts...); +} + +/* reverse arrays */ + +template +constexpr EIGEN_STRONG_INLINE Array h_array_reverse(Array arr, numeric_list) +{ + return {{array_get(arr)...}}; +} + +template +constexpr EIGEN_STRONG_INLINE array array_reverse(array arr) +{ + return h_array_reverse(arr, typename gen_numeric_list::type()); +} + + +/* generic array reductions */ + +// can't reuse standard reduce() interface above because Intel's Compiler +// *really* doesn't like it, so we just reimplement the stuff +// (start from N - 1 and work down to 0 because specialization for +// n == N - 1 also doesn't work in Intel's compiler, so it goes into +// an infinite loop) +template +struct h_array_reduce { + EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(array arr, T identity) -> decltype(Reducer::run(h_array_reduce::run(arr, identity), array_get(arr))) + { + return Reducer::run(h_array_reduce::run(arr, identity), array_get(arr)); + } +}; + +template +struct h_array_reduce +{ + EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE T run(const array& arr, T) + { + return array_get<0>(arr); + } +}; + +template +struct h_array_reduce +{ + EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE T run(const array&, T identity) + { + return identity; + } +}; + +template +EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_reduce(const array& arr, T identity) -> decltype(h_array_reduce::run(arr, identity)) +{ + return h_array_reduce::run(arr, identity); +} + +/* standard array reductions */ + +template +EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_sum(const array& arr) -> decltype(array_reduce(arr, static_cast(0))) +{ + return array_reduce(arr, static_cast(0)); +} + +template +EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_prod(const array& arr) -> decltype(array_reduce(arr, static_cast(1))) +{ + return array_reduce(arr, static_cast(1)); +} + +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector& a) { + eigen_assert(a.size() > 0); + t prod = 1; + for (size_t i = 0; i < a.size(); ++i) { prod *= a[i]; } + return prod; +} + +/* zip an array */ + +template +constexpr EIGEN_STRONG_INLINE array h_array_zip(array a, array b, numeric_list) +{ + return array{{ Op::run(array_get(a), array_get(b))... }}; +} + +template +constexpr EIGEN_STRONG_INLINE array array_zip(array a, array b) +{ + return h_array_zip(a, b, typename gen_numeric_list::type()); +} + +/* zip an array and reduce the result */ + +template +constexpr EIGEN_STRONG_INLINE auto h_array_zip_and_reduce(array a, array b, numeric_list) -> decltype(reduce::type...>::run(Op::run(array_get(a), array_get(b))...)) +{ + return reduce::type...>::run(Op::run(array_get(a), array_get(b))...); +} + +template +constexpr EIGEN_STRONG_INLINE auto array_zip_and_reduce(array a, array b) -> decltype(h_array_zip_and_reduce(a, b, typename gen_numeric_list::type())) +{ + return h_array_zip_and_reduce(a, b, typename gen_numeric_list::type()); +} + +/* apply stuff to an array */ + +template +constexpr EIGEN_STRONG_INLINE array h_array_apply(array a, numeric_list) +{ + return array{{ Op::run(array_get(a))... }}; +} + +template +constexpr EIGEN_STRONG_INLINE array array_apply(array a) +{ + return h_array_apply(a, typename gen_numeric_list::type()); +} + +/* apply stuff to an array and reduce */ + +template +constexpr EIGEN_STRONG_INLINE auto h_array_apply_and_reduce(array arr, numeric_list) -> decltype(reduce::type...>::run(Op::run(array_get(arr))...)) +{ + return reduce::type...>::run(Op::run(array_get(arr))...); +} + +template +constexpr EIGEN_STRONG_INLINE auto array_apply_and_reduce(array a) -> decltype(h_array_apply_and_reduce(a, typename gen_numeric_list::type())) +{ + return h_array_apply_and_reduce(a, typename gen_numeric_list::type()); +} + +/* repeat a value n times (and make an array out of it + * usage: + * array = repeat<16>(42); + */ + +template +struct h_repeat +{ + template + constexpr static EIGEN_STRONG_INLINE array run(t v, numeric_list) + { + return {{ typename id_numeric::type(v)... }}; + } +}; + +template +constexpr array repeat(t v) { return h_repeat::run(v, typename gen_numeric_list::type()); } + +/* instantiate a class by a C-style array */ +template +struct h_instantiate_by_c_array; + +template +struct h_instantiate_by_c_array +{ + static InstType run(ArrType* arr, Ps... args) + { + return h_instantiate_by_c_array::run(arr + 1, args..., arr[0]); + } +}; + +template +struct h_instantiate_by_c_array +{ + static InstType run(ArrType* arr, Ps... args) + { + return h_instantiate_by_c_array::run(arr + 1, arr[0], args...); + } +}; + +template +struct h_instantiate_by_c_array +{ + static InstType run(ArrType* arr, Ps... args) + { + (void)arr; + return InstType(args...); + } +}; + +template +struct h_instantiate_by_c_array +{ + static InstType run(ArrType* arr, Ps... args) + { + (void)arr; + return InstType(args...); + } +}; + +template +InstType instantiate_by_c_array(ArrType* arr) +{ + return h_instantiate_by_c_array::run(arr); +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_CXX11META_H diff --git a/src/EigenUnsupported/CXX11/src/util/CXX11Workarounds.h b/src/EigenUnsupported/CXX11/src/util/CXX11Workarounds.h new file mode 100644 index 0000000..056736c --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/util/CXX11Workarounds.h @@ -0,0 +1,88 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2013 Christian Seiler +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11WORKAROUNDS_H +#define EIGEN_CXX11WORKAROUNDS_H + +/* COMPATIBILITY CHECKS + * (so users of compilers that are too old get some realistic error messages) + */ +#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1310) +#error Intel Compiler only supports required C++ features since version 13.1. +// note that most stuff in principle works with 13.0 but when combining +// some features, at some point 13.0 will just fail with an internal assertion +#elif defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 6)) +// G++ < 4.6 by default will continue processing the source files - even if we use #error to make +// it error out. For this reason, we use the pragma to make sure G++ aborts at the first error +// it sees. Unfortunately, that is still not our #error directive, but at least the output is +// short enough the user has a chance to see that the compiler version is not sufficient for +// the funky template mojo we use. +#pragma GCC diagnostic error "-Wfatal-errors" +#error GNU C++ Compiler (g++) only supports required C++ features since version 4.6. +#endif + +/* Check that the compiler at least claims to support C++11. It might not be sufficient + * because the compiler may not implement it correctly, but at least we'll know. + * On the other hand, visual studio still doesn't claim to support C++11 although it's + * compliant enugh for our purpose. + */ +#if (EIGEN_COMP_CXXVER < 11) +#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) +#pragma GCC diagnostic error "-Wfatal-errors" +#endif +#error This library needs at least a C++11 compliant compiler. If you use g++/clang, please enable the -std=c++11 compiler flag. (-std=c++0x on older versions.) +#endif + +namespace Eigen { + +namespace internal { + +/* std::get is only constexpr in C++14, not yet in C++11 + */ + + +template constexpr inline T& array_get(std::vector& a) { return a[I_]; } +template constexpr inline T&& array_get(std::vector&& a) { return a[I_]; } +template constexpr inline T const& array_get(std::vector const& a) { return a[I_]; } + +/* Suppose you have a template of the form + * template struct X; + * And you want to specialize it in such a way: + * template struct X> { ::: }; + * template<> struct X> { ::: }; + * This will work in Intel's compiler 13.0, but only to some extent in g++ 4.6, since + * g++ can only match templates called with parameter packs if the number of template + * arguments is not a fixed size (so inside the first specialization, referencing + * X> will fail in g++). On the other hand, g++ will accept the following: + * template struct X> { ::: }: + * as an additional (!) specialization, which will then only match the empty case. + * But Intel's compiler 13.0 won't accept that, it will only accept the empty syntax, + * so we have to create a workaround for this. + */ +#if defined(__GNUC__) && !defined(__INTEL_COMPILER) +#define EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n) mt... n +#define EIGEN_TPL_PP_SPEC_HACK_DEFC(mt, n) , EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n) +#define EIGEN_TPL_PP_SPEC_HACK_USE(n) n... +#define EIGEN_TPL_PP_SPEC_HACK_USEC(n) , n... +#else +#define EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n) +#define EIGEN_TPL_PP_SPEC_HACK_DEFC(mt, n) +#define EIGEN_TPL_PP_SPEC_HACK_USE(n) +#define EIGEN_TPL_PP_SPEC_HACK_USEC(n) +#endif + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_CXX11WORKAROUNDS_H + +/* + * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle; + */ diff --git a/src/EigenUnsupported/CXX11/src/util/EmulateArray.h b/src/EigenUnsupported/CXX11/src/util/EmulateArray.h new file mode 100644 index 0000000..834b20b --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/util/EmulateArray.h @@ -0,0 +1,261 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_EMULATE_ARRAY_H +#define EIGEN_EMULATE_ARRAY_H + + + +// The array class is only available starting with cxx11. Emulate our own here +// if needed. Beware, msvc still doesn't advertise itself as a c++11 compiler! +// Moreover, CUDA doesn't support the STL containers, so we use our own instead. +#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(EIGEN_GPUCC) || defined(EIGEN_AVOID_STL_ARRAY) + +namespace Eigen { +template class array { + public: + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE T& operator[] (size_t index) { eigen_internal_assert(index < size()); return values[index]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { eigen_internal_assert(index < size()); return values[index]; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE T& at(size_t index) { eigen_assert(index < size()); return values[index]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const T& at(size_t index) const { eigen_assert(index < size()); return values[index]; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE T& front() { return values[0]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const T& front() const { return values[0]; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE T& back() { return values[n-1]; } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const T& back() const { return values[n-1]; } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + static std::size_t size() { return n; } + + T values[n]; + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array() { } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v) { + EIGEN_STATIC_ASSERT(n==1, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v1, const T& v2) { + EIGEN_STATIC_ASSERT(n==2, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3) { + EIGEN_STATIC_ASSERT(n==3, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, + const T& v4) { + EIGEN_STATIC_ASSERT(n==4, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + values[3] = v4; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, + const T& v5) { + EIGEN_STATIC_ASSERT(n==5, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + values[3] = v4; + values[4] = v5; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, + const T& v5, const T& v6) { + EIGEN_STATIC_ASSERT(n==6, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + values[3] = v4; + values[4] = v5; + values[5] = v6; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, + const T& v5, const T& v6, const T& v7) { + EIGEN_STATIC_ASSERT(n==7, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + values[3] = v4; + values[4] = v5; + values[5] = v6; + values[6] = v7; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array( + const T& v1, const T& v2, const T& v3, const T& v4, + const T& v5, const T& v6, const T& v7, const T& v8) { + EIGEN_STATIC_ASSERT(n==8, YOU_MADE_A_PROGRAMMING_MISTAKE) + values[0] = v1; + values[1] = v2; + values[2] = v3; + values[3] = v4; + values[4] = v5; + values[5] = v6; + values[6] = v7; + values[7] = v8; + } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array(std::initializer_list l) { + eigen_assert(l.size() == n); + internal::smart_copy(l.begin(), l.end(), values); + } +#endif +}; + + +// Specialize array for zero size +template class array { + public: + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE T& operator[] (size_t) { + eigen_assert(false && "Can't index a zero size array"); + return dummy; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const T& operator[] (size_t) const { + eigen_assert(false && "Can't index a zero size array"); + return dummy; + } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE T& front() { + eigen_assert(false && "Can't index a zero size array"); + return dummy; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const T& front() const { + eigen_assert(false && "Can't index a zero size array"); + return dummy; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE T& back() { + eigen_assert(false && "Can't index a zero size array"); + return dummy; + } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const T& back() const { + eigen_assert(false && "Can't index a zero size array"); + return dummy; + } + + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::size_t size() { return 0; } + + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE array() : dummy() { } + +#if EIGEN_HAS_VARIADIC_TEMPLATES + EIGEN_DEVICE_FUNC array(std::initializer_list l) : dummy() { + EIGEN_UNUSED_VARIABLE(l); + eigen_assert(l.size() == 0); + } +#endif + + private: + T dummy; +}; + +// Comparison operator +// Todo: implement !=, <, <=, >, and >= +template +EIGEN_DEVICE_FUNC bool operator==(const array& lhs, const array& rhs) { + for (std::size_t i = 0; i < N; ++i) { + if (lhs[i] != rhs[i]) { + return false; + } + } + return true; +} + + +namespace internal { +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(array& a) { + return a[I_]; +} +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const array& a) { + return a[I_]; +} + +template struct array_size > { + enum { value = N }; +}; +template struct array_size& > { + enum { value = N }; +}; +template struct array_size > { + enum { value = N }; +}; +template struct array_size& > { + enum { value = N }; +}; + +} // end namespace internal +} // end namespace Eigen + +#else + +// The compiler supports c++11, and we're not targeting cuda: use std::array as Eigen::array +#include +namespace Eigen { + +template using array = std::array; + +namespace internal { +/* std::get is only constexpr in C++14, not yet in C++11 + * - libstdc++ from version 4.7 onwards has it nevertheless, + * so use that + * - libstdc++ older versions: use _M_instance directly + * - libc++ all versions so far: use __elems_ directly + * - all other libs: use std::get to be portable, but + * this may not be constexpr + */ +#if defined(__GLIBCXX__) && __GLIBCXX__ < 20120322 +#define STD_GET_ARR_HACK a._M_instance[I_] +#elif defined(_LIBCPP_VERSION) +#define STD_GET_ARR_HACK a.__elems_[I_] +#else +#define STD_GET_ARR_HACK std::template get(a) +#endif + +template constexpr inline T& array_get(std::array& a) { return (T&) STD_GET_ARR_HACK; } +template constexpr inline T&& array_get(std::array&& a) { return (T&&) STD_GET_ARR_HACK; } +template constexpr inline T const& array_get(std::array const& a) { return (T const&) STD_GET_ARR_HACK; } + +#undef STD_GET_ARR_HACK + +} // end namespace internal +} // end namespace Eigen + +#endif + +#endif // EIGEN_EMULATE_ARRAY_H diff --git a/src/EigenUnsupported/CXX11/src/util/MaxSizeVector.h b/src/EigenUnsupported/CXX11/src/util/MaxSizeVector.h new file mode 100644 index 0000000..277ab14 --- /dev/null +++ b/src/EigenUnsupported/CXX11/src/util/MaxSizeVector.h @@ -0,0 +1,158 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_FIXEDSIZEVECTOR_H +#define EIGEN_FIXEDSIZEVECTOR_H + +namespace Eigen { + +/** \class MaxSizeVector + * \ingroup Core + * + * \brief The MaxSizeVector class. + * + * The %MaxSizeVector provides a subset of std::vector functionality. + * + * The goal is to provide basic std::vector operations when using + * std::vector is not an option (e.g. on GPU or when compiling using + * FMA/AVX, as this can cause either compilation failures or illegal + * instruction failures). + * + * Beware: The constructors are not API compatible with these of + * std::vector. + */ +template +class MaxSizeVector { + static const size_t alignment = EIGEN_PLAIN_ENUM_MAX(EIGEN_ALIGNOF(T), sizeof(void*)); + public: + // Construct a new MaxSizeVector, reserve n elements. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + explicit MaxSizeVector(size_t n) + : reserve_(n), size_(0), + data_(static_cast(internal::handmade_aligned_malloc(n * sizeof(T), alignment))) { + } + + // Construct a new MaxSizeVector, reserve and resize to n. + // Copy the init value to all elements. + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + MaxSizeVector(size_t n, const T& init) + : reserve_(n), size_(n), + data_(static_cast(internal::handmade_aligned_malloc(n * sizeof(T), alignment))) { + size_t i = 0; + EIGEN_TRY + { + for(; i < size_; ++i) { new (&data_[i]) T(init); } + } + EIGEN_CATCH(...) + { + // Construction failed, destruct in reverse order: + for(; (i+1) > 0; --i) { data_[i-1].~T(); } + internal::handmade_aligned_free(data_); + EIGEN_THROW; + } + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + ~MaxSizeVector() { + for (size_t i = size_; i > 0; --i) { + data_[i-1].~T(); + } + internal::handmade_aligned_free(data_); + } + + void resize(size_t n) { + eigen_assert(n <= reserve_); + for (; size_ < n; ++size_) { + new (&data_[size_]) T; + } + for (; size_ > n; --size_) { + data_[size_-1].~T(); + } + eigen_assert(size_ == n); + } + + // Append new elements (up to reserved size). + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void push_back(const T& t) { + eigen_assert(size_ < reserve_); + new (&data_[size_++]) T(t); + } + + // For C++03 compatibility this only takes one argument + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void emplace_back(const X& x) { + eigen_assert(size_ < reserve_); + new (&data_[size_++]) T(x); + } + + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const T& operator[] (size_t i) const { + eigen_assert(i < size_); + return data_[i]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + T& operator[] (size_t i) { + eigen_assert(i < size_); + return data_[i]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + T& back() { + eigen_assert(size_ > 0); + return data_[size_ - 1]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const T& back() const { + eigen_assert(size_ > 0); + return data_[size_ - 1]; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void pop_back() { + eigen_assert(size_ > 0); + data_[--size_].~T(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + size_t size() const { return size_; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + bool empty() const { return size_ == 0; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + T* data() { return data_; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const T* data() const { return data_; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + T* begin() { return data_; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + T* end() { return data_ + size_; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const T* begin() const { return data_; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const T* end() const { return data_ + size_; } + + private: + size_t reserve_; + size_t size_; + T* data_; +}; + +} // namespace Eigen + +#endif // EIGEN_FIXEDSIZEVECTOR_H diff --git a/src/EigenUnsupported/EulerAngles b/src/EigenUnsupported/EulerAngles new file mode 100644 index 0000000..f8f1c5d --- /dev/null +++ b/src/EigenUnsupported/EulerAngles @@ -0,0 +1,43 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Tal Hadad +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_EULERANGLES_MODULE_H +#define EIGEN_EULERANGLES_MODULE_H + + +#include "../../Eigen/Core" +#include "../../Eigen/Geometry" + +#include "../../Eigen/src/Core/util/DisableStupidWarnings.h" + +namespace Eigen { + +/** + * \defgroup EulerAngles_Module EulerAngles module + * \brief This module provides generic euler angles rotation. + * + * Euler angles are a way to represent 3D rotation. + * + * In order to use this module in your code, include this header: + * \code + * #include + * \endcode + * + * See \ref EulerAngles for more information. + * + */ + +} + +#include "src/EulerAngles/EulerSystem.h" +#include "src/EulerAngles/EulerAngles.h" + +#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h" + +#endif // EIGEN_EULERANGLES_MODULE_H diff --git a/src/EigenUnsupported/FFT b/src/EigenUnsupported/FFT new file mode 100644 index 0000000..c8c311a --- /dev/null +++ b/src/EigenUnsupported/FFT @@ -0,0 +1,419 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Mark Borgerding mark a borgerding net +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_FFT_H +#define EIGEN_FFT_H + +#include +#include +#include +#include "../../Eigen/Core" + + +/** + * \defgroup FFT_Module Fast Fourier Transform module + * + * \code + * #include + * \endcode + * + * This module provides Fast Fourier transformation, with a configurable backend + * implementation. + * + * The default implementation is based on kissfft. It is a small, free, and + * reasonably efficient default. + * + * There are currently two implementation backend: + * + * - fftw (http://www.fftw.org) : faster, GPL -- incompatible with Eigen in LGPL form, bigger code size. + * - MKL (http://en.wikipedia.org/wiki/Math_Kernel_Library) : fastest, commercial -- may be incompatible with Eigen in GPL form. + * + * \section FFTDesign Design + * + * The following design decisions were made concerning scaling and + * half-spectrum for real FFT. + * + * The intent is to facilitate generic programming and ease migrating code + * from Matlab/octave. + * We think the default behavior of Eigen/FFT should favor correctness and + * generality over speed. Of course, the caller should be able to "opt-out" from this + * behavior and get the speed increase if they want it. + * + * 1) %Scaling: + * Other libraries (FFTW,IMKL,KISSFFT) do not perform scaling, so there + * is a constant gain incurred after the forward&inverse transforms , so + * IFFT(FFT(x)) = Kx; this is done to avoid a vector-by-value multiply. + * The downside is that algorithms that worked correctly in Matlab/octave + * don't behave the same way once implemented in C++. + * + * How Eigen/FFT differs: invertible scaling is performed so IFFT( FFT(x) ) = x. + * + * 2) Real FFT half-spectrum + * Other libraries use only half the frequency spectrum (plus one extra + * sample for the Nyquist bin) for a real FFT, the other half is the + * conjugate-symmetric of the first half. This saves them a copy and some + * memory. The downside is the caller needs to have special logic for the + * number of bins in complex vs real. + * + * How Eigen/FFT differs: The full spectrum is returned from the forward + * transform. This facilitates generic template programming by obviating + * separate specializations for real vs complex. On the inverse + * transform, only half the spectrum is actually used if the output type is real. + */ + + +#include "../../Eigen/src/Core/util/DisableStupidWarnings.h" + +#ifdef EIGEN_FFTW_DEFAULT +// FFTW: faster, GPL -- incompatible with Eigen in LGPL form, bigger code size +# include +# include "src/FFT/ei_fftw_impl.h" + namespace Eigen { + //template typedef struct internal::fftw_impl default_fft_impl; this does not work + template struct default_fft_impl : public internal::fftw_impl {}; + } +#elif defined EIGEN_MKL_DEFAULT +// TODO +// intel Math Kernel Library: fastest, commercial -- may be incompatible with Eigen in GPL form +# include "src/FFT/ei_imklfft_impl.h" + namespace Eigen { + template struct default_fft_impl : public internal::imklfft_impl {}; + } +#else +// internal::kissfft_impl: small, free, reasonably efficient default, derived from kissfft +// +# include "src/FFT/ei_kissfft_impl.h" + namespace Eigen { + template + struct default_fft_impl : public internal::kissfft_impl {}; + } +#endif + +namespace Eigen { + + +// +template struct fft_fwd_proxy; +template struct fft_inv_proxy; + +namespace internal { +template +struct traits< fft_fwd_proxy > +{ + typedef typename T_SrcMat::PlainObject ReturnType; +}; +template +struct traits< fft_inv_proxy > +{ + typedef typename T_SrcMat::PlainObject ReturnType; +}; +} + +template +struct fft_fwd_proxy + : public ReturnByValue > +{ + typedef DenseIndex Index; + + fft_fwd_proxy(const T_SrcMat& src,T_FftIfc & fft, Index nfft) : m_src(src),m_ifc(fft), m_nfft(nfft) {} + + template void evalTo(T_DestMat& dst) const; + + Index rows() const { return m_src.rows(); } + Index cols() const { return m_src.cols(); } +protected: + const T_SrcMat & m_src; + T_FftIfc & m_ifc; + Index m_nfft; +}; + +template +struct fft_inv_proxy + : public ReturnByValue > +{ + typedef DenseIndex Index; + + fft_inv_proxy(const T_SrcMat& src,T_FftIfc & fft, Index nfft) : m_src(src),m_ifc(fft), m_nfft(nfft) {} + + template void evalTo(T_DestMat& dst) const; + + Index rows() const { return m_src.rows(); } + Index cols() const { return m_src.cols(); } +protected: + const T_SrcMat & m_src; + T_FftIfc & m_ifc; + Index m_nfft; +}; + + +template > +class FFT +{ + public: + typedef T_Impl impl_type; + typedef DenseIndex Index; + typedef typename impl_type::Scalar Scalar; + typedef typename impl_type::Complex Complex; + + enum Flag { + Default=0, // goof proof + Unscaled=1, + HalfSpectrum=2, + // SomeOtherSpeedOptimization=4 + Speedy=32767 + }; + + FFT( const impl_type & impl=impl_type() , Flag flags=Default ) :m_impl(impl),m_flag(flags) { } + + inline + bool HasFlag(Flag f) const { return (m_flag & (int)f) == f;} + + inline + void SetFlag(Flag f) { m_flag |= (int)f;} + + inline + void ClearFlag(Flag f) { m_flag &= (~(int)f);} + + inline + void fwd( Complex * dst, const Scalar * src, Index nfft) + { + m_impl.fwd(dst,src,static_cast(nfft)); + if ( HasFlag(HalfSpectrum) == false) + ReflectSpectrum(dst,nfft); + } + + inline + void fwd( Complex * dst, const Complex * src, Index nfft) + { + m_impl.fwd(dst,src,static_cast(nfft)); + } + + /* + inline + void fwd2(Complex * dst, const Complex * src, int n0,int n1) + { + m_impl.fwd2(dst,src,n0,n1); + } + */ + + template + inline + void fwd( std::vector & dst, const std::vector<_Input> & src) + { + if ( NumTraits<_Input>::IsComplex == 0 && HasFlag(HalfSpectrum) ) + dst.resize( (src.size()>>1)+1); // half the bins + Nyquist bin + else + dst.resize(src.size()); + fwd(&dst[0],&src[0],src.size()); + } + + template + inline + void fwd( MatrixBase & dst, const MatrixBase & src, Index nfft=-1) + { + typedef typename ComplexDerived::Scalar dst_type; + typedef typename InputDerived::Scalar src_type; + EIGEN_STATIC_ASSERT_VECTOR_ONLY(InputDerived) + EIGEN_STATIC_ASSERT_VECTOR_ONLY(ComplexDerived) + EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(ComplexDerived,InputDerived) // size at compile-time + EIGEN_STATIC_ASSERT((internal::is_same::value), + YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) + EIGEN_STATIC_ASSERT(int(InputDerived::Flags)&int(ComplexDerived::Flags)&DirectAccessBit, + THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES) + + if (nfft<1) + nfft = src.size(); + + if ( NumTraits< src_type >::IsComplex == 0 && HasFlag(HalfSpectrum) ) + dst.derived().resize( (nfft>>1)+1); + else + dst.derived().resize(nfft); + + if ( src.innerStride() != 1 || src.size() < nfft ) { + Matrix tmp; + if (src.size() + inline + fft_fwd_proxy< MatrixBase, FFT > + fwd( const MatrixBase & src, Index nfft=-1) + { + return fft_fwd_proxy< MatrixBase ,FFT >( src, *this,nfft ); + } + + template + inline + fft_inv_proxy< MatrixBase, FFT > + inv( const MatrixBase & src, Index nfft=-1) + { + return fft_inv_proxy< MatrixBase ,FFT >( src, *this,nfft ); + } + + inline + void inv( Complex * dst, const Complex * src, Index nfft) + { + m_impl.inv( dst,src,static_cast(nfft) ); + if ( HasFlag( Unscaled ) == false) + scale(dst,Scalar(1./nfft),nfft); // scale the time series + } + + inline + void inv( Scalar * dst, const Complex * src, Index nfft) + { + m_impl.inv( dst,src,static_cast(nfft) ); + if ( HasFlag( Unscaled ) == false) + scale(dst,Scalar(1./nfft),nfft); // scale the time series + } + + template + inline + void inv( MatrixBase & dst, const MatrixBase & src, Index nfft=-1) + { + typedef typename ComplexDerived::Scalar src_type; + typedef typename ComplexDerived::RealScalar real_type; + typedef typename OutputDerived::Scalar dst_type; + const bool realfft= (NumTraits::IsComplex == 0); + EIGEN_STATIC_ASSERT_VECTOR_ONLY(OutputDerived) + EIGEN_STATIC_ASSERT_VECTOR_ONLY(ComplexDerived) + EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(ComplexDerived,OutputDerived) // size at compile-time + EIGEN_STATIC_ASSERT((internal::is_same::value), + YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) + EIGEN_STATIC_ASSERT(int(OutputDerived::Flags)&int(ComplexDerived::Flags)&DirectAccessBit, + THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES) + + if (nfft<1) { //automatic FFT size determination + if ( realfft && HasFlag(HalfSpectrum) ) + nfft = 2*(src.size()-1); //assume even fft size + else + nfft = src.size(); + } + dst.derived().resize( nfft ); + + // check for nfft that does not fit the input data size + Index resize_input= ( realfft && HasFlag(HalfSpectrum) ) + ? ( (nfft/2+1) - src.size() ) + : ( nfft - src.size() ); + + if ( src.innerStride() != 1 || resize_input ) { + // if the vector is strided, then we need to copy it to a packed temporary + Matrix tmp; + if ( resize_input ) { + size_t ncopy = (std::min)(src.size(),src.size() + resize_input); + tmp.setZero(src.size() + resize_input); + if ( realfft && HasFlag(HalfSpectrum) ) { + // pad at the Nyquist bin + tmp.head(ncopy) = src.head(ncopy); + tmp(ncopy-1) = real(tmp(ncopy-1)); // enforce real-only Nyquist bin + }else{ + size_t nhead,ntail; + nhead = 1+ncopy/2-1; // range [0:pi) + ntail = ncopy/2-1; // range (-pi:0) + tmp.head(nhead) = src.head(nhead); + tmp.tail(ntail) = src.tail(ntail); + if (resize_input<0) { //shrinking -- create the Nyquist bin as the average of the two bins that fold into it + tmp(nhead) = ( src(nfft/2) + src( src.size() - nfft/2 ) )*real_type(.5); + }else{ // expanding -- split the old Nyquist bin into two halves + tmp(nhead) = src(nhead) * real_type(.5); + tmp(tmp.size()-nhead) = tmp(nhead); + } + } + }else{ + tmp = src; + } + inv( &dst[0],&tmp[0], nfft); + }else{ + inv( &dst[0],&src[0], nfft); + } + } + + template + inline + void inv( std::vector<_Output> & dst, const std::vector & src,Index nfft=-1) + { + if (nfft<1) + nfft = ( NumTraits<_Output>::IsComplex == 0 && HasFlag(HalfSpectrum) ) ? 2*(src.size()-1) : src.size(); + dst.resize( nfft ); + inv( &dst[0],&src[0],nfft); + } + + + /* + // TODO: multi-dimensional FFTs + inline + void inv2(Complex * dst, const Complex * src, int n0,int n1) + { + m_impl.inv2(dst,src,n0,n1); + if ( HasFlag( Unscaled ) == false) + scale(dst,1./(n0*n1),n0*n1); + } + */ + + inline + impl_type & impl() {return m_impl;} + private: + + template + inline + void scale(T_Data * x,Scalar s,Index nx) + { +#if 1 + for (int k=0;k::Map(x,nx) *= s; + else + Matrix::MapAligned(x,nx) *= s; + //Matrix::Map(x,nx) * s; +#endif + } + + inline + void ReflectSpectrum(Complex * freq, Index nfft) + { + // create the implicit right-half spectrum (conjugate-mirror of the left-half) + Index nhbins=(nfft>>1)+1; + for (Index k=nhbins;k < nfft; ++k ) + freq[k] = conj(freq[nfft-k]); + } + + impl_type m_impl; + int m_flag; +}; + +template +template inline +void fft_fwd_proxy::evalTo(T_DestMat& dst) const +{ + m_ifc.fwd( dst, m_src, m_nfft); +} + +template +template inline +void fft_inv_proxy::evalTo(T_DestMat& dst) const +{ + m_ifc.inv( dst, m_src, m_nfft); +} + +} + +#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h" + +#endif diff --git a/src/EigenUnsupported/IterativeSolvers b/src/EigenUnsupported/IterativeSolvers new file mode 100644 index 0000000..a3f58d6 --- /dev/null +++ b/src/EigenUnsupported/IterativeSolvers @@ -0,0 +1,51 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2009 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_ITERATIVE_SOLVERS_MODULE_H +#define EIGEN_ITERATIVE_SOLVERS_MODULE_H + +#include "../../Eigen/Sparse" +#include "../../Eigen/Jacobi" +#include "../../Eigen/Householder" + + +/** + * \defgroup IterativeLinearSolvers_Module Iterative solvers module + * This module aims to provide various iterative linear and non linear solver algorithms. + * It currently provides: + * - a constrained conjugate gradient + * - a Householder GMRES implementation + * - an IDR(s) implementation + * - a DGMRES implementation + * - a MINRES implementation + * + * \code + * #include + * \endcode + */ + + +#include "../../Eigen/src/Core/util/DisableStupidWarnings.h" + +#ifndef EIGEN_MPL2_ONLY +#include "src/IterativeSolvers/IterationController.h" +#include "src/IterativeSolvers/ConstrainedConjGrad.h" +#endif + +#include "src/IterativeSolvers/IncompleteLU.h" +#include "src/IterativeSolvers/GMRES.h" +#include "src/IterativeSolvers/DGMRES.h" +//#include "src/IterativeSolvers/SSORPreconditioner.h" +#include "src/IterativeSolvers/MINRES.h" +#include "src/IterativeSolvers/IDRS.h" + +#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h" + + +#endif // EIGEN_ITERATIVE_SOLVERS_MODULE_H diff --git a/src/EigenUnsupported/KroneckerProduct b/src/EigenUnsupported/KroneckerProduct new file mode 100644 index 0000000..5f5afb8 --- /dev/null +++ b/src/EigenUnsupported/KroneckerProduct @@ -0,0 +1,36 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_KRONECKER_PRODUCT_MODULE_H +#define EIGEN_KRONECKER_PRODUCT_MODULE_H + +#include "../../Eigen/Core" + +#include "../../Eigen/src/Core/util/DisableStupidWarnings.h" + +#include "../../Eigen/src/SparseCore/SparseUtil.h" + +namespace Eigen { + +/** + * \defgroup KroneckerProduct_Module KroneckerProduct module + * + * This module contains an experimental Kronecker product implementation. + * + * \code + * #include + * \endcode + */ + +} // namespace Eigen + +#include "src/KroneckerProduct/KroneckerTensorProduct.h" + +#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h" + +#endif // EIGEN_KRONECKER_PRODUCT_MODULE_H diff --git a/src/EigenUnsupported/LevenbergMarquardt b/src/EigenUnsupported/LevenbergMarquardt new file mode 100644 index 0000000..1090505 --- /dev/null +++ b/src/EigenUnsupported/LevenbergMarquardt @@ -0,0 +1,49 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Thomas Capricelli +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_LEVENBERGMARQUARDT_MODULE +#define EIGEN_LEVENBERGMARQUARDT_MODULE + +// #include + +#include "../../Eigen/Core" +#include "../../Eigen/Jacobi" +#include "../../Eigen/QR" +#include "NumericalDiff" + +#include "../../Eigen/SparseQR" + +/** + * \defgroup LevenbergMarquardt_Module Levenberg-Marquardt module + * + * \code + * #include + * \endcode + * + * + */ + +#include "../../Eigen/SparseCore" + +#include "../../Eigen/src/Core/util/DisableStupidWarnings.h" + +#ifndef EIGEN_PARSED_BY_DOXYGEN + +#include "src/LevenbergMarquardt/LMqrsolv.h" +#include "src/LevenbergMarquardt/LMcovar.h" +#include "src/LevenbergMarquardt/LMpar.h" + +#endif + +#include "src/LevenbergMarquardt/LevenbergMarquardt.h" +#include "src/LevenbergMarquardt/LMonestep.h" + +#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h" + +#endif // EIGEN_LEVENBERGMARQUARDT_MODULE diff --git a/src/EigenUnsupported/MPRealSupport b/src/EigenUnsupported/MPRealSupport new file mode 100644 index 0000000..c4ea4ec --- /dev/null +++ b/src/EigenUnsupported/MPRealSupport @@ -0,0 +1,213 @@ +// This file is part of a joint effort between Eigen, a lightweight C++ template library +// for linear algebra, and MPFR C++, a C++ interface to MPFR library (http://www.holoborodko.com/pavel/) +// +// Copyright (C) 2010-2012 Pavel Holoborodko +// Copyright (C) 2010 Konstantin Holoborodko +// Copyright (C) 2010 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_MPREALSUPPORT_MODULE_H +#define EIGEN_MPREALSUPPORT_MODULE_H + +#include "../../Eigen/Core" +#include + +namespace Eigen { + +/** + * \defgroup MPRealSupport_Module MPFRC++ Support module + * \code + * #include + * \endcode + * + * This module provides support for multi precision floating point numbers + * via the MPFR C++ + * library which itself is built upon MPFR/GMP. + * + * \warning MPFR C++ is licensed under the GPL. + * + * You can find a copy of MPFR C++ that is known to be compatible in the unsupported/test/mpreal folder. + * + * Here is an example: + * +\code +#include +#include +#include +using namespace mpfr; +using namespace Eigen; +int main() +{ + // set precision to 256 bits (double has only 53 bits) + mpreal::set_default_prec(256); + // Declare matrix and vector types with multi-precision scalar type + typedef Matrix MatrixXmp; + typedef Matrix VectorXmp; + + MatrixXmp A = MatrixXmp::Random(100,100); + VectorXmp b = VectorXmp::Random(100); + + // Solve Ax=b using LU + VectorXmp x = A.lu().solve(b); + std::cout << "relative error: " << (A*x - b).norm() / b.norm() << std::endl; + return 0; +} +\endcode + * + */ + + template<> struct NumTraits + : GenericNumTraits + { + enum { + IsInteger = 0, + IsSigned = 1, + IsComplex = 0, + RequireInitialization = 1, + ReadCost = HugeCost, + AddCost = HugeCost, + MulCost = HugeCost + }; + + typedef mpfr::mpreal Real; + typedef mpfr::mpreal NonInteger; + + static inline Real highest (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::maxval(Precision); } + static inline Real lowest (long Precision = mpfr::mpreal::get_default_prec()) { return -mpfr::maxval(Precision); } + + // Constants + static inline Real Pi (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_pi(Precision); } + static inline Real Euler (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_euler(Precision); } + static inline Real Log2 (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_log2(Precision); } + static inline Real Catalan (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_catalan(Precision); } + + static inline Real epsilon (long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::machine_epsilon(Precision); } + static inline Real epsilon (const Real& x) { return mpfr::machine_epsilon(x); } + +#ifdef MPREAL_HAVE_DYNAMIC_STD_NUMERIC_LIMITS + static inline int digits10 (long Precision = mpfr::mpreal::get_default_prec()) { return std::numeric_limits::digits10(Precision); } + static inline int digits10 (const Real& x) { return std::numeric_limits::digits10(x); } + + static inline int digits () { return std::numeric_limits::digits(); } + static inline int digits (const Real& x) { return std::numeric_limits::digits(x); } +#endif + + static inline Real dummy_precision() + { + mpfr_prec_t weak_prec = ((mpfr::mpreal::get_default_prec()-1) * 90) / 100; + return mpfr::machine_epsilon(weak_prec); + } + }; + + namespace internal { + + template<> inline mpfr::mpreal random() + { + return mpfr::random(); + } + + template<> inline mpfr::mpreal random(const mpfr::mpreal& a, const mpfr::mpreal& b) + { + return a + (b-a) * random(); + } + + inline bool isMuchSmallerThan(const mpfr::mpreal& a, const mpfr::mpreal& b, const mpfr::mpreal& eps) + { + return mpfr::abs(a) <= mpfr::abs(b) * eps; + } + + inline bool isApprox(const mpfr::mpreal& a, const mpfr::mpreal& b, const mpfr::mpreal& eps) + { + return mpfr::isEqualFuzzy(a,b,eps); + } + + inline bool isApproxOrLessThan(const mpfr::mpreal& a, const mpfr::mpreal& b, const mpfr::mpreal& eps) + { + return a <= b || mpfr::isEqualFuzzy(a,b,eps); + } + + template<> inline long double cast(const mpfr::mpreal& x) + { return x.toLDouble(); } + + template<> inline double cast(const mpfr::mpreal& x) + { return x.toDouble(); } + + template<> inline long cast(const mpfr::mpreal& x) + { return x.toLong(); } + + template<> inline int cast(const mpfr::mpreal& x) + { return int(x.toLong()); } + + // Specialize GEBP kernel and traits for mpreal (no need for peeling, nor complicated stuff) + // This also permits to directly call mpfr's routines and avoid many temporaries produced by mpreal + template<> + class gebp_traits + { + public: + typedef mpfr::mpreal ResScalar; + enum { + Vectorizable = false, + LhsPacketSize = 1, + RhsPacketSize = 1, + ResPacketSize = 1, + NumberOfRegisters = 1, + nr = 1, + mr = 1, + LhsProgress = 1, + RhsProgress = 1 + }; + typedef ResScalar LhsPacket; + typedef ResScalar RhsPacket; + typedef ResScalar ResPacket; + typedef LhsPacket LhsPacket4Packing; + + }; + + + + template + struct gebp_kernel + { + typedef mpfr::mpreal mpreal; + + EIGEN_DONT_INLINE + void operator()(const DataMapper& res, const mpreal* blockA, const mpreal* blockB, + Index rows, Index depth, Index cols, const mpreal& alpha, + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0) + { + if(rows==0 || cols==0 || depth==0) + return; + + mpreal acc1(0,mpfr_get_prec(blockA[0].mpfr_srcptr())), + tmp (0,mpfr_get_prec(blockA[0].mpfr_srcptr())); + + if(strideA==-1) strideA = depth; + if(strideB==-1) strideB = depth; + + for(Index i=0; i +// Copyright (C) 2012 Chen-Pang He +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_MATRIX_FUNCTIONS +#define EIGEN_MATRIX_FUNCTIONS + +#include +#include + +#include "../../Eigen/Core" +#include "../../Eigen/LU" +#include "../../Eigen/Eigenvalues" + +/** + * \defgroup MatrixFunctions_Module Matrix functions module + * \brief This module aims to provide various methods for the computation of + * matrix functions. + * + * To use this module, add + * \code + * #include + * \endcode + * at the start of your source file. + * + * This module defines the following MatrixBase methods. + * - \ref matrixbase_cos "MatrixBase::cos()", for computing the matrix cosine + * - \ref matrixbase_cosh "MatrixBase::cosh()", for computing the matrix hyperbolic cosine + * - \ref matrixbase_exp "MatrixBase::exp()", for computing the matrix exponential + * - \ref matrixbase_log "MatrixBase::log()", for computing the matrix logarithm + * - \ref matrixbase_pow "MatrixBase::pow()", for computing the matrix power + * - \ref matrixbase_matrixfunction "MatrixBase::matrixFunction()", for computing general matrix functions + * - \ref matrixbase_sin "MatrixBase::sin()", for computing the matrix sine + * - \ref matrixbase_sinh "MatrixBase::sinh()", for computing the matrix hyperbolic sine + * - \ref matrixbase_sqrt "MatrixBase::sqrt()", for computing the matrix square root + * + * These methods are the main entry points to this module. + * + * %Matrix functions are defined as follows. Suppose that \f$ f \f$ + * is an entire function (that is, a function on the complex plane + * that is everywhere complex differentiable). Then its Taylor + * series + * \f[ f(0) + f'(0) x + \frac{f''(0)}{2} x^2 + \frac{f'''(0)}{3!} x^3 + \cdots \f] + * converges to \f$ f(x) \f$. In this case, we can define the matrix + * function by the same series: + * \f[ f(M) = f(0) + f'(0) M + \frac{f''(0)}{2} M^2 + \frac{f'''(0)}{3!} M^3 + \cdots \f] + * + */ + +#include "../../Eigen/src/Core/util/DisableStupidWarnings.h" + +#include "src/MatrixFunctions/MatrixExponential.h" +#include "src/MatrixFunctions/MatrixFunction.h" +#include "src/MatrixFunctions/MatrixSquareRoot.h" +#include "src/MatrixFunctions/MatrixLogarithm.h" +#include "src/MatrixFunctions/MatrixPower.h" + +#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h" + + +/** +\page matrixbaseextra_page +\ingroup MatrixFunctions_Module + +\section matrixbaseextra MatrixBase methods defined in the MatrixFunctions module + +The remainder of the page documents the following MatrixBase methods +which are defined in the MatrixFunctions module. + + + +\subsection matrixbase_cos MatrixBase::cos() + +Compute the matrix cosine. + +\code +const MatrixFunctionReturnValue MatrixBase::cos() const +\endcode + +\param[in] M a square matrix. +\returns expression representing \f$ \cos(M) \f$. + +This function computes the matrix cosine. Use ArrayBase::cos() for computing the entry-wise cosine. + +The implementation calls \ref matrixbase_matrixfunction "matrixFunction()" with StdStemFunctions::cos(). + +\sa \ref matrixbase_sin "sin()" for an example. + + + +\subsection matrixbase_cosh MatrixBase::cosh() + +Compute the matrix hyberbolic cosine. + +\code +const MatrixFunctionReturnValue MatrixBase::cosh() const +\endcode + +\param[in] M a square matrix. +\returns expression representing \f$ \cosh(M) \f$ + +This function calls \ref matrixbase_matrixfunction "matrixFunction()" with StdStemFunctions::cosh(). + +\sa \ref matrixbase_sinh "sinh()" for an example. + + + +\subsection matrixbase_exp MatrixBase::exp() + +Compute the matrix exponential. + +\code +const MatrixExponentialReturnValue MatrixBase::exp() const +\endcode + +\param[in] M matrix whose exponential is to be computed. +\returns expression representing the matrix exponential of \p M. + +The matrix exponential of \f$ M \f$ is defined by +\f[ \exp(M) = \sum_{k=0}^\infty \frac{M^k}{k!}. \f] +The matrix exponential can be used to solve linear ordinary +differential equations: the solution of \f$ y' = My \f$ with the +initial condition \f$ y(0) = y_0 \f$ is given by +\f$ y(t) = \exp(M) y_0 \f$. + +The matrix exponential is different from applying the exp function to all the entries in the matrix. +Use ArrayBase::exp() if you want to do the latter. + +The cost of the computation is approximately \f$ 20 n^3 \f$ for +matrices of size \f$ n \f$. The number 20 depends weakly on the +norm of the matrix. + +The matrix exponential is computed using the scaling-and-squaring +method combined with Padé approximation. The matrix is first +rescaled, then the exponential of the reduced matrix is computed +approximant, and then the rescaling is undone by repeated +squaring. The degree of the Padé approximant is chosen such +that the approximation error is less than the round-off +error. However, errors may accumulate during the squaring phase. + +Details of the algorithm can be found in: Nicholas J. Higham, "The +scaling and squaring method for the matrix exponential revisited," +SIAM J. %Matrix Anal. Applic., 26:1179–1193, +2005. + +Example: The following program checks that +\f[ \exp \left[ \begin{array}{ccc} + 0 & \frac14\pi & 0 \\ + -\frac14\pi & 0 & 0 \\ + 0 & 0 & 0 + \end{array} \right] = \left[ \begin{array}{ccc} + \frac12\sqrt2 & -\frac12\sqrt2 & 0 \\ + \frac12\sqrt2 & \frac12\sqrt2 & 0 \\ + 0 & 0 & 1 + \end{array} \right]. \f] +This corresponds to a rotation of \f$ \frac14\pi \f$ radians around +the z-axis. + +\include MatrixExponential.cpp +Output: \verbinclude MatrixExponential.out + +\note \p M has to be a matrix of \c float, \c double, `long double` +\c complex, \c complex, or `complex` . + + +\subsection matrixbase_log MatrixBase::log() + +Compute the matrix logarithm. + +\code +const MatrixLogarithmReturnValue MatrixBase::log() const +\endcode + +\param[in] M invertible matrix whose logarithm is to be computed. +\returns expression representing the matrix logarithm root of \p M. + +The matrix logarithm of \f$ M \f$ is a matrix \f$ X \f$ such that +\f$ \exp(X) = M \f$ where exp denotes the matrix exponential. As for +the scalar logarithm, the equation \f$ \exp(X) = M \f$ may have +multiple solutions; this function returns a matrix whose eigenvalues +have imaginary part in the interval \f$ (-\pi,\pi] \f$. + +The matrix logarithm is different from applying the log function to all the entries in the matrix. +Use ArrayBase::log() if you want to do the latter. + +In the real case, the matrix \f$ M \f$ should be invertible and +it should have no eigenvalues which are real and negative (pairs of +complex conjugate eigenvalues are allowed). In the complex case, it +only needs to be invertible. + +This function computes the matrix logarithm using the Schur-Parlett +algorithm as implemented by MatrixBase::matrixFunction(). The +logarithm of an atomic block is computed by MatrixLogarithmAtomic, +which uses direct computation for 1-by-1 and 2-by-2 blocks and an +inverse scaling-and-squaring algorithm for bigger blocks, with the +square roots computed by MatrixBase::sqrt(). + +Details of the algorithm can be found in Section 11.6.2 of: +Nicholas J. Higham, +Functions of Matrices: Theory and Computation, +SIAM 2008. ISBN 978-0-898716-46-7. + +Example: The following program checks that +\f[ \log \left[ \begin{array}{ccc} + \frac12\sqrt2 & -\frac12\sqrt2 & 0 \\ + \frac12\sqrt2 & \frac12\sqrt2 & 0 \\ + 0 & 0 & 1 + \end{array} \right] = \left[ \begin{array}{ccc} + 0 & \frac14\pi & 0 \\ + -\frac14\pi & 0 & 0 \\ + 0 & 0 & 0 + \end{array} \right]. \f] +This corresponds to a rotation of \f$ \frac14\pi \f$ radians around +the z-axis. This is the inverse of the example used in the +documentation of \ref matrixbase_exp "exp()". + +\include MatrixLogarithm.cpp +Output: \verbinclude MatrixLogarithm.out + +\note \p M has to be a matrix of \c float, \c double, `long +double`, \c complex, \c complex, or `complex`. + +\sa MatrixBase::exp(), MatrixBase::matrixFunction(), + class MatrixLogarithmAtomic, MatrixBase::sqrt(). + + +\subsection matrixbase_pow MatrixBase::pow() + +Compute the matrix raised to arbitrary real power. + +\code +const MatrixPowerReturnValue MatrixBase::pow(RealScalar p) const +\endcode + +\param[in] M base of the matrix power, should be a square matrix. +\param[in] p exponent of the matrix power. + +The matrix power \f$ M^p \f$ is defined as \f$ \exp(p \log(M)) \f$, +where exp denotes the matrix exponential, and log denotes the matrix +logarithm. This is different from raising all the entries in the matrix +to the p-th power. Use ArrayBase::pow() if you want to do the latter. + +If \p p is complex, the scalar type of \p M should be the type of \p +p . \f$ M^p \f$ simply evaluates into \f$ \exp(p \log(M)) \f$. +Therefore, the matrix \f$ M \f$ should meet the conditions to be an +argument of matrix logarithm. + +If \p p is real, it is casted into the real scalar type of \p M. Then +this function computes the matrix power using the Schur-Padé +algorithm as implemented by class MatrixPower. The exponent is split +into integral part and fractional part, where the fractional part is +in the interval \f$ (-1, 1) \f$. The main diagonal and the first +super-diagonal is directly computed. + +If \p M is singular with a semisimple zero eigenvalue and \p p is +positive, the Schur factor \f$ T \f$ is reordered with Givens +rotations, i.e. + +\f[ T = \left[ \begin{array}{cc} + T_1 & T_2 \\ + 0 & 0 + \end{array} \right] \f] + +where \f$ T_1 \f$ is invertible. Then \f$ T^p \f$ is given by + +\f[ T^p = \left[ \begin{array}{cc} + T_1^p & T_1^{-1} T_1^p T_2 \\ + 0 & 0 + \end{array}. \right] \f] + +\warning Fractional power of a matrix with a non-semisimple zero +eigenvalue is not well-defined. We introduce an assertion failure +against inaccurate result, e.g. \code +#include +#include + +int main() +{ + Eigen::Matrix4d A; + A << 0, 0, 2, 3, + 0, 0, 4, 5, + 0, 0, 6, 7, + 0, 0, 8, 9; + std::cout << A.pow(0.37) << std::endl; + + // The 1 makes eigenvalue 0 non-semisimple. + A.coeffRef(0, 1) = 1; + + // This fails if EIGEN_NO_DEBUG is undefined. + std::cout << A.pow(0.37) << std::endl; + + return 0; +} +\endcode + +Details of the algorithm can be found in: Nicholas J. Higham and +Lijing Lin, "A Schur-Padé algorithm for fractional powers of a +matrix," SIAM J. %Matrix Anal. Applic., +32(3):1056–1078, 2011. + +Example: The following program checks that +\f[ \left[ \begin{array}{ccc} + \cos1 & -\sin1 & 0 \\ + \sin1 & \cos1 & 0 \\ + 0 & 0 & 1 + \end{array} \right]^{\frac14\pi} = \left[ \begin{array}{ccc} + \frac12\sqrt2 & -\frac12\sqrt2 & 0 \\ + \frac12\sqrt2 & \frac12\sqrt2 & 0 \\ + 0 & 0 & 1 + \end{array} \right]. \f] +This corresponds to \f$ \frac14\pi \f$ rotations of 1 radian around +the z-axis. + +\include MatrixPower.cpp +Output: \verbinclude MatrixPower.out + +MatrixBase::pow() is user-friendly. However, there are some +circumstances under which you should use class MatrixPower directly. +MatrixPower can save the result of Schur decomposition, so it's +better for computing various powers for the same matrix. + +Example: +\include MatrixPower_optimal.cpp +Output: \verbinclude MatrixPower_optimal.out + +\note \p M has to be a matrix of \c float, \c double, `long +double`, \c complex, \c complex, or +\c complex . + +\sa MatrixBase::exp(), MatrixBase::log(), class MatrixPower. + + +\subsection matrixbase_matrixfunction MatrixBase::matrixFunction() + +Compute a matrix function. + +\code +const MatrixFunctionReturnValue MatrixBase::matrixFunction(typename internal::stem_function::Scalar>::type f) const +\endcode + +\param[in] M argument of matrix function, should be a square matrix. +\param[in] f an entire function; \c f(x,n) should compute the n-th +derivative of f at x. +\returns expression representing \p f applied to \p M. + +Suppose that \p M is a matrix whose entries have type \c Scalar. +Then, the second argument, \p f, should be a function with prototype +\code +ComplexScalar f(ComplexScalar, int) +\endcode +where \c ComplexScalar = \c std::complex if \c Scalar is +real (e.g., \c float or \c double) and \c ComplexScalar = +\c Scalar if \c Scalar is complex. The return value of \c f(x,n) +should be \f$ f^{(n)}(x) \f$, the n-th derivative of f at x. + +This routine uses the algorithm described in: +Philip Davies and Nicholas J. Higham, +"A Schur-Parlett algorithm for computing matrix functions", +SIAM J. %Matrix Anal. Applic., 25:464–485, 2003. + +The actual work is done by the MatrixFunction class. + +Example: The following program checks that +\f[ \exp \left[ \begin{array}{ccc} + 0 & \frac14\pi & 0 \\ + -\frac14\pi & 0 & 0 \\ + 0 & 0 & 0 + \end{array} \right] = \left[ \begin{array}{ccc} + \frac12\sqrt2 & -\frac12\sqrt2 & 0 \\ + \frac12\sqrt2 & \frac12\sqrt2 & 0 \\ + 0 & 0 & 1 + \end{array} \right]. \f] +This corresponds to a rotation of \f$ \frac14\pi \f$ radians around +the z-axis. This is the same example as used in the documentation +of \ref matrixbase_exp "exp()". + +\include MatrixFunction.cpp +Output: \verbinclude MatrixFunction.out + +Note that the function \c expfn is defined for complex numbers +\c x, even though the matrix \c A is over the reals. Instead of +\c expfn, we could also have used StdStemFunctions::exp: +\code +A.matrixFunction(StdStemFunctions >::exp, &B); +\endcode + + + +\subsection matrixbase_sin MatrixBase::sin() + +Compute the matrix sine. + +\code +const MatrixFunctionReturnValue MatrixBase::sin() const +\endcode + +\param[in] M a square matrix. +\returns expression representing \f$ \sin(M) \f$. + +This function computes the matrix sine. Use ArrayBase::sin() for computing the entry-wise sine. + +The implementation calls \ref matrixbase_matrixfunction "matrixFunction()" with StdStemFunctions::sin(). + +Example: \include MatrixSine.cpp +Output: \verbinclude MatrixSine.out + + + +\subsection matrixbase_sinh MatrixBase::sinh() + +Compute the matrix hyperbolic sine. + +\code +MatrixFunctionReturnValue MatrixBase::sinh() const +\endcode + +\param[in] M a square matrix. +\returns expression representing \f$ \sinh(M) \f$ + +This function calls \ref matrixbase_matrixfunction "matrixFunction()" with StdStemFunctions::sinh(). + +Example: \include MatrixSinh.cpp +Output: \verbinclude MatrixSinh.out + + +\subsection matrixbase_sqrt MatrixBase::sqrt() + +Compute the matrix square root. + +\code +const MatrixSquareRootReturnValue MatrixBase::sqrt() const +\endcode + +\param[in] M invertible matrix whose square root is to be computed. +\returns expression representing the matrix square root of \p M. + +The matrix square root of \f$ M \f$ is the matrix \f$ M^{1/2} \f$ +whose square is the original matrix; so if \f$ S = M^{1/2} \f$ then +\f$ S^2 = M \f$. This is different from taking the square root of all +the entries in the matrix; use ArrayBase::sqrt() if you want to do the +latter. + +In the real case, the matrix \f$ M \f$ should be invertible and +it should have no eigenvalues which are real and negative (pairs of +complex conjugate eigenvalues are allowed). In that case, the matrix +has a square root which is also real, and this is the square root +computed by this function. + +The matrix square root is computed by first reducing the matrix to +quasi-triangular form with the real Schur decomposition. The square +root of the quasi-triangular matrix can then be computed directly. The +cost is approximately \f$ 25 n^3 \f$ real flops for the real Schur +decomposition and \f$ 3\frac13 n^3 \f$ real flops for the remainder +(though the computation time in practice is likely more than this +indicates). + +Details of the algorithm can be found in: Nicholas J. Highan, +"Computing real square roots of a real matrix", Linear Algebra +Appl., 88/89:405–430, 1987. + +If the matrix is positive-definite symmetric, then the square +root is also positive-definite symmetric. In this case, it is best to +use SelfAdjointEigenSolver::operatorSqrt() to compute it. + +In the complex case, the matrix \f$ M \f$ should be invertible; +this is a restriction of the algorithm. The square root computed by +this algorithm is the one whose eigenvalues have an argument in the +interval \f$ (-\frac12\pi, \frac12\pi] \f$. This is the usual branch +cut. + +The computation is the same as in the real case, except that the +complex Schur decomposition is used to reduce the matrix to a +triangular matrix. The theoretical cost is the same. Details are in: +Åke Björck and Sven Hammarling, "A Schur method for the +square root of a matrix", Linear Algebra Appl., +52/53:127–140, 1983. + +Example: The following program checks that the square root of +\f[ \left[ \begin{array}{cc} + \cos(\frac13\pi) & -\sin(\frac13\pi) \\ + \sin(\frac13\pi) & \cos(\frac13\pi) + \end{array} \right], \f] +corresponding to a rotation over 60 degrees, is a rotation over 30 degrees: +\f[ \left[ \begin{array}{cc} + \cos(\frac16\pi) & -\sin(\frac16\pi) \\ + \sin(\frac16\pi) & \cos(\frac16\pi) + \end{array} \right]. \f] + +\include MatrixSquareRoot.cpp +Output: \verbinclude MatrixSquareRoot.out + +\sa class RealSchur, class ComplexSchur, class MatrixSquareRoot, + SelfAdjointEigenSolver::operatorSqrt(). + +*/ + +#endif // EIGEN_MATRIX_FUNCTIONS + diff --git a/src/EigenUnsupported/MoreVectorization b/src/EigenUnsupported/MoreVectorization new file mode 100644 index 0000000..7662b47 --- /dev/null +++ b/src/EigenUnsupported/MoreVectorization @@ -0,0 +1,24 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_MOREVECTORIZATION_MODULE_H +#define EIGEN_MOREVECTORIZATION_MODULE_H + +#include "../../Eigen/Core" + +namespace Eigen { + +/** + * \defgroup MoreVectorization More vectorization module + */ + +} + +#include "src/MoreVectorization/MathFunctions.h" + +#endif // EIGEN_MOREVECTORIZATION_MODULE_H diff --git a/src/EigenUnsupported/NonLinearOptimization b/src/EigenUnsupported/NonLinearOptimization new file mode 100644 index 0000000..961f192 --- /dev/null +++ b/src/EigenUnsupported/NonLinearOptimization @@ -0,0 +1,140 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Thomas Capricelli +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_NONLINEAROPTIMIZATION_MODULE +#define EIGEN_NONLINEAROPTIMIZATION_MODULE + +#include + +#include "../../Eigen/Core" +#include "../../Eigen/Jacobi" +#include "../../Eigen/QR" +#include "NumericalDiff" + +/** + * \defgroup NonLinearOptimization_Module Non linear optimization module + * + * \code + * #include + * \endcode + * + * This module provides implementation of two important algorithms in non linear + * optimization. In both cases, we consider a system of non linear functions. Of + * course, this should work, and even work very well if those functions are + * actually linear. But if this is so, you should probably better use other + * methods more fitted to this special case. + * + * One algorithm allows to find a least-squares solution of such a system + * (Levenberg-Marquardt algorithm) and the second one is used to find + * a zero for the system (Powell hybrid "dogleg" method). + * + * This code is a port of minpack (http://en.wikipedia.org/wiki/MINPACK). + * Minpack is a very famous, old, robust and well renowned package, written in + * fortran. Those implementations have been carefully tuned, tested, and used + * for several decades. + * + * The original fortran code was automatically translated using f2c (http://en.wikipedia.org/wiki/F2c) in C, + * then c++, and then cleaned by several different authors. + * The last one of those cleanings being our starting point : + * http://devernay.free.fr/hacks/cminpack.html + * + * Finally, we ported this code to Eigen, creating classes and API + * coherent with Eigen. When possible, we switched to Eigen + * implementation, such as most linear algebra (vectors, matrices, stable norms). + * + * Doing so, we were very careful to check the tests we setup at the very + * beginning, which ensure that the same results are found. + * + * \section Tests Tests + * + * The tests are placed in the file unsupported/test/NonLinear.cpp. + * + * There are two kinds of tests : those that come from examples bundled with cminpack. + * They guaranty we get the same results as the original algorithms (value for 'x', + * for the number of evaluations of the function, and for the number of evaluations + * of the Jacobian if ever). + * + * Other tests were added by myself at the very beginning of the + * process and check the results for Levenberg-Marquardt using the reference data + * on http://www.itl.nist.gov/div898/strd/nls/nls_main.shtml. Since then i've + * carefully checked that the same results were obtained when modifying the + * code. Please note that we do not always get the exact same decimals as they do, + * but this is ok : they use 128bits float, and we do the tests using the C type 'double', + * which is 64 bits on most platforms (x86 and amd64, at least). + * I've performed those tests on several other implementations of Levenberg-Marquardt, and + * (c)minpack performs VERY well compared to those, both in accuracy and speed. + * + * The documentation for running the tests is on the wiki + * http://eigen.tuxfamily.org/index.php?title=Tests + * + * \section API API: overview of methods + * + * Both algorithms needs a functor computing the Jacobian. It can be computed by + * hand, using auto-differentiation (see \ref AutoDiff_Module), or using numerical + * differences (see \ref NumericalDiff_Module). For instance: + *\code + * MyFunc func; + * NumericalDiff func_with_num_diff(func); + * LevenbergMarquardt > lm(func_with_num_diff); + * \endcode + * For HybridNonLinearSolver, the method solveNumericalDiff() does the above wrapping for + * you. + * + * The methods LevenbergMarquardt.lmder1()/lmdif1()/lmstr1() and + * HybridNonLinearSolver.hybrj1()/hybrd1() are specific methods from the original + * minpack package that you probably should NOT use until you are porting a code that + * was previously using minpack. They just define a 'simple' API with default values + * for some parameters. + * + * All algorithms are provided using two APIs : + * - one where the user inits the algorithm, and uses '*OneStep()' as much as he wants : + * this way the caller have control over the steps + * - one where the user just calls a method (optimize() or solve()) which will + * handle the loop: init + loop until a stop condition is met. Those are provided for + * convenience. + * + * As an example, the method LevenbergMarquardt::minimize() is + * implemented as follow: + * \code + * Status LevenbergMarquardt::minimize(FVectorType &x, const int mode) + * { + * Status status = minimizeInit(x, mode); + * do { + * status = minimizeOneStep(x, mode); + * } while (status==Running); + * return status; + * } + * \endcode + * + * \section examples Examples + * + * The easiest way to understand how to use this module is by looking at the many examples in the file + * unsupported/test/NonLinearOptimization.cpp. + */ + +#ifndef EIGEN_PARSED_BY_DOXYGEN + +#include "src/NonLinearOptimization/qrsolv.h" +#include "src/NonLinearOptimization/r1updt.h" +#include "src/NonLinearOptimization/r1mpyq.h" +#include "src/NonLinearOptimization/rwupdt.h" +#include "src/NonLinearOptimization/fdjac1.h" +#include "src/NonLinearOptimization/lmpar.h" +#include "src/NonLinearOptimization/dogleg.h" +#include "src/NonLinearOptimization/covar.h" + +#include "src/NonLinearOptimization/chkder.h" + +#endif + +#include "src/NonLinearOptimization/HybridNonLinearSolver.h" +#include "src/NonLinearOptimization/LevenbergMarquardt.h" + + +#endif // EIGEN_NONLINEAROPTIMIZATION_MODULE diff --git a/src/EigenUnsupported/NumericalDiff b/src/EigenUnsupported/NumericalDiff new file mode 100644 index 0000000..0668f96 --- /dev/null +++ b/src/EigenUnsupported/NumericalDiff @@ -0,0 +1,56 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Thomas Capricelli +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_NUMERICALDIFF_MODULE +#define EIGEN_NUMERICALDIFF_MODULE + +#include "../../Eigen/Core" + +namespace Eigen { + +/** + * \defgroup NumericalDiff_Module Numerical differentiation module + * + * \code + * #include + * \endcode + * + * See http://en.wikipedia.org/wiki/Numerical_differentiation + * + * Warning : this should NOT be confused with automatic differentiation, which + * is a different method and has its own module in Eigen : \ref + * AutoDiff_Module. + * + * Currently only "Forward" and "Central" schemes are implemented. Those + * are basic methods, and there exist some more elaborated way of + * computing such approximates. They are implemented using both + * proprietary and free software, and usually requires linking to an + * external library. It is very easy for you to write a functor + * using such software, and the purpose is quite orthogonal to what we + * want to achieve with Eigen. + * + * This is why we will not provide wrappers for every great numerical + * differentiation software that exist, but should rather stick with those + * basic ones, that still are useful for testing. + * + * Also, the \ref NonLinearOptimization_Module needs this in order to + * provide full features compatibility with the original (c)minpack + * package. + * + */ +} + +//@{ + +#include "src/NumericalDiff/NumericalDiff.h" + +//@} + + +#endif // EIGEN_NUMERICALDIFF_MODULE diff --git a/src/EigenUnsupported/OpenGLSupport b/src/EigenUnsupported/OpenGLSupport new file mode 100644 index 0000000..f8c2130 --- /dev/null +++ b/src/EigenUnsupported/OpenGLSupport @@ -0,0 +1,322 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2010 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_OPENGL_MODULE +#define EIGEN_OPENGL_MODULE + +#include "../../Eigen/Geometry" + +#if defined(__APPLE_CC__) + #include +#else + #include +#endif + +namespace Eigen { + +/** + * \defgroup OpenGLSUpport_Module OpenGL Support module + * + * This module provides wrapper functions for a couple of OpenGL functions + * which simplify the way to pass Eigen's object to openGL. + * Here is an example: + * + * \code + * // You need to add path_to_eigen/unsupported to your include path. + * #include + * // ... + * Vector3f x, y; + * Matrix3f rot; + * + * glVertex(y + x * rot); + * + * Quaternion q; + * glRotate(q); + * + * // ... + * \endcode + * + */ +//@{ + +#define EIGEN_GL_FUNC_DECLARATION(FUNC) \ +namespace internal { \ + template< typename XprType, \ + typename Scalar = typename XprType::Scalar, \ + int Rows = XprType::RowsAtCompileTime, \ + int Cols = XprType::ColsAtCompileTime, \ + bool IsGLCompatible = bool(internal::evaluator::Flags&LinearAccessBit) \ + && bool(XprType::Flags&DirectAccessBit) \ + && (XprType::IsVectorAtCompileTime || (XprType::Flags&RowMajorBit)==0)> \ + struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl); \ + \ + template \ + struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl) { \ + inline static void run(const XprType& p) { \ + EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)::type>::run(p); } \ + }; \ +} \ + \ +template inline void FUNC(const Eigen::DenseBase& p) { \ + EIGEN_CAT(EIGEN_CAT(internal::gl_,FUNC),_impl)::run(p.derived()); \ +} + + +#define EIGEN_GL_FUNC_SPECIALIZATION_MAT(FUNC,SCALAR,ROWS,COLS,SUFFIX) \ +namespace internal { \ + template< typename XprType> struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl) { \ + inline static void run(const XprType& p) { FUNC##SUFFIX(p.data()); } \ + }; \ +} + + +#define EIGEN_GL_FUNC_SPECIALIZATION_VEC(FUNC,SCALAR,SIZE,SUFFIX) \ +namespace internal { \ + template< typename XprType> struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl) { \ + inline static void run(const XprType& p) { FUNC##SUFFIX(p.data()); } \ + }; \ + template< typename XprType> struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl) { \ + inline static void run(const XprType& p) { FUNC##SUFFIX(p.data()); } \ + }; \ +} + + +EIGEN_GL_FUNC_DECLARATION (glVertex) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,int, 2,2iv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,short, 2,2sv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,float, 2,2fv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,double, 2,2dv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,int, 3,3iv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,short, 3,3sv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,float, 3,3fv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,double, 3,3dv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,int, 4,4iv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,short, 4,4sv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,float, 4,4fv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,double, 4,4dv) + +EIGEN_GL_FUNC_DECLARATION (glTexCoord) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,int, 2,2iv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,short, 2,2sv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,float, 2,2fv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,double, 2,2dv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,int, 3,3iv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,short, 3,3sv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,float, 3,3fv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,double, 3,3dv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,int, 4,4iv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,short, 4,4sv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,float, 4,4fv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,double, 4,4dv) + +EIGEN_GL_FUNC_DECLARATION (glColor) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,int, 2,2iv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,short, 2,2sv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,float, 2,2fv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,double, 2,2dv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,int, 3,3iv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,short, 3,3sv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,float, 3,3fv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,double, 3,3dv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,int, 4,4iv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,short, 4,4sv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,float, 4,4fv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,double, 4,4dv) + +EIGEN_GL_FUNC_DECLARATION (glNormal) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glNormal,int, 3,3iv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glNormal,short, 3,3sv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glNormal,float, 3,3fv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glNormal,double, 3,3dv) + +inline void glScale2fv(const float* v) { glScalef(v[0], v[1], 1.f); } +inline void glScale2dv(const double* v) { glScaled(v[0], v[1], 1.0); } +inline void glScale3fv(const float* v) { glScalef(v[0], v[1], v[2]); } +inline void glScale3dv(const double* v) { glScaled(v[0], v[1], v[2]); } + +EIGEN_GL_FUNC_DECLARATION (glScale) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glScale,float, 2,2fv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glScale,double, 2,2dv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glScale,float, 3,3fv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glScale,double, 3,3dv) + +template void glScale(const UniformScaling& s) { glScale(Matrix::Constant(s.factor())); } + +inline void glTranslate2fv(const float* v) { glTranslatef(v[0], v[1], 0.f); } +inline void glTranslate2dv(const double* v) { glTranslated(v[0], v[1], 0.0); } +inline void glTranslate3fv(const float* v) { glTranslatef(v[0], v[1], v[2]); } +inline void glTranslate3dv(const double* v) { glTranslated(v[0], v[1], v[2]); } + +EIGEN_GL_FUNC_DECLARATION (glTranslate) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTranslate,float, 2,2fv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTranslate,double, 2,2dv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTranslate,float, 3,3fv) +EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTranslate,double, 3,3dv) + +template void glTranslate(const Translation& t) { glTranslate(t.vector()); } +template void glTranslate(const Translation& t) { glTranslate(t.vector()); } + +EIGEN_GL_FUNC_DECLARATION (glMultMatrix) +EIGEN_GL_FUNC_SPECIALIZATION_MAT(glMultMatrix,float, 4,4,f) +EIGEN_GL_FUNC_SPECIALIZATION_MAT(glMultMatrix,double, 4,4,d) + +template void glMultMatrix(const Transform& t) { glMultMatrix(t.matrix()); } +template void glMultMatrix(const Transform& t) { glMultMatrix(t.matrix()); } +template void glMultMatrix(const Transform& t) { glMultMatrix(Transform(t).matrix()); } + +EIGEN_GL_FUNC_DECLARATION (glLoadMatrix) +EIGEN_GL_FUNC_SPECIALIZATION_MAT(glLoadMatrix,float, 4,4,f) +EIGEN_GL_FUNC_SPECIALIZATION_MAT(glLoadMatrix,double, 4,4,d) + +template void glLoadMatrix(const Transform& t) { glLoadMatrix(t.matrix()); } +template void glLoadMatrix(const Transform& t) { glLoadMatrix(t.matrix()); } +template void glLoadMatrix(const Transform& t) { glLoadMatrix(Transform(t).matrix()); } + +inline void glRotate(const Rotation2D& rot) +{ + glRotatef(rot.angle()*180.f/float(EIGEN_PI), 0.f, 0.f, 1.f); +} +inline void glRotate(const Rotation2D& rot) +{ + glRotated(rot.angle()*180.0/double(EIGEN_PI), 0.0, 0.0, 1.0); +} + +template void glRotate(const RotationBase& rot) +{ + Transform tr(rot); + glMultMatrix(tr.matrix()); +} + +#define EIGEN_GL_MAKE_CONST_const const +#define EIGEN_GL_MAKE_CONST__ +#define EIGEN_GL_EVAL(X) X + +#define EIGEN_GL_FUNC1_DECLARATION(FUNC,ARG1,CONST) \ +namespace internal { \ + template< typename XprType, \ + typename Scalar = typename XprType::Scalar, \ + int Rows = XprType::RowsAtCompileTime, \ + int Cols = XprType::ColsAtCompileTime, \ + bool IsGLCompatible = bool(internal::evaluator::Flags&LinearAccessBit) \ + && bool(XprType::Flags&DirectAccessBit) \ + && (XprType::IsVectorAtCompileTime || (XprType::Flags&RowMajorBit)==0)> \ + struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl); \ + \ + template \ + struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl) { \ + inline static void run(ARG1 a,EIGEN_GL_EVAL(EIGEN_GL_MAKE_CONST_##CONST) XprType& p) { \ + EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)::type>::run(a,p); } \ + }; \ +} \ + \ +template inline void FUNC(ARG1 a,EIGEN_GL_EVAL(EIGEN_GL_MAKE_CONST_##CONST) Eigen::DenseBase& p) { \ + EIGEN_CAT(EIGEN_CAT(internal::gl_,FUNC),_impl)::run(a,p.derived()); \ +} + + +#define EIGEN_GL_FUNC1_SPECIALIZATION_MAT(FUNC,ARG1,CONST,SCALAR,ROWS,COLS,SUFFIX) \ +namespace internal { \ + template< typename XprType> struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl) { \ + inline static void run(ARG1 a, EIGEN_GL_EVAL(EIGEN_GL_MAKE_CONST_##CONST) XprType& p) { FUNC##SUFFIX(a,p.data()); } \ + }; \ +} + + +#define EIGEN_GL_FUNC1_SPECIALIZATION_VEC(FUNC,ARG1,CONST,SCALAR,SIZE,SUFFIX) \ +namespace internal { \ + template< typename XprType> struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl) { \ + inline static void run(ARG1 a, EIGEN_GL_EVAL(EIGEN_GL_MAKE_CONST_##CONST) XprType& p) { FUNC##SUFFIX(a,p.data()); } \ + }; \ + template< typename XprType> struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl) { \ + inline static void run(ARG1 a, EIGEN_GL_EVAL(EIGEN_GL_MAKE_CONST_##CONST) XprType& p) { FUNC##SUFFIX(a,p.data()); } \ + }; \ +} + +EIGEN_GL_FUNC1_DECLARATION (glGet,GLenum,_) +EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glGet,GLenum,_,float, 4,4,Floatv) +EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glGet,GLenum,_,double, 4,4,Doublev) + +// glUniform API + +#ifdef GL_VERSION_2_0 + +inline void glUniform2fv_ei (GLint loc, const float* v) { glUniform2fv(loc,1,v); } +inline void glUniform2iv_ei (GLint loc, const int* v) { glUniform2iv(loc,1,v); } + +inline void glUniform3fv_ei (GLint loc, const float* v) { glUniform3fv(loc,1,v); } +inline void glUniform3iv_ei (GLint loc, const int* v) { glUniform3iv(loc,1,v); } + +inline void glUniform4fv_ei (GLint loc, const float* v) { glUniform4fv(loc,1,v); } +inline void glUniform4iv_ei (GLint loc, const int* v) { glUniform4iv(loc,1,v); } + +inline void glUniformMatrix2fv_ei (GLint loc, const float* v) { glUniformMatrix2fv(loc,1,false,v); } +inline void glUniformMatrix3fv_ei (GLint loc, const float* v) { glUniformMatrix3fv(loc,1,false,v); } +inline void glUniformMatrix4fv_ei (GLint loc, const float* v) { glUniformMatrix4fv(loc,1,false,v); } + + +EIGEN_GL_FUNC1_DECLARATION (glUniform,GLint,const) +EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,float, 2,2fv_ei) +EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,int, 2,2iv_ei) +EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,float, 3,3fv_ei) +EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,int, 3,3iv_ei) +EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,float, 4,4fv_ei) +EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,int, 4,4iv_ei) + +EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float, 2,2,Matrix2fv_ei) +EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float, 3,3,Matrix3fv_ei) +EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float, 4,4,Matrix4fv_ei) + +#endif + +#ifdef GL_VERSION_2_1 + +inline void glUniformMatrix2x3fv_ei(GLint loc, const float* v) { glUniformMatrix2x3fv(loc,1,false,v); } +inline void glUniformMatrix3x2fv_ei(GLint loc, const float* v) { glUniformMatrix3x2fv(loc,1,false,v); } +inline void glUniformMatrix2x4fv_ei(GLint loc, const float* v) { glUniformMatrix2x4fv(loc,1,false,v); } +inline void glUniformMatrix4x2fv_ei(GLint loc, const float* v) { glUniformMatrix4x2fv(loc,1,false,v); } +inline void glUniformMatrix3x4fv_ei(GLint loc, const float* v) { glUniformMatrix3x4fv(loc,1,false,v); } +inline void glUniformMatrix4x3fv_ei(GLint loc, const float* v) { glUniformMatrix4x3fv(loc,1,false,v); } + +EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float, 2,3,Matrix2x3fv_ei) +EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float, 3,2,Matrix3x2fv_ei) +EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float, 2,4,Matrix2x4fv_ei) +EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float, 4,2,Matrix4x2fv_ei) +EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float, 3,4,Matrix3x4fv_ei) +EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float, 4,3,Matrix4x3fv_ei) + +#endif + +#ifdef GL_VERSION_3_0 + +inline void glUniform2uiv_ei (GLint loc, const unsigned int* v) { glUniform2uiv(loc,1,v); } +inline void glUniform3uiv_ei (GLint loc, const unsigned int* v) { glUniform3uiv(loc,1,v); } +inline void glUniform4uiv_ei (GLint loc, const unsigned int* v) { glUniform4uiv(loc,1,v); } + +EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,unsigned int, 2,2uiv_ei) +EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,unsigned int, 3,3uiv_ei) +EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,unsigned int, 4,4uiv_ei) + +#endif + +#ifdef GL_ARB_gpu_shader_fp64 +inline void glUniform2dv_ei (GLint loc, const double* v) { glUniform2dv(loc,1,v); } +inline void glUniform3dv_ei (GLint loc, const double* v) { glUniform3dv(loc,1,v); } +inline void glUniform4dv_ei (GLint loc, const double* v) { glUniform4dv(loc,1,v); } + +EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,double, 2,2dv_ei) +EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,double, 3,3dv_ei) +EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,double, 4,4dv_ei) +#endif + + +//@} + +} + +#endif // EIGEN_OPENGL_MODULE diff --git a/src/EigenUnsupported/Polynomials b/src/EigenUnsupported/Polynomials new file mode 100644 index 0000000..32ce2a2 --- /dev/null +++ b/src/EigenUnsupported/Polynomials @@ -0,0 +1,137 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_POLYNOMIALS_MODULE_H +#define EIGEN_POLYNOMIALS_MODULE_H + +#include "../../Eigen/Core" + +#include "../../Eigen/Eigenvalues" + +#include "../../Eigen/src/Core/util/DisableStupidWarnings.h" + +// Note that EIGEN_HIDE_HEAVY_CODE has to be defined per module +#if (defined EIGEN_EXTERN_INSTANTIATIONS) && (EIGEN_EXTERN_INSTANTIATIONS>=2) + #ifndef EIGEN_HIDE_HEAVY_CODE + #define EIGEN_HIDE_HEAVY_CODE + #endif +#elif defined EIGEN_HIDE_HEAVY_CODE + #undef EIGEN_HIDE_HEAVY_CODE +#endif + +/** + * \defgroup Polynomials_Module Polynomials module + * \brief This module provides a QR based polynomial solver. + * + * To use this module, add + * \code + * #include + * \endcode + * at the start of your source file. + */ + +#include "src/Polynomials/PolynomialUtils.h" +#include "src/Polynomials/Companion.h" +#include "src/Polynomials/PolynomialSolver.h" + +/** + \page polynomials Polynomials defines functions for dealing with polynomials + and a QR based polynomial solver. + \ingroup Polynomials_Module + + The remainder of the page documents first the functions for evaluating, computing + polynomials, computing estimates about polynomials and next the QR based polynomial + solver. + + \section polynomialUtils convenient functions to deal with polynomials + \subsection roots_to_monicPolynomial + The function + \code + void roots_to_monicPolynomial( const RootVector& rv, Polynomial& poly ) + \endcode + computes the coefficients \f$ a_i \f$ of + + \f$ p(x) = a_0 + a_{1}x + ... + a_{n-1}x^{n-1} + x^n \f$ + + where \f$ p \f$ is known through its roots i.e. \f$ p(x) = (x-r_1)(x-r_2)...(x-r_n) \f$. + + \subsection poly_eval + The function + \code + T poly_eval( const Polynomials& poly, const T& x ) + \endcode + evaluates a polynomial at a given point using stabilized Hörner method. + + The following code: first computes the coefficients in the monomial basis of the monic polynomial that has the provided roots; + then, it evaluates the computed polynomial, using a stabilized Hörner method. + + \include PolynomialUtils1.cpp + Output: \verbinclude PolynomialUtils1.out + + \subsection Cauchy bounds + The function + \code + Real cauchy_max_bound( const Polynomial& poly ) + \endcode + provides a maximum bound (the Cauchy one: \f$C(p)\f$) for the absolute value of a root of the given polynomial i.e. + \f$ \forall r_i \f$ root of \f$ p(x) = \sum_{k=0}^d a_k x^k \f$, + \f$ |r_i| \le C(p) = \sum_{k=0}^{d} \left | \frac{a_k}{a_d} \right | \f$ + The leading coefficient \f$ p \f$: should be non zero \f$a_d \neq 0\f$. + + + The function + \code + Real cauchy_min_bound( const Polynomial& poly ) + \endcode + provides a minimum bound (the Cauchy one: \f$c(p)\f$) for the absolute value of a non zero root of the given polynomial i.e. + \f$ \forall r_i \neq 0 \f$ root of \f$ p(x) = \sum_{k=0}^d a_k x^k \f$, + \f$ |r_i| \ge c(p) = \left( \sum_{k=0}^{d} \left | \frac{a_k}{a_0} \right | \right)^{-1} \f$ + + + + + \section QR polynomial solver class + Computes the complex roots of a polynomial by computing the eigenvalues of the associated companion matrix with the QR algorithm. + + The roots of \f$ p(x) = a_0 + a_1 x + a_2 x^2 + a_{3} x^3 + x^4 \f$ are the eigenvalues of + \f$ + \left [ + \begin{array}{cccc} + 0 & 0 & 0 & a_0 \\ + 1 & 0 & 0 & a_1 \\ + 0 & 1 & 0 & a_2 \\ + 0 & 0 & 1 & a_3 + \end{array} \right ] + \f$ + + However, the QR algorithm is not guaranteed to converge when there are several eigenvalues with same modulus. + + Therefore the current polynomial solver is guaranteed to provide a correct result only when the complex roots \f$r_1,r_2,...,r_d\f$ have distinct moduli i.e. + + \f$ \forall i,j \in [1;d],~ \| r_i \| \neq \| r_j \| \f$. + + With 32bit (float) floating types this problem shows up frequently. + However, almost always, correct accuracy is reached even in these cases for 64bit + (double) floating types and small polynomial degree (<20). + + \include PolynomialSolver1.cpp + + In the above example: + + -# a simple use of the polynomial solver is shown; + -# the accuracy problem with the QR algorithm is presented: a polynomial with almost conjugate roots is provided to the solver. + Those roots have almost same module therefore the QR algorithm failed to converge: the accuracy + of the last root is bad; + -# a simple way to circumvent the problem is shown: use doubles instead of floats. + + Output: \verbinclude PolynomialSolver1.out +*/ + +#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h" + +#endif // EIGEN_POLYNOMIALS_MODULE_H diff --git a/src/EigenUnsupported/Skyline b/src/EigenUnsupported/Skyline new file mode 100644 index 0000000..ebdf143 --- /dev/null +++ b/src/EigenUnsupported/Skyline @@ -0,0 +1,39 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SKYLINE_MODULE_H +#define EIGEN_SKYLINE_MODULE_H + + +#include "../../Eigen/Core" + +#include "../../Eigen/src/Core/util/DisableStupidWarnings.h" + +#include +#include +#include +#include + +/** + * \defgroup Skyline_Module Skyline module + * + * + * + * + */ + +#include "src/Skyline/SkylineUtil.h" +#include "src/Skyline/SkylineMatrixBase.h" +#include "src/Skyline/SkylineStorage.h" +#include "src/Skyline/SkylineMatrix.h" +#include "src/Skyline/SkylineInplaceLU.h" +#include "src/Skyline/SkylineProduct.h" + +#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h" + +#endif // EIGEN_SKYLINE_MODULE_H diff --git a/src/EigenUnsupported/SparseExtra b/src/EigenUnsupported/SparseExtra new file mode 100644 index 0000000..ba5cbd6 --- /dev/null +++ b/src/EigenUnsupported/SparseExtra @@ -0,0 +1,54 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2009 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SPARSE_EXTRA_MODULE_H +#define EIGEN_SPARSE_EXTRA_MODULE_H + +#include "../../Eigen/Sparse" + +#include "../../Eigen/src/Core/util/DisableStupidWarnings.h" + +#include +#include +#include +#include +#include +#include +#include + +#ifdef EIGEN_GOOGLEHASH_SUPPORT + #include + #include +#endif + +/** + * \defgroup SparseExtra_Module SparseExtra module + * + * This module contains some experimental features extending the sparse module. + * + * \code + * #include + * \endcode + */ + + +#include "src/SparseExtra/DynamicSparseMatrix.h" +#include "src/SparseExtra/BlockOfDynamicSparseMatrix.h" +#include "src/SparseExtra/RandomSetter.h" + +#include "src/SparseExtra/MarketIO.h" + +#if !defined(_WIN32) +#include +#include "src/SparseExtra/MatrixMarketIterator.h" +#endif + +#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h" + +#endif // EIGEN_SPARSE_EXTRA_MODULE_H diff --git a/src/EigenUnsupported/SpecialFunctions b/src/EigenUnsupported/SpecialFunctions new file mode 100644 index 0000000..f6a2460 --- /dev/null +++ b/src/EigenUnsupported/SpecialFunctions @@ -0,0 +1,103 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SPECIALFUNCTIONS_MODULE +#define EIGEN_SPECIALFUNCTIONS_MODULE + +#include + +#include "../../Eigen/Core" + +#include "../../Eigen/src/Core/util/DisableStupidWarnings.h" + +namespace Eigen { + +/** + * \defgroup SpecialFunctions_Module Special math functions module + * + * This module features additional coefficient-wise math functions available + * within the numext:: namespace for the scalar version, and as method and/or free + * functions of Array. Those include: + * + * - erf + * - erfc + * - lgamma + * - igamma + * - igamma_der_a + * - gamma_sample_der_alpha + * - igammac + * - digamma + * - ndtri + * - polygamma + * - zeta + * - betainc + * + * Bessel Functions + * - bessel_i0 + * - bessel_i0e + * - bessel_i1 + * - bessel_i1e + * - bessel_j0 + * - bessel_j1 + * - bessel_k0 + * - bessel_k0e + * - bessel_k1 + * - bessel_k1e + * - bessel_y0 + * - bessel_y1 + * + * \code + * #include + * \endcode + */ +//@{ + +} + +#include "src/SpecialFunctions/BesselFunctionsImpl.h" +#include "src/SpecialFunctions/BesselFunctionsBFloat16.h" +#include "src/SpecialFunctions/BesselFunctionsHalf.h" +#include "src/SpecialFunctions/BesselFunctionsPacketMath.h" +#include "src/SpecialFunctions/BesselFunctionsFunctors.h" +#include "src/SpecialFunctions/BesselFunctionsArrayAPI.h" +#include "src/SpecialFunctions/SpecialFunctionsImpl.h" +#if defined(EIGEN_HIPCC) +#include "src/SpecialFunctions/HipVectorCompatibility.h" +#endif +#include "src/SpecialFunctions/SpecialFunctionsBFloat16.h" +#include "src/SpecialFunctions/SpecialFunctionsHalf.h" +#include "src/SpecialFunctions/SpecialFunctionsPacketMath.h" +#include "src/SpecialFunctions/SpecialFunctionsFunctors.h" +#include "src/SpecialFunctions/SpecialFunctionsArrayAPI.h" + +#if defined EIGEN_VECTORIZE_AVX512 + #include "src/SpecialFunctions/arch/AVX/BesselFunctions.h" + #include "src/SpecialFunctions/arch/AVX/SpecialFunctions.h" + #include "src/SpecialFunctions/arch/AVX512/BesselFunctions.h" + #include "src/SpecialFunctions/arch/AVX512/SpecialFunctions.h" +#elif defined EIGEN_VECTORIZE_AVX + #include "src/SpecialFunctions/arch/AVX/BesselFunctions.h" + #include "src/SpecialFunctions/arch/AVX/SpecialFunctions.h" +#elif defined EIGEN_VECTORIZE_NEON + #include "src/SpecialFunctions/arch/NEON/BesselFunctions.h" + #include "src/SpecialFunctions/arch/NEON/SpecialFunctions.h" +#endif + +#if defined EIGEN_VECTORIZE_GPU + #include "src/SpecialFunctions/arch/GPU/SpecialFunctions.h" +#endif + +namespace Eigen { +//@} +} + + +#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h" + +#endif // EIGEN_SPECIALFUNCTIONS_MODULE diff --git a/src/EigenUnsupported/Splines b/src/EigenUnsupported/Splines new file mode 100644 index 0000000..2ca5813 --- /dev/null +++ b/src/EigenUnsupported/Splines @@ -0,0 +1,35 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 20010-2011 Hauke Heibel +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SPLINES_MODULE_H +#define EIGEN_SPLINES_MODULE_H + +namespace Eigen +{ +/** + * \defgroup Splines_Module Spline and spline fitting module + * + * This module provides a simple multi-dimensional spline class while + * offering most basic functionality to fit a spline to point sets. + * + * \code + * #include + * \endcode + */ +} + +#include "../../Eigen/src/Core/util/DisableStupidWarnings.h" + +#include "src/Splines/SplineFwd.h" +#include "src/Splines/Spline.h" +#include "src/Splines/SplineFitting.h" + +#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h" + +#endif // EIGEN_SPLINES_MODULE_H diff --git a/src/EigenUnsupported/src/AutoDiff/AutoDiffJacobian.h b/src/EigenUnsupported/src/AutoDiff/AutoDiffJacobian.h new file mode 100644 index 0000000..33b6c39 --- /dev/null +++ b/src/EigenUnsupported/src/AutoDiff/AutoDiffJacobian.h @@ -0,0 +1,108 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_AUTODIFF_JACOBIAN_H +#define EIGEN_AUTODIFF_JACOBIAN_H + +namespace Eigen +{ + +template class AutoDiffJacobian : public Functor +{ +public: + AutoDiffJacobian() : Functor() {} + AutoDiffJacobian(const Functor& f) : Functor(f) {} + + // forward constructors +#if EIGEN_HAS_VARIADIC_TEMPLATES + template + AutoDiffJacobian(const T& ...Values) : Functor(Values...) {} +#else + template + AutoDiffJacobian(const T0& a0) : Functor(a0) {} + template + AutoDiffJacobian(const T0& a0, const T1& a1) : Functor(a0, a1) {} + template + AutoDiffJacobian(const T0& a0, const T1& a1, const T2& a2) : Functor(a0, a1, a2) {} +#endif + + typedef typename Functor::InputType InputType; + typedef typename Functor::ValueType ValueType; + typedef typename ValueType::Scalar Scalar; + + enum { + InputsAtCompileTime = InputType::RowsAtCompileTime, + ValuesAtCompileTime = ValueType::RowsAtCompileTime + }; + + typedef Matrix JacobianType; + typedef typename JacobianType::Index Index; + + typedef Matrix DerivativeType; + typedef AutoDiffScalar ActiveScalar; + + typedef Matrix ActiveInput; + typedef Matrix ActiveValue; + +#if EIGEN_HAS_VARIADIC_TEMPLATES + // Some compilers don't accept variadic parameters after a default parameter, + // i.e., we can't just write _jac=0 but we need to overload operator(): + EIGEN_STRONG_INLINE + void operator() (const InputType& x, ValueType* v) const + { + this->operator()(x, v, 0); + } + template + void operator() (const InputType& x, ValueType* v, JacobianType* _jac, + const ParamsType&... Params) const +#else + void operator() (const InputType& x, ValueType* v, JacobianType* _jac=0) const +#endif + { + eigen_assert(v!=0); + + if (!_jac) + { +#if EIGEN_HAS_VARIADIC_TEMPLATES + Functor::operator()(x, v, Params...); +#else + Functor::operator()(x, v); +#endif + return; + } + + JacobianType& jac = *_jac; + + ActiveInput ax = x.template cast(); + ActiveValue av(jac.rows()); + + if(InputsAtCompileTime==Dynamic) + for (Index j=0; j +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_AUTODIFF_SCALAR_H +#define EIGEN_AUTODIFF_SCALAR_H + +namespace Eigen { + +namespace internal { + +template +struct make_coherent_impl { + static void run(A&, B&) {} +}; + +// resize a to match b is a.size()==0, and conversely. +template +void make_coherent(const A& a, const B&b) +{ + make_coherent_impl::run(a.const_cast_derived(), b.const_cast_derived()); +} + +template struct auto_diff_special_op; + +} // end namespace internal + +template class AutoDiffScalar; + +template +inline AutoDiffScalar MakeAutoDiffScalar(const typename NewDerType::Scalar& value, const NewDerType &der) { + return AutoDiffScalar(value,der); +} + +/** \class AutoDiffScalar + * \brief A scalar type replacement with automatic differentiation capability + * + * \param DerivativeType the vector type used to store/represent the derivatives. The base scalar type + * as well as the number of derivatives to compute are determined from this type. + * Typical choices include, e.g., \c Vector4f for 4 derivatives, or \c VectorXf + * if the number of derivatives is not known at compile time, and/or, the number + * of derivatives is large. + * Note that DerivativeType can also be a reference (e.g., \c VectorXf&) to wrap a + * existing vector into an AutoDiffScalar. + * Finally, DerivativeType can also be any Eigen compatible expression. + * + * This class represents a scalar value while tracking its respective derivatives using Eigen's expression + * template mechanism. + * + * It supports the following list of global math function: + * - std::abs, std::sqrt, std::pow, std::exp, std::log, std::sin, std::cos, + * - internal::abs, internal::sqrt, numext::pow, internal::exp, internal::log, internal::sin, internal::cos, + * - internal::conj, internal::real, internal::imag, numext::abs2. + * + * AutoDiffScalar can be used as the scalar type of an Eigen::Matrix object. However, + * in that case, the expression template mechanism only occurs at the top Matrix level, + * while derivatives are computed right away. + * + */ + +template +class AutoDiffScalar + : public internal::auto_diff_special_op + ::type>::Scalar, + typename NumTraits::type>::Scalar>::Real>::value> +{ + public: + typedef internal::auto_diff_special_op + ::type>::Scalar, + typename NumTraits::type>::Scalar>::Real>::value> Base; + typedef typename internal::remove_all::type DerType; + typedef typename internal::traits::Scalar Scalar; + typedef typename NumTraits::Real Real; + + using Base::operator+; + using Base::operator*; + + /** Default constructor without any initialization. */ + AutoDiffScalar() {} + + /** Constructs an active scalar from its \a value, + and initializes the \a nbDer derivatives such that it corresponds to the \a derNumber -th variable */ + AutoDiffScalar(const Scalar& value, int nbDer, int derNumber) + : m_value(value), m_derivatives(DerType::Zero(nbDer)) + { + m_derivatives.coeffRef(derNumber) = Scalar(1); + } + + /** Conversion from a scalar constant to an active scalar. + * The derivatives are set to zero. */ + /*explicit*/ AutoDiffScalar(const Real& value) + : m_value(value) + { + if(m_derivatives.size()>0) + m_derivatives.setZero(); + } + + /** Constructs an active scalar from its \a value and derivatives \a der */ + AutoDiffScalar(const Scalar& value, const DerType& der) + : m_value(value), m_derivatives(der) + {} + + template + AutoDiffScalar(const AutoDiffScalar& other +#ifndef EIGEN_PARSED_BY_DOXYGEN + , typename internal::enable_if< + internal::is_same::type>::Scalar>::value + && internal::is_convertible::value , void*>::type = 0 +#endif + ) + : m_value(other.value()), m_derivatives(other.derivatives()) + {} + + friend std::ostream & operator << (std::ostream & s, const AutoDiffScalar& a) + { + return s << a.value(); + } + + AutoDiffScalar(const AutoDiffScalar& other) + : m_value(other.value()), m_derivatives(other.derivatives()) + {} + + template + inline AutoDiffScalar& operator=(const AutoDiffScalar& other) + { + m_value = other.value(); + m_derivatives = other.derivatives(); + return *this; + } + + inline AutoDiffScalar& operator=(const AutoDiffScalar& other) + { + m_value = other.value(); + m_derivatives = other.derivatives(); + return *this; + } + + inline AutoDiffScalar& operator=(const Scalar& other) + { + m_value = other; + if(m_derivatives.size()>0) + m_derivatives.setZero(); + return *this; + } + +// inline operator const Scalar& () const { return m_value; } +// inline operator Scalar& () { return m_value; } + + inline const Scalar& value() const { return m_value; } + inline Scalar& value() { return m_value; } + + inline const DerType& derivatives() const { return m_derivatives; } + inline DerType& derivatives() { return m_derivatives; } + + inline bool operator< (const Scalar& other) const { return m_value < other; } + inline bool operator<=(const Scalar& other) const { return m_value <= other; } + inline bool operator> (const Scalar& other) const { return m_value > other; } + inline bool operator>=(const Scalar& other) const { return m_value >= other; } + inline bool operator==(const Scalar& other) const { return m_value == other; } + inline bool operator!=(const Scalar& other) const { return m_value != other; } + + friend inline bool operator< (const Scalar& a, const AutoDiffScalar& b) { return a < b.value(); } + friend inline bool operator<=(const Scalar& a, const AutoDiffScalar& b) { return a <= b.value(); } + friend inline bool operator> (const Scalar& a, const AutoDiffScalar& b) { return a > b.value(); } + friend inline bool operator>=(const Scalar& a, const AutoDiffScalar& b) { return a >= b.value(); } + friend inline bool operator==(const Scalar& a, const AutoDiffScalar& b) { return a == b.value(); } + friend inline bool operator!=(const Scalar& a, const AutoDiffScalar& b) { return a != b.value(); } + + template inline bool operator< (const AutoDiffScalar& b) const { return m_value < b.value(); } + template inline bool operator<=(const AutoDiffScalar& b) const { return m_value <= b.value(); } + template inline bool operator> (const AutoDiffScalar& b) const { return m_value > b.value(); } + template inline bool operator>=(const AutoDiffScalar& b) const { return m_value >= b.value(); } + template inline bool operator==(const AutoDiffScalar& b) const { return m_value == b.value(); } + template inline bool operator!=(const AutoDiffScalar& b) const { return m_value != b.value(); } + + inline const AutoDiffScalar operator+(const Scalar& other) const + { + return AutoDiffScalar(m_value + other, m_derivatives); + } + + friend inline const AutoDiffScalar operator+(const Scalar& a, const AutoDiffScalar& b) + { + return AutoDiffScalar(a + b.value(), b.derivatives()); + } + +// inline const AutoDiffScalar operator+(const Real& other) const +// { +// return AutoDiffScalar(m_value + other, m_derivatives); +// } + +// friend inline const AutoDiffScalar operator+(const Real& a, const AutoDiffScalar& b) +// { +// return AutoDiffScalar(a + b.value(), b.derivatives()); +// } + + inline AutoDiffScalar& operator+=(const Scalar& other) + { + value() += other; + return *this; + } + + template + inline const AutoDiffScalar,const DerType,const typename internal::remove_all::type> > + operator+(const AutoDiffScalar& other) const + { + internal::make_coherent(m_derivatives, other.derivatives()); + return AutoDiffScalar,const DerType,const typename internal::remove_all::type> >( + m_value + other.value(), + m_derivatives + other.derivatives()); + } + + template + inline AutoDiffScalar& + operator+=(const AutoDiffScalar& other) + { + (*this) = (*this) + other; + return *this; + } + + inline const AutoDiffScalar operator-(const Scalar& b) const + { + return AutoDiffScalar(m_value - b, m_derivatives); + } + + friend inline const AutoDiffScalar, const DerType> > + operator-(const Scalar& a, const AutoDiffScalar& b) + { + return AutoDiffScalar, const DerType> > + (a - b.value(), -b.derivatives()); + } + + inline AutoDiffScalar& operator-=(const Scalar& other) + { + value() -= other; + return *this; + } + + template + inline const AutoDiffScalar, const DerType,const typename internal::remove_all::type> > + operator-(const AutoDiffScalar& other) const + { + internal::make_coherent(m_derivatives, other.derivatives()); + return AutoDiffScalar, const DerType,const typename internal::remove_all::type> >( + m_value - other.value(), + m_derivatives - other.derivatives()); + } + + template + inline AutoDiffScalar& + operator-=(const AutoDiffScalar& other) + { + *this = *this - other; + return *this; + } + + inline const AutoDiffScalar, const DerType> > + operator-() const + { + return AutoDiffScalar, const DerType> >( + -m_value, + -m_derivatives); + } + + inline const AutoDiffScalar + operator*(const Scalar& other) const + { + return MakeAutoDiffScalar(m_value * other, m_derivatives * other); + } + + friend inline const AutoDiffScalar + operator*(const Scalar& other, const AutoDiffScalar& a) + { + return MakeAutoDiffScalar(a.value() * other, a.derivatives() * other); + } + +// inline const AutoDiffScalar, DerType>::Type > +// operator*(const Real& other) const +// { +// return AutoDiffScalar, DerType>::Type >( +// m_value * other, +// (m_derivatives * other)); +// } +// +// friend inline const AutoDiffScalar, DerType>::Type > +// operator*(const Real& other, const AutoDiffScalar& a) +// { +// return AutoDiffScalar, DerType>::Type >( +// a.value() * other, +// a.derivatives() * other); +// } + + inline const AutoDiffScalar + operator/(const Scalar& other) const + { + return MakeAutoDiffScalar(m_value / other, (m_derivatives * (Scalar(1)/other))); + } + + friend inline const AutoDiffScalar + operator/(const Scalar& other, const AutoDiffScalar& a) + { + return MakeAutoDiffScalar(other / a.value(), a.derivatives() * (Scalar(-other) / (a.value()*a.value()))); + } + +// inline const AutoDiffScalar, DerType>::Type > +// operator/(const Real& other) const +// { +// return AutoDiffScalar, DerType>::Type >( +// m_value / other, +// (m_derivatives * (Real(1)/other))); +// } +// +// friend inline const AutoDiffScalar, DerType>::Type > +// operator/(const Real& other, const AutoDiffScalar& a) +// { +// return AutoDiffScalar, DerType>::Type >( +// other / a.value(), +// a.derivatives() * (-Real(1)/other)); +// } + + template + inline const AutoDiffScalar EIGEN_COMMA + const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) EIGEN_COMMA + const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename internal::remove_all::type,Scalar,product) >,Scalar,product) > + operator/(const AutoDiffScalar& other) const + { + internal::make_coherent(m_derivatives, other.derivatives()); + return MakeAutoDiffScalar( + m_value / other.value(), + ((m_derivatives * other.value()) - (other.derivatives() * m_value)) + * (Scalar(1)/(other.value()*other.value()))); + } + + template + inline const AutoDiffScalar, + const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product), + const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename internal::remove_all::type,Scalar,product) > > + operator*(const AutoDiffScalar& other) const + { + internal::make_coherent(m_derivatives, other.derivatives()); + return MakeAutoDiffScalar( + m_value * other.value(), + (m_derivatives * other.value()) + (other.derivatives() * m_value)); + } + + inline AutoDiffScalar& operator*=(const Scalar& other) + { + *this = *this * other; + return *this; + } + + template + inline AutoDiffScalar& operator*=(const AutoDiffScalar& other) + { + *this = *this * other; + return *this; + } + + inline AutoDiffScalar& operator/=(const Scalar& other) + { + *this = *this / other; + return *this; + } + + template + inline AutoDiffScalar& operator/=(const AutoDiffScalar& other) + { + *this = *this / other; + return *this; + } + + protected: + Scalar m_value; + DerType m_derivatives; + +}; + +namespace internal { + +template +struct auto_diff_special_op +// : auto_diff_scalar_op::Real, +// is_same::Real>::value> +{ + typedef typename remove_all::type DerType; + typedef typename traits::Scalar Scalar; + typedef typename NumTraits::Real Real; + +// typedef auto_diff_scalar_op::Real, +// is_same::Real>::value> Base; + +// using Base::operator+; +// using Base::operator+=; +// using Base::operator-; +// using Base::operator-=; +// using Base::operator*; +// using Base::operator*=; + + const AutoDiffScalar& derived() const { return *static_cast*>(this); } + AutoDiffScalar& derived() { return *static_cast*>(this); } + + + inline const AutoDiffScalar operator+(const Real& other) const + { + return AutoDiffScalar(derived().value() + other, derived().derivatives()); + } + + friend inline const AutoDiffScalar operator+(const Real& a, const AutoDiffScalar& b) + { + return AutoDiffScalar(a + b.value(), b.derivatives()); + } + + inline AutoDiffScalar& operator+=(const Real& other) + { + derived().value() += other; + return derived(); + } + + + inline const AutoDiffScalar >, DerType>::Type > + operator*(const Real& other) const + { + return AutoDiffScalar >, DerType>::Type >( + derived().value() * other, + derived().derivatives() * other); + } + + friend inline const AutoDiffScalar >, DerType>::Type > + operator*(const Real& other, const AutoDiffScalar& a) + { + return AutoDiffScalar >, DerType>::Type >( + a.value() * other, + a.derivatives() * other); + } + + inline AutoDiffScalar& operator*=(const Scalar& other) + { + *this = *this * other; + return derived(); + } +}; + +template +struct auto_diff_special_op +{ + void operator*() const; + void operator-() const; + void operator+() const; +}; + +template +void make_coherent_expression(CwiseBinaryOp xpr, const RefType &ref) +{ + make_coherent(xpr.const_cast_derived().lhs(), ref); + make_coherent(xpr.const_cast_derived().rhs(), ref); +} + +template +void make_coherent_expression(const CwiseUnaryOp &xpr, const RefType &ref) +{ + make_coherent(xpr.nestedExpression().const_cast_derived(), ref); +} + +// needed for compilation only +template +void make_coherent_expression(const CwiseNullaryOp &, const RefType &) +{} + +template +struct make_coherent_impl, B> { + typedef Matrix A; + static void run(A& a, B& b) { + if((A_Rows==Dynamic || A_Cols==Dynamic) && (a.size()==0)) + { + a.resize(b.size()); + a.setZero(); + } + else if (B::SizeAtCompileTime==Dynamic && a.size()!=0 && b.size()==0) + { + make_coherent_expression(b,a); + } + } +}; + +template +struct make_coherent_impl > { + typedef Matrix B; + static void run(A& a, B& b) { + if((B_Rows==Dynamic || B_Cols==Dynamic) && (b.size()==0)) + { + b.resize(a.size()); + b.setZero(); + } + else if (A::SizeAtCompileTime==Dynamic && b.size()!=0 && a.size()==0) + { + make_coherent_expression(a,b); + } + } +}; + +template +struct make_coherent_impl, + Matrix > { + typedef Matrix A; + typedef Matrix B; + static void run(A& a, B& b) { + if((A_Rows==Dynamic || A_Cols==Dynamic) && (a.size()==0)) + { + a.resize(b.size()); + a.setZero(); + } + else if((B_Rows==Dynamic || B_Cols==Dynamic) && (b.size()==0)) + { + b.resize(a.size()); + b.setZero(); + } + } +}; + +} // end namespace internal + +template +struct ScalarBinaryOpTraits,typename DerType::Scalar,BinOp> +{ + typedef AutoDiffScalar ReturnType; +}; + +template +struct ScalarBinaryOpTraits, BinOp> +{ + typedef AutoDiffScalar ReturnType; +}; + + +// The following is an attempt to let Eigen's known about expression template, but that's more tricky! + +// template +// struct ScalarBinaryOpTraits,AutoDiffScalar, BinOp> +// { +// enum { Defined = 1 }; +// typedef AutoDiffScalar ReturnType; +// }; +// +// template +// struct ScalarBinaryOpTraits,AutoDiffScalar, BinOp> +// { +// enum { Defined = 1 };//internal::is_same::value }; +// typedef AutoDiffScalar ReturnType; +// }; + +#define EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(FUNC,CODE) \ + template \ + inline const Eigen::AutoDiffScalar< \ + EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename Eigen::internal::remove_all::type, typename Eigen::internal::traits::type>::Scalar, product) > \ + FUNC(const Eigen::AutoDiffScalar& x) { \ + using namespace Eigen; \ + typedef typename Eigen::internal::traits::type>::Scalar Scalar; \ + EIGEN_UNUSED_VARIABLE(sizeof(Scalar)); \ + CODE; \ + } + +template +struct CleanedUpDerType { + typedef AutoDiffScalar::type::PlainObject> type; +}; + +template +inline const AutoDiffScalar& conj(const AutoDiffScalar& x) { return x; } +template +inline const AutoDiffScalar& real(const AutoDiffScalar& x) { return x; } +template +inline typename DerType::Scalar imag(const AutoDiffScalar&) { return 0.; } +template +inline typename CleanedUpDerType::type (min)(const AutoDiffScalar& x, const T& y) { + typedef typename CleanedUpDerType::type ADS; + return (x <= y ? ADS(x) : ADS(y)); +} +template +inline typename CleanedUpDerType::type (max)(const AutoDiffScalar& x, const T& y) { + typedef typename CleanedUpDerType::type ADS; + return (x >= y ? ADS(x) : ADS(y)); +} +template +inline typename CleanedUpDerType::type (min)(const T& x, const AutoDiffScalar& y) { + typedef typename CleanedUpDerType::type ADS; + return (x < y ? ADS(x) : ADS(y)); +} +template +inline typename CleanedUpDerType::type (max)(const T& x, const AutoDiffScalar& y) { + typedef typename CleanedUpDerType::type ADS; + return (x > y ? ADS(x) : ADS(y)); +} +template +inline typename CleanedUpDerType::type (min)(const AutoDiffScalar& x, const AutoDiffScalar& y) { + return (x.value() < y.value() ? x : y); +} +template +inline typename CleanedUpDerType::type (max)(const AutoDiffScalar& x, const AutoDiffScalar& y) { + return (x.value() >= y.value() ? x : y); +} + + +EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(abs, + using std::abs; + return Eigen::MakeAutoDiffScalar(abs(x.value()), x.derivatives() * (x.value()<0 ? -1 : 1) );) + +EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(abs2, + using numext::abs2; + return Eigen::MakeAutoDiffScalar(abs2(x.value()), x.derivatives() * (Scalar(2)*x.value()));) + +EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(sqrt, + using std::sqrt; + Scalar sqrtx = sqrt(x.value()); + return Eigen::MakeAutoDiffScalar(sqrtx,x.derivatives() * (Scalar(0.5) / sqrtx));) + +EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(cos, + using std::cos; + using std::sin; + return Eigen::MakeAutoDiffScalar(cos(x.value()), x.derivatives() * (-sin(x.value())));) + +EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(sin, + using std::sin; + using std::cos; + return Eigen::MakeAutoDiffScalar(sin(x.value()),x.derivatives() * cos(x.value()));) + +EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(exp, + using std::exp; + Scalar expx = exp(x.value()); + return Eigen::MakeAutoDiffScalar(expx,x.derivatives() * expx);) + +EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(log, + using std::log; + return Eigen::MakeAutoDiffScalar(log(x.value()),x.derivatives() * (Scalar(1)/x.value()));) + +template +inline const Eigen::AutoDiffScalar< +EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename internal::remove_all::type,typename internal::traits::type>::Scalar,product) > +pow(const Eigen::AutoDiffScalar &x, const typename internal::traits::type>::Scalar &y) +{ + using namespace Eigen; + using std::pow; + return Eigen::MakeAutoDiffScalar(pow(x.value(),y), x.derivatives() * (y * pow(x.value(),y-1))); +} + + +template +inline const AutoDiffScalar::type>::Scalar,Dynamic,1> > +atan2(const AutoDiffScalar& a, const AutoDiffScalar& b) +{ + using std::atan2; + typedef typename internal::traits::type>::Scalar Scalar; + typedef AutoDiffScalar > PlainADS; + PlainADS ret; + ret.value() = atan2(a.value(), b.value()); + + Scalar squared_hypot = a.value() * a.value() + b.value() * b.value(); + + // if (squared_hypot==0) the derivation is undefined and the following results in a NaN: + ret.derivatives() = (a.derivatives() * b.value() - a.value() * b.derivatives()) / squared_hypot; + + return ret; +} + +EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(tan, + using std::tan; + using std::cos; + return Eigen::MakeAutoDiffScalar(tan(x.value()),x.derivatives() * (Scalar(1)/numext::abs2(cos(x.value()))));) + +EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(asin, + using std::sqrt; + using std::asin; + return Eigen::MakeAutoDiffScalar(asin(x.value()),x.derivatives() * (Scalar(1)/sqrt(1-numext::abs2(x.value()))));) + +EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(acos, + using std::sqrt; + using std::acos; + return Eigen::MakeAutoDiffScalar(acos(x.value()),x.derivatives() * (Scalar(-1)/sqrt(1-numext::abs2(x.value()))));) + +EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(tanh, + using std::cosh; + using std::tanh; + return Eigen::MakeAutoDiffScalar(tanh(x.value()),x.derivatives() * (Scalar(1)/numext::abs2(cosh(x.value()))));) + +EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(sinh, + using std::sinh; + using std::cosh; + return Eigen::MakeAutoDiffScalar(sinh(x.value()),x.derivatives() * cosh(x.value()));) + +EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(cosh, + using std::sinh; + using std::cosh; + return Eigen::MakeAutoDiffScalar(cosh(x.value()),x.derivatives() * sinh(x.value()));) + +#undef EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY + +template struct NumTraits > + : NumTraits< typename NumTraits::type::Scalar>::Real > +{ + typedef typename internal::remove_all::type DerTypeCleaned; + typedef AutoDiffScalar::Real,DerTypeCleaned::RowsAtCompileTime,DerTypeCleaned::ColsAtCompileTime, + 0, DerTypeCleaned::MaxRowsAtCompileTime, DerTypeCleaned::MaxColsAtCompileTime> > Real; + typedef AutoDiffScalar NonInteger; + typedef AutoDiffScalar Nested; + typedef typename NumTraits::Literal Literal; + enum{ + RequireInitialization = 1 + }; +}; + +} + +namespace std { + +template +class numeric_limits > + : public numeric_limits {}; + +template +class numeric_limits > + : public numeric_limits {}; + +} // namespace std + +#endif // EIGEN_AUTODIFF_SCALAR_H diff --git a/src/EigenUnsupported/src/AutoDiff/AutoDiffVector.h b/src/EigenUnsupported/src/AutoDiff/AutoDiffVector.h new file mode 100644 index 0000000..8c2d048 --- /dev/null +++ b/src/EigenUnsupported/src/AutoDiff/AutoDiffVector.h @@ -0,0 +1,220 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_AUTODIFF_VECTOR_H +#define EIGEN_AUTODIFF_VECTOR_H + +namespace Eigen { + +/* \class AutoDiffScalar + * \brief A scalar type replacement with automatic differentation capability + * + * \param DerType the vector type used to store/represent the derivatives (e.g. Vector3f) + * + * This class represents a scalar value while tracking its respective derivatives. + * + * It supports the following list of global math function: + * - std::abs, std::sqrt, std::pow, std::exp, std::log, std::sin, std::cos, + * - internal::abs, internal::sqrt, numext::pow, internal::exp, internal::log, internal::sin, internal::cos, + * - internal::conj, internal::real, internal::imag, numext::abs2. + * + * AutoDiffScalar can be used as the scalar type of an Eigen::Matrix object. However, + * in that case, the expression template mechanism only occurs at the top Matrix level, + * while derivatives are computed right away. + * + */ +template +class AutoDiffVector +{ + public: + //typedef typename internal::traits::Scalar Scalar; + typedef typename internal::traits::Scalar BaseScalar; + typedef AutoDiffScalar > ActiveScalar; + typedef ActiveScalar Scalar; + typedef AutoDiffScalar CoeffType; + typedef typename JacobianType::Index Index; + + inline AutoDiffVector() {} + + inline AutoDiffVector(const ValueType& values) + : m_values(values) + { + m_jacobian.setZero(); + } + + + CoeffType operator[] (Index i) { return CoeffType(m_values[i], m_jacobian.col(i)); } + const CoeffType operator[] (Index i) const { return CoeffType(m_values[i], m_jacobian.col(i)); } + + CoeffType operator() (Index i) { return CoeffType(m_values[i], m_jacobian.col(i)); } + const CoeffType operator() (Index i) const { return CoeffType(m_values[i], m_jacobian.col(i)); } + + CoeffType coeffRef(Index i) { return CoeffType(m_values[i], m_jacobian.col(i)); } + const CoeffType coeffRef(Index i) const { return CoeffType(m_values[i], m_jacobian.col(i)); } + + Index size() const { return m_values.size(); } + + // FIXME here we could return an expression of the sum + Scalar sum() const { /*std::cerr << "sum \n\n";*/ /*std::cerr << m_jacobian.rowwise().sum() << "\n\n";*/ return Scalar(m_values.sum(), m_jacobian.rowwise().sum()); } + + + inline AutoDiffVector(const ValueType& values, const JacobianType& jac) + : m_values(values), m_jacobian(jac) + {} + + template + inline AutoDiffVector(const AutoDiffVector& other) + : m_values(other.values()), m_jacobian(other.jacobian()) + {} + + inline AutoDiffVector(const AutoDiffVector& other) + : m_values(other.values()), m_jacobian(other.jacobian()) + {} + + template + inline AutoDiffVector& operator=(const AutoDiffVector& other) + { + m_values = other.values(); + m_jacobian = other.jacobian(); + return *this; + } + + inline AutoDiffVector& operator=(const AutoDiffVector& other) + { + m_values = other.values(); + m_jacobian = other.jacobian(); + return *this; + } + + inline const ValueType& values() const { return m_values; } + inline ValueType& values() { return m_values; } + + inline const JacobianType& jacobian() const { return m_jacobian; } + inline JacobianType& jacobian() { return m_jacobian; } + + template + inline const AutoDiffVector< + typename MakeCwiseBinaryOp,ValueType,OtherValueType>::Type, + typename MakeCwiseBinaryOp,JacobianType,OtherJacobianType>::Type > + operator+(const AutoDiffVector& other) const + { + return AutoDiffVector< + typename MakeCwiseBinaryOp,ValueType,OtherValueType>::Type, + typename MakeCwiseBinaryOp,JacobianType,OtherJacobianType>::Type >( + m_values + other.values(), + m_jacobian + other.jacobian()); + } + + template + inline AutoDiffVector& + operator+=(const AutoDiffVector& other) + { + m_values += other.values(); + m_jacobian += other.jacobian(); + return *this; + } + + template + inline const AutoDiffVector< + typename MakeCwiseBinaryOp,ValueType,OtherValueType>::Type, + typename MakeCwiseBinaryOp,JacobianType,OtherJacobianType>::Type > + operator-(const AutoDiffVector& other) const + { + return AutoDiffVector< + typename MakeCwiseBinaryOp,ValueType,OtherValueType>::Type, + typename MakeCwiseBinaryOp,JacobianType,OtherJacobianType>::Type >( + m_values - other.values(), + m_jacobian - other.jacobian()); + } + + template + inline AutoDiffVector& + operator-=(const AutoDiffVector& other) + { + m_values -= other.values(); + m_jacobian -= other.jacobian(); + return *this; + } + + inline const AutoDiffVector< + typename MakeCwiseUnaryOp, ValueType>::Type, + typename MakeCwiseUnaryOp, JacobianType>::Type > + operator-() const + { + return AutoDiffVector< + typename MakeCwiseUnaryOp, ValueType>::Type, + typename MakeCwiseUnaryOp, JacobianType>::Type >( + -m_values, + -m_jacobian); + } + + inline const AutoDiffVector< + typename MakeCwiseUnaryOp, ValueType>::Type, + typename MakeCwiseUnaryOp, JacobianType>::Type> + operator*(const BaseScalar& other) const + { + return AutoDiffVector< + typename MakeCwiseUnaryOp, ValueType>::Type, + typename MakeCwiseUnaryOp, JacobianType>::Type >( + m_values * other, + m_jacobian * other); + } + + friend inline const AutoDiffVector< + typename MakeCwiseUnaryOp, ValueType>::Type, + typename MakeCwiseUnaryOp, JacobianType>::Type > + operator*(const Scalar& other, const AutoDiffVector& v) + { + return AutoDiffVector< + typename MakeCwiseUnaryOp, ValueType>::Type, + typename MakeCwiseUnaryOp, JacobianType>::Type >( + v.values() * other, + v.jacobian() * other); + } + +// template +// inline const AutoDiffVector< +// CwiseBinaryOp, ValueType, OtherValueType> +// CwiseBinaryOp, +// CwiseUnaryOp, JacobianType>, +// CwiseUnaryOp, OtherJacobianType> > > +// operator*(const AutoDiffVector& other) const +// { +// return AutoDiffVector< +// CwiseBinaryOp, ValueType, OtherValueType> +// CwiseBinaryOp, +// CwiseUnaryOp, JacobianType>, +// CwiseUnaryOp, OtherJacobianType> > >( +// m_values.cwise() * other.values(), +// (m_jacobian * other.values()) + (m_values * other.jacobian())); +// } + + inline AutoDiffVector& operator*=(const Scalar& other) + { + m_values *= other; + m_jacobian *= other; + return *this; + } + + template + inline AutoDiffVector& operator*=(const AutoDiffVector& other) + { + *this = *this * other; + return *this; + } + + protected: + ValueType m_values; + JacobianType m_jacobian; + +}; + +} + +#endif // EIGEN_AUTODIFF_VECTOR_H diff --git a/src/EigenUnsupported/src/BVH/BVAlgorithms.h b/src/EigenUnsupported/src/BVH/BVAlgorithms.h new file mode 100644 index 0000000..994c8af --- /dev/null +++ b/src/EigenUnsupported/src/BVH/BVAlgorithms.h @@ -0,0 +1,293 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Ilya Baran +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_BVALGORITHMS_H +#define EIGEN_BVALGORITHMS_H + +namespace Eigen { + +namespace internal { + +#ifndef EIGEN_PARSED_BY_DOXYGEN +template +bool intersect_helper(const BVH &tree, Intersector &intersector, typename BVH::Index root) +{ + typedef typename BVH::Index Index; + typedef typename BVH::VolumeIterator VolIter; + typedef typename BVH::ObjectIterator ObjIter; + + VolIter vBegin = VolIter(), vEnd = VolIter(); + ObjIter oBegin = ObjIter(), oEnd = ObjIter(); + + std::vector todo(1, root); + + while(!todo.empty()) { + tree.getChildren(todo.back(), vBegin, vEnd, oBegin, oEnd); + todo.pop_back(); + + for(; vBegin != vEnd; ++vBegin) //go through child volumes + if(intersector.intersectVolume(tree.getVolume(*vBegin))) + todo.push_back(*vBegin); + + for(; oBegin != oEnd; ++oBegin) //go through child objects + if(intersector.intersectObject(*oBegin)) + return true; //intersector said to stop query + } + return false; +} +#endif //not EIGEN_PARSED_BY_DOXYGEN + +template +struct intersector_helper1 +{ + intersector_helper1(const Object2 &inStored, Intersector &in) : stored(inStored), intersector(in) {} + bool intersectVolume(const Volume1 &vol) { return intersector.intersectVolumeObject(vol, stored); } + bool intersectObject(const Object1 &obj) { return intersector.intersectObjectObject(obj, stored); } + Object2 stored; + Intersector &intersector; +private: + intersector_helper1& operator=(const intersector_helper1&); +}; + +template +struct intersector_helper2 +{ + intersector_helper2(const Object1 &inStored, Intersector &in) : stored(inStored), intersector(in) {} + bool intersectVolume(const Volume2 &vol) { return intersector.intersectObjectVolume(stored, vol); } + bool intersectObject(const Object2 &obj) { return intersector.intersectObjectObject(stored, obj); } + Object1 stored; + Intersector &intersector; +private: + intersector_helper2& operator=(const intersector_helper2&); +}; + +} // end namespace internal + +/** Given a BVH, runs the query encapsulated by \a intersector. + * The Intersector type must provide the following members: \code + bool intersectVolume(const BVH::Volume &volume) //returns true if volume intersects the query + bool intersectObject(const BVH::Object &object) //returns true if the search should terminate immediately + \endcode + */ +template +void BVIntersect(const BVH &tree, Intersector &intersector) +{ + internal::intersect_helper(tree, intersector, tree.getRootIndex()); +} + +/** Given two BVH's, runs the query on their Cartesian product encapsulated by \a intersector. + * The Intersector type must provide the following members: \code + bool intersectVolumeVolume(const BVH1::Volume &v1, const BVH2::Volume &v2) //returns true if product of volumes intersects the query + bool intersectVolumeObject(const BVH1::Volume &v1, const BVH2::Object &o2) //returns true if the volume-object product intersects the query + bool intersectObjectVolume(const BVH1::Object &o1, const BVH2::Volume &v2) //returns true if the volume-object product intersects the query + bool intersectObjectObject(const BVH1::Object &o1, const BVH2::Object &o2) //returns true if the search should terminate immediately + \endcode + */ +template +void BVIntersect(const BVH1 &tree1, const BVH2 &tree2, Intersector &intersector) //TODO: tandem descent when it makes sense +{ + typedef typename BVH1::Index Index1; + typedef typename BVH2::Index Index2; + typedef internal::intersector_helper1 Helper1; + typedef internal::intersector_helper2 Helper2; + typedef typename BVH1::VolumeIterator VolIter1; + typedef typename BVH1::ObjectIterator ObjIter1; + typedef typename BVH2::VolumeIterator VolIter2; + typedef typename BVH2::ObjectIterator ObjIter2; + + VolIter1 vBegin1 = VolIter1(), vEnd1 = VolIter1(); + ObjIter1 oBegin1 = ObjIter1(), oEnd1 = ObjIter1(); + VolIter2 vBegin2 = VolIter2(), vEnd2 = VolIter2(), vCur2 = VolIter2(); + ObjIter2 oBegin2 = ObjIter2(), oEnd2 = ObjIter2(), oCur2 = ObjIter2(); + + std::vector > todo(1, std::make_pair(tree1.getRootIndex(), tree2.getRootIndex())); + + while(!todo.empty()) { + tree1.getChildren(todo.back().first, vBegin1, vEnd1, oBegin1, oEnd1); + tree2.getChildren(todo.back().second, vBegin2, vEnd2, oBegin2, oEnd2); + todo.pop_back(); + + for(; vBegin1 != vEnd1; ++vBegin1) { //go through child volumes of first tree + const typename BVH1::Volume &vol1 = tree1.getVolume(*vBegin1); + for(vCur2 = vBegin2; vCur2 != vEnd2; ++vCur2) { //go through child volumes of second tree + if(intersector.intersectVolumeVolume(vol1, tree2.getVolume(*vCur2))) + todo.push_back(std::make_pair(*vBegin1, *vCur2)); + } + + for(oCur2 = oBegin2; oCur2 != oEnd2; ++oCur2) {//go through child objects of second tree + Helper1 helper(*oCur2, intersector); + if(internal::intersect_helper(tree1, helper, *vBegin1)) + return; //intersector said to stop query + } + } + + for(; oBegin1 != oEnd1; ++oBegin1) { //go through child objects of first tree + for(vCur2 = vBegin2; vCur2 != vEnd2; ++vCur2) { //go through child volumes of second tree + Helper2 helper(*oBegin1, intersector); + if(internal::intersect_helper(tree2, helper, *vCur2)) + return; //intersector said to stop query + } + + for(oCur2 = oBegin2; oCur2 != oEnd2; ++oCur2) {//go through child objects of second tree + if(intersector.intersectObjectObject(*oBegin1, *oCur2)) + return; //intersector said to stop query + } + } + } +} + +namespace internal { + +#ifndef EIGEN_PARSED_BY_DOXYGEN +template +typename Minimizer::Scalar minimize_helper(const BVH &tree, Minimizer &minimizer, typename BVH::Index root, typename Minimizer::Scalar minimum) +{ + typedef typename Minimizer::Scalar Scalar; + typedef typename BVH::Index Index; + typedef std::pair QueueElement; //first element is priority + typedef typename BVH::VolumeIterator VolIter; + typedef typename BVH::ObjectIterator ObjIter; + + VolIter vBegin = VolIter(), vEnd = VolIter(); + ObjIter oBegin = ObjIter(), oEnd = ObjIter(); + std::priority_queue, std::greater > todo; //smallest is at the top + + todo.push(std::make_pair(Scalar(), root)); + + while(!todo.empty()) { + tree.getChildren(todo.top().second, vBegin, vEnd, oBegin, oEnd); + todo.pop(); + + for(; oBegin != oEnd; ++oBegin) //go through child objects + minimum = (std::min)(minimum, minimizer.minimumOnObject(*oBegin)); + + for(; vBegin != vEnd; ++vBegin) { //go through child volumes + Scalar val = minimizer.minimumOnVolume(tree.getVolume(*vBegin)); + if(val < minimum) + todo.push(std::make_pair(val, *vBegin)); + } + } + + return minimum; +} +#endif //not EIGEN_PARSED_BY_DOXYGEN + + +template +struct minimizer_helper1 +{ + typedef typename Minimizer::Scalar Scalar; + minimizer_helper1(const Object2 &inStored, Minimizer &m) : stored(inStored), minimizer(m) {} + Scalar minimumOnVolume(const Volume1 &vol) { return minimizer.minimumOnVolumeObject(vol, stored); } + Scalar minimumOnObject(const Object1 &obj) { return minimizer.minimumOnObjectObject(obj, stored); } + Object2 stored; + Minimizer &minimizer; +private: + minimizer_helper1& operator=(const minimizer_helper1&); +}; + +template +struct minimizer_helper2 +{ + typedef typename Minimizer::Scalar Scalar; + minimizer_helper2(const Object1 &inStored, Minimizer &m) : stored(inStored), minimizer(m) {} + Scalar minimumOnVolume(const Volume2 &vol) { return minimizer.minimumOnObjectVolume(stored, vol); } + Scalar minimumOnObject(const Object2 &obj) { return minimizer.minimumOnObjectObject(stored, obj); } + Object1 stored; + Minimizer &minimizer; +private: + minimizer_helper2& operator=(const minimizer_helper2&); +}; + +} // end namespace internal + +/** Given a BVH, runs the query encapsulated by \a minimizer. + * \returns the minimum value. + * The Minimizer type must provide the following members: \code + typedef Scalar //the numeric type of what is being minimized--not necessarily the Scalar type of the BVH (if it has one) + Scalar minimumOnVolume(const BVH::Volume &volume) + Scalar minimumOnObject(const BVH::Object &object) + \endcode + */ +template +typename Minimizer::Scalar BVMinimize(const BVH &tree, Minimizer &minimizer) +{ + return internal::minimize_helper(tree, minimizer, tree.getRootIndex(), (std::numeric_limits::max)()); +} + +/** Given two BVH's, runs the query on their cartesian product encapsulated by \a minimizer. + * \returns the minimum value. + * The Minimizer type must provide the following members: \code + typedef Scalar //the numeric type of what is being minimized--not necessarily the Scalar type of the BVH (if it has one) + Scalar minimumOnVolumeVolume(const BVH1::Volume &v1, const BVH2::Volume &v2) + Scalar minimumOnVolumeObject(const BVH1::Volume &v1, const BVH2::Object &o2) + Scalar minimumOnObjectVolume(const BVH1::Object &o1, const BVH2::Volume &v2) + Scalar minimumOnObjectObject(const BVH1::Object &o1, const BVH2::Object &o2) + \endcode + */ +template +typename Minimizer::Scalar BVMinimize(const BVH1 &tree1, const BVH2 &tree2, Minimizer &minimizer) +{ + typedef typename Minimizer::Scalar Scalar; + typedef typename BVH1::Index Index1; + typedef typename BVH2::Index Index2; + typedef internal::minimizer_helper1 Helper1; + typedef internal::minimizer_helper2 Helper2; + typedef std::pair > QueueElement; //first element is priority + typedef typename BVH1::VolumeIterator VolIter1; + typedef typename BVH1::ObjectIterator ObjIter1; + typedef typename BVH2::VolumeIterator VolIter2; + typedef typename BVH2::ObjectIterator ObjIter2; + + VolIter1 vBegin1 = VolIter1(), vEnd1 = VolIter1(); + ObjIter1 oBegin1 = ObjIter1(), oEnd1 = ObjIter1(); + VolIter2 vBegin2 = VolIter2(), vEnd2 = VolIter2(), vCur2 = VolIter2(); + ObjIter2 oBegin2 = ObjIter2(), oEnd2 = ObjIter2(), oCur2 = ObjIter2(); + std::priority_queue, std::greater > todo; //smallest is at the top + + Scalar minimum = (std::numeric_limits::max)(); + todo.push(std::make_pair(Scalar(), std::make_pair(tree1.getRootIndex(), tree2.getRootIndex()))); + + while(!todo.empty()) { + tree1.getChildren(todo.top().second.first, vBegin1, vEnd1, oBegin1, oEnd1); + tree2.getChildren(todo.top().second.second, vBegin2, vEnd2, oBegin2, oEnd2); + todo.pop(); + + for(; oBegin1 != oEnd1; ++oBegin1) { //go through child objects of first tree + for(oCur2 = oBegin2; oCur2 != oEnd2; ++oCur2) {//go through child objects of second tree + minimum = (std::min)(minimum, minimizer.minimumOnObjectObject(*oBegin1, *oCur2)); + } + + for(vCur2 = vBegin2; vCur2 != vEnd2; ++vCur2) { //go through child volumes of second tree + Helper2 helper(*oBegin1, minimizer); + minimum = (std::min)(minimum, internal::minimize_helper(tree2, helper, *vCur2, minimum)); + } + } + + for(; vBegin1 != vEnd1; ++vBegin1) { //go through child volumes of first tree + const typename BVH1::Volume &vol1 = tree1.getVolume(*vBegin1); + + for(oCur2 = oBegin2; oCur2 != oEnd2; ++oCur2) {//go through child objects of second tree + Helper1 helper(*oCur2, minimizer); + minimum = (std::min)(minimum, internal::minimize_helper(tree1, helper, *vBegin1, minimum)); + } + + for(vCur2 = vBegin2; vCur2 != vEnd2; ++vCur2) { //go through child volumes of second tree + Scalar val = minimizer.minimumOnVolumeVolume(vol1, tree2.getVolume(*vCur2)); + if(val < minimum) + todo.push(std::make_pair(val, std::make_pair(*vBegin1, *vCur2))); + } + } + } + return minimum; +} + +} // end namespace Eigen + +#endif // EIGEN_BVALGORITHMS_H diff --git a/src/EigenUnsupported/src/BVH/KdBVH.h b/src/EigenUnsupported/src/BVH/KdBVH.h new file mode 100644 index 0000000..2d5b76a --- /dev/null +++ b/src/EigenUnsupported/src/BVH/KdBVH.h @@ -0,0 +1,223 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Ilya Baran +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef KDBVH_H_INCLUDED +#define KDBVH_H_INCLUDED + +namespace Eigen { + +namespace internal { + +//internal pair class for the BVH--used instead of std::pair because of alignment +template +struct vector_int_pair +{ +EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar, Dim) + typedef Matrix VectorType; + + vector_int_pair(const VectorType &v, int i) : first(v), second(i) {} + + VectorType first; + int second; +}; + +//these templates help the tree initializer get the bounding boxes either from a provided +//iterator range or using bounding_box in a unified way +template +struct get_boxes_helper { + void operator()(const ObjectList &objects, BoxIter boxBegin, BoxIter boxEnd, VolumeList &outBoxes) + { + outBoxes.insert(outBoxes.end(), boxBegin, boxEnd); + eigen_assert(outBoxes.size() == objects.size()); + EIGEN_ONLY_USED_FOR_DEBUG(objects); + } +}; + +template +struct get_boxes_helper { + void operator()(const ObjectList &objects, int, int, VolumeList &outBoxes) + { + outBoxes.reserve(objects.size()); + for(int i = 0; i < (int)objects.size(); ++i) + outBoxes.push_back(bounding_box(objects[i])); + } +}; + +} // end namespace internal + + +/** \class KdBVH + * \brief A simple bounding volume hierarchy based on AlignedBox + * + * \param _Scalar The underlying scalar type of the bounding boxes + * \param _Dim The dimension of the space in which the hierarchy lives + * \param _Object The object type that lives in the hierarchy. It must have value semantics. Either bounding_box(_Object) must + * be defined and return an AlignedBox<_Scalar, _Dim> or bounding boxes must be provided to the tree initializer. + * + * This class provides a simple (as opposed to optimized) implementation of a bounding volume hierarchy analogous to a Kd-tree. + * Given a sequence of objects, it computes their bounding boxes, constructs a Kd-tree of their centers + * and builds a BVH with the structure of that Kd-tree. When the elements of the tree are too expensive to be copied around, + * it is useful for _Object to be a pointer. + */ +template class KdBVH +{ +public: + enum { Dim = _Dim }; + typedef _Object Object; + typedef std::vector > ObjectList; + typedef _Scalar Scalar; + typedef AlignedBox Volume; + typedef std::vector > VolumeList; + typedef int Index; + typedef const int *VolumeIterator; //the iterators are just pointers into the tree's vectors + typedef const Object *ObjectIterator; + + KdBVH() {} + + /** Given an iterator range over \a Object references, constructs the BVH. Requires that bounding_box(Object) return a Volume. */ + template KdBVH(Iter begin, Iter end) { init(begin, end, 0, 0); } //int is recognized by init as not being an iterator type + + /** Given an iterator range over \a Object references and an iterator range over their bounding boxes, constructs the BVH */ + template KdBVH(OIter begin, OIter end, BIter boxBegin, BIter boxEnd) { init(begin, end, boxBegin, boxEnd); } + + /** Given an iterator range over \a Object references, constructs the BVH, overwriting whatever is in there currently. + * Requires that bounding_box(Object) return a Volume. */ + template void init(Iter begin, Iter end) { init(begin, end, 0, 0); } + + /** Given an iterator range over \a Object references and an iterator range over their bounding boxes, + * constructs the BVH, overwriting whatever is in there currently. */ + template void init(OIter begin, OIter end, BIter boxBegin, BIter boxEnd) + { + objects.clear(); + boxes.clear(); + children.clear(); + + objects.insert(objects.end(), begin, end); + int n = static_cast(objects.size()); + + if(n < 2) + return; //if we have at most one object, we don't need any internal nodes + + VolumeList objBoxes; + VIPairList objCenters; + + //compute the bounding boxes depending on BIter type + internal::get_boxes_helper()(objects, boxBegin, boxEnd, objBoxes); + + objCenters.reserve(n); + boxes.reserve(n - 1); + children.reserve(2 * n - 2); + + for(int i = 0; i < n; ++i) + objCenters.push_back(VIPair(objBoxes[i].center(), i)); + + build(objCenters, 0, n, objBoxes, 0); //the recursive part of the algorithm + + ObjectList tmp(n); + tmp.swap(objects); + for(int i = 0; i < n; ++i) + objects[i] = tmp[objCenters[i].second]; + } + + /** \returns the index of the root of the hierarchy */ + inline Index getRootIndex() const { return (int)boxes.size() - 1; } + + /** Given an \a index of a node, on exit, \a outVBegin and \a outVEnd range over the indices of the volume children of the node + * and \a outOBegin and \a outOEnd range over the object children of the node */ + EIGEN_STRONG_INLINE void getChildren(Index index, VolumeIterator &outVBegin, VolumeIterator &outVEnd, + ObjectIterator &outOBegin, ObjectIterator &outOEnd) const + { //inlining this function should open lots of optimization opportunities to the compiler + if(index < 0) { + outVBegin = outVEnd; + if(!objects.empty()) + outOBegin = &(objects[0]); + outOEnd = outOBegin + objects.size(); //output all objects--necessary when the tree has only one object + return; + } + + int numBoxes = static_cast(boxes.size()); + + int idx = index * 2; + if(children[idx + 1] < numBoxes) { //second index is always bigger + outVBegin = &(children[idx]); + outVEnd = outVBegin + 2; + outOBegin = outOEnd; + } + else if(children[idx] >= numBoxes) { //if both children are objects + outVBegin = outVEnd; + outOBegin = &(objects[children[idx] - numBoxes]); + outOEnd = outOBegin + 2; + } else { //if the first child is a volume and the second is an object + outVBegin = &(children[idx]); + outVEnd = outVBegin + 1; + outOBegin = &(objects[children[idx + 1] - numBoxes]); + outOEnd = outOBegin + 1; + } + } + + /** \returns the bounding box of the node at \a index */ + inline const Volume &getVolume(Index index) const + { + return boxes[index]; + } + +private: + typedef internal::vector_int_pair VIPair; + typedef std::vector > VIPairList; + typedef Matrix VectorType; + struct VectorComparator //compares vectors, or more specifically, VIPairs along a particular dimension + { + VectorComparator(int inDim) : dim(inDim) {} + inline bool operator()(const VIPair &v1, const VIPair &v2) const { return v1.first[dim] < v2.first[dim]; } + int dim; + }; + + //Build the part of the tree between objects[from] and objects[to] (not including objects[to]). + //This routine partitions the objCenters in [from, to) along the dimension dim, recursively constructs + //the two halves, and adds their parent node. TODO: a cache-friendlier layout + void build(VIPairList &objCenters, int from, int to, const VolumeList &objBoxes, int dim) + { + eigen_assert(to - from > 1); + if(to - from == 2) { + boxes.push_back(objBoxes[objCenters[from].second].merged(objBoxes[objCenters[from + 1].second])); + children.push_back(from + (int)objects.size() - 1); //there are objects.size() - 1 tree nodes + children.push_back(from + (int)objects.size()); + } + else if(to - from == 3) { + int mid = from + 2; + std::nth_element(objCenters.begin() + from, objCenters.begin() + mid, + objCenters.begin() + to, VectorComparator(dim)); //partition + build(objCenters, from, mid, objBoxes, (dim + 1) % Dim); + int idx1 = (int)boxes.size() - 1; + boxes.push_back(boxes[idx1].merged(objBoxes[objCenters[mid].second])); + children.push_back(idx1); + children.push_back(mid + (int)objects.size() - 1); + } + else { + int mid = from + (to - from) / 2; + nth_element(objCenters.begin() + from, objCenters.begin() + mid, + objCenters.begin() + to, VectorComparator(dim)); //partition + build(objCenters, from, mid, objBoxes, (dim + 1) % Dim); + int idx1 = (int)boxes.size() - 1; + build(objCenters, mid, to, objBoxes, (dim + 1) % Dim); + int idx2 = (int)boxes.size() - 1; + boxes.push_back(boxes[idx1].merged(boxes[idx2])); + children.push_back(idx1); + children.push_back(idx2); + } + } + + std::vector children; //children of x are children[2x] and children[2x+1], indices bigger than boxes.size() index into objects. + VolumeList boxes; + ObjectList objects; +}; + +} // end namespace Eigen + +#endif //KDBVH_H_INCLUDED diff --git a/src/EigenUnsupported/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h b/src/EigenUnsupported/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h new file mode 100644 index 0000000..0fbd847 --- /dev/null +++ b/src/EigenUnsupported/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h @@ -0,0 +1,790 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2012 David Harmon +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_ARPACKGENERALIZEDSELFADJOINTEIGENSOLVER_H +#define EIGEN_ARPACKGENERALIZEDSELFADJOINTEIGENSOLVER_H + +#include "../../../../Eigen/Dense" + +namespace Eigen { + +namespace internal { + template struct arpack_wrapper; + template struct OP; +} + + + +template, bool BisSPD=false> +class ArpackGeneralizedSelfAdjointEigenSolver +{ +public: + //typedef typename MatrixSolver::MatrixType MatrixType; + + /** \brief Scalar type for matrices of type \p MatrixType. */ + typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::Index Index; + + /** \brief Real scalar type for \p MatrixType. + * + * This is just \c Scalar if #Scalar is real (e.g., \c float or + * \c Scalar), and the type of the real part of \c Scalar if #Scalar is + * complex. + */ + typedef typename NumTraits::Real RealScalar; + + /** \brief Type for vector of eigenvalues as returned by eigenvalues(). + * + * This is a column vector with entries of type #RealScalar. + * The length of the vector is the size of \p nbrEigenvalues. + */ + typedef typename internal::plain_col_type::type RealVectorType; + + /** \brief Default constructor. + * + * The default constructor is for cases in which the user intends to + * perform decompositions via compute(). + * + */ + ArpackGeneralizedSelfAdjointEigenSolver() + : m_eivec(), + m_eivalues(), + m_isInitialized(false), + m_eigenvectorsOk(false), + m_nbrConverged(0), + m_nbrIterations(0) + { } + + /** \brief Constructor; computes generalized eigenvalues of given matrix with respect to another matrix. + * + * \param[in] A Self-adjoint matrix whose eigenvalues / eigenvectors will + * computed. By default, the upper triangular part is used, but can be changed + * through the template parameter. + * \param[in] B Self-adjoint matrix for the generalized eigenvalue problem. + * \param[in] nbrEigenvalues The number of eigenvalues / eigenvectors to compute. + * Must be less than the size of the input matrix, or an error is returned. + * \param[in] eigs_sigma String containing either "LM", "SM", "LA", or "SA", with + * respective meanings to find the largest magnitude , smallest magnitude, + * largest algebraic, or smallest algebraic eigenvalues. Alternatively, this + * value can contain floating point value in string form, in which case the + * eigenvalues closest to this value will be found. + * \param[in] options Can be #ComputeEigenvectors (default) or #EigenvaluesOnly. + * \param[in] tol What tolerance to find the eigenvalues to. Default is 0, which + * means machine precision. + * + * This constructor calls compute(const MatrixType&, const MatrixType&, Index, string, int, RealScalar) + * to compute the eigenvalues of the matrix \p A with respect to \p B. The eigenvectors are computed if + * \p options equals #ComputeEigenvectors. + * + */ + ArpackGeneralizedSelfAdjointEigenSolver(const MatrixType& A, const MatrixType& B, + Index nbrEigenvalues, std::string eigs_sigma="LM", + int options=ComputeEigenvectors, RealScalar tol=0.0) + : m_eivec(), + m_eivalues(), + m_isInitialized(false), + m_eigenvectorsOk(false), + m_nbrConverged(0), + m_nbrIterations(0) + { + compute(A, B, nbrEigenvalues, eigs_sigma, options, tol); + } + + /** \brief Constructor; computes eigenvalues of given matrix. + * + * \param[in] A Self-adjoint matrix whose eigenvalues / eigenvectors will + * computed. By default, the upper triangular part is used, but can be changed + * through the template parameter. + * \param[in] nbrEigenvalues The number of eigenvalues / eigenvectors to compute. + * Must be less than the size of the input matrix, or an error is returned. + * \param[in] eigs_sigma String containing either "LM", "SM", "LA", or "SA", with + * respective meanings to find the largest magnitude , smallest magnitude, + * largest algebraic, or smallest algebraic eigenvalues. Alternatively, this + * value can contain floating point value in string form, in which case the + * eigenvalues closest to this value will be found. + * \param[in] options Can be #ComputeEigenvectors (default) or #EigenvaluesOnly. + * \param[in] tol What tolerance to find the eigenvalues to. Default is 0, which + * means machine precision. + * + * This constructor calls compute(const MatrixType&, Index, string, int, RealScalar) + * to compute the eigenvalues of the matrix \p A. The eigenvectors are computed if + * \p options equals #ComputeEigenvectors. + * + */ + + ArpackGeneralizedSelfAdjointEigenSolver(const MatrixType& A, + Index nbrEigenvalues, std::string eigs_sigma="LM", + int options=ComputeEigenvectors, RealScalar tol=0.0) + : m_eivec(), + m_eivalues(), + m_isInitialized(false), + m_eigenvectorsOk(false), + m_nbrConverged(0), + m_nbrIterations(0) + { + compute(A, nbrEigenvalues, eigs_sigma, options, tol); + } + + + /** \brief Computes generalized eigenvalues / eigenvectors of given matrix using the external ARPACK library. + * + * \param[in] A Selfadjoint matrix whose eigendecomposition is to be computed. + * \param[in] B Selfadjoint matrix for generalized eigenvalues. + * \param[in] nbrEigenvalues The number of eigenvalues / eigenvectors to compute. + * Must be less than the size of the input matrix, or an error is returned. + * \param[in] eigs_sigma String containing either "LM", "SM", "LA", or "SA", with + * respective meanings to find the largest magnitude , smallest magnitude, + * largest algebraic, or smallest algebraic eigenvalues. Alternatively, this + * value can contain floating point value in string form, in which case the + * eigenvalues closest to this value will be found. + * \param[in] options Can be #ComputeEigenvectors (default) or #EigenvaluesOnly. + * \param[in] tol What tolerance to find the eigenvalues to. Default is 0, which + * means machine precision. + * + * \returns Reference to \c *this + * + * This function computes the generalized eigenvalues of \p A with respect to \p B using ARPACK. The eigenvalues() + * function can be used to retrieve them. If \p options equals #ComputeEigenvectors, + * then the eigenvectors are also computed and can be retrieved by + * calling eigenvectors(). + * + */ + ArpackGeneralizedSelfAdjointEigenSolver& compute(const MatrixType& A, const MatrixType& B, + Index nbrEigenvalues, std::string eigs_sigma="LM", + int options=ComputeEigenvectors, RealScalar tol=0.0); + + /** \brief Computes eigenvalues / eigenvectors of given matrix using the external ARPACK library. + * + * \param[in] A Selfadjoint matrix whose eigendecomposition is to be computed. + * \param[in] nbrEigenvalues The number of eigenvalues / eigenvectors to compute. + * Must be less than the size of the input matrix, or an error is returned. + * \param[in] eigs_sigma String containing either "LM", "SM", "LA", or "SA", with + * respective meanings to find the largest magnitude , smallest magnitude, + * largest algebraic, or smallest algebraic eigenvalues. Alternatively, this + * value can contain floating point value in string form, in which case the + * eigenvalues closest to this value will be found. + * \param[in] options Can be #ComputeEigenvectors (default) or #EigenvaluesOnly. + * \param[in] tol What tolerance to find the eigenvalues to. Default is 0, which + * means machine precision. + * + * \returns Reference to \c *this + * + * This function computes the eigenvalues of \p A using ARPACK. The eigenvalues() + * function can be used to retrieve them. If \p options equals #ComputeEigenvectors, + * then the eigenvectors are also computed and can be retrieved by + * calling eigenvectors(). + * + */ + ArpackGeneralizedSelfAdjointEigenSolver& compute(const MatrixType& A, + Index nbrEigenvalues, std::string eigs_sigma="LM", + int options=ComputeEigenvectors, RealScalar tol=0.0); + + + /** \brief Returns the eigenvectors of given matrix. + * + * \returns A const reference to the matrix whose columns are the eigenvectors. + * + * \pre The eigenvectors have been computed before. + * + * Column \f$ k \f$ of the returned matrix is an eigenvector corresponding + * to eigenvalue number \f$ k \f$ as returned by eigenvalues(). The + * eigenvectors are normalized to have (Euclidean) norm equal to one. If + * this object was used to solve the eigenproblem for the selfadjoint + * matrix \f$ A \f$, then the matrix returned by this function is the + * matrix \f$ V \f$ in the eigendecomposition \f$ A V = D V \f$. + * For the generalized eigenproblem, the matrix returned is the solution \f$ A V = D B V \f$ + * + * Example: \include SelfAdjointEigenSolver_eigenvectors.cpp + * Output: \verbinclude SelfAdjointEigenSolver_eigenvectors.out + * + * \sa eigenvalues() + */ + const Matrix& eigenvectors() const + { + eigen_assert(m_isInitialized && "ArpackGeneralizedSelfAdjointEigenSolver is not initialized."); + eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues."); + return m_eivec; + } + + /** \brief Returns the eigenvalues of given matrix. + * + * \returns A const reference to the column vector containing the eigenvalues. + * + * \pre The eigenvalues have been computed before. + * + * The eigenvalues are repeated according to their algebraic multiplicity, + * so there are as many eigenvalues as rows in the matrix. The eigenvalues + * are sorted in increasing order. + * + * Example: \include SelfAdjointEigenSolver_eigenvalues.cpp + * Output: \verbinclude SelfAdjointEigenSolver_eigenvalues.out + * + * \sa eigenvectors(), MatrixBase::eigenvalues() + */ + const Matrix& eigenvalues() const + { + eigen_assert(m_isInitialized && "ArpackGeneralizedSelfAdjointEigenSolver is not initialized."); + return m_eivalues; + } + + /** \brief Computes the positive-definite square root of the matrix. + * + * \returns the positive-definite square root of the matrix + * + * \pre The eigenvalues and eigenvectors of a positive-definite matrix + * have been computed before. + * + * The square root of a positive-definite matrix \f$ A \f$ is the + * positive-definite matrix whose square equals \f$ A \f$. This function + * uses the eigendecomposition \f$ A = V D V^{-1} \f$ to compute the + * square root as \f$ A^{1/2} = V D^{1/2} V^{-1} \f$. + * + * Example: \include SelfAdjointEigenSolver_operatorSqrt.cpp + * Output: \verbinclude SelfAdjointEigenSolver_operatorSqrt.out + * + * \sa operatorInverseSqrt(), + * \ref MatrixFunctions_Module "MatrixFunctions Module" + */ + Matrix operatorSqrt() const + { + eigen_assert(m_isInitialized && "SelfAdjointEigenSolver is not initialized."); + eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues."); + return m_eivec * m_eivalues.cwiseSqrt().asDiagonal() * m_eivec.adjoint(); + } + + /** \brief Computes the inverse square root of the matrix. + * + * \returns the inverse positive-definite square root of the matrix + * + * \pre The eigenvalues and eigenvectors of a positive-definite matrix + * have been computed before. + * + * This function uses the eigendecomposition \f$ A = V D V^{-1} \f$ to + * compute the inverse square root as \f$ V D^{-1/2} V^{-1} \f$. This is + * cheaper than first computing the square root with operatorSqrt() and + * then its inverse with MatrixBase::inverse(). + * + * Example: \include SelfAdjointEigenSolver_operatorInverseSqrt.cpp + * Output: \verbinclude SelfAdjointEigenSolver_operatorInverseSqrt.out + * + * \sa operatorSqrt(), MatrixBase::inverse(), + * \ref MatrixFunctions_Module "MatrixFunctions Module" + */ + Matrix operatorInverseSqrt() const + { + eigen_assert(m_isInitialized && "SelfAdjointEigenSolver is not initialized."); + eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues."); + return m_eivec * m_eivalues.cwiseInverse().cwiseSqrt().asDiagonal() * m_eivec.adjoint(); + } + + /** \brief Reports whether previous computation was successful. + * + * \returns \c Success if computation was successful, \c NoConvergence otherwise. + */ + ComputationInfo info() const + { + eigen_assert(m_isInitialized && "ArpackGeneralizedSelfAdjointEigenSolver is not initialized."); + return m_info; + } + + size_t getNbrConvergedEigenValues() const + { return m_nbrConverged; } + + size_t getNbrIterations() const + { return m_nbrIterations; } + +protected: + Matrix m_eivec; + Matrix m_eivalues; + ComputationInfo m_info; + bool m_isInitialized; + bool m_eigenvectorsOk; + + size_t m_nbrConverged; + size_t m_nbrIterations; +}; + + + + + +template +ArpackGeneralizedSelfAdjointEigenSolver& + ArpackGeneralizedSelfAdjointEigenSolver +::compute(const MatrixType& A, Index nbrEigenvalues, + std::string eigs_sigma, int options, RealScalar tol) +{ + MatrixType B(0,0); + compute(A, B, nbrEigenvalues, eigs_sigma, options, tol); + + return *this; +} + + +template +ArpackGeneralizedSelfAdjointEigenSolver& + ArpackGeneralizedSelfAdjointEigenSolver +::compute(const MatrixType& A, const MatrixType& B, Index nbrEigenvalues, + std::string eigs_sigma, int options, RealScalar tol) +{ + eigen_assert(A.cols() == A.rows()); + eigen_assert(B.cols() == B.rows()); + eigen_assert(B.rows() == 0 || A.cols() == B.rows()); + eigen_assert((options &~ (EigVecMask | GenEigMask)) == 0 + && (options & EigVecMask) != EigVecMask + && "invalid option parameter"); + + bool isBempty = (B.rows() == 0) || (B.cols() == 0); + + // For clarity, all parameters match their ARPACK name + // + // Always 0 on the first call + // + int ido = 0; + + int n = (int)A.cols(); + + // User options: "LA", "SA", "SM", "LM", "BE" + // + char whch[3] = "LM"; + + // Specifies the shift if iparam[6] = { 3, 4, 5 }, not used if iparam[6] = { 1, 2 } + // + RealScalar sigma = 0.0; + + if (eigs_sigma.length() >= 2 && isalpha(eigs_sigma[0]) && isalpha(eigs_sigma[1])) + { + eigs_sigma[0] = toupper(eigs_sigma[0]); + eigs_sigma[1] = toupper(eigs_sigma[1]); + + // In the following special case we're going to invert the problem, since solving + // for larger magnitude is much much faster + // i.e., if 'SM' is specified, we're going to really use 'LM', the default + // + if (eigs_sigma.substr(0,2) != "SM") + { + whch[0] = eigs_sigma[0]; + whch[1] = eigs_sigma[1]; + } + } + else + { + eigen_assert(false && "Specifying clustered eigenvalues is not yet supported!"); + + // If it's not scalar values, then the user may be explicitly + // specifying the sigma value to cluster the evs around + // + sigma = atof(eigs_sigma.c_str()); + + // If atof fails, it returns 0.0, which is a fine default + // + } + + // "I" means normal eigenvalue problem, "G" means generalized + // + char bmat[2] = "I"; + if (eigs_sigma.substr(0,2) == "SM" || !(isalpha(eigs_sigma[0]) && isalpha(eigs_sigma[1])) || (!isBempty && !BisSPD)) + bmat[0] = 'G'; + + // Now we determine the mode to use + // + int mode = (bmat[0] == 'G') + 1; + if (eigs_sigma.substr(0,2) == "SM" || !(isalpha(eigs_sigma[0]) && isalpha(eigs_sigma[1]))) + { + // We're going to use shift-and-invert mode, and basically find + // the largest eigenvalues of the inverse operator + // + mode = 3; + } + + // The user-specified number of eigenvalues/vectors to compute + // + int nev = (int)nbrEigenvalues; + + // Allocate space for ARPACK to store the residual + // + Scalar *resid = new Scalar[n]; + + // Number of Lanczos vectors, must satisfy nev < ncv <= n + // Note that this indicates that nev != n, and we cannot compute + // all eigenvalues of a mtrix + // + int ncv = std::min(std::max(2*nev, 20), n); + + // The working n x ncv matrix, also store the final eigenvectors (if computed) + // + Scalar *v = new Scalar[n*ncv]; + int ldv = n; + + // Working space + // + Scalar *workd = new Scalar[3*n]; + int lworkl = ncv*ncv+8*ncv; // Must be at least this length + Scalar *workl = new Scalar[lworkl]; + + int *iparam= new int[11]; + iparam[0] = 1; // 1 means we let ARPACK perform the shifts, 0 means we'd have to do it + iparam[2] = std::max(300, (int)std::ceil(2*n/std::max(ncv,1))); + iparam[6] = mode; // The mode, 1 is standard ev problem, 2 for generalized ev, 3 for shift-and-invert + + // Used during reverse communicate to notify where arrays start + // + int *ipntr = new int[11]; + + // Error codes are returned in here, initial value of 0 indicates a random initial + // residual vector is used, any other values means resid contains the initial residual + // vector, possibly from a previous run + // + int info = 0; + + Scalar scale = 1.0; + //if (!isBempty) + //{ + //Scalar scale = B.norm() / std::sqrt(n); + //scale = std::pow(2, std::floor(std::log(scale+1))); + ////M /= scale; + //for (size_t i=0; i<(size_t)B.outerSize(); i++) + // for (typename MatrixType::InnerIterator it(B, i); it; ++it) + // it.valueRef() /= scale; + //} + + MatrixSolver OP; + if (mode == 1 || mode == 2) + { + if (!isBempty) + OP.compute(B); + } + else if (mode == 3) + { + if (sigma == 0.0) + { + OP.compute(A); + } + else + { + // Note: We will never enter here because sigma must be 0.0 + // + if (isBempty) + { + MatrixType AminusSigmaB(A); + for (Index i=0; i::saupd(&ido, bmat, &n, whch, &nev, &tol, resid, + &ncv, v, &ldv, iparam, ipntr, workd, workl, + &lworkl, &info); + + if (ido == -1 || ido == 1) + { + Scalar *in = workd + ipntr[0] - 1; + Scalar *out = workd + ipntr[1] - 1; + + if (ido == 1 && mode != 2) + { + Scalar *out2 = workd + ipntr[2] - 1; + if (isBempty || mode == 1) + Matrix::Map(out2, n) = Matrix::Map(in, n); + else + Matrix::Map(out2, n) = B * Matrix::Map(in, n); + + in = workd + ipntr[2] - 1; + } + + if (mode == 1) + { + if (isBempty) + { + // OP = A + // + Matrix::Map(out, n) = A * Matrix::Map(in, n); + } + else + { + // OP = L^{-1}AL^{-T} + // + internal::OP::applyOP(OP, A, n, in, out); + } + } + else if (mode == 2) + { + if (ido == 1) + Matrix::Map(in, n) = A * Matrix::Map(in, n); + + // OP = B^{-1} A + // + Matrix::Map(out, n) = OP.solve(Matrix::Map(in, n)); + } + else if (mode == 3) + { + // OP = (A-\sigmaB)B (\sigma could be 0, and B could be I) + // The B * in is already computed and stored at in if ido == 1 + // + if (ido == 1 || isBempty) + Matrix::Map(out, n) = OP.solve(Matrix::Map(in, n)); + else + Matrix::Map(out, n) = OP.solve(B * Matrix::Map(in, n)); + } + } + else if (ido == 2) + { + Scalar *in = workd + ipntr[0] - 1; + Scalar *out = workd + ipntr[1] - 1; + + if (isBempty || mode == 1) + Matrix::Map(out, n) = Matrix::Map(in, n); + else + Matrix::Map(out, n) = B * Matrix::Map(in, n); + } + } while (ido != 99); + + if (info == 1) + m_info = NoConvergence; + else if (info == 3) + m_info = NumericalIssue; + else if (info < 0) + m_info = InvalidInput; + else if (info != 0) + eigen_assert(false && "Unknown ARPACK return value!"); + else + { + // Do we compute eigenvectors or not? + // + int rvec = (options & ComputeEigenvectors) == ComputeEigenvectors; + + // "A" means "All", use "S" to choose specific eigenvalues (not yet supported in ARPACK)) + // + char howmny[2] = "A"; + + // if howmny == "S", specifies the eigenvalues to compute (not implemented in ARPACK) + // + int *select = new int[ncv]; + + // Final eigenvalues + // + m_eivalues.resize(nev, 1); + + internal::arpack_wrapper::seupd(&rvec, howmny, select, m_eivalues.data(), v, &ldv, + &sigma, bmat, &n, whch, &nev, &tol, resid, &ncv, + v, &ldv, iparam, ipntr, workd, workl, &lworkl, &info); + + if (info == -14) + m_info = NoConvergence; + else if (info != 0) + m_info = InvalidInput; + else + { + if (rvec) + { + m_eivec.resize(A.rows(), nev); + for (int i=0; i::project(OP, n, nev, m_eivec.data()); + + m_eigenvectorsOk = true; + } + + m_nbrIterations = iparam[2]; + m_nbrConverged = iparam[4]; + + m_info = Success; + } + + delete[] select; + } + + delete[] v; + delete[] iparam; + delete[] ipntr; + delete[] workd; + delete[] workl; + delete[] resid; + + m_isInitialized = true; + + return *this; +} + + +// Single precision +// +extern "C" void ssaupd_(int *ido, char *bmat, int *n, char *which, + int *nev, float *tol, float *resid, int *ncv, + float *v, int *ldv, int *iparam, int *ipntr, + float *workd, float *workl, int *lworkl, + int *info); + +extern "C" void sseupd_(int *rvec, char *All, int *select, float *d, + float *z, int *ldz, float *sigma, + char *bmat, int *n, char *which, int *nev, + float *tol, float *resid, int *ncv, float *v, + int *ldv, int *iparam, int *ipntr, float *workd, + float *workl, int *lworkl, int *ierr); + +// Double precision +// +extern "C" void dsaupd_(int *ido, char *bmat, int *n, char *which, + int *nev, double *tol, double *resid, int *ncv, + double *v, int *ldv, int *iparam, int *ipntr, + double *workd, double *workl, int *lworkl, + int *info); + +extern "C" void dseupd_(int *rvec, char *All, int *select, double *d, + double *z, int *ldz, double *sigma, + char *bmat, int *n, char *which, int *nev, + double *tol, double *resid, int *ncv, double *v, + int *ldv, int *iparam, int *ipntr, double *workd, + double *workl, int *lworkl, int *ierr); + + +namespace internal { + +template struct arpack_wrapper +{ + static inline void saupd(int *ido, char *bmat, int *n, char *which, + int *nev, RealScalar *tol, Scalar *resid, int *ncv, + Scalar *v, int *ldv, int *iparam, int *ipntr, + Scalar *workd, Scalar *workl, int *lworkl, int *info) + { + EIGEN_STATIC_ASSERT(!NumTraits::IsComplex, NUMERIC_TYPE_MUST_BE_REAL) + } + + static inline void seupd(int *rvec, char *All, int *select, Scalar *d, + Scalar *z, int *ldz, RealScalar *sigma, + char *bmat, int *n, char *which, int *nev, + RealScalar *tol, Scalar *resid, int *ncv, Scalar *v, + int *ldv, int *iparam, int *ipntr, Scalar *workd, + Scalar *workl, int *lworkl, int *ierr) + { + EIGEN_STATIC_ASSERT(!NumTraits::IsComplex, NUMERIC_TYPE_MUST_BE_REAL) + } +}; + +template <> struct arpack_wrapper +{ + static inline void saupd(int *ido, char *bmat, int *n, char *which, + int *nev, float *tol, float *resid, int *ncv, + float *v, int *ldv, int *iparam, int *ipntr, + float *workd, float *workl, int *lworkl, int *info) + { + ssaupd_(ido, bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, ipntr, workd, workl, lworkl, info); + } + + static inline void seupd(int *rvec, char *All, int *select, float *d, + float *z, int *ldz, float *sigma, + char *bmat, int *n, char *which, int *nev, + float *tol, float *resid, int *ncv, float *v, + int *ldv, int *iparam, int *ipntr, float *workd, + float *workl, int *lworkl, int *ierr) + { + sseupd_(rvec, All, select, d, z, ldz, sigma, bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, ipntr, + workd, workl, lworkl, ierr); + } +}; + +template <> struct arpack_wrapper +{ + static inline void saupd(int *ido, char *bmat, int *n, char *which, + int *nev, double *tol, double *resid, int *ncv, + double *v, int *ldv, int *iparam, int *ipntr, + double *workd, double *workl, int *lworkl, int *info) + { + dsaupd_(ido, bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, ipntr, workd, workl, lworkl, info); + } + + static inline void seupd(int *rvec, char *All, int *select, double *d, + double *z, int *ldz, double *sigma, + char *bmat, int *n, char *which, int *nev, + double *tol, double *resid, int *ncv, double *v, + int *ldv, int *iparam, int *ipntr, double *workd, + double *workl, int *lworkl, int *ierr) + { + dseupd_(rvec, All, select, d, v, ldv, sigma, bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, ipntr, + workd, workl, lworkl, ierr); + } +}; + + +template +struct OP +{ + static inline void applyOP(MatrixSolver &OP, const MatrixType &A, int n, Scalar *in, Scalar *out); + static inline void project(MatrixSolver &OP, int n, int k, Scalar *vecs); +}; + +template +struct OP +{ + static inline void applyOP(MatrixSolver &OP, const MatrixType &A, int n, Scalar *in, Scalar *out) +{ + // OP = L^{-1} A L^{-T} (B = LL^T) + // + // First solve L^T out = in + // + Matrix::Map(out, n) = OP.matrixU().solve(Matrix::Map(in, n)); + Matrix::Map(out, n) = OP.permutationPinv() * Matrix::Map(out, n); + + // Then compute out = A out + // + Matrix::Map(out, n) = A * Matrix::Map(out, n); + + // Then solve L out = out + // + Matrix::Map(out, n) = OP.permutationP() * Matrix::Map(out, n); + Matrix::Map(out, n) = OP.matrixL().solve(Matrix::Map(out, n)); +} + + static inline void project(MatrixSolver &OP, int n, int k, Scalar *vecs) +{ + // Solve L^T out = in + // + Matrix::Map(vecs, n, k) = OP.matrixU().solve(Matrix::Map(vecs, n, k)); + Matrix::Map(vecs, n, k) = OP.permutationPinv() * Matrix::Map(vecs, n, k); +} + +}; + +template +struct OP +{ + static inline void applyOP(MatrixSolver &OP, const MatrixType &A, int n, Scalar *in, Scalar *out) +{ + eigen_assert(false && "Should never be in here..."); +} + + static inline void project(MatrixSolver &OP, int n, int k, Scalar *vecs) +{ + eigen_assert(false && "Should never be in here..."); +} + +}; + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_ARPACKSELFADJOINTEIGENSOLVER_H + diff --git a/src/EigenUnsupported/src/EulerAngles/CMakeLists.txt b/src/EigenUnsupported/src/EulerAngles/CMakeLists.txt new file mode 100644 index 0000000..22088eb --- /dev/null +++ b/src/EigenUnsupported/src/EulerAngles/CMakeLists.txt @@ -0,0 +1,6 @@ +file(GLOB Eigen_EulerAngles_SRCS "*.h") + +install(FILES + ${Eigen_EulerAngles_SRCS} + DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/EulerAngles COMPONENT Devel + ) diff --git a/src/EigenUnsupported/src/EulerAngles/EulerAngles.h b/src/EigenUnsupported/src/EulerAngles/EulerAngles.h new file mode 100644 index 0000000..e43cdb7 --- /dev/null +++ b/src/EigenUnsupported/src/EulerAngles/EulerAngles.h @@ -0,0 +1,355 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Tal Hadad +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_EULERANGLESCLASS_H// TODO: Fix previous "EIGEN_EULERANGLES_H" definition? +#define EIGEN_EULERANGLESCLASS_H + +namespace Eigen +{ + /** \class EulerAngles + * + * \ingroup EulerAngles_Module + * + * \brief Represents a rotation in a 3 dimensional space as three Euler angles. + * + * Euler rotation is a set of three rotation of three angles over three fixed axes, defined by the EulerSystem given as a template parameter. + * + * Here is how intrinsic Euler angles works: + * - first, rotate the axes system over the alpha axis in angle alpha + * - then, rotate the axes system over the beta axis(which was rotated in the first stage) in angle beta + * - then, rotate the axes system over the gamma axis(which was rotated in the two stages above) in angle gamma + * + * \note This class support only intrinsic Euler angles for simplicity, + * see EulerSystem how to easily overcome this for extrinsic systems. + * + * ### Rotation representation and conversions ### + * + * It has been proved(see Wikipedia link below) that every rotation can be represented + * by Euler angles, but there is no single representation (e.g. unlike rotation matrices). + * Therefore, you can convert from Eigen rotation and to them + * (including rotation matrices, which is not called "rotations" by Eigen design). + * + * Euler angles usually used for: + * - convenient human representation of rotation, especially in interactive GUI. + * - gimbal systems and robotics + * - efficient encoding(i.e. 3 floats only) of rotation for network protocols. + * + * However, Euler angles are slow comparing to quaternion or matrices, + * because their unnatural math definition, although it's simple for human. + * To overcome this, this class provide easy movement from the math friendly representation + * to the human friendly representation, and vise-versa. + * + * All the user need to do is a safe simple C++ type conversion, + * and this class take care for the math. + * Additionally, some axes related computation is done in compile time. + * + * #### Euler angles ranges in conversions #### + * Rotations representation as EulerAngles are not single (unlike matrices), + * and even have infinite EulerAngles representations.
+ * For example, add or subtract 2*PI from either angle of EulerAngles + * and you'll get the same rotation. + * This is the general reason for infinite representation, + * but it's not the only general reason for not having a single representation. + * + * When converting rotation to EulerAngles, this class convert it to specific ranges + * When converting some rotation to EulerAngles, the rules for ranges are as follow: + * - If the rotation we converting from is an EulerAngles + * (even when it represented as RotationBase explicitly), angles ranges are __undefined__. + * - otherwise, alpha and gamma angles will be in the range [-PI, PI].
+ * As for Beta angle: + * - If the system is Tait-Bryan, the beta angle will be in the range [-PI/2, PI/2]. + * - otherwise: + * - If the beta axis is positive, the beta angle will be in the range [0, PI] + * - If the beta axis is negative, the beta angle will be in the range [-PI, 0] + * + * \sa EulerAngles(const MatrixBase&) + * \sa EulerAngles(const RotationBase&) + * + * ### Convenient user typedefs ### + * + * Convenient typedefs for EulerAngles exist for float and double scalar, + * in a form of EulerAngles{A}{B}{C}{scalar}, + * e.g. \ref EulerAnglesXYZd, \ref EulerAnglesZYZf. + * + * Only for positive axes{+x,+y,+z} Euler systems are have convenient typedef. + * If you need negative axes{-x,-y,-z}, it is recommended to create you own typedef with + * a word that represent what you need. + * + * ### Example ### + * + * \include EulerAngles.cpp + * Output: \verbinclude EulerAngles.out + * + * ### Additional reading ### + * + * If you're want to get more idea about how Euler system work in Eigen see EulerSystem. + * + * More information about Euler angles: https://en.wikipedia.org/wiki/Euler_angles + * + * \tparam _Scalar the scalar type, i.e. the type of the angles. + * + * \tparam _System the EulerSystem to use, which represents the axes of rotation. + */ + template + class EulerAngles : public RotationBase, 3> + { + public: + typedef RotationBase, 3> Base; + + /** the scalar type of the angles */ + typedef _Scalar Scalar; + typedef typename NumTraits::Real RealScalar; + + /** the EulerSystem to use, which represents the axes of rotation. */ + typedef _System System; + + typedef Matrix Matrix3; /*!< the equivalent rotation matrix type */ + typedef Matrix Vector3; /*!< the equivalent 3 dimension vector type */ + typedef Quaternion QuaternionType; /*!< the equivalent quaternion type */ + typedef AngleAxis AngleAxisType; /*!< the equivalent angle-axis type */ + + /** \returns the axis vector of the first (alpha) rotation */ + static Vector3 AlphaAxisVector() { + const Vector3& u = Vector3::Unit(System::AlphaAxisAbs - 1); + return System::IsAlphaOpposite ? -u : u; + } + + /** \returns the axis vector of the second (beta) rotation */ + static Vector3 BetaAxisVector() { + const Vector3& u = Vector3::Unit(System::BetaAxisAbs - 1); + return System::IsBetaOpposite ? -u : u; + } + + /** \returns the axis vector of the third (gamma) rotation */ + static Vector3 GammaAxisVector() { + const Vector3& u = Vector3::Unit(System::GammaAxisAbs - 1); + return System::IsGammaOpposite ? -u : u; + } + + private: + Vector3 m_angles; + + public: + /** Default constructor without initialization. */ + EulerAngles() {} + /** Constructs and initialize an EulerAngles (\p alpha, \p beta, \p gamma). */ + EulerAngles(const Scalar& alpha, const Scalar& beta, const Scalar& gamma) : + m_angles(alpha, beta, gamma) {} + + // TODO: Test this constructor + /** Constructs and initialize an EulerAngles from the array data {alpha, beta, gamma} */ + explicit EulerAngles(const Scalar* data) : m_angles(data) {} + + /** Constructs and initializes an EulerAngles from either: + * - a 3x3 rotation matrix expression(i.e. pure orthogonal matrix with determinant of +1), + * - a 3D vector expression representing Euler angles. + * + * \note If \p other is a 3x3 rotation matrix, the angles range rules will be as follow:
+ * Alpha and gamma angles will be in the range [-PI, PI].
+ * As for Beta angle: + * - If the system is Tait-Bryan, the beta angle will be in the range [-PI/2, PI/2]. + * - otherwise: + * - If the beta axis is positive, the beta angle will be in the range [0, PI] + * - If the beta axis is negative, the beta angle will be in the range [-PI, 0] + */ + template + explicit EulerAngles(const MatrixBase& other) { *this = other; } + + /** Constructs and initialize Euler angles from a rotation \p rot. + * + * \note If \p rot is an EulerAngles (even when it represented as RotationBase explicitly), + * angles ranges are __undefined__. + * Otherwise, alpha and gamma angles will be in the range [-PI, PI].
+ * As for Beta angle: + * - If the system is Tait-Bryan, the beta angle will be in the range [-PI/2, PI/2]. + * - otherwise: + * - If the beta axis is positive, the beta angle will be in the range [0, PI] + * - If the beta axis is negative, the beta angle will be in the range [-PI, 0] + */ + template + EulerAngles(const RotationBase& rot) { System::CalcEulerAngles(*this, rot.toRotationMatrix()); } + + /*EulerAngles(const QuaternionType& q) + { + // TODO: Implement it in a faster way for quaternions + // According to http://www.euclideanspace.com/maths/geometry/rotations/conversions/quaternionToEuler/ + // we can compute only the needed matrix cells and then convert to euler angles. (see ZYX example below) + // Currently we compute all matrix cells from quaternion. + + // Special case only for ZYX + //Scalar y2 = q.y() * q.y(); + //m_angles[0] = std::atan2(2*(q.w()*q.z() + q.x()*q.y()), (1 - 2*(y2 + q.z()*q.z()))); + //m_angles[1] = std::asin( 2*(q.w()*q.y() - q.z()*q.x())); + //m_angles[2] = std::atan2(2*(q.w()*q.x() + q.y()*q.z()), (1 - 2*(q.x()*q.x() + y2))); + }*/ + + /** \returns The angle values stored in a vector (alpha, beta, gamma). */ + const Vector3& angles() const { return m_angles; } + /** \returns A read-write reference to the angle values stored in a vector (alpha, beta, gamma). */ + Vector3& angles() { return m_angles; } + + /** \returns The value of the first angle. */ + Scalar alpha() const { return m_angles[0]; } + /** \returns A read-write reference to the angle of the first angle. */ + Scalar& alpha() { return m_angles[0]; } + + /** \returns The value of the second angle. */ + Scalar beta() const { return m_angles[1]; } + /** \returns A read-write reference to the angle of the second angle. */ + Scalar& beta() { return m_angles[1]; } + + /** \returns The value of the third angle. */ + Scalar gamma() const { return m_angles[2]; } + /** \returns A read-write reference to the angle of the third angle. */ + Scalar& gamma() { return m_angles[2]; } + + /** \returns The Euler angles rotation inverse (which is as same as the negative), + * (-alpha, -beta, -gamma). + */ + EulerAngles inverse() const + { + EulerAngles res; + res.m_angles = -m_angles; + return res; + } + + /** \returns The Euler angles rotation negative (which is as same as the inverse), + * (-alpha, -beta, -gamma). + */ + EulerAngles operator -() const + { + return inverse(); + } + + /** Set \c *this from either: + * - a 3x3 rotation matrix expression(i.e. pure orthogonal matrix with determinant of +1), + * - a 3D vector expression representing Euler angles. + * + * See EulerAngles(const MatrixBase&) for more information about + * angles ranges output. + */ + template + EulerAngles& operator=(const MatrixBase& other) + { + EIGEN_STATIC_ASSERT((internal::is_same::value), + YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) + + internal::eulerangles_assign_impl::run(*this, other.derived()); + return *this; + } + + // TODO: Assign and construct from another EulerAngles (with different system) + + /** Set \c *this from a rotation. + * + * See EulerAngles(const RotationBase&) for more information about + * angles ranges output. + */ + template + EulerAngles& operator=(const RotationBase& rot) { + System::CalcEulerAngles(*this, rot.toRotationMatrix()); + return *this; + } + + /** \returns \c true if \c *this is approximately equal to \a other, within the precision + * determined by \a prec. + * + * \sa MatrixBase::isApprox() */ + bool isApprox(const EulerAngles& other, + const RealScalar& prec = NumTraits::dummy_precision()) const + { return angles().isApprox(other.angles(), prec); } + + /** \returns an equivalent 3x3 rotation matrix. */ + Matrix3 toRotationMatrix() const + { + // TODO: Calc it faster + return static_cast(*this).toRotationMatrix(); + } + + /** Convert the Euler angles to quaternion. */ + operator QuaternionType() const + { + return + AngleAxisType(alpha(), AlphaAxisVector()) * + AngleAxisType(beta(), BetaAxisVector()) * + AngleAxisType(gamma(), GammaAxisVector()); + } + + friend std::ostream& operator<<(std::ostream& s, const EulerAngles& eulerAngles) + { + s << eulerAngles.angles().transpose(); + return s; + } + + /** \returns \c *this with scalar type casted to \a NewScalarType */ + template + EulerAngles cast() const + { + EulerAngles e; + e.angles() = angles().template cast(); + return e; + } + }; + +#define EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(AXES, SCALAR_TYPE, SCALAR_POSTFIX) \ + /** \ingroup EulerAngles_Module */ \ + typedef EulerAngles EulerAngles##AXES##SCALAR_POSTFIX; + +#define EIGEN_EULER_ANGLES_TYPEDEFS(SCALAR_TYPE, SCALAR_POSTFIX) \ + EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(XYZ, SCALAR_TYPE, SCALAR_POSTFIX) \ + EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(XYX, SCALAR_TYPE, SCALAR_POSTFIX) \ + EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(XZY, SCALAR_TYPE, SCALAR_POSTFIX) \ + EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(XZX, SCALAR_TYPE, SCALAR_POSTFIX) \ + \ + EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(YZX, SCALAR_TYPE, SCALAR_POSTFIX) \ + EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(YZY, SCALAR_TYPE, SCALAR_POSTFIX) \ + EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(YXZ, SCALAR_TYPE, SCALAR_POSTFIX) \ + EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(YXY, SCALAR_TYPE, SCALAR_POSTFIX) \ + \ + EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(ZXY, SCALAR_TYPE, SCALAR_POSTFIX) \ + EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(ZXZ, SCALAR_TYPE, SCALAR_POSTFIX) \ + EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(ZYX, SCALAR_TYPE, SCALAR_POSTFIX) \ + EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(ZYZ, SCALAR_TYPE, SCALAR_POSTFIX) + +EIGEN_EULER_ANGLES_TYPEDEFS(float, f) +EIGEN_EULER_ANGLES_TYPEDEFS(double, d) + + namespace internal + { + template + struct traits > + { + typedef _Scalar Scalar; + }; + + // set from a rotation matrix + template + struct eulerangles_assign_impl + { + typedef typename Other::Scalar Scalar; + static void run(EulerAngles& e, const Other& m) + { + System::CalcEulerAngles(e, m); + } + }; + + // set from a vector of Euler angles + template + struct eulerangles_assign_impl + { + typedef typename Other::Scalar Scalar; + static void run(EulerAngles& e, const Other& vec) + { + e.angles() = vec; + } + }; + } +} + +#endif // EIGEN_EULERANGLESCLASS_H diff --git a/src/EigenUnsupported/src/EulerAngles/EulerSystem.h b/src/EigenUnsupported/src/EulerAngles/EulerSystem.h new file mode 100644 index 0000000..2a833b0 --- /dev/null +++ b/src/EigenUnsupported/src/EulerAngles/EulerSystem.h @@ -0,0 +1,305 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Tal Hadad +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_EULERSYSTEM_H +#define EIGEN_EULERSYSTEM_H + +namespace Eigen +{ + // Forward declarations + template + class EulerAngles; + + namespace internal + { + // TODO: Add this trait to the Eigen internal API? + template 0)> + struct Abs + { + enum { value = Num }; + }; + + template + struct Abs + { + enum { value = -Num }; + }; + + template + struct IsValidAxis + { + enum { value = Axis != 0 && Abs::value <= 3 }; + }; + + template + struct eulerangles_assign_impl; + } + + #define EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT(COND,MSG) typedef char static_assertion_##MSG[(COND)?1:-1] + + /** \brief Representation of a fixed signed rotation axis for EulerSystem. + * + * \ingroup EulerAngles_Module + * + * Values here represent: + * - The axis of the rotation: X, Y or Z. + * - The sign (i.e. direction of the rotation along the axis): positive(+) or negative(-) + * + * Therefore, this could express all the axes {+X,+Y,+Z,-X,-Y,-Z} + * + * For positive axis, use +EULER_{axis}, and for negative axis use -EULER_{axis}. + */ + enum EulerAxis + { + EULER_X = 1, /*!< the X axis */ + EULER_Y = 2, /*!< the Y axis */ + EULER_Z = 3 /*!< the Z axis */ + }; + + /** \class EulerSystem + * + * \ingroup EulerAngles_Module + * + * \brief Represents a fixed Euler rotation system. + * + * This meta-class goal is to represent the Euler system in compilation time, for EulerAngles. + * + * You can use this class to get two things: + * - Build an Euler system, and then pass it as a template parameter to EulerAngles. + * - Query some compile time data about an Euler system. (e.g. Whether it's Tait-Bryan) + * + * Euler rotation is a set of three rotation on fixed axes. (see \ref EulerAngles) + * This meta-class store constantly those signed axes. (see \ref EulerAxis) + * + * ### Types of Euler systems ### + * + * All and only valid 3 dimension Euler rotation over standard + * signed axes{+X,+Y,+Z,-X,-Y,-Z} are supported: + * - all axes X, Y, Z in each valid order (see below what order is valid) + * - rotation over the axis is supported both over the positive and negative directions. + * - both Tait-Bryan and proper/classic Euler angles (i.e. the opposite). + * + * Since EulerSystem support both positive and negative directions, + * you may call this rotation distinction in other names: + * - _right handed_ or _left handed_ + * - _counterclockwise_ or _clockwise_ + * + * Notice all axed combination are valid, and would trigger a static assertion. + * Same unsigned axes can't be neighbors, e.g. {X,X,Y} is invalid. + * This yield two and only two classes: + * - _Tait-Bryan_ - all unsigned axes are distinct, e.g. {X,Y,Z} + * - _proper/classic Euler angles_ - The first and the third unsigned axes is equal, + * and the second is different, e.g. {X,Y,X} + * + * ### Intrinsic vs extrinsic Euler systems ### + * + * Only intrinsic Euler systems are supported for simplicity. + * If you want to use extrinsic Euler systems, + * just use the equal intrinsic opposite order for axes and angles. + * I.e axes (A,B,C) becomes (C,B,A), and angles (a,b,c) becomes (c,b,a). + * + * ### Convenient user typedefs ### + * + * Convenient typedefs for EulerSystem exist (only for positive axes Euler systems), + * in a form of EulerSystem{A}{B}{C}, e.g. \ref EulerSystemXYZ. + * + * ### Additional reading ### + * + * More information about Euler angles: https://en.wikipedia.org/wiki/Euler_angles + * + * \tparam _AlphaAxis the first fixed EulerAxis + * + * \tparam _BetaAxis the second fixed EulerAxis + * + * \tparam _GammaAxis the third fixed EulerAxis + */ + template + class EulerSystem + { + public: + // It's defined this way and not as enum, because I think + // that enum is not guerantee to support negative numbers + + /** The first rotation axis */ + static const int AlphaAxis = _AlphaAxis; + + /** The second rotation axis */ + static const int BetaAxis = _BetaAxis; + + /** The third rotation axis */ + static const int GammaAxis = _GammaAxis; + + enum + { + AlphaAxisAbs = internal::Abs::value, /*!< the first rotation axis unsigned */ + BetaAxisAbs = internal::Abs::value, /*!< the second rotation axis unsigned */ + GammaAxisAbs = internal::Abs::value, /*!< the third rotation axis unsigned */ + + IsAlphaOpposite = (AlphaAxis < 0) ? 1 : 0, /*!< whether alpha axis is negative */ + IsBetaOpposite = (BetaAxis < 0) ? 1 : 0, /*!< whether beta axis is negative */ + IsGammaOpposite = (GammaAxis < 0) ? 1 : 0, /*!< whether gamma axis is negative */ + + // Parity is even if alpha axis X is followed by beta axis Y, or Y is followed + // by Z, or Z is followed by X; otherwise it is odd. + IsOdd = ((AlphaAxisAbs)%3 == (BetaAxisAbs - 1)%3) ? 0 : 1, /*!< whether the Euler system is odd */ + IsEven = IsOdd ? 0 : 1, /*!< whether the Euler system is even */ + + IsTaitBryan = ((unsigned)AlphaAxisAbs != (unsigned)GammaAxisAbs) ? 1 : 0 /*!< whether the Euler system is Tait-Bryan */ + }; + + private: + + EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT(internal::IsValidAxis::value, + ALPHA_AXIS_IS_INVALID); + + EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT(internal::IsValidAxis::value, + BETA_AXIS_IS_INVALID); + + EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT(internal::IsValidAxis::value, + GAMMA_AXIS_IS_INVALID); + + EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT((unsigned)AlphaAxisAbs != (unsigned)BetaAxisAbs, + ALPHA_AXIS_CANT_BE_EQUAL_TO_BETA_AXIS); + + EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT((unsigned)BetaAxisAbs != (unsigned)GammaAxisAbs, + BETA_AXIS_CANT_BE_EQUAL_TO_GAMMA_AXIS); + + static const int + // I, J, K are the pivot indexes permutation for the rotation matrix, that match this Euler system. + // They are used in this class converters. + // They are always different from each other, and their possible values are: 0, 1, or 2. + I_ = AlphaAxisAbs - 1, + J_ = (AlphaAxisAbs - 1 + 1 + IsOdd)%3, + K_ = (AlphaAxisAbs - 1 + 2 - IsOdd)%3 + ; + + // TODO: Get @mat parameter in form that avoids double evaluation. + template + static void CalcEulerAngles_imp(Matrix::Scalar, 3, 1>& res, const MatrixBase& mat, internal::true_type /*isTaitBryan*/) + { + using std::atan2; + using std::sqrt; + + typedef typename Derived::Scalar Scalar; + + const Scalar plusMinus = IsEven? 1 : -1; + const Scalar minusPlus = IsOdd? 1 : -1; + + const Scalar Rsum = sqrt((mat(I_,I_) * mat(I_,I_) + mat(I_,J_) * mat(I_,J_) + mat(J_,K_) * mat(J_,K_) + mat(K_,K_) * mat(K_,K_))/2); + res[1] = atan2(plusMinus * mat(I_,K_), Rsum); + + // There is a singularity when cos(beta) == 0 + if(Rsum > 4 * NumTraits::epsilon()) {// cos(beta) != 0 + res[0] = atan2(minusPlus * mat(J_, K_), mat(K_, K_)); + res[2] = atan2(minusPlus * mat(I_, J_), mat(I_, I_)); + } + else if(plusMinus * mat(I_, K_) > 0) {// cos(beta) == 0 and sin(beta) == 1 + Scalar spos = mat(J_, I_) + plusMinus * mat(K_, J_); // 2*sin(alpha + plusMinus * gamma + Scalar cpos = mat(J_, J_) + minusPlus * mat(K_, I_); // 2*cos(alpha + plusMinus * gamma) + Scalar alphaPlusMinusGamma = atan2(spos, cpos); + res[0] = alphaPlusMinusGamma; + res[2] = 0; + } + else {// cos(beta) == 0 and sin(beta) == -1 + Scalar sneg = plusMinus * (mat(K_, J_) + minusPlus * mat(J_, I_)); // 2*sin(alpha + minusPlus*gamma) + Scalar cneg = mat(J_, J_) + plusMinus * mat(K_, I_); // 2*cos(alpha + minusPlus*gamma) + Scalar alphaMinusPlusBeta = atan2(sneg, cneg); + res[0] = alphaMinusPlusBeta; + res[2] = 0; + } + } + + template + static void CalcEulerAngles_imp(Matrix::Scalar,3,1>& res, + const MatrixBase& mat, internal::false_type /*isTaitBryan*/) + { + using std::atan2; + using std::sqrt; + + typedef typename Derived::Scalar Scalar; + + const Scalar plusMinus = IsEven? 1 : -1; + const Scalar minusPlus = IsOdd? 1 : -1; + + const Scalar Rsum = sqrt((mat(I_, J_) * mat(I_, J_) + mat(I_, K_) * mat(I_, K_) + mat(J_, I_) * mat(J_, I_) + mat(K_, I_) * mat(K_, I_)) / 2); + + res[1] = atan2(Rsum, mat(I_, I_)); + + // There is a singularity when sin(beta) == 0 + if(Rsum > 4 * NumTraits::epsilon()) {// sin(beta) != 0 + res[0] = atan2(mat(J_, I_), minusPlus * mat(K_, I_)); + res[2] = atan2(mat(I_, J_), plusMinus * mat(I_, K_)); + } + else if(mat(I_, I_) > 0) {// sin(beta) == 0 and cos(beta) == 1 + Scalar spos = plusMinus * mat(K_, J_) + minusPlus * mat(J_, K_); // 2*sin(alpha + gamma) + Scalar cpos = mat(J_, J_) + mat(K_, K_); // 2*cos(alpha + gamma) + res[0] = atan2(spos, cpos); + res[2] = 0; + } + else {// sin(beta) == 0 and cos(beta) == -1 + Scalar sneg = plusMinus * mat(K_, J_) + plusMinus * mat(J_, K_); // 2*sin(alpha - gamma) + Scalar cneg = mat(J_, J_) - mat(K_, K_); // 2*cos(alpha - gamma) + res[0] = atan2(sneg, cneg); + res[2] = 0; + } + } + + template + static void CalcEulerAngles( + EulerAngles& res, + const typename EulerAngles::Matrix3& mat) + { + CalcEulerAngles_imp( + res.angles(), mat, + typename internal::conditional::type()); + + if (IsAlphaOpposite) + res.alpha() = -res.alpha(); + + if (IsBetaOpposite) + res.beta() = -res.beta(); + + if (IsGammaOpposite) + res.gamma() = -res.gamma(); + } + + template + friend class Eigen::EulerAngles; + + template + friend struct internal::eulerangles_assign_impl; + }; + +#define EIGEN_EULER_SYSTEM_TYPEDEF(A, B, C) \ + /** \ingroup EulerAngles_Module */ \ + typedef EulerSystem EulerSystem##A##B##C; + + EIGEN_EULER_SYSTEM_TYPEDEF(X,Y,Z) + EIGEN_EULER_SYSTEM_TYPEDEF(X,Y,X) + EIGEN_EULER_SYSTEM_TYPEDEF(X,Z,Y) + EIGEN_EULER_SYSTEM_TYPEDEF(X,Z,X) + + EIGEN_EULER_SYSTEM_TYPEDEF(Y,Z,X) + EIGEN_EULER_SYSTEM_TYPEDEF(Y,Z,Y) + EIGEN_EULER_SYSTEM_TYPEDEF(Y,X,Z) + EIGEN_EULER_SYSTEM_TYPEDEF(Y,X,Y) + + EIGEN_EULER_SYSTEM_TYPEDEF(Z,X,Y) + EIGEN_EULER_SYSTEM_TYPEDEF(Z,X,Z) + EIGEN_EULER_SYSTEM_TYPEDEF(Z,Y,X) + EIGEN_EULER_SYSTEM_TYPEDEF(Z,Y,Z) +} + +#endif // EIGEN_EULERSYSTEM_H diff --git a/src/EigenUnsupported/src/FFT/ei_fftw_impl.h b/src/EigenUnsupported/src/FFT/ei_fftw_impl.h new file mode 100644 index 0000000..1c2cd24 --- /dev/null +++ b/src/EigenUnsupported/src/FFT/ei_fftw_impl.h @@ -0,0 +1,261 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Mark Borgerding mark a borgerding net +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +namespace Eigen { + +namespace internal { + + // FFTW uses non-const arguments + // so we must use ugly const_cast calls for all the args it uses + // + // This should be safe as long as + // 1. we use FFTW_ESTIMATE for all our planning + // see the FFTW docs section 4.3.2 "Planner Flags" + // 2. fftw_complex is compatible with std::complex + // This assumes std::complex layout is array of size 2 with real,imag + template + inline + T * fftw_cast(const T* p) + { + return const_cast( p); + } + + inline + fftw_complex * fftw_cast( const std::complex * p) + { + return const_cast( reinterpret_cast(p) ); + } + + inline + fftwf_complex * fftw_cast( const std::complex * p) + { + return const_cast( reinterpret_cast(p) ); + } + + inline + fftwl_complex * fftw_cast( const std::complex * p) + { + return const_cast( reinterpret_cast(p) ); + } + + template + struct fftw_plan {}; + + template <> + struct fftw_plan + { + typedef float scalar_type; + typedef fftwf_complex complex_type; + fftwf_plan m_plan; + fftw_plan() :m_plan(NULL) {} + ~fftw_plan() {if (m_plan) fftwf_destroy_plan(m_plan);} + + inline + void fwd(complex_type * dst,complex_type * src,int nfft) { + if (m_plan==NULL) m_plan = fftwf_plan_dft_1d(nfft,src,dst, FFTW_FORWARD, FFTW_ESTIMATE|FFTW_PRESERVE_INPUT); + fftwf_execute_dft( m_plan, src,dst); + } + inline + void inv(complex_type * dst,complex_type * src,int nfft) { + if (m_plan==NULL) m_plan = fftwf_plan_dft_1d(nfft,src,dst, FFTW_BACKWARD , FFTW_ESTIMATE|FFTW_PRESERVE_INPUT); + fftwf_execute_dft( m_plan, src,dst); + } + inline + void fwd(complex_type * dst,scalar_type * src,int nfft) { + if (m_plan==NULL) m_plan = fftwf_plan_dft_r2c_1d(nfft,src,dst,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT); + fftwf_execute_dft_r2c( m_plan,src,dst); + } + inline + void inv(scalar_type * dst,complex_type * src,int nfft) { + if (m_plan==NULL) + m_plan = fftwf_plan_dft_c2r_1d(nfft,src,dst,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT); + fftwf_execute_dft_c2r( m_plan, src,dst); + } + + inline + void fwd2( complex_type * dst,complex_type * src,int n0,int n1) { + if (m_plan==NULL) m_plan = fftwf_plan_dft_2d(n0,n1,src,dst,FFTW_FORWARD,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT); + fftwf_execute_dft( m_plan, src,dst); + } + inline + void inv2( complex_type * dst,complex_type * src,int n0,int n1) { + if (m_plan==NULL) m_plan = fftwf_plan_dft_2d(n0,n1,src,dst,FFTW_BACKWARD,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT); + fftwf_execute_dft( m_plan, src,dst); + } + + }; + template <> + struct fftw_plan + { + typedef double scalar_type; + typedef fftw_complex complex_type; + ::fftw_plan m_plan; + fftw_plan() :m_plan(NULL) {} + ~fftw_plan() {if (m_plan) fftw_destroy_plan(m_plan);} + + inline + void fwd(complex_type * dst,complex_type * src,int nfft) { + if (m_plan==NULL) m_plan = fftw_plan_dft_1d(nfft,src,dst, FFTW_FORWARD, FFTW_ESTIMATE|FFTW_PRESERVE_INPUT); + fftw_execute_dft( m_plan, src,dst); + } + inline + void inv(complex_type * dst,complex_type * src,int nfft) { + if (m_plan==NULL) m_plan = fftw_plan_dft_1d(nfft,src,dst, FFTW_BACKWARD , FFTW_ESTIMATE|FFTW_PRESERVE_INPUT); + fftw_execute_dft( m_plan, src,dst); + } + inline + void fwd(complex_type * dst,scalar_type * src,int nfft) { + if (m_plan==NULL) m_plan = fftw_plan_dft_r2c_1d(nfft,src,dst,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT); + fftw_execute_dft_r2c( m_plan,src,dst); + } + inline + void inv(scalar_type * dst,complex_type * src,int nfft) { + if (m_plan==NULL) + m_plan = fftw_plan_dft_c2r_1d(nfft,src,dst,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT); + fftw_execute_dft_c2r( m_plan, src,dst); + } + inline + void fwd2( complex_type * dst,complex_type * src,int n0,int n1) { + if (m_plan==NULL) m_plan = fftw_plan_dft_2d(n0,n1,src,dst,FFTW_FORWARD,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT); + fftw_execute_dft( m_plan, src,dst); + } + inline + void inv2( complex_type * dst,complex_type * src,int n0,int n1) { + if (m_plan==NULL) m_plan = fftw_plan_dft_2d(n0,n1,src,dst,FFTW_BACKWARD,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT); + fftw_execute_dft( m_plan, src,dst); + } + }; + template <> + struct fftw_plan + { + typedef long double scalar_type; + typedef fftwl_complex complex_type; + fftwl_plan m_plan; + fftw_plan() :m_plan(NULL) {} + ~fftw_plan() {if (m_plan) fftwl_destroy_plan(m_plan);} + + inline + void fwd(complex_type * dst,complex_type * src,int nfft) { + if (m_plan==NULL) m_plan = fftwl_plan_dft_1d(nfft,src,dst, FFTW_FORWARD, FFTW_ESTIMATE|FFTW_PRESERVE_INPUT); + fftwl_execute_dft( m_plan, src,dst); + } + inline + void inv(complex_type * dst,complex_type * src,int nfft) { + if (m_plan==NULL) m_plan = fftwl_plan_dft_1d(nfft,src,dst, FFTW_BACKWARD , FFTW_ESTIMATE|FFTW_PRESERVE_INPUT); + fftwl_execute_dft( m_plan, src,dst); + } + inline + void fwd(complex_type * dst,scalar_type * src,int nfft) { + if (m_plan==NULL) m_plan = fftwl_plan_dft_r2c_1d(nfft,src,dst,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT); + fftwl_execute_dft_r2c( m_plan,src,dst); + } + inline + void inv(scalar_type * dst,complex_type * src,int nfft) { + if (m_plan==NULL) + m_plan = fftwl_plan_dft_c2r_1d(nfft,src,dst,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT); + fftwl_execute_dft_c2r( m_plan, src,dst); + } + inline + void fwd2( complex_type * dst,complex_type * src,int n0,int n1) { + if (m_plan==NULL) m_plan = fftwl_plan_dft_2d(n0,n1,src,dst,FFTW_FORWARD,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT); + fftwl_execute_dft( m_plan, src,dst); + } + inline + void inv2( complex_type * dst,complex_type * src,int n0,int n1) { + if (m_plan==NULL) m_plan = fftwl_plan_dft_2d(n0,n1,src,dst,FFTW_BACKWARD,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT); + fftwl_execute_dft( m_plan, src,dst); + } + }; + + template + struct fftw_impl + { + typedef _Scalar Scalar; + typedef std::complex Complex; + + inline + void clear() + { + m_plans.clear(); + } + + // complex-to-complex forward FFT + inline + void fwd( Complex * dst,const Complex *src,int nfft) + { + get_plan(nfft,false,dst,src).fwd(fftw_cast(dst), fftw_cast(src),nfft ); + } + + // real-to-complex forward FFT + inline + void fwd( Complex * dst,const Scalar * src,int nfft) + { + get_plan(nfft,false,dst,src).fwd(fftw_cast(dst), fftw_cast(src) ,nfft); + } + + // 2-d complex-to-complex + inline + void fwd2(Complex * dst, const Complex * src, int n0,int n1) + { + get_plan(n0,n1,false,dst,src).fwd2(fftw_cast(dst), fftw_cast(src) ,n0,n1); + } + + // inverse complex-to-complex + inline + void inv(Complex * dst,const Complex *src,int nfft) + { + get_plan(nfft,true,dst,src).inv(fftw_cast(dst), fftw_cast(src),nfft ); + } + + // half-complex to scalar + inline + void inv( Scalar * dst,const Complex * src,int nfft) + { + get_plan(nfft,true,dst,src).inv(fftw_cast(dst), fftw_cast(src),nfft ); + } + + // 2-d complex-to-complex + inline + void inv2(Complex * dst, const Complex * src, int n0,int n1) + { + get_plan(n0,n1,true,dst,src).inv2(fftw_cast(dst), fftw_cast(src) ,n0,n1); + } + + + protected: + typedef fftw_plan PlanData; + + typedef Eigen::numext::int64_t int64_t; + + typedef std::map PlanMap; + + PlanMap m_plans; + + inline + PlanData & get_plan(int nfft,bool inverse,void * dst,const void * src) + { + bool inplace = (dst==src); + bool aligned = ( (reinterpret_cast(src)&15) | (reinterpret_cast(dst)&15) ) == 0; + int64_t key = ( (nfft<<3 ) | (inverse<<2) | (inplace<<1) | aligned ) << 1; + return m_plans[key]; + } + + inline + PlanData & get_plan(int n0,int n1,bool inverse,void * dst,const void * src) + { + bool inplace = (dst==src); + bool aligned = ( (reinterpret_cast(src)&15) | (reinterpret_cast(dst)&15) ) == 0; + int64_t key = ( ( (((int64_t)n0) << 30)|(n1<<3 ) | (inverse<<2) | (inplace<<1) | aligned ) << 1 ) + 1; + return m_plans[key]; + } + }; + +} // end namespace internal + +} // end namespace Eigen diff --git a/src/EigenUnsupported/src/FFT/ei_kissfft_impl.h b/src/EigenUnsupported/src/FFT/ei_kissfft_impl.h new file mode 100644 index 0000000..430953a --- /dev/null +++ b/src/EigenUnsupported/src/FFT/ei_kissfft_impl.h @@ -0,0 +1,449 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Mark Borgerding mark a borgerding net +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +namespace Eigen { + +namespace internal { + + // This FFT implementation was derived from kissfft http:sourceforge.net/projects/kissfft + // Copyright 2003-2009 Mark Borgerding + +template +struct kiss_cpx_fft +{ + typedef _Scalar Scalar; + typedef std::complex Complex; + std::vector m_twiddles; + std::vector m_stageRadix; + std::vector m_stageRemainder; + std::vector m_scratchBuf; + bool m_inverse; + + inline void make_twiddles(int nfft, bool inverse) + { + using numext::sin; + using numext::cos; + m_inverse = inverse; + m_twiddles.resize(nfft); + double phinc = 0.25 * double(EIGEN_PI) / nfft; + Scalar flip = inverse ? Scalar(1) : Scalar(-1); + m_twiddles[0] = Complex(Scalar(1), Scalar(0)); + if ((nfft&1)==0) + m_twiddles[nfft/2] = Complex(Scalar(-1), Scalar(0)); + int i=1; + for (;i*8n) + p=n;// impossible to have a factor > sqrt(n) + } + n /= p; + m_stageRadix.push_back(p); + m_stageRemainder.push_back(n); + if ( p > 5 ) + m_scratchBuf.resize(p); // scratchbuf will be needed in bfly_generic + }while(n>1); + } + + template + inline + void work( int stage,Complex * xout, const _Src * xin, size_t fstride,size_t in_stride) + { + int p = m_stageRadix[stage]; + int m = m_stageRemainder[stage]; + Complex * Fout_beg = xout; + Complex * Fout_end = xout + p*m; + + if (m>1) { + do{ + // recursive call: + // DFT of size m*p performed by doing + // p instances of smaller DFTs of size m, + // each one takes a decimated version of the input + work(stage+1, xout , xin, fstride*p,in_stride); + xin += fstride*in_stride; + }while( (xout += m) != Fout_end ); + }else{ + do{ + *xout = *xin; + xin += fstride*in_stride; + }while(++xout != Fout_end ); + } + xout=Fout_beg; + + // recombine the p smaller DFTs + switch (p) { + case 2: bfly2(xout,fstride,m); break; + case 3: bfly3(xout,fstride,m); break; + case 4: bfly4(xout,fstride,m); break; + case 5: bfly5(xout,fstride,m); break; + default: bfly_generic(xout,fstride,m,p); break; + } + } + + inline + void bfly2( Complex * Fout, const size_t fstride, int m) + { + for (int k=0;kreal() - Scalar(.5)*scratch[3].real() , Fout->imag() - Scalar(.5)*scratch[3].imag() ); + scratch[0] *= epi3.imag(); + *Fout += scratch[3]; + Fout[m2] = Complex( Fout[m].real() + scratch[0].imag() , Fout[m].imag() - scratch[0].real() ); + Fout[m] += Complex( -scratch[0].imag(),scratch[0].real() ); + ++Fout; + }while(--k); + } + + inline + void bfly5( Complex * Fout, const size_t fstride, const size_t m) + { + Complex *Fout0,*Fout1,*Fout2,*Fout3,*Fout4; + size_t u; + Complex scratch[13]; + Complex * twiddles = &m_twiddles[0]; + Complex *tw; + Complex ya,yb; + ya = twiddles[fstride*m]; + yb = twiddles[fstride*2*m]; + + Fout0=Fout; + Fout1=Fout0+m; + Fout2=Fout0+2*m; + Fout3=Fout0+3*m; + Fout4=Fout0+4*m; + + tw=twiddles; + for ( u=0; u(m_twiddles.size()); + Complex * scratchbuf = &m_scratchBuf[0]; + + for ( u=0; u(fstride) * k; + if (twidx>=Norig) twidx-=Norig; + t=scratchbuf[q] * twiddles[twidx]; + Fout[ k ] += t; + } + k += m; + } + } + } +}; + +template +struct kissfft_impl +{ + typedef _Scalar Scalar; + typedef std::complex Complex; + + void clear() + { + m_plans.clear(); + m_realTwiddles.clear(); + } + + inline + void fwd( Complex * dst,const Complex *src,int nfft) + { + get_plan(nfft,false).work(0, dst, src, 1,1); + } + + inline + void fwd2( Complex * dst,const Complex *src,int n0,int n1) + { + EIGEN_UNUSED_VARIABLE(dst); + EIGEN_UNUSED_VARIABLE(src); + EIGEN_UNUSED_VARIABLE(n0); + EIGEN_UNUSED_VARIABLE(n1); + } + + inline + void inv2( Complex * dst,const Complex *src,int n0,int n1) + { + EIGEN_UNUSED_VARIABLE(dst); + EIGEN_UNUSED_VARIABLE(src); + EIGEN_UNUSED_VARIABLE(n0); + EIGEN_UNUSED_VARIABLE(n1); + } + + // real-to-complex forward FFT + // perform two FFTs of src even and src odd + // then twiddle to recombine them into the half-spectrum format + // then fill in the conjugate symmetric half + inline + void fwd( Complex * dst,const Scalar * src,int nfft) + { + if ( nfft&3 ) { + // use generic mode for odd + m_tmpBuf1.resize(nfft); + get_plan(nfft,false).work(0, &m_tmpBuf1[0], src, 1,1); + std::copy(m_tmpBuf1.begin(),m_tmpBuf1.begin()+(nfft>>1)+1,dst ); + }else{ + int ncfft = nfft>>1; + int ncfft2 = nfft>>2; + Complex * rtw = real_twiddles(ncfft2); + + // use optimized mode for even real + fwd( dst, reinterpret_cast (src), ncfft); + Complex dc(dst[0].real() + dst[0].imag()); + Complex nyquist(dst[0].real() - dst[0].imag()); + int k; + for ( k=1;k <= ncfft2 ; ++k ) { + Complex fpk = dst[k]; + Complex fpnk = conj(dst[ncfft-k]); + Complex f1k = fpk + fpnk; + Complex f2k = fpk - fpnk; + Complex tw= f2k * rtw[k-1]; + dst[k] = (f1k + tw) * Scalar(.5); + dst[ncfft-k] = conj(f1k -tw)*Scalar(.5); + } + dst[0] = dc; + dst[ncfft] = nyquist; + } + } + + // inverse complex-to-complex + inline + void inv(Complex * dst,const Complex *src,int nfft) + { + get_plan(nfft,true).work(0, dst, src, 1,1); + } + + // half-complex to scalar + inline + void inv( Scalar * dst,const Complex * src,int nfft) + { + if (nfft&3) { + m_tmpBuf1.resize(nfft); + m_tmpBuf2.resize(nfft); + std::copy(src,src+(nfft>>1)+1,m_tmpBuf1.begin() ); + for (int k=1;k<(nfft>>1)+1;++k) + m_tmpBuf1[nfft-k] = conj(m_tmpBuf1[k]); + inv(&m_tmpBuf2[0],&m_tmpBuf1[0],nfft); + for (int k=0;k>1; + int ncfft2 = nfft>>2; + Complex * rtw = real_twiddles(ncfft2); + m_tmpBuf1.resize(ncfft); + m_tmpBuf1[0] = Complex( src[0].real() + src[ncfft].real(), src[0].real() - src[ncfft].real() ); + for (int k = 1; k <= ncfft / 2; ++k) { + Complex fk = src[k]; + Complex fnkc = conj(src[ncfft-k]); + Complex fek = fk + fnkc; + Complex tmp = fk - fnkc; + Complex fok = tmp * conj(rtw[k-1]); + m_tmpBuf1[k] = fek + fok; + m_tmpBuf1[ncfft-k] = conj(fek - fok); + } + get_plan(ncfft,true).work(0, reinterpret_cast(dst), &m_tmpBuf1[0], 1,1); + } + } + + protected: + typedef kiss_cpx_fft PlanData; + typedef std::map PlanMap; + + PlanMap m_plans; + std::map > m_realTwiddles; + std::vector m_tmpBuf1; + std::vector m_tmpBuf2; + + inline + int PlanKey(int nfft, bool isinverse) const { return (nfft<<1) | int(isinverse); } + + inline + PlanData & get_plan(int nfft, bool inverse) + { + // TODO look for PlanKey(nfft, ! inverse) and conjugate the twiddles + PlanData & pd = m_plans[ PlanKey(nfft,inverse) ]; + if ( pd.m_twiddles.size() == 0 ) { + pd.make_twiddles(nfft,inverse); + pd.factorize(nfft); + } + return pd; + } + + inline + Complex * real_twiddles(int ncfft2) + { + using std::acos; + std::vector & twidref = m_realTwiddles[ncfft2];// creates new if not there + if ( (int)twidref.size() != ncfft2 ) { + twidref.resize(ncfft2); + int ncfft= ncfft2<<1; + Scalar pi = acos( Scalar(-1) ); + for (int k=1;k<=ncfft2;++k) + twidref[k-1] = exp( Complex(0,-pi * (Scalar(k) / ncfft + Scalar(.5)) ) ); + } + return &twidref[0]; + } +}; + +} // end namespace internal + +} // end namespace Eigen diff --git a/src/EigenUnsupported/src/IterativeSolvers/ConstrainedConjGrad.h b/src/EigenUnsupported/src/IterativeSolvers/ConstrainedConjGrad.h new file mode 100644 index 0000000..e7d70f3 --- /dev/null +++ b/src/EigenUnsupported/src/IterativeSolvers/ConstrainedConjGrad.h @@ -0,0 +1,187 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008 Gael Guennebaud + +/* NOTE The functions of this file have been adapted from the GMM++ library */ + +//======================================================================== +// +// Copyright (C) 2002-2007 Yves Renard +// +// This file is a part of GETFEM++ +// +// Getfem++ is free software; you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as +// published by the Free Software Foundation; version 2.1 of the License. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// You should have received a copy of the GNU Lesser General Public +// License along with this program; if not, write to the Free Software +// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, +// USA. +// +//======================================================================== + +#include "../../../../Eigen/src/Core/util/NonMPL2.h" + +#ifndef EIGEN_CONSTRAINEDCG_H +#define EIGEN_CONSTRAINEDCG_H + +#include "../../../../Eigen/Core" + +namespace Eigen { + +namespace internal { + +/** \ingroup IterativeLinearSolvers_Module + * Compute the pseudo inverse of the non-square matrix C such that + * \f$ CINV = (C * C^T)^{-1} * C \f$ based on a conjugate gradient method. + * + * This function is internally used by constrained_cg. + */ +template +void pseudo_inverse(const CMatrix &C, CINVMatrix &CINV) +{ + // optimisable : copie de la ligne, precalcul de C * trans(C). + typedef typename CMatrix::Scalar Scalar; + typedef typename CMatrix::Index Index; + // FIXME use sparse vectors ? + typedef Matrix TmpVec; + + Index rows = C.rows(), cols = C.cols(); + + TmpVec d(rows), e(rows), l(cols), p(rows), q(rows), r(rows); + Scalar rho, rho_1, alpha; + d.setZero(); + + typedef Triplet T; + std::vector tripletList; + + for (Index i = 0; i < rows; ++i) + { + d[i] = 1.0; + rho = 1.0; + e.setZero(); + r = d; + p = d; + + while (rho >= 1e-38) + { /* conjugate gradient to compute e */ + /* which is the i-th row of inv(C * trans(C)) */ + l = C.transpose() * p; + q = C * l; + alpha = rho / p.dot(q); + e += alpha * p; + r += -alpha * q; + rho_1 = rho; + rho = r.dot(r); + p = (rho/rho_1) * p + r; + } + + l = C.transpose() * e; // l is the i-th row of CINV + // FIXME add a generic "prune/filter" expression for both dense and sparse object to sparse + for (Index j=0; j +void constrained_cg(const TMatrix& A, const CMatrix& C, VectorX& x, + const VectorB& b, const VectorF& f, IterationController &iter) +{ + using std::sqrt; + typedef typename TMatrix::Scalar Scalar; + typedef typename TMatrix::Index Index; + typedef Matrix TmpVec; + + Scalar rho = 1.0, rho_1, lambda, gamma; + Index xSize = x.size(); + TmpVec p(xSize), q(xSize), q2(xSize), + r(xSize), old_z(xSize), z(xSize), + memox(xSize); + std::vector satured(C.rows()); + p.setZero(); + iter.setRhsNorm(sqrt(b.dot(b))); // gael vect_sp(PS, b, b) + if (iter.rhsNorm() == 0.0) iter.setRhsNorm(1.0); + + SparseMatrix CINV(C.rows(), C.cols()); + pseudo_inverse(C, CINV); + + while(true) + { + // computation of residual + old_z = z; + memox = x; + r = b; + r += A * -x; + z = r; + bool transition = false; + for (Index i = 0; i < C.rows(); ++i) + { + Scalar al = C.row(i).dot(x) - f.coeff(i); + if (al >= -1.0E-15) + { + if (!satured[i]) + { + satured[i] = true; + transition = true; + } + Scalar bb = CINV.row(i).dot(z); + if (bb > 0.0) + // FIXME: we should allow that: z += -bb * C.row(i); + for (typename CMatrix::InnerIterator it(C,i); it; ++it) + z.coeffRef(it.index()) -= bb*it.value(); + } + else + satured[i] = false; + } + + // descent direction + rho_1 = rho; + rho = r.dot(z); + + if (iter.finished(rho)) break; + if (transition || iter.first()) gamma = 0.0; + else gamma = (std::max)(0.0, (rho - old_z.dot(z)) / rho_1); + p = z + gamma*p; + + ++iter; + // one dimensionnal optimization + q = A * p; + lambda = rho / q.dot(p); + for (Index i = 0; i < C.rows(); ++i) + { + if (!satured[i]) + { + Scalar bb = C.row(i).dot(p) - f[i]; + if (bb > 0.0) + lambda = (std::min)(lambda, (f.coeff(i)-C.row(i).dot(x)) / bb); + } + } + x += lambda * p; + memox -= x; + } +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_CONSTRAINEDCG_H diff --git a/src/EigenUnsupported/src/IterativeSolvers/DGMRES.h b/src/EigenUnsupported/src/IterativeSolvers/DGMRES.h new file mode 100644 index 0000000..5ae011b --- /dev/null +++ b/src/EigenUnsupported/src/IterativeSolvers/DGMRES.h @@ -0,0 +1,511 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2012 Désiré Nuentsa-Wakam +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_DGMRES_H +#define EIGEN_DGMRES_H + +#include "../../../../Eigen/Eigenvalues" + +namespace Eigen { + +template< typename _MatrixType, + typename _Preconditioner = DiagonalPreconditioner > +class DGMRES; + +namespace internal { + +template< typename _MatrixType, typename _Preconditioner> +struct traits > +{ + typedef _MatrixType MatrixType; + typedef _Preconditioner Preconditioner; +}; + +/** \brief Computes a permutation vector to have a sorted sequence + * \param vec The vector to reorder. + * \param perm gives the sorted sequence on output. Must be initialized with 0..n-1 + * \param ncut Put the ncut smallest elements at the end of the vector + * WARNING This is an expensive sort, so should be used only + * for small size vectors + * TODO Use modified QuickSplit or std::nth_element to get the smallest values + */ +template +void sortWithPermutation (VectorType& vec, IndexType& perm, typename IndexType::Scalar& ncut) +{ + eigen_assert(vec.size() == perm.size()); + bool flag; + for (Index k = 0; k < ncut; k++) + { + flag = false; + for (Index j = 0; j < vec.size()-1; j++) + { + if ( vec(perm(j)) < vec(perm(j+1)) ) + { + std::swap(perm(j),perm(j+1)); + flag = true; + } + if (!flag) break; // The vector is in sorted order + } + } +} + +} +/** + * \ingroup IterativeLinearSolvers_Module + * \brief A Restarted GMRES with deflation. + * This class implements a modification of the GMRES solver for + * sparse linear systems. The basis is built with modified + * Gram-Schmidt. At each restart, a few approximated eigenvectors + * corresponding to the smallest eigenvalues are used to build a + * preconditioner for the next cycle. This preconditioner + * for deflation can be combined with any other preconditioner, + * the IncompleteLUT for instance. The preconditioner is applied + * at right of the matrix and the combination is multiplicative. + * + * \tparam _MatrixType the type of the sparse matrix A, can be a dense or a sparse matrix. + * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner + * Typical usage : + * \code + * SparseMatrix A; + * VectorXd x, b; + * //Fill A and b ... + * DGMRES > solver; + * solver.set_restart(30); // Set restarting value + * solver.setEigenv(1); // Set the number of eigenvalues to deflate + * solver.compute(A); + * x = solver.solve(b); + * \endcode + * + * DGMRES can also be used in a matrix-free context, see the following \link MatrixfreeSolverExample example \endlink. + * + * References : + * [1] D. NUENTSA WAKAM and F. PACULL, Memory Efficient Hybrid + * Algebraic Solvers for Linear Systems Arising from Compressible + * Flows, Computers and Fluids, In Press, + * https://doi.org/10.1016/j.compfluid.2012.03.023 + * [2] K. Burrage and J. Erhel, On the performance of various + * adaptive preconditioned GMRES strategies, 5(1998), 101-121. + * [3] J. Erhel, K. Burrage and B. Pohl, Restarted GMRES + * preconditioned by deflation,J. Computational and Applied + * Mathematics, 69(1996), 303-318. + + * + */ +template< typename _MatrixType, typename _Preconditioner> +class DGMRES : public IterativeSolverBase > +{ + typedef IterativeSolverBase Base; + using Base::matrix; + using Base::m_error; + using Base::m_iterations; + using Base::m_info; + using Base::m_isInitialized; + using Base::m_tolerance; + public: + using Base::_solve_impl; + using Base::_solve_with_guess_impl; + typedef _MatrixType MatrixType; + typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::StorageIndex StorageIndex; + typedef typename MatrixType::RealScalar RealScalar; + typedef _Preconditioner Preconditioner; + typedef Matrix DenseMatrix; + typedef Matrix DenseRealMatrix; + typedef Matrix DenseVector; + typedef Matrix DenseRealVector; + typedef Matrix, Dynamic, 1> ComplexVector; + + + /** Default constructor. */ + DGMRES() : Base(),m_restart(30),m_neig(0),m_r(0),m_maxNeig(5),m_isDeflAllocated(false),m_isDeflInitialized(false) {} + + /** Initialize the solver with matrix \a A for further \c Ax=b solving. + * + * This constructor is a shortcut for the default constructor followed + * by a call to compute(). + * + * \warning this class stores a reference to the matrix A as well as some + * precomputed values that depend on it. Therefore, if \a A is changed + * this class becomes invalid. Call compute() to update it with the new + * matrix A, or modify a copy of A. + */ + template + explicit DGMRES(const EigenBase& A) : Base(A.derived()), m_restart(30),m_neig(0),m_r(0),m_maxNeig(5),m_isDeflAllocated(false),m_isDeflInitialized(false) {} + + ~DGMRES() {} + + /** \internal */ + template + void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const + { + EIGEN_STATIC_ASSERT(Rhs::ColsAtCompileTime==1 || Dest::ColsAtCompileTime==1, YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX); + + m_iterations = Base::maxIterations(); + m_error = Base::m_tolerance; + + dgmres(matrix(), b, x, Base::m_preconditioner); + } + + /** + * Get the restart value + */ + Index restart() { return m_restart; } + + /** + * Set the restart value (default is 30) + */ + void set_restart(const Index restart) { m_restart=restart; } + + /** + * Set the number of eigenvalues to deflate at each restart + */ + void setEigenv(const Index neig) + { + m_neig = neig; + if (neig+1 > m_maxNeig) m_maxNeig = neig+1; // To allow for complex conjugates + } + + /** + * Get the size of the deflation subspace size + */ + Index deflSize() {return m_r; } + + /** + * Set the maximum size of the deflation subspace + */ + void setMaxEigenv(const Index maxNeig) { m_maxNeig = maxNeig; } + + protected: + // DGMRES algorithm + template + void dgmres(const MatrixType& mat,const Rhs& rhs, Dest& x, const Preconditioner& precond) const; + // Perform one cycle of GMRES + template + Index dgmresCycle(const MatrixType& mat, const Preconditioner& precond, Dest& x, DenseVector& r0, RealScalar& beta, const RealScalar& normRhs, Index& nbIts) const; + // Compute data to use for deflation + Index dgmresComputeDeflationData(const MatrixType& mat, const Preconditioner& precond, const Index& it, StorageIndex& neig) const; + // Apply deflation to a vector + template + Index dgmresApplyDeflation(const RhsType& In, DestType& Out) const; + ComplexVector schurValues(const ComplexSchur& schurofH) const; + ComplexVector schurValues(const RealSchur& schurofH) const; + // Init data for deflation + void dgmresInitDeflation(Index& rows) const; + mutable DenseMatrix m_V; // Krylov basis vectors + mutable DenseMatrix m_H; // Hessenberg matrix + mutable DenseMatrix m_Hes; // Initial hessenberg matrix without Givens rotations applied + mutable Index m_restart; // Maximum size of the Krylov subspace + mutable DenseMatrix m_U; // Vectors that form the basis of the invariant subspace + mutable DenseMatrix m_MU; // matrix operator applied to m_U (for next cycles) + mutable DenseMatrix m_T; /* T=U^T*M^{-1}*A*U */ + mutable PartialPivLU m_luT; // LU factorization of m_T + mutable StorageIndex m_neig; //Number of eigenvalues to extract at each restart + mutable Index m_r; // Current number of deflated eigenvalues, size of m_U + mutable Index m_maxNeig; // Maximum number of eigenvalues to deflate + mutable RealScalar m_lambdaN; //Modulus of the largest eigenvalue of A + mutable bool m_isDeflAllocated; + mutable bool m_isDeflInitialized; + + //Adaptive strategy + mutable RealScalar m_smv; // Smaller multiple of the remaining number of steps allowed + mutable bool m_force; // Force the use of deflation at each restart + +}; +/** + * \brief Perform several cycles of restarted GMRES with modified Gram Schmidt, + * + * A right preconditioner is used combined with deflation. + * + */ +template< typename _MatrixType, typename _Preconditioner> +template +void DGMRES<_MatrixType, _Preconditioner>::dgmres(const MatrixType& mat,const Rhs& rhs, Dest& x, + const Preconditioner& precond) const +{ + const RealScalar considerAsZero = (std::numeric_limits::min)(); + + RealScalar normRhs = rhs.norm(); + if(normRhs <= considerAsZero) + { + x.setZero(); + m_error = 0; + return; + } + + //Initialization + m_isDeflInitialized = false; + Index n = mat.rows(); + DenseVector r0(n); + Index nbIts = 0; + m_H.resize(m_restart+1, m_restart); + m_Hes.resize(m_restart, m_restart); + m_V.resize(n,m_restart+1); + //Initial residual vector and initial norm + if(x.squaredNorm()==0) + x = precond.solve(rhs); + r0 = rhs - mat * x; + RealScalar beta = r0.norm(); + + m_error = beta/normRhs; + if(m_error < m_tolerance) + m_info = Success; + else + m_info = NoConvergence; + + // Iterative process + while (nbIts < m_iterations && m_info == NoConvergence) + { + dgmresCycle(mat, precond, x, r0, beta, normRhs, nbIts); + + // Compute the new residual vector for the restart + if (nbIts < m_iterations && m_info == NoConvergence) { + r0 = rhs - mat * x; + beta = r0.norm(); + } + } +} + +/** + * \brief Perform one restart cycle of DGMRES + * \param mat The coefficient matrix + * \param precond The preconditioner + * \param x the new approximated solution + * \param r0 The initial residual vector + * \param beta The norm of the residual computed so far + * \param normRhs The norm of the right hand side vector + * \param nbIts The number of iterations + */ +template< typename _MatrixType, typename _Preconditioner> +template +Index DGMRES<_MatrixType, _Preconditioner>::dgmresCycle(const MatrixType& mat, const Preconditioner& precond, Dest& x, DenseVector& r0, RealScalar& beta, const RealScalar& normRhs, Index& nbIts) const +{ + //Initialization + DenseVector g(m_restart+1); // Right hand side of the least square problem + g.setZero(); + g(0) = Scalar(beta); + m_V.col(0) = r0/beta; + m_info = NoConvergence; + std::vector >gr(m_restart); // Givens rotations + Index it = 0; // Number of inner iterations + Index n = mat.rows(); + DenseVector tv1(n), tv2(n); //Temporary vectors + while (m_info == NoConvergence && it < m_restart && nbIts < m_iterations) + { + // Apply preconditioner(s) at right + if (m_isDeflInitialized ) + { + dgmresApplyDeflation(m_V.col(it), tv1); // Deflation + tv2 = precond.solve(tv1); + } + else + { + tv2 = precond.solve(m_V.col(it)); // User's selected preconditioner + } + tv1 = mat * tv2; + + // Orthogonalize it with the previous basis in the basis using modified Gram-Schmidt + Scalar coef; + for (Index i = 0; i <= it; ++i) + { + coef = tv1.dot(m_V.col(i)); + tv1 = tv1 - coef * m_V.col(i); + m_H(i,it) = coef; + m_Hes(i,it) = coef; + } + // Normalize the vector + coef = tv1.norm(); + m_V.col(it+1) = tv1/coef; + m_H(it+1, it) = coef; +// m_Hes(it+1,it) = coef; + + // FIXME Check for happy breakdown + + // Update Hessenberg matrix with Givens rotations + for (Index i = 1; i <= it; ++i) + { + m_H.col(it).applyOnTheLeft(i-1,i,gr[i-1].adjoint()); + } + // Compute the new plane rotation + gr[it].makeGivens(m_H(it, it), m_H(it+1,it)); + // Apply the new rotation + m_H.col(it).applyOnTheLeft(it,it+1,gr[it].adjoint()); + g.applyOnTheLeft(it,it+1, gr[it].adjoint()); + + beta = std::abs(g(it+1)); + m_error = beta/normRhs; + // std::cerr << nbIts << " Relative Residual Norm " << m_error << std::endl; + it++; nbIts++; + + if (m_error < m_tolerance) + { + // The method has converged + m_info = Success; + break; + } + } + + // Compute the new coefficients by solving the least square problem +// it++; + //FIXME Check first if the matrix is singular ... zero diagonal + DenseVector nrs(m_restart); + nrs = m_H.topLeftCorner(it,it).template triangularView().solve(g.head(it)); + + // Form the new solution + if (m_isDeflInitialized) + { + tv1 = m_V.leftCols(it) * nrs; + dgmresApplyDeflation(tv1, tv2); + x = x + precond.solve(tv2); + } + else + x = x + precond.solve(m_V.leftCols(it) * nrs); + + // Go for a new cycle and compute data for deflation + if(nbIts < m_iterations && m_info == NoConvergence && m_neig > 0 && (m_r+m_neig) < m_maxNeig) + dgmresComputeDeflationData(mat, precond, it, m_neig); + return 0; + +} + + +template< typename _MatrixType, typename _Preconditioner> +void DGMRES<_MatrixType, _Preconditioner>::dgmresInitDeflation(Index& rows) const +{ + m_U.resize(rows, m_maxNeig); + m_MU.resize(rows, m_maxNeig); + m_T.resize(m_maxNeig, m_maxNeig); + m_lambdaN = 0.0; + m_isDeflAllocated = true; +} + +template< typename _MatrixType, typename _Preconditioner> +inline typename DGMRES<_MatrixType, _Preconditioner>::ComplexVector DGMRES<_MatrixType, _Preconditioner>::schurValues(const ComplexSchur& schurofH) const +{ + return schurofH.matrixT().diagonal(); +} + +template< typename _MatrixType, typename _Preconditioner> +inline typename DGMRES<_MatrixType, _Preconditioner>::ComplexVector DGMRES<_MatrixType, _Preconditioner>::schurValues(const RealSchur& schurofH) const +{ + const DenseMatrix& T = schurofH.matrixT(); + Index it = T.rows(); + ComplexVector eig(it); + Index j = 0; + while (j < it-1) + { + if (T(j+1,j) ==Scalar(0)) + { + eig(j) = std::complex(T(j,j),RealScalar(0)); + j++; + } + else + { + eig(j) = std::complex(T(j,j),T(j+1,j)); + eig(j+1) = std::complex(T(j,j+1),T(j+1,j+1)); + j++; + } + } + if (j < it-1) eig(j) = std::complex(T(j,j),RealScalar(0)); + return eig; +} + +template< typename _MatrixType, typename _Preconditioner> +Index DGMRES<_MatrixType, _Preconditioner>::dgmresComputeDeflationData(const MatrixType& mat, const Preconditioner& precond, const Index& it, StorageIndex& neig) const +{ + // First, find the Schur form of the Hessenberg matrix H + typename internal::conditional::IsComplex, ComplexSchur, RealSchur >::type schurofH; + bool computeU = true; + DenseMatrix matrixQ(it,it); + matrixQ.setIdentity(); + schurofH.computeFromHessenberg(m_Hes.topLeftCorner(it,it), matrixQ, computeU); + + ComplexVector eig(it); + Matrixperm(it); + eig = this->schurValues(schurofH); + + // Reorder the absolute values of Schur values + DenseRealVector modulEig(it); + for (Index j=0; j(it-1)); + internal::sortWithPermutation(modulEig, perm, neig); + + if (!m_lambdaN) + { + m_lambdaN = (std::max)(modulEig.maxCoeff(), m_lambdaN); + } + //Count the real number of extracted eigenvalues (with complex conjugates) + Index nbrEig = 0; + while (nbrEig < neig) + { + if(eig(perm(it-nbrEig-1)).imag() == RealScalar(0)) nbrEig++; + else nbrEig += 2; + } + // Extract the Schur vectors corresponding to the smallest Ritz values + DenseMatrix Sr(it, nbrEig); + Sr.setZero(); + for (Index j = 0; j < nbrEig; j++) + { + Sr.col(j) = schurofH.matrixU().col(perm(it-j-1)); + } + + // Form the Schur vectors of the initial matrix using the Krylov basis + DenseMatrix X; + X = m_V.leftCols(it) * Sr; + if (m_r) + { + // Orthogonalize X against m_U using modified Gram-Schmidt + for (Index j = 0; j < nbrEig; j++) + for (Index k =0; k < m_r; k++) + X.col(j) = X.col(j) - (m_U.col(k).dot(X.col(j)))*m_U.col(k); + } + + // Compute m_MX = A * M^-1 * X + Index m = m_V.rows(); + if (!m_isDeflAllocated) + dgmresInitDeflation(m); + DenseMatrix MX(m, nbrEig); + DenseVector tv1(m); + for (Index j = 0; j < nbrEig; j++) + { + tv1 = mat * X.col(j); + MX.col(j) = precond.solve(tv1); + } + + //Update m_T = [U'MU U'MX; X'MU X'MX] + m_T.block(m_r, m_r, nbrEig, nbrEig) = X.transpose() * MX; + if(m_r) + { + m_T.block(0, m_r, m_r, nbrEig) = m_U.leftCols(m_r).transpose() * MX; + m_T.block(m_r, 0, nbrEig, m_r) = X.transpose() * m_MU.leftCols(m_r); + } + + // Save X into m_U and m_MX in m_MU + for (Index j = 0; j < nbrEig; j++) m_U.col(m_r+j) = X.col(j); + for (Index j = 0; j < nbrEig; j++) m_MU.col(m_r+j) = MX.col(j); + // Increase the size of the invariant subspace + m_r += nbrEig; + + // Factorize m_T into m_luT + m_luT.compute(m_T.topLeftCorner(m_r, m_r)); + + //FIXME CHeck if the factorization was correctly done (nonsingular matrix) + m_isDeflInitialized = true; + return 0; +} +template +template +Index DGMRES<_MatrixType, _Preconditioner>::dgmresApplyDeflation(const RhsType &x, DestType &y) const +{ + DenseVector x1 = m_U.leftCols(m_r).transpose() * x; + y = x + m_U.leftCols(m_r) * ( m_lambdaN * m_luT.solve(x1) - x1); + return 0; +} + +} // end namespace Eigen +#endif diff --git a/src/EigenUnsupported/src/IterativeSolvers/GMRES.h b/src/EigenUnsupported/src/IterativeSolvers/GMRES.h new file mode 100644 index 0000000..ff91209 --- /dev/null +++ b/src/EigenUnsupported/src/IterativeSolvers/GMRES.h @@ -0,0 +1,335 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2011 Gael Guennebaud +// Copyright (C) 2012, 2014 Kolja Brix +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_GMRES_H +#define EIGEN_GMRES_H + +namespace Eigen { + +namespace internal { + +/** +* Generalized Minimal Residual Algorithm based on the +* Arnoldi algorithm implemented with Householder reflections. +* +* Parameters: +* \param mat matrix of linear system of equations +* \param rhs right hand side vector of linear system of equations +* \param x on input: initial guess, on output: solution +* \param precond preconditioner used +* \param iters on input: maximum number of iterations to perform +* on output: number of iterations performed +* \param restart number of iterations for a restart +* \param tol_error on input: relative residual tolerance +* on output: residuum achieved +* +* \sa IterativeMethods::bicgstab() +* +* +* For references, please see: +* +* Saad, Y. and Schultz, M. H. +* GMRES: A Generalized Minimal Residual Algorithm for Solving Nonsymmetric Linear Systems. +* SIAM J.Sci.Stat.Comp. 7, 1986, pp. 856 - 869. +* +* Saad, Y. +* Iterative Methods for Sparse Linear Systems. +* Society for Industrial and Applied Mathematics, Philadelphia, 2003. +* +* Walker, H. F. +* Implementations of the GMRES method. +* Comput.Phys.Comm. 53, 1989, pp. 311 - 320. +* +* Walker, H. F. +* Implementation of the GMRES Method using Householder Transformations. +* SIAM J.Sci.Stat.Comp. 9, 1988, pp. 152 - 163. +* +*/ +template +bool gmres(const MatrixType & mat, const Rhs & rhs, Dest & x, const Preconditioner & precond, + Index &iters, const Index &restart, typename Dest::RealScalar & tol_error) { + + using std::sqrt; + using std::abs; + + typedef typename Dest::RealScalar RealScalar; + typedef typename Dest::Scalar Scalar; + typedef Matrix < Scalar, Dynamic, 1 > VectorType; + typedef Matrix < Scalar, Dynamic, Dynamic, ColMajor> FMatrixType; + + const RealScalar considerAsZero = (std::numeric_limits::min)(); + + if(rhs.norm() <= considerAsZero) + { + x.setZero(); + tol_error = 0; + return true; + } + + RealScalar tol = tol_error; + const Index maxIters = iters; + iters = 0; + + const Index m = mat.rows(); + + // residual and preconditioned residual + VectorType p0 = rhs - mat*x; + VectorType r0 = precond.solve(p0); + + const RealScalar r0Norm = r0.norm(); + + // is initial guess already good enough? + if(r0Norm == 0) + { + tol_error = 0; + return true; + } + + // storage for Hessenberg matrix and Householder data + FMatrixType H = FMatrixType::Zero(m, restart + 1); + VectorType w = VectorType::Zero(restart + 1); + VectorType tau = VectorType::Zero(restart + 1); + + // storage for Jacobi rotations + std::vector < JacobiRotation < Scalar > > G(restart); + + // storage for temporaries + VectorType t(m), v(m), workspace(m), x_new(m); + + // generate first Householder vector + Ref H0_tail = H.col(0).tail(m - 1); + RealScalar beta; + r0.makeHouseholder(H0_tail, tau.coeffRef(0), beta); + w(0) = Scalar(beta); + + for (Index k = 1; k <= restart; ++k) + { + ++iters; + + v = VectorType::Unit(m, k - 1); + + // apply Householder reflections H_{1} ... H_{k-1} to v + // TODO: use a HouseholderSequence + for (Index i = k - 1; i >= 0; --i) { + v.tail(m - i).applyHouseholderOnTheLeft(H.col(i).tail(m - i - 1), tau.coeffRef(i), workspace.data()); + } + + // apply matrix M to v: v = mat * v; + t.noalias() = mat * v; + v = precond.solve(t); + + // apply Householder reflections H_{k-1} ... H_{1} to v + // TODO: use a HouseholderSequence + for (Index i = 0; i < k; ++i) { + v.tail(m - i).applyHouseholderOnTheLeft(H.col(i).tail(m - i - 1), tau.coeffRef(i), workspace.data()); + } + + if (v.tail(m - k).norm() != 0.0) + { + if (k <= restart) + { + // generate new Householder vector + Ref Hk_tail = H.col(k).tail(m - k - 1); + v.tail(m - k).makeHouseholder(Hk_tail, tau.coeffRef(k), beta); + + // apply Householder reflection H_{k} to v + v.tail(m - k).applyHouseholderOnTheLeft(Hk_tail, tau.coeffRef(k), workspace.data()); + } + } + + if (k > 1) + { + for (Index i = 0; i < k - 1; ++i) + { + // apply old Givens rotations to v + v.applyOnTheLeft(i, i + 1, G[i].adjoint()); + } + } + + if (k y = w.head(k); + H.topLeftCorner(k, k).template triangularView ().solveInPlace(y); + + // use Horner-like scheme to calculate solution vector + x_new.setZero(); + for (Index i = k - 1; i >= 0; --i) + { + x_new(i) += y(i); + // apply Householder reflection H_{i} to x_new + x_new.tail(m - i).applyHouseholderOnTheLeft(H.col(i).tail(m - i - 1), tau.coeffRef(i), workspace.data()); + } + + x += x_new; + + if(stop) + { + return true; + } + else + { + k=0; + + // reset data for restart + p0.noalias() = rhs - mat*x; + r0 = precond.solve(p0); + + // clear Hessenberg matrix and Householder data + H.setZero(); + w.setZero(); + tau.setZero(); + + // generate first Householder vector + r0.makeHouseholder(H0_tail, tau.coeffRef(0), beta); + w(0) = Scalar(beta); + } + } + } + + return false; + +} + +} + +template< typename _MatrixType, + typename _Preconditioner = DiagonalPreconditioner > +class GMRES; + +namespace internal { + +template< typename _MatrixType, typename _Preconditioner> +struct traits > +{ + typedef _MatrixType MatrixType; + typedef _Preconditioner Preconditioner; +}; + +} + +/** \ingroup IterativeLinearSolvers_Module + * \brief A GMRES solver for sparse square problems + * + * This class allows to solve for A.x = b sparse linear problems using a generalized minimal + * residual method. The vectors x and b can be either dense or sparse. + * + * \tparam _MatrixType the type of the sparse matrix A, can be a dense or a sparse matrix. + * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner + * + * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations() + * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations + * and NumTraits::epsilon() for the tolerance. + * + * This class can be used as the direct solver classes. Here is a typical usage example: + * \code + * int n = 10000; + * VectorXd x(n), b(n); + * SparseMatrix A(n,n); + * // fill A and b + * GMRES > solver(A); + * x = solver.solve(b); + * std::cout << "#iterations: " << solver.iterations() << std::endl; + * std::cout << "estimated error: " << solver.error() << std::endl; + * // update b, and solve again + * x = solver.solve(b); + * \endcode + * + * By default the iterations start with x=0 as an initial guess of the solution. + * One can control the start using the solveWithGuess() method. + * + * GMRES can also be used in a matrix-free context, see the following \link MatrixfreeSolverExample example \endlink. + * + * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner + */ +template< typename _MatrixType, typename _Preconditioner> +class GMRES : public IterativeSolverBase > +{ + typedef IterativeSolverBase Base; + using Base::matrix; + using Base::m_error; + using Base::m_iterations; + using Base::m_info; + using Base::m_isInitialized; + +private: + Index m_restart; + +public: + using Base::_solve_impl; + typedef _MatrixType MatrixType; + typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::RealScalar RealScalar; + typedef _Preconditioner Preconditioner; + +public: + + /** Default constructor. */ + GMRES() : Base(), m_restart(30) {} + + /** Initialize the solver with matrix \a A for further \c Ax=b solving. + * + * This constructor is a shortcut for the default constructor followed + * by a call to compute(). + * + * \warning this class stores a reference to the matrix A as well as some + * precomputed values that depend on it. Therefore, if \a A is changed + * this class becomes invalid. Call compute() to update it with the new + * matrix A, or modify a copy of A. + */ + template + explicit GMRES(const EigenBase& A) : Base(A.derived()), m_restart(30) {} + + ~GMRES() {} + + /** Get the number of iterations after that a restart is performed. + */ + Index get_restart() { return m_restart; } + + /** Set the number of iterations after that a restart is performed. + * \param restart number of iterations for a restarti, default is 30. + */ + void set_restart(const Index restart) { m_restart=restart; } + + /** \internal */ + template + void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const + { + m_iterations = Base::maxIterations(); + m_error = Base::m_tolerance; + bool ret = internal::gmres(matrix(), b, x, Base::m_preconditioner, m_iterations, m_restart, m_error); + m_info = (!ret) ? NumericalIssue + : m_error <= Base::m_tolerance ? Success + : NoConvergence; + } + +protected: + +}; + +} // end namespace Eigen + +#endif // EIGEN_GMRES_H diff --git a/src/EigenUnsupported/src/IterativeSolvers/IDRS.h b/src/EigenUnsupported/src/IterativeSolvers/IDRS.h new file mode 100755 index 0000000..90d20fa --- /dev/null +++ b/src/EigenUnsupported/src/IterativeSolvers/IDRS.h @@ -0,0 +1,436 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2020 Chris Schoutrop +// Copyright (C) 2020 Jens Wehner +// Copyright (C) 2020 Jan van Dijk +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +#ifndef EIGEN_IDRS_H +#define EIGEN_IDRS_H + +namespace Eigen +{ + + namespace internal + { + /** \internal Low-level Induced Dimension Reduction algoritm + \param A The matrix A + \param b The right hand side vector b + \param x On input and initial solution, on output the computed solution. + \param precond A preconditioner being able to efficiently solve for an + approximation of Ax=b (regardless of b) + \param iter On input the max number of iteration, on output the number of performed iterations. + \param relres On input the tolerance error, on output an estimation of the relative error. + \param S On input Number of the dimension of the shadow space. + \param smoothing switches residual smoothing on. + \param angle small omega lead to faster convergence at the expense of numerical stability + \param replacement switches on a residual replacement strategy to increase accuracy of residual at the expense of more Mat*vec products + \return false in the case of numerical issue, for example a break down of IDRS. + */ + template + typename Vector::Scalar omega(const Vector& t, const Vector& s, RealScalar angle) + { + using numext::abs; + typedef typename Vector::Scalar Scalar; + const RealScalar ns = s.norm(); + const RealScalar nt = t.norm(); + const Scalar ts = t.dot(s); + const RealScalar rho = abs(ts / (nt * ns)); + + if (rho < angle) { + if (ts == Scalar(0)) { + return Scalar(0); + } + // Original relation for om is given by + // om = om * angle / rho; + // To alleviate potential (near) division by zero this can be rewritten as + // om = angle * (ns / nt) * (ts / abs(ts)) = angle * (ns / nt) * sgn(ts) + return angle * (ns / nt) * (ts / abs(ts)); + } + return ts / (nt * nt); + } + + template + bool idrs(const MatrixType& A, const Rhs& b, Dest& x, const Preconditioner& precond, + Index& iter, + typename Dest::RealScalar& relres, Index S, bool smoothing, typename Dest::RealScalar angle, bool replacement) + { + typedef typename Dest::RealScalar RealScalar; + typedef typename Dest::Scalar Scalar; + typedef Matrix VectorType; + typedef Matrix DenseMatrixType; + const Index N = b.size(); + S = S < x.rows() ? S : x.rows(); + const RealScalar tol = relres; + const Index maxit = iter; + + Index replacements = 0; + bool trueres = false; + + FullPivLU lu_solver; + + DenseMatrixType P; + { + HouseholderQR qr(DenseMatrixType::Random(N, S)); + P = (qr.householderQ() * DenseMatrixType::Identity(N, S)); + } + + const RealScalar normb = b.norm(); + + if (internal::isApprox(normb, RealScalar(0))) + { + //Solution is the zero vector + x.setZero(); + iter = 0; + relres = 0; + return true; + } + // from http://homepage.tudelft.nl/1w5b5/IDRS/manual.pdf + // A peak in the residual is considered dangerously high if‖ri‖/‖b‖> C(tol/epsilon). + // With epsilon the + // relative machine precision. The factor tol/epsilon corresponds to the size of a + // finite precision number that is so large that the absolute round-off error in + // this number, when propagated through the process, makes it impossible to + // achieve the required accuracy.The factor C accounts for the accumulation of + // round-off errors. This parameter has beenset to 10−3. + // mp is epsilon/C + // 10^3 * eps is very conservative, so normally no residual replacements will take place. + // It only happens if things go very wrong. Too many restarts may ruin the convergence. + const RealScalar mp = RealScalar(1e3) * NumTraits::epsilon(); + + + + //Compute initial residual + const RealScalar tolb = tol * normb; //Relative tolerance + VectorType r = b - A * x; + + VectorType x_s, r_s; + + if (smoothing) + { + x_s = x; + r_s = r; + } + + RealScalar normr = r.norm(); + + if (normr <= tolb) + { + //Initial guess is a good enough solution + iter = 0; + relres = normr / normb; + return true; + } + + DenseMatrixType G = DenseMatrixType::Zero(N, S); + DenseMatrixType U = DenseMatrixType::Zero(N, S); + DenseMatrixType M = DenseMatrixType::Identity(S, S); + VectorType t(N), v(N); + Scalar om = 1.; + + //Main iteration loop, guild G-spaces: + iter = 0; + + while (normr > tolb && iter < maxit) + { + //New right hand size for small system: + VectorType f = (r.adjoint() * P).adjoint(); + + for (Index k = 0; k < S; ++k) + { + //Solve small system and make v orthogonal to P: + //c = M(k:s,k:s)\f(k:s); + lu_solver.compute(M.block(k , k , S -k, S - k )); + VectorType c = lu_solver.solve(f.segment(k , S - k )); + //v = r - G(:,k:s)*c; + v = r - G.rightCols(S - k ) * c; + //Preconditioning + v = precond.solve(v); + + //Compute new U(:,k) and G(:,k), G(:,k) is in space G_j + U.col(k) = U.rightCols(S - k ) * c + om * v; + G.col(k) = A * U.col(k ); + + //Bi-Orthogonalise the new basis vectors: + for (Index i = 0; i < k-1 ; ++i) + { + //alpha = ( P(:,i)'*G(:,k) )/M(i,i); + Scalar alpha = P.col(i ).dot(G.col(k )) / M(i, i ); + G.col(k ) = G.col(k ) - alpha * G.col(i ); + U.col(k ) = U.col(k ) - alpha * U.col(i ); + } + + //New column of M = P'*G (first k-1 entries are zero) + //M(k:s,k) = (G(:,k)'*P(:,k:s))'; + M.block(k , k , S - k , 1) = (G.col(k ).adjoint() * P.rightCols(S - k )).adjoint(); + + if (internal::isApprox(M(k,k), Scalar(0))) + { + return false; + } + + //Make r orthogonal to q_i, i = 0..k-1 + Scalar beta = f(k ) / M(k , k ); + r = r - beta * G.col(k ); + x = x + beta * U.col(k ); + normr = r.norm(); + + if (replacement && normr > tolb / mp) + { + trueres = true; + } + + //Smoothing: + if (smoothing) + { + t = r_s - r; + //gamma is a Scalar, but the conversion is not allowed + Scalar gamma = t.dot(r_s) / t.norm(); + r_s = r_s - gamma * t; + x_s = x_s - gamma * (x_s - x); + normr = r_s.norm(); + } + + if (normr < tolb || iter == maxit) + { + break; + } + + //New f = P'*r (first k components are zero) + if (k < S-1) + { + f.segment(k + 1, S - (k + 1) ) = f.segment(k + 1 , S - (k + 1)) - beta * M.block(k + 1 , k , S - (k + 1), 1); + } + }//end for + + if (normr < tolb || iter == maxit) + { + break; + } + + //Now we have sufficient vectors in G_j to compute residual in G_j+1 + //Note: r is already perpendicular to P so v = r + //Preconditioning + v = r; + v = precond.solve(v); + + //Matrix-vector multiplication: + t = A * v; + + //Computation of a new omega + om = internal::omega(t, r, angle); + + if (om == RealScalar(0.0)) + { + return false; + } + + r = r - om * t; + x = x + om * v; + normr = r.norm(); + + if (replacement && normr > tolb / mp) + { + trueres = true; + } + + //Residual replacement? + if (trueres && normr < normb) + { + r = b - A * x; + trueres = false; + replacements++; + } + + //Smoothing: + if (smoothing) + { + t = r_s - r; + Scalar gamma = t.dot(r_s) /t.norm(); + r_s = r_s - gamma * t; + x_s = x_s - gamma * (x_s - x); + normr = r_s.norm(); + } + + iter++; + + }//end while + + if (smoothing) + { + x = x_s; + } + relres=normr/normb; + return true; + } + + } // namespace internal + + template > + class IDRS; + + namespace internal + { + + template + struct traits > + { + typedef _MatrixType MatrixType; + typedef _Preconditioner Preconditioner; + }; + + } // namespace internal + + +/** \ingroup IterativeLinearSolvers_Module + * \brief The Induced Dimension Reduction method (IDR(s)) is a short-recurrences Krylov method for sparse square problems. + * + * This class allows to solve for A.x = b sparse linear problems. The vectors x and b can be either dense or sparse. + * he Induced Dimension Reduction method, IDR(), is a robust and efficient short-recurrence Krylov subspace method for + * solving large nonsymmetric systems of linear equations. + * + * For indefinite systems IDR(S) outperforms both BiCGStab and BiCGStab(L). Additionally, IDR(S) can handle matrices + * with complex eigenvalues more efficiently than BiCGStab. + * + * Many problems that do not converge for BiCGSTAB converge for IDR(s) (for larger values of s). And if both methods + * converge the convergence for IDR(s) is typically much faster for difficult systems (for example indefinite problems). + * + * IDR(s) is a limited memory finite termination method. In exact arithmetic it converges in at most N+N/s iterations, + * with N the system size. It uses a fixed number of 4+3s vector. In comparison, BiCGSTAB terminates in 2N iterations + * and uses 7 vectors. GMRES terminates in at most N iterations, and uses I+3 vectors, with I the number of iterations. + * Restarting GMRES limits the memory consumption, but destroys the finite termination property. + * + * \tparam _MatrixType the type of the sparse matrix A, can be a dense or a sparse matrix. + * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner + * + * \implsparsesolverconcept + * + * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations() + * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations + * and NumTraits::epsilon() for the tolerance. + * + * The tolerance corresponds to the relative residual error: |Ax-b|/|b| + * + * \b Performance: when using sparse matrices, best performance is achied for a row-major sparse matrix format. + * Moreover, in this case multi-threading can be exploited if the user code is compiled with OpenMP enabled. + * See \ref TopicMultiThreading for details. + * + * By default the iterations start with x=0 as an initial guess of the solution. + * One can control the start using the solveWithGuess() method. + * + * IDR(s) can also be used in a matrix-free context, see the following \link MatrixfreeSolverExample example \endlink. + * + * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner + */ + template + class IDRS : public IterativeSolverBase > + { + + public: + typedef _MatrixType MatrixType; + typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::RealScalar RealScalar; + typedef _Preconditioner Preconditioner; + + private: + typedef IterativeSolverBase Base; + using Base::m_error; + using Base::m_info; + using Base::m_isInitialized; + using Base::m_iterations; + using Base::matrix; + Index m_S; + bool m_smoothing; + RealScalar m_angle; + bool m_residual; + + public: + /** Default constructor. */ + IDRS(): m_S(4), m_smoothing(false), m_angle(RealScalar(0.7)), m_residual(false) {} + + /** Initialize the solver with matrix \a A for further \c Ax=b solving. + + This constructor is a shortcut for the default constructor followed + by a call to compute(). + + \warning this class stores a reference to the matrix A as well as some + precomputed values that depend on it. Therefore, if \a A is changed + this class becomes invalid. Call compute() to update it with the new + matrix A, or modify a copy of A. + */ + template + explicit IDRS(const EigenBase& A) : Base(A.derived()), m_S(4), m_smoothing(false), + m_angle(RealScalar(0.7)), m_residual(false) {} + + + /** \internal */ + /** Loops over the number of columns of b and does the following: + 1. sets the tolerence and maxIterations + 2. Calls the function that has the core solver routine + */ + template + void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const + { + m_iterations = Base::maxIterations(); + m_error = Base::m_tolerance; + + bool ret = internal::idrs(matrix(), b, x, Base::m_preconditioner, m_iterations, m_error, m_S,m_smoothing,m_angle,m_residual); + + m_info = (!ret) ? NumericalIssue : m_error <= Base::m_tolerance ? Success : NoConvergence; + } + + /** Sets the parameter S, indicating the dimension of the shadow space. Default is 4*/ + void setS(Index S) + { + if (S < 1) + { + S = 4; + } + + m_S = S; + } + + /** Switches off and on smoothing. + Residual smoothing results in monotonically decreasing residual norms at + the expense of two extra vectors of storage and a few extra vector + operations. Although monotonic decrease of the residual norms is a + desirable property, the rate of convergence of the unsmoothed process and + the smoothed process is basically the same. Default is off */ + void setSmoothing(bool smoothing) + { + m_smoothing=smoothing; + } + + /** The angle must be a real scalar. In IDR(s), a value for the + iteration parameter omega must be chosen in every s+1th step. The most + natural choice is to select a value to minimize the norm of the next residual. + This corresponds to the parameter omega = 0. In practice, this may lead to + values of omega that are so small that the other iteration parameters + cannot be computed with sufficient accuracy. In such cases it is better to + increase the value of omega sufficiently such that a compromise is reached + between accurate computations and reduction of the residual norm. The + parameter angle =0.7 (”maintaining the convergence strategy”) + results in such a compromise. */ + void setAngle(RealScalar angle) + { + m_angle=angle; + } + + /** The parameter replace is a logical that determines whether a + residual replacement strategy is employed to increase the accuracy of the + solution. */ + void setResidualUpdate(bool update) + { + m_residual=update; + } + + }; + +} // namespace Eigen + +#endif /* EIGEN_IDRS_H */ diff --git a/src/EigenUnsupported/src/IterativeSolvers/IncompleteLU.h b/src/EigenUnsupported/src/IterativeSolvers/IncompleteLU.h new file mode 100644 index 0000000..7d08c35 --- /dev/null +++ b/src/EigenUnsupported/src/IterativeSolvers/IncompleteLU.h @@ -0,0 +1,90 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2011 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_INCOMPLETE_LU_H +#define EIGEN_INCOMPLETE_LU_H + +namespace Eigen { + +template +class IncompleteLU : public SparseSolverBase > +{ + protected: + typedef SparseSolverBase > Base; + using Base::m_isInitialized; + + typedef _Scalar Scalar; + typedef Matrix Vector; + typedef typename Vector::Index Index; + typedef SparseMatrix FactorType; + + public: + typedef Matrix MatrixType; + + IncompleteLU() {} + + template + IncompleteLU(const MatrixType& mat) + { + compute(mat); + } + + Index rows() const { return m_lu.rows(); } + Index cols() const { return m_lu.cols(); } + + template + IncompleteLU& compute(const MatrixType& mat) + { + m_lu = mat; + int size = mat.cols(); + Vector diag(size); + for(int i=0; i + void _solve_impl(const Rhs& b, Dest& x) const + { + x = m_lu.template triangularView().solve(b); + x = m_lu.template triangularView().solve(x); + } + + protected: + FactorType m_lu; +}; + +} // end namespace Eigen + +#endif // EIGEN_INCOMPLETE_LU_H diff --git a/src/EigenUnsupported/src/IterativeSolvers/IterationController.h b/src/EigenUnsupported/src/IterativeSolvers/IterationController.h new file mode 100644 index 0000000..a116e09 --- /dev/null +++ b/src/EigenUnsupported/src/IterativeSolvers/IterationController.h @@ -0,0 +1,154 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2009 Gael Guennebaud + +/* NOTE The class IterationController has been adapted from the iteration + * class of the GMM++ and ITL libraries. + */ + +//======================================================================= +// Copyright (C) 1997-2001 +// Authors: Andrew Lumsdaine +// Lie-Quan Lee +// +// This file is part of the Iterative Template Library +// +// You should have received a copy of the License Agreement for the +// Iterative Template Library along with the software; see the +// file LICENSE. +// +// Permission to modify the code and to distribute modified code is +// granted, provided the text of this NOTICE is retained, a notice that +// the code was modified is included with the above COPYRIGHT NOTICE and +// with the COPYRIGHT NOTICE in the LICENSE file, and that the LICENSE +// file is distributed with the modified code. +// +// LICENSOR MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. +// By way of example, but not limitation, Licensor MAKES NO +// REPRESENTATIONS OR WARRANTIES OF MERCHANTABILITY OR FITNESS FOR ANY +// PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE COMPONENTS +// OR DOCUMENTATION WILL NOT INFRINGE ANY PATENTS, COPYRIGHTS, TRADEMARKS +// OR OTHER RIGHTS. +//======================================================================= + +//======================================================================== +// +// Copyright (C) 2002-2007 Yves Renard +// +// This file is a part of GETFEM++ +// +// Getfem++ is free software; you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as +// published by the Free Software Foundation; version 2.1 of the License. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// You should have received a copy of the GNU Lesser General Public +// License along with this program; if not, write to the Free Software +// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, +// USA. +// +//======================================================================== + +#include "../../../../Eigen/src/Core/util/NonMPL2.h" + +#ifndef EIGEN_ITERATION_CONTROLLER_H +#define EIGEN_ITERATION_CONTROLLER_H + +namespace Eigen { + +/** \ingroup IterativeLinearSolvers_Module + * \class IterationController + * + * \brief Controls the iterations of the iterative solvers + * + * This class has been adapted from the iteration class of GMM++ and ITL libraries. + * + */ +class IterationController +{ + protected : + double m_rhsn; ///< Right hand side norm + size_t m_maxiter; ///< Max. number of iterations + int m_noise; ///< if noise > 0 iterations are printed + double m_resmax; ///< maximum residual + double m_resminreach, m_resadd; + size_t m_nit; ///< iteration number + double m_res; ///< last computed residual + bool m_written; + void (*m_callback)(const IterationController&); + public : + + void init() + { + m_nit = 0; m_res = 0.0; m_written = false; + m_resminreach = 1E50; m_resadd = 0.0; + m_callback = 0; + } + + IterationController(double r = 1.0E-8, int noi = 0, size_t mit = size_t(-1)) + : m_rhsn(1.0), m_maxiter(mit), m_noise(noi), m_resmax(r) { init(); } + + void operator ++(int) { m_nit++; m_written = false; m_resadd += m_res; } + void operator ++() { (*this)++; } + + bool first() { return m_nit == 0; } + + /* get/set the "noisyness" (verbosity) of the solvers */ + int noiseLevel() const { return m_noise; } + void setNoiseLevel(int n) { m_noise = n; } + void reduceNoiseLevel() { if (m_noise > 0) m_noise--; } + + double maxResidual() const { return m_resmax; } + void setMaxResidual(double r) { m_resmax = r; } + + double residual() const { return m_res; } + + /* change the user-definable callback, called after each iteration */ + void setCallback(void (*t)(const IterationController&)) + { + m_callback = t; + } + + size_t iteration() const { return m_nit; } + void setIteration(size_t i) { m_nit = i; } + + size_t maxIterarions() const { return m_maxiter; } + void setMaxIterations(size_t i) { m_maxiter = i; } + + double rhsNorm() const { return m_rhsn; } + void setRhsNorm(double r) { m_rhsn = r; } + + bool converged() const { return m_res <= m_rhsn * m_resmax; } + bool converged(double nr) + { + using std::abs; + m_res = abs(nr); + m_resminreach = (std::min)(m_resminreach, m_res); + return converged(); + } + template bool converged(const VectorType &v) + { return converged(v.squaredNorm()); } + + bool finished(double nr) + { + if (m_callback) m_callback(*this); + if (m_noise > 0 && !m_written) + { + converged(nr); + m_written = true; + } + return (m_nit >= m_maxiter || converged(nr)); + } + template + bool finished(const MatrixBase &v) + { return finished(double(v.squaredNorm())); } + +}; + +} // end namespace Eigen + +#endif // EIGEN_ITERATION_CONTROLLER_H diff --git a/src/EigenUnsupported/src/IterativeSolvers/MINRES.h b/src/EigenUnsupported/src/IterativeSolvers/MINRES.h new file mode 100644 index 0000000..5db454d --- /dev/null +++ b/src/EigenUnsupported/src/IterativeSolvers/MINRES.h @@ -0,0 +1,267 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2012 Giacomo Po +// Copyright (C) 2011-2014 Gael Guennebaud +// Copyright (C) 2018 David Hyde +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +#ifndef EIGEN_MINRES_H_ +#define EIGEN_MINRES_H_ + + +namespace Eigen { + + namespace internal { + + /** \internal Low-level MINRES algorithm + * \param mat The matrix A + * \param rhs The right hand side vector b + * \param x On input and initial solution, on output the computed solution. + * \param precond A right preconditioner being able to efficiently solve for an + * approximation of Ax=b (regardless of b) + * \param iters On input the max number of iteration, on output the number of performed iterations. + * \param tol_error On input the tolerance error, on output an estimation of the relative error. + */ + template + EIGEN_DONT_INLINE + void minres(const MatrixType& mat, const Rhs& rhs, Dest& x, + const Preconditioner& precond, Index& iters, + typename Dest::RealScalar& tol_error) + { + using std::sqrt; + typedef typename Dest::RealScalar RealScalar; + typedef typename Dest::Scalar Scalar; + typedef Matrix VectorType; + + // Check for zero rhs + const RealScalar rhsNorm2(rhs.squaredNorm()); + if(rhsNorm2 == 0) + { + x.setZero(); + iters = 0; + tol_error = 0; + return; + } + + // initialize + const Index maxIters(iters); // initialize maxIters to iters + const Index N(mat.cols()); // the size of the matrix + const RealScalar threshold2(tol_error*tol_error*rhsNorm2); // convergence threshold (compared to residualNorm2) + + // Initialize preconditioned Lanczos + VectorType v_old(N); // will be initialized inside loop + VectorType v( VectorType::Zero(N) ); //initialize v + VectorType v_new(rhs-mat*x); //initialize v_new + RealScalar residualNorm2(v_new.squaredNorm()); + VectorType w(N); // will be initialized inside loop + VectorType w_new(precond.solve(v_new)); // initialize w_new +// RealScalar beta; // will be initialized inside loop + RealScalar beta_new2(v_new.dot(w_new)); + eigen_assert(beta_new2 >= 0.0 && "PRECONDITIONER IS NOT POSITIVE DEFINITE"); + RealScalar beta_new(sqrt(beta_new2)); + const RealScalar beta_one(beta_new); + // Initialize other variables + RealScalar c(1.0); // the cosine of the Givens rotation + RealScalar c_old(1.0); + RealScalar s(0.0); // the sine of the Givens rotation + RealScalar s_old(0.0); // the sine of the Givens rotation + VectorType p_oold(N); // will be initialized in loop + VectorType p_old(VectorType::Zero(N)); // initialize p_old=0 + VectorType p(p_old); // initialize p=0 + RealScalar eta(1.0); + + iters = 0; // reset iters + while ( iters < maxIters ) + { + // Preconditioned Lanczos + /* Note that there are 4 variants on the Lanczos algorithm. These are + * described in Paige, C. C. (1972). Computational variants of + * the Lanczos method for the eigenproblem. IMA Journal of Applied + * Mathematics, 10(3), 373-381. The current implementation corresponds + * to the case A(2,7) in the paper. It also corresponds to + * algorithm 6.14 in Y. Saad, Iterative Methods for Sparse Linear + * Systems, 2003 p.173. For the preconditioned version see + * A. Greenbaum, Iterative Methods for Solving Linear Systems, SIAM (1987). + */ + const RealScalar beta(beta_new); + v_old = v; // update: at first time step, this makes v_old = 0 so value of beta doesn't matter + v_new /= beta_new; // overwrite v_new for next iteration + w_new /= beta_new; // overwrite w_new for next iteration + v = v_new; // update + w = w_new; // update + v_new.noalias() = mat*w - beta*v_old; // compute v_new + const RealScalar alpha = v_new.dot(w); + v_new -= alpha*v; // overwrite v_new + w_new = precond.solve(v_new); // overwrite w_new + beta_new2 = v_new.dot(w_new); // compute beta_new + eigen_assert(beta_new2 >= 0.0 && "PRECONDITIONER IS NOT POSITIVE DEFINITE"); + beta_new = sqrt(beta_new2); // compute beta_new + + // Givens rotation + const RealScalar r2 =s*alpha+c*c_old*beta; // s, s_old, c and c_old are still from previous iteration + const RealScalar r3 =s_old*beta; // s, s_old, c and c_old are still from previous iteration + const RealScalar r1_hat=c*alpha-c_old*s*beta; + const RealScalar r1 =sqrt( std::pow(r1_hat,2) + std::pow(beta_new,2) ); + c_old = c; // store for next iteration + s_old = s; // store for next iteration + c=r1_hat/r1; // new cosine + s=beta_new/r1; // new sine + + // Update solution + p_oold = p_old; + p_old = p; + p.noalias()=(w-r2*p_old-r3*p_oold) /r1; // IS NOALIAS REQUIRED? + x += beta_one*c*eta*p; + + /* Update the squared residual. Note that this is the estimated residual. + The real residual |Ax-b|^2 may be slightly larger */ + residualNorm2 *= s*s; + + if ( residualNorm2 < threshold2) + { + break; + } + + eta=-s*eta; // update eta + iters++; // increment iteration number (for output purposes) + } + + /* Compute error. Note that this is the estimated error. The real + error |Ax-b|/|b| may be slightly larger */ + tol_error = std::sqrt(residualNorm2 / rhsNorm2); + } + + } + + template< typename _MatrixType, int _UpLo=Lower, + typename _Preconditioner = IdentityPreconditioner> + class MINRES; + + namespace internal { + + template< typename _MatrixType, int _UpLo, typename _Preconditioner> + struct traits > + { + typedef _MatrixType MatrixType; + typedef _Preconditioner Preconditioner; + }; + + } + + /** \ingroup IterativeLinearSolvers_Module + * \brief A minimal residual solver for sparse symmetric problems + * + * This class allows to solve for A.x = b sparse linear problems using the MINRES algorithm + * of Paige and Saunders (1975). The sparse matrix A must be symmetric (possibly indefinite). + * The vectors x and b can be either dense or sparse. + * + * \tparam _MatrixType the type of the sparse matrix A, can be a dense or a sparse matrix. + * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower, + * Upper, or Lower|Upper in which the full matrix entries will be considered. Default is Lower. + * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner + * + * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations() + * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations + * and NumTraits::epsilon() for the tolerance. + * + * This class can be used as the direct solver classes. Here is a typical usage example: + * \code + * int n = 10000; + * VectorXd x(n), b(n); + * SparseMatrix A(n,n); + * // fill A and b + * MINRES > mr; + * mr.compute(A); + * x = mr.solve(b); + * std::cout << "#iterations: " << mr.iterations() << std::endl; + * std::cout << "estimated error: " << mr.error() << std::endl; + * // update b, and solve again + * x = mr.solve(b); + * \endcode + * + * By default the iterations start with x=0 as an initial guess of the solution. + * One can control the start using the solveWithGuess() method. + * + * MINRES can also be used in a matrix-free context, see the following \link MatrixfreeSolverExample example \endlink. + * + * \sa class ConjugateGradient, BiCGSTAB, SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner + */ + template< typename _MatrixType, int _UpLo, typename _Preconditioner> + class MINRES : public IterativeSolverBase > + { + + typedef IterativeSolverBase Base; + using Base::matrix; + using Base::m_error; + using Base::m_iterations; + using Base::m_info; + using Base::m_isInitialized; + public: + using Base::_solve_impl; + typedef _MatrixType MatrixType; + typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::RealScalar RealScalar; + typedef _Preconditioner Preconditioner; + + enum {UpLo = _UpLo}; + + public: + + /** Default constructor. */ + MINRES() : Base() {} + + /** Initialize the solver with matrix \a A for further \c Ax=b solving. + * + * This constructor is a shortcut for the default constructor followed + * by a call to compute(). + * + * \warning this class stores a reference to the matrix A as well as some + * precomputed values that depend on it. Therefore, if \a A is changed + * this class becomes invalid. Call compute() to update it with the new + * matrix A, or modify a copy of A. + */ + template + explicit MINRES(const EigenBase& A) : Base(A.derived()) {} + + /** Destructor. */ + ~MINRES(){} + + /** \internal */ + template + void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const + { + typedef typename Base::MatrixWrapper MatrixWrapper; + typedef typename Base::ActualMatrixType ActualMatrixType; + enum { + TransposeInput = (!MatrixWrapper::MatrixFree) + && (UpLo==(Lower|Upper)) + && (!MatrixType::IsRowMajor) + && (!NumTraits::IsComplex) + }; + typedef typename internal::conditional, ActualMatrixType const&>::type RowMajorWrapper; + EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(MatrixWrapper::MatrixFree,UpLo==(Lower|Upper)),MATRIX_FREE_CONJUGATE_GRADIENT_IS_COMPATIBLE_WITH_UPPER_UNION_LOWER_MODE_ONLY); + typedef typename internal::conditional::Type + >::type SelfAdjointWrapper; + + m_iterations = Base::maxIterations(); + m_error = Base::m_tolerance; + RowMajorWrapper row_mat(matrix()); + internal::minres(SelfAdjointWrapper(row_mat), b, x, + Base::m_preconditioner, m_iterations, m_error); + m_info = m_error <= Base::m_tolerance ? Success : NoConvergence; + } + + protected: + + }; + +} // end namespace Eigen + +#endif // EIGEN_MINRES_H diff --git a/src/EigenUnsupported/src/IterativeSolvers/Scaling.h b/src/EigenUnsupported/src/IterativeSolvers/Scaling.h new file mode 100644 index 0000000..9b3eb53 --- /dev/null +++ b/src/EigenUnsupported/src/IterativeSolvers/Scaling.h @@ -0,0 +1,193 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2012 Desire NUENTSA WAKAM A; + * // fill A and b; + * IterScaling > scal; + * // Compute the left and right scaling vectors. The matrix is equilibrated at output + * scal.computeRef(A); + * // Scale the right hand side + * b = scal.LeftScaling().cwiseProduct(b); + * // Now, solve the equilibrated linear system with any available solver + * + * // Scale back the computed solution + * x = scal.RightScaling().cwiseProduct(x); + * \endcode + * + * \tparam _MatrixType the type of the matrix. It should be a real square sparsematrix + * + * References : D. Ruiz and B. Ucar, A Symmetry Preserving Algorithm for Matrix Scaling, INRIA Research report RR-7552 + * + * \sa \ref IncompleteLUT + */ +template +class IterScaling +{ + public: + typedef _MatrixType MatrixType; + typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::Index Index; + + public: + IterScaling() { init(); } + + IterScaling(const MatrixType& matrix) + { + init(); + compute(matrix); + } + + ~IterScaling() { } + + /** + * Compute the left and right diagonal matrices to scale the input matrix @p mat + * + * FIXME This algorithm will be modified such that the diagonal elements are permuted on the diagonal. + * + * \sa LeftScaling() RightScaling() + */ + void compute (const MatrixType& mat) + { + using std::abs; + int m = mat.rows(); + int n = mat.cols(); + eigen_assert((m>0 && m == n) && "Please give a non - empty matrix"); + m_left.resize(m); + m_right.resize(n); + m_left.setOnes(); + m_right.setOnes(); + m_matrix = mat; + VectorXd Dr, Dc, DrRes, DcRes; // Temporary Left and right scaling vectors + Dr.resize(m); Dc.resize(n); + DrRes.resize(m); DcRes.resize(n); + double EpsRow = 1.0, EpsCol = 1.0; + int its = 0; + do + { // Iterate until the infinite norm of each row and column is approximately 1 + // Get the maximum value in each row and column + Dr.setZero(); Dc.setZero(); + for (int k=0; km_tol || EpsCol > m_tol) && (its < m_maxits) ); + m_isInitialized = true; + } + /** Compute the left and right vectors to scale the vectors + * the input matrix is scaled with the computed vectors at output + * + * \sa compute() + */ + void computeRef (MatrixType& mat) + { + compute (mat); + mat = m_matrix; + } + /** Get the vector to scale the rows of the matrix + */ + VectorXd& LeftScaling() + { + return m_left; + } + + /** Get the vector to scale the columns of the matrix + */ + VectorXd& RightScaling() + { + return m_right; + } + + /** Set the tolerance for the convergence of the iterative scaling algorithm + */ + void setTolerance(double tol) + { + m_tol = tol; + } + + protected: + + void init() + { + m_tol = 1e-10; + m_maxits = 5; + m_isInitialized = false; + } + + MatrixType m_matrix; + mutable ComputationInfo m_info; + bool m_isInitialized; + VectorXd m_left; // Left scaling vector + VectorXd m_right; // m_right scaling vector + double m_tol; + int m_maxits; // Maximum number of iterations allowed +}; +} +#endif diff --git a/src/EigenUnsupported/src/KroneckerProduct/KroneckerTensorProduct.h b/src/EigenUnsupported/src/KroneckerProduct/KroneckerTensorProduct.h new file mode 100644 index 0000000..6a9b0be --- /dev/null +++ b/src/EigenUnsupported/src/KroneckerProduct/KroneckerTensorProduct.h @@ -0,0 +1,305 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2011 Kolja Brix +// Copyright (C) 2011 Andreas Platen +// Copyright (C) 2012 Chen-Pang He +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef KRONECKER_TENSOR_PRODUCT_H +#define KRONECKER_TENSOR_PRODUCT_H + +namespace Eigen { + +/*! + * \ingroup KroneckerProduct_Module + * + * \brief The base class of dense and sparse Kronecker product. + * + * \tparam Derived is the derived type. + */ +template +class KroneckerProductBase : public ReturnByValue +{ + private: + typedef typename internal::traits Traits; + typedef typename Traits::Scalar Scalar; + + protected: + typedef typename Traits::Lhs Lhs; + typedef typename Traits::Rhs Rhs; + + public: + /*! \brief Constructor. */ + KroneckerProductBase(const Lhs& A, const Rhs& B) + : m_A(A), m_B(B) + {} + + inline Index rows() const { return m_A.rows() * m_B.rows(); } + inline Index cols() const { return m_A.cols() * m_B.cols(); } + + /*! + * This overrides ReturnByValue::coeff because this function is + * efficient enough. + */ + Scalar coeff(Index row, Index col) const + { + return m_A.coeff(row / m_B.rows(), col / m_B.cols()) * + m_B.coeff(row % m_B.rows(), col % m_B.cols()); + } + + /*! + * This overrides ReturnByValue::coeff because this function is + * efficient enough. + */ + Scalar coeff(Index i) const + { + EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived); + return m_A.coeff(i / m_A.size()) * m_B.coeff(i % m_A.size()); + } + + protected: + typename Lhs::Nested m_A; + typename Rhs::Nested m_B; +}; + +/*! + * \ingroup KroneckerProduct_Module + * + * \brief Kronecker tensor product helper class for dense matrices + * + * This class is the return value of kroneckerProduct(MatrixBase, + * MatrixBase). Use the function rather than construct this class + * directly to avoid specifying template prarameters. + * + * \tparam Lhs Type of the left-hand side, a matrix expression. + * \tparam Rhs Type of the rignt-hand side, a matrix expression. + */ +template +class KroneckerProduct : public KroneckerProductBase > +{ + private: + typedef KroneckerProductBase Base; + using Base::m_A; + using Base::m_B; + + public: + /*! \brief Constructor. */ + KroneckerProduct(const Lhs& A, const Rhs& B) + : Base(A, B) + {} + + /*! \brief Evaluate the Kronecker tensor product. */ + template void evalTo(Dest& dst) const; +}; + +/*! + * \ingroup KroneckerProduct_Module + * + * \brief Kronecker tensor product helper class for sparse matrices + * + * If at least one of the operands is a sparse matrix expression, + * then this class is returned and evaluates into a sparse matrix. + * + * This class is the return value of kroneckerProduct(EigenBase, + * EigenBase). Use the function rather than construct this class + * directly to avoid specifying template prarameters. + * + * \tparam Lhs Type of the left-hand side, a matrix expression. + * \tparam Rhs Type of the rignt-hand side, a matrix expression. + */ +template +class KroneckerProductSparse : public KroneckerProductBase > +{ + private: + typedef KroneckerProductBase Base; + using Base::m_A; + using Base::m_B; + + public: + /*! \brief Constructor. */ + KroneckerProductSparse(const Lhs& A, const Rhs& B) + : Base(A, B) + {} + + /*! \brief Evaluate the Kronecker tensor product. */ + template void evalTo(Dest& dst) const; +}; + +template +template +void KroneckerProduct::evalTo(Dest& dst) const +{ + const int BlockRows = Rhs::RowsAtCompileTime, + BlockCols = Rhs::ColsAtCompileTime; + const Index Br = m_B.rows(), + Bc = m_B.cols(); + for (Index i=0; i < m_A.rows(); ++i) + for (Index j=0; j < m_A.cols(); ++j) + Block(dst,i*Br,j*Bc,Br,Bc) = m_A.coeff(i,j) * m_B; +} + +template +template +void KroneckerProductSparse::evalTo(Dest& dst) const +{ + Index Br = m_B.rows(), Bc = m_B.cols(); + dst.resize(this->rows(), this->cols()); + dst.resizeNonZeros(0); + + // 1 - evaluate the operands if needed: + typedef typename internal::nested_eval::type Lhs1; + typedef typename internal::remove_all::type Lhs1Cleaned; + const Lhs1 lhs1(m_A); + typedef typename internal::nested_eval::type Rhs1; + typedef typename internal::remove_all::type Rhs1Cleaned; + const Rhs1 rhs1(m_B); + + // 2 - construct respective iterators + typedef Eigen::InnerIterator LhsInnerIterator; + typedef Eigen::InnerIterator RhsInnerIterator; + + // compute number of non-zeros per innervectors of dst + { + // TODO VectorXi is not necessarily big enough! + VectorXi nnzA = VectorXi::Zero(Dest::IsRowMajor ? m_A.rows() : m_A.cols()); + for (Index kA=0; kA < m_A.outerSize(); ++kA) + for (LhsInnerIterator itA(lhs1,kA); itA; ++itA) + nnzA(Dest::IsRowMajor ? itA.row() : itA.col())++; + + VectorXi nnzB = VectorXi::Zero(Dest::IsRowMajor ? m_B.rows() : m_B.cols()); + for (Index kB=0; kB < m_B.outerSize(); ++kB) + for (RhsInnerIterator itB(rhs1,kB); itB; ++itB) + nnzB(Dest::IsRowMajor ? itB.row() : itB.col())++; + + Matrix nnzAB = nnzB * nnzA.transpose(); + dst.reserve(VectorXi::Map(nnzAB.data(), nnzAB.size())); + } + + for (Index kA=0; kA < m_A.outerSize(); ++kA) + { + for (Index kB=0; kB < m_B.outerSize(); ++kB) + { + for (LhsInnerIterator itA(lhs1,kA); itA; ++itA) + { + for (RhsInnerIterator itB(rhs1,kB); itB; ++itB) + { + Index i = itA.row() * Br + itB.row(), + j = itA.col() * Bc + itB.col(); + dst.insert(i,j) = itA.value() * itB.value(); + } + } + } + } +} + +namespace internal { + +template +struct traits > +{ + typedef typename remove_all<_Lhs>::type Lhs; + typedef typename remove_all<_Rhs>::type Rhs; + typedef typename ScalarBinaryOpTraits::ReturnType Scalar; + typedef typename promote_index_type::type StorageIndex; + + enum { + Rows = size_at_compile_time::RowsAtCompileTime, traits::RowsAtCompileTime>::ret, + Cols = size_at_compile_time::ColsAtCompileTime, traits::ColsAtCompileTime>::ret, + MaxRows = size_at_compile_time::MaxRowsAtCompileTime, traits::MaxRowsAtCompileTime>::ret, + MaxCols = size_at_compile_time::MaxColsAtCompileTime, traits::MaxColsAtCompileTime>::ret + }; + + typedef Matrix ReturnType; +}; + +template +struct traits > +{ + typedef MatrixXpr XprKind; + typedef typename remove_all<_Lhs>::type Lhs; + typedef typename remove_all<_Rhs>::type Rhs; + typedef typename ScalarBinaryOpTraits::ReturnType Scalar; + typedef typename cwise_promote_storage_type::StorageKind, typename traits::StorageKind, scalar_product_op >::ret StorageKind; + typedef typename promote_index_type::type StorageIndex; + + enum { + LhsFlags = Lhs::Flags, + RhsFlags = Rhs::Flags, + + RowsAtCompileTime = size_at_compile_time::RowsAtCompileTime, traits::RowsAtCompileTime>::ret, + ColsAtCompileTime = size_at_compile_time::ColsAtCompileTime, traits::ColsAtCompileTime>::ret, + MaxRowsAtCompileTime = size_at_compile_time::MaxRowsAtCompileTime, traits::MaxRowsAtCompileTime>::ret, + MaxColsAtCompileTime = size_at_compile_time::MaxColsAtCompileTime, traits::MaxColsAtCompileTime>::ret, + + EvalToRowMajor = (int(LhsFlags) & int(RhsFlags) & RowMajorBit), + RemovedBits = ~(EvalToRowMajor ? 0 : RowMajorBit), + + Flags = ((int(LhsFlags) | int(RhsFlags)) & HereditaryBits & RemovedBits) + | EvalBeforeNestingBit, + CoeffReadCost = HugeCost + }; + + typedef SparseMatrix ReturnType; +}; + +} // end namespace internal + +/*! + * \ingroup KroneckerProduct_Module + * + * Computes Kronecker tensor product of two dense matrices + * + * \warning If you want to replace a matrix by its Kronecker product + * with some matrix, do \b NOT do this: + * \code + * A = kroneckerProduct(A,B); // bug!!! caused by aliasing effect + * \endcode + * instead, use eval() to work around this: + * \code + * A = kroneckerProduct(A,B).eval(); + * \endcode + * + * \param a Dense matrix a + * \param b Dense matrix b + * \return Kronecker tensor product of a and b + */ +template +KroneckerProduct kroneckerProduct(const MatrixBase& a, const MatrixBase& b) +{ + return KroneckerProduct(a.derived(), b.derived()); +} + +/*! + * \ingroup KroneckerProduct_Module + * + * Computes Kronecker tensor product of two matrices, at least one of + * which is sparse + * + * \warning If you want to replace a matrix by its Kronecker product + * with some matrix, do \b NOT do this: + * \code + * A = kroneckerProduct(A,B); // bug!!! caused by aliasing effect + * \endcode + * instead, use eval() to work around this: + * \code + * A = kroneckerProduct(A,B).eval(); + * \endcode + * + * \param a Dense/sparse matrix a + * \param b Dense/sparse matrix b + * \return Kronecker tensor product of a and b, stored in a sparse + * matrix + */ +template +KroneckerProductSparse kroneckerProduct(const EigenBase& a, const EigenBase& b) +{ + return KroneckerProductSparse(a.derived(), b.derived()); +} + +} // end namespace Eigen + +#endif // KRONECKER_TENSOR_PRODUCT_H diff --git a/src/EigenUnsupported/src/LevenbergMarquardt/CopyrightMINPACK.txt b/src/EigenUnsupported/src/LevenbergMarquardt/CopyrightMINPACK.txt new file mode 100644 index 0000000..ae7984d --- /dev/null +++ b/src/EigenUnsupported/src/LevenbergMarquardt/CopyrightMINPACK.txt @@ -0,0 +1,52 @@ +Minpack Copyright Notice (1999) University of Chicago. All rights reserved + +Redistribution and use in source and binary forms, with or +without modification, are permitted provided that the +following conditions are met: + +1. Redistributions of source code must retain the above +copyright notice, this list of conditions and the following +disclaimer. + +2. Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following +disclaimer in the documentation and/or other materials +provided with the distribution. + +3. The end-user documentation included with the +redistribution, if any, must include the following +acknowledgment: + + "This product includes software developed by the + University of Chicago, as Operator of Argonne National + Laboratory. + +Alternately, this acknowledgment may appear in the software +itself, if and wherever such third-party acknowledgments +normally appear. + +4. WARRANTY DISCLAIMER. THE SOFTWARE IS SUPPLIED "AS IS" +WITHOUT WARRANTY OF ANY KIND. THE COPYRIGHT HOLDER, THE +UNITED STATES, THE UNITED STATES DEPARTMENT OF ENERGY, AND +THEIR EMPLOYEES: (1) DISCLAIM ANY WARRANTIES, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO ANY IMPLIED WARRANTIES +OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE +OR NON-INFRINGEMENT, (2) DO NOT ASSUME ANY LEGAL LIABILITY +OR RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR +USEFULNESS OF THE SOFTWARE, (3) DO NOT REPRESENT THAT USE OF +THE SOFTWARE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS, (4) +DO NOT WARRANT THAT THE SOFTWARE WILL FUNCTION +UNINTERRUPTED, THAT IT IS ERROR-FREE OR THAT ANY ERRORS WILL +BE CORRECTED. + +5. LIMITATION OF LIABILITY. IN NO EVENT WILL THE COPYRIGHT +HOLDER, THE UNITED STATES, THE UNITED STATES DEPARTMENT OF +ENERGY, OR THEIR EMPLOYEES: BE LIABLE FOR ANY INDIRECT, +INCIDENTAL, CONSEQUENTIAL, SPECIAL OR PUNITIVE DAMAGES OF +ANY KIND OR NATURE, INCLUDING BUT NOT LIMITED TO LOSS OF +PROFITS OR LOSS OF DATA, FOR ANY REASON WHATSOEVER, WHETHER +SUCH LIABILITY IS ASSERTED ON THE BASIS OF CONTRACT, TORT +(INCLUDING NEGLIGENCE OR STRICT LIABILITY), OR OTHERWISE, +EVEN IF ANY OF SAID PARTIES HAS BEEN WARNED OF THE +POSSIBILITY OF SUCH LOSS OR DAMAGES. + diff --git a/src/EigenUnsupported/src/LevenbergMarquardt/LMcovar.h b/src/EigenUnsupported/src/LevenbergMarquardt/LMcovar.h new file mode 100644 index 0000000..b75bea2 --- /dev/null +++ b/src/EigenUnsupported/src/LevenbergMarquardt/LMcovar.h @@ -0,0 +1,84 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This code initially comes from MINPACK whose original authors are: +// Copyright Jorge More - Argonne National Laboratory +// Copyright Burt Garbow - Argonne National Laboratory +// Copyright Ken Hillstrom - Argonne National Laboratory +// +// This Source Code Form is subject to the terms of the Minpack license +// (a BSD-like license) described in the campaigned CopyrightMINPACK.txt file. + +#ifndef EIGEN_LMCOVAR_H +#define EIGEN_LMCOVAR_H + +namespace Eigen { + +namespace internal { + +template +void covar( + Matrix< Scalar, Dynamic, Dynamic > &r, + const VectorXi& ipvt, + Scalar tol = std::sqrt(NumTraits::epsilon()) ) +{ + using std::abs; + /* Local variables */ + Index i, j, k, l, ii, jj; + bool sing; + Scalar temp; + + /* Function Body */ + const Index n = r.cols(); + const Scalar tolr = tol * abs(r(0,0)); + Matrix< Scalar, Dynamic, 1 > wa(n); + eigen_assert(ipvt.size()==n); + + /* form the inverse of r in the full upper triangle of r. */ + l = -1; + for (k = 0; k < n; ++k) + if (abs(r(k,k)) > tolr) { + r(k,k) = 1. / r(k,k); + for (j = 0; j <= k-1; ++j) { + temp = r(k,k) * r(j,k); + r(j,k) = 0.; + r.col(k).head(j+1) -= r.col(j).head(j+1) * temp; + } + l = k; + } + + /* form the full upper triangle of the inverse of (r transpose)*r */ + /* in the full upper triangle of r. */ + for (k = 0; k <= l; ++k) { + for (j = 0; j <= k-1; ++j) + r.col(j).head(j+1) += r.col(k).head(j+1) * r(j,k); + r.col(k).head(k+1) *= r(k,k); + } + + /* form the full lower triangle of the covariance matrix */ + /* in the strict lower triangle of r and in wa. */ + for (j = 0; j < n; ++j) { + jj = ipvt[j]; + sing = j > l; + for (i = 0; i <= j; ++i) { + if (sing) + r(i,j) = 0.; + ii = ipvt[i]; + if (ii > jj) + r(ii,jj) = r(i,j); + if (ii < jj) + r(jj,ii) = r(i,j); + } + wa[jj] = r(j,j); + } + + /* symmetrize the covariance matrix in r. */ + r.topLeftCorner(n,n).template triangularView() = r.topLeftCorner(n,n).transpose(); + r.diagonal() = wa; +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_LMCOVAR_H diff --git a/src/EigenUnsupported/src/LevenbergMarquardt/LMonestep.h b/src/EigenUnsupported/src/LevenbergMarquardt/LMonestep.h new file mode 100644 index 0000000..25b32ec --- /dev/null +++ b/src/EigenUnsupported/src/LevenbergMarquardt/LMonestep.h @@ -0,0 +1,202 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Thomas Capricelli +// +// This code initially comes from MINPACK whose original authors are: +// Copyright Jorge More - Argonne National Laboratory +// Copyright Burt Garbow - Argonne National Laboratory +// Copyright Ken Hillstrom - Argonne National Laboratory +// +// This Source Code Form is subject to the terms of the Minpack license +// (a BSD-like license) described in the campaigned CopyrightMINPACK.txt file. + +#ifndef EIGEN_LMONESTEP_H +#define EIGEN_LMONESTEP_H + +namespace Eigen { + +template +LevenbergMarquardtSpace::Status +LevenbergMarquardt::minimizeOneStep(FVectorType &x) +{ + using std::abs; + using std::sqrt; + RealScalar temp, temp1,temp2; + RealScalar ratio; + RealScalar pnorm, xnorm, fnorm1, actred, dirder, prered; + eigen_assert(x.size()==n); // check the caller is not cheating us + + temp = 0.0; xnorm = 0.0; + /* calculate the jacobian matrix. */ + Index df_ret = m_functor.df(x, m_fjac); + if (df_ret<0) + return LevenbergMarquardtSpace::UserAsked; + if (df_ret>0) + // numerical diff, we evaluated the function df_ret times + m_nfev += df_ret; + else m_njev++; + + /* compute the qr factorization of the jacobian. */ + for (int j = 0; j < x.size(); ++j) + m_wa2(j) = m_fjac.col(j).blueNorm(); + QRSolver qrfac(m_fjac); + if(qrfac.info() != Success) { + m_info = NumericalIssue; + return LevenbergMarquardtSpace::ImproperInputParameters; + } + // Make a copy of the first factor with the associated permutation + m_rfactor = qrfac.matrixR(); + m_permutation = (qrfac.colsPermutation()); + + /* on the first iteration and if external scaling is not used, scale according */ + /* to the norms of the columns of the initial jacobian. */ + if (m_iter == 1) { + if (!m_useExternalScaling) + for (Index j = 0; j < n; ++j) + m_diag[j] = (m_wa2[j]==0.)? 1. : m_wa2[j]; + + /* on the first iteration, calculate the norm of the scaled x */ + /* and initialize the step bound m_delta. */ + xnorm = m_diag.cwiseProduct(x).stableNorm(); + m_delta = m_factor * xnorm; + if (m_delta == 0.) + m_delta = m_factor; + } + + /* form (q transpose)*m_fvec and store the first n components in */ + /* m_qtf. */ + m_wa4 = m_fvec; + m_wa4 = qrfac.matrixQ().adjoint() * m_fvec; + m_qtf = m_wa4.head(n); + + /* compute the norm of the scaled gradient. */ + m_gnorm = 0.; + if (m_fnorm != 0.) + for (Index j = 0; j < n; ++j) + if (m_wa2[m_permutation.indices()[j]] != 0.) + m_gnorm = (std::max)(m_gnorm, abs( m_rfactor.col(j).head(j+1).dot(m_qtf.head(j+1)/m_fnorm) / m_wa2[m_permutation.indices()[j]])); + + /* test for convergence of the gradient norm. */ + if (m_gnorm <= m_gtol) { + m_info = Success; + return LevenbergMarquardtSpace::CosinusTooSmall; + } + + /* rescale if necessary. */ + if (!m_useExternalScaling) + m_diag = m_diag.cwiseMax(m_wa2); + + do { + /* determine the levenberg-marquardt parameter. */ + internal::lmpar2(qrfac, m_diag, m_qtf, m_delta, m_par, m_wa1); + + /* store the direction p and x + p. calculate the norm of p. */ + m_wa1 = -m_wa1; + m_wa2 = x + m_wa1; + pnorm = m_diag.cwiseProduct(m_wa1).stableNorm(); + + /* on the first iteration, adjust the initial step bound. */ + if (m_iter == 1) + m_delta = (std::min)(m_delta,pnorm); + + /* evaluate the function at x + p and calculate its norm. */ + if ( m_functor(m_wa2, m_wa4) < 0) + return LevenbergMarquardtSpace::UserAsked; + ++m_nfev; + fnorm1 = m_wa4.stableNorm(); + + /* compute the scaled actual reduction. */ + actred = -1.; + if (Scalar(.1) * fnorm1 < m_fnorm) + actred = 1. - numext::abs2(fnorm1 / m_fnorm); + + /* compute the scaled predicted reduction and */ + /* the scaled directional derivative. */ + m_wa3 = m_rfactor.template triangularView() * (m_permutation.inverse() *m_wa1); + temp1 = numext::abs2(m_wa3.stableNorm() / m_fnorm); + temp2 = numext::abs2(sqrt(m_par) * pnorm / m_fnorm); + prered = temp1 + temp2 / Scalar(.5); + dirder = -(temp1 + temp2); + + /* compute the ratio of the actual to the predicted */ + /* reduction. */ + ratio = 0.; + if (prered != 0.) + ratio = actred / prered; + + /* update the step bound. */ + if (ratio <= Scalar(.25)) { + if (actred >= 0.) + temp = RealScalar(.5); + if (actred < 0.) + temp = RealScalar(.5) * dirder / (dirder + RealScalar(.5) * actred); + if (RealScalar(.1) * fnorm1 >= m_fnorm || temp < RealScalar(.1)) + temp = Scalar(.1); + /* Computing MIN */ + m_delta = temp * (std::min)(m_delta, pnorm / RealScalar(.1)); + m_par /= temp; + } else if (!(m_par != 0. && ratio < RealScalar(.75))) { + m_delta = pnorm / RealScalar(.5); + m_par = RealScalar(.5) * m_par; + } + + /* test for successful iteration. */ + if (ratio >= RealScalar(1e-4)) { + /* successful iteration. update x, m_fvec, and their norms. */ + x = m_wa2; + m_wa2 = m_diag.cwiseProduct(x); + m_fvec = m_wa4; + xnorm = m_wa2.stableNorm(); + m_fnorm = fnorm1; + ++m_iter; + } + + /* tests for convergence. */ + if (abs(actred) <= m_ftol && prered <= m_ftol && Scalar(.5) * ratio <= 1. && m_delta <= m_xtol * xnorm) + { + m_info = Success; + return LevenbergMarquardtSpace::RelativeErrorAndReductionTooSmall; + } + if (abs(actred) <= m_ftol && prered <= m_ftol && Scalar(.5) * ratio <= 1.) + { + m_info = Success; + return LevenbergMarquardtSpace::RelativeReductionTooSmall; + } + if (m_delta <= m_xtol * xnorm) + { + m_info = Success; + return LevenbergMarquardtSpace::RelativeErrorTooSmall; + } + + /* tests for termination and stringent tolerances. */ + if (m_nfev >= m_maxfev) + { + m_info = NoConvergence; + return LevenbergMarquardtSpace::TooManyFunctionEvaluation; + } + if (abs(actred) <= NumTraits::epsilon() && prered <= NumTraits::epsilon() && Scalar(.5) * ratio <= 1.) + { + m_info = Success; + return LevenbergMarquardtSpace::FtolTooSmall; + } + if (m_delta <= NumTraits::epsilon() * xnorm) + { + m_info = Success; + return LevenbergMarquardtSpace::XtolTooSmall; + } + if (m_gnorm <= NumTraits::epsilon()) + { + m_info = Success; + return LevenbergMarquardtSpace::GtolTooSmall; + } + + } while (ratio < Scalar(1e-4)); + + return LevenbergMarquardtSpace::Running; +} + + +} // end namespace Eigen + +#endif // EIGEN_LMONESTEP_H diff --git a/src/EigenUnsupported/src/LevenbergMarquardt/LMpar.h b/src/EigenUnsupported/src/LevenbergMarquardt/LMpar.h new file mode 100644 index 0000000..9a48365 --- /dev/null +++ b/src/EigenUnsupported/src/LevenbergMarquardt/LMpar.h @@ -0,0 +1,160 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This code initially comes from MINPACK whose original authors are: +// Copyright Jorge More - Argonne National Laboratory +// Copyright Burt Garbow - Argonne National Laboratory +// Copyright Ken Hillstrom - Argonne National Laboratory +// +// This Source Code Form is subject to the terms of the Minpack license +// (a BSD-like license) described in the campaigned CopyrightMINPACK.txt file. + +#ifndef EIGEN_LMPAR_H +#define EIGEN_LMPAR_H + +namespace Eigen { + +namespace internal { + + template + void lmpar2( + const QRSolver &qr, + const VectorType &diag, + const VectorType &qtb, + typename VectorType::Scalar m_delta, + typename VectorType::Scalar &par, + VectorType &x) + + { + using std::sqrt; + using std::abs; + typedef typename QRSolver::MatrixType MatrixType; + typedef typename QRSolver::Scalar Scalar; +// typedef typename QRSolver::StorageIndex StorageIndex; + + /* Local variables */ + Index j; + Scalar fp; + Scalar parc, parl; + Index iter; + Scalar temp, paru; + Scalar gnorm; + Scalar dxnorm; + + // Make a copy of the triangular factor. + // This copy is modified during call the qrsolv + MatrixType s; + s = qr.matrixR(); + + /* Function Body */ + const Scalar dwarf = (std::numeric_limits::min)(); + const Index n = qr.matrixR().cols(); + eigen_assert(n==diag.size()); + eigen_assert(n==qtb.size()); + + VectorType wa1, wa2; + + /* compute and store in x the gauss-newton direction. if the */ + /* jacobian is rank-deficient, obtain a least squares solution. */ + + // const Index rank = qr.nonzeroPivots(); // exactly double(0.) + const Index rank = qr.rank(); // use a threshold + wa1 = qtb; + wa1.tail(n-rank).setZero(); + //FIXME There is no solve in place for sparse triangularView + wa1.head(rank) = s.topLeftCorner(rank,rank).template triangularView().solve(qtb.head(rank)); + + x = qr.colsPermutation()*wa1; + + /* initialize the iteration counter. */ + /* evaluate the function at the origin, and test */ + /* for acceptance of the gauss-newton direction. */ + iter = 0; + wa2 = diag.cwiseProduct(x); + dxnorm = wa2.blueNorm(); + fp = dxnorm - m_delta; + if (fp <= Scalar(0.1) * m_delta) { + par = 0; + return; + } + + /* if the jacobian is not rank deficient, the newton */ + /* step provides a lower bound, parl, for the zero of */ + /* the function. otherwise set this bound to zero. */ + parl = 0.; + if (rank==n) { + wa1 = qr.colsPermutation().inverse() * diag.cwiseProduct(wa2)/dxnorm; + s.topLeftCorner(n,n).transpose().template triangularView().solveInPlace(wa1); + temp = wa1.blueNorm(); + parl = fp / m_delta / temp / temp; + } + + /* calculate an upper bound, paru, for the zero of the function. */ + for (j = 0; j < n; ++j) + wa1[j] = s.col(j).head(j+1).dot(qtb.head(j+1)) / diag[qr.colsPermutation().indices()(j)]; + + gnorm = wa1.stableNorm(); + paru = gnorm / m_delta; + if (paru == 0.) + paru = dwarf / (std::min)(m_delta,Scalar(0.1)); + + /* if the input par lies outside of the interval (parl,paru), */ + /* set par to the closer endpoint. */ + par = (std::max)(par,parl); + par = (std::min)(par,paru); + if (par == 0.) + par = gnorm / dxnorm; + + /* beginning of an iteration. */ + while (true) { + ++iter; + + /* evaluate the function at the current value of par. */ + if (par == 0.) + par = (std::max)(dwarf,Scalar(.001) * paru); /* Computing MAX */ + wa1 = sqrt(par)* diag; + + VectorType sdiag(n); + lmqrsolv(s, qr.colsPermutation(), wa1, qtb, x, sdiag); + + wa2 = diag.cwiseProduct(x); + dxnorm = wa2.blueNorm(); + temp = fp; + fp = dxnorm - m_delta; + + /* if the function is small enough, accept the current value */ + /* of par. also test for the exceptional cases where parl */ + /* is zero or the number of iterations has reached 10. */ + if (abs(fp) <= Scalar(0.1) * m_delta || (parl == 0. && fp <= temp && temp < 0.) || iter == 10) + break; + + /* compute the newton correction. */ + wa1 = qr.colsPermutation().inverse() * diag.cwiseProduct(wa2/dxnorm); + // we could almost use this here, but the diagonal is outside qr, in sdiag[] + for (j = 0; j < n; ++j) { + wa1[j] /= sdiag[j]; + temp = wa1[j]; + for (Index i = j+1; i < n; ++i) + wa1[i] -= s.coeff(i,j) * temp; + } + temp = wa1.blueNorm(); + parc = fp / m_delta / temp / temp; + + /* depending on the sign of the function, update parl or paru. */ + if (fp > 0.) + parl = (std::max)(parl,par); + if (fp < 0.) + paru = (std::min)(paru,par); + + /* compute an improved estimate for par. */ + par = (std::max)(parl,par+parc); + } + if (iter == 0) + par = 0.; + return; + } +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_LMPAR_H diff --git a/src/EigenUnsupported/src/LevenbergMarquardt/LMqrsolv.h b/src/EigenUnsupported/src/LevenbergMarquardt/LMqrsolv.h new file mode 100644 index 0000000..1234858 --- /dev/null +++ b/src/EigenUnsupported/src/LevenbergMarquardt/LMqrsolv.h @@ -0,0 +1,188 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Thomas Capricelli +// Copyright (C) 2012 Desire Nuentsa +// +// This code initially comes from MINPACK whose original authors are: +// Copyright Jorge More - Argonne National Laboratory +// Copyright Burt Garbow - Argonne National Laboratory +// Copyright Ken Hillstrom - Argonne National Laboratory +// +// This Source Code Form is subject to the terms of the Minpack license +// (a BSD-like license) described in the campaigned CopyrightMINPACK.txt file. + +#ifndef EIGEN_LMQRSOLV_H +#define EIGEN_LMQRSOLV_H + +namespace Eigen { + +namespace internal { + +template +void lmqrsolv( + Matrix &s, + const PermutationMatrix &iPerm, + const Matrix &diag, + const Matrix &qtb, + Matrix &x, + Matrix &sdiag) +{ + /* Local variables */ + Index i, j, k; + Scalar temp; + Index n = s.cols(); + Matrix wa(n); + JacobiRotation givens; + + /* Function Body */ + // the following will only change the lower triangular part of s, including + // the diagonal, though the diagonal is restored afterward + + /* copy r and (q transpose)*b to preserve input and initialize s. */ + /* in particular, save the diagonal elements of r in x. */ + x = s.diagonal(); + wa = qtb; + + + s.topLeftCorner(n,n).template triangularView() = s.topLeftCorner(n,n).transpose(); + /* eliminate the diagonal matrix d using a givens rotation. */ + for (j = 0; j < n; ++j) { + + /* prepare the row of d to be eliminated, locating the */ + /* diagonal element using p from the qr factorization. */ + const PermIndex l = iPerm.indices()(j); + if (diag[l] == 0.) + break; + sdiag.tail(n-j).setZero(); + sdiag[j] = diag[l]; + + /* the transformations to eliminate the row of d */ + /* modify only a single element of (q transpose)*b */ + /* beyond the first n, which is initially zero. */ + Scalar qtbpj = 0.; + for (k = j; k < n; ++k) { + /* determine a givens rotation which eliminates the */ + /* appropriate element in the current row of d. */ + givens.makeGivens(-s(k,k), sdiag[k]); + + /* compute the modified diagonal element of r and */ + /* the modified element of ((q transpose)*b,0). */ + s(k,k) = givens.c() * s(k,k) + givens.s() * sdiag[k]; + temp = givens.c() * wa[k] + givens.s() * qtbpj; + qtbpj = -givens.s() * wa[k] + givens.c() * qtbpj; + wa[k] = temp; + + /* accumulate the transformation in the row of s. */ + for (i = k+1; i().solveInPlace(wa.head(nsing)); + + // restore + sdiag = s.diagonal(); + s.diagonal() = x; + + /* permute the components of z back to components of x. */ + x = iPerm * wa; +} + +template +void lmqrsolv( + SparseMatrix &s, + const PermutationMatrix &iPerm, + const Matrix &diag, + const Matrix &qtb, + Matrix &x, + Matrix &sdiag) +{ + /* Local variables */ + typedef SparseMatrix FactorType; + Index i, j, k, l; + Scalar temp; + Index n = s.cols(); + Matrix wa(n); + JacobiRotation givens; + + /* Function Body */ + // the following will only change the lower triangular part of s, including + // the diagonal, though the diagonal is restored afterward + + /* copy r and (q transpose)*b to preserve input and initialize R. */ + wa = qtb; + FactorType R(s); + // Eliminate the diagonal matrix d using a givens rotation + for (j = 0; j < n; ++j) + { + // Prepare the row of d to be eliminated, locating the + // diagonal element using p from the qr factorization + l = iPerm.indices()(j); + if (diag(l) == Scalar(0)) + break; + sdiag.tail(n-j).setZero(); + sdiag[j] = diag[l]; + // the transformations to eliminate the row of d + // modify only a single element of (q transpose)*b + // beyond the first n, which is initially zero. + + Scalar qtbpj = 0; + // Browse the nonzero elements of row j of the upper triangular s + for (k = j; k < n; ++k) + { + typename FactorType::InnerIterator itk(R,k); + for (; itk; ++itk){ + if (itk.index() < k) continue; + else break; + } + //At this point, we have the diagonal element R(k,k) + // Determine a givens rotation which eliminates + // the appropriate element in the current row of d + givens.makeGivens(-itk.value(), sdiag(k)); + + // Compute the modified diagonal element of r and + // the modified element of ((q transpose)*b,0). + itk.valueRef() = givens.c() * itk.value() + givens.s() * sdiag(k); + temp = givens.c() * wa(k) + givens.s() * qtbpj; + qtbpj = -givens.s() * wa(k) + givens.c() * qtbpj; + wa(k) = temp; + + // Accumulate the transformation in the remaining k row/column of R + for (++itk; itk; ++itk) + { + i = itk.index(); + temp = givens.c() * itk.value() + givens.s() * sdiag(i); + sdiag(i) = -givens.s() * itk.value() + givens.c() * sdiag(i); + itk.valueRef() = temp; + } + } + } + + // Solve the triangular system for z. If the system is + // singular, then obtain a least squares solution + Index nsing; + for(nsing = 0; nsing().solve/*InPlace*/(wa.head(nsing)); + + sdiag = R.diagonal(); + // Permute the components of z back to components of x + x = iPerm * wa; +} +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_LMQRSOLV_H diff --git a/src/EigenUnsupported/src/LevenbergMarquardt/LevenbergMarquardt.h b/src/EigenUnsupported/src/LevenbergMarquardt/LevenbergMarquardt.h new file mode 100644 index 0000000..62561da --- /dev/null +++ b/src/EigenUnsupported/src/LevenbergMarquardt/LevenbergMarquardt.h @@ -0,0 +1,396 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Thomas Capricelli +// Copyright (C) 2012 Desire Nuentsa +// +// The algorithm of this class initially comes from MINPACK whose original authors are: +// Copyright Jorge More - Argonne National Laboratory +// Copyright Burt Garbow - Argonne National Laboratory +// Copyright Ken Hillstrom - Argonne National Laboratory +// +// This Source Code Form is subject to the terms of the Minpack license +// (a BSD-like license) described in the campaigned CopyrightMINPACK.txt file. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_LEVENBERGMARQUARDT_H +#define EIGEN_LEVENBERGMARQUARDT_H + + +namespace Eigen { +namespace LevenbergMarquardtSpace { + enum Status { + NotStarted = -2, + Running = -1, + ImproperInputParameters = 0, + RelativeReductionTooSmall = 1, + RelativeErrorTooSmall = 2, + RelativeErrorAndReductionTooSmall = 3, + CosinusTooSmall = 4, + TooManyFunctionEvaluation = 5, + FtolTooSmall = 6, + XtolTooSmall = 7, + GtolTooSmall = 8, + UserAsked = 9 + }; +} + +template +struct DenseFunctor +{ + typedef _Scalar Scalar; + enum { + InputsAtCompileTime = NX, + ValuesAtCompileTime = NY + }; + typedef Matrix InputType; + typedef Matrix ValueType; + typedef Matrix JacobianType; + typedef ColPivHouseholderQR QRSolver; + const int m_inputs, m_values; + + DenseFunctor() : m_inputs(InputsAtCompileTime), m_values(ValuesAtCompileTime) {} + DenseFunctor(int inputs, int values) : m_inputs(inputs), m_values(values) {} + + int inputs() const { return m_inputs; } + int values() const { return m_values; } + + //int operator()(const InputType &x, ValueType& fvec) { } + // should be defined in derived classes + + //int df(const InputType &x, JacobianType& fjac) { } + // should be defined in derived classes +}; + +template +struct SparseFunctor +{ + typedef _Scalar Scalar; + typedef _Index Index; + typedef Matrix InputType; + typedef Matrix ValueType; + typedef SparseMatrix JacobianType; + typedef SparseQR > QRSolver; + enum { + InputsAtCompileTime = Dynamic, + ValuesAtCompileTime = Dynamic + }; + + SparseFunctor(int inputs, int values) : m_inputs(inputs), m_values(values) {} + + int inputs() const { return m_inputs; } + int values() const { return m_values; } + + const int m_inputs, m_values; + //int operator()(const InputType &x, ValueType& fvec) { } + // to be defined in the functor + + //int df(const InputType &x, JacobianType& fjac) { } + // to be defined in the functor if no automatic differentiation + +}; +namespace internal { +template +void lmpar2(const QRSolver &qr, const VectorType &diag, const VectorType &qtb, + typename VectorType::Scalar m_delta, typename VectorType::Scalar &par, + VectorType &x); + } +/** + * \ingroup NonLinearOptimization_Module + * \brief Performs non linear optimization over a non-linear function, + * using a variant of the Levenberg Marquardt algorithm. + * + * Check wikipedia for more information. + * http://en.wikipedia.org/wiki/Levenberg%E2%80%93Marquardt_algorithm + */ +template +class LevenbergMarquardt : internal::no_assignment_operator +{ + public: + typedef _FunctorType FunctorType; + typedef typename FunctorType::QRSolver QRSolver; + typedef typename FunctorType::JacobianType JacobianType; + typedef typename JacobianType::Scalar Scalar; + typedef typename JacobianType::RealScalar RealScalar; + typedef typename QRSolver::StorageIndex PermIndex; + typedef Matrix FVectorType; + typedef PermutationMatrix PermutationType; + public: + LevenbergMarquardt(FunctorType& functor) + : m_functor(functor),m_nfev(0),m_njev(0),m_fnorm(0.0),m_gnorm(0), + m_isInitialized(false),m_info(InvalidInput) + { + resetParameters(); + m_useExternalScaling=false; + } + + LevenbergMarquardtSpace::Status minimize(FVectorType &x); + LevenbergMarquardtSpace::Status minimizeInit(FVectorType &x); + LevenbergMarquardtSpace::Status minimizeOneStep(FVectorType &x); + LevenbergMarquardtSpace::Status lmder1( + FVectorType &x, + const Scalar tol = std::sqrt(NumTraits::epsilon()) + ); + static LevenbergMarquardtSpace::Status lmdif1( + FunctorType &functor, + FVectorType &x, + Index *nfev, + const Scalar tol = std::sqrt(NumTraits::epsilon()) + ); + + /** Sets the default parameters */ + void resetParameters() + { + using std::sqrt; + + m_factor = 100.; + m_maxfev = 400; + m_ftol = sqrt(NumTraits::epsilon()); + m_xtol = sqrt(NumTraits::epsilon()); + m_gtol = 0. ; + m_epsfcn = 0. ; + } + + /** Sets the tolerance for the norm of the solution vector*/ + void setXtol(RealScalar xtol) { m_xtol = xtol; } + + /** Sets the tolerance for the norm of the vector function*/ + void setFtol(RealScalar ftol) { m_ftol = ftol; } + + /** Sets the tolerance for the norm of the gradient of the error vector*/ + void setGtol(RealScalar gtol) { m_gtol = gtol; } + + /** Sets the step bound for the diagonal shift */ + void setFactor(RealScalar factor) { m_factor = factor; } + + /** Sets the error precision */ + void setEpsilon (RealScalar epsfcn) { m_epsfcn = epsfcn; } + + /** Sets the maximum number of function evaluation */ + void setMaxfev(Index maxfev) {m_maxfev = maxfev; } + + /** Use an external Scaling. If set to true, pass a nonzero diagonal to diag() */ + void setExternalScaling(bool value) {m_useExternalScaling = value; } + + /** \returns the tolerance for the norm of the solution vector */ + RealScalar xtol() const {return m_xtol; } + + /** \returns the tolerance for the norm of the vector function */ + RealScalar ftol() const {return m_ftol; } + + /** \returns the tolerance for the norm of the gradient of the error vector */ + RealScalar gtol() const {return m_gtol; } + + /** \returns the step bound for the diagonal shift */ + RealScalar factor() const {return m_factor; } + + /** \returns the error precision */ + RealScalar epsilon() const {return m_epsfcn; } + + /** \returns the maximum number of function evaluation */ + Index maxfev() const {return m_maxfev; } + + /** \returns a reference to the diagonal of the jacobian */ + FVectorType& diag() {return m_diag; } + + /** \returns the number of iterations performed */ + Index iterations() { return m_iter; } + + /** \returns the number of functions evaluation */ + Index nfev() { return m_nfev; } + + /** \returns the number of jacobian evaluation */ + Index njev() { return m_njev; } + + /** \returns the norm of current vector function */ + RealScalar fnorm() {return m_fnorm; } + + /** \returns the norm of the gradient of the error */ + RealScalar gnorm() {return m_gnorm; } + + /** \returns the LevenbergMarquardt parameter */ + RealScalar lm_param(void) { return m_par; } + + /** \returns a reference to the current vector function + */ + FVectorType& fvec() {return m_fvec; } + + /** \returns a reference to the matrix where the current Jacobian matrix is stored + */ + JacobianType& jacobian() {return m_fjac; } + + /** \returns a reference to the triangular matrix R from the QR of the jacobian matrix. + * \sa jacobian() + */ + JacobianType& matrixR() {return m_rfactor; } + + /** the permutation used in the QR factorization + */ + PermutationType permutation() {return m_permutation; } + + /** + * \brief Reports whether the minimization was successful + * \returns \c Success if the minimization was successful, + * \c NumericalIssue if a numerical problem arises during the + * minimization process, for example during the QR factorization + * \c NoConvergence if the minimization did not converge after + * the maximum number of function evaluation allowed + * \c InvalidInput if the input matrix is invalid + */ + ComputationInfo info() const + { + + return m_info; + } + private: + JacobianType m_fjac; + JacobianType m_rfactor; // The triangular matrix R from the QR of the jacobian matrix m_fjac + FunctorType &m_functor; + FVectorType m_fvec, m_qtf, m_diag; + Index n; + Index m; + Index m_nfev; + Index m_njev; + RealScalar m_fnorm; // Norm of the current vector function + RealScalar m_gnorm; //Norm of the gradient of the error + RealScalar m_factor; // + Index m_maxfev; // Maximum number of function evaluation + RealScalar m_ftol; //Tolerance in the norm of the vector function + RealScalar m_xtol; // + RealScalar m_gtol; //tolerance of the norm of the error gradient + RealScalar m_epsfcn; // + Index m_iter; // Number of iterations performed + RealScalar m_delta; + bool m_useExternalScaling; + PermutationType m_permutation; + FVectorType m_wa1, m_wa2, m_wa3, m_wa4; //Temporary vectors + RealScalar m_par; + bool m_isInitialized; // Check whether the minimization step has been called + ComputationInfo m_info; +}; + +template +LevenbergMarquardtSpace::Status +LevenbergMarquardt::minimize(FVectorType &x) +{ + LevenbergMarquardtSpace::Status status = minimizeInit(x); + if (status==LevenbergMarquardtSpace::ImproperInputParameters) { + m_isInitialized = true; + return status; + } + do { +// std::cout << " uv " << x.transpose() << "\n"; + status = minimizeOneStep(x); + } while (status==LevenbergMarquardtSpace::Running); + m_isInitialized = true; + return status; +} + +template +LevenbergMarquardtSpace::Status +LevenbergMarquardt::minimizeInit(FVectorType &x) +{ + n = x.size(); + m = m_functor.values(); + + m_wa1.resize(n); m_wa2.resize(n); m_wa3.resize(n); + m_wa4.resize(m); + m_fvec.resize(m); + //FIXME Sparse Case : Allocate space for the jacobian + m_fjac.resize(m, n); +// m_fjac.reserve(VectorXi::Constant(n,5)); // FIXME Find a better alternative + if (!m_useExternalScaling) + m_diag.resize(n); + eigen_assert( (!m_useExternalScaling || m_diag.size()==n) && "When m_useExternalScaling is set, the caller must provide a valid 'm_diag'"); + m_qtf.resize(n); + + /* Function Body */ + m_nfev = 0; + m_njev = 0; + + /* check the input parameters for errors. */ + if (n <= 0 || m < n || m_ftol < 0. || m_xtol < 0. || m_gtol < 0. || m_maxfev <= 0 || m_factor <= 0.){ + m_info = InvalidInput; + return LevenbergMarquardtSpace::ImproperInputParameters; + } + + if (m_useExternalScaling) + for (Index j = 0; j < n; ++j) + if (m_diag[j] <= 0.) + { + m_info = InvalidInput; + return LevenbergMarquardtSpace::ImproperInputParameters; + } + + /* evaluate the function at the starting point */ + /* and calculate its norm. */ + m_nfev = 1; + if ( m_functor(x, m_fvec) < 0) + return LevenbergMarquardtSpace::UserAsked; + m_fnorm = m_fvec.stableNorm(); + + /* initialize levenberg-marquardt parameter and iteration counter. */ + m_par = 0.; + m_iter = 1; + + return LevenbergMarquardtSpace::NotStarted; +} + +template +LevenbergMarquardtSpace::Status +LevenbergMarquardt::lmder1( + FVectorType &x, + const Scalar tol + ) +{ + n = x.size(); + m = m_functor.values(); + + /* check the input parameters for errors. */ + if (n <= 0 || m < n || tol < 0.) + return LevenbergMarquardtSpace::ImproperInputParameters; + + resetParameters(); + m_ftol = tol; + m_xtol = tol; + m_maxfev = 100*(n+1); + + return minimize(x); +} + + +template +LevenbergMarquardtSpace::Status +LevenbergMarquardt::lmdif1( + FunctorType &functor, + FVectorType &x, + Index *nfev, + const Scalar tol + ) +{ + Index n = x.size(); + Index m = functor.values(); + + /* check the input parameters for errors. */ + if (n <= 0 || m < n || tol < 0.) + return LevenbergMarquardtSpace::ImproperInputParameters; + + NumericalDiff numDiff(functor); + // embedded LevenbergMarquardt + LevenbergMarquardt > lm(numDiff); + lm.setFtol(tol); + lm.setXtol(tol); + lm.setMaxfev(200*(n+1)); + + LevenbergMarquardtSpace::Status info = LevenbergMarquardtSpace::Status(lm.minimize(x)); + if (nfev) + * nfev = lm.nfev(); + return info; +} + +} // end namespace Eigen + +#endif // EIGEN_LEVENBERGMARQUARDT_H diff --git a/src/EigenUnsupported/src/MatrixFunctions/MatrixExponential.h b/src/EigenUnsupported/src/MatrixFunctions/MatrixExponential.h new file mode 100644 index 0000000..02284b0 --- /dev/null +++ b/src/EigenUnsupported/src/MatrixFunctions/MatrixExponential.h @@ -0,0 +1,441 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009, 2010, 2013 Jitse Niesen +// Copyright (C) 2011, 2013 Chen-Pang He +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_MATRIX_EXPONENTIAL +#define EIGEN_MATRIX_EXPONENTIAL + +#include "StemFunction.h" + +namespace Eigen { +namespace internal { + +/** \brief Scaling operator. + * + * This struct is used by CwiseUnaryOp to scale a matrix by \f$ 2^{-s} \f$. + */ +template +struct MatrixExponentialScalingOp +{ + /** \brief Constructor. + * + * \param[in] squarings The integer \f$ s \f$ in this document. + */ + MatrixExponentialScalingOp(int squarings) : m_squarings(squarings) { } + + + /** \brief Scale a matrix coefficient. + * + * \param[in,out] x The scalar to be scaled, becoming \f$ 2^{-s} x \f$. + */ + inline const RealScalar operator() (const RealScalar& x) const + { + using std::ldexp; + return ldexp(x, -m_squarings); + } + + typedef std::complex ComplexScalar; + + /** \brief Scale a matrix coefficient. + * + * \param[in,out] x The scalar to be scaled, becoming \f$ 2^{-s} x \f$. + */ + inline const ComplexScalar operator() (const ComplexScalar& x) const + { + using std::ldexp; + return ComplexScalar(ldexp(x.real(), -m_squarings), ldexp(x.imag(), -m_squarings)); + } + + private: + int m_squarings; +}; + +/** \brief Compute the (3,3)-Padé approximant to the exponential. + * + * After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Padé + * approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$. + */ +template +void matrix_exp_pade3(const MatA& A, MatU& U, MatV& V) +{ + typedef typename MatA::PlainObject MatrixType; + typedef typename NumTraits::Scalar>::Real RealScalar; + const RealScalar b[] = {120.L, 60.L, 12.L, 1.L}; + const MatrixType A2 = A * A; + const MatrixType tmp = b[3] * A2 + b[1] * MatrixType::Identity(A.rows(), A.cols()); + U.noalias() = A * tmp; + V = b[2] * A2 + b[0] * MatrixType::Identity(A.rows(), A.cols()); +} + +/** \brief Compute the (5,5)-Padé approximant to the exponential. + * + * After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Padé + * approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$. + */ +template +void matrix_exp_pade5(const MatA& A, MatU& U, MatV& V) +{ + typedef typename MatA::PlainObject MatrixType; + typedef typename NumTraits::Scalar>::Real RealScalar; + const RealScalar b[] = {30240.L, 15120.L, 3360.L, 420.L, 30.L, 1.L}; + const MatrixType A2 = A * A; + const MatrixType A4 = A2 * A2; + const MatrixType tmp = b[5] * A4 + b[3] * A2 + b[1] * MatrixType::Identity(A.rows(), A.cols()); + U.noalias() = A * tmp; + V = b[4] * A4 + b[2] * A2 + b[0] * MatrixType::Identity(A.rows(), A.cols()); +} + +/** \brief Compute the (7,7)-Padé approximant to the exponential. + * + * After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Padé + * approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$. + */ +template +void matrix_exp_pade7(const MatA& A, MatU& U, MatV& V) +{ + typedef typename MatA::PlainObject MatrixType; + typedef typename NumTraits::Scalar>::Real RealScalar; + const RealScalar b[] = {17297280.L, 8648640.L, 1995840.L, 277200.L, 25200.L, 1512.L, 56.L, 1.L}; + const MatrixType A2 = A * A; + const MatrixType A4 = A2 * A2; + const MatrixType A6 = A4 * A2; + const MatrixType tmp = b[7] * A6 + b[5] * A4 + b[3] * A2 + + b[1] * MatrixType::Identity(A.rows(), A.cols()); + U.noalias() = A * tmp; + V = b[6] * A6 + b[4] * A4 + b[2] * A2 + b[0] * MatrixType::Identity(A.rows(), A.cols()); + +} + +/** \brief Compute the (9,9)-Padé approximant to the exponential. + * + * After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Padé + * approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$. + */ +template +void matrix_exp_pade9(const MatA& A, MatU& U, MatV& V) +{ + typedef typename MatA::PlainObject MatrixType; + typedef typename NumTraits::Scalar>::Real RealScalar; + const RealScalar b[] = {17643225600.L, 8821612800.L, 2075673600.L, 302702400.L, 30270240.L, + 2162160.L, 110880.L, 3960.L, 90.L, 1.L}; + const MatrixType A2 = A * A; + const MatrixType A4 = A2 * A2; + const MatrixType A6 = A4 * A2; + const MatrixType A8 = A6 * A2; + const MatrixType tmp = b[9] * A8 + b[7] * A6 + b[5] * A4 + b[3] * A2 + + b[1] * MatrixType::Identity(A.rows(), A.cols()); + U.noalias() = A * tmp; + V = b[8] * A8 + b[6] * A6 + b[4] * A4 + b[2] * A2 + b[0] * MatrixType::Identity(A.rows(), A.cols()); +} + +/** \brief Compute the (13,13)-Padé approximant to the exponential. + * + * After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Padé + * approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$. + */ +template +void matrix_exp_pade13(const MatA& A, MatU& U, MatV& V) +{ + typedef typename MatA::PlainObject MatrixType; + typedef typename NumTraits::Scalar>::Real RealScalar; + const RealScalar b[] = {64764752532480000.L, 32382376266240000.L, 7771770303897600.L, + 1187353796428800.L, 129060195264000.L, 10559470521600.L, 670442572800.L, + 33522128640.L, 1323241920.L, 40840800.L, 960960.L, 16380.L, 182.L, 1.L}; + const MatrixType A2 = A * A; + const MatrixType A4 = A2 * A2; + const MatrixType A6 = A4 * A2; + V = b[13] * A6 + b[11] * A4 + b[9] * A2; // used for temporary storage + MatrixType tmp = A6 * V; + tmp += b[7] * A6 + b[5] * A4 + b[3] * A2 + b[1] * MatrixType::Identity(A.rows(), A.cols()); + U.noalias() = A * tmp; + tmp = b[12] * A6 + b[10] * A4 + b[8] * A2; + V.noalias() = A6 * tmp; + V += b[6] * A6 + b[4] * A4 + b[2] * A2 + b[0] * MatrixType::Identity(A.rows(), A.cols()); +} + +/** \brief Compute the (17,17)-Padé approximant to the exponential. + * + * After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Padé + * approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$. + * + * This function activates only if your long double is double-double or quadruple. + */ +#if LDBL_MANT_DIG > 64 +template +void matrix_exp_pade17(const MatA& A, MatU& U, MatV& V) +{ + typedef typename MatA::PlainObject MatrixType; + typedef typename NumTraits::Scalar>::Real RealScalar; + const RealScalar b[] = {830034394580628357120000.L, 415017197290314178560000.L, + 100610229646136770560000.L, 15720348382208870400000.L, + 1774878043152614400000.L, 153822763739893248000.L, 10608466464820224000.L, + 595373117923584000.L, 27563570274240000.L, 1060137318240000.L, + 33924394183680.L, 899510451840.L, 19554575040.L, 341863200.L, 4651200.L, + 46512.L, 306.L, 1.L}; + const MatrixType A2 = A * A; + const MatrixType A4 = A2 * A2; + const MatrixType A6 = A4 * A2; + const MatrixType A8 = A4 * A4; + V = b[17] * A8 + b[15] * A6 + b[13] * A4 + b[11] * A2; // used for temporary storage + MatrixType tmp = A8 * V; + tmp += b[9] * A8 + b[7] * A6 + b[5] * A4 + b[3] * A2 + + b[1] * MatrixType::Identity(A.rows(), A.cols()); + U.noalias() = A * tmp; + tmp = b[16] * A8 + b[14] * A6 + b[12] * A4 + b[10] * A2; + V.noalias() = tmp * A8; + V += b[8] * A8 + b[6] * A6 + b[4] * A4 + b[2] * A2 + + b[0] * MatrixType::Identity(A.rows(), A.cols()); +} +#endif + +template ::Scalar>::Real> +struct matrix_exp_computeUV +{ + /** \brief Compute Padé approximant to the exponential. + * + * Computes \c U, \c V and \c squarings such that \f$ (V+U)(V-U)^{-1} \f$ is a Padé + * approximant of \f$ \exp(2^{-\mbox{squarings}}M) \f$ around \f$ M = 0 \f$, where \f$ M \f$ + * denotes the matrix \c arg. The degree of the Padé approximant and the value of squarings + * are chosen such that the approximation error is no more than the round-off error. + */ + static void run(const MatrixType& arg, MatrixType& U, MatrixType& V, int& squarings); +}; + +template +struct matrix_exp_computeUV +{ + template + static void run(const ArgType& arg, MatrixType& U, MatrixType& V, int& squarings) + { + using std::frexp; + using std::pow; + const float l1norm = arg.cwiseAbs().colwise().sum().maxCoeff(); + squarings = 0; + if (l1norm < 4.258730016922831e-001f) { + matrix_exp_pade3(arg, U, V); + } else if (l1norm < 1.880152677804762e+000f) { + matrix_exp_pade5(arg, U, V); + } else { + const float maxnorm = 3.925724783138660f; + frexp(l1norm / maxnorm, &squarings); + if (squarings < 0) squarings = 0; + MatrixType A = arg.unaryExpr(MatrixExponentialScalingOp(squarings)); + matrix_exp_pade7(A, U, V); + } + } +}; + +template +struct matrix_exp_computeUV +{ + typedef typename NumTraits::Scalar>::Real RealScalar; + template + static void run(const ArgType& arg, MatrixType& U, MatrixType& V, int& squarings) + { + using std::frexp; + using std::pow; + const RealScalar l1norm = arg.cwiseAbs().colwise().sum().maxCoeff(); + squarings = 0; + if (l1norm < 1.495585217958292e-002) { + matrix_exp_pade3(arg, U, V); + } else if (l1norm < 2.539398330063230e-001) { + matrix_exp_pade5(arg, U, V); + } else if (l1norm < 9.504178996162932e-001) { + matrix_exp_pade7(arg, U, V); + } else if (l1norm < 2.097847961257068e+000) { + matrix_exp_pade9(arg, U, V); + } else { + const RealScalar maxnorm = 5.371920351148152; + frexp(l1norm / maxnorm, &squarings); + if (squarings < 0) squarings = 0; + MatrixType A = arg.unaryExpr(MatrixExponentialScalingOp(squarings)); + matrix_exp_pade13(A, U, V); + } + } +}; + +template +struct matrix_exp_computeUV +{ + template + static void run(const ArgType& arg, MatrixType& U, MatrixType& V, int& squarings) + { +#if LDBL_MANT_DIG == 53 // double precision + matrix_exp_computeUV::run(arg, U, V, squarings); + +#else + + using std::frexp; + using std::pow; + const long double l1norm = arg.cwiseAbs().colwise().sum().maxCoeff(); + squarings = 0; + +#if LDBL_MANT_DIG <= 64 // extended precision + + if (l1norm < 4.1968497232266989671e-003L) { + matrix_exp_pade3(arg, U, V); + } else if (l1norm < 1.1848116734693823091e-001L) { + matrix_exp_pade5(arg, U, V); + } else if (l1norm < 5.5170388480686700274e-001L) { + matrix_exp_pade7(arg, U, V); + } else if (l1norm < 1.3759868875587845383e+000L) { + matrix_exp_pade9(arg, U, V); + } else { + const long double maxnorm = 4.0246098906697353063L; + frexp(l1norm / maxnorm, &squarings); + if (squarings < 0) squarings = 0; + MatrixType A = arg.unaryExpr(MatrixExponentialScalingOp(squarings)); + matrix_exp_pade13(A, U, V); + } + +#elif LDBL_MANT_DIG <= 106 // double-double + + if (l1norm < 3.2787892205607026992947488108213e-005L) { + matrix_exp_pade3(arg, U, V); + } else if (l1norm < 6.4467025060072760084130906076332e-003L) { + matrix_exp_pade5(arg, U, V); + } else if (l1norm < 6.8988028496595374751374122881143e-002L) { + matrix_exp_pade7(arg, U, V); + } else if (l1norm < 2.7339737518502231741495857201670e-001L) { + matrix_exp_pade9(arg, U, V); + } else if (l1norm < 1.3203382096514474905666448850278e+000L) { + matrix_exp_pade13(arg, U, V); + } else { + const long double maxnorm = 3.2579440895405400856599663723517L; + frexp(l1norm / maxnorm, &squarings); + if (squarings < 0) squarings = 0; + MatrixType A = arg.unaryExpr(MatrixExponentialScalingOp(squarings)); + matrix_exp_pade17(A, U, V); + } + +#elif LDBL_MANT_DIG <= 113 // quadruple precision + + if (l1norm < 1.639394610288918690547467954466970e-005L) { + matrix_exp_pade3(arg, U, V); + } else if (l1norm < 4.253237712165275566025884344433009e-003L) { + matrix_exp_pade5(arg, U, V); + } else if (l1norm < 5.125804063165764409885122032933142e-002L) { + matrix_exp_pade7(arg, U, V); + } else if (l1norm < 2.170000765161155195453205651889853e-001L) { + matrix_exp_pade9(arg, U, V); + } else if (l1norm < 1.125358383453143065081397882891878e+000L) { + matrix_exp_pade13(arg, U, V); + } else { + const long double maxnorm = 2.884233277829519311757165057717815L; + frexp(l1norm / maxnorm, &squarings); + if (squarings < 0) squarings = 0; + MatrixType A = arg.unaryExpr(MatrixExponentialScalingOp(squarings)); + matrix_exp_pade17(A, U, V); + } + +#else + + // this case should be handled in compute() + eigen_assert(false && "Bug in MatrixExponential"); + +#endif +#endif // LDBL_MANT_DIG + } +}; + +template struct is_exp_known_type : false_type {}; +template<> struct is_exp_known_type : true_type {}; +template<> struct is_exp_known_type : true_type {}; +#if LDBL_MANT_DIG <= 113 +template<> struct is_exp_known_type : true_type {}; +#endif + +template +void matrix_exp_compute(const ArgType& arg, ResultType &result, true_type) // natively supported scalar type +{ + typedef typename ArgType::PlainObject MatrixType; + MatrixType U, V; + int squarings; + matrix_exp_computeUV::run(arg, U, V, squarings); // Pade approximant is (U+V) / (-U+V) + MatrixType numer = U + V; + MatrixType denom = -U + V; + result = denom.partialPivLu().solve(numer); + for (int i=0; i +void matrix_exp_compute(const ArgType& arg, ResultType &result, false_type) // default +{ + typedef typename ArgType::PlainObject MatrixType; + typedef typename traits::Scalar Scalar; + typedef typename NumTraits::Real RealScalar; + typedef typename std::complex ComplexScalar; + result = arg.matrixFunction(internal::stem_function_exp); +} + +} // end namespace Eigen::internal + +/** \ingroup MatrixFunctions_Module + * + * \brief Proxy for the matrix exponential of some matrix (expression). + * + * \tparam Derived Type of the argument to the matrix exponential. + * + * This class holds the argument to the matrix exponential until it is assigned or evaluated for + * some other reason (so the argument should not be changed in the meantime). It is the return type + * of MatrixBase::exp() and most of the time this is the only way it is used. + */ +template struct MatrixExponentialReturnValue +: public ReturnByValue > +{ + public: + /** \brief Constructor. + * + * \param src %Matrix (expression) forming the argument of the matrix exponential. + */ + MatrixExponentialReturnValue(const Derived& src) : m_src(src) { } + + /** \brief Compute the matrix exponential. + * + * \param result the matrix exponential of \p src in the constructor. + */ + template + inline void evalTo(ResultType& result) const + { + const typename internal::nested_eval::type tmp(m_src); + internal::matrix_exp_compute(tmp, result, internal::is_exp_known_type()); + } + + Index rows() const { return m_src.rows(); } + Index cols() const { return m_src.cols(); } + + protected: + const typename internal::ref_selector::type m_src; +}; + +namespace internal { +template +struct traits > +{ + typedef typename Derived::PlainObject ReturnType; +}; +} + +template +const MatrixExponentialReturnValue MatrixBase::exp() const +{ + eigen_assert(rows() == cols()); + return MatrixExponentialReturnValue(derived()); +} + +} // end namespace Eigen + +#endif // EIGEN_MATRIX_EXPONENTIAL diff --git a/src/EigenUnsupported/src/MatrixFunctions/MatrixFunction.h b/src/EigenUnsupported/src/MatrixFunctions/MatrixFunction.h new file mode 100644 index 0000000..cc12ab6 --- /dev/null +++ b/src/EigenUnsupported/src/MatrixFunctions/MatrixFunction.h @@ -0,0 +1,569 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009-2011, 2013 Jitse Niesen +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_MATRIX_FUNCTION_H +#define EIGEN_MATRIX_FUNCTION_H + +#include "StemFunction.h" + + +namespace Eigen { + +namespace internal { + +/** \brief Maximum distance allowed between eigenvalues to be considered "close". */ +static const float matrix_function_separation = 0.1f; + +/** \ingroup MatrixFunctions_Module + * \class MatrixFunctionAtomic + * \brief Helper class for computing matrix functions of atomic matrices. + * + * Here, an atomic matrix is a triangular matrix whose diagonal entries are close to each other. + */ +template +class MatrixFunctionAtomic +{ + public: + + typedef typename MatrixType::Scalar Scalar; + typedef typename stem_function::type StemFunction; + + /** \brief Constructor + * \param[in] f matrix function to compute. + */ + MatrixFunctionAtomic(StemFunction f) : m_f(f) { } + + /** \brief Compute matrix function of atomic matrix + * \param[in] A argument of matrix function, should be upper triangular and atomic + * \returns f(A), the matrix function evaluated at the given matrix + */ + MatrixType compute(const MatrixType& A); + + private: + StemFunction* m_f; +}; + +template +typename NumTraits::Real matrix_function_compute_mu(const MatrixType& A) +{ + typedef typename plain_col_type::type VectorType; + Index rows = A.rows(); + const MatrixType N = MatrixType::Identity(rows, rows) - A; + VectorType e = VectorType::Ones(rows); + N.template triangularView().solveInPlace(e); + return e.cwiseAbs().maxCoeff(); +} + +template +MatrixType MatrixFunctionAtomic::compute(const MatrixType& A) +{ + // TODO: Use that A is upper triangular + typedef typename NumTraits::Real RealScalar; + Index rows = A.rows(); + Scalar avgEival = A.trace() / Scalar(RealScalar(rows)); + MatrixType Ashifted = A - avgEival * MatrixType::Identity(rows, rows); + RealScalar mu = matrix_function_compute_mu(Ashifted); + MatrixType F = m_f(avgEival, 0) * MatrixType::Identity(rows, rows); + MatrixType P = Ashifted; + MatrixType Fincr; + for (Index s = 1; double(s) < 1.1 * double(rows) + 10.0; s++) { // upper limit is fairly arbitrary + Fincr = m_f(avgEival, static_cast(s)) * P; + F += Fincr; + P = Scalar(RealScalar(1)/RealScalar(s + 1)) * P * Ashifted; + + // test whether Taylor series converged + const RealScalar F_norm = F.cwiseAbs().rowwise().sum().maxCoeff(); + const RealScalar Fincr_norm = Fincr.cwiseAbs().rowwise().sum().maxCoeff(); + if (Fincr_norm < NumTraits::epsilon() * F_norm) { + RealScalar delta = 0; + RealScalar rfactorial = 1; + for (Index r = 0; r < rows; r++) { + RealScalar mx = 0; + for (Index i = 0; i < rows; i++) + mx = (std::max)(mx, std::abs(m_f(Ashifted(i, i) + avgEival, static_cast(s+r)))); + if (r != 0) + rfactorial *= RealScalar(r); + delta = (std::max)(delta, mx / rfactorial); + } + const RealScalar P_norm = P.cwiseAbs().rowwise().sum().maxCoeff(); + if (mu * delta * P_norm < NumTraits::epsilon() * F_norm) // series converged + break; + } + } + return F; +} + +/** \brief Find cluster in \p clusters containing some value + * \param[in] key Value to find + * \returns Iterator to cluster containing \p key, or \c clusters.end() if no cluster in \p m_clusters + * contains \p key. + */ +template +typename ListOfClusters::iterator matrix_function_find_cluster(Index key, ListOfClusters& clusters) +{ + typename std::list::iterator j; + for (typename ListOfClusters::iterator i = clusters.begin(); i != clusters.end(); ++i) { + j = std::find(i->begin(), i->end(), key); + if (j != i->end()) + return i; + } + return clusters.end(); +} + +/** \brief Partition eigenvalues in clusters of ei'vals close to each other + * + * \param[in] eivals Eigenvalues + * \param[out] clusters Resulting partition of eigenvalues + * + * The partition satisfies the following two properties: + * # Any eigenvalue in a certain cluster is at most matrix_function_separation() away from another eigenvalue + * in the same cluster. + * # The distance between two eigenvalues in different clusters is more than matrix_function_separation(). + * The implementation follows Algorithm 4.1 in the paper of Davies and Higham. + */ +template +void matrix_function_partition_eigenvalues(const EivalsType& eivals, std::list& clusters) +{ + typedef typename EivalsType::RealScalar RealScalar; + for (Index i=0; i::iterator qi = matrix_function_find_cluster(i, clusters); + if (qi == clusters.end()) { + Cluster l; + l.push_back(i); + clusters.push_back(l); + qi = clusters.end(); + --qi; + } + + // Look for other element to add to the set + for (Index j=i+1; jbegin(), qi->end(), j) == qi->end()) { + typename std::list::iterator qj = matrix_function_find_cluster(j, clusters); + if (qj == clusters.end()) { + qi->push_back(j); + } else { + qi->insert(qi->end(), qj->begin(), qj->end()); + clusters.erase(qj); + } + } + } + } +} + +/** \brief Compute size of each cluster given a partitioning */ +template +void matrix_function_compute_cluster_size(const ListOfClusters& clusters, Matrix& clusterSize) +{ + const Index numClusters = static_cast(clusters.size()); + clusterSize.setZero(numClusters); + Index clusterIndex = 0; + for (typename ListOfClusters::const_iterator cluster = clusters.begin(); cluster != clusters.end(); ++cluster) { + clusterSize[clusterIndex] = cluster->size(); + ++clusterIndex; + } +} + +/** \brief Compute start of each block using clusterSize */ +template +void matrix_function_compute_block_start(const VectorType& clusterSize, VectorType& blockStart) +{ + blockStart.resize(clusterSize.rows()); + blockStart(0) = 0; + for (Index i = 1; i < clusterSize.rows(); i++) { + blockStart(i) = blockStart(i-1) + clusterSize(i-1); + } +} + +/** \brief Compute mapping of eigenvalue indices to cluster indices */ +template +void matrix_function_compute_map(const EivalsType& eivals, const ListOfClusters& clusters, VectorType& eivalToCluster) +{ + eivalToCluster.resize(eivals.rows()); + Index clusterIndex = 0; + for (typename ListOfClusters::const_iterator cluster = clusters.begin(); cluster != clusters.end(); ++cluster) { + for (Index i = 0; i < eivals.rows(); ++i) { + if (std::find(cluster->begin(), cluster->end(), i) != cluster->end()) { + eivalToCluster[i] = clusterIndex; + } + } + ++clusterIndex; + } +} + +/** \brief Compute permutation which groups ei'vals in same cluster together */ +template +void matrix_function_compute_permutation(const DynVectorType& blockStart, const DynVectorType& eivalToCluster, VectorType& permutation) +{ + DynVectorType indexNextEntry = blockStart; + permutation.resize(eivalToCluster.rows()); + for (Index i = 0; i < eivalToCluster.rows(); i++) { + Index cluster = eivalToCluster[i]; + permutation[i] = indexNextEntry[cluster]; + ++indexNextEntry[cluster]; + } +} + +/** \brief Permute Schur decomposition in U and T according to permutation */ +template +void matrix_function_permute_schur(VectorType& permutation, MatrixType& U, MatrixType& T) +{ + for (Index i = 0; i < permutation.rows() - 1; i++) { + Index j; + for (j = i; j < permutation.rows(); j++) { + if (permutation(j) == i) break; + } + eigen_assert(permutation(j) == i); + for (Index k = j-1; k >= i; k--) { + JacobiRotation rotation; + rotation.makeGivens(T(k, k+1), T(k+1, k+1) - T(k, k)); + T.applyOnTheLeft(k, k+1, rotation.adjoint()); + T.applyOnTheRight(k, k+1, rotation); + U.applyOnTheRight(k, k+1, rotation); + std::swap(permutation.coeffRef(k), permutation.coeffRef(k+1)); + } + } +} + +/** \brief Compute block diagonal part of matrix function. + * + * This routine computes the matrix function applied to the block diagonal part of \p T (which should be + * upper triangular), with the blocking given by \p blockStart and \p clusterSize. The matrix function of + * each diagonal block is computed by \p atomic. The off-diagonal parts of \p fT are set to zero. + */ +template +void matrix_function_compute_block_atomic(const MatrixType& T, AtomicType& atomic, const VectorType& blockStart, const VectorType& clusterSize, MatrixType& fT) +{ + fT.setZero(T.rows(), T.cols()); + for (Index i = 0; i < clusterSize.rows(); ++i) { + fT.block(blockStart(i), blockStart(i), clusterSize(i), clusterSize(i)) + = atomic.compute(T.block(blockStart(i), blockStart(i), clusterSize(i), clusterSize(i))); + } +} + +/** \brief Solve a triangular Sylvester equation AX + XB = C + * + * \param[in] A the matrix A; should be square and upper triangular + * \param[in] B the matrix B; should be square and upper triangular + * \param[in] C the matrix C; should have correct size. + * + * \returns the solution X. + * + * If A is m-by-m and B is n-by-n, then both C and X are m-by-n. The (i,j)-th component of the Sylvester + * equation is + * \f[ + * \sum_{k=i}^m A_{ik} X_{kj} + \sum_{k=1}^j X_{ik} B_{kj} = C_{ij}. + * \f] + * This can be re-arranged to yield: + * \f[ + * X_{ij} = \frac{1}{A_{ii} + B_{jj}} \Bigl( C_{ij} + * - \sum_{k=i+1}^m A_{ik} X_{kj} - \sum_{k=1}^{j-1} X_{ik} B_{kj} \Bigr). + * \f] + * It is assumed that A and B are such that the numerator is never zero (otherwise the Sylvester equation + * does not have a unique solution). In that case, these equations can be evaluated in the order + * \f$ i=m,\ldots,1 \f$ and \f$ j=1,\ldots,n \f$. + */ +template +MatrixType matrix_function_solve_triangular_sylvester(const MatrixType& A, const MatrixType& B, const MatrixType& C) +{ + eigen_assert(A.rows() == A.cols()); + eigen_assert(A.isUpperTriangular()); + eigen_assert(B.rows() == B.cols()); + eigen_assert(B.isUpperTriangular()); + eigen_assert(C.rows() == A.rows()); + eigen_assert(C.cols() == B.rows()); + + typedef typename MatrixType::Scalar Scalar; + + Index m = A.rows(); + Index n = B.rows(); + MatrixType X(m, n); + + for (Index i = m - 1; i >= 0; --i) { + for (Index j = 0; j < n; ++j) { + + // Compute AX = \sum_{k=i+1}^m A_{ik} X_{kj} + Scalar AX; + if (i == m - 1) { + AX = 0; + } else { + Matrix AXmatrix = A.row(i).tail(m-1-i) * X.col(j).tail(m-1-i); + AX = AXmatrix(0,0); + } + + // Compute XB = \sum_{k=1}^{j-1} X_{ik} B_{kj} + Scalar XB; + if (j == 0) { + XB = 0; + } else { + Matrix XBmatrix = X.row(i).head(j) * B.col(j).head(j); + XB = XBmatrix(0,0); + } + + X(i,j) = (C(i,j) - AX - XB) / (A(i,i) + B(j,j)); + } + } + return X; +} + +/** \brief Compute part of matrix function above block diagonal. + * + * This routine completes the computation of \p fT, denoting a matrix function applied to the triangular + * matrix \p T. It assumes that the block diagonal part of \p fT has already been computed. The part below + * the diagonal is zero, because \p T is upper triangular. + */ +template +void matrix_function_compute_above_diagonal(const MatrixType& T, const VectorType& blockStart, const VectorType& clusterSize, MatrixType& fT) +{ + typedef internal::traits Traits; + typedef typename MatrixType::Scalar Scalar; + static const int Options = MatrixType::Options; + typedef Matrix DynMatrixType; + + for (Index k = 1; k < clusterSize.rows(); k++) { + for (Index i = 0; i < clusterSize.rows() - k; i++) { + // compute (i, i+k) block + DynMatrixType A = T.block(blockStart(i), blockStart(i), clusterSize(i), clusterSize(i)); + DynMatrixType B = -T.block(blockStart(i+k), blockStart(i+k), clusterSize(i+k), clusterSize(i+k)); + DynMatrixType C = fT.block(blockStart(i), blockStart(i), clusterSize(i), clusterSize(i)) + * T.block(blockStart(i), blockStart(i+k), clusterSize(i), clusterSize(i+k)); + C -= T.block(blockStart(i), blockStart(i+k), clusterSize(i), clusterSize(i+k)) + * fT.block(blockStart(i+k), blockStart(i+k), clusterSize(i+k), clusterSize(i+k)); + for (Index m = i + 1; m < i + k; m++) { + C += fT.block(blockStart(i), blockStart(m), clusterSize(i), clusterSize(m)) + * T.block(blockStart(m), blockStart(i+k), clusterSize(m), clusterSize(i+k)); + C -= T.block(blockStart(i), blockStart(m), clusterSize(i), clusterSize(m)) + * fT.block(blockStart(m), blockStart(i+k), clusterSize(m), clusterSize(i+k)); + } + fT.block(blockStart(i), blockStart(i+k), clusterSize(i), clusterSize(i+k)) + = matrix_function_solve_triangular_sylvester(A, B, C); + } + } +} + +/** \ingroup MatrixFunctions_Module + * \brief Class for computing matrix functions. + * \tparam MatrixType type of the argument of the matrix function, + * expected to be an instantiation of the Matrix class template. + * \tparam AtomicType type for computing matrix function of atomic blocks. + * \tparam IsComplex used internally to select correct specialization. + * + * This class implements the Schur-Parlett algorithm for computing matrix functions. The spectrum of the + * matrix is divided in clustered of eigenvalues that lies close together. This class delegates the + * computation of the matrix function on every block corresponding to these clusters to an object of type + * \p AtomicType and uses these results to compute the matrix function of the whole matrix. The class + * \p AtomicType should have a \p compute() member function for computing the matrix function of a block. + * + * \sa class MatrixFunctionAtomic, class MatrixLogarithmAtomic + */ +template ::Scalar>::IsComplex> +struct matrix_function_compute +{ + /** \brief Compute the matrix function. + * + * \param[in] A argument of matrix function, should be a square matrix. + * \param[in] atomic class for computing matrix function of atomic blocks. + * \param[out] result the function \p f applied to \p A, as + * specified in the constructor. + * + * See MatrixBase::matrixFunction() for details on how this computation + * is implemented. + */ + template + static void run(const MatrixType& A, AtomicType& atomic, ResultType &result); +}; + +/** \internal \ingroup MatrixFunctions_Module + * \brief Partial specialization of MatrixFunction for real matrices + * + * This converts the real matrix to a complex matrix, compute the matrix function of that matrix, and then + * converts the result back to a real matrix. + */ +template +struct matrix_function_compute +{ + template + static void run(const MatA& A, AtomicType& atomic, ResultType &result) + { + typedef internal::traits Traits; + typedef typename Traits::Scalar Scalar; + static const int Rows = Traits::RowsAtCompileTime, Cols = Traits::ColsAtCompileTime; + static const int MaxRows = Traits::MaxRowsAtCompileTime, MaxCols = Traits::MaxColsAtCompileTime; + + typedef std::complex ComplexScalar; + typedef Matrix ComplexMatrix; + + ComplexMatrix CA = A.template cast(); + ComplexMatrix Cresult; + matrix_function_compute::run(CA, atomic, Cresult); + result = Cresult.real(); + } +}; + +/** \internal \ingroup MatrixFunctions_Module + * \brief Partial specialization of MatrixFunction for complex matrices + */ +template +struct matrix_function_compute +{ + template + static void run(const MatA& A, AtomicType& atomic, ResultType &result) + { + typedef internal::traits Traits; + + // compute Schur decomposition of A + const ComplexSchur schurOfA(A); + eigen_assert(schurOfA.info()==Success); + MatrixType T = schurOfA.matrixT(); + MatrixType U = schurOfA.matrixU(); + + // partition eigenvalues into clusters of ei'vals "close" to each other + std::list > clusters; + matrix_function_partition_eigenvalues(T.diagonal(), clusters); + + // compute size of each cluster + Matrix clusterSize; + matrix_function_compute_cluster_size(clusters, clusterSize); + + // blockStart[i] is row index at which block corresponding to i-th cluster starts + Matrix blockStart; + matrix_function_compute_block_start(clusterSize, blockStart); + + // compute map so that eivalToCluster[i] = j means that i-th ei'val is in j-th cluster + Matrix eivalToCluster; + matrix_function_compute_map(T.diagonal(), clusters, eivalToCluster); + + // compute permutation which groups ei'vals in same cluster together + Matrix permutation; + matrix_function_compute_permutation(blockStart, eivalToCluster, permutation); + + // permute Schur decomposition + matrix_function_permute_schur(permutation, U, T); + + // compute result + MatrixType fT; // matrix function applied to T + matrix_function_compute_block_atomic(T, atomic, blockStart, clusterSize, fT); + matrix_function_compute_above_diagonal(T, blockStart, clusterSize, fT); + result = U * (fT.template triangularView() * U.adjoint()); + } +}; + +} // end of namespace internal + +/** \ingroup MatrixFunctions_Module + * + * \brief Proxy for the matrix function of some matrix (expression). + * + * \tparam Derived Type of the argument to the matrix function. + * + * This class holds the argument to the matrix function until it is assigned or evaluated for some other + * reason (so the argument should not be changed in the meantime). It is the return type of + * matrixBase::matrixFunction() and related functions and most of the time this is the only way it is used. + */ +template class MatrixFunctionReturnValue +: public ReturnByValue > +{ + public: + typedef typename Derived::Scalar Scalar; + typedef typename internal::stem_function::type StemFunction; + + protected: + typedef typename internal::ref_selector::type DerivedNested; + + public: + + /** \brief Constructor. + * + * \param[in] A %Matrix (expression) forming the argument of the matrix function. + * \param[in] f Stem function for matrix function under consideration. + */ + MatrixFunctionReturnValue(const Derived& A, StemFunction f) : m_A(A), m_f(f) { } + + /** \brief Compute the matrix function. + * + * \param[out] result \p f applied to \p A, where \p f and \p A are as in the constructor. + */ + template + inline void evalTo(ResultType& result) const + { + typedef typename internal::nested_eval::type NestedEvalType; + typedef typename internal::remove_all::type NestedEvalTypeClean; + typedef internal::traits Traits; + typedef std::complex::Real> ComplexScalar; + typedef Matrix DynMatrixType; + + typedef internal::MatrixFunctionAtomic AtomicType; + AtomicType atomic(m_f); + + internal::matrix_function_compute::run(m_A, atomic, result); + } + + Index rows() const { return m_A.rows(); } + Index cols() const { return m_A.cols(); } + + private: + const DerivedNested m_A; + StemFunction *m_f; +}; + +namespace internal { +template +struct traits > +{ + typedef typename Derived::PlainObject ReturnType; +}; +} + + +/********** MatrixBase methods **********/ + + +template +const MatrixFunctionReturnValue MatrixBase::matrixFunction(typename internal::stem_function::Scalar>::type f) const +{ + eigen_assert(rows() == cols()); + return MatrixFunctionReturnValue(derived(), f); +} + +template +const MatrixFunctionReturnValue MatrixBase::sin() const +{ + eigen_assert(rows() == cols()); + typedef typename internal::stem_function::ComplexScalar ComplexScalar; + return MatrixFunctionReturnValue(derived(), internal::stem_function_sin); +} + +template +const MatrixFunctionReturnValue MatrixBase::cos() const +{ + eigen_assert(rows() == cols()); + typedef typename internal::stem_function::ComplexScalar ComplexScalar; + return MatrixFunctionReturnValue(derived(), internal::stem_function_cos); +} + +template +const MatrixFunctionReturnValue MatrixBase::sinh() const +{ + eigen_assert(rows() == cols()); + typedef typename internal::stem_function::ComplexScalar ComplexScalar; + return MatrixFunctionReturnValue(derived(), internal::stem_function_sinh); +} + +template +const MatrixFunctionReturnValue MatrixBase::cosh() const +{ + eigen_assert(rows() == cols()); + typedef typename internal::stem_function::ComplexScalar ComplexScalar; + return MatrixFunctionReturnValue(derived(), internal::stem_function_cosh); +} + +} // end namespace Eigen + +#endif // EIGEN_MATRIX_FUNCTION_H diff --git a/src/EigenUnsupported/src/MatrixFunctions/MatrixLogarithm.h b/src/EigenUnsupported/src/MatrixFunctions/MatrixLogarithm.h new file mode 100644 index 0000000..e917013 --- /dev/null +++ b/src/EigenUnsupported/src/MatrixFunctions/MatrixLogarithm.h @@ -0,0 +1,373 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2011, 2013 Jitse Niesen +// Copyright (C) 2011 Chen-Pang He +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_MATRIX_LOGARITHM +#define EIGEN_MATRIX_LOGARITHM + +namespace Eigen { + +namespace internal { + +template +struct matrix_log_min_pade_degree +{ + static const int value = 3; +}; + +template +struct matrix_log_max_pade_degree +{ + typedef typename NumTraits::Real RealScalar; + static const int value = std::numeric_limits::digits<= 24? 5: // single precision + std::numeric_limits::digits<= 53? 7: // double precision + std::numeric_limits::digits<= 64? 8: // extended precision + std::numeric_limits::digits<=106? 10: // double-double + 11; // quadruple precision +}; + +/** \brief Compute logarithm of 2x2 triangular matrix. */ +template +void matrix_log_compute_2x2(const MatrixType& A, MatrixType& result) +{ + typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::RealScalar RealScalar; + using std::abs; + using std::ceil; + using std::imag; + using std::log; + + Scalar logA00 = log(A(0,0)); + Scalar logA11 = log(A(1,1)); + + result(0,0) = logA00; + result(1,0) = Scalar(0); + result(1,1) = logA11; + + Scalar y = A(1,1) - A(0,0); + if (y==Scalar(0)) + { + result(0,1) = A(0,1) / A(0,0); + } + else if ((abs(A(0,0)) < RealScalar(0.5)*abs(A(1,1))) || (abs(A(0,0)) > 2*abs(A(1,1)))) + { + result(0,1) = A(0,1) * (logA11 - logA00) / y; + } + else + { + // computation in previous branch is inaccurate if A(1,1) \approx A(0,0) + RealScalar unwindingNumber = ceil((imag(logA11 - logA00) - RealScalar(EIGEN_PI)) / RealScalar(2*EIGEN_PI)); + result(0,1) = A(0,1) * (numext::log1p(y/A(0,0)) + Scalar(0,RealScalar(2*EIGEN_PI)*unwindingNumber)) / y; + } +} + +/* \brief Get suitable degree for Pade approximation. (specialized for RealScalar = float) */ +inline int matrix_log_get_pade_degree(float normTminusI) +{ + const float maxNormForPade[] = { 2.5111573934555054e-1 /* degree = 3 */ , 4.0535837411880493e-1, + 5.3149729967117310e-1 }; + const int minPadeDegree = matrix_log_min_pade_degree::value; + const int maxPadeDegree = matrix_log_max_pade_degree::value; + int degree = minPadeDegree; + for (; degree <= maxPadeDegree; ++degree) + if (normTminusI <= maxNormForPade[degree - minPadeDegree]) + break; + return degree; +} + +/* \brief Get suitable degree for Pade approximation. (specialized for RealScalar = double) */ +inline int matrix_log_get_pade_degree(double normTminusI) +{ + const double maxNormForPade[] = { 1.6206284795015624e-2 /* degree = 3 */ , 5.3873532631381171e-2, + 1.1352802267628681e-1, 1.8662860613541288e-1, 2.642960831111435e-1 }; + const int minPadeDegree = matrix_log_min_pade_degree::value; + const int maxPadeDegree = matrix_log_max_pade_degree::value; + int degree = minPadeDegree; + for (; degree <= maxPadeDegree; ++degree) + if (normTminusI <= maxNormForPade[degree - minPadeDegree]) + break; + return degree; +} + +/* \brief Get suitable degree for Pade approximation. (specialized for RealScalar = long double) */ +inline int matrix_log_get_pade_degree(long double normTminusI) +{ +#if LDBL_MANT_DIG == 53 // double precision + const long double maxNormForPade[] = { 1.6206284795015624e-2L /* degree = 3 */ , 5.3873532631381171e-2L, + 1.1352802267628681e-1L, 1.8662860613541288e-1L, 2.642960831111435e-1L }; +#elif LDBL_MANT_DIG <= 64 // extended precision + const long double maxNormForPade[] = { 5.48256690357782863103e-3L /* degree = 3 */, 2.34559162387971167321e-2L, + 5.84603923897347449857e-2L, 1.08486423756725170223e-1L, 1.68385767881294446649e-1L, + 2.32777776523703892094e-1L }; +#elif LDBL_MANT_DIG <= 106 // double-double + const long double maxNormForPade[] = { 8.58970550342939562202529664318890e-5L /* degree = 3 */, + 9.34074328446359654039446552677759e-4L, 4.26117194647672175773064114582860e-3L, + 1.21546224740281848743149666560464e-2L, 2.61100544998339436713088248557444e-2L, + 4.66170074627052749243018566390567e-2L, 7.32585144444135027565872014932387e-2L, + 1.05026503471351080481093652651105e-1L }; +#else // quadruple precision + const long double maxNormForPade[] = { 4.7419931187193005048501568167858103e-5L /* degree = 3 */, + 5.8853168473544560470387769480192666e-4L, 2.9216120366601315391789493628113520e-3L, + 8.8415758124319434347116734705174308e-3L, 1.9850836029449446668518049562565291e-2L, + 3.6688019729653446926585242192447447e-2L, 5.9290962294020186998954055264528393e-2L, + 8.6998436081634343903250580992127677e-2L, 1.1880960220216759245467951592883642e-1L }; +#endif + const int minPadeDegree = matrix_log_min_pade_degree::value; + const int maxPadeDegree = matrix_log_max_pade_degree::value; + int degree = minPadeDegree; + for (; degree <= maxPadeDegree; ++degree) + if (normTminusI <= maxNormForPade[degree - minPadeDegree]) + break; + return degree; +} + +/* \brief Compute Pade approximation to matrix logarithm */ +template +void matrix_log_compute_pade(MatrixType& result, const MatrixType& T, int degree) +{ + typedef typename NumTraits::Real RealScalar; + const int minPadeDegree = 3; + const int maxPadeDegree = 11; + assert(degree >= minPadeDegree && degree <= maxPadeDegree); + // FIXME this creates float-conversion-warnings if these are enabled. + // Either manually convert each value, or disable the warning locally + const RealScalar nodes[][maxPadeDegree] = { + { 0.1127016653792583114820734600217600L, 0.5000000000000000000000000000000000L, // degree 3 + 0.8872983346207416885179265399782400L }, + { 0.0694318442029737123880267555535953L, 0.3300094782075718675986671204483777L, // degree 4 + 0.6699905217924281324013328795516223L, 0.9305681557970262876119732444464048L }, + { 0.0469100770306680036011865608503035L, 0.2307653449471584544818427896498956L, // degree 5 + 0.5000000000000000000000000000000000L, 0.7692346550528415455181572103501044L, + 0.9530899229693319963988134391496965L }, + { 0.0337652428984239860938492227530027L, 0.1693953067668677431693002024900473L, // degree 6 + 0.3806904069584015456847491391596440L, 0.6193095930415984543152508608403560L, + 0.8306046932331322568306997975099527L, 0.9662347571015760139061507772469973L }, + { 0.0254460438286207377369051579760744L, 0.1292344072003027800680676133596058L, // degree 7 + 0.2970774243113014165466967939615193L, 0.5000000000000000000000000000000000L, + 0.7029225756886985834533032060384807L, 0.8707655927996972199319323866403942L, + 0.9745539561713792622630948420239256L }, + { 0.0198550717512318841582195657152635L, 0.1016667612931866302042230317620848L, // degree 8 + 0.2372337950418355070911304754053768L, 0.4082826787521750975302619288199080L, + 0.5917173212478249024697380711800920L, 0.7627662049581644929088695245946232L, + 0.8983332387068133697957769682379152L, 0.9801449282487681158417804342847365L }, + { 0.0159198802461869550822118985481636L, 0.0819844463366821028502851059651326L, // degree 9 + 0.1933142836497048013456489803292629L, 0.3378732882980955354807309926783317L, + 0.5000000000000000000000000000000000L, 0.6621267117019044645192690073216683L, + 0.8066857163502951986543510196707371L, 0.9180155536633178971497148940348674L, + 0.9840801197538130449177881014518364L }, + { 0.0130467357414141399610179939577740L, 0.0674683166555077446339516557882535L, // degree 10 + 0.1602952158504877968828363174425632L, 0.2833023029353764046003670284171079L, + 0.4255628305091843945575869994351400L, 0.5744371694908156054424130005648600L, + 0.7166976970646235953996329715828921L, 0.8397047841495122031171636825574368L, + 0.9325316833444922553660483442117465L, 0.9869532642585858600389820060422260L }, + { 0.0108856709269715035980309994385713L, 0.0564687001159523504624211153480364L, // degree 11 + 0.1349239972129753379532918739844233L, 0.2404519353965940920371371652706952L, + 0.3652284220238275138342340072995692L, 0.5000000000000000000000000000000000L, + 0.6347715779761724861657659927004308L, 0.7595480646034059079628628347293048L, + 0.8650760027870246620467081260155767L, 0.9435312998840476495375788846519636L, + 0.9891143290730284964019690005614287L } }; + + const RealScalar weights[][maxPadeDegree] = { + { 0.2777777777777777777777777777777778L, 0.4444444444444444444444444444444444L, // degree 3 + 0.2777777777777777777777777777777778L }, + { 0.1739274225687269286865319746109997L, 0.3260725774312730713134680253890003L, // degree 4 + 0.3260725774312730713134680253890003L, 0.1739274225687269286865319746109997L }, + { 0.1184634425280945437571320203599587L, 0.2393143352496832340206457574178191L, // degree 5 + 0.2844444444444444444444444444444444L, 0.2393143352496832340206457574178191L, + 0.1184634425280945437571320203599587L }, + { 0.0856622461895851725201480710863665L, 0.1803807865240693037849167569188581L, // degree 6 + 0.2339569672863455236949351719947755L, 0.2339569672863455236949351719947755L, + 0.1803807865240693037849167569188581L, 0.0856622461895851725201480710863665L }, + { 0.0647424830844348466353057163395410L, 0.1398526957446383339507338857118898L, // degree 7 + 0.1909150252525594724751848877444876L, 0.2089795918367346938775510204081633L, + 0.1909150252525594724751848877444876L, 0.1398526957446383339507338857118898L, + 0.0647424830844348466353057163395410L }, + { 0.0506142681451881295762656771549811L, 0.1111905172266872352721779972131204L, // degree 8 + 0.1568533229389436436689811009933007L, 0.1813418916891809914825752246385978L, + 0.1813418916891809914825752246385978L, 0.1568533229389436436689811009933007L, + 0.1111905172266872352721779972131204L, 0.0506142681451881295762656771549811L }, + { 0.0406371941807872059859460790552618L, 0.0903240803474287020292360156214564L, // degree 9 + 0.1303053482014677311593714347093164L, 0.1561735385200014200343152032922218L, + 0.1651196775006298815822625346434870L, 0.1561735385200014200343152032922218L, + 0.1303053482014677311593714347093164L, 0.0903240803474287020292360156214564L, + 0.0406371941807872059859460790552618L }, + { 0.0333356721543440687967844049466659L, 0.0747256745752902965728881698288487L, // degree 10 + 0.1095431812579910219977674671140816L, 0.1346333596549981775456134607847347L, + 0.1477621123573764350869464973256692L, 0.1477621123573764350869464973256692L, + 0.1346333596549981775456134607847347L, 0.1095431812579910219977674671140816L, + 0.0747256745752902965728881698288487L, 0.0333356721543440687967844049466659L }, + { 0.0278342835580868332413768602212743L, 0.0627901847324523123173471496119701L, // degree 11 + 0.0931451054638671257130488207158280L, 0.1165968822959952399592618524215876L, + 0.1314022722551233310903444349452546L, 0.1364625433889503153572417641681711L, + 0.1314022722551233310903444349452546L, 0.1165968822959952399592618524215876L, + 0.0931451054638671257130488207158280L, 0.0627901847324523123173471496119701L, + 0.0278342835580868332413768602212743L } }; + + MatrixType TminusI = T - MatrixType::Identity(T.rows(), T.rows()); + result.setZero(T.rows(), T.rows()); + for (int k = 0; k < degree; ++k) { + RealScalar weight = weights[degree-minPadeDegree][k]; + RealScalar node = nodes[degree-minPadeDegree][k]; + result += weight * (MatrixType::Identity(T.rows(), T.rows()) + node * TminusI) + .template triangularView().solve(TminusI); + } +} + +/** \brief Compute logarithm of triangular matrices with size > 2. + * \details This uses a inverse scale-and-square algorithm. */ +template +void matrix_log_compute_big(const MatrixType& A, MatrixType& result) +{ + typedef typename MatrixType::Scalar Scalar; + typedef typename NumTraits::Real RealScalar; + using std::pow; + + int numberOfSquareRoots = 0; + int numberOfExtraSquareRoots = 0; + int degree; + MatrixType T = A, sqrtT; + + const int maxPadeDegree = matrix_log_max_pade_degree::value; + const RealScalar maxNormForPade = RealScalar( + maxPadeDegree<= 5? 5.3149729967117310e-1L: // single precision + maxPadeDegree<= 7? 2.6429608311114350e-1L: // double precision + maxPadeDegree<= 8? 2.32777776523703892094e-1L: // extended precision + maxPadeDegree<=10? 1.05026503471351080481093652651105e-1L: // double-double + 1.1880960220216759245467951592883642e-1L); // quadruple precision + + while (true) { + RealScalar normTminusI = (T - MatrixType::Identity(T.rows(), T.rows())).cwiseAbs().colwise().sum().maxCoeff(); + if (normTminusI < maxNormForPade) { + degree = matrix_log_get_pade_degree(normTminusI); + int degree2 = matrix_log_get_pade_degree(normTminusI / RealScalar(2)); + if ((degree - degree2 <= 1) || (numberOfExtraSquareRoots == 1)) + break; + ++numberOfExtraSquareRoots; + } + matrix_sqrt_triangular(T, sqrtT); + T = sqrtT.template triangularView(); + ++numberOfSquareRoots; + } + + matrix_log_compute_pade(result, T, degree); + result *= pow(RealScalar(2), RealScalar(numberOfSquareRoots)); // TODO replace by bitshift if possible +} + +/** \ingroup MatrixFunctions_Module + * \class MatrixLogarithmAtomic + * \brief Helper class for computing matrix logarithm of atomic matrices. + * + * Here, an atomic matrix is a triangular matrix whose diagonal entries are close to each other. + * + * \sa class MatrixFunctionAtomic, MatrixBase::log() + */ +template +class MatrixLogarithmAtomic +{ +public: + /** \brief Compute matrix logarithm of atomic matrix + * \param[in] A argument of matrix logarithm, should be upper triangular and atomic + * \returns The logarithm of \p A. + */ + MatrixType compute(const MatrixType& A); +}; + +template +MatrixType MatrixLogarithmAtomic::compute(const MatrixType& A) +{ + using std::log; + MatrixType result(A.rows(), A.rows()); + if (A.rows() == 1) + result(0,0) = log(A(0,0)); + else if (A.rows() == 2) + matrix_log_compute_2x2(A, result); + else + matrix_log_compute_big(A, result); + return result; +} + +} // end of namespace internal + +/** \ingroup MatrixFunctions_Module + * + * \brief Proxy for the matrix logarithm of some matrix (expression). + * + * \tparam Derived Type of the argument to the matrix function. + * + * This class holds the argument to the matrix function until it is + * assigned or evaluated for some other reason (so the argument + * should not be changed in the meantime). It is the return type of + * MatrixBase::log() and most of the time this is the only way it + * is used. + */ +template class MatrixLogarithmReturnValue +: public ReturnByValue > +{ +public: + typedef typename Derived::Scalar Scalar; + typedef typename Derived::Index Index; + +protected: + typedef typename internal::ref_selector::type DerivedNested; + +public: + + /** \brief Constructor. + * + * \param[in] A %Matrix (expression) forming the argument of the matrix logarithm. + */ + explicit MatrixLogarithmReturnValue(const Derived& A) : m_A(A) { } + + /** \brief Compute the matrix logarithm. + * + * \param[out] result Logarithm of \c A, where \c A is as specified in the constructor. + */ + template + inline void evalTo(ResultType& result) const + { + typedef typename internal::nested_eval::type DerivedEvalType; + typedef typename internal::remove_all::type DerivedEvalTypeClean; + typedef internal::traits Traits; + typedef std::complex::Real> ComplexScalar; + typedef Matrix DynMatrixType; + typedef internal::MatrixLogarithmAtomic AtomicType; + AtomicType atomic; + + internal::matrix_function_compute::run(m_A, atomic, result); + } + + Index rows() const { return m_A.rows(); } + Index cols() const { return m_A.cols(); } + +private: + const DerivedNested m_A; +}; + +namespace internal { + template + struct traits > + { + typedef typename Derived::PlainObject ReturnType; + }; +} + + +/********** MatrixBase method **********/ + + +template +const MatrixLogarithmReturnValue MatrixBase::log() const +{ + eigen_assert(rows() == cols()); + return MatrixLogarithmReturnValue(derived()); +} + +} // end namespace Eigen + +#endif // EIGEN_MATRIX_LOGARITHM diff --git a/src/EigenUnsupported/src/MatrixFunctions/MatrixPower.h b/src/EigenUnsupported/src/MatrixFunctions/MatrixPower.h new file mode 100644 index 0000000..d7672d7 --- /dev/null +++ b/src/EigenUnsupported/src/MatrixFunctions/MatrixPower.h @@ -0,0 +1,705 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2012, 2013 Chen-Pang He +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_MATRIX_POWER +#define EIGEN_MATRIX_POWER + +namespace Eigen { + +template class MatrixPower; + +/** + * \ingroup MatrixFunctions_Module + * + * \brief Proxy for the matrix power of some matrix. + * + * \tparam MatrixType type of the base, a matrix. + * + * This class holds the arguments to the matrix power until it is + * assigned or evaluated for some other reason (so the argument + * should not be changed in the meantime). It is the return type of + * MatrixPower::operator() and related functions and most of the + * time this is the only way it is used. + */ +/* TODO This class is only used by MatrixPower, so it should be nested + * into MatrixPower, like MatrixPower::ReturnValue. However, my + * compiler complained about unused template parameter in the + * following declaration in namespace internal. + * + * template + * struct traits::ReturnValue>; + */ +template +class MatrixPowerParenthesesReturnValue : public ReturnByValue< MatrixPowerParenthesesReturnValue > +{ + public: + typedef typename MatrixType::RealScalar RealScalar; + + /** + * \brief Constructor. + * + * \param[in] pow %MatrixPower storing the base. + * \param[in] p scalar, the exponent of the matrix power. + */ + MatrixPowerParenthesesReturnValue(MatrixPower& pow, RealScalar p) : m_pow(pow), m_p(p) + { } + + /** + * \brief Compute the matrix power. + * + * \param[out] result + */ + template + inline void evalTo(ResultType& result) const + { m_pow.compute(result, m_p); } + + Index rows() const { return m_pow.rows(); } + Index cols() const { return m_pow.cols(); } + + private: + MatrixPower& m_pow; + const RealScalar m_p; +}; + +/** + * \ingroup MatrixFunctions_Module + * + * \brief Class for computing matrix powers. + * + * \tparam MatrixType type of the base, expected to be an instantiation + * of the Matrix class template. + * + * This class is capable of computing triangular real/complex matrices + * raised to a power in the interval \f$ (-1, 1) \f$. + * + * \note Currently this class is only used by MatrixPower. One may + * insist that this be nested into MatrixPower. This class is here to + * facilitate future development of triangular matrix functions. + */ +template +class MatrixPowerAtomic : internal::noncopyable +{ + private: + enum { + RowsAtCompileTime = MatrixType::RowsAtCompileTime, + MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime + }; + typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::RealScalar RealScalar; + typedef std::complex ComplexScalar; + typedef Block ResultType; + + const MatrixType& m_A; + RealScalar m_p; + + void computePade(int degree, const MatrixType& IminusT, ResultType& res) const; + void compute2x2(ResultType& res, RealScalar p) const; + void computeBig(ResultType& res) const; + static int getPadeDegree(float normIminusT); + static int getPadeDegree(double normIminusT); + static int getPadeDegree(long double normIminusT); + static ComplexScalar computeSuperDiag(const ComplexScalar&, const ComplexScalar&, RealScalar p); + static RealScalar computeSuperDiag(RealScalar, RealScalar, RealScalar p); + + public: + /** + * \brief Constructor. + * + * \param[in] T the base of the matrix power. + * \param[in] p the exponent of the matrix power, should be in + * \f$ (-1, 1) \f$. + * + * The class stores a reference to T, so it should not be changed + * (or destroyed) before evaluation. Only the upper triangular + * part of T is read. + */ + MatrixPowerAtomic(const MatrixType& T, RealScalar p); + + /** + * \brief Compute the matrix power. + * + * \param[out] res \f$ A^p \f$ where A and p are specified in the + * constructor. + */ + void compute(ResultType& res) const; +}; + +template +MatrixPowerAtomic::MatrixPowerAtomic(const MatrixType& T, RealScalar p) : + m_A(T), m_p(p) +{ + eigen_assert(T.rows() == T.cols()); + eigen_assert(p > -1 && p < 1); +} + +template +void MatrixPowerAtomic::compute(ResultType& res) const +{ + using std::pow; + switch (m_A.rows()) { + case 0: + break; + case 1: + res(0,0) = pow(m_A(0,0), m_p); + break; + case 2: + compute2x2(res, m_p); + break; + default: + computeBig(res); + } +} + +template +void MatrixPowerAtomic::computePade(int degree, const MatrixType& IminusT, ResultType& res) const +{ + int i = 2*degree; + res = (m_p-RealScalar(degree)) / RealScalar(2*i-2) * IminusT; + + for (--i; i; --i) { + res = (MatrixType::Identity(IminusT.rows(), IminusT.cols()) + res).template triangularView() + .solve((i==1 ? -m_p : i&1 ? (-m_p-RealScalar(i/2))/RealScalar(2*i) : (m_p-RealScalar(i/2))/RealScalar(2*i-2)) * IminusT).eval(); + } + res += MatrixType::Identity(IminusT.rows(), IminusT.cols()); +} + +// This function assumes that res has the correct size (see bug 614) +template +void MatrixPowerAtomic::compute2x2(ResultType& res, RealScalar p) const +{ + using std::abs; + using std::pow; + res.coeffRef(0,0) = pow(m_A.coeff(0,0), p); + + for (Index i=1; i < m_A.cols(); ++i) { + res.coeffRef(i,i) = pow(m_A.coeff(i,i), p); + if (m_A.coeff(i-1,i-1) == m_A.coeff(i,i)) + res.coeffRef(i-1,i) = p * pow(m_A.coeff(i,i), p-1); + else if (2*abs(m_A.coeff(i-1,i-1)) < abs(m_A.coeff(i,i)) || 2*abs(m_A.coeff(i,i)) < abs(m_A.coeff(i-1,i-1))) + res.coeffRef(i-1,i) = (res.coeff(i,i)-res.coeff(i-1,i-1)) / (m_A.coeff(i,i)-m_A.coeff(i-1,i-1)); + else + res.coeffRef(i-1,i) = computeSuperDiag(m_A.coeff(i,i), m_A.coeff(i-1,i-1), p); + res.coeffRef(i-1,i) *= m_A.coeff(i-1,i); + } +} + +template +void MatrixPowerAtomic::computeBig(ResultType& res) const +{ + using std::ldexp; + const int digits = std::numeric_limits::digits; + const RealScalar maxNormForPade = RealScalar( + digits <= 24? 4.3386528e-1L // single precision + : digits <= 53? 2.789358995219730e-1L // double precision + : digits <= 64? 2.4471944416607995472e-1L // extended precision + : digits <= 106? 1.1016843812851143391275867258512e-1L // double-double + : 9.134603732914548552537150753385375e-2L); // quadruple precision + MatrixType IminusT, sqrtT, T = m_A.template triangularView(); + RealScalar normIminusT; + int degree, degree2, numberOfSquareRoots = 0; + bool hasExtraSquareRoot = false; + + for (Index i=0; i < m_A.cols(); ++i) + eigen_assert(m_A(i,i) != RealScalar(0)); + + while (true) { + IminusT = MatrixType::Identity(m_A.rows(), m_A.cols()) - T; + normIminusT = IminusT.cwiseAbs().colwise().sum().maxCoeff(); + if (normIminusT < maxNormForPade) { + degree = getPadeDegree(normIminusT); + degree2 = getPadeDegree(normIminusT/2); + if (degree - degree2 <= 1 || hasExtraSquareRoot) + break; + hasExtraSquareRoot = true; + } + matrix_sqrt_triangular(T, sqrtT); + T = sqrtT.template triangularView(); + ++numberOfSquareRoots; + } + computePade(degree, IminusT, res); + + for (; numberOfSquareRoots; --numberOfSquareRoots) { + compute2x2(res, ldexp(m_p, -numberOfSquareRoots)); + res = res.template triangularView() * res; + } + compute2x2(res, m_p); +} + +template +inline int MatrixPowerAtomic::getPadeDegree(float normIminusT) +{ + const float maxNormForPade[] = { 2.8064004e-1f /* degree = 3 */ , 4.3386528e-1f }; + int degree = 3; + for (; degree <= 4; ++degree) + if (normIminusT <= maxNormForPade[degree - 3]) + break; + return degree; +} + +template +inline int MatrixPowerAtomic::getPadeDegree(double normIminusT) +{ + const double maxNormForPade[] = { 1.884160592658218e-2 /* degree = 3 */ , 6.038881904059573e-2, 1.239917516308172e-1, + 1.999045567181744e-1, 2.789358995219730e-1 }; + int degree = 3; + for (; degree <= 7; ++degree) + if (normIminusT <= maxNormForPade[degree - 3]) + break; + return degree; +} + +template +inline int MatrixPowerAtomic::getPadeDegree(long double normIminusT) +{ +#if LDBL_MANT_DIG == 53 + const int maxPadeDegree = 7; + const double maxNormForPade[] = { 1.884160592658218e-2L /* degree = 3 */ , 6.038881904059573e-2L, 1.239917516308172e-1L, + 1.999045567181744e-1L, 2.789358995219730e-1L }; +#elif LDBL_MANT_DIG <= 64 + const int maxPadeDegree = 8; + const long double maxNormForPade[] = { 6.3854693117491799460e-3L /* degree = 3 */ , 2.6394893435456973676e-2L, + 6.4216043030404063729e-2L, 1.1701165502926694307e-1L, 1.7904284231268670284e-1L, 2.4471944416607995472e-1L }; +#elif LDBL_MANT_DIG <= 106 + const int maxPadeDegree = 10; + const double maxNormForPade[] = { 1.0007161601787493236741409687186e-4L /* degree = 3 */ , + 1.0007161601787493236741409687186e-3L, 4.7069769360887572939882574746264e-3L, 1.3220386624169159689406653101695e-2L, + 2.8063482381631737920612944054906e-2L, 4.9625993951953473052385361085058e-2L, 7.7367040706027886224557538328171e-2L, + 1.1016843812851143391275867258512e-1L }; +#else + const int maxPadeDegree = 10; + const double maxNormForPade[] = { 5.524506147036624377378713555116378e-5L /* degree = 3 */ , + 6.640600568157479679823602193345995e-4L, 3.227716520106894279249709728084626e-3L, + 9.619593944683432960546978734646284e-3L, 2.134595382433742403911124458161147e-2L, + 3.908166513900489428442993794761185e-2L, 6.266780814639442865832535460550138e-2L, + 9.134603732914548552537150753385375e-2L }; +#endif + int degree = 3; + for (; degree <= maxPadeDegree; ++degree) + if (normIminusT <= maxNormForPade[degree - 3]) + break; + return degree; +} + +template +inline typename MatrixPowerAtomic::ComplexScalar +MatrixPowerAtomic::computeSuperDiag(const ComplexScalar& curr, const ComplexScalar& prev, RealScalar p) +{ + using std::ceil; + using std::exp; + using std::log; + using std::sinh; + + ComplexScalar logCurr = log(curr); + ComplexScalar logPrev = log(prev); + RealScalar unwindingNumber = ceil((numext::imag(logCurr - logPrev) - RealScalar(EIGEN_PI)) / RealScalar(2*EIGEN_PI)); + ComplexScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2) + ComplexScalar(0, RealScalar(EIGEN_PI)*unwindingNumber); + return RealScalar(2) * exp(RealScalar(0.5) * p * (logCurr + logPrev)) * sinh(p * w) / (curr - prev); +} + +template +inline typename MatrixPowerAtomic::RealScalar +MatrixPowerAtomic::computeSuperDiag(RealScalar curr, RealScalar prev, RealScalar p) +{ + using std::exp; + using std::log; + using std::sinh; + + RealScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2); + return 2 * exp(p * (log(curr) + log(prev)) / 2) * sinh(p * w) / (curr - prev); +} + +/** + * \ingroup MatrixFunctions_Module + * + * \brief Class for computing matrix powers. + * + * \tparam MatrixType type of the base, expected to be an instantiation + * of the Matrix class template. + * + * This class is capable of computing real/complex matrices raised to + * an arbitrary real power. Meanwhile, it saves the result of Schur + * decomposition if an non-integral power has even been calculated. + * Therefore, if you want to compute multiple (>= 2) matrix powers + * for the same matrix, using the class directly is more efficient than + * calling MatrixBase::pow(). + * + * Example: + * \include MatrixPower_optimal.cpp + * Output: \verbinclude MatrixPower_optimal.out + */ +template +class MatrixPower : internal::noncopyable +{ + private: + typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::RealScalar RealScalar; + + public: + /** + * \brief Constructor. + * + * \param[in] A the base of the matrix power. + * + * The class stores a reference to A, so it should not be changed + * (or destroyed) before evaluation. + */ + explicit MatrixPower(const MatrixType& A) : + m_A(A), + m_conditionNumber(0), + m_rank(A.cols()), + m_nulls(0) + { eigen_assert(A.rows() == A.cols()); } + + /** + * \brief Returns the matrix power. + * + * \param[in] p exponent, a real scalar. + * \return The expression \f$ A^p \f$, where A is specified in the + * constructor. + */ + const MatrixPowerParenthesesReturnValue operator()(RealScalar p) + { return MatrixPowerParenthesesReturnValue(*this, p); } + + /** + * \brief Compute the matrix power. + * + * \param[in] p exponent, a real scalar. + * \param[out] res \f$ A^p \f$ where A is specified in the + * constructor. + */ + template + void compute(ResultType& res, RealScalar p); + + Index rows() const { return m_A.rows(); } + Index cols() const { return m_A.cols(); } + + private: + typedef std::complex ComplexScalar; + typedef Matrix ComplexMatrix; + + /** \brief Reference to the base of matrix power. */ + typename MatrixType::Nested m_A; + + /** \brief Temporary storage. */ + MatrixType m_tmp; + + /** \brief Store the result of Schur decomposition. */ + ComplexMatrix m_T, m_U; + + /** \brief Store fractional power of m_T. */ + ComplexMatrix m_fT; + + /** + * \brief Condition number of m_A. + * + * It is initialized as 0 to avoid performing unnecessary Schur + * decomposition, which is the bottleneck. + */ + RealScalar m_conditionNumber; + + /** \brief Rank of m_A. */ + Index m_rank; + + /** \brief Rank deficiency of m_A. */ + Index m_nulls; + + /** + * \brief Split p into integral part and fractional part. + * + * \param[in] p The exponent. + * \param[out] p The fractional part ranging in \f$ (-1, 1) \f$. + * \param[out] intpart The integral part. + * + * Only if the fractional part is nonzero, it calls initialize(). + */ + void split(RealScalar& p, RealScalar& intpart); + + /** \brief Perform Schur decomposition for fractional power. */ + void initialize(); + + template + void computeIntPower(ResultType& res, RealScalar p); + + template + void computeFracPower(ResultType& res, RealScalar p); + + template + static void revertSchur( + Matrix& res, + const ComplexMatrix& T, + const ComplexMatrix& U); + + template + static void revertSchur( + Matrix& res, + const ComplexMatrix& T, + const ComplexMatrix& U); +}; + +template +template +void MatrixPower::compute(ResultType& res, RealScalar p) +{ + using std::pow; + switch (cols()) { + case 0: + break; + case 1: + res(0,0) = pow(m_A.coeff(0,0), p); + break; + default: + RealScalar intpart; + split(p, intpart); + + res = MatrixType::Identity(rows(), cols()); + computeIntPower(res, intpart); + if (p) computeFracPower(res, p); + } +} + +template +void MatrixPower::split(RealScalar& p, RealScalar& intpart) +{ + using std::floor; + using std::pow; + + intpart = floor(p); + p -= intpart; + + // Perform Schur decomposition if it is not yet performed and the power is + // not an integer. + if (!m_conditionNumber && p) + initialize(); + + // Choose the more stable of intpart = floor(p) and intpart = ceil(p). + if (p > RealScalar(0.5) && p > (1-p) * pow(m_conditionNumber, p)) { + --p; + ++intpart; + } +} + +template +void MatrixPower::initialize() +{ + const ComplexSchur schurOfA(m_A); + JacobiRotation rot; + ComplexScalar eigenvalue; + + m_fT.resizeLike(m_A); + m_T = schurOfA.matrixT(); + m_U = schurOfA.matrixU(); + m_conditionNumber = m_T.diagonal().array().abs().maxCoeff() / m_T.diagonal().array().abs().minCoeff(); + + // Move zero eigenvalues to the bottom right corner. + for (Index i = cols()-1; i>=0; --i) { + if (m_rank <= 2) + return; + if (m_T.coeff(i,i) == RealScalar(0)) { + for (Index j=i+1; j < m_rank; ++j) { + eigenvalue = m_T.coeff(j,j); + rot.makeGivens(m_T.coeff(j-1,j), eigenvalue); + m_T.applyOnTheRight(j-1, j, rot); + m_T.applyOnTheLeft(j-1, j, rot.adjoint()); + m_T.coeffRef(j-1,j-1) = eigenvalue; + m_T.coeffRef(j,j) = RealScalar(0); + m_U.applyOnTheRight(j-1, j, rot); + } + --m_rank; + } + } + + m_nulls = rows() - m_rank; + if (m_nulls) { + eigen_assert(m_T.bottomRightCorner(m_nulls, m_nulls).isZero() + && "Base of matrix power should be invertible or with a semisimple zero eigenvalue."); + m_fT.bottomRows(m_nulls).fill(RealScalar(0)); + } +} + +template +template +void MatrixPower::computeIntPower(ResultType& res, RealScalar p) +{ + using std::abs; + using std::fmod; + RealScalar pp = abs(p); + + if (p<0) + m_tmp = m_A.inverse(); + else + m_tmp = m_A; + + while (true) { + if (fmod(pp, 2) >= 1) + res = m_tmp * res; + pp /= 2; + if (pp < 1) + break; + m_tmp *= m_tmp; + } +} + +template +template +void MatrixPower::computeFracPower(ResultType& res, RealScalar p) +{ + Block blockTp(m_fT, 0, 0, m_rank, m_rank); + eigen_assert(m_conditionNumber); + eigen_assert(m_rank + m_nulls == rows()); + + MatrixPowerAtomic(m_T.topLeftCorner(m_rank, m_rank), p).compute(blockTp); + if (m_nulls) { + m_fT.topRightCorner(m_rank, m_nulls) = m_T.topLeftCorner(m_rank, m_rank).template triangularView() + .solve(blockTp * m_T.topRightCorner(m_rank, m_nulls)); + } + revertSchur(m_tmp, m_fT, m_U); + res = m_tmp * res; +} + +template +template +inline void MatrixPower::revertSchur( + Matrix& res, + const ComplexMatrix& T, + const ComplexMatrix& U) +{ res.noalias() = U * (T.template triangularView() * U.adjoint()); } + +template +template +inline void MatrixPower::revertSchur( + Matrix& res, + const ComplexMatrix& T, + const ComplexMatrix& U) +{ res.noalias() = (U * (T.template triangularView() * U.adjoint())).real(); } + +/** + * \ingroup MatrixFunctions_Module + * + * \brief Proxy for the matrix power of some matrix (expression). + * + * \tparam Derived type of the base, a matrix (expression). + * + * This class holds the arguments to the matrix power until it is + * assigned or evaluated for some other reason (so the argument + * should not be changed in the meantime). It is the return type of + * MatrixBase::pow() and related functions and most of the + * time this is the only way it is used. + */ +template +class MatrixPowerReturnValue : public ReturnByValue< MatrixPowerReturnValue > +{ + public: + typedef typename Derived::PlainObject PlainObject; + typedef typename Derived::RealScalar RealScalar; + + /** + * \brief Constructor. + * + * \param[in] A %Matrix (expression), the base of the matrix power. + * \param[in] p real scalar, the exponent of the matrix power. + */ + MatrixPowerReturnValue(const Derived& A, RealScalar p) : m_A(A), m_p(p) + { } + + /** + * \brief Compute the matrix power. + * + * \param[out] result \f$ A^p \f$ where \p A and \p p are as in the + * constructor. + */ + template + inline void evalTo(ResultType& result) const + { MatrixPower(m_A.eval()).compute(result, m_p); } + + Index rows() const { return m_A.rows(); } + Index cols() const { return m_A.cols(); } + + private: + const Derived& m_A; + const RealScalar m_p; +}; + +/** + * \ingroup MatrixFunctions_Module + * + * \brief Proxy for the matrix power of some matrix (expression). + * + * \tparam Derived type of the base, a matrix (expression). + * + * This class holds the arguments to the matrix power until it is + * assigned or evaluated for some other reason (so the argument + * should not be changed in the meantime). It is the return type of + * MatrixBase::pow() and related functions and most of the + * time this is the only way it is used. + */ +template +class MatrixComplexPowerReturnValue : public ReturnByValue< MatrixComplexPowerReturnValue > +{ + public: + typedef typename Derived::PlainObject PlainObject; + typedef typename std::complex ComplexScalar; + + /** + * \brief Constructor. + * + * \param[in] A %Matrix (expression), the base of the matrix power. + * \param[in] p complex scalar, the exponent of the matrix power. + */ + MatrixComplexPowerReturnValue(const Derived& A, const ComplexScalar& p) : m_A(A), m_p(p) + { } + + /** + * \brief Compute the matrix power. + * + * Because \p p is complex, \f$ A^p \f$ is simply evaluated as \f$ + * \exp(p \log(A)) \f$. + * + * \param[out] result \f$ A^p \f$ where \p A and \p p are as in the + * constructor. + */ + template + inline void evalTo(ResultType& result) const + { result = (m_p * m_A.log()).exp(); } + + Index rows() const { return m_A.rows(); } + Index cols() const { return m_A.cols(); } + + private: + const Derived& m_A; + const ComplexScalar m_p; +}; + +namespace internal { + +template +struct traits< MatrixPowerParenthesesReturnValue > +{ typedef typename MatrixPowerType::PlainObject ReturnType; }; + +template +struct traits< MatrixPowerReturnValue > +{ typedef typename Derived::PlainObject ReturnType; }; + +template +struct traits< MatrixComplexPowerReturnValue > +{ typedef typename Derived::PlainObject ReturnType; }; + +} + +template +const MatrixPowerReturnValue MatrixBase::pow(const RealScalar& p) const +{ return MatrixPowerReturnValue(derived(), p); } + +template +const MatrixComplexPowerReturnValue MatrixBase::pow(const std::complex& p) const +{ return MatrixComplexPowerReturnValue(derived(), p); } + +} // namespace Eigen + +#endif // EIGEN_MATRIX_POWER diff --git a/src/EigenUnsupported/src/MatrixFunctions/MatrixSquareRoot.h b/src/EigenUnsupported/src/MatrixFunctions/MatrixSquareRoot.h new file mode 100644 index 0000000..e363e77 --- /dev/null +++ b/src/EigenUnsupported/src/MatrixFunctions/MatrixSquareRoot.h @@ -0,0 +1,368 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2011, 2013 Jitse Niesen +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_MATRIX_SQUARE_ROOT +#define EIGEN_MATRIX_SQUARE_ROOT + +namespace Eigen { + +namespace internal { + +// pre: T.block(i,i,2,2) has complex conjugate eigenvalues +// post: sqrtT.block(i,i,2,2) is square root of T.block(i,i,2,2) +template +void matrix_sqrt_quasi_triangular_2x2_diagonal_block(const MatrixType& T, Index i, ResultType& sqrtT) +{ + // TODO: This case (2-by-2 blocks with complex conjugate eigenvalues) is probably hidden somewhere + // in EigenSolver. If we expose it, we could call it directly from here. + typedef typename traits::Scalar Scalar; + Matrix block = T.template block<2,2>(i,i); + EigenSolver > es(block); + sqrtT.template block<2,2>(i,i) + = (es.eigenvectors() * es.eigenvalues().cwiseSqrt().asDiagonal() * es.eigenvectors().inverse()).real(); +} + +// pre: block structure of T is such that (i,j) is a 1x1 block, +// all blocks of sqrtT to left of and below (i,j) are correct +// post: sqrtT(i,j) has the correct value +template +void matrix_sqrt_quasi_triangular_1x1_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT) +{ + typedef typename traits::Scalar Scalar; + Scalar tmp = (sqrtT.row(i).segment(i+1,j-i-1) * sqrtT.col(j).segment(i+1,j-i-1)).value(); + sqrtT.coeffRef(i,j) = (T.coeff(i,j) - tmp) / (sqrtT.coeff(i,i) + sqrtT.coeff(j,j)); +} + +// similar to compute1x1offDiagonalBlock() +template +void matrix_sqrt_quasi_triangular_1x2_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT) +{ + typedef typename traits::Scalar Scalar; + Matrix rhs = T.template block<1,2>(i,j); + if (j-i > 1) + rhs -= sqrtT.block(i, i+1, 1, j-i-1) * sqrtT.block(i+1, j, j-i-1, 2); + Matrix A = sqrtT.coeff(i,i) * Matrix::Identity(); + A += sqrtT.template block<2,2>(j,j).transpose(); + sqrtT.template block<1,2>(i,j).transpose() = A.fullPivLu().solve(rhs.transpose()); +} + +// similar to compute1x1offDiagonalBlock() +template +void matrix_sqrt_quasi_triangular_2x1_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT) +{ + typedef typename traits::Scalar Scalar; + Matrix rhs = T.template block<2,1>(i,j); + if (j-i > 2) + rhs -= sqrtT.block(i, i+2, 2, j-i-2) * sqrtT.block(i+2, j, j-i-2, 1); + Matrix A = sqrtT.coeff(j,j) * Matrix::Identity(); + A += sqrtT.template block<2,2>(i,i); + sqrtT.template block<2,1>(i,j) = A.fullPivLu().solve(rhs); +} + +// solves the equation A X + X B = C where all matrices are 2-by-2 +template +void matrix_sqrt_quasi_triangular_solve_auxiliary_equation(MatrixType& X, const MatrixType& A, const MatrixType& B, const MatrixType& C) +{ + typedef typename traits::Scalar Scalar; + Matrix coeffMatrix = Matrix::Zero(); + coeffMatrix.coeffRef(0,0) = A.coeff(0,0) + B.coeff(0,0); + coeffMatrix.coeffRef(1,1) = A.coeff(0,0) + B.coeff(1,1); + coeffMatrix.coeffRef(2,2) = A.coeff(1,1) + B.coeff(0,0); + coeffMatrix.coeffRef(3,3) = A.coeff(1,1) + B.coeff(1,1); + coeffMatrix.coeffRef(0,1) = B.coeff(1,0); + coeffMatrix.coeffRef(0,2) = A.coeff(0,1); + coeffMatrix.coeffRef(1,0) = B.coeff(0,1); + coeffMatrix.coeffRef(1,3) = A.coeff(0,1); + coeffMatrix.coeffRef(2,0) = A.coeff(1,0); + coeffMatrix.coeffRef(2,3) = B.coeff(1,0); + coeffMatrix.coeffRef(3,1) = A.coeff(1,0); + coeffMatrix.coeffRef(3,2) = B.coeff(0,1); + + Matrix rhs; + rhs.coeffRef(0) = C.coeff(0,0); + rhs.coeffRef(1) = C.coeff(0,1); + rhs.coeffRef(2) = C.coeff(1,0); + rhs.coeffRef(3) = C.coeff(1,1); + + Matrix result; + result = coeffMatrix.fullPivLu().solve(rhs); + + X.coeffRef(0,0) = result.coeff(0); + X.coeffRef(0,1) = result.coeff(1); + X.coeffRef(1,0) = result.coeff(2); + X.coeffRef(1,1) = result.coeff(3); +} + +// similar to compute1x1offDiagonalBlock() +template +void matrix_sqrt_quasi_triangular_2x2_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT) +{ + typedef typename traits::Scalar Scalar; + Matrix A = sqrtT.template block<2,2>(i,i); + Matrix B = sqrtT.template block<2,2>(j,j); + Matrix C = T.template block<2,2>(i,j); + if (j-i > 2) + C -= sqrtT.block(i, i+2, 2, j-i-2) * sqrtT.block(i+2, j, j-i-2, 2); + Matrix X; + matrix_sqrt_quasi_triangular_solve_auxiliary_equation(X, A, B, C); + sqrtT.template block<2,2>(i,j) = X; +} + +// pre: T is quasi-upper-triangular and sqrtT is a zero matrix of the same size +// post: the diagonal blocks of sqrtT are the square roots of the diagonal blocks of T +template +void matrix_sqrt_quasi_triangular_diagonal(const MatrixType& T, ResultType& sqrtT) +{ + using std::sqrt; + const Index size = T.rows(); + for (Index i = 0; i < size; i++) { + if (i == size - 1 || T.coeff(i+1, i) == 0) { + eigen_assert(T(i,i) >= 0); + sqrtT.coeffRef(i,i) = sqrt(T.coeff(i,i)); + } + else { + matrix_sqrt_quasi_triangular_2x2_diagonal_block(T, i, sqrtT); + ++i; + } + } +} + +// pre: T is quasi-upper-triangular and diagonal blocks of sqrtT are square root of diagonal blocks of T. +// post: sqrtT is the square root of T. +template +void matrix_sqrt_quasi_triangular_off_diagonal(const MatrixType& T, ResultType& sqrtT) +{ + const Index size = T.rows(); + for (Index j = 1; j < size; j++) { + if (T.coeff(j, j-1) != 0) // if T(j-1:j, j-1:j) is a 2-by-2 block + continue; + for (Index i = j-1; i >= 0; i--) { + if (i > 0 && T.coeff(i, i-1) != 0) // if T(i-1:i, i-1:i) is a 2-by-2 block + continue; + bool iBlockIs2x2 = (i < size - 1) && (T.coeff(i+1, i) != 0); + bool jBlockIs2x2 = (j < size - 1) && (T.coeff(j+1, j) != 0); + if (iBlockIs2x2 && jBlockIs2x2) + matrix_sqrt_quasi_triangular_2x2_off_diagonal_block(T, i, j, sqrtT); + else if (iBlockIs2x2 && !jBlockIs2x2) + matrix_sqrt_quasi_triangular_2x1_off_diagonal_block(T, i, j, sqrtT); + else if (!iBlockIs2x2 && jBlockIs2x2) + matrix_sqrt_quasi_triangular_1x2_off_diagonal_block(T, i, j, sqrtT); + else if (!iBlockIs2x2 && !jBlockIs2x2) + matrix_sqrt_quasi_triangular_1x1_off_diagonal_block(T, i, j, sqrtT); + } + } +} + +} // end of namespace internal + +/** \ingroup MatrixFunctions_Module + * \brief Compute matrix square root of quasi-triangular matrix. + * + * \tparam MatrixType type of \p arg, the argument of matrix square root, + * expected to be an instantiation of the Matrix class template. + * \tparam ResultType type of \p result, where result is to be stored. + * \param[in] arg argument of matrix square root. + * \param[out] result matrix square root of upper Hessenberg part of \p arg. + * + * This function computes the square root of the upper quasi-triangular matrix stored in the upper + * Hessenberg part of \p arg. Only the upper Hessenberg part of \p result is updated, the rest is + * not touched. See MatrixBase::sqrt() for details on how this computation is implemented. + * + * \sa MatrixSquareRoot, MatrixSquareRootQuasiTriangular + */ +template +void matrix_sqrt_quasi_triangular(const MatrixType &arg, ResultType &result) +{ + eigen_assert(arg.rows() == arg.cols()); + result.resize(arg.rows(), arg.cols()); + internal::matrix_sqrt_quasi_triangular_diagonal(arg, result); + internal::matrix_sqrt_quasi_triangular_off_diagonal(arg, result); +} + + +/** \ingroup MatrixFunctions_Module + * \brief Compute matrix square root of triangular matrix. + * + * \tparam MatrixType type of \p arg, the argument of matrix square root, + * expected to be an instantiation of the Matrix class template. + * \tparam ResultType type of \p result, where result is to be stored. + * \param[in] arg argument of matrix square root. + * \param[out] result matrix square root of upper triangular part of \p arg. + * + * Only the upper triangular part (including the diagonal) of \p result is updated, the rest is not + * touched. See MatrixBase::sqrt() for details on how this computation is implemented. + * + * \sa MatrixSquareRoot, MatrixSquareRootQuasiTriangular + */ +template +void matrix_sqrt_triangular(const MatrixType &arg, ResultType &result) +{ + using std::sqrt; + typedef typename MatrixType::Scalar Scalar; + + eigen_assert(arg.rows() == arg.cols()); + + // Compute square root of arg and store it in upper triangular part of result + // This uses that the square root of triangular matrices can be computed directly. + result.resize(arg.rows(), arg.cols()); + for (Index i = 0; i < arg.rows(); i++) { + result.coeffRef(i,i) = sqrt(arg.coeff(i,i)); + } + for (Index j = 1; j < arg.cols(); j++) { + for (Index i = j-1; i >= 0; i--) { + // if i = j-1, then segment has length 0 so tmp = 0 + Scalar tmp = (result.row(i).segment(i+1,j-i-1) * result.col(j).segment(i+1,j-i-1)).value(); + // denominator may be zero if original matrix is singular + result.coeffRef(i,j) = (arg.coeff(i,j) - tmp) / (result.coeff(i,i) + result.coeff(j,j)); + } + } +} + + +namespace internal { + +/** \ingroup MatrixFunctions_Module + * \brief Helper struct for computing matrix square roots of general matrices. + * \tparam MatrixType type of the argument of the matrix square root, + * expected to be an instantiation of the Matrix class template. + * + * \sa MatrixSquareRootTriangular, MatrixSquareRootQuasiTriangular, MatrixBase::sqrt() + */ +template ::Scalar>::IsComplex> +struct matrix_sqrt_compute +{ + /** \brief Compute the matrix square root + * + * \param[in] arg matrix whose square root is to be computed. + * \param[out] result square root of \p arg. + * + * See MatrixBase::sqrt() for details on how this computation is implemented. + */ + template static void run(const MatrixType &arg, ResultType &result); +}; + + +// ********** Partial specialization for real matrices ********** + +template +struct matrix_sqrt_compute +{ + typedef typename MatrixType::PlainObject PlainType; + template + static void run(const MatrixType &arg, ResultType &result) + { + eigen_assert(arg.rows() == arg.cols()); + + // Compute Schur decomposition of arg + const RealSchur schurOfA(arg); + const PlainType& T = schurOfA.matrixT(); + const PlainType& U = schurOfA.matrixU(); + + // Compute square root of T + PlainType sqrtT = PlainType::Zero(arg.rows(), arg.cols()); + matrix_sqrt_quasi_triangular(T, sqrtT); + + // Compute square root of arg + result = U * sqrtT * U.adjoint(); + } +}; + + +// ********** Partial specialization for complex matrices ********** + +template +struct matrix_sqrt_compute +{ + typedef typename MatrixType::PlainObject PlainType; + template + static void run(const MatrixType &arg, ResultType &result) + { + eigen_assert(arg.rows() == arg.cols()); + + // Compute Schur decomposition of arg + const ComplexSchur schurOfA(arg); + const PlainType& T = schurOfA.matrixT(); + const PlainType& U = schurOfA.matrixU(); + + // Compute square root of T + PlainType sqrtT; + matrix_sqrt_triangular(T, sqrtT); + + // Compute square root of arg + result = U * (sqrtT.template triangularView() * U.adjoint()); + } +}; + +} // end namespace internal + +/** \ingroup MatrixFunctions_Module + * + * \brief Proxy for the matrix square root of some matrix (expression). + * + * \tparam Derived Type of the argument to the matrix square root. + * + * This class holds the argument to the matrix square root until it + * is assigned or evaluated for some other reason (so the argument + * should not be changed in the meantime). It is the return type of + * MatrixBase::sqrt() and most of the time this is the only way it is + * used. + */ +template class MatrixSquareRootReturnValue +: public ReturnByValue > +{ + protected: + typedef typename internal::ref_selector::type DerivedNested; + + public: + /** \brief Constructor. + * + * \param[in] src %Matrix (expression) forming the argument of the + * matrix square root. + */ + explicit MatrixSquareRootReturnValue(const Derived& src) : m_src(src) { } + + /** \brief Compute the matrix square root. + * + * \param[out] result the matrix square root of \p src in the + * constructor. + */ + template + inline void evalTo(ResultType& result) const + { + typedef typename internal::nested_eval::type DerivedEvalType; + typedef typename internal::remove_all::type DerivedEvalTypeClean; + DerivedEvalType tmp(m_src); + internal::matrix_sqrt_compute::run(tmp, result); + } + + Index rows() const { return m_src.rows(); } + Index cols() const { return m_src.cols(); } + + protected: + const DerivedNested m_src; +}; + +namespace internal { +template +struct traits > +{ + typedef typename Derived::PlainObject ReturnType; +}; +} + +template +const MatrixSquareRootReturnValue MatrixBase::sqrt() const +{ + eigen_assert(rows() == cols()); + return MatrixSquareRootReturnValue(derived()); +} + +} // end namespace Eigen + +#endif // EIGEN_MATRIX_FUNCTION diff --git a/src/EigenUnsupported/src/MatrixFunctions/StemFunction.h b/src/EigenUnsupported/src/MatrixFunctions/StemFunction.h new file mode 100644 index 0000000..7604df9 --- /dev/null +++ b/src/EigenUnsupported/src/MatrixFunctions/StemFunction.h @@ -0,0 +1,117 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2010, 2013 Jitse Niesen +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_STEM_FUNCTION +#define EIGEN_STEM_FUNCTION + +namespace Eigen { + +namespace internal { + +/** \brief The exponential function (and its derivatives). */ +template +Scalar stem_function_exp(Scalar x, int) +{ + using std::exp; + return exp(x); +} + +/** \brief Cosine (and its derivatives). */ +template +Scalar stem_function_cos(Scalar x, int n) +{ + using std::cos; + using std::sin; + Scalar res; + + switch (n % 4) { + case 0: + res = std::cos(x); + break; + case 1: + res = -std::sin(x); + break; + case 2: + res = -std::cos(x); + break; + case 3: + res = std::sin(x); + break; + } + return res; +} + +/** \brief Sine (and its derivatives). */ +template +Scalar stem_function_sin(Scalar x, int n) +{ + using std::cos; + using std::sin; + Scalar res; + + switch (n % 4) { + case 0: + res = std::sin(x); + break; + case 1: + res = std::cos(x); + break; + case 2: + res = -std::sin(x); + break; + case 3: + res = -std::cos(x); + break; + } + return res; +} + +/** \brief Hyperbolic cosine (and its derivatives). */ +template +Scalar stem_function_cosh(Scalar x, int n) +{ + using std::cosh; + using std::sinh; + Scalar res; + + switch (n % 2) { + case 0: + res = std::cosh(x); + break; + case 1: + res = std::sinh(x); + break; + } + return res; +} + +/** \brief Hyperbolic sine (and its derivatives). */ +template +Scalar stem_function_sinh(Scalar x, int n) +{ + using std::cosh; + using std::sinh; + Scalar res; + + switch (n % 2) { + case 0: + res = std::sinh(x); + break; + case 1: + res = std::cosh(x); + break; + } + return res; +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_STEM_FUNCTION diff --git a/src/EigenUnsupported/src/MoreVectorization/MathFunctions.h b/src/EigenUnsupported/src/MoreVectorization/MathFunctions.h new file mode 100644 index 0000000..63cb28d --- /dev/null +++ b/src/EigenUnsupported/src/MoreVectorization/MathFunctions.h @@ -0,0 +1,95 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Rohit Garg +// Copyright (C) 2009 Benoit Jacob +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_MOREVECTORIZATION_MATHFUNCTIONS_H +#define EIGEN_MOREVECTORIZATION_MATHFUNCTIONS_H + +namespace Eigen { + +namespace internal { + +/** \internal \returns the arcsin of \a a (coeff-wise) */ +template inline static Packet pasin(Packet a) { return std::asin(a); } + +#ifdef EIGEN_VECTORIZE_SSE + +template<> EIGEN_DONT_INLINE Packet4f pasin(Packet4f x) +{ + _EIGEN_DECLARE_CONST_Packet4f(half, 0.5); + _EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5); + _EIGEN_DECLARE_CONST_Packet4f(3half, 1.5); + + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000); + + _EIGEN_DECLARE_CONST_Packet4f(pi, 3.141592654); + _EIGEN_DECLARE_CONST_Packet4f(pi_over_2, 3.141592654*0.5); + + _EIGEN_DECLARE_CONST_Packet4f(asin1, 4.2163199048E-2); + _EIGEN_DECLARE_CONST_Packet4f(asin2, 2.4181311049E-2); + _EIGEN_DECLARE_CONST_Packet4f(asin3, 4.5470025998E-2); + _EIGEN_DECLARE_CONST_Packet4f(asin4, 7.4953002686E-2); + _EIGEN_DECLARE_CONST_Packet4f(asin5, 1.6666752422E-1); + + Packet4f a = pabs(x);//got the absolute value + + Packet4f sign_bit= _mm_and_ps(x, p4f_sign_mask);//extracted the sign bit + + Packet4f z1,z2;//will need them during computation + + +//will compute the two branches for asin +//so first compare with half + + Packet4f branch_mask= _mm_cmpgt_ps(a, p4f_half);//this is to select which branch to take +//both will be taken, and finally results will be merged +//the branch for values >0.5 + + { +//the core series expansion + z1=pmadd(p4f_minus_half,a,p4f_half); + Packet4f x1=psqrt(z1); + Packet4f s1=pmadd(p4f_asin1, z1, p4f_asin2); + Packet4f s2=pmadd(s1, z1, p4f_asin3); + Packet4f s3=pmadd(s2,z1, p4f_asin4); + Packet4f s4=pmadd(s3,z1, p4f_asin5); + Packet4f temp=pmul(s4,z1);//not really a madd but a mul by z so that the next term can be a madd + z1=pmadd(temp,x1,x1); + z1=padd(z1,z1); + z1=psub(p4f_pi_over_2,z1); + } + + { +//the core series expansion + Packet4f x2=a; + z2=pmul(x2,x2); + Packet4f s1=pmadd(p4f_asin1, z2, p4f_asin2); + Packet4f s2=pmadd(s1, z2, p4f_asin3); + Packet4f s3=pmadd(s2,z2, p4f_asin4); + Packet4f s4=pmadd(s3,z2, p4f_asin5); + Packet4f temp=pmul(s4,z2);//not really a madd but a mul by z so that the next term can be a madd + z2=pmadd(temp,x2,x2); + } + +/* select the correct result from the two branch evaluations */ + z1 = _mm_and_ps(branch_mask, z1); + z2 = _mm_andnot_ps(branch_mask, z2); + Packet4f z = _mm_or_ps(z1,z2); + +/* update the sign */ + return _mm_xor_ps(z, sign_bit); +} + +#endif // EIGEN_VECTORIZE_SSE + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_MOREVECTORIZATION_MATHFUNCTIONS_H diff --git a/src/EigenUnsupported/src/NonLinearOptimization/HybridNonLinearSolver.h b/src/EigenUnsupported/src/NonLinearOptimization/HybridNonLinearSolver.h new file mode 100644 index 0000000..07c5ef0 --- /dev/null +++ b/src/EigenUnsupported/src/NonLinearOptimization/HybridNonLinearSolver.h @@ -0,0 +1,601 @@ +// -*- coding: utf-8 +// vim: set fileencoding=utf-8 + +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Thomas Capricelli +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_HYBRIDNONLINEARSOLVER_H +#define EIGEN_HYBRIDNONLINEARSOLVER_H + +namespace Eigen { + +namespace HybridNonLinearSolverSpace { + enum Status { + Running = -1, + ImproperInputParameters = 0, + RelativeErrorTooSmall = 1, + TooManyFunctionEvaluation = 2, + TolTooSmall = 3, + NotMakingProgressJacobian = 4, + NotMakingProgressIterations = 5, + UserAsked = 6 + }; +} + +/** + * \ingroup NonLinearOptimization_Module + * \brief Finds a zero of a system of n + * nonlinear functions in n variables by a modification of the Powell + * hybrid method ("dogleg"). + * + * The user must provide a subroutine which calculates the + * functions. The Jacobian is either provided by the user, or approximated + * using a forward-difference method. + * + */ +template +class HybridNonLinearSolver +{ +public: + typedef DenseIndex Index; + + HybridNonLinearSolver(FunctorType &_functor) + : functor(_functor) { nfev=njev=iter = 0; fnorm= 0.; useExternalScaling=false;} + + struct Parameters { + Parameters() + : factor(Scalar(100.)) + , maxfev(1000) + , xtol(numext::sqrt(NumTraits::epsilon())) + , nb_of_subdiagonals(-1) + , nb_of_superdiagonals(-1) + , epsfcn(Scalar(0.)) {} + Scalar factor; + Index maxfev; // maximum number of function evaluation + Scalar xtol; + Index nb_of_subdiagonals; + Index nb_of_superdiagonals; + Scalar epsfcn; + }; + typedef Matrix< Scalar, Dynamic, 1 > FVectorType; + typedef Matrix< Scalar, Dynamic, Dynamic > JacobianType; + /* TODO: if eigen provides a triangular storage, use it here */ + typedef Matrix< Scalar, Dynamic, Dynamic > UpperTriangularType; + + HybridNonLinearSolverSpace::Status hybrj1( + FVectorType &x, + const Scalar tol = numext::sqrt(NumTraits::epsilon()) + ); + + HybridNonLinearSolverSpace::Status solveInit(FVectorType &x); + HybridNonLinearSolverSpace::Status solveOneStep(FVectorType &x); + HybridNonLinearSolverSpace::Status solve(FVectorType &x); + + HybridNonLinearSolverSpace::Status hybrd1( + FVectorType &x, + const Scalar tol = numext::sqrt(NumTraits::epsilon()) + ); + + HybridNonLinearSolverSpace::Status solveNumericalDiffInit(FVectorType &x); + HybridNonLinearSolverSpace::Status solveNumericalDiffOneStep(FVectorType &x); + HybridNonLinearSolverSpace::Status solveNumericalDiff(FVectorType &x); + + void resetParameters(void) { parameters = Parameters(); } + Parameters parameters; + FVectorType fvec, qtf, diag; + JacobianType fjac; + UpperTriangularType R; + Index nfev; + Index njev; + Index iter; + Scalar fnorm; + bool useExternalScaling; +private: + FunctorType &functor; + Index n; + Scalar sum; + bool sing; + Scalar temp; + Scalar delta; + bool jeval; + Index ncsuc; + Scalar ratio; + Scalar pnorm, xnorm, fnorm1; + Index nslow1, nslow2; + Index ncfail; + Scalar actred, prered; + FVectorType wa1, wa2, wa3, wa4; + + HybridNonLinearSolver& operator=(const HybridNonLinearSolver&); +}; + + + +template +HybridNonLinearSolverSpace::Status +HybridNonLinearSolver::hybrj1( + FVectorType &x, + const Scalar tol + ) +{ + n = x.size(); + + /* check the input parameters for errors. */ + if (n <= 0 || tol < 0.) + return HybridNonLinearSolverSpace::ImproperInputParameters; + + resetParameters(); + parameters.maxfev = 100*(n+1); + parameters.xtol = tol; + diag.setConstant(n, 1.); + useExternalScaling = true; + return solve(x); +} + +template +HybridNonLinearSolverSpace::Status +HybridNonLinearSolver::solveInit(FVectorType &x) +{ + n = x.size(); + + wa1.resize(n); wa2.resize(n); wa3.resize(n); wa4.resize(n); + fvec.resize(n); + qtf.resize(n); + fjac.resize(n, n); + if (!useExternalScaling) + diag.resize(n); + eigen_assert( (!useExternalScaling || diag.size()==n) && "When useExternalScaling is set, the caller must provide a valid 'diag'"); + + /* Function Body */ + nfev = 0; + njev = 0; + + /* check the input parameters for errors. */ + if (n <= 0 || parameters.xtol < 0. || parameters.maxfev <= 0 || parameters.factor <= 0. ) + return HybridNonLinearSolverSpace::ImproperInputParameters; + if (useExternalScaling) + for (Index j = 0; j < n; ++j) + if (diag[j] <= 0.) + return HybridNonLinearSolverSpace::ImproperInputParameters; + + /* evaluate the function at the starting point */ + /* and calculate its norm. */ + nfev = 1; + if ( functor(x, fvec) < 0) + return HybridNonLinearSolverSpace::UserAsked; + fnorm = fvec.stableNorm(); + + /* initialize iteration counter and monitors. */ + iter = 1; + ncsuc = 0; + ncfail = 0; + nslow1 = 0; + nslow2 = 0; + + return HybridNonLinearSolverSpace::Running; +} + +template +HybridNonLinearSolverSpace::Status +HybridNonLinearSolver::solveOneStep(FVectorType &x) +{ + using std::abs; + + eigen_assert(x.size()==n); // check the caller is not cheating us + + Index j; + std::vector > v_givens(n), w_givens(n); + + jeval = true; + + /* calculate the jacobian matrix. */ + if ( functor.df(x, fjac) < 0) + return HybridNonLinearSolverSpace::UserAsked; + ++njev; + + wa2 = fjac.colwise().blueNorm(); + + /* on the first iteration and if external scaling is not used, scale according */ + /* to the norms of the columns of the initial jacobian. */ + if (iter == 1) { + if (!useExternalScaling) + for (j = 0; j < n; ++j) + diag[j] = (wa2[j]==0.) ? 1. : wa2[j]; + + /* on the first iteration, calculate the norm of the scaled x */ + /* and initialize the step bound delta. */ + xnorm = diag.cwiseProduct(x).stableNorm(); + delta = parameters.factor * xnorm; + if (delta == 0.) + delta = parameters.factor; + } + + /* compute the qr factorization of the jacobian. */ + HouseholderQR qrfac(fjac); // no pivoting: + + /* copy the triangular factor of the qr factorization into r. */ + R = qrfac.matrixQR(); + + /* accumulate the orthogonal factor in fjac. */ + fjac = qrfac.householderQ(); + + /* form (q transpose)*fvec and store in qtf. */ + qtf = fjac.transpose() * fvec; + + /* rescale if necessary. */ + if (!useExternalScaling) + diag = diag.cwiseMax(wa2); + + while (true) { + /* determine the direction p. */ + internal::dogleg(R, diag, qtf, delta, wa1); + + /* store the direction p and x + p. calculate the norm of p. */ + wa1 = -wa1; + wa2 = x + wa1; + pnorm = diag.cwiseProduct(wa1).stableNorm(); + + /* on the first iteration, adjust the initial step bound. */ + if (iter == 1) + delta = (std::min)(delta,pnorm); + + /* evaluate the function at x + p and calculate its norm. */ + if ( functor(wa2, wa4) < 0) + return HybridNonLinearSolverSpace::UserAsked; + ++nfev; + fnorm1 = wa4.stableNorm(); + + /* compute the scaled actual reduction. */ + actred = -1.; + if (fnorm1 < fnorm) /* Computing 2nd power */ + actred = 1. - numext::abs2(fnorm1 / fnorm); + + /* compute the scaled predicted reduction. */ + wa3 = R.template triangularView()*wa1 + qtf; + temp = wa3.stableNorm(); + prered = 0.; + if (temp < fnorm) /* Computing 2nd power */ + prered = 1. - numext::abs2(temp / fnorm); + + /* compute the ratio of the actual to the predicted reduction. */ + ratio = 0.; + if (prered > 0.) + ratio = actred / prered; + + /* update the step bound. */ + if (ratio < Scalar(.1)) { + ncsuc = 0; + ++ncfail; + delta = Scalar(.5) * delta; + } else { + ncfail = 0; + ++ncsuc; + if (ratio >= Scalar(.5) || ncsuc > 1) + delta = (std::max)(delta, pnorm / Scalar(.5)); + if (abs(ratio - 1.) <= Scalar(.1)) { + delta = pnorm / Scalar(.5); + } + } + + /* test for successful iteration. */ + if (ratio >= Scalar(1e-4)) { + /* successful iteration. update x, fvec, and their norms. */ + x = wa2; + wa2 = diag.cwiseProduct(x); + fvec = wa4; + xnorm = wa2.stableNorm(); + fnorm = fnorm1; + ++iter; + } + + /* determine the progress of the iteration. */ + ++nslow1; + if (actred >= Scalar(.001)) + nslow1 = 0; + if (jeval) + ++nslow2; + if (actred >= Scalar(.1)) + nslow2 = 0; + + /* test for convergence. */ + if (delta <= parameters.xtol * xnorm || fnorm == 0.) + return HybridNonLinearSolverSpace::RelativeErrorTooSmall; + + /* tests for termination and stringent tolerances. */ + if (nfev >= parameters.maxfev) + return HybridNonLinearSolverSpace::TooManyFunctionEvaluation; + if (Scalar(.1) * (std::max)(Scalar(.1) * delta, pnorm) <= NumTraits::epsilon() * xnorm) + return HybridNonLinearSolverSpace::TolTooSmall; + if (nslow2 == 5) + return HybridNonLinearSolverSpace::NotMakingProgressJacobian; + if (nslow1 == 10) + return HybridNonLinearSolverSpace::NotMakingProgressIterations; + + /* criterion for recalculating jacobian. */ + if (ncfail == 2) + break; // leave inner loop and go for the next outer loop iteration + + /* calculate the rank one modification to the jacobian */ + /* and update qtf if necessary. */ + wa1 = diag.cwiseProduct( diag.cwiseProduct(wa1)/pnorm ); + wa2 = fjac.transpose() * wa4; + if (ratio >= Scalar(1e-4)) + qtf = wa2; + wa2 = (wa2-wa3)/pnorm; + + /* compute the qr factorization of the updated jacobian. */ + internal::r1updt(R, wa1, v_givens, w_givens, wa2, wa3, &sing); + internal::r1mpyq(n, n, fjac.data(), v_givens, w_givens); + internal::r1mpyq(1, n, qtf.data(), v_givens, w_givens); + + jeval = false; + } + return HybridNonLinearSolverSpace::Running; +} + +template +HybridNonLinearSolverSpace::Status +HybridNonLinearSolver::solve(FVectorType &x) +{ + HybridNonLinearSolverSpace::Status status = solveInit(x); + if (status==HybridNonLinearSolverSpace::ImproperInputParameters) + return status; + while (status==HybridNonLinearSolverSpace::Running) + status = solveOneStep(x); + return status; +} + + + +template +HybridNonLinearSolverSpace::Status +HybridNonLinearSolver::hybrd1( + FVectorType &x, + const Scalar tol + ) +{ + n = x.size(); + + /* check the input parameters for errors. */ + if (n <= 0 || tol < 0.) + return HybridNonLinearSolverSpace::ImproperInputParameters; + + resetParameters(); + parameters.maxfev = 200*(n+1); + parameters.xtol = tol; + + diag.setConstant(n, 1.); + useExternalScaling = true; + return solveNumericalDiff(x); +} + +template +HybridNonLinearSolverSpace::Status +HybridNonLinearSolver::solveNumericalDiffInit(FVectorType &x) +{ + n = x.size(); + + if (parameters.nb_of_subdiagonals<0) parameters.nb_of_subdiagonals= n-1; + if (parameters.nb_of_superdiagonals<0) parameters.nb_of_superdiagonals= n-1; + + wa1.resize(n); wa2.resize(n); wa3.resize(n); wa4.resize(n); + qtf.resize(n); + fjac.resize(n, n); + fvec.resize(n); + if (!useExternalScaling) + diag.resize(n); + eigen_assert( (!useExternalScaling || diag.size()==n) && "When useExternalScaling is set, the caller must provide a valid 'diag'"); + + /* Function Body */ + nfev = 0; + njev = 0; + + /* check the input parameters for errors. */ + if (n <= 0 || parameters.xtol < 0. || parameters.maxfev <= 0 || parameters.nb_of_subdiagonals< 0 || parameters.nb_of_superdiagonals< 0 || parameters.factor <= 0. ) + return HybridNonLinearSolverSpace::ImproperInputParameters; + if (useExternalScaling) + for (Index j = 0; j < n; ++j) + if (diag[j] <= 0.) + return HybridNonLinearSolverSpace::ImproperInputParameters; + + /* evaluate the function at the starting point */ + /* and calculate its norm. */ + nfev = 1; + if ( functor(x, fvec) < 0) + return HybridNonLinearSolverSpace::UserAsked; + fnorm = fvec.stableNorm(); + + /* initialize iteration counter and monitors. */ + iter = 1; + ncsuc = 0; + ncfail = 0; + nslow1 = 0; + nslow2 = 0; + + return HybridNonLinearSolverSpace::Running; +} + +template +HybridNonLinearSolverSpace::Status +HybridNonLinearSolver::solveNumericalDiffOneStep(FVectorType &x) +{ + using std::sqrt; + using std::abs; + + assert(x.size()==n); // check the caller is not cheating us + + Index j; + std::vector > v_givens(n), w_givens(n); + + jeval = true; + if (parameters.nb_of_subdiagonals<0) parameters.nb_of_subdiagonals= n-1; + if (parameters.nb_of_superdiagonals<0) parameters.nb_of_superdiagonals= n-1; + + /* calculate the jacobian matrix. */ + if (internal::fdjac1(functor, x, fvec, fjac, parameters.nb_of_subdiagonals, parameters.nb_of_superdiagonals, parameters.epsfcn) <0) + return HybridNonLinearSolverSpace::UserAsked; + nfev += (std::min)(parameters.nb_of_subdiagonals+parameters.nb_of_superdiagonals+ 1, n); + + wa2 = fjac.colwise().blueNorm(); + + /* on the first iteration and if external scaling is not used, scale according */ + /* to the norms of the columns of the initial jacobian. */ + if (iter == 1) { + if (!useExternalScaling) + for (j = 0; j < n; ++j) + diag[j] = (wa2[j]==0.) ? 1. : wa2[j]; + + /* on the first iteration, calculate the norm of the scaled x */ + /* and initialize the step bound delta. */ + xnorm = diag.cwiseProduct(x).stableNorm(); + delta = parameters.factor * xnorm; + if (delta == 0.) + delta = parameters.factor; + } + + /* compute the qr factorization of the jacobian. */ + HouseholderQR qrfac(fjac); // no pivoting: + + /* copy the triangular factor of the qr factorization into r. */ + R = qrfac.matrixQR(); + + /* accumulate the orthogonal factor in fjac. */ + fjac = qrfac.householderQ(); + + /* form (q transpose)*fvec and store in qtf. */ + qtf = fjac.transpose() * fvec; + + /* rescale if necessary. */ + if (!useExternalScaling) + diag = diag.cwiseMax(wa2); + + while (true) { + /* determine the direction p. */ + internal::dogleg(R, diag, qtf, delta, wa1); + + /* store the direction p and x + p. calculate the norm of p. */ + wa1 = -wa1; + wa2 = x + wa1; + pnorm = diag.cwiseProduct(wa1).stableNorm(); + + /* on the first iteration, adjust the initial step bound. */ + if (iter == 1) + delta = (std::min)(delta,pnorm); + + /* evaluate the function at x + p and calculate its norm. */ + if ( functor(wa2, wa4) < 0) + return HybridNonLinearSolverSpace::UserAsked; + ++nfev; + fnorm1 = wa4.stableNorm(); + + /* compute the scaled actual reduction. */ + actred = -1.; + if (fnorm1 < fnorm) /* Computing 2nd power */ + actred = 1. - numext::abs2(fnorm1 / fnorm); + + /* compute the scaled predicted reduction. */ + wa3 = R.template triangularView()*wa1 + qtf; + temp = wa3.stableNorm(); + prered = 0.; + if (temp < fnorm) /* Computing 2nd power */ + prered = 1. - numext::abs2(temp / fnorm); + + /* compute the ratio of the actual to the predicted reduction. */ + ratio = 0.; + if (prered > 0.) + ratio = actred / prered; + + /* update the step bound. */ + if (ratio < Scalar(.1)) { + ncsuc = 0; + ++ncfail; + delta = Scalar(.5) * delta; + } else { + ncfail = 0; + ++ncsuc; + if (ratio >= Scalar(.5) || ncsuc > 1) + delta = (std::max)(delta, pnorm / Scalar(.5)); + if (abs(ratio - 1.) <= Scalar(.1)) { + delta = pnorm / Scalar(.5); + } + } + + /* test for successful iteration. */ + if (ratio >= Scalar(1e-4)) { + /* successful iteration. update x, fvec, and their norms. */ + x = wa2; + wa2 = diag.cwiseProduct(x); + fvec = wa4; + xnorm = wa2.stableNorm(); + fnorm = fnorm1; + ++iter; + } + + /* determine the progress of the iteration. */ + ++nslow1; + if (actred >= Scalar(.001)) + nslow1 = 0; + if (jeval) + ++nslow2; + if (actred >= Scalar(.1)) + nslow2 = 0; + + /* test for convergence. */ + if (delta <= parameters.xtol * xnorm || fnorm == 0.) + return HybridNonLinearSolverSpace::RelativeErrorTooSmall; + + /* tests for termination and stringent tolerances. */ + if (nfev >= parameters.maxfev) + return HybridNonLinearSolverSpace::TooManyFunctionEvaluation; + if (Scalar(.1) * (std::max)(Scalar(.1) * delta, pnorm) <= NumTraits::epsilon() * xnorm) + return HybridNonLinearSolverSpace::TolTooSmall; + if (nslow2 == 5) + return HybridNonLinearSolverSpace::NotMakingProgressJacobian; + if (nslow1 == 10) + return HybridNonLinearSolverSpace::NotMakingProgressIterations; + + /* criterion for recalculating jacobian. */ + if (ncfail == 2) + break; // leave inner loop and go for the next outer loop iteration + + /* calculate the rank one modification to the jacobian */ + /* and update qtf if necessary. */ + wa1 = diag.cwiseProduct( diag.cwiseProduct(wa1)/pnorm ); + wa2 = fjac.transpose() * wa4; + if (ratio >= Scalar(1e-4)) + qtf = wa2; + wa2 = (wa2-wa3)/pnorm; + + /* compute the qr factorization of the updated jacobian. */ + internal::r1updt(R, wa1, v_givens, w_givens, wa2, wa3, &sing); + internal::r1mpyq(n, n, fjac.data(), v_givens, w_givens); + internal::r1mpyq(1, n, qtf.data(), v_givens, w_givens); + + jeval = false; + } + return HybridNonLinearSolverSpace::Running; +} + +template +HybridNonLinearSolverSpace::Status +HybridNonLinearSolver::solveNumericalDiff(FVectorType &x) +{ + HybridNonLinearSolverSpace::Status status = solveNumericalDiffInit(x); + if (status==HybridNonLinearSolverSpace::ImproperInputParameters) + return status; + while (status==HybridNonLinearSolverSpace::Running) + status = solveNumericalDiffOneStep(x); + return status; +} + +} // end namespace Eigen + +#endif // EIGEN_HYBRIDNONLINEARSOLVER_H + +//vim: ai ts=4 sts=4 et sw=4 diff --git a/src/EigenUnsupported/src/NonLinearOptimization/LevenbergMarquardt.h b/src/EigenUnsupported/src/NonLinearOptimization/LevenbergMarquardt.h new file mode 100644 index 0000000..fe3b79c --- /dev/null +++ b/src/EigenUnsupported/src/NonLinearOptimization/LevenbergMarquardt.h @@ -0,0 +1,657 @@ +// -*- coding: utf-8 +// vim: set fileencoding=utf-8 + +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Thomas Capricelli +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_LEVENBERGMARQUARDT__H +#define EIGEN_LEVENBERGMARQUARDT__H + +namespace Eigen { + +namespace LevenbergMarquardtSpace { + enum Status { + NotStarted = -2, + Running = -1, + ImproperInputParameters = 0, + RelativeReductionTooSmall = 1, + RelativeErrorTooSmall = 2, + RelativeErrorAndReductionTooSmall = 3, + CosinusTooSmall = 4, + TooManyFunctionEvaluation = 5, + FtolTooSmall = 6, + XtolTooSmall = 7, + GtolTooSmall = 8, + UserAsked = 9 + }; +} + + + +/** + * \ingroup NonLinearOptimization_Module + * \brief Performs non linear optimization over a non-linear function, + * using a variant of the Levenberg Marquardt algorithm. + * + * Check wikipedia for more information. + * http://en.wikipedia.org/wiki/Levenberg%E2%80%93Marquardt_algorithm + */ +template +class LevenbergMarquardt +{ + static Scalar sqrt_epsilon() + { + using std::sqrt; + return sqrt(NumTraits::epsilon()); + } + +public: + LevenbergMarquardt(FunctorType &_functor) + : functor(_functor) { nfev = njev = iter = 0; fnorm = gnorm = 0.; useExternalScaling=false; } + + typedef DenseIndex Index; + + struct Parameters { + Parameters() + : factor(Scalar(100.)) + , maxfev(400) + , ftol(sqrt_epsilon()) + , xtol(sqrt_epsilon()) + , gtol(Scalar(0.)) + , epsfcn(Scalar(0.)) {} + Scalar factor; + Index maxfev; // maximum number of function evaluation + Scalar ftol; + Scalar xtol; + Scalar gtol; + Scalar epsfcn; + }; + + typedef Matrix< Scalar, Dynamic, 1 > FVectorType; + typedef Matrix< Scalar, Dynamic, Dynamic > JacobianType; + + LevenbergMarquardtSpace::Status lmder1( + FVectorType &x, + const Scalar tol = sqrt_epsilon() + ); + + LevenbergMarquardtSpace::Status minimize(FVectorType &x); + LevenbergMarquardtSpace::Status minimizeInit(FVectorType &x); + LevenbergMarquardtSpace::Status minimizeOneStep(FVectorType &x); + + static LevenbergMarquardtSpace::Status lmdif1( + FunctorType &functor, + FVectorType &x, + Index *nfev, + const Scalar tol = sqrt_epsilon() + ); + + LevenbergMarquardtSpace::Status lmstr1( + FVectorType &x, + const Scalar tol = sqrt_epsilon() + ); + + LevenbergMarquardtSpace::Status minimizeOptimumStorage(FVectorType &x); + LevenbergMarquardtSpace::Status minimizeOptimumStorageInit(FVectorType &x); + LevenbergMarquardtSpace::Status minimizeOptimumStorageOneStep(FVectorType &x); + + void resetParameters(void) { parameters = Parameters(); } + + Parameters parameters; + FVectorType fvec, qtf, diag; + JacobianType fjac; + PermutationMatrix permutation; + Index nfev; + Index njev; + Index iter; + Scalar fnorm, gnorm; + bool useExternalScaling; + + Scalar lm_param(void) { return par; } +private: + + FunctorType &functor; + Index n; + Index m; + FVectorType wa1, wa2, wa3, wa4; + + Scalar par, sum; + Scalar temp, temp1, temp2; + Scalar delta; + Scalar ratio; + Scalar pnorm, xnorm, fnorm1, actred, dirder, prered; + + LevenbergMarquardt& operator=(const LevenbergMarquardt&); +}; + +template +LevenbergMarquardtSpace::Status +LevenbergMarquardt::lmder1( + FVectorType &x, + const Scalar tol + ) +{ + n = x.size(); + m = functor.values(); + + /* check the input parameters for errors. */ + if (n <= 0 || m < n || tol < 0.) + return LevenbergMarquardtSpace::ImproperInputParameters; + + resetParameters(); + parameters.ftol = tol; + parameters.xtol = tol; + parameters.maxfev = 100*(n+1); + + return minimize(x); +} + + +template +LevenbergMarquardtSpace::Status +LevenbergMarquardt::minimize(FVectorType &x) +{ + LevenbergMarquardtSpace::Status status = minimizeInit(x); + if (status==LevenbergMarquardtSpace::ImproperInputParameters) + return status; + do { + status = minimizeOneStep(x); + } while (status==LevenbergMarquardtSpace::Running); + return status; +} + +template +LevenbergMarquardtSpace::Status +LevenbergMarquardt::minimizeInit(FVectorType &x) +{ + n = x.size(); + m = functor.values(); + + wa1.resize(n); wa2.resize(n); wa3.resize(n); + wa4.resize(m); + fvec.resize(m); + fjac.resize(m, n); + if (!useExternalScaling) + diag.resize(n); + eigen_assert( (!useExternalScaling || diag.size()==n) && "When useExternalScaling is set, the caller must provide a valid 'diag'"); + qtf.resize(n); + + /* Function Body */ + nfev = 0; + njev = 0; + + /* check the input parameters for errors. */ + if (n <= 0 || m < n || parameters.ftol < 0. || parameters.xtol < 0. || parameters.gtol < 0. || parameters.maxfev <= 0 || parameters.factor <= 0.) + return LevenbergMarquardtSpace::ImproperInputParameters; + + if (useExternalScaling) + for (Index j = 0; j < n; ++j) + if (diag[j] <= 0.) + return LevenbergMarquardtSpace::ImproperInputParameters; + + /* evaluate the function at the starting point */ + /* and calculate its norm. */ + nfev = 1; + if ( functor(x, fvec) < 0) + return LevenbergMarquardtSpace::UserAsked; + fnorm = fvec.stableNorm(); + + /* initialize levenberg-marquardt parameter and iteration counter. */ + par = 0.; + iter = 1; + + return LevenbergMarquardtSpace::NotStarted; +} + +template +LevenbergMarquardtSpace::Status +LevenbergMarquardt::minimizeOneStep(FVectorType &x) +{ + using std::abs; + using std::sqrt; + + eigen_assert(x.size()==n); // check the caller is not cheating us + + /* calculate the jacobian matrix. */ + Index df_ret = functor.df(x, fjac); + if (df_ret<0) + return LevenbergMarquardtSpace::UserAsked; + if (df_ret>0) + // numerical diff, we evaluated the function df_ret times + nfev += df_ret; + else njev++; + + /* compute the qr factorization of the jacobian. */ + wa2 = fjac.colwise().blueNorm(); + ColPivHouseholderQR qrfac(fjac); + fjac = qrfac.matrixQR(); + permutation = qrfac.colsPermutation(); + + /* on the first iteration and if external scaling is not used, scale according */ + /* to the norms of the columns of the initial jacobian. */ + if (iter == 1) { + if (!useExternalScaling) + for (Index j = 0; j < n; ++j) + diag[j] = (wa2[j]==0.)? 1. : wa2[j]; + + /* on the first iteration, calculate the norm of the scaled x */ + /* and initialize the step bound delta. */ + xnorm = diag.cwiseProduct(x).stableNorm(); + delta = parameters.factor * xnorm; + if (delta == 0.) + delta = parameters.factor; + } + + /* form (q transpose)*fvec and store the first n components in */ + /* qtf. */ + wa4 = fvec; + wa4.applyOnTheLeft(qrfac.householderQ().adjoint()); + qtf = wa4.head(n); + + /* compute the norm of the scaled gradient. */ + gnorm = 0.; + if (fnorm != 0.) + for (Index j = 0; j < n; ++j) + if (wa2[permutation.indices()[j]] != 0.) + gnorm = (std::max)(gnorm, abs( fjac.col(j).head(j+1).dot(qtf.head(j+1)/fnorm) / wa2[permutation.indices()[j]])); + + /* test for convergence of the gradient norm. */ + if (gnorm <= parameters.gtol) + return LevenbergMarquardtSpace::CosinusTooSmall; + + /* rescale if necessary. */ + if (!useExternalScaling) + diag = diag.cwiseMax(wa2); + + do { + + /* determine the levenberg-marquardt parameter. */ + internal::lmpar2(qrfac, diag, qtf, delta, par, wa1); + + /* store the direction p and x + p. calculate the norm of p. */ + wa1 = -wa1; + wa2 = x + wa1; + pnorm = diag.cwiseProduct(wa1).stableNorm(); + + /* on the first iteration, adjust the initial step bound. */ + if (iter == 1) + delta = (std::min)(delta,pnorm); + + /* evaluate the function at x + p and calculate its norm. */ + if ( functor(wa2, wa4) < 0) + return LevenbergMarquardtSpace::UserAsked; + ++nfev; + fnorm1 = wa4.stableNorm(); + + /* compute the scaled actual reduction. */ + actred = -1.; + if (Scalar(.1) * fnorm1 < fnorm) + actred = 1. - numext::abs2(fnorm1 / fnorm); + + /* compute the scaled predicted reduction and */ + /* the scaled directional derivative. */ + wa3 = fjac.template triangularView() * (qrfac.colsPermutation().inverse() *wa1); + temp1 = numext::abs2(wa3.stableNorm() / fnorm); + temp2 = numext::abs2(sqrt(par) * pnorm / fnorm); + prered = temp1 + temp2 / Scalar(.5); + dirder = -(temp1 + temp2); + + /* compute the ratio of the actual to the predicted */ + /* reduction. */ + ratio = 0.; + if (prered != 0.) + ratio = actred / prered; + + /* update the step bound. */ + if (ratio <= Scalar(.25)) { + if (actred >= 0.) + temp = Scalar(.5); + if (actred < 0.) + temp = Scalar(.5) * dirder / (dirder + Scalar(.5) * actred); + if (Scalar(.1) * fnorm1 >= fnorm || temp < Scalar(.1)) + temp = Scalar(.1); + /* Computing MIN */ + delta = temp * (std::min)(delta, pnorm / Scalar(.1)); + par /= temp; + } else if (!(par != 0. && ratio < Scalar(.75))) { + delta = pnorm / Scalar(.5); + par = Scalar(.5) * par; + } + + /* test for successful iteration. */ + if (ratio >= Scalar(1e-4)) { + /* successful iteration. update x, fvec, and their norms. */ + x = wa2; + wa2 = diag.cwiseProduct(x); + fvec = wa4; + xnorm = wa2.stableNorm(); + fnorm = fnorm1; + ++iter; + } + + /* tests for convergence. */ + if (abs(actred) <= parameters.ftol && prered <= parameters.ftol && Scalar(.5) * ratio <= 1. && delta <= parameters.xtol * xnorm) + return LevenbergMarquardtSpace::RelativeErrorAndReductionTooSmall; + if (abs(actred) <= parameters.ftol && prered <= parameters.ftol && Scalar(.5) * ratio <= 1.) + return LevenbergMarquardtSpace::RelativeReductionTooSmall; + if (delta <= parameters.xtol * xnorm) + return LevenbergMarquardtSpace::RelativeErrorTooSmall; + + /* tests for termination and stringent tolerances. */ + if (nfev >= parameters.maxfev) + return LevenbergMarquardtSpace::TooManyFunctionEvaluation; + if (abs(actred) <= NumTraits::epsilon() && prered <= NumTraits::epsilon() && Scalar(.5) * ratio <= 1.) + return LevenbergMarquardtSpace::FtolTooSmall; + if (delta <= NumTraits::epsilon() * xnorm) + return LevenbergMarquardtSpace::XtolTooSmall; + if (gnorm <= NumTraits::epsilon()) + return LevenbergMarquardtSpace::GtolTooSmall; + + } while (ratio < Scalar(1e-4)); + + return LevenbergMarquardtSpace::Running; +} + +template +LevenbergMarquardtSpace::Status +LevenbergMarquardt::lmstr1( + FVectorType &x, + const Scalar tol + ) +{ + n = x.size(); + m = functor.values(); + + /* check the input parameters for errors. */ + if (n <= 0 || m < n || tol < 0.) + return LevenbergMarquardtSpace::ImproperInputParameters; + + resetParameters(); + parameters.ftol = tol; + parameters.xtol = tol; + parameters.maxfev = 100*(n+1); + + return minimizeOptimumStorage(x); +} + +template +LevenbergMarquardtSpace::Status +LevenbergMarquardt::minimizeOptimumStorageInit(FVectorType &x) +{ + n = x.size(); + m = functor.values(); + + wa1.resize(n); wa2.resize(n); wa3.resize(n); + wa4.resize(m); + fvec.resize(m); + // Only R is stored in fjac. Q is only used to compute 'qtf', which is + // Q.transpose()*rhs. qtf will be updated using givens rotation, + // instead of storing them in Q. + // The purpose it to only use a nxn matrix, instead of mxn here, so + // that we can handle cases where m>>n : + fjac.resize(n, n); + if (!useExternalScaling) + diag.resize(n); + eigen_assert( (!useExternalScaling || diag.size()==n) && "When useExternalScaling is set, the caller must provide a valid 'diag'"); + qtf.resize(n); + + /* Function Body */ + nfev = 0; + njev = 0; + + /* check the input parameters for errors. */ + if (n <= 0 || m < n || parameters.ftol < 0. || parameters.xtol < 0. || parameters.gtol < 0. || parameters.maxfev <= 0 || parameters.factor <= 0.) + return LevenbergMarquardtSpace::ImproperInputParameters; + + if (useExternalScaling) + for (Index j = 0; j < n; ++j) + if (diag[j] <= 0.) + return LevenbergMarquardtSpace::ImproperInputParameters; + + /* evaluate the function at the starting point */ + /* and calculate its norm. */ + nfev = 1; + if ( functor(x, fvec) < 0) + return LevenbergMarquardtSpace::UserAsked; + fnorm = fvec.stableNorm(); + + /* initialize levenberg-marquardt parameter and iteration counter. */ + par = 0.; + iter = 1; + + return LevenbergMarquardtSpace::NotStarted; +} + + +template +LevenbergMarquardtSpace::Status +LevenbergMarquardt::minimizeOptimumStorageOneStep(FVectorType &x) +{ + using std::abs; + using std::sqrt; + + eigen_assert(x.size()==n); // check the caller is not cheating us + + Index i, j; + bool sing; + + /* compute the qr factorization of the jacobian matrix */ + /* calculated one row at a time, while simultaneously */ + /* forming (q transpose)*fvec and storing the first */ + /* n components in qtf. */ + qtf.fill(0.); + fjac.fill(0.); + Index rownb = 2; + for (i = 0; i < m; ++i) { + if (functor.df(x, wa3, rownb) < 0) return LevenbergMarquardtSpace::UserAsked; + internal::rwupdt(fjac, wa3, qtf, fvec[i]); + ++rownb; + } + ++njev; + + /* if the jacobian is rank deficient, call qrfac to */ + /* reorder its columns and update the components of qtf. */ + sing = false; + for (j = 0; j < n; ++j) { + if (fjac(j,j) == 0.) + sing = true; + wa2[j] = fjac.col(j).head(j).stableNorm(); + } + permutation.setIdentity(n); + if (sing) { + wa2 = fjac.colwise().blueNorm(); + // TODO We have no unit test covering this code path, do not modify + // until it is carefully tested + ColPivHouseholderQR qrfac(fjac); + fjac = qrfac.matrixQR(); + wa1 = fjac.diagonal(); + fjac.diagonal() = qrfac.hCoeffs(); + permutation = qrfac.colsPermutation(); + // TODO : avoid this: + for(Index ii=0; ii< fjac.cols(); ii++) fjac.col(ii).segment(ii+1, fjac.rows()-ii-1) *= fjac(ii,ii); // rescale vectors + + for (j = 0; j < n; ++j) { + if (fjac(j,j) != 0.) { + sum = 0.; + for (i = j; i < n; ++i) + sum += fjac(i,j) * qtf[i]; + temp = -sum / fjac(j,j); + for (i = j; i < n; ++i) + qtf[i] += fjac(i,j) * temp; + } + fjac(j,j) = wa1[j]; + } + } + + /* on the first iteration and if external scaling is not used, scale according */ + /* to the norms of the columns of the initial jacobian. */ + if (iter == 1) { + if (!useExternalScaling) + for (j = 0; j < n; ++j) + diag[j] = (wa2[j]==0.)? 1. : wa2[j]; + + /* on the first iteration, calculate the norm of the scaled x */ + /* and initialize the step bound delta. */ + xnorm = diag.cwiseProduct(x).stableNorm(); + delta = parameters.factor * xnorm; + if (delta == 0.) + delta = parameters.factor; + } + + /* compute the norm of the scaled gradient. */ + gnorm = 0.; + if (fnorm != 0.) + for (j = 0; j < n; ++j) + if (wa2[permutation.indices()[j]] != 0.) + gnorm = (std::max)(gnorm, abs( fjac.col(j).head(j+1).dot(qtf.head(j+1)/fnorm) / wa2[permutation.indices()[j]])); + + /* test for convergence of the gradient norm. */ + if (gnorm <= parameters.gtol) + return LevenbergMarquardtSpace::CosinusTooSmall; + + /* rescale if necessary. */ + if (!useExternalScaling) + diag = diag.cwiseMax(wa2); + + do { + + /* determine the levenberg-marquardt parameter. */ + internal::lmpar(fjac, permutation.indices(), diag, qtf, delta, par, wa1); + + /* store the direction p and x + p. calculate the norm of p. */ + wa1 = -wa1; + wa2 = x + wa1; + pnorm = diag.cwiseProduct(wa1).stableNorm(); + + /* on the first iteration, adjust the initial step bound. */ + if (iter == 1) + delta = (std::min)(delta,pnorm); + + /* evaluate the function at x + p and calculate its norm. */ + if ( functor(wa2, wa4) < 0) + return LevenbergMarquardtSpace::UserAsked; + ++nfev; + fnorm1 = wa4.stableNorm(); + + /* compute the scaled actual reduction. */ + actred = -1.; + if (Scalar(.1) * fnorm1 < fnorm) + actred = 1. - numext::abs2(fnorm1 / fnorm); + + /* compute the scaled predicted reduction and */ + /* the scaled directional derivative. */ + wa3 = fjac.topLeftCorner(n,n).template triangularView() * (permutation.inverse() * wa1); + temp1 = numext::abs2(wa3.stableNorm() / fnorm); + temp2 = numext::abs2(sqrt(par) * pnorm / fnorm); + prered = temp1 + temp2 / Scalar(.5); + dirder = -(temp1 + temp2); + + /* compute the ratio of the actual to the predicted */ + /* reduction. */ + ratio = 0.; + if (prered != 0.) + ratio = actred / prered; + + /* update the step bound. */ + if (ratio <= Scalar(.25)) { + if (actred >= 0.) + temp = Scalar(.5); + if (actred < 0.) + temp = Scalar(.5) * dirder / (dirder + Scalar(.5) * actred); + if (Scalar(.1) * fnorm1 >= fnorm || temp < Scalar(.1)) + temp = Scalar(.1); + /* Computing MIN */ + delta = temp * (std::min)(delta, pnorm / Scalar(.1)); + par /= temp; + } else if (!(par != 0. && ratio < Scalar(.75))) { + delta = pnorm / Scalar(.5); + par = Scalar(.5) * par; + } + + /* test for successful iteration. */ + if (ratio >= Scalar(1e-4)) { + /* successful iteration. update x, fvec, and their norms. */ + x = wa2; + wa2 = diag.cwiseProduct(x); + fvec = wa4; + xnorm = wa2.stableNorm(); + fnorm = fnorm1; + ++iter; + } + + /* tests for convergence. */ + if (abs(actred) <= parameters.ftol && prered <= parameters.ftol && Scalar(.5) * ratio <= 1. && delta <= parameters.xtol * xnorm) + return LevenbergMarquardtSpace::RelativeErrorAndReductionTooSmall; + if (abs(actred) <= parameters.ftol && prered <= parameters.ftol && Scalar(.5) * ratio <= 1.) + return LevenbergMarquardtSpace::RelativeReductionTooSmall; + if (delta <= parameters.xtol * xnorm) + return LevenbergMarquardtSpace::RelativeErrorTooSmall; + + /* tests for termination and stringent tolerances. */ + if (nfev >= parameters.maxfev) + return LevenbergMarquardtSpace::TooManyFunctionEvaluation; + if (abs(actred) <= NumTraits::epsilon() && prered <= NumTraits::epsilon() && Scalar(.5) * ratio <= 1.) + return LevenbergMarquardtSpace::FtolTooSmall; + if (delta <= NumTraits::epsilon() * xnorm) + return LevenbergMarquardtSpace::XtolTooSmall; + if (gnorm <= NumTraits::epsilon()) + return LevenbergMarquardtSpace::GtolTooSmall; + + } while (ratio < Scalar(1e-4)); + + return LevenbergMarquardtSpace::Running; +} + +template +LevenbergMarquardtSpace::Status +LevenbergMarquardt::minimizeOptimumStorage(FVectorType &x) +{ + LevenbergMarquardtSpace::Status status = minimizeOptimumStorageInit(x); + if (status==LevenbergMarquardtSpace::ImproperInputParameters) + return status; + do { + status = minimizeOptimumStorageOneStep(x); + } while (status==LevenbergMarquardtSpace::Running); + return status; +} + +template +LevenbergMarquardtSpace::Status +LevenbergMarquardt::lmdif1( + FunctorType &functor, + FVectorType &x, + Index *nfev, + const Scalar tol + ) +{ + Index n = x.size(); + Index m = functor.values(); + + /* check the input parameters for errors. */ + if (n <= 0 || m < n || tol < 0.) + return LevenbergMarquardtSpace::ImproperInputParameters; + + NumericalDiff numDiff(functor); + // embedded LevenbergMarquardt + LevenbergMarquardt, Scalar > lm(numDiff); + lm.parameters.ftol = tol; + lm.parameters.xtol = tol; + lm.parameters.maxfev = 200*(n+1); + + LevenbergMarquardtSpace::Status info = LevenbergMarquardtSpace::Status(lm.minimize(x)); + if (nfev) + * nfev = lm.nfev; + return info; +} + +} // end namespace Eigen + +#endif // EIGEN_LEVENBERGMARQUARDT__H + +//vim: ai ts=4 sts=4 et sw=4 diff --git a/src/EigenUnsupported/src/NonLinearOptimization/chkder.h b/src/EigenUnsupported/src/NonLinearOptimization/chkder.h new file mode 100644 index 0000000..db8ff7d --- /dev/null +++ b/src/EigenUnsupported/src/NonLinearOptimization/chkder.h @@ -0,0 +1,66 @@ +#define chkder_log10e 0.43429448190325182765 +#define chkder_factor 100. + +namespace Eigen { + +namespace internal { + +template +void chkder( + const Matrix< Scalar, Dynamic, 1 > &x, + const Matrix< Scalar, Dynamic, 1 > &fvec, + const Matrix< Scalar, Dynamic, Dynamic > &fjac, + Matrix< Scalar, Dynamic, 1 > &xp, + const Matrix< Scalar, Dynamic, 1 > &fvecp, + int mode, + Matrix< Scalar, Dynamic, 1 > &err + ) +{ + using std::sqrt; + using std::abs; + using std::log; + + typedef DenseIndex Index; + + const Scalar eps = sqrt(NumTraits::epsilon()); + const Scalar epsf = chkder_factor * NumTraits::epsilon(); + const Scalar epslog = chkder_log10e * log(eps); + Scalar temp; + + const Index m = fvec.size(), n = x.size(); + + if (mode != 2) { + /* mode = 1. */ + xp.resize(n); + for (Index j = 0; j < n; ++j) { + temp = eps * abs(x[j]); + if (temp == 0.) + temp = eps; + xp[j] = x[j] + temp; + } + } + else { + /* mode = 2. */ + err.setZero(m); + for (Index j = 0; j < n; ++j) { + temp = abs(x[j]); + if (temp == 0.) + temp = 1.; + err += temp * fjac.col(j); + } + for (Index i = 0; i < m; ++i) { + temp = 1.; + if (fvec[i] != 0. && fvecp[i] != 0. && abs(fvecp[i] - fvec[i]) >= epsf * abs(fvec[i])) + temp = eps * abs((fvecp[i] - fvec[i]) / eps - err[i]) / (abs(fvec[i]) + abs(fvecp[i])); + err[i] = 1.; + if (temp > NumTraits::epsilon() && temp < eps) + err[i] = (chkder_log10e * log(temp) - epslog) / epslog; + if (temp >= eps) + err[i] = 0.; + } + } +} + +} // end namespace internal + +} // end namespace Eigen diff --git a/src/EigenUnsupported/src/NonLinearOptimization/covar.h b/src/EigenUnsupported/src/NonLinearOptimization/covar.h new file mode 100644 index 0000000..68260d1 --- /dev/null +++ b/src/EigenUnsupported/src/NonLinearOptimization/covar.h @@ -0,0 +1,70 @@ +namespace Eigen { + +namespace internal { + +template +void covar( + Matrix< Scalar, Dynamic, Dynamic > &r, + const VectorXi &ipvt, + Scalar tol = std::sqrt(NumTraits::epsilon()) ) +{ + using std::abs; + typedef DenseIndex Index; + + /* Local variables */ + Index i, j, k, l, ii, jj; + bool sing; + Scalar temp; + + /* Function Body */ + const Index n = r.cols(); + const Scalar tolr = tol * abs(r(0,0)); + Matrix< Scalar, Dynamic, 1 > wa(n); + eigen_assert(ipvt.size()==n); + + /* form the inverse of r in the full upper triangle of r. */ + l = -1; + for (k = 0; k < n; ++k) + if (abs(r(k,k)) > tolr) { + r(k,k) = 1. / r(k,k); + for (j = 0; j <= k-1; ++j) { + temp = r(k,k) * r(j,k); + r(j,k) = 0.; + r.col(k).head(j+1) -= r.col(j).head(j+1) * temp; + } + l = k; + } + + /* form the full upper triangle of the inverse of (r transpose)*r */ + /* in the full upper triangle of r. */ + for (k = 0; k <= l; ++k) { + for (j = 0; j <= k-1; ++j) + r.col(j).head(j+1) += r.col(k).head(j+1) * r(j,k); + r.col(k).head(k+1) *= r(k,k); + } + + /* form the full lower triangle of the covariance matrix */ + /* in the strict lower triangle of r and in wa. */ + for (j = 0; j < n; ++j) { + jj = ipvt[j]; + sing = j > l; + for (i = 0; i <= j; ++i) { + if (sing) + r(i,j) = 0.; + ii = ipvt[i]; + if (ii > jj) + r(ii,jj) = r(i,j); + if (ii < jj) + r(jj,ii) = r(i,j); + } + wa[jj] = r(j,j); + } + + /* symmetrize the covariance matrix in r. */ + r.topLeftCorner(n,n).template triangularView() = r.topLeftCorner(n,n).transpose(); + r.diagonal() = wa; +} + +} // end namespace internal + +} // end namespace Eigen diff --git a/src/EigenUnsupported/src/NonLinearOptimization/dogleg.h b/src/EigenUnsupported/src/NonLinearOptimization/dogleg.h new file mode 100644 index 0000000..80c5d27 --- /dev/null +++ b/src/EigenUnsupported/src/NonLinearOptimization/dogleg.h @@ -0,0 +1,107 @@ +namespace Eigen { + +namespace internal { + +template +void dogleg( + const Matrix< Scalar, Dynamic, Dynamic > &qrfac, + const Matrix< Scalar, Dynamic, 1 > &diag, + const Matrix< Scalar, Dynamic, 1 > &qtb, + Scalar delta, + Matrix< Scalar, Dynamic, 1 > &x) +{ + using std::abs; + using std::sqrt; + + typedef DenseIndex Index; + + /* Local variables */ + Index i, j; + Scalar sum, temp, alpha, bnorm; + Scalar gnorm, qnorm; + Scalar sgnorm; + + /* Function Body */ + const Scalar epsmch = NumTraits::epsilon(); + const Index n = qrfac.cols(); + eigen_assert(n==qtb.size()); + eigen_assert(n==x.size()); + eigen_assert(n==diag.size()); + Matrix< Scalar, Dynamic, 1 > wa1(n), wa2(n); + + /* first, calculate the gauss-newton direction. */ + for (j = n-1; j >=0; --j) { + temp = qrfac(j,j); + if (temp == 0.) { + temp = epsmch * qrfac.col(j).head(j+1).maxCoeff(); + if (temp == 0.) + temp = epsmch; + } + if (j==n-1) + x[j] = qtb[j] / temp; + else + x[j] = (qtb[j] - qrfac.row(j).tail(n-j-1).dot(x.tail(n-j-1))) / temp; + } + + /* test whether the gauss-newton direction is acceptable. */ + qnorm = diag.cwiseProduct(x).stableNorm(); + if (qnorm <= delta) + return; + + // TODO : this path is not tested by Eigen unit tests + + /* the gauss-newton direction is not acceptable. */ + /* next, calculate the scaled gradient direction. */ + + wa1.fill(0.); + for (j = 0; j < n; ++j) { + wa1.tail(n-j) += qrfac.row(j).tail(n-j) * qtb[j]; + wa1[j] /= diag[j]; + } + + /* calculate the norm of the scaled gradient and test for */ + /* the special case in which the scaled gradient is zero. */ + gnorm = wa1.stableNorm(); + sgnorm = 0.; + alpha = delta / qnorm; + if (gnorm == 0.) + goto algo_end; + + /* calculate the point along the scaled gradient */ + /* at which the quadratic is minimized. */ + wa1.array() /= (diag*gnorm).array(); + // TODO : once unit tests cover this part,: + // wa2 = qrfac.template triangularView() * wa1; + for (j = 0; j < n; ++j) { + sum = 0.; + for (i = j; i < n; ++i) { + sum += qrfac(j,i) * wa1[i]; + } + wa2[j] = sum; + } + temp = wa2.stableNorm(); + sgnorm = gnorm / temp / temp; + + /* test whether the scaled gradient direction is acceptable. */ + alpha = 0.; + if (sgnorm >= delta) + goto algo_end; + + /* the scaled gradient direction is not acceptable. */ + /* finally, calculate the point along the dogleg */ + /* at which the quadratic is minimized. */ + bnorm = qtb.stableNorm(); + temp = bnorm / gnorm * (bnorm / qnorm) * (sgnorm / delta); + temp = temp - delta / qnorm * numext::abs2(sgnorm / delta) + sqrt(numext::abs2(temp - delta / qnorm) + (1.-numext::abs2(delta / qnorm)) * (1.-numext::abs2(sgnorm / delta))); + alpha = delta / qnorm * (1. - numext::abs2(sgnorm / delta)) / temp; +algo_end: + + /* form appropriate convex combination of the gauss-newton */ + /* direction and the scaled gradient direction. */ + temp = (1.-alpha) * (std::min)(sgnorm,delta); + x = temp * wa1 + alpha * x; +} + +} // end namespace internal + +} // end namespace Eigen diff --git a/src/EigenUnsupported/src/NonLinearOptimization/fdjac1.h b/src/EigenUnsupported/src/NonLinearOptimization/fdjac1.h new file mode 100644 index 0000000..bb7cf26 --- /dev/null +++ b/src/EigenUnsupported/src/NonLinearOptimization/fdjac1.h @@ -0,0 +1,79 @@ +namespace Eigen { + +namespace internal { + +template +DenseIndex fdjac1( + const FunctorType &Functor, + Matrix< Scalar, Dynamic, 1 > &x, + Matrix< Scalar, Dynamic, 1 > &fvec, + Matrix< Scalar, Dynamic, Dynamic > &fjac, + DenseIndex ml, DenseIndex mu, + Scalar epsfcn) +{ + using std::sqrt; + using std::abs; + + typedef DenseIndex Index; + + /* Local variables */ + Scalar h; + Index j, k; + Scalar eps, temp; + Index msum; + int iflag; + Index start, length; + + /* Function Body */ + const Scalar epsmch = NumTraits::epsilon(); + const Index n = x.size(); + eigen_assert(fvec.size()==n); + Matrix< Scalar, Dynamic, 1 > wa1(n); + Matrix< Scalar, Dynamic, 1 > wa2(n); + + eps = sqrt((std::max)(epsfcn,epsmch)); + msum = ml + mu + 1; + if (msum >= n) { + /* computation of dense approximate jacobian. */ + for (j = 0; j < n; ++j) { + temp = x[j]; + h = eps * abs(temp); + if (h == 0.) + h = eps; + x[j] = temp + h; + iflag = Functor(x, wa1); + if (iflag < 0) + return iflag; + x[j] = temp; + fjac.col(j) = (wa1-fvec)/h; + } + + }else { + /* computation of banded approximate jacobian. */ + for (k = 0; k < msum; ++k) { + for (j = k; (msum<0) ? (j>n): (jn): (j(0,j-mu); + length = (std::min)(n-1, j+ml) - start + 1; + fjac.col(j).segment(start, length) = ( wa1.segment(start, length)-fvec.segment(start, length))/h; + } + } + } + return 0; +} + +} // end namespace internal + +} // end namespace Eigen diff --git a/src/EigenUnsupported/src/NonLinearOptimization/lmpar.h b/src/EigenUnsupported/src/NonLinearOptimization/lmpar.h new file mode 100644 index 0000000..4c17d4c --- /dev/null +++ b/src/EigenUnsupported/src/NonLinearOptimization/lmpar.h @@ -0,0 +1,298 @@ +namespace Eigen { + +namespace internal { + +template +void lmpar( + Matrix< Scalar, Dynamic, Dynamic > &r, + const VectorXi &ipvt, + const Matrix< Scalar, Dynamic, 1 > &diag, + const Matrix< Scalar, Dynamic, 1 > &qtb, + Scalar delta, + Scalar &par, + Matrix< Scalar, Dynamic, 1 > &x) +{ + using std::abs; + using std::sqrt; + typedef DenseIndex Index; + + /* Local variables */ + Index i, j, l; + Scalar fp; + Scalar parc, parl; + Index iter; + Scalar temp, paru; + Scalar gnorm; + Scalar dxnorm; + + + /* Function Body */ + const Scalar dwarf = (std::numeric_limits::min)(); + const Index n = r.cols(); + eigen_assert(n==diag.size()); + eigen_assert(n==qtb.size()); + eigen_assert(n==x.size()); + + Matrix< Scalar, Dynamic, 1 > wa1, wa2; + + /* compute and store in x the gauss-newton direction. if the */ + /* jacobian is rank-deficient, obtain a least squares solution. */ + Index nsing = n-1; + wa1 = qtb; + for (j = 0; j < n; ++j) { + if (r(j,j) == 0. && nsing == n-1) + nsing = j - 1; + if (nsing < n-1) + wa1[j] = 0.; + } + for (j = nsing; j>=0; --j) { + wa1[j] /= r(j,j); + temp = wa1[j]; + for (i = 0; i < j ; ++i) + wa1[i] -= r(i,j) * temp; + } + + for (j = 0; j < n; ++j) + x[ipvt[j]] = wa1[j]; + + /* initialize the iteration counter. */ + /* evaluate the function at the origin, and test */ + /* for acceptance of the gauss-newton direction. */ + iter = 0; + wa2 = diag.cwiseProduct(x); + dxnorm = wa2.blueNorm(); + fp = dxnorm - delta; + if (fp <= Scalar(0.1) * delta) { + par = 0; + return; + } + + /* if the jacobian is not rank deficient, the newton */ + /* step provides a lower bound, parl, for the zero of */ + /* the function. otherwise set this bound to zero. */ + parl = 0.; + if (nsing >= n-1) { + for (j = 0; j < n; ++j) { + l = ipvt[j]; + wa1[j] = diag[l] * (wa2[l] / dxnorm); + } + // it's actually a triangularView.solveInplace(), though in a weird + // way: + for (j = 0; j < n; ++j) { + Scalar sum = 0.; + for (i = 0; i < j; ++i) + sum += r(i,j) * wa1[i]; + wa1[j] = (wa1[j] - sum) / r(j,j); + } + temp = wa1.blueNorm(); + parl = fp / delta / temp / temp; + } + + /* calculate an upper bound, paru, for the zero of the function. */ + for (j = 0; j < n; ++j) + wa1[j] = r.col(j).head(j+1).dot(qtb.head(j+1)) / diag[ipvt[j]]; + + gnorm = wa1.stableNorm(); + paru = gnorm / delta; + if (paru == 0.) + paru = dwarf / (std::min)(delta,Scalar(0.1)); + + /* if the input par lies outside of the interval (parl,paru), */ + /* set par to the closer endpoint. */ + par = (std::max)(par,parl); + par = (std::min)(par,paru); + if (par == 0.) + par = gnorm / dxnorm; + + /* beginning of an iteration. */ + while (true) { + ++iter; + + /* evaluate the function at the current value of par. */ + if (par == 0.) + par = (std::max)(dwarf,Scalar(.001) * paru); /* Computing MAX */ + wa1 = sqrt(par)* diag; + + Matrix< Scalar, Dynamic, 1 > sdiag(n); + qrsolv(r, ipvt, wa1, qtb, x, sdiag); + + wa2 = diag.cwiseProduct(x); + dxnorm = wa2.blueNorm(); + temp = fp; + fp = dxnorm - delta; + + /* if the function is small enough, accept the current value */ + /* of par. also test for the exceptional cases where parl */ + /* is zero or the number of iterations has reached 10. */ + if (abs(fp) <= Scalar(0.1) * delta || (parl == 0. && fp <= temp && temp < 0.) || iter == 10) + break; + + /* compute the newton correction. */ + for (j = 0; j < n; ++j) { + l = ipvt[j]; + wa1[j] = diag[l] * (wa2[l] / dxnorm); + } + for (j = 0; j < n; ++j) { + wa1[j] /= sdiag[j]; + temp = wa1[j]; + for (i = j+1; i < n; ++i) + wa1[i] -= r(i,j) * temp; + } + temp = wa1.blueNorm(); + parc = fp / delta / temp / temp; + + /* depending on the sign of the function, update parl or paru. */ + if (fp > 0.) + parl = (std::max)(parl,par); + if (fp < 0.) + paru = (std::min)(paru,par); + + /* compute an improved estimate for par. */ + /* Computing MAX */ + par = (std::max)(parl,par+parc); + + /* end of an iteration. */ + } + + /* termination. */ + if (iter == 0) + par = 0.; + return; +} + +template +void lmpar2( + const ColPivHouseholderQR > &qr, + const Matrix< Scalar, Dynamic, 1 > &diag, + const Matrix< Scalar, Dynamic, 1 > &qtb, + Scalar delta, + Scalar &par, + Matrix< Scalar, Dynamic, 1 > &x) + +{ + using std::sqrt; + using std::abs; + typedef DenseIndex Index; + + /* Local variables */ + Index j; + Scalar fp; + Scalar parc, parl; + Index iter; + Scalar temp, paru; + Scalar gnorm; + Scalar dxnorm; + + + /* Function Body */ + const Scalar dwarf = (std::numeric_limits::min)(); + const Index n = qr.matrixQR().cols(); + eigen_assert(n==diag.size()); + eigen_assert(n==qtb.size()); + + Matrix< Scalar, Dynamic, 1 > wa1, wa2; + + /* compute and store in x the gauss-newton direction. if the */ + /* jacobian is rank-deficient, obtain a least squares solution. */ + +// const Index rank = qr.nonzeroPivots(); // exactly double(0.) + const Index rank = qr.rank(); // use a threshold + wa1 = qtb; + wa1.tail(n-rank).setZero(); + qr.matrixQR().topLeftCorner(rank, rank).template triangularView().solveInPlace(wa1.head(rank)); + + x = qr.colsPermutation()*wa1; + + /* initialize the iteration counter. */ + /* evaluate the function at the origin, and test */ + /* for acceptance of the gauss-newton direction. */ + iter = 0; + wa2 = diag.cwiseProduct(x); + dxnorm = wa2.blueNorm(); + fp = dxnorm - delta; + if (fp <= Scalar(0.1) * delta) { + par = 0; + return; + } + + /* if the jacobian is not rank deficient, the newton */ + /* step provides a lower bound, parl, for the zero of */ + /* the function. otherwise set this bound to zero. */ + parl = 0.; + if (rank==n) { + wa1 = qr.colsPermutation().inverse() * diag.cwiseProduct(wa2)/dxnorm; + qr.matrixQR().topLeftCorner(n, n).transpose().template triangularView().solveInPlace(wa1); + temp = wa1.blueNorm(); + parl = fp / delta / temp / temp; + } + + /* calculate an upper bound, paru, for the zero of the function. */ + for (j = 0; j < n; ++j) + wa1[j] = qr.matrixQR().col(j).head(j+1).dot(qtb.head(j+1)) / diag[qr.colsPermutation().indices()(j)]; + + gnorm = wa1.stableNorm(); + paru = gnorm / delta; + if (paru == 0.) + paru = dwarf / (std::min)(delta,Scalar(0.1)); + + /* if the input par lies outside of the interval (parl,paru), */ + /* set par to the closer endpoint. */ + par = (std::max)(par,parl); + par = (std::min)(par,paru); + if (par == 0.) + par = gnorm / dxnorm; + + /* beginning of an iteration. */ + Matrix< Scalar, Dynamic, Dynamic > s = qr.matrixQR(); + while (true) { + ++iter; + + /* evaluate the function at the current value of par. */ + if (par == 0.) + par = (std::max)(dwarf,Scalar(.001) * paru); /* Computing MAX */ + wa1 = sqrt(par)* diag; + + Matrix< Scalar, Dynamic, 1 > sdiag(n); + qrsolv(s, qr.colsPermutation().indices(), wa1, qtb, x, sdiag); + + wa2 = diag.cwiseProduct(x); + dxnorm = wa2.blueNorm(); + temp = fp; + fp = dxnorm - delta; + + /* if the function is small enough, accept the current value */ + /* of par. also test for the exceptional cases where parl */ + /* is zero or the number of iterations has reached 10. */ + if (abs(fp) <= Scalar(0.1) * delta || (parl == 0. && fp <= temp && temp < 0.) || iter == 10) + break; + + /* compute the newton correction. */ + wa1 = qr.colsPermutation().inverse() * diag.cwiseProduct(wa2/dxnorm); + // we could almost use this here, but the diagonal is outside qr, in sdiag[] + // qr.matrixQR().topLeftCorner(n, n).transpose().template triangularView().solveInPlace(wa1); + for (j = 0; j < n; ++j) { + wa1[j] /= sdiag[j]; + temp = wa1[j]; + for (Index i = j+1; i < n; ++i) + wa1[i] -= s(i,j) * temp; + } + temp = wa1.blueNorm(); + parc = fp / delta / temp / temp; + + /* depending on the sign of the function, update parl or paru. */ + if (fp > 0.) + parl = (std::max)(parl,par); + if (fp < 0.) + paru = (std::min)(paru,par); + + /* compute an improved estimate for par. */ + par = (std::max)(parl,par+parc); + } + if (iter == 0) + par = 0.; + return; +} + +} // end namespace internal + +} // end namespace Eigen diff --git a/src/EigenUnsupported/src/NonLinearOptimization/qrsolv.h b/src/EigenUnsupported/src/NonLinearOptimization/qrsolv.h new file mode 100644 index 0000000..4f2f560 --- /dev/null +++ b/src/EigenUnsupported/src/NonLinearOptimization/qrsolv.h @@ -0,0 +1,91 @@ +namespace Eigen { + +namespace internal { + +// TODO : once qrsolv2 is removed, use ColPivHouseholderQR or PermutationMatrix instead of ipvt +template +void qrsolv( + Matrix< Scalar, Dynamic, Dynamic > &s, + // TODO : use a PermutationMatrix once lmpar is no more: + const VectorXi &ipvt, + const Matrix< Scalar, Dynamic, 1 > &diag, + const Matrix< Scalar, Dynamic, 1 > &qtb, + Matrix< Scalar, Dynamic, 1 > &x, + Matrix< Scalar, Dynamic, 1 > &sdiag) + +{ + typedef DenseIndex Index; + + /* Local variables */ + Index i, j, k, l; + Scalar temp; + Index n = s.cols(); + Matrix< Scalar, Dynamic, 1 > wa(n); + JacobiRotation givens; + + /* Function Body */ + // the following will only change the lower triangular part of s, including + // the diagonal, though the diagonal is restored afterward + + /* copy r and (q transpose)*b to preserve input and initialize s. */ + /* in particular, save the diagonal elements of r in x. */ + x = s.diagonal(); + wa = qtb; + + s.topLeftCorner(n,n).template triangularView() = s.topLeftCorner(n,n).transpose(); + + /* eliminate the diagonal matrix d using a givens rotation. */ + for (j = 0; j < n; ++j) { + + /* prepare the row of d to be eliminated, locating the */ + /* diagonal element using p from the qr factorization. */ + l = ipvt[j]; + if (diag[l] == 0.) + break; + sdiag.tail(n-j).setZero(); + sdiag[j] = diag[l]; + + /* the transformations to eliminate the row of d */ + /* modify only a single element of (q transpose)*b */ + /* beyond the first n, which is initially zero. */ + Scalar qtbpj = 0.; + for (k = j; k < n; ++k) { + /* determine a givens rotation which eliminates the */ + /* appropriate element in the current row of d. */ + givens.makeGivens(-s(k,k), sdiag[k]); + + /* compute the modified diagonal element of r and */ + /* the modified element of ((q transpose)*b,0). */ + s(k,k) = givens.c() * s(k,k) + givens.s() * sdiag[k]; + temp = givens.c() * wa[k] + givens.s() * qtbpj; + qtbpj = -givens.s() * wa[k] + givens.c() * qtbpj; + wa[k] = temp; + + /* accumulate the transformation in the row of s. */ + for (i = k+1; i().solveInPlace(wa.head(nsing)); + + // restore + sdiag = s.diagonal(); + s.diagonal() = x; + + /* permute the components of z back to components of x. */ + for (j = 0; j < n; ++j) x[ipvt[j]] = wa[j]; +} + +} // end namespace internal + +} // end namespace Eigen diff --git a/src/EigenUnsupported/src/NonLinearOptimization/r1mpyq.h b/src/EigenUnsupported/src/NonLinearOptimization/r1mpyq.h new file mode 100644 index 0000000..36ff700 --- /dev/null +++ b/src/EigenUnsupported/src/NonLinearOptimization/r1mpyq.h @@ -0,0 +1,30 @@ +namespace Eigen { + +namespace internal { + +// TODO : move this to GivensQR once there's such a thing in Eigen + +template +void r1mpyq(DenseIndex m, DenseIndex n, Scalar *a, const std::vector > &v_givens, const std::vector > &w_givens) +{ + typedef DenseIndex Index; + + /* apply the first set of givens rotations to a. */ + for (Index j = n-2; j>=0; --j) + for (Index i = 0; i +void r1updt( + Matrix< Scalar, Dynamic, Dynamic > &s, + const Matrix< Scalar, Dynamic, 1> &u, + std::vector > &v_givens, + std::vector > &w_givens, + Matrix< Scalar, Dynamic, 1> &v, + Matrix< Scalar, Dynamic, 1> &w, + bool *sing) +{ + typedef DenseIndex Index; + const JacobiRotation IdentityRotation = JacobiRotation(1,0); + + /* Local variables */ + const Index m = s.rows(); + const Index n = s.cols(); + Index i, j=1; + Scalar temp; + JacobiRotation givens; + + // r1updt had a broader usecase, but we don't use it here. And, more + // importantly, we can not test it. + eigen_assert(m==n); + eigen_assert(u.size()==m); + eigen_assert(v.size()==n); + eigen_assert(w.size()==n); + + /* move the nontrivial part of the last column of s into w. */ + w[n-1] = s(n-1,n-1); + + /* rotate the vector v into a multiple of the n-th unit vector */ + /* in such a way that a spike is introduced into w. */ + for (j=n-2; j>=0; --j) { + w[j] = 0.; + if (v[j] != 0.) { + /* determine a givens rotation which eliminates the */ + /* j-th element of v. */ + givens.makeGivens(-v[n-1], v[j]); + + /* apply the transformation to v and store the information */ + /* necessary to recover the givens rotation. */ + v[n-1] = givens.s() * v[j] + givens.c() * v[n-1]; + v_givens[j] = givens; + + /* apply the transformation to s and extend the spike in w. */ + for (i = j; i < m; ++i) { + temp = givens.c() * s(j,i) - givens.s() * w[i]; + w[i] = givens.s() * s(j,i) + givens.c() * w[i]; + s(j,i) = temp; + } + } else + v_givens[j] = IdentityRotation; + } + + /* add the spike from the rank 1 update to w. */ + w += v[n-1] * u; + + /* eliminate the spike. */ + *sing = false; + for (j = 0; j < n-1; ++j) { + if (w[j] != 0.) { + /* determine a givens rotation which eliminates the */ + /* j-th element of the spike. */ + givens.makeGivens(-s(j,j), w[j]); + + /* apply the transformation to s and reduce the spike in w. */ + for (i = j; i < m; ++i) { + temp = givens.c() * s(j,i) + givens.s() * w[i]; + w[i] = -givens.s() * s(j,i) + givens.c() * w[i]; + s(j,i) = temp; + } + + /* store the information necessary to recover the */ + /* givens rotation. */ + w_givens[j] = givens; + } else + v_givens[j] = IdentityRotation; + + /* test for zero diagonal elements in the output s. */ + if (s(j,j) == 0.) { + *sing = true; + } + } + /* move w back into the last column of the output s. */ + s(n-1,n-1) = w[n-1]; + + if (s(j,j) == 0.) { + *sing = true; + } + return; +} + +} // end namespace internal + +} // end namespace Eigen diff --git a/src/EigenUnsupported/src/NonLinearOptimization/rwupdt.h b/src/EigenUnsupported/src/NonLinearOptimization/rwupdt.h new file mode 100644 index 0000000..6ebf856 --- /dev/null +++ b/src/EigenUnsupported/src/NonLinearOptimization/rwupdt.h @@ -0,0 +1,49 @@ +namespace Eigen { + +namespace internal { + +template +void rwupdt( + Matrix< Scalar, Dynamic, Dynamic > &r, + const Matrix< Scalar, Dynamic, 1> &w, + Matrix< Scalar, Dynamic, 1> &b, + Scalar alpha) +{ + typedef DenseIndex Index; + + const Index n = r.cols(); + eigen_assert(r.rows()>=n); + std::vector > givens(n); + + /* Local variables */ + Scalar temp, rowj; + + /* Function Body */ + for (Index j = 0; j < n; ++j) { + rowj = w[j]; + + /* apply the previous transformations to */ + /* r(i,j), i=0,1,...,j-1, and to w(j). */ + for (Index i = 0; i < j; ++i) { + temp = givens[i].c() * r(i,j) + givens[i].s() * rowj; + rowj = -givens[i].s() * r(i,j) + givens[i].c() * rowj; + r(i,j) = temp; + } + + /* determine a givens rotation which eliminates w(j). */ + givens[j].makeGivens(-r(j,j), rowj); + + if (rowj == 0.) + continue; // givens[j] is identity + + /* apply the current transformation to r(j,j), b(j), and alpha. */ + r(j,j) = givens[j].c() * r(j,j) + givens[j].s() * rowj; + temp = givens[j].c() * b[j] + givens[j].s() * alpha; + alpha = -givens[j].s() * b[j] + givens[j].c() * alpha; + b[j] = temp; + } +} + +} // end namespace internal + +} // end namespace Eigen diff --git a/src/EigenUnsupported/src/NumericalDiff/NumericalDiff.h b/src/EigenUnsupported/src/NumericalDiff/NumericalDiff.h new file mode 100644 index 0000000..ea5d8bc --- /dev/null +++ b/src/EigenUnsupported/src/NumericalDiff/NumericalDiff.h @@ -0,0 +1,130 @@ +// -*- coding: utf-8 +// vim: set fileencoding=utf-8 + +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Thomas Capricelli +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_NUMERICAL_DIFF_H +#define EIGEN_NUMERICAL_DIFF_H + +namespace Eigen { + +enum NumericalDiffMode { + Forward, + Central +}; + + +/** + * This class allows you to add a method df() to your functor, which will + * use numerical differentiation to compute an approximate of the + * derivative for the functor. Of course, if you have an analytical form + * for the derivative, you should rather implement df() by yourself. + * + * More information on + * http://en.wikipedia.org/wiki/Numerical_differentiation + * + * Currently only "Forward" and "Central" scheme are implemented. + */ +template +class NumericalDiff : public _Functor +{ +public: + typedef _Functor Functor; + typedef typename Functor::Scalar Scalar; + typedef typename Functor::InputType InputType; + typedef typename Functor::ValueType ValueType; + typedef typename Functor::JacobianType JacobianType; + + NumericalDiff(Scalar _epsfcn=0.) : Functor(), epsfcn(_epsfcn) {} + NumericalDiff(const Functor& f, Scalar _epsfcn=0.) : Functor(f), epsfcn(_epsfcn) {} + + // forward constructors + template + NumericalDiff(const T0& a0) : Functor(a0), epsfcn(0) {} + template + NumericalDiff(const T0& a0, const T1& a1) : Functor(a0, a1), epsfcn(0) {} + template + NumericalDiff(const T0& a0, const T1& a1, const T2& a2) : Functor(a0, a1, a2), epsfcn(0) {} + + enum { + InputsAtCompileTime = Functor::InputsAtCompileTime, + ValuesAtCompileTime = Functor::ValuesAtCompileTime + }; + + /** + * return the number of evaluation of functor + */ + int df(const InputType& _x, JacobianType &jac) const + { + using std::sqrt; + using std::abs; + /* Local variables */ + Scalar h; + int nfev=0; + const typename InputType::Index n = _x.size(); + const Scalar eps = sqrt(((std::max)(epsfcn,NumTraits::epsilon() ))); + ValueType val1, val2; + InputType x = _x; + // TODO : we should do this only if the size is not already known + val1.resize(Functor::values()); + val2.resize(Functor::values()); + + // initialization + switch(mode) { + case Forward: + // compute f(x) + Functor::operator()(x, val1); nfev++; + break; + case Central: + // do nothing + break; + default: + eigen_assert(false); + }; + + // Function Body + for (int j = 0; j < n; ++j) { + h = eps * abs(x[j]); + if (h == 0.) { + h = eps; + } + switch(mode) { + case Forward: + x[j] += h; + Functor::operator()(x, val2); + nfev++; + x[j] = _x[j]; + jac.col(j) = (val2-val1)/h; + break; + case Central: + x[j] += h; + Functor::operator()(x, val2); nfev++; + x[j] -= 2*h; + Functor::operator()(x, val1); nfev++; + x[j] = _x[j]; + jac.col(j) = (val2-val1)/(2*h); + break; + default: + eigen_assert(false); + }; + } + return nfev; + } +private: + Scalar epsfcn; + + NumericalDiff& operator=(const NumericalDiff&); +}; + +} // end namespace Eigen + +//vim: ai ts=4 sts=4 et sw=4 +#endif // EIGEN_NUMERICAL_DIFF_H + diff --git a/src/EigenUnsupported/src/Polynomials/Companion.h b/src/EigenUnsupported/src/Polynomials/Companion.h new file mode 100644 index 0000000..59a15b0 --- /dev/null +++ b/src/EigenUnsupported/src/Polynomials/Companion.h @@ -0,0 +1,280 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2010 Manuel Yguel +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_COMPANION_H +#define EIGEN_COMPANION_H + +// This file requires the user to include +// * Eigen/Core +// * Eigen/src/PolynomialSolver.h + +namespace Eigen { + +namespace internal { + +#ifndef EIGEN_PARSED_BY_DOXYGEN + +template +struct decrement_if_fixed_size +{ + enum { + ret = (Size == Dynamic) ? Dynamic : Size-1 }; +}; + +#endif + +template< typename _Scalar, int _Deg > +class companion +{ + public: + EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_Deg==Dynamic ? Dynamic : _Deg) + + enum { + Deg = _Deg, + Deg_1=decrement_if_fixed_size::ret + }; + + typedef _Scalar Scalar; + typedef typename NumTraits::Real RealScalar; + typedef Matrix RightColumn; + //typedef DiagonalMatrix< Scalar, Deg_1, Deg_1 > BottomLeftDiagonal; + typedef Matrix BottomLeftDiagonal; + + typedef Matrix DenseCompanionMatrixType; + typedef Matrix< Scalar, _Deg, Deg_1 > LeftBlock; + typedef Matrix< Scalar, Deg_1, Deg_1 > BottomLeftBlock; + typedef Matrix< Scalar, 1, Deg_1 > LeftBlockFirstRow; + + typedef DenseIndex Index; + + public: + EIGEN_STRONG_INLINE const _Scalar operator()(Index row, Index col ) const + { + if( m_bl_diag.rows() > col ) + { + if( 0 < row ){ return m_bl_diag[col]; } + else{ return 0; } + } + else{ return m_monic[row]; } + } + + public: + template + void setPolynomial( const VectorType& poly ) + { + const Index deg = poly.size()-1; + m_monic = -poly.head(deg)/poly[deg]; + m_bl_diag.setOnes(deg-1); + } + + template + companion( const VectorType& poly ){ + setPolynomial( poly ); } + + public: + DenseCompanionMatrixType denseMatrix() const + { + const Index deg = m_monic.size(); + const Index deg_1 = deg-1; + DenseCompanionMatrixType companMat(deg,deg); + companMat << + ( LeftBlock(deg,deg_1) + << LeftBlockFirstRow::Zero(1,deg_1), + BottomLeftBlock::Identity(deg-1,deg-1)*m_bl_diag.asDiagonal() ).finished() + , m_monic; + return companMat; + } + + + + protected: + /** Helper function for the balancing algorithm. + * \returns true if the row and the column, having colNorm and rowNorm + * as norms, are balanced, false otherwise. + * colB and rowB are respectively the multipliers for + * the column and the row in order to balance them. + * */ + bool balanced( RealScalar colNorm, RealScalar rowNorm, + bool& isBalanced, RealScalar& colB, RealScalar& rowB ); + + /** Helper function for the balancing algorithm. + * \returns true if the row and the column, having colNorm and rowNorm + * as norms, are balanced, false otherwise. + * colB and rowB are respectively the multipliers for + * the column and the row in order to balance them. + * */ + bool balancedR( RealScalar colNorm, RealScalar rowNorm, + bool& isBalanced, RealScalar& colB, RealScalar& rowB ); + + public: + /** + * Balancing algorithm from B. N. PARLETT and C. REINSCH (1969) + * "Balancing a matrix for calculation of eigenvalues and eigenvectors" + * adapted to the case of companion matrices. + * A matrix with non zero row and non zero column is balanced + * for a certain norm if the i-th row and the i-th column + * have same norm for all i. + */ + void balance(); + + protected: + RightColumn m_monic; + BottomLeftDiagonal m_bl_diag; +}; + + + +template< typename _Scalar, int _Deg > +inline +bool companion<_Scalar,_Deg>::balanced( RealScalar colNorm, RealScalar rowNorm, + bool& isBalanced, RealScalar& colB, RealScalar& rowB ) +{ + if( RealScalar(0) == colNorm || RealScalar(0) == rowNorm + || !(numext::isfinite)(colNorm) || !(numext::isfinite)(rowNorm)){ + return true; + } + else + { + //To find the balancing coefficients, if the radix is 2, + //one finds \f$ \sigma \f$ such that + // \f$ 2^{2\sigma-1} < rowNorm / colNorm \le 2^{2\sigma+1} \f$ + // then the balancing coefficient for the row is \f$ 1/2^{\sigma} \f$ + // and the balancing coefficient for the column is \f$ 2^{\sigma} \f$ + const RealScalar radix = RealScalar(2); + const RealScalar radix2 = RealScalar(4); + + rowB = rowNorm / radix; + colB = RealScalar(1); + const RealScalar s = colNorm + rowNorm; + + // Find sigma s.t. rowNorm / 2 <= 2^(2*sigma) * colNorm + RealScalar scout = colNorm; + while (scout < rowB) + { + colB *= radix; + scout *= radix2; + } + + // We now have an upper-bound for sigma, try to lower it. + // Find sigma s.t. 2^(2*sigma) * colNorm / 2 < rowNorm + scout = colNorm * (colB / radix) * colB; // Avoid overflow. + while (scout >= rowNorm) + { + colB /= radix; + scout /= radix2; + } + + // This line is used to avoid insubstantial balancing. + if ((rowNorm + radix * scout) < RealScalar(0.95) * s * colB) + { + isBalanced = false; + rowB = RealScalar(1) / colB; + return false; + } + else + { + return true; + } + } +} + +template< typename _Scalar, int _Deg > +inline +bool companion<_Scalar,_Deg>::balancedR( RealScalar colNorm, RealScalar rowNorm, + bool& isBalanced, RealScalar& colB, RealScalar& rowB ) +{ + if( RealScalar(0) == colNorm || RealScalar(0) == rowNorm ){ return true; } + else + { + /** + * Set the norm of the column and the row to the geometric mean + * of the row and column norm + */ + const RealScalar q = colNorm/rowNorm; + if( !isApprox( q, _Scalar(1) ) ) + { + rowB = sqrt( colNorm/rowNorm ); + colB = RealScalar(1)/rowB; + + isBalanced = false; + return false; + } + else{ + return true; } + } +} + + +template< typename _Scalar, int _Deg > +void companion<_Scalar,_Deg>::balance() +{ + using std::abs; + EIGEN_STATIC_ASSERT( Deg == Dynamic || 1 < Deg, YOU_MADE_A_PROGRAMMING_MISTAKE ); + const Index deg = m_monic.size(); + const Index deg_1 = deg-1; + + bool hasConverged=false; + while( !hasConverged ) + { + hasConverged = true; + RealScalar colNorm,rowNorm; + RealScalar colB,rowB; + + //First row, first column excluding the diagonal + //============================================== + colNorm = abs(m_bl_diag[0]); + rowNorm = abs(m_monic[0]); + + //Compute balancing of the row and the column + if( !balanced( colNorm, rowNorm, hasConverged, colB, rowB ) ) + { + m_bl_diag[0] *= colB; + m_monic[0] *= rowB; + } + + //Middle rows and columns excluding the diagonal + //============================================== + for( Index i=1; i headMonic( m_monic, 0, deg_1 ); + colNorm = headMonic.array().abs().sum(); + rowNorm = abs( m_bl_diag[ebl] ); + + //Compute balancing of the row and the column + if( !balanced( colNorm, rowNorm, hasConverged, colB, rowB ) ) + { + headMonic *= colB; + m_bl_diag[ebl] *= rowB; + } + } +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_COMPANION_H diff --git a/src/EigenUnsupported/src/Polynomials/PolynomialSolver.h b/src/EigenUnsupported/src/Polynomials/PolynomialSolver.h new file mode 100644 index 0000000..5e0ecbb --- /dev/null +++ b/src/EigenUnsupported/src/Polynomials/PolynomialSolver.h @@ -0,0 +1,428 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2010 Manuel Yguel +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_POLYNOMIAL_SOLVER_H +#define EIGEN_POLYNOMIAL_SOLVER_H + +namespace Eigen { + +/** \ingroup Polynomials_Module + * \class PolynomialSolverBase. + * + * \brief Defined to be inherited by polynomial solvers: it provides + * convenient methods such as + * - real roots, + * - greatest, smallest complex roots, + * - real roots with greatest, smallest absolute real value, + * - greatest, smallest real roots. + * + * It stores the set of roots as a vector of complexes. + * + */ +template< typename _Scalar, int _Deg > +class PolynomialSolverBase +{ + public: + EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_Deg==Dynamic ? Dynamic : _Deg) + + typedef _Scalar Scalar; + typedef typename NumTraits::Real RealScalar; + typedef std::complex RootType; + typedef Matrix RootsType; + + typedef DenseIndex Index; + + protected: + template< typename OtherPolynomial > + inline void setPolynomial( const OtherPolynomial& poly ){ + m_roots.resize(poly.size()-1); } + + public: + template< typename OtherPolynomial > + inline PolynomialSolverBase( const OtherPolynomial& poly ){ + setPolynomial( poly() ); } + + inline PolynomialSolverBase(){} + + public: + /** \returns the complex roots of the polynomial */ + inline const RootsType& roots() const { return m_roots; } + + public: + /** Clear and fills the back insertion sequence with the real roots of the polynomial + * i.e. the real part of the complex roots that have an imaginary part which + * absolute value is smaller than absImaginaryThreshold. + * absImaginaryThreshold takes the dummy_precision associated + * with the _Scalar template parameter of the PolynomialSolver class as the default value. + * + * \param[out] bi_seq : the back insertion sequence (stl concept) + * \param[in] absImaginaryThreshold : the maximum bound of the imaginary part of a complex + * number that is considered as real. + * */ + template + inline void realRoots( Stl_back_insertion_sequence& bi_seq, + const RealScalar& absImaginaryThreshold = NumTraits::dummy_precision() ) const + { + using std::abs; + bi_seq.clear(); + for(Index i=0; i + inline const RootType& selectComplexRoot_withRespectToNorm( squaredNormBinaryPredicate& pred ) const + { + Index res=0; + RealScalar norm2 = numext::abs2( m_roots[0] ); + for( Index i=1; i greater; + return selectComplexRoot_withRespectToNorm( greater ); + } + + /** + * \returns the complex root with smallest norm. + */ + inline const RootType& smallestRoot() const + { + std::less less; + return selectComplexRoot_withRespectToNorm( less ); + } + + protected: + template + inline const RealScalar& selectRealRoot_withRespectToAbsRealPart( + squaredRealPartBinaryPredicate& pred, + bool& hasArealRoot, + const RealScalar& absImaginaryThreshold = NumTraits::dummy_precision() ) const + { + using std::abs; + hasArealRoot = false; + Index res=0; + RealScalar abs2(0); + + for( Index i=0; i + inline const RealScalar& selectRealRoot_withRespectToRealPart( + RealPartBinaryPredicate& pred, + bool& hasArealRoot, + const RealScalar& absImaginaryThreshold = NumTraits::dummy_precision() ) const + { + using std::abs; + hasArealRoot = false; + Index res=0; + RealScalar val(0); + + for( Index i=0; i::dummy_precision() ) const + { + std::greater greater; + return selectRealRoot_withRespectToAbsRealPart( greater, hasArealRoot, absImaginaryThreshold ); + } + + + /** + * \returns a real root with smallest absolute magnitude. + * A real root is defined as the real part of a complex root with absolute imaginary + * part smallest than absImaginaryThreshold. + * absImaginaryThreshold takes the dummy_precision associated + * with the _Scalar template parameter of the PolynomialSolver class as the default value. + * If no real root is found the boolean hasArealRoot is set to false and the real part of + * the root with smallest absolute imaginary part is returned instead. + * + * \param[out] hasArealRoot : boolean true if a real root is found according to the + * absImaginaryThreshold criterion, false otherwise. + * \param[in] absImaginaryThreshold : threshold on the absolute imaginary part to decide + * whether or not a root is real. + */ + inline const RealScalar& absSmallestRealRoot( + bool& hasArealRoot, + const RealScalar& absImaginaryThreshold = NumTraits::dummy_precision() ) const + { + std::less less; + return selectRealRoot_withRespectToAbsRealPart( less, hasArealRoot, absImaginaryThreshold ); + } + + + /** + * \returns the real root with greatest value. + * A real root is defined as the real part of a complex root with absolute imaginary + * part smallest than absImaginaryThreshold. + * absImaginaryThreshold takes the dummy_precision associated + * with the _Scalar template parameter of the PolynomialSolver class as the default value. + * If no real root is found the boolean hasArealRoot is set to false and the real part of + * the root with smallest absolute imaginary part is returned instead. + * + * \param[out] hasArealRoot : boolean true if a real root is found according to the + * absImaginaryThreshold criterion, false otherwise. + * \param[in] absImaginaryThreshold : threshold on the absolute imaginary part to decide + * whether or not a root is real. + */ + inline const RealScalar& greatestRealRoot( + bool& hasArealRoot, + const RealScalar& absImaginaryThreshold = NumTraits::dummy_precision() ) const + { + std::greater greater; + return selectRealRoot_withRespectToRealPart( greater, hasArealRoot, absImaginaryThreshold ); + } + + + /** + * \returns the real root with smallest value. + * A real root is defined as the real part of a complex root with absolute imaginary + * part smallest than absImaginaryThreshold. + * absImaginaryThreshold takes the dummy_precision associated + * with the _Scalar template parameter of the PolynomialSolver class as the default value. + * If no real root is found the boolean hasArealRoot is set to false and the real part of + * the root with smallest absolute imaginary part is returned instead. + * + * \param[out] hasArealRoot : boolean true if a real root is found according to the + * absImaginaryThreshold criterion, false otherwise. + * \param[in] absImaginaryThreshold : threshold on the absolute imaginary part to decide + * whether or not a root is real. + */ + inline const RealScalar& smallestRealRoot( + bool& hasArealRoot, + const RealScalar& absImaginaryThreshold = NumTraits::dummy_precision() ) const + { + std::less less; + return selectRealRoot_withRespectToRealPart( less, hasArealRoot, absImaginaryThreshold ); + } + + protected: + RootsType m_roots; +}; + +#define EIGEN_POLYNOMIAL_SOLVER_BASE_INHERITED_TYPES( BASE ) \ + typedef typename BASE::Scalar Scalar; \ + typedef typename BASE::RealScalar RealScalar; \ + typedef typename BASE::RootType RootType; \ + typedef typename BASE::RootsType RootsType; + + + +/** \ingroup Polynomials_Module + * + * \class PolynomialSolver + * + * \brief A polynomial solver + * + * Computes the complex roots of a real polynomial. + * + * \param _Scalar the scalar type, i.e., the type of the polynomial coefficients + * \param _Deg the degree of the polynomial, can be a compile time value or Dynamic. + * Notice that the number of polynomial coefficients is _Deg+1. + * + * This class implements a polynomial solver and provides convenient methods such as + * - real roots, + * - greatest, smallest complex roots, + * - real roots with greatest, smallest absolute real value. + * - greatest, smallest real roots. + * + * WARNING: this polynomial solver is experimental, part of the unsupported Eigen modules. + * + * + * Currently a QR algorithm is used to compute the eigenvalues of the companion matrix of + * the polynomial to compute its roots. + * This supposes that the complex moduli of the roots are all distinct: e.g. there should + * be no multiple roots or conjugate roots for instance. + * With 32bit (float) floating types this problem shows up frequently. + * However, almost always, correct accuracy is reached even in these cases for 64bit + * (double) floating types and small polynomial degree (<20). + */ +template +class PolynomialSolver : public PolynomialSolverBase<_Scalar,_Deg> +{ + public: + EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_Deg==Dynamic ? Dynamic : _Deg) + + typedef PolynomialSolverBase<_Scalar,_Deg> PS_Base; + EIGEN_POLYNOMIAL_SOLVER_BASE_INHERITED_TYPES( PS_Base ) + + typedef Matrix CompanionMatrixType; + typedef typename internal::conditional::IsComplex, + ComplexEigenSolver, + EigenSolver >::type EigenSolverType; + typedef typename internal::conditional::IsComplex, Scalar, std::complex >::type ComplexScalar; + + public: + /** Computes the complex roots of a new polynomial. */ + template< typename OtherPolynomial > + void compute( const OtherPolynomial& poly ) + { + eigen_assert( Scalar(0) != poly[poly.size()-1] ); + eigen_assert( poly.size() > 1 ); + if(poly.size() > 2 ) + { + internal::companion companion( poly ); + companion.balance(); + m_eigenSolver.compute( companion.denseMatrix() ); + m_roots = m_eigenSolver.eigenvalues(); + // cleanup noise in imaginary part of real roots: + // if the imaginary part is rather small compared to the real part + // and that cancelling the imaginary part yield a smaller evaluation, + // then it's safe to keep the real part only. + RealScalar coarse_prec = RealScalar(std::pow(4,poly.size()+1))*NumTraits::epsilon(); + for(Index i = 0; i + inline PolynomialSolver( const OtherPolynomial& poly ){ + compute( poly ); } + + inline PolynomialSolver(){} + + protected: + using PS_Base::m_roots; + EigenSolverType m_eigenSolver; +}; + + +template< typename _Scalar > +class PolynomialSolver<_Scalar,1> : public PolynomialSolverBase<_Scalar,1> +{ + public: + typedef PolynomialSolverBase<_Scalar,1> PS_Base; + EIGEN_POLYNOMIAL_SOLVER_BASE_INHERITED_TYPES( PS_Base ) + + public: + /** Computes the complex roots of a new polynomial. */ + template< typename OtherPolynomial > + void compute( const OtherPolynomial& poly ) + { + eigen_assert( poly.size() == 2 ); + eigen_assert( Scalar(0) != poly[1] ); + m_roots[0] = -poly[0]/poly[1]; + } + + public: + template< typename OtherPolynomial > + inline PolynomialSolver( const OtherPolynomial& poly ){ + compute( poly ); } + + inline PolynomialSolver(){} + + protected: + using PS_Base::m_roots; +}; + +} // end namespace Eigen + +#endif // EIGEN_POLYNOMIAL_SOLVER_H diff --git a/src/EigenUnsupported/src/Polynomials/PolynomialUtils.h b/src/EigenUnsupported/src/Polynomials/PolynomialUtils.h new file mode 100644 index 0000000..394e857 --- /dev/null +++ b/src/EigenUnsupported/src/Polynomials/PolynomialUtils.h @@ -0,0 +1,143 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2010 Manuel Yguel +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_POLYNOMIAL_UTILS_H +#define EIGEN_POLYNOMIAL_UTILS_H + +namespace Eigen { + +/** \ingroup Polynomials_Module + * \returns the evaluation of the polynomial at x using Horner algorithm. + * + * \param[in] poly : the vector of coefficients of the polynomial ordered + * by degrees i.e. poly[i] is the coefficient of degree i of the polynomial + * e.g. \f$ 1 + 3x^2 \f$ is stored as a vector \f$ [ 1, 0, 3 ] \f$. + * \param[in] x : the value to evaluate the polynomial at. + * + * \note for stability: + * \f$ |x| \le 1 \f$ + */ +template +inline +T poly_eval_horner( const Polynomials& poly, const T& x ) +{ + T val=poly[poly.size()-1]; + for(DenseIndex i=poly.size()-2; i>=0; --i ){ + val = val*x + poly[i]; } + return val; +} + +/** \ingroup Polynomials_Module + * \returns the evaluation of the polynomial at x using stabilized Horner algorithm. + * + * \param[in] poly : the vector of coefficients of the polynomial ordered + * by degrees i.e. poly[i] is the coefficient of degree i of the polynomial + * e.g. \f$ 1 + 3x^2 \f$ is stored as a vector \f$ [ 1, 0, 3 ] \f$. + * \param[in] x : the value to evaluate the polynomial at. + */ +template +inline +T poly_eval( const Polynomials& poly, const T& x ) +{ + typedef typename NumTraits::Real Real; + + if( numext::abs2( x ) <= Real(1) ){ + return poly_eval_horner( poly, x ); } + else + { + T val=poly[0]; + T inv_x = T(1)/x; + for( DenseIndex i=1; i +inline +typename NumTraits::Real cauchy_max_bound( const Polynomial& poly ) +{ + using std::abs; + typedef typename Polynomial::Scalar Scalar; + typedef typename NumTraits::Real Real; + + eigen_assert( Scalar(0) != poly[poly.size()-1] ); + const Scalar inv_leading_coeff = Scalar(1)/poly[poly.size()-1]; + Real cb(0); + + for( DenseIndex i=0; i +inline +typename NumTraits::Real cauchy_min_bound( const Polynomial& poly ) +{ + using std::abs; + typedef typename Polynomial::Scalar Scalar; + typedef typename NumTraits::Real Real; + + DenseIndex i=0; + while( i +void roots_to_monicPolynomial( const RootVector& rv, Polynomial& poly ) +{ + + typedef typename Polynomial::Scalar Scalar; + + poly.setZero( rv.size()+1 ); + poly[0] = -rv[0]; poly[1] = Scalar(1); + for( DenseIndex i=1; i< rv.size(); ++i ) + { + for( DenseIndex j=i+1; j>0; --j ){ poly[j] = poly[j-1] - rv[i]*poly[j]; } + poly[0] = -rv[i]*poly[0]; + } +} + +} // end namespace Eigen + +#endif // EIGEN_POLYNOMIAL_UTILS_H diff --git a/src/EigenUnsupported/src/Skyline/SkylineInplaceLU.h b/src/EigenUnsupported/src/Skyline/SkylineInplaceLU.h new file mode 100644 index 0000000..6d0370d --- /dev/null +++ b/src/EigenUnsupported/src/Skyline/SkylineInplaceLU.h @@ -0,0 +1,352 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008 Guillaume Saupin +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SKYLINEINPLACELU_H +#define EIGEN_SKYLINEINPLACELU_H + +namespace Eigen { + +/** \ingroup Skyline_Module + * + * \class SkylineInplaceLU + * + * \brief Inplace LU decomposition of a skyline matrix and associated features + * + * \param MatrixType the type of the matrix of which we are computing the LU factorization + * + */ +template +class SkylineInplaceLU { +protected: + typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::Index Index; + + typedef typename NumTraits::Real RealScalar; + +public: + + /** Creates a LU object and compute the respective factorization of \a matrix using + * flags \a flags. */ + SkylineInplaceLU(MatrixType& matrix, int flags = 0) + : /*m_matrix(matrix.rows(), matrix.cols()),*/ m_flags(flags), m_status(0), m_lu(matrix) { + m_precision = RealScalar(0.1) * Eigen::dummy_precision (); + m_lu.IsRowMajor ? computeRowMajor() : compute(); + } + + /** Sets the relative threshold value used to prune zero coefficients during the decomposition. + * + * Setting a value greater than zero speeds up computation, and yields to an incomplete + * factorization with fewer non zero coefficients. Such approximate factors are especially + * useful to initialize an iterative solver. + * + * Note that the exact meaning of this parameter might depends on the actual + * backend. Moreover, not all backends support this feature. + * + * \sa precision() */ + void setPrecision(RealScalar v) { + m_precision = v; + } + + /** \returns the current precision. + * + * \sa setPrecision() */ + RealScalar precision() const { + return m_precision; + } + + /** Sets the flags. Possible values are: + * - CompleteFactorization + * - IncompleteFactorization + * - MemoryEfficient + * - one of the ordering methods + * - etc... + * + * \sa flags() */ + void setFlags(int f) { + m_flags = f; + } + + /** \returns the current flags */ + int flags() const { + return m_flags; + } + + void setOrderingMethod(int m) { + m_flags = m; + } + + int orderingMethod() const { + return m_flags; + } + + /** Computes/re-computes the LU factorization */ + void compute(); + void computeRowMajor(); + + /** \returns the lower triangular matrix L */ + //inline const MatrixType& matrixL() const { return m_matrixL; } + + /** \returns the upper triangular matrix U */ + //inline const MatrixType& matrixU() const { return m_matrixU; } + + template + bool solve(const MatrixBase &b, MatrixBase* x, + const int transposed = 0) const; + + /** \returns true if the factorization succeeded */ + inline bool succeeded(void) const { + return m_succeeded; + } + +protected: + RealScalar m_precision; + int m_flags; + mutable int m_status; + bool m_succeeded; + MatrixType& m_lu; +}; + +/** Computes / recomputes the in place LU decomposition of the SkylineInplaceLU. + * using the default algorithm. + */ +template +//template +void SkylineInplaceLU::compute() { + const size_t rows = m_lu.rows(); + const size_t cols = m_lu.cols(); + + eigen_assert(rows == cols && "We do not (yet) support rectangular LU."); + eigen_assert(!m_lu.IsRowMajor && "LU decomposition does not work with rowMajor Storage"); + + for (Index row = 0; row < rows; row++) { + const double pivot = m_lu.coeffDiag(row); + + //Lower matrix Columns update + const Index& col = row; + for (typename MatrixType::InnerLowerIterator lIt(m_lu, col); lIt; ++lIt) { + lIt.valueRef() /= pivot; + } + + //Upper matrix update -> contiguous memory access + typename MatrixType::InnerLowerIterator lIt(m_lu, col); + for (Index rrow = row + 1; rrow < m_lu.rows(); rrow++) { + typename MatrixType::InnerUpperIterator uItPivot(m_lu, row); + typename MatrixType::InnerUpperIterator uIt(m_lu, rrow); + const double coef = lIt.value(); + + uItPivot += (rrow - row - 1); + + //update upper part -> contiguous memory access + for (++uItPivot; uIt && uItPivot;) { + uIt.valueRef() -= uItPivot.value() * coef; + + ++uIt; + ++uItPivot; + } + ++lIt; + } + + //Upper matrix update -> non contiguous memory access + typename MatrixType::InnerLowerIterator lIt3(m_lu, col); + for (Index rrow = row + 1; rrow < m_lu.rows(); rrow++) { + typename MatrixType::InnerUpperIterator uItPivot(m_lu, row); + const double coef = lIt3.value(); + + //update lower part -> non contiguous memory access + for (Index i = 0; i < rrow - row - 1; i++) { + m_lu.coeffRefLower(rrow, row + i + 1) -= uItPivot.value() * coef; + ++uItPivot; + } + ++lIt3; + } + //update diag -> contiguous + typename MatrixType::InnerLowerIterator lIt2(m_lu, col); + for (Index rrow = row + 1; rrow < m_lu.rows(); rrow++) { + + typename MatrixType::InnerUpperIterator uItPivot(m_lu, row); + typename MatrixType::InnerUpperIterator uIt(m_lu, rrow); + const double coef = lIt2.value(); + + uItPivot += (rrow - row - 1); + m_lu.coeffRefDiag(rrow) -= uItPivot.value() * coef; + ++lIt2; + } + } +} + +template +void SkylineInplaceLU::computeRowMajor() { + const size_t rows = m_lu.rows(); + const size_t cols = m_lu.cols(); + + eigen_assert(rows == cols && "We do not (yet) support rectangular LU."); + eigen_assert(m_lu.IsRowMajor && "You're trying to apply rowMajor decomposition on a ColMajor matrix !"); + + for (Index row = 0; row < rows; row++) { + typename MatrixType::InnerLowerIterator llIt(m_lu, row); + + + for (Index col = llIt.col(); col < row; col++) { + if (m_lu.coeffExistLower(row, col)) { + const double diag = m_lu.coeffDiag(col); + + typename MatrixType::InnerLowerIterator lIt(m_lu, row); + typename MatrixType::InnerUpperIterator uIt(m_lu, col); + + + const Index offset = lIt.col() - uIt.row(); + + + Index stop = offset > 0 ? col - lIt.col() : col - uIt.row(); + + //#define VECTORIZE +#ifdef VECTORIZE + Map rowVal(lIt.valuePtr() + (offset > 0 ? 0 : -offset), stop); + Map colVal(uIt.valuePtr() + (offset > 0 ? offset : 0), stop); + + + Scalar newCoeff = m_lu.coeffLower(row, col) - rowVal.dot(colVal); +#else + if (offset > 0) //Skip zero value of lIt + uIt += offset; + else //Skip zero values of uIt + lIt += -offset; + Scalar newCoeff = m_lu.coeffLower(row, col); + + for (Index k = 0; k < stop; ++k) { + const Scalar tmp = newCoeff; + newCoeff = tmp - lIt.value() * uIt.value(); + ++lIt; + ++uIt; + } +#endif + + m_lu.coeffRefLower(row, col) = newCoeff / diag; + } + } + + //Upper matrix update + const Index col = row; + typename MatrixType::InnerUpperIterator uuIt(m_lu, col); + for (Index rrow = uuIt.row(); rrow < col; rrow++) { + + typename MatrixType::InnerLowerIterator lIt(m_lu, rrow); + typename MatrixType::InnerUpperIterator uIt(m_lu, col); + const Index offset = lIt.col() - uIt.row(); + + Index stop = offset > 0 ? rrow - lIt.col() : rrow - uIt.row(); + +#ifdef VECTORIZE + Map rowVal(lIt.valuePtr() + (offset > 0 ? 0 : -offset), stop); + Map colVal(uIt.valuePtr() + (offset > 0 ? offset : 0), stop); + + Scalar newCoeff = m_lu.coeffUpper(rrow, col) - rowVal.dot(colVal); +#else + if (offset > 0) //Skip zero value of lIt + uIt += offset; + else //Skip zero values of uIt + lIt += -offset; + Scalar newCoeff = m_lu.coeffUpper(rrow, col); + for (Index k = 0; k < stop; ++k) { + const Scalar tmp = newCoeff; + newCoeff = tmp - lIt.value() * uIt.value(); + + ++lIt; + ++uIt; + } +#endif + m_lu.coeffRefUpper(rrow, col) = newCoeff; + } + + + //Diag matrix update + typename MatrixType::InnerLowerIterator lIt(m_lu, row); + typename MatrixType::InnerUpperIterator uIt(m_lu, row); + + const Index offset = lIt.col() - uIt.row(); + + + Index stop = offset > 0 ? lIt.size() : uIt.size(); +#ifdef VECTORIZE + Map rowVal(lIt.valuePtr() + (offset > 0 ? 0 : -offset), stop); + Map colVal(uIt.valuePtr() + (offset > 0 ? offset : 0), stop); + Scalar newCoeff = m_lu.coeffDiag(row) - rowVal.dot(colVal); +#else + if (offset > 0) //Skip zero value of lIt + uIt += offset; + else //Skip zero values of uIt + lIt += -offset; + Scalar newCoeff = m_lu.coeffDiag(row); + for (Index k = 0; k < stop; ++k) { + const Scalar tmp = newCoeff; + newCoeff = tmp - lIt.value() * uIt.value(); + ++lIt; + ++uIt; + } +#endif + m_lu.coeffRefDiag(row) = newCoeff; + } +} + +/** Computes *x = U^-1 L^-1 b + * + * If \a transpose is set to SvTranspose or SvAdjoint, the solution + * of the transposed/adjoint system is computed instead. + * + * Not all backends implement the solution of the transposed or + * adjoint system. + */ +template +template +bool SkylineInplaceLU::solve(const MatrixBase &b, MatrixBase* x, const int transposed) const { + const size_t rows = m_lu.rows(); + const size_t cols = m_lu.cols(); + + + for (Index row = 0; row < rows; row++) { + x->coeffRef(row) = b.coeff(row); + Scalar newVal = x->coeff(row); + typename MatrixType::InnerLowerIterator lIt(m_lu, row); + + Index col = lIt.col(); + while (lIt.col() < row) { + + newVal -= x->coeff(col++) * lIt.value(); + ++lIt; + } + + x->coeffRef(row) = newVal; + } + + + for (Index col = rows - 1; col > 0; col--) { + x->coeffRef(col) = x->coeff(col) / m_lu.coeffDiag(col); + + const Scalar x_col = x->coeff(col); + + typename MatrixType::InnerUpperIterator uIt(m_lu, col); + uIt += uIt.size()-1; + + + while (uIt) { + x->coeffRef(uIt.row()) -= x_col * uIt.value(); + //TODO : introduce --operator + uIt += -1; + } + + + } + x->coeffRef(0) = x->coeff(0) / m_lu.coeffDiag(0); + + return true; +} + +} // end namespace Eigen + +#endif // EIGEN_SKYLINEINPLACELU_H diff --git a/src/EigenUnsupported/src/Skyline/SkylineMatrix.h b/src/EigenUnsupported/src/Skyline/SkylineMatrix.h new file mode 100644 index 0000000..7c7eace --- /dev/null +++ b/src/EigenUnsupported/src/Skyline/SkylineMatrix.h @@ -0,0 +1,862 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2009 Guillaume Saupin +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SKYLINEMATRIX_H +#define EIGEN_SKYLINEMATRIX_H + +#include "SkylineStorage.h" +#include "SkylineMatrixBase.h" + +namespace Eigen { + +/** \ingroup Skyline_Module + * + * \class SkylineMatrix + * + * \brief The main skyline matrix class + * + * This class implements a skyline matrix using the very uncommon storage + * scheme. + * + * \param _Scalar the scalar type, i.e. the type of the coefficients + * \param _Options Union of bit flags controlling the storage scheme. Currently the only possibility + * is RowMajor. The default is 0 which means column-major. + * + * + */ +namespace internal { +template +struct traits > { + typedef _Scalar Scalar; + typedef Sparse StorageKind; + + enum { + RowsAtCompileTime = Dynamic, + ColsAtCompileTime = Dynamic, + MaxRowsAtCompileTime = Dynamic, + MaxColsAtCompileTime = Dynamic, + Flags = SkylineBit | _Options, + CoeffReadCost = NumTraits::ReadCost, + }; +}; +} + +template +class SkylineMatrix +: public SkylineMatrixBase > { +public: + EIGEN_SKYLINE_GENERIC_PUBLIC_INTERFACE(SkylineMatrix) + EIGEN_SKYLINE_INHERIT_ASSIGNMENT_OPERATOR(SkylineMatrix, +=) + EIGEN_SKYLINE_INHERIT_ASSIGNMENT_OPERATOR(SkylineMatrix, -=) + + using Base::IsRowMajor; + +protected: + + typedef SkylineMatrix TransposedSkylineMatrix; + + Index m_outerSize; + Index m_innerSize; + +public: + Index* m_colStartIndex; + Index* m_rowStartIndex; + SkylineStorage m_data; + +public: + + inline Index rows() const { + return IsRowMajor ? m_outerSize : m_innerSize; + } + + inline Index cols() const { + return IsRowMajor ? m_innerSize : m_outerSize; + } + + inline Index innerSize() const { + return m_innerSize; + } + + inline Index outerSize() const { + return m_outerSize; + } + + inline Index upperNonZeros() const { + return m_data.upperSize(); + } + + inline Index lowerNonZeros() const { + return m_data.lowerSize(); + } + + inline Index upperNonZeros(Index j) const { + return m_colStartIndex[j + 1] - m_colStartIndex[j]; + } + + inline Index lowerNonZeros(Index j) const { + return m_rowStartIndex[j + 1] - m_rowStartIndex[j]; + } + + inline const Scalar* _diagPtr() const { + return &m_data.diag(0); + } + + inline Scalar* _diagPtr() { + return &m_data.diag(0); + } + + inline const Scalar* _upperPtr() const { + return &m_data.upper(0); + } + + inline Scalar* _upperPtr() { + return &m_data.upper(0); + } + + inline const Scalar* _lowerPtr() const { + return &m_data.lower(0); + } + + inline Scalar* _lowerPtr() { + return &m_data.lower(0); + } + + inline const Index* _upperProfilePtr() const { + return &m_data.upperProfile(0); + } + + inline Index* _upperProfilePtr() { + return &m_data.upperProfile(0); + } + + inline const Index* _lowerProfilePtr() const { + return &m_data.lowerProfile(0); + } + + inline Index* _lowerProfilePtr() { + return &m_data.lowerProfile(0); + } + + inline Scalar coeff(Index row, Index col) const { + const Index outer = IsRowMajor ? row : col; + const Index inner = IsRowMajor ? col : row; + + eigen_assert(outer < outerSize()); + eigen_assert(inner < innerSize()); + + if (outer == inner) + return this->m_data.diag(outer); + + if (IsRowMajor) { + if (inner > outer) //upper matrix + { + const Index minOuterIndex = inner - m_data.upperProfile(inner); + if (outer >= minOuterIndex) + return this->m_data.upper(m_colStartIndex[inner] + outer - (inner - m_data.upperProfile(inner))); + else + return Scalar(0); + } + if (inner < outer) //lower matrix + { + const Index minInnerIndex = outer - m_data.lowerProfile(outer); + if (inner >= minInnerIndex) + return this->m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer))); + else + return Scalar(0); + } + return m_data.upper(m_colStartIndex[inner] + outer - inner); + } else { + if (outer > inner) //upper matrix + { + const Index maxOuterIndex = inner + m_data.upperProfile(inner); + if (outer <= maxOuterIndex) + return this->m_data.upper(m_colStartIndex[inner] + (outer - inner)); + else + return Scalar(0); + } + if (outer < inner) //lower matrix + { + const Index maxInnerIndex = outer + m_data.lowerProfile(outer); + + if (inner <= maxInnerIndex) + return this->m_data.lower(m_rowStartIndex[outer] + (inner - outer)); + else + return Scalar(0); + } + } + } + + inline Scalar& coeffRef(Index row, Index col) { + const Index outer = IsRowMajor ? row : col; + const Index inner = IsRowMajor ? col : row; + + eigen_assert(outer < outerSize()); + eigen_assert(inner < innerSize()); + + if (outer == inner) + return this->m_data.diag(outer); + + if (IsRowMajor) { + if (col > row) //upper matrix + { + const Index minOuterIndex = inner - m_data.upperProfile(inner); + eigen_assert(outer >= minOuterIndex && "You tried to access a coeff that does not exist in the storage"); + return this->m_data.upper(m_colStartIndex[inner] + outer - (inner - m_data.upperProfile(inner))); + } + if (col < row) //lower matrix + { + const Index minInnerIndex = outer - m_data.lowerProfile(outer); + eigen_assert(inner >= minInnerIndex && "You tried to access a coeff that does not exist in the storage"); + return this->m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer))); + } + } else { + if (outer > inner) //upper matrix + { + const Index maxOuterIndex = inner + m_data.upperProfile(inner); + eigen_assert(outer <= maxOuterIndex && "You tried to access a coeff that does not exist in the storage"); + return this->m_data.upper(m_colStartIndex[inner] + (outer - inner)); + } + if (outer < inner) //lower matrix + { + const Index maxInnerIndex = outer + m_data.lowerProfile(outer); + eigen_assert(inner <= maxInnerIndex && "You tried to access a coeff that does not exist in the storage"); + return this->m_data.lower(m_rowStartIndex[outer] + (inner - outer)); + } + } + } + + inline Scalar coeffDiag(Index idx) const { + eigen_assert(idx < outerSize()); + eigen_assert(idx < innerSize()); + return this->m_data.diag(idx); + } + + inline Scalar coeffLower(Index row, Index col) const { + const Index outer = IsRowMajor ? row : col; + const Index inner = IsRowMajor ? col : row; + + eigen_assert(outer < outerSize()); + eigen_assert(inner < innerSize()); + eigen_assert(inner != outer); + + if (IsRowMajor) { + const Index minInnerIndex = outer - m_data.lowerProfile(outer); + if (inner >= minInnerIndex) + return this->m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer))); + else + return Scalar(0); + + } else { + const Index maxInnerIndex = outer + m_data.lowerProfile(outer); + if (inner <= maxInnerIndex) + return this->m_data.lower(m_rowStartIndex[outer] + (inner - outer)); + else + return Scalar(0); + } + } + + inline Scalar coeffUpper(Index row, Index col) const { + const Index outer = IsRowMajor ? row : col; + const Index inner = IsRowMajor ? col : row; + + eigen_assert(outer < outerSize()); + eigen_assert(inner < innerSize()); + eigen_assert(inner != outer); + + if (IsRowMajor) { + const Index minOuterIndex = inner - m_data.upperProfile(inner); + if (outer >= minOuterIndex) + return this->m_data.upper(m_colStartIndex[inner] + outer - (inner - m_data.upperProfile(inner))); + else + return Scalar(0); + } else { + const Index maxOuterIndex = inner + m_data.upperProfile(inner); + if (outer <= maxOuterIndex) + return this->m_data.upper(m_colStartIndex[inner] + (outer - inner)); + else + return Scalar(0); + } + } + + inline Scalar& coeffRefDiag(Index idx) { + eigen_assert(idx < outerSize()); + eigen_assert(idx < innerSize()); + return this->m_data.diag(idx); + } + + inline Scalar& coeffRefLower(Index row, Index col) { + const Index outer = IsRowMajor ? row : col; + const Index inner = IsRowMajor ? col : row; + + eigen_assert(outer < outerSize()); + eigen_assert(inner < innerSize()); + eigen_assert(inner != outer); + + if (IsRowMajor) { + const Index minInnerIndex = outer - m_data.lowerProfile(outer); + eigen_assert(inner >= minInnerIndex && "You tried to access a coeff that does not exist in the storage"); + return this->m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer))); + } else { + const Index maxInnerIndex = outer + m_data.lowerProfile(outer); + eigen_assert(inner <= maxInnerIndex && "You tried to access a coeff that does not exist in the storage"); + return this->m_data.lower(m_rowStartIndex[outer] + (inner - outer)); + } + } + + inline bool coeffExistLower(Index row, Index col) { + const Index outer = IsRowMajor ? row : col; + const Index inner = IsRowMajor ? col : row; + + eigen_assert(outer < outerSize()); + eigen_assert(inner < innerSize()); + eigen_assert(inner != outer); + + if (IsRowMajor) { + const Index minInnerIndex = outer - m_data.lowerProfile(outer); + return inner >= minInnerIndex; + } else { + const Index maxInnerIndex = outer + m_data.lowerProfile(outer); + return inner <= maxInnerIndex; + } + } + + inline Scalar& coeffRefUpper(Index row, Index col) { + const Index outer = IsRowMajor ? row : col; + const Index inner = IsRowMajor ? col : row; + + eigen_assert(outer < outerSize()); + eigen_assert(inner < innerSize()); + eigen_assert(inner != outer); + + if (IsRowMajor) { + const Index minOuterIndex = inner - m_data.upperProfile(inner); + eigen_assert(outer >= minOuterIndex && "You tried to access a coeff that does not exist in the storage"); + return this->m_data.upper(m_colStartIndex[inner] + outer - (inner - m_data.upperProfile(inner))); + } else { + const Index maxOuterIndex = inner + m_data.upperProfile(inner); + eigen_assert(outer <= maxOuterIndex && "You tried to access a coeff that does not exist in the storage"); + return this->m_data.upper(m_colStartIndex[inner] + (outer - inner)); + } + } + + inline bool coeffExistUpper(Index row, Index col) { + const Index outer = IsRowMajor ? row : col; + const Index inner = IsRowMajor ? col : row; + + eigen_assert(outer < outerSize()); + eigen_assert(inner < innerSize()); + eigen_assert(inner != outer); + + if (IsRowMajor) { + const Index minOuterIndex = inner - m_data.upperProfile(inner); + return outer >= minOuterIndex; + } else { + const Index maxOuterIndex = inner + m_data.upperProfile(inner); + return outer <= maxOuterIndex; + } + } + + +protected: + +public: + class InnerUpperIterator; + class InnerLowerIterator; + + class OuterUpperIterator; + class OuterLowerIterator; + + /** Removes all non zeros */ + inline void setZero() { + m_data.clear(); + memset(m_colStartIndex, 0, (m_outerSize + 1) * sizeof (Index)); + memset(m_rowStartIndex, 0, (m_outerSize + 1) * sizeof (Index)); + } + + /** \returns the number of non zero coefficients */ + inline Index nonZeros() const { + return m_data.diagSize() + m_data.upperSize() + m_data.lowerSize(); + } + + /** Preallocates \a reserveSize non zeros */ + inline void reserve(Index reserveSize, Index reserveUpperSize, Index reserveLowerSize) { + m_data.reserve(reserveSize, reserveUpperSize, reserveLowerSize); + } + + /** \returns a reference to a novel non zero coefficient with coordinates \a row x \a col. + + * + * \warning This function can be extremely slow if the non zero coefficients + * are not inserted in a coherent order. + * + * After an insertion session, you should call the finalize() function. + */ + EIGEN_DONT_INLINE Scalar & insert(Index row, Index col) { + const Index outer = IsRowMajor ? row : col; + const Index inner = IsRowMajor ? col : row; + + eigen_assert(outer < outerSize()); + eigen_assert(inner < innerSize()); + + if (outer == inner) + return m_data.diag(col); + + if (IsRowMajor) { + if (outer < inner) //upper matrix + { + Index minOuterIndex = 0; + minOuterIndex = inner - m_data.upperProfile(inner); + + if (outer < minOuterIndex) //The value does not yet exist + { + const Index previousProfile = m_data.upperProfile(inner); + + m_data.upperProfile(inner) = inner - outer; + + + const Index bandIncrement = m_data.upperProfile(inner) - previousProfile; + //shift data stored after this new one + const Index stop = m_colStartIndex[cols()]; + const Index start = m_colStartIndex[inner]; + + + for (Index innerIdx = stop; innerIdx >= start; innerIdx--) { + m_data.upper(innerIdx + bandIncrement) = m_data.upper(innerIdx); + } + + for (Index innerIdx = cols(); innerIdx > inner; innerIdx--) { + m_colStartIndex[innerIdx] += bandIncrement; + } + + //zeros new data + memset(this->_upperPtr() + start, 0, (bandIncrement - 1) * sizeof (Scalar)); + + return m_data.upper(m_colStartIndex[inner]); + } else { + return m_data.upper(m_colStartIndex[inner] + outer - (inner - m_data.upperProfile(inner))); + } + } + + if (outer > inner) //lower matrix + { + const Index minInnerIndex = outer - m_data.lowerProfile(outer); + if (inner < minInnerIndex) //The value does not yet exist + { + const Index previousProfile = m_data.lowerProfile(outer); + m_data.lowerProfile(outer) = outer - inner; + + const Index bandIncrement = m_data.lowerProfile(outer) - previousProfile; + //shift data stored after this new one + const Index stop = m_rowStartIndex[rows()]; + const Index start = m_rowStartIndex[outer]; + + + for (Index innerIdx = stop; innerIdx >= start; innerIdx--) { + m_data.lower(innerIdx + bandIncrement) = m_data.lower(innerIdx); + } + + for (Index innerIdx = rows(); innerIdx > outer; innerIdx--) { + m_rowStartIndex[innerIdx] += bandIncrement; + } + + //zeros new data + memset(this->_lowerPtr() + start, 0, (bandIncrement - 1) * sizeof (Scalar)); + return m_data.lower(m_rowStartIndex[outer]); + } else { + return m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer))); + } + } + } else { + if (outer > inner) //upper matrix + { + const Index maxOuterIndex = inner + m_data.upperProfile(inner); + if (outer > maxOuterIndex) //The value does not yet exist + { + const Index previousProfile = m_data.upperProfile(inner); + m_data.upperProfile(inner) = outer - inner; + + const Index bandIncrement = m_data.upperProfile(inner) - previousProfile; + //shift data stored after this new one + const Index stop = m_rowStartIndex[rows()]; + const Index start = m_rowStartIndex[inner + 1]; + + for (Index innerIdx = stop; innerIdx >= start; innerIdx--) { + m_data.upper(innerIdx + bandIncrement) = m_data.upper(innerIdx); + } + + for (Index innerIdx = inner + 1; innerIdx < outerSize() + 1; innerIdx++) { + m_rowStartIndex[innerIdx] += bandIncrement; + } + memset(this->_upperPtr() + m_rowStartIndex[inner] + previousProfile + 1, 0, (bandIncrement - 1) * sizeof (Scalar)); + return m_data.upper(m_rowStartIndex[inner] + m_data.upperProfile(inner)); + } else { + return m_data.upper(m_rowStartIndex[inner] + (outer - inner)); + } + } + + if (outer < inner) //lower matrix + { + const Index maxInnerIndex = outer + m_data.lowerProfile(outer); + if (inner > maxInnerIndex) //The value does not yet exist + { + const Index previousProfile = m_data.lowerProfile(outer); + m_data.lowerProfile(outer) = inner - outer; + + const Index bandIncrement = m_data.lowerProfile(outer) - previousProfile; + //shift data stored after this new one + const Index stop = m_colStartIndex[cols()]; + const Index start = m_colStartIndex[outer + 1]; + + for (Index innerIdx = stop; innerIdx >= start; innerIdx--) { + m_data.lower(innerIdx + bandIncrement) = m_data.lower(innerIdx); + } + + for (Index innerIdx = outer + 1; innerIdx < outerSize() + 1; innerIdx++) { + m_colStartIndex[innerIdx] += bandIncrement; + } + memset(this->_lowerPtr() + m_colStartIndex[outer] + previousProfile + 1, 0, (bandIncrement - 1) * sizeof (Scalar)); + return m_data.lower(m_colStartIndex[outer] + m_data.lowerProfile(outer)); + } else { + return m_data.lower(m_colStartIndex[outer] + (inner - outer)); + } + } + } + } + + /** Must be called after inserting a set of non zero entries. + */ + inline void finalize() { + if (IsRowMajor) { + if (rows() > cols()) + m_data.resize(cols(), cols(), rows(), m_colStartIndex[cols()] + 1, m_rowStartIndex[rows()] + 1); + else + m_data.resize(rows(), cols(), rows(), m_colStartIndex[cols()] + 1, m_rowStartIndex[rows()] + 1); + + // eigen_assert(rows() == cols() && "memory reorganisatrion only works with suare matrix"); + // + // Scalar* newArray = new Scalar[m_colStartIndex[cols()] + 1 + m_rowStartIndex[rows()] + 1]; + // Index dataIdx = 0; + // for (Index row = 0; row < rows(); row++) { + // + // const Index nbLowerElts = m_rowStartIndex[row + 1] - m_rowStartIndex[row]; + // // std::cout << "nbLowerElts" << nbLowerElts << std::endl; + // memcpy(newArray + dataIdx, m_data.m_lower + m_rowStartIndex[row], nbLowerElts * sizeof (Scalar)); + // m_rowStartIndex[row] = dataIdx; + // dataIdx += nbLowerElts; + // + // const Index nbUpperElts = m_colStartIndex[row + 1] - m_colStartIndex[row]; + // memcpy(newArray + dataIdx, m_data.m_upper + m_colStartIndex[row], nbUpperElts * sizeof (Scalar)); + // m_colStartIndex[row] = dataIdx; + // dataIdx += nbUpperElts; + // + // + // } + // //todo : don't access m_data profile directly : add an accessor from SkylineMatrix + // m_rowStartIndex[rows()] = m_rowStartIndex[rows()-1] + m_data.lowerProfile(rows()-1); + // m_colStartIndex[cols()] = m_colStartIndex[cols()-1] + m_data.upperProfile(cols()-1); + // + // delete[] m_data.m_lower; + // delete[] m_data.m_upper; + // + // m_data.m_lower = newArray; + // m_data.m_upper = newArray; + } else { + if (rows() > cols()) + m_data.resize(cols(), rows(), cols(), m_rowStartIndex[cols()] + 1, m_colStartIndex[cols()] + 1); + else + m_data.resize(rows(), rows(), cols(), m_rowStartIndex[rows()] + 1, m_colStartIndex[rows()] + 1); + } + } + + inline void squeeze() { + finalize(); + m_data.squeeze(); + } + + void prune(Scalar reference, RealScalar epsilon = dummy_precision ()) { + //TODO + } + + /** Resizes the matrix to a \a rows x \a cols matrix and initializes it to zero + * \sa resizeNonZeros(Index), reserve(), setZero() + */ + void resize(size_t rows, size_t cols) { + const Index diagSize = rows > cols ? cols : rows; + m_innerSize = IsRowMajor ? cols : rows; + + eigen_assert(rows == cols && "Skyline matrix must be square matrix"); + + if (diagSize % 2) { // diagSize is odd + const Index k = (diagSize - 1) / 2; + + m_data.resize(diagSize, IsRowMajor ? cols : rows, IsRowMajor ? rows : cols, + 2 * k * k + k + 1, + 2 * k * k + k + 1); + + } else // diagSize is even + { + const Index k = diagSize / 2; + m_data.resize(diagSize, IsRowMajor ? cols : rows, IsRowMajor ? rows : cols, + 2 * k * k - k + 1, + 2 * k * k - k + 1); + } + + if (m_colStartIndex && m_rowStartIndex) { + delete[] m_colStartIndex; + delete[] m_rowStartIndex; + } + m_colStartIndex = new Index [cols + 1]; + m_rowStartIndex = new Index [rows + 1]; + m_outerSize = diagSize; + + m_data.reset(); + m_data.clear(); + + m_outerSize = diagSize; + memset(m_colStartIndex, 0, (cols + 1) * sizeof (Index)); + memset(m_rowStartIndex, 0, (rows + 1) * sizeof (Index)); + } + + void resizeNonZeros(Index size) { + m_data.resize(size); + } + + inline SkylineMatrix() + : m_outerSize(-1), m_innerSize(0), m_colStartIndex(0), m_rowStartIndex(0) { + resize(0, 0); + } + + inline SkylineMatrix(size_t rows, size_t cols) + : m_outerSize(0), m_innerSize(0), m_colStartIndex(0), m_rowStartIndex(0) { + resize(rows, cols); + } + + template + inline SkylineMatrix(const SkylineMatrixBase& other) + : m_outerSize(0), m_innerSize(0), m_colStartIndex(0), m_rowStartIndex(0) { + *this = other.derived(); + } + + inline SkylineMatrix(const SkylineMatrix & other) + : Base(), m_outerSize(0), m_innerSize(0), m_colStartIndex(0), m_rowStartIndex(0) { + *this = other.derived(); + } + + inline void swap(SkylineMatrix & other) { + //EIGEN_DBG_SKYLINE(std::cout << "SkylineMatrix:: swap\n"); + std::swap(m_colStartIndex, other.m_colStartIndex); + std::swap(m_rowStartIndex, other.m_rowStartIndex); + std::swap(m_innerSize, other.m_innerSize); + std::swap(m_outerSize, other.m_outerSize); + m_data.swap(other.m_data); + } + + inline SkylineMatrix & operator=(const SkylineMatrix & other) { + std::cout << "SkylineMatrix& operator=(const SkylineMatrix& other)\n"; + if (other.isRValue()) { + swap(other.const_cast_derived()); + } else { + resize(other.rows(), other.cols()); + memcpy(m_colStartIndex, other.m_colStartIndex, (m_outerSize + 1) * sizeof (Index)); + memcpy(m_rowStartIndex, other.m_rowStartIndex, (m_outerSize + 1) * sizeof (Index)); + m_data = other.m_data; + } + return *this; + } + + template + inline SkylineMatrix & operator=(const SkylineMatrixBase& other) { + const bool needToTranspose = (Flags & RowMajorBit) != (OtherDerived::Flags & RowMajorBit); + if (needToTranspose) { + // TODO + // return *this; + } else { + // there is no special optimization + return SkylineMatrixBase::operator=(other.derived()); + } + } + + friend std::ostream & operator <<(std::ostream & s, const SkylineMatrix & m) { + + EIGEN_DBG_SKYLINE( + std::cout << "upper elements : " << std::endl; + for (Index i = 0; i < m.m_data.upperSize(); i++) + std::cout << m.m_data.upper(i) << "\t"; + std::cout << std::endl; + std::cout << "upper profile : " << std::endl; + for (Index i = 0; i < m.m_data.upperProfileSize(); i++) + std::cout << m.m_data.upperProfile(i) << "\t"; + std::cout << std::endl; + std::cout << "lower startIdx : " << std::endl; + for (Index i = 0; i < m.m_data.upperProfileSize(); i++) + std::cout << (IsRowMajor ? m.m_colStartIndex[i] : m.m_rowStartIndex[i]) << "\t"; + std::cout << std::endl; + + + std::cout << "lower elements : " << std::endl; + for (Index i = 0; i < m.m_data.lowerSize(); i++) + std::cout << m.m_data.lower(i) << "\t"; + std::cout << std::endl; + std::cout << "lower profile : " << std::endl; + for (Index i = 0; i < m.m_data.lowerProfileSize(); i++) + std::cout << m.m_data.lowerProfile(i) << "\t"; + std::cout << std::endl; + std::cout << "lower startIdx : " << std::endl; + for (Index i = 0; i < m.m_data.lowerProfileSize(); i++) + std::cout << (IsRowMajor ? m.m_rowStartIndex[i] : m.m_colStartIndex[i]) << "\t"; + std::cout << std::endl; + ); + for (Index rowIdx = 0; rowIdx < m.rows(); rowIdx++) { + for (Index colIdx = 0; colIdx < m.cols(); colIdx++) { + s << m.coeff(rowIdx, colIdx) << "\t"; + } + s << std::endl; + } + return s; + } + + /** Destructor */ + inline ~SkylineMatrix() { + delete[] m_colStartIndex; + delete[] m_rowStartIndex; + } + + /** Overloaded for performance */ + Scalar sum() const; +}; + +template +class SkylineMatrix::InnerUpperIterator { +public: + + InnerUpperIterator(const SkylineMatrix& mat, Index outer) + : m_matrix(mat), m_outer(outer), + m_id(_Options == RowMajor ? mat.m_colStartIndex[outer] : mat.m_rowStartIndex[outer] + 1), + m_start(m_id), + m_end(_Options == RowMajor ? mat.m_colStartIndex[outer + 1] : mat.m_rowStartIndex[outer + 1] + 1) { + } + + inline InnerUpperIterator & operator++() { + m_id++; + return *this; + } + + inline InnerUpperIterator & operator+=(Index shift) { + m_id += shift; + return *this; + } + + inline Scalar value() const { + return m_matrix.m_data.upper(m_id); + } + + inline Scalar* valuePtr() { + return const_cast (&(m_matrix.m_data.upper(m_id))); + } + + inline Scalar& valueRef() { + return const_cast (m_matrix.m_data.upper(m_id)); + } + + inline Index index() const { + return IsRowMajor ? m_outer - m_matrix.m_data.upperProfile(m_outer) + (m_id - m_start) : + m_outer + (m_id - m_start) + 1; + } + + inline Index row() const { + return IsRowMajor ? index() : m_outer; + } + + inline Index col() const { + return IsRowMajor ? m_outer : index(); + } + + inline size_t size() const { + return m_matrix.m_data.upperProfile(m_outer); + } + + inline operator bool() const { + return (m_id < m_end) && (m_id >= m_start); + } + +protected: + const SkylineMatrix& m_matrix; + const Index m_outer; + Index m_id; + const Index m_start; + const Index m_end; +}; + +template +class SkylineMatrix::InnerLowerIterator { +public: + + InnerLowerIterator(const SkylineMatrix& mat, Index outer) + : m_matrix(mat), + m_outer(outer), + m_id(_Options == RowMajor ? mat.m_rowStartIndex[outer] : mat.m_colStartIndex[outer] + 1), + m_start(m_id), + m_end(_Options == RowMajor ? mat.m_rowStartIndex[outer + 1] : mat.m_colStartIndex[outer + 1] + 1) { + } + + inline InnerLowerIterator & operator++() { + m_id++; + return *this; + } + + inline InnerLowerIterator & operator+=(Index shift) { + m_id += shift; + return *this; + } + + inline Scalar value() const { + return m_matrix.m_data.lower(m_id); + } + + inline Scalar* valuePtr() { + return const_cast (&(m_matrix.m_data.lower(m_id))); + } + + inline Scalar& valueRef() { + return const_cast (m_matrix.m_data.lower(m_id)); + } + + inline Index index() const { + return IsRowMajor ? m_outer - m_matrix.m_data.lowerProfile(m_outer) + (m_id - m_start) : + m_outer + (m_id - m_start) + 1; + ; + } + + inline Index row() const { + return IsRowMajor ? m_outer : index(); + } + + inline Index col() const { + return IsRowMajor ? index() : m_outer; + } + + inline size_t size() const { + return m_matrix.m_data.lowerProfile(m_outer); + } + + inline operator bool() const { + return (m_id < m_end) && (m_id >= m_start); + } + +protected: + const SkylineMatrix& m_matrix; + const Index m_outer; + Index m_id; + const Index m_start; + const Index m_end; +}; + +} // end namespace Eigen + +#endif // EIGEN_SKYLINEMATRIX_H diff --git a/src/EigenUnsupported/src/Skyline/SkylineMatrixBase.h b/src/EigenUnsupported/src/Skyline/SkylineMatrixBase.h new file mode 100644 index 0000000..b0d5e10 --- /dev/null +++ b/src/EigenUnsupported/src/Skyline/SkylineMatrixBase.h @@ -0,0 +1,212 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2009 Guillaume Saupin +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SKYLINEMATRIXBASE_H +#define EIGEN_SKYLINEMATRIXBASE_H + +#include "SkylineUtil.h" + +namespace Eigen { + +/** \ingroup Skyline_Module + * + * \class SkylineMatrixBase + * + * \brief Base class of any skyline matrices or skyline expressions + * + * \param Derived + * + */ +template class SkylineMatrixBase : public EigenBase { +public: + + typedef typename internal::traits::Scalar Scalar; + typedef typename internal::traits::StorageKind StorageKind; + typedef typename internal::index::type Index; + + enum { + RowsAtCompileTime = internal::traits::RowsAtCompileTime, + /**< The number of rows at compile-time. This is just a copy of the value provided + * by the \a Derived type. If a value is not known at compile-time, + * it is set to the \a Dynamic constant. + * \sa MatrixBase::rows(), MatrixBase::cols(), ColsAtCompileTime, SizeAtCompileTime */ + + ColsAtCompileTime = internal::traits::ColsAtCompileTime, + /**< The number of columns at compile-time. This is just a copy of the value provided + * by the \a Derived type. If a value is not known at compile-time, + * it is set to the \a Dynamic constant. + * \sa MatrixBase::rows(), MatrixBase::cols(), RowsAtCompileTime, SizeAtCompileTime */ + + + SizeAtCompileTime = (internal::size_at_compile_time::RowsAtCompileTime, + internal::traits::ColsAtCompileTime>::ret), + /**< This is equal to the number of coefficients, i.e. the number of + * rows times the number of columns, or to \a Dynamic if this is not + * known at compile-time. \sa RowsAtCompileTime, ColsAtCompileTime */ + + MaxRowsAtCompileTime = RowsAtCompileTime, + MaxColsAtCompileTime = ColsAtCompileTime, + + MaxSizeAtCompileTime = (internal::size_at_compile_time::ret), + + IsVectorAtCompileTime = RowsAtCompileTime == 1 || ColsAtCompileTime == 1, + /**< This is set to true if either the number of rows or the number of + * columns is known at compile-time to be equal to 1. Indeed, in that case, + * we are dealing with a column-vector (if there is only one column) or with + * a row-vector (if there is only one row). */ + + Flags = internal::traits::Flags, + /**< This stores expression \ref flags flags which may or may not be inherited by new expressions + * constructed from this one. See the \ref flags "list of flags". + */ + + CoeffReadCost = internal::traits::CoeffReadCost, + /**< This is a rough measure of how expensive it is to read one coefficient from + * this expression. + */ + + IsRowMajor = Flags & RowMajorBit ? 1 : 0 + }; + +#ifndef EIGEN_PARSED_BY_DOXYGEN + /** This is the "real scalar" type; if the \a Scalar type is already real numbers + * (e.g. int, float or double) then \a RealScalar is just the same as \a Scalar. If + * \a Scalar is \a std::complex then RealScalar is \a T. + * + * \sa class NumTraits + */ + typedef typename NumTraits::Real RealScalar; + + /** type of the equivalent square matrix */ + typedef Matrix SquareMatrixType; + + inline const Derived& derived() const { + return *static_cast (this); + } + + inline Derived& derived() { + return *static_cast (this); + } + + inline Derived& const_cast_derived() const { + return *static_cast (const_cast (this)); + } +#endif // not EIGEN_PARSED_BY_DOXYGEN + + /** \returns the number of rows. \sa cols(), RowsAtCompileTime */ + inline EIGEN_CONSTEXPR Index rows() const EIGEN_NOEXCEPT { + return derived().rows(); + } + + /** \returns the number of columns. \sa rows(), ColsAtCompileTime*/ + inline EIGEN_CONSTEXPR Index cols() const EIGEN_NOEXCEPT { + return derived().cols(); + } + + /** \returns the number of coefficients, which is \a rows()*cols(). + * \sa rows(), cols(), SizeAtCompileTime. */ + inline EIGEN_CONSTEXPR Index size() const EIGEN_NOEXCEPT { + return rows() * cols(); + } + + /** \returns the number of nonzero coefficients which is in practice the number + * of stored coefficients. */ + inline Index nonZeros() const { + return derived().nonZeros(); + } + + /** \returns the size of the storage major dimension, + * i.e., the number of columns for a columns major matrix, and the number of rows otherwise */ + Index outerSize() const { + return (int(Flags) & RowMajorBit) ? this->rows() : this->cols(); + } + + /** \returns the size of the inner dimension according to the storage order, + * i.e., the number of rows for a columns major matrix, and the number of cols otherwise */ + Index innerSize() const { + return (int(Flags) & RowMajorBit) ? this->cols() : this->rows(); + } + + bool isRValue() const { + return m_isRValue; + } + + Derived& markAsRValue() { + m_isRValue = true; + return derived(); + } + + SkylineMatrixBase() : m_isRValue(false) { + /* TODO check flags */ + } + + inline Derived & operator=(const Derived& other) { + this->operator= (other); + return derived(); + } + + template + inline void assignGeneric(const OtherDerived& other) { + derived().resize(other.rows(), other.cols()); + for (Index row = 0; row < rows(); row++) + for (Index col = 0; col < cols(); col++) { + if (other.coeff(row, col) != Scalar(0)) + derived().insert(row, col) = other.coeff(row, col); + } + derived().finalize(); + } + + template + inline Derived & operator=(const SkylineMatrixBase& other) { + //TODO + } + + template + inline Derived & operator=(const SkylineProduct& product); + + friend std::ostream & operator <<(std::ostream & s, const SkylineMatrixBase& m) { + s << m.derived(); + return s; + } + + template + const typename SkylineProductReturnType::Type + operator*(const MatrixBase &other) const; + + /** \internal use operator= */ + template + void evalTo(MatrixBase& dst) const { + dst.setZero(); + for (Index i = 0; i < rows(); i++) + for (Index j = 0; j < rows(); j++) + dst(i, j) = derived().coeff(i, j); + } + + Matrix toDense() const { + return derived(); + } + + /** \returns the matrix or vector obtained by evaluating this expression. + * + * Notice that in the case of a plain matrix or vector (not an expression) this function just returns + * a const reference, in order to avoid a useless copy. + */ + EIGEN_STRONG_INLINE const typename internal::eval::type eval() const { + return typename internal::eval::type(derived()); + } + +protected: + bool m_isRValue; +}; + +} // end namespace Eigen + +#endif // EIGEN_SKYLINEMATRIXBASE_H diff --git a/src/EigenUnsupported/src/Skyline/SkylineProduct.h b/src/EigenUnsupported/src/Skyline/SkylineProduct.h new file mode 100644 index 0000000..d9eb814 --- /dev/null +++ b/src/EigenUnsupported/src/Skyline/SkylineProduct.h @@ -0,0 +1,295 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2009 Guillaume Saupin +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SKYLINEPRODUCT_H +#define EIGEN_SKYLINEPRODUCT_H + +namespace Eigen { + +template +struct SkylineProductReturnType { + typedef const typename internal::nested_eval::type LhsNested; + typedef const typename internal::nested_eval::type RhsNested; + + typedef SkylineProduct Type; +}; + +template +struct internal::traits > { + // clean the nested types: + typedef typename internal::remove_all::type _LhsNested; + typedef typename internal::remove_all::type _RhsNested; + typedef typename _LhsNested::Scalar Scalar; + + enum { + LhsCoeffReadCost = _LhsNested::CoeffReadCost, + RhsCoeffReadCost = _RhsNested::CoeffReadCost, + LhsFlags = _LhsNested::Flags, + RhsFlags = _RhsNested::Flags, + + RowsAtCompileTime = _LhsNested::RowsAtCompileTime, + ColsAtCompileTime = _RhsNested::ColsAtCompileTime, + InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(_LhsNested::ColsAtCompileTime, _RhsNested::RowsAtCompileTime), + + MaxRowsAtCompileTime = _LhsNested::MaxRowsAtCompileTime, + MaxColsAtCompileTime = _RhsNested::MaxColsAtCompileTime, + + EvalToRowMajor = (RhsFlags & LhsFlags & RowMajorBit), + ResultIsSkyline = ProductMode == SkylineTimeSkylineProduct, + + RemovedBits = ~((EvalToRowMajor ? 0 : RowMajorBit) | (ResultIsSkyline ? 0 : SkylineBit)), + + Flags = (int(LhsFlags | RhsFlags) & HereditaryBits & RemovedBits) + | EvalBeforeAssigningBit + | EvalBeforeNestingBit, + + CoeffReadCost = HugeCost + }; + + typedef typename internal::conditional >, + MatrixBase > >::type Base; +}; + +namespace internal { +template +class SkylineProduct : no_assignment_operator, +public traits >::Base { +public: + + EIGEN_GENERIC_PUBLIC_INTERFACE(SkylineProduct) + +private: + + typedef typename traits::_LhsNested _LhsNested; + typedef typename traits::_RhsNested _RhsNested; + +public: + + template + EIGEN_STRONG_INLINE SkylineProduct(const Lhs& lhs, const Rhs& rhs) + : m_lhs(lhs), m_rhs(rhs) { + eigen_assert(lhs.cols() == rhs.rows()); + + enum { + ProductIsValid = _LhsNested::ColsAtCompileTime == Dynamic + || _RhsNested::RowsAtCompileTime == Dynamic + || int(_LhsNested::ColsAtCompileTime) == int(_RhsNested::RowsAtCompileTime), + AreVectors = _LhsNested::IsVectorAtCompileTime && _RhsNested::IsVectorAtCompileTime, + SameSizes = EIGEN_PREDICATE_SAME_MATRIX_SIZE(_LhsNested, _RhsNested) + }; + // note to the lost user: + // * for a dot product use: v1.dot(v2) + // * for a coeff-wise product use: v1.cwise()*v2 + EIGEN_STATIC_ASSERT(ProductIsValid || !(AreVectors && SameSizes), + INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS) + EIGEN_STATIC_ASSERT(ProductIsValid || !(SameSizes && !AreVectors), + INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION) + EIGEN_STATIC_ASSERT(ProductIsValid || SameSizes, INVALID_MATRIX_PRODUCT) + } + + EIGEN_STRONG_INLINE Index rows() const { + return m_lhs.rows(); + } + + EIGEN_STRONG_INLINE Index cols() const { + return m_rhs.cols(); + } + + EIGEN_STRONG_INLINE const _LhsNested& lhs() const { + return m_lhs; + } + + EIGEN_STRONG_INLINE const _RhsNested& rhs() const { + return m_rhs; + } + +protected: + LhsNested m_lhs; + RhsNested m_rhs; +}; + +// dense = skyline * dense +// Note that here we force no inlining and separate the setZero() because GCC messes up otherwise + +template +EIGEN_DONT_INLINE void skyline_row_major_time_dense_product(const Lhs& lhs, const Rhs& rhs, Dest& dst) { + typedef typename remove_all::type _Lhs; + typedef typename remove_all::type _Rhs; + typedef typename traits::Scalar Scalar; + + enum { + LhsIsRowMajor = (_Lhs::Flags & RowMajorBit) == RowMajorBit, + LhsIsSelfAdjoint = (_Lhs::Flags & SelfAdjointBit) == SelfAdjointBit, + ProcessFirstHalf = LhsIsSelfAdjoint + && (((_Lhs::Flags & (UpperTriangularBit | LowerTriangularBit)) == 0) + || ((_Lhs::Flags & UpperTriangularBit) && !LhsIsRowMajor) + || ((_Lhs::Flags & LowerTriangularBit) && LhsIsRowMajor)), + ProcessSecondHalf = LhsIsSelfAdjoint && (!ProcessFirstHalf) + }; + + //Use matrix diagonal part <- Improvement : use inner iterator on dense matrix. + for (Index col = 0; col < rhs.cols(); col++) { + for (Index row = 0; row < lhs.rows(); row++) { + dst(row, col) = lhs.coeffDiag(row) * rhs(row, col); + } + } + //Use matrix lower triangular part + for (Index row = 0; row < lhs.rows(); row++) { + typename _Lhs::InnerLowerIterator lIt(lhs, row); + const Index stop = lIt.col() + lIt.size(); + for (Index col = 0; col < rhs.cols(); col++) { + + Index k = lIt.col(); + Scalar tmp = 0; + while (k < stop) { + tmp += + lIt.value() * + rhs(k++, col); + ++lIt; + } + dst(row, col) += tmp; + lIt += -lIt.size(); + } + + } + + //Use matrix upper triangular part + for (Index lhscol = 0; lhscol < lhs.cols(); lhscol++) { + typename _Lhs::InnerUpperIterator uIt(lhs, lhscol); + const Index stop = uIt.size() + uIt.row(); + for (Index rhscol = 0; rhscol < rhs.cols(); rhscol++) { + + + const Scalar rhsCoeff = rhs.coeff(lhscol, rhscol); + Index k = uIt.row(); + while (k < stop) { + dst(k++, rhscol) += + uIt.value() * + rhsCoeff; + ++uIt; + } + uIt += -uIt.size(); + } + } + +} + +template +EIGEN_DONT_INLINE void skyline_col_major_time_dense_product(const Lhs& lhs, const Rhs& rhs, Dest& dst) { + typedef typename remove_all::type _Lhs; + typedef typename remove_all::type _Rhs; + typedef typename traits::Scalar Scalar; + + enum { + LhsIsRowMajor = (_Lhs::Flags & RowMajorBit) == RowMajorBit, + LhsIsSelfAdjoint = (_Lhs::Flags & SelfAdjointBit) == SelfAdjointBit, + ProcessFirstHalf = LhsIsSelfAdjoint + && (((_Lhs::Flags & (UpperTriangularBit | LowerTriangularBit)) == 0) + || ((_Lhs::Flags & UpperTriangularBit) && !LhsIsRowMajor) + || ((_Lhs::Flags & LowerTriangularBit) && LhsIsRowMajor)), + ProcessSecondHalf = LhsIsSelfAdjoint && (!ProcessFirstHalf) + }; + + //Use matrix diagonal part <- Improvement : use inner iterator on dense matrix. + for (Index col = 0; col < rhs.cols(); col++) { + for (Index row = 0; row < lhs.rows(); row++) { + dst(row, col) = lhs.coeffDiag(row) * rhs(row, col); + } + } + + //Use matrix upper triangular part + for (Index row = 0; row < lhs.rows(); row++) { + typename _Lhs::InnerUpperIterator uIt(lhs, row); + const Index stop = uIt.col() + uIt.size(); + for (Index col = 0; col < rhs.cols(); col++) { + + Index k = uIt.col(); + Scalar tmp = 0; + while (k < stop) { + tmp += + uIt.value() * + rhs(k++, col); + ++uIt; + } + + + dst(row, col) += tmp; + uIt += -uIt.size(); + } + } + + //Use matrix lower triangular part + for (Index lhscol = 0; lhscol < lhs.cols(); lhscol++) { + typename _Lhs::InnerLowerIterator lIt(lhs, lhscol); + const Index stop = lIt.size() + lIt.row(); + for (Index rhscol = 0; rhscol < rhs.cols(); rhscol++) { + + const Scalar rhsCoeff = rhs.coeff(lhscol, rhscol); + Index k = lIt.row(); + while (k < stop) { + dst(k++, rhscol) += + lIt.value() * + rhsCoeff; + ++lIt; + } + lIt += -lIt.size(); + } + } + +} + +template::Flags&RowMajorBit> + struct skyline_product_selector; + +template +struct skyline_product_selector { + typedef typename traits::type>::Scalar Scalar; + + static void run(const Lhs& lhs, const Rhs& rhs, ResultType & res) { + skyline_row_major_time_dense_product (lhs, rhs, res); + } +}; + +template +struct skyline_product_selector { + typedef typename traits::type>::Scalar Scalar; + + static void run(const Lhs& lhs, const Rhs& rhs, ResultType & res) { + skyline_col_major_time_dense_product (lhs, rhs, res); + } +}; + +} // end namespace internal + +// template +// template +// Derived & MatrixBase::lazyAssign(const SkylineProduct& product) { +// typedef typename internal::remove_all::type _Lhs; +// internal::skyline_product_selector::type, +// typename internal::remove_all::type, +// Derived>::run(product.lhs(), product.rhs(), derived()); +// +// return derived(); +// } + +// skyline * dense + +template +template +EIGEN_STRONG_INLINE const typename SkylineProductReturnType::Type +SkylineMatrixBase::operator*(const MatrixBase &other) const { + + return typename SkylineProductReturnType::Type(derived(), other.derived()); +} + +} // end namespace Eigen + +#endif // EIGEN_SKYLINEPRODUCT_H diff --git a/src/EigenUnsupported/src/Skyline/SkylineStorage.h b/src/EigenUnsupported/src/Skyline/SkylineStorage.h new file mode 100644 index 0000000..cc7514f --- /dev/null +++ b/src/EigenUnsupported/src/Skyline/SkylineStorage.h @@ -0,0 +1,259 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2009 Guillaume Saupin +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SKYLINE_STORAGE_H +#define EIGEN_SKYLINE_STORAGE_H + +namespace Eigen { + +/** Stores a skyline set of values in three structures : + * The diagonal elements + * The upper elements + * The lower elements + * + */ +template +class SkylineStorage { + typedef typename NumTraits::Real RealScalar; + typedef SparseIndex Index; +public: + + SkylineStorage() + : m_diag(0), + m_lower(0), + m_upper(0), + m_lowerProfile(0), + m_upperProfile(0), + m_diagSize(0), + m_upperSize(0), + m_lowerSize(0), + m_upperProfileSize(0), + m_lowerProfileSize(0), + m_allocatedSize(0) { + } + + SkylineStorage(const SkylineStorage& other) + : m_diag(0), + m_lower(0), + m_upper(0), + m_lowerProfile(0), + m_upperProfile(0), + m_diagSize(0), + m_upperSize(0), + m_lowerSize(0), + m_upperProfileSize(0), + m_lowerProfileSize(0), + m_allocatedSize(0) { + *this = other; + } + + SkylineStorage & operator=(const SkylineStorage& other) { + resize(other.diagSize(), other.m_upperProfileSize, other.m_lowerProfileSize, other.upperSize(), other.lowerSize()); + memcpy(m_diag, other.m_diag, m_diagSize * sizeof (Scalar)); + memcpy(m_upper, other.m_upper, other.upperSize() * sizeof (Scalar)); + memcpy(m_lower, other.m_lower, other.lowerSize() * sizeof (Scalar)); + memcpy(m_upperProfile, other.m_upperProfile, m_upperProfileSize * sizeof (Index)); + memcpy(m_lowerProfile, other.m_lowerProfile, m_lowerProfileSize * sizeof (Index)); + return *this; + } + + void swap(SkylineStorage& other) { + std::swap(m_diag, other.m_diag); + std::swap(m_upper, other.m_upper); + std::swap(m_lower, other.m_lower); + std::swap(m_upperProfile, other.m_upperProfile); + std::swap(m_lowerProfile, other.m_lowerProfile); + std::swap(m_diagSize, other.m_diagSize); + std::swap(m_upperSize, other.m_upperSize); + std::swap(m_lowerSize, other.m_lowerSize); + std::swap(m_allocatedSize, other.m_allocatedSize); + } + + ~SkylineStorage() { + delete[] m_diag; + delete[] m_upper; + if (m_upper != m_lower) + delete[] m_lower; + delete[] m_upperProfile; + delete[] m_lowerProfile; + } + + void reserve(Index size, Index upperProfileSize, Index lowerProfileSize, Index upperSize, Index lowerSize) { + Index newAllocatedSize = size + upperSize + lowerSize; + if (newAllocatedSize > m_allocatedSize) + reallocate(size, upperProfileSize, lowerProfileSize, upperSize, lowerSize); + } + + void squeeze() { + if (m_allocatedSize > m_diagSize + m_upperSize + m_lowerSize) + reallocate(m_diagSize, m_upperProfileSize, m_lowerProfileSize, m_upperSize, m_lowerSize); + } + + void resize(Index diagSize, Index upperProfileSize, Index lowerProfileSize, Index upperSize, Index lowerSize, float reserveSizeFactor = 0) { + if (m_allocatedSize < diagSize + upperSize + lowerSize) + reallocate(diagSize, upperProfileSize, lowerProfileSize, upperSize + Index(reserveSizeFactor * upperSize), lowerSize + Index(reserveSizeFactor * lowerSize)); + m_diagSize = diagSize; + m_upperSize = upperSize; + m_lowerSize = lowerSize; + m_upperProfileSize = upperProfileSize; + m_lowerProfileSize = lowerProfileSize; + } + + inline Index diagSize() const { + return m_diagSize; + } + + inline Index upperSize() const { + return m_upperSize; + } + + inline Index lowerSize() const { + return m_lowerSize; + } + + inline Index upperProfileSize() const { + return m_upperProfileSize; + } + + inline Index lowerProfileSize() const { + return m_lowerProfileSize; + } + + inline Index allocatedSize() const { + return m_allocatedSize; + } + + inline void clear() { + m_diagSize = 0; + } + + inline Scalar& diag(Index i) { + return m_diag[i]; + } + + inline const Scalar& diag(Index i) const { + return m_diag[i]; + } + + inline Scalar& upper(Index i) { + return m_upper[i]; + } + + inline const Scalar& upper(Index i) const { + return m_upper[i]; + } + + inline Scalar& lower(Index i) { + return m_lower[i]; + } + + inline const Scalar& lower(Index i) const { + return m_lower[i]; + } + + inline Index& upperProfile(Index i) { + return m_upperProfile[i]; + } + + inline const Index& upperProfile(Index i) const { + return m_upperProfile[i]; + } + + inline Index& lowerProfile(Index i) { + return m_lowerProfile[i]; + } + + inline const Index& lowerProfile(Index i) const { + return m_lowerProfile[i]; + } + + static SkylineStorage Map(Index* upperProfile, Index* lowerProfile, Scalar* diag, Scalar* upper, Scalar* lower, Index size, Index upperSize, Index lowerSize) { + SkylineStorage res; + res.m_upperProfile = upperProfile; + res.m_lowerProfile = lowerProfile; + res.m_diag = diag; + res.m_upper = upper; + res.m_lower = lower; + res.m_allocatedSize = res.m_diagSize = size; + res.m_upperSize = upperSize; + res.m_lowerSize = lowerSize; + return res; + } + + inline void reset() { + memset(m_diag, 0, m_diagSize * sizeof (Scalar)); + memset(m_upper, 0, m_upperSize * sizeof (Scalar)); + memset(m_lower, 0, m_lowerSize * sizeof (Scalar)); + memset(m_upperProfile, 0, m_diagSize * sizeof (Index)); + memset(m_lowerProfile, 0, m_diagSize * sizeof (Index)); + } + + void prune(Scalar reference, RealScalar epsilon = dummy_precision()) { + //TODO + } + +protected: + + inline void reallocate(Index diagSize, Index upperProfileSize, Index lowerProfileSize, Index upperSize, Index lowerSize) { + + Scalar* diag = new Scalar[diagSize]; + Scalar* upper = new Scalar[upperSize]; + Scalar* lower = new Scalar[lowerSize]; + Index* upperProfile = new Index[upperProfileSize]; + Index* lowerProfile = new Index[lowerProfileSize]; + + Index copyDiagSize = (std::min)(diagSize, m_diagSize); + Index copyUpperSize = (std::min)(upperSize, m_upperSize); + Index copyLowerSize = (std::min)(lowerSize, m_lowerSize); + Index copyUpperProfileSize = (std::min)(upperProfileSize, m_upperProfileSize); + Index copyLowerProfileSize = (std::min)(lowerProfileSize, m_lowerProfileSize); + + // copy + memcpy(diag, m_diag, copyDiagSize * sizeof (Scalar)); + memcpy(upper, m_upper, copyUpperSize * sizeof (Scalar)); + memcpy(lower, m_lower, copyLowerSize * sizeof (Scalar)); + memcpy(upperProfile, m_upperProfile, copyUpperProfileSize * sizeof (Index)); + memcpy(lowerProfile, m_lowerProfile, copyLowerProfileSize * sizeof (Index)); + + + + // delete old stuff + delete[] m_diag; + delete[] m_upper; + delete[] m_lower; + delete[] m_upperProfile; + delete[] m_lowerProfile; + m_diag = diag; + m_upper = upper; + m_lower = lower; + m_upperProfile = upperProfile; + m_lowerProfile = lowerProfile; + m_allocatedSize = diagSize + upperSize + lowerSize; + m_upperSize = upperSize; + m_lowerSize = lowerSize; + } + +public: + Scalar* m_diag; + Scalar* m_upper; + Scalar* m_lower; + Index* m_upperProfile; + Index* m_lowerProfile; + Index m_diagSize; + Index m_upperSize; + Index m_lowerSize; + Index m_upperProfileSize; + Index m_lowerProfileSize; + Index m_allocatedSize; + +}; + +} // end namespace Eigen + +#endif // EIGEN_SKYLINE_STORAGE_H diff --git a/src/EigenUnsupported/src/Skyline/SkylineUtil.h b/src/EigenUnsupported/src/Skyline/SkylineUtil.h new file mode 100644 index 0000000..75eb612 --- /dev/null +++ b/src/EigenUnsupported/src/Skyline/SkylineUtil.h @@ -0,0 +1,89 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2009 Guillaume Saupin +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SKYLINEUTIL_H +#define EIGEN_SKYLINEUTIL_H + +namespace Eigen { + +#ifdef NDEBUG +#define EIGEN_DBG_SKYLINE(X) +#else +#define EIGEN_DBG_SKYLINE(X) X +#endif + +const unsigned int SkylineBit = 0x1200; +template class SkylineProduct; +enum AdditionalProductEvaluationMode {SkylineTimeDenseProduct, SkylineTimeSkylineProduct, DenseTimeSkylineProduct}; +enum {IsSkyline = SkylineBit}; + + +#define EIGEN_SKYLINE_INHERIT_ASSIGNMENT_OPERATOR(Derived, Op) \ +template \ +EIGEN_STRONG_INLINE Derived& operator Op(const Eigen::SkylineMatrixBase& other) \ +{ \ + return Base::operator Op(other.derived()); \ +} \ +EIGEN_STRONG_INLINE Derived& operator Op(const Derived& other) \ +{ \ + return Base::operator Op(other); \ +} + +#define EIGEN_SKYLINE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, Op) \ +template \ +EIGEN_STRONG_INLINE Derived& operator Op(const Other& scalar) \ +{ \ + return Base::operator Op(scalar); \ +} + +#define EIGEN_SKYLINE_INHERIT_ASSIGNMENT_OPERATORS(Derived) \ + EIGEN_SKYLINE_INHERIT_ASSIGNMENT_OPERATOR(Derived, =) \ + EIGEN_SKYLINE_INHERIT_ASSIGNMENT_OPERATOR(Derived, +=) \ + EIGEN_SKYLINE_INHERIT_ASSIGNMENT_OPERATOR(Derived, -=) \ + EIGEN_SKYLINE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, *=) \ + EIGEN_SKYLINE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, /=) + +#define _EIGEN_SKYLINE_GENERIC_PUBLIC_INTERFACE(Derived, BaseClass) \ + typedef BaseClass Base; \ + typedef typename Eigen::internal::traits::Scalar Scalar; \ + typedef typename Eigen::NumTraits::Real RealScalar; \ + typedef typename Eigen::internal::traits::StorageKind StorageKind; \ + typedef typename Eigen::internal::index::type Index; \ + enum { Flags = Eigen::internal::traits::Flags, }; + +#define EIGEN_SKYLINE_GENERIC_PUBLIC_INTERFACE(Derived) \ + _EIGEN_SKYLINE_GENERIC_PUBLIC_INTERFACE(Derived, Eigen::SkylineMatrixBase) + +template class SkylineMatrixBase; +template class SkylineMatrix; +template class DynamicSkylineMatrix; +template class SkylineVector; +template class MappedSkylineMatrix; + +namespace internal { + +template struct skyline_product_mode; +template::value> struct SkylineProductReturnType; + +template class eval +{ + typedef typename traits::Scalar _Scalar; + enum { + _Flags = traits::Flags + }; + + public: + typedef SkylineMatrix<_Scalar, _Flags> type; +}; + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_SKYLINEUTIL_H diff --git a/src/EigenUnsupported/src/SparseExtra/BlockOfDynamicSparseMatrix.h b/src/EigenUnsupported/src/SparseExtra/BlockOfDynamicSparseMatrix.h new file mode 100644 index 0000000..e9ec746 --- /dev/null +++ b/src/EigenUnsupported/src/SparseExtra/BlockOfDynamicSparseMatrix.h @@ -0,0 +1,122 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2009 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SPARSE_BLOCKFORDYNAMICMATRIX_H +#define EIGEN_SPARSE_BLOCKFORDYNAMICMATRIX_H + +namespace Eigen { + +#if 0 + +// NOTE Have to be reimplemented as a specialization of BlockImpl< DynamicSparseMatrix<_Scalar, _Options, _Index>, ... > +// See SparseBlock.h for an example + + +/*************************************************************************** +* specialisation for DynamicSparseMatrix +***************************************************************************/ + +template +class SparseInnerVectorSet, Size> + : public SparseMatrixBase, Size> > +{ + typedef DynamicSparseMatrix<_Scalar, _Options, _Index> MatrixType; + public: + + enum { IsRowMajor = internal::traits::IsRowMajor }; + + EIGEN_SPARSE_PUBLIC_INTERFACE(SparseInnerVectorSet) + class InnerIterator: public MatrixType::InnerIterator + { + public: + inline InnerIterator(const SparseInnerVectorSet& xpr, Index outer) + : MatrixType::InnerIterator(xpr.m_matrix, xpr.m_outerStart + outer), m_outer(outer) + {} + inline Index row() const { return IsRowMajor ? m_outer : this->index(); } + inline Index col() const { return IsRowMajor ? this->index() : m_outer; } + protected: + Index m_outer; + }; + + inline SparseInnerVectorSet(const MatrixType& matrix, Index outerStart, Index outerSize) + : m_matrix(matrix), m_outerStart(outerStart), m_outerSize(outerSize) + { + eigen_assert( (outerStart>=0) && ((outerStart+outerSize)<=matrix.outerSize()) ); + } + + inline SparseInnerVectorSet(const MatrixType& matrix, Index outer) + : m_matrix(matrix), m_outerStart(outer), m_outerSize(Size) + { + eigen_assert(Size!=Dynamic); + eigen_assert( (outer>=0) && (outer + inline SparseInnerVectorSet& operator=(const SparseMatrixBase& other) + { + if (IsRowMajor != ((OtherDerived::Flags&RowMajorBit)==RowMajorBit)) + { + // need to transpose => perform a block evaluation followed by a big swap + DynamicSparseMatrix aux(other); + *this = aux.markAsRValue(); + } + else + { + // evaluate/copy vector per vector + for (Index j=0; j aux(other.innerVector(j)); + m_matrix.const_cast_derived()._data()[m_outerStart+j].swap(aux._data()); + } + } + return *this; + } + + inline SparseInnerVectorSet& operator=(const SparseInnerVectorSet& other) + { + return operator=(other); + } + + Index nonZeros() const + { + Index count = 0; + for (Index j=0; j0); + return m_matrix.data()[m_outerStart].vale(m_matrix.data()[m_outerStart].size()-1); + } + +// template +// inline SparseInnerVectorSet& operator=(const SparseMatrixBase& other) +// { +// return *this; +// } + + EIGEN_STRONG_INLINE Index rows() const { return IsRowMajor ? m_outerSize.value() : m_matrix.rows(); } + EIGEN_STRONG_INLINE Index cols() const { return IsRowMajor ? m_matrix.cols() : m_outerSize.value(); } + + protected: + + const typename MatrixType::Nested m_matrix; + Index m_outerStart; + const internal::variable_if_dynamic m_outerSize; + +}; + +#endif + +} // end namespace Eigen + +#endif // EIGEN_SPARSE_BLOCKFORDYNAMICMATRIX_H diff --git a/src/EigenUnsupported/src/SparseExtra/BlockSparseMatrix.h b/src/EigenUnsupported/src/SparseExtra/BlockSparseMatrix.h new file mode 100644 index 0000000..536a0c3 --- /dev/null +++ b/src/EigenUnsupported/src/SparseExtra/BlockSparseMatrix.h @@ -0,0 +1,1079 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2013 Desire Nuentsa +// Copyright (C) 2013 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SPARSEBLOCKMATRIX_H +#define EIGEN_SPARSEBLOCKMATRIX_H + +namespace Eigen { +/** \ingroup SparseCore_Module + * + * \class BlockSparseMatrix + * + * \brief A versatile sparse matrix representation where each element is a block + * + * This class provides routines to manipulate block sparse matrices stored in a + * BSR-like representation. There are two main types : + * + * 1. All blocks have the same number of rows and columns, called block size + * in the following. In this case, if this block size is known at compile time, + * it can be given as a template parameter like + * \code + * BlockSparseMatrix bmat(b_rows, b_cols); + * \endcode + * Here, bmat is a b_rows x b_cols block sparse matrix + * where each coefficient is a 3x3 dense matrix. + * If the block size is fixed but will be given at runtime, + * \code + * BlockSparseMatrix bmat(b_rows, b_cols); + * bmat.setBlockSize(block_size); + * \endcode + * + * 2. The second case is for variable-block sparse matrices. + * Here each block has its own dimensions. The only restriction is that all the blocks + * in a row (resp. a column) should have the same number of rows (resp. of columns). + * It is thus required in this case to describe the layout of the matrix by calling + * setBlockLayout(rowBlocks, colBlocks). + * + * In any of the previous case, the matrix can be filled by calling setFromTriplets(). + * A regular sparse matrix can be converted to a block sparse matrix and vice versa. + * It is obviously required to describe the block layout beforehand by calling either + * setBlockSize() for fixed-size blocks or setBlockLayout for variable-size blocks. + * + * \tparam _Scalar The Scalar type + * \tparam _BlockAtCompileTime The block layout option. It takes the following values + * Dynamic : block size known at runtime + * a numeric number : fixed-size block known at compile time + */ +template class BlockSparseMatrix; + +template class BlockSparseMatrixView; + +namespace internal { +template +struct traits > +{ + typedef _Scalar Scalar; + typedef _Index Index; + typedef Sparse StorageKind; // FIXME Where is it used ?? + typedef MatrixXpr XprKind; + enum { + RowsAtCompileTime = Dynamic, + ColsAtCompileTime = Dynamic, + MaxRowsAtCompileTime = Dynamic, + MaxColsAtCompileTime = Dynamic, + BlockSize = _BlockAtCompileTime, + Flags = _Options | NestByRefBit | LvalueBit, + CoeffReadCost = NumTraits::ReadCost, + SupportedAccessPatterns = InnerRandomAccessPattern + }; +}; +template +struct traits > +{ + typedef Ref > Scalar; + typedef Ref > RealScalar; + +}; + +// Function object to sort a triplet list +template +struct TripletComp +{ + typedef typename Iterator::value_type Triplet; + bool operator()(const Triplet& a, const Triplet& b) + { if(IsColMajor) + return ((a.col() == b.col() && a.row() < b.row()) || (a.col() < b.col())); + else + return ((a.row() == b.row() && a.col() < b.col()) || (a.row() < b.row())); + } +}; +} // end namespace internal + + +/* Proxy to view the block sparse matrix as a regular sparse matrix */ +template +class BlockSparseMatrixView : public SparseMatrixBase +{ + public: + typedef Ref Scalar; + typedef Ref RealScalar; + typedef typename BlockSparseMatrixT::Index Index; + typedef BlockSparseMatrixT Nested; + enum { + Flags = BlockSparseMatrixT::Options, + Options = BlockSparseMatrixT::Options, + RowsAtCompileTime = BlockSparseMatrixT::RowsAtCompileTime, + ColsAtCompileTime = BlockSparseMatrixT::ColsAtCompileTime, + MaxColsAtCompileTime = BlockSparseMatrixT::MaxColsAtCompileTime, + MaxRowsAtCompileTime = BlockSparseMatrixT::MaxRowsAtCompileTime + }; + public: + BlockSparseMatrixView(const BlockSparseMatrixT& spblockmat) + : m_spblockmat(spblockmat) + {} + + Index outerSize() const + { + return (Flags&RowMajorBit) == 1 ? this->rows() : this->cols(); + } + Index cols() const + { + return m_spblockmat.blockCols(); + } + Index rows() const + { + return m_spblockmat.blockRows(); + } + Scalar coeff(Index row, Index col) + { + return m_spblockmat.coeff(row, col); + } + Scalar coeffRef(Index row, Index col) + { + return m_spblockmat.coeffRef(row, col); + } + // Wrapper to iterate over all blocks + class InnerIterator : public BlockSparseMatrixT::BlockInnerIterator + { + public: + InnerIterator(const BlockSparseMatrixView& mat, Index outer) + : BlockSparseMatrixT::BlockInnerIterator(mat.m_spblockmat, outer) + {} + + }; + + protected: + const BlockSparseMatrixT& m_spblockmat; +}; + +// Proxy to view a regular vector as a block vector +template +class BlockVectorView +{ + public: + enum { + BlockSize = BlockSparseMatrixT::BlockSize, + ColsAtCompileTime = VectorType::ColsAtCompileTime, + RowsAtCompileTime = VectorType::RowsAtCompileTime, + Flags = VectorType::Flags + }; + typedef Ref >Scalar; + typedef typename BlockSparseMatrixT::Index Index; + public: + BlockVectorView(const BlockSparseMatrixT& spblockmat, const VectorType& vec) + : m_spblockmat(spblockmat),m_vec(vec) + { } + inline Index cols() const + { + return m_vec.cols(); + } + inline Index size() const + { + return m_spblockmat.blockRows(); + } + inline Scalar coeff(Index bi) const + { + Index startRow = m_spblockmat.blockRowsIndex(bi); + Index rowSize = m_spblockmat.blockRowsIndex(bi+1) - startRow; + return m_vec.middleRows(startRow, rowSize); + } + inline Scalar coeff(Index bi, Index j) const + { + Index startRow = m_spblockmat.blockRowsIndex(bi); + Index rowSize = m_spblockmat.blockRowsIndex(bi+1) - startRow; + return m_vec.block(startRow, j, rowSize, 1); + } + protected: + const BlockSparseMatrixT& m_spblockmat; + const VectorType& m_vec; +}; + +template class BlockVectorReturn; + + +// Proxy to view a regular vector as a block vector +template +class BlockVectorReturn +{ + public: + enum { + ColsAtCompileTime = VectorType::ColsAtCompileTime, + RowsAtCompileTime = VectorType::RowsAtCompileTime, + Flags = VectorType::Flags + }; + typedef Ref > Scalar; + typedef typename BlockSparseMatrixT::Index Index; + public: + BlockVectorReturn(const BlockSparseMatrixT& spblockmat, VectorType& vec) + : m_spblockmat(spblockmat),m_vec(vec) + { } + inline Index size() const + { + return m_spblockmat.blockRows(); + } + inline Scalar coeffRef(Index bi) + { + Index startRow = m_spblockmat.blockRowsIndex(bi); + Index rowSize = m_spblockmat.blockRowsIndex(bi+1) - startRow; + return m_vec.middleRows(startRow, rowSize); + } + inline Scalar coeffRef(Index bi, Index j) + { + Index startRow = m_spblockmat.blockRowsIndex(bi); + Index rowSize = m_spblockmat.blockRowsIndex(bi+1) - startRow; + return m_vec.block(startRow, j, rowSize, 1); + } + + protected: + const BlockSparseMatrixT& m_spblockmat; + VectorType& m_vec; +}; + +// Block version of the sparse dense product +template +class BlockSparseTimeDenseProduct; + +namespace internal { + +template +struct traits > +{ + typedef Dense StorageKind; + typedef MatrixXpr XprKind; + typedef typename BlockSparseMatrixT::Scalar Scalar; + typedef typename BlockSparseMatrixT::Index Index; + enum { + RowsAtCompileTime = Dynamic, + ColsAtCompileTime = Dynamic, + MaxRowsAtCompileTime = Dynamic, + MaxColsAtCompileTime = Dynamic, + Flags = 0, + CoeffReadCost = internal::traits::CoeffReadCost + }; +}; +} // end namespace internal + +template +class BlockSparseTimeDenseProduct + : public ProductBase, Lhs, Rhs> +{ + public: + EIGEN_PRODUCT_PUBLIC_INTERFACE(BlockSparseTimeDenseProduct) + + BlockSparseTimeDenseProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) + {} + + template void scaleAndAddTo(Dest& dest, const typename Rhs::Scalar& alpha) const + { + BlockVectorReturn tmpDest(m_lhs, dest); + internal::sparse_time_dense_product( BlockSparseMatrixView(m_lhs), BlockVectorView(m_lhs, m_rhs), tmpDest, alpha); + } + + private: + BlockSparseTimeDenseProduct& operator=(const BlockSparseTimeDenseProduct&); +}; + +template +class BlockSparseMatrix : public SparseMatrixBase > +{ + public: + typedef _Scalar Scalar; + typedef typename NumTraits::Real RealScalar; + typedef _StorageIndex StorageIndex; + typedef typename internal::ref_selector >::type Nested; + + enum { + Options = _Options, + Flags = Options, + BlockSize=_BlockAtCompileTime, + RowsAtCompileTime = Dynamic, + ColsAtCompileTime = Dynamic, + MaxRowsAtCompileTime = Dynamic, + MaxColsAtCompileTime = Dynamic, + IsVectorAtCompileTime = 0, + IsColMajor = Flags&RowMajorBit ? 0 : 1 + }; + typedef Matrix BlockScalar; + typedef Matrix BlockRealScalar; + typedef typename internal::conditional<_BlockAtCompileTime==Dynamic, Scalar, BlockScalar>::type BlockScalarReturnType; + typedef BlockSparseMatrix PlainObject; + public: + // Default constructor + BlockSparseMatrix() + : m_innerBSize(0),m_outerBSize(0),m_innerOffset(0),m_outerOffset(0), + m_nonzerosblocks(0),m_values(0),m_blockPtr(0),m_indices(0), + m_outerIndex(0),m_blockSize(BlockSize) + { } + + + /** + * \brief Construct and resize + * + */ + BlockSparseMatrix(Index brow, Index bcol) + : m_innerBSize(IsColMajor ? brow : bcol), + m_outerBSize(IsColMajor ? bcol : brow), + m_innerOffset(0),m_outerOffset(0),m_nonzerosblocks(0), + m_values(0),m_blockPtr(0),m_indices(0), + m_outerIndex(0),m_blockSize(BlockSize) + { } + + /** + * \brief Copy-constructor + */ + BlockSparseMatrix(const BlockSparseMatrix& other) + : m_innerBSize(other.m_innerBSize),m_outerBSize(other.m_outerBSize), + m_nonzerosblocks(other.m_nonzerosblocks),m_nonzeros(other.m_nonzeros), + m_blockPtr(0),m_blockSize(other.m_blockSize) + { + // should we allow copying between variable-size blocks and fixed-size blocks ?? + eigen_assert(m_blockSize == BlockSize && " CAN NOT COPY BETWEEN FIXED-SIZE AND VARIABLE-SIZE BLOCKS"); + + std::copy(other.m_innerOffset, other.m_innerOffset+m_innerBSize+1, m_innerOffset); + std::copy(other.m_outerOffset, other.m_outerOffset+m_outerBSize+1, m_outerOffset); + std::copy(other.m_values, other.m_values+m_nonzeros, m_values); + + if(m_blockSize != Dynamic) + std::copy(other.m_blockPtr, other.m_blockPtr+m_nonzerosblocks, m_blockPtr); + + std::copy(other.m_indices, other.m_indices+m_nonzerosblocks, m_indices); + std::copy(other.m_outerIndex, other.m_outerIndex+m_outerBSize, m_outerIndex); + } + + friend void swap(BlockSparseMatrix& first, BlockSparseMatrix& second) + { + std::swap(first.m_innerBSize, second.m_innerBSize); + std::swap(first.m_outerBSize, second.m_outerBSize); + std::swap(first.m_innerOffset, second.m_innerOffset); + std::swap(first.m_outerOffset, second.m_outerOffset); + std::swap(first.m_nonzerosblocks, second.m_nonzerosblocks); + std::swap(first.m_nonzeros, second.m_nonzeros); + std::swap(first.m_values, second.m_values); + std::swap(first.m_blockPtr, second.m_blockPtr); + std::swap(first.m_indices, second.m_indices); + std::swap(first.m_outerIndex, second.m_outerIndex); + std::swap(first.m_BlockSize, second.m_blockSize); + } + + BlockSparseMatrix& operator=(BlockSparseMatrix other) + { + //Copy-and-swap paradigm ... avoid leaked data if thrown + swap(*this, other); + return *this; + } + + // Destructor + ~BlockSparseMatrix() + { + delete[] m_outerIndex; + delete[] m_innerOffset; + delete[] m_outerOffset; + delete[] m_indices; + delete[] m_blockPtr; + delete[] m_values; + } + + + /** + * \brief Constructor from a sparse matrix + * + */ + template + inline BlockSparseMatrix(const MatrixType& spmat) : m_blockSize(BlockSize) + { + EIGEN_STATIC_ASSERT((m_blockSize != Dynamic), THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE); + + *this = spmat; + } + + /** + * \brief Assignment from a sparse matrix with the same storage order + * + * Convert from a sparse matrix to block sparse matrix. + * \warning Before calling this function, tt is necessary to call + * either setBlockLayout() (matrices with variable-size blocks) + * or setBlockSize() (for fixed-size blocks). + */ + template + inline BlockSparseMatrix& operator=(const MatrixType& spmat) + { + eigen_assert((m_innerBSize != 0 && m_outerBSize != 0) + && "Trying to assign to a zero-size matrix, call resize() first"); + eigen_assert(((MatrixType::Options&RowMajorBit) != IsColMajor) && "Wrong storage order"); + typedef SparseMatrix MatrixPatternType; + MatrixPatternType blockPattern(blockRows(), blockCols()); + m_nonzeros = 0; + + // First, compute the number of nonzero blocks and their locations + for(StorageIndex bj = 0; bj < m_outerBSize; ++bj) + { + // Browse each outer block and compute the structure + std::vector nzblocksFlag(m_innerBSize,false); // Record the existing blocks + blockPattern.startVec(bj); + for(StorageIndex j = blockOuterIndex(bj); j < blockOuterIndex(bj+1); ++j) + { + typename MatrixType::InnerIterator it_spmat(spmat, j); + for(; it_spmat; ++it_spmat) + { + StorageIndex bi = innerToBlock(it_spmat.index()); // Index of the current nonzero block + if(!nzblocksFlag[bi]) + { + // Save the index of this nonzero block + nzblocksFlag[bi] = true; + blockPattern.insertBackByOuterInnerUnordered(bj, bi) = true; + // Compute the total number of nonzeros (including explicit zeros in blocks) + m_nonzeros += blockOuterSize(bj) * blockInnerSize(bi); + } + } + } // end current outer block + } + blockPattern.finalize(); + + // Allocate the internal arrays + setBlockStructure(blockPattern); + + for(StorageIndex nz = 0; nz < m_nonzeros; ++nz) m_values[nz] = Scalar(0); + for(StorageIndex bj = 0; bj < m_outerBSize; ++bj) + { + // Now copy the values + for(StorageIndex j = blockOuterIndex(bj); j < blockOuterIndex(bj+1); ++j) + { + // Browse the outer block column by column (for column-major matrices) + typename MatrixType::InnerIterator it_spmat(spmat, j); + for(; it_spmat; ++it_spmat) + { + StorageIndex idx = 0; // Position of this block in the column block + StorageIndex bi = innerToBlock(it_spmat.index()); // Index of the current nonzero block + // Go to the inner block where this element belongs to + while(bi > m_indices[m_outerIndex[bj]+idx]) ++idx; // Not expensive for ordered blocks + StorageIndex idxVal;// Get the right position in the array of values for this element + if(m_blockSize == Dynamic) + { + // Offset from all blocks before ... + idxVal = m_blockPtr[m_outerIndex[bj]+idx]; + // ... and offset inside the block + idxVal += (j - blockOuterIndex(bj)) * blockOuterSize(bj) + it_spmat.index() - m_innerOffset[bi]; + } + else + { + // All blocks before + idxVal = (m_outerIndex[bj] + idx) * m_blockSize * m_blockSize; + // inside the block + idxVal += (j - blockOuterIndex(bj)) * m_blockSize + (it_spmat.index()%m_blockSize); + } + // Insert the value + m_values[idxVal] = it_spmat.value(); + } // end of this column + } // end of this block + } // end of this outer block + + return *this; + } + + /** + * \brief Set the nonzero block pattern of the matrix + * + * Given a sparse matrix describing the nonzero block pattern, + * this function prepares the internal pointers for values. + * After calling this function, any *nonzero* block (bi, bj) can be set + * with a simple call to coeffRef(bi,bj). + * + * + * \warning Before calling this function, tt is necessary to call + * either setBlockLayout() (matrices with variable-size blocks) + * or setBlockSize() (for fixed-size blocks). + * + * \param blockPattern Sparse matrix of boolean elements describing the block structure + * + * \sa setBlockLayout() \sa setBlockSize() + */ + template + void setBlockStructure(const MatrixType& blockPattern) + { + resize(blockPattern.rows(), blockPattern.cols()); + reserve(blockPattern.nonZeros()); + + // Browse the block pattern and set up the various pointers + m_outerIndex[0] = 0; + if(m_blockSize == Dynamic) m_blockPtr[0] = 0; + for(StorageIndex nz = 0; nz < m_nonzeros; ++nz) m_values[nz] = Scalar(0); + for(StorageIndex bj = 0; bj < m_outerBSize; ++bj) + { + //Browse each outer block + + //First, copy and save the indices of nonzero blocks + //FIXME : find a way to avoid this ... + std::vector nzBlockIdx; + typename MatrixType::InnerIterator it(blockPattern, bj); + for(; it; ++it) + { + nzBlockIdx.push_back(it.index()); + } + std::sort(nzBlockIdx.begin(), nzBlockIdx.end()); + + // Now, fill block indices and (eventually) pointers to blocks + for(StorageIndex idx = 0; idx < nzBlockIdx.size(); ++idx) + { + StorageIndex offset = m_outerIndex[bj]+idx; // offset in m_indices + m_indices[offset] = nzBlockIdx[idx]; + if(m_blockSize == Dynamic) + m_blockPtr[offset] = m_blockPtr[offset-1] + blockInnerSize(nzBlockIdx[idx]) * blockOuterSize(bj); + // There is no blockPtr for fixed-size blocks... not needed !??? + } + // Save the pointer to the next outer block + m_outerIndex[bj+1] = m_outerIndex[bj] + nzBlockIdx.size(); + } + } + + /** + * \brief Set the number of rows and columns blocks + */ + inline void resize(Index brow, Index bcol) + { + m_innerBSize = IsColMajor ? brow : bcol; + m_outerBSize = IsColMajor ? bcol : brow; + } + + /** + * \brief set the block size at runtime for fixed-size block layout + * + * Call this only for fixed-size blocks + */ + inline void setBlockSize(Index blockSize) + { + m_blockSize = blockSize; + } + + /** + * \brief Set the row and column block layouts, + * + * This function set the size of each row and column block. + * So this function should be used only for blocks with variable size. + * \param rowBlocks : Number of rows per row block + * \param colBlocks : Number of columns per column block + * \sa resize(), setBlockSize() + */ + inline void setBlockLayout(const VectorXi& rowBlocks, const VectorXi& colBlocks) + { + const VectorXi& innerBlocks = IsColMajor ? rowBlocks : colBlocks; + const VectorXi& outerBlocks = IsColMajor ? colBlocks : rowBlocks; + eigen_assert(m_innerBSize == innerBlocks.size() && "CHECK THE NUMBER OF ROW OR COLUMN BLOCKS"); + eigen_assert(m_outerBSize == outerBlocks.size() && "CHECK THE NUMBER OF ROW OR COLUMN BLOCKS"); + m_outerBSize = outerBlocks.size(); + // starting index of blocks... cumulative sums + m_innerOffset = new StorageIndex[m_innerBSize+1]; + m_outerOffset = new StorageIndex[m_outerBSize+1]; + m_innerOffset[0] = 0; + m_outerOffset[0] = 0; + std::partial_sum(&innerBlocks[0], &innerBlocks[m_innerBSize-1]+1, &m_innerOffset[1]); + std::partial_sum(&outerBlocks[0], &outerBlocks[m_outerBSize-1]+1, &m_outerOffset[1]); + + // Compute the total number of nonzeros + m_nonzeros = 0; + for(StorageIndex bj = 0; bj < m_outerBSize; ++bj) + for(StorageIndex bi = 0; bi < m_innerBSize; ++bi) + m_nonzeros += outerBlocks[bj] * innerBlocks[bi]; + + } + + /** + * \brief Allocate the internal array of pointers to blocks and their inner indices + * + * \note For fixed-size blocks, call setBlockSize() to set the block. + * And For variable-size blocks, call setBlockLayout() before using this function + * + * \param nonzerosblocks Number of nonzero blocks. The total number of nonzeros is + * is computed in setBlockLayout() for variable-size blocks + * \sa setBlockSize() + */ + inline void reserve(const Index nonzerosblocks) + { + eigen_assert((m_innerBSize != 0 && m_outerBSize != 0) && + "TRYING TO RESERVE ZERO-SIZE MATRICES, CALL resize() first"); + + //FIXME Should free if already allocated + m_outerIndex = new StorageIndex[m_outerBSize+1]; + + m_nonzerosblocks = nonzerosblocks; + if(m_blockSize != Dynamic) + { + m_nonzeros = nonzerosblocks * (m_blockSize * m_blockSize); + m_blockPtr = 0; + } + else + { + // m_nonzeros is already computed in setBlockLayout() + m_blockPtr = new StorageIndex[m_nonzerosblocks+1]; + } + m_indices = new StorageIndex[m_nonzerosblocks+1]; + m_values = new Scalar[m_nonzeros]; + } + + + /** + * \brief Fill values in a matrix from a triplet list. + * + * Each triplet item has a block stored in an Eigen dense matrix. + * The InputIterator class should provide the functions row(), col() and value() + * + * \note For fixed-size blocks, call setBlockSize() before this function. + * + * FIXME Do not accept duplicates + */ + template + void setFromTriplets(const InputIterator& begin, const InputIterator& end) + { + eigen_assert((m_innerBSize!=0 && m_outerBSize !=0) && "ZERO BLOCKS, PLEASE CALL resize() before"); + + /* First, sort the triplet list + * FIXME This can be unnecessarily expensive since only the inner indices have to be sorted + * The best approach is like in SparseMatrix::setFromTriplets() + */ + internal::TripletComp tripletcomp; + std::sort(begin, end, tripletcomp); + + /* Count the number of rows and column blocks, + * and the number of nonzero blocks per outer dimension + */ + VectorXi rowBlocks(m_innerBSize); // Size of each block row + VectorXi colBlocks(m_outerBSize); // Size of each block column + rowBlocks.setZero(); colBlocks.setZero(); + VectorXi nzblock_outer(m_outerBSize); // Number of nz blocks per outer vector + VectorXi nz_outer(m_outerBSize); // Number of nz per outer vector...for variable-size blocks + nzblock_outer.setZero(); + nz_outer.setZero(); + for(InputIterator it(begin); it !=end; ++it) + { + eigen_assert(it->row() >= 0 && it->row() < this->blockRows() && it->col() >= 0 && it->col() < this->blockCols()); + eigen_assert((it->value().rows() == it->value().cols() && (it->value().rows() == m_blockSize)) + || (m_blockSize == Dynamic)); + + if(m_blockSize == Dynamic) + { + eigen_assert((rowBlocks[it->row()] == 0 || rowBlocks[it->row()] == it->value().rows()) && + "NON CORRESPONDING SIZES FOR ROW BLOCKS"); + eigen_assert((colBlocks[it->col()] == 0 || colBlocks[it->col()] == it->value().cols()) && + "NON CORRESPONDING SIZES FOR COLUMN BLOCKS"); + rowBlocks[it->row()] =it->value().rows(); + colBlocks[it->col()] = it->value().cols(); + } + nz_outer(IsColMajor ? it->col() : it->row()) += it->value().rows() * it->value().cols(); + nzblock_outer(IsColMajor ? it->col() : it->row())++; + } + // Allocate member arrays + if(m_blockSize == Dynamic) setBlockLayout(rowBlocks, colBlocks); + StorageIndex nzblocks = nzblock_outer.sum(); + reserve(nzblocks); + + // Temporary markers + VectorXi block_id(m_outerBSize); // To be used as a block marker during insertion + + // Setup outer index pointers and markers + m_outerIndex[0] = 0; + if (m_blockSize == Dynamic) m_blockPtr[0] = 0; + for(StorageIndex bj = 0; bj < m_outerBSize; ++bj) + { + m_outerIndex[bj+1] = m_outerIndex[bj] + nzblock_outer(bj); + block_id(bj) = m_outerIndex[bj]; + if(m_blockSize==Dynamic) + { + m_blockPtr[m_outerIndex[bj+1]] = m_blockPtr[m_outerIndex[bj]] + nz_outer(bj); + } + } + + // Fill the matrix + for(InputIterator it(begin); it!=end; ++it) + { + StorageIndex outer = IsColMajor ? it->col() : it->row(); + StorageIndex inner = IsColMajor ? it->row() : it->col(); + m_indices[block_id(outer)] = inner; + StorageIndex block_size = it->value().rows()*it->value().cols(); + StorageIndex nz_marker = blockPtr(block_id[outer]); + memcpy(&(m_values[nz_marker]), it->value().data(), block_size * sizeof(Scalar)); + if(m_blockSize == Dynamic) + { + m_blockPtr[block_id(outer)+1] = m_blockPtr[block_id(outer)] + block_size; + } + block_id(outer)++; + } + + // An alternative when the outer indices are sorted...no need to use an array of markers +// for(Index bcol = 0; bcol < m_outerBSize; ++bcol) +// { +// Index id = 0, id_nz = 0, id_nzblock = 0; +// for(InputIterator it(begin); it!=end; ++it) +// { +// while (idvalue().rows()*it->value().cols(); +// m_blockPtr[id_nzblock+1] = m_blockPtr[id_nzblock] + block_size; +// id_nzblock++; +// memcpy(&(m_values[id_nz]),it->value().data(), block_size*sizeof(Scalar)); +// id_nz += block_size; +// } +// while(id < m_outerBSize-1) // Empty columns at the end +// { +// id++; +// m_outerIndex[id+1]=m_outerIndex[id]; +// } +// } + } + + + /** + * \returns the number of rows + */ + inline Index rows() const + { +// return blockRows(); + return (IsColMajor ? innerSize() : outerSize()); + } + + /** + * \returns the number of cols + */ + inline Index cols() const + { +// return blockCols(); + return (IsColMajor ? outerSize() : innerSize()); + } + + inline Index innerSize() const + { + if(m_blockSize == Dynamic) return m_innerOffset[m_innerBSize]; + else return (m_innerBSize * m_blockSize) ; + } + + inline Index outerSize() const + { + if(m_blockSize == Dynamic) return m_outerOffset[m_outerBSize]; + else return (m_outerBSize * m_blockSize) ; + } + /** \returns the number of rows grouped by blocks */ + inline Index blockRows() const + { + return (IsColMajor ? m_innerBSize : m_outerBSize); + } + /** \returns the number of columns grouped by blocks */ + inline Index blockCols() const + { + return (IsColMajor ? m_outerBSize : m_innerBSize); + } + + inline Index outerBlocks() const { return m_outerBSize; } + inline Index innerBlocks() const { return m_innerBSize; } + + /** \returns the block index where outer belongs to */ + inline Index outerToBlock(Index outer) const + { + eigen_assert(outer < outerSize() && "OUTER INDEX OUT OF BOUNDS"); + + if(m_blockSize != Dynamic) + return (outer / m_blockSize); // Integer division + + StorageIndex b_outer = 0; + while(m_outerOffset[b_outer] <= outer) ++b_outer; + return b_outer - 1; + } + /** \returns the block index where inner belongs to */ + inline Index innerToBlock(Index inner) const + { + eigen_assert(inner < innerSize() && "OUTER INDEX OUT OF BOUNDS"); + + if(m_blockSize != Dynamic) + return (inner / m_blockSize); // Integer division + + StorageIndex b_inner = 0; + while(m_innerOffset[b_inner] <= inner) ++b_inner; + return b_inner - 1; + } + + /** + *\returns a reference to the (i,j) block as an Eigen Dense Matrix + */ + Ref coeffRef(Index brow, Index bcol) + { + eigen_assert(brow < blockRows() && "BLOCK ROW INDEX OUT OF BOUNDS"); + eigen_assert(bcol < blockCols() && "BLOCK nzblocksFlagCOLUMN OUT OF BOUNDS"); + + StorageIndex rsize = IsColMajor ? blockInnerSize(brow): blockOuterSize(bcol); + StorageIndex csize = IsColMajor ? blockOuterSize(bcol) : blockInnerSize(brow); + StorageIndex inner = IsColMajor ? brow : bcol; + StorageIndex outer = IsColMajor ? bcol : brow; + StorageIndex offset = m_outerIndex[outer]; + while(offset < m_outerIndex[outer+1] && m_indices[offset] != inner) + offset++; + if(m_indices[offset] == inner) + { + return Map(&(m_values[blockPtr(offset)]), rsize, csize); + } + else + { + //FIXME the block does not exist, Insert it !!!!!!!!! + eigen_assert("DYNAMIC INSERTION IS NOT YET SUPPORTED"); + } + } + + /** + * \returns the value of the (i,j) block as an Eigen Dense Matrix + */ + Map coeff(Index brow, Index bcol) const + { + eigen_assert(brow < blockRows() && "BLOCK ROW INDEX OUT OF BOUNDS"); + eigen_assert(bcol < blockCols() && "BLOCK COLUMN OUT OF BOUNDS"); + + StorageIndex rsize = IsColMajor ? blockInnerSize(brow): blockOuterSize(bcol); + StorageIndex csize = IsColMajor ? blockOuterSize(bcol) : blockInnerSize(brow); + StorageIndex inner = IsColMajor ? brow : bcol; + StorageIndex outer = IsColMajor ? bcol : brow; + StorageIndex offset = m_outerIndex[outer]; + while(offset < m_outerIndex[outer+1] && m_indices[offset] != inner) offset++; + if(m_indices[offset] == inner) + { + return Map (&(m_values[blockPtr(offset)]), rsize, csize); + } + else +// return BlockScalar::Zero(rsize, csize); + eigen_assert("NOT YET SUPPORTED"); + } + + // Block Matrix times vector product + template + BlockSparseTimeDenseProduct operator*(const VecType& lhs) const + { + return BlockSparseTimeDenseProduct(*this, lhs); + } + + /** \returns the number of nonzero blocks */ + inline Index nonZerosBlocks() const { return m_nonzerosblocks; } + /** \returns the total number of nonzero elements, including eventual explicit zeros in blocks */ + inline Index nonZeros() const { return m_nonzeros; } + + inline BlockScalarReturnType *valuePtr() {return static_cast(m_values);} +// inline Scalar *valuePtr(){ return m_values; } + inline StorageIndex *innerIndexPtr() {return m_indices; } + inline const StorageIndex *innerIndexPtr() const {return m_indices; } + inline StorageIndex *outerIndexPtr() {return m_outerIndex; } + inline const StorageIndex* outerIndexPtr() const {return m_outerIndex; } + + /** \brief for compatibility purposes with the SparseMatrix class */ + inline bool isCompressed() const {return true;} + /** + * \returns the starting index of the bi row block + */ + inline Index blockRowsIndex(Index bi) const + { + return IsColMajor ? blockInnerIndex(bi) : blockOuterIndex(bi); + } + + /** + * \returns the starting index of the bj col block + */ + inline Index blockColsIndex(Index bj) const + { + return IsColMajor ? blockOuterIndex(bj) : blockInnerIndex(bj); + } + + inline Index blockOuterIndex(Index bj) const + { + return (m_blockSize == Dynamic) ? m_outerOffset[bj] : (bj * m_blockSize); + } + inline Index blockInnerIndex(Index bi) const + { + return (m_blockSize == Dynamic) ? m_innerOffset[bi] : (bi * m_blockSize); + } + + // Not needed ??? + inline Index blockInnerSize(Index bi) const + { + return (m_blockSize == Dynamic) ? (m_innerOffset[bi+1] - m_innerOffset[bi]) : m_blockSize; + } + inline Index blockOuterSize(Index bj) const + { + return (m_blockSize == Dynamic) ? (m_outerOffset[bj+1]- m_outerOffset[bj]) : m_blockSize; + } + + /** + * \brief Browse the matrix by outer index + */ + class InnerIterator; // Browse column by column + + /** + * \brief Browse the matrix by block outer index + */ + class BlockInnerIterator; // Browse block by block + + friend std::ostream & operator << (std::ostream & s, const BlockSparseMatrix& m) + { + for (StorageIndex j = 0; j < m.outerBlocks(); ++j) + { + BlockInnerIterator itb(m, j); + for(; itb; ++itb) + { + s << "("<::type()); + } + + + protected: +// inline Index blockDynIdx(Index id, internal::true_type) const +// { +// return m_blockPtr[id]; +// } +// inline Index blockDynIdx(Index id, internal::false_type) const +// { +// return id * BlockSize * BlockSize; +// } + + // To be implemented + // Insert a block at a particular location... need to make a room for that + Map insert(Index brow, Index bcol); + + Index m_innerBSize; // Number of block rows + Index m_outerBSize; // Number of block columns + StorageIndex *m_innerOffset; // Starting index of each inner block (size m_innerBSize+1) + StorageIndex *m_outerOffset; // Starting index of each outer block (size m_outerBSize+1) + Index m_nonzerosblocks; // Total nonzeros blocks (lower than m_innerBSize x m_outerBSize) + Index m_nonzeros; // Total nonzeros elements + Scalar *m_values; //Values stored block column after block column (size m_nonzeros) + StorageIndex *m_blockPtr; // Pointer to the beginning of each block in m_values, size m_nonzeroblocks ... null for fixed-size blocks + StorageIndex *m_indices; //Inner block indices, size m_nonzerosblocks ... OK + StorageIndex *m_outerIndex; // Starting pointer of each block column in m_indices (size m_outerBSize)... OK + Index m_blockSize; // Size of a block for fixed-size blocks, otherwise -1 +}; + +template +class BlockSparseMatrix<_Scalar, _BlockAtCompileTime, _Options, _StorageIndex>::BlockInnerIterator +{ + public: + + enum{ + Flags = _Options + }; + + BlockInnerIterator(const BlockSparseMatrix& mat, const Index outer) + : m_mat(mat),m_outer(outer), + m_id(mat.m_outerIndex[outer]), + m_end(mat.m_outerIndex[outer+1]) + { + } + + inline BlockInnerIterator& operator++() {m_id++; return *this; } + + inline const Map value() const + { + return Map(&(m_mat.m_values[m_mat.blockPtr(m_id)]), + rows(),cols()); + } + inline Map valueRef() + { + return Map(&(m_mat.m_values[m_mat.blockPtr(m_id)]), + rows(),cols()); + } + // Block inner index + inline Index index() const {return m_mat.m_indices[m_id]; } + inline Index outer() const { return m_outer; } + // block row index + inline Index row() const {return index(); } + // block column index + inline Index col() const {return outer(); } + // FIXME Number of rows in the current block + inline Index rows() const { return (m_mat.m_blockSize==Dynamic) ? (m_mat.m_innerOffset[index()+1] - m_mat.m_innerOffset[index()]) : m_mat.m_blockSize; } + // Number of columns in the current block ... + inline Index cols() const { return (m_mat.m_blockSize==Dynamic) ? (m_mat.m_outerOffset[m_outer+1]-m_mat.m_outerOffset[m_outer]) : m_mat.m_blockSize;} + inline operator bool() const { return (m_id < m_end); } + + protected: + const BlockSparseMatrix<_Scalar, _BlockAtCompileTime, _Options, StorageIndex>& m_mat; + const Index m_outer; + Index m_id; + Index m_end; +}; + +template +class BlockSparseMatrix<_Scalar, _BlockAtCompileTime, _Options, _StorageIndex>::InnerIterator +{ + public: + InnerIterator(const BlockSparseMatrix& mat, Index outer) + : m_mat(mat),m_outerB(mat.outerToBlock(outer)),m_outer(outer), + itb(mat, mat.outerToBlock(outer)), + m_offset(outer - mat.blockOuterIndex(m_outerB)) + { + if (itb) + { + m_id = m_mat.blockInnerIndex(itb.index()); + m_start = m_id; + m_end = m_mat.blockInnerIndex(itb.index()+1); + } + } + inline InnerIterator& operator++() + { + m_id++; + if (m_id >= m_end) + { + ++itb; + if (itb) + { + m_id = m_mat.blockInnerIndex(itb.index()); + m_start = m_id; + m_end = m_mat.blockInnerIndex(itb.index()+1); + } + } + return *this; + } + inline const Scalar& value() const + { + return itb.value().coeff(m_id - m_start, m_offset); + } + inline Scalar& valueRef() + { + return itb.valueRef().coeff(m_id - m_start, m_offset); + } + inline Index index() const { return m_id; } + inline Index outer() const {return m_outer; } + inline Index col() const {return outer(); } + inline Index row() const { return index();} + inline operator bool() const + { + return itb; + } + protected: + const BlockSparseMatrix& m_mat; + const Index m_outer; + const Index m_outerB; + BlockInnerIterator itb; // Iterator through the blocks + const Index m_offset; // Position of this column in the block + Index m_start; // starting inner index of this block + Index m_id; // current inner index in the block + Index m_end; // starting inner index of the next block + +}; +} // end namespace Eigen + +#endif // EIGEN_SPARSEBLOCKMATRIX_H diff --git a/src/EigenUnsupported/src/SparseExtra/DynamicSparseMatrix.h b/src/EigenUnsupported/src/SparseExtra/DynamicSparseMatrix.h new file mode 100644 index 0000000..42c99e4 --- /dev/null +++ b/src/EigenUnsupported/src/SparseExtra/DynamicSparseMatrix.h @@ -0,0 +1,404 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2009 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_DYNAMIC_SPARSEMATRIX_H +#define EIGEN_DYNAMIC_SPARSEMATRIX_H + +namespace Eigen { + +/** \deprecated use a SparseMatrix in an uncompressed mode + * + * \class DynamicSparseMatrix + * + * \brief A sparse matrix class designed for matrix assembly purpose + * + * \param _Scalar the scalar type, i.e. the type of the coefficients + * + * Unlike SparseMatrix, this class provides a much higher degree of flexibility. In particular, it allows + * random read/write accesses in log(rho*outer_size) where \c rho is the probability that a coefficient is + * nonzero and outer_size is the number of columns if the matrix is column-major and the number of rows + * otherwise. + * + * Internally, the data are stored as a std::vector of compressed vector. The performances of random writes might + * decrease as the number of nonzeros per inner-vector increase. In practice, we observed very good performance + * till about 100 nonzeros/vector, and the performance remains relatively good till 500 nonzeros/vectors. + * + * \see SparseMatrix + */ + +namespace internal { +template +struct traits > +{ + typedef _Scalar Scalar; + typedef _StorageIndex StorageIndex; + typedef Sparse StorageKind; + typedef MatrixXpr XprKind; + enum { + RowsAtCompileTime = Dynamic, + ColsAtCompileTime = Dynamic, + MaxRowsAtCompileTime = Dynamic, + MaxColsAtCompileTime = Dynamic, + Flags = _Options | NestByRefBit | LvalueBit, + CoeffReadCost = NumTraits::ReadCost, + SupportedAccessPatterns = OuterRandomAccessPattern + }; +}; +} + +template + class DynamicSparseMatrix + : public SparseMatrixBase > +{ + typedef SparseMatrixBase Base; + using Base::convert_index; + public: + EIGEN_SPARSE_PUBLIC_INTERFACE(DynamicSparseMatrix) + // FIXME: why are these operator already alvailable ??? + // EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(DynamicSparseMatrix, +=) + // EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(DynamicSparseMatrix, -=) + typedef MappedSparseMatrix Map; + using Base::IsRowMajor; + using Base::operator=; + enum { + Options = _Options + }; + + protected: + + typedef DynamicSparseMatrix TransposedSparseMatrix; + + Index m_innerSize; + std::vector > m_data; + + public: + + inline Index rows() const { return IsRowMajor ? outerSize() : m_innerSize; } + inline Index cols() const { return IsRowMajor ? m_innerSize : outerSize(); } + inline Index innerSize() const { return m_innerSize; } + inline Index outerSize() const { return convert_index(m_data.size()); } + inline Index innerNonZeros(Index j) const { return m_data[j].size(); } + + std::vector >& _data() { return m_data; } + const std::vector >& _data() const { return m_data; } + + /** \returns the coefficient value at given position \a row, \a col + * This operation involes a log(rho*outer_size) binary search. + */ + inline Scalar coeff(Index row, Index col) const + { + const Index outer = IsRowMajor ? row : col; + const Index inner = IsRowMajor ? col : row; + return m_data[outer].at(inner); + } + + /** \returns a reference to the coefficient value at given position \a row, \a col + * This operation involes a log(rho*outer_size) binary search. If the coefficient does not + * exist yet, then a sorted insertion into a sequential buffer is performed. + */ + inline Scalar& coeffRef(Index row, Index col) + { + const Index outer = IsRowMajor ? row : col; + const Index inner = IsRowMajor ? col : row; + return m_data[outer].atWithInsertion(inner); + } + + class InnerIterator; + class ReverseInnerIterator; + + void setZero() + { + for (Index j=0; j0) + { + Index reserveSizePerVector = (std::max)(reserveSize/outerSize(),Index(4)); + for (Index j=0; j(m_data[outer].size()) - 1; + m_data[outer].resize(id+2,1); + + while ( (id >= startId) && (m_data[outer].index(id) > inner) ) + { + m_data[outer].index(id+1) = m_data[outer].index(id); + m_data[outer].value(id+1) = m_data[outer].value(id); + --id; + } + m_data[outer].index(id+1) = inner; + m_data[outer].value(id+1) = 0; + return m_data[outer].value(id+1); + } + + /** Does nothing: provided for compatibility with SparseMatrix */ + inline void finalize() {} + + /** Suppress all nonzeros which are smaller than \a reference under the tolerance \a epsilon */ + void prune(Scalar reference, RealScalar epsilon = NumTraits::dummy_precision()) + { + for (Index j=0; jinnerSize) + { + // remove all coefficients with innerCoord>=innerSize + // TODO + //std::cerr << "not implemented yet\n"; + exit(2); + } + if (m_data.size() != outerSize) + { + m_data.resize(outerSize); + } + } + + /** The class DynamicSparseMatrix is deprecated */ + EIGEN_DEPRECATED inline DynamicSparseMatrix() + : m_innerSize(0), m_data(0) + { + #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN + EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN + #endif + eigen_assert(innerSize()==0 && outerSize()==0); + } + + /** The class DynamicSparseMatrix is deprecated */ + EIGEN_DEPRECATED inline DynamicSparseMatrix(Index rows, Index cols) + : m_innerSize(0) + { + #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN + EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN + #endif + resize(rows, cols); + } + + /** The class DynamicSparseMatrix is deprecated */ + template + EIGEN_DEPRECATED explicit inline DynamicSparseMatrix(const SparseMatrixBase& other) + : m_innerSize(0) + { + #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN + EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN + #endif + Base::operator=(other.derived()); + } + + inline DynamicSparseMatrix(const DynamicSparseMatrix& other) + : Base(), m_innerSize(0) + { + #ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN + EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN + #endif + *this = other.derived(); + } + + inline void swap(DynamicSparseMatrix& other) + { + //EIGEN_DBG_SPARSE(std::cout << "SparseMatrix:: swap\n"); + std::swap(m_innerSize, other.m_innerSize); + //std::swap(m_outerSize, other.m_outerSize); + m_data.swap(other.m_data); + } + + inline DynamicSparseMatrix& operator=(const DynamicSparseMatrix& other) + { + if (other.isRValue()) + { + swap(other.const_cast_derived()); + } + else + { + resize(other.rows(), other.cols()); + m_data = other.m_data; + } + return *this; + } + + /** Destructor */ + inline ~DynamicSparseMatrix() {} + + public: + + /** \deprecated + * Set the matrix to zero and reserve the memory for \a reserveSize nonzero coefficients. */ + EIGEN_DEPRECATED void startFill(Index reserveSize = 1000) + { + setZero(); + reserve(reserveSize); + } + + /** \deprecated use insert() + * inserts a nonzero coefficient at given coordinates \a row, \a col and returns its reference assuming that: + * 1 - the coefficient does not exist yet + * 2 - this the coefficient with greater inner coordinate for the given outer coordinate. + * In other words, assuming \c *this is column-major, then there must not exists any nonzero coefficient of coordinates + * \c i \c x \a col such that \c i >= \a row. Otherwise the matrix is invalid. + * + * \see fillrand(), coeffRef() + */ + EIGEN_DEPRECATED Scalar& fill(Index row, Index col) + { + const Index outer = IsRowMajor ? row : col; + const Index inner = IsRowMajor ? col : row; + return insertBack(outer,inner); + } + + /** \deprecated use insert() + * Like fill() but with random inner coordinates. + * Compared to the generic coeffRef(), the unique limitation is that we assume + * the coefficient does not exist yet. + */ + EIGEN_DEPRECATED Scalar& fillrand(Index row, Index col) + { + return insert(row,col); + } + + /** \deprecated use finalize() + * Does nothing. Provided for compatibility with SparseMatrix. */ + EIGEN_DEPRECATED void endFill() {} + +# ifdef EIGEN_DYNAMICSPARSEMATRIX_PLUGIN +# include EIGEN_DYNAMICSPARSEMATRIX_PLUGIN +# endif + }; + +template +class DynamicSparseMatrix::InnerIterator : public SparseVector::InnerIterator +{ + typedef typename SparseVector::InnerIterator Base; + public: + InnerIterator(const DynamicSparseMatrix& mat, Index outer) + : Base(mat.m_data[outer]), m_outer(outer) + {} + + inline Index row() const { return IsRowMajor ? m_outer : Base::index(); } + inline Index col() const { return IsRowMajor ? Base::index() : m_outer; } + inline Index outer() const { return m_outer; } + + protected: + const Index m_outer; +}; + +template +class DynamicSparseMatrix::ReverseInnerIterator : public SparseVector::ReverseInnerIterator +{ + typedef typename SparseVector::ReverseInnerIterator Base; + public: + ReverseInnerIterator(const DynamicSparseMatrix& mat, Index outer) + : Base(mat.m_data[outer]), m_outer(outer) + {} + + inline Index row() const { return IsRowMajor ? m_outer : Base::index(); } + inline Index col() const { return IsRowMajor ? Base::index() : m_outer; } + inline Index outer() const { return m_outer; } + + protected: + const Index m_outer; +}; + +namespace internal { + +template +struct evaluator > + : evaluator_base > +{ + typedef _Scalar Scalar; + typedef DynamicSparseMatrix<_Scalar,_Options,_StorageIndex> SparseMatrixType; + typedef typename SparseMatrixType::InnerIterator InnerIterator; + typedef typename SparseMatrixType::ReverseInnerIterator ReverseInnerIterator; + + enum { + CoeffReadCost = NumTraits<_Scalar>::ReadCost, + Flags = SparseMatrixType::Flags + }; + + evaluator() : m_matrix(0) {} + evaluator(const SparseMatrixType &mat) : m_matrix(&mat) {} + + operator SparseMatrixType&() { return m_matrix->const_cast_derived(); } + operator const SparseMatrixType&() const { return *m_matrix; } + + Scalar coeff(Index row, Index col) const { return m_matrix->coeff(row,col); } + + Index nonZerosEstimate() const { return m_matrix->nonZeros(); } + + const SparseMatrixType *m_matrix; +}; + +} + +} // end namespace Eigen + +#endif // EIGEN_DYNAMIC_SPARSEMATRIX_H diff --git a/src/EigenUnsupported/src/SparseExtra/MarketIO.h b/src/EigenUnsupported/src/SparseExtra/MarketIO.h new file mode 100644 index 0000000..dd786d5 --- /dev/null +++ b/src/EigenUnsupported/src/SparseExtra/MarketIO.h @@ -0,0 +1,282 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2011 Gael Guennebaud +// Copyright (C) 2012 Desire NUENTSA WAKAM +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SPARSE_MARKET_IO_H +#define EIGEN_SPARSE_MARKET_IO_H + +#include +#include + +namespace Eigen { + +namespace internal +{ + template + inline void GetMarketLine (const char* line, StorageIndex& i, StorageIndex& j, Scalar& value) + { + std::stringstream sline(line); + sline >> i >> j >> value; + } + + template<> inline void GetMarketLine (const char* line, int& i, int& j, float& value) + { std::sscanf(line, "%d %d %g", &i, &j, &value); } + + template<> inline void GetMarketLine (const char* line, int& i, int& j, double& value) + { std::sscanf(line, "%d %d %lg", &i, &j, &value); } + + template<> inline void GetMarketLine (const char* line, int& i, int& j, std::complex& value) + { std::sscanf(line, "%d %d %g %g", &i, &j, &numext::real_ref(value), &numext::imag_ref(value)); } + + template<> inline void GetMarketLine (const char* line, int& i, int& j, std::complex& value) + { std::sscanf(line, "%d %d %lg %lg", &i, &j, &numext::real_ref(value), &numext::imag_ref(value)); } + + template + inline void GetMarketLine (const char* line, StorageIndex& i, StorageIndex& j, std::complex& value) + { + std::stringstream sline(line); + Scalar valR, valI; + sline >> i >> j >> valR >> valI; + value = std::complex(valR,valI); + } + + template + inline void GetVectorElt (const std::string& line, RealScalar& val) + { + std::istringstream newline(line); + newline >> val; + } + + template + inline void GetVectorElt (const std::string& line, std::complex& val) + { + RealScalar valR, valI; + std::istringstream newline(line); + newline >> valR >> valI; + val = std::complex(valR, valI); + } + + template + inline void putMarketHeader(std::string& header,int sym) + { + header= "%%MatrixMarket matrix coordinate "; + if(internal::is_same >::value || internal::is_same >::value) + { + header += " complex"; + if(sym == Symmetric) header += " symmetric"; + else if (sym == SelfAdjoint) header += " Hermitian"; + else header += " general"; + } + else + { + header += " real"; + if(sym == Symmetric) header += " symmetric"; + else header += " general"; + } + } + + template + inline void PutMatrixElt(Scalar value, StorageIndex row, StorageIndex col, std::ofstream& out) + { + out << row << " "<< col << " " << value << "\n"; + } + template + inline void PutMatrixElt(std::complex value, StorageIndex row, StorageIndex col, std::ofstream& out) + { + out << row << " " << col << " " << value.real() << " " << value.imag() << "\n"; + } + + + template + inline void putVectorElt(Scalar value, std::ofstream& out) + { + out << value << "\n"; + } + template + inline void putVectorElt(std::complex value, std::ofstream& out) + { + out << value.real() << " " << value.imag()<< "\n"; + } + +} // end namespace internal + +inline bool getMarketHeader(const std::string& filename, int& sym, bool& iscomplex, bool& isvector) +{ + sym = 0; + iscomplex = false; + isvector = false; + std::ifstream in(filename.c_str(),std::ios::in); + if(!in) + return false; + + std::string line; + // The matrix header is always the first line in the file + std::getline(in, line); eigen_assert(in.good()); + + std::stringstream fmtline(line); + std::string substr[5]; + fmtline>> substr[0] >> substr[1] >> substr[2] >> substr[3] >> substr[4]; + if(substr[2].compare("array") == 0) isvector = true; + if(substr[3].compare("complex") == 0) iscomplex = true; + if(substr[4].compare("symmetric") == 0) sym = Symmetric; + else if (substr[4].compare("Hermitian") == 0) sym = SelfAdjoint; + + return true; +} + +template +bool loadMarket(SparseMatrixType& mat, const std::string& filename) +{ + typedef typename SparseMatrixType::Scalar Scalar; + typedef typename SparseMatrixType::StorageIndex StorageIndex; + std::ifstream input(filename.c_str(),std::ios::in); + if(!input) + return false; + + char rdbuffer[4096]; + input.rdbuf()->pubsetbuf(rdbuffer, 4096); + + const int maxBuffersize = 2048; + char buffer[maxBuffersize]; + + bool readsizes = false; + + typedef Triplet T; + std::vector elements; + + Index M(-1), N(-1), NNZ(-1); + Index count = 0; + while(input.getline(buffer, maxBuffersize)) + { + // skip comments + //NOTE An appropriate test should be done on the header to get the symmetry + if(buffer[0]=='%') + continue; + + if(!readsizes) + { + std::stringstream line(buffer); + line >> M >> N >> NNZ; + if(M > 0 && N > 0) + { + readsizes = true; + mat.resize(M,N); + mat.reserve(NNZ); + } + } + else + { + StorageIndex i(-1), j(-1); + Scalar value; + internal::GetMarketLine(buffer, i, j, value); + + i--; + j--; + if(i>=0 && j>=0 && i +bool loadMarketVector(VectorType& vec, const std::string& filename) +{ + typedef typename VectorType::Scalar Scalar; + std::ifstream in(filename.c_str(), std::ios::in); + if(!in) + return false; + + std::string line; + int n(0), col(0); + do + { // Skip comments + std::getline(in, line); eigen_assert(in.good()); + } while (line[0] == '%'); + std::istringstream newline(line); + newline >> n >> col; + eigen_assert(n>0 && col>0); + vec.resize(n); + int i = 0; + Scalar value; + while ( std::getline(in, line) && (i < n) ){ + internal::GetVectorElt(line, value); + vec(i++) = value; + } + in.close(); + if (i!=n){ + std::cerr<< "Unable to read all elements from file " << filename << "\n"; + return false; + } + return true; +} + +template +bool saveMarket(const SparseMatrixType& mat, const std::string& filename, int sym = 0) +{ + typedef typename SparseMatrixType::Scalar Scalar; + typedef typename SparseMatrixType::RealScalar RealScalar; + std::ofstream out(filename.c_str(),std::ios::out); + if(!out) + return false; + + out.flags(std::ios_base::scientific); + out.precision(std::numeric_limits::digits10 + 2); + std::string header; + internal::putMarketHeader(header, sym); + out << header << std::endl; + out << mat.rows() << " " << mat.cols() << " " << mat.nonZeros() << "\n"; + int count = 0; + for(int j=0; j +bool saveMarketVector (const VectorType& vec, const std::string& filename) +{ + typedef typename VectorType::Scalar Scalar; + typedef typename VectorType::RealScalar RealScalar; + std::ofstream out(filename.c_str(),std::ios::out); + if(!out) + return false; + + out.flags(std::ios_base::scientific); + out.precision(std::numeric_limits::digits10 + 2); + if(internal::is_same >::value || internal::is_same >::value) + out << "%%MatrixMarket matrix array complex general\n"; + else + out << "%%MatrixMarket matrix array real general\n"; + out << vec.size() << " "<< 1 << "\n"; + for (int i=0; i < vec.size(); i++){ + internal::putVectorElt(vec(i), out); + } + out.close(); + return true; +} + +} // end namespace Eigen + +#endif // EIGEN_SPARSE_MARKET_IO_H diff --git a/src/EigenUnsupported/src/SparseExtra/MatrixMarketIterator.h b/src/EigenUnsupported/src/SparseExtra/MatrixMarketIterator.h new file mode 100644 index 0000000..02916ea --- /dev/null +++ b/src/EigenUnsupported/src/SparseExtra/MatrixMarketIterator.h @@ -0,0 +1,247 @@ + +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2012 Desire NUENTSA WAKAM +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_BROWSE_MATRICES_H +#define EIGEN_BROWSE_MATRICES_H + +namespace Eigen { + +enum { + SPD = 0x100, + NonSymmetric = 0x0 +}; + +/** + * @brief Iterator to browse matrices from a specified folder + * + * This is used to load all the matrices from a folder. + * The matrices should be in Matrix Market format + * It is assumed that the matrices are named as matname.mtx + * and matname_SPD.mtx if the matrix is Symmetric and positive definite (or Hermitian) + * The right hand side vectors are loaded as well, if they exist. + * They should be named as matname_b.mtx. + * Note that the right hand side for a SPD matrix is named as matname_SPD_b.mtx + * + * Sometimes a reference solution is available. In this case, it should be named as matname_x.mtx + * + * Sample code + * \code + * + * \endcode + * + * \tparam Scalar The scalar type + */ +template +class MatrixMarketIterator +{ + typedef typename NumTraits::Real RealScalar; + public: + typedef Matrix VectorType; + typedef SparseMatrix MatrixType; + + public: + MatrixMarketIterator(const std::string &folder) + : m_sym(0), m_isvalid(false), m_matIsLoaded(false), m_hasRhs(false), m_hasrefX(false), m_folder(folder) + { + m_folder_id = opendir(folder.c_str()); + if(m_folder_id) + Getnextvalidmatrix(); + } + + ~MatrixMarketIterator() + { + if (m_folder_id) closedir(m_folder_id); + } + + inline MatrixMarketIterator& operator++() + { + m_matIsLoaded = false; + m_hasrefX = false; + m_hasRhs = false; + Getnextvalidmatrix(); + return *this; + } + inline operator bool() const { return m_isvalid;} + + /** Return the sparse matrix corresponding to the current file */ + inline MatrixType& matrix() + { + // Read the matrix + if (m_matIsLoaded) return m_mat; + + std::string matrix_file = m_folder + "/" + m_matname + ".mtx"; + if ( !loadMarket(m_mat, matrix_file)) + { + std::cerr << "Warning loadMarket failed when loading \"" << matrix_file << "\"" << std::endl; + m_matIsLoaded = false; + return m_mat; + } + m_matIsLoaded = true; + + if (m_sym != NonSymmetric) + { + // Check whether we need to restore a full matrix: + RealScalar diag_norm = m_mat.diagonal().norm(); + RealScalar lower_norm = m_mat.template triangularView().norm(); + RealScalar upper_norm = m_mat.template triangularView().norm(); + if(lower_norm>diag_norm && upper_norm==diag_norm) + { + // only the lower part is stored + MatrixType tmp(m_mat); + m_mat = tmp.template selfadjointView(); + } + else if(upper_norm>diag_norm && lower_norm==diag_norm) + { + // only the upper part is stored + MatrixType tmp(m_mat); + m_mat = tmp.template selfadjointView(); + } + } + return m_mat; + } + + /** Return the right hand side corresponding to the current matrix. + * If the rhs file is not provided, a random rhs is generated + */ + inline VectorType& rhs() + { + // Get the right hand side + if (m_hasRhs) return m_rhs; + + std::string rhs_file; + rhs_file = m_folder + "/" + m_matname + "_b.mtx"; // The pattern is matname_b.mtx + m_hasRhs = Fileexists(rhs_file); + if (m_hasRhs) + { + m_rhs.resize(m_mat.cols()); + m_hasRhs = loadMarketVector(m_rhs, rhs_file); + } + if (!m_hasRhs) + { + // Generate a random right hand side + if (!m_matIsLoaded) this->matrix(); + m_refX.resize(m_mat.cols()); + m_refX.setRandom(); + m_rhs = m_mat * m_refX; + m_hasrefX = true; + m_hasRhs = true; + } + return m_rhs; + } + + /** Return a reference solution + * If it is not provided and if the right hand side is not available + * then refX is randomly generated such that A*refX = b + * where A and b are the matrix and the rhs. + * Note that when a rhs is provided, refX is not available + */ + inline VectorType& refX() + { + // Check if a reference solution is provided + if (m_hasrefX) return m_refX; + + std::string lhs_file; + lhs_file = m_folder + "/" + m_matname + "_x.mtx"; + m_hasrefX = Fileexists(lhs_file); + if (m_hasrefX) + { + m_refX.resize(m_mat.cols()); + m_hasrefX = loadMarketVector(m_refX, lhs_file); + } + else + m_refX.resize(0); + return m_refX; + } + + inline std::string& matname() { return m_matname; } + + inline int sym() { return m_sym; } + + bool hasRhs() {return m_hasRhs; } + bool hasrefX() {return m_hasrefX; } + bool isFolderValid() { return bool(m_folder_id); } + + protected: + + inline bool Fileexists(std::string file) + { + std::ifstream file_id(file.c_str()); + if (!file_id.good() ) + { + return false; + } + else + { + file_id.close(); + return true; + } + } + + void Getnextvalidmatrix( ) + { + m_isvalid = false; + // Here, we return with the next valid matrix in the folder + while ( (m_curs_id = readdir(m_folder_id)) != NULL) { + m_isvalid = false; + std::string curfile; + curfile = m_folder + "/" + m_curs_id->d_name; + // Discard if it is a folder + if (m_curs_id->d_type == DT_DIR) continue; //FIXME This may not be available on non BSD systems +// struct stat st_buf; +// stat (curfile.c_str(), &st_buf); +// if (S_ISDIR(st_buf.st_mode)) continue; + + // Determine from the header if it is a matrix or a right hand side + bool isvector,iscomplex=false; + if(!getMarketHeader(curfile,m_sym,iscomplex,isvector)) continue; + if(isvector) continue; + if (!iscomplex) + { + if(internal::is_same >::value || internal::is_same >::value) + continue; + } + if (iscomplex) + { + if(internal::is_same::value || internal::is_same::value) + continue; + } + + + // Get the matrix name + std::string filename = m_curs_id->d_name; + m_matname = filename.substr(0, filename.length()-4); + + // Find if the matrix is SPD + size_t found = m_matname.find("SPD"); + if( (found!=std::string::npos) && (m_sym != NonSymmetric) ) + m_sym = SPD; + + m_isvalid = true; + break; + } + } + int m_sym; // Symmetry of the matrix + MatrixType m_mat; // Current matrix + VectorType m_rhs; // Current vector + VectorType m_refX; // The reference solution, if exists + std::string m_matname; // Matrix Name + bool m_isvalid; + bool m_matIsLoaded; // Determine if the matrix has already been loaded from the file + bool m_hasRhs; // The right hand side exists + bool m_hasrefX; // A reference solution is provided + std::string m_folder; + DIR * m_folder_id; + struct dirent *m_curs_id; + +}; + +} // end namespace Eigen + +#endif diff --git a/src/EigenUnsupported/src/SparseExtra/RandomSetter.h b/src/EigenUnsupported/src/SparseExtra/RandomSetter.h new file mode 100644 index 0000000..985702b --- /dev/null +++ b/src/EigenUnsupported/src/SparseExtra/RandomSetter.h @@ -0,0 +1,349 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_RANDOMSETTER_H +#define EIGEN_RANDOMSETTER_H + +#if defined(EIGEN_GOOGLEHASH_SUPPORT) +// Ensure the ::google namespace exists, required for checking existence of +// ::google::dense_hash_map and ::google::sparse_hash_map. +namespace google {} +#endif + +namespace Eigen { + +/** Represents a std::map + * + * \see RandomSetter + */ +template struct StdMapTraits +{ + typedef int KeyType; + typedef std::map Type; + enum { + IsSorted = 1 + }; + + static void setInvalidKey(Type&, const KeyType&) {} +}; + +#ifdef EIGEN_UNORDERED_MAP_SUPPORT +/** Represents a std::unordered_map + * + * To use it you need to both define EIGEN_UNORDERED_MAP_SUPPORT and include the unordered_map header file + * yourself making sure that unordered_map is defined in the std namespace. + * + * For instance, with current version of gcc you can either enable C++0x standard (-std=c++0x) or do: + * \code + * #include + * #define EIGEN_UNORDERED_MAP_SUPPORT + * namespace std { + * using std::tr1::unordered_map; + * } + * \endcode + * + * \see RandomSetter + */ +template struct StdUnorderedMapTraits +{ + typedef int KeyType; + typedef std::unordered_map Type; + enum { + IsSorted = 0 + }; + + static void setInvalidKey(Type&, const KeyType&) {} +}; +#endif // EIGEN_UNORDERED_MAP_SUPPORT + +#if defined(EIGEN_GOOGLEHASH_SUPPORT) + +namespace google { + +// Namespace work-around, since sometimes dense_hash_map and sparse_hash_map +// are in the global namespace, and other times they are under ::google. +using namespace ::google; + +template +struct DenseHashMap { + typedef dense_hash_map type; +}; + +template +struct SparseHashMap { + typedef sparse_hash_map type; +}; + +} // namespace google + +/** Represents a google::dense_hash_map + * + * \see RandomSetter + */ +template struct GoogleDenseHashMapTraits +{ + typedef int KeyType; + typedef typename google::DenseHashMap::type Type; + enum { + IsSorted = 0 + }; + + static void setInvalidKey(Type& map, const KeyType& k) + { map.set_empty_key(k); } +}; + +/** Represents a google::sparse_hash_map + * + * \see RandomSetter + */ +template struct GoogleSparseHashMapTraits +{ + typedef int KeyType; + typedef typename google::SparseHashMap::type Type; + enum { + IsSorted = 0 + }; + + static void setInvalidKey(Type&, const KeyType&) {} +}; +#endif + +/** \class RandomSetter + * + * \brief The RandomSetter is a wrapper object allowing to set/update a sparse matrix with random access + * + * \tparam SparseMatrixType the type of the sparse matrix we are updating + * \tparam MapTraits a traits class representing the map implementation used for the temporary sparse storage. + * Its default value depends on the system. + * \tparam OuterPacketBits defines the number of rows (or columns) manage by a single map object + * as a power of two exponent. + * + * This class temporarily represents a sparse matrix object using a generic map implementation allowing for + * efficient random access. The conversion from the compressed representation to a hash_map object is performed + * in the RandomSetter constructor, while the sparse matrix is updated back at destruction time. This strategy + * suggest the use of nested blocks as in this example: + * + * \code + * SparseMatrix m(rows,cols); + * { + * RandomSetter > w(m); + * // don't use m but w instead with read/write random access to the coefficients: + * for(;;) + * w(rand(),rand()) = rand; + * } + * // when w is deleted, the data are copied back to m + * // and m is ready to use. + * \endcode + * + * Since hash_map objects are not fully sorted, representing a full matrix as a single hash_map would + * involve a big and costly sort to update the compressed matrix back. To overcome this issue, a RandomSetter + * use multiple hash_map, each representing 2^OuterPacketBits columns or rows according to the storage order. + * To reach optimal performance, this value should be adjusted according to the average number of nonzeros + * per rows/columns. + * + * The possible values for the template parameter MapTraits are: + * - \b StdMapTraits: corresponds to std::map. (does not perform very well) + * - \b GnuHashMapTraits: corresponds to __gnu_cxx::hash_map (available only with GCC) + * - \b GoogleDenseHashMapTraits: corresponds to google::dense_hash_map (best efficiency, reasonable memory consumption) + * - \b GoogleSparseHashMapTraits: corresponds to google::sparse_hash_map (best memory consumption, relatively good performance) + * + * The default map implementation depends on the availability, and the preferred order is: + * GoogleSparseHashMapTraits, GnuHashMapTraits, and finally StdMapTraits. + * + * For performance and memory consumption reasons it is highly recommended to use one of + * Google's hash_map implementations. To enable the support for them, you must define + * EIGEN_GOOGLEHASH_SUPPORT. This will include both and + * for you. + * + * \see https://github.com/sparsehash/sparsehash + */ +template class MapTraits = +#if defined(EIGEN_GOOGLEHASH_SUPPORT) + GoogleDenseHashMapTraits +#elif defined(_HASH_MAP) + GnuHashMapTraits +#else + StdMapTraits +#endif + ,int OuterPacketBits = 6> +class RandomSetter +{ + typedef typename SparseMatrixType::Scalar Scalar; + typedef typename SparseMatrixType::StorageIndex StorageIndex; + + struct ScalarWrapper + { + ScalarWrapper() : value(0) {} + Scalar value; + }; + typedef typename MapTraits::KeyType KeyType; + typedef typename MapTraits::Type HashMapType; + static const int OuterPacketMask = (1 << OuterPacketBits) - 1; + enum { + SwapStorage = 1 - MapTraits::IsSorted, + TargetRowMajor = (SparseMatrixType::Flags & RowMajorBit) ? 1 : 0, + SetterRowMajor = SwapStorage ? 1-TargetRowMajor : TargetRowMajor + }; + + public: + + /** Constructs a random setter object from the sparse matrix \a target + * + * Note that the initial value of \a target are imported. If you want to re-set + * a sparse matrix from scratch, then you must set it to zero first using the + * setZero() function. + */ + inline RandomSetter(SparseMatrixType& target) + : mp_target(&target) + { + const Index outerSize = SwapStorage ? target.innerSize() : target.outerSize(); + const Index innerSize = SwapStorage ? target.outerSize() : target.innerSize(); + m_outerPackets = outerSize >> OuterPacketBits; + if (outerSize&OuterPacketMask) + m_outerPackets += 1; + m_hashmaps = new HashMapType[m_outerPackets]; + // compute number of bits needed to store inner indices + Index aux = innerSize - 1; + m_keyBitsOffset = 0; + while (aux) + { + ++m_keyBitsOffset; + aux = aux >> 1; + } + KeyType ik = (1<<(OuterPacketBits+m_keyBitsOffset)); + for (Index k=0; k::setInvalidKey(m_hashmaps[k],ik); + + // insert current coeffs + for (Index j=0; jouterSize(); ++j) + for (typename SparseMatrixType::InnerIterator it(*mp_target,j); it; ++it) + (*this)(TargetRowMajor?j:it.index(), TargetRowMajor?it.index():j) = it.value(); + } + + /** Destructor updating back the sparse matrix target */ + ~RandomSetter() + { + KeyType keyBitsMask = (1<setZero(); + mp_target->makeCompressed(); + mp_target->reserve(nonZeros()); + Index prevOuter = -1; + for (Index k=0; kfirst >> m_keyBitsOffset) + outerOffset; + const Index inner = it->first & keyBitsMask; + if (prevOuter!=outer) + { + for (Index j=prevOuter+1;j<=outer;++j) + mp_target->startVec(j); + prevOuter = outer; + } + mp_target->insertBackByOuterInner(outer, inner) = it->second.value; + } + } + mp_target->finalize(); + } + else + { + VectorXi positions(mp_target->outerSize()); + positions.setZero(); + // pass 1 + for (Index k=0; kfirst & keyBitsMask; + ++positions[outer]; + } + } + // prefix sum + StorageIndex count = 0; + for (Index j=0; jouterSize(); ++j) + { + StorageIndex tmp = positions[j]; + mp_target->outerIndexPtr()[j] = count; + positions[j] = count; + count += tmp; + } + mp_target->makeCompressed(); + mp_target->outerIndexPtr()[mp_target->outerSize()] = count; + mp_target->resizeNonZeros(count); + // pass 2 + for (Index k=0; kfirst >> m_keyBitsOffset) + outerOffset; + const Index outer = it->first & keyBitsMask; + // sorted insertion + // Note that we have to deal with at most 2^OuterPacketBits unsorted coefficients, + // moreover those 2^OuterPacketBits coeffs are likely to be sparse, an so only a + // small fraction of them have to be sorted, whence the following simple procedure: + Index posStart = mp_target->outerIndexPtr()[outer]; + Index i = (positions[outer]++) - 1; + while ( (i >= posStart) && (mp_target->innerIndexPtr()[i] > inner) ) + { + mp_target->valuePtr()[i+1] = mp_target->valuePtr()[i]; + mp_target->innerIndexPtr()[i+1] = mp_target->innerIndexPtr()[i]; + --i; + } + mp_target->innerIndexPtr()[i+1] = internal::convert_index(inner); + mp_target->valuePtr()[i+1] = it->second.value; + } + } + } + delete[] m_hashmaps; + } + + /** \returns a reference to the coefficient at given coordinates \a row, \a col */ + Scalar& operator() (Index row, Index col) + { + const Index outer = SetterRowMajor ? row : col; + const Index inner = SetterRowMajor ? col : row; + const Index outerMajor = outer >> OuterPacketBits; // index of the packet/map + const Index outerMinor = outer & OuterPacketMask; // index of the inner vector in the packet + const KeyType key = internal::convert_index((outerMinor<(m_hashmaps[k].size()); + return nz; + } + + + protected: + + HashMapType* m_hashmaps; + SparseMatrixType* mp_target; + Index m_outerPackets; + unsigned char m_keyBitsOffset; +}; + +} // end namespace Eigen + +#endif // EIGEN_RANDOMSETTER_H diff --git a/src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsArrayAPI.h b/src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsArrayAPI.h new file mode 100644 index 0000000..41d2bf6 --- /dev/null +++ b/src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsArrayAPI.h @@ -0,0 +1,286 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +#ifndef EIGEN_BESSELFUNCTIONS_ARRAYAPI_H +#define EIGEN_BESSELFUNCTIONS_ARRAYAPI_H + +namespace Eigen { + +/** \returns an expression of the coefficient-wise i0(\a x) to the given + * arrays. + * + * It returns the modified Bessel function of the first kind of order zero. + * + * \param x is the argument + * + * \note This function supports only float and double scalar types. To support + * other scalar types, the user has to provide implementations of i0(T) for + * any scalar type T to be supported. + * + * \sa ArrayBase::bessel_i0() + */ +template +EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp< + Eigen::internal::scalar_bessel_i0_op, const Derived> +bessel_i0(const Eigen::ArrayBase& x) { + return Eigen::CwiseUnaryOp< + Eigen::internal::scalar_bessel_i0_op, + const Derived>(x.derived()); +} + +/** \returns an expression of the coefficient-wise i0e(\a x) to the given + * arrays. + * + * It returns the exponentially scaled modified Bessel + * function of the first kind of order zero. + * + * \param x is the argument + * + * \note This function supports only float and double scalar types. To support + * other scalar types, the user has to provide implementations of i0e(T) for + * any scalar type T to be supported. + * + * \sa ArrayBase::bessel_i0e() + */ +template +EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp< + Eigen::internal::scalar_bessel_i0e_op, const Derived> +bessel_i0e(const Eigen::ArrayBase& x) { + return Eigen::CwiseUnaryOp< + Eigen::internal::scalar_bessel_i0e_op, + const Derived>(x.derived()); +} + +/** \returns an expression of the coefficient-wise i1(\a x) to the given + * arrays. + * + * It returns the modified Bessel function of the first kind of order one. + * + * \param x is the argument + * + * \note This function supports only float and double scalar types. To support + * other scalar types, the user has to provide implementations of i1(T) for + * any scalar type T to be supported. + * + * \sa ArrayBase::bessel_i1() + */ +template +EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp< + Eigen::internal::scalar_bessel_i1_op, const Derived> +bessel_i1(const Eigen::ArrayBase& x) { + return Eigen::CwiseUnaryOp< + Eigen::internal::scalar_bessel_i1_op, + const Derived>(x.derived()); +} + +/** \returns an expression of the coefficient-wise i1e(\a x) to the given + * arrays. + * + * It returns the exponentially scaled modified Bessel + * function of the first kind of order one. + * + * \param x is the argument + * + * \note This function supports only float and double scalar types. To support + * other scalar types, the user has to provide implementations of i1e(T) for + * any scalar type T to be supported. + * + * \sa ArrayBase::bessel_i1e() + */ +template +EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp< + Eigen::internal::scalar_bessel_i1e_op, const Derived> +bessel_i1e(const Eigen::ArrayBase& x) { + return Eigen::CwiseUnaryOp< + Eigen::internal::scalar_bessel_i1e_op, + const Derived>(x.derived()); +} + +/** \returns an expression of the coefficient-wise k0(\a x) to the given + * arrays. + * + * It returns the modified Bessel function of the second kind of order zero. + * + * \param x is the argument + * + * \note This function supports only float and double scalar types. To support + * other scalar types, the user has to provide implementations of k0(T) for + * any scalar type T to be supported. + * + * \sa ArrayBase::bessel_k0() + */ +template +EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp< + Eigen::internal::scalar_bessel_k0_op, const Derived> +bessel_k0(const Eigen::ArrayBase& x) { + return Eigen::CwiseUnaryOp< + Eigen::internal::scalar_bessel_k0_op, + const Derived>(x.derived()); +} + +/** \returns an expression of the coefficient-wise k0e(\a x) to the given + * arrays. + * + * It returns the exponentially scaled modified Bessel + * function of the second kind of order zero. + * + * \param x is the argument + * + * \note This function supports only float and double scalar types. To support + * other scalar types, the user has to provide implementations of k0e(T) for + * any scalar type T to be supported. + * + * \sa ArrayBase::bessel_k0e() + */ +template +EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp< + Eigen::internal::scalar_bessel_k0e_op, const Derived> +bessel_k0e(const Eigen::ArrayBase& x) { + return Eigen::CwiseUnaryOp< + Eigen::internal::scalar_bessel_k0e_op, + const Derived>(x.derived()); +} + +/** \returns an expression of the coefficient-wise k1(\a x) to the given + * arrays. + * + * It returns the modified Bessel function of the second kind of order one. + * + * \param x is the argument + * + * \note This function supports only float and double scalar types. To support + * other scalar types, the user has to provide implementations of k1(T) for + * any scalar type T to be supported. + * + * \sa ArrayBase::bessel_k1() + */ +template +EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp< + Eigen::internal::scalar_bessel_k1_op, const Derived> +bessel_k1(const Eigen::ArrayBase& x) { + return Eigen::CwiseUnaryOp< + Eigen::internal::scalar_bessel_k1_op, + const Derived>(x.derived()); +} + +/** \returns an expression of the coefficient-wise k1e(\a x) to the given + * arrays. + * + * It returns the exponentially scaled modified Bessel + * function of the second kind of order one. + * + * \param x is the argument + * + * \note This function supports only float and double scalar types. To support + * other scalar types, the user has to provide implementations of k1e(T) for + * any scalar type T to be supported. + * + * \sa ArrayBase::bessel_k1e() + */ +template +EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp< + Eigen::internal::scalar_bessel_k1e_op, const Derived> +bessel_k1e(const Eigen::ArrayBase& x) { + return Eigen::CwiseUnaryOp< + Eigen::internal::scalar_bessel_k1e_op, + const Derived>(x.derived()); +} + +/** \returns an expression of the coefficient-wise j0(\a x) to the given + * arrays. + * + * It returns the Bessel function of the first kind of order zero. + * + * \param x is the argument + * + * \note This function supports only float and double scalar types. To support + * other scalar types, the user has to provide implementations of j0(T) for + * any scalar type T to be supported. + * + * \sa ArrayBase::bessel_j0() + */ +template +EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp< + Eigen::internal::scalar_bessel_j0_op, const Derived> +bessel_j0(const Eigen::ArrayBase& x) { + return Eigen::CwiseUnaryOp< + Eigen::internal::scalar_bessel_j0_op, + const Derived>(x.derived()); +} + +/** \returns an expression of the coefficient-wise y0(\a x) to the given + * arrays. + * + * It returns the Bessel function of the second kind of order zero. + * + * \param x is the argument + * + * \note This function supports only float and double scalar types. To support + * other scalar types, the user has to provide implementations of y0(T) for + * any scalar type T to be supported. + * + * \sa ArrayBase::bessel_y0() + */ +template +EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp< + Eigen::internal::scalar_bessel_y0_op, const Derived> +bessel_y0(const Eigen::ArrayBase& x) { + return Eigen::CwiseUnaryOp< + Eigen::internal::scalar_bessel_y0_op, + const Derived>(x.derived()); +} + +/** \returns an expression of the coefficient-wise j1(\a x) to the given + * arrays. + * + * It returns the modified Bessel function of the first kind of order one. + * + * \param x is the argument + * + * \note This function supports only float and double scalar types. To support + * other scalar types, the user has to provide implementations of j1(T) for + * any scalar type T to be supported. + * + * \sa ArrayBase::bessel_j1() + */ +template +EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp< + Eigen::internal::scalar_bessel_j1_op, const Derived> +bessel_j1(const Eigen::ArrayBase& x) { + return Eigen::CwiseUnaryOp< + Eigen::internal::scalar_bessel_j1_op, + const Derived>(x.derived()); +} + +/** \returns an expression of the coefficient-wise y1(\a x) to the given + * arrays. + * + * It returns the Bessel function of the second kind of order one. + * + * \param x is the argument + * + * \note This function supports only float and double scalar types. To support + * other scalar types, the user has to provide implementations of y1(T) for + * any scalar type T to be supported. + * + * \sa ArrayBase::bessel_y1() + */ +template +EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp< + Eigen::internal::scalar_bessel_y1_op, const Derived> +bessel_y1(const Eigen::ArrayBase& x) { + return Eigen::CwiseUnaryOp< + Eigen::internal::scalar_bessel_y1_op, + const Derived>(x.derived()); +} + +} // end namespace Eigen + +#endif // EIGEN_BESSELFUNCTIONS_ARRAYAPI_H diff --git a/src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsBFloat16.h b/src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsBFloat16.h new file mode 100644 index 0000000..6049cc2 --- /dev/null +++ b/src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsBFloat16.h @@ -0,0 +1,68 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_BESSELFUNCTIONS_BFLOAT16_H +#define EIGEN_BESSELFUNCTIONS_BFLOAT16_H + +namespace Eigen { +namespace numext { + +#if EIGEN_HAS_C99_MATH +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_i0(const Eigen::bfloat16& x) { + return Eigen::bfloat16(Eigen::numext::bessel_i0(static_cast(x))); +} +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_i0e(const Eigen::bfloat16& x) { + return Eigen::bfloat16(Eigen::numext::bessel_i0e(static_cast(x))); +} +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_i1(const Eigen::bfloat16& x) { + return Eigen::bfloat16(Eigen::numext::bessel_i1(static_cast(x))); +} +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_i1e(const Eigen::bfloat16& x) { + return Eigen::bfloat16(Eigen::numext::bessel_i1e(static_cast(x))); +} +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_j0(const Eigen::bfloat16& x) { + return Eigen::bfloat16(Eigen::numext::bessel_j0(static_cast(x))); +} +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_j1(const Eigen::bfloat16& x) { + return Eigen::bfloat16(Eigen::numext::bessel_j1(static_cast(x))); +} +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_y0(const Eigen::bfloat16& x) { + return Eigen::bfloat16(Eigen::numext::bessel_y0(static_cast(x))); +} +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_y1(const Eigen::bfloat16& x) { + return Eigen::bfloat16(Eigen::numext::bessel_y1(static_cast(x))); +} +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_k0(const Eigen::bfloat16& x) { + return Eigen::bfloat16(Eigen::numext::bessel_k0(static_cast(x))); +} +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_k0e(const Eigen::bfloat16& x) { + return Eigen::bfloat16(Eigen::numext::bessel_k0e(static_cast(x))); +} +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_k1(const Eigen::bfloat16& x) { + return Eigen::bfloat16(Eigen::numext::bessel_k1(static_cast(x))); +} +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_k1e(const Eigen::bfloat16& x) { + return Eigen::bfloat16(Eigen::numext::bessel_k1e(static_cast(x))); +} +#endif + +} // end namespace numext +} // end namespace Eigen + +#endif // EIGEN_BESSELFUNCTIONS_BFLOAT16_H diff --git a/src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsFunctors.h b/src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsFunctors.h new file mode 100644 index 0000000..8606a9f --- /dev/null +++ b/src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsFunctors.h @@ -0,0 +1,357 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Eugene Brevdo +// Copyright (C) 2016 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_BESSELFUNCTIONS_FUNCTORS_H +#define EIGEN_BESSELFUNCTIONS_FUNCTORS_H + +namespace Eigen { + +namespace internal { + +/** \internal + * \brief Template functor to compute the modified Bessel function of the first + * kind of order zero. + * \sa class CwiseUnaryOp, Cwise::bessel_i0() + */ +template +struct scalar_bessel_i0_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_i0_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const { + using numext::bessel_i0; + return bessel_i0(x); + } + typedef typename packet_traits::type Packet; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { + return internal::pbessel_i0(x); + } +}; +template +struct functor_traits > { + enum { + // On average, a Chebyshev polynomial of order N=20 is computed. + // The cost is N multiplications and 2N additions. We also add + // the cost of an additional exp over i0e. + Cost = 28 * NumTraits::MulCost + 48 * NumTraits::AddCost, + PacketAccess = packet_traits::HasBessel + }; +}; + +/** \internal + * \brief Template functor to compute the exponentially scaled modified Bessel + * function of the first kind of order zero + * \sa class CwiseUnaryOp, Cwise::bessel_i0e() + */ +template +struct scalar_bessel_i0e_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_i0e_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const { + using numext::bessel_i0e; + return bessel_i0e(x); + } + typedef typename packet_traits::type Packet; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { + return internal::pbessel_i0e(x); + } +}; +template +struct functor_traits > { + enum { + // On average, a Chebyshev polynomial of order N=20 is computed. + // The cost is N multiplications and 2N additions. + Cost = 20 * NumTraits::MulCost + 40 * NumTraits::AddCost, + PacketAccess = packet_traits::HasBessel + }; +}; + +/** \internal + * \brief Template functor to compute the modified Bessel function of the first + * kind of order one + * \sa class CwiseUnaryOp, Cwise::bessel_i1() + */ +template +struct scalar_bessel_i1_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_i1_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const { + using numext::bessel_i1; + return bessel_i1(x); + } + typedef typename packet_traits::type Packet; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { + return internal::pbessel_i1(x); + } +}; +template +struct functor_traits > { + enum { + // On average, a Chebyshev polynomial of order N=20 is computed. + // The cost is N multiplications and 2N additions. We also add + // the cost of an additional exp over i1e. + Cost = 28 * NumTraits::MulCost + 48 * NumTraits::AddCost, + PacketAccess = packet_traits::HasBessel + }; +}; + +/** \internal + * \brief Template functor to compute the exponentially scaled modified Bessel + * function of the first kind of order zero + * \sa class CwiseUnaryOp, Cwise::bessel_i1e() + */ +template +struct scalar_bessel_i1e_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_i1e_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const { + using numext::bessel_i1e; + return bessel_i1e(x); + } + typedef typename packet_traits::type Packet; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { + return internal::pbessel_i1e(x); + } +}; +template +struct functor_traits > { + enum { + // On average, a Chebyshev polynomial of order N=20 is computed. + // The cost is N multiplications and 2N additions. + Cost = 20 * NumTraits::MulCost + 40 * NumTraits::AddCost, + PacketAccess = packet_traits::HasBessel + }; +}; + +/** \internal + * \brief Template functor to compute the Bessel function of the second kind of + * order zero + * \sa class CwiseUnaryOp, Cwise::bessel_j0() + */ +template +struct scalar_bessel_j0_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_j0_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const { + using numext::bessel_j0; + return bessel_j0(x); + } + typedef typename packet_traits::type Packet; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { + return internal::pbessel_j0(x); + } +}; +template +struct functor_traits > { + enum { + // 6 polynomial of order ~N=8 is computed. + // The cost is N multiplications and N additions each, along with a + // sine, cosine and rsqrt cost. + Cost = 63 * NumTraits::MulCost + 48 * NumTraits::AddCost, + PacketAccess = packet_traits::HasBessel + }; +}; + +/** \internal + * \brief Template functor to compute the Bessel function of the second kind of + * order zero + * \sa class CwiseUnaryOp, Cwise::bessel_y0() + */ +template +struct scalar_bessel_y0_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_y0_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const { + using numext::bessel_y0; + return bessel_y0(x); + } + typedef typename packet_traits::type Packet; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { + return internal::pbessel_y0(x); + } +}; +template +struct functor_traits > { + enum { + // 6 polynomial of order ~N=8 is computed. + // The cost is N multiplications and N additions each, along with a + // sine, cosine, rsqrt and j0 cost. + Cost = 126 * NumTraits::MulCost + 96 * NumTraits::AddCost, + PacketAccess = packet_traits::HasBessel + }; +}; + +/** \internal + * \brief Template functor to compute the Bessel function of the first kind of + * order one + * \sa class CwiseUnaryOp, Cwise::bessel_j1() + */ +template +struct scalar_bessel_j1_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_j1_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const { + using numext::bessel_j1; + return bessel_j1(x); + } + typedef typename packet_traits::type Packet; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { + return internal::pbessel_j1(x); + } +}; +template +struct functor_traits > { + enum { + // 6 polynomial of order ~N=8 is computed. + // The cost is N multiplications and N additions each, along with a + // sine, cosine and rsqrt cost. + Cost = 63 * NumTraits::MulCost + 48 * NumTraits::AddCost, + PacketAccess = packet_traits::HasBessel + }; +}; + +/** \internal + * \brief Template functor to compute the Bessel function of the second kind of + * order one + * \sa class CwiseUnaryOp, Cwise::bessel_j1e() + */ +template +struct scalar_bessel_y1_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_y1_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const { + using numext::bessel_y1; + return bessel_y1(x); + } + typedef typename packet_traits::type Packet; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { + return internal::pbessel_y1(x); + } +}; +template +struct functor_traits > { + enum { + // 6 polynomial of order ~N=8 is computed. + // The cost is N multiplications and N additions each, along with a + // sine, cosine, rsqrt and j1 cost. + Cost = 126 * NumTraits::MulCost + 96 * NumTraits::AddCost, + PacketAccess = packet_traits::HasBessel + }; +}; + +/** \internal + * \brief Template functor to compute the modified Bessel function of the second + * kind of order zero + * \sa class CwiseUnaryOp, Cwise::bessel_k0() + */ +template +struct scalar_bessel_k0_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_k0_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const { + using numext::bessel_k0; + return bessel_k0(x); + } + typedef typename packet_traits::type Packet; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { + return internal::pbessel_k0(x); + } +}; +template +struct functor_traits > { + enum { + // On average, a Chebyshev polynomial of order N=10 is computed. + // The cost is N multiplications and 2N additions. In addition we compute + // i0, a log, exp and prsqrt and sin and cos. + Cost = 68 * NumTraits::MulCost + 88 * NumTraits::AddCost, + PacketAccess = packet_traits::HasBessel + }; +}; + +/** \internal + * \brief Template functor to compute the exponentially scaled modified Bessel + * function of the second kind of order zero + * \sa class CwiseUnaryOp, Cwise::bessel_k0e() + */ +template +struct scalar_bessel_k0e_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_k0e_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const { + using numext::bessel_k0e; + return bessel_k0e(x); + } + typedef typename packet_traits::type Packet; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { + return internal::pbessel_k0e(x); + } +}; +template +struct functor_traits > { + enum { + // On average, a Chebyshev polynomial of order N=10 is computed. + // The cost is N multiplications and 2N additions. In addition we compute + // i0, a log, exp and prsqrt and sin and cos. + Cost = 68 * NumTraits::MulCost + 88 * NumTraits::AddCost, + PacketAccess = packet_traits::HasBessel + }; +}; + +/** \internal + * \brief Template functor to compute the modified Bessel function of the + * second kind of order one + * \sa class CwiseUnaryOp, Cwise::bessel_k1() + */ +template +struct scalar_bessel_k1_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_k1_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const { + using numext::bessel_k1; + return bessel_k1(x); + } + typedef typename packet_traits::type Packet; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { + return internal::pbessel_k1(x); + } +}; +template +struct functor_traits > { + enum { + // On average, a Chebyshev polynomial of order N=10 is computed. + // The cost is N multiplications and 2N additions. In addition we compute + // i1, a log, exp and prsqrt and sin and cos. + Cost = 68 * NumTraits::MulCost + 88 * NumTraits::AddCost, + PacketAccess = packet_traits::HasBessel + }; +}; + +/** \internal + * \brief Template functor to compute the exponentially scaled modified Bessel + * function of the second kind of order one + * \sa class CwiseUnaryOp, Cwise::bessel_k1e() + */ +template +struct scalar_bessel_k1e_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_k1e_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const { + using numext::bessel_k1e; + return bessel_k1e(x); + } + typedef typename packet_traits::type Packet; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { + return internal::pbessel_k1e(x); + } +}; +template +struct functor_traits > { + enum { + // On average, a Chebyshev polynomial of order N=10 is computed. + // The cost is N multiplications and 2N additions. In addition we compute + // i1, a log, exp and prsqrt and sin and cos. + Cost = 68 * NumTraits::MulCost + 88 * NumTraits::AddCost, + PacketAccess = packet_traits::HasBessel + }; +}; + + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_BESSELFUNCTIONS_FUNCTORS_H diff --git a/src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsHalf.h b/src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsHalf.h new file mode 100644 index 0000000..8930d1a --- /dev/null +++ b/src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsHalf.h @@ -0,0 +1,66 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_BESSELFUNCTIONS_HALF_H +#define EIGEN_BESSELFUNCTIONS_HALF_H + +namespace Eigen { +namespace numext { + +#if EIGEN_HAS_C99_MATH +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_i0(const Eigen::half& x) { + return Eigen::half(Eigen::numext::bessel_i0(static_cast(x))); +} +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_i0e(const Eigen::half& x) { + return Eigen::half(Eigen::numext::bessel_i0e(static_cast(x))); +} +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_i1(const Eigen::half& x) { + return Eigen::half(Eigen::numext::bessel_i1(static_cast(x))); +} +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_i1e(const Eigen::half& x) { + return Eigen::half(Eigen::numext::bessel_i1e(static_cast(x))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_j0(const Eigen::half& x) { + return Eigen::half(Eigen::numext::bessel_j0(static_cast(x))); +} +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_j1(const Eigen::half& x) { + return Eigen::half(Eigen::numext::bessel_j1(static_cast(x))); +} +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_y0(const Eigen::half& x) { + return Eigen::half(Eigen::numext::bessel_y0(static_cast(x))); +} +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_y1(const Eigen::half& x) { + return Eigen::half(Eigen::numext::bessel_y1(static_cast(x))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_k0(const Eigen::half& x) { + return Eigen::half(Eigen::numext::bessel_k0(static_cast(x))); +} +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_k0e(const Eigen::half& x) { + return Eigen::half(Eigen::numext::bessel_k0e(static_cast(x))); +} +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_k1(const Eigen::half& x) { + return Eigen::half(Eigen::numext::bessel_k1(static_cast(x))); +} +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_k1e(const Eigen::half& x) { + return Eigen::half(Eigen::numext::bessel_k1e(static_cast(x))); +} +#endif + +} // end namespace numext +} // end namespace Eigen + +#endif // EIGEN_BESSELFUNCTIONS_HALF_H diff --git a/src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsImpl.h b/src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsImpl.h new file mode 100644 index 0000000..24812be --- /dev/null +++ b/src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsImpl.h @@ -0,0 +1,1959 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Eugene Brevdo +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_BESSEL_FUNCTIONS_H +#define EIGEN_BESSEL_FUNCTIONS_H + +namespace Eigen { +namespace internal { + +// Parts of this code are based on the Cephes Math Library. +// +// Cephes Math Library Release 2.8: June, 2000 +// Copyright 1984, 1987, 1992, 2000 by Stephen L. Moshier +// +// Permission has been kindly provided by the original author +// to incorporate the Cephes software into the Eigen codebase: +// +// From: Stephen Moshier +// To: Eugene Brevdo +// Subject: Re: Permission to wrap several cephes functions in Eigen +// +// Hello Eugene, +// +// Thank you for writing. +// +// If your licensing is similar to BSD, the formal way that has been +// handled is simply to add a statement to the effect that you are incorporating +// the Cephes software by permission of the author. +// +// Good luck with your project, +// Steve + + +/**************************************************************************** + * Implementation of Bessel function, based on Cephes * + ****************************************************************************/ + +template +struct bessel_i0e_retval { + typedef Scalar type; +}; + +template ::type> +struct generic_i0e { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T&) { + EIGEN_STATIC_ASSERT((internal::is_same::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return ScalarType(0); + } +}; + +template +struct generic_i0e { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T& x) { + /* i0ef.c + * + * Modified Bessel function of order zero, + * exponentially scaled + * + * + * + * SYNOPSIS: + * + * float x, y, i0ef(); + * + * y = i0ef( x ); + * + * + * + * DESCRIPTION: + * + * Returns exponentially scaled modified Bessel function + * of order zero of the argument. + * + * The function is defined as i0e(x) = exp(-|x|) j0( ix ). + * + * + * + * ACCURACY: + * + * Relative error: + * arithmetic domain # trials peak rms + * IEEE 0,30 100000 3.7e-7 7.0e-8 + * See i0f(). + * + */ + + const float A[] = {-1.30002500998624804212E-8f, 6.04699502254191894932E-8f, + -2.67079385394061173391E-7f, 1.11738753912010371815E-6f, + -4.41673835845875056359E-6f, 1.64484480707288970893E-5f, + -5.75419501008210370398E-5f, 1.88502885095841655729E-4f, + -5.76375574538582365885E-4f, 1.63947561694133579842E-3f, + -4.32430999505057594430E-3f, 1.05464603945949983183E-2f, + -2.37374148058994688156E-2f, 4.93052842396707084878E-2f, + -9.49010970480476444210E-2f, 1.71620901522208775349E-1f, + -3.04682672343198398683E-1f, 6.76795274409476084995E-1f}; + + const float B[] = {3.39623202570838634515E-9f, 2.26666899049817806459E-8f, + 2.04891858946906374183E-7f, 2.89137052083475648297E-6f, + 6.88975834691682398426E-5f, 3.36911647825569408990E-3f, + 8.04490411014108831608E-1f}; + T y = pabs(x); + T y_le_eight = internal::pchebevl::run( + pmadd(pset1(0.5f), y, pset1(-2.0f)), A); + T y_gt_eight = pmul( + internal::pchebevl::run( + psub(pdiv(pset1(32.0f), y), pset1(2.0f)), B), + prsqrt(y)); + // TODO: Perhaps instead check whether all packet elements are in + // [-8, 8] and evaluate a branch based off of that. It's possible + // in practice most elements are in this region. + return pselect(pcmp_le(y, pset1(8.0f)), y_le_eight, y_gt_eight); + } +}; + +template +struct generic_i0e { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T& x) { + /* i0e.c + * + * Modified Bessel function of order zero, + * exponentially scaled + * + * + * + * SYNOPSIS: + * + * double x, y, i0e(); + * + * y = i0e( x ); + * + * + * + * DESCRIPTION: + * + * Returns exponentially scaled modified Bessel function + * of order zero of the argument. + * + * The function is defined as i0e(x) = exp(-|x|) j0( ix ). + * + * + * + * ACCURACY: + * + * Relative error: + * arithmetic domain # trials peak rms + * IEEE 0,30 30000 5.4e-16 1.2e-16 + * See i0(). + * + */ + + const double A[] = {-4.41534164647933937950E-18, 3.33079451882223809783E-17, + -2.43127984654795469359E-16, 1.71539128555513303061E-15, + -1.16853328779934516808E-14, 7.67618549860493561688E-14, + -4.85644678311192946090E-13, 2.95505266312963983461E-12, + -1.72682629144155570723E-11, 9.67580903537323691224E-11, + -5.18979560163526290666E-10, 2.65982372468238665035E-9, + -1.30002500998624804212E-8, 6.04699502254191894932E-8, + -2.67079385394061173391E-7, 1.11738753912010371815E-6, + -4.41673835845875056359E-6, 1.64484480707288970893E-5, + -5.75419501008210370398E-5, 1.88502885095841655729E-4, + -5.76375574538582365885E-4, 1.63947561694133579842E-3, + -4.32430999505057594430E-3, 1.05464603945949983183E-2, + -2.37374148058994688156E-2, 4.93052842396707084878E-2, + -9.49010970480476444210E-2, 1.71620901522208775349E-1, + -3.04682672343198398683E-1, 6.76795274409476084995E-1}; + const double B[] = { + -7.23318048787475395456E-18, -4.83050448594418207126E-18, + 4.46562142029675999901E-17, 3.46122286769746109310E-17, + -2.82762398051658348494E-16, -3.42548561967721913462E-16, + 1.77256013305652638360E-15, 3.81168066935262242075E-15, + -9.55484669882830764870E-15, -4.15056934728722208663E-14, + 1.54008621752140982691E-14, 3.85277838274214270114E-13, + 7.18012445138366623367E-13, -1.79417853150680611778E-12, + -1.32158118404477131188E-11, -3.14991652796324136454E-11, + 1.18891471078464383424E-11, 4.94060238822496958910E-10, + 3.39623202570838634515E-9, 2.26666899049817806459E-8, + 2.04891858946906374183E-7, 2.89137052083475648297E-6, + 6.88975834691682398426E-5, 3.36911647825569408990E-3, + 8.04490411014108831608E-1}; + T y = pabs(x); + T y_le_eight = internal::pchebevl::run( + pmadd(pset1(0.5), y, pset1(-2.0)), A); + T y_gt_eight = pmul( + internal::pchebevl::run( + psub(pdiv(pset1(32.0), y), pset1(2.0)), B), + prsqrt(y)); + // TODO: Perhaps instead check whether all packet elements are in + // [-8, 8] and evaluate a branch based off of that. It's possible + // in practice most elements are in this region. + return pselect(pcmp_le(y, pset1(8.0)), y_le_eight, y_gt_eight); + } +}; + +template +struct bessel_i0e_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T x) { + return generic_i0e::run(x); + } +}; + +template +struct bessel_i0_retval { + typedef Scalar type; +}; + +template ::type> +struct generic_i0 { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T& x) { + return pmul( + pexp(pabs(x)), + generic_i0e::run(x)); + } +}; + +template +struct bessel_i0_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T x) { + return generic_i0::run(x); + } +}; + +template +struct bessel_i1e_retval { + typedef Scalar type; +}; + +template ::type > +struct generic_i1e { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T&) { + EIGEN_STATIC_ASSERT((internal::is_same::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return ScalarType(0); + } +}; + +template +struct generic_i1e { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T& x) { + /* i1ef.c + * + * Modified Bessel function of order one, + * exponentially scaled + * + * + * + * SYNOPSIS: + * + * float x, y, i1ef(); + * + * y = i1ef( x ); + * + * + * + * DESCRIPTION: + * + * Returns exponentially scaled modified Bessel function + * of order one of the argument. + * + * The function is defined as i1(x) = -i exp(-|x|) j1( ix ). + * + * + * + * ACCURACY: + * + * Relative error: + * arithmetic domain # trials peak rms + * IEEE 0, 30 30000 1.5e-6 1.5e-7 + * See i1(). + * + */ + const float A[] = {9.38153738649577178388E-9f, -4.44505912879632808065E-8f, + 2.00329475355213526229E-7f, -8.56872026469545474066E-7f, + 3.47025130813767847674E-6f, -1.32731636560394358279E-5f, + 4.78156510755005422638E-5f, -1.61760815825896745588E-4f, + 5.12285956168575772895E-4f, -1.51357245063125314899E-3f, + 4.15642294431288815669E-3f, -1.05640848946261981558E-2f, + 2.47264490306265168283E-2f, -5.29459812080949914269E-2f, + 1.02643658689847095384E-1f, -1.76416518357834055153E-1f, + 2.52587186443633654823E-1f}; + + const float B[] = {-3.83538038596423702205E-9f, -2.63146884688951950684E-8f, + -2.51223623787020892529E-7f, -3.88256480887769039346E-6f, + -1.10588938762623716291E-4f, -9.76109749136146840777E-3f, + 7.78576235018280120474E-1f}; + + + T y = pabs(x); + T y_le_eight = pmul(y, internal::pchebevl::run( + pmadd(pset1(0.5f), y, pset1(-2.0f)), A)); + T y_gt_eight = pmul( + internal::pchebevl::run( + psub(pdiv(pset1(32.0f), y), + pset1(2.0f)), B), + prsqrt(y)); + // TODO: Perhaps instead check whether all packet elements are in + // [-8, 8] and evaluate a branch based off of that. It's possible + // in practice most elements are in this region. + y = pselect(pcmp_le(y, pset1(8.0f)), y_le_eight, y_gt_eight); + return pselect(pcmp_lt(x, pset1(0.0f)), pnegate(y), y); + } +}; + +template +struct generic_i1e { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T& x) { + /* i1e.c + * + * Modified Bessel function of order one, + * exponentially scaled + * + * + * + * SYNOPSIS: + * + * double x, y, i1e(); + * + * y = i1e( x ); + * + * + * + * DESCRIPTION: + * + * Returns exponentially scaled modified Bessel function + * of order one of the argument. + * + * The function is defined as i1(x) = -i exp(-|x|) j1( ix ). + * + * + * + * ACCURACY: + * + * Relative error: + * arithmetic domain # trials peak rms + * IEEE 0, 30 30000 2.0e-15 2.0e-16 + * See i1(). + * + */ + const double A[] = {2.77791411276104639959E-18, -2.11142121435816608115E-17, + 1.55363195773620046921E-16, -1.10559694773538630805E-15, + 7.60068429473540693410E-15, -5.04218550472791168711E-14, + 3.22379336594557470981E-13, -1.98397439776494371520E-12, + 1.17361862988909016308E-11, -6.66348972350202774223E-11, + 3.62559028155211703701E-10, -1.88724975172282928790E-9, + 9.38153738649577178388E-9, -4.44505912879632808065E-8, + 2.00329475355213526229E-7, -8.56872026469545474066E-7, + 3.47025130813767847674E-6, -1.32731636560394358279E-5, + 4.78156510755005422638E-5, -1.61760815825896745588E-4, + 5.12285956168575772895E-4, -1.51357245063125314899E-3, + 4.15642294431288815669E-3, -1.05640848946261981558E-2, + 2.47264490306265168283E-2, -5.29459812080949914269E-2, + 1.02643658689847095384E-1, -1.76416518357834055153E-1, + 2.52587186443633654823E-1}; + const double B[] = { + 7.51729631084210481353E-18, 4.41434832307170791151E-18, + -4.65030536848935832153E-17, -3.20952592199342395980E-17, + 2.96262899764595013876E-16, 3.30820231092092828324E-16, + -1.88035477551078244854E-15, -3.81440307243700780478E-15, + 1.04202769841288027642E-14, 4.27244001671195135429E-14, + -2.10154184277266431302E-14, -4.08355111109219731823E-13, + -7.19855177624590851209E-13, 2.03562854414708950722E-12, + 1.41258074366137813316E-11, 3.25260358301548823856E-11, + -1.89749581235054123450E-11, -5.58974346219658380687E-10, + -3.83538038596423702205E-9, -2.63146884688951950684E-8, + -2.51223623787020892529E-7, -3.88256480887769039346E-6, + -1.10588938762623716291E-4, -9.76109749136146840777E-3, + 7.78576235018280120474E-1}; + T y = pabs(x); + T y_le_eight = pmul(y, internal::pchebevl::run( + pmadd(pset1(0.5), y, pset1(-2.0)), A)); + T y_gt_eight = pmul( + internal::pchebevl::run( + psub(pdiv(pset1(32.0), y), + pset1(2.0)), B), + prsqrt(y)); + // TODO: Perhaps instead check whether all packet elements are in + // [-8, 8] and evaluate a branch based off of that. It's possible + // in practice most elements are in this region. + y = pselect(pcmp_le(y, pset1(8.0)), y_le_eight, y_gt_eight); + return pselect(pcmp_lt(x, pset1(0.0)), pnegate(y), y); + } +}; + +template +struct bessel_i1e_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T x) { + return generic_i1e::run(x); + } +}; + +template +struct bessel_i1_retval { + typedef T type; +}; + +template ::type> +struct generic_i1 { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T& x) { + return pmul( + pexp(pabs(x)), + generic_i1e::run(x)); + } +}; + +template +struct bessel_i1_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T x) { + return generic_i1::run(x); + } +}; + +template +struct bessel_k0e_retval { + typedef T type; +}; + +template ::type> +struct generic_k0e { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T&) { + EIGEN_STATIC_ASSERT((internal::is_same::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return ScalarType(0); + } +}; + +template +struct generic_k0e { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T& x) { + /* k0ef.c + * Modified Bessel function, third kind, order zero, + * exponentially scaled + * + * + * + * SYNOPSIS: + * + * float x, y, k0ef(); + * + * y = k0ef( x ); + * + * + * + * DESCRIPTION: + * + * Returns exponentially scaled modified Bessel function + * of the third kind of order zero of the argument. + * + * + * + * ACCURACY: + * + * Relative error: + * arithmetic domain # trials peak rms + * IEEE 0, 30 30000 8.1e-7 7.8e-8 + * See k0(). + * + */ + + const float A[] = {1.90451637722020886025E-9f, 2.53479107902614945675E-7f, + 2.28621210311945178607E-5f, 1.26461541144692592338E-3f, + 3.59799365153615016266E-2f, 3.44289899924628486886E-1f, + -5.35327393233902768720E-1f}; + + const float B[] = {-1.69753450938905987466E-9f, 8.57403401741422608519E-9f, + -4.66048989768794782956E-8f, 2.76681363944501510342E-7f, + -1.83175552271911948767E-6f, 1.39498137188764993662E-5f, + -1.28495495816278026384E-4f, 1.56988388573005337491E-3f, + -3.14481013119645005427E-2f, 2.44030308206595545468E0f}; + const T MAXNUM = pset1(NumTraits::infinity()); + const T two = pset1(2.0); + T x_le_two = internal::pchebevl::run( + pmadd(x, x, pset1(-2.0)), A); + x_le_two = pmadd( + generic_i0::run(x), pnegate( + plog(pmul(pset1(0.5), x))), x_le_two); + x_le_two = pmul(pexp(x), x_le_two); + T x_gt_two = pmul( + internal::pchebevl::run( + psub(pdiv(pset1(8.0), x), two), B), + prsqrt(x)); + return pselect( + pcmp_le(x, pset1(0.0)), + MAXNUM, + pselect(pcmp_le(x, two), x_le_two, x_gt_two)); + } +}; + +template +struct generic_k0e { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T& x) { + /* k0e.c + * Modified Bessel function, third kind, order zero, + * exponentially scaled + * + * + * + * SYNOPSIS: + * + * double x, y, k0e(); + * + * y = k0e( x ); + * + * + * + * DESCRIPTION: + * + * Returns exponentially scaled modified Bessel function + * of the third kind of order zero of the argument. + * + * + * + * ACCURACY: + * + * Relative error: + * arithmetic domain # trials peak rms + * IEEE 0, 30 30000 1.4e-15 1.4e-16 + * See k0(). + * + */ + + const double A[] = { + 1.37446543561352307156E-16, + 4.25981614279661018399E-14, + 1.03496952576338420167E-11, + 1.90451637722020886025E-9, + 2.53479107902614945675E-7, + 2.28621210311945178607E-5, + 1.26461541144692592338E-3, + 3.59799365153615016266E-2, + 3.44289899924628486886E-1, + -5.35327393233902768720E-1}; + const double B[] = { + 5.30043377268626276149E-18, -1.64758043015242134646E-17, + 5.21039150503902756861E-17, -1.67823109680541210385E-16, + 5.51205597852431940784E-16, -1.84859337734377901440E-15, + 6.34007647740507060557E-15, -2.22751332699166985548E-14, + 8.03289077536357521100E-14, -2.98009692317273043925E-13, + 1.14034058820847496303E-12, -4.51459788337394416547E-12, + 1.85594911495471785253E-11, -7.95748924447710747776E-11, + 3.57739728140030116597E-10, -1.69753450938905987466E-9, + 8.57403401741422608519E-9, -4.66048989768794782956E-8, + 2.76681363944501510342E-7, -1.83175552271911948767E-6, + 1.39498137188764993662E-5, -1.28495495816278026384E-4, + 1.56988388573005337491E-3, -3.14481013119645005427E-2, + 2.44030308206595545468E0 + }; + const T MAXNUM = pset1(NumTraits::infinity()); + const T two = pset1(2.0); + T x_le_two = internal::pchebevl::run( + pmadd(x, x, pset1(-2.0)), A); + x_le_two = pmadd( + generic_i0::run(x), pmul( + pset1(-1.0), plog(pmul(pset1(0.5), x))), x_le_two); + x_le_two = pmul(pexp(x), x_le_two); + x_le_two = pselect(pcmp_le(x, pset1(0.0)), MAXNUM, x_le_two); + T x_gt_two = pmul( + internal::pchebevl::run( + psub(pdiv(pset1(8.0), x), two), B), + prsqrt(x)); + return pselect(pcmp_le(x, two), x_le_two, x_gt_two); + } +}; + +template +struct bessel_k0e_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T x) { + return generic_k0e::run(x); + } +}; + +template +struct bessel_k0_retval { + typedef T type; +}; + +template ::type> +struct generic_k0 { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T&) { + EIGEN_STATIC_ASSERT((internal::is_same::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return ScalarType(0); + } +}; + +template +struct generic_k0 { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T& x) { + /* k0f.c + * Modified Bessel function, third kind, order zero + * + * + * + * SYNOPSIS: + * + * float x, y, k0f(); + * + * y = k0f( x ); + * + * + * + * DESCRIPTION: + * + * Returns modified Bessel function of the third kind + * of order zero of the argument. + * + * The range is partitioned into the two intervals [0,8] and + * (8, infinity). Chebyshev polynomial expansions are employed + * in each interval. + * + * + * + * ACCURACY: + * + * Tested at 2000 random points between 0 and 8. Peak absolute + * error (relative when K0 > 1) was 1.46e-14; rms, 4.26e-15. + * Relative error: + * arithmetic domain # trials peak rms + * IEEE 0, 30 30000 7.8e-7 8.5e-8 + * + * ERROR MESSAGES: + * + * message condition value returned + * K0 domain x <= 0 MAXNUM + * + */ + + const float A[] = {1.90451637722020886025E-9f, 2.53479107902614945675E-7f, + 2.28621210311945178607E-5f, 1.26461541144692592338E-3f, + 3.59799365153615016266E-2f, 3.44289899924628486886E-1f, + -5.35327393233902768720E-1f}; + + const float B[] = {-1.69753450938905987466E-9f, 8.57403401741422608519E-9f, + -4.66048989768794782956E-8f, 2.76681363944501510342E-7f, + -1.83175552271911948767E-6f, 1.39498137188764993662E-5f, + -1.28495495816278026384E-4f, 1.56988388573005337491E-3f, + -3.14481013119645005427E-2f, 2.44030308206595545468E0f}; + const T MAXNUM = pset1(NumTraits::infinity()); + const T two = pset1(2.0); + T x_le_two = internal::pchebevl::run( + pmadd(x, x, pset1(-2.0)), A); + x_le_two = pmadd( + generic_i0::run(x), pnegate( + plog(pmul(pset1(0.5), x))), x_le_two); + x_le_two = pselect(pcmp_le(x, pset1(0.0)), MAXNUM, x_le_two); + T x_gt_two = pmul( + pmul( + pexp(pnegate(x)), + internal::pchebevl::run( + psub(pdiv(pset1(8.0), x), two), B)), + prsqrt(x)); + return pselect(pcmp_le(x, two), x_le_two, x_gt_two); + } +}; + +template +struct generic_k0 { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T& x) { + /* + * + * Modified Bessel function, third kind, order zero, + * exponentially scaled + * + * + * + * SYNOPSIS: + * + * double x, y, k0(); + * + * y = k0( x ); + * + * + * + * DESCRIPTION: + * + * Returns exponentially scaled modified Bessel function + * of the third kind of order zero of the argument. + * + * + * + * ACCURACY: + * + * Relative error: + * arithmetic domain # trials peak rms + * IEEE 0, 30 30000 1.4e-15 1.4e-16 + * See k0(). + * + */ + const double A[] = { + 1.37446543561352307156E-16, + 4.25981614279661018399E-14, + 1.03496952576338420167E-11, + 1.90451637722020886025E-9, + 2.53479107902614945675E-7, + 2.28621210311945178607E-5, + 1.26461541144692592338E-3, + 3.59799365153615016266E-2, + 3.44289899924628486886E-1, + -5.35327393233902768720E-1}; + const double B[] = { + 5.30043377268626276149E-18, -1.64758043015242134646E-17, + 5.21039150503902756861E-17, -1.67823109680541210385E-16, + 5.51205597852431940784E-16, -1.84859337734377901440E-15, + 6.34007647740507060557E-15, -2.22751332699166985548E-14, + 8.03289077536357521100E-14, -2.98009692317273043925E-13, + 1.14034058820847496303E-12, -4.51459788337394416547E-12, + 1.85594911495471785253E-11, -7.95748924447710747776E-11, + 3.57739728140030116597E-10, -1.69753450938905987466E-9, + 8.57403401741422608519E-9, -4.66048989768794782956E-8, + 2.76681363944501510342E-7, -1.83175552271911948767E-6, + 1.39498137188764993662E-5, -1.28495495816278026384E-4, + 1.56988388573005337491E-3, -3.14481013119645005427E-2, + 2.44030308206595545468E0 + }; + const T MAXNUM = pset1(NumTraits::infinity()); + const T two = pset1(2.0); + T x_le_two = internal::pchebevl::run( + pmadd(x, x, pset1(-2.0)), A); + x_le_two = pmadd( + generic_i0::run(x), pnegate( + plog(pmul(pset1(0.5), x))), x_le_two); + x_le_two = pselect(pcmp_le(x, pset1(0.0)), MAXNUM, x_le_two); + T x_gt_two = pmul( + pmul( + pexp(-x), + internal::pchebevl::run( + psub(pdiv(pset1(8.0), x), two), B)), + prsqrt(x)); + return pselect(pcmp_le(x, two), x_le_two, x_gt_two); + } +}; + +template +struct bessel_k0_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T x) { + return generic_k0::run(x); + } +}; + +template +struct bessel_k1e_retval { + typedef T type; +}; + +template ::type> +struct generic_k1e { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T&) { + EIGEN_STATIC_ASSERT((internal::is_same::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return ScalarType(0); + } +}; + +template +struct generic_k1e { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T& x) { + /* k1ef.c + * + * Modified Bessel function, third kind, order one, + * exponentially scaled + * + * + * + * SYNOPSIS: + * + * float x, y, k1ef(); + * + * y = k1ef( x ); + * + * + * + * DESCRIPTION: + * + * Returns exponentially scaled modified Bessel function + * of the third kind of order one of the argument: + * + * k1e(x) = exp(x) * k1(x). + * + * + * + * ACCURACY: + * + * Relative error: + * arithmetic domain # trials peak rms + * IEEE 0, 30 30000 4.9e-7 6.7e-8 + * See k1(). + * + */ + + const float A[] = {-2.21338763073472585583E-8f, -2.43340614156596823496E-6f, + -1.73028895751305206302E-4f, -6.97572385963986435018E-3f, + -1.22611180822657148235E-1f, -3.53155960776544875667E-1f, + 1.52530022733894777053E0f}; + const float B[] = {2.01504975519703286596E-9f, -1.03457624656780970260E-8f, + 5.74108412545004946722E-8f, -3.50196060308781257119E-7f, + 2.40648494783721712015E-6f, -1.93619797416608296024E-5f, + 1.95215518471351631108E-4f, -2.85781685962277938680E-3f, + 1.03923736576817238437E-1f, 2.72062619048444266945E0f}; + const T MAXNUM = pset1(NumTraits::infinity()); + const T two = pset1(2.0); + T x_le_two = pdiv(internal::pchebevl::run( + pmadd(x, x, pset1(-2.0)), A), x); + x_le_two = pmadd( + generic_i1::run(x), plog(pmul(pset1(0.5), x)), x_le_two); + x_le_two = pmul(x_le_two, pexp(x)); + x_le_two = pselect(pcmp_le(x, pset1(0.0)), MAXNUM, x_le_two); + T x_gt_two = pmul( + internal::pchebevl::run( + psub(pdiv(pset1(8.0), x), two), B), + prsqrt(x)); + return pselect(pcmp_le(x, two), x_le_two, x_gt_two); + } +}; + +template +struct generic_k1e { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T& x) { + /* k1e.c + * + * Modified Bessel function, third kind, order one, + * exponentially scaled + * + * + * + * SYNOPSIS: + * + * double x, y, k1e(); + * + * y = k1e( x ); + * + * + * + * DESCRIPTION: + * + * Returns exponentially scaled modified Bessel function + * of the third kind of order one of the argument: + * + * k1e(x) = exp(x) * k1(x). + * + * + * + * ACCURACY: + * + * Relative error: + * arithmetic domain # trials peak rms + * IEEE 0, 30 30000 7.8e-16 1.2e-16 + * See k1(). + * + */ + const double A[] = {-7.02386347938628759343E-18, -2.42744985051936593393E-15, + -6.66690169419932900609E-13, -1.41148839263352776110E-10, + -2.21338763073472585583E-8, -2.43340614156596823496E-6, + -1.73028895751305206302E-4, -6.97572385963986435018E-3, + -1.22611180822657148235E-1, -3.53155960776544875667E-1, + 1.52530022733894777053E0}; + const double B[] = {-5.75674448366501715755E-18, 1.79405087314755922667E-17, + -5.68946255844285935196E-17, 1.83809354436663880070E-16, + -6.05704724837331885336E-16, 2.03870316562433424052E-15, + -7.01983709041831346144E-15, 2.47715442448130437068E-14, + -8.97670518232499435011E-14, 3.34841966607842919884E-13, + -1.28917396095102890680E-12, 5.13963967348173025100E-12, + -2.12996783842756842877E-11, 9.21831518760500529508E-11, + -4.19035475934189648750E-10, 2.01504975519703286596E-9, + -1.03457624656780970260E-8, 5.74108412545004946722E-8, + -3.50196060308781257119E-7, 2.40648494783721712015E-6, + -1.93619797416608296024E-5, 1.95215518471351631108E-4, + -2.85781685962277938680E-3, 1.03923736576817238437E-1, + 2.72062619048444266945E0}; + const T MAXNUM = pset1(NumTraits::infinity()); + const T two = pset1(2.0); + T x_le_two = pdiv(internal::pchebevl::run( + pmadd(x, x, pset1(-2.0)), A), x); + x_le_two = pmadd( + generic_i1::run(x), plog(pmul(pset1(0.5), x)), x_le_two); + x_le_two = pmul(x_le_two, pexp(x)); + x_le_two = pselect(pcmp_le(x, pset1(0.0)), MAXNUM, x_le_two); + T x_gt_two = pmul( + internal::pchebevl::run( + psub(pdiv(pset1(8.0), x), two), B), + prsqrt(x)); + return pselect(pcmp_le(x, two), x_le_two, x_gt_two); + } +}; + +template +struct bessel_k1e_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T x) { + return generic_k1e::run(x); + } +}; + +template +struct bessel_k1_retval { + typedef T type; +}; + +template ::type> +struct generic_k1 { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T&) { + EIGEN_STATIC_ASSERT((internal::is_same::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return ScalarType(0); + } +}; + +template +struct generic_k1 { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T& x) { + /* k1f.c + * Modified Bessel function, third kind, order one + * + * + * + * SYNOPSIS: + * + * float x, y, k1f(); + * + * y = k1f( x ); + * + * + * + * DESCRIPTION: + * + * Computes the modified Bessel function of the third kind + * of order one of the argument. + * + * The range is partitioned into the two intervals [0,2] and + * (2, infinity). Chebyshev polynomial expansions are employed + * in each interval. + * + * + * + * ACCURACY: + * + * Relative error: + * arithmetic domain # trials peak rms + * IEEE 0, 30 30000 4.6e-7 7.6e-8 + * + * ERROR MESSAGES: + * + * message condition value returned + * k1 domain x <= 0 MAXNUM + * + */ + + const float A[] = {-2.21338763073472585583E-8f, -2.43340614156596823496E-6f, + -1.73028895751305206302E-4f, -6.97572385963986435018E-3f, + -1.22611180822657148235E-1f, -3.53155960776544875667E-1f, + 1.52530022733894777053E0f}; + const float B[] = {2.01504975519703286596E-9f, -1.03457624656780970260E-8f, + 5.74108412545004946722E-8f, -3.50196060308781257119E-7f, + 2.40648494783721712015E-6f, -1.93619797416608296024E-5f, + 1.95215518471351631108E-4f, -2.85781685962277938680E-3f, + 1.03923736576817238437E-1f, 2.72062619048444266945E0f}; + const T MAXNUM = pset1(NumTraits::infinity()); + const T two = pset1(2.0); + T x_le_two = pdiv(internal::pchebevl::run( + pmadd(x, x, pset1(-2.0)), A), x); + x_le_two = pmadd( + generic_i1::run(x), plog(pmul(pset1(0.5), x)), x_le_two); + x_le_two = pselect(pcmp_le(x, pset1(0.0)), MAXNUM, x_le_two); + T x_gt_two = pmul( + pexp(pnegate(x)), + pmul( + internal::pchebevl::run( + psub(pdiv(pset1(8.0), x), two), B), + prsqrt(x))); + return pselect(pcmp_le(x, two), x_le_two, x_gt_two); + } +}; + +template +struct generic_k1 { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T& x) { + /* k1.c + * Modified Bessel function, third kind, order one + * + * + * + * SYNOPSIS: + * + * float x, y, k1f(); + * + * y = k1f( x ); + * + * + * + * DESCRIPTION: + * + * Computes the modified Bessel function of the third kind + * of order one of the argument. + * + * The range is partitioned into the two intervals [0,2] and + * (2, infinity). Chebyshev polynomial expansions are employed + * in each interval. + * + * + * + * ACCURACY: + * + * Relative error: + * arithmetic domain # trials peak rms + * IEEE 0, 30 30000 4.6e-7 7.6e-8 + * + * ERROR MESSAGES: + * + * message condition value returned + * k1 domain x <= 0 MAXNUM + * + */ + const double A[] = {-7.02386347938628759343E-18, -2.42744985051936593393E-15, + -6.66690169419932900609E-13, -1.41148839263352776110E-10, + -2.21338763073472585583E-8, -2.43340614156596823496E-6, + -1.73028895751305206302E-4, -6.97572385963986435018E-3, + -1.22611180822657148235E-1, -3.53155960776544875667E-1, + 1.52530022733894777053E0}; + const double B[] = {-5.75674448366501715755E-18, 1.79405087314755922667E-17, + -5.68946255844285935196E-17, 1.83809354436663880070E-16, + -6.05704724837331885336E-16, 2.03870316562433424052E-15, + -7.01983709041831346144E-15, 2.47715442448130437068E-14, + -8.97670518232499435011E-14, 3.34841966607842919884E-13, + -1.28917396095102890680E-12, 5.13963967348173025100E-12, + -2.12996783842756842877E-11, 9.21831518760500529508E-11, + -4.19035475934189648750E-10, 2.01504975519703286596E-9, + -1.03457624656780970260E-8, 5.74108412545004946722E-8, + -3.50196060308781257119E-7, 2.40648494783721712015E-6, + -1.93619797416608296024E-5, 1.95215518471351631108E-4, + -2.85781685962277938680E-3, 1.03923736576817238437E-1, + 2.72062619048444266945E0}; + const T MAXNUM = pset1(NumTraits::infinity()); + const T two = pset1(2.0); + T x_le_two = pdiv(internal::pchebevl::run( + pmadd(x, x, pset1(-2.0)), A), x); + x_le_two = pmadd( + generic_i1::run(x), plog(pmul(pset1(0.5), x)), x_le_two); + x_le_two = pselect(pcmp_le(x, pset1(0.0)), MAXNUM, x_le_two); + T x_gt_two = pmul( + pexp(-x), + pmul( + internal::pchebevl::run( + psub(pdiv(pset1(8.0), x), two), B), + prsqrt(x))); + return pselect(pcmp_le(x, two), x_le_two, x_gt_two); + } +}; + +template +struct bessel_k1_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T x) { + return generic_k1::run(x); + } +}; + +template +struct bessel_j0_retval { + typedef T type; +}; + +template ::type> +struct generic_j0 { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T&) { + EIGEN_STATIC_ASSERT((internal::is_same::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return ScalarType(0); + } +}; + +template +struct generic_j0 { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T& x) { + /* j0f.c + * Bessel function of order zero + * + * + * + * SYNOPSIS: + * + * float x, y, j0f(); + * + * y = j0f( x ); + * + * + * + * DESCRIPTION: + * + * Returns Bessel function of order zero of the argument. + * + * The domain is divided into the intervals [0, 2] and + * (2, infinity). In the first interval the following polynomial + * approximation is used: + * + * + * 2 2 2 + * (w - r ) (w - r ) (w - r ) P(w) + * 1 2 3 + * + * 2 + * where w = x and the three r's are zeros of the function. + * + * In the second interval, the modulus and phase are approximated + * by polynomials of the form Modulus(x) = sqrt(1/x) Q(1/x) + * and Phase(x) = x + 1/x R(1/x^2) - pi/4. The function is + * + * j0(x) = Modulus(x) cos( Phase(x) ). + * + * + * + * ACCURACY: + * + * Absolute error: + * arithmetic domain # trials peak rms + * IEEE 0, 2 100000 1.3e-7 3.6e-8 + * IEEE 2, 32 100000 1.9e-7 5.4e-8 + * + */ + + const float JP[] = {-6.068350350393235E-008f, 6.388945720783375E-006f, + -3.969646342510940E-004f, 1.332913422519003E-002f, + -1.729150680240724E-001f}; + const float MO[] = {-6.838999669318810E-002f, 1.864949361379502E-001f, + -2.145007480346739E-001f, 1.197549369473540E-001f, + -3.560281861530129E-003f, -4.969382655296620E-002f, + -3.355424622293709E-006f, 7.978845717621440E-001f}; + const float PH[] = {3.242077816988247E+001f, -3.630592630518434E+001f, + 1.756221482109099E+001f, -4.974978466280903E+000f, + 1.001973420681837E+000f, -1.939906941791308E-001f, + 6.490598792654666E-002f, -1.249992184872738E-001f}; + const T DR1 = pset1(5.78318596294678452118f); + const T NEG_PIO4F = pset1(-0.7853981633974483096f); /* -pi / 4 */ + T y = pabs(x); + T z = pmul(y, y); + T y_le_two = pselect( + pcmp_lt(y, pset1(1.0e-3f)), + pmadd(z, pset1(-0.25f), pset1(1.0f)), + pmul(psub(z, DR1), internal::ppolevl::run(z, JP))); + T q = pdiv(pset1(1.0f), y); + T w = prsqrt(y); + T p = pmul(w, internal::ppolevl::run(q, MO)); + w = pmul(q, q); + T yn = pmadd(q, internal::ppolevl::run(w, PH), NEG_PIO4F); + T y_gt_two = pmul(p, pcos(padd(yn, y))); + return pselect(pcmp_le(y, pset1(2.0)), y_le_two, y_gt_two); + } +}; + +template +struct generic_j0 { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T& x) { + /* j0.c + * Bessel function of order zero + * + * + * + * SYNOPSIS: + * + * double x, y, j0(); + * + * y = j0( x ); + * + * + * + * DESCRIPTION: + * + * Returns Bessel function of order zero of the argument. + * + * The domain is divided into the intervals [0, 5] and + * (5, infinity). In the first interval the following rational + * approximation is used: + * + * + * 2 2 + * (w - r ) (w - r ) P (w) / Q (w) + * 1 2 3 8 + * + * 2 + * where w = x and the two r's are zeros of the function. + * + * In the second interval, the Hankel asymptotic expansion + * is employed with two rational functions of degree 6/6 + * and 7/7. + * + * + * + * ACCURACY: + * + * Absolute error: + * arithmetic domain # trials peak rms + * DEC 0, 30 10000 4.4e-17 6.3e-18 + * IEEE 0, 30 60000 4.2e-16 1.1e-16 + * + */ + const double PP[] = {7.96936729297347051624E-4, 8.28352392107440799803E-2, + 1.23953371646414299388E0, 5.44725003058768775090E0, + 8.74716500199817011941E0, 5.30324038235394892183E0, + 9.99999999999999997821E-1}; + const double PQ[] = {9.24408810558863637013E-4, 8.56288474354474431428E-2, + 1.25352743901058953537E0, 5.47097740330417105182E0, + 8.76190883237069594232E0, 5.30605288235394617618E0, + 1.00000000000000000218E0}; + const double QP[] = {-1.13663838898469149931E-2, -1.28252718670509318512E0, + -1.95539544257735972385E1, -9.32060152123768231369E1, + -1.77681167980488050595E2, -1.47077505154951170175E2, + -5.14105326766599330220E1, -6.05014350600728481186E0}; + const double QQ[] = {1.00000000000000000000E0, 6.43178256118178023184E1, + 8.56430025976980587198E2, 3.88240183605401609683E3, + 7.24046774195652478189E3, 5.93072701187316984827E3, + 2.06209331660327847417E3, 2.42005740240291393179E2}; + const double RP[] = {-4.79443220978201773821E9, 1.95617491946556577543E12, + -2.49248344360967716204E14, 9.70862251047306323952E15}; + const double RQ[] = {1.00000000000000000000E0, 4.99563147152651017219E2, + 1.73785401676374683123E5, 4.84409658339962045305E7, + 1.11855537045356834862E10, 2.11277520115489217587E12, + 3.10518229857422583814E14, 3.18121955943204943306E16, + 1.71086294081043136091E18}; + const T DR1 = pset1(5.78318596294678452118E0); + const T DR2 = pset1(3.04712623436620863991E1); + const T SQ2OPI = pset1(7.9788456080286535587989E-1); /* sqrt(2 / pi) */ + const T NEG_PIO4 = pset1(-0.7853981633974483096); /* pi / 4 */ + + T y = pabs(x); + T z = pmul(y, y); + T y_le_five = pselect( + pcmp_lt(y, pset1(1.0e-5)), + pmadd(z, pset1(-0.25), pset1(1.0)), + pmul(pmul(psub(z, DR1), psub(z, DR2)), + pdiv(internal::ppolevl::run(z, RP), + internal::ppolevl::run(z, RQ)))); + T s = pdiv(pset1(25.0), z); + T p = pdiv( + internal::ppolevl::run(s, PP), + internal::ppolevl::run(s, PQ)); + T q = pdiv( + internal::ppolevl::run(s, QP), + internal::ppolevl::run(s, QQ)); + T yn = padd(y, NEG_PIO4); + T w = pdiv(pset1(-5.0), y); + p = pmadd(p, pcos(yn), pmul(w, pmul(q, psin(yn)))); + T y_gt_five = pmul(p, pmul(SQ2OPI, prsqrt(y))); + return pselect(pcmp_le(y, pset1(5.0)), y_le_five, y_gt_five); + } +}; + +template +struct bessel_j0_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T x) { + return generic_j0::run(x); + } +}; + +template +struct bessel_y0_retval { + typedef T type; +}; + +template ::type> +struct generic_y0 { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T&) { + EIGEN_STATIC_ASSERT((internal::is_same::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return ScalarType(0); + } +}; + +template +struct generic_y0 { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T& x) { + /* j0f.c + * Bessel function of the second kind, order zero + * + * + * + * SYNOPSIS: + * + * float x, y, y0f(); + * + * y = y0f( x ); + * + * + * + * DESCRIPTION: + * + * Returns Bessel function of the second kind, of order + * zero, of the argument. + * + * The domain is divided into the intervals [0, 2] and + * (2, infinity). In the first interval a rational approximation + * R(x) is employed to compute + * + * 2 2 2 + * y0(x) = (w - r ) (w - r ) (w - r ) R(x) + 2/pi ln(x) j0(x). + * 1 2 3 + * + * Thus a call to j0() is required. The three zeros are removed + * from R(x) to improve its numerical stability. + * + * In the second interval, the modulus and phase are approximated + * by polynomials of the form Modulus(x) = sqrt(1/x) Q(1/x) + * and Phase(x) = x + 1/x S(1/x^2) - pi/4. Then the function is + * + * y0(x) = Modulus(x) sin( Phase(x) ). + * + * + * + * + * ACCURACY: + * + * Absolute error, when y0(x) < 1; else relative error: + * + * arithmetic domain # trials peak rms + * IEEE 0, 2 100000 2.4e-7 3.4e-8 + * IEEE 2, 32 100000 1.8e-7 5.3e-8 + * + */ + + const float YP[] = {9.454583683980369E-008f, -9.413212653797057E-006f, + 5.344486707214273E-004f, -1.584289289821316E-002f, + 1.707584643733568E-001f}; + const float MO[] = {-6.838999669318810E-002f, 1.864949361379502E-001f, + -2.145007480346739E-001f, 1.197549369473540E-001f, + -3.560281861530129E-003f, -4.969382655296620E-002f, + -3.355424622293709E-006f, 7.978845717621440E-001f}; + const float PH[] = {3.242077816988247E+001f, -3.630592630518434E+001f, + 1.756221482109099E+001f, -4.974978466280903E+000f, + 1.001973420681837E+000f, -1.939906941791308E-001f, + 6.490598792654666E-002f, -1.249992184872738E-001f}; + const T YZ1 = pset1(0.43221455686510834878f); + const T TWOOPI = pset1(0.636619772367581343075535f); /* 2 / pi */ + const T NEG_PIO4F = pset1(-0.7853981633974483096f); /* -pi / 4 */ + const T NEG_MAXNUM = pset1(-NumTraits::infinity()); + T z = pmul(x, x); + T x_le_two = pmul(TWOOPI, pmul(plog(x), generic_j0::run(x))); + x_le_two = pmadd( + psub(z, YZ1), internal::ppolevl::run(z, YP), x_le_two); + x_le_two = pselect(pcmp_le(x, pset1(0.0)), NEG_MAXNUM, x_le_two); + T q = pdiv(pset1(1.0), x); + T w = prsqrt(x); + T p = pmul(w, internal::ppolevl::run(q, MO)); + T u = pmul(q, q); + T xn = pmadd(q, internal::ppolevl::run(u, PH), NEG_PIO4F); + T x_gt_two = pmul(p, psin(padd(xn, x))); + return pselect(pcmp_le(x, pset1(2.0)), x_le_two, x_gt_two); + } +}; + +template +struct generic_y0 { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T& x) { + /* j0.c + * Bessel function of the second kind, order zero + * + * + * + * SYNOPSIS: + * + * double x, y, y0(); + * + * y = y0( x ); + * + * + * + * DESCRIPTION: + * + * Returns Bessel function of the second kind, of order + * zero, of the argument. + * + * The domain is divided into the intervals [0, 5] and + * (5, infinity). In the first interval a rational approximation + * R(x) is employed to compute + * y0(x) = R(x) + 2 * log(x) * j0(x) / PI. + * Thus a call to j0() is required. + * + * In the second interval, the Hankel asymptotic expansion + * is employed with two rational functions of degree 6/6 + * and 7/7. + * + * + * + * ACCURACY: + * + * Absolute error, when y0(x) < 1; else relative error: + * + * arithmetic domain # trials peak rms + * DEC 0, 30 9400 7.0e-17 7.9e-18 + * IEEE 0, 30 30000 1.3e-15 1.6e-16 + * + */ + const double PP[] = {7.96936729297347051624E-4, 8.28352392107440799803E-2, + 1.23953371646414299388E0, 5.44725003058768775090E0, + 8.74716500199817011941E0, 5.30324038235394892183E0, + 9.99999999999999997821E-1}; + const double PQ[] = {9.24408810558863637013E-4, 8.56288474354474431428E-2, + 1.25352743901058953537E0, 5.47097740330417105182E0, + 8.76190883237069594232E0, 5.30605288235394617618E0, + 1.00000000000000000218E0}; + const double QP[] = {-1.13663838898469149931E-2, -1.28252718670509318512E0, + -1.95539544257735972385E1, -9.32060152123768231369E1, + -1.77681167980488050595E2, -1.47077505154951170175E2, + -5.14105326766599330220E1, -6.05014350600728481186E0}; + const double QQ[] = {1.00000000000000000000E0, 6.43178256118178023184E1, + 8.56430025976980587198E2, 3.88240183605401609683E3, + 7.24046774195652478189E3, 5.93072701187316984827E3, + 2.06209331660327847417E3, 2.42005740240291393179E2}; + const double YP[] = {1.55924367855235737965E4, -1.46639295903971606143E7, + 5.43526477051876500413E9, -9.82136065717911466409E11, + 8.75906394395366999549E13, -3.46628303384729719441E15, + 4.42733268572569800351E16, -1.84950800436986690637E16}; + const double YQ[] = {1.00000000000000000000E0, 1.04128353664259848412E3, + 6.26107330137134956842E5, 2.68919633393814121987E8, + 8.64002487103935000337E10, 2.02979612750105546709E13, + 3.17157752842975028269E15, 2.50596256172653059228E17}; + const T SQ2OPI = pset1(7.9788456080286535587989E-1); /* sqrt(2 / pi) */ + const T TWOOPI = pset1(0.636619772367581343075535); /* 2 / pi */ + const T NEG_PIO4 = pset1(-0.7853981633974483096); /* -pi / 4 */ + const T NEG_MAXNUM = pset1(-NumTraits::infinity()); + + T z = pmul(x, x); + T x_le_five = pdiv(internal::ppolevl::run(z, YP), + internal::ppolevl::run(z, YQ)); + x_le_five = pmadd( + pmul(TWOOPI, plog(x)), generic_j0::run(x), x_le_five); + x_le_five = pselect(pcmp_le(x, pset1(0.0)), NEG_MAXNUM, x_le_five); + T s = pdiv(pset1(25.0), z); + T p = pdiv( + internal::ppolevl::run(s, PP), + internal::ppolevl::run(s, PQ)); + T q = pdiv( + internal::ppolevl::run(s, QP), + internal::ppolevl::run(s, QQ)); + T xn = padd(x, NEG_PIO4); + T w = pdiv(pset1(5.0), x); + p = pmadd(p, psin(xn), pmul(w, pmul(q, pcos(xn)))); + T x_gt_five = pmul(p, pmul(SQ2OPI, prsqrt(x))); + return pselect(pcmp_le(x, pset1(5.0)), x_le_five, x_gt_five); + } +}; + +template +struct bessel_y0_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T x) { + return generic_y0::run(x); + } +}; + +template +struct bessel_j1_retval { + typedef T type; +}; + +template ::type> +struct generic_j1 { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T&) { + EIGEN_STATIC_ASSERT((internal::is_same::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return ScalarType(0); + } +}; + +template +struct generic_j1 { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T& x) { + /* j1f.c + * Bessel function of order one + * + * + * + * SYNOPSIS: + * + * float x, y, j1f(); + * + * y = j1f( x ); + * + * + * + * DESCRIPTION: + * + * Returns Bessel function of order one of the argument. + * + * The domain is divided into the intervals [0, 2] and + * (2, infinity). In the first interval a polynomial approximation + * 2 + * (w - r ) x P(w) + * 1 + * 2 + * is used, where w = x and r is the first zero of the function. + * + * In the second interval, the modulus and phase are approximated + * by polynomials of the form Modulus(x) = sqrt(1/x) Q(1/x) + * and Phase(x) = x + 1/x R(1/x^2) - 3pi/4. The function is + * + * j0(x) = Modulus(x) cos( Phase(x) ). + * + * + * + * ACCURACY: + * + * Absolute error: + * arithmetic domain # trials peak rms + * IEEE 0, 2 100000 1.2e-7 2.5e-8 + * IEEE 2, 32 100000 2.0e-7 5.3e-8 + * + * + */ + + const float JP[] = {-4.878788132172128E-009f, 6.009061827883699E-007f, + -4.541343896997497E-005f, 1.937383947804541E-003f, + -3.405537384615824E-002f}; + const float MO1[] = {6.913942741265801E-002f, -2.284801500053359E-001f, + 3.138238455499697E-001f, -2.102302420403875E-001f, + 5.435364690523026E-003f, 1.493389585089498E-001f, + 4.976029650847191E-006f, 7.978845453073848E-001f}; + const float PH1[] = {-4.497014141919556E+001f, 5.073465654089319E+001f, + -2.485774108720340E+001f, 7.222973196770240E+000f, + -1.544842782180211E+000f, 3.503787691653334E-001f, + -1.637986776941202E-001f, 3.749989509080821E-001f}; + const T Z1 = pset1(1.46819706421238932572E1f); + const T NEG_THPIO4F = pset1(-2.35619449019234492885f); /* -3*pi/4 */ + + T y = pabs(x); + T z = pmul(y, y); + T y_le_two = pmul( + psub(z, Z1), + pmul(x, internal::ppolevl::run(z, JP))); + T q = pdiv(pset1(1.0f), y); + T w = prsqrt(y); + T p = pmul(w, internal::ppolevl::run(q, MO1)); + w = pmul(q, q); + T yn = pmadd(q, internal::ppolevl::run(w, PH1), NEG_THPIO4F); + T y_gt_two = pmul(p, pcos(padd(yn, y))); + // j1 is an odd function. This implementation differs from cephes to + // take this fact in to account. Cephes returns -j1(x) for y > 2 range. + y_gt_two = pselect( + pcmp_lt(x, pset1(0.0f)), pnegate(y_gt_two), y_gt_two); + return pselect(pcmp_le(y, pset1(2.0f)), y_le_two, y_gt_two); + } +}; + +template +struct generic_j1 { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T& x) { + /* j1.c + * Bessel function of order one + * + * + * + * SYNOPSIS: + * + * double x, y, j1(); + * + * y = j1( x ); + * + * + * + * DESCRIPTION: + * + * Returns Bessel function of order one of the argument. + * + * The domain is divided into the intervals [0, 8] and + * (8, infinity). In the first interval a 24 term Chebyshev + * expansion is used. In the second, the asymptotic + * trigonometric representation is employed using two + * rational functions of degree 5/5. + * + * + * + * ACCURACY: + * + * Absolute error: + * arithmetic domain # trials peak rms + * DEC 0, 30 10000 4.0e-17 1.1e-17 + * IEEE 0, 30 30000 2.6e-16 1.1e-16 + * + */ + const double PP[] = {7.62125616208173112003E-4, 7.31397056940917570436E-2, + 1.12719608129684925192E0, 5.11207951146807644818E0, + 8.42404590141772420927E0, 5.21451598682361504063E0, + 1.00000000000000000254E0}; + const double PQ[] = {5.71323128072548699714E-4, 6.88455908754495404082E-2, + 1.10514232634061696926E0, 5.07386386128601488557E0, + 8.39985554327604159757E0, 5.20982848682361821619E0, + 9.99999999999999997461E-1}; + const double QP[] = {5.10862594750176621635E-2, 4.98213872951233449420E0, + 7.58238284132545283818E1, 3.66779609360150777800E2, + 7.10856304998926107277E2, 5.97489612400613639965E2, + 2.11688757100572135698E2, 2.52070205858023719784E1}; + const double QQ[] = {1.00000000000000000000E0, 7.42373277035675149943E1, + 1.05644886038262816351E3, 4.98641058337653607651E3, + 9.56231892404756170795E3, 7.99704160447350683650E3, + 2.82619278517639096600E3, 3.36093607810698293419E2}; + const double RP[] = {-8.99971225705559398224E8, 4.52228297998194034323E11, + -7.27494245221818276015E13, 3.68295732863852883286E15}; + const double RQ[] = {1.00000000000000000000E0, 6.20836478118054335476E2, + 2.56987256757748830383E5, 8.35146791431949253037E7, + 2.21511595479792499675E10, 4.74914122079991414898E12, + 7.84369607876235854894E14, 8.95222336184627338078E16, + 5.32278620332680085395E18}; + const T Z1 = pset1(1.46819706421238932572E1); + const T Z2 = pset1(4.92184563216946036703E1); + const T NEG_THPIO4 = pset1(-2.35619449019234492885); /* -3*pi/4 */ + const T SQ2OPI = pset1(7.9788456080286535587989E-1); /* sqrt(2 / pi) */ + T y = pabs(x); + T z = pmul(y, y); + T y_le_five = pdiv(internal::ppolevl::run(z, RP), + internal::ppolevl::run(z, RQ)); + y_le_five = pmul(pmul(pmul(y_le_five, x), psub(z, Z1)), psub(z, Z2)); + T s = pdiv(pset1(25.0), z); + T p = pdiv( + internal::ppolevl::run(s, PP), + internal::ppolevl::run(s, PQ)); + T q = pdiv( + internal::ppolevl::run(s, QP), + internal::ppolevl::run(s, QQ)); + T yn = padd(y, NEG_THPIO4); + T w = pdiv(pset1(-5.0), y); + p = pmadd(p, pcos(yn), pmul(w, pmul(q, psin(yn)))); + T y_gt_five = pmul(p, pmul(SQ2OPI, prsqrt(y))); + // j1 is an odd function. This implementation differs from cephes to + // take this fact in to account. Cephes returns -j1(x) for y > 5 range. + y_gt_five = pselect( + pcmp_lt(x, pset1(0.0)), pnegate(y_gt_five), y_gt_five); + return pselect(pcmp_le(y, pset1(5.0)), y_le_five, y_gt_five); + } +}; + +template +struct bessel_j1_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T x) { + return generic_j1::run(x); + } +}; + +template +struct bessel_y1_retval { + typedef T type; +}; + +template ::type> +struct generic_y1 { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T&) { + EIGEN_STATIC_ASSERT((internal::is_same::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return ScalarType(0); + } +}; + +template +struct generic_y1 { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T& x) { + /* j1f.c + * Bessel function of second kind of order one + * + * + * + * SYNOPSIS: + * + * double x, y, y1(); + * + * y = y1( x ); + * + * + * + * DESCRIPTION: + * + * Returns Bessel function of the second kind of order one + * of the argument. + * + * The domain is divided into the intervals [0, 2] and + * (2, infinity). In the first interval a rational approximation + * R(x) is employed to compute + * + * 2 + * y0(x) = (w - r ) x R(x^2) + 2/pi (ln(x) j1(x) - 1/x) . + * 1 + * + * Thus a call to j1() is required. + * + * In the second interval, the modulus and phase are approximated + * by polynomials of the form Modulus(x) = sqrt(1/x) Q(1/x) + * and Phase(x) = x + 1/x S(1/x^2) - 3pi/4. Then the function is + * + * y0(x) = Modulus(x) sin( Phase(x) ). + * + * + * + * + * ACCURACY: + * + * Absolute error: + * arithmetic domain # trials peak rms + * IEEE 0, 2 100000 2.2e-7 4.6e-8 + * IEEE 2, 32 100000 1.9e-7 5.3e-8 + * + * (error criterion relative when |y1| > 1). + * + */ + + const float YP[] = {8.061978323326852E-009f, -9.496460629917016E-007f, + 6.719543806674249E-005f, -2.641785726447862E-003f, + 4.202369946500099E-002f}; + const float MO1[] = {6.913942741265801E-002f, -2.284801500053359E-001f, + 3.138238455499697E-001f, -2.102302420403875E-001f, + 5.435364690523026E-003f, 1.493389585089498E-001f, + 4.976029650847191E-006f, 7.978845453073848E-001f}; + const float PH1[] = {-4.497014141919556E+001f, 5.073465654089319E+001f, + -2.485774108720340E+001f, 7.222973196770240E+000f, + -1.544842782180211E+000f, 3.503787691653334E-001f, + -1.637986776941202E-001f, 3.749989509080821E-001f}; + const T YO1 = pset1(4.66539330185668857532f); + const T NEG_THPIO4F = pset1(-2.35619449019234492885f); /* -3*pi/4 */ + const T TWOOPI = pset1(0.636619772367581343075535f); /* 2/pi */ + const T NEG_MAXNUM = pset1(-NumTraits::infinity()); + + T z = pmul(x, x); + T x_le_two = pmul(psub(z, YO1), internal::ppolevl::run(z, YP)); + x_le_two = pmadd( + x_le_two, x, + pmul(TWOOPI, pmadd( + generic_j1::run(x), plog(x), + pdiv(pset1(-1.0f), x)))); + x_le_two = pselect(pcmp_lt(x, pset1(0.0f)), NEG_MAXNUM, x_le_two); + + T q = pdiv(pset1(1.0), x); + T w = prsqrt(x); + T p = pmul(w, internal::ppolevl::run(q, MO1)); + w = pmul(q, q); + T xn = pmadd(q, internal::ppolevl::run(w, PH1), NEG_THPIO4F); + T x_gt_two = pmul(p, psin(padd(xn, x))); + return pselect(pcmp_le(x, pset1(2.0)), x_le_two, x_gt_two); + } +}; + +template +struct generic_y1 { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T& x) { + /* j1.c + * Bessel function of second kind of order one + * + * + * + * SYNOPSIS: + * + * double x, y, y1(); + * + * y = y1( x ); + * + * + * + * DESCRIPTION: + * + * Returns Bessel function of the second kind of order one + * of the argument. + * + * The domain is divided into the intervals [0, 8] and + * (8, infinity). In the first interval a 25 term Chebyshev + * expansion is used, and a call to j1() is required. + * In the second, the asymptotic trigonometric representation + * is employed using two rational functions of degree 5/5. + * + * + * + * ACCURACY: + * + * Absolute error: + * arithmetic domain # trials peak rms + * DEC 0, 30 10000 8.6e-17 1.3e-17 + * IEEE 0, 30 30000 1.0e-15 1.3e-16 + * + * (error criterion relative when |y1| > 1). + * + */ + const double PP[] = {7.62125616208173112003E-4, 7.31397056940917570436E-2, + 1.12719608129684925192E0, 5.11207951146807644818E0, + 8.42404590141772420927E0, 5.21451598682361504063E0, + 1.00000000000000000254E0}; + const double PQ[] = {5.71323128072548699714E-4, 6.88455908754495404082E-2, + 1.10514232634061696926E0, 5.07386386128601488557E0, + 8.39985554327604159757E0, 5.20982848682361821619E0, + 9.99999999999999997461E-1}; + const double QP[] = {5.10862594750176621635E-2, 4.98213872951233449420E0, + 7.58238284132545283818E1, 3.66779609360150777800E2, + 7.10856304998926107277E2, 5.97489612400613639965E2, + 2.11688757100572135698E2, 2.52070205858023719784E1}; + const double QQ[] = {1.00000000000000000000E0, 7.42373277035675149943E1, + 1.05644886038262816351E3, 4.98641058337653607651E3, + 9.56231892404756170795E3, 7.99704160447350683650E3, + 2.82619278517639096600E3, 3.36093607810698293419E2}; + const double YP[] = {1.26320474790178026440E9, -6.47355876379160291031E11, + 1.14509511541823727583E14, -8.12770255501325109621E15, + 2.02439475713594898196E17, -7.78877196265950026825E17}; + const double YQ[] = {1.00000000000000000000E0, 5.94301592346128195359E2, + 2.35564092943068577943E5, 7.34811944459721705660E7, + 1.87601316108706159478E10, 3.88231277496238566008E12, + 6.20557727146953693363E14, 6.87141087355300489866E16, + 3.97270608116560655612E18}; + const T SQ2OPI = pset1(.79788456080286535588); + const T NEG_THPIO4 = pset1(-2.35619449019234492885); /* -3*pi/4 */ + const T TWOOPI = pset1(0.636619772367581343075535); /* 2/pi */ + const T NEG_MAXNUM = pset1(-NumTraits::infinity()); + + T z = pmul(x, x); + T x_le_five = pdiv(internal::ppolevl::run(z, YP), + internal::ppolevl::run(z, YQ)); + x_le_five = pmadd( + x_le_five, x, pmul( + TWOOPI, pmadd(generic_j1::run(x), plog(x), + pdiv(pset1(-1.0), x)))); + + x_le_five = pselect(pcmp_le(x, pset1(0.0)), NEG_MAXNUM, x_le_five); + T s = pdiv(pset1(25.0), z); + T p = pdiv( + internal::ppolevl::run(s, PP), + internal::ppolevl::run(s, PQ)); + T q = pdiv( + internal::ppolevl::run(s, QP), + internal::ppolevl::run(s, QQ)); + T xn = padd(x, NEG_THPIO4); + T w = pdiv(pset1(5.0), x); + p = pmadd(p, psin(xn), pmul(w, pmul(q, pcos(xn)))); + T x_gt_five = pmul(p, pmul(SQ2OPI, prsqrt(x))); + return pselect(pcmp_le(x, pset1(5.0)), x_le_five, x_gt_five); + } +}; + +template +struct bessel_y1_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T x) { + return generic_y1::run(x); + } +}; + +} // end namespace internal + +namespace numext { + +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_i0, Scalar) + bessel_i0(const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(bessel_i0, Scalar)::run(x); +} + +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_i0e, Scalar) + bessel_i0e(const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(bessel_i0e, Scalar)::run(x); +} + +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_i1, Scalar) + bessel_i1(const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(bessel_i1, Scalar)::run(x); +} + +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_i1e, Scalar) + bessel_i1e(const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(bessel_i1e, Scalar)::run(x); +} + +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_k0, Scalar) + bessel_k0(const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(bessel_k0, Scalar)::run(x); +} + +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_k0e, Scalar) + bessel_k0e(const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(bessel_k0e, Scalar)::run(x); +} + +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_k1, Scalar) + bessel_k1(const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(bessel_k1, Scalar)::run(x); +} + +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_k1e, Scalar) + bessel_k1e(const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(bessel_k1e, Scalar)::run(x); +} + +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_j0, Scalar) + bessel_j0(const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(bessel_j0, Scalar)::run(x); +} + +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_y0, Scalar) + bessel_y0(const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(bessel_y0, Scalar)::run(x); +} + +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_j1, Scalar) + bessel_j1(const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(bessel_j1, Scalar)::run(x); +} + +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_y1, Scalar) + bessel_y1(const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(bessel_y1, Scalar)::run(x); +} + +} // end namespace numext + +} // end namespace Eigen + +#endif // EIGEN_BESSEL_FUNCTIONS_H diff --git a/src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsPacketMath.h b/src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsPacketMath.h new file mode 100644 index 0000000..943d10f --- /dev/null +++ b/src/EigenUnsupported/src/SpecialFunctions/BesselFunctionsPacketMath.h @@ -0,0 +1,118 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_BESSELFUNCTIONS_PACKETMATH_H +#define EIGEN_BESSELFUNCTIONS_PACKETMATH_H + +namespace Eigen { + +namespace internal { + +/** \internal \returns the exponentially scaled modified Bessel function of + * order zero i0(\a a) (coeff-wise) */ +template +EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pbessel_i0(const Packet& x) { + return numext::bessel_i0(x); +} + +/** \internal \returns the exponentially scaled modified Bessel function of + * order zero i0e(\a a) (coeff-wise) */ +template +EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pbessel_i0e(const Packet& x) { + return numext::bessel_i0e(x); +} + +/** \internal \returns the exponentially scaled modified Bessel function of + * order one i1(\a a) (coeff-wise) */ +template +EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pbessel_i1(const Packet& x) { + return numext::bessel_i1(x); +} + +/** \internal \returns the exponentially scaled modified Bessel function of + * order one i1e(\a a) (coeff-wise) */ +template +EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pbessel_i1e(const Packet& x) { + return numext::bessel_i1e(x); +} + +/** \internal \returns the exponentially scaled modified Bessel function of + * order zero j0(\a a) (coeff-wise) */ +template +EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pbessel_j0(const Packet& x) { + return numext::bessel_j0(x); +} + +/** \internal \returns the exponentially scaled modified Bessel function of + * order zero j1(\a a) (coeff-wise) */ +template +EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pbessel_j1(const Packet& x) { + return numext::bessel_j1(x); +} + +/** \internal \returns the exponentially scaled modified Bessel function of + * order one y0(\a a) (coeff-wise) */ +template +EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pbessel_y0(const Packet& x) { + return numext::bessel_y0(x); +} + +/** \internal \returns the exponentially scaled modified Bessel function of + * order one y1(\a a) (coeff-wise) */ +template +EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pbessel_y1(const Packet& x) { + return numext::bessel_y1(x); +} + +/** \internal \returns the exponentially scaled modified Bessel function of + * order zero k0(\a a) (coeff-wise) */ +template +EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pbessel_k0(const Packet& x) { + return numext::bessel_k0(x); +} + +/** \internal \returns the exponentially scaled modified Bessel function of + * order zero k0e(\a a) (coeff-wise) */ +template +EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pbessel_k0e(const Packet& x) { + return numext::bessel_k0e(x); +} + +/** \internal \returns the exponentially scaled modified Bessel function of + * order one k1e(\a a) (coeff-wise) */ +template +EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pbessel_k1(const Packet& x) { + return numext::bessel_k1(x); +} + +/** \internal \returns the exponentially scaled modified Bessel function of + * order one k1e(\a a) (coeff-wise) */ +template +EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pbessel_k1e(const Packet& x) { + return numext::bessel_k1e(x); +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_BESSELFUNCTIONS_PACKETMATH_H + diff --git a/src/EigenUnsupported/src/SpecialFunctions/HipVectorCompatibility.h b/src/EigenUnsupported/src/SpecialFunctions/HipVectorCompatibility.h new file mode 100644 index 0000000..d7b231a --- /dev/null +++ b/src/EigenUnsupported/src/SpecialFunctions/HipVectorCompatibility.h @@ -0,0 +1,67 @@ +#ifndef HIP_VECTOR_COMPATIBILITY_H +#define HIP_VECTOR_COMPATIBILITY_H + +namespace hip_impl { + template struct Scalar_accessor; +} // end namespace hip_impl + +namespace Eigen { +namespace internal { + +#define HIP_SCALAR_ACCESSOR_BUILDER(NAME) \ +template \ +struct NAME > : NAME {}; + +#define HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(NAME) \ +template \ +struct NAME##_impl > : NAME##_impl {}; \ +template \ +struct NAME##_retval > : NAME##_retval {}; + +#define HIP_SCALAR_ACCESSOR_BUILDER_IGAMMA(NAME) \ +template \ +struct NAME , mode> : NAME {}; + +#if EIGEN_HAS_C99_MATH +HIP_SCALAR_ACCESSOR_BUILDER(betainc_helper) +HIP_SCALAR_ACCESSOR_BUILDER(incbeta_cfe) + +HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(erf) +HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(erfc) +HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(igammac) +HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(lgamma) +HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(ndtri) +HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(polygamma) + +HIP_SCALAR_ACCESSOR_BUILDER_IGAMMA(igamma_generic_impl) +#endif + +HIP_SCALAR_ACCESSOR_BUILDER(digamma_impl_maybe_poly) +HIP_SCALAR_ACCESSOR_BUILDER(zeta_impl_series) + +HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_i0) +HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_i0e) +HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_i1) +HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_i1e) +HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_j0) +HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_j1) +HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_k0) +HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_k0e) +HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_k1) +HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_k1e) +HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_y0) +HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_y1) +HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(betainc) +HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(digamma) +HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(gamma_sample_der_alpha) +HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(igamma_der_a) +HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(igamma) +HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(zeta) + +HIP_SCALAR_ACCESSOR_BUILDER_IGAMMA(igamma_series_impl) +HIP_SCALAR_ACCESSOR_BUILDER_IGAMMA(igammac_cf_impl) + +} // end namespace internal +} // end namespace Eigen + +#endif // HIP_VECTOR_COMPATIBILITY_H diff --git a/src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsArrayAPI.h b/src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsArrayAPI.h new file mode 100644 index 0000000..691ff4d --- /dev/null +++ b/src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsArrayAPI.h @@ -0,0 +1,167 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +#ifndef EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H +#define EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H + +namespace Eigen { + +/** \cpp11 \returns an expression of the coefficient-wise igamma(\a a, \a x) to the given arrays. + * + * This function computes the coefficient-wise incomplete gamma function. + * + * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types, + * or float/double in non c++11 mode, the user has to provide implementations of igammac(T,T) for any scalar + * type T to be supported. + * + * \sa Eigen::igammac(), Eigen::lgamma() + */ +template +EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp, const Derived, const ExponentDerived> +igamma(const Eigen::ArrayBase& a, const Eigen::ArrayBase& x) +{ + return Eigen::CwiseBinaryOp, const Derived, const ExponentDerived>( + a.derived(), + x.derived() + ); +} + +/** \cpp11 \returns an expression of the coefficient-wise igamma_der_a(\a a, \a x) to the given arrays. + * + * This function computes the coefficient-wise derivative of the incomplete + * gamma function with respect to the parameter a. + * + * \note This function supports only float and double scalar types in c++11 + * mode. To support other scalar types, + * or float/double in non c++11 mode, the user has to provide implementations + * of igamma_der_a(T,T) for any scalar + * type T to be supported. + * + * \sa Eigen::igamma(), Eigen::lgamma() + */ +template +EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp, const Derived, const ExponentDerived> +igamma_der_a(const Eigen::ArrayBase& a, const Eigen::ArrayBase& x) { + return Eigen::CwiseBinaryOp, const Derived, const ExponentDerived>( + a.derived(), + x.derived()); +} + +/** \cpp11 \returns an expression of the coefficient-wise gamma_sample_der_alpha(\a alpha, \a sample) to the given arrays. + * + * This function computes the coefficient-wise derivative of the sample + * of a Gamma(alpha, 1) random variable with respect to the parameter alpha. + * + * \note This function supports only float and double scalar types in c++11 + * mode. To support other scalar types, + * or float/double in non c++11 mode, the user has to provide implementations + * of gamma_sample_der_alpha(T,T) for any scalar + * type T to be supported. + * + * \sa Eigen::igamma(), Eigen::lgamma() + */ +template +EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp, const AlphaDerived, const SampleDerived> +gamma_sample_der_alpha(const Eigen::ArrayBase& alpha, const Eigen::ArrayBase& sample) { + return Eigen::CwiseBinaryOp, const AlphaDerived, const SampleDerived>( + alpha.derived(), + sample.derived()); +} + +/** \cpp11 \returns an expression of the coefficient-wise igammac(\a a, \a x) to the given arrays. + * + * This function computes the coefficient-wise complementary incomplete gamma function. + * + * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types, + * or float/double in non c++11 mode, the user has to provide implementations of igammac(T,T) for any scalar + * type T to be supported. + * + * \sa Eigen::igamma(), Eigen::lgamma() + */ +template +EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp, const Derived, const ExponentDerived> +igammac(const Eigen::ArrayBase& a, const Eigen::ArrayBase& x) +{ + return Eigen::CwiseBinaryOp, const Derived, const ExponentDerived>( + a.derived(), + x.derived() + ); +} + +/** \cpp11 \returns an expression of the coefficient-wise polygamma(\a n, \a x) to the given arrays. + * + * It returns the \a n -th derivative of the digamma(psi) evaluated at \c x. + * + * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types, + * or float/double in non c++11 mode, the user has to provide implementations of polygamma(T,T) for any scalar + * type T to be supported. + * + * \sa Eigen::digamma() + */ +// * \warning Be careful with the order of the parameters: x.polygamma(n) is equivalent to polygamma(n,x) +// * \sa ArrayBase::polygamma() +template +EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp, const DerivedN, const DerivedX> +polygamma(const Eigen::ArrayBase& n, const Eigen::ArrayBase& x) +{ + return Eigen::CwiseBinaryOp, const DerivedN, const DerivedX>( + n.derived(), + x.derived() + ); +} + +/** \cpp11 \returns an expression of the coefficient-wise betainc(\a x, \a a, \a b) to the given arrays. + * + * This function computes the regularized incomplete beta function (integral). + * + * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types, + * or float/double in non c++11 mode, the user has to provide implementations of betainc(T,T,T) for any scalar + * type T to be supported. + * + * \sa Eigen::betainc(), Eigen::lgamma() + */ +template +EIGEN_STRONG_INLINE const Eigen::CwiseTernaryOp, const ArgADerived, const ArgBDerived, const ArgXDerived> +betainc(const Eigen::ArrayBase& a, const Eigen::ArrayBase& b, const Eigen::ArrayBase& x) +{ + return Eigen::CwiseTernaryOp, const ArgADerived, const ArgBDerived, const ArgXDerived>( + a.derived(), + b.derived(), + x.derived() + ); +} + + +/** \returns an expression of the coefficient-wise zeta(\a x, \a q) to the given arrays. + * + * It returns the Riemann zeta function of two arguments \a x and \a q: + * + * \param x is the exponent, it must be > 1 + * \param q is the shift, it must be > 0 + * + * \note This function supports only float and double scalar types. To support other scalar types, the user has + * to provide implementations of zeta(T,T) for any scalar type T to be supported. + * + * \sa ArrayBase::zeta() + */ +template +EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp, const DerivedX, const DerivedQ> +zeta(const Eigen::ArrayBase& x, const Eigen::ArrayBase& q) +{ + return Eigen::CwiseBinaryOp, const DerivedX, const DerivedQ>( + x.derived(), + q.derived() + ); +} + + +} // end namespace Eigen + +#endif // EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H diff --git a/src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsBFloat16.h b/src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsBFloat16.h new file mode 100644 index 0000000..2d94231 --- /dev/null +++ b/src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsBFloat16.h @@ -0,0 +1,58 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SPECIALFUNCTIONS_BFLOAT16_H +#define EIGEN_SPECIALFUNCTIONS_BFLOAT16_H + +namespace Eigen { +namespace numext { + +#if EIGEN_HAS_C99_MATH +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 lgamma(const Eigen::bfloat16& a) { + return Eigen::bfloat16(Eigen::numext::lgamma(static_cast(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 digamma(const Eigen::bfloat16& a) { + return Eigen::bfloat16(Eigen::numext::digamma(static_cast(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 zeta(const Eigen::bfloat16& x, const Eigen::bfloat16& q) { + return Eigen::bfloat16(Eigen::numext::zeta(static_cast(x), static_cast(q))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 polygamma(const Eigen::bfloat16& n, const Eigen::bfloat16& x) { + return Eigen::bfloat16(Eigen::numext::polygamma(static_cast(n), static_cast(x))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 erf(const Eigen::bfloat16& a) { + return Eigen::bfloat16(Eigen::numext::erf(static_cast(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 erfc(const Eigen::bfloat16& a) { + return Eigen::bfloat16(Eigen::numext::erfc(static_cast(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 ndtri(const Eigen::bfloat16& a) { + return Eigen::bfloat16(Eigen::numext::ndtri(static_cast(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 igamma(const Eigen::bfloat16& a, const Eigen::bfloat16& x) { + return Eigen::bfloat16(Eigen::numext::igamma(static_cast(a), static_cast(x))); +} +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 igamma_der_a(const Eigen::bfloat16& a, const Eigen::bfloat16& x) { + return Eigen::bfloat16(Eigen::numext::igamma_der_a(static_cast(a), static_cast(x))); +} +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 gamma_sample_der_alpha(const Eigen::bfloat16& alpha, const Eigen::bfloat16& sample) { + return Eigen::bfloat16(Eigen::numext::gamma_sample_der_alpha(static_cast(alpha), static_cast(sample))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 igammac(const Eigen::bfloat16& a, const Eigen::bfloat16& x) { + return Eigen::bfloat16(Eigen::numext::igammac(static_cast(a), static_cast(x))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 betainc(const Eigen::bfloat16& a, const Eigen::bfloat16& b, const Eigen::bfloat16& x) { + return Eigen::bfloat16(Eigen::numext::betainc(static_cast(a), static_cast(b), static_cast(x))); +} +#endif + +} // end namespace numext +} // end namespace Eigen + +#endif // EIGEN_SPECIALFUNCTIONS_BFLOAT16_H diff --git a/src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsFunctors.h b/src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsFunctors.h new file mode 100644 index 0000000..abefe99 --- /dev/null +++ b/src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsFunctors.h @@ -0,0 +1,330 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Eugene Brevdo +// Copyright (C) 2016 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SPECIALFUNCTIONS_FUNCTORS_H +#define EIGEN_SPECIALFUNCTIONS_FUNCTORS_H + +namespace Eigen { + +namespace internal { + + +/** \internal + * \brief Template functor to compute the incomplete gamma function igamma(a, x) + * + * \sa class CwiseBinaryOp, Cwise::igamma + */ +template struct scalar_igamma_op : binary_op_base +{ + EIGEN_EMPTY_STRUCT_CTOR(scalar_igamma_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& x) const { + using numext::igamma; return igamma(a, x); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const { + return internal::pigamma(a, x); + } +}; +template +struct functor_traits > { + enum { + // Guesstimate + Cost = 20 * NumTraits::MulCost + 10 * NumTraits::AddCost, + PacketAccess = packet_traits::HasIGamma + }; +}; + +/** \internal + * \brief Template functor to compute the derivative of the incomplete gamma + * function igamma_der_a(a, x) + * + * \sa class CwiseBinaryOp, Cwise::igamma_der_a + */ +template +struct scalar_igamma_der_a_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_igamma_der_a_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a, const Scalar& x) const { + using numext::igamma_der_a; + return igamma_der_a(a, x); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const { + return internal::pigamma_der_a(a, x); + } +}; +template +struct functor_traits > { + enum { + // 2x the cost of igamma + Cost = 40 * NumTraits::MulCost + 20 * NumTraits::AddCost, + PacketAccess = packet_traits::HasIGammaDerA + }; +}; + +/** \internal + * \brief Template functor to compute the derivative of the sample + * of a Gamma(alpha, 1) random variable with respect to the parameter alpha + * gamma_sample_der_alpha(alpha, sample) + * + * \sa class CwiseBinaryOp, Cwise::gamma_sample_der_alpha + */ +template +struct scalar_gamma_sample_der_alpha_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_gamma_sample_der_alpha_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& alpha, const Scalar& sample) const { + using numext::gamma_sample_der_alpha; + return gamma_sample_der_alpha(alpha, sample); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& alpha, const Packet& sample) const { + return internal::pgamma_sample_der_alpha(alpha, sample); + } +}; +template +struct functor_traits > { + enum { + // 2x the cost of igamma, minus the lgamma cost (the lgamma cancels out) + Cost = 30 * NumTraits::MulCost + 15 * NumTraits::AddCost, + PacketAccess = packet_traits::HasGammaSampleDerAlpha + }; +}; + +/** \internal + * \brief Template functor to compute the complementary incomplete gamma function igammac(a, x) + * + * \sa class CwiseBinaryOp, Cwise::igammac + */ +template struct scalar_igammac_op : binary_op_base +{ + EIGEN_EMPTY_STRUCT_CTOR(scalar_igammac_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& x) const { + using numext::igammac; return igammac(a, x); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const + { + return internal::pigammac(a, x); + } +}; +template +struct functor_traits > { + enum { + // Guesstimate + Cost = 20 * NumTraits::MulCost + 10 * NumTraits::AddCost, + PacketAccess = packet_traits::HasIGammac + }; +}; + + +/** \internal + * \brief Template functor to compute the incomplete beta integral betainc(a, b, x) + * + */ +template struct scalar_betainc_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_betainc_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& x, const Scalar& a, const Scalar& b) const { + using numext::betainc; return betainc(x, a, b); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& x, const Packet& a, const Packet& b) const + { + return internal::pbetainc(x, a, b); + } +}; +template +struct functor_traits > { + enum { + // Guesstimate + Cost = 400 * NumTraits::MulCost + 400 * NumTraits::AddCost, + PacketAccess = packet_traits::HasBetaInc + }; +}; + + +/** \internal + * \brief Template functor to compute the natural log of the absolute + * value of Gamma of a scalar + * \sa class CwiseUnaryOp, Cwise::lgamma() + */ +template struct scalar_lgamma_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_lgamma_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { + using numext::lgamma; return lgamma(a); + } + typedef typename packet_traits::type Packet; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::plgamma(a); } +}; +template +struct functor_traits > +{ + enum { + // Guesstimate + Cost = 10 * NumTraits::MulCost + 5 * NumTraits::AddCost, + PacketAccess = packet_traits::HasLGamma + }; +}; + +/** \internal + * \brief Template functor to compute psi, the derivative of lgamma of a scalar. + * \sa class CwiseUnaryOp, Cwise::digamma() + */ +template struct scalar_digamma_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_digamma_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { + using numext::digamma; return digamma(a); + } + typedef typename packet_traits::type Packet; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::pdigamma(a); } +}; +template +struct functor_traits > +{ + enum { + // Guesstimate + Cost = 10 * NumTraits::MulCost + 5 * NumTraits::AddCost, + PacketAccess = packet_traits::HasDiGamma + }; +}; + +/** \internal + * \brief Template functor to compute the Riemann Zeta function of two arguments. + * \sa class CwiseUnaryOp, Cwise::zeta() + */ +template struct scalar_zeta_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_zeta_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& x, const Scalar& q) const { + using numext::zeta; return zeta(x, q); + } + typedef typename packet_traits::type Packet; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x, const Packet& q) const { return internal::pzeta(x, q); } +}; +template +struct functor_traits > +{ + enum { + // Guesstimate + Cost = 10 * NumTraits::MulCost + 5 * NumTraits::AddCost, + PacketAccess = packet_traits::HasZeta + }; +}; + +/** \internal + * \brief Template functor to compute the polygamma function. + * \sa class CwiseUnaryOp, Cwise::polygamma() + */ +template struct scalar_polygamma_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_polygamma_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& n, const Scalar& x) const { + using numext::polygamma; return polygamma(n, x); + } + typedef typename packet_traits::type Packet; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& n, const Packet& x) const { return internal::ppolygamma(n, x); } +}; +template +struct functor_traits > +{ + enum { + // Guesstimate + Cost = 10 * NumTraits::MulCost + 5 * NumTraits::AddCost, + PacketAccess = packet_traits::HasPolygamma + }; +}; + +/** \internal + * \brief Template functor to compute the error function of a scalar + * \sa class CwiseUnaryOp, ArrayBase::erf() + */ +template struct scalar_erf_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_erf_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar + operator()(const Scalar& a) const { + return numext::erf(a); + } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { + return perf(x); + } +}; +template +struct functor_traits > { + enum { + PacketAccess = packet_traits::HasErf, + Cost = + (PacketAccess +#ifdef EIGEN_VECTORIZE_FMA + // TODO(rmlarsen): Move the FMA cost model to a central location. + // Haswell can issue 2 add/mul/madd per cycle. + // 10 pmadd, 2 pmul, 1 div, 2 other + ? (2 * NumTraits::AddCost + + 7 * NumTraits::MulCost + + scalar_div_cost::HasDiv>::value) +#else + ? (12 * NumTraits::AddCost + + 12 * NumTraits::MulCost + + scalar_div_cost::HasDiv>::value) +#endif + // Assume for simplicity that this is as expensive as an exp(). + : (functor_traits >::Cost)) + }; +}; + +/** \internal + * \brief Template functor to compute the Complementary Error Function + * of a scalar + * \sa class CwiseUnaryOp, Cwise::erfc() + */ +template struct scalar_erfc_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_erfc_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { + using numext::erfc; return erfc(a); + } + typedef typename packet_traits::type Packet; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::perfc(a); } +}; +template +struct functor_traits > +{ + enum { + // Guesstimate + Cost = 10 * NumTraits::MulCost + 5 * NumTraits::AddCost, + PacketAccess = packet_traits::HasErfc + }; +}; + +/** \internal + * \brief Template functor to compute the Inverse of the normal distribution + * function of a scalar + * \sa class CwiseUnaryOp, Cwise::ndtri() + */ +template struct scalar_ndtri_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_ndtri_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { + using numext::ndtri; return ndtri(a); + } + typedef typename packet_traits::type Packet; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::pndtri(a); } +}; +template +struct functor_traits > +{ + enum { + // On average, We are evaluating rational functions with degree N=9 in the + // numerator and denominator. This results in 2*N additions and 2*N + // multiplications. + Cost = 18 * NumTraits::MulCost + 18 * NumTraits::AddCost, + PacketAccess = packet_traits::HasNdtri + }; +}; + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_SPECIALFUNCTIONS_FUNCTORS_H diff --git a/src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsHalf.h b/src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsHalf.h new file mode 100644 index 0000000..2a3a531 --- /dev/null +++ b/src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsHalf.h @@ -0,0 +1,58 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SPECIALFUNCTIONS_HALF_H +#define EIGEN_SPECIALFUNCTIONS_HALF_H + +namespace Eigen { +namespace numext { + +#if EIGEN_HAS_C99_MATH +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half lgamma(const Eigen::half& a) { + return Eigen::half(Eigen::numext::lgamma(static_cast(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half digamma(const Eigen::half& a) { + return Eigen::half(Eigen::numext::digamma(static_cast(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half zeta(const Eigen::half& x, const Eigen::half& q) { + return Eigen::half(Eigen::numext::zeta(static_cast(x), static_cast(q))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half polygamma(const Eigen::half& n, const Eigen::half& x) { + return Eigen::half(Eigen::numext::polygamma(static_cast(n), static_cast(x))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erf(const Eigen::half& a) { + return Eigen::half(Eigen::numext::erf(static_cast(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erfc(const Eigen::half& a) { + return Eigen::half(Eigen::numext::erfc(static_cast(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ndtri(const Eigen::half& a) { + return Eigen::half(Eigen::numext::ndtri(static_cast(a))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igamma(const Eigen::half& a, const Eigen::half& x) { + return Eigen::half(Eigen::numext::igamma(static_cast(a), static_cast(x))); +} +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igamma_der_a(const Eigen::half& a, const Eigen::half& x) { + return Eigen::half(Eigen::numext::igamma_der_a(static_cast(a), static_cast(x))); +} +template <> +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half gamma_sample_der_alpha(const Eigen::half& alpha, const Eigen::half& sample) { + return Eigen::half(Eigen::numext::gamma_sample_der_alpha(static_cast(alpha), static_cast(sample))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igammac(const Eigen::half& a, const Eigen::half& x) { + return Eigen::half(Eigen::numext::igammac(static_cast(a), static_cast(x))); +} +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half betainc(const Eigen::half& a, const Eigen::half& b, const Eigen::half& x) { + return Eigen::half(Eigen::numext::betainc(static_cast(a), static_cast(b), static_cast(x))); +} +#endif + +} // end namespace numext +} // end namespace Eigen + +#endif // EIGEN_SPECIALFUNCTIONS_HALF_H diff --git a/src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsImpl.h b/src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsImpl.h new file mode 100644 index 0000000..f1c260e --- /dev/null +++ b/src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsImpl.h @@ -0,0 +1,2045 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Eugene Brevdo +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SPECIAL_FUNCTIONS_H +#define EIGEN_SPECIAL_FUNCTIONS_H + +namespace Eigen { +namespace internal { + +// Parts of this code are based on the Cephes Math Library. +// +// Cephes Math Library Release 2.8: June, 2000 +// Copyright 1984, 1987, 1992, 2000 by Stephen L. Moshier +// +// Permission has been kindly provided by the original author +// to incorporate the Cephes software into the Eigen codebase: +// +// From: Stephen Moshier +// To: Eugene Brevdo +// Subject: Re: Permission to wrap several cephes functions in Eigen +// +// Hello Eugene, +// +// Thank you for writing. +// +// If your licensing is similar to BSD, the formal way that has been +// handled is simply to add a statement to the effect that you are incorporating +// the Cephes software by permission of the author. +// +// Good luck with your project, +// Steve + + +/**************************************************************************** + * Implementation of lgamma, requires C++11/C99 * + ****************************************************************************/ + +template +struct lgamma_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Scalar) { + EIGEN_STATIC_ASSERT((internal::is_same::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +template +struct lgamma_retval { + typedef Scalar type; +}; + +#if EIGEN_HAS_C99_MATH +// Since glibc 2.19 +#if defined(__GLIBC__) && ((__GLIBC__>=2 && __GLIBC_MINOR__ >= 19) || __GLIBC__>2) \ + && (defined(_DEFAULT_SOURCE) || defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) +#define EIGEN_HAS_LGAMMA_R +#endif + +// Glibc versions before 2.19 +#if defined(__GLIBC__) && ((__GLIBC__==2 && __GLIBC_MINOR__ < 19) || __GLIBC__<2) \ + && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) +#define EIGEN_HAS_LGAMMA_R +#endif + +template <> +struct lgamma_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE float run(float x) { +#if !defined(EIGEN_GPU_COMPILE_PHASE) && defined (EIGEN_HAS_LGAMMA_R) && !defined(__APPLE__) + int dummy; + return ::lgammaf_r(x, &dummy); +#elif defined(SYCL_DEVICE_ONLY) + return cl::sycl::lgamma(x); +#else + return ::lgammaf(x); +#endif + } +}; + +template <> +struct lgamma_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double run(double x) { +#if !defined(EIGEN_GPU_COMPILE_PHASE) && defined(EIGEN_HAS_LGAMMA_R) && !defined(__APPLE__) + int dummy; + return ::lgamma_r(x, &dummy); +#elif defined(SYCL_DEVICE_ONLY) + return cl::sycl::lgamma(x); +#else + return ::lgamma(x); +#endif + } +}; + +#undef EIGEN_HAS_LGAMMA_R +#endif + +/**************************************************************************** + * Implementation of digamma (psi), based on Cephes * + ****************************************************************************/ + +template +struct digamma_retval { + typedef Scalar type; +}; + +/* + * + * Polynomial evaluation helper for the Psi (digamma) function. + * + * digamma_impl_maybe_poly::run(s) evaluates the asymptotic Psi expansion for + * input Scalar s, assuming s is above 10.0. + * + * If s is above a certain threshold for the given Scalar type, zero + * is returned. Otherwise the polynomial is evaluated with enough + * coefficients for results matching Scalar machine precision. + * + * + */ +template +struct digamma_impl_maybe_poly { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Scalar) { + EIGEN_STATIC_ASSERT((internal::is_same::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + + +template <> +struct digamma_impl_maybe_poly { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE float run(const float s) { + const float A[] = { + -4.16666666666666666667E-3f, + 3.96825396825396825397E-3f, + -8.33333333333333333333E-3f, + 8.33333333333333333333E-2f + }; + + float z; + if (s < 1.0e8f) { + z = 1.0f / (s * s); + return z * internal::ppolevl::run(z, A); + } else return 0.0f; + } +}; + +template <> +struct digamma_impl_maybe_poly { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double run(const double s) { + const double A[] = { + 8.33333333333333333333E-2, + -2.10927960927960927961E-2, + 7.57575757575757575758E-3, + -4.16666666666666666667E-3, + 3.96825396825396825397E-3, + -8.33333333333333333333E-3, + 8.33333333333333333333E-2 + }; + + double z; + if (s < 1.0e17) { + z = 1.0 / (s * s); + return z * internal::ppolevl::run(z, A); + } + else return 0.0; + } +}; + +template +struct digamma_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar x) { + /* + * + * Psi (digamma) function (modified for Eigen) + * + * + * SYNOPSIS: + * + * double x, y, psi(); + * + * y = psi( x ); + * + * + * DESCRIPTION: + * + * d - + * psi(x) = -- ln | (x) + * dx + * + * is the logarithmic derivative of the gamma function. + * For integer x, + * n-1 + * - + * psi(n) = -EUL + > 1/k. + * - + * k=1 + * + * If x is negative, it is transformed to a positive argument by the + * reflection formula psi(1-x) = psi(x) + pi cot(pi x). + * For general positive x, the argument is made greater than 10 + * using the recurrence psi(x+1) = psi(x) + 1/x. + * Then the following asymptotic expansion is applied: + * + * inf. B + * - 2k + * psi(x) = log(x) - 1/2x - > ------- + * - 2k + * k=1 2k x + * + * where the B2k are Bernoulli numbers. + * + * ACCURACY (float): + * Relative error (except absolute when |psi| < 1): + * arithmetic domain # trials peak rms + * IEEE 0,30 30000 1.3e-15 1.4e-16 + * IEEE -30,0 40000 1.5e-15 2.2e-16 + * + * ACCURACY (double): + * Absolute error, relative when |psi| > 1 : + * arithmetic domain # trials peak rms + * IEEE -33,0 30000 8.2e-7 1.2e-7 + * IEEE 0,33 100000 7.3e-7 7.7e-8 + * + * ERROR MESSAGES: + * message condition value returned + * psi singularity x integer <=0 INFINITY + */ + + Scalar p, q, nz, s, w, y; + bool negative = false; + + const Scalar nan = NumTraits::quiet_NaN(); + const Scalar m_pi = Scalar(EIGEN_PI); + + const Scalar zero = Scalar(0); + const Scalar one = Scalar(1); + const Scalar half = Scalar(0.5); + nz = zero; + + if (x <= zero) { + negative = true; + q = x; + p = numext::floor(q); + if (p == q) { + return nan; + } + /* Remove the zeros of tan(m_pi x) + * by subtracting the nearest integer from x + */ + nz = q - p; + if (nz != half) { + if (nz > half) { + p += one; + nz = q - p; + } + nz = m_pi / numext::tan(m_pi * nz); + } + else { + nz = zero; + } + x = one - x; + } + + /* use the recurrence psi(x+1) = psi(x) + 1/x. */ + s = x; + w = zero; + while (s < Scalar(10)) { + w += one / s; + s += one; + } + + y = digamma_impl_maybe_poly::run(s); + + y = numext::log(s) - (half / s) - y - w; + + return (negative) ? y - nz : y; + } +}; + +/**************************************************************************** + * Implementation of erf, requires C++11/C99 * + ****************************************************************************/ + +/** \internal \returns the error function of \a a (coeff-wise) + Doesn't do anything fancy, just a 13/8-degree rational interpolant which + is accurate up to a couple of ulp in the range [-4, 4], outside of which + fl(erf(x)) = +/-1. + + This implementation works on both scalars and Ts. +*/ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erf_float(const T& a_x) { + // Clamp the inputs to the range [-4, 4] since anything outside + // this range is +/-1.0f in single-precision. + const T plus_4 = pset1(4.f); + const T minus_4 = pset1(-4.f); + const T x = pmax(pmin(a_x, plus_4), minus_4); + // The monomial coefficients of the numerator polynomial (odd). + const T alpha_1 = pset1(-1.60960333262415e-02f); + const T alpha_3 = pset1(-2.95459980854025e-03f); + const T alpha_5 = pset1(-7.34990630326855e-04f); + const T alpha_7 = pset1(-5.69250639462346e-05f); + const T alpha_9 = pset1(-2.10102402082508e-06f); + const T alpha_11 = pset1(2.77068142495902e-08f); + const T alpha_13 = pset1(-2.72614225801306e-10f); + + // The monomial coefficients of the denominator polynomial (even). + const T beta_0 = pset1(-1.42647390514189e-02f); + const T beta_2 = pset1(-7.37332916720468e-03f); + const T beta_4 = pset1(-1.68282697438203e-03f); + const T beta_6 = pset1(-2.13374055278905e-04f); + const T beta_8 = pset1(-1.45660718464996e-05f); + + // Since the polynomials are odd/even, we need x^2. + const T x2 = pmul(x, x); + + // Evaluate the numerator polynomial p. + T p = pmadd(x2, alpha_13, alpha_11); + p = pmadd(x2, p, alpha_9); + p = pmadd(x2, p, alpha_7); + p = pmadd(x2, p, alpha_5); + p = pmadd(x2, p, alpha_3); + p = pmadd(x2, p, alpha_1); + p = pmul(x, p); + + // Evaluate the denominator polynomial p. + T q = pmadd(x2, beta_8, beta_6); + q = pmadd(x2, q, beta_4); + q = pmadd(x2, q, beta_2); + q = pmadd(x2, q, beta_0); + + // Divide the numerator by the denominator. + return pdiv(p, q); +} + +template +struct erf_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE T run(const T& x) { + return generic_fast_erf_float(x); + } +}; + +template +struct erf_retval { + typedef Scalar type; +}; + +#if EIGEN_HAS_C99_MATH +template <> +struct erf_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE float run(float x) { +#if defined(SYCL_DEVICE_ONLY) + return cl::sycl::erf(x); +#else + return generic_fast_erf_float(x); +#endif + } +}; + +template <> +struct erf_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double run(double x) { +#if defined(SYCL_DEVICE_ONLY) + return cl::sycl::erf(x); +#else + return ::erf(x); +#endif + } +}; +#endif // EIGEN_HAS_C99_MATH + +/*************************************************************************** +* Implementation of erfc, requires C++11/C99 * +****************************************************************************/ + +template +struct erfc_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Scalar) { + EIGEN_STATIC_ASSERT((internal::is_same::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +template +struct erfc_retval { + typedef Scalar type; +}; + +#if EIGEN_HAS_C99_MATH +template <> +struct erfc_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE float run(const float x) { +#if defined(SYCL_DEVICE_ONLY) + return cl::sycl::erfc(x); +#else + return ::erfcf(x); +#endif + } +}; + +template <> +struct erfc_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double run(const double x) { +#if defined(SYCL_DEVICE_ONLY) + return cl::sycl::erfc(x); +#else + return ::erfc(x); +#endif + } +}; +#endif // EIGEN_HAS_C99_MATH + + +/*************************************************************************** +* Implementation of ndtri. * +****************************************************************************/ + +/* Inverse of Normal distribution function (modified for Eigen). + * + * + * SYNOPSIS: + * + * double x, y, ndtri(); + * + * x = ndtri( y ); + * + * + * + * DESCRIPTION: + * + * Returns the argument, x, for which the area under the + * Gaussian probability density function (integrated from + * minus infinity to x) is equal to y. + * + * + * For small arguments 0 < y < exp(-2), the program computes + * z = sqrt( -2.0 * log(y) ); then the approximation is + * x = z - log(z)/z - (1/z) P(1/z) / Q(1/z). + * There are two rational functions P/Q, one for 0 < y < exp(-32) + * and the other for y up to exp(-2). For larger arguments, + * w = y - 0.5, and x/sqrt(2pi) = w + w**3 R(w**2)/S(w**2)). + * + * + * ACCURACY: + * + * Relative error: + * arithmetic domain # trials peak rms + * DEC 0.125, 1 5500 9.5e-17 2.1e-17 + * DEC 6e-39, 0.135 3500 5.7e-17 1.3e-17 + * IEEE 0.125, 1 20000 7.2e-16 1.3e-16 + * IEEE 3e-308, 0.135 50000 4.6e-16 9.8e-17 + * + * + * ERROR MESSAGES: + * + * message condition value returned + * ndtri domain x <= 0 -MAXNUM + * ndtri domain x >= 1 MAXNUM + * + */ + /* + Cephes Math Library Release 2.2: June, 1992 + Copyright 1985, 1987, 1992 by Stephen L. Moshier + Direct inquiries to 30 Frost Street, Cambridge, MA 02140 + */ + + +// TODO: Add a cheaper approximation for float. + + +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T flipsign( + const T& should_flipsign, const T& x) { + typedef typename unpacket_traits::type Scalar; + const T sign_mask = pset1(Scalar(-0.0)); + T sign_bit = pand(should_flipsign, sign_mask); + return pxor(sign_bit, x); +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double flipsign( + const double& should_flipsign, const double& x) { + return should_flipsign == 0 ? x : -x; +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float flipsign( + const float& should_flipsign, const float& x) { + return should_flipsign == 0 ? x : -x; +} + +// We split this computation in to two so that in the scalar path +// only one branch is evaluated (due to our template specialization of pselect +// being an if statement.) + +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_ndtri_gt_exp_neg_two(const T& b) { + const ScalarType p0[] = { + ScalarType(-5.99633501014107895267e1), + ScalarType(9.80010754185999661536e1), + ScalarType(-5.66762857469070293439e1), + ScalarType(1.39312609387279679503e1), + ScalarType(-1.23916583867381258016e0) + }; + const ScalarType q0[] = { + ScalarType(1.0), + ScalarType(1.95448858338141759834e0), + ScalarType(4.67627912898881538453e0), + ScalarType(8.63602421390890590575e1), + ScalarType(-2.25462687854119370527e2), + ScalarType(2.00260212380060660359e2), + ScalarType(-8.20372256168333339912e1), + ScalarType(1.59056225126211695515e1), + ScalarType(-1.18331621121330003142e0) + }; + const T sqrt2pi = pset1(ScalarType(2.50662827463100050242e0)); + const T half = pset1(ScalarType(0.5)); + T c, c2, ndtri_gt_exp_neg_two; + + c = psub(b, half); + c2 = pmul(c, c); + ndtri_gt_exp_neg_two = pmadd(c, pmul( + c2, pdiv( + internal::ppolevl::run(c2, p0), + internal::ppolevl::run(c2, q0))), c); + return pmul(ndtri_gt_exp_neg_two, sqrt2pi); +} + +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_ndtri_lt_exp_neg_two( + const T& b, const T& should_flipsign) { + /* Approximation for interval z = sqrt(-2 log a ) between 2 and 8 + * i.e., a between exp(-2) = .135 and exp(-32) = 1.27e-14. + */ + const ScalarType p1[] = { + ScalarType(4.05544892305962419923e0), + ScalarType(3.15251094599893866154e1), + ScalarType(5.71628192246421288162e1), + ScalarType(4.40805073893200834700e1), + ScalarType(1.46849561928858024014e1), + ScalarType(2.18663306850790267539e0), + ScalarType(-1.40256079171354495875e-1), + ScalarType(-3.50424626827848203418e-2), + ScalarType(-8.57456785154685413611e-4) + }; + const ScalarType q1[] = { + ScalarType(1.0), + ScalarType(1.57799883256466749731e1), + ScalarType(4.53907635128879210584e1), + ScalarType(4.13172038254672030440e1), + ScalarType(1.50425385692907503408e1), + ScalarType(2.50464946208309415979e0), + ScalarType(-1.42182922854787788574e-1), + ScalarType(-3.80806407691578277194e-2), + ScalarType(-9.33259480895457427372e-4) + }; + /* Approximation for interval z = sqrt(-2 log a ) between 8 and 64 + * i.e., a between exp(-32) = 1.27e-14 and exp(-2048) = 3.67e-890. + */ + const ScalarType p2[] = { + ScalarType(3.23774891776946035970e0), + ScalarType(6.91522889068984211695e0), + ScalarType(3.93881025292474443415e0), + ScalarType(1.33303460815807542389e0), + ScalarType(2.01485389549179081538e-1), + ScalarType(1.23716634817820021358e-2), + ScalarType(3.01581553508235416007e-4), + ScalarType(2.65806974686737550832e-6), + ScalarType(6.23974539184983293730e-9) + }; + const ScalarType q2[] = { + ScalarType(1.0), + ScalarType(6.02427039364742014255e0), + ScalarType(3.67983563856160859403e0), + ScalarType(1.37702099489081330271e0), + ScalarType(2.16236993594496635890e-1), + ScalarType(1.34204006088543189037e-2), + ScalarType(3.28014464682127739104e-4), + ScalarType(2.89247864745380683936e-6), + ScalarType(6.79019408009981274425e-9) + }; + const T eight = pset1(ScalarType(8.0)); + const T one = pset1(ScalarType(1)); + const T neg_two = pset1(ScalarType(-2)); + T x, x0, x1, z; + + x = psqrt(pmul(neg_two, plog(b))); + x0 = psub(x, pdiv(plog(x), x)); + z = pdiv(one, x); + x1 = pmul( + z, pselect( + pcmp_lt(x, eight), + pdiv(internal::ppolevl::run(z, p1), + internal::ppolevl::run(z, q1)), + pdiv(internal::ppolevl::run(z, p2), + internal::ppolevl::run(z, q2)))); + return flipsign(should_flipsign, psub(x0, x1)); +} + +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +T generic_ndtri(const T& a) { + const T maxnum = pset1(NumTraits::infinity()); + const T neg_maxnum = pset1(-NumTraits::infinity()); + + const T zero = pset1(ScalarType(0)); + const T one = pset1(ScalarType(1)); + // exp(-2) + const T exp_neg_two = pset1(ScalarType(0.13533528323661269189)); + T b, ndtri, should_flipsign; + + should_flipsign = pcmp_le(a, psub(one, exp_neg_two)); + b = pselect(should_flipsign, a, psub(one, a)); + + ndtri = pselect( + pcmp_lt(exp_neg_two, b), + generic_ndtri_gt_exp_neg_two(b), + generic_ndtri_lt_exp_neg_two(b, should_flipsign)); + + return pselect( + pcmp_le(a, zero), neg_maxnum, + pselect(pcmp_le(one, a), maxnum, ndtri)); +} + +template +struct ndtri_retval { + typedef Scalar type; +}; + +#if !EIGEN_HAS_C99_MATH + +template +struct ndtri_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Scalar) { + EIGEN_STATIC_ASSERT((internal::is_same::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +# else + +template +struct ndtri_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Scalar x) { + return generic_ndtri(x); + } +}; + +#endif // EIGEN_HAS_C99_MATH + + +/************************************************************************************************************** + * Implementation of igammac (complemented incomplete gamma integral), based on Cephes but requires C++11/C99 * + **************************************************************************************************************/ + +template +struct igammac_retval { + typedef Scalar type; +}; + +// NOTE: cephes_helper is also used to implement zeta +template +struct cephes_helper { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar machep() { assert(false && "machep not supported for this type"); return 0.0; } + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar big() { assert(false && "big not supported for this type"); return 0.0; } + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar biginv() { assert(false && "biginv not supported for this type"); return 0.0; } +}; + +template <> +struct cephes_helper { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE float machep() { + return NumTraits::epsilon() / 2; // 1.0 - machep == 1.0 + } + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE float big() { + // use epsneg (1.0 - epsneg == 1.0) + return 1.0f / (NumTraits::epsilon() / 2); + } + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE float biginv() { + // epsneg + return machep(); + } +}; + +template <> +struct cephes_helper { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double machep() { + return NumTraits::epsilon() / 2; // 1.0 - machep == 1.0 + } + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double big() { + return 1.0 / NumTraits::epsilon(); + } + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double biginv() { + // inverse of eps + return NumTraits::epsilon(); + } +}; + +enum IgammaComputationMode { VALUE, DERIVATIVE, SAMPLE_DERIVATIVE }; + +template +EIGEN_DEVICE_FUNC +static EIGEN_STRONG_INLINE Scalar main_igamma_term(Scalar a, Scalar x) { + /* Compute x**a * exp(-x) / gamma(a) */ + Scalar logax = a * numext::log(x) - x - lgamma_impl::run(a); + if (logax < -numext::log(NumTraits::highest()) || + // Assuming x and a aren't Nan. + (numext::isnan)(logax)) { + return Scalar(0); + } + return numext::exp(logax); +} + +template +EIGEN_DEVICE_FUNC +int igamma_num_iterations() { + /* Returns the maximum number of internal iterations for igamma computation. + */ + if (mode == VALUE) { + return 2000; + } + + if (internal::is_same::value) { + return 200; + } else if (internal::is_same::value) { + return 500; + } else { + return 2000; + } +} + +template +struct igammac_cf_impl { + /* Computes igamc(a, x) or derivative (depending on the mode) + * using the continued fraction expansion of the complementary + * incomplete Gamma function. + * + * Preconditions: + * a > 0 + * x >= 1 + * x >= a + */ + EIGEN_DEVICE_FUNC + static Scalar run(Scalar a, Scalar x) { + const Scalar zero = 0; + const Scalar one = 1; + const Scalar two = 2; + const Scalar machep = cephes_helper::machep(); + const Scalar big = cephes_helper::big(); + const Scalar biginv = cephes_helper::biginv(); + + if ((numext::isinf)(x)) { + return zero; + } + + Scalar ax = main_igamma_term(a, x); + // This is independent of mode. If this value is zero, + // then the function value is zero. If the function value is zero, + // then we are in a neighborhood where the function value evalutes to zero, + // so the derivative is zero. + if (ax == zero) { + return zero; + } + + // continued fraction + Scalar y = one - a; + Scalar z = x + y + one; + Scalar c = zero; + Scalar pkm2 = one; + Scalar qkm2 = x; + Scalar pkm1 = x + one; + Scalar qkm1 = z * x; + Scalar ans = pkm1 / qkm1; + + Scalar dpkm2_da = zero; + Scalar dqkm2_da = zero; + Scalar dpkm1_da = zero; + Scalar dqkm1_da = -x; + Scalar dans_da = (dpkm1_da - ans * dqkm1_da) / qkm1; + + for (int i = 0; i < igamma_num_iterations(); i++) { + c += one; + y += one; + z += two; + + Scalar yc = y * c; + Scalar pk = pkm1 * z - pkm2 * yc; + Scalar qk = qkm1 * z - qkm2 * yc; + + Scalar dpk_da = dpkm1_da * z - pkm1 - dpkm2_da * yc + pkm2 * c; + Scalar dqk_da = dqkm1_da * z - qkm1 - dqkm2_da * yc + qkm2 * c; + + if (qk != zero) { + Scalar ans_prev = ans; + ans = pk / qk; + + Scalar dans_da_prev = dans_da; + dans_da = (dpk_da - ans * dqk_da) / qk; + + if (mode == VALUE) { + if (numext::abs(ans_prev - ans) <= machep * numext::abs(ans)) { + break; + } + } else { + if (numext::abs(dans_da - dans_da_prev) <= machep) { + break; + } + } + } + + pkm2 = pkm1; + pkm1 = pk; + qkm2 = qkm1; + qkm1 = qk; + + dpkm2_da = dpkm1_da; + dpkm1_da = dpk_da; + dqkm2_da = dqkm1_da; + dqkm1_da = dqk_da; + + if (numext::abs(pk) > big) { + pkm2 *= biginv; + pkm1 *= biginv; + qkm2 *= biginv; + qkm1 *= biginv; + + dpkm2_da *= biginv; + dpkm1_da *= biginv; + dqkm2_da *= biginv; + dqkm1_da *= biginv; + } + } + + /* Compute x**a * exp(-x) / gamma(a) */ + Scalar dlogax_da = numext::log(x) - digamma_impl::run(a); + Scalar dax_da = ax * dlogax_da; + + switch (mode) { + case VALUE: + return ans * ax; + case DERIVATIVE: + return ans * dax_da + dans_da * ax; + case SAMPLE_DERIVATIVE: + default: // this is needed to suppress clang warning + return -(dans_da + ans * dlogax_da) * x; + } + } +}; + +template +struct igamma_series_impl { + /* Computes igam(a, x) or its derivative (depending on the mode) + * using the series expansion of the incomplete Gamma function. + * + * Preconditions: + * x > 0 + * a > 0 + * !(x > 1 && x > a) + */ + EIGEN_DEVICE_FUNC + static Scalar run(Scalar a, Scalar x) { + const Scalar zero = 0; + const Scalar one = 1; + const Scalar machep = cephes_helper::machep(); + + Scalar ax = main_igamma_term(a, x); + + // This is independent of mode. If this value is zero, + // then the function value is zero. If the function value is zero, + // then we are in a neighborhood where the function value evalutes to zero, + // so the derivative is zero. + if (ax == zero) { + return zero; + } + + ax /= a; + + /* power series */ + Scalar r = a; + Scalar c = one; + Scalar ans = one; + + Scalar dc_da = zero; + Scalar dans_da = zero; + + for (int i = 0; i < igamma_num_iterations(); i++) { + r += one; + Scalar term = x / r; + Scalar dterm_da = -x / (r * r); + dc_da = term * dc_da + dterm_da * c; + dans_da += dc_da; + c *= term; + ans += c; + + if (mode == VALUE) { + if (c <= machep * ans) { + break; + } + } else { + if (numext::abs(dc_da) <= machep * numext::abs(dans_da)) { + break; + } + } + } + + Scalar dlogax_da = numext::log(x) - digamma_impl::run(a + one); + Scalar dax_da = ax * dlogax_da; + + switch (mode) { + case VALUE: + return ans * ax; + case DERIVATIVE: + return ans * dax_da + dans_da * ax; + case SAMPLE_DERIVATIVE: + default: // this is needed to suppress clang warning + return -(dans_da + ans * dlogax_da) * x / a; + } + } +}; + +#if !EIGEN_HAS_C99_MATH + +template +struct igammac_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar a, Scalar x) { + EIGEN_STATIC_ASSERT((internal::is_same::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +#else + +template +struct igammac_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar a, Scalar x) { + /* igamc() + * + * Incomplete gamma integral (modified for Eigen) + * + * + * + * SYNOPSIS: + * + * double a, x, y, igamc(); + * + * y = igamc( a, x ); + * + * DESCRIPTION: + * + * The function is defined by + * + * + * igamc(a,x) = 1 - igam(a,x) + * + * inf. + * - + * 1 | | -t a-1 + * = ----- | e t dt. + * - | | + * | (a) - + * x + * + * + * In this implementation both arguments must be positive. + * The integral is evaluated by either a power series or + * continued fraction expansion, depending on the relative + * values of a and x. + * + * ACCURACY (float): + * + * Relative error: + * arithmetic domain # trials peak rms + * IEEE 0,30 30000 7.8e-6 5.9e-7 + * + * + * ACCURACY (double): + * + * Tested at random a, x. + * a x Relative error: + * arithmetic domain domain # trials peak rms + * IEEE 0.5,100 0,100 200000 1.9e-14 1.7e-15 + * IEEE 0.01,0.5 0,100 200000 1.4e-13 1.6e-15 + * + */ + /* + Cephes Math Library Release 2.2: June, 1992 + Copyright 1985, 1987, 1992 by Stephen L. Moshier + Direct inquiries to 30 Frost Street, Cambridge, MA 02140 + */ + const Scalar zero = 0; + const Scalar one = 1; + const Scalar nan = NumTraits::quiet_NaN(); + + if ((x < zero) || (a <= zero)) { + // domain error + return nan; + } + + if ((numext::isnan)(a) || (numext::isnan)(x)) { // propagate nans + return nan; + } + + if ((x < one) || (x < a)) { + return (one - igamma_series_impl::run(a, x)); + } + + return igammac_cf_impl::run(a, x); + } +}; + +#endif // EIGEN_HAS_C99_MATH + +/************************************************************************************************ + * Implementation of igamma (incomplete gamma integral), based on Cephes but requires C++11/C99 * + ************************************************************************************************/ + +#if !EIGEN_HAS_C99_MATH + +template +struct igamma_generic_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar x) { + EIGEN_STATIC_ASSERT((internal::is_same::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +#else + +template +struct igamma_generic_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar a, Scalar x) { + /* Depending on the mode, returns + * - VALUE: incomplete Gamma function igamma(a, x) + * - DERIVATIVE: derivative of incomplete Gamma function d/da igamma(a, x) + * - SAMPLE_DERIVATIVE: implicit derivative of a Gamma random variable + * x ~ Gamma(x | a, 1), dx/da = -1 / Gamma(x | a, 1) * d igamma(a, x) / dx + * + * Derivatives are implemented by forward-mode differentiation. + */ + const Scalar zero = 0; + const Scalar one = 1; + const Scalar nan = NumTraits::quiet_NaN(); + + if (x == zero) return zero; + + if ((x < zero) || (a <= zero)) { // domain error + return nan; + } + + if ((numext::isnan)(a) || (numext::isnan)(x)) { // propagate nans + return nan; + } + + if ((x > one) && (x > a)) { + Scalar ret = igammac_cf_impl::run(a, x); + if (mode == VALUE) { + return one - ret; + } else { + return -ret; + } + } + + return igamma_series_impl::run(a, x); + } +}; + +#endif // EIGEN_HAS_C99_MATH + +template +struct igamma_retval { + typedef Scalar type; +}; + +template +struct igamma_impl : igamma_generic_impl { + /* igam() + * Incomplete gamma integral. + * + * The CDF of Gamma(a, 1) random variable at the point x. + * + * Accuracy estimation. For each a in [10^-2, 10^-1...10^3] we sample + * 50 Gamma random variables x ~ Gamma(x | a, 1), a total of 300 points. + * The ground truth is computed by mpmath. Mean absolute error: + * float: 1.26713e-05 + * double: 2.33606e-12 + * + * Cephes documentation below. + * + * SYNOPSIS: + * + * double a, x, y, igam(); + * + * y = igam( a, x ); + * + * DESCRIPTION: + * + * The function is defined by + * + * x + * - + * 1 | | -t a-1 + * igam(a,x) = ----- | e t dt. + * - | | + * | (a) - + * 0 + * + * + * In this implementation both arguments must be positive. + * The integral is evaluated by either a power series or + * continued fraction expansion, depending on the relative + * values of a and x. + * + * ACCURACY (double): + * + * Relative error: + * arithmetic domain # trials peak rms + * IEEE 0,30 200000 3.6e-14 2.9e-15 + * IEEE 0,100 300000 9.9e-14 1.5e-14 + * + * + * ACCURACY (float): + * + * Relative error: + * arithmetic domain # trials peak rms + * IEEE 0,30 20000 7.8e-6 5.9e-7 + * + */ + /* + Cephes Math Library Release 2.2: June, 1992 + Copyright 1985, 1987, 1992 by Stephen L. Moshier + Direct inquiries to 30 Frost Street, Cambridge, MA 02140 + */ + + /* left tail of incomplete gamma function: + * + * inf. k + * a -x - x + * x e > ---------- + * - - + * k=0 | (a+k+1) + * + */ +}; + +template +struct igamma_der_a_retval : igamma_retval {}; + +template +struct igamma_der_a_impl : igamma_generic_impl { + /* Derivative of the incomplete Gamma function with respect to a. + * + * Computes d/da igamma(a, x) by forward differentiation of the igamma code. + * + * Accuracy estimation. For each a in [10^-2, 10^-1...10^3] we sample + * 50 Gamma random variables x ~ Gamma(x | a, 1), a total of 300 points. + * The ground truth is computed by mpmath. Mean absolute error: + * float: 6.17992e-07 + * double: 4.60453e-12 + * + * Reference: + * R. Moore. "Algorithm AS 187: Derivatives of the incomplete gamma + * integral". Journal of the Royal Statistical Society. 1982 + */ +}; + +template +struct gamma_sample_der_alpha_retval : igamma_retval {}; + +template +struct gamma_sample_der_alpha_impl + : igamma_generic_impl { + /* Derivative of a Gamma random variable sample with respect to alpha. + * + * Consider a sample of a Gamma random variable with the concentration + * parameter alpha: sample ~ Gamma(alpha, 1). The reparameterization + * derivative that we want to compute is dsample / dalpha = + * d igammainv(alpha, u) / dalpha, where u = igamma(alpha, sample). + * However, this formula is numerically unstable and expensive, so instead + * we use implicit differentiation: + * + * igamma(alpha, sample) = u, where u ~ Uniform(0, 1). + * Apply d / dalpha to both sides: + * d igamma(alpha, sample) / dalpha + * + d igamma(alpha, sample) / dsample * dsample/dalpha = 0 + * d igamma(alpha, sample) / dalpha + * + Gamma(sample | alpha, 1) dsample / dalpha = 0 + * dsample/dalpha = - (d igamma(alpha, sample) / dalpha) + * / Gamma(sample | alpha, 1) + * + * Here Gamma(sample | alpha, 1) is the PDF of the Gamma distribution + * (note that the derivative of the CDF w.r.t. sample is the PDF). + * See the reference below for more details. + * + * The derivative of igamma(alpha, sample) is computed by forward + * differentiation of the igamma code. Division by the Gamma PDF is performed + * in the same code, increasing the accuracy and speed due to cancellation + * of some terms. + * + * Accuracy estimation. For each alpha in [10^-2, 10^-1...10^3] we sample + * 50 Gamma random variables sample ~ Gamma(sample | alpha, 1), a total of 300 + * points. The ground truth is computed by mpmath. Mean absolute error: + * float: 2.1686e-06 + * double: 1.4774e-12 + * + * Reference: + * M. Figurnov, S. Mohamed, A. Mnih "Implicit Reparameterization Gradients". + * 2018 + */ +}; + +/***************************************************************************** + * Implementation of Riemann zeta function of two arguments, based on Cephes * + *****************************************************************************/ + +template +struct zeta_retval { + typedef Scalar type; +}; + +template +struct zeta_impl_series { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Scalar) { + EIGEN_STATIC_ASSERT((internal::is_same::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +template <> +struct zeta_impl_series { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE bool run(float& a, float& b, float& s, const float x, const float machep) { + int i = 0; + while(i < 9) + { + i += 1; + a += 1.0f; + b = numext::pow( a, -x ); + s += b; + if( numext::abs(b/s) < machep ) + return true; + } + + //Return whether we are done + return false; + } +}; + +template <> +struct zeta_impl_series { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE bool run(double& a, double& b, double& s, const double x, const double machep) { + int i = 0; + while( (i < 9) || (a <= 9.0) ) + { + i += 1; + a += 1.0; + b = numext::pow( a, -x ); + s += b; + if( numext::abs(b/s) < machep ) + return true; + } + + //Return whether we are done + return false; + } +}; + +template +struct zeta_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar x, Scalar q) { + /* zeta.c + * + * Riemann zeta function of two arguments + * + * + * + * SYNOPSIS: + * + * double x, q, y, zeta(); + * + * y = zeta( x, q ); + * + * + * + * DESCRIPTION: + * + * + * + * inf. + * - -x + * zeta(x,q) = > (k+q) + * - + * k=0 + * + * where x > 1 and q is not a negative integer or zero. + * The Euler-Maclaurin summation formula is used to obtain + * the expansion + * + * n + * - -x + * zeta(x,q) = > (k+q) + * - + * k=1 + * + * 1-x inf. B x(x+1)...(x+2j) + * (n+q) 1 - 2j + * + --------- - ------- + > -------------------- + * x-1 x - x+2j+1 + * 2(n+q) j=1 (2j)! (n+q) + * + * where the B2j are Bernoulli numbers. Note that (see zetac.c) + * zeta(x,1) = zetac(x) + 1. + * + * + * + * ACCURACY: + * + * Relative error for single precision: + * arithmetic domain # trials peak rms + * IEEE 0,25 10000 6.9e-7 1.0e-7 + * + * Large arguments may produce underflow in powf(), in which + * case the results are inaccurate. + * + * REFERENCE: + * + * Gradshteyn, I. S., and I. M. Ryzhik, Tables of Integrals, + * Series, and Products, p. 1073; Academic Press, 1980. + * + */ + + int i; + Scalar p, r, a, b, k, s, t, w; + + const Scalar A[] = { + Scalar(12.0), + Scalar(-720.0), + Scalar(30240.0), + Scalar(-1209600.0), + Scalar(47900160.0), + Scalar(-1.8924375803183791606e9), /*1.307674368e12/691*/ + Scalar(7.47242496e10), + Scalar(-2.950130727918164224e12), /*1.067062284288e16/3617*/ + Scalar(1.1646782814350067249e14), /*5.109094217170944e18/43867*/ + Scalar(-4.5979787224074726105e15), /*8.028576626982912e20/174611*/ + Scalar(1.8152105401943546773e17), /*1.5511210043330985984e23/854513*/ + Scalar(-7.1661652561756670113e18) /*1.6938241367317436694528e27/236364091*/ + }; + + const Scalar maxnum = NumTraits::infinity(); + const Scalar zero = 0.0, half = 0.5, one = 1.0; + const Scalar machep = cephes_helper::machep(); + const Scalar nan = NumTraits::quiet_NaN(); + + if( x == one ) + return maxnum; + + if( x < one ) + { + return nan; + } + + if( q <= zero ) + { + if(q == numext::floor(q)) + { + if (x == numext::floor(x) && long(x) % 2 == 0) { + return maxnum; + } + else { + return nan; + } + } + p = x; + r = numext::floor(p); + if (p != r) + return nan; + } + + /* Permit negative q but continue sum until n+q > +9 . + * This case should be handled by a reflection formula. + * If q<0 and x is an integer, there is a relation to + * the polygamma function. + */ + s = numext::pow( q, -x ); + a = q; + b = zero; + // Run the summation in a helper function that is specific to the floating precision + if (zeta_impl_series::run(a, b, s, x, machep)) { + return s; + } + + w = a; + s += b*w/(x-one); + s -= half * b; + a = one; + k = zero; + for( i=0; i<12; i++ ) + { + a *= x + k; + b /= w; + t = a*b/A[i]; + s = s + t; + t = numext::abs(t/s); + if( t < machep ) { + break; + } + k += one; + a *= x + k; + b /= w; + k += one; + } + return s; + } +}; + +/**************************************************************************** + * Implementation of polygamma function, requires C++11/C99 * + ****************************************************************************/ + +template +struct polygamma_retval { + typedef Scalar type; +}; + +#if !EIGEN_HAS_C99_MATH + +template +struct polygamma_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(Scalar n, Scalar x) { + EIGEN_STATIC_ASSERT((internal::is_same::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +#else + +template +struct polygamma_impl { + EIGEN_DEVICE_FUNC + static Scalar run(Scalar n, Scalar x) { + Scalar zero = 0.0, one = 1.0; + Scalar nplus = n + one; + const Scalar nan = NumTraits::quiet_NaN(); + + // Check that n is a non-negative integer + if (numext::floor(n) != n || n < zero) { + return nan; + } + // Just return the digamma function for n = 0 + else if (n == zero) { + return digamma_impl::run(x); + } + // Use the same implementation as scipy + else { + Scalar factorial = numext::exp(lgamma_impl::run(nplus)); + return numext::pow(-one, nplus) * factorial * zeta_impl::run(nplus, x); + } + } +}; + +#endif // EIGEN_HAS_C99_MATH + +/************************************************************************************************ + * Implementation of betainc (incomplete beta integral), based on Cephes but requires C++11/C99 * + ************************************************************************************************/ + +template +struct betainc_retval { + typedef Scalar type; +}; + +#if !EIGEN_HAS_C99_MATH + +template +struct betainc_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar b, Scalar x) { + EIGEN_STATIC_ASSERT((internal::is_same::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +#else + +template +struct betainc_impl { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(Scalar, Scalar, Scalar) { + /* betaincf.c + * + * Incomplete beta integral + * + * + * SYNOPSIS: + * + * float a, b, x, y, betaincf(); + * + * y = betaincf( a, b, x ); + * + * + * DESCRIPTION: + * + * Returns incomplete beta integral of the arguments, evaluated + * from zero to x. The function is defined as + * + * x + * - - + * | (a+b) | | a-1 b-1 + * ----------- | t (1-t) dt. + * - - | | + * | (a) | (b) - + * 0 + * + * The domain of definition is 0 <= x <= 1. In this + * implementation a and b are restricted to positive values. + * The integral from x to 1 may be obtained by the symmetry + * relation + * + * 1 - betainc( a, b, x ) = betainc( b, a, 1-x ). + * + * The integral is evaluated by a continued fraction expansion. + * If a < 1, the function calls itself recursively after a + * transformation to increase a to a+1. + * + * ACCURACY (float): + * + * Tested at random points (a,b,x) with a and b in the indicated + * interval and x between 0 and 1. + * + * arithmetic domain # trials peak rms + * Relative error: + * IEEE 0,30 10000 3.7e-5 5.1e-6 + * IEEE 0,100 10000 1.7e-4 2.5e-5 + * The useful domain for relative error is limited by underflow + * of the single precision exponential function. + * Absolute error: + * IEEE 0,30 100000 2.2e-5 9.6e-7 + * IEEE 0,100 10000 6.5e-5 3.7e-6 + * + * Larger errors may occur for extreme ratios of a and b. + * + * ACCURACY (double): + * arithmetic domain # trials peak rms + * IEEE 0,5 10000 6.9e-15 4.5e-16 + * IEEE 0,85 250000 2.2e-13 1.7e-14 + * IEEE 0,1000 30000 5.3e-12 6.3e-13 + * IEEE 0,10000 250000 9.3e-11 7.1e-12 + * IEEE 0,100000 10000 8.7e-10 4.8e-11 + * Outputs smaller than the IEEE gradual underflow threshold + * were excluded from these statistics. + * + * ERROR MESSAGES: + * message condition value returned + * incbet domain x<0, x>1 nan + * incbet underflow nan + */ + + EIGEN_STATIC_ASSERT((internal::is_same::value == false), + THIS_TYPE_IS_NOT_SUPPORTED); + return Scalar(0); + } +}; + +/* Continued fraction expansion #1 for incomplete beta integral (small_branch = True) + * Continued fraction expansion #2 for incomplete beta integral (small_branch = False) + */ +template +struct incbeta_cfe { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar b, Scalar x, bool small_branch) { + EIGEN_STATIC_ASSERT((internal::is_same::value || + internal::is_same::value), + THIS_TYPE_IS_NOT_SUPPORTED); + const Scalar big = cephes_helper::big(); + const Scalar machep = cephes_helper::machep(); + const Scalar biginv = cephes_helper::biginv(); + + const Scalar zero = 0; + const Scalar one = 1; + const Scalar two = 2; + + Scalar xk, pk, pkm1, pkm2, qk, qkm1, qkm2; + Scalar k1, k2, k3, k4, k5, k6, k7, k8, k26update; + Scalar ans; + int n; + + const int num_iters = (internal::is_same::value) ? 100 : 300; + const Scalar thresh = + (internal::is_same::value) ? machep : Scalar(3) * machep; + Scalar r = (internal::is_same::value) ? zero : one; + + if (small_branch) { + k1 = a; + k2 = a + b; + k3 = a; + k4 = a + one; + k5 = one; + k6 = b - one; + k7 = k4; + k8 = a + two; + k26update = one; + } else { + k1 = a; + k2 = b - one; + k3 = a; + k4 = a + one; + k5 = one; + k6 = a + b; + k7 = a + one; + k8 = a + two; + k26update = -one; + x = x / (one - x); + } + + pkm2 = zero; + qkm2 = one; + pkm1 = one; + qkm1 = one; + ans = one; + n = 0; + + do { + xk = -(x * k1 * k2) / (k3 * k4); + pk = pkm1 + pkm2 * xk; + qk = qkm1 + qkm2 * xk; + pkm2 = pkm1; + pkm1 = pk; + qkm2 = qkm1; + qkm1 = qk; + + xk = (x * k5 * k6) / (k7 * k8); + pk = pkm1 + pkm2 * xk; + qk = qkm1 + qkm2 * xk; + pkm2 = pkm1; + pkm1 = pk; + qkm2 = qkm1; + qkm1 = qk; + + if (qk != zero) { + r = pk / qk; + if (numext::abs(ans - r) < numext::abs(r) * thresh) { + return r; + } + ans = r; + } + + k1 += one; + k2 += k26update; + k3 += two; + k4 += two; + k5 += one; + k6 -= k26update; + k7 += two; + k8 += two; + + if ((numext::abs(qk) + numext::abs(pk)) > big) { + pkm2 *= biginv; + pkm1 *= biginv; + qkm2 *= biginv; + qkm1 *= biginv; + } + if ((numext::abs(qk) < biginv) || (numext::abs(pk) < biginv)) { + pkm2 *= big; + pkm1 *= big; + qkm2 *= big; + qkm1 *= big; + } + } while (++n < num_iters); + + return ans; + } +}; + +/* Helper functions depending on the Scalar type */ +template +struct betainc_helper {}; + +template <> +struct betainc_helper { + /* Core implementation, assumes a large (> 1.0) */ + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float incbsa(float aa, float bb, + float xx) { + float ans, a, b, t, x, onemx; + bool reversed_a_b = false; + + onemx = 1.0f - xx; + + /* see if x is greater than the mean */ + if (xx > (aa / (aa + bb))) { + reversed_a_b = true; + a = bb; + b = aa; + t = xx; + x = onemx; + } else { + a = aa; + b = bb; + t = onemx; + x = xx; + } + + /* Choose expansion for optimal convergence */ + if (b > 10.0f) { + if (numext::abs(b * x / a) < 0.3f) { + t = betainc_helper::incbps(a, b, x); + if (reversed_a_b) t = 1.0f - t; + return t; + } + } + + ans = x * (a + b - 2.0f) / (a - 1.0f); + if (ans < 1.0f) { + ans = incbeta_cfe::run(a, b, x, true /* small_branch */); + t = b * numext::log(t); + } else { + ans = incbeta_cfe::run(a, b, x, false /* small_branch */); + t = (b - 1.0f) * numext::log(t); + } + + t += a * numext::log(x) + lgamma_impl::run(a + b) - + lgamma_impl::run(a) - lgamma_impl::run(b); + t += numext::log(ans / a); + t = numext::exp(t); + + if (reversed_a_b) t = 1.0f - t; + return t; + } + + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE float incbps(float a, float b, float x) { + float t, u, y, s; + const float machep = cephes_helper::machep(); + + y = a * numext::log(x) + (b - 1.0f) * numext::log1p(-x) - numext::log(a); + y -= lgamma_impl::run(a) + lgamma_impl::run(b); + y += lgamma_impl::run(a + b); + + t = x / (1.0f - x); + s = 0.0f; + u = 1.0f; + do { + b -= 1.0f; + if (b == 0.0f) { + break; + } + a += 1.0f; + u *= t * b / a; + s += u; + } while (numext::abs(u) > machep); + + return numext::exp(y) * (1.0f + s); + } +}; + +template <> +struct betainc_impl { + EIGEN_DEVICE_FUNC + static float run(float a, float b, float x) { + const float nan = NumTraits::quiet_NaN(); + float ans, t; + + if (a <= 0.0f) return nan; + if (b <= 0.0f) return nan; + if ((x <= 0.0f) || (x >= 1.0f)) { + if (x == 0.0f) return 0.0f; + if (x == 1.0f) return 1.0f; + // mtherr("betaincf", DOMAIN); + return nan; + } + + /* transformation for small aa */ + if (a <= 1.0f) { + ans = betainc_helper::incbsa(a + 1.0f, b, x); + t = a * numext::log(x) + b * numext::log1p(-x) + + lgamma_impl::run(a + b) - lgamma_impl::run(a + 1.0f) - + lgamma_impl::run(b); + return (ans + numext::exp(t)); + } else { + return betainc_helper::incbsa(a, b, x); + } + } +}; + +template <> +struct betainc_helper { + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE double incbps(double a, double b, double x) { + const double machep = cephes_helper::machep(); + + double s, t, u, v, n, t1, z, ai; + + ai = 1.0 / a; + u = (1.0 - b) * x; + v = u / (a + 1.0); + t1 = v; + t = u; + n = 2.0; + s = 0.0; + z = machep * ai; + while (numext::abs(v) > z) { + u = (n - b) * x / n; + t *= u; + v = t / (a + n); + s += v; + n += 1.0; + } + s += t1; + s += ai; + + u = a * numext::log(x); + // TODO: gamma() is not directly implemented in Eigen. + /* + if ((a + b) < maxgam && numext::abs(u) < maxlog) { + t = gamma(a + b) / (gamma(a) * gamma(b)); + s = s * t * pow(x, a); + } + */ + t = lgamma_impl::run(a + b) - lgamma_impl::run(a) - + lgamma_impl::run(b) + u + numext::log(s); + return s = numext::exp(t); + } +}; + +template <> +struct betainc_impl { + EIGEN_DEVICE_FUNC + static double run(double aa, double bb, double xx) { + const double nan = NumTraits::quiet_NaN(); + const double machep = cephes_helper::machep(); + // const double maxgam = 171.624376956302725; + + double a, b, t, x, xc, w, y; + bool reversed_a_b = false; + + if (aa <= 0.0 || bb <= 0.0) { + return nan; // goto domerr; + } + + if ((xx <= 0.0) || (xx >= 1.0)) { + if (xx == 0.0) return (0.0); + if (xx == 1.0) return (1.0); + // mtherr("incbet", DOMAIN); + return nan; + } + + if ((bb * xx) <= 1.0 && xx <= 0.95) { + return betainc_helper::incbps(aa, bb, xx); + } + + w = 1.0 - xx; + + /* Reverse a and b if x is greater than the mean. */ + if (xx > (aa / (aa + bb))) { + reversed_a_b = true; + a = bb; + b = aa; + xc = xx; + x = w; + } else { + a = aa; + b = bb; + xc = w; + x = xx; + } + + if (reversed_a_b && (b * x) <= 1.0 && x <= 0.95) { + t = betainc_helper::incbps(a, b, x); + if (t <= machep) { + t = 1.0 - machep; + } else { + t = 1.0 - t; + } + return t; + } + + /* Choose expansion for better convergence. */ + y = x * (a + b - 2.0) - (a - 1.0); + if (y < 0.0) { + w = incbeta_cfe::run(a, b, x, true /* small_branch */); + } else { + w = incbeta_cfe::run(a, b, x, false /* small_branch */) / xc; + } + + /* Multiply w by the factor + a b _ _ _ + x (1-x) | (a+b) / ( a | (a) | (b) ) . */ + + y = a * numext::log(x); + t = b * numext::log(xc); + // TODO: gamma is not directly implemented in Eigen. + /* + if ((a + b) < maxgam && numext::abs(y) < maxlog && numext::abs(t) < maxlog) + { + t = pow(xc, b); + t *= pow(x, a); + t /= a; + t *= w; + t *= gamma(a + b) / (gamma(a) * gamma(b)); + } else { + */ + /* Resort to logarithms. */ + y += t + lgamma_impl::run(a + b) - lgamma_impl::run(a) - + lgamma_impl::run(b); + y += numext::log(w / a); + t = numext::exp(y); + + /* } */ + // done: + + if (reversed_a_b) { + if (t <= machep) { + t = 1.0 - machep; + } else { + t = 1.0 - t; + } + } + return t; + } +}; + +#endif // EIGEN_HAS_C99_MATH + +} // end namespace internal + +namespace numext { + +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(lgamma, Scalar) + lgamma(const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(lgamma, Scalar)::run(x); +} + +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(digamma, Scalar) + digamma(const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(digamma, Scalar)::run(x); +} + +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(zeta, Scalar) +zeta(const Scalar& x, const Scalar& q) { + return EIGEN_MATHFUNC_IMPL(zeta, Scalar)::run(x, q); +} + +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(polygamma, Scalar) +polygamma(const Scalar& n, const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(polygamma, Scalar)::run(n, x); +} + +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(erf, Scalar) + erf(const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(erf, Scalar)::run(x); +} + +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(erfc, Scalar) + erfc(const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(erfc, Scalar)::run(x); +} + +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(ndtri, Scalar) + ndtri(const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(ndtri, Scalar)::run(x); +} + +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igamma, Scalar) + igamma(const Scalar& a, const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(igamma, Scalar)::run(a, x); +} + +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igamma_der_a, Scalar) + igamma_der_a(const Scalar& a, const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(igamma_der_a, Scalar)::run(a, x); +} + +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(gamma_sample_der_alpha, Scalar) + gamma_sample_der_alpha(const Scalar& a, const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(gamma_sample_der_alpha, Scalar)::run(a, x); +} + +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igammac, Scalar) + igammac(const Scalar& a, const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(igammac, Scalar)::run(a, x); +} + +template +EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(betainc, Scalar) + betainc(const Scalar& a, const Scalar& b, const Scalar& x) { + return EIGEN_MATHFUNC_IMPL(betainc, Scalar)::run(a, b, x); +} + +} // end namespace numext +} // end namespace Eigen + +#endif // EIGEN_SPECIAL_FUNCTIONS_H diff --git a/src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsPacketMath.h b/src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsPacketMath.h new file mode 100644 index 0000000..2bb0179 --- /dev/null +++ b/src/EigenUnsupported/src/SpecialFunctions/SpecialFunctionsPacketMath.h @@ -0,0 +1,79 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SPECIALFUNCTIONS_PACKETMATH_H +#define EIGEN_SPECIALFUNCTIONS_PACKETMATH_H + +namespace Eigen { + +namespace internal { + +/** \internal \returns the ln(|gamma(\a a)|) (coeff-wise) */ +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet plgamma(const Packet& a) { using numext::lgamma; return lgamma(a); } + +/** \internal \returns the derivative of lgamma, psi(\a a) (coeff-wise) */ +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pdigamma(const Packet& a) { using numext::digamma; return digamma(a); } + +/** \internal \returns the zeta function of two arguments (coeff-wise) */ +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pzeta(const Packet& x, const Packet& q) { using numext::zeta; return zeta(x, q); } + +/** \internal \returns the polygamma function (coeff-wise) */ +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet ppolygamma(const Packet& n, const Packet& x) { using numext::polygamma; return polygamma(n, x); } + +/** \internal \returns the erf(\a a) (coeff-wise) */ +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet perf(const Packet& a) { using numext::erf; return erf(a); } + +/** \internal \returns the erfc(\a a) (coeff-wise) */ +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet perfc(const Packet& a) { using numext::erfc; return erfc(a); } + +/** \internal \returns the ndtri(\a a) (coeff-wise) */ +template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet pndtri(const Packet& a) { + typedef typename unpacket_traits::type ScalarType; + using internal::generic_ndtri; return generic_ndtri(a); +} + +/** \internal \returns the incomplete gamma function igamma(\a a, \a x) */ +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +Packet pigamma(const Packet& a, const Packet& x) { using numext::igamma; return igamma(a, x); } + +/** \internal \returns the derivative of the incomplete gamma function + * igamma_der_a(\a a, \a x) */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pigamma_der_a(const Packet& a, const Packet& x) { + using numext::igamma_der_a; return igamma_der_a(a, x); +} + +/** \internal \returns compute the derivative of the sample + * of Gamma(alpha, 1) random variable with respect to the parameter a + * gamma_sample_der_alpha(\a alpha, \a sample) */ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pgamma_sample_der_alpha(const Packet& alpha, const Packet& sample) { + using numext::gamma_sample_der_alpha; return gamma_sample_der_alpha(alpha, sample); +} + +/** \internal \returns the complementary incomplete gamma function igammac(\a a, \a x) */ +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +Packet pigammac(const Packet& a, const Packet& x) { using numext::igammac; return igammac(a, x); } + +/** \internal \returns the complementary incomplete gamma function betainc(\a a, \a b, \a x) */ +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +Packet pbetainc(const Packet& a, const Packet& b,const Packet& x) { using numext::betainc; return betainc(a, b, x); } + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_SPECIALFUNCTIONS_PACKETMATH_H diff --git a/src/EigenUnsupported/src/SpecialFunctions/arch/AVX/BesselFunctions.h b/src/EigenUnsupported/src/SpecialFunctions/arch/AVX/BesselFunctions.h new file mode 100644 index 0000000..2d76692 --- /dev/null +++ b/src/EigenUnsupported/src/SpecialFunctions/arch/AVX/BesselFunctions.h @@ -0,0 +1,46 @@ +#ifndef EIGEN_AVX_BESSELFUNCTIONS_H +#define EIGEN_AVX_BESSELFUNCTIONS_H + +namespace Eigen { +namespace internal { + +F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_i0) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_i0) + +F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_i0e) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_i0e) + +F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_i1) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_i1) + +F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_i1e) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_i1e) + +F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_j0) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_j0) + +F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_j1) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_j1) + +F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_k0) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_k0) + +F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_k0e) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_k0e) + +F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_k1) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_k1) + +F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_k1e) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_k1e) + +F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_y0) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_y0) + +F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_y1) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_y1) + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_AVX_BESSELFUNCTIONS_H diff --git a/src/EigenUnsupported/src/SpecialFunctions/arch/AVX/SpecialFunctions.h b/src/EigenUnsupported/src/SpecialFunctions/arch/AVX/SpecialFunctions.h new file mode 100644 index 0000000..35e62a8 --- /dev/null +++ b/src/EigenUnsupported/src/SpecialFunctions/arch/AVX/SpecialFunctions.h @@ -0,0 +1,16 @@ +#ifndef EIGEN_AVX_SPECIALFUNCTIONS_H +#define EIGEN_AVX_SPECIALFUNCTIONS_H + +namespace Eigen { +namespace internal { + +F16_PACKET_FUNCTION(Packet8f, Packet8h, perf) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, perf) + +F16_PACKET_FUNCTION(Packet8f, Packet8h, pndtri) +BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pndtri) + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_AVX_SPECIAL_FUNCTIONS_H diff --git a/src/EigenUnsupported/src/SpecialFunctions/arch/AVX512/BesselFunctions.h b/src/EigenUnsupported/src/SpecialFunctions/arch/AVX512/BesselFunctions.h new file mode 100644 index 0000000..7dd3c3e --- /dev/null +++ b/src/EigenUnsupported/src/SpecialFunctions/arch/AVX512/BesselFunctions.h @@ -0,0 +1,46 @@ +#ifndef EIGEN_AVX512_BESSELFUNCTIONS_H +#define EIGEN_AVX512_BESSELFUNCTIONS_H + +namespace Eigen { +namespace internal { + +F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_i0) +BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_i0) + +F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_i0e) +BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_i0e) + +F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_i1) +BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_i1) + +F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_i1e) +BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_i1e) + +F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_j0) +BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_j0) + +F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_j1) +BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_j1) + +F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_k0) +BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_k0) + +F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_k0e) +BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_k0e) + +F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_k1) +BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_k1) + +F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_k1e) +BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_k1e) + +F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_y0) +BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_y0) + +F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_y1) +BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_y1) + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_AVX512_BESSELFUNCTIONS_H diff --git a/src/EigenUnsupported/src/SpecialFunctions/arch/AVX512/SpecialFunctions.h b/src/EigenUnsupported/src/SpecialFunctions/arch/AVX512/SpecialFunctions.h new file mode 100644 index 0000000..79878f2 --- /dev/null +++ b/src/EigenUnsupported/src/SpecialFunctions/arch/AVX512/SpecialFunctions.h @@ -0,0 +1,16 @@ +#ifndef EIGEN_AVX512_SPECIALFUNCTIONS_H +#define EIGEN_AVX512_SPECIALFUNCTIONS_H + +namespace Eigen { +namespace internal { + +F16_PACKET_FUNCTION(Packet16f, Packet16h, perf) +BF16_PACKET_FUNCTION(Packet16f, Packet16bf, perf) + +F16_PACKET_FUNCTION(Packet16f, Packet16h, pndtri) +BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pndtri) + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_AVX512_SPECIAL_FUNCTIONS_H diff --git a/src/EigenUnsupported/src/SpecialFunctions/arch/GPU/SpecialFunctions.h b/src/EigenUnsupported/src/SpecialFunctions/arch/GPU/SpecialFunctions.h new file mode 100644 index 0000000..dd3bf4d --- /dev/null +++ b/src/EigenUnsupported/src/SpecialFunctions/arch/GPU/SpecialFunctions.h @@ -0,0 +1,369 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_GPU_SPECIALFUNCTIONS_H +#define EIGEN_GPU_SPECIALFUNCTIONS_H + +namespace Eigen { + +namespace internal { + +// Make sure this is only available when targeting a GPU: we don't want to +// introduce conflicts between these packet_traits definitions and the ones +// we'll use on the host side (SSE, AVX, ...) +#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU) + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 plgamma(const float4& a) +{ + return make_float4(lgammaf(a.x), lgammaf(a.y), lgammaf(a.z), lgammaf(a.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 plgamma(const double2& a) +{ + using numext::lgamma; + return make_double2(lgamma(a.x), lgamma(a.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 pdigamma(const float4& a) +{ + using numext::digamma; + return make_float4(digamma(a.x), digamma(a.y), digamma(a.z), digamma(a.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 pdigamma(const double2& a) +{ + using numext::digamma; + return make_double2(digamma(a.x), digamma(a.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 pzeta(const float4& x, const float4& q) +{ + using numext::zeta; + return make_float4(zeta(x.x, q.x), zeta(x.y, q.y), zeta(x.z, q.z), zeta(x.w, q.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 pzeta(const double2& x, const double2& q) +{ + using numext::zeta; + return make_double2(zeta(x.x, q.x), zeta(x.y, q.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 ppolygamma(const float4& n, const float4& x) +{ + using numext::polygamma; + return make_float4(polygamma(n.x, x.x), polygamma(n.y, x.y), polygamma(n.z, x.z), polygamma(n.w, x.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 ppolygamma(const double2& n, const double2& x) +{ + using numext::polygamma; + return make_double2(polygamma(n.x, x.x), polygamma(n.y, x.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 perf(const float4& a) +{ + return make_float4(erff(a.x), erff(a.y), erff(a.z), erff(a.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 perf(const double2& a) +{ + using numext::erf; + return make_double2(erf(a.x), erf(a.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 perfc(const float4& a) +{ + using numext::erfc; + return make_float4(erfc(a.x), erfc(a.y), erfc(a.z), erfc(a.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 perfc(const double2& a) +{ + using numext::erfc; + return make_double2(erfc(a.x), erfc(a.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 pndtri(const float4& a) +{ + using numext::ndtri; + return make_float4(ndtri(a.x), ndtri(a.y), ndtri(a.z), ndtri(a.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 pndtri(const double2& a) +{ + using numext::ndtri; + return make_double2(ndtri(a.x), ndtri(a.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 pigamma(const float4& a, const float4& x) +{ + using numext::igamma; + return make_float4( + igamma(a.x, x.x), + igamma(a.y, x.y), + igamma(a.z, x.z), + igamma(a.w, x.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 pigamma(const double2& a, const double2& x) +{ + using numext::igamma; + return make_double2(igamma(a.x, x.x), igamma(a.y, x.y)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pigamma_der_a( + const float4& a, const float4& x) { + using numext::igamma_der_a; + return make_float4(igamma_der_a(a.x, x.x), igamma_der_a(a.y, x.y), + igamma_der_a(a.z, x.z), igamma_der_a(a.w, x.w)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 +pigamma_der_a(const double2& a, const double2& x) { + using numext::igamma_der_a; + return make_double2(igamma_der_a(a.x, x.x), igamma_der_a(a.y, x.y)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pgamma_sample_der_alpha( + const float4& alpha, const float4& sample) { + using numext::gamma_sample_der_alpha; + return make_float4( + gamma_sample_der_alpha(alpha.x, sample.x), + gamma_sample_der_alpha(alpha.y, sample.y), + gamma_sample_der_alpha(alpha.z, sample.z), + gamma_sample_der_alpha(alpha.w, sample.w)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 +pgamma_sample_der_alpha(const double2& alpha, const double2& sample) { + using numext::gamma_sample_der_alpha; + return make_double2( + gamma_sample_der_alpha(alpha.x, sample.x), + gamma_sample_der_alpha(alpha.y, sample.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 pigammac(const float4& a, const float4& x) +{ + using numext::igammac; + return make_float4( + igammac(a.x, x.x), + igammac(a.y, x.y), + igammac(a.z, x.z), + igammac(a.w, x.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 pigammac(const double2& a, const double2& x) +{ + using numext::igammac; + return make_double2(igammac(a.x, x.x), igammac(a.y, x.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 pbetainc(const float4& a, const float4& b, const float4& x) +{ + using numext::betainc; + return make_float4( + betainc(a.x, b.x, x.x), + betainc(a.y, b.y, x.y), + betainc(a.z, b.z, x.z), + betainc(a.w, b.w, x.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 pbetainc(const double2& a, const double2& b, const double2& x) +{ + using numext::betainc; + return make_double2(betainc(a.x, b.x, x.x), betainc(a.y, b.y, x.y)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_i0e(const float4& x) { + using numext::bessel_i0e; + return make_float4(bessel_i0e(x.x), bessel_i0e(x.y), bessel_i0e(x.z), bessel_i0e(x.w)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 +pbessel_i0e(const double2& x) { + using numext::bessel_i0e; + return make_double2(bessel_i0e(x.x), bessel_i0e(x.y)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_i0(const float4& x) { + using numext::bessel_i0; + return make_float4(bessel_i0(x.x), bessel_i0(x.y), bessel_i0(x.z), bessel_i0(x.w)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 +pbessel_i0(const double2& x) { + using numext::bessel_i0; + return make_double2(bessel_i0(x.x), bessel_i0(x.y)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_i1e(const float4& x) { + using numext::bessel_i1e; + return make_float4(bessel_i1e(x.x), bessel_i1e(x.y), bessel_i1e(x.z), bessel_i1e(x.w)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 +pbessel_i1e(const double2& x) { + using numext::bessel_i1e; + return make_double2(bessel_i1e(x.x), bessel_i1e(x.y)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_i1(const float4& x) { + using numext::bessel_i1; + return make_float4(bessel_i1(x.x), bessel_i1(x.y), bessel_i1(x.z), bessel_i1(x.w)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 +pbessel_i1(const double2& x) { + using numext::bessel_i1; + return make_double2(bessel_i1(x.x), bessel_i1(x.y)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_k0e(const float4& x) { + using numext::bessel_k0e; + return make_float4(bessel_k0e(x.x), bessel_k0e(x.y), bessel_k0e(x.z), bessel_k0e(x.w)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 +pbessel_k0e(const double2& x) { + using numext::bessel_k0e; + return make_double2(bessel_k0e(x.x), bessel_k0e(x.y)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_k0(const float4& x) { + using numext::bessel_k0; + return make_float4(bessel_k0(x.x), bessel_k0(x.y), bessel_k0(x.z), bessel_k0(x.w)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 +pbessel_k0(const double2& x) { + using numext::bessel_k0; + return make_double2(bessel_k0(x.x), bessel_k0(x.y)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_k1e(const float4& x) { + using numext::bessel_k1e; + return make_float4(bessel_k1e(x.x), bessel_k1e(x.y), bessel_k1e(x.z), bessel_k1e(x.w)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 +pbessel_k1e(const double2& x) { + using numext::bessel_k1e; + return make_double2(bessel_k1e(x.x), bessel_k1e(x.y)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_k1(const float4& x) { + using numext::bessel_k1; + return make_float4(bessel_k1(x.x), bessel_k1(x.y), bessel_k1(x.z), bessel_k1(x.w)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 +pbessel_k1(const double2& x) { + using numext::bessel_k1; + return make_double2(bessel_k1(x.x), bessel_k1(x.y)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_j0(const float4& x) { + using numext::bessel_j0; + return make_float4(bessel_j0(x.x), bessel_j0(x.y), bessel_j0(x.z), bessel_j0(x.w)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 +pbessel_j0(const double2& x) { + using numext::bessel_j0; + return make_double2(bessel_j0(x.x), bessel_j0(x.y)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_j1(const float4& x) { + using numext::bessel_j1; + return make_float4(bessel_j1(x.x), bessel_j1(x.y), bessel_j1(x.z), bessel_j1(x.w)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 +pbessel_j1(const double2& x) { + using numext::bessel_j1; + return make_double2(bessel_j1(x.x), bessel_j1(x.y)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_y0(const float4& x) { + using numext::bessel_y0; + return make_float4(bessel_y0(x.x), bessel_y0(x.y), bessel_y0(x.z), bessel_y0(x.w)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 +pbessel_y0(const double2& x) { + using numext::bessel_y0; + return make_double2(bessel_y0(x.x), bessel_y0(x.y)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_y1(const float4& x) { + using numext::bessel_y1; + return make_float4(bessel_y1(x.x), bessel_y1(x.y), bessel_y1(x.z), bessel_y1(x.w)); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 +pbessel_y1(const double2& x) { + using numext::bessel_y1; + return make_double2(bessel_y1(x.x), bessel_y1(x.y)); +} + +#endif + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_GPU_SPECIALFUNCTIONS_H diff --git a/src/EigenUnsupported/src/SpecialFunctions/arch/NEON/BesselFunctions.h b/src/EigenUnsupported/src/SpecialFunctions/arch/NEON/BesselFunctions.h new file mode 100644 index 0000000..67433b0 --- /dev/null +++ b/src/EigenUnsupported/src/SpecialFunctions/arch/NEON/BesselFunctions.h @@ -0,0 +1,54 @@ +#ifndef EIGEN_NEON_BESSELFUNCTIONS_H +#define EIGEN_NEON_BESSELFUNCTIONS_H + +namespace Eigen { +namespace internal { + +#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC + +#define NEON_HALF_TO_FLOAT_FUNCTIONS(METHOD) \ +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +Packet8hf METHOD(const Packet8hf& x) { \ + const Packet4f lo = METHOD(vcvt_f32_f16(vget_low_f16(x))); \ + const Packet4f hi = METHOD(vcvt_f32_f16(vget_high_f16(x))); \ + return vcombine_f16(vcvt_f16_f32(lo), vcvt_f16_f32(hi)); \ +} \ + \ +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +Packet4hf METHOD(const Packet4hf& x) { \ + return vcvt_f16_f32(METHOD(vcvt_f32_f16(x))); \ +} + +NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_i0) +NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_i0e) +NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_i1) +NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_i1e) +NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_j0) +NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_j1) +NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_k0) +NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_k0e) +NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_k1) +NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_k1e) +NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_y0) +NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_y1) + +#undef NEON_HALF_TO_FLOAT_FUNCTIONS +#endif + +BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_i0) +BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_i0e) +BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_i1) +BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_i1e) +BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_j0) +BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_j1) +BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_k0) +BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_k0e) +BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_k1) +BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_k1e) +BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_y0) +BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_y1) + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_NEON_BESSELFUNCTIONS_H diff --git a/src/EigenUnsupported/src/SpecialFunctions/arch/NEON/SpecialFunctions.h b/src/EigenUnsupported/src/SpecialFunctions/arch/NEON/SpecialFunctions.h new file mode 100644 index 0000000..ec92951 --- /dev/null +++ b/src/EigenUnsupported/src/SpecialFunctions/arch/NEON/SpecialFunctions.h @@ -0,0 +1,34 @@ +#ifndef EIGEN_NEON_SPECIALFUNCTIONS_H +#define EIGEN_NEON_SPECIALFUNCTIONS_H + +namespace Eigen { +namespace internal { + +#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC + +#define NEON_HALF_TO_FLOAT_FUNCTIONS(METHOD) \ +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +Packet8hf METHOD(const Packet8hf& x) { \ + const Packet4f lo = METHOD(vcvt_f32_f16(vget_low_f16(x))); \ + const Packet4f hi = METHOD(vcvt_f32_f16(vget_high_f16(x))); \ + return vcombine_f16(vcvt_f16_f32(lo), vcvt_f16_f32(hi)); \ +} \ + \ +template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ +Packet4hf METHOD(const Packet4hf& x) { \ + return vcvt_f16_f32(METHOD(vcvt_f32_f16(x))); \ +} + +NEON_HALF_TO_FLOAT_FUNCTIONS(perf) +NEON_HALF_TO_FLOAT_FUNCTIONS(pndtri) + +#undef NEON_HALF_TO_FLOAT_FUNCTIONS +#endif + +BF16_PACKET_FUNCTION(Packet4f, Packet4bf, perf) +BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pndtri) + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_NEON_SPECIALFUNCTIONS_H diff --git a/src/EigenUnsupported/src/Splines/Spline.h b/src/EigenUnsupported/src/Splines/Spline.h new file mode 100644 index 0000000..79edd52 --- /dev/null +++ b/src/EigenUnsupported/src/Splines/Spline.h @@ -0,0 +1,507 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 20010-2011 Hauke Heibel +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SPLINE_H +#define EIGEN_SPLINE_H + +#include "SplineFwd.h" + +namespace Eigen +{ + /** + * \ingroup Splines_Module + * \class Spline + * \brief A class representing multi-dimensional spline curves. + * + * The class represents B-splines with non-uniform knot vectors. Each control + * point of the B-spline is associated with a basis function + * \f{align*} + * C(u) & = \sum_{i=0}^{n}N_{i,p}(u)P_i + * \f} + * + * \tparam _Scalar The underlying data type (typically float or double) + * \tparam _Dim The curve dimension (e.g. 2 or 3) + * \tparam _Degree Per default set to Dynamic; could be set to the actual desired + * degree for optimization purposes (would result in stack allocation + * of several temporary variables). + **/ + template + class Spline + { + public: + typedef _Scalar Scalar; /*!< The spline curve's scalar type. */ + enum { Dimension = _Dim /*!< The spline curve's dimension. */ }; + enum { Degree = _Degree /*!< The spline curve's degree. */ }; + + /** \brief The point type the spline is representing. */ + typedef typename SplineTraits::PointType PointType; + + /** \brief The data type used to store knot vectors. */ + typedef typename SplineTraits::KnotVectorType KnotVectorType; + + /** \brief The data type used to store parameter vectors. */ + typedef typename SplineTraits::ParameterVectorType ParameterVectorType; + + /** \brief The data type used to store non-zero basis functions. */ + typedef typename SplineTraits::BasisVectorType BasisVectorType; + + /** \brief The data type used to store the values of the basis function derivatives. */ + typedef typename SplineTraits::BasisDerivativeType BasisDerivativeType; + + /** \brief The data type representing the spline's control points. */ + typedef typename SplineTraits::ControlPointVectorType ControlPointVectorType; + + /** + * \brief Creates a (constant) zero spline. + * For Splines with dynamic degree, the resulting degree will be 0. + **/ + Spline() + : m_knots(1, (Degree==Dynamic ? 2 : 2*Degree+2)) + , m_ctrls(ControlPointVectorType::Zero(Dimension,(Degree==Dynamic ? 1 : Degree+1))) + { + // in theory this code can go to the initializer list but it will get pretty + // much unreadable ... + enum { MinDegree = (Degree==Dynamic ? 0 : Degree) }; + m_knots.template segment(0) = Array::Zero(); + m_knots.template segment(MinDegree+1) = Array::Ones(); + } + + /** + * \brief Creates a spline from a knot vector and control points. + * \param knots The spline's knot vector. + * \param ctrls The spline's control point vector. + **/ + template + Spline(const OtherVectorType& knots, const OtherArrayType& ctrls) : m_knots(knots), m_ctrls(ctrls) {} + + /** + * \brief Copy constructor for splines. + * \param spline The input spline. + **/ + template + Spline(const Spline& spline) : + m_knots(spline.knots()), m_ctrls(spline.ctrls()) {} + + /** + * \brief Returns the knots of the underlying spline. + **/ + const KnotVectorType& knots() const { return m_knots; } + + /** + * \brief Returns the ctrls of the underlying spline. + **/ + const ControlPointVectorType& ctrls() const { return m_ctrls; } + + /** + * \brief Returns the spline value at a given site \f$u\f$. + * + * The function returns + * \f{align*} + * C(u) & = \sum_{i=0}^{n}N_{i,p}P_i + * \f} + * + * \param u Parameter \f$u \in [0;1]\f$ at which the spline is evaluated. + * \return The spline value at the given location \f$u\f$. + **/ + PointType operator()(Scalar u) const; + + /** + * \brief Evaluation of spline derivatives of up-to given order. + * + * The function returns + * \f{align*} + * \frac{d^i}{du^i}C(u) & = \sum_{i=0}^{n} \frac{d^i}{du^i} N_{i,p}(u)P_i + * \f} + * for i ranging between 0 and order. + * + * \param u Parameter \f$u \in [0;1]\f$ at which the spline derivative is evaluated. + * \param order The order up to which the derivatives are computed. + **/ + typename SplineTraits::DerivativeType + derivatives(Scalar u, DenseIndex order) const; + + /** + * \copydoc Spline::derivatives + * Using the template version of this function is more efficieent since + * temporary objects are allocated on the stack whenever this is possible. + **/ + template + typename SplineTraits::DerivativeType + derivatives(Scalar u, DenseIndex order = DerivativeOrder) const; + + /** + * \brief Computes the non-zero basis functions at the given site. + * + * Splines have local support and a point from their image is defined + * by exactly \f$p+1\f$ control points \f$P_i\f$ where \f$p\f$ is the + * spline degree. + * + * This function computes the \f$p+1\f$ non-zero basis function values + * for a given parameter value \f$u\f$. It returns + * \f{align*}{ + * N_{i,p}(u), \hdots, N_{i+p+1,p}(u) + * \f} + * + * \param u Parameter \f$u \in [0;1]\f$ at which the non-zero basis functions + * are computed. + **/ + typename SplineTraits::BasisVectorType + basisFunctions(Scalar u) const; + + /** + * \brief Computes the non-zero spline basis function derivatives up to given order. + * + * The function computes + * \f{align*}{ + * \frac{d^i}{du^i} N_{i,p}(u), \hdots, \frac{d^i}{du^i} N_{i+p+1,p}(u) + * \f} + * with i ranging from 0 up to the specified order. + * + * \param u Parameter \f$u \in [0;1]\f$ at which the non-zero basis function + * derivatives are computed. + * \param order The order up to which the basis function derivatives are computes. + **/ + typename SplineTraits::BasisDerivativeType + basisFunctionDerivatives(Scalar u, DenseIndex order) const; + + /** + * \copydoc Spline::basisFunctionDerivatives + * Using the template version of this function is more efficieent since + * temporary objects are allocated on the stack whenever this is possible. + **/ + template + typename SplineTraits::BasisDerivativeType + basisFunctionDerivatives(Scalar u, DenseIndex order = DerivativeOrder) const; + + /** + * \brief Returns the spline degree. + **/ + DenseIndex degree() const; + + /** + * \brief Returns the span within the knot vector in which u is falling. + * \param u The site for which the span is determined. + **/ + DenseIndex span(Scalar u) const; + + /** + * \brief Computes the span within the provided knot vector in which u is falling. + **/ + static DenseIndex Span(typename SplineTraits::Scalar u, DenseIndex degree, const typename SplineTraits::KnotVectorType& knots); + + /** + * \brief Returns the spline's non-zero basis functions. + * + * The function computes and returns + * \f{align*}{ + * N_{i,p}(u), \hdots, N_{i+p+1,p}(u) + * \f} + * + * \param u The site at which the basis functions are computed. + * \param degree The degree of the underlying spline. + * \param knots The underlying spline's knot vector. + **/ + static BasisVectorType BasisFunctions(Scalar u, DenseIndex degree, const KnotVectorType& knots); + + /** + * \copydoc Spline::basisFunctionDerivatives + * \param degree The degree of the underlying spline + * \param knots The underlying spline's knot vector. + **/ + static BasisDerivativeType BasisFunctionDerivatives( + const Scalar u, const DenseIndex order, const DenseIndex degree, const KnotVectorType& knots); + + private: + KnotVectorType m_knots; /*!< Knot vector. */ + ControlPointVectorType m_ctrls; /*!< Control points. */ + + template + static void BasisFunctionDerivativesImpl( + const typename Spline<_Scalar, _Dim, _Degree>::Scalar u, + const DenseIndex order, + const DenseIndex p, + const typename Spline<_Scalar, _Dim, _Degree>::KnotVectorType& U, + DerivativeType& N_); + }; + + template + DenseIndex Spline<_Scalar, _Dim, _Degree>::Span( + typename SplineTraits< Spline<_Scalar, _Dim, _Degree> >::Scalar u, + DenseIndex degree, + const typename SplineTraits< Spline<_Scalar, _Dim, _Degree> >::KnotVectorType& knots) + { + // Piegl & Tiller, "The NURBS Book", A2.1 (p. 68) + if (u <= knots(0)) return degree; + const Scalar* pos = std::upper_bound(knots.data()+degree-1, knots.data()+knots.size()-degree-1, u); + return static_cast( std::distance(knots.data(), pos) - 1 ); + } + + template + typename Spline<_Scalar, _Dim, _Degree>::BasisVectorType + Spline<_Scalar, _Dim, _Degree>::BasisFunctions( + typename Spline<_Scalar, _Dim, _Degree>::Scalar u, + DenseIndex degree, + const typename Spline<_Scalar, _Dim, _Degree>::KnotVectorType& knots) + { + const DenseIndex p = degree; + const DenseIndex i = Spline::Span(u, degree, knots); + + const KnotVectorType& U = knots; + + BasisVectorType left(p+1); left(0) = Scalar(0); + BasisVectorType right(p+1); right(0) = Scalar(0); + + VectorBlock(left,1,p) = u - VectorBlock(U,i+1-p,p).reverse(); + VectorBlock(right,1,p) = VectorBlock(U,i+1,p) - u; + + BasisVectorType N(1,p+1); + N(0) = Scalar(1); + for (DenseIndex j=1; j<=p; ++j) + { + Scalar saved = Scalar(0); + for (DenseIndex r=0; r + DenseIndex Spline<_Scalar, _Dim, _Degree>::degree() const + { + if (_Degree == Dynamic) + return m_knots.size() - m_ctrls.cols() - 1; + else + return _Degree; + } + + template + DenseIndex Spline<_Scalar, _Dim, _Degree>::span(Scalar u) const + { + return Spline::Span(u, degree(), knots()); + } + + template + typename Spline<_Scalar, _Dim, _Degree>::PointType Spline<_Scalar, _Dim, _Degree>::operator()(Scalar u) const + { + enum { Order = SplineTraits::OrderAtCompileTime }; + + const DenseIndex span = this->span(u); + const DenseIndex p = degree(); + const BasisVectorType basis_funcs = basisFunctions(u); + + const Replicate ctrl_weights(basis_funcs); + const Block ctrl_pts(ctrls(),0,span-p,Dimension,p+1); + return (ctrl_weights * ctrl_pts).rowwise().sum(); + } + + /* --------------------------------------------------------------------------------------------- */ + + template + void derivativesImpl(const SplineType& spline, typename SplineType::Scalar u, DenseIndex order, DerivativeType& der) + { + enum { Dimension = SplineTraits::Dimension }; + enum { Order = SplineTraits::OrderAtCompileTime }; + enum { DerivativeOrder = DerivativeType::ColsAtCompileTime }; + + typedef typename SplineTraits::ControlPointVectorType ControlPointVectorType; + typedef typename SplineTraits::BasisDerivativeType BasisDerivativeType; + typedef typename BasisDerivativeType::ConstRowXpr BasisDerivativeRowXpr; + + const DenseIndex p = spline.degree(); + const DenseIndex span = spline.span(u); + + const DenseIndex n = (std::min)(p, order); + + der.resize(Dimension,n+1); + + // Retrieve the basis function derivatives up to the desired order... + const BasisDerivativeType basis_func_ders = spline.template basisFunctionDerivatives(u, n+1); + + // ... and perform the linear combinations of the control points. + for (DenseIndex der_order=0; der_order ctrl_weights( basis_func_ders.row(der_order) ); + const Block ctrl_pts(spline.ctrls(),0,span-p,Dimension,p+1); + der.col(der_order) = (ctrl_weights * ctrl_pts).rowwise().sum(); + } + } + + template + typename SplineTraits< Spline<_Scalar, _Dim, _Degree> >::DerivativeType + Spline<_Scalar, _Dim, _Degree>::derivatives(Scalar u, DenseIndex order) const + { + typename SplineTraits< Spline >::DerivativeType res; + derivativesImpl(*this, u, order, res); + return res; + } + + template + template + typename SplineTraits< Spline<_Scalar, _Dim, _Degree>, DerivativeOrder >::DerivativeType + Spline<_Scalar, _Dim, _Degree>::derivatives(Scalar u, DenseIndex order) const + { + typename SplineTraits< Spline, DerivativeOrder >::DerivativeType res; + derivativesImpl(*this, u, order, res); + return res; + } + + template + typename SplineTraits< Spline<_Scalar, _Dim, _Degree> >::BasisVectorType + Spline<_Scalar, _Dim, _Degree>::basisFunctions(Scalar u) const + { + return Spline::BasisFunctions(u, degree(), knots()); + } + + /* --------------------------------------------------------------------------------------------- */ + + + template + template + void Spline<_Scalar, _Dim, _Degree>::BasisFunctionDerivativesImpl( + const typename Spline<_Scalar, _Dim, _Degree>::Scalar u, + const DenseIndex order, + const DenseIndex p, + const typename Spline<_Scalar, _Dim, _Degree>::KnotVectorType& U, + DerivativeType& N_) + { + typedef Spline<_Scalar, _Dim, _Degree> SplineType; + enum { Order = SplineTraits::OrderAtCompileTime }; + + const DenseIndex span = SplineType::Span(u, p, U); + + const DenseIndex n = (std::min)(p, order); + + N_.resize(n+1, p+1); + + BasisVectorType left = BasisVectorType::Zero(p+1); + BasisVectorType right = BasisVectorType::Zero(p+1); + + Matrix ndu(p+1,p+1); + + Scalar saved, temp; // FIXME These were double instead of Scalar. Was there a reason for that? + + ndu(0,0) = 1.0; + + DenseIndex j; + for (j=1; j<=p; ++j) + { + left[j] = u-U[span+1-j]; + right[j] = U[span+j]-u; + saved = 0.0; + + for (DenseIndex r=0; r(saved+right[r+1] * temp); + saved = left[j-r] * temp; + } + + ndu(j,j) = static_cast(saved); + } + + for (j = p; j>=0; --j) + N_(0,j) = ndu(j,p); + + // Compute the derivatives + DerivativeType a(n+1,p+1); + DenseIndex r=0; + for (; r<=p; ++r) + { + DenseIndex s1,s2; + s1 = 0; s2 = 1; // alternate rows in array a + a(0,0) = 1.0; + + // Compute the k-th derivative + for (DenseIndex k=1; k<=static_cast(n); ++k) + { + Scalar d = 0.0; + DenseIndex rk,pk,j1,j2; + rk = r-k; pk = p-k; + + if (r>=k) + { + a(s2,0) = a(s1,0)/ndu(pk+1,rk); + d = a(s2,0)*ndu(rk,pk); + } + + if (rk>=-1) j1 = 1; + else j1 = -rk; + + if (r-1 <= pk) j2 = k-1; + else j2 = p-r; + + for (j=j1; j<=j2; ++j) + { + a(s2,j) = (a(s1,j)-a(s1,j-1))/ndu(pk+1,rk+j); + d += a(s2,j)*ndu(rk+j,pk); + } + + if (r<=pk) + { + a(s2,k) = -a(s1,k-1)/ndu(pk+1,r); + d += a(s2,k)*ndu(r,pk); + } + + N_(k,r) = static_cast(d); + j = s1; s1 = s2; s2 = j; // Switch rows + } + } + + /* Multiply through by the correct factors */ + /* (Eq. [2.9]) */ + r = p; + for (DenseIndex k=1; k<=static_cast(n); ++k) + { + for (j=p; j>=0; --j) N_(k,j) *= r; + r *= p-k; + } + } + + template + typename SplineTraits< Spline<_Scalar, _Dim, _Degree> >::BasisDerivativeType + Spline<_Scalar, _Dim, _Degree>::basisFunctionDerivatives(Scalar u, DenseIndex order) const + { + typename SplineTraits >::BasisDerivativeType der; + BasisFunctionDerivativesImpl(u, order, degree(), knots(), der); + return der; + } + + template + template + typename SplineTraits< Spline<_Scalar, _Dim, _Degree>, DerivativeOrder >::BasisDerivativeType + Spline<_Scalar, _Dim, _Degree>::basisFunctionDerivatives(Scalar u, DenseIndex order) const + { + typename SplineTraits< Spline<_Scalar, _Dim, _Degree>, DerivativeOrder >::BasisDerivativeType der; + BasisFunctionDerivativesImpl(u, order, degree(), knots(), der); + return der; + } + + template + typename SplineTraits >::BasisDerivativeType + Spline<_Scalar, _Dim, _Degree>::BasisFunctionDerivatives( + const typename Spline<_Scalar, _Dim, _Degree>::Scalar u, + const DenseIndex order, + const DenseIndex degree, + const typename Spline<_Scalar, _Dim, _Degree>::KnotVectorType& knots) + { + typename SplineTraits::BasisDerivativeType der; + BasisFunctionDerivativesImpl(u, order, degree, knots, der); + return der; + } +} + +#endif // EIGEN_SPLINE_H diff --git a/src/EigenUnsupported/src/Splines/SplineFitting.h b/src/EigenUnsupported/src/Splines/SplineFitting.h new file mode 100644 index 0000000..9f6e8af --- /dev/null +++ b/src/EigenUnsupported/src/Splines/SplineFitting.h @@ -0,0 +1,431 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 20010-2011 Hauke Heibel +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SPLINE_FITTING_H +#define EIGEN_SPLINE_FITTING_H + +#include +#include +#include +#include + +#include "SplineFwd.h" + +#include "../../../../Eigen/LU" +#include "../../../../Eigen/QR" + +namespace Eigen +{ + /** + * \brief Computes knot averages. + * \ingroup Splines_Module + * + * The knots are computed as + * \f{align*} + * u_0 & = \hdots = u_p = 0 \\ + * u_{m-p} & = \hdots = u_{m} = 1 \\ + * u_{j+p} & = \frac{1}{p}\sum_{i=j}^{j+p-1}\bar{u}_i \quad\quad j=1,\hdots,n-p + * \f} + * where \f$p\f$ is the degree and \f$m+1\f$ the number knots + * of the desired interpolating spline. + * + * \param[in] parameters The input parameters. During interpolation one for each data point. + * \param[in] degree The spline degree which is used during the interpolation. + * \param[out] knots The output knot vector. + * + * \sa Les Piegl and Wayne Tiller, The NURBS book (2nd ed.), 1997, 9.2.1 Global Curve Interpolation to Point Data + **/ + template + void KnotAveraging(const KnotVectorType& parameters, DenseIndex degree, KnotVectorType& knots) + { + knots.resize(parameters.size()+degree+1); + + for (DenseIndex j=1; j + void KnotAveragingWithDerivatives(const ParameterVectorType& parameters, + const unsigned int degree, + const IndexArray& derivativeIndices, + KnotVectorType& knots) + { + typedef typename ParameterVectorType::Scalar Scalar; + + DenseIndex numParameters = parameters.size(); + DenseIndex numDerivatives = derivativeIndices.size(); + + if (numDerivatives < 1) + { + KnotAveraging(parameters, degree, knots); + return; + } + + DenseIndex startIndex; + DenseIndex endIndex; + + DenseIndex numInternalDerivatives = numDerivatives; + + if (derivativeIndices[0] == 0) + { + startIndex = 0; + --numInternalDerivatives; + } + else + { + startIndex = 1; + } + if (derivativeIndices[numDerivatives - 1] == numParameters - 1) + { + endIndex = numParameters - degree; + --numInternalDerivatives; + } + else + { + endIndex = numParameters - degree - 1; + } + + // There are (endIndex - startIndex + 1) knots obtained from the averaging + // and 2 for the first and last parameters. + DenseIndex numAverageKnots = endIndex - startIndex + 3; + KnotVectorType averageKnots(numAverageKnots); + averageKnots[0] = parameters[0]; + + int newKnotIndex = 0; + for (DenseIndex i = startIndex; i <= endIndex; ++i) + averageKnots[++newKnotIndex] = parameters.segment(i, degree).mean(); + averageKnots[++newKnotIndex] = parameters[numParameters - 1]; + + newKnotIndex = -1; + + ParameterVectorType temporaryParameters(numParameters + 1); + KnotVectorType derivativeKnots(numInternalDerivatives); + for (DenseIndex i = 0; i < numAverageKnots - 1; ++i) + { + temporaryParameters[0] = averageKnots[i]; + ParameterVectorType parameterIndices(numParameters); + int temporaryParameterIndex = 1; + for (DenseIndex j = 0; j < numParameters; ++j) + { + Scalar parameter = parameters[j]; + if (parameter >= averageKnots[i] && parameter < averageKnots[i + 1]) + { + parameterIndices[temporaryParameterIndex] = j; + temporaryParameters[temporaryParameterIndex++] = parameter; + } + } + temporaryParameters[temporaryParameterIndex] = averageKnots[i + 1]; + + for (int j = 0; j <= temporaryParameterIndex - 2; ++j) + { + for (DenseIndex k = 0; k < derivativeIndices.size(); ++k) + { + if (parameterIndices[j + 1] == derivativeIndices[k] + && parameterIndices[j + 1] != 0 + && parameterIndices[j + 1] != numParameters - 1) + { + derivativeKnots[++newKnotIndex] = temporaryParameters.segment(j, 3).mean(); + break; + } + } + } + } + + KnotVectorType temporaryKnots(averageKnots.size() + derivativeKnots.size()); + + std::merge(averageKnots.data(), averageKnots.data() + averageKnots.size(), + derivativeKnots.data(), derivativeKnots.data() + derivativeKnots.size(), + temporaryKnots.data()); + + // Number of knots (one for each point and derivative) plus spline order. + DenseIndex numKnots = numParameters + numDerivatives + degree + 1; + knots.resize(numKnots); + + knots.head(degree).fill(temporaryKnots[0]); + knots.tail(degree).fill(temporaryKnots.template tail<1>()[0]); + knots.segment(degree, temporaryKnots.size()) = temporaryKnots; + } + + /** + * \brief Computes chord length parameters which are required for spline interpolation. + * \ingroup Splines_Module + * + * \param[in] pts The data points to which a spline should be fit. + * \param[out] chord_lengths The resulting chord length vector. + * + * \sa Les Piegl and Wayne Tiller, The NURBS book (2nd ed.), 1997, 9.2.1 Global Curve Interpolation to Point Data + **/ + template + void ChordLengths(const PointArrayType& pts, KnotVectorType& chord_lengths) + { + typedef typename KnotVectorType::Scalar Scalar; + + const DenseIndex n = pts.cols(); + + // 1. compute the column-wise norms + chord_lengths.resize(pts.cols()); + chord_lengths[0] = 0; + chord_lengths.rightCols(n-1) = (pts.array().leftCols(n-1) - pts.array().rightCols(n-1)).matrix().colwise().norm(); + + // 2. compute the partial sums + std::partial_sum(chord_lengths.data(), chord_lengths.data()+n, chord_lengths.data()); + + // 3. normalize the data + chord_lengths /= chord_lengths(n-1); + chord_lengths(n-1) = Scalar(1); + } + + /** + * \brief Spline fitting methods. + * \ingroup Splines_Module + **/ + template + struct SplineFitting + { + typedef typename SplineType::KnotVectorType KnotVectorType; + typedef typename SplineType::ParameterVectorType ParameterVectorType; + + /** + * \brief Fits an interpolating Spline to the given data points. + * + * \param pts The points for which an interpolating spline will be computed. + * \param degree The degree of the interpolating spline. + * + * \returns A spline interpolating the initially provided points. + **/ + template + static SplineType Interpolate(const PointArrayType& pts, DenseIndex degree); + + /** + * \brief Fits an interpolating Spline to the given data points. + * + * \param pts The points for which an interpolating spline will be computed. + * \param degree The degree of the interpolating spline. + * \param knot_parameters The knot parameters for the interpolation. + * + * \returns A spline interpolating the initially provided points. + **/ + template + static SplineType Interpolate(const PointArrayType& pts, DenseIndex degree, const KnotVectorType& knot_parameters); + + /** + * \brief Fits an interpolating spline to the given data points and + * derivatives. + * + * \param points The points for which an interpolating spline will be computed. + * \param derivatives The desired derivatives of the interpolating spline at interpolation + * points. + * \param derivativeIndices An array indicating which point each derivative belongs to. This + * must be the same size as @a derivatives. + * \param degree The degree of the interpolating spline. + * + * \returns A spline interpolating @a points with @a derivatives at those points. + * + * \sa Les A. Piegl, Khairan Rajab, Volha Smarodzinana. 2008. + * Curve interpolation with directional constraints for engineering design. + * Engineering with Computers + **/ + template + static SplineType InterpolateWithDerivatives(const PointArrayType& points, + const PointArrayType& derivatives, + const IndexArray& derivativeIndices, + const unsigned int degree); + + /** + * \brief Fits an interpolating spline to the given data points and derivatives. + * + * \param points The points for which an interpolating spline will be computed. + * \param derivatives The desired derivatives of the interpolating spline at interpolation points. + * \param derivativeIndices An array indicating which point each derivative belongs to. This + * must be the same size as @a derivatives. + * \param degree The degree of the interpolating spline. + * \param parameters The parameters corresponding to the interpolation points. + * + * \returns A spline interpolating @a points with @a derivatives at those points. + * + * \sa Les A. Piegl, Khairan Rajab, Volha Smarodzinana. 2008. + * Curve interpolation with directional constraints for engineering design. + * Engineering with Computers + */ + template + static SplineType InterpolateWithDerivatives(const PointArrayType& points, + const PointArrayType& derivatives, + const IndexArray& derivativeIndices, + const unsigned int degree, + const ParameterVectorType& parameters); + }; + + template + template + SplineType SplineFitting::Interpolate(const PointArrayType& pts, DenseIndex degree, const KnotVectorType& knot_parameters) + { + typedef typename SplineType::KnotVectorType::Scalar Scalar; + typedef typename SplineType::ControlPointVectorType ControlPointVectorType; + + typedef Matrix MatrixType; + + KnotVectorType knots; + KnotAveraging(knot_parameters, degree, knots); + + DenseIndex n = pts.cols(); + MatrixType A = MatrixType::Zero(n,n); + for (DenseIndex i=1; i qr(A); + + // Here, we are creating a temporary due to an Eigen issue. + ControlPointVectorType ctrls = qr.solve(MatrixType(pts.transpose())).transpose(); + + return SplineType(knots, ctrls); + } + + template + template + SplineType SplineFitting::Interpolate(const PointArrayType& pts, DenseIndex degree) + { + KnotVectorType chord_lengths; // knot parameters + ChordLengths(pts, chord_lengths); + return Interpolate(pts, degree, chord_lengths); + } + + template + template + SplineType + SplineFitting::InterpolateWithDerivatives(const PointArrayType& points, + const PointArrayType& derivatives, + const IndexArray& derivativeIndices, + const unsigned int degree, + const ParameterVectorType& parameters) + { + typedef typename SplineType::KnotVectorType::Scalar Scalar; + typedef typename SplineType::ControlPointVectorType ControlPointVectorType; + + typedef Matrix MatrixType; + + const DenseIndex n = points.cols() + derivatives.cols(); + + KnotVectorType knots; + + KnotAveragingWithDerivatives(parameters, degree, derivativeIndices, knots); + + // fill matrix + MatrixType A = MatrixType::Zero(n, n); + + // Use these dimensions for quicker populating, then transpose for solving. + MatrixType b(points.rows(), n); + + DenseIndex startRow; + DenseIndex derivativeStart; + + // End derivatives. + if (derivativeIndices[0] == 0) + { + A.template block<1, 2>(1, 0) << -1, 1; + + Scalar y = (knots(degree + 1) - knots(0)) / degree; + b.col(1) = y*derivatives.col(0); + + startRow = 2; + derivativeStart = 1; + } + else + { + startRow = 1; + derivativeStart = 0; + } + if (derivativeIndices[derivatives.cols() - 1] == points.cols() - 1) + { + A.template block<1, 2>(n - 2, n - 2) << -1, 1; + + Scalar y = (knots(knots.size() - 1) - knots(knots.size() - (degree + 2))) / degree; + b.col(b.cols() - 2) = y*derivatives.col(derivatives.cols() - 1); + } + + DenseIndex row = startRow; + DenseIndex derivativeIndex = derivativeStart; + for (DenseIndex i = 1; i < parameters.size() - 1; ++i) + { + const DenseIndex span = SplineType::Span(parameters[i], degree, knots); + + if (derivativeIndex < derivativeIndices.size() && derivativeIndices[derivativeIndex] == i) + { + A.block(row, span - degree, 2, degree + 1) + = SplineType::BasisFunctionDerivatives(parameters[i], 1, degree, knots); + + b.col(row++) = points.col(i); + b.col(row++) = derivatives.col(derivativeIndex++); + } + else + { + A.row(row).segment(span - degree, degree + 1) + = SplineType::BasisFunctions(parameters[i], degree, knots); + b.col(row++) = points.col(i); + } + } + b.col(0) = points.col(0); + b.col(b.cols() - 1) = points.col(points.cols() - 1); + A(0,0) = 1; + A(n - 1, n - 1) = 1; + + // Solve + FullPivLU lu(A); + ControlPointVectorType controlPoints = lu.solve(MatrixType(b.transpose())).transpose(); + + SplineType spline(knots, controlPoints); + + return spline; + } + + template + template + SplineType + SplineFitting::InterpolateWithDerivatives(const PointArrayType& points, + const PointArrayType& derivatives, + const IndexArray& derivativeIndices, + const unsigned int degree) + { + ParameterVectorType parameters; + ChordLengths(points, parameters); + return InterpolateWithDerivatives(points, derivatives, derivativeIndices, degree, parameters); + } +} + +#endif // EIGEN_SPLINE_FITTING_H diff --git a/src/EigenUnsupported/src/Splines/SplineFwd.h b/src/EigenUnsupported/src/Splines/SplineFwd.h new file mode 100644 index 0000000..00d6b49 --- /dev/null +++ b/src/EigenUnsupported/src/Splines/SplineFwd.h @@ -0,0 +1,93 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 20010-2011 Hauke Heibel +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_SPLINES_FWD_H +#define EIGEN_SPLINES_FWD_H + +#include "../../../../Eigen/Core" + +namespace Eigen +{ + template class Spline; + + template < typename SplineType, int DerivativeOrder = Dynamic > struct SplineTraits {}; + + /** + * \ingroup Splines_Module + * \brief Compile-time attributes of the Spline class for Dynamic degree. + **/ + template + struct SplineTraits< Spline<_Scalar, _Dim, _Degree>, Dynamic > + { + typedef _Scalar Scalar; /*!< The spline curve's scalar type. */ + enum { Dimension = _Dim /*!< The spline curve's dimension. */ }; + enum { Degree = _Degree /*!< The spline curve's degree. */ }; + + enum { OrderAtCompileTime = _Degree==Dynamic ? Dynamic : _Degree+1 /*!< The spline curve's order at compile-time. */ }; + enum { NumOfDerivativesAtCompileTime = OrderAtCompileTime /*!< The number of derivatives defined for the current spline. */ }; + + enum { DerivativeMemoryLayout = Dimension==1 ? RowMajor : ColMajor /*!< The derivative type's memory layout. */ }; + + /** \brief The data type used to store non-zero basis functions. */ + typedef Array BasisVectorType; + + /** \brief The data type used to store the values of the basis function derivatives. */ + typedef Array BasisDerivativeType; + + /** \brief The data type used to store the spline's derivative values. */ + typedef Array DerivativeType; + + /** \brief The point type the spline is representing. */ + typedef Array PointType; + + /** \brief The data type used to store knot vectors. */ + typedef Array KnotVectorType; + + /** \brief The data type used to store parameter vectors. */ + typedef Array ParameterVectorType; + + /** \brief The data type representing the spline's control points. */ + typedef Array ControlPointVectorType; + }; + + /** + * \ingroup Splines_Module + * \brief Compile-time attributes of the Spline class for fixed degree. + * + * The traits class inherits all attributes from the SplineTraits of Dynamic degree. + **/ + template < typename _Scalar, int _Dim, int _Degree, int _DerivativeOrder > + struct SplineTraits< Spline<_Scalar, _Dim, _Degree>, _DerivativeOrder > : public SplineTraits< Spline<_Scalar, _Dim, _Degree> > + { + enum { OrderAtCompileTime = _Degree==Dynamic ? Dynamic : _Degree+1 /*!< The spline curve's order at compile-time. */ }; + enum { NumOfDerivativesAtCompileTime = _DerivativeOrder==Dynamic ? Dynamic : _DerivativeOrder+1 /*!< The number of derivatives defined for the current spline. */ }; + + enum { DerivativeMemoryLayout = _Dim==1 ? RowMajor : ColMajor /*!< The derivative type's memory layout. */ }; + + /** \brief The data type used to store the values of the basis function derivatives. */ + typedef Array<_Scalar,Dynamic,Dynamic,RowMajor,NumOfDerivativesAtCompileTime,OrderAtCompileTime> BasisDerivativeType; + + /** \brief The data type used to store the spline's derivative values. */ + typedef Array<_Scalar,_Dim,Dynamic,DerivativeMemoryLayout,_Dim,NumOfDerivativesAtCompileTime> DerivativeType; + }; + + /** \brief 2D float B-spline with dynamic degree. */ + typedef Spline Spline2f; + + /** \brief 3D float B-spline with dynamic degree. */ + typedef Spline Spline3f; + + /** \brief 2D double B-spline with dynamic degree. */ + typedef Spline Spline2d; + + /** \brief 3D double B-spline with dynamic degree. */ + typedef Spline Spline3d; +} + +#endif // EIGEN_SPLINES_FWD_H -- cgit v1.2.1