92 files changed, 0 insertions, 40984 deletions
diff --git a/src/EigenUnsupported/CXX11/CMakeLists.txt b/src/EigenUnsupported/CXX11/CMakeLists.txt
deleted file mode 100644
index 385ed24..0000000
--- a/src/EigenUnsupported/CXX11/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-set(Eigen_CXX11_HEADERS Tensor TensorSymmetry ThreadPool)
-
-install(FILES
-  ${Eigen_CXX11_HEADERS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel
-  )
-
-install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel FILES_MATCHING PATTERN "*.h")
diff --git a/src/EigenUnsupported/CXX11/Tensor b/src/EigenUnsupported/CXX11/Tensor
deleted file mode 100644
index 0938bb5..0000000
--- a/src/EigenUnsupported/CXX11/Tensor
+++ /dev/null
@@ -1,137 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-//#ifndef EIGEN_CXX11_TENSOR_MODULE
-//#define EIGEN_CXX11_TENSOR_MODULE
-
-#include "../../../Eigen/Core"
-
-#if EIGEN_HAS_CXX11
-
-#include "../SpecialFunctions"
-
-#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
-#include "src/util/CXX11Meta.h"
-#include "src/util/MaxSizeVector.h"
-
-/** \defgroup CXX11_Tensor_Module Tensor Module
-  *
-  * This module provides a Tensor class for storing arbitrarily indexed
-  * objects.
-  *
-  * \code
-  * #include <Eigen/CXX11/Tensor>
-  * \endcode
-  *
-  * Much of the documentation can be found \ref eigen_tensors "here".
-  */
-
-#include <atomic>
-#include <chrono>
-#include <cmath>
-#include <cstddef>
-#include <cstring>
-#include <random>
-#include <thread>
-
-#if defined(EIGEN_USE_THREADS) || defined(EIGEN_USE_SYCL)
-#include "ThreadPool"
-#endif
-
-#ifdef EIGEN_USE_GPU
-  #include <iostream>
-  #if defined(EIGEN_USE_HIP)
-    #include <hip/hip_runtime.h>
-  #else
-    #include <cuda_runtime.h>
-  #endif
-#endif
-
-#include "src/Tensor/TensorMacros.h"
-#include "src/Tensor/TensorForwardDeclarations.h"
-#include "src/Tensor/TensorMeta.h"
-#include "src/Tensor/TensorFunctors.h"
-#include "src/Tensor/TensorCostModel.h"
-#include "src/Tensor/TensorDeviceDefault.h"
-#include "src/Tensor/TensorDeviceThreadPool.h"
-#include "src/Tensor/TensorDeviceGpu.h"
-#ifndef gpu_assert
-#define gpu_assert(x)
-#endif
-#include "src/Tensor/TensorDeviceSycl.h"
-#include "src/Tensor/TensorIndexList.h"
-#include "src/Tensor/TensorDimensionList.h"
-#include "src/Tensor/TensorDimensions.h"
-#include "src/Tensor/TensorInitializer.h"
-#include "src/Tensor/TensorTraits.h"
-#include "src/Tensor/TensorRandom.h"
-#include "src/Tensor/TensorUInt128.h"
-#include "src/Tensor/TensorIntDiv.h"
-#include "src/Tensor/TensorGlobalFunctions.h"
-
-#include "src/Tensor/TensorBase.h"
-#include "src/Tensor/TensorBlock.h"
-
-#include "src/Tensor/TensorEvaluator.h"
-#include "src/Tensor/TensorExpr.h"
-#include "src/Tensor/TensorReduction.h"
-#include "src/Tensor/TensorReductionGpu.h"
-#include "src/Tensor/TensorArgMax.h"
-#include "src/Tensor/TensorConcatenation.h"
-#include "src/Tensor/TensorContractionMapper.h"
-#include "src/Tensor/TensorContractionBlocking.h"
-#include "src/Tensor/TensorContraction.h"
-#include "src/Tensor/TensorContractionThreadPool.h"
-#include "src/Tensor/TensorContractionGpu.h"
-#include "src/Tensor/TensorConversion.h"
-#include "src/Tensor/TensorConvolution.h"
-#include "src/Tensor/TensorFFT.h"
-#include "src/Tensor/TensorPatch.h"
-#include "src/Tensor/TensorImagePatch.h"
-#include "src/Tensor/TensorVolumePatch.h"
-#include "src/Tensor/TensorBroadcasting.h"
-#include "src/Tensor/TensorChipping.h"
-#include "src/Tensor/TensorInflation.h"
-#include "src/Tensor/TensorLayoutSwap.h"
-#include "src/Tensor/TensorMorphing.h"
-#include "src/Tensor/TensorPadding.h"
-#include "src/Tensor/TensorReverse.h"
-#include "src/Tensor/TensorShuffling.h"
-#include "src/Tensor/TensorStriding.h"
-#include "src/Tensor/TensorCustomOp.h"
-#include "src/Tensor/TensorEvalTo.h"
-#include "src/Tensor/TensorForcedEval.h"
-#include "src/Tensor/TensorGenerator.h"
-#include "src/Tensor/TensorAssign.h"
-#include "src/Tensor/TensorScan.h"
-#include "src/Tensor/TensorTrace.h"
-
-#ifdef EIGEN_USE_SYCL
-#include "src/Tensor/TensorReductionSycl.h"
-#include "src/Tensor/TensorConvolutionSycl.h"
-#include "src/Tensor/TensorContractionSycl.h"
-#include "src/Tensor/TensorScanSycl.h"
-#endif
-
-#include "src/Tensor/TensorExecutor.h"
-#include "src/Tensor/TensorDevice.h"
-
-#include "src/Tensor/TensorStorage.h"
-#include "src/Tensor/Tensor.h"
-#include "src/Tensor/TensorFixedSize.h"
-#include "src/Tensor/TensorMap.h"
-#include "src/Tensor/TensorRef.h"
-
-#include "src/Tensor/TensorIO.h"
-
-#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
-
-#endif  // EIGEN_HAS_CXX11
-//#endif // EIGEN_CXX11_TENSOR_MODULE
diff --git a/src/EigenUnsupported/CXX11/TensorSymmetry b/src/EigenUnsupported/CXX11/TensorSymmetry
deleted file mode 100644
index b09c5e4..0000000
--- a/src/EigenUnsupported/CXX11/TensorSymmetry
+++ /dev/null
@@ -1,42 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSORSYMMETRY_MODULE
-#define EIGEN_CXX11_TENSORSYMMETRY_MODULE
-
-#include "Tensor"
-
-#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
-
-#include "src/util/CXX11Meta.h"
-
-/** \defgroup CXX11_TensorSymmetry_Module Tensor Symmetry Module
-  *
-  * This module provides a classes that allow for the definition of
-  * symmetries w.r.t. tensor indices.
-  *
-  * Including this module will implicitly include the Tensor module.
-  *
-  * \code
-  * #include <Eigen/TensorSymmetry>
-  * \endcode
-  */
-
-#include "src/TensorSymmetry/util/TemplateGroupTheory.h"
-#include "src/TensorSymmetry/Symmetry.h"
-#include "src/TensorSymmetry/StaticSymmetry.h"
-#include "src/TensorSymmetry/DynamicSymmetry.h"
-
-#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
-
-#endif // EIGEN_CXX11_TENSORSYMMETRY_MODULE
-
-/*
- * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
- */
diff --git a/src/EigenUnsupported/CXX11/ThreadPool b/src/EigenUnsupported/CXX11/ThreadPool
deleted file mode 100644
index c5cafb2..0000000
--- a/src/EigenUnsupported/CXX11/ThreadPool
+++ /dev/null
@@ -1,74 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_THREADPOOL_MODULE
-#define EIGEN_CXX11_THREADPOOL_MODULE
-
-#include "../../../Eigen/Core"
-
-#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
-
-/** \defgroup CXX11_ThreadPool_Module C++11 ThreadPool Module
-  *
-  * This module provides 2 threadpool implementations
-  *  - a simple reference implementation
-  *  - a faster non blocking implementation
-  *
-  * This module requires C++11.
-  *
-  * \code
-  * #include <Eigen/CXX11/ThreadPool>
-  * \endcode
-  */
-
-
-// The code depends on CXX11, so only include the module if the
-// compiler supports it.
-#if (EIGEN_COMP_CXXVER >= 11)
-#include <cstddef>
-#include <cstring>
-#include <time.h>
-
-#include <vector>
-#include <atomic>
-#include <condition_variable>
-#include <deque>
-#include <mutex>
-#include <thread>
-#include <functional>
-#include <memory>
-#include <utility>
-
-// There are non-parenthesized calls to "max" in the  <unordered_map> header,
-// which trigger a check in test/main.h causing compilation to fail.
-// We work around the check here by removing the check for max in
-// the case where we have to emulate thread_local.
-#ifdef max
-#undef max
-#endif
-#include <unordered_map>
-
-#include "src/util/CXX11Meta.h"
-#include "src/util/MaxSizeVector.h"
-
-#include "src/ThreadPool/ThreadLocal.h"
-#include "src/ThreadPool/ThreadYield.h"
-#include "src/ThreadPool/ThreadCancel.h"
-#include "src/ThreadPool/EventCount.h"
-#include "src/ThreadPool/RunQueue.h"
-#include "src/ThreadPool/ThreadPoolInterface.h"
-#include "src/ThreadPool/ThreadEnvironment.h"
-#include "src/ThreadPool/Barrier.h"
-#include "src/ThreadPool/NonBlockingThreadPool.h"
-
-#endif
-
-#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
-
-#endif // EIGEN_CXX11_THREADPOOL_MODULE
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/README.md b/src/EigenUnsupported/CXX11/src/Tensor/README.md
deleted file mode 100644
index 2f65b1b..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/README.md
+++ /dev/null
@@ -1,1815 +0,0 @@
-# Eigen Tensors {#eigen_tensors}
-
-Tensors are multidimensional arrays of elements. Elements are typically scalars,
-but more complex types such as strings are also supported.
-
-## Tensor Classes
-
-You can manipulate a tensor with one of the following classes.  They all are in
-the namespace `::Eigen.`
-
-
-### Class Tensor<data_type, rank>
-
-This is the class to use to create a tensor and allocate memory for it.  The
-class is templatized with the tensor datatype, such as float or int, and the
-tensor rank.  The rank is the number of dimensions, for example rank 2 is a
-matrix.
-
-Tensors of this class are resizable.  For example, if you assign a tensor of a
-different size to a Tensor, that tensor is resized to match its new value.
-
-#### Constructor Tensor<data_type, rank>(size0, size1, ...)
-
-Constructor for a Tensor.  The constructor must be passed `rank` integers
-indicating the sizes of the instance along each of the the `rank`
-dimensions.
-
-    // Create a tensor of rank 3 of sizes 2, 3, 4.  This tensor owns
-    // memory to hold 24 floating point values (24 = 2 x 3 x 4).
-    Tensor<float, 3> t_3d(2, 3, 4);
-
-    // Resize t_3d by assigning a tensor of different sizes, but same rank.
-    t_3d = Tensor<float, 3>(3, 4, 3);
-
-#### Constructor Tensor<data_type, rank>(size_array)
-
-Constructor where the sizes for the constructor are specified as an array of
-values instead of an explicitly list of parameters.  The array type to use is
-`Eigen::array<Eigen::Index>`.  The array can be constructed automatically
-from an initializer list.
-
-    // Create a tensor of strings of rank 2 with sizes 5, 7.
-    Tensor<string, 2> t_2d({5, 7});
-
-
-### Class TensorFixedSize<data_type, Sizes<size0, size1, ...>>
-
-Class to use for tensors of fixed size, where the size is known at compile
-time.  Fixed sized tensors can provide very fast computations because all their
-dimensions are known by the compiler.  FixedSize tensors are not resizable.
-
-If the total number of elements in a fixed size tensor is small enough the
-tensor data is held onto the stack and does not cause heap allocation and free.
-
-    // Create a 4 x 3 tensor of floats.
-    TensorFixedSize<float, Sizes<4, 3>> t_4x3;
-
-### Class TensorMap<Tensor<data_type, rank>>
-
-This is the class to use to create a tensor on top of memory allocated and
-owned by another part of your code.  It allows to view any piece of allocated
-memory as a Tensor.  Instances of this class do not own the memory where the
-data are stored.
-
-A TensorMap is not resizable because it does not own the memory where its data
-are stored.
-
-#### Constructor TensorMap<Tensor<data_type, rank>>(data, size0, size1, ...)
-
-Constructor for a Tensor.  The constructor must be passed a pointer to the
-storage for the data, and "rank" size attributes.  The storage has to be
-large enough to hold all the data.
-
-    // Map a tensor of ints on top of stack-allocated storage.
-    int storage[128];  // 2 x 4 x 2 x 8 = 128
-    TensorMap<Tensor<int, 4>> t_4d(storage, 2, 4, 2, 8);
-
-    // The same storage can be viewed as a different tensor.
-    // You can also pass the sizes as an array.
-    TensorMap<Tensor<int, 2>> t_2d(storage, 16, 8);
-
-    // You can also map fixed-size tensors.  Here we get a 1d view of
-    // the 2d fixed-size tensor.
-    TensorFixedSize<float, Sizes<4, 3>> t_4x3;
-    TensorMap<Tensor<float, 1>> t_12(t_4x3.data(), 12);
-
-
-#### Class TensorRef
-
-See Assigning to a TensorRef below.
-
-## Accessing Tensor Elements
-
-#### <data_type> tensor(index0, index1...)
-
-Return the element at position `(index0, index1...)` in tensor
-`tensor`.  You must pass as many parameters as the rank of `tensor`.
-The expression can be used as an l-value to set the value of the element at the
-specified position.  The value returned is of the datatype of the tensor.
-
-    // Set the value of the element at position (0, 1, 0);
-    Tensor<float, 3> t_3d(2, 3, 4);
-    t_3d(0, 1, 0) = 12.0f;
-
-    // Initialize all elements to random values.
-    for (int i = 0; i < 2; ++i) {
-      for (int j = 0; j < 3; ++j) {
-        for (int k = 0; k < 4; ++k) {
-          t_3d(i, j, k) = ...some random value...;
-        }
-      }
-    }
-
-    // Print elements of a tensor.
-    for (int i = 0; i < 2; ++i) {
-      LOG(INFO) << t_3d(i, 0, 0);
-    }
-
-
-## TensorLayout
-
-The tensor library supports 2 layouts: `ColMajor` (the default) and
-`RowMajor`.  Only the default column major layout is currently fully
-supported, and it is therefore not recommended to attempt to use the row major
-layout at the moment.
-
-The layout of a tensor is optionally specified as part of its type. If not
-specified explicitly column major is assumed.
-
-    Tensor<float, 3, ColMajor> col_major;  // equivalent to Tensor<float, 3>
-    TensorMap<Tensor<float, 3, RowMajor> > row_major(data, ...);
-
-All the arguments to an expression must use the same layout. Attempting to mix
-different layouts will result in a compilation error.
-
-It is possible to change the layout of a tensor or an expression using the
-`swap_layout()` method.  Note that this will also reverse the order of the
-dimensions.
-
-    Tensor<float, 2, ColMajor> col_major(2, 4);
-    Tensor<float, 2, RowMajor> row_major(2, 4);
-
-    Tensor<float, 2> col_major_result = col_major;  // ok, layouts match
-    Tensor<float, 2> col_major_result = row_major;  // will not compile
-
-    // Simple layout swap
-    col_major_result = row_major.swap_layout();
-    eigen_assert(col_major_result.dimension(0) == 4);
-    eigen_assert(col_major_result.dimension(1) == 2);
-
-    // Swap the layout and preserve the order of the dimensions
-    array<int, 2> shuffle(1, 0);
-    col_major_result = row_major.swap_layout().shuffle(shuffle);
-    eigen_assert(col_major_result.dimension(0) == 2);
-    eigen_assert(col_major_result.dimension(1) == 4);
-
-
-## Tensor Operations
-
-The Eigen Tensor library provides a vast library of operations on Tensors:
-numerical operations such as addition and multiplication, geometry operations
-such as slicing and shuffling, etc.  These operations are available as methods
-of the Tensor classes, and in some cases as operator overloads.  For example
-the following code computes the elementwise addition of two tensors:
-
-    Tensor<float, 3> t1(2, 3, 4);
-    ...set some values in t1...
-    Tensor<float, 3> t2(2, 3, 4);
-    ...set some values in t2...
-    // Set t3 to the element wise sum of t1 and t2
-    Tensor<float, 3> t3 = t1 + t2;
-
-While the code above looks easy enough, it is important to understand that the
-expression `t1 + t2` is not actually adding the values of the tensors.  The
-expression instead constructs a "tensor operator" object of the class
-TensorCwiseBinaryOp<scalar_sum>, which has references to the tensors
-`t1` and `t2`.  This is a small C++ object that knows how to add
-`t1` and `t2`.  It is only when the value of the expression is assigned
-to the tensor `t3` that the addition is actually performed.  Technically,
-this happens through the overloading of `operator=()` in the Tensor class.
-
-This mechanism for computing tensor expressions allows for lazy evaluation and
-optimizations which are what make the tensor library very fast.
-
-Of course, the tensor operators do nest, and the expression `t1 + t2 * 0.3f`
-is actually represented with the (approximate) tree of operators:
-
-    TensorCwiseBinaryOp<scalar_sum>(t1, TensorCwiseUnaryOp<scalar_mul>(t2, 0.3f))
-
-
-### Tensor Operations and C++ "auto"
-
-Because Tensor operations create tensor operators, the C++ `auto` keyword
-does not have its intuitive meaning.  Consider these 2 lines of code:
-
-    Tensor<float, 3> t3 = t1 + t2;
-    auto t4 = t1 + t2;
-
-In the first line we allocate the tensor `t3` and it will contain the
-result of the addition of `t1` and `t2`.  In the second line, `t4`
-is actually the tree of tensor operators that will compute the addition of
-`t1` and `t2`.  In fact, `t4` is *not* a tensor and you cannot get
-the values of its elements:
-
-    Tensor<float, 3> t3 = t1 + t2;
-    cout << t3(0, 0, 0);  // OK prints the value of t1(0, 0, 0) + t2(0, 0, 0)
-
-    auto t4 = t1 + t2;
-    cout << t4(0, 0, 0);  // Compilation error!
-
-When you use `auto` you do not get a Tensor as a result but instead a
-non-evaluated expression.  So only use `auto` to delay evaluation.
-
-Unfortunately, there is no single underlying concrete type for holding
-non-evaluated expressions, hence you have to use auto in the case when you do
-want to hold non-evaluated expressions.
-
-When you need the results of set of tensor computations you have to assign the
-result to a Tensor that will be capable of holding onto them.  This can be
-either a normal Tensor, a fixed size Tensor, or a TensorMap on an existing
-piece of memory.  All the following will work:
-
-    auto t4 = t1 + t2;
-
-    Tensor<float, 3> result = t4;  // Could also be: result(t4);
-    cout << result(0, 0, 0);
-
-    TensorMap<float, 4> result(<a float* with enough space>, <size0>, ...) = t4;
-    cout << result(0, 0, 0);
-
-    TensorFixedSize<float, Sizes<size0, ...>> result = t4;
-    cout << result(0, 0, 0);
-
-Until you need the results, you can keep the operation around, and even reuse
-it for additional operations.  As long as you keep the expression as an
-operation, no computation is performed.
-
-    // One way to compute exp((t1 + t2) * 0.2f);
-    auto t3 = t1 + t2;
-    auto t4 = t3 * 0.2f;
-    auto t5 = t4.exp();
-    Tensor<float, 3> result = t5;
-
-    // Another way, exactly as efficient as the previous one:
-    Tensor<float, 3> result = ((t1 + t2) * 0.2f).exp();
-
-### Controlling When Expression are Evaluated
-
-There are several ways to control when expressions are evaluated:
-
-*   Assignment to a Tensor, TensorFixedSize, or TensorMap.
-*   Use of the eval() method.
-*   Assignment to a TensorRef.
-
-#### Assigning to a Tensor, TensorFixedSize, or TensorMap.
-
-The most common way to evaluate an expression is to assign it to a Tensor.  In
-the example below, the `auto` declarations make the intermediate values
-"Operations", not Tensors, and do not cause the expressions to be evaluated.
-The assignment to the Tensor `result` causes the evaluation of all the
-operations.
-
-    auto t3 = t1 + t2;             // t3 is an Operation.
-    auto t4 = t3 * 0.2f;           // t4 is an Operation.
-    auto t5 = t4.exp();            // t5 is an Operation.
-    Tensor<float, 3> result = t5;  // The operations are evaluated.
-
-If you know the ranks and sizes of the Operation value you can assign the
-Operation to a TensorFixedSize instead of a Tensor, which is a bit more
-efficient.
-
-    // We know that the result is a 4x4x2 tensor!
-    TensorFixedSize<float, Sizes<4, 4, 2>> result = t5;
-
-Simiarly, assigning an expression to a TensorMap causes its evaluation.  Like
-tensors of type TensorFixedSize, TensorMaps cannot be resized so they have to
-have the rank and sizes of the expression that are assigned to them.
-
-#### Calling eval().
-
-When you compute large composite expressions, you sometimes want to tell Eigen
-that an intermediate value in the expression tree is worth evaluating ahead of
-time.  This is done by inserting a call to the `eval()` method of the
-expression Operation.
-
-    // The previous example could have been written:
-    Tensor<float, 3> result = ((t1 + t2) * 0.2f).exp();
-
-    // If you want to compute (t1 + t2) once ahead of time you can write:
-    Tensor<float, 3> result = ((t1 + t2).eval() * 0.2f).exp();
-
-Semantically, calling `eval()` is equivalent to materializing the value of
-the expression in a temporary Tensor of the right size.  The code above in
-effect does:
-
-    // .eval() knows the size!
-    TensorFixedSize<float, Sizes<4, 4, 2>> tmp = t1 + t2;
-    Tensor<float, 3> result = (tmp * 0.2f).exp();
-
-Note that the return value of `eval()` is itself an Operation, so the
-following code does not do what you may think:
-
-    // Here t3 is an evaluation Operation.  t3 has not been evaluated yet.
-    auto t3 = (t1 + t2).eval();
-
-    // You can use t3 in another expression.  Still no evaluation.
-    auto t4 = (t3 * 0.2f).exp();
-
-    // The value is evaluated when you assign the Operation to a Tensor, using
-    // an intermediate tensor to represent t3.x
-    Tensor<float, 3> result = t4;
-
-While in the examples above calling `eval()` does not make a difference in
-performance, in other cases it can make a huge difference.  In the expression
-below the `broadcast()` expression causes the `X.maximum()` expression
-to be evaluated many times:
-
-    Tensor<...> X ...;
-    Tensor<...> Y = ((X - X.maximum(depth_dim).reshape(dims2d).broadcast(bcast))
-                     * beta).exp();
-
-Inserting a call to `eval()` between the `maximum()` and
-`reshape()` calls guarantees that maximum() is only computed once and
-greatly speeds-up execution:
-
-    Tensor<...> Y =
-      ((X - X.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast))
-        * beta).exp();
-
-In the other example below, the tensor `Y` is both used in the expression
-and its assignment.  This is an aliasing problem and if the evaluation is not
-done in the right order Y will be updated incrementally during the evaluation
-resulting in bogus results:
-
-     Tensor<...> Y ...;
-     Y = Y / (Y.sum(depth_dim).reshape(dims2d).broadcast(bcast));
-
-Inserting a call to `eval()` between the `sum()` and `reshape()`
-expressions ensures that the sum is computed before any updates to `Y` are
-done.
-
-     Y = Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast));
-
-Note that an eval around the full right hand side expression is not needed
-because the generated has to compute the i-th value of the right hand side
-before assigning it to the left hand side.
-
-However, if you were assigning the expression value to a shuffle of `Y`
-then you would need to force an eval for correctness by adding an `eval()`
-call for the right hand side:
-
-     Y.shuffle(...) =
-        (Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast))).eval();
-
-
-#### Assigning to a TensorRef.
-
-If you need to access only a few elements from the value of an expression you
-can avoid materializing the value in a full tensor by using a TensorRef.
-
-A TensorRef is a small wrapper class for any Eigen Operation.  It provides
-overloads for the `()` operator that let you access individual values in
-the expression.  TensorRef is convenient, because the Operation themselves do
-not provide a way to access individual elements.
-
-    // Create a TensorRef for the expression.  The expression is not
-    // evaluated yet.
-    TensorRef<Tensor<float, 3> > ref = ((t1 + t2) * 0.2f).exp();
-
-    // Use "ref" to access individual elements.  The expression is evaluated
-    // on the fly.
-    float at_0 = ref(0, 0, 0);
-    cout << ref(0, 1, 0);
-
-Only use TensorRef when you need a subset of the values of the expression.
-TensorRef only computes the values you access.  However note that if you are
-going to access all the values it will be much faster to materialize the
-results in a Tensor first.
-
-In some cases, if the full Tensor result would be very large, you may save
-memory by accessing it as a TensorRef.  But not always.  So don't count on it.
-
-
-### Controlling How Expressions Are Evaluated
-
-The tensor library provides several implementations of the various operations
-such as contractions and convolutions.  The implementations are optimized for
-different environments: single threaded on CPU, multi threaded on CPU, or on a
-GPU using cuda.  Additional implementations may be added later.
-
-You can choose which implementation to use with the `device()` call.  If
-you do not choose an implementation explicitly the default implementation that
-uses a single thread on the CPU is used.
-
-The default implementation has been optimized for recent Intel CPUs, taking
-advantage of SSE, AVX, and FMA instructions.  Work is ongoing to tune the
-library on ARM CPUs.  Note that you need to pass compiler-dependent flags
-to enable the use of SSE, AVX, and other instructions.
-
-For example, the following code adds two tensors using the default
-single-threaded CPU implementation:
-
-    Tensor<float, 2> a(30, 40);
-    Tensor<float, 2> b(30, 40);
-    Tensor<float, 2> c = a + b;
-
-To choose a different implementation you have to insert a `device()` call
-before the assignment of the result.  For technical C++ reasons this requires
-that the Tensor for the result be declared on its own.  This means that you
-have to know the size of the result.
-
-    Eigen::Tensor<float, 2> c(30, 40);
-    c.device(...) = a + b;
-
-The call to `device()` must be the last call on the left of the operator=.
-
-You must pass to the `device()` call an Eigen device object.  There are
-presently three devices you can use: DefaultDevice, ThreadPoolDevice and
-GpuDevice.
-
-
-#### Evaluating With the DefaultDevice
-
-This is exactly the same as not inserting a `device()` call.
-
-    DefaultDevice my_device;
-    c.device(my_device) = a + b;
-
-#### Evaluating with a Thread Pool
-
-    // Create the Eigen ThreadPool
-    Eigen::ThreadPool pool(8 /* number of threads in pool */)
-
-    // Create the Eigen ThreadPoolDevice.
-    Eigen::ThreadPoolDevice my_device(&pool, 4 /* number of threads to use */);
-
-    // Now just use the device when evaluating expressions.
-    Eigen::Tensor<float, 2> c(30, 50);
-    c.device(my_device) = a.contract(b, dot_product_dims);
-
-
-#### Evaluating On GPU
-
-This is presently a bit more complicated than just using a thread pool device.
-You need to create a GPU device but you also need to explicitly allocate the
-memory for tensors with cuda.
-
-
-## API Reference
-
-### Datatypes
-
-In the documentation of the tensor methods and Operation we mention datatypes
-that are tensor-type specific:
-
-#### <Tensor-Type>::Dimensions
-
-Acts like an array of ints.  Has an `int size` attribute, and can be
-indexed like an array to access individual values.  Used to represent the
-dimensions of a tensor.  See `dimensions()`.
-
-#### <Tensor-Type>::Index
-
-Acts like an `int`.  Used for indexing tensors along their dimensions.  See
-`operator()`, `dimension()`, and `size()`.
-
-#### <Tensor-Type>::Scalar
-
-Represents the datatype of individual tensor elements.  For example, for a
-`Tensor<float>`, `Scalar` is the type `float`.  See
-`setConstant()`.
-
-#### <Operation>
-
-We use this pseudo type to indicate that a tensor Operation is returned by a
-method.  We indicate in the text the type and dimensions of the tensor that the
-Operation returns after evaluation.
-
-The Operation will have to be evaluated, for example by assigning it to a
-tensor, before you can access the values of the resulting tensor.  You can also
-access the values through a TensorRef.
-
-
-## Built-in Tensor Methods
-
-These are usual C++ methods that act on tensors immediately.  They are not
-Operations which provide delayed evaluation of their results.  Unless specified
-otherwise, all the methods listed below are available on all tensor classes:
-Tensor, TensorFixedSize, and TensorMap.
-
-## Metadata
-
-### int NumDimensions
-
-Constant value indicating the number of dimensions of a Tensor.  This is also
-known as the tensor "rank".
-
-      Eigen::Tensor<float, 2> a(3, 4);
-      cout << "Dims " << a.NumDimensions;
-      => Dims 2
-
-### Dimensions dimensions()
-
-Returns an array-like object representing the dimensions of the tensor.
-The actual type of the `dimensions()` result is `<Tensor-Type>::``Dimensions`.
-
-    Eigen::Tensor<float, 2> a(3, 4);
-    const Eigen::Tensor<float, 2>::Dimensions& d = a.dimensions();
-    cout << "Dim size: " << d.size << ", dim 0: " << d[0]
-         << ", dim 1: " << d[1];
-    => Dim size: 2, dim 0: 3, dim 1: 4
-
-If you use a C++11 compiler, you can use `auto` to simplify the code:
-
-    const auto& d = a.dimensions();
-    cout << "Dim size: " << d.size << ", dim 0: " << d[0]
-         << ", dim 1: " << d[1];
-    => Dim size: 2, dim 0: 3, dim 1: 4
-
-### Index dimension(Index n)
-
-Returns the n-th dimension of the tensor.  The actual type of the
-`dimension()` result is `<Tensor-Type>::``Index`, but you can
-always use it like an int.
-
-      Eigen::Tensor<float, 2> a(3, 4);
-      int dim1 = a.dimension(1);
-      cout << "Dim 1: " << dim1;
-      => Dim 1: 4
-
-### Index size()
-
-Returns the total number of elements in the tensor.  This is the product of all
-the tensor dimensions.  The actual type of the `size()` result is
-`<Tensor-Type>::``Index`, but you can always use it like an int.
-
-    Eigen::Tensor<float, 2> a(3, 4);
-    cout << "Size: " << a.size();
-    => Size: 12
-
-
-### Getting Dimensions From An Operation
-
-A few operations provide `dimensions()` directly,
-e.g. `TensorReslicingOp`.  Most operations defer calculating dimensions
-until the operation is being evaluated.  If you need access to the dimensions
-of a deferred operation, you can wrap it in a TensorRef (see Assigning to a
-TensorRef above), which provides `dimensions()` and `dimension()` as
-above.
-
-TensorRef can also wrap the plain Tensor types, so this is a useful idiom in
-templated contexts where the underlying object could be either a raw Tensor
-or some deferred operation (e.g. a slice of a Tensor).  In this case, the
-template code can wrap the object in a TensorRef and reason about its
-dimensionality while remaining agnostic to the underlying type.
-
-
-## Constructors
-
-### Tensor
-
-Creates a tensor of the specified size. The number of arguments must be equal
-to the rank of the tensor. The content of the tensor is not initialized.
-
-    Eigen::Tensor<float, 2> a(3, 4);
-    cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl;
-    => NumRows: 3 NumCols: 4
-
-### TensorFixedSize
-
-Creates a tensor of the specified size. The number of arguments in the Sizes<>
-template parameter determines the rank of the tensor. The content of the tensor
-is not initialized.
-
-    Eigen::TensorFixedSize<float, Sizes<3, 4>> a;
-    cout << "Rank: " << a.rank() << endl;
-    => Rank: 2
-    cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl;
-    => NumRows: 3 NumCols: 4
-
-### TensorMap
-
-Creates a tensor mapping an existing array of data. The data must not be freed
-until the TensorMap is discarded, and the size of the data must be large enough
-to accommodate the coefficients of the tensor.
-
-    float data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-    Eigen::TensorMap<Tensor<float, 2>> a(data, 3, 4);
-    cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl;
-    => NumRows: 3 NumCols: 4
-    cout << "a(1, 2): " << a(1, 2) << endl;
-    => a(1, 2): 7
-
-
-## Contents Initialization
-
-When a new Tensor or a new TensorFixedSize are created, memory is allocated to
-hold all the tensor elements, but the memory is not initialized.  Similarly,
-when a new TensorMap is created on top of non-initialized memory the memory its
-contents are not initialized.
-
-You can use one of the methods below to initialize the tensor memory.  These
-have an immediate effect on the tensor and return the tensor itself as a
-result.  These are not tensor Operations which delay evaluation.
-
-### <Tensor-Type> setConstant(const Scalar& val)
-
-Sets all elements of the tensor to the constant value `val`.  `Scalar`
-is the type of data stored in the tensor.  You can pass any value that is
-convertible to that type.
-
-Returns the tensor itself in case you want to chain another call.
-
-    a.setConstant(12.3f);
-    cout << "Constant: " << endl << a << endl << endl;
-    =>
-    Constant:
-    12.3 12.3 12.3 12.3
-    12.3 12.3 12.3 12.3
-    12.3 12.3 12.3 12.3
-
-Note that `setConstant()` can be used on any tensor where the element type
-has a copy constructor and an `operator=()`:
-
-    Eigen::Tensor<string, 2> a(2, 3);
-    a.setConstant("yolo");
-    cout << "String tensor: " << endl << a << endl << endl;
-    =>
-    String tensor:
-    yolo yolo yolo
-    yolo yolo yolo
-
-
-### <Tensor-Type> setZero()
-
-Fills the tensor with zeros.  Equivalent to `setConstant(Scalar(0))`.
-Returns the tensor itself in case you want to chain another call.
-
-    a.setZero();
-    cout << "Zeros: " << endl << a << endl << endl;
-    =>
-    Zeros:
-    0 0 0 0
-    0 0 0 0
-    0 0 0 0
-
-
-### <Tensor-Type> setValues({..initializer_list})
-
-Fills the tensor with explicit values specified in a std::initializer_list.
-The type of the initializer list depends on the type and rank of the tensor.
-
-If the tensor has rank N, the initializer list must be nested N times.  The
-most deeply nested lists must contains P scalars of the Tensor type where P is
-the size of the last dimension of the Tensor.
-
-For example, for a `TensorFixedSize<float, 2, 3>` the initializer list must
-contains 2 lists of 3 floats each.
-
-`setValues()` returns the tensor itself in case you want to chain another
-call.
-
-    Eigen::Tensor<float, 2> a(2, 3);
-    a.setValues({{0.0f, 1.0f, 2.0f}, {3.0f, 4.0f, 5.0f}});
-    cout << "a" << endl << a << endl << endl;
-    =>
-    a
-    0 1 2
-    3 4 5
-
-If a list is too short, the corresponding elements of the tensor will not be
-changed.  This is valid at each level of nesting.  For example the following
-code only sets the values of the first row of the tensor.
-
-    Eigen::Tensor<int, 2> a(2, 3);
-    a.setConstant(1000);
-    a.setValues({{10, 20, 30}});
-    cout << "a" << endl << a << endl << endl;
-    =>
-    a
-    10   20   30
-    1000 1000 1000
-
-### <Tensor-Type> setRandom()
-
-Fills the tensor with random values.  Returns the tensor itself in case you
-want to chain another call.
-
-    a.setRandom();
-    cout << "Random: " << endl << a << endl << endl;
-    =>
-    Random:
-      0.680375    0.59688  -0.329554    0.10794
-     -0.211234   0.823295   0.536459 -0.0452059
-      0.566198  -0.604897  -0.444451   0.257742
-
-You can customize `setRandom()` by providing your own random number
-generator as a template argument:
-
-    a.setRandom<MyRandomGenerator>();
-
-Here, `MyRandomGenerator` must be a struct with the following member
-functions, where Scalar and Index are the same as `<Tensor-Type>::``Scalar`
-and `<Tensor-Type>::``Index`.
-
-See `struct UniformRandomGenerator` in TensorFunctors.h for an example.
-
-    // Custom number generator for use with setRandom().
-    struct MyRandomGenerator {
-      // Default and copy constructors. Both are needed
-      MyRandomGenerator() { }
-      MyRandomGenerator(const MyRandomGenerator& ) { }
-
-      // Return a random value to be used.  "element_location" is the
-      // location of the entry to set in the tensor, it can typically
-      // be ignored.
-      Scalar operator()(Eigen::DenseIndex element_location,
-                        Eigen::DenseIndex /*unused*/ = 0) const {
-        return <randomly generated value of type T>;
-      }
-
-      // Same as above but generates several numbers at a time.
-      typename internal::packet_traits<Scalar>::type packetOp(
-          Eigen::DenseIndex packet_location, Eigen::DenseIndex /*unused*/ = 0) const {
-        return <a packet of randomly generated values>;
-      }
-    };
-
-You can also use one of the 2 random number generators that are part of the
-tensor library:
-*   UniformRandomGenerator
-*   NormalRandomGenerator
-
-
-## Data Access
-
-The Tensor, TensorFixedSize, and TensorRef classes provide the following
-accessors to access the tensor coefficients:
-
-    const Scalar& operator()(const array<Index, NumIndices>& indices)
-    const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
-    Scalar& operator()(const array<Index, NumIndices>& indices)
-    Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
-
-The number of indices must be equal to the rank of the tensor. Moreover, these
-accessors are not available on tensor expressions. In order to access the
-values of a tensor expression, the expression must either be evaluated or
-wrapped in a TensorRef.
-
-
-### Scalar* data() and const Scalar* data() const
-
-Returns a pointer to the storage for the tensor.  The pointer is const if the
-tensor was const.  This allows direct access to the data.  The layout of the
-data depends on the tensor layout: RowMajor or ColMajor.
-
-This access is usually only needed for special cases, for example when mixing
-Eigen Tensor code with other libraries.
-
-Scalar is the type of data stored in the tensor.
-
-    Eigen::Tensor<float, 2> a(3, 4);
-    float* a_data = a.data();
-    a_data[0] = 123.45f;
-    cout << "a(0, 0): " << a(0, 0);
-    => a(0, 0): 123.45
-
-
-## Tensor Operations
-
-All the methods documented below return non evaluated tensor `Operations`.
-These can be chained: you can apply another Tensor Operation to the value
-returned by the method.
-
-The chain of Operation is evaluated lazily, typically when it is assigned to a
-tensor.  See "Controlling when Expression are Evaluated" for more details about
-their evaluation.
-
-### <Operation> constant(const Scalar& val)
-
-Returns a tensor of the same type and dimensions as the original tensor but
-where all elements have the value `val`.
-
-This is useful, for example, when you want to add or subtract a constant from a
-tensor, or multiply every element of a tensor by a scalar.
-
-    Eigen::Tensor<float, 2> a(2, 3);
-    a.setConstant(1.0f);
-    Eigen::Tensor<float, 2> b = a + a.constant(2.0f);
-    Eigen::Tensor<float, 2> c = b * b.constant(0.2f);
-    cout << "a" << endl << a << endl << endl;
-    cout << "b" << endl << b << endl << endl;
-    cout << "c" << endl << c << endl << endl;
-    =>
-    a
-    1 1 1
-    1 1 1
-
-    b
-    3 3 3
-    3 3 3
-
-    c
-    0.6 0.6 0.6
-    0.6 0.6 0.6
-
-### <Operation> random()
-
-Returns a tensor of the same type and dimensions as the current tensor
-but where all elements have random values.
-
-This is for example useful to add random values to an existing tensor.
-The generation of random values can be customized in the same manner
-as for `setRandom()`.
-
-    Eigen::Tensor<float, 2> a(2, 3);
-    a.setConstant(1.0f);
-    Eigen::Tensor<float, 2> b = a + a.random();
-    cout << "a" << endl << a << endl << endl;
-    cout << "b" << endl << b << endl << endl;
-    =>
-    a
-    1 1 1
-    1 1 1
-
-    b
-    1.68038   1.5662  1.82329
-    0.788766  1.59688 0.395103
-
-
-## Unary Element Wise Operations
-
-All these operations take a single input tensor as argument and return a tensor
-of the same type and dimensions as the tensor to which they are applied.  The
-requested operations are applied to each element independently.
-
-### <Operation> operator-()
-
-Returns a tensor of the same type and dimensions as the original tensor
-containing the opposite values of the original tensor.
-
-    Eigen::Tensor<float, 2> a(2, 3);
-    a.setConstant(1.0f);
-    Eigen::Tensor<float, 2> b = -a;
-    cout << "a" << endl << a << endl << endl;
-    cout << "b" << endl << b << endl << endl;
-    =>
-    a
-    1 1 1
-    1 1 1
-
-    b
-    -1 -1 -1
-    -1 -1 -1
-
-### <Operation> sqrt()
-
-Returns a tensor of the same type and dimensions as the original tensor
-containing the square roots of the original tensor.
-
-### <Operation> rsqrt()
-
-Returns a tensor of the same type and dimensions as the original tensor
-containing the inverse square roots of the original tensor.
-
-### <Operation> square()
-
-Returns a tensor of the same type and dimensions as the original tensor
-containing the squares of the original tensor values.
-
-### <Operation> inverse()
-
-Returns a tensor of the same type and dimensions as the original tensor
-containing the inverse of the original tensor values.
-
-### <Operation> exp()
-
-Returns a tensor of the same type and dimensions as the original tensor
-containing the exponential of the original tensor.
-
-### <Operation> log()
-
-Returns a tensor of the same type and dimensions as the original tensor
-containing the natural logarithms of the original tensor.
-
-### <Operation> abs()
-
-Returns a tensor of the same type and dimensions as the original tensor
-containing the absolute values of the original tensor.
-
-### <Operation> pow(Scalar exponent)
-
-Returns a tensor of the same type and dimensions as the original tensor
-containing the coefficients of the original tensor to the power of the
-exponent.
-
-The type of the exponent, Scalar, is always the same as the type of the
-tensor coefficients.  For example, only integer exponents can be used in
-conjuntion with tensors of integer values.
-
-You can use cast() to lift this restriction.  For example this computes
-cubic roots of an int Tensor:
-
-    Eigen::Tensor<int, 2> a(2, 3);
-    a.setValues({{0, 1, 8}, {27, 64, 125}});
-    Eigen::Tensor<double, 2> b = a.cast<double>().pow(1.0 / 3.0);
-    cout << "a" << endl << a << endl << endl;
-    cout << "b" << endl << b << endl << endl;
-    =>
-    a
-    0   1   8
-    27  64 125
-
-    b
-    0 1 2
-    3 4 5
-
-### <Operation>  operator * (Scalar scale)
-
-Multiplies all the coefficients of the input tensor by the provided scale.
-
-### <Operation>  cwiseMax(Scalar threshold)
-TODO
-
-### <Operation>  cwiseMin(Scalar threshold)
-TODO
-
-### <Operation>  unaryExpr(const CustomUnaryOp& func)
-TODO
-
-
-## Binary Element Wise Operations
-
-These operations take two input tensors as arguments. The 2 input tensors should
-be of the same type and dimensions. The result is a tensor of the same
-dimensions as the tensors to which they are applied, and unless otherwise
-specified it is also of the same type. The requested operations are applied to
-each pair of elements independently.
-
-### <Operation> operator+(const OtherDerived& other)
-
-Returns a tensor of the same type and dimensions as the input tensors
-containing the coefficient wise sums of the inputs.
-
-### <Operation> operator-(const OtherDerived& other)
-
-Returns a tensor of the same type and dimensions as the input tensors
-containing the coefficient wise differences of the inputs.
-
-### <Operation> operator*(const OtherDerived& other)
-
-Returns a tensor of the same type and dimensions as the input tensors
-containing the coefficient wise products of the inputs.
-
-### <Operation> operator/(const OtherDerived& other)
-
-Returns a tensor of the same type and dimensions as the input tensors
-containing the coefficient wise quotients of the inputs.
-
-This operator is not supported for integer types.
-
-### <Operation> cwiseMax(const OtherDerived& other)
-
-Returns a tensor of the same type and dimensions as the input tensors
-containing the coefficient wise maximums of the inputs.
-
-### <Operation> cwiseMin(const OtherDerived& other)
-
-Returns a tensor of the same type and dimensions as the input tensors
-containing the coefficient wise mimimums of the inputs.
-
-### <Operation> Logical operators
-
-The following logical operators are supported as well:
-
-*   operator&&(const OtherDerived& other)
-*   operator||(const OtherDerived& other)
-*   operator<(const OtherDerived& other)
-*   operator<=(const OtherDerived& other)
-*   operator>(const OtherDerived& other)
-*   operator>=(const OtherDerived& other)
-*   operator==(const OtherDerived& other)
-*   operator!=(const OtherDerived& other)
-
-They all return a tensor of boolean values.
-
-
-## Selection (select(const ThenDerived& thenTensor, const ElseDerived& elseTensor)
-
-Selection is a coefficient-wise ternary operator that is the tensor equivalent
-to the if-then-else operation.
-
-    Tensor<bool, 3> if = ...;
-    Tensor<float, 3> then = ...;
-    Tensor<float, 3> else = ...;
-    Tensor<float, 3> result = if.select(then, else);
-
-The 3 arguments must be of the same dimensions, which will also be the dimension
-of the result.  The 'if' tensor must be of type boolean, the 'then' and the
-'else' tensor must be of the same type, which will also be the type of the
-result.
-
-Each coefficient in the result is equal to the corresponding coefficient in the
-'then' tensor if the corresponding value in the 'if' tensor is true. If not, the
-resulting coefficient will come from the 'else' tensor.
-
-
-## Contraction
-
-Tensor *contractions* are a generalization of the matrix product to the
-multidimensional case.
-
-    // Create 2 matrices using tensors of rank 2
-    Eigen::Tensor<int, 2> a(2, 3);
-    a.setValues({{1, 2, 3}, {6, 5, 4}});
-    Eigen::Tensor<int, 2> b(3, 2);
-    b.setValues({{1, 2}, {4, 5}, {5, 6}});
-
-    // Compute the traditional matrix product
-    Eigen::array<Eigen::IndexPair<int>, 1> product_dims = { Eigen::IndexPair<int>(1, 0) };
-    Eigen::Tensor<int, 2> AB = a.contract(b, product_dims);
-
-    // Compute the product of the transpose of the matrices
-    Eigen::array<Eigen::IndexPair<int>, 1> transposed_product_dims = { Eigen::IndexPair<int>(0, 1) };
-    Eigen::Tensor<int, 2> AtBt = a.contract(b, transposed_product_dims);
-
-    // Contraction to scalar value using a double contraction.
-    // First coordinate of both tensors are contracted as well as both second coordinates, i.e., this computes the sum of the squares of the elements.
-    Eigen::array<Eigen::IndexPair<int>, 2> double_contraction_product_dims = { Eigen::IndexPair<int>(0, 0), Eigen::IndexPair<int>(1, 1) };
-    Eigen::Tensor<int, 0> AdoubleContractedA = a.contract(a, double_contraction_product_dims);
-
-    // Extracting the scalar value of the tensor contraction for further usage
-    int value = AdoubleContractedA(0);
-
-## Reduction Operations
-
-A *Reduction* operation returns a tensor with fewer dimensions than the
-original tensor.  The values in the returned tensor are computed by applying a
-*reduction operator* to slices of values from the original tensor.  You specify
-the dimensions along which the slices are made.
-
-The Eigen Tensor library provides a set of predefined reduction operators such
-as `maximum()` and `sum()` and lets you define additional operators by
-implementing a few methods from a reductor template.
-
-### Reduction Dimensions
-
-All reduction operations take a single parameter of type
-`<TensorType>::``Dimensions` which can always be specified as an array of
-ints.  These are called the "reduction dimensions."  The values are the indices
-of the dimensions of the input tensor over which the reduction is done.  The
-parameter can have at most as many element as the rank of the input tensor;
-each element must be less than the tensor rank, as it indicates one of the
-dimensions to reduce.
-
-Each dimension of the input tensor should occur at most once in the reduction
-dimensions as the implementation does not remove duplicates.
-
-The order of the values in the reduction dimensions does not affect the
-results, but the code may execute faster if you list the dimensions in
-increasing order.
-
-Example: Reduction along one dimension.
-
-    // Create a tensor of 2 dimensions
-    Eigen::Tensor<int, 2> a(2, 3);
-    a.setValues({{1, 2, 3}, {6, 5, 4}});
-    // Reduce it along the second dimension (1)...
-    Eigen::array<int, 1> dims({1 /* dimension to reduce */});
-    // ...using the "maximum" operator.
-    // The result is a tensor with one dimension.  The size of
-    // that dimension is the same as the first (non-reduced) dimension of a.
-    Eigen::Tensor<int, 1> b = a.maximum(dims);
-    cout << "a" << endl << a << endl << endl;
-    cout << "b" << endl << b << endl << endl;
-    =>
-    a
-    1 2 3
-    6 5 4
-
-    b
-    3
-    6
-
-Example: Reduction along two dimensions.
-
-    Eigen::Tensor<float, 3, Eigen::ColMajor> a(2, 3, 4);
-    a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f},
-                  {7.0f, 6.0f, 5.0f, 4.0f},
-                  {8.0f, 9.0f, 10.0f, 11.0f}},
-                 {{12.0f, 13.0f, 14.0f, 15.0f},
-                  {19.0f, 18.0f, 17.0f, 16.0f},
-                  {20.0f, 21.0f, 22.0f, 23.0f}}});
-    // The tensor a has 3 dimensions.  We reduce along the
-    // first 2, resulting in a tensor with a single dimension
-    // of size 4 (the last dimension of a.)
-    // Note that we pass the array of reduction dimensions
-    // directly to the maximum() call.
-    Eigen::Tensor<float, 1, Eigen::ColMajor> b =
-        a.maximum(Eigen::array<int, 2>({0, 1}));
-    cout << "b" << endl << b << endl << endl;
-    =>
-    b
-    20
-    21
-    22
-    23
-
-#### Reduction along all dimensions
-
-As a special case, if you pass no parameter to a reduction operation the
-original tensor is reduced along *all* its dimensions.  The result is a
-scalar, represented as a zero-dimension tensor.
-
-    Eigen::Tensor<float, 3> a(2, 3, 4);
-    a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f},
-                  {7.0f, 6.0f, 5.0f, 4.0f},
-                  {8.0f, 9.0f, 10.0f, 11.0f}},
-                 {{12.0f, 13.0f, 14.0f, 15.0f},
-                  {19.0f, 18.0f, 17.0f, 16.0f},
-                  {20.0f, 21.0f, 22.0f, 23.0f}}});
-    // Reduce along all dimensions using the sum() operator.
-    Eigen::Tensor<float, 0> b = a.sum();
-    cout << "b" << endl << b << endl << endl;
-    =>
-    b
-    276
-
-
-### <Operation> sum(const Dimensions& new_dims)
-### <Operation> sum()
-
-Reduce a tensor using the sum() operator.  The resulting values
-are the sum of the reduced values.
-
-### <Operation> mean(const Dimensions& new_dims)
-### <Operation> mean()
-
-Reduce a tensor using the mean() operator.  The resulting values
-are the mean of the reduced values.
-
-### <Operation> maximum(const Dimensions& new_dims)
-### <Operation> maximum()
-
-Reduce a tensor using the maximum() operator.  The resulting values are the
-largest of the reduced values.
-
-### <Operation> minimum(const Dimensions& new_dims)
-### <Operation> minimum()
-
-Reduce a tensor using the minimum() operator.  The resulting values
-are the smallest of the reduced values.
-
-### <Operation> prod(const Dimensions& new_dims)
-### <Operation> prod()
-
-Reduce a tensor using the prod() operator.  The resulting values
-are the product of the reduced values.
-
-### <Operation> all(const Dimensions& new_dims)
-### <Operation> all()
-Reduce a tensor using the all() operator.  Casts tensor to bool and then checks
-whether all elements are true.  Runs through all elements rather than
-short-circuiting, so may be significantly inefficient.
-
-### <Operation> any(const Dimensions& new_dims)
-### <Operation> any()
-Reduce a tensor using the any() operator.  Casts tensor to bool and then checks
-whether any element is true.  Runs through all elements rather than
-short-circuiting, so may be significantly inefficient.
-
-
-### <Operation> reduce(const Dimensions& new_dims, const Reducer& reducer)
-
-Reduce a tensor using a user-defined reduction operator.  See `SumReducer`
-in TensorFunctors.h for information on how to implement a reduction operator.
-
-
-## Trace
-
-A *Trace* operation returns a tensor with fewer dimensions than the original
-tensor. It returns a tensor whose elements are the sum of the elements of the
-original tensor along the main diagonal for a list of specified dimensions, the
-"trace dimensions". Similar to the `Reduction Dimensions`, the trace dimensions
-are passed as an input parameter to the operation, are of type `<TensorType>::``Dimensions`
-, and have the same requirements when passed as an input parameter. In addition,
-the trace dimensions must have the same size.
-
-Example: Trace along 2 dimensions.
-
-    // Create a tensor of 3 dimensions
-    Eigen::Tensor<int, 3> a(2, 2, 3);
-    a.setValues({{{1, 2, 3}, {4, 5, 6}}, {{7, 8, 9}, {10, 11, 12}}});
-    // Specify the dimensions along which the trace will be computed.
-    // In this example, the trace can only be computed along the dimensions
-    // with indices 0 and 1
-    Eigen::array<int, 2> dims({0, 1});
-    // The output tensor contains all but the trace dimensions.
-    Tensor<int, 1> a_trace = a.trace(dims);
-    cout << "a_trace:" << endl;
-    cout << a_trace << endl;
-    =>
-    a_trace:
-    11
-    13
-    15
-
-
-### <Operation> trace(const Dimensions& new_dims)
-### <Operation> trace()
-
-As a special case, if no parameter is passed to the operation, trace is computed
-along *all* dimensions of the input tensor.
-
-Example: Trace along all dimensions.
-
-    // Create a tensor of 3 dimensions, with all dimensions having the same size.
-    Eigen::Tensor<int, 3> a(3, 3, 3);
-    a.setValues({{{1, 2, 3}, {4, 5, 6}, {7, 8, 9}},
-                {{10, 11, 12}, {13, 14, 15}, {16, 17, 18}},
-                {{19, 20, 21}, {22, 23, 24}, {25, 26, 27}}});
-    // Result is a zero dimension tensor
-    Tensor<int, 0> a_trace = a.trace();
-    cout<<"a_trace:"<<endl;
-    cout<<a_trace<<endl;
-    =>
-    a_trace:
-    42
-
-
-## Scan Operations
-
-A *Scan* operation returns a tensor with the same dimensions as the original
-tensor. The operation performs an inclusive scan along the specified
-axis, which means it computes a running total along the axis for a given
-reduction operation.
-If the reduction operation corresponds to summation, then this computes the
-prefix sum of the tensor along the given axis.
-
-Example:
-dd a comment to this line
-
-    // Create a tensor of 2 dimensions
-    Eigen::Tensor<int, 2> a(2, 3);
-    a.setValues({{1, 2, 3}, {4, 5, 6}});
-    // Scan it along the second dimension (1) using summation
-    Eigen::Tensor<int, 2> b = a.cumsum(1);
-    // The result is a tensor with the same size as the input
-    cout << "a" << endl << a << endl << endl;
-    cout << "b" << endl << b << endl << endl;
-    =>
-    a
-    1 2 3
-    4 5 6
-
-    b
-    1  3  6
-    4  9 15
-
-### <Operation> cumsum(const Index& axis)
-
-Perform a scan by summing consecutive entries.
-
-### <Operation> cumprod(const Index& axis)
-
-Perform a scan by multiplying consecutive entries.
-
-
-## Convolutions
-
-### <Operation> convolve(const Kernel& kernel, const Dimensions& dims)
-
-Returns a tensor that is the output of the convolution of the input tensor with the kernel,
-along the specified dimensions of the input tensor. The dimension size for dimensions of the output tensor
-which were part of the convolution will be reduced by the formula:
-output_dim_size = input_dim_size - kernel_dim_size + 1 (requires: input_dim_size >= kernel_dim_size).
-The dimension sizes for dimensions that were not part of the convolution will remain the same.
-Performance of the convolution can depend on the length of the stride(s) of the input tensor dimension(s) along which the
-convolution is computed (the first dimension has the shortest stride for ColMajor, whereas RowMajor's shortest stride is
-for the last dimension).
-
-    // Compute convolution along the second and third dimension.
-    Tensor<float, 4, DataLayout> input(3, 3, 7, 11);
-    Tensor<float, 2, DataLayout> kernel(2, 2);
-    Tensor<float, 4, DataLayout> output(3, 2, 6, 11);
-    input.setRandom();
-    kernel.setRandom();
-
-    Eigen::array<ptrdiff_t, 2> dims({1, 2});  // Specify second and third dimension for convolution.
-    output = input.convolve(kernel, dims);
-
-    for (int i = 0; i < 3; ++i) {
-      for (int j = 0; j < 2; ++j) {
-        for (int k = 0; k < 6; ++k) {
-          for (int l = 0; l < 11; ++l) {
-            const float result = output(i,j,k,l);
-            const float expected = input(i,j+0,k+0,l) * kernel(0,0) +
-                                   input(i,j+1,k+0,l) * kernel(1,0) +
-                                   input(i,j+0,k+1,l) * kernel(0,1) +
-                                   input(i,j+1,k+1,l) * kernel(1,1);
-            VERIFY_IS_APPROX(result, expected);
-          }
-        }
-      }
-    }
-
-
-## Geometrical Operations
-
-These operations return a Tensor with different dimensions than the original
-Tensor.  They can be used to access slices of tensors, see them with different
-dimensions, or pad tensors with additional data.
-
-### <Operation> reshape(const Dimensions& new_dims)
-
-Returns a view of the input tensor that has been reshaped to the specified
-new dimensions.  The argument new_dims is an array of Index values.  The
-rank of the resulting tensor is equal to the number of elements in new_dims.
-
-The product of all the sizes in the new dimension array must be equal to
-the number of elements in the input tensor.
-
-    // Increase the rank of the input tensor by introducing a new dimension
-    // of size 1.
-    Tensor<float, 2> input(7, 11);
-    array<int, 3> three_dims{{7, 11, 1}};
-    Tensor<float, 3> result = input.reshape(three_dims);
-
-    // Decrease the rank of the input tensor by merging 2 dimensions;
-    array<int, 1> one_dim{{7 * 11}};
-    Tensor<float, 1> result = input.reshape(one_dim);
-
-This operation does not move any data in the input tensor, so the resulting
-contents of a reshaped Tensor depend on the data layout of the original Tensor.
-
-For example this is what happens when you `reshape()` a 2D ColMajor tensor
-to one dimension:
-
-    Eigen::Tensor<float, 2, Eigen::ColMajor> a(2, 3);
-    a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}});
-    Eigen::array<Eigen::DenseIndex, 1> one_dim({3 * 2});
-    Eigen::Tensor<float, 1, Eigen::ColMajor> b = a.reshape(one_dim);
-    cout << "b" << endl << b << endl;
-    =>
-    b
-      0
-    300
-    100
-    400
-    200
-    500
-
-This is what happens when the 2D Tensor is RowMajor:
-
-    Eigen::Tensor<float, 2, Eigen::RowMajor> a(2, 3);
-    a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}});
-    Eigen::array<Eigen::DenseIndex, 1> one_dim({3 * 2});
-    Eigen::Tensor<float, 1, Eigen::RowMajor> b = a.reshape(one_dim);
-    cout << "b" << endl << b << endl;
-    =>
-    b
-      0
-    100
-    200
-    300
-    400
-    500
-
-The reshape operation is a lvalue. In other words, it can be used on the left
-side of the assignment operator.
-
-The previous example can be rewritten as follow:
-
-    Eigen::Tensor<float, 2, Eigen::ColMajor> a(2, 3);
-    a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}});
-    Eigen::array<Eigen::DenseIndex, 2> two_dim({2, 3});
-    Eigen::Tensor<float, 1, Eigen::ColMajor> b(6);
-    b.reshape(two_dim) = a;
-    cout << "b" << endl << b << endl;
-    =>
-    b
-      0
-    300
-    100
-    400
-    200
-    500
-
-Note that "b" itself was not reshaped but that instead the assignment is done to
-the reshape view of b.
-
-
-### <Operation> shuffle(const Shuffle& shuffle)
-
-Returns a copy of the input tensor whose dimensions have been
-reordered according to the specified permutation. The argument shuffle
-is an array of Index values. Its size is the rank of the input
-tensor. It must contain a permutation of 0, 1, ..., rank - 1. The i-th
-dimension of the output tensor equals to the size of the shuffle[i]-th
-dimension of the input tensor. For example:
-
-    // Shuffle all dimensions to the left by 1.
-    Tensor<float, 3> input(20, 30, 50);
-    // ... set some values in input.
-    Tensor<float, 3> output = input.shuffle({1, 2, 0})
-
-    eigen_assert(output.dimension(0) == 30);
-    eigen_assert(output.dimension(1) == 50);
-    eigen_assert(output.dimension(2) == 20);
-
-Indices into the output tensor are shuffled accordingly to formulate
-indices into the input tensor. For example, one can assert in the above
-code snippet that:
-
-    eigen_assert(output(3, 7, 11) == input(11, 3, 7));
-
-In general, one can assert that
-
-    eigen_assert(output(..., indices[shuffle[i]], ...) ==
-                 input(..., indices[i], ...))
-
-The shuffle operation results in a lvalue, which means that it can be assigned
-to. In other words, it can be used on the left side of the assignment operator.
-
-Let's rewrite the previous example to take advantage of this feature:
-
-    // Shuffle all dimensions to the left by 1.
-    Tensor<float, 3> input(20, 30, 50);
-    // ... set some values in input.
-    Tensor<float, 3> output(30, 50, 20);
-    output.shuffle({2, 0, 1}) = input;
-
-
-### <Operation> stride(const Strides& strides)
-
-Returns a view of the input tensor that strides (skips stride-1
-elements) along each of the dimensions.  The argument strides is an
-array of Index values.  The dimensions of the resulting tensor are
-ceil(input_dimensions[i] / strides[i]).
-
-For example this is what happens when you `stride()` a 2D tensor:
-
-    Eigen::Tensor<int, 2> a(4, 3);
-    a.setValues({{0, 100, 200}, {300, 400, 500}, {600, 700, 800}, {900, 1000, 1100}});
-    Eigen::array<Eigen::DenseIndex, 2> strides({3, 2});
-    Eigen::Tensor<int, 2> b = a.stride(strides);
-    cout << "b" << endl << b << endl;
-    =>
-    b
-       0   200
-     900  1100
-
-It is possible to assign a tensor to a stride:
-    Tensor<float, 3> input(20, 30, 50);
-    // ... set some values in input.
-    Tensor<float, 3> output(40, 90, 200);
-    output.stride({2, 3, 4}) = input;
-
-
-### <Operation> slice(const StartIndices& offsets, const Sizes& extents)
-
-Returns a sub-tensor of the given tensor. For each dimension i, the slice is
-made of the coefficients stored between offset[i] and offset[i] + extents[i] in
-the input tensor.
-
-    Eigen::Tensor<int, 2> a(4, 3);
-    a.setValues({{0, 100, 200}, {300, 400, 500},
-                 {600, 700, 800}, {900, 1000, 1100}});
-    Eigen::array<int, 2> offsets = {1, 0};
-    Eigen::array<int, 2> extents = {2, 2};
-    Eigen::Tensor<int, 1> slice = a.slice(offsets, extents);
-    cout << "a" << endl << a << endl;
-    =>
-    a
-       0   100   200
-     300   400   500
-     600   700   800
-     900  1000  1100
-    cout << "slice" << endl << slice << endl;
-    =>
-    slice
-     300   400
-     600   700
-
-
-### <Operation> chip(const Index offset, const Index dim)
-
-A chip is a special kind of slice. It is the subtensor at the given offset in
-the dimension dim. The returned tensor has one fewer dimension than the input
-tensor: the dimension dim is removed.
-
-For example, a matrix chip would be either a row or a column of the input
-matrix.
-
-    Eigen::Tensor<int, 2> a(4, 3);
-    a.setValues({{0, 100, 200}, {300, 400, 500},
-                 {600, 700, 800}, {900, 1000, 1100}});
-    Eigen::Tensor<int, 1> row_3 = a.chip(2, 0);
-    Eigen::Tensor<int, 1> col_2 = a.chip(1, 1);
-    cout << "a" << endl << a << endl;
-    =>
-    a
-       0   100   200
-     300   400   500
-     600   700   800
-     900  1000  1100
-    cout << "row_3" << endl << row_3 << endl;
-    =>
-    row_3
-       600   700   800
-    cout << "col_2" << endl << col_2 << endl;
-    =>
-    col_2
-       100   400   700    1000
-
-It is possible to assign values to a tensor chip since the chip operation is a
-lvalue. For example:
-
-    Eigen::Tensor<int, 1> a(3);
-    a.setValues({{100, 200, 300}});
-    Eigen::Tensor<int, 2> b(2, 3);
-    b.setZero();
-    b.chip(0, 0) = a;
-    cout << "a" << endl << a << endl;
-    =>
-    a
-     100
-     200
-     300
-    cout << "b" << endl << b << endl;
-    =>
-    b
-       100   200   300
-         0     0     0
-
-
-### <Operation> reverse(const ReverseDimensions& reverse)
-
-Returns a view of the input tensor that reverses the order of the coefficients
-along a subset of the dimensions.  The argument reverse is an array of boolean
-values that indicates whether or not the order of the coefficients should be
-reversed along each of the dimensions.  This operation preserves the dimensions
-of the input tensor.
-
-For example this is what happens when you `reverse()` the first dimension
-of a 2D tensor:
-
-    Eigen::Tensor<int, 2> a(4, 3);
-    a.setValues({{0, 100, 200}, {300, 400, 500},
-                {600, 700, 800}, {900, 1000, 1100}});
-    Eigen::array<bool, 2> reverse({true, false});
-    Eigen::Tensor<int, 2> b = a.reverse(reverse);
-    cout << "a" << endl << a << endl << "b" << endl << b << endl;
-    =>
-    a
-       0   100   200
-     300   400   500
-     600   700   800
-     900  1000  1100
-    b
-     900  1000  1100
-     600   700   800
-     300   400   500
-       0   100   200
-
-
-### <Operation> broadcast(const Broadcast& broadcast)
-
-Returns a view of the input tensor in which the input is replicated one to many
-times.
-The broadcast argument specifies how many copies of the input tensor need to be
-made in each of the dimensions.
-
-    Eigen::Tensor<int, 2> a(2, 3);
-    a.setValues({{0, 100, 200}, {300, 400, 500}});
-    Eigen::array<int, 2> bcast({3, 2});
-    Eigen::Tensor<int, 2> b = a.broadcast(bcast);
-    cout << "a" << endl << a << endl << "b" << endl << b << endl;
-    =>
-    a
-       0   100   200
-     300   400   500
-    b
-       0   100   200    0   100   200
-     300   400   500  300   400   500
-       0   100   200    0   100   200
-     300   400   500  300   400   500
-       0   100   200    0   100   200
-     300   400   500  300   400   500
-
-### <Operation> concatenate(const OtherDerived& other, Axis axis)
-
-TODO
-
-### <Operation>  pad(const PaddingDimensions& padding)
-
-Returns a view of the input tensor in which the input is padded with zeros.
-
-    Eigen::Tensor<int, 2> a(2, 3);
-    a.setValues({{0, 100, 200}, {300, 400, 500}});
-    Eigen::array<pair<int, int>, 2> paddings;
-    paddings[0] = make_pair(0, 1);
-    paddings[1] = make_pair(2, 3);
-    Eigen::Tensor<int, 2> b = a.pad(paddings);
-    cout << "a" << endl << a << endl << "b" << endl << b << endl;
-    =>
-    a
-       0   100   200
-     300   400   500
-    b
-       0     0     0    0
-       0     0     0    0
-       0   100   200    0
-     300   400   500    0
-       0     0     0    0
-       0     0     0    0
-       0     0     0    0
-
-
-### <Operation>  extract_patches(const PatchDims& patch_dims)
-
-Returns a tensor of coefficient patches extracted from the input tensor, where
-each patch is of dimension specified by 'patch_dims'. The returned tensor has
-one greater dimension than the input tensor, which is used to index each patch.
-The patch index in the output tensor depends on the data layout of the input
-tensor: the patch index is the last dimension ColMajor layout, and the first
-dimension in RowMajor layout.
-
-For example, given the following input tensor:
-
-    Eigen::Tensor<float, 2, DataLayout> tensor(3,4);
-    tensor.setValues({{0.0f, 1.0f, 2.0f, 3.0f},
-                      {4.0f, 5.0f, 6.0f, 7.0f},
-                      {8.0f, 9.0f, 10.0f, 11.0f}});
-
-    cout << "tensor: " << endl << tensor << endl;
-    =>
-    tensor:
-     0   1   2   3
-     4   5   6   7
-     8   9  10  11
-
-Six 2x2 patches can be extracted and indexed using the following code:
-
-    Eigen::Tensor<float, 3, DataLayout> patch;
-    Eigen::array<ptrdiff_t, 2> patch_dims;
-    patch_dims[0] = 2;
-    patch_dims[1] = 2;
-    patch = tensor.extract_patches(patch_dims);
-    for (int k = 0; k < 6; ++k) {
-      cout << "patch index: " << k << endl;
-      for (int i = 0; i < 2; ++i) {
-    	for (int j = 0; j < 2; ++j) {
-    	  if (DataLayout == ColMajor) {
-    		cout << patch(i, j, k) << " ";
-    	  } else {
-    		cout << patch(k, i, j) << " ";
-    	  }
-    	}
-    	cout << endl;
-      }
-    }
-
-This code results in the following output when the data layout is ColMajor:
-
-    patch index: 0
-    0 1
-    4 5
-    patch index: 1
-    4 5
-    8 9
-    patch index: 2
-    1 2
-    5 6
-    patch index: 3
-    5 6
-    9 10
-    patch index: 4
-    2 3
-    6 7
-    patch index: 5
-    6 7
-    10 11
-
-This code results in the following output when the data layout is RowMajor:
-(NOTE: the set of patches is the same as in ColMajor, but are indexed differently).
-
-    patch index: 0
-    0 1
-    4 5
-    patch index: 1
-    1 2
-    5 6
-    patch index: 2
-    2 3
-    6 7
-    patch index: 3
-    4 5
-    8 9
-    patch index: 4
-    5 6
-    9 10
-    patch index: 5
-    6 7
-    10 11
-
-### <Operation>  extract_image_patches(const Index patch_rows, const Index patch_cols, const Index row_stride, const Index col_stride, const PaddingType padding_type)
-
-Returns a tensor of coefficient image patches extracted from the input tensor,
-which is expected to have dimensions ordered as follows (depending on the data
-layout of the input tensor, and the number of additional dimensions 'N'):
-
-*) ColMajor
-1st dimension: channels (of size d)
-2nd dimension: rows (of size r)
-3rd dimension: columns (of size c)
-4th-Nth dimension: time (for video) or batch (for bulk processing).
-
-*) RowMajor (reverse order of ColMajor)
-1st-Nth dimension: time (for video) or batch (for bulk processing).
-N+1'th dimension: columns (of size c)
-N+2'th dimension: rows (of size r)
-N+3'th dimension: channels (of size d)
-
-The returned tensor has one greater dimension than the input tensor, which is
-used to index each patch. The patch index in the output tensor depends on the
-data layout of the input tensor: the patch index is the 4'th dimension in
-ColMajor layout, and the 4'th from the last dimension in RowMajor layout.
-
-For example, given the following input tensor with the following dimension
-sizes:
- *) depth:   2
- *) rows:    3
- *) columns: 5
- *) batch:   7
-
-    Tensor<float, 4> tensor(2,3,5,7);
-    Tensor<float, 4, RowMajor> tensor_row_major = tensor.swap_layout();
-
-2x2 image patches can be extracted and indexed using the following code:
-
-*) 2D patch: ColMajor (patch indexed by second-to-last dimension)
-
-    Tensor<float, 5> twod_patch;
-    twod_patch = tensor.extract_image_patches<2, 2>();
-    // twod_patch.dimension(0) == 2
-    // twod_patch.dimension(1) == 2
-    // twod_patch.dimension(2) == 2
-    // twod_patch.dimension(3) == 3*5
-    // twod_patch.dimension(4) == 7
-
-*) 2D patch: RowMajor (patch indexed by the second dimension)
-
-    Tensor<float, 5, RowMajor> twod_patch_row_major;
-    twod_patch_row_major = tensor_row_major.extract_image_patches<2, 2>();
-    // twod_patch_row_major.dimension(0) == 7
-    // twod_patch_row_major.dimension(1) == 3*5
-    // twod_patch_row_major.dimension(2) == 2
-    // twod_patch_row_major.dimension(3) == 2
-    // twod_patch_row_major.dimension(4) == 2
-
-## Special Operations
-
-### <Operation> cast<T>()
-
-Returns a tensor of type T with the same dimensions as the original tensor.
-The returned tensor contains the values of the original tensor converted to
-type T.
-
-    Eigen::Tensor<float, 2> a(2, 3);
-    Eigen::Tensor<int, 2> b = a.cast<int>();
-
-This can be useful for example if you need to do element-wise division of
-Tensors of integers.  This is not currently supported by the Tensor library
-but you can easily cast the tensors to floats to do the division:
-
-    Eigen::Tensor<int, 2> a(2, 3);
-    a.setValues({{0, 1, 2}, {3, 4, 5}});
-    Eigen::Tensor<int, 2> b =
-        (a.cast<float>() / a.constant(2).cast<float>()).cast<int>();
-    cout << "a" << endl << a << endl << endl;
-    cout << "b" << endl << b << endl << endl;
-    =>
-    a
-    0 1 2
-    3 4 5
-
-    b
-    0 0 1
-    1 2 2
-
-
-### <Operation>     eval()
-
-TODO
-
-
-## Representation of scalar values
-
-Scalar values are often represented by tensors of size 1 and rank 0.For example
-Tensor<T, N>::maximum() currently returns a Tensor<T, 0>. Similarly, the inner
-product of 2 1d tensors (through contractions) returns a 0d tensor.
-
-## Limitations
-
-*   The number of tensor dimensions is currently limited to 250 when using a
-    compiler that supports cxx11. It is limited to only 5 for older compilers.
-*   The IndexList class requires a cxx11 compliant compiler. You can use an
-    array of indices instead if you don't have access to a modern compiler.
-*   On GPUs only floating point values are properly tested and optimized for.
-*   Complex and integer values are known to be broken on GPUs. If you try to use
-    them you'll most likely end up triggering a static assertion failure such as
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-
-
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/Tensor.h b/src/EigenUnsupported/CXX11/src/Tensor/Tensor.h
deleted file mode 100644
index 8cac2bb..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/Tensor.h
+++ /dev/null
@@ -1,554 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_H
-#define EIGEN_CXX11_TENSOR_TENSOR_H
-
-namespace Eigen {
-
-/** \class Tensor
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief The tensor class.
-  *
-  * The %Tensor class is the work-horse for all \em dense tensors within Eigen.
-  *
-  * The %Tensor class encompasses only dynamic-size objects so far.
-  *
-  * The first two template parameters are required:
-  * \tparam Scalar_  Numeric type, e.g. float, double, int or `std::complex<float>`.
-  *                 User defined scalar types are supported as well (see \ref user_defined_scalars "here").
-  * \tparam NumIndices_ Number of indices (i.e. rank of the tensor)
-  *
-  * The remaining template parameters are optional -- in most cases you don't have to worry about them.
-  * \tparam Options_  A combination of either \b #RowMajor or \b #ColMajor, and of either
-  *                 \b #AutoAlign or \b #DontAlign.
-  *                 The former controls \ref TopicStorageOrders "storage order", and defaults to column-major. The latter controls alignment, which is required
-  *                 for vectorization. It defaults to aligning tensors. Note that tensors currently do not support any operations that profit from vectorization.
-  *                 Support for such operations (i.e. adding two tensors etc.) is planned.
-  *
-  * You can access elements of tensors using normal subscripting:
-  *
-  * \code
-  * Eigen::Tensor<double, 4> t(10, 10, 10, 10);
-  * t(0, 1, 2, 3) = 42.0;
-  * \endcode
-  *
-  * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_TENSOR_PLUGIN.
-  *
-  * <i><b>Some notes:</b></i>
-  *
-  * <dl>
-  * <dt><b>Relation to other parts of Eigen:</b></dt>
-  * <dd>The midterm development goal for this class is to have a similar hierarchy as Eigen uses for matrices, so that
-  * taking blocks or using tensors in expressions is easily possible, including an interface with the vector/matrix code
-  * by providing .asMatrix() and .asVector() (or similar) methods for rank 2 and 1 tensors. However, currently, the %Tensor
-  * class does not provide any of these features and is only available as a stand-alone class that just allows for
-  * coefficient access. Also, when fixed-size tensors are implemented, the number of template arguments is likely to
-  * change dramatically.</dd>
-  * </dl>
-  *
-  * \ref TopicStorageOrders
-  */
-
-template<typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
-class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
-{
-  public:
-    typedef Tensor<Scalar_, NumIndices_, Options_, IndexType_> Self;
-    typedef TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexType_> > Base;
-    typedef typename Eigen::internal::nested<Self>::type Nested;
-    typedef typename internal::traits<Self>::StorageKind StorageKind;
-    typedef typename internal::traits<Self>::Index Index;
-    typedef Scalar_ Scalar;
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef typename Base::CoeffReturnType CoeffReturnType;
-
-    enum {
-      IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0) & !(Options_&DontAlign),
-      Layout = Options_ & RowMajor ? RowMajor : ColMajor,
-      CoordAccess = true,
-      RawAccess = true
-    };
-
-    static const int Options = Options_;
-    static const int NumIndices = NumIndices_;
-    typedef DSizes<Index, NumIndices_> Dimensions;
-
-  protected:
-    TensorStorage<Scalar, Dimensions, Options> m_storage;
-
-#ifdef EIGEN_HAS_SFINAE
-    template<typename CustomIndices>
-    struct isOfNormalIndex{
-      static const bool is_array = internal::is_base_of<array<Index, NumIndices>, CustomIndices>::value;
-      static const bool is_int = NumTraits<CustomIndices>::IsInteger;
-      static const bool value = is_array | is_int;
-    };
-#endif
-
-  public:
-    // Metadata
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                         rank()                   const { return NumIndices; }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                         dimension(std::size_t n) const { return m_storage.dimensions()[n]; }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions&             dimensions()             const { return m_storage.dimensions(); }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                         size()                   const { return m_storage.size(); }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar                        *data()                        { return m_storage.data(); }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar                  *data()                  const { return m_storage.data(); }
-
-    // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
-    // work, because that uses base().coeffRef() - and we don't yet
-    // implement a similar class hierarchy
-    inline Self& base()             { return *this; }
-    inline const Self& base() const { return *this; }
-
-#if EIGEN_HAS_VARIADIC_TEMPLATES
-    template<typename... IndexTypes>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
-    {
-      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
-      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      return coeff(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
-    }
-#endif
-
-    // normal indices
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(const array<Index, NumIndices>& indices) const
-    {
-      eigen_internal_assert(checkIndexRange(indices));
-      return m_storage.data()[linearizedIndex(indices)];
-    }
-
-    // custom indices
-#ifdef EIGEN_HAS_SFINAE
-    template<typename CustomIndices,
-             EIGEN_SFINAE_ENABLE_IF( !(isOfNormalIndex<CustomIndices>::value) )
-    >
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(CustomIndices& indices) const
-    {
-        return coeff(internal::customIndices2Array<Index,NumIndices>(indices));
-    }
-#endif
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff() const
-    {
-      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
-      return m_storage.data()[0];
-    }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const
-    {
-      eigen_internal_assert(index >= 0 && index < size());
-      return m_storage.data()[index];
-    }
-
-#if EIGEN_HAS_VARIADIC_TEMPLATES
-    template<typename... IndexTypes>
-    inline Scalar& coeffRef(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
-    {
-      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
-      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      return coeffRef(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
-    }
-#endif
-
-    // normal indices
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices)
-    {
-      eigen_internal_assert(checkIndexRange(indices));
-      return m_storage.data()[linearizedIndex(indices)];
-    }
-
-    // custom indices
-#ifdef EIGEN_HAS_SFINAE
-    template<typename CustomIndices,
-             EIGEN_SFINAE_ENABLE_IF( !(isOfNormalIndex<CustomIndices>::value) )
-             >
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(CustomIndices& indices)
-    {
-        return coeffRef(internal::customIndices2Array<Index,NumIndices>(indices));
-    }
-#endif
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef()
-    {
-      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
-      return m_storage.data()[0];
-    }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
-    {
-      eigen_internal_assert(index >= 0 && index < size());
-      return m_storage.data()[index];
-    }
-
-#if EIGEN_HAS_VARIADIC_TEMPLATES
-    template<typename... IndexTypes>
-    inline const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
-    {
-      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
-      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      return this->operator()(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
-    }
-#else
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const
-    {
-      return coeff(array<Index, 2>(i0, i1));
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const
-    {
-      return coeff(array<Index, 3>(i0, i1, i2));
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const
-    {
-      return coeff(array<Index, 4>(i0, i1, i2, i3));
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
-    {
-      return coeff(array<Index, 5>(i0, i1, i2, i3, i4));
-    }
-#endif
-
-    // custom indices
-#ifdef EIGEN_HAS_SFINAE
-    template<typename CustomIndices,
-             EIGEN_SFINAE_ENABLE_IF( !(isOfNormalIndex<CustomIndices>::value) )
-    >
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(CustomIndices& indices) const
-    {
-        return coeff(internal::customIndices2Array<Index,NumIndices>(indices));
-    }
-#endif
-
-    // normal indices
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const
-    {
-      return coeff(indices);
-    }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const
-    {
-      eigen_internal_assert(index >= 0 && index < size());
-      return coeff(index);
-    }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()() const
-    {
-      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
-      return coeff();
-    }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const
-    {
-      // The bracket operator is only for vectors, use the parenthesis operator instead.
-      EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE);
-      return coeff(index);
-    }
-
-#if EIGEN_HAS_VARIADIC_TEMPLATES
-    template<typename... IndexTypes>
-    inline Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
-    {
-      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
-      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      return operator()(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
-    }
-#else
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1)
-    {
-      return coeffRef(array<Index, 2>(i0, i1));
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2)
-    {
-      return coeffRef(array<Index, 3>(i0, i1, i2));
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3)
-    {
-      return coeffRef(array<Index, 4>(i0, i1, i2, i3));
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4)
-    {
-      return coeffRef(array<Index, 5>(i0, i1, i2, i3, i4));
-    }
-#endif
-
-    // normal indices
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices)
-    {
-      return coeffRef(indices);
-    }
-
-    // custom indices
-#ifdef EIGEN_HAS_SFINAE
-    template<typename CustomIndices,
-             EIGEN_SFINAE_ENABLE_IF( !(isOfNormalIndex<CustomIndices>::value) )
-    >
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(CustomIndices& indices)
-    {
-      return coeffRef(internal::customIndices2Array<Index,NumIndices>(indices));
-    }
-#endif
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index index)
-    {
-      eigen_assert(index >= 0 && index < size());
-      return coeffRef(index);
-    }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()()
-    {
-      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
-      return coeffRef();
-    }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator[](Index index)
-    {
-      // The bracket operator is only for vectors, use the parenthesis operator instead
-      EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      return coeffRef(index);
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Tensor()
-      : m_storage()
-    {
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Tensor(const Self& other)
-      : m_storage(other.m_storage)
-    {
-    }
-
-#if EIGEN_HAS_VARIADIC_TEMPLATES
-    template<typename... IndexTypes>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index firstDimension, IndexTypes... otherDimensions)
-        : m_storage(firstDimension, otherDimensions...)
-    {
-      // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
-      EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    }
-#else
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1)
-      : m_storage(dim1, array<Index, 1>(dim1))
-    {
-      EIGEN_STATIC_ASSERT(1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2)
-      : m_storage(dim1*dim2, array<Index, 2>(dim1, dim2))
-    {
-      EIGEN_STATIC_ASSERT(2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3)
-      : m_storage(dim1*dim2*dim3, array<Index, 3>(dim1, dim2, dim3))
-    {
-      EIGEN_STATIC_ASSERT(3 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3, Index dim4)
-      : m_storage(dim1*dim2*dim3*dim4, array<Index, 4>(dim1, dim2, dim3, dim4))
-    {
-      EIGEN_STATIC_ASSERT(4 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3, Index dim4, Index dim5)
-      : m_storage(dim1*dim2*dim3*dim4*dim5, array<Index, 5>(dim1, dim2, dim3, dim4, dim5))
-    {
-      EIGEN_STATIC_ASSERT(5 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    }
-#endif
-
-    /** Normal Dimension */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(const array<Index, NumIndices>& dimensions)
-        : m_storage(internal::array_prod(dimensions), dimensions)
-    {
-      EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
-    }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Tensor(const TensorBase<OtherDerived, ReadOnlyAccessors>& other)
-    {
-      typedef TensorAssignOp<Tensor, const OtherDerived> Assign;
-      Assign assign(*this, other.derived());
-      resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-    }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Tensor(const TensorBase<OtherDerived, WriteAccessors>& other)
-    {
-      typedef TensorAssignOp<Tensor, const OtherDerived> Assign;
-      Assign assign(*this, other.derived());
-      resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-    }
-
-    #if EIGEN_HAS_RVALUE_REFERENCES
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Tensor(Self&& other)
-      : m_storage(std::move(other.m_storage))
-    {
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Tensor& operator=(Self&& other)
-    {
-      m_storage = std::move(other.m_storage);
-      return *this;
-    }
-    #endif
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Tensor& operator=(const Tensor& other)
-    {
-      typedef TensorAssignOp<Tensor, const Tensor> Assign;
-      Assign assign(*this, other);
-      resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Tensor& operator=(const OtherDerived& other)
-    {
-      typedef TensorAssignOp<Tensor, const OtherDerived> Assign;
-      Assign assign(*this, other);
-      resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return *this;
-    }
-
-#if EIGEN_HAS_VARIADIC_TEMPLATES
-    template<typename... IndexTypes> EIGEN_DEVICE_FUNC
-    void resize(Index firstDimension, IndexTypes... otherDimensions)
-    {
-      // The number of dimensions used to resize a tensor must be equal to the rank of the tensor.
-      EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      resize(array<Index, NumIndices>{{firstDimension, otherDimensions...}});
-    }
-#endif
-
-    /** Normal Dimension */
-    EIGEN_DEVICE_FUNC void resize(const array<Index, NumIndices>& dimensions)
-    {
-      int i;
-      Index size = Index(1);
-      for (i = 0; i < NumIndices; i++) {
-        internal::check_rows_cols_for_overflow<Dynamic>::run(size, dimensions[i]);
-        size *= dimensions[i];
-      }
-      #ifdef EIGEN_INITIALIZE_COEFFS
-        bool size_changed = size != this->size();
-        m_storage.resize(size, dimensions);
-        if(size_changed) EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
-      #else
-        m_storage.resize(size, dimensions);
-      #endif
-    }
-
-    // Why this overload, DSizes is derived from array ??? //
-    EIGEN_DEVICE_FUNC void resize(const DSizes<Index, NumIndices>& dimensions) {
-      array<Index, NumIndices> dims;
-      for (int i = 0; i < NumIndices; ++i) {
-        dims[i] = dimensions[i];
-      }
-      resize(dims);
-    }
-
-    EIGEN_DEVICE_FUNC
-    void resize()
-    {
-      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
-      // Nothing to do: rank 0 tensors have fixed size
-    }
-
-#ifdef EIGEN_HAS_INDEX_LIST
-    template <typename FirstType, typename... OtherTypes>
-    EIGEN_DEVICE_FUNC
-    void resize(const Eigen::IndexList<FirstType, OtherTypes...>& dimensions) {
-      array<Index, NumIndices> dims;
-      for (int i = 0; i < NumIndices; ++i) {
-        dims[i] = static_cast<Index>(dimensions[i]);
-      }
-      resize(dims);
-    }
-#endif
-
-    /** Custom Dimension */
-#ifdef EIGEN_HAS_SFINAE
-    template<typename CustomDimension,
-             EIGEN_SFINAE_ENABLE_IF( !(isOfNormalIndex<CustomDimension>::value) )
-    >
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize(CustomDimension& dimensions)
-    {
-      resize(internal::customIndices2Array<Index,NumIndices>(dimensions));
-    }
-#endif
-
-#ifndef EIGEN_EMULATE_CXX11_META_H
-    template <typename std::ptrdiff_t... Indices>
-    EIGEN_DEVICE_FUNC
-    void resize(const Sizes<Indices...>& dimensions) {
-      array<Index, NumIndices> dims;
-      for (int i = 0; i < NumIndices; ++i) {
-        dims[i] = static_cast<Index>(dimensions[i]);
-      }
-      resize(dims);
-    }
-#else
-    template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5>
-    EIGEN_DEVICE_FUNC
-    void resize(const Sizes<V1, V2, V3, V4, V5>& dimensions) {
-      array<Index, NumIndices> dims;
-      for (int i = 0; i < NumIndices; ++i) {
-        dims[i] = static_cast<Index>(dimensions[i]);
-      }
-      resize(dims);
-    }
-#endif
-
-  protected:
-
-    bool checkIndexRange(const array<Index, NumIndices>& indices) const
-    {
-      using internal::array_apply_and_reduce;
-      using internal::array_zip_and_reduce;
-      using internal::greater_equal_zero_op;
-      using internal::logical_and_op;
-      using internal::lesser_op;
-
-      return
-        // check whether the indices are all >= 0
-        array_apply_and_reduce<logical_and_op, greater_equal_zero_op>(indices) &&
-        // check whether the indices fit in the dimensions
-        array_zip_and_reduce<logical_and_op, lesser_op>(indices, m_storage.dimensions());
-    }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index linearizedIndex(const array<Index, NumIndices>& indices) const
-    {
-      if (Options&RowMajor) {
-        return m_storage.dimensions().IndexOfRowMajor(indices);
-      } else {
-        return m_storage.dimensions().IndexOfColMajor(indices);
-      }
-    }
-};
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorArgMax.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorArgMax.h
deleted file mode 100644
index 8b8fb92..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorArgMax.h
+++ /dev/null
@@ -1,329 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2015 Eugene Brevdo <ebrevdo@gmail.com>
-//                    Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H
-#define EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H
-
-namespace Eigen {
-namespace internal {
-
-/** \class TensorIndexTuple
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Tensor + Index Tuple class.
-  *
-  *
-  */
-template<typename XprType>
-struct traits<TensorIndexTupleOp<XprType> > : public traits<XprType>
-{
-  typedef traits<XprType> XprTraits;
-  typedef typename XprTraits::StorageKind StorageKind;
-  typedef typename XprTraits::Index Index;
-  typedef Tuple<Index, typename XprTraits::Scalar> Scalar;
-  typedef typename XprType::Nested Nested;
-  typedef typename remove_reference<Nested>::type _Nested;
-  static const int NumDimensions = XprTraits::NumDimensions;
-  static const int Layout = XprTraits::Layout;
-};
-
-template<typename XprType>
-struct eval<TensorIndexTupleOp<XprType>, Eigen::Dense>
-{
-  typedef const TensorIndexTupleOp<XprType>EIGEN_DEVICE_REF type;
-};
-
-template<typename XprType>
-struct nested<TensorIndexTupleOp<XprType>, 1,
-              typename eval<TensorIndexTupleOp<XprType> >::type>
-{
-  typedef TensorIndexTupleOp<XprType> type;
-};
-
-}  // end namespace internal
-
-template<typename XprType>
-class TensorIndexTupleOp : public TensorBase<TensorIndexTupleOp<XprType>, ReadOnlyAccessors>
-{
-  public:
-  typedef typename Eigen::internal::traits<TensorIndexTupleOp>::Scalar Scalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename Eigen::internal::nested<TensorIndexTupleOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorIndexTupleOp>::StorageKind StorageKind;
-  typedef typename Eigen::internal::traits<TensorIndexTupleOp>::Index Index;
-  typedef Tuple<Index, typename XprType::CoeffReturnType> CoeffReturnType;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIndexTupleOp(const XprType& expr)
-      : m_xpr(expr) {}
-
-  EIGEN_DEVICE_FUNC
-  const typename internal::remove_all<typename XprType::Nested>::type&
-  expression() const { return m_xpr; }
-
-  protected:
-    typename XprType::Nested m_xpr;
-};
-
-// Eval as rvalue
-template<typename ArgType, typename Device>
-struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device>
-{
-  typedef TensorIndexTupleOp<ArgType> XprType;
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-
-  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
-  static const int NumDims = internal::array_size<Dimensions>::value;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
-  enum {
-    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
-    PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
-    BlockAccess = false,
-    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
-  };
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device) { }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
-    return m_impl.dimensions();
-  }
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
-    m_impl.evalSubExprsIfNeeded(NULL);
-    return true;
-  }
-  EIGEN_STRONG_INLINE void cleanup() {
-    m_impl.cleanup();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
-  {
-    return CoeffReturnType(index, m_impl.coeff(index));
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
-  costPerCoeff(bool vectorized) const {
-    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, 1);
-  }
-
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
-
-#ifdef EIGEN_USE_SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
-
- protected:
-  TensorEvaluator<ArgType, Device> m_impl;
-};
-
-namespace internal {
-
-/** \class TensorTupleIndex
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Converts to Tensor<Tuple<Index, Scalar> > and reduces to Tensor<Index>.
-  *
-  */
-template<typename ReduceOp, typename Dims, typename XprType>
-struct traits<TensorTupleReducerOp<ReduceOp, Dims, XprType> > : public traits<XprType>
-{
-  typedef traits<XprType> XprTraits;
-  typedef typename XprTraits::StorageKind StorageKind;
-  typedef typename XprTraits::Index Index;
-  typedef Index Scalar;
-  typedef typename XprType::Nested Nested;
-  typedef typename remove_reference<Nested>::type _Nested;
-  static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
-  static const int Layout = XprTraits::Layout;
-};
-
-template<typename ReduceOp, typename Dims, typename XprType>
-struct eval<TensorTupleReducerOp<ReduceOp, Dims, XprType>, Eigen::Dense>
-{
-  typedef const TensorTupleReducerOp<ReduceOp, Dims, XprType>EIGEN_DEVICE_REF type;
-};
-
-template<typename ReduceOp, typename Dims, typename XprType>
-struct nested<TensorTupleReducerOp<ReduceOp, Dims, XprType>, 1,
-              typename eval<TensorTupleReducerOp<ReduceOp, Dims, XprType> >::type>
-{
-  typedef TensorTupleReducerOp<ReduceOp, Dims, XprType> type;
-};
-
-}  // end namespace internal
-
-template<typename ReduceOp, typename Dims, typename XprType>
-class TensorTupleReducerOp : public TensorBase<TensorTupleReducerOp<ReduceOp, Dims, XprType>, ReadOnlyAccessors>
-{
-  public:
-  typedef typename Eigen::internal::traits<TensorTupleReducerOp>::Scalar Scalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename Eigen::internal::nested<TensorTupleReducerOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorTupleReducerOp>::StorageKind StorageKind;
-  typedef typename Eigen::internal::traits<TensorTupleReducerOp>::Index Index;
-  typedef Index CoeffReturnType;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTupleReducerOp(const XprType& expr,
-                                                          const ReduceOp& reduce_op,
-                                                          const Index return_dim,
-                                                          const Dims& reduce_dims)
-      : m_xpr(expr), m_reduce_op(reduce_op), m_return_dim(return_dim), m_reduce_dims(reduce_dims) {}
-
-  EIGEN_DEVICE_FUNC
-  const typename internal::remove_all<typename XprType::Nested>::type&
-  expression() const { return m_xpr; }
-
-  EIGEN_DEVICE_FUNC
-  const ReduceOp& reduce_op() const { return m_reduce_op; }
-
-  EIGEN_DEVICE_FUNC
-  const Dims& reduce_dims() const { return m_reduce_dims; }
-
-  EIGEN_DEVICE_FUNC
-  Index return_dim() const { return m_return_dim; }
-
-  protected:
-    typename XprType::Nested m_xpr;
-    const ReduceOp m_reduce_op;
-    const Index m_return_dim;
-    const Dims m_reduce_dims;
-};
-
-// Eval as rvalue
-template<typename ReduceOp, typename Dims, typename ArgType, typename Device>
-struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Device>
-{
-  typedef TensorTupleReducerOp<ReduceOp, Dims, ArgType> XprType;
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename TensorIndexTupleOp<ArgType>::CoeffReturnType TupleType;
-  typedef typename TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device>::Dimensions Dimensions;
-  typedef typename TensorEvaluator<const TensorIndexTupleOp<ArgType> , Device>::Dimensions InputDimensions;
-  static const int NumDims = internal::array_size<InputDimensions>::value;
-  typedef array<Index, NumDims> StrideDims;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-  typedef StorageMemory<TupleType, Device> TupleStorageMem;
-
-  enum {
-    IsAligned         = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
-    PacketAccess      = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
-    BlockAccess       = false,
-    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
-    Layout            = TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device>::Layout,
-    CoordAccess       = false,  // to be implemented
-    RawAccess         = false
-  };
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_orig_impl(op.expression(), device),
-        m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device),
-        m_return_dim(op.return_dim())
-  {
-    gen_strides(m_orig_impl.dimensions(), m_strides);
-    if (Layout == static_cast<int>(ColMajor)) {
-      const Index total_size = internal::array_prod(m_orig_impl.dimensions());
-      m_stride_mod = (m_return_dim < NumDims - 1) ? m_strides[m_return_dim + 1] : total_size;
-    } else {
-      const Index total_size = internal::array_prod(m_orig_impl.dimensions());
-      m_stride_mod = (m_return_dim > 0) ? m_strides[m_return_dim - 1] : total_size;
-    }    
-    // If m_return_dim is not a valid index, returns 1 or this can crash on Windows.
-    m_stride_div = ((m_return_dim >= 0) &&
-                    (m_return_dim < static_cast<Index>(m_strides.size())))
-                   ? m_strides[m_return_dim] : 1;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
-    return m_impl.dimensions();
-  }
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
-    m_impl.evalSubExprsIfNeeded(NULL);
-    return true;
-  }
-  EIGEN_STRONG_INLINE void cleanup() {
-    m_impl.cleanup();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
-    const TupleType v = m_impl.coeff(index);
-    return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div;
-  }
-
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
-#ifdef EIGEN_USE_SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-    m_orig_impl.bind(cgh);
-  }
-#endif
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
-  costPerCoeff(bool vectorized) const {
-    const double compute_cost = 1.0 +
-        (m_return_dim < 0 ? 0.0 : (TensorOpCost::ModCost<Index>() + TensorOpCost::DivCost<Index>()));
-    return m_orig_impl.costPerCoeff(vectorized) +
-           m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost);
-  }
-
- private:
-  EIGEN_DEVICE_FUNC void gen_strides(const InputDimensions& dims, StrideDims& strides) {
-    if (m_return_dim < 0) {
-      return;  // Won't be using the strides.
-    }
-    eigen_assert(m_return_dim < NumDims &&
-                 "Asking to convert index to a dimension outside of the rank");
-
-    // Calculate m_stride_div and m_stride_mod, which are used to
-    // calculate the value of an index w.r.t. the m_return_dim.
-    if (Layout == static_cast<int>(ColMajor)) {
-      strides[0] = 1;
-      for (int i = 1; i < NumDims; ++i) {
-        strides[i] = strides[i-1] * dims[i-1];
-      }
-    } else {
-      strides[NumDims-1] = 1;
-      for (int i = NumDims - 2; i >= 0; --i) {
-        strides[i] = strides[i+1] * dims[i+1];
-      }
-    }
-  }
-
- protected:
-  TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device> m_orig_impl;
-  TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device> m_impl;
-  const Index m_return_dim;
-  StrideDims m_strides;
-  Index m_stride_mod;
-  Index m_stride_div;
-};
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorAssign.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorAssign.h
deleted file mode 100644
index e5811d6..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorAssign.h
+++ /dev/null
@@ -1,247 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H
-#define EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H
-
-namespace Eigen {
-
-/** \class TensorAssign
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief The tensor assignment class.
-  *
-  * This class is represents the assignment of the values resulting from the evaluation of
-  * the rhs expression to the memory locations denoted by the lhs expression.
-  */
-namespace internal {
-template<typename LhsXprType, typename RhsXprType>
-struct traits<TensorAssignOp<LhsXprType, RhsXprType> >
-{
-  typedef typename LhsXprType::Scalar Scalar;
-  typedef typename traits<LhsXprType>::StorageKind StorageKind;
-  typedef typename promote_index_type<typename traits<LhsXprType>::Index,
-                                      typename traits<RhsXprType>::Index>::type Index;
-  typedef typename LhsXprType::Nested LhsNested;
-  typedef typename RhsXprType::Nested RhsNested;
-  typedef typename remove_reference<LhsNested>::type _LhsNested;
-  typedef typename remove_reference<RhsNested>::type _RhsNested;
-  static const std::size_t NumDimensions = internal::traits<LhsXprType>::NumDimensions;
-  static const int Layout = internal::traits<LhsXprType>::Layout;
-  typedef typename traits<LhsXprType>::PointerType PointerType;
-
-  enum {
-    Flags = 0
-  };
-};
-
-template<typename LhsXprType, typename RhsXprType>
-struct eval<TensorAssignOp<LhsXprType, RhsXprType>, Eigen::Dense>
-{
-  typedef const TensorAssignOp<LhsXprType, RhsXprType>& type;
-};
-
-template<typename LhsXprType, typename RhsXprType>
-struct nested<TensorAssignOp<LhsXprType, RhsXprType>, 1, typename eval<TensorAssignOp<LhsXprType, RhsXprType> >::type>
-{
-  typedef TensorAssignOp<LhsXprType, RhsXprType> type;
-};
-
-}  // end namespace internal
-
-
-
-template<typename LhsXprType, typename RhsXprType>
-class TensorAssignOp : public TensorBase<TensorAssignOp<LhsXprType, RhsXprType> >
-{
-  public:
-  typedef typename Eigen::internal::traits<TensorAssignOp>::Scalar Scalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename LhsXprType::CoeffReturnType CoeffReturnType;
-  typedef typename Eigen::internal::nested<TensorAssignOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorAssignOp>::StorageKind StorageKind;
-  typedef typename Eigen::internal::traits<TensorAssignOp>::Index Index;
-
-  static const int NumDims = Eigen::internal::traits<TensorAssignOp>::NumDimensions;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs)
-      : m_lhs_xpr(lhs), m_rhs_xpr(rhs) {}
-
-    /** \returns the nested expressions */
-    EIGEN_DEVICE_FUNC
-    typename internal::remove_all<typename LhsXprType::Nested>::type&
-    lhsExpression() const { return *((typename internal::remove_all<typename LhsXprType::Nested>::type*)&m_lhs_xpr); }
-
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename RhsXprType::Nested>::type&
-    rhsExpression() const { return m_rhs_xpr; }
-
-  protected:
-    typename internal::remove_all<typename LhsXprType::Nested>::type& m_lhs_xpr;
-    const typename internal::remove_all<typename RhsXprType::Nested>::type& m_rhs_xpr;
-};
-
-
-template<typename LeftArgType, typename RightArgType, typename Device>
-struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
-{
-  typedef TensorAssignOp<LeftArgType, RightArgType> XprType;
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  static const int NumDims = XprType::NumDims;
-
-  enum {
-    IsAligned         = int(TensorEvaluator<LeftArgType, Device>::IsAligned) &
-                        int(TensorEvaluator<RightArgType, Device>::IsAligned),
-    PacketAccess      = int(TensorEvaluator<LeftArgType, Device>::PacketAccess) &
-                        int(TensorEvaluator<RightArgType, Device>::PacketAccess),
-    BlockAccess       = int(TensorEvaluator<LeftArgType, Device>::BlockAccess) &
-                        int(TensorEvaluator<RightArgType, Device>::BlockAccess),
-    PreferBlockAccess = int(TensorEvaluator<LeftArgType, Device>::PreferBlockAccess) |
-                        int(TensorEvaluator<RightArgType, Device>::PreferBlockAccess),
-    Layout            = TensorEvaluator<LeftArgType, Device>::Layout,
-    RawAccess         = TensorEvaluator<LeftArgType, Device>::RawAccess
-  };
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
-  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
-
-  typedef typename TensorEvaluator<const RightArgType, Device>::TensorBlock
-      RightTensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  TensorEvaluator(const XprType& op, const Device& device) :
-      m_leftImpl(op.lhsExpression(), device),
-      m_rightImpl(op.rhsExpression(), device)
-  {
-    EIGEN_STATIC_ASSERT(
-        (static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
-         static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
-        YOU_MADE_A_PROGRAMMING_MISTAKE);
-  }
-
-  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
-  {
-    // The dimensions of the lhs and the rhs tensors should be equal to prevent
-    // overflows and ensure the result is fully initialized.
-    // TODO: use left impl instead if right impl dimensions are known at compile time.
-    return m_rightImpl.dimensions();
-  }
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
-    eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions()));
-    m_leftImpl.evalSubExprsIfNeeded(NULL);
-    // If the lhs provides raw access to its storage area (i.e. if m_leftImpl.data() returns a non
-    // null value), attempt to evaluate the rhs expression in place. Returns true iff in place
-    // evaluation isn't supported and the caller still needs to manually assign the values generated
-    // by the rhs to the lhs.
-    return m_rightImpl.evalSubExprsIfNeeded(m_leftImpl.data());
-  }
-
-#ifdef EIGEN_USE_THREADS
-  template <typename EvalSubExprsCallback>
-  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
-      EvaluatorPointerType, EvalSubExprsCallback done) {
-    m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done](bool) {
-      m_rightImpl.evalSubExprsIfNeededAsync(
-          m_leftImpl.data(), [done](bool need_assign) { done(need_assign); });
-    });
-  }
-#endif  // EIGEN_USE_THREADS
-
-  EIGEN_STRONG_INLINE void cleanup() {
-    m_leftImpl.cleanup();
-    m_rightImpl.cleanup();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
-    m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
-
-    const int LhsStoreMode = TensorEvaluator<LeftArgType, Device>::IsAligned ? Aligned : Unaligned;
-    const int RhsLoadMode = TensorEvaluator<RightArgType, Device>::IsAligned ? Aligned : Unaligned;
-    m_leftImpl.template writePacket<LhsStoreMode>(i, m_rightImpl.template packet<RhsLoadMode>(i));
-  }
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
-  {
-    return m_leftImpl.coeff(index);
-  }
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
-  {
-    return m_leftImpl.template packet<LoadMode>(index);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
-  costPerCoeff(bool vectorized) const {
-    // We assume that evalPacket or evalScalar is called to perform the
-    // assignment and account for the cost of the write here, but reduce left
-    // cost by one load because we are using m_leftImpl.coeffRef.
-    TensorOpCost left = m_leftImpl.costPerCoeff(vectorized);
-    return m_rightImpl.costPerCoeff(vectorized) +
-           TensorOpCost(
-               numext::maxi(0.0, left.bytes_loaded() - sizeof(CoeffReturnType)),
-               left.bytes_stored(), left.compute_cycles()) +
-           TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockResourceRequirements getResourceRequirements() const {
-    return internal::TensorBlockResourceRequirements::merge(
-        m_leftImpl.getResourceRequirements(),
-        m_rightImpl.getResourceRequirements());
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(
-      TensorBlockDesc& desc, TensorBlockScratch& scratch) {
-    if (TensorEvaluator<LeftArgType, Device>::RawAccess &&
-        m_leftImpl.data() != NULL) {
-      // If destination has raw data access, we pass it as a potential
-      // destination for a block descriptor evaluation.
-      desc.template AddDestinationBuffer<Layout>(
-          /*dst_base=*/m_leftImpl.data() + desc.offset(),
-          /*dst_strides=*/internal::strides<Layout>(m_leftImpl.dimensions()));
-    }
-
-    RightTensorBlock block = m_rightImpl.block(desc, scratch, /*root_of_expr_ast=*/true);
-    // If block was evaluated into a destination, there is no need to do assignment.
-    if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) {
-      m_leftImpl.writeBlock(desc, block);
-    }
-    block.cleanup();
-  }
-
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_leftImpl.bind(cgh);
-    m_rightImpl.bind(cgh);
-  }
-#endif
-
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_leftImpl.data(); }
-
- private:
-  TensorEvaluator<LeftArgType, Device> m_leftImpl;
-  TensorEvaluator<RightArgType, Device> m_rightImpl;
-};
-
-}
-
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorBase.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorBase.h
deleted file mode 100644
index 35b6458..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorBase.h
+++ /dev/null
@@ -1,1176 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_BASE_H
-#define EIGEN_CXX11_TENSOR_TENSOR_BASE_H
-
-// clang-format off
-
-namespace Eigen {
-
-/** \class TensorBase
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief The tensor base class.
-  *
-  * This class is the common parent of the Tensor and TensorMap class, thus
-  * making it possible to use either class interchangeably in expressions.
-  */
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-// FIXME Doxygen does not like the inheritance with different template parameters
-// Since there is no doxygen documentation inside, we disable it for now
-template<typename Derived>
-class TensorBase<Derived, ReadOnlyAccessors>
-{
-  public:
-    typedef internal::traits<Derived> DerivedTraits;
-    typedef typename DerivedTraits::Scalar Scalar;
-    typedef typename DerivedTraits::Index Index;
-    typedef typename internal::remove_const<Scalar>::type CoeffReturnType;
-    static const int NumDimensions = DerivedTraits::NumDimensions;
-
-    // Generic nullary operation support.
-    template <typename CustomNullaryOp> EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<CustomNullaryOp, const Derived>
-    nullaryExpr(const CustomNullaryOp& func) const {
-      return TensorCwiseNullaryOp<CustomNullaryOp, const Derived>(derived(), func);
-    }
-
-    // Coefficient-wise nullary operators
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived>
-    constant(const Scalar& value) const {
-      return nullaryExpr(internal::scalar_constant_op<Scalar>(value));
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<internal::UniformRandomGenerator<Scalar>, const Derived>
-    random() const {
-      return nullaryExpr(internal::UniformRandomGenerator<Scalar>());
-    }
-    template <typename RandomGenerator> EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<RandomGenerator, const Derived>
-    random(const RandomGenerator& gen = RandomGenerator()) const {
-      return nullaryExpr(gen);
-    }
-
-    // Tensor generation
-    template <typename Generator> EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorGeneratorOp<Generator, const Derived>
-    generate(const Generator& generator) const {
-      return TensorGeneratorOp<Generator, const Derived>(derived(), generator);
-    }
-
-    // Generic unary operation support.
-    template <typename CustomUnaryOp> EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<CustomUnaryOp, const Derived>
-    unaryExpr(const CustomUnaryOp& func) const {
-      return TensorCwiseUnaryOp<CustomUnaryOp, const Derived>(derived(), func);
-    }
-
-    // Coefficient-wise unary operators
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const Derived>
-    operator-() const {
-      return unaryExpr(internal::scalar_opposite_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived>
-    sqrt() const {
-      return unaryExpr(internal::scalar_sqrt_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sign_op<Scalar>, const Derived>
-    sign() const {
-      return unaryExpr(internal::scalar_sign_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_rsqrt_op<Scalar>, const Derived>
-    rsqrt() const {
-      return unaryExpr(internal::scalar_rsqrt_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_square_op<Scalar>, const Derived>
-    square() const {
-      return unaryExpr(internal::scalar_square_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_cube_op<Scalar>, const Derived>
-    cube() const {
-      return unaryExpr(internal::scalar_cube_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived>
-    inverse() const {
-      return unaryExpr(internal::scalar_inverse_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_tanh_op<Scalar>, const Derived>
-    tanh() const {
-      return unaryExpr(internal::scalar_tanh_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_lgamma_op<Scalar>, const Derived>
-    lgamma() const {
-      return unaryExpr(internal::scalar_lgamma_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_digamma_op<Scalar>, const Derived>
-    digamma() const {
-      return unaryExpr(internal::scalar_digamma_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_i0_op<Scalar>, const Derived>
-    bessel_i0() const {
-      return unaryExpr(internal::scalar_bessel_i0_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_i0e_op<Scalar>, const Derived>
-    bessel_i0e() const {
-      return unaryExpr(internal::scalar_bessel_i0e_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_i1_op<Scalar>, const Derived>
-    bessel_i1() const {
-      return unaryExpr(internal::scalar_bessel_i1_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_i1e_op<Scalar>, const Derived>
-    bessel_i1e() const {
-      return unaryExpr(internal::scalar_bessel_i1e_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_j0_op<Scalar>, const Derived>
-    bessel_j0() const {
-      return unaryExpr(internal::scalar_bessel_j0_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_y0_op<Scalar>, const Derived>
-    bessel_y0() const {
-      return unaryExpr(internal::scalar_bessel_y0_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_j1_op<Scalar>, const Derived>
-    bessel_j1() const {
-      return unaryExpr(internal::scalar_bessel_j1_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_y1_op<Scalar>, const Derived>
-    bessel_y1() const {
-      return unaryExpr(internal::scalar_bessel_y1_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_k0_op<Scalar>, const Derived>
-    bessel_k0() const {
-      return unaryExpr(internal::scalar_bessel_k0_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_k0e_op<Scalar>, const Derived>
-    bessel_k0e() const {
-      return unaryExpr(internal::scalar_bessel_k0e_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_k1_op<Scalar>, const Derived>
-    bessel_k1() const {
-      return unaryExpr(internal::scalar_bessel_k1_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_k1e_op<Scalar>, const Derived>
-    bessel_k1e() const {
-      return unaryExpr(internal::scalar_bessel_k1e_op<Scalar>());
-    }
-
-    // igamma(a = this, x = other)
-    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_igamma_op<Scalar>, const Derived, const OtherDerived>
-    igamma(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_igamma_op<Scalar>());
-    }
-
-    // igamma_der_a(a = this, x = other)
-    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_igamma_der_a_op<Scalar>, const Derived, const OtherDerived>
-    igamma_der_a(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_igamma_der_a_op<Scalar>());
-    }
-
-    // gamma_sample_der_alpha(alpha = this, sample = other)
-    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_gamma_sample_der_alpha_op<Scalar>, const Derived, const OtherDerived>
-    gamma_sample_der_alpha(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_gamma_sample_der_alpha_op<Scalar>());
-    }
-
-    // igammac(a = this, x = other)
-    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_igammac_op<Scalar>, const Derived, const OtherDerived>
-    igammac(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_igammac_op<Scalar>());
-    }
-
-    // zeta(x = this, q = other)
-    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_zeta_op<Scalar>, const Derived, const OtherDerived>
-    zeta(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_zeta_op<Scalar>());
-    }
-
-    // polygamma(n = this, x = other)
-    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_polygamma_op<Scalar>, const Derived, const OtherDerived>
-    polygamma(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_polygamma_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_erf_op<Scalar>, const Derived>
-    erf() const {
-      return unaryExpr(internal::scalar_erf_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_erfc_op<Scalar>, const Derived>
-    erfc() const {
-      return unaryExpr(internal::scalar_erfc_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_ndtri_op<Scalar>, const Derived>
-    ndtri() const {
-      return unaryExpr(internal::scalar_ndtri_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_logistic_op<Scalar>, const Derived>
-    sigmoid() const {
-      return unaryExpr(internal::scalar_logistic_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_exp_op<Scalar>, const Derived>
-    exp() const {
-      return unaryExpr(internal::scalar_exp_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_expm1_op<Scalar>, const Derived>
-    expm1() const {
-      return unaryExpr(internal::scalar_expm1_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log_op<Scalar>, const Derived>
-    log() const {
-      return unaryExpr(internal::scalar_log_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log1p_op<Scalar>, const Derived>
-    log1p() const {
-      return unaryExpr(internal::scalar_log1p_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log2_op<Scalar>, const Derived>
-    log2() const {
-      return unaryExpr(internal::scalar_log2_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived>
-    abs() const {
-      return unaryExpr(internal::scalar_abs_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_clamp_op<Scalar>, const Derived>
-    clip(Scalar min, Scalar max) const {
-      return unaryExpr(internal::scalar_clamp_op<Scalar>(min, max));
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const typename internal::conditional<NumTraits<CoeffReturnType>::IsComplex,
-                                                             TensorCwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, const Derived>,
-                                                             Derived>::type
-    conjugate() const {
-      return choose(Cond<NumTraits<CoeffReturnType>::IsComplex>(), unaryExpr(internal::scalar_conjugate_op<Scalar>()), derived());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_pow_op<Scalar,Scalar> >, const Derived>
-    pow(Scalar exponent) const {
-      return unaryExpr(internal::bind2nd_op<internal::scalar_pow_op<Scalar,Scalar> >(exponent));
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_real_op<Scalar>, const Derived>
-    real() const {
-      return unaryExpr(internal::scalar_real_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_imag_op<Scalar>, const Derived>
-    imag() const {
-      return unaryExpr(internal::scalar_imag_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_sum_op<Scalar,Scalar> >, const Derived>
-    operator+ (Scalar rhs) const {
-      return unaryExpr(internal::bind2nd_op<internal::scalar_sum_op<Scalar,Scalar> >(rhs));
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE friend
-    const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_sum_op<Scalar> >, const Derived>
-    operator+ (Scalar lhs, const Derived& rhs) {
-      return rhs.unaryExpr(internal::bind1st_op<internal::scalar_sum_op<Scalar> >(lhs));
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_difference_op<Scalar,Scalar> >, const Derived>
-    operator- (Scalar rhs) const {
-      EIGEN_STATIC_ASSERT((NumTraits<Scalar>::IsSigned || internal::is_same<Scalar, const std::complex<float> >::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
-      return unaryExpr(internal::bind2nd_op<internal::scalar_difference_op<Scalar,Scalar> >(rhs));
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE friend
-    const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_difference_op<Scalar> >, const Derived>
-    operator- (Scalar lhs, const Derived& rhs) {
-      return rhs.unaryExpr(internal::bind1st_op<internal::scalar_difference_op<Scalar> >(lhs));
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_product_op<Scalar,Scalar> >, const Derived>
-    operator* (Scalar rhs) const {
-      return unaryExpr(internal::bind2nd_op<internal::scalar_product_op<Scalar,Scalar> >(rhs));
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE friend
-    const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_product_op<Scalar> >, const Derived>
-    operator* (Scalar lhs, const Derived& rhs) {
-      return rhs.unaryExpr(internal::bind1st_op<internal::scalar_product_op<Scalar> >(lhs));
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_quotient_op<Scalar,Scalar> >, const Derived>
-    operator/ (Scalar rhs) const {
-      return unaryExpr(internal::bind2nd_op<internal::scalar_quotient_op<Scalar,Scalar> >(rhs));
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE friend
-    const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_quotient_op<Scalar> >, const Derived>
-    operator/ (Scalar lhs, const Derived& rhs) {
-      return rhs.unaryExpr(internal::bind1st_op<internal::scalar_quotient_op<Scalar> >(lhs));
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_mod_op<Scalar>, const Derived>
-    operator% (Scalar rhs) const {
-      EIGEN_STATIC_ASSERT(NumTraits<Scalar>::IsInteger, YOU_MADE_A_PROGRAMMING_MISTAKE_TRY_MOD);
-      return unaryExpr(internal::scalar_mod_op<Scalar>(rhs));
-    }
-
-    template <int NanPropagation=PropagateFast>
-    EIGEN_DEVICE_FUNC
-        EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar,NanPropagation>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
-    cwiseMax(Scalar threshold) const {
-      return cwiseMax<NanPropagation>(constant(threshold));
-    }
-
-    template <int NanPropagation=PropagateFast>
-    EIGEN_DEVICE_FUNC
-        EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar,NanPropagation>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
-    cwiseMin(Scalar threshold) const {
-      return cwiseMin<NanPropagation>(constant(threshold));
-    }
-
-    template<typename NewType>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const typename internal::conditional<internal::is_same<NewType, CoeffReturnType>::value,
-                                                             Derived,
-                                                             TensorConversionOp<NewType, const Derived> >::type
-    cast() const {
-      return choose(Cond<internal::is_same<NewType, CoeffReturnType>::value>(), derived(), TensorConversionOp<NewType, const Derived>(derived()));
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_round_op<Scalar>, const Derived>
-    round() const {
-      return unaryExpr(internal::scalar_round_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_rint_op<Scalar>, const Derived>
-    rint() const {
-      return unaryExpr(internal::scalar_rint_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_ceil_op<Scalar>, const Derived>
-    ceil() const {
-      return unaryExpr(internal::scalar_ceil_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_floor_op<Scalar>, const Derived>
-    floor() const {
-      return unaryExpr(internal::scalar_floor_op<Scalar>());
-    }
-
-    // Generic binary operation support.
-    template <typename CustomBinaryOp, typename OtherDerived> EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>
-    binaryExpr(const OtherDerived& other, const CustomBinaryOp& func) const {
-      return TensorCwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>(derived(), other, func);
-    }
-
-    // Coefficient-wise binary operators.
-    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const Derived, const OtherDerived>
-    operator+(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_sum_op<Scalar>());
-    }
-
-    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const Derived, const OtherDerived>
-    operator-(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_difference_op<Scalar>());
-    }
-
-    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_product_op<Scalar>, const Derived, const OtherDerived>
-    operator*(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_product_op<Scalar>());
-    }
-
-    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const Derived, const OtherDerived>
-    operator/(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_quotient_op<Scalar>());
-    }
-
-  template<int NaNPropagation=PropagateFast, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-      const TensorCwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar, NaNPropagation>, const Derived, const OtherDerived>
-    cwiseMax(const OtherDerived& other) const {
-    return binaryExpr(other.derived(), internal::scalar_max_op<Scalar,Scalar, NaNPropagation>());
-    }
-
-  template<int NaNPropagation=PropagateFast, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar, NaNPropagation>, const Derived, const OtherDerived>
-    cwiseMin(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_min_op<Scalar,Scalar, NaNPropagation>());
-    }
-
-    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_boolean_and_op, const Derived, const OtherDerived>
-    operator&&(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_boolean_and_op());
-    }
-
-    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_boolean_or_op, const Derived, const OtherDerived>
-    operator||(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_boolean_or_op());
-    }
-
-    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_boolean_xor_op, const Derived, const OtherDerived>
-    operator^(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_boolean_xor_op());
-    }
-
-    // Comparisons and tests.
-    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>, const Derived, const OtherDerived>
-    operator<(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>());
-    }
-    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>, const Derived, const OtherDerived>
-    operator<=(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>());
-    }
-    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>, const Derived, const OtherDerived>
-    operator>(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>());
-    }
-    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>, const Derived, const OtherDerived>
-    operator>=(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>());
-    }
-
-    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>, const Derived, const OtherDerived>
-    operator==(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>());
-    }
-
-    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>, const Derived, const OtherDerived>
-    operator!=(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>());
-    }
-
-    // comparisons and tests for Scalars
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
-    operator<(Scalar threshold) const {
-      return operator<(constant(threshold));
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
-    operator<=(Scalar threshold) const {
-      return operator<=(constant(threshold));
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
-    operator>(Scalar threshold) const {
-      return operator>(constant(threshold));
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
-    operator>=(Scalar threshold) const {
-      return operator>=(constant(threshold));
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
-    operator==(Scalar threshold) const {
-      return operator==(constant(threshold));
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
-    operator!=(Scalar threshold) const {
-      return operator!=(constant(threshold));
-    }
-
-    // Checks
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_isnan_op<Scalar>, const Derived>
-    (isnan)() const {
-      return unaryExpr(internal::scalar_isnan_op<Scalar>());
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_isinf_op<Scalar>, const Derived>
-    (isinf)() const {
-      return unaryExpr(internal::scalar_isinf_op<Scalar>());
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_isfinite_op<Scalar>, const Derived>
-    (isfinite)() const {
-      return unaryExpr(internal::scalar_isfinite_op<Scalar>());
-    }
-
-    // Coefficient-wise ternary operators.
-    template<typename ThenDerived, typename ElseDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorSelectOp<const Derived, const ThenDerived, const ElseDerived>
-    select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) const {
-      return TensorSelectOp<const Derived, const ThenDerived, const ElseDerived>(derived(), thenTensor.derived(), elseTensor.derived());
-    }
-
-    // Contractions.
-    typedef Eigen::IndexPair<Index> DimensionPair;
-
-    template<typename OtherDerived, typename Dimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorContractionOp<const Dimensions, const Derived, const OtherDerived, const NoOpOutputKernel>
-    contract(const OtherDerived& other, const Dimensions& dims) const {
-      return TensorContractionOp<const Dimensions, const Derived, const OtherDerived, const NoOpOutputKernel>(derived(), other.derived(), dims);
-    }
-
-    template<typename OtherDerived, typename Dimensions, typename OutputKernel> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorContractionOp<const Dimensions, const Derived, const OtherDerived, const OutputKernel>
-    contract(const OtherDerived& other, const Dimensions& dims, const OutputKernel& output_kernel) const {
-      return TensorContractionOp<const Dimensions, const Derived, const OtherDerived, const OutputKernel>(derived(), other.derived(), dims, output_kernel);
-    }
-
-    // Convolutions.
-    template<typename KernelDerived, typename Dimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorConvolutionOp<const Dimensions, const Derived, const KernelDerived>
-    convolve(const KernelDerived& kernel, const Dimensions& dims) const {
-      return TensorConvolutionOp<const Dimensions, const Derived, const KernelDerived>(derived(), kernel.derived(), dims);
-    }
-
-    // Fourier transforms
-    template <int FFTDataType, int FFTDirection, typename FFT> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorFFTOp<const FFT, const Derived, FFTDataType, FFTDirection>
-    fft(const FFT& dims) const {
-      return TensorFFTOp<const FFT, const Derived, FFTDataType, FFTDirection>(derived(), dims);
-    }
-
-    // Scan.
-    typedef TensorScanOp<internal::SumReducer<CoeffReturnType>, const Derived> TensorScanSumOp;
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorScanSumOp
-    cumsum(const Index& axis, bool exclusive = false) const {
-      return TensorScanSumOp(derived(), axis, exclusive);
-    }
-
-    typedef TensorScanOp<internal::ProdReducer<CoeffReturnType>, const Derived> TensorScanProdOp;
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorScanProdOp
-    cumprod(const Index& axis, bool exclusive = false) const {
-      return TensorScanProdOp(derived(), axis, exclusive);
-    }
-
-    template <typename Reducer>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorScanOp<Reducer, const Derived>
-    scan(const Index& axis, const Reducer& reducer, bool exclusive = false) const {
-      return TensorScanOp<Reducer, const Derived>(derived(), axis, exclusive, reducer);
-    }
-
-    // Reductions.
-    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorReductionOp<internal::SumReducer<CoeffReturnType>, const Dims, const Derived>
-    sum(const Dims& dims) const {
-      return TensorReductionOp<internal::SumReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::SumReducer<CoeffReturnType>());
-    }
-
-    const TensorReductionOp<internal::SumReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>
-    sum() const {
-      DimensionList<Index, NumDimensions> in_dims;
-      return TensorReductionOp<internal::SumReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::SumReducer<CoeffReturnType>());
-    }
-
-    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const Dims, const Derived>
-    mean(const Dims& dims) const {
-      return TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::MeanReducer<CoeffReturnType>());
-    }
-
-    const TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>
-    mean() const {
-      DimensionList<Index, NumDimensions> in_dims;
-      return TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MeanReducer<CoeffReturnType>());
-    }
-
-    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const Dims, const Derived>
-    prod(const Dims& dims) const {
-      return TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::ProdReducer<CoeffReturnType>());
-    }
-
-    const TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>
-    prod() const {
-      DimensionList<Index, NumDimensions> in_dims;
-      return TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::ProdReducer<CoeffReturnType>());
-    }
-
-    template <typename Dims,int NanPropagation=PropagateFast> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorReductionOp<internal::MaxReducer<CoeffReturnType,NanPropagation>, const Dims, const Derived>
-    maximum(const Dims& dims) const {
-      return TensorReductionOp<internal::MaxReducer<CoeffReturnType,NanPropagation>, const Dims, const Derived>(derived(), dims, internal::MaxReducer<CoeffReturnType,NanPropagation>());
-    }
-
-    template <int NanPropagation=PropagateFast>
-    const TensorReductionOp<internal::MaxReducer<CoeffReturnType,NanPropagation>, const DimensionList<Index, NumDimensions>, const Derived>
-    maximum() const {
-      DimensionList<Index, NumDimensions> in_dims;
-      return TensorReductionOp<internal::MaxReducer<CoeffReturnType,NanPropagation>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MaxReducer<CoeffReturnType,NanPropagation>());
-    }
-
-    template <typename Dims,int NanPropagation=PropagateFast> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorReductionOp<internal::MinReducer<CoeffReturnType,NanPropagation>, const Dims, const Derived>
-    minimum(const Dims& dims) const {
-      return TensorReductionOp<internal::MinReducer<CoeffReturnType,NanPropagation>, const Dims, const Derived>(derived(), dims, internal::MinReducer<CoeffReturnType,NanPropagation>());
-    }
-
-    template <int NanPropagation=PropagateFast>
-    const TensorReductionOp<internal::MinReducer<CoeffReturnType,NanPropagation>, const DimensionList<Index, NumDimensions>, const Derived>
-    minimum() const {
-      DimensionList<Index, NumDimensions> in_dims;
-      return TensorReductionOp<internal::MinReducer<CoeffReturnType,NanPropagation>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MinReducer<CoeffReturnType,NanPropagation>());
-    }
-
-    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorReductionOp<internal::AndReducer, const Dims, const typename internal::conditional<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> >::type >
-    all(const Dims& dims) const {
-      return cast<bool>().reduce(dims, internal::AndReducer());
-    }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorReductionOp<internal::AndReducer, const DimensionList<Index, NumDimensions>, const typename internal::conditional<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> >::type >
-    all() const {
-      DimensionList<Index, NumDimensions> in_dims;
-      return cast<bool>().reduce(in_dims, internal::AndReducer());
-    }
-
-    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorReductionOp<internal::OrReducer, const Dims, const typename internal::conditional<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> >::type >
-    any(const Dims& dims) const {
-      return cast<bool>().reduce(dims, internal::OrReducer());
-    }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorReductionOp<internal::OrReducer, const DimensionList<Index, NumDimensions>, const typename internal::conditional<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> >::type >
-    any() const {
-      DimensionList<Index, NumDimensions> in_dims;
-      return cast<bool>().reduce(in_dims, internal::OrReducer());
-    }
-
-   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorTupleReducerOp<
-      internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >,
-      const array<Index, NumDimensions>, const Derived>
-    argmax() const {
-      array<Index, NumDimensions> in_dims;
-      for (Index d = 0; d < NumDimensions; ++d) in_dims[d] = d;
-      return TensorTupleReducerOp<
-        internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >,
-        const array<Index, NumDimensions>,
-        const Derived>(derived(), internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >(), -1, in_dims);
-    }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorTupleReducerOp<
-      internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >,
-      const array<Index, NumDimensions>, const Derived>
-    argmin() const {
-      array<Index, NumDimensions> in_dims;
-      for (Index d = 0; d < NumDimensions; ++d) in_dims[d] = d;
-      return TensorTupleReducerOp<
-        internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >,
-        const array<Index, NumDimensions>,
-        const Derived>(derived(), internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >(), -1, in_dims);
-    }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorTupleReducerOp<
-      internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >,
-      const array<Index, 1>, const Derived>
-    argmax(const Index return_dim) const {
-      array<Index, 1> in_dims;
-      in_dims[0] = return_dim;
-      return TensorTupleReducerOp<
-        internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >,
-        const array<Index, 1>,
-        const Derived>(derived(), internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >(), return_dim, in_dims);
-    }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorTupleReducerOp<
-      internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >,
-      const array<Index, 1>, const Derived>
-    argmin(const Index return_dim) const {
-      array<Index, 1> in_dims;
-      in_dims[0] = return_dim;
-      return TensorTupleReducerOp<
-        internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >,
-        const array<Index, 1>,
-        const Derived>(derived(), internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >(), return_dim, in_dims);
-    }
-
-    template <typename Reducer, typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorReductionOp<Reducer, const Dims, const Derived>
-    reduce(const Dims& dims, const Reducer& reducer) const {
-      return TensorReductionOp<Reducer, const Dims, const Derived>(derived(), dims, reducer);
-    }
-
-    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorTraceOp<const Dims, const Derived>
-    trace(const Dims& dims) const {
-      return TensorTraceOp<const Dims, const Derived>(derived(), dims);
-    }
-
-    const TensorTraceOp<const DimensionList<Index, NumDimensions>, const Derived>
-    trace() const {
-      DimensionList<Index, NumDimensions> in_dims;
-      return TensorTraceOp<const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims);
-    }
-
-    template <typename Broadcast> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorBroadcastingOp<const Broadcast, const Derived>
-    broadcast(const Broadcast& bcast) const {
-      return TensorBroadcastingOp<const Broadcast, const Derived>(derived(), bcast);
-    }
-
-    template <typename Axis, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorConcatenationOp<Axis, const Derived, const OtherDerived>
-    concatenate(const OtherDerived& other, Axis axis) const {
-      return TensorConcatenationOp<Axis, const Derived, const OtherDerived>(derived(), other.derived(), axis);
-    }
-
-    template <typename PatchDims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorPatchOp<const PatchDims, const Derived>
-    extract_patches(const PatchDims& patch_dims) const {
-      return TensorPatchOp<const PatchDims, const Derived>(derived(), patch_dims);
-    }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorImagePatchOp<Dynamic, Dynamic, const Derived>
-    extract_image_patches(const Index patch_rows = 1, const Index patch_cols = 1,
-                          const Index row_stride = 1, const Index col_stride = 1,
-                          const Index in_row_stride = 1, const Index in_col_stride = 1,
-                          const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = Scalar(0)) const {
-      return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride,
-                                                                 in_row_stride, in_col_stride, 1, 1, padding_type, padding_value);
-    }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorImagePatchOp<Dynamic, Dynamic, const Derived>
-    extract_image_patches(const Index patch_rows, const Index patch_cols,
-                          const Index row_stride, const Index col_stride,
-                          const Index in_row_stride, const Index in_col_stride,
-                          const Index row_inflate_stride, const Index col_inflate_stride,
-                          const Index padding_top, const Index padding_bottom,
-                          const Index padding_left,const Index padding_right,
-                          const Scalar padding_value) const {
-      return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride,
-                                                                 in_row_stride, in_col_stride, row_inflate_stride, col_inflate_stride,
-                                                                 padding_top, padding_bottom, padding_left, padding_right, padding_value);
-    }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>
-    extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols,
-                           const Index plane_stride = 1, const Index row_stride = 1, const Index col_stride = 1,
-                           const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = Scalar(0)) const {
-      return TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, 1, 1, 1, padding_type, padding_value);
-    }
-
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>
-    extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols,
-                           const Index plane_stride, const Index row_stride, const Index col_stride,
-                           const Index plane_inflate_stride, const Index row_inflate_stride, const Index col_inflate_stride,
-                           const Index padding_top_z, const Index padding_bottom_z,
-                           const Index padding_top, const Index padding_bottom,
-                           const Index padding_left, const Index padding_right, const Scalar padding_value = Scalar(0)) const {
-      return TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, plane_inflate_stride, row_inflate_stride, col_inflate_stride, padding_top_z, padding_bottom_z, padding_top, padding_bottom, padding_left, padding_right, padding_value);
-    }
-
-    // Morphing operators.
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorLayoutSwapOp<const Derived>
-    swap_layout() const {
-      return TensorLayoutSwapOp<const Derived>(derived());
-    }
-    template <typename NewDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorReshapingOp<const NewDimensions, const Derived>
-    reshape(const NewDimensions& newDimensions) const {
-      return TensorReshapingOp<const NewDimensions, const Derived>(derived(), newDimensions);
-    }
-    template <typename StartIndices, typename Sizes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorSlicingOp<const StartIndices, const Sizes, const Derived>
-    slice(const StartIndices& startIndices, const Sizes& sizes) const {
-      return TensorSlicingOp<const StartIndices, const Sizes, const Derived>(derived(), startIndices, sizes);
-    }
-    template <typename StartIndices, typename StopIndices, typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides, const Derived>
-    stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) const {
-      return TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides,
-                                const Derived>(derived(), startIndices, stopIndices, strides);
-    }
-    template <Index DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorChippingOp<DimId, const Derived>
-    chip(const Index offset) const {
-      return TensorChippingOp<DimId, const Derived>(derived(), offset, DimId);
-    }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorChippingOp<Dynamic, const Derived>
-    chip(const Index offset, const Index dim) const {
-      return TensorChippingOp<Dynamic, const Derived>(derived(), offset, dim);
-    }
-    template <typename ReverseDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorReverseOp<const ReverseDimensions, const Derived>
-    reverse(const ReverseDimensions& rev) const {
-      return TensorReverseOp<const ReverseDimensions, const Derived>(derived(), rev);
-    }
-    template <typename PaddingDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorPaddingOp<const PaddingDimensions, const Derived>
-    pad(const PaddingDimensions& padding) const {
-      return TensorPaddingOp<const PaddingDimensions, const Derived>(derived(), padding, internal::scalar_cast_op<int, Scalar>()(0));
-    }
-    template <typename PaddingDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorPaddingOp<const PaddingDimensions, const Derived>
-    pad(const PaddingDimensions& padding, const Scalar padding_value) const {
-      return TensorPaddingOp<const PaddingDimensions, const Derived>(derived(), padding, padding_value);
-    }
-    template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorShufflingOp<const Shuffle, const Derived>
-    shuffle(const Shuffle& shfl) const {
-      return TensorShufflingOp<const Shuffle, const Derived>(derived(), shfl);
-    }
-    template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorStridingOp<const Strides, const Derived>
-    stride(const Strides& strides) const {
-      return TensorStridingOp<const Strides, const Derived>(derived(), strides);
-    }
-    template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorInflationOp<const Strides, const Derived>
-    inflate(const Strides& strides) const {
-      return TensorInflationOp<const Strides, const Derived>(derived(), strides);
-    }
-
-    // Returns a tensor containing index/value tuples
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorIndexTupleOp<const Derived>
-    index_tuples() const {
-      return TensorIndexTupleOp<const Derived>(derived());
-    }
-
-    // Support for custom unary and binary operations
-    template <typename CustomUnaryFunc>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCustomUnaryOp<const CustomUnaryFunc, const Derived> customOp(const CustomUnaryFunc& op) const {
-      return TensorCustomUnaryOp<const CustomUnaryFunc, const Derived>(derived(), op);
-    }
-    template <typename OtherDerived, typename CustomBinaryFunc>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCustomBinaryOp<const CustomBinaryFunc, const Derived, const OtherDerived> customOp(const OtherDerived& other, const CustomBinaryFunc& op) const {
-      return TensorCustomBinaryOp<const CustomBinaryFunc, const Derived, const OtherDerived>(derived(), other, op);
-    }
-
-    // Force the evaluation of the expression.
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorForcedEvalOp<const Derived> eval() const {
-      return TensorForcedEvalOp<const Derived>(derived());
-    }
-
-  protected:
-    template <typename Scalar, int NumIndices, int Options, typename IndexType> friend class Tensor;
-    template <typename Scalar, typename Dimensions, int Option, typename IndexTypes> friend class TensorFixedSize;
-    // the Eigen:: prefix is required to workaround a compilation issue with nvcc 9.0
-    template <typename OtherDerived, int AccessLevel> friend class Eigen::TensorBase;
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); }
-};
-
-template<typename Derived, int AccessLevel = internal::accessors_level<Derived>::value>
-class TensorBase : public TensorBase<Derived, ReadOnlyAccessors> {
- public:
-    typedef TensorBase<Derived, ReadOnlyAccessors> Base;
-    typedef internal::traits<Derived> DerivedTraits;
-    typedef typename DerivedTraits::Scalar Scalar;
-    typedef typename DerivedTraits::Index Index;
-    typedef Scalar CoeffReturnType;
-    static const int NumDimensions = DerivedTraits::NumDimensions;
-
-    template <typename Scalar, int NumIndices, int Options, typename IndexType> friend class Tensor;
-    template <typename Scalar, typename Dimensions, int Option, typename IndexTypes> friend class TensorFixedSize;
-    // the Eigen:: prefix is required to workaround a compilation issue with nvcc 9.0
-    template <typename OtherDerived, int OtherAccessLevel> friend class Eigen::TensorBase;
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Derived& setZero() {
-      return setConstant(Scalar(0));
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Derived& setConstant(const Scalar& val) {
-      return derived() = this->constant(val);
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Derived& setRandom() {
-      return derived() = this->random();
-    }
-    template <typename RandomGenerator> EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Derived& setRandom() {
-      return derived() = this->template random<RandomGenerator>();
-    }
-
-#if EIGEN_HAS_VARIADIC_TEMPLATES
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Derived& setValues(
-        const typename internal::Initializer<Derived, NumDimensions>::InitList& vals) {
-      TensorEvaluator<Derived, DefaultDevice> eval(derived(), DefaultDevice());
-      internal::initialize_tensor<Derived, NumDimensions>(eval, vals);
-      return derived();
-    }
-#endif  // EIGEN_HAS_VARIADIC_TEMPLATES
-
-    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Derived& operator+=(const OtherDerived& other) {
-      return derived() = derived() + other.derived();
-    }
-    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Derived& operator-=(const OtherDerived& other) {
-      return derived() = derived() - other.derived();
-    }
-    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Derived& operator*=(const OtherDerived& other) {
-      return derived() = derived() * other.derived();
-    }
-    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Derived& operator/=(const OtherDerived& other) {
-      return derived() = derived() / other.derived();
-    }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorLayoutSwapOp<const Derived>
-    swap_layout() const {
-      return TensorLayoutSwapOp<const Derived>(derived());
-    }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    TensorLayoutSwapOp<Derived>
-    swap_layout() {
-      return TensorLayoutSwapOp<Derived>(derived());
-    }
-
-    template <typename Axis, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorConcatenationOp<const Axis, const Derived, const OtherDerived>
-    concatenate(const OtherDerived& other, const Axis& axis) const {
-      return TensorConcatenationOp<const Axis, const Derived, const OtherDerived>(derived(), other, axis);
-    }
-    template <typename Axis, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    TensorConcatenationOp<const Axis, Derived, OtherDerived>
-    concatenate(const OtherDerived& other, const Axis& axis) {
-      return TensorConcatenationOp<const Axis, Derived, OtherDerived>(derived(), other, axis);
-    }
-
-    template <typename NewDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorReshapingOp<const NewDimensions, const Derived>
-    reshape(const NewDimensions& newDimensions) const {
-      return TensorReshapingOp<const NewDimensions, const Derived>(derived(), newDimensions);
-    }
-    template <typename NewDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    TensorReshapingOp<const NewDimensions, Derived>
-    reshape(const NewDimensions& newDimensions) {
-      return TensorReshapingOp<const NewDimensions, Derived>(derived(), newDimensions);
-    }
-
-    template <typename StartIndices, typename Sizes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorSlicingOp<const StartIndices, const Sizes, const Derived>
-    slice(const StartIndices& startIndices, const Sizes& sizes) const {
-      return TensorSlicingOp<const StartIndices, const Sizes, const Derived>(derived(), startIndices, sizes);
-    }
-    template <typename StartIndices, typename Sizes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    TensorSlicingOp<const StartIndices, const Sizes, Derived>
-    slice(const StartIndices& startIndices, const Sizes& sizes) {
-      return TensorSlicingOp<const StartIndices, const Sizes, Derived>(derived(), startIndices, sizes);
-    }
-
-    template <typename StartIndices, typename StopIndices, typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides, const Derived>
-    stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) const {
-      return TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides,
-                                const Derived>(derived(), startIndices, stopIndices, strides);
-    }
-    template <typename StartIndices, typename StopIndices, typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides, Derived>
-    stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) {
-      return TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides,
-                                Derived>(derived(), startIndices, stopIndices, strides);
-    }
-
-    template <DenseIndex DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorChippingOp<DimId, const Derived>
-    chip(const Index offset) const {
-      return TensorChippingOp<DimId, const Derived>(derived(), offset, DimId);
-    }
-    template <Index DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    TensorChippingOp<DimId, Derived>
-    chip(const Index offset) {
-      return TensorChippingOp<DimId, Derived>(derived(), offset, DimId);
-    }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorChippingOp<Dynamic, const Derived>
-    chip(const Index offset, const Index dim) const {
-      return TensorChippingOp<Dynamic, const Derived>(derived(), offset, dim);
-    }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    TensorChippingOp<Dynamic, Derived>
-    chip(const Index offset, const Index dim) {
-      return TensorChippingOp<Dynamic, Derived>(derived(), offset, dim);
-    }
-
-    template <typename ReverseDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorReverseOp<const ReverseDimensions, const Derived>
-    reverse(const ReverseDimensions& rev) const {
-      return TensorReverseOp<const ReverseDimensions, const Derived>(derived(), rev);
-    }
-    template <typename ReverseDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    TensorReverseOp<const ReverseDimensions, Derived>
-    reverse(const ReverseDimensions& rev) {
-      return TensorReverseOp<const ReverseDimensions, Derived>(derived(), rev);
-    }
-
-    template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorShufflingOp<const Shuffle, const Derived>
-    shuffle(const Shuffle& shfl) const {
-      return TensorShufflingOp<const Shuffle, const Derived>(derived(), shfl);
-    }
-    template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    TensorShufflingOp<const Shuffle, Derived>
-    shuffle(const Shuffle& shfl) {
-      return TensorShufflingOp<const Shuffle, Derived>(derived(), shfl);
-    }
-
-    template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorStridingOp<const Strides, const Derived>
-    stride(const Strides& strides) const {
-      return TensorStridingOp<const Strides, const Derived>(derived(), strides);
-    }
-    template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    TensorStridingOp<const Strides, Derived>
-    stride(const Strides& strides) {
-      return TensorStridingOp<const Strides, Derived>(derived(), strides);
-    }
-
-    // Select the device on which to evaluate the expression.
-    template <typename DeviceType>
-    TensorDevice<Derived, DeviceType> device(const DeviceType& dev) {
-      return TensorDevice<Derived, DeviceType>(dev, derived());
-    }
-
-    // Select the async device on which to evaluate the expression.
-    template <typename DeviceType, typename DoneCallback>
-    TensorAsyncDevice<Derived, DeviceType, DoneCallback> device(const DeviceType& dev, DoneCallback done) {
-      return TensorAsyncDevice<Derived, DeviceType, DoneCallback>(dev, derived(), std::move(done));
-    }
-
- protected:
-    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TensorBase)
-    EIGEN_DEFAULT_COPY_CONSTRUCTOR(TensorBase)
-
-    template<typename OtherDerived> EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Derived& operator=(const OtherDerived& other)
-    {
-      typedef TensorAssignOp<Derived, const OtherDerived> Assign;
-      Assign assign(derived(), other.derived());
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-      return derived();
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Derived& derived() { return *static_cast<Derived*>(this); }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); }
-};
-#endif // EIGEN_PARSED_BY_DOXYGEN
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_BASE_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorBlock.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorBlock.h
deleted file mode 100644
index 1e55d12..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorBlock.h
+++ /dev/null
@@ -1,1559 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
-#define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
-
-namespace Eigen {
-namespace internal {
-
-// -------------------------------------------------------------------------- //
-// Forward declarations for templates defined below.
-template <typename Scalar, typename IndexType, int NumDims, int Layout>
-class TensorBlockIO;
-
-// -------------------------------------------------------------------------- //
-// Helper function to compute strides for densely stored buffer of given
-// dimensions.
-
-// TODO(ezhulenev): We compute strides 1000 times in different evaluators, use
-// this function instead everywhere.
-template <int Layout, typename IndexType, int NumDims>
-EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
-    const DSizes<IndexType, NumDims>& dimensions) {
-  DSizes<IndexType, NumDims> strides;
-  if (NumDims == 0) return strides;
-
-  // TODO(ezhulenev): Use templates to unroll this loop (similar to
-  // h_array_reduce in CXX11meta.h)? Benchmark it.
-  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-    strides[0] = 1;
-    for (int i = 1; i < NumDims; ++i) {
-      strides[i] = strides[i - 1] * dimensions[i - 1];
-    }
-  } else {
-    strides[NumDims - 1] = 1;
-    for (int i = NumDims - 2; i >= 0; --i) {
-      strides[i] = strides[i + 1] * dimensions[i + 1];
-    }
-  }
-
-  return strides;
-}
-
-template <int Layout, typename IndexType, size_t NumDims>
-EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
-    const Eigen::array<IndexType, NumDims>& dimensions) {
-  return strides<Layout>(DSizes<IndexType, NumDims>(dimensions));
-}
-
-template <int Layout, std::ptrdiff_t... Indices>
-EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides(
-    const Sizes<Indices...>& sizes) {
-  return strides<Layout>(DSizes<std::ptrdiff_t, sizeof...(Indices)>(sizes));
-}
-
-// -------------------------------------------------------------------------- //
-
-// Tensor block shape type defines what are the shape preference for the blocks
-// extracted from the larger tensor.
-//
-// Example: blocks of 100 elements from the large 100x100 tensor:
-// - tensor: 100x100
-// - target_block_size: 100
-//
-// TensorBlockShapeType:
-//  - kUniformAllDims: 100 blocks of size 10x10
-//  - kSkewedInnerDims: 100 blocks of size 100x1 (or 1x100 depending on a column
-//                      or row major layout)
-enum class TensorBlockShapeType { kUniformAllDims, kSkewedInnerDims };
-
-struct TensorBlockResourceRequirements {
-  TensorBlockShapeType shape_type;  // target block shape
-  size_t size;                      // target block size
-  TensorOpCost cost_per_coeff;      // cost of computing a single block element
-
-#ifdef EIGEN_HIPCC
-  // For HIPCC, we need to explicitly declare as a "device fun", the constructor
-  // which is implicitly invoked in the "merge" / "any" routines. else HIPCC
-  // errors out complaining about the lack of a matching constructor
-  EIGEN_DEVICE_FUNC
-  TensorBlockResourceRequirements(TensorBlockShapeType shape_type_, size_t size_,
-				  TensorOpCost cost_)
-    : shape_type(shape_type_), size(size_), cost_per_coeff(cost_)
-  {}
-#endif
-
-  template <typename Scalar>
-  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(
-      TensorBlockShapeType shape_type, size_t size_in_bytes,
-      TensorOpCost cost) {
-    const size_t size = numext::maxi(size_t(1), size_in_bytes / sizeof(Scalar));
-    return {shape_type, size, cost};
-  }
-
-  template <typename Scalar>
-  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(
-      TensorBlockShapeType shape_type, size_t size_in_bytes) {
-    // This default cost per coefficient is valid for most materialized tensor
-    // block evaluation implementations, because they typically just read
-    // coefficients from the underlying tensor storage, and write to the tensor
-    // block buffer (scratch or destination memory, reads and writes have linear
-    // access pattern). We ignore the fixed cost of block evaluation, because in
-    // practice it should negligible.
-    //
-    // Lazy block evaluation adds the cost of calling a functor for each
-    // coefficient.
-    //
-    // All non-trivial block evaluation implementations must provide their own
-    // cost approximation (e.g. shuffling inner dimension has a much higher cost
-    // because it reads memory randomly, although the total number of moved
-    // bytes is the same).
-    return withShapeAndSize<Scalar>(shape_type, size_in_bytes,
-                                    {/*bytes_loaded=*/sizeof(Scalar),
-                                     /*bytes_stored=*/sizeof(Scalar),
-                                     /*compute_cycles=*/0});
-  }
-
-  template <typename Scalar>
-  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements skewed(
-      size_t size_in_bytes) {
-    return withShapeAndSize<Scalar>(TensorBlockShapeType::kSkewedInnerDims,
-                                    size_in_bytes);
-  }
-
-  template <typename Scalar>
-  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements uniform(
-      size_t size_in_bytes) {
-    return withShapeAndSize<Scalar>(TensorBlockShapeType::kUniformAllDims,
-                                    size_in_bytes);
-  }
-
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE TensorBlockResourceRequirements
-  merge(const TensorBlockResourceRequirements& lhs,
-        const TensorBlockResourceRequirements& rhs) {
-    return {merge(lhs.shape_type, rhs.shape_type),           // shape_type
-            merge(lhs.size, rhs.size),                       // size
-            merge(lhs.cost_per_coeff, rhs.cost_per_coeff)};  // cost_per_coeff
-  }
-
-  EIGEN_DEVICE_FUNC TensorBlockResourceRequirements& addCostPerCoeff(
-      TensorOpCost cost) {
-    cost_per_coeff += cost;
-    return *this;
-  }
-
-  // This is a resource requirement that should be returned from expressions
-  // that do not have any block evaluation preference (e.g. default tensor
-  // expression with raw buffer access).
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() {
-    return {TensorBlockShapeType::kUniformAllDims, 1, {0, 0, 0}};
-  }
-
- private:
-  using Requirements = TensorBlockResourceRequirements;
-
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size) {
-    return numext::maxi(lhs_size, rhs_size);
-  }
-
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE TensorBlockShapeType
-  merge(TensorBlockShapeType lhs, TensorBlockShapeType rhs) {
-    return (lhs == TensorBlockShapeType::kSkewedInnerDims ||
-            rhs == TensorBlockShapeType::kSkewedInnerDims)
-               ? TensorBlockShapeType::kSkewedInnerDims
-               : TensorBlockShapeType::kUniformAllDims;
-  }
-
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost,
-                                                TensorOpCost rhs_cost) {
-    return lhs_cost + rhs_cost;
-  }
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockDescriptor specifies a block offset within a tensor and the block
-// sizes along each of the tensor dimensions.
-
-template <int NumDims, typename IndexType = Eigen::Index>
-class TensorBlockDescriptor {
- public:
-  typedef DSizes<IndexType, NumDims> Dimensions;
-
-  // If we evaluate a Tensor assignment, and expression on the left, already has
-  // a memory buffer, then we might do performance optimization, and evaluate
-  // the root expression directly into the final output memory. Some time it's
-  // possible to reuse it for materializing subexpressions inside an expression
-  // tree, to to avoid dynamic memory allocation.
-  //
-  // The pointer type of the underlying storage is erased, because passing
-  // Scalar type through all the expression evaluation layers is way too many
-  // templates. In practice destination buffer type should always match the
-  // evaluated expression scalar type.
-  class DestinationBuffer {
-   public:
-    enum DestinationBufferKind : int {
-      // The above explicit specification of "int" as the enum basetype is
-      // needed to get around a HIPCC link error ("the field type is not
-      // amp-compatible")
-      // which is issued for class members with the enum type.
-      // TODO(rocm):
-      // remove the "int" basetype once HIPCC has been fixed to not error out
-      // in the above scenario.
-
-      // Destination buffer is not defined (`m_data` == nullptr).
-      kEmpty,
-
-      // Tensor block defined by an owning tensor block descriptor can fit
-      // contiguously into the destination buffer. In this case it's safe to
-      // materialize tensor block in the destination buffer, wrap it in a
-      // TensorMap, and use to build Eigen expression on top of it.
-      kContiguous,
-
-      // Destination buffer strides do not match strides of the contiguously
-      // stored block, and it's impossible to define a TensorMap over this
-      // buffer. However if we are evaluating a root of an expression tree, we
-      // still can materialize an output into this destination, because we can
-      // guarantee that no one will ever access it through block API.
-      //
-      // In theory it is possible to build valid TensorStriding<TensorMap>
-      // expression on top of this destination buffer, however it has
-      // inefficient coeff/packet access, and defeats the purpose of fast block
-      // evaluation API.
-      kStrided
-    };
-
-    template <typename Scalar>
-    Scalar* data() const {
-      eigen_assert(m_data_type_size == sizeof(Scalar));
-      return static_cast<Scalar*>(m_data);
-    }
-
-    const Dimensions& strides() const { return m_strides; }
-    const DestinationBufferKind& kind() const { return m_kind; }
-
-   private:
-    friend class TensorBlockDescriptor;
-
-    DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {}
-
-    template <typename Scalar>
-    DestinationBuffer(Scalar* data, const Dimensions& strides,
-                      DestinationBufferKind kind)
-        : m_data(static_cast<void*>(data)),
-          m_data_type_size(sizeof(Scalar)),
-          m_strides(strides),
-          m_kind(kind) {}
-
-    template <int Layout, typename Scalar>
-    static DestinationBuffer make(const TensorBlockDescriptor& desc,
-                                  Scalar* data, const Dimensions& strides) {
-      return DestinationBuffer(data, strides, kind<Layout>(desc, strides));
-    }
-
-    template <int Layout>
-    static DestinationBufferKind kind(const TensorBlockDescriptor& desc,
-                                      const Dimensions& strides) {
-      const Dimensions& desc_dims = desc.dimensions();
-      const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
-      for (int i = 0; i < NumDims; ++i) {
-        if (desc_dims[i] == 1) continue;
-        if (desc_strides[i] != strides[i]) return kStrided;
-      }
-      return kContiguous;
-    }
-
-    // Storage pointer is type erased, to reduce template bloat, but we still
-    // keep the size of the underlying element type for error checking.
-    void* m_data;
-    size_t m_data_type_size;
-
-    // Destination buffer dimensions always match the dimensions of a tensor
-    // block descriptor it belongs to, however strides might be different.
-    Dimensions m_strides;
-
-    DestinationBufferKind m_kind;
-  };
-
-  TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions,
-                        const DestinationBuffer& destination)
-      : m_offset(offset),
-        m_dimensions(dimensions),
-        m_destination(destination) {}
-
-  TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions)
-      : m_offset(offset),
-        m_dimensions(dimensions),
-        m_destination(DestinationBuffer()) {}
-
-  IndexType offset() const { return m_offset; }
-  const Dimensions& dimensions() const { return m_dimensions; }
-  IndexType dimension(int index) const { return m_dimensions[index]; }
-  IndexType size() const { return array_prod<IndexType>(m_dimensions); }
-
-  const DestinationBuffer& destination() const { return m_destination; }
-
-  template <int Layout, typename Scalar>
-  void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) {
-    eigen_assert(dst_base != NULL);
-    m_destination =
-        DestinationBuffer::template make<Layout>(*this, dst_base, dst_strides);
-  }
-
-  template <int Layout, typename Scalar, typename DstStridesIndexType>
-  void AddDestinationBuffer(
-      Scalar* dst_base,
-      const DSizes<DstStridesIndexType, NumDims>& dst_strides) {
-    // DSizes constructor will do index type promotion if it's safe.
-    AddDestinationBuffer<Layout>(dst_base, Dimensions(dst_strides));
-  }
-
-  TensorBlockDescriptor& DropDestinationBuffer() {
-    m_destination.m_data = NULL;
-    m_destination.m_kind = DestinationBuffer::kEmpty;
-    return *this;
-  }
-
-  bool HasDestinationBuffer() const {
-    return m_destination.kind() != DestinationBuffer::kEmpty;
-  }
-
-  // Returns a copy of `*this` with updated offset.
-  TensorBlockDescriptor WithOffset(IndexType offset) const {
-    return TensorBlockDescriptor(offset, m_dimensions, m_destination);
-  }
-
- private:
-  // Offset and dimensions are immutable after construction. Block descriptor
-  // can only be mutated by adding or dropping destination.
-  const IndexType m_offset;
-  const Dimensions m_dimensions;
-  DestinationBuffer m_destination;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockMapper is responsible for iterating over the blocks of a tensor.
-
-template <int NumDims, int Layout, typename IndexType = Eigen::Index>
-class TensorBlockMapper {
-  typedef TensorBlockDescriptor<NumDims, IndexType> BlockDescriptor;
-
- public:
-  typedef DSizes<IndexType, NumDims> Dimensions;
-
-  TensorBlockMapper() = default;
-  TensorBlockMapper(const DSizes<IndexType, NumDims>& dimensions,
-                    const TensorBlockResourceRequirements& requirements)
-      : m_tensor_dimensions(dimensions), m_requirements(requirements) {
-    // Compute block dimensions and the total number of blocks.
-    InitializeBlockDimensions();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const {
-    return m_total_block_count;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const {
-    return m_block_dimensions.TotalSize();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes<IndexType, NumDims>&
-  blockDimensions() const {
-    return m_block_dimensions;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor
-  blockDescriptor(IndexType block_index) const {
-    static const bool isColMajor = Layout == static_cast<int>(ColMajor);
-
-    IndexType offset = 0;
-    DSizes<IndexType, NumDims> dimensions;
-
-    if (NumDims == 0) return BlockDescriptor(offset, dimensions);
-
-    // Iterate outer -> inner dimensions.
-    for (int i = NumDims - 1; i >= 0; --i) {
-      const int dim = isColMajor ? i : NumDims - i - 1;
-
-      const IndexType idx = block_index / m_block_strides[dim];
-      block_index -= idx * m_block_strides[dim];
-
-      const IndexType coord = idx * m_block_dimensions[dim];
-      dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord,
-                                     m_block_dimensions[dim]);
-      offset += coord * m_tensor_strides[dim];
-    }
-
-    return {offset, dimensions};
-  }
-
- private:
-  void InitializeBlockDimensions() {
-    // Requested block shape and size.
-    const TensorBlockShapeType shape_type = m_requirements.shape_type;
-    IndexType target_block_size =
-        numext::maxi<IndexType>(1, static_cast<IndexType>(m_requirements.size));
-
-    IndexType tensor_size = m_tensor_dimensions.TotalSize();
-
-    // Corner case: one of the dimensions is zero. Logic below is too complex
-    // to handle this case on a general basis, just use unit block size.
-    // Note: we must not yield blocks with zero dimensions (recipe for
-    // overflows/underflows, divisions by zero and NaNs later).
-    if (tensor_size == 0) {
-      for (int i = 0; i < NumDims; ++i) {
-        m_block_dimensions[i] = 1;
-      }
-      m_total_block_count = 0;
-      return;
-    }
-
-    // If tensor fits into a target block size, evaluate it as a single block.
-    if (tensor_size <= target_block_size) {
-      m_block_dimensions = m_tensor_dimensions;
-      m_total_block_count = 1;
-      // The only valid block index is `0`, and in this case we do not need
-      // to compute real strides for tensor or blocks (see blockDescriptor).
-      for (int i = 0; i < NumDims; ++i) {
-        m_tensor_strides[i] = 0;
-        m_block_strides[i] = 1;
-      }
-      return;
-    }
-
-    static const bool isColMajor = Layout == static_cast<int>(ColMajor);
-
-    // Block shape skewed towards inner dimension.
-    if (shape_type == TensorBlockShapeType::kSkewedInnerDims) {
-      IndexType coeff_to_allocate = target_block_size;
-
-      for (int i = 0; i < NumDims; ++i) {
-        const int dim = isColMajor ? i : NumDims - i - 1;
-        m_block_dimensions[dim] =
-            numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]);
-        coeff_to_allocate = divup(
-            coeff_to_allocate,
-            numext::maxi(static_cast<IndexType>(1), m_block_dimensions[dim]));
-      }
-      eigen_assert(coeff_to_allocate == 1);
-
-    } else if (shape_type == TensorBlockShapeType::kUniformAllDims) {
-      // Tensor will not fit within 'target_block_size' budget: calculate tensor
-      // block dimension sizes based on "square" dimension size target.
-      const IndexType dim_size_target = convert_index<IndexType>(
-          std::pow(static_cast<float>(target_block_size),
-                   1.0f / static_cast<float>(m_block_dimensions.rank())));
-
-      for (int i = 0; i < NumDims; ++i) {
-        // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it
-        // a multiple of the packet size. Note that reducing
-        // 'block_dim_size' in this manner can increase the number of
-        // blocks, and so will amplify any per-block overhead.
-        m_block_dimensions[i] =
-            numext::mini(dim_size_target, m_tensor_dimensions[i]);
-      }
-
-      // Add any un-allocated coefficients to inner dimension(s).
-      IndexType total_size = m_block_dimensions.TotalSize();
-      for (int i = 0; i < NumDims; ++i) {
-        const int dim = isColMajor ? i : NumDims - i - 1;
-
-        if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) {
-          const IndexType total_size_other_dims =
-              total_size / m_block_dimensions[dim];
-          const IndexType alloc_avail =
-              divup<IndexType>(target_block_size, total_size_other_dims);
-          if (alloc_avail == m_block_dimensions[dim]) {
-            // Insufficient excess coefficients to allocate.
-            break;
-          }
-          m_block_dimensions[dim] =
-              numext::mini(m_tensor_dimensions[dim], alloc_avail);
-          total_size = total_size_other_dims * m_block_dimensions[dim];
-        }
-      }
-
-    } else {
-      eigen_assert(false);  // unknown block shape
-    }
-
-    eigen_assert(m_block_dimensions.TotalSize() >=
-                 numext::mini<IndexType>(target_block_size,
-                                         m_tensor_dimensions.TotalSize()));
-
-    // Calculate block counts by dimension and total block count.
-    DSizes<IndexType, NumDims> block_count;
-    for (int i = 0; i < NumDims; ++i) {
-      block_count[i] = divup(m_tensor_dimensions[i], m_block_dimensions[i]);
-    }
-    m_total_block_count = array_prod(block_count);
-
-    // Calculate block strides (used for enumerating blocks).
-    m_tensor_strides = strides<Layout>(m_tensor_dimensions);
-    m_block_strides = strides<Layout>(block_count);
-  }
-
-  DSizes<IndexType, NumDims> m_tensor_dimensions;
-  TensorBlockResourceRequirements m_requirements;
-
-  DSizes<IndexType, NumDims> m_block_dimensions;
-  IndexType m_total_block_count;
-
-  DSizes<IndexType, NumDims> m_tensor_strides;
-  DSizes<IndexType, NumDims> m_block_strides;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockScratchAllocator is responsible for allocating temporary buffers
-// for block evaluation (output or input block materialization). Given that
-// Eigen expression traversal order is deterministic, all temporary allocations
-// are happening in the same order, and usually have exactly the same size.
-// Scratch allocator keeps a trace of all dynamic allocations, and after the
-// first block evaluation is completed, we should be able to reuse all the
-// temporary buffers for the next block evaluation.
-
-template <typename Device>
-class TensorBlockScratchAllocator {
- public:
-  explicit TensorBlockScratchAllocator(const Device& device)
-      : m_device(device), m_allocation_index(0) {}
-
-  ~TensorBlockScratchAllocator() {
-    for (size_t i = 0; i < m_allocations.size(); ++i) {
-      m_device.deallocate(m_allocations[i].ptr);
-    }
-  }
-
-  void* allocate(size_t size) {
-    // TODO(ezhulenev): Remove when replaced with inlined vector.
-    if (m_allocations.capacity() == 0) m_allocations.reserve(8);
-
-    // Check if we already have an existing allocation att current index.
-    const int num_allocations = static_cast<int>(m_allocations.size());
-    const bool has_allocation = m_allocation_index < num_allocations;
-
-    // Allocation index can't be larger than the number of allocations.
-    eigen_assert(m_allocation_index <= num_allocations);
-
-    // If we have existing allocation, and its size is larger or equal to
-    // requested size, we do nothing.
-
-    // If current allocation can't fit requested size, we deallocate it, and
-    // replace with a larger allocation.
-    if (has_allocation && m_allocations[m_allocation_index].size < size) {
-      m_device.deallocate(m_allocations[m_allocation_index].ptr);
-      m_allocations[m_allocation_index].ptr = m_device.allocate(size);
-      m_allocations[m_allocation_index].size = size;
-    }
-
-    // Make a new allocation if we don't have and existing one.
-    if (!has_allocation) {
-      Allocation allocation;
-      allocation.ptr = m_device.allocate(size);
-      allocation.size = size;
-      m_allocations.push_back(allocation);
-    }
-
-    eigen_assert(m_allocations[m_allocation_index].ptr != NULL);
-    eigen_assert(m_allocations[m_allocation_index].size >= size);
-
-    return m_allocations[m_allocation_index++].ptr;
-  }
-
-  void reset() { m_allocation_index = 0; }
-
- private:
-  struct Allocation {
-    void* ptr;
-    size_t size;
-  };
-
-  const Device& m_device;
-  int m_allocation_index;
-  // TODO(ezhulenev): This should be an inlined vector.
-  std::vector<Allocation> m_allocations;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockKind represents all possible block kinds, that can be produced by
-// TensorEvaluator::evalBlock function.
-enum TensorBlockKind {
-  // Tensor block that is a lazy expression that must be assigned to a
-  // destination using TensorBlockAssign.
-  kExpr,
-
-  // Tensor block that is a view into a memory buffer owned by an underlying
-  // Tensor expression (e.g. it can be a view into a Tensor buffer).
-  kView,
-
-  // Tensor block that was materialized in a scratch memory buffer, allocated
-  // with TensorBlockScratchAllocator. This block must be copied to a
-  // destination, similar to a block of `kExpr` type.
-  kMaterializedInScratch,
-
-  // Tensor block that was materialized directly into the final output memory
-  // buffer. For example if the left side of an assignment is a Tensor, we can
-  // directly materialize the block in the destination memory.
-  //
-  // If strides in the output buffer do not match tensor block strides, the
-  // Tensor expression will be invalid, and should not be used by
-  // TensorBlockAssign or for constructing another block expression.
-  kMaterializedInOutput
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockNotImplemented should be used to defined TensorBlock typedef in
-// TensorEvaluators that do not support block evaluation.
-
-class TensorBlockNotImplemented {
- public:
-  typedef void XprType;
-};
-
-// -------------------------------------------------------------------------- //
-// XprScalar extracts Scalar type from the Eigen expressions (if expression type
-// is not void). It's required to be able to define lazy block expression for
-// argument types, that do not support block evaluation.
-
-template <typename XprType>
-struct XprScalar {
-  typedef typename XprType::Scalar type;
-};
-template <>
-struct XprScalar<void> {
-  typedef void type;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorMaterializedBlock is a fully evaluated block of the original tensor,
-// and XprType is just a TensorMap over the data. This block type is typically
-// used to materialize blocks of tensor expressions, that can't be efficiently
-// represented as lazy Tensor expressions with fast coeff/packet operations,
-// e.g. we materialize all broadcasts into evaluated blocks.
-//
-// TensorMaterializedBlock does not own its memory buffer, it's either a memory
-// buffer that backs the original expression (e.g. block is just a view into a
-// Tensor), or a memory buffer allocated with scratch allocator, and in this
-// case the scratch allocator will deallocate it at the end of block based
-// expression execution.
-//
-// If the block was evaluated directly into the output buffer, and strides in
-// the output buffer do not match block strides, the TensorMap expression will
-// be invalid, and should never be used in block assignment or any other tensor
-// expression.
-
-template <typename Scalar, int NumDims, int Layout,
-          typename IndexType = Eigen::Index>
-class TensorMaterializedBlock {
- public:
-  typedef DSizes<IndexType, NumDims> Dimensions;
-  typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType;
-
-  TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data,
-                          const Dimensions& dimensions, bool valid_expr = true)
-      : m_kind(kind),
-        m_data(data),
-        m_dimensions(dimensions),
-        m_expr(m_data, m_dimensions),
-        m_valid_expr(valid_expr) {
-    eigen_assert(m_kind == internal::TensorBlockKind::kView ||
-                 m_kind == internal::TensorBlockKind::kMaterializedInScratch ||
-                 m_kind == internal::TensorBlockKind::kMaterializedInOutput);
-  }
-
-  TensorBlockKind kind() const { return m_kind; }
-  // NOTE(ezhulenev): Returning XprType by value like in other block types
-  // causes asan failures. The theory is that XprType::Nested doesn't work
-  // properly for TensorMap.
-  const XprType& expr() const {
-    eigen_assert(m_valid_expr);
-    return m_expr;
-  }
-  const Scalar* data() const { return m_data; }
-  void cleanup() {}
-
-  typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
-
-  // TensorMaterializedBlock can be backed by different types of storage:
-  //
-  //   (1) Contiguous block of memory allocated with scratch allocator.
-  //   (2) Contiguous block of memory reused from tensor block descriptor
-  //       destination buffer.
-  //   (3) Strided block of memory reused from tensor block descriptor
-  //       destination buffer.
-  //
-  class Storage {
-   public:
-    Scalar* data() const { return m_data; }
-    const Dimensions& dimensions() const { return m_dimensions; }
-    const Dimensions& strides() const { return m_strides; }
-
-    TensorMaterializedBlock AsTensorMaterializedBlock() const {
-      return TensorMaterializedBlock(
-          m_materialized_in_output
-              ? internal::TensorBlockKind::kMaterializedInOutput
-              : internal::TensorBlockKind::kMaterializedInScratch,
-          m_data, m_dimensions, !m_strided_storage);
-    }
-
-   private:
-    friend class TensorMaterializedBlock;
-
-    Storage(Scalar* data, const Dimensions& dimensions,
-            const Dimensions& strides, bool materialized_in_output,
-            bool strided_storage)
-        : m_data(data),
-          m_dimensions(dimensions),
-          m_strides(strides),
-          m_materialized_in_output(materialized_in_output),
-          m_strided_storage(strided_storage) {}
-
-    Scalar* m_data;
-    Dimensions m_dimensions;
-    Dimensions m_strides;
-    bool m_materialized_in_output;
-    bool m_strided_storage;
-  };
-
-  // Creates a storage for materialized block either from the block descriptor
-  // destination buffer, or allocates a new buffer with scratch allocator.
-  template <typename TensorBlockScratch>
-  EIGEN_STRONG_INLINE static Storage prepareStorage(
-      TensorBlockDesc& desc, TensorBlockScratch& scratch,
-      bool allow_strided_storage = false) {
-    // Try to reuse destination as an output block buffer.
-    typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer;
-
-    if (desc.destination().kind() == DestinationBuffer::kContiguous) {
-      Scalar* buffer = desc.destination().template data<Scalar>();
-      desc.DropDestinationBuffer();
-      return Storage(buffer, desc.dimensions(),
-                     internal::strides<Layout>(desc.dimensions()),
-                     /*materialized_in_output=*/true,
-                     /*strided_storage=*/false);
-
-    } else if (desc.destination().kind() == DestinationBuffer::kStrided &&
-               allow_strided_storage) {
-      Scalar* buffer = desc.destination().template data<Scalar>();
-      desc.DropDestinationBuffer();
-      return Storage(buffer, desc.dimensions(), desc.destination().strides(),
-                     /*materialized_in_output=*/true, /*strided_storage=*/true);
-
-    } else {
-      void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
-      return Storage(static_cast<Scalar*>(mem), desc.dimensions(),
-                     internal::strides<Layout>(desc.dimensions()),
-                     /*materialized_in_output=*/false,
-                     /*strided_storage=*/false);
-    }
-  }
-
-  // Creates a materialized block for the given descriptor from a memory buffer.
-  template <typename DataDimensions, typename TensorBlockScratch>
-  EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize(
-      const Scalar* data, const DataDimensions& data_dims,
-      TensorBlockDesc& desc, TensorBlockScratch& scratch) {
-    eigen_assert(array_size<DataDimensions>::value == desc.dimensions().size());
-
-    // If a tensor block dimensions covers a contiguous block of the underlying
-    // memory, we can skip block buffer memory allocation, and construct a block
-    // from existing `data` memory buffer.
-    //
-    // Example: (RowMajor layout)
-    //   data_dims:          [11, 12, 13, 14]
-    //   desc.dimensions():  [1,   1,  3, 14]
-    //
-    // In this case we can construct a TensorBlock starting at
-    // `data + desc.offset()`, with a `desc.dimensions()` block sizes.
-    static const bool is_col_major = Layout == ColMajor;
-
-    // Find out how many inner dimensions have a matching size.
-    int num_matching_inner_dims = 0;
-    for (int i = 0; i < NumDims; ++i) {
-      int dim = is_col_major ? i : NumDims - i - 1;
-      if (data_dims[dim] != desc.dimensions()[dim]) break;
-      ++num_matching_inner_dims;
-    }
-
-    // All the outer dimensions must be of size `1`, except a single dimension
-    // before the matching inner dimension (`3` in the example above).
-    bool can_use_direct_access = true;
-    for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) {
-      int dim = is_col_major ? i : NumDims - i - 1;
-      if (desc.dimension(dim) != 1) {
-        can_use_direct_access = false;
-        break;
-      }
-    }
-
-    if (can_use_direct_access) {
-      const Scalar* block_start = data + desc.offset();
-      return TensorMaterializedBlock(internal::TensorBlockKind::kView,
-                                     block_start, desc.dimensions());
-
-    } else {
-      // Reuse destination buffer or allocate new buffer with scratch allocator.
-      const Storage storage = prepareStorage(desc, scratch);
-
-      typedef internal::TensorBlockIO<Scalar, IndexType, NumDims, Layout>
-          TensorBlockIO;
-      typedef typename TensorBlockIO::Dst TensorBlockIODst;
-      typedef typename TensorBlockIO::Src TensorBlockIOSrc;
-
-      TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)),
-                           data, desc.offset());
-      TensorBlockIODst dst(storage.dimensions(), storage.strides(),
-                           storage.data());
-
-      TensorBlockIO::Copy(dst, src);
-      return storage.AsTensorMaterializedBlock();
-    }
-  }
-
- private:
-  TensorBlockKind m_kind;
-  const Scalar* m_data;
-  Dimensions m_dimensions;
-  XprType m_expr;
-  bool m_valid_expr;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp
-// functor to the blocks produced by the underlying Tensor expression.
-
-template <typename UnaryOp, typename ArgTensorBlock>
-class TensorCwiseUnaryBlock {
-  static const bool NoArgBlockAccess =
-      internal::is_void<typename ArgTensorBlock::XprType>::value;
-
- public:
-  typedef typename conditional<
-      NoArgBlockAccess, void,
-      TensorCwiseUnaryOp<UnaryOp, const typename ArgTensorBlock::XprType> >::
-      type XprType;
-
-  typedef typename XprScalar<XprType>::type Scalar;
-
-  TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor)
-      : m_arg_block(arg_block), m_functor(functor) {}
-
-  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
-
-  XprType expr() const { return XprType(m_arg_block.expr(), m_functor); }
-  const Scalar* data() const { return NULL; }
-  void cleanup() { m_arg_block.cleanup(); }
-
- private:
-  ArgTensorBlock m_arg_block;
-  UnaryOp m_functor;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp
-// functor to the blocks produced by the underlying Tensor expression.
-
-template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock>
-class TensorCwiseBinaryBlock {
-  static const bool NoArgBlockAccess =
-      internal::is_void<typename LhsTensorBlock::XprType>::value ||
-      internal::is_void<typename RhsTensorBlock::XprType>::value;
-
- public:
-  typedef typename conditional<
-      NoArgBlockAccess, void,
-      TensorCwiseBinaryOp<BinaryOp, const typename LhsTensorBlock::XprType,
-                          const typename RhsTensorBlock::XprType> >::type
-      XprType;
-
-  typedef typename XprScalar<XprType>::type Scalar;
-
-  TensorCwiseBinaryBlock(const LhsTensorBlock& left_block,
-                         const RhsTensorBlock& right_block,
-                         const BinaryOp& functor)
-      : m_left_block(left_block),
-        m_right_block(right_block),
-        m_functor(functor) {}
-
-  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
-
-  XprType expr() const {
-    return XprType(m_left_block.expr(), m_right_block.expr(), m_functor);
-  }
-
-  const Scalar* data() const { return NULL; }
-
-  void cleanup() {
-    m_left_block.cleanup();
-    m_right_block.cleanup();
-  }
-
- private:
-  LhsTensorBlock m_left_block;
-  RhsTensorBlock m_right_block;
-  BinaryOp m_functor;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorUnaryExprBlock is a lazy tensor expression block that can construct
-// an arbitrary tensor expression from a block of the underlying type (this is a
-// generalization of the TensorCwiseUnaryBlock for arbitrary expressions).
-
-template <typename BlockFactory, typename ArgTensorBlock>
-class TensorUnaryExprBlock {
-  typedef typename ArgTensorBlock::XprType ArgXprType;
-  static const bool NoArgBlockAccess = internal::is_void<ArgXprType>::value;
-
- public:
-  typedef typename conditional<
-      NoArgBlockAccess, void,
-      typename BlockFactory::template XprType<ArgXprType>::type>::type XprType;
-
-  typedef typename XprScalar<XprType>::type Scalar;
-
-  TensorUnaryExprBlock(const ArgTensorBlock& arg_block,
-                       const BlockFactory& factory)
-      : m_arg_block(arg_block), m_factory(factory) {}
-
-  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
-  XprType expr() const { return m_factory.expr(m_arg_block.expr()); }
-  const Scalar* data() const { return NULL; }
-  void cleanup() { m_arg_block.cleanup(); }
-
- private:
-  ArgTensorBlock m_arg_block;
-  BlockFactory m_factory;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorTernaryExprBlock is a lazy tensor expression block that can construct
-// an arbitrary tensor expression from three blocks of the underlying type.
-
-template <typename BlockFactory, typename Arg1TensorBlock,
-          typename Arg2TensorBlock, typename Arg3TensorBlock>
-class TensorTernaryExprBlock {
-  typedef typename Arg1TensorBlock::XprType Arg1XprType;
-  typedef typename Arg2TensorBlock::XprType Arg2XprType;
-  typedef typename Arg3TensorBlock::XprType Arg3XprType;
-
-  static const bool NoArgBlockAccess = internal::is_void<Arg1XprType>::value ||
-                                       internal::is_void<Arg2XprType>::value ||
-                                       internal::is_void<Arg3XprType>::value;
-
- public:
-  typedef typename conditional<
-      NoArgBlockAccess, void,
-      typename BlockFactory::template XprType<Arg1XprType, Arg2XprType,
-                                              Arg3XprType>::type>::type XprType;
-
-  typedef typename XprScalar<XprType>::type Scalar;
-
-  TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block,
-                         const Arg2TensorBlock& arg2_block,
-                         const Arg3TensorBlock& arg3_block,
-                         const BlockFactory& factory)
-      : m_arg1_block(arg1_block),
-        m_arg2_block(arg2_block),
-        m_arg3_block(arg3_block),
-        m_factory(factory) {}
-
-  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
-  XprType expr() const {
-    return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(),
-                          m_arg3_block.expr());
-  }
-  const Scalar* data() const { return NULL; }
-  void cleanup() {
-    m_arg1_block.cleanup();
-    m_arg2_block.cleanup();
-    m_arg3_block.cleanup();
-  }
-
- private:
-  Arg1TensorBlock m_arg1_block;
-  Arg2TensorBlock m_arg2_block;
-  Arg3TensorBlock m_arg3_block;
-  BlockFactory m_factory;
-};
-
-// -------------------------------------------------------------------------- //
-// StridedLinearBufferCopy provides a method to copy data between two linear
-// buffers with different strides, with optimized paths for scatter/gather.
-
-template <typename Scalar, typename IndexType>
-class StridedLinearBufferCopy {
-  typedef typename packet_traits<Scalar>::type Packet;
-  enum {
-    Vectorizable = packet_traits<Scalar>::Vectorizable,
-    PacketSize = packet_traits<Scalar>::size
-  };
-
- public:
-  // Specifying linear copy kind statically gives ~30% speedup for small sizes.
-  enum class Kind {
-    Linear = 0,       // src_stride == 1 && dst_stride == 1
-    Scatter = 1,      // src_stride == 1 && dst_stride != 1
-    FillLinear = 2,   // src_stride == 0 && dst_stride == 1
-    FillScatter = 3,  // src_stride == 0 && dst_stride != 1
-    Gather = 4,       // dst_stride == 1
-    Random = 5        // everything else
-  };
-
-  struct Dst {
-    Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {}
-
-    IndexType offset;
-    IndexType stride;
-    Scalar* data;
-  };
-
-  struct Src {
-    Src(IndexType o, IndexType s, const Scalar* d)
-        : offset(o), stride(s), data(d) {}
-
-    IndexType offset;
-    IndexType stride;
-    const Scalar* data;
-  };
-
-  template <typename StridedLinearBufferCopy::Kind kind>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst,
-                                                        const Src& src,
-                                                        const size_t count) {
-    Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride,
-              src.data);
-  }
-
- private:
-  template <typename StridedLinearBufferCopy::Kind kind>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      const IndexType count, const IndexType dst_offset,
-      const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,
-      const IndexType src_offset, const IndexType src_stride,
-      const Scalar* EIGEN_RESTRICT src_data) {
-    const Scalar* src = &src_data[src_offset];
-    Scalar* dst = &dst_data[dst_offset];
-
-    if (!Vectorizable) {
-      for (Index i = 0; i < count; ++i) {
-        dst[i * dst_stride] = src[i * src_stride];
-      }
-      return;
-    }
-
-    const IndexType vectorized_size = count - PacketSize;
-    IndexType i = 0;
-
-    if (kind == StridedLinearBufferCopy::Kind::Linear) {
-      // ******************************************************************** //
-      // Linear copy from `src` to `dst`.
-      const IndexType unrolled_size = count - 4 * PacketSize;
-      eigen_assert(src_stride == 1 && dst_stride == 1);
-      for (; i <= unrolled_size; i += 4 * PacketSize) {
-        for (int j = 0; j < 4; ++j) {
-          Packet p = ploadu<Packet>(src + i + j * PacketSize);
-          pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
-        }
-      }
-      for (; i <= vectorized_size; i += PacketSize) {
-        Packet p = ploadu<Packet>(src + i);
-        pstoreu<Scalar, Packet>(dst + i, p);
-      }
-      for (; i < count; ++i) {
-        dst[i] = src[i];
-      }
-      // ******************************************************************** //
-    } else if (kind == StridedLinearBufferCopy::Kind::Scatter) {
-      // Scatter from `src` to `dst`.
-      eigen_assert(src_stride == 1 && dst_stride != 1);
-      for (; i <= vectorized_size; i += PacketSize) {
-        Packet p = ploadu<Packet>(src + i);
-        pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
-      }
-      for (; i < count; ++i) {
-        dst[i * dst_stride] = src[i];
-      }
-      // ******************************************************************** //
-    } else if (kind == StridedLinearBufferCopy::Kind::FillLinear) {
-      // Fill `dst` with value at `*src`.
-      eigen_assert(src_stride == 0 && dst_stride == 1);
-      const IndexType unrolled_size = count - 4 * PacketSize;
-      Packet p = pload1<Packet>(src);
-      for (; i <= unrolled_size; i += 4 * PacketSize) {
-        for (int j = 0; j < 4; ++j) {
-          pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
-        }
-      }
-      for (; i <= vectorized_size; i += PacketSize) {
-        pstoreu<Scalar, Packet>(dst + i, p);
-      }
-      for (; i < count; ++i) {
-        dst[i] = *src;
-      }
-      // ******************************************************************** //
-    } else if (kind == StridedLinearBufferCopy::Kind::FillScatter) {
-      // Scatter `*src` into `dst`.
-      eigen_assert(src_stride == 0 && dst_stride != 1);
-      Packet p = pload1<Packet>(src);
-      for (; i <= vectorized_size; i += PacketSize) {
-        pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
-      }
-      for (; i < count; ++i) {
-        dst[i * dst_stride] = *src;
-      }
-      // ******************************************************************** //
-    } else if (kind == StridedLinearBufferCopy::Kind::Gather) {
-      // Gather from `src` into `dst`.
-      eigen_assert(dst_stride == 1);
-      for (; i <= vectorized_size; i += PacketSize) {
-        Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
-        pstoreu<Scalar, Packet>(dst + i, p);
-      }
-      for (; i < count; ++i) {
-        dst[i] = src[i * src_stride];
-      }
-      // ******************************************************************** //
-    } else if (kind == StridedLinearBufferCopy::Kind::Random) {
-      // Random.
-      for (; i < count; ++i) {
-        dst[i * dst_stride] = src[i * src_stride];
-      }
-    } else {
-      eigen_assert(false);
-    }
-  }
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block.
-// It's possible to specify src->dst dimension mapping for the copy operation.
-// Dimensions of `dst` specify how many elements have to be copied, for the
-// `src` we need to know only stride to navigate through source memory buffer.
-
-template <typename Scalar, typename IndexType, int NumDims, int Layout>
-class TensorBlockIO {
-  static const bool IsColMajor = (Layout == ColMajor);
-
-  typedef StridedLinearBufferCopy<Scalar, IndexType> LinCopy;
-
- public:
-  typedef DSizes<IndexType, NumDims> Dimensions;
-  typedef DSizes<int, NumDims> DimensionsMap;
-
-  struct Dst {
-    Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst,
-        IndexType dst_offset = 0)
-        : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {}
-
-    Dimensions dims;
-    Dimensions strides;
-    Scalar* data;
-    IndexType offset;
-  };
-
-  struct Src {
-    Src(const Dimensions& src_strides, const Scalar* src,
-        IndexType src_offset = 0)
-        : strides(src_strides), data(src), offset(src_offset) {}
-
-    Dimensions strides;
-    const Scalar* data;
-    IndexType offset;
-  };
-
-  // Copies data to `dst` from `src`, using provided dimensions mapping:
-  //
-  //   src_dimension_index = dst_to_src_dim_map[dst_dimension_index]
-  //
-  // Returns the number of copied elements.
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy(
-      const Dst& dst, const Src& src, const DimensionsMap& dst_to_src_dim_map) {
-    // Copy single scalar value from `src` to `dst`.
-    if (NumDims == 0) {
-      *(dst.data + dst.offset) = *(src.data + src.offset);
-      return 1;
-    }
-
-    // Both `dst` and `src` must have contiguous innermost dimension. We also
-    // accept the special case with stride '0', because it's used as a trick to
-    // implement broadcasting.
-    {
-      int inner_dim = IsColMajor ? 0 : NumDims - 1;
-      EIGEN_UNUSED_VARIABLE(inner_dim);
-      eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0);
-      eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0);
-    }
-
-    // Give a shorter name to `dst_to_src_dim_map`.
-    const DimensionsMap& dim_map = dst_to_src_dim_map;
-
-    // Do not squeeze reordered inner dimensions.
-    int num_squeezable_dims = NumSqueezableInnerDims(dim_map);
-
-    // NOTE: We find the innermost dimension (contiguous in memory) in the dst
-    // block, and we write data linearly into that dimension, reading it from
-    // the src. If dimensions are reordered, we might end up reading data from
-    // the src with `stride != 1`.
-    //
-    // NOTE: Random-Read/Linear-Write can be up to ~2X faster than
-    // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680
-
-    // Find the innermost dimension in the dst whose size is not 1. This is the
-    // effective inner dim.
-    int num_size_one_inner_dims = 0;
-    for (int i = 0; i < num_squeezable_dims; ++i) {
-      const int dst_dim = IsColMajor ? i : NumDims - i - 1;
-      if (dst.dims[dst_dim] != 1) break;
-      num_size_one_inner_dims++;
-    }
-
-    // If all dimensions are of size 1, just copy a scalar from `src` to `dst`.
-    if (num_size_one_inner_dims == NumDims) {
-      *(dst.data + dst.offset) = *(src.data + src.offset);
-      return 1;
-    }
-
-    // Outermost dimension in the dst with `stride == 1` (contiguous in memory).
-    const int dst_stride1_dim = IsColMajor
-                                    ? num_size_one_inner_dims
-                                    : NumDims - num_size_one_inner_dims - 1;
-
-    // Dimension in the src that corresponds to the dst innermost dimension.
-    const int src_dim_for_dst_stride1_dim =
-        NumDims == 0 ? 1 : dim_map[dst_stride1_dim];
-
-    // Size of the innermost dimension (length of contiguous blocks of memory).
-    IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim];
-
-    // Squeeze multiple inner dims into one if they are contiguous in `dst` and
-    // `src` memory, so we can do less linear copy calls.
-    for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) {
-      const int dst_dim = IsColMajor ? i : NumDims - i - 1;
-      const IndexType dst_stride = dst.strides[dst_dim];
-      const IndexType src_stride = src.strides[dim_map[dst_dim]];
-      if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) {
-        dst_inner_dim_size *= dst.dims[dst_dim];
-        ++num_size_one_inner_dims;
-      } else {
-        break;
-      }
-    }
-
-    // Setup strides to read data from `src` and write to `dst`.
-    IndexType input_offset = src.offset;
-    IndexType output_offset = dst.offset;
-    IndexType input_stride =
-        NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim];
-    IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim];
-
-    const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
-    array<BlockIteratorState, at_least_1_dim> it;
-
-    // Initialize block iterator state. Squeeze away any dimension of size 1.
-    int idx = 0;  // currently initialized iterator state index
-    for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
-      const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2;
-      if (dst.dims[dst_dim] == 1) continue;
-
-      it[idx].size = dst.dims[dst_dim];
-      it[idx].input_stride = src.strides[dim_map[dst_dim]];
-      it[idx].output_stride = dst.strides[dst_dim];
-
-      it[idx].input_span = it[idx].input_stride * (it[idx].size - 1);
-      it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
-
-      idx++;
-    }
-
-    // Iterate copying data from src to dst.
-    const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize();
-
-#define COPY_INNER_DIM(KIND)                                           \
-  IndexType num_copied = 0;                                            \
-  for (num_copied = 0; num_copied < block_total_size;                  \
-       num_copied += dst_inner_dim_size) {                             \
-    LinCopy::template Run<KIND>(                                       \
-        typename LinCopy::Dst(output_offset, output_stride, dst.data), \
-        typename LinCopy::Src(input_offset, input_stride, src.data),   \
-        dst_inner_dim_size);                                           \
-                                                                       \
-    for (int j = 0; j < idx; ++j) {                                    \
-      if (++it[j].count < it[j].size) {                                \
-        input_offset += it[j].input_stride;                            \
-        output_offset += it[j].output_stride;                          \
-        break;                                                         \
-      }                                                                \
-      it[j].count = 0;                                                 \
-      input_offset -= it[j].input_span;                                \
-      output_offset -= it[j].output_span;                              \
-    }                                                                  \
-  }                                                                    \
-  return num_copied;
-
-    if (input_stride == 1 && output_stride == 1) {
-      COPY_INNER_DIM(LinCopy::Kind::Linear);
-    } else if (input_stride == 1 && output_stride != 1) {
-      COPY_INNER_DIM(LinCopy::Kind::Scatter);
-    } else if (input_stride == 0 && output_stride == 1) {
-      COPY_INNER_DIM(LinCopy::Kind::FillLinear);
-    } else if (input_stride == 0 && output_stride != 1) {
-      COPY_INNER_DIM(LinCopy::Kind::FillScatter);
-    } else if (output_stride == 1) {
-      COPY_INNER_DIM(LinCopy::Kind::Gather);
-    } else {
-      COPY_INNER_DIM(LinCopy::Kind::Random);
-    }
-
-#undef COPY_INNER_DIM
-  }
-
-  // Copy from `src` to `dst` with an identity src->dst dimension map. Returns
-  // the number of copied elements.
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst& dst,
-                                                              const Src& src) {
-    DimensionsMap dst_to_src_map;
-    for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i;
-    return Copy(dst, src, dst_to_src_map);
-  }
-
- private:
-  struct BlockIteratorState {
-    BlockIteratorState()
-        : size(0),
-          count(0),
-          input_stride(0),
-          output_stride(0),
-          input_span(0),
-          output_span(0) {}
-
-    IndexType size;
-    IndexType count;
-    IndexType input_stride;
-    IndexType output_stride;
-    IndexType input_span;
-    IndexType output_span;
-  };
-
-  // Compute how many inner dimensions it's allowed to squeeze when doing IO
-  // between two tensor blocks. It's safe to squeeze inner dimensions, only
-  // if they are not reordered.
-  static int NumSqueezableInnerDims(const DimensionsMap& dim_map) {
-    int num_squeezable_dims = 0;
-    for (int i = 0; i < NumDims; ++i) {
-      const int dim = IsColMajor ? i : NumDims - i - 1;
-      if (dim_map[dim] != dim) break;
-      num_squeezable_dims++;
-    }
-    return num_squeezable_dims;
-  }
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to
-// a Tensor block defined by `desc`, backed by a memory buffer at `target`.
-//
-// Currently there is no way to write from a Tensor expression to a block of
-// memory, if dimensions are reordered. If you need to do that, you should
-// materialize a Tensor block expression into a memory buffer, and then use
-// TensorBlockIO to copy data between two memory buffers with a custom
-// `target->src` dimension map (see definition above).
-//
-// Also currently the innermost dimension of `target` must have a stride '1'
-// (contiguous in memory). This restriction could be lifted with a `pscatter`,
-// but in practice it's never needed, and there is a similar TensorBlockIO
-// workaround for that.
-//
-// TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO
-// where `src` is a tensor expression. Explore if it is possible to rewrite IO
-// to use expressions instead of pointers, and after that TensorBlockAssignment
-// will become an alias to IO.
-template <typename Scalar, int NumDims, typename TensorBlockExpr,
-          typename IndexType = Eigen::Index>
-class TensorBlockAssignment {
-  // We will use coeff/packet path to evaluate block expressions.
-  typedef TensorEvaluator<const TensorBlockExpr, DefaultDevice>
-      TensorBlockEvaluator;
-
-  typedef DSizes<IndexType, NumDims> Dimensions;
-
-  enum {
-    Vectorizable = packet_traits<Scalar>::Vectorizable,
-    PacketSize = packet_traits<Scalar>::size
-  };
-
-  template <bool Vectorizable, typename Evaluator>
-  struct InnerDimAssign {
-    EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
-                                        const Evaluator& eval,
-                                        IndexType eval_offset) {
-      for (IndexType i = 0; i < count; ++i) {
-        target[i] = eval.coeff(eval_offset + i);
-      }
-    }
-  };
-
-  template <typename Evaluator>
-  struct InnerDimAssign<true, Evaluator> {
-    EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
-                                        const Evaluator& eval,
-                                        IndexType eval_offset) {
-      typedef typename packet_traits<Scalar>::type Packet;
-
-      const IndexType unrolled_size = count - 4 * PacketSize;
-      const IndexType vectorized_size = count - PacketSize;
-      IndexType i = 0;
-
-      for (; i <= unrolled_size; i += 4 * PacketSize) {
-        for (int j = 0; j < 4; ++j) {
-          const IndexType idx = eval_offset + i + j * PacketSize;
-          Packet p = eval.template packet<Unaligned>(idx);
-          pstoreu<Scalar>(target + i + j * PacketSize, p);
-        }
-      }
-
-      for (; i <= vectorized_size; i += PacketSize) {
-        Packet p = eval.template packet<Unaligned>(eval_offset + i);
-        pstoreu<Scalar>(target + i, p);
-      }
-
-      for (; i < count; ++i) {
-        target[i] = eval.coeff(eval_offset + i);
-      }
-    }
-  };
-
- public:
-  struct Target {
-    Target(const Dimensions& target_dims, const Dimensions& target_strides,
-           Scalar* target_data, IndexType target_offset = 0)
-        : dims(target_dims),
-          strides(target_strides),
-          data(target_data),
-          offset(target_offset) {}
-
-    Dimensions dims;
-    Dimensions strides;
-    Scalar* data;
-    IndexType offset;
-  };
-
-  static Target target(const Dimensions& target_dims,
-                       const Dimensions& target_strides, Scalar* target_data,
-                       IndexType target_offset = 0) {
-    return Target(target_dims, target_strides, target_data, target_offset);
-  }
-
-  template <typename TargetDimsIndexType, typename TargetStridesIndexType>
-  static Target target(
-      const DSizes<TargetDimsIndexType, NumDims>& target_dims,
-      const DSizes<TargetStridesIndexType, NumDims>& target_strides,
-      Scalar* target_data, IndexType target_offset = 0) {
-    // DSizes constructor will do index type promotion if it's safe.
-    return Target(Dimensions(target_dims), Dimensions(target_strides),
-                  target_data, target_offset);
-  }
-
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      const Target& target, const TensorBlockExpr& expr) {
-    // Prepare evaluator for block expression.
-    DefaultDevice default_device;
-    TensorBlockEvaluator eval(expr, default_device);
-
-    // Tensor block expression dimension should match destination dimensions.
-    eigen_assert(dimensions_match(target.dims, eval.dimensions()));
-
-    static const int Layout = TensorBlockEvaluator::Layout;
-    static const bool is_col_major = Layout == ColMajor;
-
-    // Initialize output inner dimension size based on a layout.
-    const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize();
-    const int inner_dim_idx = is_col_major ? 0 : NumDims - 1;
-    IndexType output_inner_dim_size = target.dims[inner_dim_idx];
-
-    // Target inner dimension stride must be '1'.
-    eigen_assert(target.strides[inner_dim_idx] == 1);
-
-    // Squeeze multiple inner dims into one if they are contiguous in `target`.
-    IndexType num_squeezed_dims = 0;
-    for (Index i = 1; i < NumDims; ++i) {
-      const Index dim = is_col_major ? i : NumDims - i - 1;
-      const IndexType target_stride = target.strides[dim];
-
-      if (output_inner_dim_size == target_stride) {
-        output_inner_dim_size *= target.dims[dim];
-        num_squeezed_dims++;
-      } else {
-        break;
-      }
-    }
-
-    // Initialize output block iterator state. Dimension in this array are
-    // always in inner_most -> outer_most order (col major layout).
-    array<BlockIteratorState, NumDims> it;
-
-    int idx = 0;  // currently initialized iterator state index
-    for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) {
-      const Index dim = is_col_major ? i + 1 : NumDims - i - 2;
-
-      it[idx].count = 0;
-      it[idx].size = target.dims[dim];
-      it[idx].output_stride = target.strides[dim];
-      it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
-      idx++;
-    }
-
-    // We read block expression from the beginning, and start writing data to
-    // `target` at given offset.
-    IndexType input_offset = 0;
-    IndexType output_offset = target.offset;
-
-    // Iterate copying data from `eval` to `target`.
-    for (IndexType i = 0; i < output_size; i += output_inner_dim_size) {
-      // Assign to `target` at current offset.
-      InnerDimAssign<Vectorizable && TensorBlockEvaluator::PacketAccess,
-                     TensorBlockEvaluator>::Run(target.data + output_offset,
-                                                output_inner_dim_size, eval,
-                                                input_offset);
-
-      // Move input offset forward by the number of assigned coefficients.
-      input_offset += output_inner_dim_size;
-
-      // Update index.
-      for (int j = 0; j < idx; ++j) {
-        if (++it[j].count < it[j].size) {
-          output_offset += it[j].output_stride;
-          break;
-        }
-        it[j].count = 0;
-        output_offset -= it[j].output_span;
-      }
-    }
-  }
-
- private:
-  struct BlockIteratorState {
-    BlockIteratorState()
-        : count(0), size(0), output_stride(0), output_span(0) {}
-
-    IndexType count;
-    IndexType size;
-    IndexType output_stride;
-    IndexType output_span;
-  };
-};
-
-// -------------------------------------------------------------------------- //
-
-}  // namespace internal
-}  // namespace Eigen
-
-#endif  // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorBroadcasting.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorBroadcasting.h
deleted file mode 100644
index a354132..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorBroadcasting.h
+++ /dev/null
@@ -1,1093 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
-#define EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
-
-namespace Eigen {
-
-/** \class TensorBroadcasting
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Tensor broadcasting class.
-  *
-  *
-  */
-namespace internal {
-template<typename Broadcast, typename XprType>
-struct traits<TensorBroadcastingOp<Broadcast, XprType> > : public traits<XprType>
-{
-  typedef typename XprType::Scalar Scalar;
-  typedef traits<XprType> XprTraits;
-  typedef typename XprTraits::StorageKind StorageKind;
-  typedef typename XprTraits::Index Index;
-  typedef typename XprType::Nested Nested;
-  typedef typename remove_reference<Nested>::type _Nested;
-  static const int NumDimensions = XprTraits::NumDimensions;
-  static const int Layout = XprTraits::Layout;
-  typedef typename XprTraits::PointerType PointerType;
-};
-
-template<typename Broadcast, typename XprType>
-struct eval<TensorBroadcastingOp<Broadcast, XprType>, Eigen::Dense>
-{
-  typedef const TensorBroadcastingOp<Broadcast, XprType> EIGEN_DEVICE_REF type;
-};
-
-template<typename Broadcast, typename XprType>
-struct nested<TensorBroadcastingOp<Broadcast, XprType>, 1, typename eval<TensorBroadcastingOp<Broadcast, XprType> >::type>
-{
-  typedef TensorBroadcastingOp<Broadcast, XprType> type;
-};
-
-template <typename Dims>
-struct is_input_scalar {
-  static const bool value = false;
-};
-template <>
-struct is_input_scalar<Sizes<> > {
-  static const bool value = true;
-};
-#ifndef EIGEN_EMULATE_CXX11_META_H
-template <typename std::ptrdiff_t... Indices>
-struct is_input_scalar<Sizes<Indices...> > {
-  static const bool value = (Sizes<Indices...>::total_size == 1);
-};
-#endif
-
-}  // end namespace internal
-
-
-
-template<typename Broadcast, typename XprType>
-class TensorBroadcastingOp : public TensorBase<TensorBroadcastingOp<Broadcast, XprType>, ReadOnlyAccessors>
-{
-  public:
-  typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Scalar Scalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename Eigen::internal::nested<TensorBroadcastingOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorBroadcastingOp>::StorageKind StorageKind;
-  typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Index Index;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBroadcastingOp(const XprType& expr, const Broadcast& broadcast)
-      : m_xpr(expr), m_broadcast(broadcast) {}
-
-    EIGEN_DEVICE_FUNC
-    const Broadcast& broadcast() const { return m_broadcast; }
-
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename XprType::Nested>::type&
-    expression() const { return m_xpr; }
-
-  protected:
-    typename XprType::Nested m_xpr;
-    const Broadcast m_broadcast;
-};
-
-
-// Eval as rvalue
-template<typename Broadcast, typename ArgType, typename Device>
-struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
-{
-  typedef TensorBroadcastingOp<Broadcast, ArgType> XprType;
-  typedef typename XprType::Index Index;
-  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
-  typedef DSizes<Index, NumDims> Dimensions;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  protected: //  all the non-static fields must have the same access control, otherwise the TensorEvaluator wont be standard layout;
-  bool isCopy, nByOne, oneByN;
-  public:
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
-  enum {
-    IsAligned         = TensorEvaluator<ArgType, Device>::IsAligned,
-    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
-    PreferBlockAccess = true,
-    Layout            = TensorEvaluator<ArgType, Device>::Layout,
-    RawAccess         = false
-  };
-
-  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
-
-  // We do block based broadcasting using a trick with 2x tensor rank and 0
-  // strides. See block method implementation for details.
-  typedef DSizes<Index, 2 * NumDims> BroadcastDimensions;
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
- typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
-  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
-
-  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
-      ArgTensorBlock;
-
-  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
-                                                     Layout, Index>
-      TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : isCopy(false), nByOne(false), oneByN(false),
-        m_device(device), m_broadcast(op.broadcast()), m_impl(op.expression(), device)
-  {
-
-    // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar
-    // and store the result in a scalar. Instead one should reshape the scalar into a a N-D
-    // tensor with N >= 1 of 1 element first and then broadcast.
-    EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    const InputDimensions& input_dims = m_impl.dimensions();
-    isCopy = true;
-    for (int i = 0; i < NumDims; ++i) {
-      eigen_assert(input_dims[i] > 0);
-      m_dimensions[i] = input_dims[i] * m_broadcast[i];
-      if (m_broadcast[i] != 1) {
-        isCopy = false;
-      }
-    }
-
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      m_inputStrides[0] = 1;
-      m_outputStrides[0] = 1;
-      for (int i = 1; i < NumDims; ++i) {
-        m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
-        m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
-      }
-    } else {
-      m_inputStrides[NumDims-1] = 1;
-      m_outputStrides[NumDims-1] = 1;
-      for (int i = NumDims-2; i >= 0; --i) {
-        m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
-        m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
-      }
-    }
-
-    if (input_dims[0] == 1) {
-      oneByN = true;
-      for (int i = 1; i < NumDims; ++i) {
-        if (m_broadcast[i] != 1) {
-          oneByN = false;
-          break;
-        }
-      }
-    } else if (input_dims[NumDims-1] == 1) {
-      nByOne = true;
-      for (int i = 0; i < NumDims-1; ++i) {
-        if (m_broadcast[i] != 1) {
-          nByOne = false;
-          break;
-        }
-      }
-    }
-
-    // Handle special format like NCHW, its input shape is '[1, N..., 1]' and
-    // broadcast shape is '[N, 1..., N]'
-    if (!oneByN && !nByOne) {
-      if (input_dims[0] == 1 && input_dims[NumDims-1] == 1 && NumDims > 2) {
-        nByOne = true;
-        oneByN = true;
-        for (int i = 1; i < NumDims-1; ++i) {
-          if (m_broadcast[i] != 1) {
-            nByOne = false;
-            oneByN = false;
-            break;
-          }
-        }
-      }
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
-    m_impl.evalSubExprsIfNeeded(NULL);
-    return true;
-  }
-
-#ifdef EIGEN_USE_THREADS
-  template <typename EvalSubExprsCallback>
-  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
-      EvaluatorPointerType, EvalSubExprsCallback done) {
-    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
-  }
-#endif  // EIGEN_USE_THREADS
-
-  EIGEN_STRONG_INLINE void cleanup() {
-    m_impl.cleanup();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const
-  {
-    if (internal::is_input_scalar<typename internal::remove_all<InputDimensions>::type>::value) {
-      return m_impl.coeff(0);
-    }
-
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      if (isCopy) {
-        return m_impl.coeff(index);
-      } else {
-        return coeffColMajor(index);
-      }
-    } else {
-      if (isCopy) {
-        return m_impl.coeff(index);
-      } else {
-        return coeffRowMajor(index);
-      }
-    }
-  }
-
-  // TODO: attempt to speed this up. The integer divisions and modulo are slow
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexColMajor(Index index) const {
-    Index inputIndex = 0;
-    EIGEN_UNROLL_LOOP
-    for (int i = NumDims - 1; i > 0; --i) {
-      const Index idx = index / m_outputStrides[i];
-      if (internal::index_statically_eq<Broadcast>(i, 1)) {
-        eigen_assert(idx < m_impl.dimensions()[i]);
-        inputIndex += idx * m_inputStrides[i];
-      } else {
-        if (internal::index_statically_eq<InputDimensions>(i, 1)) {
-          eigen_assert(idx % m_impl.dimensions()[i] == 0);
-        } else {
-          inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
-        }
-      }
-      index -= idx * m_outputStrides[i];
-    }
-    if (internal::index_statically_eq<Broadcast>(0, 1)) {
-      eigen_assert(index < m_impl.dimensions()[0]);
-      inputIndex += index;
-    } else {
-      if (internal::index_statically_eq<InputDimensions>(0, 1)) {
-        eigen_assert(index % m_impl.dimensions()[0] == 0);
-      } else {
-        inputIndex += (index % m_impl.dimensions()[0]);
-      }
-    }
-    return inputIndex;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const
-  {
-    return m_impl.coeff(indexColMajor(index));
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexRowMajor(Index index) const {
-    Index inputIndex = 0;
-    EIGEN_UNROLL_LOOP
-    for (int i = 0; i < NumDims - 1; ++i) {
-      const Index idx = index / m_outputStrides[i];
-      if (internal::index_statically_eq<Broadcast>(i, 1)) {
-        eigen_assert(idx < m_impl.dimensions()[i]);
-        inputIndex += idx * m_inputStrides[i];
-      } else {
-        if (internal::index_statically_eq<InputDimensions>(i, 1)) {
-          eigen_assert(idx % m_impl.dimensions()[i] == 0);
-        } else {
-          inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
-        }
-      }
-      index -= idx * m_outputStrides[i];
-    }
-    if (internal::index_statically_eq<Broadcast>(NumDims - 1, 1)) {
-      eigen_assert(index < m_impl.dimensions()[NumDims - 1]);
-      inputIndex += index;
-    } else {
-      if (internal::index_statically_eq<InputDimensions>(NumDims - 1, 1)) {
-        eigen_assert(index % m_impl.dimensions()[NumDims - 1] == 0);
-      } else {
-        inputIndex += (index % m_impl.dimensions()[NumDims - 1]);
-      }
-    }
-    return inputIndex;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const
-  {
-    return m_impl.coeff(indexRowMajor(index));
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const
-  {
-    if (internal::is_input_scalar<typename internal::remove_all<InputDimensions>::type>::value) {
-      return internal::pset1<PacketReturnType>(m_impl.coeff(0));
-    }
-
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      if (isCopy) {
-        #ifdef EIGEN_GPU_COMPILE_PHASE
-        // See PR 437: on NVIDIA P100 and K20m we observed a x3-4 speed up by enforcing
-        // unaligned loads here. The reason is unclear though.
-        return m_impl.template packet<Unaligned>(index);
-        #else
-        return m_impl.template packet<LoadMode>(index);
-        #endif
-      } else if (oneByN && !nByOne) {
-        return packetNByOne<LoadMode>(index);
-      } else if (!oneByN && nByOne) {
-        return packetOneByN<LoadMode>(index);
-      } else if (oneByN && nByOne) {
-        return packetOneByNByOne<LoadMode>(index);
-      } else {
-        return packetColMajor<LoadMode>(index);
-      }
-    } else {
-      if (isCopy) {
-        #ifdef EIGEN_GPU_COMPILE_PHASE
-        // See above.
-        return m_impl.template packet<Unaligned>(index);
-        #else
-        return m_impl.template packet<LoadMode>(index);
-        #endif
-      } else if (oneByN && !nByOne) {
-        return packetOneByN<LoadMode>(index);
-      } else if (!oneByN && nByOne) {
-        return packetNByOne<LoadMode>(index);
-      } else if (oneByN && nByOne) {
-        return packetOneByNByOne<LoadMode>(index);
-      } else {
-        return packetRowMajor<LoadMode>(index);
-      }
-    }
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByNByOne
-  (Index index) const
-  {
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
-
-    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
-    Index startDim, endDim;
-    Index inputIndex, outputOffset, batchedIndex;
-
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      startDim = NumDims - 1;
-      endDim = 1;
-    } else {
-      startDim = 0;
-      endDim = NumDims - 2;
-    }
-
-    batchedIndex = index % m_outputStrides[startDim];
-    inputIndex   = batchedIndex / m_outputStrides[endDim];
-    outputOffset = batchedIndex % m_outputStrides[endDim];
-
-    if (outputOffset + PacketSize <= m_outputStrides[endDim]) {
-      values[0] = m_impl.coeff(inputIndex);
-      return internal::pload1<PacketReturnType>(values);
-    } else {
-      EIGEN_UNROLL_LOOP
-      for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) {
-        if (outputOffset + cur < m_outputStrides[endDim]) {
-          values[i] = m_impl.coeff(inputIndex);
-        } else {
-          ++inputIndex;
-          inputIndex = (inputIndex == m_inputStrides[startDim] ? 0 : inputIndex);
-          values[i] = m_impl.coeff(inputIndex);
-          outputOffset = 0;
-          cur = 0;
-        }
-      }
-      return internal::pload<PacketReturnType>(values);
-    }
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByN(Index index) const
-  {
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
-
-    Index dim, inputIndex;
-
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      dim = NumDims - 1;
-    } else {
-      dim = 0;
-    }
-
-    inputIndex = index % m_inputStrides[dim];
-    if (inputIndex + PacketSize <= m_inputStrides[dim]) {
-      return m_impl.template packet<Unaligned>(inputIndex);
-    } else {
-      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
-      EIGEN_UNROLL_LOOP
-      for (int i = 0; i < PacketSize; ++i) {
-        if (inputIndex > m_inputStrides[dim]-1) {
-          inputIndex = 0;
-        }
-        values[i] = m_impl.coeff(inputIndex++);
-      }
-      return internal::pload<PacketReturnType>(values);
-    }
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetNByOne(Index index) const
-  {
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
-
-    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
-    Index dim, inputIndex, outputOffset;
-
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      dim = 1;
-    } else {
-      dim = NumDims - 2;
-    }
-
-    inputIndex   = index / m_outputStrides[dim];
-    outputOffset = index % m_outputStrides[dim];
-    if (outputOffset + PacketSize <= m_outputStrides[dim]) {
-      values[0] = m_impl.coeff(inputIndex);
-      return internal::pload1<PacketReturnType>(values);
-    } else {
-      EIGEN_UNROLL_LOOP
-      for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) {
-        if (outputOffset + cur < m_outputStrides[dim]) {
-          values[i] = m_impl.coeff(inputIndex);
-        } else {
-          values[i] = m_impl.coeff(++inputIndex);
-          outputOffset = 0;
-          cur = 0;
-        }
-      }
-      return internal::pload<PacketReturnType>(values);
-    }
-  }
-
-  // Ignore the LoadMode and always use unaligned loads since we can't guarantee
-  // the alignment at compile time.
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
-  {
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
-
-    const Index originalIndex = index;
-
-    Index inputIndex = 0;
-    EIGEN_UNROLL_LOOP
-    for (int i = NumDims - 1; i > 0; --i) {
-      const Index idx = index / m_outputStrides[i];
-      if (internal::index_statically_eq<Broadcast>(i, 1)) {
-        eigen_assert(idx < m_impl.dimensions()[i]);
-        inputIndex += idx * m_inputStrides[i];
-      } else {
-        if (internal::index_statically_eq<InputDimensions>(i, 1)) {
-          eigen_assert(idx % m_impl.dimensions()[i] == 0);
-        } else {
-          inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
-        }
-      }
-      index -= idx * m_outputStrides[i];
-    }
-    Index innermostLoc;
-    if (internal::index_statically_eq<Broadcast>(0, 1)) {
-      eigen_assert(index < m_impl.dimensions()[0]);
-      innermostLoc = index;
-    } else {
-      if (internal::index_statically_eq<InputDimensions>(0, 1)) {
-        eigen_assert(index % m_impl.dimensions()[0] == 0);
-        innermostLoc = 0;
-      } else {
-        innermostLoc = index % m_impl.dimensions()[0];
-      }
-    }
-    inputIndex += innermostLoc;
-
-    // Todo: this could be extended to the second dimension if we're not
-    // broadcasting alongside the first dimension, and so on.
-    if (innermostLoc + PacketSize <= m_impl.dimensions()[0]) {
-      return m_impl.template packet<Unaligned>(inputIndex);
-    } else {
-      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
-      values[0] = m_impl.coeff(inputIndex);
-      EIGEN_UNROLL_LOOP
-      for (int i = 1; i < PacketSize; ++i) {
-        if (innermostLoc + i < m_impl.dimensions()[0]) {
-          values[i] = m_impl.coeff(inputIndex+i);
-        } else {
-          values[i] = coeffColMajor(originalIndex+i);
-        }
-      }
-      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
-      return rslt;
-    }
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
-  {
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
-
-    const Index originalIndex = index;
-
-    Index inputIndex = 0;
-    EIGEN_UNROLL_LOOP
-    for (int i = 0; i < NumDims - 1; ++i) {
-      const Index idx = index / m_outputStrides[i];
-      if (internal::index_statically_eq<Broadcast>(i, 1)) {
-        eigen_assert(idx < m_impl.dimensions()[i]);
-        inputIndex += idx * m_inputStrides[i];
-      } else {
-        if (internal::index_statically_eq<InputDimensions>(i, 1)) {
-          eigen_assert(idx % m_impl.dimensions()[i] == 0);
-        } else {
-          inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
-        }
-      }
-      index -= idx * m_outputStrides[i];
-    }
-    Index innermostLoc;
-    if (internal::index_statically_eq<Broadcast>(NumDims-1, 1)) {
-      eigen_assert(index < m_impl.dimensions()[NumDims-1]);
-      innermostLoc = index;
-    } else {
-      if (internal::index_statically_eq<InputDimensions>(NumDims-1, 1)) {
-        eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0);
-        innermostLoc = 0;
-      } else {
-        innermostLoc = index % m_impl.dimensions()[NumDims-1];
-      }
-    }
-    inputIndex += innermostLoc;
-
-    // Todo: this could be extended to the second dimension if we're not
-    // broadcasting alongside the first dimension, and so on.
-    if (innermostLoc + PacketSize <= m_impl.dimensions()[NumDims-1]) {
-      return m_impl.template packet<Unaligned>(inputIndex);
-    } else {
-      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
-      values[0] = m_impl.coeff(inputIndex);
-      EIGEN_UNROLL_LOOP
-      for (int i = 1; i < PacketSize; ++i) {
-        if (innermostLoc + i < m_impl.dimensions()[NumDims-1]) {
-          values[i] = m_impl.coeff(inputIndex+i);
-        } else {
-          values[i] = coeffRowMajor(originalIndex+i);
-        }
-      }
-      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
-      return rslt;
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
-  costPerCoeff(bool vectorized) const {
-    double compute_cost = TensorOpCost::AddCost<Index>();
-    if (!isCopy && NumDims > 0) {
-      EIGEN_UNROLL_LOOP
-      for (int i = NumDims - 1; i > 0; --i) {
-        compute_cost += TensorOpCost::DivCost<Index>();
-        if (internal::index_statically_eq<Broadcast>(i, 1)) {
-          compute_cost +=
-              TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
-        } else {
-          if (!internal::index_statically_eq<InputDimensions>(i, 1)) {
-            compute_cost += TensorOpCost::MulCost<Index>() +
-                            TensorOpCost::ModCost<Index>() +
-                            TensorOpCost::AddCost<Index>();
-          }
-        }
-        compute_cost +=
-            TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
-      }
-    }
-    return m_impl.costPerCoeff(vectorized) +
-           TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockResourceRequirements getResourceRequirements() const {
-    // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large
-    // tensors. But this might need further tuning.
-    const size_t target_size = m_device.firstLevelCacheSize();
-    return internal::TensorBlockResourceRequirements::merge(
-        m_impl.getResourceRequirements(),
-        internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size));
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
-  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
-          bool /*root_of_expr_ast*/ = false) const {
-    BlockBroadcastingParams params = blockBroadcastingParams(desc);
-
-    if (params.inner_dim_size == 0 || params.bcast_dim_size == 0) {
-      return emptyBlock();
-    }
-
-    // Prepare storage for the materialized broadcasting result.
-    const typename TensorBlock::Storage block_storage =
-        TensorBlock::prepareStorage(desc, scratch);
-    ScalarNoConst* materialized_output = block_storage.data();
-
-    // We potentially will need to materialize input blocks.
-    size_t materialized_input_size = 0;
-    ScalarNoConst* materialized_input = NULL;
-
-    // Initialize block broadcating iterator state for outer dimensions (outer
-    // with regard to bcast dimension). Dimension in this array are always in
-    // inner_most -> outer_most order (col major layout).
-    array<BlockBroadcastingIteratorState, NumDims> it;
-    int idx = 0;
-
-    for (int i = params.inner_dim_count + 1; i < NumDims; ++i) {
-      const Index dim = IsColMajor ? i : NumDims - 1 - i;
-      it[idx].size = params.output_dims[dim];
-      it[idx].count = 0;
-      it[idx].output_stride = m_outputStrides[dim];
-      it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
-      idx++;
-    }
-
-    // Write output into the beginning of `materialized_output`.
-    Index output_offset = 0;
-
-    // We will fill output block by broadcasting along the bcast dim, and
-    // iterating over outer dimension.
-    const Index output_size = NumDims == 0 ? 1 : params.output_dims.TotalSize();
-
-    for (Index num_output_coeffs = 0; num_output_coeffs < output_size;) {
-      ScalarNoConst* bcast_output = materialized_output + num_output_coeffs;
-      Index bcast_offset = desc.offset() + output_offset;
-
-      // Broadcast along the bcast dimension.
-      num_output_coeffs += BroadcastBlockAlongBcastDim(
-          params, bcast_offset, scratch, bcast_output, &materialized_input,
-          &materialized_input_size);
-
-      // Switch to the next outer dimension.
-      for (int j = 0; j < idx; ++j) {
-        if (++it[j].count < it[j].size) {
-          output_offset += it[j].output_stride;
-          break;
-        }
-        it[j].count = 0;
-        output_offset -= it[j].output_span;
-      }
-    }
-
-    return block_storage.AsTensorMaterializedBlock();
-  }
-
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
-
-  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
-
-  Broadcast functor() const { return m_broadcast; }
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(
-      cl::sycl::handler& cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
- private:
-  static const bool IsColMajor =
-      static_cast<int>(Layout) == static_cast<int>(ColMajor);
-
-  // We will build a general case block broadcasting on top of broadcasting
-  // primitive that will do broadcasting only for the inner dimension(s) along
-  // the first dimension smaller than the input size (it's called `bcast_dim`).
-  //
-  // Example:
-  //           dim:  0  1  2   (ColMajor)
-  //    input size: [9, 3, 6]
-  //    block size: [9, 2, 6]
-  //
-  // We will compute broadcasted block by iterating over the outer dimensions
-  // before `bcast_dim` (only dimension `2` in this example) and computing
-  // broadcasts along the `bcast_dim` (dimension `1` in this example).
-
-  // BlockBroadcastingParams holds precomputed parameters for broadcasting a
-  // single block along the broadcasting dimension. Sizes and strides along the
-  // `bcast_dim` might be invalid, they will be adjusted later in
-  // `BroadcastBlockAlongBcastDim`.
-  struct BlockBroadcastingParams {
-    Dimensions input_dims;      // input expression dimensions
-    Dimensions output_dims;     // output block sizes
-    Dimensions output_strides;  // output block strides
-
-    int inner_dim_count;   // count inner dimensions matching in size
-    int bcast_dim;         // broadcasting dimension index
-    Index bcast_dim_size;  // broadcasting dimension size
-    Index inner_dim_size;  // inner dimensions size
-
-    // Block sizes and strides for the input block where all dimensions before
-    // `bcast_dim` are equal to `1`.
-    Dimensions input_block_sizes;
-    Dimensions input_block_strides;
-
-    // Block sizes and strides for blocks with extra dimensions and strides `0`.
-    BroadcastDimensions bcast_block_sizes;
-    BroadcastDimensions bcast_block_strides;
-    BroadcastDimensions bcast_input_strides;
-  };
-
-  struct BlockBroadcastingIteratorState {
-    Index size;
-    Index count;
-    Index output_stride;
-    Index output_span;
-  };
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlockBroadcastingParams
-  blockBroadcastingParams(TensorBlockDesc& desc) const {
-    BlockBroadcastingParams params;
-
-    params.input_dims = Dimensions(m_impl.dimensions());
-
-    // Output block sizes and strides.
-    params.output_dims = desc.dimensions();
-    params.output_strides = internal::strides<Layout>(params.output_dims);
-
-    // Find the broadcasting dimension (first dimension with output size smaller
-    // that the input size).
-    params.bcast_dim = 0;
-    params.bcast_dim_size = 1;
-    params.inner_dim_size = 1;
-
-    // Count the number of inner dimensions that have the same size in the block
-    // and in the broadcast expression.
-    params.inner_dim_count = 0;
-
-    for (int i = 0; i < NumDims; ++i) {
-      const int dim = IsColMajor ? i : NumDims - i - 1;
-
-      if (params.output_dims[dim] == m_dimensions[dim]) {
-        params.inner_dim_size *= params.output_dims[dim];
-        ++params.inner_dim_count;
-        continue;
-      }
-
-      // First non-matching dimension is the broadcasting dimension.
-      eigen_assert(params.output_dims[dim] < m_dimensions[dim]);
-      params.bcast_dim = dim;
-      params.bcast_dim_size = params.output_dims[dim];
-      break;
-    }
-
-    // Calculate the input block size for looking into the input.
-    for (int i = 0; i < params.inner_dim_count; ++i) {
-      const int dim = IsColMajor ? i : NumDims - i - 1;
-      params.input_block_sizes[dim] = params.input_dims[dim];
-    }
-    for (int i = params.inner_dim_count; i < NumDims; ++i) {
-      const int dim = IsColMajor ? i : NumDims - i - 1;
-      params.input_block_sizes[dim] = 1;
-    }
-    params.input_block_strides =
-        internal::strides<Layout>(params.input_block_sizes);
-
-    // Broadcast with the 0-stride trick: Create 1 extra dim for each
-    // broadcast, set the input stride to 0.
-    //
-    // When ColMajor:
-    //
-    // - bcast_block_sizes:
-    //   [d_0, b_0, d_1, b_1, ...]
-    //
-    // - bcast_block_strides:
-    //   [output_block_strides[0], output_block_strides[0] * d_0,
-    //    output_block_strides[1], output_block_strides[1] * d_1,
-    //   ...]
-    //
-    // - bcast_input_strides:
-    //   [input_block_strides[0], 0,
-    //    input_block_strides[1], 0,
-    //   ...].
-    //
-    for (int i = 0; i < params.inner_dim_count; ++i) {
-      const int dim = IsColMajor ? i : NumDims - i - 1;
-
-      const int copy_dim = IsColMajor ? 2 * i : 2 * NumDims - 2 * i - 1;
-      const int broadcast_dim = IsColMajor ? copy_dim + 1 : copy_dim - 1;
-
-      params.bcast_block_sizes[copy_dim] = params.input_dims[dim];
-      params.bcast_block_sizes[broadcast_dim] = m_broadcast[dim];
-      params.bcast_block_strides[copy_dim] = params.output_strides[dim];
-      params.bcast_block_strides[broadcast_dim] =
-          params.output_strides[dim] * params.input_dims[dim];
-      params.bcast_input_strides[copy_dim] = params.input_block_strides[dim];
-      params.bcast_input_strides[broadcast_dim] = 0;
-    }
-
-    for (int i = 2 * params.inner_dim_count; i < 2 * NumDims; ++i) {
-      const int dim = IsColMajor ? i : 2 * NumDims - i - 1;
-      params.bcast_block_sizes[dim] = 1;
-      params.bcast_block_strides[dim] = 0;
-      params.bcast_input_strides[dim] = 0;
-    }
-
-    return params;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock emptyBlock() const {
-    DSizes<Index, NumDims> dimensions;
-    for (int i = 0; i < NumDims; ++i) dimensions[i] = 0;
-    return TensorBlock(internal::TensorBlockKind::kView, NULL, dimensions);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlockAlongBcastDim(
-      BlockBroadcastingParams params, Index bcast_offset,
-      TensorBlockScratch& scratch, ScalarNoConst* materialized_output,
-      ScalarNoConst** materialized_input,
-      size_t* materialized_input_size) const {
-    if (params.bcast_dim_size == 1) {
-      // We just need one block read using the ready-set values above.
-      return BroadcastBlock(
-          params.input_block_sizes, params.input_block_strides,
-          params.bcast_block_sizes, params.bcast_block_strides,
-          params.bcast_input_strides, bcast_offset, 0, scratch,
-          materialized_output, materialized_input, materialized_input_size);
-
-    } else if (params.input_dims[params.bcast_dim] == 1) {
-      // Broadcast bcast dimension (< NumDims) by bcast_dim_size.
-      const int broadcast_bcast_dim =
-          IsColMajor ? 2 * params.inner_dim_count + 1
-                     : 2 * NumDims - 2 * params.inner_dim_count - 2;
-
-      params.bcast_block_sizes[broadcast_bcast_dim] = params.bcast_dim_size;
-      params.bcast_input_strides[broadcast_bcast_dim] = 0;
-      params.bcast_block_strides[broadcast_bcast_dim] =
-          params.output_strides[params.bcast_dim];
-
-      return BroadcastBlock(
-          params.input_block_sizes, params.input_block_strides,
-          params.bcast_block_sizes, params.bcast_block_strides,
-          params.bcast_input_strides, bcast_offset, 0, scratch,
-          materialized_output, materialized_input, materialized_input_size);
-
-    } else {
-      // Keep track of the total number of the coefficients written to the
-      // output block.
-      Index num_output_coeffs = 0;
-
-      // The general case. Let's denote the output block as
-      //
-      //   x[..., a:a+bcast_dim_size, :, ..., :]
-      //
-      // where a:a+bcast_dim_size is a slice on the bcast_dim dimension
-      // (< NumDims). We need to split the a:a+bcast_dim_size into possibly 3
-      // sub-blocks:
-      //
-      // (1) a:b, where b is the smallest multiple of
-      //     input_dims[bcast_dim_start] in [a, a+bcast_dim_size].
-      //
-      // (2) b:c, where c is the largest multiple of input_dims[bcast_dim_start]
-      //     in [a, a+bcast_dim_size].
-      //
-      // (3) c:a+bcast_dim_size .
-      //
-      // Or, when b and c do not exist, we just need to process the whole block
-      // together.
-
-      // Find a.
-      const Index bcast_dim_left_index =
-          bcast_offset / m_outputStrides[params.bcast_dim];
-
-      // Find b and c.
-      const Index input_bcast_dim_size = params.input_dims[params.bcast_dim];
-
-      // First multiple after a. This is b when <= bcast_dim_left_index +
-      // bcast_dim_size.
-      const Index first_multiple =
-          divup<Index>(bcast_dim_left_index, input_bcast_dim_size) *
-          input_bcast_dim_size;
-
-      if (first_multiple <= bcast_dim_left_index + params.bcast_dim_size) {
-        // b exists, so does c. Find it.
-        const Index last_multiple =
-            (bcast_dim_left_index + params.bcast_dim_size) /
-            input_bcast_dim_size * input_bcast_dim_size;
-        const int copy_bcast_dim =
-            IsColMajor ? 2 * params.inner_dim_count
-                       : 2 * NumDims - 2 * params.inner_dim_count - 1;
-        const int broadcast_bcast_dim =
-            IsColMajor ? 2 * params.inner_dim_count + 1
-                       : 2 * NumDims - 2 * params.inner_dim_count - 2;
-
-        if (first_multiple > bcast_dim_left_index) {
-          const Index head_size = first_multiple - bcast_dim_left_index;
-          params.input_block_sizes[params.bcast_dim] = head_size;
-          params.bcast_block_sizes[copy_bcast_dim] = head_size;
-          params.bcast_input_strides[copy_bcast_dim] =
-              params.input_block_strides[params.bcast_dim];
-          params.bcast_block_strides[copy_bcast_dim] =
-              params.output_strides[params.bcast_dim];
-          params.bcast_block_sizes[broadcast_bcast_dim] = 1;
-          params.bcast_input_strides[broadcast_bcast_dim] = 0;
-          params.bcast_block_strides[broadcast_bcast_dim] =
-              params.output_strides[params.bcast_dim] *
-              params.input_dims[params.bcast_dim];
-
-          num_output_coeffs += BroadcastBlock(
-              params.input_block_sizes, params.input_block_strides,
-              params.bcast_block_sizes, params.bcast_block_strides,
-              params.bcast_input_strides, bcast_offset, 0, scratch,
-              materialized_output, materialized_input, materialized_input_size);
-        }
-        if (first_multiple < last_multiple) {
-          params.input_block_sizes[params.bcast_dim] = input_bcast_dim_size;
-          params.bcast_block_sizes[copy_bcast_dim] = input_bcast_dim_size;
-          params.bcast_input_strides[copy_bcast_dim] =
-              params.input_block_strides[params.bcast_dim];
-          params.bcast_block_strides[copy_bcast_dim] =
-              params.output_strides[params.bcast_dim];
-          params.bcast_block_sizes[broadcast_bcast_dim] =
-              (last_multiple - first_multiple) / input_bcast_dim_size;
-          params.bcast_input_strides[broadcast_bcast_dim] = 0;
-          params.bcast_block_strides[broadcast_bcast_dim] =
-              params.output_strides[params.bcast_dim] *
-              params.input_dims[params.bcast_dim];
-          const Index offset = (first_multiple - bcast_dim_left_index) *
-                               m_outputStrides[params.bcast_dim];
-
-          num_output_coeffs += BroadcastBlock(
-              params.input_block_sizes, params.input_block_strides,
-              params.bcast_block_sizes, params.bcast_block_strides,
-              params.bcast_input_strides, bcast_offset, offset, scratch,
-              materialized_output, materialized_input, materialized_input_size);
-        }
-        if (last_multiple < bcast_dim_left_index + params.bcast_dim_size) {
-          const Index tail_size =
-              bcast_dim_left_index + params.bcast_dim_size - last_multiple;
-          params.input_block_sizes[params.bcast_dim] = tail_size;
-          params.bcast_block_sizes[copy_bcast_dim] = tail_size;
-          params.bcast_input_strides[copy_bcast_dim] =
-              params.input_block_strides[params.bcast_dim];
-          params.bcast_block_strides[copy_bcast_dim] =
-              params.output_strides[params.bcast_dim];
-          params.bcast_block_sizes[broadcast_bcast_dim] = 1;
-          params.bcast_input_strides[broadcast_bcast_dim] = 0;
-          params.bcast_block_strides[broadcast_bcast_dim] =
-              params.output_strides[params.bcast_dim] *
-              params.input_dims[params.bcast_dim];
-          const Index offset = (last_multiple - bcast_dim_left_index) *
-                               m_outputStrides[params.bcast_dim];
-
-          num_output_coeffs += BroadcastBlock(
-              params.input_block_sizes, params.input_block_strides,
-              params.bcast_block_sizes, params.bcast_block_strides,
-              params.bcast_input_strides, bcast_offset, offset, scratch,
-              materialized_output, materialized_input, materialized_input_size);
-        }
-      } else {
-        // b and c do not exist.
-        const int copy_bcast_dim =
-            IsColMajor ? 2 * params.inner_dim_count
-                       : 2 * NumDims - 2 * params.inner_dim_count - 1;
-        params.input_block_sizes[params.bcast_dim] = params.bcast_dim_size;
-        params.bcast_block_sizes[copy_bcast_dim] = params.bcast_dim_size;
-        params.bcast_input_strides[copy_bcast_dim] =
-            params.input_block_strides[params.bcast_dim];
-        params.bcast_block_strides[copy_bcast_dim] =
-            params.output_strides[params.bcast_dim];
-
-        num_output_coeffs += BroadcastBlock(
-            params.input_block_sizes, params.input_block_strides,
-            params.bcast_block_sizes, params.bcast_block_strides,
-            params.bcast_input_strides, bcast_offset, 0, scratch,
-            materialized_output, materialized_input, materialized_input_size);
-      }
-
-      return num_output_coeffs;
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlock(
-      const Dimensions& input_block_sizes,
-      const Dimensions& input_block_strides,
-      const BroadcastDimensions& bcast_block_sizes,
-      const BroadcastDimensions& bcast_block_strides,
-      const BroadcastDimensions& bcast_input_strides, Index bcast_offset,
-      Index offset, TensorBlockScratch& scratch,
-      ScalarNoConst* materialized_output, ScalarNoConst** materialized_input,
-      size_t* materialized_input_size) const {
-    // ---------------------------------------------------------------------- //
-    // Tensor block descriptor for reading block from the input.
-    const Index input_offset = bcast_offset + offset;
-    TensorBlockDesc input_desc(
-        IsColMajor ? indexColMajor(input_offset) : indexRowMajor(input_offset),
-        input_block_sizes);
-
-    ArgTensorBlock input_block = m_impl.block(input_desc, scratch);
-
-    // ---------------------------------------------------------------------- //
-    // Materialize input block into a temporary memory buffer only if it's not
-    // already available in the arg block.
-    const ScalarNoConst* input_buffer = NULL;
-
-    if (input_block.data() != NULL) {
-      // Input block already has raw data, there is no need to materialize it.
-      input_buffer = input_block.data();
-
-    } else {
-      // Otherwise we have to do block assignment into a temporary buffer.
-
-      // Maybe reuse previously allocated buffer, or allocate a new one with a
-      // scratch allocator.
-      const size_t input_total_size = input_block_sizes.TotalSize();
-      if (*materialized_input == NULL ||
-          *materialized_input_size < input_total_size) {
-        *materialized_input_size = input_total_size;
-        void* mem = scratch.allocate(*materialized_input_size * sizeof(Scalar));
-        *materialized_input = static_cast<ScalarNoConst*>(mem);
-      }
-
-      typedef internal::TensorBlockAssignment<
-          ScalarNoConst, NumDims, typename ArgTensorBlock::XprType, Index>
-          TensorBlockAssignment;
-
-      TensorBlockAssignment::Run(
-          TensorBlockAssignment::target(input_block_sizes, input_block_strides,
-                                        *materialized_input),
-          input_block.expr());
-
-      input_buffer = *materialized_input;
-    }
-
-    // ---------------------------------------------------------------------- //
-    // Copy data from materialized input block to the materialized output, using
-    // given broadcast strides (strides with zeroes).
-    typedef internal::TensorBlockIO<ScalarNoConst, Index, 2 * NumDims, Layout>
-        TensorBlockIO;
-
-    typename TensorBlockIO::Src src(bcast_input_strides, input_buffer);
-    typename TensorBlockIO::Dst dst(bcast_block_sizes, bcast_block_strides,
-                                      materialized_output + offset);
-
-    return TensorBlockIO::Copy(dst, src);
-  }
-
-protected:
-  const Device EIGEN_DEVICE_REF m_device;
-  const typename internal::remove_reference<Broadcast>::type m_broadcast;
-  Dimensions m_dimensions;
-  array<Index, NumDims> m_outputStrides;
-  array<Index, NumDims> m_inputStrides;
-  TensorEvaluator<ArgType, Device> m_impl;
-};
-
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorChipping.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorChipping.h
deleted file mode 100644
index 3764573..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorChipping.h
+++ /dev/null
@@ -1,518 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H
-#define EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H
-
-namespace Eigen {
-
-/** \class TensorKChippingReshaping
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief A chip is a thin slice, corresponding to a column or a row in a 2-d tensor.
-  *
-  *
-  */
-
-namespace internal {
-template<DenseIndex DimId, typename XprType>
-struct traits<TensorChippingOp<DimId, XprType> > : public traits<XprType>
-{
-  typedef typename XprType::Scalar Scalar;
-  typedef traits<XprType> XprTraits;
-  typedef typename XprTraits::StorageKind StorageKind;
-  typedef typename XprTraits::Index Index;
-  typedef typename XprType::Nested Nested;
-  typedef typename remove_reference<Nested>::type _Nested;
-  static const int NumDimensions = XprTraits::NumDimensions - 1;
-  static const int Layout = XprTraits::Layout;
-  typedef typename XprTraits::PointerType PointerType;
-};
-
-template<DenseIndex DimId, typename XprType>
-struct eval<TensorChippingOp<DimId, XprType>, Eigen::Dense>
-{
-  typedef const TensorChippingOp<DimId, XprType> EIGEN_DEVICE_REF type;
-};
-
-template<DenseIndex DimId, typename XprType>
-struct nested<TensorChippingOp<DimId, XprType>, 1, typename eval<TensorChippingOp<DimId, XprType> >::type>
-{
-  typedef TensorChippingOp<DimId, XprType> type;
-};
-
-template <DenseIndex DimId>
-struct DimensionId
-{
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) {
-    EIGEN_UNUSED_VARIABLE(dim);
-    eigen_assert(dim == DimId);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const {
-    return DimId;
-  }
-};
-template <>
-struct DimensionId<Dynamic>
-{
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) : actual_dim(dim) {
-    eigen_assert(dim >= 0);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const {
-    return actual_dim;
-  }
- private:
-  const DenseIndex actual_dim;
-};
-
-
-}  // end namespace internal
-
-
-
-template<DenseIndex DimId, typename XprType>
-class TensorChippingOp : public TensorBase<TensorChippingOp<DimId, XprType> >
-{
-  public:
-    typedef TensorBase<TensorChippingOp<DimId, XprType> > Base;
-    typedef typename Eigen::internal::traits<TensorChippingOp>::Scalar Scalar;
-    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-    typedef typename XprType::CoeffReturnType CoeffReturnType;
-    typedef typename Eigen::internal::nested<TensorChippingOp>::type Nested;
-    typedef typename Eigen::internal::traits<TensorChippingOp>::StorageKind StorageKind;
-    typedef typename Eigen::internal::traits<TensorChippingOp>::Index Index;
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset, const Index dim)
-        : m_xpr(expr), m_offset(offset), m_dim(dim) {
-    }
-
-    EIGEN_DEVICE_FUNC
-    const Index offset() const { return m_offset; }
-    EIGEN_DEVICE_FUNC
-    const Index dim() const { return m_dim.actualDim(); }
-
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename XprType::Nested>::type&
-    expression() const { return m_xpr; }
-
-    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorChippingOp)
-
-  protected:
-    typename XprType::Nested m_xpr;
-    const Index m_offset;
-    const internal::DimensionId<DimId> m_dim;
-};
-
-
-// Eval as rvalue
-template<DenseIndex DimId, typename ArgType, typename Device>
-struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
-{
-  typedef TensorChippingOp<DimId, ArgType> XprType;
-  static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
-  static const int NumDims = NumInputDims-1;
-  typedef typename XprType::Index Index;
-  typedef DSizes<Index, NumDims> Dimensions;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
-  enum {
-    // Alignment can't be guaranteed at compile time since it depends on the
-    // slice offsets.
-    IsAligned         = false,
-    Layout            = TensorEvaluator<ArgType, Device>::Layout,
-    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
-    // Chipping of outer-most dimension is a trivial operation, because we can
-    // read and write directly from the underlying tensor using single offset.
-    IsOuterChipping   = (static_cast<int>(Layout) == ColMajor && DimId == NumInputDims - 1) ||
-                        (static_cast<int>(Layout) == RowMajor && DimId == 0),
-    // Chipping inner-most dimension.
-    IsInnerChipping   = (static_cast<int>(Layout) == ColMajor && DimId == 0) ||
-                        (static_cast<int>(Layout) == RowMajor && DimId == NumInputDims - 1),
-    // Prefer block access if the underlying expression prefers it, otherwise
-    // only if chipping is not trivial.
-    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess ||
-                        !IsOuterChipping,
-    CoordAccess       = false,  // to be implemented
-    RawAccess         = false
-  };
-
-  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
-  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
-
-  typedef internal::TensorBlockDescriptor<NumInputDims, Index>
-      ArgTensorBlockDesc;
-  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
-      ArgTensorBlock;
-
-  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
-                                                     Layout, Index>
-      TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device)
-  {
-    EIGEN_STATIC_ASSERT((NumInputDims >= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    eigen_assert(NumInputDims > m_dim.actualDim());
-
-    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
-    eigen_assert(op.offset() < input_dims[m_dim.actualDim()]);
-
-    int j = 0;
-    for (int i = 0; i < NumInputDims; ++i) {
-      if (i != m_dim.actualDim()) {
-        m_dimensions[j] = input_dims[i];
-        ++j;
-      }
-    }
-
-    m_stride = 1;
-    m_inputStride = 1;
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      for (int i = 0; i < m_dim.actualDim(); ++i) {
-        m_stride *= input_dims[i];
-        m_inputStride *= input_dims[i];
-      }
-    } else {
-      for (int i = NumInputDims-1; i > m_dim.actualDim(); --i) {
-        m_stride *= input_dims[i];
-        m_inputStride *= input_dims[i];
-      }
-    }
-    m_inputStride *= input_dims[m_dim.actualDim()];
-    m_inputOffset = m_stride * op.offset();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
-    m_impl.evalSubExprsIfNeeded(NULL);
-    return true;
-  }
-
-  EIGEN_STRONG_INLINE void cleanup() {
-    m_impl.cleanup();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
-  {
-    return m_impl.coeff(srcCoeff(index));
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
-  {
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
-
-    if (isInnerChipping()) {
-      // m_stride is equal to 1, so let's avoid the integer division.
-      eigen_assert(m_stride == 1);
-      Index inputIndex = index * m_inputStride + m_inputOffset;
-      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
-      EIGEN_UNROLL_LOOP
-      for (int i = 0; i < PacketSize; ++i) {
-        values[i] = m_impl.coeff(inputIndex);
-        inputIndex += m_inputStride;
-      }
-      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
-      return rslt;
-    } else if (isOuterChipping()) {
-      // m_stride is always greater than index, so let's avoid the integer division.
-      eigen_assert(m_stride > index);
-      return m_impl.template packet<LoadMode>(index + m_inputOffset);
-    } else {
-      const Index idx = index / m_stride;
-      const Index rem = index - idx * m_stride;
-      if (rem + PacketSize <= m_stride) {
-        Index inputIndex = idx * m_inputStride + m_inputOffset + rem;
-        return m_impl.template packet<LoadMode>(inputIndex);
-      } else {
-        // Cross the stride boundary. Fallback to slow path.
-        EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
-       EIGEN_UNROLL_LOOP
-        for (int i = 0; i < PacketSize; ++i) {
-          values[i] = coeff(index);
-          ++index;
-        }
-        PacketReturnType rslt = internal::pload<PacketReturnType>(values);
-        return rslt;
-      }
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
-  costPerCoeff(bool vectorized) const {
-    double cost = 0;
-    if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) &&
-         m_dim.actualDim() == 0) ||
-        (static_cast<int>(Layout) == static_cast<int>(RowMajor) &&
-         m_dim.actualDim() == NumInputDims - 1)) {
-      cost += TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
-    } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) &&
-                m_dim.actualDim() == NumInputDims - 1) ||
-               (static_cast<int>(Layout) == static_cast<int>(RowMajor) &&
-                m_dim.actualDim() == 0)) {
-      cost += TensorOpCost::AddCost<Index>();
-    } else {
-      cost += 3 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>() +
-              3 * TensorOpCost::AddCost<Index>();
-    }
-
-    return m_impl.costPerCoeff(vectorized) +
-           TensorOpCost(0, 0, cost, vectorized, PacketSize);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockResourceRequirements getResourceRequirements() const {
-    const size_t target_size = m_device.lastLevelCacheSize();
-    return internal::TensorBlockResourceRequirements::merge(
-        internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size),
-        m_impl.getResourceRequirements());
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
-  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
-          bool root_of_expr_ast = false) const {
-    const Index chip_dim = m_dim.actualDim();
-
-    DSizes<Index, NumInputDims> input_block_dims;
-    for (int i = 0; i < NumInputDims; ++i) {
-      input_block_dims[i]
-            = i < chip_dim ? desc.dimension(i)
-            : i > chip_dim ? desc.dimension(i - 1)
-            : 1;
-    }
-
-    ArgTensorBlockDesc arg_desc(srcCoeff(desc.offset()), input_block_dims);
-
-    // Try to reuse destination buffer for materializing argument block.
-    if (desc.HasDestinationBuffer()) {
-      DSizes<Index, NumInputDims> arg_destination_strides;
-      for (int i = 0; i < NumInputDims; ++i) {
-      arg_destination_strides[i]
-            = i < chip_dim ? desc.destination().strides()[i]
-            : i > chip_dim ? desc.destination().strides()[i - 1]
-            : 0; // for dimensions of size `1` stride should never be used.
-      }
-
-      arg_desc.template AddDestinationBuffer<Layout>(
-          desc.destination().template data<ScalarNoConst>(),
-          arg_destination_strides);
-    }
-
-    ArgTensorBlock arg_block = m_impl.block(arg_desc, scratch, root_of_expr_ast);
-    if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer();
-
-    if (arg_block.data() != NULL) {
-      // Forward argument block buffer if possible.
-      return TensorBlock(arg_block.kind(), arg_block.data(),
-                           desc.dimensions());
-
-    } else {
-      // Assign argument block expression to a buffer.
-
-      // Prepare storage for the materialized chipping result.
-      const typename TensorBlock::Storage block_storage =
-          TensorBlock::prepareStorage(desc, scratch);
-
-      typedef internal::TensorBlockAssignment<
-          ScalarNoConst, NumInputDims, typename ArgTensorBlock::XprType, Index>
-          TensorBlockAssignment;
-
-      TensorBlockAssignment::Run(
-          TensorBlockAssignment::target(
-              arg_desc.dimensions(),
-              internal::strides<Layout>(arg_desc.dimensions()),
-              block_storage.data()),
-          arg_block.expr());
-
-      return block_storage.AsTensorMaterializedBlock();
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const {
-    typename Storage::Type result = constCast(m_impl.data());
-    if (isOuterChipping() && result) {
-      return result + m_inputOffset;
-    } else {
-      return NULL;
-    }
-  }
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
-
- protected:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
-  {
-    Index inputIndex;
-    if (isInnerChipping()) {
-      // m_stride is equal to 1, so let's avoid the integer division.
-      eigen_assert(m_stride == 1);
-      inputIndex = index * m_inputStride + m_inputOffset;
-    } else if (isOuterChipping()) {
-      // m_stride is always greater than index, so let's avoid the integer
-      // division.
-      eigen_assert(m_stride > index);
-      inputIndex = index + m_inputOffset;
-    } else {
-      const Index idx = index / m_stride;
-      inputIndex = idx * m_inputStride + m_inputOffset;
-      index -= idx * m_stride;
-      inputIndex += index;
-    }
-    return inputIndex;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isInnerChipping() const {
-    return IsInnerChipping ||
-           (static_cast<int>(Layout) == ColMajor && m_dim.actualDim() == 0) ||
-           (static_cast<int>(Layout) == RowMajor && m_dim.actualDim() == NumInputDims - 1);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isOuterChipping() const {
-    return IsOuterChipping ||
-           (static_cast<int>(Layout) == ColMajor && m_dim.actualDim() == NumInputDims-1) ||
-           (static_cast<int>(Layout) == RowMajor && m_dim.actualDim() == 0);
-  }
-
-  Dimensions m_dimensions;
-  Index m_stride;
-  Index m_inputOffset;
-  Index m_inputStride;
-  TensorEvaluator<ArgType, Device> m_impl;
-  const internal::DimensionId<DimId> m_dim;
-  const Device EIGEN_DEVICE_REF m_device;
-};
-
-
-// Eval as lvalue
-template<DenseIndex DimId, typename ArgType, typename Device>
-struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
-  : public TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
-{
-  typedef TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> Base;
-  typedef TensorChippingOp<DimId, ArgType> XprType;
-  static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
-  static const int NumDims = NumInputDims-1;
-  typedef typename XprType::Index Index;
-  typedef DSizes<Index, NumDims> Dimensions;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-
-  enum {
-    IsAligned     = false,
-    PacketAccess  = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess   = TensorEvaluator<ArgType, Device>::RawAccess,
-    Layout        = TensorEvaluator<ArgType, Device>::Layout,
-    RawAccess     = false
-  };
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-    : Base(op, device)
-    { }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
-  {
-    return this->m_impl.coeffRef(this->srcCoeff(index));
-  }
-
-  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketReturnType& x)
-  {
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
-
-    if (this->isInnerChipping()) {
-      // m_stride is equal to 1, so let's avoid the integer division.
-      eigen_assert(this->m_stride == 1);
-      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
-      internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
-      Index inputIndex = index * this->m_inputStride + this->m_inputOffset;
-      EIGEN_UNROLL_LOOP
-      for (int i = 0; i < PacketSize; ++i) {
-        this->m_impl.coeffRef(inputIndex) = values[i];
-        inputIndex += this->m_inputStride;
-      }
-    } else if (this->isOuterChipping()) {
-      // m_stride is always greater than index, so let's avoid the integer division.
-      eigen_assert(this->m_stride > index);
-      this->m_impl.template writePacket<StoreMode>(index + this->m_inputOffset, x);
-    } else {
-      const Index idx = index / this->m_stride;
-      const Index rem = index - idx * this->m_stride;
-      if (rem + PacketSize <= this->m_stride) {
-        const Index inputIndex = idx * this->m_inputStride + this->m_inputOffset + rem;
-        this->m_impl.template writePacket<StoreMode>(inputIndex, x);
-      } else {
-        // Cross stride boundary. Fallback to slow path.
-        EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
-        internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
-        EIGEN_UNROLL_LOOP
-        for (int i = 0; i < PacketSize; ++i) {
-          this->coeffRef(index) = values[i];
-          ++index;
-        }
-      }
-    }
-  }
-
-  template <typename TensorBlock>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
-      const TensorBlockDesc& desc, const TensorBlock& block) {
-    assert(this->m_impl.data() != NULL);
-
-    const Index chip_dim = this->m_dim.actualDim();
-
-    DSizes<Index, NumInputDims> input_block_dims;
-    for (int i = 0; i < NumInputDims; ++i) {
-      input_block_dims[i] = i < chip_dim ? desc.dimension(i)
-                          : i > chip_dim ? desc.dimension(i - 1)
-                          : 1;
-    }
-
-    typedef TensorReshapingOp<const DSizes<Index, NumInputDims>,
-                              const typename TensorBlock::XprType>
-        TensorBlockExpr;
-
-    typedef internal::TensorBlockAssignment<Scalar, NumInputDims,
-                                            TensorBlockExpr, Index>
-        TensorBlockAssign;
-
-    TensorBlockAssign::Run(
-        TensorBlockAssign::target(
-            input_block_dims,
-            internal::strides<Layout>(this->m_impl.dimensions()),
-            this->m_impl.data(), this->srcCoeff(desc.offset())),
-        block.expr().reshape(input_block_dims));
-  }
-};
-
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorConcatenation.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorConcatenation.h
deleted file mode 100644
index 5235a8e..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorConcatenation.h
+++ /dev/null
@@ -1,377 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H
-#define EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H
-
-namespace Eigen {
-
-/** \class TensorConcatenationOp
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Tensor concatenation class.
-  *
-  *
-  */
-namespace internal {
-template<typename Axis, typename LhsXprType, typename RhsXprType>
-struct traits<TensorConcatenationOp<Axis, LhsXprType, RhsXprType> >
-{
-  // Type promotion to handle the case where the types of the lhs and the rhs are different.
-  typedef typename promote_storage_type<typename LhsXprType::Scalar,
-                                        typename RhsXprType::Scalar>::ret Scalar;
-  typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
-                                        typename traits<RhsXprType>::StorageKind>::ret StorageKind;
-  typedef typename promote_index_type<typename traits<LhsXprType>::Index,
-                                      typename traits<RhsXprType>::Index>::type Index;
-  typedef typename LhsXprType::Nested LhsNested;
-  typedef typename RhsXprType::Nested RhsNested;
-  typedef typename remove_reference<LhsNested>::type _LhsNested;
-  typedef typename remove_reference<RhsNested>::type _RhsNested;
-  static const int NumDimensions = traits<LhsXprType>::NumDimensions;
-  static const int Layout = traits<LhsXprType>::Layout;
-  enum { Flags = 0 };
-  typedef typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
-                               typename traits<LhsXprType>::PointerType, typename traits<RhsXprType>::PointerType>::type PointerType;
-};
-
-template<typename Axis, typename LhsXprType, typename RhsXprType>
-struct eval<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, Eigen::Dense>
-{
-  typedef const TensorConcatenationOp<Axis, LhsXprType, RhsXprType>& type;
-};
-
-template<typename Axis, typename LhsXprType, typename RhsXprType>
-struct nested<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, 1, typename eval<TensorConcatenationOp<Axis, LhsXprType, RhsXprType> >::type>
-{
-  typedef TensorConcatenationOp<Axis, LhsXprType, RhsXprType> type;
-};
-
-}  // end namespace internal
-
-
-template<typename Axis, typename LhsXprType, typename RhsXprType>
-class TensorConcatenationOp : public TensorBase<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, WriteAccessors>
-{
-  public:
-    typedef TensorBase<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, WriteAccessors> Base;
-    typedef typename internal::traits<TensorConcatenationOp>::Scalar Scalar;
-    typedef typename internal::traits<TensorConcatenationOp>::StorageKind StorageKind;
-    typedef typename internal::traits<TensorConcatenationOp>::Index Index;
-    typedef typename internal::nested<TensorConcatenationOp>::type Nested;
-    typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType,
-                                                    typename RhsXprType::CoeffReturnType>::ret CoeffReturnType;
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConcatenationOp(const LhsXprType& lhs, const RhsXprType& rhs, Axis axis)
-        : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_axis(axis) {}
-
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename LhsXprType::Nested>::type&
-    lhsExpression() const { return m_lhs_xpr; }
-
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename RhsXprType::Nested>::type&
-    rhsExpression() const { return m_rhs_xpr; }
-
-    EIGEN_DEVICE_FUNC const Axis& axis() const { return m_axis; }
-
-    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorConcatenationOp)
-  protected:
-    typename LhsXprType::Nested m_lhs_xpr;
-    typename RhsXprType::Nested m_rhs_xpr;
-    const Axis m_axis;
-};
-
-
-// Eval as rvalue
-template<typename Axis, typename LeftArgType, typename RightArgType, typename Device>
-struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device>
-{
-  typedef TensorConcatenationOp<Axis, LeftArgType, RightArgType> XprType;
-  typedef typename XprType::Index Index;
-  static const int NumDims = internal::array_size<typename TensorEvaluator<LeftArgType, Device>::Dimensions>::value;
-  static const int RightNumDims = internal::array_size<typename TensorEvaluator<RightArgType, Device>::Dimensions>::value;
-  typedef DSizes<Index, NumDims> Dimensions;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-  enum {
-    IsAligned         = false,
-    PacketAccess      = TensorEvaluator<LeftArgType, Device>::PacketAccess &&
-                        TensorEvaluator<RightArgType, Device>::PacketAccess,
-    BlockAccess       = false,
-    PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess ||
-                        TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
-    Layout            = TensorEvaluator<LeftArgType, Device>::Layout,
-    RawAccess         = false
-  };
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-    : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis())
-  {
-    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) || NumDims == 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    EIGEN_STATIC_ASSERT((NumDims == RightNumDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-    eigen_assert(0 <= m_axis && m_axis < NumDims);
-    const Dimensions& lhs_dims = m_leftImpl.dimensions();
-    const Dimensions& rhs_dims = m_rightImpl.dimensions();
-    {
-      int i = 0;
-      for (; i < m_axis; ++i) {
-        eigen_assert(lhs_dims[i] > 0);
-        eigen_assert(lhs_dims[i] == rhs_dims[i]);
-        m_dimensions[i] = lhs_dims[i];
-      }
-      eigen_assert(lhs_dims[i] > 0);  // Now i == m_axis.
-      eigen_assert(rhs_dims[i] > 0);
-      m_dimensions[i] = lhs_dims[i] + rhs_dims[i];
-      for (++i; i < NumDims; ++i) {
-        eigen_assert(lhs_dims[i] > 0);
-        eigen_assert(lhs_dims[i] == rhs_dims[i]);
-        m_dimensions[i] = lhs_dims[i];
-      }
-    }
-
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      m_leftStrides[0] = 1;
-      m_rightStrides[0] = 1;
-      m_outputStrides[0] = 1;
-
-      for (int j = 1; j < NumDims; ++j) {
-        m_leftStrides[j] = m_leftStrides[j-1] * lhs_dims[j-1];
-        m_rightStrides[j] = m_rightStrides[j-1] * rhs_dims[j-1];
-        m_outputStrides[j] = m_outputStrides[j-1] * m_dimensions[j-1];
-      }
-    } else {
-      m_leftStrides[NumDims - 1] = 1;
-      m_rightStrides[NumDims - 1] = 1;
-      m_outputStrides[NumDims - 1] = 1;
-
-      for (int j = NumDims - 2; j >= 0; --j) {
-        m_leftStrides[j] = m_leftStrides[j+1] * lhs_dims[j+1];
-        m_rightStrides[j] = m_rightStrides[j+1] * rhs_dims[j+1];
-        m_outputStrides[j] = m_outputStrides[j+1] * m_dimensions[j+1];
-      }
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-
-  // TODO(phli): Add short-circuit memcpy evaluation if underlying data are linear?
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType)
-  {
-    m_leftImpl.evalSubExprsIfNeeded(NULL);
-    m_rightImpl.evalSubExprsIfNeeded(NULL);
-    return true;
-  }
-
-  EIGEN_STRONG_INLINE void cleanup()
-  {
-    m_leftImpl.cleanup();
-    m_rightImpl.cleanup();
-  }
-
-  // TODO(phli): attempt to speed this up. The integer divisions and modulo are slow.
-  // See CL/76180724 comments for more ideas.
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
-  {
-    // Collect dimension-wise indices (subs).
-    array<Index, NumDims> subs;
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      for (int i = NumDims - 1; i > 0; --i) {
-        subs[i] = index / m_outputStrides[i];
-        index -= subs[i] * m_outputStrides[i];
-      }
-      subs[0] = index;
-    } else {
-      for (int i = 0; i < NumDims - 1; ++i) {
-        subs[i] = index / m_outputStrides[i];
-        index -= subs[i] * m_outputStrides[i];
-      }
-      subs[NumDims - 1] = index;
-    }
-
-    const Dimensions& left_dims = m_leftImpl.dimensions();
-    if (subs[m_axis] < left_dims[m_axis]) {
-      Index left_index;
-      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-        left_index = subs[0];
-        EIGEN_UNROLL_LOOP
-        for (int i = 1; i < NumDims; ++i) {
-          left_index += (subs[i] % left_dims[i]) * m_leftStrides[i];
-        }
-      } else {
-        left_index = subs[NumDims - 1];
-        EIGEN_UNROLL_LOOP
-        for (int i = NumDims - 2; i >= 0; --i) {
-          left_index += (subs[i] % left_dims[i]) * m_leftStrides[i];
-        }
-      }
-      return m_leftImpl.coeff(left_index);
-    } else {
-      subs[m_axis] -= left_dims[m_axis];
-      const Dimensions& right_dims = m_rightImpl.dimensions();
-      Index right_index;
-      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-        right_index = subs[0];
-        EIGEN_UNROLL_LOOP
-        for (int i = 1; i < NumDims; ++i) {
-          right_index += (subs[i] % right_dims[i]) * m_rightStrides[i];
-        }
-      } else {
-        right_index = subs[NumDims - 1];
-        EIGEN_UNROLL_LOOP
-        for (int i = NumDims - 2; i >= 0; --i) {
-          right_index += (subs[i] % right_dims[i]) * m_rightStrides[i];
-        }
-      }
-      return m_rightImpl.coeff(right_index);
-    }
-  }
-
-  // TODO(phli): Add a real vectorization.
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
-  {
-    const int packetSize = PacketType<CoeffReturnType, Device>::size;
-    EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index + packetSize - 1 < dimensions().TotalSize());
-
-    EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
-    EIGEN_UNROLL_LOOP
-    for (int i = 0; i < packetSize; ++i) {
-      values[i] = coeff(index+i);
-    }
-    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
-    return rslt;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
-  costPerCoeff(bool vectorized) const {
-    const double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
-                                           2 * TensorOpCost::MulCost<Index>() +
-                                           TensorOpCost::DivCost<Index>() +
-                                           TensorOpCost::ModCost<Index>());
-    const double lhs_size = m_leftImpl.dimensions().TotalSize();
-    const double rhs_size = m_rightImpl.dimensions().TotalSize();
-    return (lhs_size / (lhs_size + rhs_size)) *
-               m_leftImpl.costPerCoeff(vectorized) +
-           (rhs_size / (lhs_size + rhs_size)) *
-               m_rightImpl.costPerCoeff(vectorized) +
-           TensorOpCost(0, 0, compute_cost);
-  }
-
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
-
-  #ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_leftImpl.bind(cgh);
-    m_rightImpl.bind(cgh);
-  }
-  #endif
-
-  protected:
-    Dimensions m_dimensions;
-    array<Index, NumDims> m_outputStrides;
-    array<Index, NumDims> m_leftStrides;
-    array<Index, NumDims> m_rightStrides;
-    TensorEvaluator<LeftArgType, Device> m_leftImpl;
-    TensorEvaluator<RightArgType, Device> m_rightImpl;
-    const Axis m_axis;
-};
-
-// Eval as lvalue
-template<typename Axis, typename LeftArgType, typename RightArgType, typename Device>
-  struct TensorEvaluator<TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device>
-  : public TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device>
-{
-  typedef TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device> Base;
-  typedef TensorConcatenationOp<Axis, LeftArgType, RightArgType> XprType;
-  typedef typename Base::Dimensions Dimensions;
-  enum {
-    IsAligned         = false,
-    PacketAccess      = TensorEvaluator<LeftArgType, Device>::PacketAccess &&
-                        TensorEvaluator<RightArgType, Device>::PacketAccess,
-    BlockAccess       = false,
-    PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess ||
-                        TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
-    Layout            = TensorEvaluator<LeftArgType, Device>::Layout,
-    RawAccess         = false
-  };
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device)
-    : Base(op, device)
-  {
-    EIGEN_STATIC_ASSERT((static_cast<int>(Layout) == static_cast<int>(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE);
-  }
-
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
-  {
-    // Collect dimension-wise indices (subs).
-    array<Index, Base::NumDims> subs;
-    for (int i = Base::NumDims - 1; i > 0; --i) {
-      subs[i] = index / this->m_outputStrides[i];
-      index -= subs[i] * this->m_outputStrides[i];
-    }
-    subs[0] = index;
-
-    const Dimensions& left_dims = this->m_leftImpl.dimensions();
-    if (subs[this->m_axis] < left_dims[this->m_axis]) {
-      Index left_index = subs[0];
-      for (int i = 1; i < Base::NumDims; ++i) {
-        left_index += (subs[i] % left_dims[i]) * this->m_leftStrides[i];
-      }
-      return this->m_leftImpl.coeffRef(left_index);
-    } else {
-      subs[this->m_axis] -= left_dims[this->m_axis];
-      const Dimensions& right_dims = this->m_rightImpl.dimensions();
-      Index right_index = subs[0];
-      for (int i = 1; i < Base::NumDims; ++i) {
-        right_index += (subs[i] % right_dims[i]) * this->m_rightStrides[i];
-      }
-      return this->m_rightImpl.coeffRef(right_index);
-    }
-  }
-
-  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketReturnType& x)
-  {
-    const int packetSize = PacketType<CoeffReturnType, Device>::size;
-    EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index + packetSize - 1 < this->dimensions().TotalSize());
-
-    EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
-    internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
-    for (int i = 0; i < packetSize; ++i) {
-      coeffRef(index+i) = values[i];
-    }
-  }
-};
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorContraction.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorContraction.h
deleted file mode 100644
index 8b35f79..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorContraction.h
+++ /dev/null
@@ -1,1023 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H
-#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H
-
-namespace Eigen {
-
-/** \class TensorContraction
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Tensor contraction class.
-  *
-  *
-  */
-namespace internal {
-
-template<typename Dimensions, typename LhsXprType, typename RhsXprType, typename OutputKernelType>
-struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType> >
-{
-  // Type promotion to handle the case where the types of the lhs and the rhs are different.
-  typedef typename gebp_traits<typename remove_const<typename LhsXprType::Scalar>::type,
-                               typename remove_const<typename RhsXprType::Scalar>::type>::ResScalar Scalar;
-
-  typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
-                                        typename traits<RhsXprType>::StorageKind>::ret StorageKind;
-  typedef typename promote_index_type<typename traits<LhsXprType>::Index,
-                                      typename traits<RhsXprType>::Index>::type Index;
-  typedef typename LhsXprType::Nested LhsNested;
-  typedef typename RhsXprType::Nested RhsNested;
-  typedef typename remove_reference<LhsNested>::type _LhsNested;
-  typedef typename remove_reference<RhsNested>::type _RhsNested;
-
-  // From NumDims below.
-  static const int NumDimensions = traits<LhsXprType>::NumDimensions + traits<RhsXprType>::NumDimensions - 2 * array_size<Dimensions>::value;
-  static const int Layout = traits<LhsXprType>::Layout;
-  typedef typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
-                               typename traits<LhsXprType>::PointerType,
-                               typename traits<RhsXprType>::PointerType>::type
-      PointerType;
-
-  enum {
-    Flags = 0
-  };
-};
-
-template<typename Dimensions, typename LhsXprType, typename RhsXprType, typename OutputKernelType>
-struct eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType>, Eigen::Dense>
-{
-  typedef const TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType>& type;
-};
-
-template<typename Dimensions, typename LhsXprType, typename RhsXprType, typename OutputKernelType>
-struct nested<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType>, 1, typename eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType> >::type>
-{
-  typedef TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType> type;
-};
-
-template<typename Indices_, typename LeftArgType_, typename RightArgType_, typename OutputKernelType_, typename Device_>
-struct traits<TensorEvaluator<const TensorContractionOp<Indices_, LeftArgType_, RightArgType_, OutputKernelType_>, Device_> > {
-  typedef Indices_ Indices;
-  typedef LeftArgType_ LeftArgType;
-  typedef RightArgType_ RightArgType;
-  typedef OutputKernelType_ OutputKernelType;
-  typedef Device_ Device;
-
-  // From NumDims below.
-  static const int NumDimensions = traits<LeftArgType_>::NumDimensions + traits<RightArgType_>::NumDimensions - 2 * array_size<Indices_>::value;
-};
-
-// Helper class to allocate and deallocate temporary memory for packed buffers.
-template <typename LhsScalar, typename RhsScalar>
-struct TensorContractionBlockMemAllocator {
-  typedef void* BlockMemHandle;
-
-  template <typename Device>
-  EIGEN_DEVICE_FUNC static BlockMemHandle allocate(Device& d, const Index bm,
-                                                   const Index bk,
-                                                   const Index bn,
-                                                   LhsScalar** lhs_block,
-                                                   RhsScalar** rhs_block) {
-    eigen_assert(lhs_block);
-    eigen_assert(rhs_block);
-    BlockSizes sz = ComputeLhsRhsBlockSizes(bm, bk, bn);
-    char* block_mem = static_cast<char*>(d.allocate(sz.lhs_size + sz.rhs_size));
-    eigen_assert(block_mem);
-    *lhs_block = reinterpret_cast<LhsScalar*>(block_mem);
-    *rhs_block = reinterpret_cast<RhsScalar*>(block_mem + sz.lhs_size);
-    return block_mem;
-  }
-
-  template <typename Device>
-  EIGEN_DEVICE_FUNC static BlockMemHandle allocateSlices(
-      Device& d, const Index bm, const Index bk, const Index bn,
-      const Index num_lhs, const Index num_rhs, const Index num_slices,
-      std::vector<LhsScalar*>* lhs_blocks,
-      std::vector<RhsScalar*>* rhs_blocks) {
-    eigen_assert(num_slices > 0);
-    eigen_assert(num_lhs >= 0 && num_rhs >= 0);
-    eigen_assert(num_lhs == 0 || lhs_blocks);
-    eigen_assert(num_rhs == 0 || rhs_blocks);
-    BlockSizes sz = ComputeLhsRhsBlockSizes(bm, bk, bn);
-    void* block_mem = d.allocate(
-        (num_lhs * sz.lhs_size + num_rhs * sz.rhs_size) * num_slices);
-    eigen_assert(block_mem);
-    char* mem = static_cast<char*>(block_mem);
-
-    for (Index x = 0; x < num_slices; x++) {
-      if (num_lhs > 0) lhs_blocks[x].resize(num_lhs);
-      for (Index m = 0; m < num_lhs; m++) {
-        lhs_blocks[x][m] = reinterpret_cast<LhsScalar*>(mem);
-        mem += sz.lhs_size;
-      }
-      if (num_rhs > 0) rhs_blocks[x].resize(num_rhs);
-      for (Index n = 0; n < num_rhs; n++) {
-        rhs_blocks[x][n] = reinterpret_cast<RhsScalar*>(mem);
-        mem += sz.rhs_size;
-      }
-    }
-
-    return block_mem;
-  }
-
-  template <typename Device>
-  EIGEN_DEVICE_FUNC static void deallocate(Device& d, BlockMemHandle handle) {
-    d.deallocate(handle);
-  }
-
- private:
-  struct BlockSizes {
-    Index lhs_size;
-    Index rhs_size;
-  };
-  EIGEN_DEVICE_FUNC static BlockSizes ComputeLhsRhsBlockSizes(const Index bm,
-                                                              const Index bk,
-                                                              const Index bn) {
-    Index align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
-    BlockSizes sz;
-    sz.lhs_size = divup<Index>(bm * bk * sizeof(LhsScalar), align) * align;
-    sz.rhs_size = divup<Index>(bn * bk * sizeof(RhsScalar), align) * align;
-    return sz;
-  }
-};
-
-// WARNING: In this code we assume that Lhs and Rhs tensor expressions are in
-// ColMajor storage order. This property is guaranteed by the
-// TensorContractionOp evaluator. TensorContractionKernel specifies how we pack
-// blocks of Lhs and Rhs tensor expressions, and how we invoke matrix
-// multiplication for these blocks. Default tensor contraction uses
-// gemm_pack_rhs, gemm_pack_lhs and gebp_kernel from Eigen Core (see
-// GeneralBlocPanelKernel.h for details).
-//
-// By specializing contraction kernels we can use other low level libraries to
-// perform matrix multiplication, and still rely on Eigen contraction evaluator.
-// This also includes full support in TensorContractionThreadPool, assuming that
-// underlying gemm do not use it's own threading.
-//
-// - ResScalar/LhsScalar/RhsScalar - scalar type for the result of
-//   multiplication, lhs tensor and rhs tensor respectively.
-//
-// - StorageIndex - index type for the tensor expressions. In practice almost
-//   always is Eigen::Index.
-//
-// - OutputMapper provides access to the memory of the output matrix. In
-//   practice it's always column major blas_data_mapper (it must be of ResScalar
-//   type).
-//
-// - LhsMapper/RhsMapper similarly to blas_data_mapper provide a two dimensional
-//   view into the Lhs/Rhs tensor expressions. In practice it's
-//   TensorContractionInputMapper, or some specialization of it based on the
-//   type of tensor expression (e.g. TensorImagePatchOp has optimized input
-//   mapper).
-template <typename ResScalar, typename LhsScalar, typename RhsScalar,
-    typename StorageIndex, typename OutputMapper, typename LhsMapper,
-    typename RhsMapper>
-struct TensorContractionKernel {
-  // True if `invoke()` supports `beta` in `C <- alpha * A * B + beta * C`
-  // (otherwise beta should be always equal to 1).
-  enum { HasBeta = false };
-
-  EIGEN_DEVICE_FUNC
-  TensorContractionKernel(StorageIndex m_, StorageIndex k_, StorageIndex n_,
-                          StorageIndex bm_, StorageIndex bk_, StorageIndex bn_)
-      : m(m_), k(k_), n(n_), bm(bm_), bk(bk_), bn(bn_) {}
-
-  // Pack blocks of Lhs and Rhs into contiguous blocks in memory.
-  typedef LhsScalar* LhsBlock;
-  typedef RhsScalar* RhsBlock;
-
-  // Packed Lhs/Rhs block memory allocator.
-  typedef TensorContractionBlockMemAllocator<LhsScalar, RhsScalar>
-      BlockMemAllocator;
-  typedef typename BlockMemAllocator::BlockMemHandle BlockMemHandle;
-
-  typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
-
-  typedef internal::gemm_pack_lhs<
-      LhsScalar, StorageIndex, typename LhsMapper::SubMapper, Traits::mr,
-      Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor>
-      LhsPacker;
-
-  typedef internal::gemm_pack_rhs<RhsScalar, StorageIndex,
-                                  typename RhsMapper::SubMapper, Traits::nr,
-                                  ColMajor>
-      RhsPacker;
-
-  typedef internal::gebp_kernel<LhsScalar, RhsScalar, StorageIndex,
-                                OutputMapper, Traits::mr, Traits::nr,
-      /*ConjugateLhs*/ false, /*ConjugateRhs*/ false>
-      GebpKernel;
-
-  template <typename Device>
-  EIGEN_DEVICE_FUNC BlockMemHandle allocate(Device& d, LhsBlock* lhs_block,
-                                            RhsBlock* rhs_block) {
-    return BlockMemAllocator::allocate(d, bm, bk, bn, lhs_block, rhs_block);
-  }
-
-  template <typename Device>
-  EIGEN_DEVICE_FUNC BlockMemHandle allocateSlices(
-      Device& d, const StorageIndex num_lhs, const StorageIndex num_rhs,
-      const StorageIndex num_slices, std::vector<LhsBlock>* lhs_blocks,
-      std::vector<RhsBlock>* rhs_blocks) {
-    return BlockMemAllocator::allocateSlices(
-        d, bm, bk, bn, num_lhs, num_rhs, num_slices, lhs_blocks, rhs_blocks);
-  }
-
-  template <typename Device>
-  EIGEN_DEVICE_FUNC static void deallocate(Device& d, BlockMemHandle handle) {
-    BlockMemAllocator::deallocate(d, handle);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void packLhs(
-      LhsBlock* lhsBlock, const typename LhsMapper::SubMapper& data_mapper,
-      const StorageIndex depth, const StorageIndex rows) {
-    LhsPacker()(*lhsBlock, data_mapper, depth, rows, /*stride*/ 0,
-        /*offset*/ 0);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void packRhs(
-      RhsBlock* rhsBlock, const typename RhsMapper::SubMapper& data_mapper,
-      const StorageIndex depth, const StorageIndex cols) {
-    RhsPacker()(*rhsBlock, data_mapper, depth, cols);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void invoke(
-      const OutputMapper& output_mapper, const LhsBlock& lhsBlock,
-      const RhsBlock& rhsBlock, const StorageIndex rows,
-      const StorageIndex depth, const StorageIndex cols,
-      const ResScalar alpha, const ResScalar beta) {
-    // Default GEBP kernel does not support beta.
-    eigen_assert(beta == ResScalar(1));
-    static const int kComputeStrideFromBlockDimensions = -1;
-    GebpKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha,
-        /*strideA*/ kComputeStrideFromBlockDimensions,
-        /*strideB*/ kComputeStrideFromBlockDimensions,
-        /*offsetA*/ 0, /*offsetB*/ 0);
-  }
-
- private:
-  // These are dimensions of the original Tensors, and selected block sizes. The
-  // actual block sizes passed to all function above might be smaller because of
-  // the partial blocks at the end.
-  const StorageIndex m;
-  const StorageIndex k;
-  const StorageIndex n;
-  const StorageIndex bm;
-  const StorageIndex bk;
-  const StorageIndex bn;
-};
-
-}  // end namespace internal
-
-// Tensor contraction params that should enable to get from output matrix
-// 2-dimensional coordinates to the output tensor dimensions.
-struct TensorContractionParams {
-  // TensorContraction evaluator assumes that both tensors are in ColMajor
-  // layout, if tensors are in RowMajor evaluator swap lhs with rhs.
-  bool swapped_arguments;
-};
-
-// Output kernel allows to fuse operations into the tensor contraction.
-//
-// Examples:
-//   1. Elementwise Relu transformation following Conv2D.
-//   2. AddBias to the Conv2D output channels dimension.
-//
-// The NoOpOutputKernel implements an output kernel that does absolutely nothing.
-struct NoOpOutputKernel {
-  /**
-   * Tensor contraction evaluator calls this kernel after finishing each block
-   * of output matrix. Output blocks belong to the 2-dimensional output tensor.
-   *
-   * TensorContractionParams contains contraction dimensions information
-   * required to map output 2-d space into the expected output tensor space
-   * (potentially higher dimensional).
-   *
-   * \param[in] output_mapper Access to output tensor memory
-   * \param[in] params   Tensor contraction parameters
-   * \param[in] i        Index of a first row available through output_mapper
-   * \param[in] j        Index of a first column available through output_mapper
-   * \param[in] num_rows Number of available rows
-   * \param[in] num_cols Number of available columns
-   */
-  template <typename Index, typename Scalar>
-  EIGEN_ALWAYS_INLINE void operator()(
-      const internal::blas_data_mapper<Scalar, Index, ColMajor>& output_mapper,
-      const TensorContractionParams& params, Index i,
-      Index j, Index num_rows, Index num_cols) const {
-    EIGEN_UNUSED_VARIABLE(output_mapper);
-    EIGEN_UNUSED_VARIABLE(params);
-    EIGEN_UNUSED_VARIABLE(i);
-    EIGEN_UNUSED_VARIABLE(j);
-    EIGEN_UNUSED_VARIABLE(num_rows);
-    EIGEN_UNUSED_VARIABLE(num_cols);
-  }
-};
-
-template<typename Indices, typename LhsXprType, typename RhsXprType, typename OutputKernelType = const NoOpOutputKernel>
-class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXprType, RhsXprType, OutputKernelType>, ReadOnlyAccessors>
-{
-  public:
-  typedef typename Eigen::internal::traits<TensorContractionOp>::Scalar Scalar;
-  typedef typename internal::gebp_traits<typename LhsXprType::CoeffReturnType,
-                                         typename RhsXprType::CoeffReturnType>::ResScalar CoeffReturnType;
-  typedef typename Eigen::internal::nested<TensorContractionOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorContractionOp>::StorageKind StorageKind;
-  typedef typename Eigen::internal::traits<TensorContractionOp>::Index Index;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp(
-      const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims,
-      const OutputKernelType& output_kernel = OutputKernelType())
-      : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims),
-        m_output_kernel(output_kernel) {}
-
-  EIGEN_DEVICE_FUNC
-  const Indices& indices() const { return m_indices; }
-
-  /** \returns the nested expressions */
-  EIGEN_DEVICE_FUNC
-  const typename internal::remove_all<typename LhsXprType::Nested>::type&
-  lhsExpression() const { return m_lhs_xpr; }
-
-  EIGEN_DEVICE_FUNC
-  const typename internal::remove_all<typename RhsXprType::Nested>::type&
-  rhsExpression() const { return m_rhs_xpr; }
-
-  EIGEN_DEVICE_FUNC
-  const OutputKernelType& outputKernel() const { return m_output_kernel; }
-
-  protected:
-    typename LhsXprType::Nested m_lhs_xpr;
-    typename RhsXprType::Nested m_rhs_xpr;
-    const Indices m_indices;
-    const OutputKernelType m_output_kernel;
-};
-
-
-template<typename Derived>
-struct TensorContractionEvaluatorBase : internal::no_assignment_operator
-{
-  typedef typename internal::traits<Derived>::Indices Indices;
-  typedef typename internal::traits<Derived>::LeftArgType LeftArgType;
-  typedef typename internal::traits<Derived>::RightArgType RightArgType;
-  typedef typename internal::traits<Derived>::OutputKernelType OutputKernelType;
-  typedef typename internal::traits<Derived>::Device Device;
-
-  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
-  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
-  typedef typename XprType::Index Index;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  typedef StorageMemory<Scalar, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
-  enum {
-    IsAligned         = true,
-    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccess       = false,
-    PreferBlockAccess = false,
-    Layout            = TensorEvaluator<LeftArgType, Device>::Layout,
-    CoordAccess       = false,  // to be implemented
-    RawAccess         = true
-  };
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  // Most of the code is assuming that both input tensors are ColMajor. If the
-  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
-  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
-  // will pretend B is LHS and A is RHS.
-  typedef typename internal::conditional<
-    static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
-  typedef typename internal::conditional<
-    static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
-
-  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluatorType;
-  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluatorType;
-
-  static const int LDims =
-      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
-  static const int RDims =
-      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
-  static const int ContractDims = internal::array_size<Indices>::value;
-  static const int NumDims = LDims + RDims - 2 * ContractDims;
-
-  typedef array<Index, ContractDims> contract_t;
-  typedef array<Index, LDims - ContractDims> left_nocontract_t;
-  typedef array<Index, RDims - ContractDims> right_nocontract_t;
-
-  typedef DSizes<Index, NumDims> Dimensions;
-
-  EIGEN_STRONG_INLINE
-  TensorContractionEvaluatorBase(const XprType& op, const Device& device)
-      : m_leftImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
-                          op.lhsExpression(), op.rhsExpression()), device),
-        m_rightImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
-                           op.rhsExpression(), op.lhsExpression()), device),
-        m_device(device),
-        m_output_kernel(op.outputKernel()),
-        m_result(NULL) {
-    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
-         static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
-                        YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-
-    DSizes<Index, LDims> eval_left_dims;
-    DSizes<Index, RDims> eval_right_dims;
-    array<IndexPair<Index>, ContractDims> eval_op_indices;
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      // For ColMajor, we keep using the existing dimensions
-      for (int i = 0; i < LDims; i++) {
-        eval_left_dims[i] = m_leftImpl.dimensions()[i];
-      }
-      for (int i = 0; i < RDims; i++) {
-        eval_right_dims[i] = m_rightImpl.dimensions()[i];
-      }
-      // We keep the pairs of contracting indices.
-      for (int i = 0; i < ContractDims; i++) {
-        eval_op_indices[i].first = op.indices()[i].first;
-        eval_op_indices[i].second = op.indices()[i].second;
-      }
-    } else {
-      // For RowMajor, we need to reverse the existing dimensions
-      for (int i = 0; i < LDims; i++) {
-        eval_left_dims[i] = m_leftImpl.dimensions()[LDims - i - 1];
-      }
-      for (int i = 0; i < RDims; i++) {
-        eval_right_dims[i] = m_rightImpl.dimensions()[RDims - i - 1];
-      }
-      // We need to flip all the pairs of contracting indices as well as
-      // reversing the dimensions.
-      for (int i = 0; i < ContractDims; i++) {
-        eval_op_indices[i].first = LDims - 1 - op.indices()[ContractDims - 1 - i].second;
-        eval_op_indices[i].second = RDims - 1 - op.indices()[ContractDims - 1 - i].first;
-      }
-    }
-
-    // Check for duplicate axes and make sure the first index in eval_op_indices
-    // is increasing. Using O(n^2) sorting is OK since ContractDims is small
-    for (int i = 0; i < ContractDims; i++) {
-      for (int j = i + 1; j < ContractDims; j++) {
-        eigen_assert(eval_op_indices[j].first != eval_op_indices[i].first &&
-                     eval_op_indices[j].second != eval_op_indices[i].second &&
-                     "contraction axes should be unique");
-        if (eval_op_indices[j].first < eval_op_indices[i].first) {
-          numext::swap(eval_op_indices[j], eval_op_indices[i]);
-        }
-      }
-    }
-
-    array<Index, LDims> lhs_strides;
-    lhs_strides[0] = 1;
-    for (int i = 0; i < LDims-1; ++i) {
-      lhs_strides[i+1] = lhs_strides[i] * eval_left_dims[i];
-    }
-
-    array<Index, RDims> rhs_strides;
-    rhs_strides[0] = 1;
-    for (int i = 0; i < RDims-1; ++i) {
-      rhs_strides[i+1] = rhs_strides[i] * eval_right_dims[i];
-    }
-
-    if (m_i_strides.size() > 0) m_i_strides[0] = 1;
-    if (m_j_strides.size() > 0) m_j_strides[0] = 1;
-    if (m_k_strides.size() > 0) m_k_strides[0] = 1;
-
-    m_i_size = 1;
-    m_j_size = 1;
-    m_k_size = 1;
-
-    // To compute the dimension, we simply concatenate the non-contracting
-    // dimensions of the left and then the right tensor. Additionally, we also
-    // compute the strides corresponding to the left non-contracting
-    // dimensions and right non-contracting dimensions.
-    m_lhs_inner_dim_contiguous = true;
-    int dim_idx = 0;
-    Index nocontract_idx = 0;
-
-    for (int i = 0; i < LDims; i++) {
-      // find if we are contracting on index i of left tensor
-      bool contracting = false;
-      for (int j = 0; j < ContractDims; j++) {
-        if (eval_op_indices[j].first == i) {
-          contracting = true;
-          break;
-        }
-      }
-      if (!contracting) {
-        // add dimension size to output dimensions
-        m_dimensions[dim_idx] = eval_left_dims[i];
-        m_left_nocontract_strides[nocontract_idx] = lhs_strides[i];
-        if (dim_idx != i) {
-          m_lhs_inner_dim_contiguous = false;
-        }
-        if (nocontract_idx+1 < internal::array_size<left_nocontract_t>::value) {
-          m_i_strides[nocontract_idx+1] =
-              m_i_strides[nocontract_idx] * eval_left_dims[i];
-        } else {
-          m_i_size = m_i_strides[nocontract_idx] * eval_left_dims[i];
-        }
-        dim_idx++;
-        nocontract_idx++;
-      }
-    }
-
-    nocontract_idx = 0;
-    for (int i = 0; i < RDims; i++) {
-      bool contracting = false;
-      // find if we are contracting on index i of right tensor
-      for (int j = 0; j < ContractDims; j++) {
-        if (eval_op_indices[j].second == i) {
-          contracting = true;
-          break;
-        }
-      }
-      if (!contracting) {
-        m_dimensions[dim_idx] = eval_right_dims[i];
-        if (nocontract_idx+1 < internal::array_size<right_nocontract_t>::value) {
-          m_j_strides[nocontract_idx+1] =
-              m_j_strides[nocontract_idx] * eval_right_dims[i];
-        } else {
-          m_j_size = m_j_strides[nocontract_idx] * eval_right_dims[i];
-        }
-        m_right_nocontract_strides[nocontract_idx] = rhs_strides[i];
-        dim_idx++;
-        nocontract_idx++;
-      }
-    }
-
-    // Now compute the strides corresponding to the contracting dimensions. We
-    // assumed above that non-contracting axes are represented in the same order
-    // in the matrix as they are in the tensor. This is not the case for
-    // contracting axes. As the contracting axes must be of the same size in
-    // each tensor, we'll only look at the first tensor here.
-    m_rhs_inner_dim_contiguous = true;
-    m_rhs_inner_dim_reordered = false;
-    for (int i = 0; i < ContractDims; i++) {
-      Index left = eval_op_indices[i].first;
-      Index right = eval_op_indices[i].second;
-
-      Index size = eval_left_dims[left];
-      eigen_assert(size == eval_right_dims[right] &&
-                   "Contraction axes must be same size");
-
-      if (i+1 < static_cast<int>(internal::array_size<contract_t>::value)) {
-        m_k_strides[i+1] = m_k_strides[i] * size;
-      } else {
-        m_k_size = m_k_strides[i] * size;
-      }
-      m_left_contracting_strides[i] = lhs_strides[left];
-      m_right_contracting_strides[i] = rhs_strides[right];
-
-      if (i > 0 && right < eval_op_indices[i-1].second) {
-        m_rhs_inner_dim_reordered = true;
-      }
-      if (right != i) {
-        m_rhs_inner_dim_contiguous = false;
-      }
-    }
-
-    // If the layout is RowMajor, we need to reverse the m_dimensions
-    if (static_cast<int>(Layout) == static_cast<int>(RowMajor)) {
-      for (int i = 0, j = NumDims - 1; i < j; i++, j--) {
-        numext::swap(m_dimensions[i], m_dimensions[j]);
-      }
-    }
-
-    // A set of parameters that will allow output kernel to get from output
-    // tensor dimensions (i, j) into the original tensor dimensions.
-    // TODO(ezhulenev): Add parameters required to infer output tensor index for
-    // more complex contractions than 2x2 on internal dimension.
-    m_tensor_contraction_params.swapped_arguments = static_cast<int>(Layout) == RowMajor;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
-    m_leftImpl.evalSubExprsIfNeeded(NULL);
-    m_rightImpl.evalSubExprsIfNeeded(NULL);
-    if (data) {
-      evalTo(data);
-      return false;
-    } else {
-      m_result = static_cast<EvaluatorPointerType>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
-      evalTo(m_result);
-      return true;
-    }
-  }
-
-#ifdef EIGEN_USE_THREADS
-  template <typename EvalSubExprsCallback>
-  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
-      EvaluatorPointerType dest, EvalSubExprsCallback done) {
-    m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done, dest](bool) {
-      m_rightImpl.evalSubExprsIfNeededAsync(nullptr, [this, done, dest](bool) {
-        if (dest) {
-          evalToAsync(dest, [done]() { done(false); });
-        } else {
-          m_result = static_cast<EvaluatorPointerType>(
-              m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
-          evalToAsync(m_result, [done]() { done(true); });
-        }
-      });
-    });
-  }
-#endif  // EIGEN_USE_THREADS
-
-#ifndef TENSOR_CONTRACTION_DISPATCH
-#define TENSOR_CONTRACTION_DISPATCH(METHOD, ALIGNMENT, ARGS) \
-  if (this->m_lhs_inner_dim_contiguous) {                    \
-    if (this->m_rhs_inner_dim_contiguous) {                  \
-      if (this->m_rhs_inner_dim_reordered) {                 \
-        METHOD<true, true, true, ALIGNMENT> ARGS;            \
-      } else {                                               \
-        METHOD<true, true, false, ALIGNMENT> ARGS;           \
-      }                                                      \
-    } else {                                                 \
-      if (this->m_rhs_inner_dim_reordered) {                 \
-        METHOD<true, false, true, ALIGNMENT> ARGS;           \
-      } else {                                               \
-        METHOD<true, false, false, ALIGNMENT> ARGS;          \
-      }                                                      \
-    }                                                        \
-  } else {                                                   \
-    if (this->m_rhs_inner_dim_contiguous) {                  \
-      if (this->m_rhs_inner_dim_reordered) {                 \
-        METHOD<false, true, true, ALIGNMENT> ARGS;           \
-      } else {                                               \
-        METHOD<false, true, false, ALIGNMENT> ARGS;          \
-      }                                                      \
-    } else {                                                 \
-      if (this->m_rhs_inner_dim_reordered) {                 \
-        METHOD<false, false, true, ALIGNMENT> ARGS;          \
-      } else {                                               \
-        METHOD<false, false, false, ALIGNMENT> ARGS;         \
-      }                                                      \
-    }                                                        \
-  }
-#endif
-
-#ifndef TENSOR_CONTRACTION_ASYNC_DISPATCH
-#define TENSOR_CONTRACTION_ASYNC_DISPATCH(METHOD, DONE, ALIGNMENT, ARGS, FN) \
-  if (this->m_lhs_inner_dim_contiguous) {                                    \
-    if (this->m_rhs_inner_dim_contiguous) {                                  \
-      if (this->m_rhs_inner_dim_reordered) {                                 \
-        (new METHOD<DONE, true, true, true, ALIGNMENT> ARGS)->FN;            \
-      } else {                                                               \
-        (new METHOD<DONE, true, true, false, ALIGNMENT> ARGS)->FN;           \
-      }                                                                      \
-    } else {                                                                 \
-      if (this->m_rhs_inner_dim_reordered) {                                 \
-        (new METHOD<DONE, true, false, true, ALIGNMENT> ARGS)->FN;           \
-      } else {                                                               \
-        (new METHOD<DONE, true, false, false, ALIGNMENT> ARGS)->FN;          \
-      }                                                                      \
-    }                                                                        \
-  } else {                                                                   \
-    if (this->m_rhs_inner_dim_contiguous) {                                  \
-      if (this->m_rhs_inner_dim_reordered) {                                 \
-        (new METHOD<DONE, false, true, true, ALIGNMENT> ARGS)->FN;           \
-      } else {                                                               \
-        (new METHOD<DONE, false, true, false, ALIGNMENT> ARGS)->FN;          \
-      }                                                                      \
-    } else {                                                                 \
-      if (this->m_rhs_inner_dim_reordered) {                                 \
-        (new METHOD<DONE, false, false, true, ALIGNMENT> ARGS)->FN;          \
-      } else {                                                               \
-        (new METHOD<DONE, false, false, false, ALIGNMENT> ARGS)->FN;         \
-      }                                                                      \
-    }                                                                        \
-  }
-#endif
-
-  EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const {
-   static_cast<const Derived*>(this)->template evalProduct<Unaligned>(buffer);
-  }
-
-#ifdef EIGEN_USE_THREADS
-  template <typename EvalToCallback>
-  void evalToAsync(Scalar* buffer, EvalToCallback done) const {
-    static_cast<const Derived*>(this)
-        ->template evalProductAsync<EvalToCallback, Unaligned>(buffer,
-                                                               std::move(done));
-  }
-#endif  // EIGEN_USE_THREADS
-
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
-            bool rhs_inner_dim_reordered, int Alignment>
-  void evalProductSequential(Scalar* buffer) const {
-    if (this->m_j_size == 1) {
-      this->template evalGemv<lhs_inner_dim_contiguous,
-                              rhs_inner_dim_contiguous, rhs_inner_dim_reordered,
-                              Alignment>(buffer);
-    } else {
-      this->template evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous,
-                              rhs_inner_dim_reordered, Alignment>(buffer);
-    }
-  }
-
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
-  #if !defined(EIGEN_HIPCC)
-  EIGEN_DEVICE_FUNC
-  #endif
-  void evalGemv(Scalar* buffer) const {
-    const Index rows = m_i_size;
-    const Index cols = m_k_size;
-
-    typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
-    typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
-    typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
-    typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
-    const Index lhs_packet_size = internal::unpacket_traits<typename LeftEvaluator::PacketReturnType>::size;
-    const Index rhs_packet_size = internal::unpacket_traits<typename RightEvaluator::PacketReturnType>::size;
-    const int lhs_alignment = LeftEvaluator::IsAligned ? Aligned : Unaligned;
-    const int rhs_alignment = RightEvaluator::IsAligned ? Aligned : Unaligned;
-    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
-                                                   LeftEvaluator, left_nocontract_t,
-                                                   contract_t, lhs_packet_size,
-                                                   lhs_inner_dim_contiguous,
-                                                   false, lhs_alignment> LhsMapper;
-
-    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
-                                                   RightEvaluator, right_nocontract_t,
-                                                   contract_t, rhs_packet_size,
-                                                   rhs_inner_dim_contiguous,
-                                                   rhs_inner_dim_reordered, rhs_alignment> RhsMapper;
-
-    LhsMapper lhs(m_leftImpl, m_left_nocontract_strides, m_i_strides,
-                  m_left_contracting_strides, m_k_strides);
-    RhsMapper rhs(m_rightImpl, m_right_nocontract_strides, m_j_strides,
-                  m_right_contracting_strides, m_k_strides);
-
-    const Scalar alpha(1);
-    const Index resIncr(1);
-
-    // zero out the result buffer (which must be of size at least rows * sizeof(Scalar)
-    m_device.memset(buffer, 0, rows * sizeof(Scalar));
-
-    internal::general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,false,RhsScalar,RhsMapper,false>::run(
-        rows, cols, lhs, rhs,
-        buffer, resIncr, alpha);
-
-    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
-    m_output_kernel(OutputMapper(buffer, rows), m_tensor_contraction_params,
-                    static_cast<Index>(0), static_cast<Index>(0), rows,
-                    static_cast<Index>(1));
-  }
-
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
-  #if !defined(EIGEN_HIPCC)
-  EIGEN_DEVICE_FUNC
-  #endif
-  void evalGemm(Scalar* buffer) const {
-    // columns in left side, rows in right side
-    const Index k = this->m_k_size;
-    this->template evalGemmPartial<lhs_inner_dim_contiguous,
-                                   rhs_inner_dim_contiguous,
-                                   rhs_inner_dim_reordered,
-                                   Alignment, true>(buffer, 0, k, 1);
-  }
-
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
-      bool rhs_inner_dim_reordered, int Alignment>
-  EIGEN_DEVICE_FUNC void evalGemmPartialWithoutOutputKernel(
-      Scalar* buffer, Index k_start, Index k_end, int num_threads) const {
-    evalGemmPartial<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous,
-                    rhs_inner_dim_reordered, Alignment,
-        /*use_output_kernel*/ false>(buffer, k_start, k_end,
-                                     num_threads);
-  }
-
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment, bool use_output_kernel>
-  EIGEN_DEVICE_FUNC void evalGemmPartial(Scalar* buffer, Index k_start, Index k_end, int num_threads) const {
-    eigen_assert(k_end >= k_start && k_start >= 0 && k_end <= this->m_k_size);
-    // columns in slice on left side, rows on right side
-    const Index k_slice = k_end - k_start;
-
-    // rows in left side
-    const Index m = this->m_i_size;
-
-    // columns in right side
-    const Index n = this->m_j_size;
-
-    // define data mappers for Lhs and Rhs
-    typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
-    typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
-
-    typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
-    typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
-
-    const Index lhs_packet_size = internal::unpacket_traits<typename LeftEvaluator::PacketReturnType>::size;
-    const Index rhs_packet_size = internal::unpacket_traits<typename RightEvaluator::PacketReturnType>::size;
-
-    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
-                                                   LeftEvaluator, left_nocontract_t,
-                                                   contract_t, lhs_packet_size,
-                                                   lhs_inner_dim_contiguous,
-                                                   false, Unaligned> LhsMapper;
-
-    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
-                                                   RightEvaluator, right_nocontract_t,
-                                                   contract_t, rhs_packet_size,
-                                                   rhs_inner_dim_contiguous,
-                                                   rhs_inner_dim_reordered, Unaligned> RhsMapper;
-
-    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
-
-    typedef internal::TensorContractionKernel<
-        Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper>
-        TensorContractionKernel;
-
-    // initialize data mappers
-    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
-                  this->m_left_contracting_strides, this->m_k_strides);
-
-    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
-                  this->m_right_contracting_strides, this->m_k_strides);
-
-    OutputMapper output(buffer, m);
-
-    // Sizes of the blocks to load in cache. See the Goto paper for details.
-    internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar,
-                                        Index, internal::ShardByCol>
-        blocking(k_slice, m, n, num_threads);
-    const Index kc = blocking.kc();
-    const Index mc = numext::mini(m, blocking.mc());
-    const Index nc = numext::mini(n, blocking.nc());
-
-    typedef typename TensorContractionKernel::LhsBlock LhsBlock;
-    typedef typename TensorContractionKernel::RhsBlock RhsBlock;
-
-    LhsBlock blockA;
-    RhsBlock blockB;
-
-    TensorContractionKernel kernel(m, k_slice, n, mc, kc, nc);
-
-    typedef typename TensorContractionKernel::BlockMemHandle BlockMemHandle;
-    const BlockMemHandle packed_mem =
-        kernel.allocate(this->m_device, &blockA, &blockB);
-
-    // If a contraction kernel does not support beta, explicitly initialize
-    // output buffer with zeroes.
-    if (!TensorContractionKernel::HasBeta) {
-      this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
-    }
-
-    for(Index i2=0; i2<m; i2+=mc)
-    {
-      const Index actual_mc = numext::mini(i2+mc,m)-i2;
-      for (Index k2 = k_start; k2 < k_end; k2 += kc) {
-        // make sure we don't overshoot right edge of left matrix, then pack vertical panel
-        const Index actual_kc = numext::mini(k2 + kc, k_end) - k2;
-        kernel.packLhs(&blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
-
-        // If kernel supports beta, there is no need to initialize output
-        // buffer with zeroes.
-        const Scalar alpha = Scalar(1);
-        const Scalar beta = (TensorContractionKernel::HasBeta && k2 == k_start)
-                                ? Scalar(0)
-                                : Scalar(1);
-
-        // series of horizontal blocks
-        for (Index j2 = 0; j2 < n; j2 += nc) {
-          // make sure we don't overshoot right edge of right matrix, then pack block
-          const Index actual_nc = numext::mini(j2 + nc, n) - j2;
-          kernel.packRhs(&blockB, rhs.getSubMapper(k2, j2), actual_kc,
-                         actual_nc);
-
-          // call gebp (matrix kernel)
-          // The parameters here are copied from Eigen's GEMM implementation
-          const OutputMapper output_mapper = output.getSubMapper(i2, j2);
-          kernel.invoke(output_mapper, blockA, blockB, actual_mc, actual_kc,
-                        actual_nc, alpha, beta);
-
-          // We are done with this [i2, j2] output block.
-          if (use_output_kernel && k2 + kc >= k_end) {
-            m_output_kernel(output_mapper, m_tensor_contraction_params, i2, j2,
-                            actual_mc, actual_nc);
-          }
-        }
-      }
-    }
-
-    kernel.deallocate(this->m_device, packed_mem);
-  }
-
-  EIGEN_STRONG_INLINE void cleanup() {
-    m_leftImpl.cleanup();
-    m_rightImpl.cleanup();
-
-    if (m_result != NULL) {
-      m_device.deallocate(m_result);
-      m_result = NULL;
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
-    return m_result[index];
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
-    return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
-    return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_result; }
-
-protected:
-  Dimensions m_dimensions;
-
-  contract_t m_k_strides;
-  contract_t m_left_contracting_strides;
-  contract_t m_right_contracting_strides;
-
-  bool m_lhs_inner_dim_contiguous;
-  bool m_rhs_inner_dim_contiguous;
-  bool m_rhs_inner_dim_reordered;
-
-  left_nocontract_t m_i_strides;
-  right_nocontract_t m_j_strides;
-  left_nocontract_t m_left_nocontract_strides;
-  right_nocontract_t m_right_nocontract_strides;
-
-  Index m_i_size;
-  Index m_j_size;
-  Index m_k_size;
-
-  TensorContractionParams m_tensor_contraction_params;
-
-  TensorEvaluator<EvalLeftArgType, Device> m_leftImpl;
-  TensorEvaluator<EvalRightArgType, Device> m_rightImpl;
-  const Device EIGEN_DEVICE_REF m_device;
-  OutputKernelType m_output_kernel;
-  EvaluatorPointerType m_result;
-};
-
-
-// evaluator for default device
-template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType, typename Device>
-struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> :
-    public TensorContractionEvaluatorBase<
-      TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> > {
-  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
-  typedef TensorContractionEvaluatorBase<Self> Base;
-
-  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
-  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
-  typedef typename XprType::Index Index;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
-  enum {
-    Layout = TensorEvaluator<LeftArgType, Device>::Layout
-  };
-
-  // Most of the code is assuming that both input tensors are ColMajor. If the
-  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
-  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
-  // will pretend B is LHS and A is RHS.
-  typedef typename internal::conditional<
-    static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
-  typedef typename internal::conditional<
-    static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
-
-  static const int LDims =
-      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
-  static const int RDims =
-      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
-  static const int ContractDims = internal::array_size<Indices>::value;
-
-  typedef array<Index, ContractDims> contract_t;
-  typedef array<Index, LDims - ContractDims> left_nocontract_t;
-  typedef array<Index, RDims - ContractDims> right_nocontract_t;
-
-  static const int NumDims = LDims + RDims - 2 * ContractDims;
-
-  // Could we use NumDimensions here?
-  typedef DSizes<Index, NumDims> Dimensions;
-
-  TensorEvaluator(const XprType& op, const Device& device) :
-      Base(op, device) { }
-
-  template <int Alignment>
-  void evalProduct(Scalar* buffer) const {
-    TENSOR_CONTRACTION_DISPATCH(this->template evalProductSequential, Alignment, (buffer));
-  }
-};
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionBlocking.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionBlocking.h
deleted file mode 100644
index 974feb0..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionBlocking.h
+++ /dev/null
@@ -1,73 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H
-#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H
-
-
-namespace Eigen {
-namespace internal {
-
-enum {
-  ShardByRow = 0,
-  ShardByCol = 1
-};
-
-
-// Default Blocking Strategy
-template<typename ResScalar, typename LhsScalar, typename RhsScalar, typename StorageIndex, int ShardingType = ShardByCol>
-class TensorContractionBlocking {
- public:
-
- /*
-   adding EIGEN_DEVICE_FUNC unconditionally to 'TensorContractionBlocking' constructor in `TensorContractionBlocking.h`
-     requires adding EIGEN_DEVICE_FUNC to `computeProductBlockingSizes` in `GeneralBlockPanelKernel.h`
-     which in turn, requires adding EIGEN_DEVICE_FUNC to `evaluateProductBlockingSizesHeuristic` in `GeneralBlockPanelKernel.h`
-     which in turn, requires adding EIGEN_DEVICE_FUNC to `manage_caching_sizes` in `GeneralBlockPanelKernel.h`
-     (else HIPCC will error out)
-
-   However adding EIGEN_DEVICE_FUNC to `manage_caching_sizes` in `GeneralBlockPanelKernel.h`
-   results in NVCC erroring out with the following error
-
-   ../Eigen/src/Core/products/GeneralBlockPanelKernel.h(57): error #2901:
-      dynamic initialization is not supported for function-scope static variables within a __device__/__global__ function
- */
-
-  #if !defined(EIGEN_HIPCC)
-  EIGEN_DEVICE_FUNC
-  #endif
- TensorContractionBlocking(StorageIndex k, StorageIndex m, StorageIndex n, StorageIndex num_threads = 1) :
-      kc_(k), mc_(m), nc_(n)
-  {
-    if (ShardingType == ShardByCol) {
-      computeProductBlockingSizes<LhsScalar, RhsScalar, 1>(kc_, mc_, nc_, num_threads);
-    }
-    else {
-      computeProductBlockingSizes<LhsScalar, RhsScalar, 1>(kc_, nc_, mc_, num_threads);
-    }
-
-    const int rhs_packet_size = internal::packet_traits<RhsScalar>::size;
-    kc_ = (rhs_packet_size <= 8 || kc_ <= rhs_packet_size) ?
-      kc_ : (kc_ / rhs_packet_size) * rhs_packet_size;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex mc() const { return mc_; }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex nc() const { return nc_; }
-
- private:
-  StorageIndex kc_;
-  StorageIndex mc_;
-  StorageIndex nc_;
-};
-
-} // end namespace internal
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionCuda.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionCuda.h
deleted file mode 100644
index 3f315fe..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionCuda.h
+++ /dev/null
@@ -1,6 +0,0 @@
-
-#if defined(__clang__) || defined(__GNUC__)
-#warning "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorContractionGpu.h file"
-#endif
-
-#include "TensorContractionGpu.h"
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionGpu.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionGpu.h
deleted file mode 100644
index c818038..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionGpu.h
+++ /dev/null
@@ -1,1413 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
-// Copyright (C) 2015 Navdeep Jaitly <ndjaitly@google.com>
-// Copyright (C) 2014 Eric Martin <eric@ericmart.in>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H
-#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H
-
-#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
-
-namespace Eigen {
-
-template<typename Scalar, typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper, bool needs_edge_check>
-__device__ EIGEN_STRONG_INLINE void
-EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
-                               const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem,
-                       const Index m_size, const Index n_size, const Index k_size) {
-
-  const Index m_block_idx = blockIdx.x;
-  const Index n_block_idx = blockIdx.y;
-
-  const Index base_m = 64 * m_block_idx;
-  const Index base_n = 64 * n_block_idx;
-
-  // declare and initialize 64 registers for output 8x8 block
-
-  // prefetch registers
-  Scalar lhs_pf0;
-  Scalar lhs_pf1;
-  Scalar lhs_pf2;
-  Scalar lhs_pf3;
-  Scalar lhs_pf4;
-  Scalar lhs_pf5;
-  Scalar lhs_pf6;
-  Scalar lhs_pf7;
-
-  Scalar rhs_pf0;
-  Scalar rhs_pf1;
-  Scalar rhs_pf2;
-  Scalar rhs_pf3;
-  Scalar rhs_pf4;
-  Scalar rhs_pf5;
-  Scalar rhs_pf6;
-  Scalar rhs_pf7;
-
-  // shared memory is formatted
-  // (contract idx in block, nocontract idx in block, block idx)
-  // where block idx is column major. This transposition limits the number of
-  // bank conflicts when reading the LHS. The core idea is that since the contracting
-  // index is shared by both sides, then the contracting index should be in threadIdx.x.
-
-  // On the LHS, we pad each row inside of each block with an extra element. This makes
-  // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts
-  // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks.
-
-  // On the RHS we just add 8 padding elements to the end of each block. This gives no bank
-  // conflicts on writes and also none on reads.
-
-  // storage indices
-  const Index lhs_store_idx_base = threadIdx.y * 72 + threadIdx.x * 9 + threadIdx.z;
-  const Index rhs_store_idx_base = threadIdx.y * 72 + threadIdx.z * 8 + threadIdx.x;
-
-  const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0;
-  const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1;
-  const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2;
-  const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3;
-  const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4;
-  const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5;
-  const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6;
-  const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7;
-
-  const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0;
-  const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1;
-  const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2;
-  const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3;
-  const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4;
-  const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5;
-  const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6;
-  const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7;
-
-  // in the loading code, the following variables are important:
-  // threadIdx.x: the vertical position in an 8x8 block
-  // threadIdx.y: the vertical index of the 8x8 block in the grid
-  // threadIdx.z: the horizontal position in an 8x8 block
-  // k: the horizontal index of the 8x8 block in the grid
-  //
-  // The k parameter is implicit (it was the loop counter for a loop that went
-  // from 0 to <8, but now that loop is unrolled in the below code.
-
-  const Index load_idx_vert = threadIdx.x + 8 * threadIdx.y;
-  const Index lhs_vert = base_m + load_idx_vert;
-
-#define prefetchIntoRegisters(base_k)                           \
-  {                                                             \
-    lhs_pf0 = conv(0);                                          \
-    lhs_pf1 = conv(0);                                          \
-    lhs_pf2 = conv(0);                                          \
-    lhs_pf3 = conv(0);                                          \
-    lhs_pf4 = conv(0);                                          \
-    lhs_pf5 = conv(0);                                          \
-    lhs_pf6 = conv(0);                                          \
-    lhs_pf7 = conv(0);                                          \
-                                                                \
-    rhs_pf0 = conv(0);                                          \
-    rhs_pf1 = conv(0);                                          \
-    rhs_pf2 = conv(0);                                          \
-    rhs_pf3 = conv(0);                                          \
-    rhs_pf4 = conv(0);                                          \
-    rhs_pf5 = conv(0);                                          \
-    rhs_pf6 = conv(0);                                          \
-    rhs_pf7 = conv(0);                                          \
-                                                                \
-    if (!needs_edge_check || lhs_vert < m_size) {               \
-      const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8;   \
-      const Index lhs_horiz_1 = base_k + threadIdx.z + 1 * 8;   \
-      const Index lhs_horiz_2 = base_k + threadIdx.z + 2 * 8;   \
-      const Index lhs_horiz_3 = base_k + threadIdx.z + 3 * 8;   \
-      const Index lhs_horiz_4 = base_k + threadIdx.z + 4 * 8;   \
-      const Index lhs_horiz_5 = base_k + threadIdx.z + 5 * 8;   \
-      const Index lhs_horiz_6 = base_k + threadIdx.z + 6 * 8;   \
-      const Index lhs_horiz_7 = base_k + threadIdx.z + 7 * 8;   \
-                                                                \
-      if (!needs_edge_check || lhs_horiz_7 < k_size) {          \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
-        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
-        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
-        lhs_pf6 = lhs(lhs_vert, lhs_horiz_6);                   \
-        lhs_pf7 = lhs(lhs_vert, lhs_horiz_7);                   \
-      } else if (lhs_horiz_6 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
-        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
-        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
-        lhs_pf6 = lhs(lhs_vert, lhs_horiz_6);                   \
-      } else if (lhs_horiz_5 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
-        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
-        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
-      } else if (lhs_horiz_4 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
-        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
-      } else if (lhs_horiz_3 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
-      } else if (lhs_horiz_2 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-      } else if (lhs_horiz_1 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-      } else if (lhs_horiz_0 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-      }                                                         \
-    }                                                           \
-                                                                \
-    const Index rhs_vert = base_k + load_idx_vert;              \
-    if (!needs_edge_check || rhs_vert < k_size) {               \
-      const Index rhs_horiz_0 = base_n + threadIdx.z + 0 * 8;   \
-      const Index rhs_horiz_1 = base_n + threadIdx.z + 1 * 8;   \
-      const Index rhs_horiz_2 = base_n + threadIdx.z + 2 * 8;   \
-      const Index rhs_horiz_3 = base_n + threadIdx.z + 3 * 8;   \
-      const Index rhs_horiz_4 = base_n + threadIdx.z + 4 * 8;   \
-      const Index rhs_horiz_5 = base_n + threadIdx.z + 5 * 8;   \
-      const Index rhs_horiz_6 = base_n + threadIdx.z + 6 * 8;   \
-      const Index rhs_horiz_7 = base_n + threadIdx.z + 7 * 8;   \
-                                                                \
-      if (rhs_horiz_7 < n_size) {                               \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
-        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
-        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
-        rhs_pf6 = rhs(rhs_vert, rhs_horiz_6);                   \
-        rhs_pf7 = rhs(rhs_vert, rhs_horiz_7);                   \
-      } else if (rhs_horiz_6 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
-        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
-        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
-        rhs_pf6 = rhs(rhs_vert, rhs_horiz_6);                   \
-      } else if (rhs_horiz_5 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
-        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
-        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
-      } else if (rhs_horiz_4 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
-        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
-      } else if (rhs_horiz_3 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
-      } else if (rhs_horiz_2 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-      } else if (rhs_horiz_1 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-      } else if (rhs_horiz_0 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-      }                                                         \
-    }                                                           \
-  }                                                             \
-
-#define writeRegToShmem(_)                      \
-  lhs_shmem[lhs_store_idx_0] = lhs_pf0;         \
-  rhs_shmem[rhs_store_idx_0] = rhs_pf0;         \
-                                                \
-  lhs_shmem[lhs_store_idx_1] = lhs_pf1;         \
-  rhs_shmem[rhs_store_idx_1] = rhs_pf1;         \
-                                                \
-  lhs_shmem[lhs_store_idx_2] = lhs_pf2;         \
-  rhs_shmem[rhs_store_idx_2] = rhs_pf2;         \
-                                                \
-  lhs_shmem[lhs_store_idx_3] = lhs_pf3;         \
-  rhs_shmem[rhs_store_idx_3] = rhs_pf3;         \
-                                                \
-  lhs_shmem[lhs_store_idx_4] = lhs_pf4;         \
-  rhs_shmem[rhs_store_idx_4] = rhs_pf4;         \
-                                                \
-  lhs_shmem[lhs_store_idx_5] = lhs_pf5;         \
-  rhs_shmem[rhs_store_idx_5] = rhs_pf5;         \
-                                                \
-  lhs_shmem[lhs_store_idx_6] = lhs_pf6;         \
-  rhs_shmem[rhs_store_idx_6] = rhs_pf6;         \
-                                                \
-  lhs_shmem[lhs_store_idx_7] = lhs_pf7;         \
-  rhs_shmem[rhs_store_idx_7] = rhs_pf7;         \
-
-  // declare and initialize result array
-#define res(i, j) _res_##i##j
-#define initResultRow(i)                        \
-  Scalar res(i, 0) = conv(0);                   \
-  Scalar res(i, 1) = conv(0);                   \
-  Scalar res(i, 2) = conv(0);                   \
-  Scalar res(i, 3) = conv(0);                   \
-  Scalar res(i, 4) = conv(0);                   \
-  Scalar res(i, 5) = conv(0);                   \
-  Scalar res(i, 6) = conv(0);                   \
-  Scalar res(i, 7) = conv(0);                   \
-
-  internal::scalar_cast_op<int, Scalar> conv;
-  initResultRow(0);
-  initResultRow(1);
-  initResultRow(2);
-  initResultRow(3);
-  initResultRow(4);
-  initResultRow(5);
-  initResultRow(6);
-  initResultRow(7);
-#undef initResultRow
-
-  for (Index base_k = 0; base_k < k_size; base_k += 64) {
-    // wait for previous iteration to finish with shmem. Despite common sense,
-    // the code is a bit faster with this here then at bottom of loop
-    __syncthreads();
-
-    prefetchIntoRegisters(base_k);
-    writeRegToShmem();
-
-    #undef prefetchIntoRegisters
-    #undef writeRegToShmem
-
-    // wait for shared mem packing to be done before starting computation
-    __syncthreads();
-
-    // compute 8x8 matrix product by outer product. This involves packing one column
-    // of LHS and one row of RHS into registers (takes 16 registers).
-
-#define lcol(i) _lcol##i
-    Scalar lcol(0);
-    Scalar lcol(1);
-    Scalar lcol(2);
-    Scalar lcol(3);
-    Scalar lcol(4);
-    Scalar lcol(5);
-    Scalar lcol(6);
-    Scalar lcol(7);
-
-#define rrow(j) _rrow##j
-    Scalar rrow(0);
-    Scalar rrow(1);
-    Scalar rrow(2);
-    Scalar rrow(3);
-    Scalar rrow(4);
-    Scalar rrow(5);
-    Scalar rrow(6);
-    Scalar rrow(7);
-
-    // Now x corresponds to k, y to m, and z to n
-    const Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y];
-    const Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z];
-
-#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))]
-#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))]
-
-#define loadData(i, j)                          \
-    lcol(0) = lhs_element(0, j);               \
-    rrow(0) = rhs_element(i, 0);               \
-    lcol(1) = lhs_element(1, j);               \
-    rrow(1) = rhs_element(i, 1);               \
-    lcol(2) = lhs_element(2, j);               \
-    rrow(2) = rhs_element(i, 2);               \
-    lcol(3) = lhs_element(3, j);               \
-    rrow(3) = rhs_element(i, 3);               \
-    lcol(4) = lhs_element(4, j);               \
-    rrow(4) = rhs_element(i, 4);               \
-    lcol(5) = lhs_element(5, j);               \
-    rrow(5) = rhs_element(i, 5);               \
-    lcol(6) = lhs_element(6, j);               \
-    rrow(6) = rhs_element(i, 6);               \
-    lcol(7) = lhs_element(7, j);               \
-    rrow(7) = rhs_element(i, 7);               \
-
-#define computeCol(j)                           \
-    res(0, j) += lcol(0) * rrow(j);             \
-    res(1, j) += lcol(1) * rrow(j);             \
-    res(2, j) += lcol(2) * rrow(j);             \
-    res(3, j) += lcol(3) * rrow(j);             \
-    res(4, j) += lcol(4) * rrow(j);             \
-    res(5, j) += lcol(5) * rrow(j);             \
-    res(6, j) += lcol(6) * rrow(j);             \
-    res(7, j) += lcol(7) * rrow(j);             \
-
-#define computePass(i)                          \
-    loadData(i, i);                             \
-                                                \
-    computeCol(0);                              \
-    computeCol(1);                              \
-    computeCol(2);                              \
-    computeCol(3);                              \
-    computeCol(4);                              \
-    computeCol(5);                              \
-    computeCol(6);                              \
-    computeCol(7);                              \
-
-    computePass(0);
-    computePass(1);
-    computePass(2);
-    computePass(3);
-    computePass(4);
-    computePass(5);
-    computePass(6);
-    computePass(7);
-
-#undef lcol
-#undef rrow
-#undef lhs_element
-#undef rhs_element
-#undef loadData
-#undef computeCol
-#undef computePass
-  } // end loop over k
-
-  // we've now iterated over all of the large (ie width 64) k blocks and
-  // accumulated results in registers. At this point thread (x, y, z) contains
-  // the sum across all big k blocks of the product of little k block of index (x, y)
-  // with block of index (y, z). To compute the final output, we need to reduce
-  // the 8 threads over y by summation.
-#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
-#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
-#else
-#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask)
-#endif
-
-#define reduceRow(i, mask)                      \
-  shuffleInc(i, 0, mask);                       \
-  shuffleInc(i, 1, mask);                       \
-  shuffleInc(i, 2, mask);                       \
-  shuffleInc(i, 3, mask);                       \
-  shuffleInc(i, 4, mask);                       \
-  shuffleInc(i, 5, mask);                       \
-  shuffleInc(i, 6, mask);                       \
-  shuffleInc(i, 7, mask);                       \
-
-#define reduceMatrix(mask)                      \
-  reduceRow(0, mask);                           \
-  reduceRow(1, mask);                           \
-  reduceRow(2, mask);                           \
-  reduceRow(3, mask);                           \
-  reduceRow(4, mask);                           \
-  reduceRow(5, mask);                           \
-  reduceRow(6, mask);                           \
-  reduceRow(7, mask);                           \
-
-  // actually perform the reduction, now each thread of index (_, y, z)
-  // contains the correct values in its registers that belong in the output
-  // block
-  reduceMatrix(1);
-  reduceMatrix(2);
-  reduceMatrix(4);
-
-#undef shuffleInc
-#undef reduceRow
-#undef reduceMatrix
-
-  // now we need to copy the 64 values into main memory. We can't split work
-  // among threads because all variables are in registers. There's 2 ways
-  // to do this:
-  // (1) have 1 thread do 64 writes from registers into global memory
-  // (2) have 1 thread do 64 writes into shared memory, and then 8 threads
-  //     each do 8 writes into global memory. We can just overwrite the shared
-  //     memory from the problem we just solved.
-  // (2) is slightly faster than (1) due to less branching and more ILP
-
-  // TODO: won't yield much gain, but could just use currently unused shared mem
-  //       and then we won't have to sync
-  // wait for shared mem to be out of use
-  __syncthreads();
-
-#define writeResultShmem(i, j)                                          \
-  lhs_shmem[i + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j] = res(i, j); \
-
-#define writeRow(i)                             \
-  writeResultShmem(i, 0);                       \
-  writeResultShmem(i, 1);                       \
-  writeResultShmem(i, 2);                       \
-  writeResultShmem(i, 3);                       \
-  writeResultShmem(i, 4);                       \
-  writeResultShmem(i, 5);                       \
-  writeResultShmem(i, 6);                       \
-  writeResultShmem(i, 7);                       \
-
-  if (threadIdx.x == 0) {
-    writeRow(0);
-    writeRow(1);
-    writeRow(2);
-    writeRow(3);
-    writeRow(4);
-    writeRow(5);
-    writeRow(6);
-    writeRow(7);
-  }
-#undef writeResultShmem
-#undef writeRow
-
-  const int max_i_write = numext::mini((int)((m_size - base_m - threadIdx.y + 7) / 8), 8);
-  const int max_j_write = numext::mini((int)((n_size - base_n - threadIdx.z + 7) / 8), 8);
-
-  if (threadIdx.x < max_i_write) {
-    if (max_j_write == 8) {
-      // TODO: can i trade bank conflicts for coalesced writes?
-      Scalar val0 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 0];
-      Scalar val1 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 1];
-      Scalar val2 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 2];
-      Scalar val3 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 3];
-      Scalar val4 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 4];
-      Scalar val5 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 5];
-      Scalar val6 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 6];
-      Scalar val7 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 7];
-
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 0) = val0;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 1) = val1;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 2) = val2;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 3) = val3;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 4) = val4;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 5) = val5;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 6) = val6;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 7) = val7;
-    } else {
-#pragma unroll 7
-      for (int j = 0; j < max_j_write; j++) {
-        Scalar val = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j];
-        output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * j) = val;
-      }
-    }
-  }
-#undef res
-}
-
-
-template<typename Scalar, typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper>
-__global__ void
-#if defined(EIGEN_HIPCC)
-__launch_bounds__(512, 1)
-#else
-__launch_bounds__(512)
-#endif
-EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
-                       const OutputMapper output,
-                       const Index m_size, const Index n_size, const Index k_size) {
-  __shared__ Scalar lhs_shmem[72 * 64];
-  __shared__ Scalar rhs_shmem[72 * 64];
-
-  const Index m_block_idx = blockIdx.x;
-  const Index n_block_idx = blockIdx.y;
-
-  const Index base_m = 64 * m_block_idx;
-  const Index base_n = 64 * n_block_idx;
-
-  if (base_m + 63 < m_size && base_n + 63 < n_size) {
-    EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
-  } else {
-    EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
-  }
-}
-
-
-template<typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
-         bool CHECK_RHS_BOUNDARY>
-__device__ __forceinline__ void
-EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs,
-                       const OutputMapper output, float2 lhs_shmem2[][16],
-                       float2 rhs_shmem2[][8], const Index m_size,
-                       const Index n_size, const Index k_size,
-                       const Index base_m, const Index base_n) {
-
-  // prefetch registers
-  float4 lhs_pf0, rhs_pf0;
-
-  float4 results[4];
-  for (int i=0; i < 4; i++) {
-    results[i].x = results[i].y = results[i].z = results[i].w = 0;
-  }
-
-#define prefetch_lhs(reg, row, col)                            \
-    if (!CHECK_LHS_BOUNDARY) {                                 \
-      if (col < k_size) {                                      \
-        reg =lhs.template loadPacket<float4,Unaligned>(row, col);     \
-      }                                                        \
-    } else {                                                   \
-      if (col < k_size) {                                      \
-        if (row + 3 < m_size) {                                \
-          reg =lhs.template loadPacket<float4,Unaligned>(row, col);   \
-        } else if (row + 2 < m_size) {                         \
-          reg.x =lhs(row + 0, col);                            \
-          reg.y =lhs(row + 1, col);                            \
-          reg.z =lhs(row + 2, col);                            \
-        } else if (row + 1 < m_size) {                         \
-          reg.x =lhs(row + 0, col);                            \
-          reg.y =lhs(row + 1, col);                            \
-        } else if (row  < m_size) {                            \
-          reg.x =lhs(row + 0, col);                            \
-        }                                                      \
-      }                                                        \
-    }							       \
-
-  Index lhs_vert = base_m+threadIdx.x*4;
-
-  for (Index k = 0; k < k_size; k += 16) {
-
-    lhs_pf0 = internal::pset1<float4>(0);
-    rhs_pf0 = internal::pset1<float4>(0);
-
-    Index lhs_horiz = threadIdx.y+k;
-    prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz)
-
-    Index rhs_vert = k+(threadIdx.x%4)*4;
-    Index rhs_horiz0 = (threadIdx.x>>2)+threadIdx.y*4+base_n;
-
-    if (!CHECK_RHS_BOUNDARY) {
-      if ((rhs_vert + 3) < k_size) {
-        // just CHECK_RHS_BOUNDARY
-        rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
-      } else if (rhs_vert + 2 < k_size) {
-        // just CHECK_RHS_BOUNDARY
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-        rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
-      } else if (rhs_vert + 1 < k_size) {
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-      } else if (rhs_vert  < k_size) {
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-      }
-    } else {
-      if (rhs_horiz0 < n_size) {
-        if ((rhs_vert + 3) < k_size) {
-          rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
-        } else if ((rhs_vert + 2) < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
-        } else if ((rhs_vert + 1) < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-        } else if (rhs_vert  < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        }
-      }
-    }
-    float x1, x2 ;
-    // the following can be a bitwise operation..... some day.
-    if((threadIdx.x%8) < 4) {
-      x1 = rhs_pf0.y;
-      x2 = rhs_pf0.w;
-    } else {
-      x1 = rhs_pf0.x;
-      x2 = rhs_pf0.z;
-    }
-    #if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
-    x1 = __shfl_xor(x1, 4);
-    x2 = __shfl_xor(x2, 4);
-    #else
-    x1 = __shfl_xor_sync(0xFFFFFFFF, x1, 4);
-    x2 = __shfl_xor_sync(0xFFFFFFFF, x2, 4);
-    #endif
-    if((threadIdx.x%8) < 4) {
-      rhs_pf0.y = x1;
-      rhs_pf0.w = x2;
-    } else {
-      rhs_pf0.x = x1;
-      rhs_pf0.z = x2;
-    }
-
-    // We have 64 features.
-    // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1.
-    // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3.
-    // ...
-    // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63
-    // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1
-    // ...
-    rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2][threadIdx.x%8] = make_float2(rhs_pf0.x, rhs_pf0.y);
-    rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2+32][threadIdx.x%8] = make_float2(rhs_pf0.z, rhs_pf0.w);
-
-    // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
-    // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
-    // ...
-    // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
-    // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63)
-    // ...
-
-    lhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(lhs_pf0.x, lhs_pf0.y);
-    lhs_shmem2[threadIdx.y+16][threadIdx.x] = make_float2(lhs_pf0.z, lhs_pf0.w);
-
-
-#define add_vals(fl1, fl2, fr1, fr2)\
-    results[0].x += fl1.x * fr1.x;\
-    results[0].y += fl1.y * fr1.x;\
-    results[0].z += fl2.x * fr1.x;\
-    results[0].w += fl2.y * fr1.x;\
-\
-    results[1].x += fl1.x * fr1.y;\
-    results[1].y += fl1.y * fr1.y;\
-    results[1].z += fl2.x * fr1.y;\
-    results[1].w += fl2.y * fr1.y;\
-\
-    results[2].x += fl1.x * fr2.x;\
-    results[2].y += fl1.y * fr2.x;\
-    results[2].z += fl2.x * fr2.x;\
-    results[2].w += fl2.y * fr2.x;\
-\
-    results[3].x += fl1.x * fr2.y;\
-    results[3].y += fl1.y * fr2.y;\
-    results[3].z += fl2.x * fr2.y;\
-    results[3].w += fl2.y * fr2.y;\
-
-    __syncthreads();
-
-    // Do the multiplies.
-    #pragma unroll
-    for (int koff = 0; koff < 16; koff ++) {
-      // 32 x threads.
-      float2 fl1 = lhs_shmem2[koff][threadIdx.x];
-      float2 fl2 = lhs_shmem2[koff + 16][threadIdx.x];
-
-      int start_feature = threadIdx.y * 4;
-      float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
-      float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
-
-      add_vals(fl1, fl2, fr1, fr2)
-    }
-    __syncthreads();
-  }
-
-#undef prefetch_lhs
-#undef add_vals
-
-  Index horiz_base = threadIdx.y*4+base_n;
-  if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
-    for (int i = 0; i < 4; i++) {
-      output(lhs_vert, horiz_base + i) = results[i].x;
-      output(lhs_vert + 1, horiz_base + i) = results[i].y;
-      output(lhs_vert + 2, horiz_base + i) = results[i].z;
-      output(lhs_vert + 3, horiz_base + i) = results[i].w;
-    }
-  } else if (!CHECK_RHS_BOUNDARY) {
-    // CHECK LHS
-    if (lhs_vert + 3 < m_size) {
-      for (int i = 0; i < 4; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        output(lhs_vert + 3, horiz_base + i) = results[i].w;
-      }
-    } else if (lhs_vert + 2 < m_size) {
-      for (int i = 0; i < 4; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-      }
-    } else if (lhs_vert + 1 < m_size) {
-      for (int i = 0; i < 4; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-      }
-    } else if (lhs_vert  < m_size) {
-      for (int i = 0; i < 4; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-      }
-    }
-  } else if (!CHECK_LHS_BOUNDARY) {
-    // CHECK RHS
-    /*
-    int ncols_rem = fminf(n_size- horiz_base, 4);
-    for (int i = 0; i < ncols_rem; i++) {
-      output(lhs_vert, horiz_base + i) = results[i].x;
-      output(lhs_vert + 1, horiz_base + i) = results[i].y;
-      output(lhs_vert + 2, horiz_base + i) = results[i].z;
-      output(lhs_vert + 3, horiz_base + i) = results[i].w;
-    }*/
-    for (int i = 0; i < 4; i++) {
-      if (horiz_base+i < n_size) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        output(lhs_vert + 3, horiz_base + i) = results[i].w;
-       }
-    }
-  } else {
-    // CHECK both boundaries.
-    for (int i = 0; i < 4; i++) {
-      if (horiz_base+i < n_size) {
-        if (lhs_vert < m_size)
-          output(lhs_vert, horiz_base + i) = results[i].x;
-        if (lhs_vert + 1 < m_size)
-          output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        if (lhs_vert + 2 < m_size)
-          output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        if (lhs_vert + 3 < m_size)
-          output(lhs_vert + 3, horiz_base + i) = results[i].w;
-      }
-    }
-  }
-}
-
-
-template<typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
-         bool CHECK_RHS_BOUNDARY>
-__device__ __forceinline__ void
-EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
-                       const OutputMapper output, float2 lhs_shmem2[][32],
-                       float2 rhs_shmem2[][8], const Index m_size,
-                       const Index n_size, const Index k_size,
-                       const Index base_m, const Index base_n) {
-
-  // prefetch registers
-  float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3;
-  float4 rhs_pf0, rhs_pf1;
-
-  float4 results[8];
-  for (int i=0; i < 8; i++) {
-    results[i].x = results[i].y = results[i].z = results[i].w = 0;
-  }
-
-  Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32;
-  for (Index k = 0; k < k_size; k += 32) {
-    lhs_pf0 = internal::pset1<float4>(0);
-    lhs_pf1 = internal::pset1<float4>(0);
-    lhs_pf2 = internal::pset1<float4>(0);
-    lhs_pf3 = internal::pset1<float4>(0);
-
-    rhs_pf0 = internal::pset1<float4>(0);
-    rhs_pf1 = internal::pset1<float4>(0);
-
-     if (!CHECK_LHS_BOUNDARY) {
-      if ((threadIdx.y/4+k+24) < k_size) {
-        lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
-        lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-        lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
-        lhs_pf3 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
-      } else if ((threadIdx.y/4+k+16) < k_size) {
-        lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
-        lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-        lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
-      } else if ((threadIdx.y/4+k+8) < k_size) {
-        lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
-        lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-      } else if ((threadIdx.y/4+k) < k_size) {
-        lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
-      }
-    } else {
-      // just CHECK_LHS_BOUNDARY
-      if (lhs_vert + 3 < m_size) {
-        if ((threadIdx.y/4+k+24) < k_size) {
-          lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
-          lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-          lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
-          lhs_pf3 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
-        } else if ((threadIdx.y/4+k+16) < k_size) {
-          lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
-          lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-          lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
-        } else if ((threadIdx.y/4+k+8) < k_size) {
-          lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
-          lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-        } else if ((threadIdx.y/4+k) < k_size) {
-          lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
-        }
-      } else if (lhs_vert + 2 < m_size) {
-        if ((threadIdx.y/4+k+24) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
-          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
-          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
-          lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16));
-          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
-          lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24));
-          lhs_pf3.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+24));
-        } else if ((threadIdx.y/4+k+16) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
-          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
-          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
-          lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16));
-        } else if ((threadIdx.y/4+k+8) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
-          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
-        } else if ((threadIdx.y/4+k) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
-        }
-      } else if (lhs_vert + 1 < m_size) {
-        if ((threadIdx.y/4+k+24) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
-          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
-          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
-          lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24));
-        } else if ((threadIdx.y/4+k+16) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
-          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
-        } else if ((threadIdx.y/4+k+8) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
-        } else if ((threadIdx.y/4+k) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-        }
-      } else if (lhs_vert < m_size) {
-        if ((threadIdx.y/4+k+24) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
-          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
-        } else if ((threadIdx.y/4+k+16) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
-        } else if ((threadIdx.y/4+k+8) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-        } else if ((threadIdx.y/4+k) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-        }
-      }
-    }
-    __syncthreads();
-    Index rhs_vert = k+threadIdx.x*4;
-    Index rhs_horiz0 = threadIdx.y*2+base_n;
-    Index rhs_horiz1 = threadIdx.y*2+1+base_n;
-    if (!CHECK_RHS_BOUNDARY) {
-      if ((rhs_vert + 3) < k_size) {
-        // just CHECK_RHS_BOUNDARY
-        rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
-        rhs_pf1 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz1);
-      } else if (rhs_vert + 2 < k_size) {
-        // just CHECK_RHS_BOUNDARY
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-        rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
-        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-        rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
-        rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
-      } else if (rhs_vert + 1 < k_size) {
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-        rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
-      } else if (rhs_vert  < k_size) {
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-      }
-    } else {
-      if (rhs_horiz1 < n_size) {
-        if ((rhs_vert + 3) < k_size) {
-          // just CHECK_RHS_BOUNDARY
-          rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
-          rhs_pf1 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz1);
-        } else if (rhs_vert + 2 < k_size) {
-          // just CHECK_RHS_BOUNDARY
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
-          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-          rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
-          rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
-        } else if (k+threadIdx.x*4 + 1 < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-          rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
-        } else if (k+threadIdx.x*4  < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-        }
-      } else if (rhs_horiz0 < n_size) {
-        if ((rhs_vert + 3) < k_size) {
-          // just CHECK_RHS_BOUNDARY
-          rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
-        } else if ((rhs_vert + 2) < k_size) {
-          // just CHECK_RHS_BOUNDARY
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
-        } else if ((rhs_vert + 1) < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-        } else if (rhs_vert  < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        }
-      }
-    }
-    __syncthreads();
-    // Loaded. Do computation
-    // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1.
-    // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3.
-    // ..
-    // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63
-    rhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(rhs_pf0.x, rhs_pf1.x);
-    // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1.
-    // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3.
-    // ..
-    rhs_shmem2[threadIdx.y+32][threadIdx.x] = make_float2(rhs_pf0.y, rhs_pf1.y);
-    // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1.
-    // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3.
-    rhs_shmem2[threadIdx.y+64][threadIdx.x] = make_float2(rhs_pf0.z, rhs_pf1.z);
-    // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1.
-    // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3.
-    rhs_shmem2[threadIdx.y+96][threadIdx.x] = make_float2(rhs_pf0.w, rhs_pf1.w);
-
-    // LHS.
-    // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61) .. (124, 125)
-    // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61) .. (124, 125)
-    // ...
-    // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63) .. (126, 127)
-    // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63) .. (126, 127)
-
-
-#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\
-      results[0].x += a_feat1.x * f1.x;\
-      results[1].x += a_feat1.x * f1.y;\
-      results[2].x += a_feat1.x * f2.x;\
-      results[3].x += a_feat1.x * f2.y;\
-      results[4].x += a_feat1.x * f3.x;\
-      results[5].x += a_feat1.x * f3.y;\
-      results[6].x += a_feat1.x * f4.x;\
-      results[7].x += a_feat1.x * f4.y;\
-\
-      results[0].y += a_feat1.y * f1.x;\
-      results[1].y += a_feat1.y * f1.y;\
-      results[2].y += a_feat1.y * f2.x;\
-      results[3].y += a_feat1.y * f2.y;\
-      results[4].y += a_feat1.y * f3.x;\
-      results[5].y += a_feat1.y * f3.y;\
-      results[6].y += a_feat1.y * f4.x;\
-      results[7].y += a_feat1.y * f4.y;\
-\
-      results[0].z += a_feat2.x * f1.x;\
-      results[1].z += a_feat2.x * f1.y;\
-      results[2].z += a_feat2.x * f2.x;\
-      results[3].z += a_feat2.x * f2.y;\
-      results[4].z += a_feat2.x * f3.x;\
-      results[5].z += a_feat2.x * f3.y;\
-      results[6].z += a_feat2.x * f4.x;\
-      results[7].z += a_feat2.x * f4.y;\
-\
-      results[0].w += a_feat2.y * f1.x;\
-      results[1].w += a_feat2.y * f1.y;\
-      results[2].w += a_feat2.y * f2.x;\
-      results[3].w += a_feat2.y * f2.y;\
-      results[4].w += a_feat2.y * f3.x;\
-      results[5].w += a_feat2.y * f3.y;\
-      results[6].w += a_feat2.y * f4.x;\
-      results[7].w += a_feat2.y * f4.y;\
-
-    lhs_shmem2[threadIdx.y/4][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y);
-    lhs_shmem2[threadIdx.y/4+8][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y);
-    lhs_shmem2[threadIdx.y/4+16][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y);
-    lhs_shmem2[threadIdx.y/4+24][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y);
-
-    lhs_shmem2[threadIdx.y/4 + 32][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w);
-    lhs_shmem2[threadIdx.y/4 + 40][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w);
-    lhs_shmem2[threadIdx.y/4 + 48][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w);
-    lhs_shmem2[threadIdx.y/4 + 56][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w);
-
-    __syncthreads();
-
-    // Do the multiplies.
-    #pragma unroll
-    for (int koff = 0; koff < 32; koff ++) {
-      float2 a3 = lhs_shmem2[koff][threadIdx.x + (threadIdx.y % 4) * 8];
-      float2 a4 = lhs_shmem2[koff + 32][threadIdx.x + (threadIdx.y % 4) * 8];
-
-      // first feature is at (threadIdx.y/4) * 8 last is at start + 8.
-      int start_feature = (threadIdx.y / 4) * 8;
-
-      float2 br1 = rhs_shmem2[start_feature/2 +     (koff % 4) * 32][koff/4];
-      float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4];
-      float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4];
-      float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4];
-
-      add_vals(a3, a4, br1, br2, br3, br4)
-    }
-    __syncthreads();
-  } // end loop over k
-
-  __syncthreads();
-  Index horiz_base = (threadIdx.y/4)*8+base_n;
-  if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
-    for (int i = 0; i < 8; i++) {
-      output(lhs_vert, horiz_base + i) = results[i].x;
-      output(lhs_vert + 1, horiz_base + i) = results[i].y;
-      output(lhs_vert + 2, horiz_base + i) = results[i].z;
-      output(lhs_vert + 3, horiz_base + i) = results[i].w;
-    }
-  } else if (!CHECK_RHS_BOUNDARY) {
-    if (lhs_vert + 3 < m_size) {
-      for (int i = 0; i < 8; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        output(lhs_vert + 3, horiz_base + i) = results[i].w;
-      }
-    } else if (lhs_vert + 2 < m_size) {
-      for (int i = 0; i < 8; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-      }
-    } else if (lhs_vert + 1 < m_size) {
-      for (int i = 0; i < 8; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-      }
-    } else if (lhs_vert  < m_size) {
-      for (int i = 0; i < 8; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-      }
-    }
-  } else if (!CHECK_LHS_BOUNDARY) {
-    // CHECK BOUNDARY_B
-    for (int i = 0; i < 8; i++) {
-      if (horiz_base + i < n_size) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        output(lhs_vert + 3, horiz_base + i) = results[i].w;
-      }
-    }
-  } else {
-    // CHECK both boundaries.
-    for (int i = 0; i < 8; i++) {
-      if (horiz_base + i < n_size) {
-        if (lhs_vert < m_size)
-          output(lhs_vert, horiz_base + i) = results[i].x;
-        if (lhs_vert + 1 < m_size)
-          output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        if (lhs_vert + 2 < m_size)
-          output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        if (lhs_vert + 3 < m_size)
-          output(lhs_vert + 3, horiz_base + i) = results[i].w;
-      }
-    }
-  }
-}
-
-
-template<typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper>
-__global__ void
-#if defined(EIGEN_HIPCC)
-__launch_bounds__(256, 1)
-#else
-__launch_bounds__(256)
-#endif
-EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
-                       const OutputMapper output,
-                       const Index m_size, const Index n_size, const Index k_size) {
-  __shared__ float2 lhs_shmem[64*32];
-  __shared__ float2 rhs_shmem[128*8];
-
-  typedef float2 LHS_MEM[64][32];
-  typedef float2 RHS_MEM[128][8];
-
-  const Index m_block_idx = blockIdx.x;
-  const Index n_block_idx = blockIdx.y;
-
-  const Index base_m = 128 * m_block_idx;
-  const Index base_n = 64 * n_block_idx;
-
-  bool check_rhs = (base_n + 63) >= n_size;
-  bool check_lhs128 = (base_m + 127) >= m_size;
-
-  if (!check_rhs) {
-    if (!check_lhs128) {
-      // >= 128 rows left
-      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(
-                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
-    } else {
-      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(
-                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
-    }
-  } else {
-    if (!check_lhs128) {
-      // >= 128 rows left
-      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(
-                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
-    } else {
-      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(
-                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
-    }
-  }
-}
-
-template<typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper>
-__global__ void
-#if defined(EIGEN_HIPCC)
-__launch_bounds__(256, 1)
-#else
-__launch_bounds__(256)
-#endif
-EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs,
-                       const OutputMapper output,
-                       const Index m_size, const Index n_size, const Index k_size) {
-  __shared__ float2 lhs_shmem[32][16];
-  __shared__ float2 rhs_shmem[64][8];
-
-  const Index m_block_idx = blockIdx.x;
-  const Index n_block_idx = blockIdx.y;
-
-  const Index base_m = 64 * m_block_idx;
-  const Index base_n = 64 * n_block_idx;
-
-  if (base_m + 63 < m_size) {
-    if (base_n + 63 < n_size) {
-      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
-    } else {
-      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
-    }
-  } else {
-    if (base_n + 63 < n_size) {
-      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
-    } else {
-      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
-    }
-  }
-}
-
-
-template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
-struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, GpuDevice> :
-    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, GpuDevice> > {
-
-  typedef GpuDevice Device;
-
-  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
-  typedef TensorContractionEvaluatorBase<Self> Base;
-
-  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
-  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
-  typedef typename XprType::Index Index;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
-
-  enum {
-    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
-  };
-
-  // Most of the code is assuming that both input tensors are ColMajor. If the
-  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
-  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
-  // will pretend B is LHS and A is RHS.
-  typedef typename internal::conditional<
-    static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
-  typedef typename internal::conditional<
-    static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
-
-  static const int LDims =
-      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
-  static const int RDims =
-      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
-  static const int ContractDims = internal::array_size<Indices>::value;
-
-  typedef array<Index, LDims> left_dim_mapper_t;
-  typedef array<Index, RDims> right_dim_mapper_t;
-
-  typedef array<Index, ContractDims> contract_t;
-  typedef array<Index, LDims - ContractDims> left_nocontract_t;
-  typedef array<Index, RDims - ContractDims> right_nocontract_t;
-
-  static const int NumDims = LDims + RDims - 2 * ContractDims;
-
-  typedef DSizes<Index, NumDims> Dimensions;
-
-  // typedefs needed in evalTo
-  typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
-  typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
-
-  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
-  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
-
-  typedef typename LeftEvaluator::Dimensions LeftDimensions;
-  typedef typename RightEvaluator::Dimensions RightDimensions;
-
-  TensorEvaluator(const XprType& op, const Device& device) :
-      Base(op, device)
-  {
-    EIGEN_STATIC_ASSERT( (internal::is_same<OutputKernelType, const NoOpOutputKernel>::value),
-                          GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS);
-  }
-
-  // We need to redefine this method to make nvcc happy
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
-    this->m_leftImpl.evalSubExprsIfNeeded(NULL);
-    this->m_rightImpl.evalSubExprsIfNeeded(NULL);
-    if (data) {
-      evalTo(data);
-      return false;
-    } else {
-      this->m_result = static_cast<Scalar *>(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar)));
-      evalTo(this->m_result);
-      return true;
-    }
-  }
-
-  void evalTo(Scalar* buffer) const {
-    if (this->m_lhs_inner_dim_contiguous) {
-      if (this->m_rhs_inner_dim_contiguous) {
-        if (this->m_rhs_inner_dim_reordered) {
-          evalTyped<true, true, true, Unaligned>(buffer);
-        }
-        else {
-          evalTyped<true, true, false, Unaligned>(buffer);
-        }
-      }
-      else {
-       if (this->m_rhs_inner_dim_reordered) {
-          evalTyped<true, false, true, Unaligned>(buffer);
-        }
-        else {
-          evalTyped<true, false, false, Unaligned>(buffer);
-        }
-      }
-    }
-    else {
-      if (this->m_rhs_inner_dim_contiguous) {
-        if (this->m_rhs_inner_dim_reordered) {
-          evalTyped<false, true, true, Unaligned>(buffer);
-        }
-        else {
-          evalTyped<false, true, false, Unaligned>(buffer);
-        }
-      }
-      else {
-       if (this->m_rhs_inner_dim_reordered) {
-          evalTyped<false, false, true, Unaligned>(buffer);
-        }
-        else {
-          evalTyped<false, false, false, Unaligned>(buffer);
-        }
-      }
-    }
-  }
-
-  template <typename LhsScalar, typename RhsScalar, typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels {
-    static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
-    const Index m_blocks = (m + 63) / 64;
-    const Index n_blocks = (n + 63) / 64;
-    const dim3 num_blocks(m_blocks, n_blocks, 1);
-    const dim3 block_size(8, 8, 8);
-    LAUNCH_GPU_KERNEL((EigenContractionKernel<Scalar, Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
-    }
-  };
-
-  template <typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels<float, float, Index, LhsMapper, RhsMapper, OutputMapper> {
-    static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
-      if (m < 768 || n < 768) {
-        const Index m_blocks = (m + 63) / 64;
-        const Index n_blocks = (n + 63) / 64;
-        const dim3 num_blocks(m_blocks, n_blocks, 1);
-        const dim3 block_size(16, 16, 1);
-        LAUNCH_GPU_KERNEL((EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
-      } else {
-        const Index m_blocks = (m + 127) / 128;
-        const Index n_blocks = (n + 63) / 64;
-        const dim3 num_blocks(m_blocks, n_blocks, 1);
-        const dim3 block_size(8, 32, 1);
-        LAUNCH_GPU_KERNEL((EigenFloatContractionKernel<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
-      }
-    }
-  };
-
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
-  void evalTyped(Scalar* buffer) const {
-    // columns in left side, rows in right side
-    const Index k = this->m_k_size;
-    EIGEN_UNUSED_VARIABLE(k)
-
-    // rows in left side
-    const Index m = this->m_i_size;
-
-    // columns in right side
-    const Index n = this->m_j_size;
-
-    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
-    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
-
-    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
-                                                   LeftEvaluator, left_nocontract_t,
-                                                   contract_t, 4,
-                                                   lhs_inner_dim_contiguous,
-                                                   false, Unaligned> LhsMapper;
-
-    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
-                                                   RightEvaluator, right_nocontract_t,
-                                                   contract_t, 4,
-                                                   rhs_inner_dim_contiguous,
-                                                   rhs_inner_dim_reordered, Unaligned> RhsMapper;
-
-    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
-
-
-    // initialize data mappers
-    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
-                  this->m_left_contracting_strides, this->m_k_strides);
-
-    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
-                  this->m_right_contracting_strides, this->m_k_strides);
-
-    OutputMapper output(buffer, m);
-
-#if defined(EIGEN_USE_HIP)
-    setGpuSharedMemConfig(hipSharedMemBankSizeEightByte);
-#else
-    setGpuSharedMemConfig(cudaSharedMemBankSizeEightByte);
-#endif
-
-    LaunchKernels<LhsScalar, RhsScalar, Index, LhsMapper, RhsMapper, OutputMapper>::Run(lhs, rhs, output,  m, n, k, this->m_device);
-  }
-};
-
-} // end namespace Eigen
-
-#endif // EIGEN_USE_GPU and EIGEN_GPUCC
-#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionMapper.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionMapper.h
deleted file mode 100644
index 9ab900b..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionMapper.h
+++ /dev/null
@@ -1,575 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H
-#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H
-
-namespace Eigen {
-
-namespace internal {
-
-enum {
-  Rhs = 0,
-  Lhs = 1
-};
-
-/*
- * Implementation of the Eigen blas_data_mapper class for tensors.
- */
-/// The make pointer class is used by sycl in order to build the mapper class on the device. For other platform the default make pointer is used which
-/// is scalar * for CoeffLoader.
-template <typename Tensor, bool HasRawAccess, template <class> class MakePointer_ = MakePointer>
-struct CoeffLoader;
-
-template <typename Scalar, typename Index, int side, typename Tensor,
-          typename nocontract_t, typename contract_t, int packet_size,
-          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
-          template <class> class MakePointer_ = MakePointer>
-class BaseTensorContractionMapper;
-
-template <typename Tensor, bool HasRawAccess, template <class> class MakePointer_>
-struct CoeffLoader {
-  enum {
-    DirectOffsets = false
-  };
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_tensor(tensor) { }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index) {
-    eigen_assert(false && "unsupported");
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename MakePointer_<const typename Tensor::Scalar>::Type
-  data() const {
-    eigen_assert(false && "unsupported");
-    return NULL;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return m_tensor.coeff(index); }
-
- template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- typename Tensor::PacketReturnType packet(typename Tensor::Index index) const
-  {
-    return m_tensor.template packet<LoadMode>(index);
-  }
-
-  #ifdef EIGEN_USE_SYCL
-  // The placeholder accessors require to be bound to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_tensor.bind(cgh);
-  }
-  #endif
-
- private:
-  const Tensor m_tensor;
-};
-
-template <typename Tensor, template <class> class MakePointer_>
-struct CoeffLoader<Tensor, true, MakePointer_> {
-  enum {
-    DirectOffsets = true
-  };
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_data(tensor.data()) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) {
-    m_data += offset;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename MakePointer_<const typename Tensor::Scalar>::Type
-  data() const {
-    return m_data;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return loadConstant(m_data+index); }
-
- template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- typename Tensor::PacketReturnType packet(typename Tensor::Index index) const
-  {
-    return internal::ploadt_ro<typename Tensor::PacketReturnType, LoadMode>(m_data + index);
-  }
-
-  #ifdef EIGEN_USE_SYCL
-  // The placeholder accessors require to be bound to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_data.bind(cgh);
-  }
-  #endif
- private:
-  typedef typename Tensor::Scalar Scalar;
-
-  typename MakePointer_<const Scalar>::Type m_data;
-};
-
-template<typename Scalar, typename Index, int side,
-         typename Tensor,
-         typename nocontract_t, typename contract_t,
-         int packet_size, bool inner_dim_contiguous, int Alignment, template <class> class MakePointer_ = MakePointer>
-class SimpleTensorContractionMapper {
-  public:
-  EIGEN_DEVICE_FUNC
-  SimpleTensorContractionMapper(const Tensor& tensor,
-                                const nocontract_t& nocontract_strides,
-                                const nocontract_t& ij_strides,
-                                const contract_t& contract_strides,
-                                const contract_t& k_strides) :
-      m_tensor(tensor),
-      m_nocontract_strides(nocontract_strides),
-      m_ij_strides(ij_strides),
-      m_contract_strides(contract_strides),
-      m_k_strides(k_strides) { }
-
-  enum {
-    DirectOffsets = CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>::DirectOffsets
-  };
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) {
-    m_tensor.offsetBuffer(offset);
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE void prefetch(Index /*i*/) { }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE Scalar operator()(Index row) const {
-    // column major assumption
-    return operator()(row, 0);
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE Scalar operator()(Index row, Index col) const {
-    return m_tensor.coeff(computeIndex(row, col));
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const {
-    const bool left = (side == Lhs);
-    EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963
-    Index nocontract_val = left ? row : col;
-    Index linidx = 0;
-    EIGEN_UNROLL_LOOP
-    for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
-      const Index idx = nocontract_val / m_ij_strides[i];
-      linidx += idx * m_nocontract_strides[i];
-      nocontract_val -= idx * m_ij_strides[i];
-    }
-    if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
-      if (side == Lhs && inner_dim_contiguous) {
-        eigen_assert(m_nocontract_strides[0] == 1);
-        linidx += nocontract_val;
-      } else {
-        linidx += nocontract_val * m_nocontract_strides[0];
-      }
-    }
-
-    Index contract_val = left ? col : row;
-    if(array_size<contract_t>::value > 0) {
-      EIGEN_UNROLL_LOOP
-      for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
-        const Index idx = contract_val / m_k_strides[i];
-        linidx += idx * m_contract_strides[i];
-        contract_val -= idx * m_k_strides[i];
-      }
-
-      if (side == Rhs && inner_dim_contiguous) {
-        eigen_assert(m_contract_strides[0] == 1);
-        linidx += contract_val;
-      } else {
-        linidx += contract_val * m_contract_strides[0];
-      }
-    }
-
-    return linidx;
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE IndexPair<Index> computeIndexPair(Index row, Index col, const Index distance) const {
-    const bool left = (side == Lhs);
-    EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963
-    Index nocontract_val[2] = {left ? row : col, left ? row + distance : col};
-    Index linidx[2] = {0, 0};
-    if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
-      EIGEN_UNROLL_LOOP
-      for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
-        const Index idx0 = nocontract_val[0] / m_ij_strides[i];
-        const Index idx1 = nocontract_val[1] / m_ij_strides[i];
-        linidx[0] += idx0 * m_nocontract_strides[i];
-        linidx[1] += idx1 * m_nocontract_strides[i];
-        nocontract_val[0] -= idx0 * m_ij_strides[i];
-        nocontract_val[1] -= idx1 * m_ij_strides[i];
-      }
-      if (side == Lhs && inner_dim_contiguous) {
-        eigen_assert(m_nocontract_strides[0] == 1);
-        linidx[0] += nocontract_val[0];
-        linidx[1] += nocontract_val[1];
-      } else {
-        linidx[0] += nocontract_val[0] * m_nocontract_strides[0];
-        linidx[1] += nocontract_val[1] * m_nocontract_strides[0];
-      }
-    }
-
-    Index contract_val[2] = {left ? col : row, left ? col : row + distance};
-    if (array_size<contract_t>::value> 0) {
-      EIGEN_UNROLL_LOOP
-      for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
-        const Index idx0 = contract_val[0] / m_k_strides[i];
-        const Index idx1 = contract_val[1] / m_k_strides[i];
-        linidx[0] += idx0 * m_contract_strides[i];
-        linidx[1] += idx1 * m_contract_strides[i];
-        contract_val[0] -= idx0 * m_k_strides[i];
-        contract_val[1] -= idx1 * m_k_strides[i];
-      }
-
-      if (side == Rhs && inner_dim_contiguous) {
-        eigen_assert(m_contract_strides[0] == 1);
-        linidx[0] += contract_val[0];
-        linidx[1] += contract_val[1];
-      } else {
-        linidx[0] += contract_val[0] * m_contract_strides[0];
-        linidx[1] += contract_val[1] * m_contract_strides[0];
-      }
-    }
-    return IndexPair<Index>(linidx[0], linidx[1]);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index firstAligned(Index size) const {
-    // Only claim alignment when we can compute the actual stride (ie when we're
-    // dealing with the lhs with inner_dim_contiguous. This is because the
-    // matrix-vector product relies on the stride when dealing with aligned inputs.
-    return (Alignment == Aligned) && (side == Lhs) && inner_dim_contiguous ? 0 : size;
-  }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index stride() const {
-    return ((side == Lhs) && inner_dim_contiguous && array_size<contract_t>::value > 0) ? m_contract_strides[0] : 1;
-  }
-
-  #ifdef EIGEN_USE_SYCL
-  // The placeholder accessors require to be bound to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_tensor.bind(cgh);
-  }
-  #endif
-
-  const CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>& tensor() const {
-    return m_tensor;
-  }
-
-  const nocontract_t& nocontract_strides() const {
-    return m_nocontract_strides;
-  }
-  const nocontract_t& ij_strides() const { return m_ij_strides; }
-  const contract_t& contract_strides() const { return m_contract_strides; }
-  const contract_t& k_strides() const { return m_k_strides; }
-
- protected:
-  CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_> m_tensor;
-  const nocontract_t m_nocontract_strides;
-  const nocontract_t m_ij_strides;
-  const contract_t m_contract_strides;
-  const contract_t m_k_strides;
-};
-
-template<typename Scalar, typename Index, int side,
-         typename Tensor,
-         typename nocontract_t, typename contract_t,
-         int packet_size, bool inner_dim_contiguous,
-         bool inner_dim_reordered, int Alignment, template <class> class MakePointer_>
-class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment, MakePointer_>
-{
- public:
-  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment, MakePointer_> ParentMapper;
-
-  EIGEN_DEVICE_FUNC
-  BaseTensorContractionMapper(const Tensor& tensor,
-                              const nocontract_t& nocontract_strides,
-                              const nocontract_t& ij_strides,
-                              const contract_t& contract_strides,
-                              const contract_t& k_strides) :
-  ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
-
-  template <typename PacketT,int AlignmentType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  typename internal::enable_if<internal::unpacket_traits<PacketT>::size==packet_size,PacketT>::type
-  load(Index i, Index j) const
-  {
-    // whole method makes column major assumption
-
-    // don't need to add offsets for now (because operator handles that)
-    // current code assumes packet size must be a multiple of 2
-    EIGEN_STATIC_ASSERT(packet_size % 2 == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-    if (Tensor::PacketAccess && inner_dim_contiguous && !inner_dim_reordered) {
-      const Index index = this->computeIndex(i, j);
-      eigen_assert(this->computeIndex(i+packet_size-1, j) == index + packet_size-1);
-      return this->m_tensor.template packet<AlignmentType>(index);
-    }
-
-    const IndexPair<Index> indexPair = this->computeIndexPair(i, j, packet_size - 1);
-    const Index first = indexPair.first;
-    const Index lastIdx = indexPair.second;
-
-    // We can always do optimized packet reads from left hand side right now, because
-    // the vertical matrix dimension on the left hand side is never contracting.
-    // On the right hand side we need to check if the contracting dimensions may have
-    // been shuffled first.
-    if (Tensor::PacketAccess &&
-        (side == Lhs || internal::array_size<contract_t>::value <= 1 || !inner_dim_reordered) &&
-        (lastIdx - first) == (packet_size - 1)) {
-
-      return this->m_tensor.template packet<AlignmentType>(first);
-    }
-
-    EIGEN_ALIGN_MAX Scalar data[packet_size];
-
-    data[0] = this->m_tensor.coeff(first);
-    EIGEN_UNROLL_LOOP
-    for (Index k = 1; k < packet_size - 1; k += 2) {
-      const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1);
-      data[k] = this->m_tensor.coeff(internal_pair.first);
-      data[k + 1] = this->m_tensor.coeff(internal_pair.second);
-    }
-    data[packet_size - 1] = this->m_tensor.coeff(lastIdx);
-
-    return pload<PacketT>(data);
-  }
-
-  template <typename PacketT,int AlignmentType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  typename internal::enable_if<internal::unpacket_traits<PacketT>::size!=packet_size,PacketT>::type
-  load(Index i, Index j) const
-  {
-    const Index requested_packet_size = internal::unpacket_traits<PacketT>::size;
-    EIGEN_ALIGN_MAX Scalar data[requested_packet_size];
-
-    const IndexPair<Index> indexPair = this->computeIndexPair(i, j, requested_packet_size - 1);
-    const Index first = indexPair.first;
-    const Index lastIdx = indexPair.second;
-
-    data[0] = this->m_tensor.coeff(first);
-    for (Index k = 1; k < requested_packet_size - 1; k += 2) {
-      const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1);
-      data[k] = this->m_tensor.coeff(internal_pair.first);
-      data[k + 1] = this->m_tensor.coeff(internal_pair.second);
-    }
-    data[requested_packet_size - 1] = this->m_tensor.coeff(lastIdx);
-
-    return pload<PacketT>(data);
-  }
-
-  template <typename PacketT,int AlignmentType>
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const {
-    return this->load<PacketT,AlignmentType>(i,j);
-  }
-};
-
-
-template<typename Scalar, typename Index, int side,
-         typename Tensor,
-         typename nocontract_t, typename contract_t,
-         bool inner_dim_contiguous,
-         bool inner_dim_reordered, int Alignment, template <class> class MakePointer_>
-class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_>
-  : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment, MakePointer_>
-{
- public:
-  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment, MakePointer_> ParentMapper;
-
-  EIGEN_DEVICE_FUNC
-  BaseTensorContractionMapper(const Tensor& tensor,
-                              const nocontract_t& nocontract_strides,
-                              const nocontract_t& ij_strides,
-                              const contract_t& contract_strides,
-                              const contract_t& k_strides) :
-  ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
-
-  template <typename PacketT,int> EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const {
-    EIGEN_ALIGN_MAX Scalar data[1];
-    data[0] = this->m_tensor.coeff(this->computeIndex(i, j));
-    return pload<PacketT>(data);
-  }
-  template <typename PacketT,int> EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE PacketT load(Index i, Index j) const {
-    EIGEN_ALIGN_MAX Scalar data[1];
-    data[0] = this->m_tensor.coeff(this->computeIndex(i, j));
-    return pload<PacketT>(data);
-  }
-};
-
-
-template<typename Scalar, typename Index, int side,
-         typename Tensor,
-         typename nocontract_t, typename contract_t,
-         int packet_size,
-         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, template <class> class MakePointer_=MakePointer>
-class TensorContractionSubMapper {
- public:
-
-  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> ParentMapper;
-  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> Self;
-  typedef Self LinearMapper;
-
-  enum {
-    // We can use direct offsets iff the parent mapper supports then and we can compute the strides.
-    // TODO: we should also enable direct offsets for the Rhs case.
-    UseDirectOffsets = ParentMapper::DirectOffsets && (side == Lhs) && inner_dim_contiguous && (array_size<contract_t>::value > 0)
-  };
-
-  EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset)
-      : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) {
-    // Bake the offsets into the buffer used by the base mapper whenever possible. This avoids the need to recompute
-    // this offset every time we attempt to access a coefficient.
-    if (UseDirectOffsets) {
-      Index stride = m_base_mapper.stride();
-      m_base_mapper.offsetBuffer(vert_offset + horiz_offset * stride);
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
-    if (UseDirectOffsets) {
-      return m_base_mapper(i, 0);
-    }
-    return m_base_mapper(i + m_vert_offset, m_horiz_offset);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const {
-    if (UseDirectOffsets) {
-      return m_base_mapper(i, j);
-    }
-    return m_base_mapper(i + m_vert_offset, j + m_horiz_offset);
-  }
-
-  template <typename PacketT>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i) const {
-    if (UseDirectOffsets) {
-      return m_base_mapper.template loadPacket<PacketT,Alignment>(i, 0);
-    }
-    return m_base_mapper.template loadPacket<PacketT,Alignment>(i + m_vert_offset, m_horiz_offset);
-  }
-
-  template <typename PacketT>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i, Index j) const {
-    if (UseDirectOffsets) {
-      return m_base_mapper.template loadPacket<PacketT,Alignment>(i, j);
-    }
-    return m_base_mapper.template loadPacket<PacketT,Alignment>(i + m_vert_offset, j + m_horiz_offset);
-  }
-
-  template <typename PacketT, int AlignmentType>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i, Index j) const {
-    if (UseDirectOffsets) {
-      return m_base_mapper.template load<PacketT,AlignmentType>(i, j);
-    }
-    return m_base_mapper.template loadPacket<PacketT,AlignmentType>(i + m_vert_offset, j + m_horiz_offset);
-  }
-
-  template <typename PacketT>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketT& p) const {
-    if (UseDirectOffsets) {
-      m_base_mapper.storePacket(i, 0, p);
-    }
-    m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
-    if (UseDirectOffsets) {
-      return LinearMapper(m_base_mapper, i, j);
-    }
-    return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset);
-  }
-
-  template <typename PacketT, int AlignmentType>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const {
-    EIGEN_STATIC_ASSERT((internal::is_same<PacketT, PacketT>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned;
-    if (UseDirectOffsets) {
-     return m_base_mapper.template loadPacket<PacketT,ActualAlignment>(i, 0);
-    }
-    return m_base_mapper.template loadPacket<PacketT,ActualAlignment>(i + m_vert_offset, m_horiz_offset);
-  }
-
-  template <typename PacketT>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool aligned(Index) const {
-    return false;
-  }
-
-  #ifdef EIGEN_USE_SYCL
-  // The placeholder accessors require to be bound to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_base_mapper.bind(cgh);
-  }
-  #endif
-
-  const ParentMapper& base_mapper() const { return m_base_mapper; }
-  Index vert_offset() const { return m_vert_offset; }
-  Index horiz_offset() const { return m_horiz_offset; }
-
- private:
-  ParentMapper m_base_mapper;
-  const Index m_vert_offset;
-  const Index m_horiz_offset;
-};
-
-
-template<typename Scalar_, typename Index, int side,
-         typename Tensor,
-         typename nocontract_t, typename contract_t,
-         int packet_size,
-         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,  template <class> class MakePointer_=MakePointer>
-class TensorContractionInputMapper
-  : public BaseTensorContractionMapper<Scalar_, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> {
-
- public:
-  typedef Scalar_ Scalar;
-  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> Base;
-  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> SubMapper;
-  typedef SubMapper VectorMapper;
-
-  EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor,
-                               const nocontract_t& nocontract_strides,
-                               const nocontract_t& ij_strides,
-                               const contract_t& contract_strides,
-                               const contract_t& k_strides)
-      : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const {
-    return SubMapper(*this, i, j);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
-    return VectorMapper(*this, i, j);
-  }
-  
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>& get_tensor() const {
-    return Base::m_tensor;
-  }
-};
-
-
-template <typename T> struct TensorContractionInputMapperTrait;
-
-template<typename Scalar_, typename Index_, int side_,
-         typename Tensor_,
-         typename nocontract_t_, typename contract_t_,
-         int packet_size_,
-         bool inner_dim_contiguous_, bool inner_dim_reordered_, int Alignment_,  template <class> class MakePointer_>
-struct TensorContractionInputMapperTrait<TensorContractionInputMapper<Scalar_, Index_, side_, Tensor_, 
-                                                    nocontract_t_, contract_t_, packet_size_, inner_dim_contiguous_, 
-                                                    inner_dim_reordered_, Alignment_, MakePointer_> > {
-
-      typedef Tensor_ XprType;
-      static const bool  inner_dim_contiguous = inner_dim_contiguous_;
-      static const bool  inner_dim_reordered = inner_dim_reordered_;
-  };  
-
-
-}  // end namespace internal
-}  // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionSycl.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionSycl.h
deleted file mode 100755
index 473c228..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionSycl.h
+++ /dev/null
@@ -1,1650 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla Public License v. 2.0. If a copy of the MPL was not
-// distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TensorContractionSycl.h
- *
- * \brief:
- *  TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend
- *
- *****************************************************************/
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
-#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
-
-namespace Eigen {
-
-namespace TensorSycl {
-namespace internal {
-
-#ifndef EIGEN_SYCL_DISABLE_GEMV
-/*!
- * \brief TVPanelSize, a template class used for setting the panel size required for launching General TensorVector
- * contraction kernel on various hardware devices.
- *
- * \tparam Scalar: determines the element type of the tensor/vector
- *
- * \tparam StorageIndex  determines the Index type.
- *
- * \tparam NCWindow: determines the number of non-contracting element to be process by each work-group
- *
- * \tparam CFactor: determines the number of contracting element to be process by each thread
- *
- * \tparam NCFactor: determines the number of non-contracting element to be process by each thread
- */
-template <typename Scalar, typename StorageIndex, StorageIndex NCWindow, StorageIndex CFactor, StorageIndex NCFactor>
-struct TVPanelSize {
-  // LocalThreadSizeC: determines total number of thread per workgroup for the contracting dimension
-  static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeC = EIGEN_SYCL_LOCAL_THREAD_DIM0;
-  // LocalThreadSizeNC: determines total number of thread per workgroup for the non-contracting dimension
-  static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC = EIGEN_SYCL_LOCAL_THREAD_DIM1;
-  // TileSizeDimNC: determines the tile size for the non-contracting dimension
-  static EIGEN_CONSTEXPR StorageIndex TileSizeDimNC = NCWindow / NCFactor;
-  // TileSizeDimC: determines the tile size for the contracting dimension
-  static EIGEN_CONSTEXPR StorageIndex TileSizeDimC = CFactor * LocalThreadSizeNC * LocalThreadSizeC;
-  // WorkLoadPerThreadNC : determines workload per thread for loading the non-contracting dimension
-  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC = TileSizeDimNC / LocalThreadSizeNC;
-  // WorkLoadPerThreadC: determines workload per thread for loading the non-contracting dimension
-  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadC = TileSizeDimC / LocalThreadSizeC;
-  // BC : determines if supporting bank conflict is required
-  static EIGEN_CONSTEXPR bool BC = false;
-};
-#endif
-
-/*!
- * \brief TTPanelSize, a template class used for setting the panel size required for launching General Tensor Tensor
- contraction kernel on various hardware devices.
- *
- * \tparam Scalar: determines the element type of the tensor
- *
- * \tparam StorageIndex: determines the Index type.
- *
- * \tparam REG_SIZE_M: determines workload per thread for loading the M dimension This can be varied based on the
- available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro).
- *
- * \tparam REG_SIZE_N: determines workload per thread for loading the N dimension This can be varied based on the
- available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro).
- *
- * \tparam TSDK: determines Tile size for dimension K. The packet size is assumed to be considered
- */
-
-template <typename Scalar, typename StorageIndex, StorageIndex REG_SIZE_M, StorageIndex REG_SIZE_N, StorageIndex TSDK>
-struct TTPanelSize {
-  // TileSizeDimK: determines Tile size for dimension K. The packet size is assumed to be considered
-  static EIGEN_CONSTEXPR StorageIndex TileSizeDimK = TSDK;
-  // WorkLoadPerThreadM : determines workload per thread for loading the M dimension This can be varied based on the
-  // available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro//
-#ifndef EIGEN_SYCL_REG_M
-  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = REG_SIZE_M;
-#else
-  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = EIGEN_SYCL_REG_M;
-#endif
-// WorkLoadPerThreadN : determines workload per thread for loading the N dimension This can be varied based on the
-// available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro
-#ifndef EIGEN_SYCL_REG_N
-  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = REG_SIZE_N;
-#else
-  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = EIGEN_SYCL_REG_N;
-#endif
-  // LocalThreadSizeM: determines total number of thread per workgroup for the m dimension
-  static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeM = EIGEN_SYCL_LOCAL_THREAD_DIM0;
-  // LocalThreadSizeN: determines total number of thread per workgroup for the n dimension
-  static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeN = EIGEN_SYCL_LOCAL_THREAD_DIM1;
-  // TileSizeDimM: determines the tile size for the m dimension
-  static EIGEN_CONSTEXPR StorageIndex TileSizeDimM = LocalThreadSizeM * WorkLoadPerThreadM;
-  // TileSizeDimN: determines the tile size for the n dimension
-  static EIGEN_CONSTEXPR StorageIndex TileSizeDimN = LocalThreadSizeN * WorkLoadPerThreadN;
-  // LoadPerThreadLhs: determines workload per thread for loading Lhs Tensor. This must be divisable by packetsize
-  static EIGEN_CONSTEXPR StorageIndex LoadPerThreadLhs =
-      ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimN));
-  // LoadPerThreadRhs: determines workload per thread for loading Rhs Tensor. This must be divisable by packetsize
-  static EIGEN_CONSTEXPR StorageIndex LoadPerThreadRhs =
-      ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimM));
-  // BC : determines if supporting bank conflict is required
-  static EIGEN_CONSTEXPR bool BC = true;
-  // DoubleBuffer: determines if double buffering technique should be used (This can be disabled by
-  // EIGEN_SYCL_DISABLE_DOUBLE_BUFFER macro when the device doesnot have sufficient  local memory)
-  static EIGEN_CONSTEXPR bool DoubleBuffer =
-#ifdef EIGEN_SYCL_DISABLE_DOUBLE_BUFFER
-      false;
-#else
-      true;
-#endif
-};
-
-/* !
- * \brief contraction_type: an enum class representing the Tensor Contraction implementation algorithm. This is used to
- * specialize the contraction algorithm based on device support for dedicated local memory.
- */
-enum class contraction_type { local, no_local };
-/* !
- * \brief data_source an enum class determining the location of the data in a memory hierarchy (global, local, private).
- */
-enum class data_source { global_mem, local_mem, private_mem };
-
-/*!
- * \brief read, a template function used for loading the data from global
- memory. This function is used to guarantee coalesced and vectorized load whenever possible
- *
- * \tparam PacketLoad: determines if the each element of this tensor block should be loaded in a packet mode
- *
- * \param is_coalesced_layout: determines whether or not the Tensor data in a memory can be access coalesced and
- vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the
- contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case
- when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed.
- *
- * \tparam PacketType:  determines the type of packet
- *
- * \tparam TensorMapper: determines the input tensor mapper type
- *
- * \tparam StorageIndex: determines the Index type
-
- * \param tensorMapper: is the input tensor
- *
- * \param NCIndex: is the non-contracting dim index
- *
- * \param CIndex is the contracting dim index
- *
- * \param ld: is the leading dimension of the flattened tensor
- */
-template <bool PacketLoad, bool is_coalesced_layout, bool, typename PacketType, typename TensorMapper,
-          typename StorageIndex>
-static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<PacketLoad, PacketType>::type read(
-    const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &ld) {
-  const StorageIndex row = (is_coalesced_layout) ? NCIndex : CIndex;
-  const StorageIndex col = (is_coalesced_layout) ? CIndex : NCIndex;
-  return tensorMapper.get_tensor().template packet<Unaligned>(row + (col * ld));
-}
-
-/*!
- * \brief read, special overload of read function, when the read access is not vectorized
- *
- * \tparam PacketLoad: determines if the each element of this tensor block should be loaded in a packet mode
- *
- * \param is_coalesced_layout: determines whether or not the Tensor data in a memory can be access coalesced and
-  vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the
-  contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case
-  when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed.
- *
- * \tparam PacketType: determines the type of packet
- *
- * \tparam TensorMapper: determines the input tensor mapper type
- *
- * \tparam StorageIndex: determines the Index type
-
- * \param tensorMapper: is the input tensor
- *
- * \param NCIndex: is the non-contracting dim index
- *
- * \param CIndex: is the contracting dim index
- */
-template <bool PacketLoad, bool, bool IsRhs, typename PacketType, typename TensorMapper, typename StorageIndex>
-static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!PacketLoad, PacketType>::type read(
-    const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &) {
-  const StorageIndex row = (IsRhs) ? CIndex : NCIndex;
-  const StorageIndex col = (IsRhs) ? NCIndex : CIndex;
-  return tensorMapper(row, col);
-}
-
-/*!
- * \brief write, a template function used for storing the data to local memory. This function is used to guarantee
- * coalesced and vectorized store whenever possible.
- *
- * \tparam StorageIndex: determines the Index type
- *
- * \param ld is the leading dimension of the local memory. ld is a compile time value for the local memory
- *
- * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy.
- *
- * \tparam PacketType:  determines the type of packet
- *
- * \tparam DataScalar: determines the output data type
- *
- * \param packet_data: the data to be written in the local memory
- *
- * \param ptr: a pointer to the local memory
- *
- * \param CIndex is the contracting dim index
- */
-
-template <typename StorageIndex, StorageIndex ld, data_source dt, typename PacketType, typename DataScalar>
-static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    typename ::Eigen::internal::enable_if<dt != data_source::global_mem, void>::type
-    write(PacketType &packet_data, DataScalar ptr) {
-  EIGEN_CONSTEXPR int PacketSize = Eigen::internal::unpacket_traits<PacketType>::size;
-  EIGEN_UNROLL_LOOP
-  for (int i = 0; i < PacketSize; i++) {
-    *ptr = PacketWrapper<PacketType, PacketSize>::scalarize(i, packet_data);
-    ptr += ld;
-  }
-}
-
-/*!
- * \brief Overloading the write function for storing the data to global memory, when vectorization enabled This function
- * is used to guarantee coalesced and vectorized store whenever possible.
- *
- * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy.
- *
- * \tparam PacketType:  determines the type of packet
- *
- * \tparam DataScalar: determines the output data type
- *
- * \param packet_data: the data to be written in the local memory
- *
- * \param ptr: a pointer to the local memory
- */
-
-template <data_source dt, typename PacketType, typename DataScalar>
-static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<
-    Eigen::internal::unpacket_traits<PacketType>::size != 1 && dt == data_source::global_mem, void>::type
-write(PacketType &packet_data, DataScalar *ptr) {
-  ::Eigen::internal::pstoreu<DataScalar, PacketType>(ptr, packet_data);
-}
-
-/*!
- * \brief Overloading the write function for storing the data to global memory, when vectorization is disabled.
- *
- * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy.
- *
- * \tparam PacketType:  determines the type of packet
- *
- * \tparam DataScalar: determines the output data type
- *
- * \param packet_data: the data to be written in the local memory
- *
- * \param ptr: a pointer to the local memory
- */
-template <data_source dt, typename PacketType, typename DataScalar>
-static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<
-    Eigen::internal::unpacket_traits<PacketType>::size == 1 && dt == data_source::global_mem, void>::type
-write(PacketType &packet_data, DataScalar *ptr) {
-  *ptr = packet_data;
-}
-
-/*!
- * \brief check_boundary: is used to check the edge condition for non-internal blocks.
- *
- * \tparam is_internal: determines if the block is internal
- */
-template <bool is_internal>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary(bool) {
-  return true;
-}
-
-/*!
- * \brief check_boundary: specialization of the check_boundary for non-internal blocks.
- *
- * \param cond: true when the data is in range. Otherwise false
- */
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary<false>(bool cond) {
-  return cond;
-}
-
-/*!
- * \brief BlockProperties is a template class that provides different characteristic of a block of each Tensor processed
- * by each workgroup.
- *
- * \tparam is_transposed: iff true, determines whether or not the block of the Tensor is transposed
- *
- * \tparam packet_load_: determines if the each element of this tensor block should be loaded in a packet mode
- *
- * \tparam PacketType:  determines the type of packet
- *
- * \tparam OutType: determines the type of each element for this block of tensor. If packet load is true, it will be
- * packetType; Otherwise it will be scalar Type
- *
- * \param elements_per_access determines the size of each element based on OutType
- *
- * \param is_coalesced_layout  determines whether or not the Tensor data in a memory can be access coalesced and
- * vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the
- * contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case
- * when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed.
- *
- * \param nc_stride determines the stride of non-contracting dimension to access the next adjustment element within the
- * Tensor Block for each workgroup
- *
- * \param c_stride  determines the stride of contracting dimension to access the next adjustment element within the
- * Tensor Block for each workgroup
- */
-template <bool is_transposed, bool is_rhs_, bool packet_load_, typename PacketType>
-struct BlockProperties {
-  static EIGEN_CONSTEXPR bool packet_load = packet_load_;
-  typedef typename Eigen::internal::unpacket_traits<PacketType>::type OutScalar;
-  static EIGEN_CONSTEXPR bool is_rhs = is_rhs_;
-  typedef typename Eigen::internal::conditional<packet_load, PacketType, OutScalar>::type OutType;
-  static EIGEN_CONSTEXPR int elements_per_access = Eigen::internal::unpacket_traits<OutType>::size;
-  static EIGEN_CONSTEXPR bool is_coalesced_layout = !(is_transposed ^ is_rhs);
-  static EIGEN_CONSTEXPR int nc_stride = (is_coalesced_layout ? elements_per_access : 1);
-  static EIGEN_CONSTEXPR int c_stride = (is_coalesced_layout ? 1 : elements_per_access);
-};
-
-/*!
- * \brief ThreadProperties is a template class that provides each thread's properties within a workgroup.  Please see
- * the sycl-1.2.1 specification (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for the workgroup,
- * work-items
- *
- * \tparam StorageIndex: determines the StorageIndex Type
- *
- * \param linearLocalThreadId: determines the linearized location of a thread within a work-group
- *
- * \param kGroupId: determines the logical group id in a k dimension of the flattened tensor. It will be > 1 when
- * tall/skinny algorithm is used
- *
- * \param mGroupOffset: determines the logical start position of all thread within a workgroup for the m dimension of
- * the flattened tensor.
- *
- * \param kGroupOffset determines the logical start position of all thread within a workgroup for the k dimension of the
- * flattened tensor. It will be > 1 when tall/skinny algorithm is used.
- *
- * \param mLocalOffset: determines the logical start position of each thread within a workgroup for the m dimension of a
- * flattened tensor. The position determines the distance of each thread within the workgroup from each other
- * independent from their global position.
- *
- * \param nLocalOffset: determines the logical start position of each thread within a workgroup for the n dimension of a
- * flattened tensor. The position determines the distance of each thread within the workgroup from each other
- * independent from their global position.
- *
- * \param mGlobalOffset: determines the logical start position of each thread a thread for the m dimension on a
- * flattened tensor
- *
- * \param nGlobalOffset: determines the logical start position of each thread a thread for the n dimension on a
- * flattened tensor
- *
- * \param kSize : determine the number of the k elements of the flattened Tensor to be processed by each thread for the
- * given tensor block. This is !=K dimension of Flattened Tensor when Tall/Skinny matrix is used.
- *
- * \param is_internal : this will determined if the thread within the work-group computes an internal block of tensor or
- * the edge blocks. When it is internal, there is no need to check the boundaries and all the if stantement can be
- * resolve by compiler.
- */
-template <typename StorageIndex>
-struct ThreadProperties {
-  const StorageIndex linearLocalThreadId;
-  const StorageIndex kGroupId;
-  const StorageIndex mGroupOffset;
-  const StorageIndex nGroupOffset;
-  const StorageIndex kGroupOffset;
-  const StorageIndex mLocalOffset;
-  const StorageIndex nLocalOffset;
-  const StorageIndex mGlobalOffset;
-  const StorageIndex nGlobalOffset;
-  StorageIndex kSize;
-  const bool is_internal;
-  // this is used to adjust the last block
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ThreadProperties(
-      const StorageIndex linearLocalThreadId_, const StorageIndex kGroupId_, const StorageIndex mGroupOffset_,
-      const StorageIndex nGroupOffset_, const StorageIndex kGroupOffset_, const StorageIndex mLocalOffset_,
-      const StorageIndex nLocalOffset_, const StorageIndex mGlobalOffset_, const StorageIndex nGlobalOffset_,
-      StorageIndex kSize_, const bool is_internal_)
-      : linearLocalThreadId(linearLocalThreadId_),
-        kGroupId(kGroupId_),
-        mGroupOffset(mGroupOffset_),
-        nGroupOffset(nGroupOffset_),
-        kGroupOffset(kGroupOffset_),
-        mLocalOffset(mLocalOffset_),
-        nLocalOffset(nLocalOffset_),
-        mGlobalOffset(mGlobalOffset_),
-        nGlobalOffset(nGlobalOffset_),
-        kSize(kSize_),
-        is_internal(is_internal_) {}
-};
-
-/*!
- * \brief TensorContractionKernel is a template class that provides Tensor -Tensor contraction operation.
- *
- * \tparam OutScalar: determines the output scalar type
- *
- * \tparam LhsScalar: determines the left-hand-side scalar type
- *
- * \tparam RhsScalar: determines the right-hand-side scalar type
- *
- * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification
- (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition)
- *
- * \tparam LhsMapper determines the tensor contraction mapper type for left-hand-side matrix
- *
- * \tparam RhsMapper determines the tensor contraction mapper type for right-hand-side matrix
- *
- * \tparam StorageIndex: determines the StorageIndex Type
- *
- * \tparam Properties: determines the Contraction Panel properties
- *
- * \tparam TripleDim: determines the M, K, N dimensions for the flatten tensors in order to treat them as a matrix
- *
- * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression.
- *
- * \tparam input_mapper_properties : determine if the input tensors are matrix. If they are matrix, special memory
- access is used to guarantee that always the memory access are coalesced.
- *
- * \tptaram IsFinal : determine if this is the final kernel. If so, the result will be written in a final output.
- Otherwise, the result of contraction will be written iin a temporary buffer. This is the case when Tall/Skinny
- contraction is used. So in this case, a final reduction step is required to compute final output.
-
- * \tparam contraction_tp: it is an enum value representing whether the local memroy/no local memory implementation of
- the algorithm to be used
- *
- * \param scratch: local memory containing tiles of LHS and RHS tensors for each work-group
- *
- * \param lhs: determines the left-hand-side flattened tensor (tensor mapper)
- *
- * \param rhs: determines the right-hand-side flattened tensor (tensor mapper)
- *
- * \param out_res: determines the output tensor containing the contraction result
- *
- * \param groupSizeM: a logical number determining the number of work-group for m dimension
- *
- * \param groupSizeN: a logical number determining the number of work-group for n dimension
- *
- * \param numTiles: determines total number of tiles on the k dimension
- *
- * \param TripleDim: determines the M, K, N dimensions for the flatten tensors in order to treat them as a matrix
- */
-template <typename OutScalar, typename LhsScalar, typename RhsScalar, typename OutAccessor, typename LhsMapper,
-          typename RhsMapper, typename StorageIndex, typename Properties, typename TripleDim, bool Vectorizable,
-          typename input_mapper_properties, bool IsFinal, contraction_type contraction_tp>
-class TensorContractionKernel {
- public:
-  typedef typename Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketReturnType
-      PacketReturnType;
-  static EIGEN_CONSTEXPR int PacketSize =
-      Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketSize;
-  static EIGEN_CONSTEXPR bool is_lhs_transposed =
-      !::Eigen::internal::TensorContractionInputMapperTrait<LhsMapper>::inner_dim_contiguous;
-  static EIGEN_CONSTEXPR bool is_rhs_transposed =
-      !::Eigen::internal::TensorContractionInputMapperTrait<RhsMapper>::inner_dim_contiguous;
-
-  typedef BlockProperties<is_lhs_transposed, false, input_mapper_properties::is_lhs_matrix && Vectorizable,
-                          PacketReturnType>
-      LHSBlockProperties;
-
-  typedef BlockProperties<is_rhs_transposed, true, input_mapper_properties::is_rhs_matrix && Vectorizable,
-                          PacketReturnType>
-      RHSBlockProperties;
-
-  static EIGEN_CONSTEXPR StorageIndex NStride =
-      contraction_tp == contraction_type::local ? Properties::WorkLoadPerThreadN : RHSBlockProperties::nc_stride;
-
-  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
-  typedef cl::sycl::multi_ptr<OutScalar, cl::sycl::access::address_space::local_space> local_ptr;
-  typedef OutScalar * /*cl::sycl::multi_ptr<OutScalar, cl::sycl::access::address_space::private_space>*/ private_ptr;
-  typedef
-      typename ::Eigen::internal::conditional<contraction_tp == contraction_type::local, local_ptr, private_ptr>::type
-          tile_ptr;
-  static EIGEN_CONSTEXPR StorageIndex LSDL = contraction_tp == contraction_type::local
-                                                 ? Properties::TileSizeDimM + Properties::BC
-                                                 : Properties::WorkLoadPerThreadM;
-  static EIGEN_CONSTEXPR StorageIndex LSDR = contraction_tp == contraction_type::local
-                                                 ? Properties::TileSizeDimN + Properties::BC
-                                                 : Properties::WorkLoadPerThreadN;
-  static EIGEN_CONSTEXPR StorageIndex LocalOffset = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN;
-
-  /**
-   * \brief MemHolder this is a place holder struct for creating memory hierarchy in SYCL. Inside SYCL kernel it is not
-   * allowed to have dynamic memory allocation. While the local memory is created outside of the kernel and passed to
-   * the kernel as an accessor, the private memory can only allowed to be allocated statically. Since we are abstracting
-   * the TiledMemory for both local and private memory, the MemHolder structs is used as a helper to abstract out
-   * different type of memory needed when local/no_local memory computation is called.
-   *
-   * \tparam contraction_type: it is an enum value representing whether the local memroy/no local memory implementation
-   of the algorithm to be used
-   * \tparam the private memory size
-   * \param ptr the tile memory pointer type
-   */
-  template <contraction_type, StorageIndex>
-  struct MemHolder {
-    tile_ptr ptr;
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MemHolder(local_ptr block_start_ptr) : ptr(block_start_ptr) {}
-  };
-  /**
-   * \brief specialization of memHolder class when no local memory kernel is used.
-   */
-  template <StorageIndex MemSize>
-  struct MemHolder<contraction_type::no_local, MemSize> {
-    OutScalar ptr[MemSize] = {OutScalar{0}};
-  };
-  /**
-   * \brief TiledMemory: contains required memory pointer for loading  each tile of the TensorContraction panel from
-   * global memory to local/private memory when local/no_local algorithm used.
-   *
-   * \param lhs_scratch_extract : determines the LHS tile memory. It is either private or local memory based on the
-   * selected contraction_type.
-   *
-   * \param rhs_scratch_extract : determines the RHS tile memory. It is either private or local memory based on the
-   * selected contraction_type.
-   *
-   * \param lhs_extract_index: determins the position of each thread on a local memory for lhs input. When private
-   * memory is used this is set to zero as this is not applicable in case of private memory.
-   *
-   * \param rhs_extract_index: determins the position of each thread on a local memory for rhs input. When private
-   * memory is used this is set to zero as this is not applicable in case of private memory.
-   *
-   * \param lhs_scratch_compute : determines the  location to load for computation for lhs_local memory. This is the
-   * same as lhs_scratch_extract for private memory.
-   *
-   * \param rhs_scratch_compute : determines the  location to load for computation for rhs_local memory. This is the
-   * same as rhs_scratch_extract for private memory.
-   */
-  struct TiledMemory {
-    MemHolder<contraction_tp, Properties::WorkLoadPerThreadM * Properties::TileSizeDimK> lhs_scratch_extract;
-    MemHolder<contraction_tp, Properties::WorkLoadPerThreadN * Properties::TileSizeDimK> rhs_scratch_extract;
-    tile_ptr lhs_scratch_ptr_compute;
-    tile_ptr rhs_scratch_ptr_compute;
-    const std::pair<StorageIndex, StorageIndex> lhs_extract_index;
-    const std::pair<StorageIndex, StorageIndex> rhs_extract_index;
-    template <contraction_type tp = contraction_tp>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    TiledMemory(const ThreadProperties<StorageIndex> &, local_ptr,
-                typename ::Eigen::internal::enable_if<tp == contraction_type::no_local>::type * = 0)
-        : lhs_scratch_extract{},
-          rhs_scratch_extract{},
-          lhs_scratch_ptr_compute(lhs_scratch_extract.ptr),
-          rhs_scratch_ptr_compute(rhs_scratch_extract.ptr),
-          lhs_extract_index(std::pair<StorageIndex, StorageIndex>(StorageIndex{0}, StorageIndex{0})),
-          rhs_extract_index(std::pair<StorageIndex, StorageIndex>(StorageIndex{0}, StorageIndex{0})) {}
-
-    template <contraction_type tp = contraction_tp>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    TiledMemory(const ThreadProperties<StorageIndex> &thread_properties, local_ptr block_start_ptr,
-                typename ::Eigen::internal::enable_if<tp == contraction_type::local>::type * = 0)
-        : lhs_scratch_extract{block_start_ptr},
-          rhs_scratch_extract{lhs_scratch_extract.ptr +
-                              ((Properties::DoubleBuffer + 1) * LSDL * Properties::TileSizeDimK)},
-          lhs_scratch_ptr_compute(lhs_scratch_extract.ptr + thread_properties.mLocalOffset),
-          rhs_scratch_ptr_compute(rhs_scratch_extract.ptr + thread_properties.nLocalOffset),
-          lhs_extract_index(
-              local_id_extract<LHSBlockProperties, Properties::TileSizeDimM>(thread_properties.linearLocalThreadId)),
-          rhs_extract_index(
-              local_id_extract<RHSBlockProperties, Properties::TileSizeDimN>(thread_properties.linearLocalThreadId)) {}
-  };
-
-  Scratch scratch;
-  const LhsMapper lhs;
-  const RhsMapper rhs;
-  OutAccessor out_res;
-  const StorageIndex groupSizeM;
-  const StorageIndex groupSizeN;
-  const StorageIndex numTiles;
-  const TripleDim triple_dim;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_,
-                                                                const RhsMapper rhs_, OutAccessor out_res_,
-                                                                const StorageIndex groupSizeM_,
-                                                                const StorageIndex groupSizeN_,
-                                                                const StorageIndex numTiles_,
-                                                                const TripleDim triple_dim_)
-      : scratch(scratch_),
-        lhs(lhs_),
-        rhs(rhs_),
-        out_res(out_res_),
-        groupSizeM(groupSizeM_),
-        groupSizeN(groupSizeN_),
-        numTiles(numTiles_),
-        triple_dim(triple_dim_) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_,
-                                                                const RhsMapper rhs_, OutAccessor out_res_,
-                                                                const StorageIndex groupSizeM_,
-                                                                const StorageIndex numTiles_,
-                                                                const TripleDim triple_dim_)
-      : TensorContractionKernel(scratch_, lhs_, rhs_, out_res_, groupSizeM_, 1, numTiles_, triple_dim_) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
-    const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
-    const StorageIndex nLocalThreadId = linearLocalThreadId / Properties::LocalThreadSizeM;
-    const StorageIndex mLocalThreadId = linearLocalThreadId % Properties::LocalThreadSizeM;
-    const StorageIndex mGroupId = itemID.get_group(0) % groupSizeM;
-    const StorageIndex tmp = itemID.get_group(0) / groupSizeM;
-    const StorageIndex nGroupId = IsFinal ? tmp : tmp % groupSizeN;
-    const StorageIndex kGroupId = IsFinal ? 0 : tmp / groupSizeN;
-    const StorageIndex mGroupOffset = mGroupId * Properties::TileSizeDimM;
-    const StorageIndex nGroupOffset = nGroupId * Properties::TileSizeDimN;
-    const StorageIndex mLocalOffset = PacketSize * mLocalThreadId;
-    const StorageIndex nLocalOffset = NStride * nLocalThreadId;
-    const StorageIndex mGlobalOffset = mGroupOffset + mLocalOffset;
-    const StorageIndex nGlobalOffset = nGroupOffset + nLocalOffset;
-
-    const StorageIndex kSizePerWG = IsFinal ? triple_dim.K : numTiles * Properties::TileSizeDimK;
-    StorageIndex kGroupOffset = kGroupId * kSizePerWG;
-    const bool is_internal = triple_dim.M - mGroupOffset >= Properties::TileSizeDimM &&
-                             triple_dim.N - nGroupOffset >= Properties::TileSizeDimN &&
-                             triple_dim.K - kGroupOffset >= kSizePerWG;
-    // this is used to adjust the last block
-    StorageIndex kSize = IsFinal ? triple_dim.K : std::min(kSizePerWG, triple_dim.K - kGroupOffset);
-    // This is used to find out the lats K offset so that kGroupOffset -kSize can compute the coffset for loading to
-    // tile
-    kGroupOffset += kSize;
-
-    auto thread_properties =
-        ThreadProperties<StorageIndex>(linearLocalThreadId, kGroupId, mGroupOffset, nGroupOffset, kGroupOffset,
-                                       mLocalOffset, nLocalOffset, mGlobalOffset, nGlobalOffset, kSize, is_internal);
-
-    auto out_ptr = out_res.get_pointer() + (IsFinal ? 0 : thread_properties.kGroupId * triple_dim.M * triple_dim.N);
-
-    (thread_properties.is_internal) ? compute_panel<true>(itemID, thread_properties, out_ptr)
-                                    : compute_panel<false>(itemID, thread_properties, out_ptr);
-  }
-  // The compute block computes the contraction operation private block for each thread and store the resutl in the
-  // privateRes memory of Each computation the compute block function is independent of local and no local concepts as
-  // it only compute the block on each thread's private memory space
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_block_per_tile(OutScalar *lhs_block_ptr, OutScalar *rhs_block_ptr,
-                                                                    PacketReturnType *privateRes) {
-    StorageIndex idx = 0;
-    EIGEN_CONSTEXPR StorageIndex lhs_stride =
-        contraction_tp == contraction_type::local ? (PacketSize * Properties::LocalThreadSizeM) : 1;
-    EIGEN_UNROLL_LOOP
-    for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN; wLPTN++) {
-      auto rhsPacket = PacketReturnType{*(rhs_block_ptr + wLPTN)};
-      StorageIndex lhs_index = 0;
-      EIGEN_UNROLL_LOOP
-      for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) {
-        PacketReturnType lhsPack{};
-        Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, PacketSize>::set_packet(lhsPack,
-                                                                                             lhs_block_ptr + lhs_index);
-        privateRes[idx] = ::Eigen::internal::pmadd(lhsPack, rhsPacket, privateRes[idx]);
-
-        lhs_index += lhs_stride;
-        idx++;
-      }
-    }
-  }
-  // The store function write the computed contraction operation in the private memory of each thread to the global
-  // memory. The store function is independent of local and no local concepts s that it can be abstract out in the base
-  // class.
-  template <bool is_internal_block, StorageIndex PrivateNStride, typename OutPtr>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void store(OutPtr *out_ptr, PacketReturnType *privateRes,
-                                                   StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) {
-    auto chk_bound = [&](const StorageIndex &mIndex, const StorageIndex &nIndex) EIGEN_DEVICE_FUNC {
-      return (mIndex + PacketSize - 1 < triple_dim.M && nGlobalOffset + nIndex < triple_dim.N);
-    };
-    // when local memory is not used M and N are both accessed in a coalesced way. However, when local memory is
-    // available the k*N is transposed in the local to N*K therefore, each blocks operates on blockId*
-    // WorkLoadPerThreadN slice of N
-    EIGEN_CONSTEXPR StorageIndex GlobalNStride =
-        contraction_tp == contraction_type::local ? 1 : Properties::LocalThreadSizeN;
-    EIGEN_UNROLL_LOOP
-    for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN / PrivateNStride; wLPTN++) {
-      // output leading dimension
-      StorageIndex outputLD = 0;
-      // When local memory is used the PrivateNstride is always 1 because the coalesed access on N is loaded into Local
-      // memory and extracting from local to global is the same as no transposed version. However, when local memory is
-      // not used and RHS is transposed we packetize the load for RHS.
-      EIGEN_UNROLL_LOOP
-      for (StorageIndex nId = 0; nId < PrivateNStride; nId++) {
-        StorageIndex globalRow = mGlobalOffset;
-        EIGEN_UNROLL_LOOP
-        for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) {
-          PacketReturnType privetOut = privateRes[wLPTM];
-          if (check_boundary<is_internal_block>(chk_bound(globalRow, nId))) {
-            // Store the final results in C. The C matrix has always M as a first StorageIndex and N as a second
-            // StorageIndex Therefore it is always coalesced layout
-            write<data_source::global_mem>(privetOut, out_ptr + outputLD + globalRow);
-          } else {
-            EIGEN_UNROLL_LOOP
-            for (StorageIndex mId = 0; mId < PacketSize; mId++) {
-              StorageIndex mOffset = globalRow + mId;
-              if (mOffset < triple_dim.M && (nGlobalOffset + nId < triple_dim.N)) {
-                out_ptr[mOffset + outputLD] =
-                    Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, PacketSize>::scalarize(mId, privetOut);
-              }
-            }
-          }
-          globalRow += (PacketSize * Properties::LocalThreadSizeM);
-        }
-        outputLD += triple_dim.M;
-        privateRes += Properties::WorkLoadPerThreadM / PacketSize;
-      }
-      out_ptr += (GlobalNStride * outputLD);
-
-      nGlobalOffset += (PrivateNStride * GlobalNStride);
-    }
-  }
-  // when no local memory is used the following extract_block will be enabled
-  template <typename InputBlockProperties, bool is_internal_block, typename Input, typename PrivateReg,
-            contraction_type contract_tp = contraction_tp>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-      typename ::Eigen::internal::enable_if<contract_tp == contraction_type::no_local>::type
-      extract_block(const Input &inpt, PrivateReg private_ptr, const std::pair<StorageIndex, StorageIndex> &,
-                    const StorageIndex &ncOffset, const StorageIndex cOffset) {
-    EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC =
-        InputBlockProperties::is_rhs ? Properties::LocalThreadSizeN : Properties::LocalThreadSizeM;
-    EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC =
-        InputBlockProperties::is_rhs ? Properties::WorkLoadPerThreadN : Properties::WorkLoadPerThreadM;
-    const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M;
-
-    auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC {
-      return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) &&
-              (NCIndex + InputBlockProperties::nc_stride - 1 < NC));
-    };
-    const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K;
-    StorageIndex cIndex = cOffset;
-
-    EIGEN_UNROLL_LOOP
-    for (StorageIndex cId = 0; cId < Properties::TileSizeDimK / InputBlockProperties::c_stride; cId++) {
-      StorageIndex ncIndex = ncOffset;
-      EIGEN_UNROLL_LOOP
-      for (StorageIndex ncId = 0; ncId < WorkLoadPerThreadNC / InputBlockProperties::nc_stride; ncId++) {
-        if (check_boundary<is_internal_block>(chk_bound(cIndex, ncIndex))) {
-          auto val =
-              read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
-                   InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, ncIndex, cIndex, ld);
-
-          write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : WorkLoadPerThreadNC),
-                data_source::private_mem>(val, private_ptr);
-        } else {
-          EIGEN_UNROLL_LOOP
-          for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {
-            const StorageIndex ncInd = ncIndex + (InputBlockProperties::is_coalesced_layout ? i : 0);
-            const StorageIndex cInd = cIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i);
-            OutScalar val =
-                (ncInd < NC && cInd < triple_dim.K)
-                    ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
-                          inpt, ncInd, cInd, ld)
-                    : OutScalar(0);
-            write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : WorkLoadPerThreadNC),
-                  data_source::private_mem>(
-                val, private_ptr + (InputBlockProperties::is_coalesced_layout ? i : 0) +
-                         ((InputBlockProperties::is_coalesced_layout ? 0 : i) * WorkLoadPerThreadNC));
-          }
-        }
-
-        // if it is lhs we have to load it packetised when the packet size is > 1, because the output is coalesced. So
-        // even if M is not accessed in a coalesced mode, we have to load packet_size number of m per thread.
-        ncIndex = (!InputBlockProperties::is_rhs && InputBlockProperties::nc_stride == 1 && PacketSize != 1)
-                      ? ncOffset + (ncId + 1) % PacketSize + ((ncId + 1) / PacketSize) * LocalThreadSizeNC
-                      : (ncIndex + InputBlockProperties::nc_stride * LocalThreadSizeNC);
-        private_ptr += InputBlockProperties::nc_stride;
-      }
-      // the previous for loop ( private_ptr += (ncId * nc_stride)) has already moved ptr with one WorkLoadPerThreadNC
-      private_ptr += (InputBlockProperties::c_stride - 1) * WorkLoadPerThreadNC;
-      cIndex += InputBlockProperties::c_stride;
-    }
-  }
-  template <typename InputBlockProperties, StorageIndex TileSizeDimNC>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::pair<StorageIndex, StorageIndex> local_id_extract(
-      const StorageIndex &linearLocalThreadId) {
-    const StorageIndex localThreadNC =
-        (InputBlockProperties::is_coalesced_layout)
-            ? linearLocalThreadId % (TileSizeDimNC / InputBlockProperties::nc_stride)
-            : linearLocalThreadId / (Properties::TileSizeDimK / InputBlockProperties::c_stride);
-    const StorageIndex localThreadC =
-        (InputBlockProperties::is_coalesced_layout)
-            ? linearLocalThreadId / (TileSizeDimNC / InputBlockProperties::nc_stride)
-            : linearLocalThreadId % (Properties::TileSizeDimK / InputBlockProperties::c_stride);
-    return std::pair<StorageIndex, StorageIndex>(localThreadNC, localThreadC);
-  }
-
-  template <bool db = Properties::DoubleBuffer, contraction_type ctp = contraction_tp>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-      typename ::Eigen::internal::enable_if<db && ctp == contraction_type::local>::type
-      sync_mem(const cl::sycl::nd_item<1> &, bool &db_offset) noexcept {
-    db_offset = !db_offset;
-  }
-
-  template <bool db = Properties::DoubleBuffer, contraction_type ctp = contraction_tp>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-      typename ::Eigen::internal::enable_if<!db && ctp == contraction_type::local>::type
-      sync_mem(const cl::sycl::nd_item<1> &itemID, bool &) noexcept {
-    itemID.barrier(cl::sycl::access::fence_space::local_space);
-  }
-
-  template <contraction_type ctp = contraction_tp>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-      typename ::Eigen::internal::enable_if<ctp == contraction_type::no_local>::type
-      sync_mem(const cl::sycl::nd_item<1> &, bool &) noexcept {
-    return;
-  }
-
-  template <bool need_sync, contraction_type ctp = contraction_tp>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-      typename ::Eigen::internal::enable_if<need_sync && ctp == contraction_type::no_local>::type
-      sync_thread(const cl::sycl::nd_item<1> &
-#ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION
-                      itemID
-#endif
-                  ) noexcept {
-#ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION
-    itemID.barrier(cl::sycl::access::fence_spacce::local_space);
-#else
-    return;
-#endif
-  }
-  template <bool need_sync, contraction_type ctp = contraction_tp>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-      typename ::Eigen::internal::enable_if<need_sync && ctp == contraction_type::local>::type
-      sync_thread(const cl::sycl::nd_item<1> &itemID) {
-    itemID.barrier(cl::sycl::access::fence_space::local_space);
-  }
-  template <bool need_sync>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!need_sync>::type sync_thread(
-      const cl::sycl::nd_item<1> &) {
-    return;
-  }
-
-  template <bool is_internal_block>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_tile_per_panel(const cl::sycl::nd_item<1> &itemID,
-                                                                    ThreadProperties<StorageIndex> &thread_properties,
-                                                                    TiledMemory &tiled_input_block,
-                                                                    PacketReturnType *privateRes, bool &db_offset) {
-    // Tiling the Rhs block from global to local memory
-    extract_block<RHSBlockProperties, is_internal_block>(
-        rhs, tiled_input_block.rhs_scratch_extract.ptr + (db_offset * Properties::TileSizeDimK * LSDR),
-        tiled_input_block.rhs_extract_index,
-        contraction_tp == contraction_type::local ? thread_properties.nGroupOffset : thread_properties.nGlobalOffset,
-        thread_properties.kGroupOffset - thread_properties.kSize);
-
-    sync_thread<contraction_tp == contraction_type::no_local>(itemID);
-
-    // Tiling the Lhs block from global to local memory
-    extract_block<LHSBlockProperties, is_internal_block>(
-        lhs, tiled_input_block.lhs_scratch_extract.ptr + (db_offset * LSDL * Properties::TileSizeDimK),
-        tiled_input_block.lhs_extract_index,
-        contraction_tp == contraction_type::local ? thread_properties.mGroupOffset : thread_properties.mGlobalOffset,
-        thread_properties.kGroupOffset - thread_properties.kSize);
-
-    // itemID.barrier(cl::sycl::access::fence_space::local_space);
-    sync_thread<contraction_tp == contraction_type::local>(itemID);
-    // switch to compute mede
-    StorageIndex lhs_offset = (db_offset * LSDL * Properties::TileSizeDimK);
-    StorageIndex rhs_offset = (db_offset * Properties::TileSizeDimK * LSDR);
-    // Loop over the values of a single tile
-    for (StorageIndex k = 0; k < Properties::TileSizeDimK; k++) {
-      compute_block_per_tile(tiled_input_block.lhs_scratch_ptr_compute + lhs_offset,
-                             tiled_input_block.rhs_scratch_ptr_compute + rhs_offset, privateRes);
-      lhs_offset += LSDL;
-      rhs_offset += LSDR;
-    }
-    // computing the K index for the next tile
-    thread_properties.kSize -= Properties::TileSizeDimK;
-    sync_mem(itemID, db_offset);
-  }
-
-  // when local memory is available the following compute_panel will be enabled
-  template <bool is_internal_block, typename OutPtr>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(const cl::sycl::nd_item<1> &itemID,
-                                                           ThreadProperties<StorageIndex> &thread_properties,
-                                                           OutPtr out_ptr) {
-    auto tiled_input_block = TiledMemory{thread_properties, scratch.get_pointer()};
-    // Allocate register space
-    PacketReturnType privateRes[Properties::WorkLoadPerThreadM * Properties::WorkLoadPerThreadN / PacketSize] = {
-        PacketReturnType{0}};
-    bool db_offset = 0;
-
-    while (thread_properties.kSize >= Properties::TileSizeDimK) {
-      compute_tile_per_panel<is_internal_block>(itemID, thread_properties, tiled_input_block, privateRes, db_offset);
-    }
-    if (thread_properties.kSize > 0) {
-      compute_tile_per_panel<false>(itemID, thread_properties, tiled_input_block, privateRes, db_offset);
-    }
-
-    // Storing the final results in the output
-    store<is_internal_block,
-          contraction_tp == contraction_type::local ? static_cast<StorageIndex>(1) : RHSBlockProperties::nc_stride>(
-        out_ptr + thread_properties.nGlobalOffset * triple_dim.M, privateRes, thread_properties.mGlobalOffset,
-        thread_properties.nGlobalOffset);
-  }
-  // When local memory is available the following extract_block will be enabled
-  template <typename InputBlockProperties, bool is_internal_block, typename Input, typename Local,
-            contraction_type contract_tp = contraction_tp>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-      typename ::Eigen::internal::enable_if<contract_tp == contraction_type::local>::type
-      extract_block(const Input &inpt, Local local_ptr, const std::pair<StorageIndex, StorageIndex>& local_index,
-                    const StorageIndex &ncOffset, const StorageIndex cOffset) {
-    EIGEN_CONSTEXPR StorageIndex TileSizeDimNC =
-        InputBlockProperties::is_rhs ? Properties::TileSizeDimN : Properties::TileSizeDimM;
-    EIGEN_CONSTEXPR StorageIndex LoadPerThread =
-        InputBlockProperties::is_rhs ? Properties::LoadPerThreadRhs : Properties::LoadPerThreadLhs;
-    EIGEN_CONSTEXPR StorageIndex LSD = InputBlockProperties::is_rhs ? LSDR : LSDL;
-    static_assert(((LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride) == 0) &&
-                   (LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride) == 0)),
-                  " LocalOffset must be divisable by stride");
-    const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M;
-    StorageIndex localThreadNC = local_index.first;
-    StorageIndex localThreadC = local_index.second;
-    auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC {
-      return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) &&
-              (NCIndex + InputBlockProperties::nc_stride - 1 < NC));
-    };
-    EIGEN_UNROLL_LOOP
-    for (StorageIndex lPT = 0; lPT < LoadPerThread / InputBlockProperties::elements_per_access; lPT++) {
-      const StorageIndex CIndex = cOffset + (InputBlockProperties::c_stride * localThreadC);
-      const StorageIndex NCIndex = ncOffset + (InputBlockProperties::nc_stride * localThreadNC);
-      const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K;
-      if (check_boundary<is_internal_block>(chk_bound(CIndex, NCIndex))) {
-        auto val =
-            read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
-                 InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, NCIndex, CIndex, ld);
-        write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : LSD), data_source::local_mem>(
-            val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) +
-                     (InputBlockProperties::c_stride * localThreadC * LSD));
-      } else {
-        EIGEN_UNROLL_LOOP
-        for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {
-          const StorageIndex nCInd = NCIndex + (InputBlockProperties::is_coalesced_layout ? i : 0);
-          const StorageIndex cInd = CIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i);
-          OutScalar val =
-              (nCInd < NC && cInd < triple_dim.K)
-                  ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
-                        inpt, nCInd, cInd, ld)
-                  : OutScalar(0);
-
-          write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : LSD), data_source::local_mem>(
-              val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) +
-                       (InputBlockProperties::is_coalesced_layout ? i : 0) +
-                       ((InputBlockProperties::c_stride * localThreadC +
-                         (InputBlockProperties::is_coalesced_layout ? 0 : i)) *
-                        LSD));
-        }
-      }
-      localThreadNC += (InputBlockProperties::is_coalesced_layout)
-                           ? LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride)
-                           : LocalOffset / (Properties::TileSizeDimK / InputBlockProperties::c_stride);
-      localThreadC += (InputBlockProperties::is_coalesced_layout)
-                          ? LocalOffset / (TileSizeDimNC / InputBlockProperties::nc_stride)
-                          : LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride);
-    }
-  }
-};
-
-#ifndef EIGEN_SYCL_DISABLE_GEMV
-
-/*!
- * \brief GeneralVectorTensor is a template class that provides Tensor -vector contraction operation, which is a special
- * case of Tensor Tensor contraction.
- *
- * \tparam OutScalar: determines the output scalar type
- *
- * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification
- * (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition)
- *
- * \tparam VectorMapper: determines the tensor contraction mapper for the vector input (can be lhs or rhs)
- *
- * \tparam TensorMapper: determines the tensor contraction mapper for the tensor input (can be lhs or rhs)
- *
- * \tparam StorageIndex: determines the StorageIndex Type
- *
- * \tparam Properties: determines the Contraction Panel properties
- *
- * \tparam KFactor: determines the number of elements in K dimension in a Tile
- *
- * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression.
- *
- * \tparam is_lhs_vec: determines whether lhs is a vector or rhs is a vector
- *
- * \tparam IsFinal: determine if this is the final kernel. If so, the result will be written in a final output.
- * Otherwise, the result of contraction will be written iin a temporary buffer.
- *
- * \param scratch: determines the local memory containing the vector block for each work-group
- *
- * \param vec: determines the vector input (tensor mapper)
- *
- * \param mat: determines the tensor input (tensor mapper)
- *
- * \param out_res: determines the output vector containing the contraction result
- *
- * \param nonContractGroupSize: a logical number determining the number of work-group for non-contracting dimension
- *
- * \param nonContractDim: determines the size of non contracting dimension for the flattened tensor
- *
- * \param contractDim: determines the size of non contracting dimension for the flattened tensor
- *
- */
-template <typename OutScalar, typename OutAccessor, typename VectorMapper, typename TensorMapper, typename StorageIndex,
-          typename Properties, StorageIndex KFactor, bool Vectorizable, bool is_lhs_vec, bool IsFinal>
-struct GeneralVectorTensor {
-  typedef typename Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketReturnType
-      PacketReturnType;
-  static EIGEN_CONSTEXPR int PacketSize =
-      Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketSize;
-  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
-
-  static EIGEN_CONSTEXPR StorageIndex OutScratchOffset =
-      KFactor * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC;
-
-  // Since the access layout for a vector can always be coalesced, when LHS is a vector, we pass false and false to make
-  // sure that the !^ is true When RHS is a vector, we pass true and true to make sure that the !^ is true.
-  typedef BlockProperties<is_lhs_vec ? false : true, is_lhs_vec ? false : true, Vectorizable, PacketReturnType>
-      VecBlockProperties;
-
-  Scratch scratch;
-  const VectorMapper vec;
-  const TensorMapper mat;
-  OutAccessor out_res;
-  const StorageIndex nonContractGroupSize;
-  const StorageIndex nonContractDim;
-  const StorageIndex contractDim;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE GeneralVectorTensor(Scratch scratch_, const VectorMapper vec_,
-                                                            const TensorMapper mat_, OutAccessor out_res_,
-                                                            const StorageIndex nonContractGroupSize_,
-                                                            const StorageIndex nonContractDim_,
-                                                            const StorageIndex contractDim_)
-      : scratch(scratch_),
-        vec(vec_),
-        mat(mat_),
-        out_res(out_res_),
-        nonContractGroupSize(nonContractGroupSize_),
-        nonContractDim(nonContractDim_),
-        contractDim(contractDim_) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
-    auto scratch_ptr = scratch.get_pointer();
-    const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
-    StorageIndex nonContractId = is_lhs_vec ? linearLocalThreadId / Properties::LocalThreadSizeC
-                                            : linearLocalThreadId % Properties::LocalThreadSizeNC;
-    StorageIndex contractId = is_lhs_vec ? linearLocalThreadId % Properties::LocalThreadSizeC
-                                         : linearLocalThreadId / Properties::LocalThreadSizeNC;
-    const StorageIndex cGroupSize = itemID.get_group_range(0) / nonContractGroupSize;
-    const StorageIndex nonContractGroupId =
-        is_lhs_vec ? itemID.get_group(0) / cGroupSize : itemID.get_group(0) % nonContractGroupSize;
-    const StorageIndex contractGroupId =
-        is_lhs_vec ? itemID.get_group(0) % cGroupSize : itemID.get_group(0) / nonContractGroupSize;
-    auto out_ptr = out_res.get_pointer() + (IsFinal ? 0 : contractGroupId * nonContractDim);
-
-    const StorageIndex nonContractGroupOffset = nonContractGroupId * Properties::TileSizeDimNC;
-    const StorageIndex contractGroupOffset = contractGroupId * Properties::TileSizeDimC;
-    auto outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC;
-    const StorageIndex globalNonContractDimOffset = nonContractGroupOffset + nonContractId;
-    const StorageIndex globalContractDimOffset = contractGroupOffset + contractId;
-    auto local_output = scratch_ptr + OutScratchOffset;
-    const bool is_internal = nonContractDim - nonContractGroupOffset >= Properties::TileSizeDimNC &&
-                             contractDim - contractGroupOffset >= Properties::TileSizeDimC;
-    is_internal
-        ? compute_panel<true>(itemID, vec, mat, local_output, out_ptr,
-#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
-                              scratch_ptr, contractGroupOffset,
-#endif
-                              nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId,
-                              nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex)
-        : compute_panel<false>(itemID, vec, mat, local_output, out_ptr,
-#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
-                               scratch_ptr, contractGroupOffset,
-#endif
-                               nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId,
-                               nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex);
-  }
-  template <bool is_internal_block, typename OutPtr>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(
-      const cl::sycl::nd_item<1> &itemID, const VectorMapper &vec, const TensorMapper &mat, OutScalar *local_output,
-      OutPtr out_ptr,
-#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
-      OutScalar *scratch_ptr, const StorageIndex contractGroupOffset,
-#endif
-      const StorageIndex nonContractGroupOffset, const StorageIndex linearLocalThreadId, StorageIndex contractDim,
-      StorageIndex nonContractDim, StorageIndex contractId, StorageIndex nonContractId,
-      StorageIndex globalContractDimOffset, StorageIndex globalNonContractDimOffset, StorageIndex outScratchIndex) {
-    OutScalar outScalar[Properties::WorkLoadPerThreadNC] = {OutScalar(0)};
-    // Reading the vector
-#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
-    const StorageIndex vectorOffset = contractGroupOffset + linearLocalThreadId;
-    extract_block<VecBlockProperties, is_internal_block, KFactor,
-                  Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC>(vec, scratch_ptr, linearLocalThreadId,
-                                                                                vectorOffset, contractDim);
-
-    itemID.barrier(cl::sycl::access::fence_space::local_space);
-    auto in_scratch_ptr = scratch_ptr + contractId;
-#endif
-
-    StorageIndex privateOffsetC = 0;
-    EIGEN_UNROLL_LOOP
-    for (StorageIndex i = 0; i < Properties::WorkLoadPerThreadC; i++) {
-      StorageIndex privateOffsetNC = 0;
-      bool contract_conds = ((globalContractDimOffset + privateOffsetC) < contractDim);
-#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
-      auto vecScalar = *in_scratch_ptr;
-#else
-      auto vecScalar = (check_boundary<is_internal_block>(contract_conds))
-                           ? vec(is_lhs_vec ? StorageIndex(0) : globalContractDimOffset + privateOffsetC,
-                                 is_lhs_vec ? globalContractDimOffset + privateOffsetC : StorageIndex(0))
-                           : OutScalar(0);
-#endif
-      EIGEN_UNROLL_LOOP
-      for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
-        auto matScalar = (check_boundary<is_internal_block>(
-                             contract_conds && ((globalNonContractDimOffset + privateOffsetNC) < nonContractDim)))
-                             ? mat(is_lhs_vec ? globalContractDimOffset + privateOffsetC
-                                              : globalNonContractDimOffset + privateOffsetNC,
-                                   is_lhs_vec ? globalNonContractDimOffset + privateOffsetNC
-                                              : globalContractDimOffset + privateOffsetC)
-                             : OutScalar(0);
-
-        outScalar[j] = cl::sycl::mad(matScalar, vecScalar, outScalar[j]);
-        privateOffsetNC += Properties::LocalThreadSizeNC;
-      }
-      privateOffsetC += Properties::LocalThreadSizeC;
-#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
-      in_scratch_ptr += Properties::LocalThreadSizeC;
-#endif
-    }
-
-    auto out_scratch_ptr = local_output + outScratchIndex;
-    // Each block of 16*16 element in shared memory should reduce to 16*1
-    EIGEN_UNROLL_LOOP
-    for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
-      *out_scratch_ptr = outScalar[j];
-
-      out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
-    }
-    if (is_lhs_vec) {
-      nonContractId = linearLocalThreadId % Properties::LocalThreadSizeNC;
-      contractId = linearLocalThreadId / Properties::LocalThreadSizeNC;
-      outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC;
-    }
-
-    out_scratch_ptr = local_output + outScratchIndex;
-    EIGEN_UNROLL_LOOP
-    for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
-      EIGEN_UNROLL_LOOP
-      for (StorageIndex offset = Properties::LocalThreadSizeC >> 1; offset > 0; offset >>= 1) {
-        itemID.barrier(cl::sycl::access::fence_space::local_space);
-        if (contractId < offset) {
-          StorageIndex myNeigbourId = (Properties::LocalThreadSizeNC * offset);
-          *out_scratch_ptr += out_scratch_ptr[myNeigbourId];
-        }
-      }
-      // moving to next 16 by 16 block
-      out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
-    }
-
-    if (contractId == 0) {
-      out_scratch_ptr = local_output + nonContractId;
-      StorageIndex global_final_offset = nonContractGroupOffset + nonContractId;
-      out_ptr += global_final_offset;
-      EIGEN_UNROLL_LOOP
-      for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
-        if (check_boundary<is_internal_block>(global_final_offset < nonContractDim)) {
-          auto res = *out_scratch_ptr;
-
-          *out_ptr = res;
-          out_ptr += Properties::LocalThreadSizeNC;
-        }
-        // moving to next 16 by 16 block to ge the next 16 reduced elements
-        out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
-        if (!(is_internal_block)) global_final_offset += Properties::LocalThreadSizeNC;
-      }
-    }
-  }
-
-  template <typename InputBlockProperties, bool is_internal_block, int CFactor, int GroupSize, typename Input,
-            typename Local>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_block(const Input &inpt, Local *local_ptr,
-                                                                  const StorageIndex &linearLocalThreadId,
-                                                                  const StorageIndex &cOffset, const StorageIndex &C) {
-    local_ptr += InputBlockProperties::c_stride * linearLocalThreadId;
-    StorageIndex cIndex = cOffset;
-    for (StorageIndex cId = 0; cId < CFactor / InputBlockProperties::c_stride; cId++) {
-      if (check_boundary<is_internal_block>(cIndex + InputBlockProperties::c_stride - 1 < C)) {
-        auto val = read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
-                        InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, StorageIndex(0),
-                                                                                              cIndex, StorageIndex(1));
-        write<StorageIndex, 1, data_source::local_mem>(val, local_ptr);
-      } else {
-        EIGEN_UNROLL_LOOP
-        for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {
-          OutScalar val =
-              (cIndex + i < C)
-                  ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
-                        inpt, StorageIndex(0), cIndex + i, StorageIndex(1))
-                  : OutScalar(0);
-          write<StorageIndex, 1, data_source::local_mem>(val, local_ptr + i);
-        }
-      }
-      local_ptr += InputBlockProperties::c_stride * GroupSize;
-      cIndex += InputBlockProperties::c_stride * GroupSize;
-    }
-  }
-};
-#endif
-
-#ifndef EIGEN_SYCL_DISABLE_SCALAR
-
-/*!
- * \brief GeneralScalarContraction is a template class that provides the scalar value of Tensor -Tensor contraction
- * operation, when all the dimensions are contracting dimensions. This Kernel reduces two tensors to an scalar
- *
- * \tparam OutScalar: determines the output scalar type
- *
- * \tparam LhsScalar: determines the left-hand-side scalar type
- *
- * \tparam RhsScalar: determines the right-hand-side scalar type
- *
- * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification
- * (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition)
- *
- * \tparam LhsMapper: determines the tensor contraction mapper type for left-hand-side matrix
- *
- * \tparam RhsMapper: determines the tensor contraction mapper type for right-hand-side matrix
- *
- * \tparam StorageIndex: determines the StorageIndex Type
- *
- * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression.
- *
- * \param scratch: local memory containing tiles of LHS and RHS tensors for each work-group
- *
- * \param lhs: determines the left-hand-side flattened tensor (tensor mapper)
- *
- * \param rhs: determines the right-hand-side flattened tensor (tensor mapper)
- *
- * \param out_res: determines the output tensor containing the contraction result
- *
- * \param rng: determins the total input data size
- */
-template <typename OutScalar, typename LhsScalar, typename RhsScalar, typename OutAccessor, typename LhsMapper,
-          typename RhsMapper, typename StorageIndex, bool Vectorizable>
-struct GeneralScalarContraction {
-  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
-  Scratch scratch;
-  const LhsMapper lhs;
-  const RhsMapper rhs;
-  OutAccessor out_res;
-  const StorageIndex rng;
-
-  EIGEN_DEVICE_FUNC
-  GeneralScalarContraction(Scratch scratch_, const LhsMapper lhs_, const RhsMapper rhs_, OutAccessor out_res_,
-                           const StorageIndex rng_)
-      : scratch(scratch_), lhs(lhs_), rhs(rhs_), out_res(out_res_), rng(rng_) {}
-
-  EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) {
-    auto out_ptr = out_res.get_pointer();
-    auto scratch_ptr = scratch.get_pointer().get();
-
-    StorageIndex globalid = itemID.get_global_id(0);
-    StorageIndex localid = itemID.get_local_id(0);
-    OutScalar accumulator = OutScalar(0);
-    for (StorageIndex i = globalid; i < rng; i += itemID.get_global_range(0)) {
-      accumulator = cl::sycl::mad(lhs(0, i), rhs(i, 0), accumulator);
-    }
-    auto out_scratch_ptr = scratch_ptr + localid;
-    *out_scratch_ptr = accumulator;
-    for (StorageIndex offset = itemID.get_local_range(0) >> 1; offset > 0; offset >>= 1) {
-      itemID.barrier(cl::sycl::access::fence_space::local_space);
-      if (localid < offset) {
-        *out_scratch_ptr = (accumulator += out_scratch_ptr[offset]);
-      }
-    }
-    if (localid == 0) {
-      out_ptr[itemID.get_group(0)] = accumulator;
-    }
-  }
-};
-#endif
-
-}  // namespace internal
-}  // namespace TensorSycl
-
-template <typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
-struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>,
-                       Eigen::SyclDevice>
-    : public TensorContractionEvaluatorBase<TensorEvaluator<
-          const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Eigen::SyclDevice>> {
-  static_assert(std::is_same<OutputKernelType, const NoOpOutputKernel>::value,
-                "SYCL tensor contraction does not support output kernels.");
-
-  typedef Eigen::SyclDevice Device;
-
-  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
-  typedef TensorContractionEvaluatorBase<Self> Base;
-  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
-  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
-  typedef typename XprType::Index StorageIndex;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  typedef typename Base::Storage Storage;
-  typedef typename Base::EvaluatorPointerType EvaluatorPointerType;
-  struct TripleDim {
-    const StorageIndex M;
-    const StorageIndex N;
-    const StorageIndex K;
-    TripleDim(const StorageIndex M_, const StorageIndex N_, const StorageIndex K_) : M(M_), N(N_), K(K_) {}
-  };
-  enum {
-    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
-    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccess = false,
-  };
-
-  static EIGEN_CONSTEXPR int LDims = Base::LDims;
-  static EIGEN_CONSTEXPR int RDims = Base::RDims;
-  static EIGEN_CONSTEXPR int ContractDims = Base::ContractDims;
-
-  typedef array<StorageIndex, LDims> left_dim_mapper_t;
-  typedef array<StorageIndex, RDims> right_dim_mapper_t;
-
-  typedef array<StorageIndex, ContractDims> contract_t;
-  typedef array<StorageIndex, LDims - ContractDims> left_nocontract_t;
-  typedef array<StorageIndex, RDims - ContractDims> right_nocontract_t;
-
-  static const int NumDims = LDims + RDims - 2 * ContractDims;
-
-  typedef DSizes<StorageIndex, NumDims> Dimensions;
-
-  typedef TensorEvaluator<typename Base::EvalLeftArgType, Device> LeftEvaluator;
-  typedef TensorEvaluator<typename Base::EvalRightArgType, Device> RightEvaluator;
-  typedef typename Eigen::internal::remove_const<typename LeftEvaluator::CoeffReturnType>::type LhsScalar;
-  typedef typename Eigen::internal::remove_const<typename RightEvaluator::CoeffReturnType>::type RhsScalar;
-
-  typedef typename LeftEvaluator::Dimensions LeftDimensions;
-  typedef typename RightEvaluator::Dimensions RightDimensions;
-
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered>
-  struct input_mapper_propertis {
-    static EIGEN_CONSTEXPR bool is_lhs_matrix = (LDims == 2 && ContractDims == 1) || lhs_inner_dim_contiguous;
-    static EIGEN_CONSTEXPR bool is_rhs_matrix =
-        (RDims == 2 && ContractDims == 1) || (rhs_inner_dim_contiguous && !rhs_inner_dim_reordered);
-  };
-
-  TensorEvaluator(const XprType &op, const Device &device) : Base(op, device) {}
-
-  // We need to redefine this method to make nvcc happy
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(typename Base::EvaluatorPointerType data) {
-    this->m_leftImpl.evalSubExprsIfNeeded(NULL);
-    this->m_rightImpl.evalSubExprsIfNeeded(NULL);
-    if (!data) {
-      this->m_result = this->m_device.get(
-          static_cast<Scalar *>(this->m_device.allocate_temp(this->dimensions().TotalSize() * sizeof(Scalar))));
-      data = this->m_result;
-    }
-    evalToSycl(data);
-    return (this->m_result != NULL);
-  }
-  const Eigen::SyclDevice &device() const { return this->m_device; }
-  void evalToSycl(typename Base::EvaluatorPointerType buffer) const {
-    if (this->m_lhs_inner_dim_contiguous) {
-      if (this->m_rhs_inner_dim_contiguous) {
-        if (this->m_rhs_inner_dim_reordered) {
-          evalTyped<true, true, true, Unaligned>(buffer);
-        } else {
-          evalTyped<true, true, false, Unaligned>(buffer);
-        }
-      } else {
-        if (this->m_rhs_inner_dim_reordered) {
-          evalTyped<true, false, true, Unaligned>(buffer);
-        } else {
-          evalTyped<true, false, false, Unaligned>(buffer);
-        }
-      }
-    } else {
-      if (this->m_rhs_inner_dim_contiguous) {
-        if (this->m_rhs_inner_dim_reordered) {
-          evalTyped<false, true, true, Unaligned>(buffer);
-        } else {
-          evalTyped<false, true, false, Unaligned>(buffer);
-        }
-      } else {
-        if (this->m_rhs_inner_dim_reordered) {
-          evalTyped<false, false, true, Unaligned>(buffer);
-        } else {
-          evalTyped<false, false, false, Unaligned>(buffer);
-        }
-      }
-    }
-  }
-
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
-  void evalTyped(typename Base::EvaluatorPointerType buffer) const {
-    const auto triple_dim = TripleDim{this->m_i_size, this->m_j_size, this->m_k_size};
-    typedef internal::TensorContractionInputMapper<
-        LhsScalar, StorageIndex, internal::Lhs, LeftEvaluator, left_nocontract_t, contract_t,
-        PacketType<CoeffReturnType, Device>::size, lhs_inner_dim_contiguous, false, Unaligned, MakeSYCLPointer>
-        LhsMapper;
-
-    typedef internal::TensorContractionInputMapper<RhsScalar, StorageIndex, internal::Rhs, RightEvaluator,
-                                                   right_nocontract_t, contract_t,
-                                                   PacketType<CoeffReturnType, Device>::size, rhs_inner_dim_contiguous,
-                                                   rhs_inner_dim_reordered, Unaligned, MakeSYCLPointer>
-        RhsMapper;
-
-    // initialize data mappers
-    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
-                  this->m_left_contracting_strides, this->m_k_strides);
-
-    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
-                  this->m_right_contracting_strides, this->m_k_strides);
-
-#ifndef EIGEN_SYCL_DISABLE_SCALAR
-    if (triple_dim.M == 1 && triple_dim.N == 1) {
-      launchSC(buffer, lhs, rhs, triple_dim.K);
-    } else
-#endif
-#ifndef EIGEN_SYCL_DISABLE_GEMV
-        if (triple_dim.M != 1 && triple_dim.N == 1) {
-      LaunchVT<false>(buffer, rhs, lhs, triple_dim.M, triple_dim.K);
-    } else if (triple_dim.M == 1 && triple_dim.N != 1) {
-      LaunchVT<true>(buffer, lhs, rhs, triple_dim.N, triple_dim.K);
-    } else  // This is equivalent of if (m!=1 && n!=1)
-#endif
-    {
-      typedef input_mapper_propertis<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>
-          inpt_mapper_properties;
-#ifndef EIGEN_SYCL_DISABLE_SKINNY
-      bool skinny = false;
-      auto platform_name = this->device().getPlatformName();
-      // This is based on empirical calculation for AMD r9-nano and Fiji
-      if (platform_name.find("AMD") == 0) {
-        skinny = (triple_dim.M < triple_dim.K || triple_dim.N < triple_dim.K) &&
-                 ((triple_dim.M < 1024 && triple_dim.N < 1024) ||
-                  (uint64_t(triple_dim.M * triple_dim.N) < uint64_t(triple_dim.K)));
-      } else {
-        skinny = (((std::max(triple_dim.K, triple_dim.N) / std::min(triple_dim.K, triple_dim.N)) > 100) ||
-                  ((std::max(triple_dim.K, triple_dim.M) / std::min(triple_dim.K, triple_dim.M)) > 100) ||
-                  ((std::max(triple_dim.N, triple_dim.M) / std::min(triple_dim.N, triple_dim.M)) > 100));
-      }
-      if (skinny)
-        adjustTT<true, inpt_mapper_properties>(buffer, lhs, rhs, triple_dim);
-      else
-#endif  // EIGEN_SYCL_DISABLE_SKINNY
-        adjustTT<false, inpt_mapper_properties>(buffer, lhs, rhs, triple_dim);
-    }
-  }
-
-  template <bool skinny, typename input_mapper_properties, typename LhsMapper, typename RhsMapper>
-  void EIGEN_ALWAYS_INLINE adjustTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,
-                                    const TripleDim &triple_dim) const {
-#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
-    if (device().has_local_memory()) {
-      typedef TensorSycl::internal::TTPanelSize<CoeffReturnType, StorageIndex, 4, 4, 16> PanelParameters;
-      launchTT<TensorSycl::internal::contraction_type::local, skinny, input_mapper_properties, PanelParameters>(
-          buffer, lhs, rhs, triple_dim);
-    }
-#endif
-#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF
-    if (!(device().has_local_memory())) {
-      typedef TensorSycl::internal::TTPanelSize<CoeffReturnType, StorageIndex, 4, 4, 4> PanelParameters;
-      launchTT<TensorSycl::internal::contraction_type::no_local, skinny, input_mapper_properties, PanelParameters>(
-          buffer, lhs, rhs, triple_dim);
-    }
-#endif
-  }
-
-  template <TensorSycl::internal::contraction_type ct, bool skinny, typename input_mapper_properties,
-            typename Properties, typename LhsMapper, typename RhsMapper>
-  void launchTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,
-                const TripleDim &triple_dim) const {
-    const StorageIndex roundUpM = Eigen::TensorSycl::internal::roundUp(triple_dim.M, Properties::TileSizeDimM);
-    const StorageIndex roundUpN = Eigen::TensorSycl::internal::roundUp(triple_dim.N, Properties::TileSizeDimN);
-    const StorageIndex groupSizeM = roundUpM / Properties::TileSizeDimM;
-    const StorageIndex groupSizeN = roundUpN / Properties::TileSizeDimN;
-
-    const StorageIndex roundUpK = Eigen::TensorSycl::internal::roundUp(triple_dim.K, Properties::TileSizeDimK);
-    StorageIndex totalTilesK = roundUpK / Properties::TileSizeDimK;
-    StorageIndex groupSizeK =
-        skinny
-            ? std::max(std::min(totalTilesK,
-                                (StorageIndex)(device().getPowerOfTwo(device().getNumSyclMultiProcessors(), true) * 4) /
-                                    (groupSizeM * groupSizeN)),
-                       StorageIndex(1))
-            : StorageIndex(1);
-
-    const StorageIndex numTilesPerGroup = Eigen::TensorSycl::internal::roundUp(totalTilesK, groupSizeK) / groupSizeK;
-
-    const StorageIndex totalGroupSize = groupSizeM * groupSizeN * groupSizeK;
-
-    const StorageIndex localRange = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN;
-    const StorageIndex globalRange = totalGroupSize * localRange;
-
-    const StorageIndex scratchSize = (ct == TensorSycl::internal::contraction_type::local)
-                                         ? ((Properties::DoubleBuffer + 1) *
-                                            (Properties::TileSizeDimM + Properties::BC) * (Properties::TileSizeDimK)) +
-                                               ((Properties::DoubleBuffer + 1) * (Properties::TileSizeDimK) *
-                                                (Properties::TileSizeDimN + Properties::BC))
-                                         : StorageIndex(1);
-
-    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
-    if (groupSizeK == 1) {
-      typedef TensorSycl::internal::TensorContractionKernel<CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType,
-                                                            LhsMapper, RhsMapper, StorageIndex, Properties, TripleDim,
-                                                            PacketAccess, input_mapper_properties, true, ct>
-          ContractKernelName;
-      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
-          lhs, rhs, buffer, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, triple_dim);
-    } else {
-      typedef TensorSycl::internal::TensorContractionKernel<CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType,
-                                                            LhsMapper, RhsMapper, StorageIndex, Properties, TripleDim,
-                                                            PacketAccess, input_mapper_properties, false, ct>
-          ContractKernelName;
-      CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>(
-          device().allocate_temp(triple_dim.M * triple_dim.N * groupSizeK * sizeof(CoeffReturnType)));
-      EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
-
-      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
-          lhs, rhs, tmp_global_accessor, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup,
-          triple_dim);
-
-      typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
-      auto op = Op();
-      typedef TensorSycl::internal::SecondStepPartialReduction<CoeffReturnType, StorageIndex, EvaluatorPointerType,
-                                                               EvaluatorPointerType, Op>
-          ReductionKernel;
-
-      device().template unary_kernel_launcher<CoeffReturnType, ReductionKernel>(
-          tmp_global_accessor, buffer,
-          cl::sycl::nd_range<1>(cl::sycl::range<1>(StorageIndex(
-                                    Eigen::TensorSycl::internal::roundUp(triple_dim.M * triple_dim.N, localRange))),
-                                cl::sycl::range<1>(localRange)),
-          StorageIndex(1), op, StorageIndex(triple_dim.M * triple_dim.N), groupSizeK);
-
-      device().deallocate_temp(temp_pointer);
-    }
-  }
-
-#ifndef EIGEN_SYCL_DISABLE_GEMV
-  template <bool is_lhs_vec, typename VectorMapper, typename TensorMapper, typename StorageIndex>
-  void EIGEN_ALWAYS_INLINE LaunchVT(EvaluatorPointerType buffer, const VectorMapper &vec, const TensorMapper &mat,
-                                    StorageIndex NC, StorageIndex C) const {
-    const StorageIndex nonContractDim = NC;
-    EIGEN_CONSTEXPR StorageIndex NCFactor = 1;
-    EIGEN_CONSTEXPR StorageIndex CFactor = 1;
-    EIGEN_CONSTEXPR StorageIndex NCWindow = 16;
-    typedef Eigen::TensorSycl::internal::TVPanelSize<CoeffReturnType, StorageIndex, NCWindow, CFactor, NCFactor>
-        Properties;
-    const StorageIndex roundUpC = Eigen::TensorSycl::internal::roundUp(C, Properties::TileSizeDimC);
-    const StorageIndex cNumGroups = roundUpC / (Properties::LocalThreadSizeC * Properties::WorkLoadPerThreadC);
-    const StorageIndex roundUpNC = Eigen::TensorSycl::internal::roundUp(nonContractDim, Properties::TileSizeDimNC);
-    const StorageIndex nCNumGroups = roundUpNC / (Properties::LocalThreadSizeNC * Properties::WorkLoadPerThreadNC);
-    const StorageIndex globalRange =
-        (roundUpNC / (Properties::WorkLoadPerThreadNC)) * (roundUpC / (Properties::WorkLoadPerThreadC));
-    const StorageIndex localRange = Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC;
-    const StorageIndex scratchSize =
-        (Properties::WorkLoadPerThreadNC + CFactor) * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC;
-    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
-    if (cNumGroups > 1) {
-      typedef Eigen::TensorSycl::internal::GeneralVectorTensor<CoeffReturnType, EvaluatorPointerType, VectorMapper,
-                                                               TensorMapper, StorageIndex, Properties, CFactor, false,
-                                                               is_lhs_vec, false>
-          ContractKernelName;
-      CoeffReturnType *temp_pointer =
-          static_cast<CoeffReturnType *>(device().allocate_temp(nonContractDim * cNumGroups * sizeof(CoeffReturnType)));
-      EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
-
-      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
-          vec, mat, tmp_global_accessor, thread_range, scratchSize, nCNumGroups, nonContractDim, C);
-
-      typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
-      typedef TensorSycl::internal::SecondStepPartialReduction<CoeffReturnType, StorageIndex, EvaluatorPointerType,
-                                                               EvaluatorPointerType, Op>
-          ReductionKernel;
-
-      device().template unary_kernel_launcher<CoeffReturnType, ReductionKernel>(
-          tmp_global_accessor, buffer,
-          cl::sycl::nd_range<1>(cl::sycl::range<1>(Eigen::TensorSycl::internal::roundUp(nonContractDim, localRange)),
-                                cl::sycl::range<1>(localRange)),
-          StorageIndex(1), Op(), nonContractDim, cNumGroups);
-
-      device().deallocate_temp(temp_pointer);
-    } else {
-      typedef Eigen::TensorSycl::internal::GeneralVectorTensor<CoeffReturnType, EvaluatorPointerType, VectorMapper,
-                                                               TensorMapper, StorageIndex, Properties, CFactor, false,
-                                                               is_lhs_vec, true>
-          ContractKernelName;
-      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
-          vec, mat, buffer, thread_range, scratchSize, nCNumGroups, nonContractDim, C);
-    }
-  }
-#endif
-
-#ifndef EIGEN_SYCL_DISABLE_SCALAR
-  template <typename LhsMapper, typename RhsMapper>
-  EIGEN_ALWAYS_INLINE void launchSC(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,
-                                    StorageIndex K) const {
-    EIGEN_STATIC_ASSERT(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
-                          (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
-                        "The Local thread size must be a power of 2 for the reduction "
-                        "operation");
-    EIGEN_CONSTEXPR StorageIndex local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
-
-    // Here we force the code not to be more than 2-step reduction: Our empirical research shows that if each thread
-    // reduces at least 512 elementss individually, we get better performance.
-    const StorageIndex num_work_group = ((K + (512 * local_range - 1)) / (512 * local_range) > 1 ? local_range : 1);
-    const StorageIndex global_range = num_work_group * local_range;
-
-    typedef Eigen::TensorSycl::internal::GeneralScalarContraction<
-        CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType, LhsMapper, RhsMapper, StorageIndex, false>
-        ContractKernelName;
-    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
-    if (num_work_group > 1) {
-      CoeffReturnType *temp_pointer =
-          static_cast<CoeffReturnType *>(device().allocate_temp(num_work_group * sizeof(CoeffReturnType)));
-      EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
-      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(lhs, rhs, tmp_global_accessor,
-                                                                                    thread_range, local_range, K);
-      typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
-      typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,
-                                                          EvaluatorPointerType, StorageIndex, local_range>
-          GenericRKernel;
-      device().template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
-          tmp_global_accessor, buffer,
-          cl::sycl::nd_range<1>(cl::sycl::range<1>(local_range), cl::sycl::range<1>(local_range)), local_range, Op());
-
-      device().deallocate_temp(temp_pointer);
-    } else {
-      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(lhs, rhs, buffer, thread_range,
-                                                                                    local_range, K);
-    }
-  }
-#endif
-
-  EIGEN_STRONG_INLINE void cleanup() {
-    this->m_leftImpl.cleanup();
-    this->m_rightImpl.cleanup();
-
-    if (this->m_result) {
-      this->m_device.deallocate_temp(this->m_result);
-      this->m_result = NULL;
-    }
-  }
-  // The placeholder accessors must bound to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    this->m_leftImpl.bind(cgh);
-    this->m_rightImpl.bind(cgh);
-    this->m_result.bind(cgh);
-  }
-};
-}  // namespace Eigen
-#endif  // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionThreadPool.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionThreadPool.h
deleted file mode 100644
index 21be6ea..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ /dev/null
@@ -1,1679 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H
-#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H
-
-// evaluator for thread pool device
-#ifdef EIGEN_USE_THREADS
-
-namespace Eigen {
-
-template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
-struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice> :
-    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice> > {
-
-  typedef ThreadPoolDevice Device;
-
-  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
-  typedef TensorContractionEvaluatorBase<Self> Base;
-
-  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
-  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
-  typedef typename XprType::Index Index;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
-  enum {
-    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
-  };
-
-  // Most of the code is assuming that both input tensors are ColMajor. If the
-  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
-  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
-  // will pretend B is LHS and A is RHS.
-  typedef typename internal::conditional<
-    static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
-  typedef typename internal::conditional<
-    static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
-
-  static const int LDims =
-      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
-  static const int RDims =
-      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
-  static const int ContractDims = internal::array_size<Indices>::value;
-
-  typedef array<Index, LDims> left_dim_mapper_t;
-  typedef array<Index, RDims> right_dim_mapper_t;
-
-  typedef array<Index, ContractDims> contract_t;
-  typedef array<Index, LDims - ContractDims> left_nocontract_t;
-  typedef array<Index, RDims - ContractDims> right_nocontract_t;
-
-  static const int NumDims = LDims + RDims - 2 * ContractDims;
-
-  typedef DSizes<Index, NumDims> Dimensions;
-
-  // typedefs needed in evalTo
-  typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
-  typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
-  typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
-
-  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
-  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
-
-  TensorEvaluator(const XprType& op, const Device& device) :
-      Base(op, device) {}
-
-  template <int Alignment>
-  void evalProduct(Scalar* buffer) const {
-    evalProductImpl<NoCallback, Alignment>(buffer, NoCallback());
-  }
-
-  template <typename EvalToCallback, int Alignment>
-  void evalProductAsync(Scalar* buffer, EvalToCallback done) const {
-    evalProductImpl<EvalToCallback, Alignment>(buffer, std::move(done));
-  }
-
-  template <typename DoneCallback, int Alignment>
-  void evalProductImpl(Scalar* buffer, DoneCallback done) const {
-    // This function computes a lot of heuristics in multiple steps, and it
-    // also has multiple exit points. To keep it sane, readable and all in one
-    // place, sync/async execution decision is made at runtime at the very end.
-    //
-    // (1) In sync mode we allocate Context on the stack, submit computations
-    //     to the device thread pool, and block on a barrier until it is
-    //     completed.
-    //
-    // (2) In async mode we allocate Context on the heap, and after all tasks
-    //     are finished, we call provided the done callback, and delete a
-    //     context from the heap.
-    //
-    // (*) EvalParallelContext & EvalShardedByInnerDimContext owns all the state
-    // and temporary buffers, requried for executing the tensor contraction.
-    // They are responsible for cleaning it up after contraction is done.
-    static const bool IsEvalInSyncMode =
-        std::is_same<DoneCallback, NoCallback>::value;
-
-    const Index m = this->m_i_size;
-    const Index n = this->m_j_size;
-    const Index k = this->m_k_size;
-    if (m == 0 || n == 0 || k == 0) return;
-
-    // Compute a set of algorithm parameters:
-    // - kernel block sizes (bm, bn, bk)
-    // - task grain sizes (number of kernels executed per task: gm, gn)
-    // - number of threads
-    // - sharding by row/column
-    // - parallel packing or first lhs then rhs
-    // and some derived parameters:
-    // - number of tasks (nm, nn, nk)
-    // - number of kernels (nm0, nn0)
-    // Unfortunately, all these parameters are tightly interdependent.
-    // So in some cases we first compute approximate values, then compute other
-    // values based on these approximations and then refine the approximations.
-
-    // There are lots of heuristics here. There is some reasoning behind them,
-    // but ultimately they are just tuned on contraction benchmarks for
-    // different input configurations, thread counts and instruction sets.
-    // So feel free to question any of them.
-
-    // Compute whether we want to shard by row or by column.
-    // This is a first approximation, it will be refined later. Since we don't
-    // know number of threads yet we use 2, because what's we are most
-    // interested in at this point is whether it makes sense to use
-    // parallelization at all or not.
-    bool shard_by_col = shardByCol(m, n, 2);
-
-    // First approximation of kernel blocking sizes.
-    // Again, we don't know number of threads yet, so we use 2.
-    Index bm, bn, bk;
-    if (shard_by_col) {
-      internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index,
-                                          internal::ShardByCol>
-          blocking(k, m, n, 2);
-      bm = blocking.mc();
-      bn = blocking.nc();
-      bk = blocking.kc();
-    } else {
-      internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index,
-                                          internal::ShardByRow>
-          blocking(k, m, n, 2);
-      bm = blocking.mc();
-      bn = blocking.nc();
-      bk = blocking.kc();
-    }
-
-    // Compute optimal number of threads.
-    // Note: we use bk instead of k here because we are interested in amount of
-    // _parallelizable_ computations, and computations are not parallelizable
-    // across k dimension.
-    const TensorOpCost cost =
-        contractionCost(m, n, bm, bn, bk, shard_by_col, false);
-    int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
-        static_cast<double>(n) * m, cost, this->m_device.numThreads());
-    int num_threads_by_k = numThreadsInnerDim(m, n, k);
-    if (shardByInnerDim(m, n, k, num_threads, num_threads_by_k)) {
-      // We are in the scenario where it is more effective to shard by the
-      // inner dimension.
-      if (IsEvalInSyncMode) {
-        EvalShardedByInnerDimContext<DoneCallback> ctx(
-            this, num_threads_by_k, buffer, m, n, k, std::move(done));
-        ctx.template run<Alignment>();
-      } else {
-        auto* ctx = new EvalShardedByInnerDimContext<DoneCallback>(
-            this, num_threads_by_k, buffer, m, n, k, std::move(done));
-        ctx->template runAsync<Alignment>();
-      }
-
-      return;
-    }
-
-    // TODO(dvyukov): this is a stop-gap to prevent regressions while the cost
-    // model is not tuned. Remove this when the cost model is tuned.
-    if (n == 1) num_threads = 1;
-
-    if (num_threads == 1) {
-      TENSOR_CONTRACTION_DISPATCH(this->template evalProductSequential,
-                                  Unaligned, (buffer));
-      if (!IsEvalInSyncMode) done();
-      return;
-    }
-
-    // Now that we know number of threads, recalculate sharding and blocking.
-    shard_by_col = shardByCol(m, n, num_threads);
-    if (shard_by_col) {
-      internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index,
-                                          internal::ShardByCol>
-          blocking(k, m, n, num_threads);
-      bm = blocking.mc();
-      bn = blocking.nc();
-      bk = blocking.kc();
-    } else {
-      internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index,
-                                          internal::ShardByRow>
-          blocking(k, m, n, num_threads);
-      bm = blocking.mc();
-      bn = blocking.nc();
-      bk = blocking.kc();
-    }
-
-    // Number of kernels for each dimension.
-    Index nm0 = divup(m, bm);
-    Index nn0 = divup(n, bn);
-    Index nk = divup(k, bk);
-
-    // Calculate task grain size (number of kernels executed per task).
-    // This task size coarsening serves two purposes:
-    // 1. It reduces per-task overheads including synchronization overheads.
-    // 2. It allows to use caches better (reuse the same packed rhs in several
-    // consecutive kernels).
-    Index gm = 1;
-    Index gn = 1;
-    // If we are sharding by column, then we prefer to reduce rows first.
-    if (shard_by_col) {
-      gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col);
-      gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col);
-    } else {
-      gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col);
-      gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col);
-    }
-    // Number of tasks in each dimension.
-    Index nm = divup(nm0, gm);
-    Index nn = divup(nn0, gn);
-
-    // If there is enough concurrency in the sharding dimension, we choose not
-    // to paralellize by the other dimension, and execute all kernels in sync
-    // mode. This reduces parallelism from the nm x nn down to nn
-    // (shard_by_col==true) or nm (shard_by_col==false).
-    const Index sharding_dim_tasks = shard_by_col ? nn : nm;
-    const int num_worker_threads = this->m_device.numThreadsInPool();
-
-    // With small number of threads we want to make sure that we do not reduce
-    // parallelism too much. With large number of threads we trade maximum
-    // parallelism for better memory locality.
-    const float oversharding_factor =
-        num_worker_threads <= 4  ? 8.0 :
-        num_worker_threads <= 8  ? 4.0 :
-        num_worker_threads <= 16 ? 2.0 :
-        num_worker_threads <= 32 ? 1.0 :
-        num_worker_threads <= 64 ? 0.8 : /* num_worker_threads > 64 */ 0.6;
-
-    const bool parallelize_by_sharding_dim_only =
-        sharding_dim_tasks >= oversharding_factor * num_worker_threads;
-
-    // Last by not least, decide whether we want to issue both lhs and rhs
-    // packing in parallel; or issue lhs packing first, and then issue rhs
-    // packing when lhs packing completes (for !shard_by_col lhs and rhs are
-    // swapped). Parallel packing allows more parallelism (for both packing and
-    // kernels), while sequential packing provides better locality (once
-    // a thread finishes rhs packing it proceed to kernels with that rhs).
-    // First, we are interested in parallel packing if there are few tasks.
-    bool parallel_pack = num_threads >= nm * nn;
-    // Also do parallel packing if all data fits into L2$.
-    if (m * bk * Index(sizeof(LhsScalar)) + n * bk * Index(sizeof(RhsScalar)) <=
-        l2CacheSize() * num_threads)
-      parallel_pack = true;
-    // But don't do it if we will use each rhs only once. Locality seems to be
-    // more important in this case.
-    if ((shard_by_col ? nm : nn) == 1) parallel_pack = false;
-    // Also don't get in the way of parallelize_by_sharding_dim_only
-    // optimization.
-    if (parallelize_by_sharding_dim_only) parallel_pack = false;
-
-    // TODO(ezhulnev): With if contexpr we don't need SyncEvalParallelContext.
-    if (IsEvalInSyncMode) {
-#define CONTEXT_ARGS                                                        \
-  (this, num_threads, buffer, m, n, k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, \
-   nn0, shard_by_col, parallel_pack, parallelize_by_sharding_dim_only,      \
-   NoCallback())                                                            \
-      .run()
-      TENSOR_CONTRACTION_DISPATCH(SyncEvalParallelContext, Alignment,
-                                  CONTEXT_ARGS);
-#undef CONTEXT_ARGS
-
-    } else {
-#define CONTEXT_ARGS                                                        \
-  (this, num_threads, buffer, m, n, k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, \
-   nn0, shard_by_col, parallel_pack, parallelize_by_sharding_dim_only,      \
-   std::move(done))
-      TENSOR_CONTRACTION_ASYNC_DISPATCH(EvalParallelContext, DoneCallback,
-                                        Alignment, CONTEXT_ARGS, run());
-#undef CONTEXT_ARGS
-    }
-  }
-
-  // ------------------------------------------------------------------------ //
-
-  // Dummy struct to represent an empty DoneCallback.
-
-  struct NoCallback {
-    void operator()() {
-      eigen_assert(false && "NoCallback should never be called");
-    }
-  };
-
-  // ------------------------------------------------------------------------ //
-
-  template <typename DoneCallback, typename Context>
-  class EvalParallelNotification;
-
-  // Synchronous evaluation notification that blocks caller thread in Wait().
-  template <typename Context>
-  class EvalParallelNotification<NoCallback, Context> {
-   public:
-    EvalParallelNotification(Context*, NoCallback) {}
-    void Notify() { done_.Notify(); }
-    void Wait() { done_.Wait(); }
-   private:
-    Eigen::Notification done_;
-  };
-
-  // Asynchronous evaluation notification that does not block in Wait().
-  template <typename DoneCallback, typename Context>
-  class EvalParallelNotification {
-   public:
-    EvalParallelNotification(Context* ctx, DoneCallback done)
-        : ctx_(ctx), done_(std::move(done)) {}
-
-    void Notify() {
-      // Make a copy of done callback, because it will be destructed when we
-      // will delete context in the next line (EvalParallelNotification is a
-      // data member of EvalParallelContext class).
-      DoneCallback done_copy = std::move(done_);
-
-      // Delete parallel evaluation context.
-      delete ctx_;
-
-      // Now safely call the done callback.
-      done_copy();
-    }
-
-    void Wait() {}
-
-   private:
-    Context* ctx_;
-    DoneCallback done_;
-  };
-
-  // Context orchestrates sync/async parallel contraction evaluation. When it is
-  // executed in asynchronous mode, it owns all the shared state that might be
-  // accessible by block packing and kernel tasks.
-
-  template <typename DoneCallback, bool lhs_inner_dim_contiguous,
-            bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered,
-            int Alignment>
-  class EvalParallelContext {
-   public:
-    typedef internal::TensorContractionInputMapper<
-        LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t,
-        contract_t, internal::packet_traits<LhsScalar>::size,
-        lhs_inner_dim_contiguous, false, Unaligned>
-        LhsMapper;
-    typedef internal::TensorContractionInputMapper<
-        RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t,
-        contract_t, internal::packet_traits<RhsScalar>::size,
-        rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned>
-        RhsMapper;
-
-    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
-
-    typedef internal::TensorContractionKernel<
-        Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper>
-        TensorContractionKernel;
-
-    typedef typename TensorContractionKernel::LhsBlock LhsBlock;
-    typedef typename TensorContractionKernel::RhsBlock RhsBlock;
-    typedef typename TensorContractionKernel::BlockMemHandle BlockMemHandle;
-
-    EvalParallelContext(const Self* self, int num_threads, Scalar* buffer,
-                        Index tm, Index tn, Index tk, Index bm, Index bn,
-                        Index bk, Index nm, Index nn, Index nk, Index gm,
-                        Index gn, Index nm0, Index nn0, bool shard_by_col,
-                        bool parallel_pack,
-                        bool parallelize_by_sharding_dim_only,
-                        DoneCallback done)
-        : created_by_thread_id_(std::this_thread::get_id()),
-          done_(this, std::move(done)),
-          device_(self->m_device),
-          lhs_(self->m_leftImpl, self->m_left_nocontract_strides,
-               self->m_i_strides, self->m_left_contracting_strides,
-               self->m_k_strides),
-          rhs_(self->m_rightImpl, self->m_right_nocontract_strides,
-               self->m_j_strides, self->m_right_contracting_strides,
-               self->m_k_strides),
-          buffer_(buffer),
-          output_(buffer, tm),
-          output_kernel_(self->m_output_kernel),
-          tensor_contraction_params_(self->m_tensor_contraction_params),
-          num_threads_(num_threads),
-          shard_by_col_(shard_by_col),
-          parallel_pack_(parallel_pack),
-          parallelize_by_sharding_dim_only_(parallelize_by_sharding_dim_only),
-          m_(tm),
-          n_(tn),
-          k_(tk),
-          bm_(bm),
-          bn_(bn),
-          bk_(bk),
-          nm_(nm),
-          nn_(nn),
-          nk_(nk),
-          gm_(gm),
-          gn_(gn),
-          nm0_(nm0),
-          nn0_(nn0),
-          kernel_(m_, k_, n_, bm_, bk_, bn_),
-          num_thread_local_allocations_(0),
-          // We reserve 2X more capacity for a thread local values, than the
-          // number of threads in the pool to efficiently handle task stealing
-          // by threads that are not managed by the pool.
-          thread_local_capacity(2 * (parallelize_by_sharding_dim_only_
-                                         ? device_.numThreadsInPool()
-                                         : 0)),
-          // We will use only one of the Lhs/Rhs thread local storage depending
-          // on the shard_by_col value and we parallelize by sharding dim ONLY.
-          lhs_thread_local_blocks_(shard_by_col_ ? 0 : thread_local_capacity,
-                                   {*this}, {*this}),
-          rhs_thread_local_blocks_(shard_by_col_ ? thread_local_capacity : 0,
-                                   {*this}, {*this}) {
-      // These two options are mutually exclusive.
-      eigen_assert(!(parallel_pack && parallelize_by_sharding_dim_only));
-
-      for (Index x = 0; x < P; x++) {
-        // Normal number of notifications for k slice switch is
-        // nm_ + nn_ + nm_ * nn_. However, first P - 1 slices will receive only
-        // nm_ + nn_ notifications, because they will not receive notifications
-        // from preceding kernels.
-        state_switch_[x] =
-            x == 0
-                ? 1
-                : (parallel_pack_ ? nn_ + nm_ : (shard_by_col_ ? nn_ : nm_)) +
-                      (x == P - 1 ? nm_ * nn_ : 0);
-        state_packing_ready_[x] =
-            parallel_pack_ ? 0 : (shard_by_col_ ? nm_ : nn_);
-        state_kernel_[x] = new std::atomic<uint8_t>*[nm_];
-        for (Index m = 0; m < nm_; m++) {
-          state_kernel_[x][m] = new std::atomic<uint8_t>[nn_];
-          // Kernels generally receive 3 notifications (previous kernel + 2
-          // packing), but the first slice won't get notifications from previous
-          // kernels.
-          for (Index n = 0; n < nn_; n++)
-            state_kernel_[x][m][n].store(
-                (x == 0 ? 0 : 1) + (parallel_pack_ ? 2 : 1),
-                std::memory_order_relaxed);
-        }
-      }
-
-      // Allocate memory for packed rhs/lhs matrices.
-      packed_mem_ = kernel_.allocateSlices(            //
-          device_,                                     //
-          /*num_lhs=*/nm0_,                            //
-          /*num_rhs=*/nn0_,                            //
-          /*num_slices=*/std::min<Index>(nk_, P - 1),  //
-          packed_lhs_, packed_rhs_);
-
-      if (parallelize_by_sharding_dim_only_) {
-        const int num_worker_threads = device_.numThreadsInPool();
-
-        if (shard_by_col) {
-          can_use_thread_local_packed_ = new std::atomic<bool>[nn_];
-          for (int i = 0; i < nn_; ++i)
-            can_use_thread_local_packed_[i].store(true,
-                                                  std::memory_order_relaxed);
-
-          Index num_blocks = num_worker_threads * gn_;
-          thread_local_pre_alocated_mem_ = kernel_.allocateSlices(  //
-              device_,                                              //
-              /*num_lhs=*/0,                                        //
-              /*num_rhs=*/num_blocks,                               //
-              /*num_slices=*/1,                                     //
-              /*lhs_blocks=*/nullptr, &rhs_thread_local_pre_allocated_);
-
-        } else {
-          can_use_thread_local_packed_ = new std::atomic<bool>[nm_];
-          for (int i = 0; i < nm_; ++i)
-            can_use_thread_local_packed_[i].store(true,
-                                                  std::memory_order_relaxed);
-
-          Index num_blocks = num_worker_threads * gm_;
-          thread_local_pre_alocated_mem_ = kernel_.allocateSlices(  //
-              device_,                                              //
-              /*num_lhs=*/num_blocks,                               //
-              /*num_rhs=*/0,                                        //
-              /*num_slices=*/1, &lhs_thread_local_pre_allocated_,   //
-              /*rhs_blocks=*/nullptr);
-        }
-      }
-    }
-
-    ~EvalParallelContext() {
-      for (Index x = 0; x < P; x++) {
-        for (Index m = 0; m < nm_; m++) delete[] state_kernel_[x][m];
-        delete[] state_kernel_[x];
-      }
-      kernel_.deallocate(device_, packed_mem_);
-      if (parallelize_by_sharding_dim_only_) {
-        kernel_.deallocate(device_, thread_local_pre_alocated_mem_);
-        delete[] can_use_thread_local_packed_;
-      }
-    }
-
-    void run() {
-      // Kick off packing of the first slice.
-      signal_switch(0, 1);
-
-      // Wait for overall completion.
-      //
-      // If parallel evaluation is executed in async mode, this is a no-op, and
-      // Wait() will return immediately. In synchronous mode it will block the
-      // caller thread until it will receive notification from last task.
-      //
-      // In async mode, last task when completed will call done callback from
-      // the same thread, and will delete this context.
-      //
-      // TODO(dvyukov): This wait can lead to deadlock if contraction is
-      // evaluated in synchronous mode. If nthreads contractions are
-      // concurrently submitted from worker threads, this wait will block all
-      // worker threads and the system will deadlock.
-      done_.Wait();
-    }
-
-   private:
-    std::thread::id created_by_thread_id_;
-
-    // This notification is specialized on the type of DoneCallback and can be
-    // blocking or non-blocking.
-    EvalParallelNotification<DoneCallback, EvalParallelContext> done_;
-
-    const Device& device_;
-    LhsMapper lhs_;
-    RhsMapper rhs_;
-    Scalar* const buffer_;
-    OutputMapper output_;
-    OutputKernelType output_kernel_;
-    TensorContractionParams tensor_contraction_params_;
-    const int num_threads_;
-    const bool shard_by_col_;
-    const bool parallel_pack_;
-    const bool parallelize_by_sharding_dim_only_;
-    // Matrix sizes.
-    const Index m_;
-    const Index n_;
-    const Index k_;
-    // Block sizes.
-    const Index bm_;
-    const Index bn_;
-    const Index bk_;
-    // Number of tasks.
-    const Index nm_;
-    const Index nn_;
-    const Index nk_;
-    // Task grain sizes (number of kernels executed per task).
-    const Index gm_;
-    const Index gn_;
-    // Number of blocks (this is different from ni_/nn_ because of task size
-    // coarsening).
-    const Index nm0_;
-    const Index nn0_;
-    // Tensor contraction kernel.
-    TensorContractionKernel kernel_;
-
-    // Parallelization strategy.
-    //
-    // Blocks related to the same k block can run in parallel because they write
-    // to different output blocks. So we parallelize within k slices, this
-    // gives us parallelism level of m x n. Before we can start any kernels
-    // related to k-th slice, we need to issue m lhs packing tasks and n rhs
-    // packing tasks.
-    //
-    // However, there is a bottleneck when we are finishing kernels for k-th
-    // slice (at the very end there is only 1 runnable kernel). To mitigate this
-    // bottleneck we allow kernels from k-th and k+1-th slices to run in
-    // parallel. Note that (m, n, k) and (m, n, k+1) kernels write to the same
-    // output block, so they must not run in parallel.
-    //
-    // This gives us the following dependency graph.
-    // On each k slice we have m x n kernel tasks, m lhs paking tasks and n rhs
-    // packing tasks.
-    // Kernel (m, n, k) can start when:
-    //  - kernel (m, n, k-1) has finished
-    //  - lhs packing (m, k) has finished
-    //  - rhs packing (n, k) has finished
-    // Lhs/rhs packing can start when:
-    //  - all k-1 packing has finished (artificially imposed to limit amount of
-    //  parallel packing)
-    //
-    // On top of that we limit runnable tasks to two consecutive k slices.
-    // This is done to limit amount of memory we need for packed lhs/rhs
-    // (for each k slice we need m*bk + n*bk memory in packed_lhs_/packed_rhs_).
-    //
-    // state_switch_ tracks when we are ready to switch to the next k slice.
-    // state_kernel_[m][n] tracks when we are ready to kick off kernel (m, n).
-    // These variable are rolling over 3 consecutive k slices: first two we are
-    // actively executing + one to track completion of kernels in the second
-    // slice.
-    static const Index P = 3;
-
-    // Handle to the allocated temporary storage for Lhs/Rhs blocks.
-    BlockMemHandle packed_mem_;
-    std::vector<LhsBlock> packed_lhs_[P - 1];
-    std::vector<RhsBlock> packed_rhs_[P - 1];
-
-    // If we choose to parallelize only by the sharding dimension, each thread
-    // will have it's own "thead local" (not a c++ thread local storage) memory
-    // for packed_lhs or packed_rhs (shard_by_col = false of true). This memory
-    // can't be passed to a kernel that might execute on a different thread.
-    //
-    // In practice when we are ready to pack memory for the sharding dimension
-    // (rhs if shard_by_col==true) of the K-th slice, all kernels for K-1 slice
-    // already computed (99% of the time), and we can pack data into the thread
-    // local storage, and guarantee that all the kernels will be executed
-    // immediately in the same thread. This significantly increases L1 cache hit
-    // ratio and reduces pressure on the memory bus.
-    //
-    // It's still possible that kernel for the K-th slice will be ready before
-    // completion of the K-1 kernel, so we have to allocate "global" packed_lhs_
-    // and packed_rhs_ to allow kernels to be executed later on a thread
-    // different from the thread that was used for packing.
-
-    // Handle for pre-allocated thread local memory buffers.
-    BlockMemHandle thread_local_pre_alocated_mem_;
-
-    // Only one of these will be initialized depending on shard_by_col value
-    // (the size will be `num_worker_threads * num_grains_in_the_sharding_dim`).
-    std::vector<LhsBlock> lhs_thread_local_pre_allocated_;
-    std::vector<RhsBlock> rhs_thread_local_pre_allocated_;
-
-    // How many thread local blocks were already allocated.
-    std::atomic<int> num_thread_local_allocations_;
-    const int thread_local_capacity;
-
-    // We will use pre-allocated Lhs/Rhs blocks defined above, if the number of
-    // unique threads in a system is below or equal to the number of threads in
-    // a thread pool. We will fallback on dynamic memory allocation after that.
-
-    // ThreadLocalBlocks is a container for Lhs or Rhs thread local buffers. Its
-    // size is equal to the grain size in Lhs/Rhs sharding dimension.
-    template <typename BlockType>
-    class ThreadLocalBlocks {
-     public:
-      ThreadLocalBlocks() = default;
-
-      ThreadLocalBlocks(BlockType* base, size_t grain_size)
-          : is_pre_allocated_(true),
-            thread_local_pre_allocated_base_(base),
-            grain_size_(grain_size) {}
-
-      ThreadLocalBlocks(BlockMemHandle mem_handle,
-                        std::vector<BlockType> blocks)
-          : is_pre_allocated_(false),
-            mem_handle_(std::move(mem_handle)),
-            blocks_(std::move(blocks)) {}
-
-      BlockType& block(int grain_index) {
-        eigen_assert(grain_index >= 0);
-        eigen_assert(static_cast<size_t>(grain_index) < size());
-        return is_pre_allocated_ ? thread_local_pre_allocated_base_[grain_index]
-                                 : blocks_[grain_index];
-      }
-
-      void Release(EvalParallelContext& ctx) const {
-        if (!is_pre_allocated_) {
-          ctx.kernel_.deallocate(ctx.device_, mem_handle_);
-        }
-      }
-
-      size_t size() const {
-        return is_pre_allocated_ ? grain_size_ : blocks_.size();
-      }
-
-     private:
-      bool is_pre_allocated_;
-
-      // Reuse pre-allocated thread local buffers.
-      BlockType* thread_local_pre_allocated_base_ = nullptr;
-      size_t grain_size_ = 0;
-
-      // These will be initialized only if `is_pre_allocated == false`.
-      BlockMemHandle mem_handle_{};
-      std::vector<BlockType> blocks_;
-    };
-
-    // ThreadLocalBlocksInitialize callable does custom thread local blocks
-    // initialization, and will reuse pre-allocated buffers if possible, or will
-    // dynamically allocate new memory.
-    //
-    // Lhs/Rhs blocks might be of the same type, so we have to pass explicitly
-    // for what side do we plan to do block allocation.
-    template <typename BlockType, bool is_rhs>
-    class ThreadLocalBlocksInitialize {
-      static constexpr bool kIsLhs =
-          !is_rhs && std::is_same<BlockType, LhsBlock>::value;
-      static const bool kIsRhs =
-          is_rhs && std::is_same<BlockType, RhsBlock>::value;
-      static_assert(kIsLhs || kIsRhs, "Unkown block type");
-
-      using Blocks = ThreadLocalBlocks<BlockType>;
-
-     public:
-      ThreadLocalBlocksInitialize(EvalParallelContext& ctx)
-          : ctx_(ctx),
-            num_worker_threads_(ctx_.device_.numThreadsInPool()) {}
-
-      void operator()(Blocks& blocks) {
-        const int n = ctx_.num_thread_local_allocations_.fetch_add(
-            1, std::memory_order_relaxed);
-
-        if (n >= num_worker_threads_) {
-          ThreadLocalBlocksAllocator<is_rhs>::allocate(ctx_, blocks);
-        } else {
-          ThreadLocalBlocksAllocator<is_rhs>::reuse(ctx_, n, blocks);
-        }
-      }
-
-     private:
-      // NOTE(ezhulenev): Without 'if constexpr' we have to put calls to
-      // TensorContractionKernel::allocateSlices into template specializations.
-      // Also explicit specializations are not allowed at class scope in C++03,
-      // EvalCtx type parameter is just a workaround for that limitation.
-      template <bool pack_rhs, typename EvalCtx = EvalParallelContext>
-      struct ThreadLocalBlocksAllocator;
-
-      template <typename EvalCtx>
-      struct ThreadLocalBlocksAllocator</*pack_rhs=*/true, EvalCtx> {
-        static void allocate(EvalCtx& ctx, Blocks& blocks) {
-          std::vector<RhsBlock> rhs_blocks;
-          BlockMemHandle mem_handle = ctx.kernel_.allocateSlices(
-              ctx.device_,
-              /*num_lhs=*/0,
-              /*num_rhs=*/ctx.gn_,
-              /*num_slices=*/1,
-              /*lhs_blocks=*/nullptr, /*rhs_blocks=*/&rhs_blocks);
-
-          blocks = ThreadLocalBlocks<RhsBlock>(std::move(mem_handle),
-                                               std::move(rhs_blocks));
-        }
-
-        static void reuse(EvalCtx& ctx, int index, Blocks& blocks) {
-          RhsBlock* ptr = &ctx.rhs_thread_local_pre_allocated_[ctx.gn_ * index];
-          blocks = ThreadLocalBlocks<RhsBlock>(ptr, ctx.gn_);
-        }
-      };
-
-      template <typename EvalCtx>
-      struct ThreadLocalBlocksAllocator</*pack_rhs=*/false, EvalCtx> {
-        static void allocate(EvalCtx& ctx, Blocks& blocks) {
-          std::vector<LhsBlock> lhs_blocks;
-          BlockMemHandle mem_handle = ctx.kernel_.allocateSlices(
-              ctx.device_,
-              /*num_lhs=*/ctx.gm_,
-              /*num_rhs=*/0,
-              /*num_slices=*/1,
-              /*lhs_blocks=*/&lhs_blocks, /*rhs_blocks=*/nullptr);
-
-          blocks = ThreadLocalBlocks<LhsBlock>(std::move(mem_handle),
-                                               std::move(lhs_blocks));
-        }
-
-        static void reuse(EvalCtx& ctx, int index, Blocks& blocks) {
-          LhsBlock* ptr = &ctx.lhs_thread_local_pre_allocated_[ctx.gm_ * index];
-          blocks = ThreadLocalBlocks<LhsBlock>(ptr, ctx.gm_);
-        }
-      };
-
-      EvalParallelContext& ctx_;
-      const int num_worker_threads_;
-    };
-
-    template <typename BlockType>
-    class ThreadLocalBlocksRelease {
-     public:
-      using Blocks = ThreadLocalBlocks<BlockType>;
-      ThreadLocalBlocksRelease(EvalParallelContext& ctx) : ctx_(ctx) {}
-      void operator()(Blocks& blocks) { blocks.Release(ctx_); }
-
-     private:
-      EvalParallelContext& ctx_;
-    };
-
-    // ThreadLocalBlocks initialization callables.
-    using ThreadLocalLhsInit =
-        ThreadLocalBlocksInitialize<LhsBlock, /*is_rhs=*/false>;
-    using ThreadLocalRhsInit =
-        ThreadLocalBlocksInitialize<RhsBlock, /*is_rhs=*/true>;
-
-    // ThreadLocalBlocks release callables.
-    using ThreadLocalLhsRelease = ThreadLocalBlocksRelease<LhsBlock>;
-    using ThreadLocalRhsRelease = ThreadLocalBlocksRelease<RhsBlock>;
-
-    // Thread local containers for Lhs/Rhs block packs. In practice only one of
-    // them will be used, depending on the shard_by_col value.
-    Eigen::ThreadLocal<ThreadLocalBlocks<LhsBlock>, ThreadLocalLhsInit,
-                       ThreadLocalLhsRelease>
-        lhs_thread_local_blocks_;
-    Eigen::ThreadLocal<ThreadLocalBlocks<RhsBlock>, ThreadLocalRhsInit,
-                       ThreadLocalRhsRelease>
-        rhs_thread_local_blocks_;
-
-    // After a particular shard for Kth slice missed thread local execution
-    // opportunity (K-1 slice didn't complete kernels execution), we can no
-    // longer schedule K+1 and following slices in thread local mode, because
-    // there is no more guarantee that previous kernels were executed
-    // sequentially in the same thread (size is nn_ or nm_).
-    std::atomic<bool>* can_use_thread_local_packed_;
-
-    std::atomic<uint8_t>** state_kernel_[P];
-    // state_switch_ is frequently modified by worker threads, while other
-    // fields are read-only after constructor. Let's move it to a separate cache
-    // line to reduce cache-coherency traffic.
-    char pad_[128];
-    std::atomic<Index> state_packing_ready_[P];
-    std::atomic<Index> state_switch_[P];
-
-    LhsBlock& packed_lhs(Index m, Index k, Index m1, bool use_thread_local) {
-      if (use_thread_local) {
-        eigen_assert(!shard_by_col_);
-        ThreadLocalBlocks<LhsBlock>& blocks = lhs_thread_local_blocks_.local();
-
-        Index grain_index = m1 - m * gm_;
-        return blocks.block(internal::convert_index<int>(grain_index)); // FIXME better make ThreadLocalBlocks use Eigen::Index?
-      } else {
-        return packed_lhs_[k % (P - 1)][m1];
-      }
-    }
-
-    RhsBlock& packed_rhs(Index n, Index k, Index n1, bool use_thread_local) {
-      if (use_thread_local) {
-        eigen_assert(shard_by_col_);
-        ThreadLocalBlocks<RhsBlock>& blocks = rhs_thread_local_blocks_.local();
-
-        Index grain_index = n1 - n * gn_;
-        return blocks.block(internal::convert_index<int>(grain_index)); // FIXME better make ThreadLocalBlocks use Eigen::Index?
-      } else {
-        return packed_rhs_[k % (P - 1)][n1];
-      }
-    }
-
-    // In following two methods (pack_lhs and pack_rhs), if we know for sure
-    // that we'll be able to immediately call a kernel with packed data, and do
-    // not submit it to the thread pool, we can use thread local memory for
-    // packed data.
-    //
-    // We can only reliably check it if we are running all kernels in sync mode
-    // (parallelize only by sharding dim). If kernel for m==0 (n==0) is ready to
-    // run, it's guaranteed that all kernels with larger values of m (n) are
-    // also ready, because we execute them in the same order for all K slices.
-
-    void pack_lhs(Index m, Index k) {
-      bool use_thread_local = false;
-
-      if (parallelize_by_sharding_dim_only_ && !shard_by_col_ &&
-          can_use_thread_local_packed_[m].load(std::memory_order_relaxed)) {
-        if (state_kernel_[k % P][m][0].load(std::memory_order_relaxed) == 1) {
-          use_thread_local = true;
-        } else {
-          // If we can't guarantee that all kernels in `k` slice will be
-          // executed sequentially in current thread, it's no longer safe to use
-          // thread local memory in following slices along the k dimensions.
-          eigen_assert(k > 0);
-          can_use_thread_local_packed_[m].store(false,
-                                                std::memory_order_relaxed);
-        }
-      }
-
-      const Index mend = m * gm_ + gm(m);
-      for (Index m1 = m * gm_; m1 < mend; m1++)
-        kernel_.packLhs(&packed_lhs(m, k, m1, use_thread_local),
-                        lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1));
-
-      if (!parallel_pack_ && shard_by_col_) {
-        assert(!use_thread_local);
-        signal_packing(k);
-      } else {
-        signal_switch(k + 1);
-        for (Index n = nn_ - 1; n >= 0; n--) {
-          bool sync = parallelize_by_sharding_dim_only_ || n == 0;
-          signal_kernel(m, n, k, sync, use_thread_local);
-        }
-      }
-    }
-
-    void pack_rhs(Index n, Index k) {
-      bool use_thread_local = false;
-
-      if (parallelize_by_sharding_dim_only_ && shard_by_col_ &&
-          can_use_thread_local_packed_[n].load(std::memory_order_relaxed)) {
-        if (state_kernel_[k % P][0][n].load(std::memory_order_relaxed) == 1) {
-          use_thread_local = true;
-        } else {
-          // If we can't guarantee that all kernels in `k` slice will be
-          // executed sequentially in current thread, it's no longer safe to use
-          // thread local memory in followig slices along the k dimensions.
-          eigen_assert(k > 0);
-          can_use_thread_local_packed_[n].store(false,
-                                                std::memory_order_relaxed);
-        }
-      }
-
-      const Index nend = n * gn_ + gn(n);
-      for (Index n1 = n * gn_; n1 < nend; n1++) {
-        if (!TensorContractionKernel::HasBeta && k == 0) {
-          // Zero the output memory in parallel, only if contraction kernel does
-          // not support `beta`. Otherwise we will pass beta 0.0 to the first
-          // call to the `TensorContractionKernel::invoke()`.
-          //
-          // On 10000x2x10000 mm zeroing can easily take half of time. Zero (bn
-          // x m) row. Safe to do here because all kernels that will write to
-          // this memory depend on completion of this task. Note: don't call
-          // device_.memset() here. device_.memset() blocks on thread pool
-          // worker thread, which can lead to underutilization and deadlocks.
-          memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar));
-        }
-        kernel_.packRhs(&packed_rhs(n, k, n1, use_thread_local),
-                        rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1));
-      }
-
-      if (parallel_pack_ || shard_by_col_) {
-        signal_switch(k + 1);
-        for (Index m = nm_ - 1; m >= 0; m--) {
-          bool sync = parallelize_by_sharding_dim_only_ || m == 0;
-          signal_kernel(m, n, k, sync, use_thread_local);
-        }
-      } else {
-        assert(!use_thread_local);
-        signal_packing(k);
-      }
-    }
-
-    void kernel(Index m, Index n, Index k, bool use_thread_local) {
-      // Note: order of iteration matters here. Iteration over m is innermost
-      // because we want to reuse the same packed rhs in consecutive tasks
-      // (rhs fits into L2$ while lhs only into L3$).
-      const Index nend = n * gn_ + gn(n);
-      const Index mend = m * gm_ + gm(m);
-
-      // NOTE: output = alpha * LHS * RHS + beta * output.
-      const Scalar alpha = Scalar(1);
-      const Scalar beta =
-          (TensorContractionKernel::HasBeta && k == 0) ? Scalar(0) : Scalar(1);
-
-      if (shard_by_col_) {
-        for (Index n1 = n * gn_; n1 < nend; n1++) {
-          for (Index m1 = m * gm_; m1 < mend; m1++) {
-            const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_);
-            kernel_.invoke(
-                output_mapper,
-                packed_lhs(m, k, m1, !shard_by_col_ && use_thread_local),
-                packed_rhs(n, k, n1, shard_by_col_ && use_thread_local), bm(m1),
-                bk(k), bn(n1), alpha, beta);
-
-            // We are done with the last task for the [m1, n1] block.
-            if (k + 1 == nk_) {
-              output_kernel_(output_mapper, tensor_contraction_params_,
-                             m1 * bm_, n1 * bn_, bm(m1), bn(n1));
-            }
-          }
-        }
-      } else {
-        for (Index m1 = m * gm_; m1 < mend; m1++)
-          for (Index n1 = n * gn_; n1 < nend; n1++) {
-            const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_);
-            kernel_.invoke(
-                output_mapper,
-                packed_lhs(m, k, m1, !shard_by_col_ && use_thread_local),
-                packed_rhs(n, k, n1, shard_by_col_ && use_thread_local), bm(m1),
-                bk(k), bn(n1), alpha, beta);
-
-            // We are done with the last task for the [m1, n1] block.
-            if (k + 1 == nk_) {
-              output_kernel_(output_mapper, tensor_contraction_params_,
-                             m1 * bm_, n1 * bn_, bm(m1), bn(n1));
-            }
-          }
-      }
-      signal_kernel(m, n, k + 1, /*sync=*/false, /*use_thread_local=*/false);
-      signal_switch(k + 2);
-    }
-
-    void signal_packing(Index k) {
-      eigen_assert(!parallel_pack_);
-      Index s = state_packing_ready_[k % P].fetch_sub(1);
-      eigen_assert(s > 0);
-      if (s != 1) return;
-      state_packing_ready_[k % P] = shard_by_col_ ? nm_ : nn_;
-      enqueue_packing(k, shard_by_col_);
-    }
-
-    void signal_kernel(Index m, Index n, Index k, bool sync,
-                       bool use_thread_local) {
-      std::atomic<uint8_t>* state = &state_kernel_[k % P][m][n];
-      Index s = state->load();
-      eigen_assert(s > 0);
-      if (s != 1 && state->fetch_sub(1) != 1) {
-        eigen_assert(!use_thread_local);
-        return;
-      }
-      state->store(parallel_pack_ ? 3 : 2, std::memory_order_relaxed);
-      if (sync) {
-        kernel(m, n, k, use_thread_local);
-      } else {
-        eigen_assert(!use_thread_local);
-        device_.enqueueNoNotification(
-            [=]() { kernel(m, n, k, use_thread_local); });
-      }
-    }
-
-    void signal_switch(Index k, Index v = 1) {
-      Index s = state_switch_[k % P].fetch_sub(v);
-      eigen_assert(s >= v);
-      if (s != v) return;
-
-      // Ready to switch to the next k slice.
-      // Reset counter for the next iteration.
-      state_switch_[k % P] =
-          (parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_)) +
-          nm_ * nn_;
-      if (k < nk_) {
-        // Issue lhs/rhs packing. Their completion will in turn kick off
-        // kernels.
-        if (parallel_pack_) {
-          enqueue_packing(k, !shard_by_col_);
-          enqueue_packing(k, shard_by_col_);
-        } else if (shard_by_col_) {
-          enqueue_packing(k, false);
-        } else {
-          enqueue_packing(k, true);
-        }
-
-        // Termination handling.
-        // Because kernel completion signals k + 2 switch, we need to finish nk
-        // + 2 slices without issuing any tasks on nk + 1 slice. So here we
-        // pretend that all nk + 1 packing tasks just finish instantly; so that
-        // nk + 2 switch only waits for completion of nk kernels.
-      } else if (k == nk_) {
-        signal_switch(k + 1,
-                      parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_));
-      } else {
-        done_.Notify();
-      }
-    }
-
-    // Enqueue all rhs/lhs packing for k-th slice.
-    void enqueue_packing(Index k, bool rhs) {
-      enqueue_packing_helper(0, rhs ? nn_ : nm_, k, rhs);
-    }
-
-    void enqueue_packing_helper(Index start, Index end, Index k, bool rhs) {
-      if (end - start == 1) {
-        if (rhs)
-          pack_rhs(start, k);
-        else
-          pack_lhs(start, k);
-      } else {
-        while (end - start > 1) {
-          Index mid = (start + end) / 2;
-          device_.enqueueNoNotification(
-              [=]() { enqueue_packing_helper(mid, end, k, rhs); });
-          end = mid;
-        }
-
-        // Decide if we want to run first packing task (start == 0) in
-        // async mode if we parallelize only by sharding dim:
-        // (1) pack_lhs and pack_rhs call signal_switch before completing
-        //     all calls to signal_kernel, which in sync mode might lead
-        //     to the execution of the first kernel of the k+1 slice, before
-        //     completing a call to the last kernel of the k slice.
-        // (2) all pack tasks for sharded dim must be executed in a thread
-        //     pool to get pre-allocated thead local buffers.
-        bool pack_async =
-          (start == 0) &&
-          (parallelize_by_sharding_dim_only_&& shard_by_col_ == rhs) &&
-          (k > 0 || std::this_thread::get_id() == created_by_thread_id_);
-
-        if (pack_async) {
-          device_.enqueueNoNotification(
-              [=]() { enqueue_packing_helper(start, end, k, rhs); });
-        } else {
-          enqueue_packing_helper(start, end, k, rhs);
-        }
-      }
-    }
-
-    // Block sizes with accounting for potentially incomplete last block.
-    Index bm(Index m) const { return m + 1 < nm0_ ? bm_ : m_ + bm_ - bm_ * nm0_; }
-    Index bn(Index n) const { return n + 1 < nn0_ ? bn_ : n_ + bn_ - bn_ * nn0_; }
-    Index bk(Index k) const { return k + 1 < nk_ ? bk_ : k_ + bk_ - bk_ * nk_; }
-    // Task grain sizes accounting for potentially incomplete last task.
-    Index gm(Index m) const { return m + 1 < nm_ ? gm_ : nm0_ + gm_ - gm_ * nm_; }
-    Index gn(Index n) const { return n + 1 < nn_ ? gn_ : nn0_ + gn_ - gn_ * nn_; }
-
-    EvalParallelContext(const EvalParallelContext&) = delete;
-    void operator=(const EvalParallelContext&) = delete;
-  };
-
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
-            bool rhs_inner_dim_reordered, int Alignment>
-  using SyncEvalParallelContext =
-      EvalParallelContext<NoCallback, lhs_inner_dim_contiguous,
-                          rhs_inner_dim_contiguous, rhs_inner_dim_reordered,
-                          Alignment>;
-
-  // ------------------------------------------------------------------------ //
-
-  // EvalShardedByInnerDimContext orchestrates sync/async contraction
-  // evaluation, when we shard by inner dimension. When it is executed in
-  // asynchronous mode, it owns all the shared state that might be accessible by
-  // block processing tasks.
-
-  template <typename DoneCallback>
-  struct EvalShardedByInnerDimContext {
-    EvalShardedByInnerDimContext(const Self* self, int num_threads,
-                                 Scalar* result_buffer,
-                                 Index m_size, Index n_size, Index k_size,
-                                 DoneCallback done_callback)
-        : evaluator(self),
-          m_lhs_inner_dim_contiguous(evaluator->m_lhs_inner_dim_contiguous),
-          m_rhs_inner_dim_contiguous(evaluator->m_rhs_inner_dim_contiguous),
-          m_rhs_inner_dim_reordered(evaluator->m_rhs_inner_dim_reordered),
-          result(result_buffer),
-          m(m_size),
-          n(n_size),
-          k(k_size),
-          done(std::move(done_callback)),
-          buffer_size_bytes(m * n * sizeof(Scalar)),
-          block_size(blockSize(k, num_threads)),
-          num_blocks(divup<Index>(k, block_size)),
-          num_pending_blocks(internal::convert_index<int>(num_blocks)),
-          l0_ranges(divup<Index>(num_blocks, l0_size)),
-          l0_state(l0_ranges),
-          block_buffers(num_blocks) {
-      // Keep count of pending gemm tasks for each l0 range.
-      for (int i = 0; i < l0_ranges; ++i) {
-        const Index num_pending_tasks = actualRangeSize(l0_ranges, l0_size, i);
-        l0_state.emplace_back(internal::convert_index<int>(num_pending_tasks));
-      }
-
-      // Allocate temporary buffers for each block.
-      for (Index block_idx = 0; block_idx < num_blocks; ++block_idx) {
-        Scalar* buf = block_idx == 0
-                          ? result
-                          : static_cast<Scalar*>(evaluator->m_device.allocate(
-                                buffer_size_bytes));
-        block_buffers.emplace_back(buf);
-      }
-    }
-
-    ~EvalShardedByInnerDimContext() {
-      for (Index i = 1; i < num_blocks; ++i) {
-        evaluator->m_device.deallocate(block_buffers[i]);
-      }
-    }
-
-    template <int Alignment>
-    void run() {
-      Barrier barrier(internal::convert_index<int>(num_blocks));
-      eval<Alignment>(barrier, 0, num_blocks);
-      barrier.Wait();
-
-      // Aggregate partial sums from l0 ranges.
-      aggregateL0Blocks<Alignment>();
-
-      // Apply output kernel.
-      applyOutputKernel();
-    }
-
-    template <int Alignment>
-    void runAsync() {
-      evalAsync<Alignment>(0, num_blocks);
-    }
-
-   private:
-    // The underlying GEMM kernel assumes that k is a multiple of
-    // the packet size and subtle breakage occurs if this is violated.
-    static const Index packet_size = internal::packet_traits<RhsScalar>::size;
-
-    const Self* evaluator;  // TensorContraction evaluator
-
-    // These fields required fromTENSOR_CONTRACTION_DISPATCH macro.
-    bool m_lhs_inner_dim_contiguous;
-    bool m_rhs_inner_dim_contiguous;
-    bool m_rhs_inner_dim_reordered;
-
-    Scalar* result;
-
-    Index m;
-    Index n;
-    Index k;
-
-    DoneCallback done;
-
-    // ----------------------------------------------------------------------//
-    // Algorithm parameters.
-
-    // We will compute partial results into the buffers of this size.
-    Index buffer_size_bytes;
-
-    Index block_size;
-    Index num_blocks;
-
-    // Keep track of pending tasks when evaluate in async mode.
-    std::atomic<int> num_pending_blocks;
-
-    // We compute partial gemm results in parallel, and to get the final result
-    // we need to add them all together. For the large number of threads (>= 48)
-    // this adds a very expensive sequential step at the end.
-    //
-    // We split the [0, num_blocks) into small ranges, and when a task for the
-    // block finishes its partial gemm computation, it checks if it was the last
-    // gemm in the range, and if so, it will add all blocks of the range.
-    //
-    // After all tasks done, we need to add only these pre-aggregated blocks.
-
-    // For now we use just a single level of ranges to compute pre-aggregated
-    // partial sums, but in general we can use more layers to compute tree
-    // aggregation in parallel and reduce the size of the sequential step.
-    //
-    // TODO(ezhulenev): Add multilevel tree aggregation? Probably will make
-    // sense only if number of threads >= ~128?
-    static const Index l0_size = 4;
-    Index l0_ranges;
-
-    // Keep count of pending gemm tasks for each l0 range.
-    MaxSizeVector<std::atomic<int>> l0_state;  // [0, l0_ranges)
-
-    // Buffers allocated for each temporary block computation.
-    MaxSizeVector<Scalar*> block_buffers;  // [0, num_blocks)
-
-    template <int Alignment>
-    void processBlock(Index block_idx, Index begin, Index end) {
-      Scalar* buf = block_buffers[block_idx];
-
-      TENSOR_CONTRACTION_DISPATCH(
-          evaluator->template evalGemmPartialWithoutOutputKernel, Alignment,
-          (buf, begin, end,
-           /*num_threads=*/internal::convert_index<int>(num_blocks)));
-
-      // Check if it was the last task in l0 range.
-      const Index l0_index = block_idx / l0_size;
-      const int v = l0_state[l0_index].fetch_sub(1);
-      eigen_assert(v >= 1);
-
-      // If we processed the last block of the range, we can aggregate all
-      // partial results into the first block of the range.
-      if (v == 1) {
-        const Index rng_size = actualRangeSize(l0_ranges, l0_size, l0_index);
-        const Index dst_block_idx = l0_index * l0_size;
-
-        if (rng_size == l0_size) {
-          addAllToBuffer<Alignment>(
-              m * n,
-              /*src_buf0=*/block_buffers[dst_block_idx + 1],
-              /*src_buf1=*/block_buffers[dst_block_idx + 2],
-              /*src_buf2=*/block_buffers[dst_block_idx + 3],
-              /*dst_buf= */ block_buffers[dst_block_idx]);
-        } else {
-          // Aggregate blocks of potentially incomplete last range.
-          for (int i = 1; i < rng_size; ++i) {
-            addToBuffer<Alignment>(m * n,
-                                   /*src_buf=*/block_buffers[dst_block_idx + i],
-                                   /*dst_buf=*/block_buffers[dst_block_idx]);
-          }
-        }
-      }
-    }
-
-    // Aggregate partial sums from l0 ranges.
-    template <int Alignment>
-    void aggregateL0Blocks() const {
-      Index l0_index = 1;
-
-      for (; l0_index + 2 < l0_ranges; l0_index += 3) {
-        addAllToBuffer<Alignment>(
-            m * n,
-            /*src_buf0=*/block_buffers[(l0_index + 0) * l0_size],
-            /*src_buf1=*/block_buffers[(l0_index + 1) * l0_size],
-            /*src_buf2=*/block_buffers[(l0_index + 2) * l0_size],
-            /*dst_buf= */ block_buffers[0]);
-      }
-
-      for (; l0_index < l0_ranges; ++l0_index) {
-        addToBuffer<Alignment>(m * n, block_buffers[l0_index * l0_size],
-                               block_buffers[0]);
-      }
-    }
-
-    void applyOutputKernel() const {
-      typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
-      evaluator->m_output_kernel(
-          OutputMapper(result, m), evaluator->m_tensor_contraction_params,
-          static_cast<Eigen::Index>(0), static_cast<Eigen::Index>(0), m, n);
-    }
-
-    // Compute block size with accounting for potentially incomplete last block.
-    Index actualBlockSize(Index block_idx) const {
-      return block_idx + 1 < num_blocks
-                 ? block_size
-                 : k + block_size - block_size * num_blocks;
-    };
-
-    // Compute range size with accounting for potentially incomplete last range.
-    Index actualRangeSize(Index num_ranges, Index range_size,
-                          Index range_idx) const {
-      eigen_assert(range_idx < num_ranges);
-      return range_idx + 1 < num_ranges
-                 ? range_size
-                 : num_blocks + range_size - range_size * num_ranges;
-    };
-
-    template <int Alignment>
-    EIGEN_STRONG_INLINE static void addToBuffer(size_t n, const Scalar* src_buf,
-                                                Scalar* tgt_buf) {
-      const int output_packet_size =
-          internal::unpacket_traits<PacketReturnType>::size;
-      size_t i = 0;
-      const size_t num_packets = n / output_packet_size;
-      for (; i < output_packet_size * num_packets; i += output_packet_size) {
-        const PacketReturnType src_val =
-            internal::pload<PacketReturnType>(src_buf + i);
-        const PacketReturnType tgt_val =
-            internal::ploadt<PacketReturnType, Alignment>(tgt_buf + i);
-        const PacketReturnType sum = internal::padd(src_val, tgt_val);
-        internal::pstoret<Scalar, PacketReturnType, Alignment>(tgt_buf + i,
-                                                               sum);
-      }
-      for (; i < n; ++i) {
-        tgt_buf[i] += src_buf[i];
-      }
-    }
-
-    template <int Alignment>
-    EIGEN_STRONG_INLINE static void addAllToBuffer(size_t n,
-                                                   const Scalar* src_buf0,
-                                                   const Scalar* src_buf1,
-                                                   const Scalar* src_buf2,
-                                                   Scalar* dst_buf) {
-      using ::Eigen::internal::padd;
-      using ::Eigen::internal::pload;
-      using ::Eigen::internal::ploadt;
-      using ::Eigen::internal::pstoret;
-
-      const int output_packet_size =
-          internal::unpacket_traits<PacketReturnType>::size;
-
-      size_t i = 0;
-      const size_t num_packets = n / output_packet_size;
-      for (; i < output_packet_size * num_packets; i += output_packet_size) {
-        const auto src_val0 = pload<PacketReturnType>(src_buf0 + i);
-        const auto src_val1 = pload<PacketReturnType>(src_buf1 + i);
-        const auto src_val2 = pload<PacketReturnType>(src_buf2 + i);
-
-        const auto dst_val = ploadt<PacketReturnType, Alignment>(dst_buf + i);
-        const auto sum =
-            padd(padd(dst_val, src_val0), padd(src_val1, src_val2));
-
-        pstoret<Scalar, PacketReturnType, Alignment>(dst_buf + i, sum);
-      }
-      for (; i < n; ++i) {
-        dst_buf[i] += src_buf0[i] + src_buf1[i] + src_buf2[i];
-      }
-    }
-
-    template <int Alignment>
-    void eval(Barrier& barrier, Index start_block_idx, Index end_block_idx) {
-      while (end_block_idx - start_block_idx > 1) {
-        Index mid_block_idx = (start_block_idx + end_block_idx) / 2;
-        evaluator->m_device.enqueueNoNotification(
-            [this, &barrier, mid_block_idx, end_block_idx]() {
-              eval<Alignment>(barrier, mid_block_idx, end_block_idx);
-            });
-        end_block_idx = mid_block_idx;
-      }
-
-      Index block_idx = start_block_idx;
-      Index block_start = block_idx * block_size;
-      Index block_end = block_start + actualBlockSize(block_idx);
-
-      processBlock<Alignment>(block_idx, block_start, block_end);
-      barrier.Notify();
-    }
-
-    template <int Alignment>
-    void evalAsync(Index start_block_idx, Index end_block_idx) {
-      while (end_block_idx - start_block_idx > 1) {
-        Index mid_block_idx = (start_block_idx + end_block_idx) / 2;
-        evaluator->m_device.enqueueNoNotification(
-            [this, mid_block_idx, end_block_idx]() {
-              evalAsync<Alignment>(mid_block_idx, end_block_idx);
-            });
-        end_block_idx = mid_block_idx;
-      }
-
-      Index block_idx = start_block_idx;
-
-      Index block_start = block_idx * block_size;
-      Index block_end = block_start + actualBlockSize(block_idx);
-
-      processBlock<Alignment>(block_idx, block_start, block_end);
-
-      int v = num_pending_blocks.fetch_sub(1);
-      eigen_assert(v >= 1);
-
-      if (v == 1) {
-        // Aggregate partial sums from l0 ranges.
-        aggregateL0Blocks<Alignment>();
-
-        // Apply output kernel.
-        applyOutputKernel();
-
-        // NOTE: If we call `done` callback before deleting this (context),
-        // it might deallocate Self* pointer captured by context, and we'll
-        // fail in destructor trying to deallocate temporary buffers.
-
-        // Move done call back from context before it will be destructed.
-        DoneCallback done_copy = std::move(done);
-
-        // We are confident that we are the last one who touches context.
-        delete this;
-
-        // Now safely call the done callback.
-        done_copy();
-      }
-    }
-
-    // Cost model doesn't capture well the cost associated with constructing
-    // tensor contraction mappers and computing loop bounds in gemm_pack_lhs
-    // and gemm_pack_rhs, so we specify minimum desired block size.
-    static Index blockSize(Index k, int num_threads) {
-      const auto round_up = [=](Index index) -> Index {
-        const Index kmultiple = packet_size <= 8 ? 8 : packet_size;
-        return divup<Index>(index, kmultiple) * kmultiple;
-      };
-
-      const Index target_block_size = round_up(divup<Index>(k, num_threads));
-      const Index desired_min_block_size = 12 * packet_size;
-
-      return numext::mini<Index>(
-          k, numext::maxi<Index>(desired_min_block_size, target_block_size));
-    }
-
-    EvalShardedByInnerDimContext(const EvalShardedByInnerDimContext&) = delete;
-    void operator=(const EvalShardedByInnerDimContext&) = delete;
-  };
-
-  // ------------------------------------------------------------------------ //
-
-  // Below are the function used by evalProductImpl heuristics, trying to select
-  // optimcal parameters for parallelization algorithm.
-
-  // Decide whether we want to shard m x n contraction by columns or by rows.
-  static bool shardByCol(Index m, Index n, Index num_threads) {
-    // Note: we are comparing both n and m against Traits::nr, it is not
-    // a mistake. We are trying to figure out how both n and m will fit into
-    // the main sharding dimension.
-
-    // Sharding by column is the default
-    // ... unless there is enough data for vectorization over rows
-    if (m / num_threads >= Traits::nr &&
-        // and not enough data for vectorization over columns
-        (n / num_threads < Traits::nr ||
-         // ... or barely enough data for vectorization over columns,
-         // but it is not evenly dividable across threads
-         (n / num_threads < 4 * Traits::nr &&
-          (n % (num_threads * Traits::nr)) != 0 &&
-          // ... and it is evenly dividable across threads for rows
-          ((m % (num_threads * Traits::nr)) == 0 ||
-           // .. or it is not evenly dividable for both dimensions but
-           // there is much more data over rows so that corner effects are
-           // mitigated.
-           (m / n >= 6)))))
-      return false;
-    // Wait, or if matrices are just substantially prolonged over the other
-    // dimension.
-    if (n / num_threads < 16 * Traits::nr && m > n * 32) return false;
-    return true;
-  }
-
-  Index coarsenM(Index m, Index n, Index bm, Index bn, Index bk, Index gn,
-                 int num_threads, bool shard_by_col) const {
-    Index gm = 1;
-    Index gm1 = 1;
-    Index nm0 = divup(m, bm);
-    Index nm1 = nm0;
-    for (;;) {
-      // Find the next candidate for m grain size. It needs to result in
-      // different number of blocks. E.g. if we have 10 kernels, we want to try
-      // 5 and 10, but not 6, 7, 8 and 9.
-      while (gm1 <= nm0 && nm1 == divup(nm0, gm1)) gm1++;
-      if (gm1 > nm0) break;
-      // Check the candidate.
-      int res = checkGrain(m, n, bm, bn, bk, gm1, gn, gm, gn, num_threads,
-                           shard_by_col);
-      if (res < 0) break;
-      nm1 = divup(nm0, gm1);
-      if (res == 0) continue;
-      // Commit new grain size.
-      gm = gm1;
-    }
-    return gm;
-  }
-
-  Index coarsenN(Index m, Index n, Index bm, Index bn, Index bk, Index gm,
-                 int num_threads, bool shard_by_col) const {
-    Index gn = 1;
-    Index gn1 = 1;
-    Index nn0 = divup(n, bn);
-    Index nn1 = nn0;
-    for (;;) {
-      while (gn1 <= nn0 && nn1 == divup(nn0, gn1)) gn1++;
-      if (gn1 > nn0) break;
-      int res = checkGrain(m, n, bm, bn, bk, gm, gn1, gm, gn, num_threads,
-                           shard_by_col);
-      if (res < 0) break;
-      nn1 = divup(nn0, gn1);
-      if (res == 0) continue;
-      gn = gn1;
-    }
-    return gn;
-  }
-
-  // checkGrain checks whether grain (gm, gn) is suitable and is better than
-  // (oldgm, oldgn).
-  int checkGrain(Index m, Index n, Index bm, Index bn, Index bk, Index gm,
-                 Index gn, Index oldgm, Index oldgn, int num_threads,
-                 bool shard_by_col) const {
-    const TensorOpCost cost =
-        contractionCost(bm * gm, bn * gn, bm, bn, bk, shard_by_col, true);
-    double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(
-        static_cast<double>(bm) * gm * bn * gn, cost);
-    // If the task is too small, then we agree on it regardless of anything
-    // else. Otherwise synchronization overheads will dominate.
-    if (taskSize < 1) return 1;
-    // If it is too large, then we reject it and all larger tasks.
-    if (taskSize > 2) return -1;
-    // Now we are in presumably good task size range.
-    // The main deciding factor here is parallelism. Consider that we have 12
-    // kernels and 4 threads. Grains of 2, 3 and 4 all yield good task sizes.
-    // But 2/4 yield 6/3 tasks, which gives us parallelism of 0.75 (at most 3/4
-    // of cores will be busy). While grain size 3 gives us 4 tasks, which gives
-    // us parallelism of 1 (we can load all cores).
-    Index nm0 = divup(m, bm);
-    Index nn0 = divup(n, bn);
-    Index new_tasks = divup(nm0, gm) * divup(nn0, gn);
-    double new_parallelism = static_cast<double>(new_tasks) /
-                             (divup<int>(new_tasks, num_threads) * num_threads);
-    Index old_tasks = divup(nm0, oldgm) * divup(nn0, oldgn);
-    double old_parallelism = static_cast<double>(old_tasks) /
-                             (divup<int>(old_tasks, num_threads) * num_threads);
-    if (new_parallelism > old_parallelism || new_parallelism == 1) return 1;
-    return 0;
-  }
-
-  TensorOpCost contractionCost(Index m, Index n, Index bm, Index bn, Index bk,
-                               bool shard_by_col, bool prepacked) const {
-    const int packed_size = std::min<int>(PacketType<LhsScalar, Device>::size,
-                                          PacketType<RhsScalar, Device>::size);
-    const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size;
-    const double kd = static_cast<double>(bk);
-    double compute_bandwidth = computeBandwidth(false, bm, bn, bk);
-    // Computations.
-    TensorOpCost cost = TensorOpCost(0, 0, kd * compute_bandwidth, true, packed_size);
-    // Output stores.
-    cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size);
-    if (prepacked) {
-      // Packing and kernels are executed in different tasks. When we calculate
-      // task grain size we look only at kernel cost assuming that kernel
-      // is more expensive than packing.
-      return cost;
-    }
-    // Lhs/rhs loads + computations.
-    TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * (kd / n);
-    TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * (kd / m);
-    // Lhs packing memory cost does not contribute considerably to overall
-    // execution time because lhs is prefetched early and accessed sequentially.
-    if (shard_by_col)
-      lhsCost.dropMemoryCost();
-    else
-      rhsCost.dropMemoryCost();
-    return cost + lhsCost + rhsCost;
-  }
-
-  // Decide whether we want to shard m x k x n contraction over the inner
-  // (contraction) dimension (k).
-  static bool shardByInnerDim(Index m, Index n, Index k, int num_threads,
-                              int num_threads_by_k) {
-    std::ptrdiff_t bufsize = m * n * sizeof(Scalar);
-    bool shard_by_k = false;
-    if (n == 1 ||                // If mat*vec or...
-        num_threads_by_k < 2 ||  // running single threaded or...
-        num_threads_by_k <
-            num_threads ||  // sharding by k gives less parallelism or...
-        bufsize > l3CacheSize() / num_threads_by_k ||  // need more buffer space
-        // than L3 cache or...
-        k / num_threads_by_k < 2 * Traits::nr) {  // k per thread is tiny.
-      shard_by_k = false;
-    } else if (numext::maxi(m, n) / num_threads <
-                   Traits::nr ||  // both other dimensions are tiny or...
-               // k per thread is not small and...
-               (k / num_threads_by_k > 8 * Traits::nr &&
-                // one of the outer dimensions is tiny or sharding by k offers
-                // more parallelism.
-                (numext::mini(m, n) < 2 * Traits::nr ||
-                 num_threads_by_k > num_threads))) {
-      shard_by_k = true;
-    }
-    return shard_by_k;
-  }
-
-  TensorOpCost contractionCostPerInnerDim(Index m, Index n, Index k) const {
-    // Compute cost.
-    const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size;
-    TensorOpCost cost(0, 0, (computeBandwidth(true, m, n, k) * m) * n, true, output_packet_size);
-    // Output stores.
-    cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size);
-    TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * m;
-    TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * n;
-    // Since the inner gemm kernel is always sharded by column, the lhs
-    // load cost is negligible.
-    lhsCost.dropMemoryCost();
-    return cost + lhsCost + rhsCost;
-  }
-
-  int numThreadsInnerDim(Index m, Index n, Index k) const {
-    const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size;
-    TensorOpCost cost = contractionCostPerInnerDim(m, n, k);
-    double total_parallel_cost =
-        TensorCostModel<ThreadPoolDevice>::totalCost(k, cost);
-    // Cost of reduction step accumulating the m*n per-thread buffers into the
-    // result.
-    double reduction_cost = TensorCostModel<ThreadPoolDevice>::totalCost(
-        m * n, TensorOpCost(2, 1, 1, true, output_packet_size));
-    int num_threads = 1;
-    double min_cost = total_parallel_cost;
-    double kPerThreadOverHead = 3000;
-    double kFixedOverHead = 100000;
-    for (int nt = 2; nt <= this->m_device.numThreads(); nt += 2) {
-      double sequential_cost =
-          kFixedOverHead + nt * (reduction_cost + kPerThreadOverHead);
-      double parallel_cost = total_parallel_cost / nt + sequential_cost;
-      if (parallel_cost < min_cost) {
-        num_threads = nt;
-        min_cost = parallel_cost;
-      }
-    }
-    return num_threads;
-  }
-
-  double computeBandwidth(bool shard_by_col, Index bm, Index bn,
-                          Index bk) const {
-    // Peak VFMA bandwidth is 0.5. However if we have not enough data for
-    // vectorization bandwidth drops. The 4.0 and 2.0 bandwidth is determined
-    // experimentally.
-    double computeBandwidth =
-        bk == 1 ? 4.0
-                : (shard_by_col ? bn : bm) < Traits::nr ||
-                          (shard_by_col ? bm : bn) < Traits::mr
-                      ? 2.0
-                      : 0.5;
-#ifndef EIGEN_VECTORIZE_FMA
-    // Bandwidth of all of VFMA/MULPS/ADDPS is 0.5 on latest Intel processors.
-    // However for MULPS/ADDPS we have dependent sequence of 2 such
-    // instructions,
-    // so overall bandwidth is 1.0.
-    if (computeBandwidth == 0.5) computeBandwidth = 1.0;
-#endif
-    return computeBandwidth;
-  }
-
-};
-
-} // end namespace Eigen
-
-#endif  // EIGEN_USE_THREADS
-#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorConversion.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorConversion.h
deleted file mode 100644
index 09d2da9..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorConversion.h
+++ /dev/null
@@ -1,456 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H
-#define EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H
-
-namespace Eigen {
-
-/** \class TensorConversionOp
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Tensor conversion class. This class makes it possible to vectorize
-  * type casting operations when the number of scalars per packet in the source
-  * and the destination type differ
-  */
-namespace internal {
-template<typename TargetType, typename XprType>
-struct traits<TensorConversionOp<TargetType, XprType> >
-{
-  // Type promotion to handle the case where the types of the lhs and the rhs are different.
-  typedef TargetType Scalar;
-  typedef typename traits<XprType>::StorageKind StorageKind;
-  typedef typename traits<XprType>::Index Index;
-  typedef typename XprType::Nested Nested;
-  typedef typename remove_reference<Nested>::type _Nested;
-  static const int NumDimensions = traits<XprType>::NumDimensions;
-  static const int Layout = traits<XprType>::Layout;
-  enum { Flags = 0 };
-  typedef typename TypeConversion<Scalar, typename traits<XprType>::PointerType>::type PointerType;
-};
-
-template<typename TargetType, typename XprType>
-struct eval<TensorConversionOp<TargetType, XprType>, Eigen::Dense>
-{
-  typedef const TensorConversionOp<TargetType, XprType>& type;
-};
-
-template<typename TargetType, typename XprType>
-struct nested<TensorConversionOp<TargetType, XprType>, 1, typename eval<TensorConversionOp<TargetType, XprType> >::type>
-{
-  typedef TensorConversionOp<TargetType, XprType> type;
-};
-
-}  // end namespace internal
-
-
-template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket, int SrcCoeffRatio, int TgtCoeffRatio>
-struct PacketConverter;
-
-template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
-struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 1, 1> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  PacketConverter(const TensorEvaluator& impl)
-      : m_impl(impl) {}
-
-  template<int LoadMode, typename Index>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
-    return internal::pcast<SrcPacket, TgtPacket>(m_impl.template packet<LoadMode>(index));
-  }
-
- private:
-  const TensorEvaluator& m_impl;
-};
-
-
-template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
-struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 2, 1> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  PacketConverter(const TensorEvaluator& impl)
-      : m_impl(impl) {}
-
-  template<int LoadMode, typename Index>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
-    const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
-
-    SrcPacket src1 = m_impl.template packet<LoadMode>(index);
-    SrcPacket src2 = m_impl.template packet<LoadMode>(index + SrcPacketSize);
-    TgtPacket result = internal::pcast<SrcPacket, TgtPacket>(src1, src2);
-    return result;
-  }
-
- private:
-  const TensorEvaluator& m_impl;
-};
-
-template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
-struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 4, 1> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  PacketConverter(const TensorEvaluator& impl)
-      : m_impl(impl) {}
-
-  template<int LoadMode, typename Index>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
-    const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
-
-    SrcPacket src1 = m_impl.template packet<LoadMode>(index);
-    SrcPacket src2 = m_impl.template packet<LoadMode>(index + SrcPacketSize);
-    SrcPacket src3 = m_impl.template packet<LoadMode>(index + 2 * SrcPacketSize);
-    SrcPacket src4 = m_impl.template packet<LoadMode>(index + 3 * SrcPacketSize);
-    TgtPacket result = internal::pcast<SrcPacket, TgtPacket>(src1, src2, src3, src4);
-    return result;
-  }
-
- private:
-  const TensorEvaluator& m_impl;
-};
-
-template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
-struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 8, 1> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  PacketConverter(const TensorEvaluator& impl)
-      : m_impl(impl) {}
-
-  template<int LoadMode, typename Index>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
-    const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
-
-    SrcPacket src1 = m_impl.template packet<LoadMode>(index);
-    SrcPacket src2 = m_impl.template packet<LoadMode>(index + 1 * SrcPacketSize);
-    SrcPacket src3 = m_impl.template packet<LoadMode>(index + 2 * SrcPacketSize);
-    SrcPacket src4 = m_impl.template packet<LoadMode>(index + 3 * SrcPacketSize);
-    SrcPacket src5 = m_impl.template packet<LoadMode>(index + 4 * SrcPacketSize);
-    SrcPacket src6 = m_impl.template packet<LoadMode>(index + 5 * SrcPacketSize);
-    SrcPacket src7 = m_impl.template packet<LoadMode>(index + 6 * SrcPacketSize);
-    SrcPacket src8 = m_impl.template packet<LoadMode>(index + 7 * SrcPacketSize);
-    TgtPacket result = internal::pcast<SrcPacket, TgtPacket>(src1, src2, src3, src4, src5, src6, src7, src8);
-    return result;
-  }
-
- private:
-  const TensorEvaluator& m_impl;
-};
-
-template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket, int TgtCoeffRatio>
-struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 1, TgtCoeffRatio> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  PacketConverter(const TensorEvaluator& impl)
-      : m_impl(impl), m_maxIndex(impl.dimensions().TotalSize()) {}
-
-  template<int LoadMode, typename Index>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
-    const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
-    // Only call m_impl.packet() when we have direct access to the underlying data. This
-    // ensures that we don't compute the subexpression twice. We may however load some
-    // coefficients twice, but in practice this doesn't negatively impact performance.
-    if (m_impl.data() && (index + SrcPacketSize < m_maxIndex)) {
-      // Force unaligned memory loads since we can't ensure alignment anymore
-      return internal::pcast<SrcPacket, TgtPacket>(m_impl.template packet<Unaligned>(index));
-    } else {
-      const int TgtPacketSize = internal::unpacket_traits<TgtPacket>::size;
-      typedef typename internal::unpacket_traits<SrcPacket>::type SrcType;
-      typedef typename internal::unpacket_traits<TgtPacket>::type TgtType;
-      internal::scalar_cast_op<SrcType, TgtType> converter;
-      EIGEN_ALIGN_MAX typename internal::unpacket_traits<TgtPacket>::type values[TgtPacketSize];
-      EIGEN_UNROLL_LOOP
-      for (int i = 0; i < TgtPacketSize; ++i) {
-        values[i] = converter(m_impl.coeff(index+i));
-      }
-      TgtPacket rslt = internal::pload<TgtPacket>(values);
-      return rslt;
-    }
-  }
-
- private:
-  const TensorEvaluator& m_impl;
-  const typename TensorEvaluator::Index m_maxIndex;
-};
-
-template<typename TargetType, typename XprType>
-class TensorConversionOp : public TensorBase<TensorConversionOp<TargetType, XprType>, ReadOnlyAccessors>
-{
-  public:
-    typedef typename internal::traits<TensorConversionOp>::Scalar Scalar;
-    typedef typename internal::traits<TensorConversionOp>::StorageKind StorageKind;
-    typedef typename internal::traits<TensorConversionOp>::Index Index;
-    typedef typename internal::nested<TensorConversionOp>::type Nested;
-    typedef Scalar CoeffReturnType;
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConversionOp(const XprType& xpr)
-        : m_xpr(xpr) {}
-
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename XprType::Nested>::type&
-    expression() const { return m_xpr; }
-
-  protected:
-    typename XprType::Nested m_xpr;
-};
-
-template <bool SameType, typename Eval, typename EvalPointerType> struct ConversionSubExprEval {
-  static EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType) {
-    impl.evalSubExprsIfNeeded(NULL);
-    return true;
-  }
-};
-
-template <typename Eval, typename EvalPointerType> struct ConversionSubExprEval<true, Eval, EvalPointerType> {
-  static EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType data) {
-    return impl.evalSubExprsIfNeeded(data);
-  }
-};
-
-#ifdef EIGEN_USE_THREADS
-template <bool SameType, typename Eval, typename EvalPointerType,
-          typename EvalSubExprsCallback>
-struct ConversionSubExprEvalAsync {
-  static EIGEN_STRONG_INLINE void run(Eval& impl, EvalPointerType, EvalSubExprsCallback done) {
-    impl.evalSubExprsIfNeededAsync(nullptr, std::move(done));
-  }
-};
-
-template <typename Eval, typename EvalPointerType,
-          typename EvalSubExprsCallback>
-struct ConversionSubExprEvalAsync<true, Eval, EvalPointerType,
-                                  EvalSubExprsCallback> {
-  static EIGEN_STRONG_INLINE void run(Eval& impl, EvalPointerType data, EvalSubExprsCallback done) {
-    impl.evalSubExprsIfNeededAsync(data, std::move(done));
-  }
-};
-#endif
-
-namespace internal {
-
-template <typename SrcType, typename TargetType, bool IsSameT>
-struct CoeffConv {
-  template <typename ArgType, typename Device>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetType run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
-    internal::scalar_cast_op<SrcType, TargetType> converter;
-    return converter(impl.coeff(index));
-  }
-};
-
-template <typename SrcType, typename TargetType>
-struct CoeffConv<SrcType, TargetType, true> {
-  template <typename ArgType, typename Device>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetType run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
-    return impl.coeff(index);
-  }
-};
-
-template <typename SrcPacket, typename TargetPacket, int LoadMode, bool ActuallyVectorize, bool IsSameT>
-struct PacketConv {
-  typedef typename internal::unpacket_traits<SrcPacket>::type SrcType;
-  typedef typename internal::unpacket_traits<TargetPacket>::type TargetType;
-
-  static const int PacketSize = internal::unpacket_traits<TargetPacket>::size;
-
-  template <typename ArgType, typename Device>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
-    internal::scalar_cast_op<SrcType, TargetType> converter;
-    EIGEN_ALIGN_MAX typename internal::remove_const<TargetType>::type values[PacketSize];
-    EIGEN_UNROLL_LOOP
-    for (int i = 0; i < PacketSize; ++i) {
-      values[i] = converter(impl.coeff(index+i));
-    }
-    TargetPacket rslt = internal::pload<TargetPacket>(values);
-    return rslt;
-  }
-};
-
-template <typename SrcPacket, typename TargetPacket, int LoadMode, bool IsSameT>
-struct PacketConv<SrcPacket, TargetPacket, LoadMode, true, IsSameT> {
-  typedef typename internal::unpacket_traits<SrcPacket>::type SrcType;
-  typedef typename internal::unpacket_traits<TargetPacket>::type TargetType;
-
-  template <typename ArgType, typename Device>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
-    const int SrcCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio;
-    const int TgtCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio;
-    PacketConverter<TensorEvaluator<ArgType, Device>, SrcPacket, TargetPacket,
-                    SrcCoeffRatio, TgtCoeffRatio> converter(impl);
-    return converter.template packet<LoadMode>(index);
-  }
-};
-
-template <typename SrcPacket, typename TargetPacket, int LoadMode>
-struct PacketConv<SrcPacket, TargetPacket, LoadMode, /*ActuallyVectorize=*/false, /*IsSameT=*/true> {
-  typedef typename internal::unpacket_traits<TargetPacket>::type TargetType;
-  static const int PacketSize = internal::unpacket_traits<TargetPacket>::size;
-
-  template <typename ArgType, typename Device>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
-    EIGEN_ALIGN_MAX typename internal::remove_const<TargetType>::type values[PacketSize];
-    for (int i = 0; i < PacketSize; ++i) values[i] = impl.coeff(index+i);
-    return internal::pload<TargetPacket>(values);
-  }
-};
-
-template <typename SrcPacket, typename TargetPacket, int LoadMode>
-struct PacketConv<SrcPacket, TargetPacket, LoadMode, /*ActuallyVectorize=*/true, /*IsSameT=*/true> {
-  template <typename ArgType, typename Device>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
-    return impl.template packet<LoadMode>(index);
-  }
-};
-
-}  // namespace internal
-
-// Eval as rvalue
-template<typename TargetType, typename ArgType, typename Device>
-struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
-{
-  typedef TensorConversionOp<TargetType, ArgType> XprType;
-  typedef typename XprType::Index Index;
-  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
-  typedef TargetType Scalar;
-  typedef TargetType CoeffReturnType;
-  typedef typename internal::remove_all<typename internal::traits<ArgType>::Scalar>::type SrcType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  typedef typename PacketType<SrcType, Device>::type PacketSourceType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  static const bool IsSameType = internal::is_same<TargetType, SrcType>::value;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
-  enum {
-    IsAligned         = false,
-    PacketAccess      =
-    #ifndef EIGEN_USE_SYCL
-                        true,
-    #else
-                        TensorEvaluator<ArgType, Device>::PacketAccess &
-                        internal::type_casting_traits<SrcType, TargetType>::VectorizedCast,
-    #endif
-    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
-    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
-    Layout            = TensorEvaluator<ArgType, Device>::Layout,
-    RawAccess         = false
-  };
-
-  static const int NumDims = internal::array_size<Dimensions>::value;
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
-  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
-
-  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
-      ArgTensorBlock;
-
-  struct TensorConversionOpBlockFactory {
-    template <typename ArgXprType>
-    struct XprType {
-      typedef TensorConversionOp<TargetType, const ArgXprType> type;
-    };
-
-    template <typename ArgXprType>
-    typename XprType<ArgXprType>::type expr(const ArgXprType& expr) const {
-      return typename XprType<ArgXprType>::type(expr);
-    }
-  };
-
-  typedef internal::TensorUnaryExprBlock<TensorConversionOpBlockFactory,
-                                         ArgTensorBlock>
-      TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-    : m_impl(op.expression(), device)
-  {
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); }
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data)
-  {
-    return ConversionSubExprEval<IsSameType, TensorEvaluator<ArgType, Device>, EvaluatorPointerType>::run(m_impl, data);
-  }
-
-#ifdef EIGEN_USE_THREADS
-  template <typename EvalSubExprsCallback>
-  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
-      EvaluatorPointerType data, EvalSubExprsCallback done) {
-    ConversionSubExprEvalAsync<IsSameType, TensorEvaluator<ArgType, Device>,
-                               EvaluatorPointerType,
-        EvalSubExprsCallback>::run(m_impl, data, std::move(done));
-  }
-#endif
-
-  EIGEN_STRONG_INLINE void cleanup()
-  {
-    m_impl.cleanup();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
-  {
-    return internal::CoeffConv<SrcType, TargetType, IsSameType>::run(m_impl,index);
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType
-  packet(Index index) const {
-    // If we are not going to do the cast, we just need to check that base
-    // TensorEvaluator has packet access. Otherwise we also need to make sure,
-    // that we have an implementation of vectorized cast.
-    const bool Vectorizable =
-        IsSameType
-        ? TensorEvaluator<ArgType, Device>::PacketAccess
-        : int(TensorEvaluator<ArgType, Device>::PacketAccess) &
-          int(internal::type_casting_traits<SrcType, TargetType>::VectorizedCast);
-
-    return internal::PacketConv<PacketSourceType, PacketReturnType, LoadMode,
-                                Vectorizable, IsSameType>::run(m_impl, index);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
-  costPerCoeff(bool vectorized) const {
-    const double cast_cost = TensorOpCost::CastCost<SrcType, TargetType>();
-    if (vectorized) {
-      const double SrcCoeffRatio =
-          internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio;
-      const double TgtCoeffRatio =
-          internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio;
-      return m_impl.costPerCoeff(vectorized) * (SrcCoeffRatio / PacketSize) +
-          TensorOpCost(0, 0, TgtCoeffRatio * (cast_cost / PacketSize));
-    } else {
-      return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, cast_cost);
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockResourceRequirements getResourceRequirements() const {
-    return m_impl.getResourceRequirements();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
-  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
-          bool /*root_of_expr_ast*/ = false) const {
-    return TensorBlock(m_impl.block(desc, scratch),
-                         TensorConversionOpBlockFactory());
-  }
-
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
-
-  /// required by sycl in order to extract the sycl accessor
-  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
-
- protected:
-  TensorEvaluator<ArgType, Device> m_impl;
-};
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorConvolution.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorConvolution.h
deleted file mode 100644
index b20f80b..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorConvolution.h
+++ /dev/null
@@ -1,1132 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
-#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
-
-namespace Eigen {
-
-/** \class TensorConvolution
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Tensor convolution class.
-  *
-  *
-  */
-namespace internal {
-
-template <typename Index, typename InputDims, int NumKernelDims, int Layout>
-class IndexMapper {
- public:
-  IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims,
-              const array<Index, NumKernelDims>& indices) {
-
-    array<Index, NumDims> dimensions = input_dims;
-    for (int i = 0; i < NumKernelDims; ++i) {
-      const Index index = indices[i];
-      const Index input_dim = input_dims[index];
-      const Index kernel_dim = kernel_dims[i];
-      const Index result_dim = input_dim - kernel_dim + 1;
-      dimensions[index] = result_dim;
-    }
-
-    array<Index, NumDims> inputStrides;
-    array<Index, NumDims> outputStrides;
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      inputStrides[0] = 1;
-      outputStrides[0] = 1;
-      for (int i = 1; i < NumDims; ++i) {
-        inputStrides[i] = inputStrides[i-1] * input_dims[i-1];
-        outputStrides[i] = outputStrides[i-1] * dimensions[i-1];
-      }
-    } else {
-      inputStrides[NumDims - 1] = 1;
-      outputStrides[NumDims - 1] = 1;
-      for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) {
-        inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
-        outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1];
-      }
-    }
-
-    array<Index, NumDims> gpuInputDimensions;
-    array<Index, NumDims> gpuOutputDimensions;
-    array<Index, NumDims> tmp = dimensions;
-    array<Index, NumDims> ordering;
-    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
-                              ? 0
-                              : NumDims - NumKernelDims;
-    for (int i = 0; i < NumKernelDims; ++i) {
-      const Index index = i + offset;
-      ordering[index] = indices[i];
-      tmp[indices[i]] = -1;
-      gpuInputDimensions[index] = input_dims[indices[i]];
-      gpuOutputDimensions[index] = dimensions[indices[i]];
-    }
-
-    int written = static_cast<int>(Layout) == static_cast<int>(ColMajor)
-                      ? NumKernelDims
-                      : 0;
-    for (int i = 0; i < NumDims; ++i) {
-      if (tmp[i] >= 0) {
-        ordering[written] = i;
-        gpuInputDimensions[written] = input_dims[i];
-        gpuOutputDimensions[written] = dimensions[i];
-        ++written;
-      }
-    }
-
-    for (int i = 0; i < NumDims; ++i) {
-      m_inputStrides[i] = inputStrides[ordering[i]];
-      m_outputStrides[i] = outputStrides[ordering[i]];
-    }
-
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      for (int i = 0; i < NumDims; ++i) {
-        if (i > NumKernelDims) {
-          m_gpuInputStrides[i] =
-              m_gpuInputStrides[i - 1] * gpuInputDimensions[i - 1];
-          m_gpuOutputStrides[i] =
-              m_gpuOutputStrides[i - 1] * gpuOutputDimensions[i - 1];
-        } else {
-          m_gpuInputStrides[i] = 1;
-          m_gpuOutputStrides[i] = 1;
-        }
-      }
-    } else {
-      for (int i = NumDims - 1; i >= 0; --i) {
-        if (static_cast<size_t>(i + 1) < offset) {
-          m_gpuInputStrides[i] =
-              m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1];
-          m_gpuOutputStrides[i] =
-              m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1];
-        } else {
-          m_gpuInputStrides[i] = 1;
-          m_gpuOutputStrides[i] = 1;
-        }
-      }
-    }
-  }
-
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const {
-    Index inputIndex = 0;
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      for (int d = NumDims - 1; d > NumKernelDims; --d) {
-        const Index idx = p / m_gpuInputStrides[d];
-        inputIndex += idx * m_inputStrides[d];
-        p -= idx * m_gpuInputStrides[d];
-      }
-      inputIndex += p * m_inputStrides[NumKernelDims];
-    } else {
-      std::ptrdiff_t limit = 0;
-      if (NumKernelDims < NumDims) {
-        limit = NumDims - NumKernelDims - 1;
-      }
-      for (int d = 0; d < limit; ++d) {
-        const Index idx = p / m_gpuInputStrides[d];
-        inputIndex += idx * m_inputStrides[d];
-        p -= idx * m_gpuInputStrides[d];
-      }
-      inputIndex += p * m_inputStrides[limit];
-    }
-    return inputIndex;
-  }
-
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const {
-    Index outputIndex = 0;
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      for (int d = NumDims - 1; d > NumKernelDims; --d) {
-        const Index idx = p / m_gpuOutputStrides[d];
-        outputIndex += idx * m_outputStrides[d];
-        p -= idx * m_gpuOutputStrides[d];
-      }
-      outputIndex += p * m_outputStrides[NumKernelDims];
-    } else {
-      std::ptrdiff_t limit = 0;
-      if (NumKernelDims < NumDims) {
-        limit = NumDims - NumKernelDims - 1;
-      }
-      for (int d = 0; d < limit; ++d) {
-        const Index idx = p / m_gpuOutputStrides[d];
-        outputIndex += idx * m_outputStrides[d];
-        p -= idx * m_gpuOutputStrides[d];
-      }
-      outputIndex += p * m_outputStrides[limit];
-    }
-    return outputIndex;
-  }
-
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const {
-    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
-                              ? 0
-                              : NumDims - NumKernelDims;
-    return i * m_inputStrides[offset];
-  }
-
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const {
-    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
-                              ? 0
-                              : NumDims - NumKernelDims;
-    return i * m_outputStrides[offset];
-  }
-
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j) const {
-    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
-                              ? 0
-                              : NumDims - NumKernelDims;
-    return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
-  }
-
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j) const {
-    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
-                              ? 0
-                              : NumDims - NumKernelDims;
-    return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
-  }
-
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
-    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
-                              ? 0
-                              : NumDims - NumKernelDims;
-    return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] +
-           k * m_inputStrides[offset + 2];
-  }
-
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
-    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
-                              ? 0
-                              : NumDims - NumKernelDims;
-    return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] +
-           k * m_outputStrides[offset + 2];
-  }
-
- private:
-  static const int NumDims = internal::array_size<InputDims>::value;
-  array<Index, NumDims> m_inputStrides;
-  array<Index, NumDims> m_outputStrides;
-  array<Index, NumDims> m_gpuInputStrides;
-  array<Index, NumDims> m_gpuOutputStrides;
-};
-
-
-
-template<typename Dimensions, typename InputXprType, typename KernelXprType>
-struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >
-{
-  // Type promotion to handle the case where the types of the lhs and the rhs are different.
-  typedef typename promote_storage_type<typename InputXprType::Scalar,
-                                        typename KernelXprType::Scalar>::ret Scalar;
-  typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind,
-                                        typename traits<KernelXprType>::StorageKind>::ret StorageKind;
-  typedef typename promote_index_type<typename traits<InputXprType>::Index,
-                                      typename traits<KernelXprType>::Index>::type Index;
-  typedef typename InputXprType::Nested LhsNested;
-  typedef typename KernelXprType::Nested RhsNested;
-  typedef typename remove_reference<LhsNested>::type _LhsNested;
-  typedef typename remove_reference<RhsNested>::type _RhsNested;
-  static const int NumDimensions = traits<InputXprType>::NumDimensions;
-  static const int Layout = traits<InputXprType>::Layout;
-  typedef typename conditional<Pointer_type_promotion<typename InputXprType::Scalar, Scalar>::val,
-  typename traits<InputXprType>::PointerType, typename traits<KernelXprType>::PointerType>::type PointerType;
-
-  enum {
-    Flags = 0
-  };
-};
-
-template<typename Dimensions, typename InputXprType, typename KernelXprType>
-struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense>
-{
-  typedef const TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>& type;
-};
-
-template<typename Dimensions, typename InputXprType, typename KernelXprType>
-struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type>
-{
-  typedef TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> type;
-};
-
-}  // end namespace internal
-
-
-
-template<typename Indices, typename InputXprType, typename KernelXprType>
-class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors>
-{
-  public:
-  typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType,
-                                                  typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;
-  typedef typename Eigen::internal::nested<TensorConvolutionOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorConvolutionOp>::StorageKind StorageKind;
-  typedef typename Eigen::internal::traits<TensorConvolutionOp>::Index Index;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims)
-      : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const Indices& indices() const { return m_indices; }
-
-    /** \returns the nested expressions */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const typename internal::remove_all<typename InputXprType::Nested>::type&
-    inputExpression() const { return m_input_xpr; }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const typename internal::remove_all<typename KernelXprType::Nested>::type&
-    kernelExpression() const { return m_kernel_xpr; }
-
-  protected:
-    typename InputXprType::Nested m_input_xpr;
-    typename KernelXprType::Nested m_kernel_xpr;
-    const Indices m_indices;
-};
-
-
-template<typename Indices, typename InputArgType, typename KernelArgType, typename Device>
-struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device>
-{
-  typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
-
-  static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value;
-  static const int NumKernelDims = internal::array_size<Indices>::value;
-  typedef typename XprType::Index Index;
-  typedef DSizes<Index, NumDims> Dimensions;
-
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  typedef StorageMemory<Scalar, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
-  enum {
-    IsAligned = int(TensorEvaluator<InputArgType, Device>::IsAligned) & int(TensorEvaluator<KernelArgType, Device>::IsAligned),
-    PacketAccess = int(TensorEvaluator<InputArgType, Device>::PacketAccess) & int(TensorEvaluator<KernelArgType, Device>::PacketAccess),
-    BlockAccess = false,
-    PreferBlockAccess = false,
-    Layout = TensorEvaluator<InputArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
-  };
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
-  {
-    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-    const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
-    const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
-
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      m_inputStride[0] = 1;
-      for (int i = 1; i < NumDims; ++i) {
-        m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1];
-      }
-    } else {
-      m_inputStride[NumDims - 1] = 1;
-      for (int i = NumDims - 2; i >= 0; --i) {
-        m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1];
-      }
-    }
-
-    m_dimensions = m_inputImpl.dimensions();
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      for (int i = 0; i < NumKernelDims; ++i) {
-        const Index index = op.indices()[i];
-        const Index input_dim = input_dims[index];
-        const Index kernel_dim = kernel_dims[i];
-        const Index result_dim = input_dim - kernel_dim + 1;
-        m_dimensions[index] = result_dim;
-        if (i > 0) {
-          m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1];
-        } else {
-          m_kernelStride[0] = 1;
-        }
-        m_indexStride[i] = m_inputStride[index];
-      }
-
-      m_outputStride[0] = 1;
-      for (int i = 1; i < NumDims; ++i) {
-        m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1];
-      }
-    } else {
-      for (int i = NumKernelDims - 1; i >= 0; --i) {
-        const Index index = op.indices()[i];
-        const Index input_dim = input_dims[index];
-        const Index kernel_dim = kernel_dims[i];
-        const Index result_dim = input_dim - kernel_dim + 1;
-        m_dimensions[index] = result_dim;
-        if (i < NumKernelDims - 1) {
-          m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1];
-        } else {
-          m_kernelStride[NumKernelDims - 1] = 1;
-        }
-        m_indexStride[i] = m_inputStride[index];
-      }
-
-      m_outputStride[NumDims - 1] = 1;
-      for (int i = NumDims - 2; i >= 0; --i) {
-        m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1];
-      }
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
-    m_inputImpl.evalSubExprsIfNeeded(NULL);
-    preloadKernel();
-    return true;
-  }
-  EIGEN_STRONG_INLINE void cleanup() {
-    m_inputImpl.cleanup();
-    if (m_local_kernel) {
-      m_device.deallocate((void*)m_kernel);
-      m_local_kernel = false;
-    }
-    m_kernel = NULL;
-  }
-
-  void evalTo(typename XprType::Scalar* buffer) {
-    evalSubExprsIfNeeded(NULL);
-    for (int i = 0; i < dimensions().TotalSize(); ++i) {
-      buffer[i] += coeff(i);
-    }
-    cleanup();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
-  {
-    CoeffReturnType result = CoeffReturnType(0);
-    convolve(firstInput(index), 0, NumKernelDims-1, result);
-    return result;
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const
-  {
-    Index indices[2] = {index, index+PacketSize-1};
-    Index startInputs[2] = {0, 0};
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      for (int i = NumDims - 1; i > 0; --i) {
-        const Index idx0 = indices[0] / m_outputStride[i];
-        const Index idx1 = indices[1] / m_outputStride[i];
-        startInputs[0] += idx0 * m_inputStride[i];
-        startInputs[1] += idx1 * m_inputStride[i];
-        indices[0] -= idx0 * m_outputStride[i];
-        indices[1] -= idx1 * m_outputStride[i];
-      }
-    } else {
-      for (int i = 0; i < NumDims - 1; ++i) {
-        const Index idx0 = indices[0] / m_outputStride[i];
-        const Index idx1 = indices[1] / m_outputStride[i];
-        startInputs[0] += idx0 * m_inputStride[i];
-        startInputs[1] += idx1 * m_inputStride[i];
-        indices[0] -= idx0 * m_outputStride[i];
-        indices[1] -= idx1 * m_outputStride[i];
-      }
-    }
-    startInputs[0] += indices[0];
-    startInputs[1] += indices[1];
-
-    if (startInputs[1]-startInputs[0] == PacketSize-1) {
-      PacketReturnType result = internal::pset1<PacketReturnType>(0);
-      convolvePacket(startInputs[0], 0, NumKernelDims-1, result);
-      return result;
-    } else {
-      EIGEN_ALIGN_MAX Scalar data[PacketSize];
-      data[0] = Scalar(0);
-      convolve(startInputs[0], 0, NumKernelDims-1, data[0]);
-      for (int i = 1; i < PacketSize-1; ++i) {
-        data[i] = Scalar(0);
-        convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]);
-      }
-      data[PacketSize-1] = Scalar(0);
-      convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]);
-      return internal::pload<PacketReturnType>(data);
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
-  costPerCoeff(bool vectorized) const {
-    const double kernel_size = m_kernelImpl.dimensions().TotalSize();
-    // We ignore the use of fused multiply-add.
-    const double convolve_compute_cost =
-        TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
-    const double firstIndex_compute_cost =
-        NumDims *
-        (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
-         TensorOpCost::DivCost<Index>());
-    return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
-           kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
-                          m_kernelImpl.costPerCoeff(vectorized) +
-                          TensorOpCost(0, 0, convolve_compute_cost, vectorized,
-                                       PacketSize));
-  }
-
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
-
- private:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
-    Index startInput = 0;
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      for (int i = NumDims - 1; i > 0; --i) {
-        const Index idx = index / m_outputStride[i];
-        startInput += idx * m_inputStride[i];
-        index -= idx * m_outputStride[i];
-      }
-    } else {
-      for (int i = 0; i < NumDims - 1; ++i) {
-        const Index idx = index / m_outputStride[i];
-        startInput += idx * m_inputStride[i];
-        index -= idx * m_outputStride[i];
-      }
-    }
-    startInput += index;
-    return startInput;
-  }
-
-  EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const {
-    for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
-      const Index input = firstIndex + j * m_indexStride[DimIndex];
-      const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
-      if (DimIndex > 0) {
-        convolve(input, kernel, DimIndex-1, accum);
-      } else {
-        accum += m_inputImpl.coeff(input) * m_kernel[kernel];
-      }
-    }
-  }
-
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const {
-    for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
-      const Index input = firstIndex + j * m_indexStride[DimIndex];
-      const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
-      if (DimIndex > 0) {
-        convolvePacket(input, kernel, DimIndex-1, accum);
-      } else {
-        accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum);
-      }
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
-    // Don't make a local copy of the kernel unless we have to (i.e. it's an
-    // expression that needs to be evaluated)
-    const Scalar* in_place = m_kernelImpl.data();
-    if (in_place) {
-      m_kernel = in_place;
-      m_local_kernel = false;
-    } else {
-      size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
-      Scalar* local = (Scalar*)m_device.allocate_temp(kernel_sz);
-      typedef TensorEvalToOp<const KernelArgType> EvalTo;
-      EvalTo evalToTmp(local, m_kernelArg);
-      const bool Vectorize = internal::IsVectorizable<Device, KernelArgType>::value;
-      internal::TensorExecutor<const EvalTo, Device, Vectorize>::run(evalToTmp, m_device);
-
-      m_kernel = local;
-      m_local_kernel = true;
-    }
-  }
-
-  array<Index, NumDims> m_inputStride;
-  array<Index, NumDims> m_outputStride;
-
-  array<Index, NumKernelDims> m_indexStride;
-  array<Index, NumKernelDims> m_kernelStride;
-  TensorEvaluator<InputArgType, Device> m_inputImpl;
-  TensorEvaluator<KernelArgType, Device> m_kernelImpl;
-  Dimensions m_dimensions;
-
-  KernelArgType m_kernelArg;
-  const Scalar* m_kernel;
-  bool m_local_kernel;
-  const Device EIGEN_DEVICE_REF m_device;
-};
-
-
-
-
-// Use an optimized implementation of the evaluation code for GPUs whenever possible.
-#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
-
-template <int StaticKernelSize>
-struct GetKernelSize {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const {
-    return StaticKernelSize;
-  }
-};
-template <>
-struct GetKernelSize<Dynamic> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const {
-    return kernelSize;
-  }
-};
-
-template <typename InputEvaluator, typename Index, typename InputDims,
-          int StaticKernelSize>
-__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel1D(
-    InputEvaluator eval,
-    const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout>
-        indexMapper,
-    const float* __restrict kernel, const int numPlanes, const int numX,
-    const int maxX, const int kernelSize, float* buffer) {
-#if defined(EIGEN_HIPCC)
-  HIP_DYNAMIC_SHARED(float, s)
-#else
-  extern __shared__ float s[];
-#endif
-
-  const int first_x = blockIdx.x * maxX;
-  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
-  const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
-  const int num_x_output = last_x - first_x + 1;
-
-  const int first_plane = blockIdx.y * blockDim.y;
-  const int plane_stride = blockDim.y * gridDim.y;
-
-  for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
-    // Load inputs to shared memory
-    const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
-    const int plane_kernel_offset = threadIdx.y * num_x_input;
-    #pragma unroll
-    for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
-      const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x);
-      s[i + plane_kernel_offset] = eval.coeff(tensor_index);
-    }
-
-    __syncthreads();
-
-    // Compute the convolution
-    const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
-
-    #pragma unroll
-    for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
-      const int kernel_offset = plane_kernel_offset + i;
-      float result = 0.0f;
-      #pragma unroll
-      for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
-        result += s[k + kernel_offset] * kernel[k];
-      }
-      const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x);
-      buffer[tensor_index] = result;
-    }
-    __syncthreads();
-  }
-};
-
-template <typename InputEvaluator, typename Index, typename InputDims,
-          int StaticKernelSizeX, int StaticKernelSizeY>
-__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel2D(
-    InputEvaluator eval,
-    const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout>
-        indexMapper,
-    const float* __restrict kernel, const int numPlanes, const int numX,
-    const int maxX, const int numY, const int maxY, const int kernelSizeX,
-    const int kernelSizeY, float* buffer) {
-#if defined(EIGEN_HIPCC)
-  HIP_DYNAMIC_SHARED(float, s)
-#else
-  extern __shared__ float s[];
-#endif
-
-  const int first_x = blockIdx.x * maxX;
-  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
-  const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
-  const int num_x_output = last_x - first_x + 1;
-
-  const int first_y = blockIdx.y * maxY;
-  const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
-  const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
-  const int num_y_output = last_y - first_y + 1;
-
-  const int first_plane = blockIdx.z * blockDim.z;
-  const int plane_stride = blockDim.z * gridDim.z;
-
-  for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
-
-    const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
-    const int plane_kernel_offset = threadIdx.z * num_y_input;
-
-    // Load inputs to shared memory
-    #pragma unroll
-    for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
-      const int input_offset = num_x_input * (j + plane_kernel_offset);
-      #pragma unroll
-      for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
-        const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y);
-        s[i + input_offset] = eval.coeff(tensor_index);
-      }
-    }
-
-    __syncthreads();
-
-    // Convolution
-    const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
-
-    #pragma unroll
-    for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
-      #pragma unroll
-      for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
-        float result = 0.0f;
-        #pragma unroll
-        for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
-          const int kernel_offset = kernelSizeX * l;
-          const int input_offset = i + num_x_input * (j + l + plane_kernel_offset);
-          #pragma unroll
-          for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
-            result += s[k + input_offset] * kernel[k + kernel_offset];
-          }
-        }
-        const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
-        buffer[tensor_index] = result;
-      }
-    }
-
-    __syncthreads();
-  }
-};
-
-template <typename InputEvaluator, typename Index, typename InputDims>
-__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D(
-    InputEvaluator eval,
-    const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout>
-        indexMapper,
-    const float* __restrict kernel, const size_t numPlanes, const size_t numX,
-    const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ,
-    const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
-    const size_t kernelSizeZ, float* buffer) {
-#if defined(EIGEN_HIPCC)
-  HIP_DYNAMIC_SHARED(float, s)
-#else
-  extern __shared__ float s[];
-#endif
-
-  // Load inputs to shared memory
-  const int first_x = blockIdx.x * maxX;
-  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
-  const int num_x_input = last_x - first_x + kernelSizeX;
-
-  const int first_y = blockIdx.y * maxY;
-  const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
-  const int num_y_input = last_y - first_y + kernelSizeY;
-
-  const int first_z = blockIdx.z * maxZ;
-  const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
-  const int num_z_input = last_z - first_z + kernelSizeZ;
-
-  for (int p = 0; p < numPlanes; ++p) {
-
-    const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
-    const int plane_kernel_offset = 0;
-
-    for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
-      for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
-        for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
-          const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
-          s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
-        }
-      }
-    }
-
-    __syncthreads();
-
-    // Convolution
-    const int num_z_output = last_z - first_z + 1;
-    const int num_y_output = last_y - first_y + 1;
-    const int num_x_output = last_x - first_x + 1;
-    const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
-
-    for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
-      for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
-        for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
-          float result = 0.0f;
-          for (int n = 0; n < kernelSizeZ; ++n) {
-            for (int m = 0; m < kernelSizeY; ++m) {
-              for (int l = 0; l < kernelSizeX; ++l) {
-                result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
-              }
-            }
-          }
-          const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
-          buffer[tensor_index] = result;
-        }
-      }
-    }
-    __syncthreads();
-  }
-};
-
-
-
-template<typename Indices, typename InputArgType, typename KernelArgType>
-struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice>
-{
-  typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
-
-  static const int NumDims =  internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value;
-  static const int NumKernelDims = internal::array_size<Indices>::value;
-  typedef typename XprType::Index Index;
-  typedef DSizes<Index, NumDims> Dimensions;
-  typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions;
-
-  enum {
-    IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
-    PacketAccess = false,
-    BlockAccess = false,
-    PreferBlockAccess = false,
-    Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
-  };
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  TensorEvaluator(const XprType& op, const GpuDevice& device)
-      : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
-  {
-    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-    const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
-    const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
-
-    m_dimensions = m_inputImpl.dimensions();
-    for (int i = 0; i < NumKernelDims; ++i) {
-      const Index index = op.indices()[i];
-      const Index input_dim = input_dims[index];
-      const Index kernel_dim = kernel_dims[i];
-      const Index result_dim = input_dim - kernel_dim + 1;
-      m_dimensions[index] = result_dim;
-    }
-  }
-
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
-  typedef typename InputArgType::Scalar Scalar;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
-
-  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
-    preloadKernel();
-    m_inputImpl.evalSubExprsIfNeeded(NULL);
-    if (data) {
-      executeEval(data);
-      return false;
-    } else {
-      m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
-      executeEval(m_buf);
-      return true;
-    }
-  }
-
-  EIGEN_STRONG_INLINE void cleanup() {
-    m_inputImpl.cleanup();
-    if (m_buf) {
-      m_device.deallocate(m_buf);
-      m_buf = NULL;
-    }
-    if (m_local_kernel) {
-      m_device.deallocate((void*)m_kernel);
-      m_local_kernel = false;
-    }
-    m_kernel = NULL;
-  }
-
-  EIGEN_STRONG_INLINE void preloadKernel() {
-    // Don't make a local copy of the kernel unless we have to (i.e. it's an
-    // expression that needs to be evaluated)
-    const Scalar* in_place = m_kernelImpl.data();
-    if (in_place) {
-      m_kernel = in_place;
-      m_local_kernel = false;
-    } else {
-      size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
-      Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
-      typedef TensorEvalToOp<const KernelArgType> EvalTo;
-      EvalTo evalToTmp(local, m_kernelArg);
-      const bool PacketAccess = internal::IsVectorizable<GpuDevice, KernelArgType>::value;
-      internal::TensorExecutor<const EvalTo, GpuDevice, PacketAccess>::run(evalToTmp, m_device);
-
-      m_kernel = local;
-      m_local_kernel = true;
-    }
-  }
-
-  static unsigned int ceil(unsigned int num, unsigned int denom) {
-    const unsigned int rounded_toward_zero = num / denom;
-    if (num > rounded_toward_zero * denom) {
-      return rounded_toward_zero + 1;
-    }
-    return rounded_toward_zero;
-  }
-
-  void executeEval(Scalar* data) const {
-    typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
-
-    const int maxSharedMem = m_device.sharedMemPerBlock();
-    const int maxThreadsPerBlock = m_device.maxGpuThreadsPerBlock();
-    const int maxBlocksPerProcessor = m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock;
-    const int numMultiProcessors = m_device.getNumGpuMultiProcessors();
-    const int warpSize = 32;
-
-    switch (NumKernelDims) {
-      case 1: {
-        const int kernel_size = m_kernelImpl.dimensions().TotalSize();
-
-        const int numX = dimensions()[m_indices[0]];
-        const int numP = dimensions().TotalSize() / numX;
-        int maxX;
-        dim3 block_size;
-
-        const int single_stride_dim =
-            static_cast<int>(Layout) == static_cast<int>(ColMajor)
-                ? 0
-                : m_inputImpl.dimensions().rank() - 1;
-        if (m_indices[0] == single_stride_dim) {
-          // Maximum the reuse
-          const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
-          maxX = numext::mini<int>(inner_dim, numX);
-          const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);
-          block_size.x = numext::mini(maxThreadsPerBlock, maxX);
-          block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
-        }
-        else {
-          // Read as much as possible alongside the inner most dimension, that is the plane
-          const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar));
-          const int maxP = numext::mini<int>(inner_dim, numP);
-          maxX = numext::mini<int>(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);
-
-          block_size.x = numext::mini(warpSize, maxX);
-          block_size.y = numext::mini<int>(maxThreadsPerBlock/block_size.x, maxP);
-        }
-
-        const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
-        gpu_assert(shared_mem <= maxSharedMem);
-
-        const int num_x_blocks = ceil(numX, maxX);
-        const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
-        const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
-
-        dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y)));
-
-
-        //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
-
-        const array<Index, 1> indices(m_indices[0]);
-        const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]);
-        internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(
-            m_inputImpl.dimensions(), kernel_dims, indices);
-        switch(kernel_size) {
-          case 4: {
-            LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
-            break;
-          }
-          case 7: {
-            LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
-            break;
-          }
-          default: {
-            LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
-          }
-        }
-        break;
-      }
-
-      case 2: {
-        const int idxX =
-            static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
-        const int idxY =
-            static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
-        const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
-        const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
-
-        const int numX = dimensions()[m_indices[idxX]];
-        const int numY = dimensions()[m_indices[idxY]];
-        const int numP = dimensions().TotalSize() / (numX*numY);
-
-        const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x));
-
-        // Snap maxX to warp size
-        int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
-        const int maxX = numext::mini<int>(inner_dim, numX);
-        const int maxY = numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
-        const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);
-
-        dim3 block_size;
-        block_size.x = numext::mini(1024, maxX);
-        block_size.y = numext::mini<int>(1024/block_size.x, maxY);
-        block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
-
-        const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
-        gpu_assert(shared_mem <= maxSharedMem);
-
-        const int num_x_blocks = ceil(numX, maxX);
-        const int num_y_blocks = ceil(numY, maxY);
-        const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
-        const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
-
-        dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z)));
-
-
-        //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y  << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
-
-        const array<Index, 2> indices(m_indices[idxX], m_indices[idxY]);
-        const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[idxX],
-                                          m_kernelImpl.dimensions()[idxY]);
-        internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(
-            m_inputImpl.dimensions(), kernel_dims, indices);
-        switch (kernel_size_x) {
-          case 4: {
-            switch (kernel_size_y) {
-              case 7: {
-                LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
-                break;
-              }
-              default: {
-                LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
-                break;
-              }
-            }
-            break;
-          }
-          case 7: {
-            switch (kernel_size_y) {
-              case 4: {
-                LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
-                break;
-              }
-              default: {
-                LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
-                break;
-              }
-            }
-            break;
-          }
-          default: {
-            LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
-            break;
-          }
-        }
-        break;
-      }
-
-      case 3: {
-        const int idxX =
-            static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
-        const int idxY =
-            static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
-        const int idxZ =
-            static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
-
-        const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
-        const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
-        const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
-
-        const int numX = dimensions()[m_indices[idxX]];
-        const int numY = dimensions()[m_indices[idxY]];
-        const int numZ = dimensions()[m_indices[idxZ]];
-        const int numP = dimensions().TotalSize() / (numX*numY*numZ);
-
-        const int maxX = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
-        const int maxY = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
-        const int maxZ = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
-
-        dim3 block_size;
-        block_size.x = numext::mini(32, maxX);
-        block_size.y = numext::mini(32, maxY);
-        block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxZ);
-        dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
-
-        const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
-        gpu_assert(shared_mem <= maxSharedMem);
-
-        //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y  << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z  << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
-        const array<Index, 3> indices(m_indices[idxX], m_indices[idxY],
-                                      m_indices[idxZ]);
-        const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[idxX],
-                                          m_kernelImpl.dimensions()[idxY],
-                                          m_kernelImpl.dimensions()[idxZ]);
-        internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(
-            m_inputImpl.dimensions(), kernel_dims, indices);
-
-        LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
-        break;
-      }
-
-      default: {
-        EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
-      }
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
-  {
-    eigen_assert(m_buf);
-    eigen_assert(index < m_dimensions.TotalSize());
-    return m_buf[index];
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const
-  {
-    eigen_assert(m_buf);
-    eigen_assert(index < m_dimensions.TotalSize());
-    return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
-  costPerCoeff(bool vectorized) const {
-    // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
-    // model.
-    const double kernel_size = m_kernelImpl.dimensions().TotalSize();
-    // We ignore the use of fused multiply-add.
-    const double convolve_compute_cost =
-        TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
-    const double firstIndex_compute_cost =
-        NumDims *
-        (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
-         TensorOpCost::DivCost<Index>());
-    return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
-           kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
-                          m_kernelImpl.costPerCoeff(vectorized) +
-                          TensorOpCost(0, 0, convolve_compute_cost, vectorized,
-                                       PacketSize));
-  }
-
- private:
-  // No assignment (copies are needed by the kernels)
-  TensorEvaluator& operator = (const TensorEvaluator&);
-
-  TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
-  TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
-  KernelArgType m_kernelArg;
-  Indices m_indices;
-  Dimensions m_dimensions;
-  Scalar* m_buf;
-  const Scalar* m_kernel;
-  bool m_local_kernel;
-
-  const GpuDevice& m_device;
-};
-#endif
-
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorConvolutionSycl.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorConvolutionSycl.h
deleted file mode 100644
index 033318f..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorConvolutionSycl.h
+++ /dev/null
@@ -1,544 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
-
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
-#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
-
-namespace Eigen {
-
-/** \class TensorConvolution
- * \ingroup CXX11_Tensor_Module
- *
- * \brief Tensor convolution class.
- *
- *
- */
-
-enum class convolution_type { CONV1D, CONV2D, CONV3D };
-template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
-          typename Kernel_accessor, typename Buffer_accessor, convolution_type Conv_Dim>
-struct EigenConvolutionKernel;
-template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
-          typename Kernel_accessor, typename Buffer_accessor>
-struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
-                              Buffer_accessor, convolution_type::CONV1D> {
-  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
-      Local_accessor;
-  Local_accessor local_acc;
-  Evaluator device_evaluator;
-  Kernel_accessor kernel_filter;
-  Buffer_accessor buffer_acc;
-  internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper;
-  const size_t kernelSize;
-  const cl::sycl::range<2> input_range;
-  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
-                         Buffer_accessor buffer_acc_,
-                         internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper_,
-                         const size_t kernelSize_, const cl::sycl::range<2> input_range_)
-      : local_acc(local_acc_),
-        device_evaluator(device_evaluator_),
-        kernel_filter(kernel_filter_),
-        buffer_acc(buffer_acc_),
-        indexMapper(indexMapper_),
-        kernelSize(kernelSize_),
-        input_range(input_range_) {}
-
-  template <typename BooleanDim2>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) {
-    return (boolean_check[0] && boolean_check[1]);
-  }
-  void operator()(cl::sycl::nd_item<2> itemID) {
-    auto buffer_ptr = buffer_acc.get_pointer();
-    auto kernel_ptr = kernel_filter.get_pointer();
-    // the required row to be calculated for the for each plane in shered memory
-    const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1);
-    const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input;
-    const size_t input_offset = itemID.get_group(0) * itemID.get_local_range()[0];
-    const size_t plane_tensor_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(1));
-    /// fill the shared memory
-    for (size_t i = itemID.get_local_id(0); i < num_input; i += itemID.get_local_range()[0]) {
-      const size_t local_index = i + plane_kernel_offset;
-      const size_t tensor_index =
-          plane_tensor_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + input_offset);
-
-      local_acc[local_index] =
-          (((i + input_offset) < (input_range[0] + kernelSize - 1)) && itemID.get_global_id(1) < input_range[1])
-              ? device_evaluator.coeff(tensor_index)
-              : CoeffReturnType(0);
-    }
-
-    itemID.barrier(cl::sycl::access::fence_space::local_space);
-
-    // calculate the convolution // output start x
-    const size_t first_output_start = itemID.get_group(0) * (itemID.get_local_range()[0]);
-    if (boundary_check(itemID.get_global_id() < input_range)) {
-      CoeffReturnType result = static_cast<CoeffReturnType>(0);
-      const size_t index = plane_kernel_offset + itemID.get_local_id(0);
-      for (size_t k = 0; k < kernelSize; ++k) {
-        result += (local_acc[k + index] * kernel_ptr[k]);
-      }
-      const size_t tensor_index =
-          indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(1)) +
-          indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + first_output_start);
-      buffer_ptr[tensor_index] = result;
-    }
-  }
-};
-
-template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
-          typename Kernel_accessor, typename Buffer_accessor>
-struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
-                              Buffer_accessor, convolution_type::CONV2D> {
-  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
-      Local_accessor;
-  Local_accessor local_acc;
-  Evaluator device_evaluator;
-  Kernel_accessor kernel_filter;
-  Buffer_accessor buffer_acc;
-  internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper;
-  const cl::sycl::range<2> kernel_size;
-  const cl::sycl::range<3> input_range;
-  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
-                         Buffer_accessor buffer_acc_,
-                         internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper_,
-                         const cl::sycl::range<2> kernel_size_, const cl::sycl::range<3> input_range_)
-      : local_acc(local_acc_),
-        device_evaluator(device_evaluator_),
-        kernel_filter(kernel_filter_),
-        buffer_acc(buffer_acc_),
-        indexMapper(indexMapper_),
-        kernel_size(kernel_size_),
-        input_range(input_range_) {}
-  template <typename BooleanDim3>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
-    return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
-  }
-
-  void operator()(cl::sycl::nd_item<3> itemID) {
-    auto buffer_ptr = buffer_acc.get_pointer();
-    auto kernel_ptr = kernel_filter.get_pointer();
-    // the required row to be calculated for the for each plane in shered memory
-    const auto num_input = cl::sycl::range<2>{
-        (cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)};
-
-    const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(2));
-    const size_t plane_kernel_offset = itemID.get_local_id(2) * num_input[1];
-
-    const auto input_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
-                                                 itemID.get_group(1) * itemID.get_local_range()[1]};
-      
-    // fill the local memory
-    bool in_range_dim2 = itemID.get_global_id(2) < input_range[2];
-    for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
-      const size_t local_input_offset = num_input[0] * (j + plane_kernel_offset);
-      bool in_range_dim1 = ((j + input_offset[1]) < (input_range[1] + kernel_size[1] - 1)); 
-      for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
-        const size_t local_index = i + local_input_offset;
-        const size_t tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
-                                                             i + input_offset[0], j + input_offset[1]);
-        local_acc[local_index] = (((i + input_offset[0]) < (input_range[0] + kernel_size[0] - 1)) &&
-                                  in_range_dim1 && in_range_dim2)
-                                     ? device_evaluator.coeff(tensor_index)
-                                     : CoeffReturnType(0);
-      }
-    }
-
-    itemID.barrier(cl::sycl::access::fence_space::local_space);
-
-    // output offset start for each thread
-    const auto output_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
-                                                  itemID.get_group(1) * itemID.get_local_range()[1]};
-
-    if (boundary_check(itemID.get_global_id() < input_range)) {
-      CoeffReturnType result = static_cast<CoeffReturnType>(0);
-
-      for (size_t j = 0; j < kernel_size[1]; j++) {
-        size_t kernel_offset = kernel_size[0] * j;
-        const size_t index =
-            (num_input[0] * (plane_kernel_offset + j + itemID.get_local_id(1))) + itemID.get_local_id(0);
-        for (size_t i = 0; i < kernel_size[0]; i++) {
-          result += (local_acc[i + index] * kernel_ptr[i + kernel_offset]);
-        }
-      }
-      const size_t tensor_index =
-          indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(2)) +
-          indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + output_offset[0],
-                                                             itemID.get_local_id(1) + output_offset[1]);
-
-      buffer_ptr[tensor_index] = result;
-    }
-  }
-};
-
-template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
-          typename Kernel_accessor, typename Buffer_accessor>
-struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
-                              Buffer_accessor, convolution_type::CONV3D> {
-  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
-      Local_accessor;
-  Local_accessor local_acc;
-  Evaluator device_evaluator;
-  Kernel_accessor kernel_filter;
-  Buffer_accessor buffer_acc;
-  internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper;
-  const cl::sycl::range<3> kernel_size;
-  const cl::sycl::range<3> input_range;
-  const size_t numP;
-
-  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
-                         Buffer_accessor buffer_acc_,
-                         internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper_,
-                         const cl::sycl::range<3> kernel_size_, const cl::sycl::range<3> input_range_,
-                         const size_t numP_)
-      : local_acc(local_acc_),
-        device_evaluator(device_evaluator_),
-        kernel_filter(kernel_filter_),
-        buffer_acc(buffer_acc_),
-        indexMapper(indexMapper_),
-        kernel_size(kernel_size_),
-        input_range(input_range_),
-        numP(numP_) {}
-  template <typename BooleanDim3>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
-    return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
-  }
-  void operator()(cl::sycl::nd_item<3> itemID) {
-    auto buffer_ptr = buffer_acc.get_pointer();
-    auto kernel_ptr = kernel_filter.get_pointer();
-    const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1};
-
-    const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()};
-
-    const auto output_offset =
-          cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range() + itemID.get_local_id()};
-
-    for (size_t p = 0; p < numP; p++) {
-      /// fill the shared memory
-      const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
-      for (size_t k = itemID.get_local_id(2); k < num_input[2]; k += itemID.get_local_range()[2]) {
-        size_t local_index_dim2 = num_input[0] * num_input[1] * k;
-        bool cond_k_dim = (k + input_offset[2] < (input_range[2] + kernel_size[2] - 1));
-        for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
-          bool cond_j_dim = cond_k_dim && (j + input_offset[1] < (input_range[1] + kernel_size[1] - 1));
-          size_t local_index_dim1 = (num_input[0] * j)  + local_index_dim2;
-          for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
-            bool conds = cond_j_dim && (i + input_offset[0] < (input_range[0] + kernel_size[0] - 1));
-            const size_t local_index = local_index_dim1 + i;
-            const size_t tensor_index =
-                plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
-                                         i + input_offset[0], j + input_offset[1], k + input_offset[2]);
-            local_acc[local_index] = conds ? device_evaluator.coeff(tensor_index) : CoeffReturnType(0);
-          }
-        }
-      }
-      itemID.barrier(cl::sycl::access::fence_space::local_space);
-
-      // calculate the convolution
-
-      if (boundary_check(itemID.get_global_id() < input_range)) {
-        CoeffReturnType result = static_cast<CoeffReturnType>(0);
-        for (size_t k = 0; k < kernel_size[2]; k++) {
-          for (size_t j = 0; j < kernel_size[1]; j++) {
-            for (size_t i = 0; i < kernel_size[0]; i++) {
-              const size_t kernel_index = i + kernel_size[0] * (j + kernel_size[1] * k);
-              const size_t local_index =
-                  ((i + itemID.get_local_id(0)) +
-                   num_input[0] * ((j + itemID.get_local_id(1)) + num_input[1] * (k + itemID.get_local_id(2))));
-
-              result += (local_acc[local_index] * kernel_ptr[kernel_index]);
-            }
-          }
-        }
-        const size_t tensor_index =
-            indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p) +
-            indexMapper.mapGpuOutputKernelToTensorOutputOffset(output_offset[0], output_offset[1], output_offset[2]);
-        buffer_ptr[tensor_index] = result;
-      }
-
-      itemID.barrier(cl::sycl::access::fence_space::local_space);
-    }
-  }
-};
-
-template <typename Indices, typename InputArgType, typename KernelArgType>
-struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Eigen::SyclDevice> {
-  typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
-
-  static const int NumDims =
-      internal::array_size<typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions>::value;
-  static const int NumKernelDims = internal::array_size<Indices>::value;
-  typedef typename XprType::Index Index;
-  typedef DSizes<Index, NumDims> Dimensions;
-  typedef typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions KernelDimensions;
-  typedef const Eigen::SyclDevice Device;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Eigen::SyclDevice>::type PacketReturnType;
-  typedef typename InputArgType::Scalar Scalar;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  typedef StorageMemory<CoeffReturnType, Eigen::SyclDevice> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-  typedef StorageMemory<const CoeffReturnType, Eigen::SyclDevice> KernelStorage;
-
-  enum {
-    IsAligned = TensorEvaluator<InputArgType, Eigen::SyclDevice>::IsAligned &
-                TensorEvaluator<KernelArgType, Eigen::SyclDevice>::IsAligned,
-    PacketAccess = false,
-    BlockAccess = false,
-    PreferBlockAccess = false,
-    Layout = TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
-  };
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  TensorEvaluator(const XprType &op, const Eigen::SyclDevice &device)
-      : m_inputImpl(op.inputExpression(), device),
-        m_kernelArg(op.kernelExpression()),
-        m_kernelImpl(op.kernelExpression(), device),
-        m_indices(op.indices()),
-        m_buf(NULL),
-        m_kernel(NULL),
-        m_local_kernel(false),
-        m_device(device) {
-    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout) ==
-                         static_cast<int>(TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Layout)),
-                        YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-    const typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions &input_dims = m_inputImpl.dimensions();
-    const typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions &kernel_dims =
-        m_kernelImpl.dimensions();
-
-    m_dimensions = m_inputImpl.dimensions();
-    for (int i = 0; i < NumKernelDims; ++i) {
-      const Index index = op.indices()[i];
-      const Index input_dim = input_dims[index];
-      const Index kernel_dim = kernel_dims[i];
-      const Index result_dim = input_dim - kernel_dim + 1;
-      m_dimensions[index] = result_dim;
-    }
-  }
-
-  EIGEN_DEVICE_FUNC const Dimensions &dimensions() const { return m_dimensions; }
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
-    preloadKernel();
-    m_inputImpl.evalSubExprsIfNeeded(NULL);
-    if (data) {
-      executeEval(data);
-      return false;
-    } else {
-      m_buf = (EvaluatorPointerType)m_device.get(
-          (Scalar *)m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar)));
-      executeEval(m_buf);
-      return true;
-    }
-  }
-
-  EIGEN_STRONG_INLINE void cleanup() {
-    m_inputImpl.cleanup();
-    if (m_buf) {
-      m_device.deallocate_temp(m_buf);
-      m_buf = NULL;
-    }
-    if (m_local_kernel) {
-      m_device.deallocate_temp(m_kernel);
-      m_local_kernel = false;
-    }
-    m_kernel = NULL;
-  }
-  /// used by sycl in order to build the sycl buffer
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device &device() const { return m_device; }
-  /// used by sycl in order to build the sycl buffer
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_buf; }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
-    // Don't make a local copy of the kernel unless we have to (i.e. it's an
-    // expression that needs to be evaluated)
-    typename KernelStorage::Type in_place = m_kernelImpl.data();
-    if (in_place) {
-      m_kernel = in_place;
-      m_local_kernel = false;
-    } else {
-      ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
-      EvaluatorPointerType local = (EvaluatorPointerType)m_device.get((Scalar *)m_device.allocate_temp(kernel_sz));
-      typedef TensorEvalToOp<const KernelArgType> EvalTo;
-      EvalTo evalToTmp(m_device.get(local), m_kernelArg);
-      const bool PacketAccess = internal::IsVectorizable<Eigen::SyclDevice, KernelArgType>::value;
-      internal::TensorExecutor<const EvalTo, Eigen::SyclDevice, PacketAccess>::run(evalToTmp, m_device);
-      m_kernel = local;
-      m_local_kernel = true;
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(EvaluatorPointerType data) const {
-    typedef TensorEvaluator<InputArgType, Eigen::SyclDevice> InputEvaluator;
-    typedef typename InputEvaluator::Dimensions InputDims;
-    switch (NumKernelDims) {
-      case 1: {
-        const size_t numX = dimensions()[m_indices[0]];
-        const size_t numP = dimensions().TotalSize() / numX;
-        const auto input_dim = std::array<size_t, 2>{numX, numP};
-        auto global_range = cl::sycl::range<2>{};
-        auto local_range = cl::sycl::range<2>{};
-        const size_t kernel_size = m_kernelImpl.dimensions().TotalSize();
-
-        m_device.parallel_for_setup(input_dim, global_range, local_range);
-        const size_t local_memory_size = (local_range[0] + kernel_size - 1) * (local_range[1]);
-        gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
-        const array<Index, 1> indices{{m_indices[0]}};
-        const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}};
-        internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
-
-        typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
-                                       typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV1D>
-            ConvKernel;
-
-        m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
-            m_inputImpl, m_kernel, data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size,
-            indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1]));
-        break;
-      }
-
-      case 2: {
-        auto kernel_index = std::array<size_t, 2>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1,
-                                                  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0};
-        auto kernel_size = cl::sycl::range<2>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
-                                              (size_t)m_kernelImpl.dimensions()[kernel_index[1]]};
-        const size_t numX = dimensions()[m_indices[kernel_index[0]]];
-        const size_t numY = dimensions()[m_indices[kernel_index[1]]];
-        const size_t numP = dimensions().TotalSize() / (numX * numY);
-        auto input_dim = std::array<size_t, 3>{numX, numY, numP};
-
-        auto global_range = cl::sycl::range<3>{};
-        auto local_range = cl::sycl::range<3>{};
-
-        m_device.parallel_for_setup(input_dim, global_range, local_range);
-
-        const size_t local_memory_size =
-            (local_range[0] + kernel_size[0] - 1) * (local_range[1] + kernel_size[1] - 1) * local_range[2];
-        gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
-        const array<Index, 2> indices{{m_indices[kernel_index[0]], m_indices[kernel_index[1]]}};
-        const array<Index, 2> kernel_dims{
-            {m_kernelImpl.dimensions()[kernel_index[0]], m_kernelImpl.dimensions()[kernel_index[1]]}};
-        internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
-        typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
-                                       typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV2D>
-            ConvKernel;
-        m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
-            m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
-            indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]});
-        break;
-      }
-
-      case 3: {
-        auto kernel_index = std::array<size_t, 3>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2,
-                                                  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1,
-                                                  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0};
-
-        auto kernel_size = cl::sycl::range<3>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
-                                              (size_t)m_kernelImpl.dimensions()[kernel_index[1]],
-                                              (size_t)m_kernelImpl.dimensions()[kernel_index[2]]};
-
-        const size_t numX = dimensions()[m_indices[kernel_index[0]]];
-        const size_t numY = dimensions()[m_indices[kernel_index[1]]];
-        const size_t numZ = dimensions()[m_indices[kernel_index[2]]];
-        auto input_dim = std::array<size_t, 3>{numX, numY, numZ};
-        const size_t numP = dimensions().TotalSize() / (numX * numY * numZ);
-
-        const array<Index, 3> indices{
-            {m_indices[kernel_index[0]], m_indices[kernel_index[1]], m_indices[kernel_index[2]]}};
-        const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[kernel_index[0]],
-                                           m_kernelImpl.dimensions()[kernel_index[1]],
-                                           m_kernelImpl.dimensions()[kernel_index[2]]}};
-
-        internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
-
-        auto global_range = cl::sycl::range<3>{};
-        auto local_range = cl::sycl::range<3>{};
-
-        m_device.parallel_for_setup(input_dim, global_range, local_range);
-        auto local_memory_range = (local_range + kernel_size - 1);
-        const size_t local_memory_size = local_memory_range[0] * local_memory_range[1] * local_memory_range[2];
-
-        gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
-        typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
-                                       typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV3D>
-            ConvKernel;
-        m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
-            m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
-            indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP);
-        break;
-      }
-
-      default: {
-        EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3),
-                            THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
-      }
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
-    eigen_assert(m_buf != NULL);
-    eigen_assert(index < m_dimensions.TotalSize());
-    return m_buf[index];
-  }
-
-  template <int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const {
-    eigen_assert(m_buf != NULL);
-    eigen_assert(index < m_dimensions.TotalSize());
-    return internal::ploadt<PacketReturnType, LoadMode>(m_buf + index);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
-    // model.
-    const double kernel_size = m_kernelImpl.dimensions().TotalSize();
-    // We ignore the use of fused multiply-add.
-    const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
-    const double firstIndex_compute_cost =
-        NumDims *
-        (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
-    return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
-           kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
-                          TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
-  }
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_kernelImpl.bind(cgh);
-    m_inputImpl.bind(cgh);
-    m_buf.bind(cgh);
-    m_kernel.bind(cgh);
-  }
-
- private:
-  // No assignment (copies are needed by the kernels)
-  TensorEvaluator &operator=(const TensorEvaluator &);
-  TensorEvaluator<InputArgType, Eigen::SyclDevice> m_inputImpl;
-  KernelArgType m_kernelArg;
-  TensorEvaluator<KernelArgType, Eigen::SyclDevice> m_kernelImpl;
-  Indices m_indices;
-  Dimensions m_dimensions;
-  EvaluatorPointerType m_buf;
-  typename KernelStorage::Type m_kernel;
-  bool m_local_kernel;
-  const Eigen::SyclDevice EIGEN_DEVICE_REF m_device;
-};  // namespace Eigen
-
-}  // end namespace Eigen
-
-#endif  // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorCostModel.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorCostModel.h
deleted file mode 100644
index 195267c..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorCostModel.h
+++ /dev/null
@@ -1,214 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016 Rasmus Munk Larsen <rmlarsen@google.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
-#define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
-
-namespace Eigen {
-
-/** \class TensorEvaluator
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief A cost model used to limit the number of threads used for evaluating
-  * tensor expression.
-  *
-  */
-
-// Class storing the cost of evaluating a tensor expression in terms of the
-// estimated number of operand bytes loads, bytes stored, and compute cycles.
-class TensorOpCost {
- public:
-  // TODO(rmlarsen): Fix the scalar op costs in Eigen proper. Even a simple
-  // model based on minimal reciprocal throughput numbers from Intel or
-  // Agner Fog's tables would be better than what is there now.
-  template <typename ArgType>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int MulCost() {
-    return internal::functor_traits<
-        internal::scalar_product_op<ArgType, ArgType> >::Cost;
-  }
-  template <typename ArgType>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int AddCost() {
-    return internal::functor_traits<internal::scalar_sum_op<ArgType> >::Cost;
-  }
-  template <typename ArgType>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int DivCost() {
-    return internal::functor_traits<
-        internal::scalar_quotient_op<ArgType, ArgType> >::Cost;
-  }
-  template <typename ArgType>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int ModCost() {
-    return internal::functor_traits<internal::scalar_mod_op<ArgType> >::Cost;
-  }
-  template <typename SrcType, typename TargetType>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int CastCost() {
-    return internal::functor_traits<
-        internal::scalar_cast_op<SrcType, TargetType> >::Cost;
-  }
-
-  EIGEN_DEVICE_FUNC
-  TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {}
-  EIGEN_DEVICE_FUNC
-  TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles)
-      : bytes_loaded_(bytes_loaded),
-        bytes_stored_(bytes_stored),
-        compute_cycles_(compute_cycles) {}
-
-  EIGEN_DEVICE_FUNC
-  TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles,
-               bool vectorized, double packet_size)
-      : bytes_loaded_(bytes_loaded),
-        bytes_stored_(bytes_stored),
-        compute_cycles_(vectorized ? compute_cycles / packet_size
-                                   : compute_cycles) {
-    eigen_assert(bytes_loaded >= 0 && (numext::isfinite)(bytes_loaded));
-    eigen_assert(bytes_stored >= 0 && (numext::isfinite)(bytes_stored));
-    eigen_assert(compute_cycles >= 0 && (numext::isfinite)(compute_cycles));
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const {
-    return bytes_loaded_;
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const {
-    return bytes_stored_;
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const {
-    return compute_cycles_;
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost(
-      double load_cost, double store_cost, double compute_cost) const {
-    return load_cost * bytes_loaded_ + store_cost * bytes_stored_ +
-           compute_cost * compute_cycles_;
-  }
-
-  // Drop memory access component. Intended for cases when memory accesses are
-  // sequential or are completely masked by computations.
-  EIGEN_DEVICE_FUNC void dropMemoryCost() {
-    bytes_loaded_ = 0;
-    bytes_stored_ = 0;
-  }
-
-  // TODO(rmlarsen): Define min in terms of total cost, not elementwise.
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin(
-      const TensorOpCost& rhs) const {
-    double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded());
-    double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored());
-    double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles());
-    return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
-  }
-
-  // TODO(rmlarsen): Define max in terms of total cost, not elementwise.
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax(
-      const TensorOpCost& rhs) const {
-    double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
-    double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored());
-    double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles());
-    return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=(
-      const TensorOpCost& rhs) {
-    bytes_loaded_ += rhs.bytes_loaded();
-    bytes_stored_ += rhs.bytes_stored();
-    compute_cycles_ += rhs.compute_cycles();
-    return *this;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(double rhs) {
-    bytes_loaded_ *= rhs;
-    bytes_stored_ *= rhs;
-    compute_cycles_ *= rhs;
-    return *this;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+(
-      TensorOpCost lhs, const TensorOpCost& rhs) {
-    lhs += rhs;
-    return lhs;
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(
-      TensorOpCost lhs, double rhs) {
-    lhs *= rhs;
-    return lhs;
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(
-      double lhs, TensorOpCost rhs) {
-    rhs *= lhs;
-    return rhs;
-  }
-
-  friend std::ostream& operator<<(std::ostream& os, const TensorOpCost& tc) {
-    return os << "[bytes_loaded = " << tc.bytes_loaded()
-              << ", bytes_stored = " << tc.bytes_stored()
-              << ", compute_cycles = " << tc.compute_cycles() << "]";
-  }
-
- private:
-  double bytes_loaded_;
-  double bytes_stored_;
-  double compute_cycles_;
-};
-
-// TODO(rmlarsen): Implement a policy that chooses an "optimal" number of theads
-// in [1:max_threads] instead of just switching multi-threading off for small
-// work units.
-template <typename Device>
-class TensorCostModel {
- public:
-  // Scaling from Eigen compute cost to device cycles.
-  static const int kDeviceCyclesPerComputeCycle = 1;
-
- // Costs in device cycles.
-  static const int kStartupCycles = 100000;
-  static const int kPerThreadCycles = 100000;
-  static const int kTaskSize = 40000;
-
-  // Returns the number of threads in [1:max_threads] to use for
-  // evaluating an expression with the given output size and cost per
-  // coefficient.
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(
-      double output_size, const TensorOpCost& cost_per_coeff, int max_threads) {
-    double cost = totalCost(output_size, cost_per_coeff);
-    double threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
-    // Make sure we don't invoke undefined behavior when we convert to an int.
-    threads = numext::mini<double>(threads, GenericNumTraits<int>::highest());
-    return numext::mini(max_threads,
-                        numext::maxi<int>(1, static_cast<int>(threads)));
-  }
-
-  // taskSize assesses parallel task size.
-  // Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task
-  // granularity needs to be increased to mitigate parallelization overheads.
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize(
-      double output_size, const TensorOpCost& cost_per_coeff) {
-    return totalCost(output_size, cost_per_coeff) / kTaskSize;
-  }
-
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost(
-      double output_size, const TensorOpCost& cost_per_coeff) {
-    // Cost of memory fetches from L2 cache. 64 is typical cache line size.
-    // 11 is L2 cache latency on Haswell.
-    // We don't know whether data is in L1, L2 or L3. But we are most interested
-    // in single-threaded computational time around 100us-10ms (smaller time
-    // is too small for parallelization, larger time is not interesting
-    // either because we are probably using all available threads already).
-    // And for the target time range, L2 seems to be what matters. Data set
-    // fitting into L1 is too small to take noticeable time. Data set fitting
-    // only into L3 presumably will take more than 10ms to load and process.
-    const double kLoadCycles = 1.0 / 64 * 11;
-    const double kStoreCycles = 1.0 / 64 * 11;
-    // Scaling from Eigen compute cost to device cycles.
-    return output_size *
-        cost_per_coeff.total_cost(kLoadCycles, kStoreCycles,
-                                  kDeviceCyclesPerComputeCycle);
-  }
-};
-
-}  // namespace Eigen
-
-#endif  // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorCustomOp.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorCustomOp.h
deleted file mode 100644
index 95a8a84..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorCustomOp.h
+++ /dev/null
@@ -1,347 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H
-#define EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H
-
-namespace Eigen {
-
-/** \class TensorCustomUnaryOp
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Tensor custom class.
-  *
-  *
-  */
-namespace internal {
-template<typename CustomUnaryFunc, typename XprType>
-struct traits<TensorCustomUnaryOp<CustomUnaryFunc, XprType> >
-{
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::StorageKind StorageKind;
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Nested Nested;
-  typedef typename remove_reference<Nested>::type _Nested;
-  static const int NumDimensions = traits<XprType>::NumDimensions;
-  static const int Layout = traits<XprType>::Layout;
-  typedef typename traits<XprType>::PointerType PointerType;
-};
-
-template<typename CustomUnaryFunc, typename XprType>
-struct eval<TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Eigen::Dense>
-{
-  typedef const TensorCustomUnaryOp<CustomUnaryFunc, XprType>EIGEN_DEVICE_REF type;
-};
-
-template<typename CustomUnaryFunc, typename XprType>
-struct nested<TensorCustomUnaryOp<CustomUnaryFunc, XprType> >
-{
-  typedef TensorCustomUnaryOp<CustomUnaryFunc, XprType> type;
-};
-
-}  // end namespace internal
-
-
-
-template<typename CustomUnaryFunc, typename XprType>
-class TensorCustomUnaryOp : public TensorBase<TensorCustomUnaryOp<CustomUnaryFunc, XprType>, ReadOnlyAccessors>
-{
-  public:
-  typedef typename internal::traits<TensorCustomUnaryOp>::Scalar Scalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename internal::nested<TensorCustomUnaryOp>::type Nested;
-  typedef typename internal::traits<TensorCustomUnaryOp>::StorageKind StorageKind;
-  typedef typename internal::traits<TensorCustomUnaryOp>::Index Index;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCustomUnaryOp(const XprType& expr, const CustomUnaryFunc& func)
-      : m_expr(expr), m_func(func) {}
-
-  EIGEN_DEVICE_FUNC
-  const CustomUnaryFunc& func() const { return m_func; }
-
-  EIGEN_DEVICE_FUNC
-  const typename internal::remove_all<typename XprType::Nested>::type&
-  expression() const { return m_expr; }
-
-  protected:
-    typename XprType::Nested m_expr;
-    const CustomUnaryFunc m_func;
-};
-
-
-// Eval as rvalue
-template<typename CustomUnaryFunc, typename XprType, typename Device>
-struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Device>
-{
-  typedef TensorCustomUnaryOp<CustomUnaryFunc, XprType> ArgType;
-  typedef typename internal::traits<ArgType>::Index Index;
-  static const int NumDims = internal::traits<ArgType>::NumDimensions;
-  typedef DSizes<Index, NumDims> Dimensions;
-  typedef typename internal::remove_const<typename ArgType::Scalar>::type Scalar;
-  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
-  enum {
-    IsAligned = false,
-    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccess = false,
-    PreferBlockAccess = false,
-    Layout = TensorEvaluator<XprType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
-  };
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device)
-      : m_op(op), m_device(device), m_result(NULL)
-  {
-    m_dimensions = op.func().dimensions(op.expression());
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
-    if (data) {
-      evalTo(data);
-      return false;
-    } else {
-      m_result = static_cast<EvaluatorPointerType>(m_device.get( (CoeffReturnType*)
-          m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar))));
-      evalTo(m_result);
-      return true;
-    }
-  }
-
-  EIGEN_STRONG_INLINE void cleanup() {
-    if (m_result) {
-      m_device.deallocate_temp(m_result);
-      m_result = NULL;
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
-    return m_result[index];
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
-    return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    // TODO(rmlarsen): Extend CustomOp API to return its cost estimate.
-    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
-  }
-
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
-
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_result.bind(cgh);
-  }
-#endif
-
- protected:
-  void evalTo(EvaluatorPointerType data) {
-    TensorMap<Tensor<CoeffReturnType, NumDims, Layout, Index> > result(m_device.get(data), m_dimensions);
-    m_op.func().eval(m_op.expression(), result, m_device);
-  }
-
-  Dimensions m_dimensions;
-  const ArgType m_op;
-  const Device EIGEN_DEVICE_REF m_device;
-  EvaluatorPointerType m_result;
-};
-
-
-
-/** \class TensorCustomBinaryOp
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Tensor custom class.
-  *
-  *
-  */
-namespace internal {
-template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
-struct traits<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> >
-{
-  typedef typename internal::promote_storage_type<typename LhsXprType::Scalar,
-                                                  typename RhsXprType::Scalar>::ret Scalar;
-  typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType,
-                                                  typename RhsXprType::CoeffReturnType>::ret CoeffReturnType;
-  typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
-                                        typename traits<RhsXprType>::StorageKind>::ret StorageKind;
-  typedef typename promote_index_type<typename traits<LhsXprType>::Index,
-                                      typename traits<RhsXprType>::Index>::type Index;
-  typedef typename LhsXprType::Nested LhsNested;
-  typedef typename RhsXprType::Nested RhsNested;
-  typedef typename remove_reference<LhsNested>::type _LhsNested;
-  typedef typename remove_reference<RhsNested>::type _RhsNested;
-  static const int NumDimensions = traits<LhsXprType>::NumDimensions;
-  static const int Layout = traits<LhsXprType>::Layout;
-  typedef typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
-                                typename traits<LhsXprType>::PointerType, typename traits<RhsXprType>::PointerType>::type PointerType;
-};
-
-template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
-struct eval<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Eigen::Dense>
-{
-  typedef const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>& type;
-};
-
-template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
-struct nested<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> >
-{
-  typedef TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> type;
-};
-
-}  // end namespace internal
-
-
-
-template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
-class TensorCustomBinaryOp : public TensorBase<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, ReadOnlyAccessors>
-{
-  public:
-  typedef typename internal::traits<TensorCustomBinaryOp>::Scalar Scalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename internal::traits<TensorCustomBinaryOp>::CoeffReturnType CoeffReturnType;
-  typedef typename internal::nested<TensorCustomBinaryOp>::type Nested;
-  typedef typename internal::traits<TensorCustomBinaryOp>::StorageKind StorageKind;
-  typedef typename internal::traits<TensorCustomBinaryOp>::Index Index;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCustomBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const CustomBinaryFunc& func)
-
-      : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_func(func) {}
-
-  EIGEN_DEVICE_FUNC
-  const CustomBinaryFunc& func() const { return m_func; }
-
-  EIGEN_DEVICE_FUNC
-  const typename internal::remove_all<typename LhsXprType::Nested>::type&
-  lhsExpression() const { return m_lhs_xpr; }
-
-  EIGEN_DEVICE_FUNC
-  const typename internal::remove_all<typename RhsXprType::Nested>::type&
-  rhsExpression() const { return m_rhs_xpr; }
-
-  protected:
-    typename LhsXprType::Nested m_lhs_xpr;
-    typename RhsXprType::Nested m_rhs_xpr;
-    const CustomBinaryFunc m_func;
-};
-
-
-// Eval as rvalue
-template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType, typename Device>
-struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Device>
-{
-  typedef TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> XprType;
-  typedef typename internal::traits<XprType>::Index Index;
-  static const int NumDims = internal::traits<XprType>::NumDimensions;
-  typedef DSizes<Index, NumDims> Dimensions;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-
-  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
-  enum {
-    IsAligned = false,
-    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccess = false,
-    PreferBlockAccess = false,
-    Layout = TensorEvaluator<LhsXprType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
-  };
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_op(op), m_device(device), m_result(NULL)
-  {
-    m_dimensions = op.func().dimensions(op.lhsExpression(), op.rhsExpression());
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
-    if (data) {
-      evalTo(data);
-      return false;
-    } else {
-      m_result = static_cast<EvaluatorPointerType>(m_device.get( (CoeffReturnType*)
-        m_device.allocate_temp(dimensions().TotalSize() * sizeof(CoeffReturnType))));
-      evalTo(m_result);
-      return true;
-    }
-  }
-
-  EIGEN_STRONG_INLINE void cleanup() {
-    if (m_result != NULL) {
-      m_device.deallocate_temp(m_result);
-      m_result = NULL;
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
-    return m_result[index];
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
-    return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    // TODO(rmlarsen): Extend CustomOp API to return its cost estimate.
-    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
-  }
-
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
-
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_result.bind(cgh);
-  }
-#endif
-
- protected:
-  void evalTo(EvaluatorPointerType data) {
-    TensorMap<Tensor<CoeffReturnType, NumDims, Layout> > result(m_device.get(data), m_dimensions);
-    m_op.func().eval(m_op.lhsExpression(), m_op.rhsExpression(), result, m_device);
-  }
-
-  Dimensions m_dimensions;
-  const XprType m_op;
-  const Device EIGEN_DEVICE_REF m_device;
-  EvaluatorPointerType m_result;
-};
-
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorDevice.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorDevice.h
deleted file mode 100644
index 96fa46c..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorDevice.h
+++ /dev/null
@@ -1,137 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H
-#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H
-
-namespace Eigen {
-
-/** \class TensorDevice
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Pseudo expression providing an operator = that will evaluate its argument
-  * on the specified computing 'device' (GPU, thread pool, ...)
-  *
-  * Example:
-  *    C.device(EIGEN_GPU) = A + B;
-  *
-  * Todo: operator *= and /=.
-  */
-
-template <typename ExpressionType, typename DeviceType> class TensorDevice {
-  public:
-    TensorDevice(const DeviceType& device, ExpressionType& expression) : m_device(device), m_expression(expression) {}
-
-    EIGEN_DEFAULT_COPY_CONSTRUCTOR(TensorDevice)
-
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
-      typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
-      Assign assign(m_expression, other);
-      internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device);
-      return *this;
-    }
-
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) {
-      typedef typename OtherDerived::Scalar Scalar;
-      typedef TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ExpressionType, const OtherDerived> Sum;
-      Sum sum(m_expression, other);
-      typedef TensorAssignOp<ExpressionType, const Sum> Assign;
-      Assign assign(m_expression, sum);
-      internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device);
-      return *this;
-    }
-
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE TensorDevice& operator-=(const OtherDerived& other) {
-      typedef typename OtherDerived::Scalar Scalar;
-      typedef TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const ExpressionType, const OtherDerived> Difference;
-      Difference difference(m_expression, other);
-      typedef TensorAssignOp<ExpressionType, const Difference> Assign;
-      Assign assign(m_expression, difference);
-      internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device);
-      return *this;
-    }
-
-  protected:
-    const DeviceType& m_device;
-    ExpressionType& m_expression;
-};
-
-/** \class TensorAsyncDevice
- * \ingroup CXX11_Tensor_Module
- *
- * \brief Pseudo expression providing an operator = that will evaluate its
- * argument asynchronously on the specified device. Currently only
- * ThreadPoolDevice implements proper asynchronous execution, while the default
- * and GPU devices just run the expression synchronously and call m_done() on
- * completion..
- *
- * Example:
- *    auto done = []() { ... expression evaluation done ... };
- *    C.device(thread_pool_device, std::move(done)) = A + B;
- */
-
-template <typename ExpressionType, typename DeviceType, typename DoneCallback>
-class TensorAsyncDevice {
- public:
-  TensorAsyncDevice(const DeviceType& device, ExpressionType& expression,
-                    DoneCallback done)
-      : m_device(device), m_expression(expression), m_done(std::move(done)) {}
-
-  template <typename OtherDerived>
-  EIGEN_STRONG_INLINE TensorAsyncDevice& operator=(const OtherDerived& other) {
-    typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
-    typedef internal::TensorExecutor<const Assign, DeviceType> Executor;
-
-    Assign assign(m_expression, other);
-    Executor::run(assign, m_device);
-    m_done();
-
-    return *this;
-  }
-
- protected:
-  const DeviceType& m_device;
-  ExpressionType& m_expression;
-  DoneCallback m_done;
-};
-
-
-#ifdef EIGEN_USE_THREADS
-template <typename ExpressionType, typename DoneCallback>
-class TensorAsyncDevice<ExpressionType, ThreadPoolDevice, DoneCallback> {
- public:
-  TensorAsyncDevice(const ThreadPoolDevice& device, ExpressionType& expression,
-                    DoneCallback done)
-      : m_device(device), m_expression(expression), m_done(std::move(done)) {}
-
-  template <typename OtherDerived>
-  EIGEN_STRONG_INLINE TensorAsyncDevice& operator=(const OtherDerived& other) {
-    typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
-    typedef internal::TensorAsyncExecutor<const Assign, ThreadPoolDevice, DoneCallback> Executor;
-
-    // WARNING: After assignment 'm_done' callback will be in undefined state.
-    Assign assign(m_expression, other);
-    Executor::runAsync(assign, m_device, std::move(m_done));
-
-    return *this;
-  }
-
- protected:
-  const ThreadPoolDevice& m_device;
-  ExpressionType& m_expression;
-  DoneCallback m_done;
-};
-#endif
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceCuda.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceCuda.h
deleted file mode 100644
index f779239..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceCuda.h
+++ /dev/null
@@ -1,6 +0,0 @@
-
-#if defined(__clang__) || defined(__GNUC__)
-#warning "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorDeviceGpu.h file"
-#endif
-
-#include "TensorDeviceGpu.h"
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceDefault.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceDefault.h
deleted file mode 100644
index 46b9d3a..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceDefault.h
+++ /dev/null
@@ -1,104 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H
-#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H
-
-
-namespace Eigen {
-
-// Default device for the machine (typically a single cpu core)
-struct DefaultDevice {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
-    return internal::aligned_malloc(num_bytes);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
-    internal::aligned_free(buffer);
-  }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const {
-    return allocate(num_bytes);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const {
-    deallocate(buffer);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
-    ::memcpy(dst, src, n);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
-    memcpy(dst, src, n);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
-    memcpy(dst, src, n);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
-    ::memset(buffer, c, n);
-  }
-  template<typename Type>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { 
-    return data;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
-#if !defined(EIGEN_GPU_COMPILE_PHASE)
-    // Running on the host CPU
-    return 1;
-#elif defined(EIGEN_HIP_DEVICE_COMPILE)
-    // Running on a HIP device
-    return 64;
-#else
-    // Running on a CUDA device
-    return 32;
-#endif
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
-#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY)
-    // Running on the host CPU
-    return l1CacheSize();
-#elif defined(EIGEN_HIP_DEVICE_COMPILE)
-    // Running on a HIP device
-    return 48*1024; // FIXME : update this number for HIP
-#else
-    // Running on a CUDA device, return the amount of shared memory available.
-    return 48*1024;
-#endif
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
-#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY)
-    // Running single threaded on the host CPU
-    return l3CacheSize();
-#elif defined(EIGEN_HIP_DEVICE_COMPILE)
-    // Running on a HIP device
-    return firstLevelCacheSize(); // FIXME : update this number for HIP
-#else
-    // Running on a CUDA device
-    return firstLevelCacheSize();
-#endif
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
-#if !defined(EIGEN_GPU_COMPILE_PHASE)
-    // Running single threaded on the host CPU
-    // Should return an enum that encodes the ISA supported by the CPU
-    return 1;
-#elif defined(EIGEN_HIP_DEVICE_COMPILE)
-    // Running on a HIP device
-    // return 1 as major for HIP
-    return 1;
-#else
-    // Running on a CUDA device
-    return EIGEN_CUDA_ARCH / 100;
-#endif
-  }
-};
-
-}  // namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceGpu.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceGpu.h
deleted file mode 100644
index ec2e3cb..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceGpu.h
+++ /dev/null
@@ -1,389 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H)
-#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
-
-// This header file container defines fo gpu* macros which will resolve to
-// their equivalent hip* or cuda* versions depending on the compiler in use
-// A separate header (included at the end of this file) will undefine all 
-#include "TensorGpuHipCudaDefines.h"
-
-namespace Eigen {
-
-static const int kGpuScratchSize = 1024;
-
-// This defines an interface that GPUDevice can take to use
-// HIP / CUDA streams underneath.
-class StreamInterface {
- public:
-  virtual ~StreamInterface() {}
-
-  virtual const gpuStream_t& stream() const = 0;
-  virtual const gpuDeviceProp_t& deviceProperties() const = 0;
-
-  // Allocate memory on the actual device where the computation will run
-  virtual void* allocate(size_t num_bytes) const = 0;
-  virtual void deallocate(void* buffer) const = 0;
-
-  // Return a scratchpad buffer of size 1k
-  virtual void* scratchpad() const = 0;
-
-  // Return a semaphore. The semaphore is initially initialized to 0, and
-  // each kernel using it is responsible for resetting to 0 upon completion
-  // to maintain the invariant that the semaphore is always equal to 0 upon
-  // each kernel start.
-  virtual unsigned int* semaphore() const = 0;
-};
-
-class GpuDeviceProperties {
- public:
-  GpuDeviceProperties() : 
-      initialized_(false), first_(true), device_properties_(nullptr) {}
- 
-  ~GpuDeviceProperties() {
-    if (device_properties_) {
-      delete[] device_properties_;
-    }
-  }
-  
-  EIGEN_STRONG_INLINE const gpuDeviceProp_t& get(int device) const {
-    return device_properties_[device];
-  }
-
-  EIGEN_STRONG_INLINE bool isInitialized() const {
-    return initialized_;
-  }
-
-  void initialize() {
-    if (!initialized_) {
-      // Attempts to ensure proper behavior in the case of multiple threads
-      // calling this function simultaneously. This would be trivial to
-      // implement if we could use std::mutex, but unfortunately mutex don't
-      // compile with nvcc, so we resort to atomics and thread fences instead.
-      // Note that if the caller uses a compiler that doesn't support c++11 we
-      // can't ensure that the initialization is thread safe.
-      if (first_.exchange(false)) {
-        // We're the first thread to reach this point.
-        int num_devices;
-        gpuError_t status = gpuGetDeviceCount(&num_devices);
-        if (status != gpuSuccess) {
-          std::cerr << "Failed to get the number of GPU devices: "
-                    << gpuGetErrorString(status)
-                    << std::endl;
-          gpu_assert(status == gpuSuccess);
-        }
-        device_properties_ = new gpuDeviceProp_t[num_devices];
-        for (int i = 0; i < num_devices; ++i) {
-          status = gpuGetDeviceProperties(&device_properties_[i], i);
-          if (status != gpuSuccess) {
-            std::cerr << "Failed to initialize GPU device #"
-                      << i
-                      << ": "
-                      << gpuGetErrorString(status)
-                      << std::endl;
-            gpu_assert(status == gpuSuccess);
-          }
-        }
-
-        std::atomic_thread_fence(std::memory_order_release);
-        initialized_ = true;
-      } else {
-        // Wait for the other thread to inititialize the properties.
-        while (!initialized_) {
-          std::atomic_thread_fence(std::memory_order_acquire);
-          std::this_thread::sleep_for(std::chrono::milliseconds(1000));
-        }
-      }
-    }
-  }
-
- private:
-  volatile bool initialized_;
-  std::atomic<bool> first_;
-  gpuDeviceProp_t* device_properties_;
-};
-
-EIGEN_ALWAYS_INLINE const GpuDeviceProperties& GetGpuDeviceProperties() {
-  static GpuDeviceProperties* deviceProperties = new GpuDeviceProperties();
-  if (!deviceProperties->isInitialized()) {
-    deviceProperties->initialize();
-  }
-  return *deviceProperties;
-}
-
-EIGEN_ALWAYS_INLINE const gpuDeviceProp_t& GetGpuDeviceProperties(int device) {
-  return GetGpuDeviceProperties().get(device);
-}
-
-static const gpuStream_t default_stream = gpuStreamDefault;
-
-class GpuStreamDevice : public StreamInterface {
- public:
-  // Use the default stream on the current device
-  GpuStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
-    gpuGetDevice(&device_);
-  }
-  // Use the default stream on the specified device
-  GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {}
-  // Use the specified stream. Note that it's the
-  // caller responsibility to ensure that the stream can run on
-  // the specified device. If no device is specified the code
-  // assumes that the stream is associated to the current gpu device.
-  GpuStreamDevice(const gpuStream_t* stream, int device = -1)
-      : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
-    if (device < 0) {
-      gpuGetDevice(&device_);
-    } else {
-      int num_devices;
-      gpuError_t err = gpuGetDeviceCount(&num_devices);
-      EIGEN_UNUSED_VARIABLE(err)
-      gpu_assert(err == gpuSuccess);
-      gpu_assert(device < num_devices);
-      device_ = device;
-    }
-  }
-
-  virtual ~GpuStreamDevice() {
-    if (scratch_) {
-      deallocate(scratch_);
-    }
-  }
-
-  const gpuStream_t& stream() const { return *stream_; }
-  const gpuDeviceProp_t& deviceProperties() const {
-    return GetGpuDeviceProperties(device_);
-  }
-  virtual void* allocate(size_t num_bytes) const {
-    gpuError_t err = gpuSetDevice(device_);
-    EIGEN_UNUSED_VARIABLE(err)
-    gpu_assert(err == gpuSuccess);
-    void* result;
-    err = gpuMalloc(&result, num_bytes);
-    gpu_assert(err == gpuSuccess);
-    gpu_assert(result != NULL);
-    return result;
-  }
-  virtual void deallocate(void* buffer) const {
-    gpuError_t err = gpuSetDevice(device_);
-    EIGEN_UNUSED_VARIABLE(err)
-    gpu_assert(err == gpuSuccess);
-    gpu_assert(buffer != NULL);
-    err = gpuFree(buffer);
-    gpu_assert(err == gpuSuccess);
-  }
-
-  virtual void* scratchpad() const {
-    if (scratch_ == NULL) {
-      scratch_ = allocate(kGpuScratchSize + sizeof(unsigned int));
-    }
-    return scratch_;
-  }
-
-  virtual unsigned int* semaphore() const {
-    if (semaphore_ == NULL) {
-      char* scratch = static_cast<char*>(scratchpad()) + kGpuScratchSize;
-      semaphore_ = reinterpret_cast<unsigned int*>(scratch);
-      gpuError_t err = gpuMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
-      EIGEN_UNUSED_VARIABLE(err)
-      gpu_assert(err == gpuSuccess);
-    }
-    return semaphore_;
-  }
-
- private:
-  const gpuStream_t* stream_;
-  int device_;
-  mutable void* scratch_;
-  mutable unsigned int* semaphore_;
-};
-
-struct GpuDevice {
-  // The StreamInterface is not owned: the caller is
-  // responsible for its initialization and eventual destruction.
-  explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
-    eigen_assert(stream);
-  }
-  explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
-    eigen_assert(stream);
-  }
-  // TODO(bsteiner): This is an internal API, we should not expose it.
-  EIGEN_STRONG_INLINE const gpuStream_t& stream() const {
-    return stream_->stream();
-  }
-
-  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
-    return stream_->allocate(num_bytes);
-  }
-
-  EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
-    stream_->deallocate(buffer);
-  }
-
-  EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const {
-    return stream_->allocate(num_bytes);
-  }
-
-  EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const {
-    stream_->deallocate(buffer);
-  }
-
-  template<typename Type>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { 
-    return data;
-  }
-
-  EIGEN_STRONG_INLINE void* scratchpad() const {
-    return stream_->scratchpad();
-  }
-
-  EIGEN_STRONG_INLINE unsigned int* semaphore() const {
-    return stream_->semaphore();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
-#ifndef EIGEN_GPU_COMPILE_PHASE
-    gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToDevice,
-                                      stream_->stream());
-    EIGEN_UNUSED_VARIABLE(err)
-    gpu_assert(err == gpuSuccess);
-#else
-    EIGEN_UNUSED_VARIABLE(dst);
-    EIGEN_UNUSED_VARIABLE(src);
-    EIGEN_UNUSED_VARIABLE(n);
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-#endif
-  }
-
-  EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
-    gpuError_t err =
-        gpuMemcpyAsync(dst, src, n, gpuMemcpyHostToDevice, stream_->stream());
-    EIGEN_UNUSED_VARIABLE(err)
-    gpu_assert(err == gpuSuccess);
-  }
-
-  EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
-    gpuError_t err =
-        gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToHost, stream_->stream());
-    EIGEN_UNUSED_VARIABLE(err)
-    gpu_assert(err == gpuSuccess);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
-#ifndef EIGEN_GPU_COMPILE_PHASE
-    gpuError_t err = gpuMemsetAsync(buffer, c, n, stream_->stream());
-    EIGEN_UNUSED_VARIABLE(err)
-    gpu_assert(err == gpuSuccess);
-#else
-  eigen_assert(false && "The default device should be used instead to generate kernel code");
-#endif
-  }
-
-  EIGEN_STRONG_INLINE size_t numThreads() const {
-    // FIXME
-    return 32;
-  }
-
-  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
-    // FIXME
-    return 48*1024;
-  }
-
-  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
-    // We won't try to take advantage of the l2 cache for the time being, and
-    // there is no l3 cache on hip/cuda devices.
-    return firstLevelCacheSize();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
-#ifndef EIGEN_GPU_COMPILE_PHASE
-    gpuError_t err = gpuStreamSynchronize(stream_->stream());
-    if (err != gpuSuccess) {
-      std::cerr << "Error detected in GPU stream: "
-                << gpuGetErrorString(err)
-                << std::endl;
-      gpu_assert(err == gpuSuccess);
-    }
-#else
-    gpu_assert(false && "The default device should be used instead to generate kernel code");
-#endif
-  }
-
-  EIGEN_STRONG_INLINE int getNumGpuMultiProcessors() const {
-    return stream_->deviceProperties().multiProcessorCount;
-  }
-  EIGEN_STRONG_INLINE int maxGpuThreadsPerBlock() const {
-    return stream_->deviceProperties().maxThreadsPerBlock;
-  }
-  EIGEN_STRONG_INLINE int maxGpuThreadsPerMultiProcessor() const {
-    return stream_->deviceProperties().maxThreadsPerMultiProcessor;
-  }
-  EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
-    return stream_->deviceProperties().sharedMemPerBlock;
-  }
-  EIGEN_STRONG_INLINE int majorDeviceVersion() const {
-    return stream_->deviceProperties().major;
-  }
-  EIGEN_STRONG_INLINE int minorDeviceVersion() const {
-    return stream_->deviceProperties().minor;
-  }
-
-  EIGEN_STRONG_INLINE int maxBlocks() const {
-    return max_blocks_;
-  }
-
-  // This function checks if the GPU runtime recorded an error for the
-  // underlying stream device.
-  inline bool ok() const {
-#ifdef EIGEN_GPUCC
-    gpuError_t error = gpuStreamQuery(stream_->stream());
-    return (error == gpuSuccess) || (error == gpuErrorNotReady);
-#else
-    return false;
-#endif
-  }
-
- private:
-  const StreamInterface* stream_;
-  int max_blocks_;
-};
-
-#if defined(EIGEN_HIPCC)
-
-#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)             \
-  hipLaunchKernelGGL(kernel, dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), __VA_ARGS__); \
-  gpu_assert(hipGetLastError() == hipSuccess);
-
-#else
- 
-#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)             \
-  (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__);   \
-  gpu_assert(cudaGetLastError() == cudaSuccess);
-
-#endif
- 
-// FIXME: Should be device and kernel specific.
-#ifdef EIGEN_GPUCC
-static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig config) {
-#ifndef EIGEN_GPU_COMPILE_PHASE
-  gpuError_t status = gpuDeviceSetSharedMemConfig(config);
-  EIGEN_UNUSED_VARIABLE(status)
-  gpu_assert(status == gpuSuccess);
-#else
-  EIGEN_UNUSED_VARIABLE(config)
-#endif
-}
-#endif
-
-}  // end namespace Eigen
-
-// undefine all the gpu* macros we defined at the beginning of the file
-#include "TensorGpuHipCudaUndefines.h"
-
-#endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceSycl.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceSycl.h
deleted file mode 100644
index df591c2..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceSycl.h
+++ /dev/null
@@ -1,1048 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
-
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H)
-#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
-#include <unordered_set>
-
-namespace Eigen {
-
-namespace TensorSycl {
-namespace internal {
-
-/// Cache all the device information needed
-struct SyclDeviceInfo {
-  SyclDeviceInfo(cl::sycl::queue queue)
-      : local_mem_type(
-            queue.get_device()
-                .template get_info<cl::sycl::info::device::local_mem_type>()),
-        max_work_item_sizes(
-            queue.get_device()
-                .template get_info<
-                    cl::sycl::info::device::max_work_item_sizes>()),
-        max_mem_alloc_size(
-            queue.get_device()
-                .template get_info<
-                    cl::sycl::info::device::max_mem_alloc_size>()),
-        max_compute_units(queue.get_device()
-                              .template get_info<
-                                  cl::sycl::info::device::max_compute_units>()),
-        max_work_group_size(
-            queue.get_device()
-                .template get_info<
-                    cl::sycl::info::device::max_work_group_size>()),
-        local_mem_size(
-            queue.get_device()
-                .template get_info<cl::sycl::info::device::local_mem_size>()),
-        platform_name(queue.get_device()
-                          .get_platform()
-                          .template get_info<cl::sycl::info::platform::name>()),
-        device_name(queue.get_device()
-                        .template get_info<cl::sycl::info::device::name>()),
-        device_vendor(
-            queue.get_device()
-                .template get_info<cl::sycl::info::device::vendor>()) {}
-
-  cl::sycl::info::local_mem_type local_mem_type;
-  cl::sycl::id<3> max_work_item_sizes;
-  unsigned long max_mem_alloc_size;
-  unsigned long max_compute_units;
-  unsigned long max_work_group_size;
-  size_t local_mem_size;
-  std::string platform_name;
-  std::string device_name;
-  std::string device_vendor;
-};
-
-}  // end namespace internal
-}  // end namespace TensorSycl
-
-typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t;
-// All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and
-// can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently
-// TensorFlow via the Eigen SYCL Backend.
-EIGEN_STRONG_INLINE auto get_sycl_supported_devices()
-    -> decltype(cl::sycl::device::get_devices()) {
-#ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR
-  return {cl::sycl::device(cl::sycl::default_selector())};
-#else
-  std::vector<cl::sycl::device> supported_devices;
-  auto platform_list = cl::sycl::platform::get_platforms();
-  for (const auto &platform : platform_list) {
-    auto device_list = platform.get_devices();
-    auto platform_name =
-        platform.template get_info<cl::sycl::info::platform::name>();
-    std::transform(platform_name.begin(), platform_name.end(),
-                   platform_name.begin(), ::tolower);
-    for (const auto &device : device_list) {
-      auto vendor = device.template get_info<cl::sycl::info::device::vendor>();
-      std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower);
-      bool unsupported_condition =
-          (device.is_cpu() && platform_name.find("amd") != std::string::npos &&
-           vendor.find("apu") == std::string::npos) ||
-          (platform_name.find("experimental") != std::string::npos) ||
-          device.is_host();
-      if (!unsupported_condition) {
-        supported_devices.push_back(device);
-      }
-    }
-  }
-  return supported_devices;
-#endif
-}
-
-class QueueInterface {
- public:
-  /// Creating device by using cl::sycl::selector or cl::sycl::device.
-  template <typename DeviceOrSelector>
-  explicit QueueInterface(
-      const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler,
-      unsigned num_threads = std::thread::hardware_concurrency())
-      : m_queue(dev_or_sel, handler),
-#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
-        m_prog(m_queue.get_context(), get_sycl_supported_devices()),
-#endif
-        m_thread_pool(num_threads),
-        m_device_info(m_queue) {
-#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
-    m_prog.build_with_kernel_type<DeviceOrSelector>();
-    auto f = [&](cl::sycl::handler &cgh) {
-      cgh.single_task<DeviceOrSelector>(m_prog.get_kernel<DeviceOrSelector>(),
-                                        [=]() {})
-    };
-    EIGEN_SYCL_TRY_CATCH(m_queue.submit(f));
-#endif
-  }
-
-  template <typename DeviceOrSelector>
-  explicit QueueInterface(
-      const DeviceOrSelector &dev_or_sel,
-      unsigned num_threads = std::thread::hardware_concurrency())
-      : QueueInterface(dev_or_sel,
-                       [this](cl::sycl::exception_list l) {
-                         this->exception_caught_ = this->sycl_async_handler(l);
-                       },
-                       num_threads) {}
-
-#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
-  EIGEN_STRONG_INLINE cl::sycl::program &program() const { return m_prog; }
-#endif
-
-  /// Attach an existing buffer to the pointer map, Eigen will not reuse it
-  EIGEN_STRONG_INLINE void *attach_buffer(
-      cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
-    std::lock_guard<std::mutex> lock(pmapper_mutex_);
-    return static_cast<void *>(pMapper.add_pointer(buf));
-  }
-
-  /// Detach previously attached buffer
-  EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
-    std::lock_guard<std::mutex> lock(pmapper_mutex_);
-    TensorSycl::internal::SYCLfree<false>(p, pMapper);
-  }
-
-  /// Allocating device pointer. This pointer is actually an 8 bytes host
-  /// pointer used as key to access the sycl device buffer. The reason is that
-  /// we cannot use device buffer as a pointer as a m_data in Eigen leafNode
-  /// expressions. So we create a key pointer to be used in Eigen expression
-  /// construction. When we convert the Eigen construction into the sycl
-  /// construction we use this pointer as a key in our buffer_map and we make
-  /// sure that we dedicate only one buffer only for this pointer. The device
-  /// pointer would be deleted by calling deallocate function.
-  EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
-#if EIGEN_MAX_ALIGN_BYTES > 0
-    size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
-    if (align > 0) {
-      num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
-    }
-#endif
-    std::lock_guard<std::mutex> lock(pmapper_mutex_);
-    return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
-  }
-
-  EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
-#if EIGEN_MAX_ALIGN_BYTES > 0
-    size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
-    if (align > 0) {
-      num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
-    }
-#endif
-    std::lock_guard<std::mutex> lock(pmapper_mutex_);
-#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
-    if (scratch_buffers.empty()) {
-      return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
-      ;
-    } else {
-      for (auto it = scratch_buffers.begin(); it != scratch_buffers.end();) {
-        auto buff = pMapper.get_buffer(*it);
-        if (buff.get_size() >= num_bytes) {
-          auto ptr = *it;
-          scratch_buffers.erase(it);
-          return ptr;
-        } else {
-          ++it;
-        }
-      }
-      return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
-    }
-#else
-    return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
-#endif
-  }
-  template <typename data_t>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
-      cl::sycl::access::mode::read_write, data_t>
-  get(data_t *data) const {
-    return get_range_accessor<cl::sycl::access::mode::read_write, data_t>(data);
-  }
-  template <typename data_t>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(
-      TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
-                                        data_t>
-          data) const {
-    return static_cast<data_t *>(data.get_virtual_pointer());
-  }
-
-  EIGEN_STRONG_INLINE void deallocate_temp(void *p) const {
-    std::lock_guard<std::mutex> lock(pmapper_mutex_);
-#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
-    scratch_buffers.insert(p);
-#else
-    TensorSycl::internal::SYCLfree(p, pMapper);
-#endif
-  }
-  template <cl::sycl::access::mode AcMd, typename T>
-  EIGEN_STRONG_INLINE void deallocate_temp(
-      const TensorSycl::internal::RangeAccess<AcMd, T> &p) const {
-    deallocate_temp(p.get_virtual_pointer());
-  }
-
-  /// This is used to deallocate the device pointer. p is used as a key inside
-  /// the map to find the device buffer and delete it.
-  EIGEN_STRONG_INLINE void deallocate(void *p) const {
-    std::lock_guard<std::mutex> lock(pmapper_mutex_);
-    TensorSycl::internal::SYCLfree(p, pMapper);
-  }
-
-  EIGEN_STRONG_INLINE void deallocate_all() const {
-    std::lock_guard<std::mutex> lock(pmapper_mutex_);
-    TensorSycl::internal::SYCLfreeAll(pMapper);
-#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
-    scratch_buffers.clear();
-#endif
-  }
-
-  /// The memcpyHostToDevice is used to copy the data from host to device
-  /// The destination pointer could be deleted before the copy happend which is
-  /// why a callback function is needed. By default if none is provided, the
-  /// function is blocking.
-  EIGEN_STRONG_INLINE void memcpyHostToDevice(
-      void *dst, const void *src, size_t n,
-      std::function<void()> callback) const {
-    static const auto write_mode = cl::sycl::access::mode::discard_write;
-    static const auto global_access = cl::sycl::access::target::global_buffer;
-    typedef cl::sycl::accessor<buffer_scalar_t, 1, write_mode, global_access>
-        write_accessor;
-    if (n == 0) {
-      if (callback) callback();
-      return;
-    }
-    n /= sizeof(buffer_scalar_t);
-    auto f = [&](cl::sycl::handler &cgh) {
-      write_accessor dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
-      buffer_scalar_t const *ptr = static_cast<buffer_scalar_t const *>(src);
-      auto non_deleter = [](buffer_scalar_t const *) {};
-      std::shared_ptr<const buffer_scalar_t> s_ptr(ptr, non_deleter);
-      cgh.copy(s_ptr, dst_acc);
-    };
-    cl::sycl::event e;
-    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
-    synchronize_and_callback(e, callback);
-  }
-
-  /// The memcpyDeviceToHost is used to copy the data from device to host.
-  /// The source pointer could be deleted before the copy happend which is
-  /// why a callback function is needed. By default if none is provided, the
-  /// function is blocking.
-  EIGEN_STRONG_INLINE void memcpyDeviceToHost(
-      void *dst, const void *src, size_t n,
-      std::function<void()> callback) const {
-    static const auto read_mode = cl::sycl::access::mode::read;
-    static const auto global_access = cl::sycl::access::target::global_buffer;
-    typedef cl::sycl::accessor<buffer_scalar_t, 1, read_mode, global_access>
-        read_accessor;
-    if (n == 0) {
-      if (callback) callback();
-      return;
-    }
-    n /= sizeof(buffer_scalar_t);
-    auto f = [&](cl::sycl::handler &cgh) {
-      read_accessor src_acc = get_range_accessor<read_mode>(cgh, src, n);
-      buffer_scalar_t *ptr = static_cast<buffer_scalar_t *>(dst);
-      auto non_deleter = [](buffer_scalar_t *) {};
-      std::shared_ptr<buffer_scalar_t> s_ptr(ptr, non_deleter);
-      cgh.copy(src_acc, s_ptr);
-    };
-    cl::sycl::event e;
-    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
-    synchronize_and_callback(e, callback);
-  }
-
-  /// The memcpy function.
-  /// No callback is required here as both arguments are on the device
-  /// and SYCL can handle the dependency.
-  EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const {
-    static const auto read_mode = cl::sycl::access::mode::read;
-    static const auto write_mode = cl::sycl::access::mode::discard_write;
-    if (n == 0) {
-      return;
-    }
-    n /= sizeof(buffer_scalar_t);
-    auto f = [&](cl::sycl::handler &cgh) {
-      auto src_acc = get_range_accessor<read_mode>(cgh, src, n);
-      auto dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
-      cgh.copy(src_acc, dst_acc);
-    };
-    cl::sycl::event e;
-    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
-    async_synchronize(e);
-  }
-
-  /// the memset function.
-  /// No callback is required here as both arguments are on the device
-  /// and SYCL can handle the dependency.
-  EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
-    static const auto write_mode = cl::sycl::access::mode::discard_write;
-    if (n == 0) {
-      return;
-    }
-    n /= sizeof(buffer_scalar_t);
-    auto f = [&](cl::sycl::handler &cgh) {
-      auto dst_acc = get_range_accessor<write_mode>(cgh, data, n);
-      // The cast to uint8_t is here to match the behaviour of the standard
-      // memset. The cast to buffer_scalar_t is needed to match the type of the
-      // accessor (in case buffer_scalar_t is not uint8_t)
-      cgh.fill(dst_acc, static_cast<buffer_scalar_t>(static_cast<uint8_t>(c)));
-    };
-    cl::sycl::event e;
-    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
-    async_synchronize(e);
-  }
-
-  /// Get a range accessor to the virtual pointer's device memory. This range
-  /// accessor will allow access to the memory from the pointer to the end of
-  /// the buffer.
-  ///
-  /// NOTE: Inside a kernel the range accessor will always be indexed from the
-  /// start of the buffer, so the offset in the accessor is only used by
-  /// methods like handler::copy and will not be available inside a kernel.
-  template <cl::sycl::access::mode AcMd, typename T>
-  EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
-  get_range_accessor(const void *ptr) const {
-    static const auto global_access = cl::sycl::access::target::global_buffer;
-    static const auto is_place_holder = cl::sycl::access::placeholder::true_t;
-    typedef TensorSycl::internal::RangeAccess<AcMd, T> ret_type;
-    typedef const TensorSycl::internal::buffer_data_type_t *internal_ptr_t;
-
-    std::lock_guard<std::mutex> lock(pmapper_mutex_);
-
-    auto original_buffer = pMapper.get_buffer(ptr);
-    const ptrdiff_t offset = pMapper.get_offset(ptr);
-    const ptrdiff_t typed_offset = offset / sizeof(T);
-    eigen_assert(typed_offset >= 0);
-    const auto typed_size = original_buffer.get_size() / sizeof(T);
-    auto buffer = original_buffer.template reinterpret<
-        typename Eigen::internal::remove_const<T>::type>(
-        cl::sycl::range<1>(typed_size));
-    const ptrdiff_t size = buffer.get_count() - typed_offset;
-    eigen_assert(size >= 0);
-    typedef cl::sycl::accessor<typename Eigen::internal::remove_const<T>::type,
-                               1, AcMd, global_access, is_place_holder>
-        placeholder_accessor_t;
-    const auto start_ptr = static_cast<internal_ptr_t>(ptr) - offset;
-    return ret_type(placeholder_accessor_t(buffer, cl::sycl::range<1>(size),
-                                           cl::sycl::id<1>(typed_offset)),
-                    static_cast<size_t>(typed_offset),
-                    reinterpret_cast<std::intptr_t>(start_ptr));
-  }
-
-  /// Get a range accessor to the virtual pointer's device memory with a
-  /// specified size.
-  template <cl::sycl::access::mode AcMd, typename Index>
-  EIGEN_STRONG_INLINE cl::sycl::accessor<
-      buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
-  get_range_accessor(cl::sycl::handler &cgh, const void *ptr,
-                     const Index n_bytes) const {
-    static const auto global_access = cl::sycl::access::target::global_buffer;
-    eigen_assert(n_bytes >= 0);
-    std::lock_guard<std::mutex> lock(pmapper_mutex_);
-    auto buffer = pMapper.get_buffer(ptr);
-    const ptrdiff_t offset = pMapper.get_offset(ptr);
-    eigen_assert(offset >= 0);
-    eigen_assert(offset + n_bytes <= buffer.get_size());
-    return buffer.template get_access<AcMd, global_access>(
-        cgh, cl::sycl::range<1>(n_bytes), cl::sycl::id<1>(offset));
-  }
-
-  /// Creation of sycl accessor for a buffer. This function first tries to find
-  /// the buffer in the buffer_map. If found it gets the accessor from it, if
-  /// not, the function then adds an entry by creating a sycl buffer for that
-  /// particular pointer.
-  template <cl::sycl::access::mode AcMd>
-  EIGEN_STRONG_INLINE cl::sycl::accessor<
-      buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
-  get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const {
-    std::lock_guard<std::mutex> lock(pmapper_mutex_);
-    return pMapper.get_buffer(ptr)
-        .template get_access<AcMd, cl::sycl::access::target::global_buffer>(
-            cgh);
-  }
-
-  EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
-      const void *ptr) const {
-    std::lock_guard<std::mutex> lock(pmapper_mutex_);
-    return pMapper.get_buffer(ptr);
-  }
-
-  EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
-    std::lock_guard<std::mutex> lock(pmapper_mutex_);
-    return pMapper.get_offset(ptr);
-  }
-
-  template <typename OutScalar, typename sycl_kernel, typename Lhs,
-            typename Rhs, typename OutPtr, typename Range, typename Index,
-            typename... T>
-  EIGEN_ALWAYS_INLINE void binary_kernel_launcher(const Lhs &lhs,
-                                                  const Rhs &rhs, OutPtr outptr,
-                                                  Range thread_range,
-                                                  Index scratchSize,
-                                                  T... var) const {
-    auto kernel_functor = [=](cl::sycl::handler &cgh) {
-      // binding the placeholder accessors to a commandgroup handler
-      lhs.bind(cgh);
-      rhs.bind(cgh);
-      outptr.bind(cgh);
-      typedef cl::sycl::accessor<OutScalar, 1,
-                                 cl::sycl::access::mode::read_write,
-                                 cl::sycl::access::target::local>
-          LocalAccessor;
-
-      LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
-      cgh.parallel_for(
-#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
-          program().template get_kernel<sycl_kernel>(),
-#endif
-          thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...));
-    };
-    cl::sycl::event e;
-    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
-    async_synchronize(e);
-  }
-
-  template <typename OutScalar, typename sycl_kernel, typename InPtr,
-            typename OutPtr, typename Range, typename Index, typename... T>
-  EIGEN_ALWAYS_INLINE void unary_kernel_launcher(const InPtr &inptr,
-                                                 OutPtr &outptr,
-                                                 Range thread_range,
-                                                 Index scratchSize,
-                                                 T... var) const {
-    auto kernel_functor = [=](cl::sycl::handler &cgh) {
-      // binding the placeholder accessors to a commandgroup handler
-      inptr.bind(cgh);
-      outptr.bind(cgh);
-      typedef cl::sycl::accessor<OutScalar, 1,
-                                 cl::sycl::access::mode::read_write,
-                                 cl::sycl::access::target::local>
-          LocalAccessor;
-
-      LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
-      cgh.parallel_for(
-#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
-          program().template get_kernel<sycl_kernel>(),
-#endif
-          thread_range, sycl_kernel(scratch, inptr, outptr, var...));
-    };
-    cl::sycl::event e;
-    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
-    async_synchronize(e);
-  }
-
-    template <typename OutScalar, typename sycl_kernel, typename InPtr,
-           typename Range, typename Index, typename... T>
-  EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(const InPtr &inptr,
-                                                 Range thread_range,
-                                                 Index scratchSize,
-                                                 T... var) const {
-    auto kernel_functor = [=](cl::sycl::handler &cgh) {
-      // binding the placeholder accessors to a commandgroup handler
-      inptr.bind(cgh);
-      typedef cl::sycl::accessor<OutScalar, 1,
-                                 cl::sycl::access::mode::read_write,
-                                 cl::sycl::access::target::local>
-          LocalAccessor;
-
-      LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
-      cgh.parallel_for(
-#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
-          program().template get_kernel<sycl_kernel>(),
-#endif
-          thread_range, sycl_kernel(scratch, inptr, var...));
-    };
-    cl::sycl::event e;
-    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
-    async_synchronize(e);
-  }
-
-
-  EIGEN_STRONG_INLINE void synchronize() const {
-#ifdef EIGEN_EXCEPTIONS
-    m_queue.wait_and_throw();
-#else
-    m_queue.wait();
-#endif
-  }
-
-
-  EIGEN_STRONG_INLINE void async_synchronize(cl::sycl::event e) const {
-    set_latest_event(e);
-#ifndef EIGEN_SYCL_ASYNC_EXECUTION
-    synchronize();
-#endif
-  }
-
-  template <typename Index>
-  EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize,
-                                              Index &rng, Index &GRange) const {
-    tileSize = static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
-    tileSize = std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
-                                           EIGEN_SYCL_LOCAL_THREAD_DIM1),
-                        static_cast<Index>(tileSize));
-    rng = n;
-    if (rng == 0) rng = static_cast<Index>(1);
-    GRange = rng;
-    if (tileSize > GRange)
-      tileSize = GRange;
-    else if (GRange > tileSize) {
-      Index xMode = static_cast<Index>(GRange % tileSize);
-      if (xMode != 0) GRange += static_cast<Index>(tileSize - xMode);
-    }
-  }
-
-  /// This is used to prepare the number of threads and also the number of
-  /// threads per block for sycl kernels
-  template <typename Index>
-  EIGEN_STRONG_INLINE void parallel_for_setup(
-      const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
-      cl::sycl::range<2> &local_range) const {
-    std::array<Index, 2> input_range = input_dim;
-    Index max_workgroup_Size =
-        static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
-    max_workgroup_Size =
-        std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
-                                    EIGEN_SYCL_LOCAL_THREAD_DIM1),
-                 static_cast<Index>(max_workgroup_Size));
-    Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
-    local_range[1] =
-        static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
-    input_range[1] = input_dim[1];
-    if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
-    global_range[1] = input_range[1];
-    if (local_range[1] > global_range[1])
-      local_range[1] = global_range[1];
-    else if (global_range[1] > local_range[1]) {
-      Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
-      if (xMode != 0)
-        global_range[1] += static_cast<Index>(local_range[1] - xMode);
-    }
-    local_range[0] = static_cast<Index>(max_workgroup_Size / local_range[1]);
-    input_range[0] = input_dim[0];
-    if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
-    global_range[0] = input_range[0];
-    if (local_range[0] > global_range[0])
-      local_range[0] = global_range[0];
-    else if (global_range[0] > local_range[0]) {
-      Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
-      if (xMode != 0)
-        global_range[0] += static_cast<Index>(local_range[0] - xMode);
-    }
-  }
-
-  /// This is used to prepare the number of threads and also the number of
-  /// threads per block for sycl kernels
-  template <typename Index>
-  EIGEN_STRONG_INLINE void parallel_for_setup(
-      const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
-      cl::sycl::range<3> &local_range) const {
-    std::array<Index, 3> input_range = input_dim;
-    Index max_workgroup_Size =
-        static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
-    max_workgroup_Size =
-        std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
-                                    EIGEN_SYCL_LOCAL_THREAD_DIM1),
-                 static_cast<Index>(max_workgroup_Size));
-    Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
-    local_range[2] =
-        static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 3)));
-    input_range[2] = input_dim[2];
-    if (input_range[2] == 0) input_range[1] = static_cast<Index>(1);
-    global_range[2] = input_range[2];
-    if (local_range[2] > global_range[2])
-      local_range[2] = global_range[2];
-    else if (global_range[2] > local_range[2]) {
-      Index xMode = static_cast<Index>(global_range[2] % local_range[2]);
-      if (xMode != 0)
-        global_range[2] += static_cast<Index>(local_range[2] - xMode);
-    }
-    pow_of_2 = static_cast<Index>(
-        std::log2(static_cast<Index>(max_workgroup_Size / local_range[2])));
-    local_range[1] =
-        static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
-    input_range[1] = input_dim[1];
-    if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
-    global_range[1] = input_range[1];
-    if (local_range[1] > global_range[1])
-      local_range[1] = global_range[1];
-    else if (global_range[1] > local_range[1]) {
-      Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
-      if (xMode != 0)
-        global_range[1] += static_cast<Index>(local_range[1] - xMode);
-    }
-    local_range[0] = static_cast<Index>(max_workgroup_Size /
-                                        (local_range[1] * local_range[2]));
-    input_range[0] = input_dim[0];
-    if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
-    global_range[0] = input_range[0];
-    if (local_range[0] > global_range[0])
-      local_range[0] = global_range[0];
-    else if (global_range[0] > local_range[0]) {
-      Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
-      if (xMode != 0)
-        global_range[0] += static_cast<Index>(local_range[0] - xMode);
-    }
-  }
-
-  EIGEN_STRONG_INLINE bool has_local_memory() const {
-#if !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
-    return false;
-#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
-    return true;
-#else
-    return m_device_info.local_mem_type ==
-           cl::sycl::info::local_mem_type::local;
-#endif
-  }
-
-  EIGEN_STRONG_INLINE unsigned long max_buffer_size() const {
-    return m_device_info.max_mem_alloc_size;
-  }
-
-  EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
-    return m_device_info.max_compute_units;
-  }
-
-  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
-    return m_device_info.max_work_group_size;
-  }
-
-  EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const {
-    return m_device_info.max_work_item_sizes;
-  }
-
-  /// No need for sycl it should act the same as CPU version
-  EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; }
-
-  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
-    // OpenCL doesnot have such concept
-    return 2;
-  }
-
-  EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
-    return m_device_info.local_mem_size;
-  }
-
-  // This function returns the nearest power of 2 Work-group size which is <=
-  // maximum device workgroup size.
-  EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
-    return getPowerOfTwo(m_device_info.max_work_group_size, false);
-  }
-
-  EIGEN_STRONG_INLINE std::string getPlatformName() const {
-    return m_device_info.platform_name;
-  }
-
-  EIGEN_STRONG_INLINE std::string getDeviceName() const {
-    return m_device_info.device_name;
-  }
-
-  EIGEN_STRONG_INLINE std::string getDeviceVendor() const {
-    return m_device_info.device_vendor;
-  }
-
-  // This function returns the nearest power of 2
-  // if roundup is true returns result>=wgsize
-  // else it return result <= wgsize
-  EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t wGSize, bool roundUp) const {
-    if (roundUp) --wGSize;
-    wGSize |= (wGSize >> 1);
-    wGSize |= (wGSize >> 2);
-    wGSize |= (wGSize >> 4);
-    wGSize |= (wGSize >> 8);
-    wGSize |= (wGSize >> 16);
-#if EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64 || EIGEN_OS_WIN64
-    wGSize |= (wGSize >> 32);
-#endif
-    return ((!roundUp) ? (wGSize - (wGSize >> 1)) : ++wGSize);
-  }
-
-  EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { return m_queue; }
-
-  // This function checks if the runtime recorded an error for the
-  // underlying stream device.
-  EIGEN_STRONG_INLINE bool ok() const {
-    if (!exception_caught_) {
-      synchronize();
-    }
-    return !exception_caught_;
-  }
-
-  EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const {
-#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
-    std::lock_guard<std::mutex> lock(event_mutex_);
-    return latest_events_[std::this_thread::get_id()];
-#else
-    eigen_assert(false);
-    return cl::sycl::event();
-#endif
-  }
-
-  // destructor
-  ~QueueInterface() {
-    pMapper.clear();
-#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
-    scratch_buffers.clear();
-#endif
-  }
-
- protected:
-  EIGEN_STRONG_INLINE void set_latest_event(cl::sycl::event e) const {
-#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
-    std::lock_guard<std::mutex> lock(event_mutex_);
-    latest_events_[std::this_thread::get_id()] = e;
-#else
-    EIGEN_UNUSED_VARIABLE(e);
-#endif
-  }
-
-  void synchronize_and_callback(cl::sycl::event e,
-                                const std::function<void()> &callback) const {
-    set_latest_event(e);
-    if (callback) {
-      auto callback_ = [=]() {
-#ifdef EIGEN_EXCEPTIONS
-        cl::sycl::event(e).wait_and_throw();
-#else
-        cl::sycl::event(e).wait();
-#endif
-        callback();
-      };
-      m_thread_pool.Schedule(std::move(callback_));
-    } else {
-#ifdef EIGEN_EXCEPTIONS
-      m_queue.wait_and_throw();
-#else
-      m_queue.wait();
-#endif
-    }
-  }
-
-  bool sycl_async_handler(cl::sycl::exception_list exceptions) const {
-    bool exception_caught = false;
-    for (const auto &e : exceptions) {
-      if (e) {
-        exception_caught = true;
-        EIGEN_THROW_X(e);
-      }
-    }
-    return exception_caught;
-  }
-
-  /// class members:
-  bool exception_caught_ = false;
-
-  mutable std::mutex pmapper_mutex_;
-
-#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
-  mutable std::mutex event_mutex_;
-  mutable std::unordered_map<std::thread::id, cl::sycl::event> latest_events_;
-#endif
-
-  /// std::map is the container used to make sure that we create only one buffer
-  /// per pointer. The lifespan of the buffer now depends on the lifespan of
-  /// SyclDevice. If a non-read-only pointer is needed to be accessed on the
-  /// host we should manually deallocate it.
-  mutable TensorSycl::internal::PointerMapper pMapper;
-#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
-  mutable std::unordered_set<void *> scratch_buffers;
-#endif
-  /// sycl queue
-  mutable cl::sycl::queue m_queue;
-#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
-  mutable cl::sycl::program m_prog;
-#endif
-
-  /// The thread pool is used to wait on events and call callbacks
-  /// asynchronously
-  mutable Eigen::ThreadPool m_thread_pool;
-
-  const TensorSycl::internal::SyclDeviceInfo m_device_info;
-};
-
-struct SyclDeviceBase {
-  /// QueueInterface is not owned. it is the caller's responsibility to destroy
-  /// it
-  const QueueInterface *m_queue_stream;
-  explicit SyclDeviceBase(const QueueInterface *queue_stream)
-      : m_queue_stream(queue_stream) {}
-  EIGEN_STRONG_INLINE const QueueInterface *queue_stream() const {
-    return m_queue_stream;
-  }
-};
-
-// Here is a sycl device struct which accept the sycl queue interface
-// as an input
-struct SyclDevice : public SyclDeviceBase {
-  explicit SyclDevice(const QueueInterface *queue_stream)
-      : SyclDeviceBase(queue_stream) {}
-
-  // this is the accessor used to construct the evaluator
-  template <cl::sycl::access::mode AcMd, typename T>
-  EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
-  get_range_accessor(const void *ptr) const {
-    return queue_stream()->template get_range_accessor<AcMd, T>(ptr);
-  }
-
-  // get sycl accessor
-  template <cl::sycl::access::mode AcMd>
-  EIGEN_STRONG_INLINE cl::sycl::accessor<
-      buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
-  get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const {
-    return queue_stream()->template get_sycl_accessor<AcMd>(cgh, ptr);
-  }
-
-  /// Accessing the created sycl device buffer for the device pointer
-  EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
-      const void *ptr) const {
-    return queue_stream()->get_sycl_buffer(ptr);
-  }
-
-  /// This is used to prepare the number of threads and also the number of
-  /// threads per block for sycl kernels
-  template <typename Index>
-  EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize,
-                                              Index &rng, Index &GRange) const {
-    queue_stream()->parallel_for_setup(n, tileSize, rng, GRange);
-  }
-
-  /// This is used to prepare the number of threads and also the number of
-  /// threads per block for sycl kernels
-  template <typename Index>
-  EIGEN_STRONG_INLINE void parallel_for_setup(
-      const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
-      cl::sycl::range<2> &local_range) const {
-    queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
-  }
-
-  /// This is used to prepare the number of threads and also the number of
-  /// threads per block for sycl kernels
-  template <typename Index>
-  EIGEN_STRONG_INLINE void parallel_for_setup(
-      const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
-      cl::sycl::range<3> &local_range) const {
-    queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
-  }
-
-  /// allocate device memory
-  EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
-    return queue_stream()->allocate(num_bytes);
-  }
-
-  EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
-    return queue_stream()->allocate_temp(num_bytes);
-  }
-
-  /// deallocate device memory
-  EIGEN_STRONG_INLINE void deallocate(void *p) const {
-    queue_stream()->deallocate(p);
-  }
-
-  EIGEN_STRONG_INLINE void deallocate_temp(void *buffer) const {
-    queue_stream()->deallocate_temp(buffer);
-  }
-  template <cl::sycl::access::mode AcMd, typename T>
-  EIGEN_STRONG_INLINE void deallocate_temp(
-      const TensorSycl::internal::RangeAccess<AcMd, T> &buffer) const {
-    queue_stream()->deallocate_temp(buffer);
-  }
-  EIGEN_STRONG_INLINE void deallocate_all() const {
-    queue_stream()->deallocate_all();
-  }
-
-  template <typename data_t>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
-      cl::sycl::access::mode::read_write, data_t>
-  get(data_t *data) const {
-    return queue_stream()->get(data);
-  }
-  template <typename data_t>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(
-      TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
-                                        data_t>
-          data) const {
-    return queue_stream()->get(data);
-  }
-
-  /// attach existing buffer
-  EIGEN_STRONG_INLINE void *attach_buffer(
-      cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
-    return queue_stream()->attach_buffer(buf);
-  }
-  /// detach buffer
-  EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
-    queue_stream()->detach_buffer(p);
-  }
-  EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
-    return queue_stream()->get_offset(ptr);
-  }
-
-  // some runtime conditions that can be applied here
-  EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; }
-
-  /// memcpyHostToDevice
-  template <typename Index>
-  EIGEN_STRONG_INLINE void memcpyHostToDevice(
-      Index *dst, const Index *src, size_t n,
-      std::function<void()> callback = {}) const {
-    queue_stream()->memcpyHostToDevice(dst, src, n, callback);
-  }
-  /// memcpyDeviceToHost
-  template <typename Index>
-  EIGEN_STRONG_INLINE void memcpyDeviceToHost(
-      void *dst, const Index *src, size_t n,
-      std::function<void()> callback = {}) const {
-    queue_stream()->memcpyDeviceToHost(dst, src, n, callback);
-  }
-  /// the memcpy function
-  template <typename Index>
-  EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const {
-    queue_stream()->memcpy(dst, src, n);
-  }
-  /// the memset function
-  EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
-    queue_stream()->memset(data, c, n);
-  }
-  /// returning the sycl queue
-  EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const {
-    return queue_stream()->sycl_queue();
-  }
-#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
-  EIGEN_STRONG_INLINE cl::sycl::program &program() const {
-    return queue_stream()->program();
-  }
-#endif
-
-  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return 48 * 1024; }
-
-  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
-    // We won't try to take advantage of the l2 cache for the time being, and
-    // there is no l3 cache on sycl devices.
-    return firstLevelCacheSize();
-  }
-  EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
-    return queue_stream()->getNumSyclMultiProcessors();
-  }
-  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
-    return queue_stream()->maxSyclThreadsPerBlock();
-  }
-  EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const {
-    return queue_stream()->maxWorkItemSizes();
-  }
-  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
-    // OpenCL doesnot have such concept
-    return queue_stream()->maxSyclThreadsPerMultiProcessor();
-  }
-  EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
-    return queue_stream()->sharedMemPerBlock();
-  }
-  EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
-    return queue_stream()->getNearestPowerOfTwoWorkGroupSize();
-  }
-
-  EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t val, bool roundUp) const {
-    return queue_stream()->getPowerOfTwo(val, roundUp);
-  }
-  /// No need for sycl it should act the same as CPU version
-  EIGEN_STRONG_INLINE int majorDeviceVersion() const {
-    return queue_stream()->majorDeviceVersion();
-  }
-
-  EIGEN_STRONG_INLINE void synchronize() const {
-    queue_stream()->synchronize();
-  }
-  EIGEN_STRONG_INLINE void async_synchronize(
-      cl::sycl::event e = cl::sycl::event()) const {
-    queue_stream()->async_synchronize(e);
-  }
-  EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const {
-    return queue_stream()->get_latest_event();
-  }
-
-  // This function checks if the runtime recorded an error for the
-  // underlying stream device.
-  EIGEN_STRONG_INLINE bool ok() const { return queue_stream()->ok(); }
-
-  EIGEN_STRONG_INLINE bool has_local_memory() const {
-    return queue_stream()->has_local_memory();
-  }
-  EIGEN_STRONG_INLINE long max_buffer_size() const {
-    return queue_stream()->max_buffer_size();
-  }
-  EIGEN_STRONG_INLINE std::string getPlatformName() const {
-    return queue_stream()->getPlatformName();
-  }
-  EIGEN_STRONG_INLINE std::string getDeviceName() const {
-    return queue_stream()->getDeviceName();
-  }
-  EIGEN_STRONG_INLINE std::string getDeviceVendor() const {
-    return queue_stream()->getDeviceVendor();
-  }
-  template <typename OutScalar, typename KernelType, typename... T>
-  EIGEN_ALWAYS_INLINE void binary_kernel_launcher(T... var) const {
-    queue_stream()->template binary_kernel_launcher<OutScalar, KernelType>(
-        var...);
-  }
-  template <typename OutScalar, typename KernelType, typename... T>
-  EIGEN_ALWAYS_INLINE void unary_kernel_launcher(T... var) const {
-    queue_stream()->template unary_kernel_launcher<OutScalar, KernelType>(
-        var...);
-  }
-
-  template <typename OutScalar, typename KernelType, typename... T>
-  EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(T... var) const {
-    queue_stream()->template nullary_kernel_launcher<OutScalar, KernelType>(
-        var...);
-  }
-};
-}  // end namespace Eigen
-
-#endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceThreadPool.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceThreadPool.h
deleted file mode 100644
index e524b53..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ /dev/null
@@ -1,409 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#if defined(EIGEN_USE_THREADS) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H)
-#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H
-
-namespace Eigen {
-
-// Runs an arbitrary function and then calls Notify() on the passed in
-// Notification.
-template <typename Function, typename... Args> struct FunctionWrapperWithNotification
-{
-  static void run(Notification* n, Function f, Args... args) {
-    f(args...);
-    if (n) {
-      n->Notify();
-    }
-  }
-};
-
-template <typename Function, typename... Args> struct FunctionWrapperWithBarrier
-{
-  static void run(Barrier* b, Function f, Args... args) {
-    f(args...);
-    if (b) {
-      b->Notify();
-    }
-  }
-};
-
-template <typename SyncType>
-static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) {
-  if (n) {
-    n->Wait();
-  }
-}
-
-// An abstract interface to a device specific memory allocator.
-class Allocator {
- public:
-  virtual ~Allocator() {}
-  virtual void* allocate(size_t num_bytes) const = 0;
-  virtual void deallocate(void* buffer) const = 0;
-};
-
-// Build a thread pool device on top the an existing pool of threads.
-struct ThreadPoolDevice {
-  // The ownership of the thread pool remains with the caller.
-  ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores, Allocator* allocator = nullptr)
-      : pool_(pool), num_threads_(num_cores), allocator_(allocator) { }
-
-  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
-    return allocator_ ? allocator_->allocate(num_bytes)
-        : internal::aligned_malloc(num_bytes);
-  }
-
-  EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
-    if (allocator_) {
-      allocator_->deallocate(buffer);
-    } else {
-      internal::aligned_free(buffer);
-    }
-  }
-
-    EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const {
-    return allocate(num_bytes);
-  }
-
-  EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const {
-    deallocate(buffer);
-  }
-
-  template<typename Type>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const {
-    return data;
-  }
-
-  EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
-#ifdef __ANDROID__
-    ::memcpy(dst, src, n);
-#else
-    // TODO(rmlarsen): Align blocks on cache lines.
-    // We have observed that going beyond 4 threads usually just wastes
-    // CPU cycles due to the threads competing for memory bandwidth, so we
-    // statically schedule at most 4 block copies here.
-    const size_t kMinBlockSize = 32768;
-    const size_t num_threads = CostModel::numThreads(n, TensorOpCost(1.0, 1.0, 0), 4);
-    if (n <= kMinBlockSize || num_threads < 2) {
-      ::memcpy(dst, src, n);
-    } else {
-      const char* src_ptr = static_cast<const char*>(src);
-      char* dst_ptr = static_cast<char*>(dst);
-      const size_t blocksize = (n + (num_threads - 1)) / num_threads;
-      Barrier barrier(static_cast<int>(num_threads - 1));
-      // Launch the last 3 blocks on worker threads.
-      for (size_t i = 1; i < num_threads; ++i) {
-        enqueue_with_barrier(&barrier, [n, i, src_ptr, dst_ptr, blocksize] {
-          ::memcpy(dst_ptr + i * blocksize, src_ptr + i * blocksize,
-                   numext::mini(blocksize, n - (i * blocksize)));
-        });
-      }
-      // Launch the first block on the main thread.
-      ::memcpy(dst_ptr, src_ptr, blocksize);
-      barrier.Wait();
-    }
-#endif
-  }
-  EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
-    memcpy(dst, src, n);
-  }
-  EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
-    memcpy(dst, src, n);
-  }
-
-  EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
-    ::memset(buffer, c, n);
-  }
-
-  EIGEN_STRONG_INLINE int numThreads() const {
-    return num_threads_;
-  }
-
-  // Number of theads available in the underlying thread pool. This number can
-  // be different from the value returned by numThreads().
-  EIGEN_STRONG_INLINE int numThreadsInPool() const {
-    return pool_->NumThreads();
-  }
-
-  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
-    return l1CacheSize();
-  }
-
-  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
-    // The l3 cache size is shared between all the cores.
-    return l3CacheSize() / num_threads_;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
-    // Should return an enum that encodes the ISA supported by the CPU
-    return 1;
-  }
-
-  template <class Function, class... Args>
-  EIGEN_STRONG_INLINE Notification* enqueue(Function&& f,
-                                            Args&&... args) const {
-    Notification* n = new Notification();
-    pool_->Schedule(
-        std::bind(&FunctionWrapperWithNotification<Function, Args...>::run, n,
-                  std::move(f), args...));
-    return n;
-  }
-
-  template <class Function, class... Args>
-  EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b, Function&& f,
-                                                Args&&... args) const {
-    pool_->Schedule(
-        std::bind(&FunctionWrapperWithBarrier<Function, Args...>::run, b,
-                  std::move(f), args...));
-  }
-
-  template <class Function, class... Args>
-  EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f,
-                                                 Args&&... args) const {
-    if (sizeof...(args) > 0) {
-      pool_->Schedule(std::bind(std::move(f), args...));
-    } else {
-      pool_->Schedule(std::move(f));
-    }
-  }
-
-  // Returns a logical thread index between 0 and pool_->NumThreads() - 1 if
-  // called from one of the threads in pool_. Returns -1 otherwise.
-  EIGEN_STRONG_INLINE int currentThreadId() const {
-    return pool_->CurrentThreadId();
-  }
-
-  // WARNING: This function is synchronous and will block the calling thread.
-  //
-  // Synchronous parallelFor executes f with [0, n) arguments in parallel and
-  // waits for completion. F accepts a half-open interval [first, last). Block
-  // size is chosen based on the iteration cost and resulting parallel
-  // efficiency. If block_align is not nullptr, it is called to round up the
-  // block size.
-  void parallelFor(Index n, const TensorOpCost& cost,
-                   std::function<Index(Index)> block_align,
-                   std::function<void(Index, Index)> f) const {
-    if (EIGEN_PREDICT_FALSE(n <= 0)){
-      return;
-    // Compute small problems directly in the caller thread.
-    } else if (n == 1 || numThreads() == 1 ||
-               CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) {
-      f(0, n);
-      return;
-    }
-
-    // Compute block size and total count of blocks.
-    ParallelForBlock block = CalculateParallelForBlock(n, cost, block_align);
-
-    // Recursively divide size into halves until we reach block_size.
-    // Division code rounds mid to block_size, so we are guaranteed to get
-    // block_count leaves that do actual computations.
-    Barrier barrier(static_cast<unsigned int>(block.count));
-    std::function<void(Index, Index)> handleRange;
-    handleRange = [=, &handleRange, &barrier, &f](Index firstIdx,
-                                                  Index lastIdx) {
-      while (lastIdx - firstIdx > block.size) {
-        // Split into halves and schedule the second half on a different thread.
-        const Index midIdx = firstIdx + divup((lastIdx - firstIdx) / 2, block.size) * block.size;
-        pool_->Schedule([=, &handleRange]() { handleRange(midIdx, lastIdx); });
-        lastIdx = midIdx;
-      }
-      // Single block or less, execute directly.
-      f(firstIdx, lastIdx);
-      barrier.Notify();
-    };
-
-    if (block.count <= numThreads()) {
-      // Avoid a thread hop by running the root of the tree and one block on the
-      // main thread.
-      handleRange(0, n);
-    } else {
-      // Execute the root in the thread pool to avoid running work on more than
-      // numThreads() threads.
-      pool_->Schedule([=, &handleRange]() { handleRange(0, n); });
-    }
-
-    barrier.Wait();
-  }
-
-  // Convenience wrapper for parallelFor that does not align blocks.
-  void parallelFor(Index n, const TensorOpCost& cost,
-                   std::function<void(Index, Index)> f) const {
-    parallelFor(n, cost, nullptr, std::move(f));
-  }
-
-  // WARNING: This function is asynchronous and will not block the calling thread.
-  //
-  // Asynchronous parallelFor executes f with [0, n) arguments in parallel
-  // without waiting for completion. When the last block finished, it will call
-  // 'done' callback. F accepts a half-open interval [first, last). Block size
-  // is chosen based on the iteration cost and resulting parallel efficiency. If
-  // block_align is not nullptr, it is called to round up the block size.
-  void parallelForAsync(Index n, const TensorOpCost& cost,
-                        std::function<Index(Index)> block_align,
-                        std::function<void(Index, Index)> f,
-                        std::function<void()> done) const {
-    // Compute small problems directly in the caller thread.
-    if (n <= 1 || numThreads() == 1 ||
-        CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) {
-      f(0, n);
-      done();
-      return;
-    }
-
-    // Compute block size and total count of blocks.
-    ParallelForBlock block = CalculateParallelForBlock(n, cost, block_align);
-
-    ParallelForAsyncContext* const ctx =
-        new ParallelForAsyncContext(block.count, std::move(f), std::move(done));
-
-    // Recursively divide size into halves until we reach block_size.
-    // Division code rounds mid to block_size, so we are guaranteed to get
-    // block_count leaves that do actual computations.
-    ctx->handle_range = [this, ctx, block](Index firstIdx, Index lastIdx) {
-      while (lastIdx - firstIdx > block.size) {
-        // Split into halves and schedule the second half on a different thread.
-        const Index midIdx = firstIdx + divup((lastIdx - firstIdx) / 2, block.size) * block.size;
-        pool_->Schedule(
-            [ctx, midIdx, lastIdx]() { ctx->handle_range(midIdx, lastIdx); });
-        lastIdx = midIdx;
-      }
-
-      // Single block or less, execute directly.
-      ctx->f(firstIdx, lastIdx);
-
-      // Delete async context if it was the last block.
-      if (ctx->count.fetch_sub(1) == 1) delete ctx;
-    };
-
-    if (block.count <= numThreads()) {
-      // Avoid a thread hop by running the root of the tree and one block on the
-      // main thread.
-      ctx->handle_range(0, n);
-    } else {
-      // Execute the root in the thread pool to avoid running work on more than
-      // numThreads() threads.
-      pool_->Schedule([ctx, n]() { ctx->handle_range(0, n); });
-    }
-  }
-
-  // Convenience wrapper for parallelForAsync that does not align blocks.
-  void parallelForAsync(Index n, const TensorOpCost& cost,
-                        std::function<void(Index, Index)> f,
-                        std::function<void()> done) const {
-    parallelForAsync(n, cost, nullptr, std::move(f), std::move(done));
-  }
-
-  // Thread pool accessor.
-  ThreadPoolInterface* getPool() const { return pool_; }
-
-  // Allocator accessor.
-  Allocator* allocator() const { return allocator_; }
-
- private:
-  typedef TensorCostModel<ThreadPoolDevice> CostModel;
-
-  // For parallelForAsync we must keep passed in closures on the heap, and
-  // delete them only after `done` callback finished.
-  struct ParallelForAsyncContext {
-    ParallelForAsyncContext(Index block_count,
-                            std::function<void(Index, Index)> block_f,
-                            std::function<void()> done_callback)
-        : count(block_count),
-          f(std::move(block_f)),
-          done(std::move(done_callback)) {}
-    ~ParallelForAsyncContext() { done(); }
-
-    std::atomic<Index> count;
-    std::function<void(Index, Index)> f;
-    std::function<void()> done;
-
-    std::function<void(Index, Index)> handle_range;
-  };
-
-  struct ParallelForBlock {
-    Index size;   // block size
-    Index count;  // number of blocks
-  };
-
-  // Calculates block size based on (1) the iteration cost and (2) parallel
-  // efficiency. We want blocks to be not too small to mitigate parallelization
-  // overheads; not too large to mitigate tail effect and potential load
-  // imbalance and we also want number of blocks to be evenly dividable across
-  // threads.
-  ParallelForBlock CalculateParallelForBlock(
-      const Index n, const TensorOpCost& cost,
-      std::function<Index(Index)> block_align) const {
-    const double block_size_f = 1.0 / CostModel::taskSize(1, cost);
-    const Index max_oversharding_factor = 4;
-    Index block_size = numext::mini(
-        n, numext::maxi<Index>(
-               divup<Index>(n, max_oversharding_factor * numThreads()),
-               block_size_f));
-    const Index max_block_size = numext::mini(n, 2 * block_size);
-
-    if (block_align) {
-      Index new_block_size = block_align(block_size);
-      eigen_assert(new_block_size >= block_size);
-      block_size = numext::mini(n, new_block_size);
-    }
-
-    Index block_count = divup(n, block_size);
-
-    // Calculate parallel efficiency as fraction of total CPU time used for
-    // computations:
-    double max_efficiency =
-        static_cast<double>(block_count) /
-        (divup<int>(block_count, numThreads()) * numThreads());
-
-    // Now try to increase block size up to max_block_size as long as it
-    // doesn't decrease parallel efficiency.
-    for (Index prev_block_count = block_count;
-         max_efficiency < 1.0 && prev_block_count > 1;) {
-      // This is the next block size that divides size into a smaller number
-      // of blocks than the current block_size.
-      Index coarser_block_size = divup(n, prev_block_count - 1);
-      if (block_align) {
-        Index new_block_size = block_align(coarser_block_size);
-        eigen_assert(new_block_size >= coarser_block_size);
-        coarser_block_size = numext::mini(n, new_block_size);
-      }
-      if (coarser_block_size > max_block_size) {
-        break;  // Reached max block size. Stop.
-      }
-      // Recalculate parallel efficiency.
-      const Index coarser_block_count = divup(n, coarser_block_size);
-      eigen_assert(coarser_block_count < prev_block_count);
-      prev_block_count = coarser_block_count;
-      const double coarser_efficiency =
-          static_cast<double>(coarser_block_count) /
-          (divup<int>(coarser_block_count, numThreads()) * numThreads());
-      if (coarser_efficiency + 0.01 >= max_efficiency) {
-        // Taking it.
-        block_size = coarser_block_size;
-        block_count = coarser_block_count;
-        if (max_efficiency < coarser_efficiency) {
-          max_efficiency = coarser_efficiency;
-        }
-      }
-    }
-
-    return {block_size, block_count};
-  }
-
-  ThreadPoolInterface* pool_;
-  int num_threads_;
-  Allocator* allocator_;
-};
-
-
-}  // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorDimensionList.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorDimensionList.h
deleted file mode 100644
index 1a30e45..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorDimensionList.h
+++ /dev/null
@@ -1,236 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H
-#define EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H
-
-namespace Eigen {
-
-/** \internal
-  *
-  * \class TensorDimensionList
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Special case of tensor index list used to list all the dimensions of a tensor of rank n.
-  *
-  * \sa Tensor
-  */
-
-template <typename Index, std::size_t Rank> struct DimensionList {
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-  const Index operator[] (const Index i) const { return i; }
-};
-
-namespace internal {
-
-template<typename Index, std::size_t Rank> struct array_size<DimensionList<Index, Rank> > {
-  static const size_t value = Rank;
-};
-template<typename Index, std::size_t Rank> struct array_size<const DimensionList<Index, Rank> > {
-  static const size_t value = Rank;
-};
-
-template<DenseIndex n, typename Index, std::size_t Rank> const Index array_get(DimensionList<Index, Rank>&) {
-  return n;
-}
-template<DenseIndex n, typename Index, std::size_t Rank> const Index array_get(const DimensionList<Index, Rank>&) {
-  return n;
-}
-
-
-#if EIGEN_HAS_CONSTEXPR
-template <typename Index, std::size_t Rank>
-struct index_known_statically_impl<DimensionList<Index, Rank> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) {
-    return true;
-  }
-};
-template <typename Index, std::size_t Rank>
-struct index_known_statically_impl<const DimensionList<Index, Rank> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) {
-    return true;
-  }
-};
-
-template <typename Index, std::size_t Rank>
-struct all_indices_known_statically_impl<DimensionList<Index, Rank> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run() {
-    return true;
-  }
-};
-template <typename Index, std::size_t Rank>
-struct all_indices_known_statically_impl<const DimensionList<Index, Rank> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run() {
-    return true;
-  }
-};
-
-template <typename Index, std::size_t Rank>
-struct indices_statically_known_to_increase_impl<DimensionList<Index, Rank> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run() {
-    return true;
-  }
-};
-template <typename Index, std::size_t Rank>
-struct indices_statically_known_to_increase_impl<const DimensionList<Index, Rank> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run() {
-    return true;
-  }
-};
-
-template <typename Index, std::size_t Rank>
-struct index_statically_eq_impl<DimensionList<Index, Rank> > {
-  static constexpr bool run(const DenseIndex i, const DenseIndex value) {
-    return i == value;
-  }
-};
-template <typename Index, std::size_t Rank>
-struct index_statically_eq_impl<const DimensionList<Index, Rank> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
-    return i == value;
-  }
-};
-
-template <typename Index, std::size_t Rank>
-struct index_statically_ne_impl<DimensionList<Index, Rank> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
-    return i != value;
-  }
-};
-template <typename Index, std::size_t Rank>
-struct index_statically_ne_impl<const DimensionList<Index, Rank> > {
-  static constexpr bool run(const DenseIndex i, const DenseIndex value) {
-    return i != value;
-  }
-};
-
-template <typename Index, std::size_t Rank>
-struct index_statically_gt_impl<DimensionList<Index, Rank> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
-    return i > value;
-  }
-};
-template <typename Index, std::size_t Rank>
-struct index_statically_gt_impl<const DimensionList<Index, Rank> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
-    return i > value;
-  }
-};
-
-template <typename Index, std::size_t Rank>
-struct index_statically_lt_impl<DimensionList<Index, Rank> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
-    return i < value;
-  }
-};
-template <typename Index, std::size_t Rank>
-struct index_statically_lt_impl<const DimensionList<Index, Rank> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
-    return i < value;
-  }
-};
-
-#else
-template <typename Index, std::size_t Rank>
-struct index_known_statically_impl<DimensionList<Index, Rank> > {
-  EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run(const DenseIndex) {
-    return true;
-  }
-};
-template <typename Index, std::size_t Rank>
-struct index_known_statically_impl<const DimensionList<Index, Rank> > {
-  EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run(const DenseIndex) {
-    return true;
-  }
-};
-
-template <typename Index, std::size_t Rank>
-struct all_indices_known_statically_impl<DimensionList<Index, Rank> > {
-  EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run() {
-    return true;
-  }
-};
-template <typename Index, std::size_t Rank>
-struct all_indices_known_statically_impl<const DimensionList<Index, Rank> > {
-  EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run() {
-    return true;
-  }
-};
-
-template <typename Index, std::size_t Rank>
-struct indices_statically_known_to_increase_impl<DimensionList<Index, Rank> > {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() {
-    return true;
-  }
-};
-template <typename Index, std::size_t Rank>
-struct indices_statically_known_to_increase_impl<const DimensionList<Index, Rank> > {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() {
-    return true;
-  }
-};
-
-template <typename Index, std::size_t Rank>
-struct index_statically_eq_impl<DimensionList<Index, Rank> > {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
-    return false;
-  }
-};
-template <typename Index, std::size_t Rank>
-struct index_statically_eq_impl<const DimensionList<Index, Rank> > {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
-    return false;
-  }
-};
-
-template <typename Index, std::size_t Rank>
-struct index_statically_ne_impl<DimensionList<Index, Rank> > {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex){
-    return false;
-  }
-};
-template <typename Index, std::size_t Rank>
-struct index_statically_ne_impl<const DimensionList<Index, Rank> > {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
-    return false;
-  }
-};
-
-template <typename Index, std::size_t Rank>
-struct index_statically_gt_impl<DimensionList<Index, Rank> > {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
-    return false;
-  }
-};
-template <typename Index, std::size_t Rank>
-struct index_statically_gt_impl<const DimensionList<Index, Rank> > {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
-    return false;
-  }
-};
-
-template <typename Index, std::size_t Rank>
-struct index_statically_lt_impl<DimensionList<Index, Rank> > {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
-    return false;
-  }
-};
-template <typename Index, std::size_t Rank>
-struct index_statically_lt_impl<const DimensionList<Index, Rank> > {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
-    return false;
-  }
-};
-#endif
-
-}  // end namespace internal
-}  // end namespace Eigen
-
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorDimensions.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorDimensions.h
deleted file mode 100644
index f0f1e83..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorDimensions.h
+++ /dev/null
@@ -1,490 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H
-#define EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H
-
-
-namespace Eigen {
-
-/** \internal
-  *
-  * \class TensorDimensions
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Set of classes used to encode and store the dimensions of a Tensor.
-  *
-  * The Sizes class encodes as part of the type the number of dimensions and the
-  * sizes corresponding to each dimension. It uses no storage space since it is
-  * entirely known at compile time.
-  * The DSizes class is its dynamic sibling: the number of dimensions is known
-  * at compile time but the sizes are set during execution.
-  *
-  * \sa Tensor
-  */
-
-// Boilerplate code
-namespace internal {
-
-template<std::ptrdiff_t n, typename Dimension> struct dget {
-  static const std::ptrdiff_t value = get<n, Dimension>::value;
-};
-
-
-template<typename Index, std::ptrdiff_t NumIndices, std::ptrdiff_t n, bool RowMajor>
-struct fixed_size_tensor_index_linearization_helper
-{
-  template <typename Dimensions> EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Index run(array<Index, NumIndices> const& indices,
-                          const Dimensions& dimensions)
-  {
-    return array_get<RowMajor ? n - 1 : (NumIndices - n)>(indices) +
-        dget<RowMajor ? n - 1 : (NumIndices - n), Dimensions>::value *
-        fixed_size_tensor_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(indices, dimensions);
-  }
-};
-
-template<typename Index, std::ptrdiff_t NumIndices, bool RowMajor>
-struct fixed_size_tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor>
-{
-  template <typename Dimensions> EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Index run(array<Index, NumIndices> const&, const Dimensions&)
-  {
-    return 0;
-  }
-};
-
-template<typename Index, std::ptrdiff_t n>
-struct fixed_size_tensor_index_extraction_helper
-{
-  template <typename Dimensions> EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Index run(const Index index,
-                          const Dimensions& dimensions)
-  {
-    const Index mult = (index == n-1) ? 1 : 0;
-    return array_get<n-1>(dimensions) * mult +
-        fixed_size_tensor_index_extraction_helper<Index, n - 1>::run(index, dimensions);
-  }
-};
-
-template<typename Index>
-struct fixed_size_tensor_index_extraction_helper<Index, 0>
-{
-  template <typename Dimensions> EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Index run(const Index,
-                          const Dimensions&)
-  {
-    return 0;
-  }
-  };
-
-}  // end namespace internal
-
-
-// Fixed size
-#ifndef EIGEN_EMULATE_CXX11_META_H
-template <typename std::ptrdiff_t... Indices>
-struct Sizes {
-  typedef internal::numeric_list<std::ptrdiff_t, Indices...> Base;
-  const Base t = Base();
-  static const std::ptrdiff_t total_size = internal::arg_prod(Indices...);
-  static const ptrdiff_t count = Base::count;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t rank() const {
-    return Base::count;
-  }
-
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t TotalSize() {
-    return internal::arg_prod(Indices...);
-  }
-
-  EIGEN_DEVICE_FUNC Sizes() { }
-  template <typename DenseIndex>
-  explicit EIGEN_DEVICE_FUNC Sizes(const array<DenseIndex, Base::count>& /*indices*/) {
-    // todo: add assertion
-  }
-#if EIGEN_HAS_VARIADIC_TEMPLATES
-  template <typename... DenseIndex> EIGEN_DEVICE_FUNC Sizes(DenseIndex...) { }
-  explicit EIGEN_DEVICE_FUNC Sizes(std::initializer_list<std::ptrdiff_t> /*l*/) {
-    // todo: add assertion
-  }
-#endif
-
-  template <typename T> Sizes& operator = (const T& /*other*/) {
-    // add assertion failure if the size of other is different
-    return *this;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[] (const std::ptrdiff_t index) const {
-    return internal::fixed_size_tensor_index_extraction_helper<std::ptrdiff_t, Base::count>::run(index, t);
-  }
-
-  template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  ptrdiff_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
-    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(indices, t);
-  }
-  template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  ptrdiff_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
-    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(indices, t);
-  }
-};
-
-namespace internal {
-template <typename std::ptrdiff_t... Indices>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes<Indices...>&) {
-  return Sizes<Indices...>::total_size;
-}
-}
-
-#else
-
-template <std::ptrdiff_t n>
-struct non_zero_size {
-  typedef internal::type2val<std::ptrdiff_t, n> type;
-};
-template <>
-struct non_zero_size<0> {
-  typedef internal::null_type type;
-};
-
-template <std::ptrdiff_t V1=0, std::ptrdiff_t V2=0, std::ptrdiff_t V3=0, std::ptrdiff_t V4=0, std::ptrdiff_t V5=0> struct Sizes {
-  typedef typename internal::make_type_list<typename non_zero_size<V1>::type, typename non_zero_size<V2>::type, typename non_zero_size<V3>::type, typename non_zero_size<V4>::type, typename non_zero_size<V5>::type >::type Base;
-  static const std::ptrdiff_t count = Base::count;
-  static const std::ptrdiff_t total_size = internal::arg_prod<Base>::value;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptrdiff_t rank() const {
-    return count;
-  }
-
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptrdiff_t TotalSize() {
-    return internal::arg_prod<Base>::value;
-  }
-
-  Sizes() { }
-  template <typename DenseIndex>
-  explicit Sizes(const array<DenseIndex, Base::count>& /*indices*/) {
-    // todo: add assertion
-  }
-  template <typename T> Sizes& operator = (const T& /*other*/) {
-    // add assertion failure if the size of other is different
-    return *this;
-  }
-
-#if EIGEN_HAS_VARIADIC_TEMPLATES
-  template <typename... DenseIndex> Sizes(DenseIndex... /*indices*/) { }
-  explicit Sizes(std::initializer_list<std::ptrdiff_t>) {
-    // todo: add assertion
-  }
-#else
-  EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex) {
-  }
-  EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex) {
-  }
-  EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex) {
-  }
-  EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) {
-  }
-  EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) {
-  }
-#endif
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index operator[] (const Index index) const {
-    switch (index) {
-      case 0:
-        return internal::get<0, Base>::value;
-      case 1:
-        return internal::get<1, Base>::value;
-      case 2:
-        return internal::get<2, Base>::value;
-      case 3:
-        return internal::get<3, Base>::value;
-      case 4:
-        return internal::get<4, Base>::value;
-      default:
-        eigen_assert(false && "index overflow");
-        return static_cast<Index>(-1);
-    }
-  }
-
-  template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  ptrdiff_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
-    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(indices, *reinterpret_cast<const Base*>(this));
-  }
-  template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  ptrdiff_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
-    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(indices, *reinterpret_cast<const Base*>(this));
-  }
-};
-
-namespace internal {
-template <std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes<V1, V2, V3, V4, V5>&) {
-  return Sizes<V1, V2, V3, V4, V5>::total_size;
-}
-}
-
-#endif
-
-// Boilerplate
-namespace internal {
-template<typename Index, std::ptrdiff_t NumIndices, std::ptrdiff_t n, bool RowMajor>
-struct tensor_index_linearization_helper
-{
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  Index run(array<Index, NumIndices> const& indices, array<Index, NumIndices> const& dimensions)
-  {
-    return array_get<RowMajor ? n : (NumIndices - n - 1)>(indices) +
-      array_get<RowMajor ? n : (NumIndices - n - 1)>(dimensions) *
-        tensor_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(indices, dimensions);
-  }
-};
-
-template<typename Index, std::ptrdiff_t NumIndices, bool RowMajor>
-struct tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor>
-{
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  Index run(array<Index, NumIndices> const& indices, array<Index, NumIndices> const&)
-  {
-    return array_get<RowMajor ? 0 : NumIndices - 1>(indices);
-  }
-};
-}  // end namespace internal
-
-
-
-// Dynamic size
-template <typename DenseIndex, int NumDims>
-struct DSizes : array<DenseIndex, NumDims> {
-  typedef array<DenseIndex, NumDims> Base;
-  static const int count = NumDims;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const {
-    return NumDims;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex TotalSize() const {
-    return (NumDims == 0) ? 1 : internal::array_prod(*static_cast<const Base*>(this));
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DSizes() {
-    for (int i = 0 ; i < NumDims; ++i) {
-      (*this)[i] = 0;
-    }
-  }
-  EIGEN_DEVICE_FUNC explicit DSizes(const array<DenseIndex, NumDims>& a) : Base(a) { }
-
-  EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0) {
-    eigen_assert(NumDims == 1);
-    (*this)[0] = i0;
-  }
-
-  EIGEN_DEVICE_FUNC DSizes(const DimensionList<DenseIndex, NumDims>& a) {
-    for (int i = 0 ; i < NumDims; ++i) {
-      (*this)[i] = a[i];
-    }
-  }
-
-  // Enable DSizes index type promotion only if we are promoting to the
-  // larger type, e.g. allow to promote dimensions of type int to long.
-  template<typename OtherIndex>
-  EIGEN_DEVICE_FUNC
-  explicit DSizes(const array<OtherIndex, NumDims>& other,
-                  // Default template parameters require c++11.
-                  typename internal::enable_if<
-                     internal::is_same<
-                         DenseIndex,
-                         typename internal::promote_index_type<
-                             DenseIndex,
-                             OtherIndex
-                         >::type
-                     >::value, void*>::type = 0) {
-    for (int i = 0; i < NumDims; ++i) {
-      (*this)[i] = static_cast<DenseIndex>(other[i]);
-    }
-  }
-
-#ifdef EIGEN_HAS_INDEX_LIST
-  template <typename FirstType, typename... OtherTypes>
-  EIGEN_DEVICE_FUNC
-  explicit DSizes(const Eigen::IndexList<FirstType, OtherTypes...>& dimensions) {
-    for (int i = 0; i < dimensions.count; ++i) {
-      (*this)[i] = dimensions[i];
-    }
-  }
-#endif
-
-#ifndef EIGEN_EMULATE_CXX11_META_H
-  template <typename std::ptrdiff_t... Indices>
-  EIGEN_DEVICE_FUNC DSizes(const Sizes<Indices...>& a) {
-    for (int i = 0 ; i < NumDims; ++i) {
-      (*this)[i] = a[i];
-    }
-  }
-#else
-  template <std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5>
-  EIGEN_DEVICE_FUNC DSizes(const Sizes<V1, V2, V3, V4, V5>& a) {
-    for (int i = 0 ; i < NumDims; ++i) {
-      (*this)[i] = a[i];
-    }
-  }
-#endif
-
-#if EIGEN_HAS_VARIADIC_TEMPLATES
-  template<typename... IndexTypes> EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, DenseIndex secondDimension, IndexTypes... otherDimensions) : Base({{firstDimension, secondDimension, otherDimensions...}}) {
-    EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 2 == NumDims, YOU_MADE_A_PROGRAMMING_MISTAKE)
-  }
-#else
-  EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1) {
-    eigen_assert(NumDims == 2);
-    (*this)[0] = i0;
-    (*this)[1] = i1;
-  }
-  EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) {
-    eigen_assert(NumDims == 3);
-    (*this)[0] = i0;
-    (*this)[1] = i1;
-    (*this)[2] = i2;
-  }
-  EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) {
-    eigen_assert(NumDims == 4);
-    (*this)[0] = i0;
-    (*this)[1] = i1;
-    (*this)[2] = i2;
-    (*this)[3] = i3;
-  }
-  EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) {
-    eigen_assert(NumDims == 5);
-    (*this)[0] = i0;
-    (*this)[1] = i1;
-    (*this)[2] = i2;
-    (*this)[3] = i3;
-    (*this)[4] = i4;
-  }
-#endif
-
-  EIGEN_DEVICE_FUNC DSizes& operator = (const array<DenseIndex, NumDims>& other) {
-    *static_cast<Base*>(this) = other;
-    return *this;
-  }
-
-  // A constexpr would be so much better here
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex IndexOfColMajor(const array<DenseIndex, NumDims>& indices) const {
-    return internal::tensor_index_linearization_helper<DenseIndex, NumDims, NumDims - 1, false>::run(indices, *static_cast<const Base*>(this));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex IndexOfRowMajor(const array<DenseIndex, NumDims>& indices) const {
-    return internal::tensor_index_linearization_helper<DenseIndex, NumDims, NumDims - 1, true>::run(indices, *static_cast<const Base*>(this));
-  }
-};
-
-template <typename IndexType, int NumDims>
-std::ostream& operator<<(std::ostream& os,
-                         const DSizes<IndexType, NumDims>& dims) {
-  os << "[";
-  for (int i = 0; i < NumDims; ++i) {
-    if (i > 0) os << ", ";
-    os << dims[i];
-  }
-  os << "]";
-  return os;
-}
-
-// Boilerplate
-namespace internal {
-template<typename Index, std::ptrdiff_t NumIndices, std::ptrdiff_t n, bool RowMajor>
-struct tensor_vsize_index_linearization_helper
-{
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  Index run(array<Index, NumIndices> const& indices, std::vector<DenseIndex> const& dimensions)
-  {
-    return array_get<RowMajor ? n : (NumIndices - n - 1)>(indices) +
-      array_get<RowMajor ? n : (NumIndices - n - 1)>(dimensions) *
-        tensor_vsize_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(indices, dimensions);
-  }
-};
-
-template<typename Index, std::ptrdiff_t NumIndices, bool RowMajor>
-struct tensor_vsize_index_linearization_helper<Index, NumIndices, 0, RowMajor>
-{
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  Index run(array<Index, NumIndices> const& indices, std::vector<DenseIndex> const&)
-  {
-    return array_get<RowMajor ? 0 : NumIndices - 1>(indices);
-  }
-};
-}  // end namespace internal
-
-
-namespace internal {
-
-template <typename DenseIndex, int NumDims> struct array_size<const DSizes<DenseIndex, NumDims> > {
-  static const ptrdiff_t value = NumDims;
-};
-template <typename DenseIndex, int NumDims> struct array_size<DSizes<DenseIndex, NumDims> > {
-  static const ptrdiff_t value = NumDims;
-};
-#ifndef EIGEN_EMULATE_CXX11_META_H
-template <typename std::ptrdiff_t... Indices> struct array_size<const Sizes<Indices...> > {
-static const std::ptrdiff_t value = Sizes<Indices...>::count;
-};
-template <typename std::ptrdiff_t... Indices> struct array_size<Sizes<Indices...> > {
-static const std::ptrdiff_t value = Sizes<Indices...>::count;
-};
-template <std::ptrdiff_t n, typename std::ptrdiff_t... Indices> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<Indices...>&) {
-  return get<n, internal::numeric_list<std::ptrdiff_t, Indices...> >::value;
-}
-template <std::ptrdiff_t n> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<>&) {
-  eigen_assert(false && "should never be called");
-  return -1;
-}
-#else
-template <std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5> struct array_size<const Sizes<V1,V2,V3,V4,V5> > {
-  static const ptrdiff_t value = Sizes<V1,V2,V3,V4,V5>::count;
-};
-template <std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5> struct array_size<Sizes<V1,V2,V3,V4,V5> > {
-  static const ptrdiff_t value = Sizes<V1,V2,V3,V4,V5>::count;
-};
-template <std::ptrdiff_t n, std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<V1,V2,V3,V4,V5>&) {
-  return get<n, typename Sizes<V1,V2,V3,V4,V5>::Base>::value;
-}
-
-#endif
-
-
-template <typename Dims1, typename Dims2, ptrdiff_t n, ptrdiff_t m>
-struct sizes_match_below_dim {
-  static EIGEN_DEVICE_FUNC  EIGEN_STRONG_INLINE bool run(Dims1&, Dims2&) {
-    return false;
-  }
-};
-template <typename Dims1, typename Dims2, ptrdiff_t n>
-struct sizes_match_below_dim<Dims1, Dims2, n, n> {
-  static EIGEN_DEVICE_FUNC  EIGEN_STRONG_INLINE bool run(Dims1& dims1, Dims2& dims2) {
-    return (array_get<n-1>(dims1) == array_get<n-1>(dims2)) &&
-        sizes_match_below_dim<Dims1, Dims2, n-1, n-1>::run(dims1, dims2);
-  }
-};
-template <typename Dims1, typename Dims2>
-struct sizes_match_below_dim<Dims1, Dims2, 0, 0> {
-  static EIGEN_DEVICE_FUNC  EIGEN_STRONG_INLINE bool run(Dims1&, Dims2&) {
-    return true;
-  }
-};
-
-} // end namespace internal
-
-
-template <typename Dims1, typename Dims2>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool dimensions_match(Dims1 dims1, Dims2 dims2) {
-  return internal::sizes_match_below_dim<Dims1, Dims2, internal::array_size<Dims1>::value, internal::array_size<Dims2>::value>::run(dims1, dims2);
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorEvalTo.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorEvalTo.h
deleted file mode 100644
index a48d035..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorEvalTo.h
+++ /dev/null
@@ -1,236 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H
-#define EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H
-
-namespace Eigen {
-
-/** \class TensorForcedEval
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Tensor reshaping class.
-  *
-  *
-  */
-namespace internal {
-template<typename XprType, template <class> class MakePointer_>
-struct traits<TensorEvalToOp<XprType, MakePointer_> >
-{
-  // Type promotion to handle the case where the types of the lhs and the rhs are different.
-  typedef typename XprType::Scalar Scalar;
-  typedef traits<XprType> XprTraits;
-  typedef typename XprTraits::StorageKind StorageKind;
-  typedef typename XprTraits::Index Index;
-  typedef typename XprType::Nested Nested;
-  typedef typename remove_reference<Nested>::type _Nested;
-  static const int NumDimensions = XprTraits::NumDimensions;
-  static const int Layout = XprTraits::Layout;
-  typedef typename MakePointer_<Scalar>::Type PointerType;
-
-  enum {
-    Flags = 0
-  };
-  template <class T>
-  struct MakePointer {
-    // Intermediate typedef to workaround MSVC issue.
-    typedef MakePointer_<T> MakePointerT;
-    typedef typename MakePointerT::Type Type;
-
-
-  };
-};
-
-template<typename XprType, template <class> class MakePointer_>
-struct eval<TensorEvalToOp<XprType, MakePointer_>, Eigen::Dense>
-{
-  typedef const TensorEvalToOp<XprType, MakePointer_>& type;
-};
-
-template<typename XprType, template <class> class MakePointer_>
-struct nested<TensorEvalToOp<XprType, MakePointer_>, 1, typename eval<TensorEvalToOp<XprType, MakePointer_> >::type>
-{
-  typedef TensorEvalToOp<XprType, MakePointer_> type;
-};
-
-}  // end namespace internal
-
-
-
-
-template<typename XprType, template <class> class MakePointer_>
-class TensorEvalToOp : public TensorBase<TensorEvalToOp<XprType, MakePointer_>, ReadOnlyAccessors>
-{
-  public:
-  typedef typename Eigen::internal::traits<TensorEvalToOp>::Scalar Scalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
-  typedef typename MakePointer_<CoeffReturnType>::Type PointerType;
-  typedef typename Eigen::internal::nested<TensorEvalToOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorEvalToOp>::StorageKind StorageKind;
-  typedef typename Eigen::internal::traits<TensorEvalToOp>::Index Index;
-
-  static const int NumDims = Eigen::internal::traits<TensorEvalToOp>::NumDimensions;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(PointerType buffer, const XprType& expr)
-      : m_xpr(expr), m_buffer(buffer) {}
-
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename XprType::Nested>::type&
-    expression() const { return m_xpr; }
-
-    EIGEN_DEVICE_FUNC PointerType buffer() const { return m_buffer; }
-
-  protected:
-    typename XprType::Nested m_xpr;
-    PointerType m_buffer;
-};
-
-
-
-template<typename ArgType, typename Device, template <class> class MakePointer_>
-struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
-{
-  typedef TensorEvalToOp<ArgType, MakePointer_> XprType;
-  typedef typename ArgType::Scalar Scalar;
-  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
-  typedef typename XprType::Index Index;
-  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-  enum {
-    IsAligned         = TensorEvaluator<ArgType, Device>::IsAligned,
-    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess       = true,
-    PreferBlockAccess = false,
-    Layout            = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess       = false,  // to be implemented
-    RawAccess         = true
-  };
-
-  static const int NumDims = internal::traits<ArgType>::NumDimensions;
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
-  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
-
-  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
-      ArgTensorBlock;
-
-  typedef internal::TensorBlockAssignment<
-      CoeffReturnType, NumDims, typename ArgTensorBlock::XprType, Index>
-      TensorBlockAssignment;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device), m_buffer(device.get(op.buffer())), m_expression(op.expression()){}
-
-
-  EIGEN_STRONG_INLINE ~TensorEvaluator() {
-  }
-
-
-  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType scalar) {
-    EIGEN_UNUSED_VARIABLE(scalar);
-    eigen_assert(scalar == NULL);
-    return m_impl.evalSubExprsIfNeeded(m_buffer);
-  }
-
-#ifdef EIGEN_USE_THREADS
-  template <typename EvalSubExprsCallback>
-  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
-      EvaluatorPointerType scalar, EvalSubExprsCallback done) {
-    EIGEN_UNUSED_VARIABLE(scalar);
-    eigen_assert(scalar == NULL);
-    m_impl.evalSubExprsIfNeededAsync(m_buffer, std::move(done));
-  }
-#endif
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
-    m_buffer[i] = m_impl.coeff(i);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
-    internal::pstoret<CoeffReturnType, PacketReturnType, Aligned>(m_buffer + i, m_impl.template packet<TensorEvaluator<ArgType, Device>::IsAligned ? Aligned : Unaligned>(i));
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockResourceRequirements getResourceRequirements() const {
-    return m_impl.getResourceRequirements();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(
-      TensorBlockDesc& desc, TensorBlockScratch& scratch) {
-    // Add `m_buffer` as destination buffer to the block descriptor.
-    desc.template AddDestinationBuffer<Layout>(
-        /*dst_base=*/m_buffer + desc.offset(),
-        /*dst_strides=*/internal::strides<Layout>(m_impl.dimensions()));
-
-    ArgTensorBlock block =
-        m_impl.block(desc, scratch, /*root_of_expr_ast=*/true);
-
-    // If block was evaluated into a destination buffer, there is no need to do
-    // an assignment.
-    if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) {
-      TensorBlockAssignment::Run(
-          TensorBlockAssignment::target(
-              desc.dimensions(), internal::strides<Layout>(m_impl.dimensions()),
-              m_buffer, desc.offset()),
-          block.expr());
-    }
-    block.cleanup();
-  }
-
-  EIGEN_STRONG_INLINE void cleanup() {
-    m_impl.cleanup();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
-  {
-    return m_buffer[index];
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
-  {
-    return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    // We assume that evalPacket or evalScalar is called to perform the
-    // assignment and account for the cost of the write here.
-    return m_impl.costPerCoeff(vectorized) +
-        TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize);
-  }
-
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_buffer; }
-  ArgType expression() const { return m_expression; }
-  #ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-    m_buffer.bind(cgh);
-  }
-  #endif
-
-
- private:
-  TensorEvaluator<ArgType, Device> m_impl;
-  EvaluatorPointerType m_buffer;
-  const ArgType m_expression;
-};
-
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorEvaluator.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorEvaluator.h
deleted file mode 100644
index 3aff7fa..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorEvaluator.h
+++ /dev/null
@@ -1,983 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H
-#define EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H
-
-namespace Eigen {
-
-/** \class TensorEvaluator
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief The tensor evaluator classes.
-  *
-  * These classes are responsible for the evaluation of the tensor expression.
-  *
-  * TODO: add support for more types of expressions, in particular expressions
-  * leading to lvalues (slicing, reshaping, etc...)
-  */
-
-// Generic evaluator
-template<typename Derived, typename Device>
-struct TensorEvaluator
-{
-  typedef typename Derived::Index Index;
-  typedef typename Derived::Scalar Scalar;
-  typedef typename Derived::Scalar CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  typedef typename Derived::Dimensions Dimensions;
-  typedef Derived XprType;
-  static const int PacketSize =  PacketType<CoeffReturnType, Device>::size;
-  typedef typename internal::traits<Derived>::template MakePointer<Scalar>::Type TensorPointerType;
-  typedef StorageMemory<Scalar, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
-  // NumDimensions is -1 for variable dim tensors
-  static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
-                               internal::traits<Derived>::NumDimensions : 0;
-
-  enum {
-    IsAligned          = Derived::IsAligned,
-    PacketAccess       = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccess        = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
-    PreferBlockAccess  = false,
-    Layout             = Derived::Layout,
-    CoordAccess        = NumCoords > 0,
-    RawAccess          = true
-  };
-
-  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc;
-  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
-
-  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumCoords,
-                                                     Layout, Index>
-      TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
-      : m_data(device.get((const_cast<TensorPointerType>(m.data())))),
-        m_dims(m.dimensions()),
-        m_device(device)
-  { }
-
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; }
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType dest) {
-    if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization && dest) {
-      m_device.memcpy((void*)(m_device.get(dest)), m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar));
-      return false;
-    }
-    return true;
-  }
-
-#ifdef EIGEN_USE_THREADS
-  template <typename EvalSubExprsCallback>
-  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
-      EvaluatorPointerType dest, EvalSubExprsCallback done) {
-    // TODO(ezhulenev): ThreadPoolDevice memcpy is blockign operation.
-    done(evalSubExprsIfNeeded(dest));
-  }
-#endif  // EIGEN_USE_THREADS
-
-  EIGEN_STRONG_INLINE void cleanup() {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
-    eigen_assert(m_data != NULL);
-    return m_data[index];
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) {
-    eigen_assert(m_data != NULL);
-    return m_data[index];
-  }
-
-  template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  PacketReturnType packet(Index index) const
-  {
-    return internal::ploadt<PacketReturnType, LoadMode>(m_data + index);
-  }
-
-  // Return a packet starting at `index` where `umask` specifies which elements
-  // have to be loaded. Type/size of mask depends on PacketReturnType, e.g. for
-  // Packet16f, `umask` is of type uint16_t and if a bit is 1, corresponding
-  // float element will be loaded, otherwise 0 will be loaded.
-  // Function has been templatized to enable Sfinae.
-  template <typename PacketReturnTypeT> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  typename internal::enable_if<internal::unpacket_traits<PacketReturnTypeT>::masked_load_available, PacketReturnTypeT>::type
-  partialPacket(Index index, typename internal::unpacket_traits<PacketReturnTypeT>::mask_t umask) const
-  {
-    return internal::ploadu<PacketReturnTypeT>(m_data + index, umask);
-  }
-
-  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketReturnType& x)
-  {
-    return internal::pstoret<Scalar, PacketReturnType, StoreMode>(m_data + index, x);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<DenseIndex, NumCoords>& coords) const {
-    eigen_assert(m_data != NULL);
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      return m_data[m_dims.IndexOfColMajor(coords)];
-    } else {
-      return m_data[m_dims.IndexOfRowMajor(coords)];
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType&
-  coeffRef(const array<DenseIndex, NumCoords>& coords) {
-    eigen_assert(m_data != NULL);
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      return m_data[m_dims.IndexOfColMajor(coords)];
-    } else {
-      return m_data[m_dims.IndexOfRowMajor(coords)];
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
-                        PacketType<CoeffReturnType, Device>::size);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockResourceRequirements getResourceRequirements() const {
-    return internal::TensorBlockResourceRequirements::any();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
-  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
-          bool /*root_of_expr_ast*/ = false) const {
-    assert(m_data != NULL);
-    return TensorBlock::materialize(m_data, m_dims, desc, scratch);
-  }
-
-  template<typename TensorBlock>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
-      const TensorBlockDesc& desc, const TensorBlock& block) {
-    assert(m_data != NULL);
-
-    typedef typename TensorBlock::XprType TensorBlockExpr;
-    typedef internal::TensorBlockAssignment<Scalar, NumCoords, TensorBlockExpr,
-                                            Index>
-        TensorBlockAssign;
-
-    TensorBlockAssign::Run(
-        TensorBlockAssign::target(desc.dimensions(),
-                                  internal::strides<Layout>(m_dims), m_data,
-                                  desc.offset()),
-        block.expr());
-  }
-
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
-
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_data.bind(cgh);
-  }
-#endif
- protected:
-  EvaluatorPointerType m_data;
-  Dimensions m_dims;
-  const Device EIGEN_DEVICE_REF m_device;
-};
-
-namespace {
-template <typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-T loadConstant(const T* address) {
-  return *address;
-}
-// Use the texture cache on CUDA devices whenever possible
-#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
-template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-float loadConstant(const float* address) {
-  return __ldg(address);
-}
-template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-double loadConstant(const double* address) {
-  return __ldg(address);
-}
-template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-Eigen::half loadConstant(const Eigen::half* address) {
-  return Eigen::half(half_impl::raw_uint16_to_half(__ldg(&address->x)));
-}
-#endif
-#ifdef EIGEN_USE_SYCL
-// overload of load constant should be implemented here based on range access
-template <cl::sycl::access::mode AcMd, typename T>
-T &loadConstant(const Eigen::TensorSycl::internal::RangeAccess<AcMd, T> &address) {
-  return *address;
-}
-#endif
-}
-
-
-// Default evaluator for rvalues
-template<typename Derived, typename Device>
-struct TensorEvaluator<const Derived, Device>
-{
-  typedef typename Derived::Index Index;
-  typedef typename Derived::Scalar Scalar;
-  typedef typename Derived::Scalar CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  typedef typename Derived::Dimensions Dimensions;
-  typedef const Derived XprType;
-  typedef typename internal::traits<Derived>::template MakePointer<const Scalar>::Type TensorPointerType;
-  typedef StorageMemory<const Scalar, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
-  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
-
-  // NumDimensions is -1 for variable dim tensors
-  static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
-                               internal::traits<Derived>::NumDimensions : 0;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-
-  enum {
-    IsAligned         = Derived::IsAligned,
-    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccess       = internal::is_arithmetic<ScalarNoConst>::value,
-    PreferBlockAccess = false,
-    Layout            = Derived::Layout,
-    CoordAccess       = NumCoords > 0,
-    RawAccess         = true
-  };
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc;
-  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
-
-  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumCoords,
-                                                     Layout, Index>
-      TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
-      : m_data(device.get(m.data())), m_dims(m.dimensions()), m_device(device)
-  { }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; }
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
-    if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization && data) {
-      m_device.memcpy((void*)(m_device.get(data)),m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar));
-      return false;
-    }
-    return true;
-  }
-
-#ifdef EIGEN_USE_THREADS
-  template <typename EvalSubExprsCallback>
-  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
-      EvaluatorPointerType dest, EvalSubExprsCallback done) {
-    // TODO(ezhulenev): ThreadPoolDevice memcpy is a blockign operation.
-    done(evalSubExprsIfNeeded(dest));
-  }
-#endif  // EIGEN_USE_THREADS
-
-  EIGEN_STRONG_INLINE void cleanup() { }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
-    eigen_assert(m_data != NULL);
-    return loadConstant(m_data+index);
-  }
-
-  template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  PacketReturnType packet(Index index) const
-  {
-    return internal::ploadt_ro<PacketReturnType, LoadMode>(m_data + index);
-  }
-
-  // Return a packet starting at `index` where `umask` specifies which elements
-  // have to be loaded. Type/size of mask depends on PacketReturnType, e.g. for
-  // Packet16f, `umask` is of type uint16_t and if a bit is 1, corresponding
-  // float element will be loaded, otherwise 0 will be loaded.
-  // Function has been templatized to enable Sfinae.
-  template <typename PacketReturnTypeT> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  typename internal::enable_if<internal::unpacket_traits<PacketReturnTypeT>::masked_load_available, PacketReturnTypeT>::type
-  partialPacket(Index index, typename internal::unpacket_traits<PacketReturnTypeT>::mask_t umask) const
-  {
-    return internal::ploadu<PacketReturnTypeT>(m_data + index, umask);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<DenseIndex, NumCoords>& coords) const {
-    eigen_assert(m_data != NULL);
-    const Index index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_dims.IndexOfColMajor(coords)
-                        : m_dims.IndexOfRowMajor(coords);
-    return loadConstant(m_data+index);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
-                        PacketType<CoeffReturnType, Device>::size);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockResourceRequirements getResourceRequirements() const {
-    return internal::TensorBlockResourceRequirements::any();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
-  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
-          bool /*root_of_expr_ast*/ = false) const {
-    assert(m_data != NULL);
-    return TensorBlock::materialize(m_data, m_dims, desc, scratch);
-  }
-
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_data.bind(cgh);
-  }
-#endif
- protected:
-  EvaluatorPointerType m_data;
-  Dimensions m_dims;
-  const Device EIGEN_DEVICE_REF m_device;
-};
-
-
-
-
-// -------------------- CwiseNullaryOp --------------------
-
-template<typename NullaryOp, typename ArgType, typename Device>
-struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
-{
-  typedef TensorCwiseNullaryOp<NullaryOp, ArgType> XprType;
-
-  TensorEvaluator(const XprType& op, const Device& device)
-      : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper()
-  { }
-
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
-  enum {
-    IsAligned = true,
-    PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess
-    #ifdef EIGEN_USE_SYCL
-    &&  (PacketType<CoeffReturnType, Device>::size >1)
-    #endif
-    ,
-    BlockAccess = false,
-    PreferBlockAccess = false,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
-  };
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; }
-
-#ifdef EIGEN_USE_THREADS
-  template <typename EvalSubExprsCallback>
-  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
-      EvaluatorPointerType, EvalSubExprsCallback done) {
-    done(true);
-  }
-#endif  // EIGEN_USE_THREADS
-
-  EIGEN_STRONG_INLINE void cleanup() { }
-
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
-  {
-    return m_wrapper(m_functor, index);
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
-  {
-    return m_wrapper.template packetOp<PacketReturnType, Index>(m_functor, index);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
-  costPerCoeff(bool vectorized) const {
-    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
-                        PacketType<CoeffReturnType, Device>::size);
-  }
-
-  EIGEN_DEVICE_FUNC  EvaluatorPointerType data() const { return NULL; }
-
-#ifdef EIGEN_USE_SYCL
-   // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_argImpl.bind(cgh);
-  }
-#endif
-
- private:
-  const NullaryOp m_functor;
-  TensorEvaluator<ArgType, Device> m_argImpl;
-  const internal::nullary_wrapper<CoeffReturnType,NullaryOp> m_wrapper;
-};
-
-
-
-// -------------------- CwiseUnaryOp --------------------
-
-template<typename UnaryOp, typename ArgType, typename Device>
-struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
-{
-  typedef TensorCwiseUnaryOp<UnaryOp, ArgType> XprType;
-
-  enum {
-    IsAligned          = TensorEvaluator<ArgType, Device>::IsAligned,
-    PacketAccess       = int(TensorEvaluator<ArgType, Device>::PacketAccess) &
-                         int(internal::functor_traits<UnaryOp>::PacketAccess),
-    BlockAccess        = TensorEvaluator<ArgType, Device>::BlockAccess,
-    PreferBlockAccess  = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
-    Layout             = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess        = false,  // to be implemented
-    RawAccess          = false
-  };
-
-  TensorEvaluator(const XprType& op, const Device& device)
-    : m_device(device),
-      m_functor(op.functor()),
-      m_argImpl(op.nestedExpression(), device)
-  { }
-
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
-  typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-  static const int NumDims = internal::array_size<Dimensions>::value;
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
-  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
-
-  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
-      ArgTensorBlock;
-
-  typedef internal::TensorCwiseUnaryBlock<UnaryOp, ArgTensorBlock>
-      TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
-    m_argImpl.evalSubExprsIfNeeded(NULL);
-    return true;
-  }
-
-#ifdef EIGEN_USE_THREADS
-  template <typename EvalSubExprsCallback>
-  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
-      EvaluatorPointerType, EvalSubExprsCallback done) {
-    m_argImpl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
-  }
-#endif  // EIGEN_USE_THREADS
-
-  EIGEN_STRONG_INLINE void cleanup() {
-    m_argImpl.cleanup();
-  }
-
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
-  {
-    return m_functor(m_argImpl.coeff(index));
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
-  {
-    return m_functor.packetOp(m_argImpl.template packet<LoadMode>(index));
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    const double functor_cost = internal::functor_traits<UnaryOp>::Cost;
-    return m_argImpl.costPerCoeff(vectorized) +
-        TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockResourceRequirements getResourceRequirements() const {
-    static const double functor_cost = internal::functor_traits<UnaryOp>::Cost;
-    return m_argImpl.getResourceRequirements().addCostPerCoeff(
-        {0, 0, functor_cost / PacketSize});
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
-  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
-          bool /*root_of_expr_ast*/ = false) const {
-    return TensorBlock(m_argImpl.block(desc, scratch), m_functor);
-  }
-
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
-
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const{
-    m_argImpl.bind(cgh);
-  }
-#endif
-
-
- private:
-  const Device EIGEN_DEVICE_REF m_device;
-  const UnaryOp m_functor;
-  TensorEvaluator<ArgType, Device> m_argImpl;
-};
-
-
-// -------------------- CwiseBinaryOp --------------------
-
-template<typename BinaryOp, typename LeftArgType, typename RightArgType, typename Device>
-struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType>, Device>
-{
-  typedef TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType> XprType;
-
-  enum {
-    IsAligned         = int(TensorEvaluator<LeftArgType, Device>::IsAligned) &
-                        int(TensorEvaluator<RightArgType, Device>::IsAligned),
-    PacketAccess      = int(TensorEvaluator<LeftArgType, Device>::PacketAccess) &
-                        int(TensorEvaluator<RightArgType, Device>::PacketAccess) &
-                        int(internal::functor_traits<BinaryOp>::PacketAccess),
-    BlockAccess       = int(TensorEvaluator<LeftArgType, Device>::BlockAccess) &
-                        int(TensorEvaluator<RightArgType, Device>::BlockAccess),
-    PreferBlockAccess = int(TensorEvaluator<LeftArgType, Device>::PreferBlockAccess) |
-                        int(TensorEvaluator<RightArgType, Device>::PreferBlockAccess),
-    Layout            = TensorEvaluator<LeftArgType, Device>::Layout,
-    CoordAccess       = false,  // to be implemented
-    RawAccess         = false
-  };
-
-  TensorEvaluator(const XprType& op, const Device& device)
-    : m_device(device),
-      m_functor(op.functor()),
-      m_leftImpl(op.lhsExpression(), device),
-      m_rightImpl(op.rhsExpression(), device)
-  {
-    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) || internal::traits<XprType>::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions()));
-  }
-
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
-  static const int NumDims = internal::array_size<
-      typename TensorEvaluator<LeftArgType, Device>::Dimensions>::value;
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
-  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
-
-  typedef typename TensorEvaluator<const LeftArgType, Device>::TensorBlock
-      LeftTensorBlock;
-  typedef typename TensorEvaluator<const RightArgType, Device>::TensorBlock
-      RightTensorBlock;
-
-  typedef internal::TensorCwiseBinaryBlock<BinaryOp, LeftTensorBlock,
-                                           RightTensorBlock>
-      TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
-  {
-    // TODO: use right impl instead if right impl dimensions are known at compile time.
-    return m_leftImpl.dimensions();
-  }
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
-    m_leftImpl.evalSubExprsIfNeeded(NULL);
-    m_rightImpl.evalSubExprsIfNeeded(NULL);
-    return true;
-  }
-
-#ifdef EIGEN_USE_THREADS
-  template <typename EvalSubExprsCallback>
-  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
-      EvaluatorPointerType, EvalSubExprsCallback done) {
-    // TODO(ezhulenev): Evaluate two expression in parallel?
-    m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done](bool) {
-      m_rightImpl.evalSubExprsIfNeededAsync(nullptr,
-                                            [done](bool) { done(true); });
-    });
-  }
-#endif  // EIGEN_USE_THREADS
-
-  EIGEN_STRONG_INLINE void cleanup() {
-    m_leftImpl.cleanup();
-    m_rightImpl.cleanup();
-  }
-
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
-  {
-    return m_functor(m_leftImpl.coeff(index), m_rightImpl.coeff(index));
-  }
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
-  {
-    return m_functor.packetOp(m_leftImpl.template packet<LoadMode>(index), m_rightImpl.template packet<LoadMode>(index));
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
-  costPerCoeff(bool vectorized) const {
-    const double functor_cost = internal::functor_traits<BinaryOp>::Cost;
-    return m_leftImpl.costPerCoeff(vectorized) +
-           m_rightImpl.costPerCoeff(vectorized) +
-           TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockResourceRequirements getResourceRequirements() const {
-    static const double functor_cost = internal::functor_traits<BinaryOp>::Cost;
-    return internal::TensorBlockResourceRequirements::merge(
-               m_leftImpl.getResourceRequirements(),
-               m_rightImpl.getResourceRequirements())
-        .addCostPerCoeff({0, 0, functor_cost / PacketSize});
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
-  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
-          bool /*root_of_expr_ast*/ = false) const {
-    desc.DropDestinationBuffer();
-    return TensorBlock(m_leftImpl.block(desc, scratch),
-                         m_rightImpl.block(desc, scratch), m_functor);
-  }
-
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
-
-  #ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_leftImpl.bind(cgh);
-    m_rightImpl.bind(cgh);
-  }
-  #endif
- private:
-  const Device EIGEN_DEVICE_REF m_device;
-  const BinaryOp m_functor;
-  TensorEvaluator<LeftArgType, Device> m_leftImpl;
-  TensorEvaluator<RightArgType, Device> m_rightImpl;
-};
-
-// -------------------- CwiseTernaryOp --------------------
-
-template<typename TernaryOp, typename Arg1Type, typename Arg2Type, typename Arg3Type, typename Device>
-struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, Arg3Type>, Device>
-{
-  typedef TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, Arg3Type> XprType;
-
-  enum {
-    IsAligned = TensorEvaluator<Arg1Type, Device>::IsAligned & TensorEvaluator<Arg2Type, Device>::IsAligned & TensorEvaluator<Arg3Type, Device>::IsAligned,
-    PacketAccess      = TensorEvaluator<Arg1Type, Device>::PacketAccess &&
-                        TensorEvaluator<Arg2Type, Device>::PacketAccess &&
-                        TensorEvaluator<Arg3Type, Device>::PacketAccess &&
-                        internal::functor_traits<TernaryOp>::PacketAccess,
-    BlockAccess       = false,
-    PreferBlockAccess = TensorEvaluator<Arg1Type, Device>::PreferBlockAccess ||
-                        TensorEvaluator<Arg2Type, Device>::PreferBlockAccess ||
-                        TensorEvaluator<Arg3Type, Device>::PreferBlockAccess,
-    Layout            = TensorEvaluator<Arg1Type, Device>::Layout,
-    CoordAccess       = false,  // to be implemented
-    RawAccess         = false
-  };
-
-  TensorEvaluator(const XprType& op, const Device& device)
-    : m_functor(op.functor()),
-      m_arg1Impl(op.arg1Expression(), device),
-      m_arg2Impl(op.arg2Expression(), device),
-      m_arg3Impl(op.arg3Expression(), device)
-  {
-    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<Arg1Type, Device>::Layout) == static_cast<int>(TensorEvaluator<Arg3Type, Device>::Layout) || internal::traits<XprType>::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-    EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind,
-                         typename internal::traits<Arg2Type>::StorageKind>::value),
-                        STORAGE_KIND_MUST_MATCH)
-    EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind,
-                         typename internal::traits<Arg3Type>::StorageKind>::value),
-                        STORAGE_KIND_MUST_MATCH)
-    EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::Index,
-                         typename internal::traits<Arg2Type>::Index>::value),
-                        STORAGE_INDEX_MUST_MATCH)
-    EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::Index,
-                         typename internal::traits<Arg3Type>::Index>::value),
-                        STORAGE_INDEX_MUST_MATCH)
-
-    eigen_assert(dimensions_match(m_arg1Impl.dimensions(), m_arg2Impl.dimensions()) && dimensions_match(m_arg1Impl.dimensions(), m_arg3Impl.dimensions()));
-  }
-
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  typedef typename TensorEvaluator<Arg1Type, Device>::Dimensions Dimensions;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
-  {
-    // TODO: use arg2 or arg3 dimensions if they are known at compile time.
-    return m_arg1Impl.dimensions();
-  }
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
-    m_arg1Impl.evalSubExprsIfNeeded(NULL);
-    m_arg2Impl.evalSubExprsIfNeeded(NULL);
-    m_arg3Impl.evalSubExprsIfNeeded(NULL);
-    return true;
-  }
-  EIGEN_STRONG_INLINE void cleanup() {
-    m_arg1Impl.cleanup();
-    m_arg2Impl.cleanup();
-    m_arg3Impl.cleanup();
-  }
-
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
-  {
-    return m_functor(m_arg1Impl.coeff(index), m_arg2Impl.coeff(index), m_arg3Impl.coeff(index));
-  }
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
-  {
-    return m_functor.packetOp(m_arg1Impl.template packet<LoadMode>(index),
-                              m_arg2Impl.template packet<LoadMode>(index),
-                              m_arg3Impl.template packet<LoadMode>(index));
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
-  costPerCoeff(bool vectorized) const {
-    const double functor_cost = internal::functor_traits<TernaryOp>::Cost;
-    return m_arg1Impl.costPerCoeff(vectorized) +
-           m_arg2Impl.costPerCoeff(vectorized) +
-           m_arg3Impl.costPerCoeff(vectorized) +
-           TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
-  }
-
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
-
-#ifdef EIGEN_USE_SYCL
-   // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_arg1Impl.bind(cgh);
-    m_arg2Impl.bind(cgh);
-    m_arg3Impl.bind(cgh);
-  }
-#endif
-
- private:
-  const TernaryOp m_functor;
-  TensorEvaluator<Arg1Type, Device> m_arg1Impl;
-  TensorEvaluator<Arg2Type, Device> m_arg2Impl;
-  TensorEvaluator<Arg3Type, Device> m_arg3Impl;
-};
-
-
-// -------------------- SelectOp --------------------
-
-template<typename IfArgType, typename ThenArgType, typename ElseArgType, typename Device>
-struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>, Device>
-{
-  typedef TensorSelectOp<IfArgType, ThenArgType, ElseArgType> XprType;
-  typedef typename XprType::Scalar Scalar;
-
-  enum {
-    IsAligned         = TensorEvaluator<ThenArgType, Device>::IsAligned &
-                        TensorEvaluator<ElseArgType, Device>::IsAligned,
-    PacketAccess      = TensorEvaluator<ThenArgType, Device>::PacketAccess &
-                        TensorEvaluator<ElseArgType, Device>::PacketAccess &
-                        PacketType<Scalar, Device>::HasBlend,
-    BlockAccess       = TensorEvaluator<IfArgType, Device>::BlockAccess &&
-                        TensorEvaluator<ThenArgType, Device>::BlockAccess &&
-                        TensorEvaluator<ElseArgType, Device>::BlockAccess,
-    PreferBlockAccess = TensorEvaluator<IfArgType, Device>::PreferBlockAccess ||
-                        TensorEvaluator<ThenArgType, Device>::PreferBlockAccess ||
-                        TensorEvaluator<ElseArgType, Device>::PreferBlockAccess,
-    Layout            = TensorEvaluator<IfArgType, Device>::Layout,
-    CoordAccess       = false,  // to be implemented
-    RawAccess         = false
-  };
-
-  TensorEvaluator(const XprType& op, const Device& device)
-    : m_condImpl(op.ifExpression(), device),
-      m_thenImpl(op.thenExpression(), device),
-      m_elseImpl(op.elseExpression(), device)
-  {
-    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<IfArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<ThenArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<IfArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<ElseArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    eigen_assert(dimensions_match(m_condImpl.dimensions(), m_thenImpl.dimensions()));
-    eigen_assert(dimensions_match(m_thenImpl.dimensions(), m_elseImpl.dimensions()));
-  }
-
-  typedef typename XprType::Index Index;
-  typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  typedef typename TensorEvaluator<IfArgType, Device>::Dimensions Dimensions;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
-  static const int NumDims = internal::array_size<Dimensions>::value;
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-    typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
-  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
-
-  typedef typename TensorEvaluator<const IfArgType, Device>::TensorBlock
-      IfArgTensorBlock;
-  typedef typename TensorEvaluator<const ThenArgType, Device>::TensorBlock
-      ThenArgTensorBlock;
-  typedef typename TensorEvaluator<const ElseArgType, Device>::TensorBlock
-      ElseArgTensorBlock;
-
-  struct TensorSelectOpBlockFactory {
-    template <typename IfArgXprType, typename ThenArgXprType, typename ElseArgXprType>
-    struct XprType {
-      typedef TensorSelectOp<const IfArgXprType, const ThenArgXprType, const ElseArgXprType> type;
-    };
-
-    template <typename IfArgXprType, typename ThenArgXprType, typename ElseArgXprType>
-    typename XprType<IfArgXprType, ThenArgXprType, ElseArgXprType>::type expr(
-        const IfArgXprType& if_expr, const ThenArgXprType& then_expr, const ElseArgXprType& else_expr) const {
-      return typename XprType<IfArgXprType, ThenArgXprType, ElseArgXprType>::type(if_expr, then_expr, else_expr);
-    }
-  };
-
-  typedef internal::TensorTernaryExprBlock<TensorSelectOpBlockFactory,
-                                           IfArgTensorBlock, ThenArgTensorBlock,
-                                           ElseArgTensorBlock>
-      TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
-  {
-    // TODO: use then or else impl instead if they happen to be known at compile time.
-    return m_condImpl.dimensions();
-  }
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
-    m_condImpl.evalSubExprsIfNeeded(NULL);
-    m_thenImpl.evalSubExprsIfNeeded(NULL);
-    m_elseImpl.evalSubExprsIfNeeded(NULL);
-    return true;
-  }
-
-#ifdef EIGEN_USE_THREADS
-  template <typename EvalSubExprsCallback>
-  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
-      EvaluatorPointerType, EvalSubExprsCallback done) {
-    m_condImpl.evalSubExprsIfNeeded(nullptr, [this, done](bool) {
-      m_thenImpl.evalSubExprsIfNeeded(nullptr, [this, done](bool) {
-        m_elseImpl.evalSubExprsIfNeeded(nullptr, [done](bool) { done(true); });
-      });
-    });
-  }
-#endif  // EIGEN_USE_THREADS
-
-  EIGEN_STRONG_INLINE void cleanup() {
-    m_condImpl.cleanup();
-    m_thenImpl.cleanup();
-    m_elseImpl.cleanup();
-  }
-
-  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
-  {
-    return m_condImpl.coeff(index) ? m_thenImpl.coeff(index) : m_elseImpl.coeff(index);
-  }
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
-  {
-     internal::Selector<PacketSize> select;
-     EIGEN_UNROLL_LOOP
-     for (Index i = 0; i < PacketSize; ++i) {
-       select.select[i] = m_condImpl.coeff(index+i);
-     }
-     return internal::pblend(select,
-                             m_thenImpl.template packet<LoadMode>(index),
-                             m_elseImpl.template packet<LoadMode>(index));
-
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
-  costPerCoeff(bool vectorized) const {
-    return m_condImpl.costPerCoeff(vectorized) +
-           m_thenImpl.costPerCoeff(vectorized)
-        .cwiseMax(m_elseImpl.costPerCoeff(vectorized));
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockResourceRequirements getResourceRequirements() const {
-    auto then_req = m_thenImpl.getResourceRequirements();
-    auto else_req = m_elseImpl.getResourceRequirements();
-
-    auto merged_req =
-        internal::TensorBlockResourceRequirements::merge(then_req, else_req);
-    merged_req.cost_per_coeff =
-        then_req.cost_per_coeff.cwiseMax(else_req.cost_per_coeff);
-
-    return internal::TensorBlockResourceRequirements::merge(
-        m_condImpl.getResourceRequirements(), merged_req);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
-  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
-          bool /*root_of_expr_ast*/ = false) const {
-    // It's unsafe to pass destination buffer to underlying expressions, because
-    // output might be aliased with one of the inputs.
-    desc.DropDestinationBuffer();
-
-    return TensorBlock(
-        m_condImpl.block(desc, scratch), m_thenImpl.block(desc, scratch),
-        m_elseImpl.block(desc, scratch), TensorSelectOpBlockFactory());
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; }
-
-#ifdef EIGEN_USE_SYCL
- // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_condImpl.bind(cgh);
-    m_thenImpl.bind(cgh);
-    m_elseImpl.bind(cgh);
-  }
-#endif
- private:
-  TensorEvaluator<IfArgType, Device> m_condImpl;
-  TensorEvaluator<ThenArgType, Device> m_thenImpl;
-  TensorEvaluator<ElseArgType, Device> m_elseImpl;
-};
-
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorExecutor.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorExecutor.h
deleted file mode 100644
index c52fb77..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorExecutor.h
+++ /dev/null
@@ -1,703 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
-#define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
-
-namespace Eigen {
-
-/**
- * \class TensorExecutor
- * \ingroup CXX11_Tensor_Module
- *
- * \brief The tensor executor class.
- *
- * This class is responsible for launch the evaluation of the expression on
- * the specified computing device.
- *
- * @tparam Vectorizable can use packet math (SSE/AVX/etc... registers and
- *                      instructions)
- * @tparam Tiling       can use block based tensor evaluation
- *                      (see TensorBlock.h)
- */
-namespace internal {
-
-/**
- * Evaluating TensorBroadcastingOp via coefficient of packet path is extremely
- * expensive. If expression has at least one broadcast op in it, and it supports
- * block based evaluation, we always prefer it, even for the small tensors. For
- * all other tileable ops, block evaluation overhead for small tensors (fits
- * into L1) is too large, and we fallback on vectorized evaluation.
- */
-
-// TODO(ezhulenev): Add specializations for all other types of Tensor ops.
-
-template<typename Expression>
-struct ExpressionHasTensorBroadcastingOp {
-  enum { value = false };
-};
-
-template<typename LhsXprType, typename RhsXprType>
-struct ExpressionHasTensorBroadcastingOp<
-    const TensorAssignOp<LhsXprType, RhsXprType> > {
-  enum { value = ExpressionHasTensorBroadcastingOp<RhsXprType>::value };
-};
-
-template<typename UnaryOp, typename XprType>
-struct ExpressionHasTensorBroadcastingOp<
-    const TensorCwiseUnaryOp<UnaryOp, XprType> > {
-  enum { value = ExpressionHasTensorBroadcastingOp<XprType>::value };
-};
-
-template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
-struct ExpressionHasTensorBroadcastingOp<
-    const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> > {
-  enum {
-    value = ExpressionHasTensorBroadcastingOp<LhsXprType>::value ||
-        ExpressionHasTensorBroadcastingOp<RhsXprType>::value
-  };
-};
-
-template<typename Broadcast, typename XprType>
-struct ExpressionHasTensorBroadcastingOp<
-    const TensorBroadcastingOp<Broadcast, XprType> > {
-  enum { value = true };
-};
-
-// -------------------------------------------------------------------------- //
-
-/**
- * Default strategy: the expression is evaluated sequentially with a single cpu
- * thread, without vectorization and block evaluation.
- */
-template <typename Expression, typename Device, bool Vectorizable,
-          TiledEvaluation Tiling>
-class TensorExecutor {
- public:
-  typedef typename Expression::Index StorageIndex;
-
-  // Including `unsupported/Eigen/CXX11/Tensor` in different translation units
-  // with/without `EIGEN_USE_THREADS` or `EIGEN_USE_GPU` is a potential ODR
-  // violation. If this template is instantiated with a non-default device, it
-  // means that this header file was included without defining
-  // `EIGEN_USE_THREADS`, `EIGEN_USE_GPU` or `EIGEN_USE_SYCL`.
-  static_assert(std::is_same<Device, DefaultDevice>::value,
-                "Default executor instantiated with non-default device. "
-                "You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or "
-                "EIGEN_USE_SYCL before including Eigen headers.");
-
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE void run(const Expression& expr,
-                                      const Device& device = Device()) {
-    TensorEvaluator<Expression, Device> evaluator(expr, device);
-    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
-    if (needs_assign) {
-      const StorageIndex size = array_prod(evaluator.dimensions());
-      for (StorageIndex i = 0; i < size; ++i) {
-        evaluator.evalScalar(i);
-      }
-    }
-    evaluator.cleanup();
-  }
-};
-
-/**
- * Default async execution strategy is not implemented. Currently it's only
- * available for ThreadPoolDevice (see definition below).
- */
-template <typename Expression, typename Device, typename DoneCallback,
-          bool Vectorizable, TiledEvaluation Tiling>
-class TensorAsyncExecutor {};
-
-/**
- * Process all the data with a single cpu thread, using vectorized instructions.
- */
-template <typename Expression>
-class TensorExecutor<Expression, DefaultDevice, /*Vectorizable=*/true,
-                     /*Tiling=*/TiledEvaluation::Off> {
- public:
-  typedef typename Expression::Index StorageIndex;
-
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE void run(
-      const Expression& expr, const DefaultDevice& device = DefaultDevice()) {
-    TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
-    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
-    if (needs_assign) {
-      const StorageIndex size = array_prod(evaluator.dimensions());
-      const int PacketSize = unpacket_traits<typename TensorEvaluator<
-          Expression, DefaultDevice>::PacketReturnType>::size;
-
-      // Give compiler a strong possibility to unroll the loop. But don't insist
-      // on unrolling, because if the function is expensive compiler should not
-      // unroll the loop at the expense of inlining.
-      const StorageIndex UnrolledSize =
-          (size / (4 * PacketSize)) * 4 * PacketSize;
-      for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) {
-        for (StorageIndex j = 0; j < 4; j++) {
-          evaluator.evalPacket(i + j * PacketSize);
-        }
-      }
-      const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize;
-      for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
-        evaluator.evalPacket(i);
-      }
-      for (StorageIndex i = VectorizedSize; i < size; ++i) {
-        evaluator.evalScalar(i);
-      }
-    }
-    evaluator.cleanup();
-  }
-};
-
-/**
- * Process all the data with a single cpu thread, using blocks of data. By
- * sizing a block to fit L1 cache we get better cache performance.
- */
-template <typename Expression, bool Vectorizable>
-class TensorExecutor<Expression, DefaultDevice, Vectorizable,
-                     /*Tiling=*/TiledEvaluation::On> {
- public:
-  typedef typename traits<Expression>::Scalar Scalar;
-  typedef typename remove_const<Scalar>::type ScalarNoConst;
-
-  typedef TensorEvaluator<Expression, DefaultDevice> Evaluator;
-  typedef typename traits<Expression>::Index StorageIndex;
-
-  static const int NumDims = traits<Expression>::NumDimensions;
-
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE void run(const Expression& expr,
-                         const DefaultDevice& device = DefaultDevice()) {
-    typedef TensorBlockMapper<NumDims, Evaluator::Layout, StorageIndex>
-        TensorBlockMapper;
-
-    typedef internal::TensorBlockDescriptor<NumDims, StorageIndex>
-        TensorBlockDesc;
-    typedef internal::TensorBlockScratchAllocator<DefaultDevice>
-        TensorBlockScratch;
-
-    Evaluator evaluator(expr, device);
-
-    // TODO(ezhulenev): Do not use tiling for small tensors?
-    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
-
-    if (needs_assign) {
-      // Query expression tree for desired block size/shape.
-      const TensorBlockResourceRequirements requirements =
-          evaluator.getResourceRequirements();
-
-      const TensorBlockMapper block_mapper(
-          typename TensorBlockDesc::Dimensions(evaluator.dimensions()),
-          requirements);
-
-      // Share scratch memory allocator between all blocks.
-      TensorBlockScratch scratch(device);
-
-      const StorageIndex total_block_count = block_mapper.blockCount();
-      for (StorageIndex i = 0; i < total_block_count; ++i) {
-        TensorBlockDesc desc = block_mapper.blockDescriptor(i);
-        evaluator.evalBlock(desc, scratch);
-        scratch.reset();
-      }
-    }
-    evaluator.cleanup();
-  }
-};
-
-/**
- * Multicore strategy: the index space is partitioned and each partition is
- * executed on a single core.
- *
- * (1) TensorExecutor will submit work to the ThreadPoolDevice managed thread
- *     pool, and will block the caller thread until all tasks are finished.
- *
- * (2) TensorAsyncExecutor is a non-blocking version, that will submit work to
- *     the ThreadPoolDevice managed thread pool, and will return immediately.
- *     It will call 'done' callback after all tasks are finished.
- */
-#ifdef EIGEN_USE_THREADS
-
-template <typename TensorBlockMapper>
-struct TensorExecutorTilingContext {
-  TensorExecutorTilingContext() = default;
-  TensorExecutorTilingContext(const TensorBlockMapper& b_mapper,
-                              const TensorOpCost& b_cost, size_t b_aligned_size)
-      : block_mapper(b_mapper),
-        cost(b_cost),
-        aligned_blocksize(b_aligned_size) {}
-
-  TensorBlockMapper block_mapper;  // navigate through blocks
-  TensorOpCost cost;               // cost of computing a single block
-  size_t aligned_blocksize;        // block size after memory alignment
-};
-
-// Computes a block evaluation parameters, and allocates temporary memory buffer
-// for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below.
-template <typename Evaluator, typename TensorBlockMapper, bool Vectorizable>
-TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(
-    const Evaluator& evaluator) {
-  // Query expression tree for desired block size/shape.
-  TensorBlockResourceRequirements requirements =
-      evaluator.getResourceRequirements();
-
-  // Update target block size based on cost model.
-  double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(
-      1, requirements.cost_per_coeff);
-  requirements.size = static_cast<size_t>(1.0 / taskSize);
-
-  TensorBlockMapper block_mapper(
-      typename TensorBlockMapper::Dimensions(evaluator.dimensions()),
-      requirements);
-
-  size_t block_size = block_mapper.blockTotalSize();
-  const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
-  const size_t aligned_blocksize =
-      align *
-      divup<size_t>(block_size * sizeof(typename Evaluator::Scalar), align);
-
-  return {block_mapper, requirements.cost_per_coeff * block_size,
-          aligned_blocksize};
-}
-
-template <typename Evaluator, typename StorageIndex, bool Vectorizable>
-struct EvalRange {
-  static void run(Evaluator* evaluator_in, const StorageIndex firstIdx,
-                  const StorageIndex lastIdx) {
-    Evaluator evaluator = *evaluator_in;
-    eigen_assert(lastIdx >= firstIdx);
-    for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
-      evaluator.evalScalar(i);
-    }
-  }
-
-  static StorageIndex alignBlockSize(StorageIndex size) { return size; }
-};
-
-template <typename Evaluator, typename StorageIndex>
-struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> {
-  static const int PacketSize =
-      unpacket_traits<typename Evaluator::PacketReturnType>::size;
-
-  static void run(Evaluator* evaluator_in, const StorageIndex firstIdx,
-                  const StorageIndex lastIdx) {
-    Evaluator evaluator = *evaluator_in;
-    eigen_assert(lastIdx >= firstIdx);
-    StorageIndex i = firstIdx;
-    if (lastIdx - firstIdx >= PacketSize) {
-      eigen_assert(firstIdx % PacketSize == 0);
-      StorageIndex last_chunk_offset = lastIdx - 4 * PacketSize;
-      // Give compiler a strong possibility to unroll the loop. But don't insist
-      // on unrolling, because if the function is expensive compiler should not
-      // unroll the loop at the expense of inlining.
-      for (; i <= last_chunk_offset; i += 4 * PacketSize) {
-        for (StorageIndex j = 0; j < 4; j++) {
-          evaluator.evalPacket(i + j * PacketSize);
-        }
-      }
-      last_chunk_offset = lastIdx - PacketSize;
-      for (; i <= last_chunk_offset; i += PacketSize) {
-        evaluator.evalPacket(i);
-      }
-    }
-    for (; i < lastIdx; ++i) {
-      evaluator.evalScalar(i);
-    }
-  }
-
-  static StorageIndex alignBlockSize(StorageIndex size) {
-    // Align block size to packet size and account for unrolling in run above.
-    if (size >= 16 * PacketSize) {
-      return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
-    }
-    // Aligning to 4 * PacketSize would increase block size by more than 25%.
-    return (size + PacketSize - 1) & ~(PacketSize - 1);
-  }
-};
-
-template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
-class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tiling> {
- public:
-  typedef typename Expression::Index StorageIndex;
-
-  static EIGEN_STRONG_INLINE void run(const Expression& expr,
-                         const ThreadPoolDevice& device) {
-    typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
-    typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
-
-    Evaluator evaluator(expr, device);
-    const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
-    if (needs_assign) {
-      const StorageIndex size = array_prod(evaluator.dimensions());
-      device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
-                         EvalRange::alignBlockSize,
-                         [&evaluator](StorageIndex firstIdx, StorageIndex lastIdx) {
-                           EvalRange::run(&evaluator, firstIdx, lastIdx);
-                         });
-    }
-    evaluator.cleanup();
-  }
-};
-
-template <typename Expression, bool Vectorizable>
-class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
-                     /*Tiling=*/TiledEvaluation::On> {
- public:
-  typedef typename traits<Expression>::Index IndexType;
-  typedef typename traits<Expression>::Scalar Scalar;
-  typedef typename remove_const<Scalar>::type ScalarNoConst;
-
-  static const int NumDims = traits<Expression>::NumDimensions;
-
-  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
-  typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
-  typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
-
-  typedef internal::TensorBlockDescriptor<NumDims, IndexType>
-      TensorBlockDesc;
-  typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
-      TensorBlockScratch;
-
-  static EIGEN_STRONG_INLINE void run(const Expression& expr,
-                                      const ThreadPoolDevice& device) {
-    Evaluator evaluator(expr, device);
-
-    const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
-    if (needs_assign) {
-      const TilingContext tiling =
-          internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper,
-                                                   Vectorizable>(evaluator);
-
-      auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx,
-                                                       IndexType lastBlockIdx) {
-        TensorBlockScratch scratch(device);
-
-        for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
-             ++block_idx) {
-          TensorBlockDesc desc = tiling.block_mapper.blockDescriptor(block_idx);
-          evaluator.evalBlock(desc, scratch);
-          scratch.reset();
-        }
-      };
-
-      // Evaluate small expressions directly as a single block.
-      if (tiling.block_mapper.blockCount() == 1) {
-        TensorBlockScratch scratch(device);
-        TensorBlockDesc desc(0, tiling.block_mapper.blockDimensions());
-        evaluator.evalBlock(desc, scratch);
-      } else {
-        device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost,
-                           eval_block);
-      }
-    }
-    evaluator.cleanup();
-  }
-};
-
-template <typename Expression, typename DoneCallback, bool Vectorizable,
-          TiledEvaluation Tiling>
-class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
-                          Vectorizable, Tiling> {
- public:
-  typedef typename Expression::Index StorageIndex;
-  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
-
-  static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
-                                           const ThreadPoolDevice& device,
-                                           DoneCallback done) {
-    TensorAsyncExecutorContext* const ctx =
-        new TensorAsyncExecutorContext(expr, device, std::move(done));
-
-    const auto on_eval_subexprs = [ctx, &device](bool need_assign) -> void {
-      if (!need_assign) {
-        delete ctx;
-        return;
-      }
-
-      typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
-      const StorageIndex size = array_prod(ctx->evaluator.dimensions());
-      device.parallelForAsync(
-          size, ctx->evaluator.costPerCoeff(Vectorizable),
-          EvalRange::alignBlockSize,
-          [ctx](StorageIndex firstIdx, StorageIndex lastIdx) {
-            EvalRange::run(&ctx->evaluator, firstIdx, lastIdx);
-          },
-          [ctx]() { delete ctx; });
-    };
-
-    ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
-  }
-
- private:
-  struct TensorAsyncExecutorContext {
-    TensorAsyncExecutorContext(const Expression& expr,
-                               const ThreadPoolDevice& thread_pool,
-                               DoneCallback done)
-        : evaluator(expr, thread_pool), on_done(std::move(done)) {}
-
-    ~TensorAsyncExecutorContext() {
-      evaluator.cleanup();
-      on_done();
-    }
-
-    Evaluator evaluator;
-
-   private:
-    DoneCallback on_done;
-  };
-};
-
-template <typename Expression, typename DoneCallback, bool Vectorizable>
-class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
-                          Vectorizable, /*Tileable*/ TiledEvaluation::On> {
- public:
-  typedef typename traits<Expression>::Index IndexType;
-  typedef typename traits<Expression>::Scalar Scalar;
-  typedef typename remove_const<Scalar>::type ScalarNoConst;
-
-  static const int NumDims = traits<Expression>::NumDimensions;
-
-  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
-  typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
-  typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
-
-  typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
-  typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
-      TensorBlockScratch;
-
-  static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
-                                           const ThreadPoolDevice& device,
-                                           DoneCallback done) {
-
-    TensorAsyncExecutorContext* const ctx =
-        new TensorAsyncExecutorContext(expr, device, std::move(done));
-
-    const auto on_eval_subexprs = [ctx](bool need_assign) -> void {
-      if (!need_assign) {
-        delete ctx;
-        return;
-      }
-
-      ctx->tiling = internal::GetTensorExecutorTilingContext<
-          Evaluator, BlockMapper, Vectorizable>(ctx->evaluator);
-
-      auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) {
-        TensorBlockScratch scratch(ctx->device);
-
-        for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
-             ++block_idx) {
-          TensorBlockDesc desc =
-              ctx->tiling.block_mapper.blockDescriptor(block_idx);
-          ctx->evaluator.evalBlock(desc, scratch);
-          scratch.reset();
-        }
-      };
-
-      // Evaluate small expressions directly as a single block.
-      if (ctx->tiling.block_mapper.blockCount() == 1) {
-        TensorBlockScratch scratch(ctx->device);
-        TensorBlockDesc desc(0, ctx->tiling.block_mapper.blockDimensions());
-        ctx->evaluator.evalBlock(desc, scratch);
-        delete ctx;
-      } else {
-        ctx->device.parallelForAsync(ctx->tiling.block_mapper.blockCount(),
-                                     ctx->tiling.cost, eval_block,
-                                     [ctx]() { delete ctx; });
-      }
-    };
-
-    ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
-  }
-
- private:
-  struct TensorAsyncExecutorContext {
-    TensorAsyncExecutorContext(const Expression& expr,
-                               const ThreadPoolDevice& thread_pool,
-                               DoneCallback done)
-        : device(thread_pool),
-          evaluator(expr, thread_pool),
-          on_done(std::move(done)) {}
-
-    ~TensorAsyncExecutorContext() {
-      evaluator.cleanup();
-      on_done();
-    }
-
-    const ThreadPoolDevice& device;
-    Evaluator evaluator;
-    TilingContext tiling;
-
-   private:
-    DoneCallback on_done;
-  };
-};
-
-#endif  // EIGEN_USE_THREADS
-
-// GPU: the evaluation of the expression is offloaded to a GPU.
-#if defined(EIGEN_USE_GPU)
-
-template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
-class TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling> {
- public:
-  typedef typename Expression::Index StorageIndex;
-  static void run(const Expression& expr, const GpuDevice& device);
-};
-
-#if defined(EIGEN_GPUCC)
-template <typename Evaluator, typename StorageIndex, bool Vectorizable>
-struct EigenMetaKernelEval {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-  void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
-    for (StorageIndex i = firstIdx; i < lastIdx; i += step_size) {
-      eval.evalScalar(i);
-    }
-  }
-};
-
-template <typename Evaluator, typename StorageIndex>
-struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-  void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
-    const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
-    const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize;
-    const StorageIndex vectorized_step_size = step_size * PacketSize;
-
-    // Use the vector path
-    for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size;
-         i += vectorized_step_size) {
-      eval.evalPacket(i);
-    }
-    for (StorageIndex i = vectorized_size + firstIdx; i < lastIdx; i += step_size) {
-      eval.evalScalar(i);
-    }
-  }
-};
-
-template <typename Evaluator, typename StorageIndex>
-__global__ void
-__launch_bounds__(1024)
-EigenMetaKernel(Evaluator eval, StorageIndex size) {
-
-  const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x;
-  const StorageIndex step_size = blockDim.x * gridDim.x;
-
-  const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
-  EigenMetaKernelEval<Evaluator, StorageIndex, vectorizable>::run(eval, first_index, size, step_size);
-}
-
-/*static*/
-template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
-EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling>::run(
-    const Expression& expr, const GpuDevice& device) {
-  TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
-  const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
-  if (needs_assign) {
-
-    const int block_size = device.maxGpuThreadsPerBlock();
-    const int max_blocks = device.getNumGpuMultiProcessors() *
-                           device.maxGpuThreadsPerMultiProcessor() / block_size;
-    const StorageIndex size = array_prod(evaluator.dimensions());
-    // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
-    const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
-
-    LAUNCH_GPU_KERNEL(
-        (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>),
-        num_blocks, block_size, 0, device, evaluator, size);
-  }
-  evaluator.cleanup();
-}
-
-#endif  // EIGEN_GPUCC
-#endif  // EIGEN_USE_GPU
-
-// SYCL Executor policy
-#ifdef EIGEN_USE_SYCL
-
-template <typename Evaluator>
-struct ExecExprFunctorKernel {
-  typedef typename Evaluator::Index Index;
-  Evaluator evaluator;
-  const Index range;
-  template <typename Scratch>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel(
-      const Scratch, Evaluator evaluator_, const Index range_)
-      : evaluator(evaluator_), range(range_) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()(
-      cl::sycl::nd_item<1> itemID) {
-    compute(itemID);
-  }
-  template <bool is_vec = Evaluator::PacketAccess>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<!is_vec>::type
-  compute(const cl::sycl::nd_item<1>& itemID) {
-    Index gId = static_cast<Index>(itemID.get_global_linear_id());
-    Index total_threads = itemID.get_global_range(0);
-
-    for (Index i = gId; i < range; i += total_threads) {
-      evaluator.evalScalar(i);
-    }
-  }
-  template <bool is_vec = Evaluator::PacketAccess>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<is_vec>::type
-  compute(const cl::sycl::nd_item<1>& itemID) {
-    const Index vectorizedRange =
-        (range / Evaluator::PacketSize) * Evaluator::PacketSize;
-    Index gId = static_cast<Index>(itemID.get_global_linear_id());
-    const Index step = Evaluator::PacketSize * itemID.get_global_range(0);
-    const Index start = Evaluator::PacketSize * gId;
-    for (Index i = start; i < vectorizedRange; i += step) {
-      evaluator.evalPacket(i);
-    }
-    gId += vectorizedRange;
-    for (Index i = gId; i < range; i += itemID.get_global_range(0)) {
-      evaluator.evalScalar(i);
-    }
-  }
-};
-
-template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
-class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tiling> {
- public:
-  typedef typename Expression::Index Index;
-  static EIGEN_STRONG_INLINE void run(const Expression& expr,
-                                      const Eigen::SyclDevice& dev) {
-    typedef Eigen::TensorEvaluator<Expression, Eigen::SyclDevice> Evaluator;
-    Evaluator evaluator(expr, dev);
-    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
-    if (needs_assign) {
-      Index range, GRange, tileSize;
-      Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions());
-      total_size = (total_size == 0) ? 1 : total_size;
-      const int PacketSize =
-          Eigen::PacketType<typename Evaluator::CoeffReturnType,
-                            Eigen::SyclDevice>::size;
-      Index vectorizable_threads = static_cast<Index>(total_size / PacketSize);
-      dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange);
-      range = total_size;
-
-      dev.template nullary_kernel_launcher<
-          typename Evaluator::CoeffReturnType,
-          ExecExprFunctorKernel<Evaluator> >(
-          evaluator,
-          cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
-                                cl::sycl::range<1>(tileSize)),
-          Index(1), range);
-    }
-    evaluator.cleanup();
-  }
-};
-
-#endif
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorExpr.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorExpr.h
deleted file mode 100644
index c9bccfc..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorExpr.h
+++ /dev/null
@@ -1,388 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXPR_H
-#define EIGEN_CXX11_TENSOR_TENSOR_EXPR_H
-
-namespace Eigen {
-
-/** \class TensorExpr
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Tensor expression classes.
-  *
-  * The TensorCwiseNullaryOp class applies a nullary operators to an expression.
-  * This is typically used to generate constants.
-  *
-  * The TensorCwiseUnaryOp class represents an expression where a unary operator
-  * (e.g. cwiseSqrt) is applied to an expression.
-  *
-  * The TensorCwiseBinaryOp class represents an expression where a binary
-  * operator (e.g. addition) is applied to a lhs and a rhs expression.
-  *
-  */
-namespace internal {
-template<typename NullaryOp, typename XprType>
-struct traits<TensorCwiseNullaryOp<NullaryOp, XprType> >
-    : traits<XprType>
-{
-  typedef traits<XprType> XprTraits;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::Nested XprTypeNested;
-  typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
-  static const int NumDimensions = XprTraits::NumDimensions;
-  static const int Layout = XprTraits::Layout;
-  typedef typename XprTraits::PointerType PointerType;
-  enum {
-    Flags = 0
-  };
-};
-
-}  // end namespace internal
-
-
-
-template<typename NullaryOp, typename XprType>
-class TensorCwiseNullaryOp : public TensorBase<TensorCwiseNullaryOp<NullaryOp, XprType>, ReadOnlyAccessors>
-{
-  public:
-    typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::Scalar Scalar;
-    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-    typedef typename XprType::CoeffReturnType CoeffReturnType;
-    typedef TensorCwiseNullaryOp<NullaryOp, XprType> Nested;
-    typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::StorageKind StorageKind;
-    typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::Index Index;
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseNullaryOp(const XprType& xpr, const NullaryOp& func = NullaryOp())
-        : m_xpr(xpr), m_functor(func) {}
-
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename XprType::Nested>::type&
-    nestedExpression() const { return m_xpr; }
-
-    EIGEN_DEVICE_FUNC
-    const NullaryOp& functor() const { return m_functor; }
-
-  protected:
-    typename XprType::Nested m_xpr;
-    const NullaryOp m_functor;
-};
-
-
-
-namespace internal {
-template<typename UnaryOp, typename XprType>
-struct traits<TensorCwiseUnaryOp<UnaryOp, XprType> >
-    : traits<XprType>
-{
-  // TODO(phli): Add InputScalar, InputPacket.  Check references to
-  // current Scalar/Packet to see if the intent is Input or Output.
-  typedef typename result_of<UnaryOp(typename XprType::Scalar)>::type Scalar;
-  typedef traits<XprType> XprTraits;
-  typedef typename XprType::Nested XprTypeNested;
-  typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
-  static const int NumDimensions = XprTraits::NumDimensions;
-  static const int Layout = XprTraits::Layout;
-  typedef typename TypeConversion<Scalar, 
-                                  typename XprTraits::PointerType
-                                  >::type 
-                                  PointerType;
-};
-
-template<typename UnaryOp, typename XprType>
-struct eval<TensorCwiseUnaryOp<UnaryOp, XprType>, Eigen::Dense>
-{
-  typedef const TensorCwiseUnaryOp<UnaryOp, XprType>& type;
-};
-
-template<typename UnaryOp, typename XprType>
-struct nested<TensorCwiseUnaryOp<UnaryOp, XprType>, 1, typename eval<TensorCwiseUnaryOp<UnaryOp, XprType> >::type>
-{
-  typedef TensorCwiseUnaryOp<UnaryOp, XprType> type;
-};
-
-}  // end namespace internal
-
-
-
-template<typename UnaryOp, typename XprType>
-class TensorCwiseUnaryOp : public TensorBase<TensorCwiseUnaryOp<UnaryOp, XprType>, ReadOnlyAccessors>
-{
-  public:
-    // TODO(phli): Add InputScalar, InputPacket.  Check references to
-    // current Scalar/Packet to see if the intent is Input or Output.
-    typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::Scalar Scalar;
-    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-    typedef Scalar CoeffReturnType;
-    typedef typename Eigen::internal::nested<TensorCwiseUnaryOp>::type Nested;
-    typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::StorageKind StorageKind;
-    typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::Index Index;
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
-      : m_xpr(xpr), m_functor(func) {}
-
-    EIGEN_DEVICE_FUNC
-    const UnaryOp& functor() const { return m_functor; }
-
-    /** \returns the nested expression */
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename XprType::Nested>::type&
-    nestedExpression() const { return m_xpr; }
-
-  protected:
-    typename XprType::Nested m_xpr;
-    const UnaryOp m_functor;
-};
-
-
-namespace internal {
-template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
-struct traits<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> >
-{
-  // Type promotion to handle the case where the types of the lhs and the rhs
-  // are different.
-  // TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket.  Check references to
-  // current Scalar/Packet to see if the intent is Inputs or Output.
-  typedef typename result_of<
-      BinaryOp(typename LhsXprType::Scalar,
-               typename RhsXprType::Scalar)>::type Scalar;
-  typedef traits<LhsXprType> XprTraits;
-  typedef typename promote_storage_type<
-      typename traits<LhsXprType>::StorageKind,
-      typename traits<RhsXprType>::StorageKind>::ret StorageKind;
-  typedef typename promote_index_type<
-      typename traits<LhsXprType>::Index,
-      typename traits<RhsXprType>::Index>::type Index;
-  typedef typename LhsXprType::Nested LhsNested;
-  typedef typename RhsXprType::Nested RhsNested;
-  typedef typename remove_reference<LhsNested>::type _LhsNested;
-  typedef typename remove_reference<RhsNested>::type _RhsNested;
-  static const int NumDimensions = XprTraits::NumDimensions;
-  static const int Layout = XprTraits::Layout;
-  typedef typename TypeConversion<Scalar,
-                                  typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
-                                                      typename traits<LhsXprType>::PointerType,
-                                                      typename traits<RhsXprType>::PointerType>::type
-                                  >::type 
-                                  PointerType;
-  enum {
-    Flags = 0
-  };
-};
-
-template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
-struct eval<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, Eigen::Dense>
-{
-  typedef const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>& type;
-};
-
-template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
-struct nested<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, 1, typename eval<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> >::type>
-{
-  typedef TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> type;
-};
-
-}  // end namespace internal
-
-
-
-template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
-class TensorCwiseBinaryOp : public TensorBase<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, ReadOnlyAccessors>
-{
-  public:
-    // TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket.  Check references to
-    // current Scalar/Packet to see if the intent is Inputs or Output.
-    typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::Scalar Scalar;
-    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-    typedef Scalar CoeffReturnType;
-    typedef typename Eigen::internal::nested<TensorCwiseBinaryOp>::type Nested;
-    typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::StorageKind StorageKind;
-    typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::Index Index;
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const BinaryOp& func = BinaryOp())
-        : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_functor(func) {}
-
-    EIGEN_DEVICE_FUNC
-    const BinaryOp& functor() const { return m_functor; }
-
-    /** \returns the nested expressions */
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename LhsXprType::Nested>::type&
-    lhsExpression() const { return m_lhs_xpr; }
-
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename RhsXprType::Nested>::type&
-    rhsExpression() const { return m_rhs_xpr; }
-
-  protected:
-    typename LhsXprType::Nested m_lhs_xpr;
-    typename RhsXprType::Nested m_rhs_xpr;
-    const BinaryOp m_functor;
-};
-
-
-namespace internal {
-template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
-struct traits<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> >
-{
-  // Type promotion to handle the case where the types of the args are different.
-  typedef typename result_of<
-      TernaryOp(typename Arg1XprType::Scalar,
-                typename Arg2XprType::Scalar,
-                typename Arg3XprType::Scalar)>::type Scalar;
-  typedef traits<Arg1XprType> XprTraits;
-  typedef typename traits<Arg1XprType>::StorageKind StorageKind;
-  typedef typename traits<Arg1XprType>::Index Index;
-  typedef typename Arg1XprType::Nested Arg1Nested;
-  typedef typename Arg2XprType::Nested Arg2Nested;
-  typedef typename Arg3XprType::Nested Arg3Nested;
-  typedef typename remove_reference<Arg1Nested>::type _Arg1Nested;
-  typedef typename remove_reference<Arg2Nested>::type _Arg2Nested;
-  typedef typename remove_reference<Arg3Nested>::type _Arg3Nested;
-  static const int NumDimensions = XprTraits::NumDimensions;
-  static const int Layout = XprTraits::Layout;
-  typedef typename TypeConversion<Scalar,
-                                  typename conditional<Pointer_type_promotion<typename Arg2XprType::Scalar, Scalar>::val,
-                                                      typename traits<Arg2XprType>::PointerType,
-                                                      typename traits<Arg3XprType>::PointerType>::type
-                                  >::type 
-                                  PointerType;
-  enum {
-    Flags = 0
-  };
-};
-
-template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
-struct eval<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, Eigen::Dense>
-{
-  typedef const TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>& type;
-};
-
-template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
-struct nested<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, 1, typename eval<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> >::type>
-{
-  typedef TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> type;
-};
-
-}  // end namespace internal
-
-
-
-template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
-class TensorCwiseTernaryOp : public TensorBase<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, ReadOnlyAccessors>
-{
-  public:
-    typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::Scalar Scalar;
-    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-    typedef Scalar CoeffReturnType;
-    typedef typename Eigen::internal::nested<TensorCwiseTernaryOp>::type Nested;
-    typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::StorageKind StorageKind;
-    typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::Index Index;
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseTernaryOp(const Arg1XprType& arg1, const Arg2XprType& arg2, const Arg3XprType& arg3, const TernaryOp& func = TernaryOp())
-        : m_arg1_xpr(arg1), m_arg2_xpr(arg2), m_arg3_xpr(arg3), m_functor(func) {}
-
-    EIGEN_DEVICE_FUNC
-    const TernaryOp& functor() const { return m_functor; }
-
-    /** \returns the nested expressions */
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename Arg1XprType::Nested>::type&
-    arg1Expression() const { return m_arg1_xpr; }
-
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename Arg2XprType::Nested>::type&
-    arg2Expression() const { return m_arg2_xpr; }
-
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename Arg3XprType::Nested>::type&
-    arg3Expression() const { return m_arg3_xpr; }
-
-  protected:
-    typename Arg1XprType::Nested m_arg1_xpr;
-    typename Arg2XprType::Nested m_arg2_xpr;
-    typename Arg3XprType::Nested m_arg3_xpr;
-    const TernaryOp m_functor;
-};
-
-
-namespace internal {
-template<typename IfXprType, typename ThenXprType, typename ElseXprType>
-struct traits<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >
-    : traits<ThenXprType>
-{
-  typedef typename traits<ThenXprType>::Scalar Scalar;
-  typedef traits<ThenXprType> XprTraits;
-  typedef typename promote_storage_type<typename traits<ThenXprType>::StorageKind,
-                                        typename traits<ElseXprType>::StorageKind>::ret StorageKind;
-  typedef typename promote_index_type<typename traits<ElseXprType>::Index,
-                                      typename traits<ThenXprType>::Index>::type Index;
-  typedef typename IfXprType::Nested IfNested;
-  typedef typename ThenXprType::Nested ThenNested;
-  typedef typename ElseXprType::Nested ElseNested;
-  static const int NumDimensions = XprTraits::NumDimensions;
-  static const int Layout = XprTraits::Layout;
-  typedef typename conditional<Pointer_type_promotion<typename ThenXprType::Scalar, Scalar>::val,
-                               typename traits<ThenXprType>::PointerType,
-                               typename traits<ElseXprType>::PointerType>::type PointerType;
-};
-
-template<typename IfXprType, typename ThenXprType, typename ElseXprType>
-struct eval<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, Eigen::Dense>
-{
-  typedef const TensorSelectOp<IfXprType, ThenXprType, ElseXprType>& type;
-};
-
-template<typename IfXprType, typename ThenXprType, typename ElseXprType>
-struct nested<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, 1, typename eval<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >::type>
-{
-  typedef TensorSelectOp<IfXprType, ThenXprType, ElseXprType> type;
-};
-
-}  // end namespace internal
-
-
-template<typename IfXprType, typename ThenXprType, typename ElseXprType>
-class TensorSelectOp : public TensorBase<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, ReadOnlyAccessors>
-{
-  public:
-    typedef typename Eigen::internal::traits<TensorSelectOp>::Scalar Scalar;
-    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-    typedef typename internal::promote_storage_type<typename ThenXprType::CoeffReturnType,
-                                                    typename ElseXprType::CoeffReturnType>::ret CoeffReturnType;
-    typedef typename Eigen::internal::nested<TensorSelectOp>::type Nested;
-    typedef typename Eigen::internal::traits<TensorSelectOp>::StorageKind StorageKind;
-    typedef typename Eigen::internal::traits<TensorSelectOp>::Index Index;
-
-    EIGEN_DEVICE_FUNC
-    TensorSelectOp(const IfXprType& a_condition,
-                   const ThenXprType& a_then,
-                   const ElseXprType& a_else)
-      : m_condition(a_condition), m_then(a_then), m_else(a_else)
-    { }
-
-    EIGEN_DEVICE_FUNC
-    const IfXprType& ifExpression() const { return m_condition; }
-
-    EIGEN_DEVICE_FUNC
-    const ThenXprType& thenExpression() const { return m_then; }
-
-    EIGEN_DEVICE_FUNC
-    const ElseXprType& elseExpression() const { return m_else; }
-
-  protected:
-    typename IfXprType::Nested m_condition;
-    typename ThenXprType::Nested m_then;
-    typename ElseXprType::Nested m_else;
-};
-
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_EXPR_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorFFT.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorFFT.h
deleted file mode 100644
index 4a1a068..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorFFT.h
+++ /dev/null
@@ -1,669 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2015 Jianwei Cui <thucjw@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_FFT_H
-#define EIGEN_CXX11_TENSOR_TENSOR_FFT_H
-
-namespace Eigen {
-
-/** \class TensorFFT
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Tensor FFT class.
-  *
-  * TODO:
-  * Vectorize the Cooley Tukey and the Bluestein algorithm
-  * Add support for multithreaded evaluation
-  * Improve the performance on GPU
-  */
-
-template <bool NeedUprade> struct MakeComplex {
-  template <typename T>
-  EIGEN_DEVICE_FUNC
-  T operator() (const T& val) const { return val; }
-};
-
-template <> struct MakeComplex<true> {
-  template <typename T>
-  EIGEN_DEVICE_FUNC
-  std::complex<T> operator() (const T& val) const { return std::complex<T>(val, 0); }
-};
-
-template <> struct MakeComplex<false> {
-  template <typename T>
-  EIGEN_DEVICE_FUNC
-  std::complex<T> operator() (const std::complex<T>& val) const { return val; }
-};
-
-template <int ResultType> struct PartOf {
-  template <typename T> T operator() (const T& val) const { return val; }
-};
-
-template <> struct PartOf<RealPart> {
-  template <typename T> T operator() (const std::complex<T>& val) const { return val.real(); }
-};
-
-template <> struct PartOf<ImagPart> {
-  template <typename T> T operator() (const std::complex<T>& val) const { return val.imag(); }
-};
-
-namespace internal {
-template <typename FFT, typename XprType, int FFTResultType, int FFTDir>
-struct traits<TensorFFTOp<FFT, XprType, FFTResultType, FFTDir> > : public traits<XprType> {
-  typedef traits<XprType> XprTraits;
-  typedef typename NumTraits<typename XprTraits::Scalar>::Real RealScalar;
-  typedef typename std::complex<RealScalar> ComplexScalar;
-  typedef typename XprTraits::Scalar InputScalar;
-  typedef typename conditional<FFTResultType == RealPart || FFTResultType == ImagPart, RealScalar, ComplexScalar>::type OutputScalar;
-  typedef typename XprTraits::StorageKind StorageKind;
-  typedef typename XprTraits::Index Index;
-  typedef typename XprType::Nested Nested;
-  typedef typename remove_reference<Nested>::type _Nested;
-  static const int NumDimensions = XprTraits::NumDimensions;
-  static const int Layout = XprTraits::Layout;
-  typedef typename traits<XprType>::PointerType PointerType;
-};
-
-template <typename FFT, typename XprType, int FFTResultType, int FFTDirection>
-struct eval<TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection>, Eigen::Dense> {
-  typedef const TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection>& type;
-};
-
-template <typename FFT, typename XprType, int FFTResultType, int FFTDirection>
-struct nested<TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection>, 1, typename eval<TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection> >::type> {
-  typedef TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection> type;
-};
-
-}  // end namespace internal
-
-template <typename FFT, typename XprType, int FFTResultType, int FFTDir>
-class TensorFFTOp : public TensorBase<TensorFFTOp<FFT, XprType, FFTResultType, FFTDir>, ReadOnlyAccessors> {
- public:
-  typedef typename Eigen::internal::traits<TensorFFTOp>::Scalar Scalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename std::complex<RealScalar> ComplexScalar;
-  typedef typename internal::conditional<FFTResultType == RealPart || FFTResultType == ImagPart, RealScalar, ComplexScalar>::type OutputScalar;
-  typedef OutputScalar CoeffReturnType;
-  typedef typename Eigen::internal::nested<TensorFFTOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorFFTOp>::StorageKind StorageKind;
-  typedef typename Eigen::internal::traits<TensorFFTOp>::Index Index;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFFTOp(const XprType& expr, const FFT& fft)
-      : m_xpr(expr), m_fft(fft) {}
-
-  EIGEN_DEVICE_FUNC
-  const FFT& fft() const { return m_fft; }
-
-  EIGEN_DEVICE_FUNC
-  const typename internal::remove_all<typename XprType::Nested>::type& expression() const {
-    return m_xpr;
-  }
-
- protected:
-  typename XprType::Nested m_xpr;
-  const FFT m_fft;
-};
-
-// Eval as rvalue
-template <typename FFT, typename ArgType, typename Device, int FFTResultType, int FFTDir>
-struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, Device> {
-  typedef TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir> XprType;
-  typedef typename XprType::Index Index;
-  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
-  typedef DSizes<Index, NumDims> Dimensions;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename std::complex<RealScalar> ComplexScalar;
-  typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
-  typedef internal::traits<XprType> XprTraits;
-  typedef typename XprTraits::Scalar InputScalar;
-  typedef typename internal::conditional<FFTResultType == RealPart || FFTResultType == ImagPart, RealScalar, ComplexScalar>::type OutputScalar;
-  typedef OutputScalar CoeffReturnType;
-  typedef typename PacketType<OutputScalar, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
-    typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
-  enum {
-    IsAligned = false,
-    PacketAccess = true,
-    BlockAccess = false,
-    PreferBlockAccess = false,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,
-    RawAccess = false
-  };
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) {
-    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
-    for (int i = 0; i < NumDims; ++i) {
-      eigen_assert(input_dims[i] > 0);
-      m_dimensions[i] = input_dims[i];
-    }
-
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      m_strides[0] = 1;
-      for (int i = 1; i < NumDims; ++i) {
-        m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1];
-      }
-    } else {
-      m_strides[NumDims - 1] = 1;
-      for (int i = NumDims - 2; i >= 0; --i) {
-        m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1];
-      }
-    }
-    m_size = m_dimensions.TotalSize();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
-    return m_dimensions;
-  }
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
-    m_impl.evalSubExprsIfNeeded(NULL);
-    if (data) {
-      evalToBuf(data);
-      return false;
-    } else {
-      m_data = (EvaluatorPointerType)m_device.get((CoeffReturnType*)(m_device.allocate_temp(sizeof(CoeffReturnType) * m_size)));
-      evalToBuf(m_data);
-      return true;
-    }
-  }
-
-  EIGEN_STRONG_INLINE void cleanup() {
-    if (m_data) {
-      m_device.deallocate(m_data);
-      m_data = NULL;
-    }
-    m_impl.cleanup();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const {
-    return m_data[index];
-  }
-
-  template <int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType
-  packet(Index index) const {
-    return internal::ploadt<PacketReturnType, LoadMode>(m_data + index);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
-  costPerCoeff(bool vectorized) const {
-    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
-  }
-
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_data.bind(cgh);
-  }
-#endif
-
- private:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(EvaluatorPointerType data) {
-    const bool write_to_out = internal::is_same<OutputScalar, ComplexScalar>::value;
-    ComplexScalar* buf = write_to_out ? (ComplexScalar*)data : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * m_size);
-
-    for (Index i = 0; i < m_size; ++i) {
-      buf[i] = MakeComplex<internal::is_same<InputScalar, RealScalar>::value>()(m_impl.coeff(i));
-    }
-
-    for (size_t i = 0; i < m_fft.size(); ++i) {
-      Index dim = m_fft[i];
-      eigen_assert(dim >= 0 && dim < NumDims);
-      Index line_len = m_dimensions[dim];
-      eigen_assert(line_len >= 1);
-      ComplexScalar* line_buf = (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * line_len);
-      const bool is_power_of_two = isPowerOfTwo(line_len);
-      const Index good_composite = is_power_of_two ? 0 : findGoodComposite(line_len);
-      const Index log_len = is_power_of_two ? getLog2(line_len) : getLog2(good_composite);
-
-      ComplexScalar* a = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * good_composite);
-      ComplexScalar* b = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * good_composite);
-      ComplexScalar* pos_j_base_powered = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * (line_len + 1));
-      if (!is_power_of_two) {
-        // Compute twiddle factors
-        //   t_n = exp(sqrt(-1) * pi * n^2 / line_len)
-        // for n = 0, 1,..., line_len-1.
-        // For n > 2 we use the recurrence t_n = t_{n-1}^2 / t_{n-2} * t_1^2
-
-        // The recurrence is correct in exact arithmetic, but causes
-        // numerical issues for large transforms, especially in
-        // single-precision floating point.
-        //
-        // pos_j_base_powered[0] = ComplexScalar(1, 0);
-        // if (line_len > 1) {
-        //   const ComplexScalar pos_j_base = ComplexScalar(
-        //       numext::cos(M_PI / line_len), numext::sin(M_PI / line_len));
-        //   pos_j_base_powered[1] = pos_j_base;
-        //   if (line_len > 2) {
-        //     const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base;
-        //     for (int i = 2; i < line_len + 1; ++i) {
-        //       pos_j_base_powered[i] = pos_j_base_powered[i - 1] *
-        //           pos_j_base_powered[i - 1] /
-        //           pos_j_base_powered[i - 2] *
-        //           pos_j_base_sq;
-        //     }
-        //   }
-        // }
-        // TODO(rmlarsen): Find a way to use Eigen's vectorized sin
-        // and cosine functions here.
-        for (int j = 0; j < line_len + 1; ++j) {
-          double arg = ((EIGEN_PI * j) * j) / line_len;
-          std::complex<double> tmp(numext::cos(arg), numext::sin(arg));
-          pos_j_base_powered[j] = static_cast<ComplexScalar>(tmp);
-        }
-      }
-
-      for (Index partial_index = 0; partial_index < m_size / line_len; ++partial_index) {
-        const Index base_offset = getBaseOffsetFromIndex(partial_index, dim);
-
-        // get data into line_buf
-        const Index stride = m_strides[dim];
-        if (stride == 1) {
-          m_device.memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar));
-        } else {
-          Index offset = base_offset;
-          for (int j = 0; j < line_len; ++j, offset += stride) {
-            line_buf[j] = buf[offset];
-          }
-        }
-
-        // process the line
-        if (is_power_of_two) {
-          processDataLineCooleyTukey(line_buf, line_len, log_len);
-        }
-        else {
-          processDataLineBluestein(line_buf, line_len, good_composite, log_len, a, b, pos_j_base_powered);
-        }
-
-        // write back
-        if (FFTDir == FFT_FORWARD && stride == 1) {
-          m_device.memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar));
-        } else {
-          Index offset = base_offset;
-          const ComplexScalar div_factor =  ComplexScalar(1.0 / line_len, 0);
-          for (int j = 0; j < line_len; ++j, offset += stride) {
-             buf[offset] = (FFTDir == FFT_FORWARD) ? line_buf[j] : line_buf[j] * div_factor;
-          }
-        }
-      }
-      m_device.deallocate(line_buf);
-      if (!is_power_of_two) {
-        m_device.deallocate(a);
-        m_device.deallocate(b);
-        m_device.deallocate(pos_j_base_powered);
-      }
-    }
-
-    if(!write_to_out) {
-      for (Index i = 0; i < m_size; ++i) {
-        data[i] = PartOf<FFTResultType>()(buf[i]);
-      }
-      m_device.deallocate(buf);
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static bool isPowerOfTwo(Index x) {
-    eigen_assert(x > 0);
-    return !(x & (x - 1));
-  }
-
-  // The composite number for padding, used in Bluestein's FFT algorithm
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Index findGoodComposite(Index n) {
-    Index i = 2;
-    while (i < 2 * n - 1) i *= 2;
-    return i;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Index getLog2(Index m) {
-    Index log2m = 0;
-    while (m >>= 1) log2m++;
-    return log2m;
-  }
-
-  // Call Cooley Tukey algorithm directly, data length must be power of 2
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void processDataLineCooleyTukey(ComplexScalar* line_buf, Index line_len, Index log_len) {
-    eigen_assert(isPowerOfTwo(line_len));
-    scramble_FFT(line_buf, line_len);
-    compute_1D_Butterfly<FFTDir>(line_buf, line_len, log_len);
-  }
-
-  // Call Bluestein's FFT algorithm, m is a good composite number greater than (2 * n - 1), used as the padding length
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void processDataLineBluestein(ComplexScalar* line_buf, Index line_len, Index good_composite, Index log_len, ComplexScalar* a, ComplexScalar* b, const ComplexScalar* pos_j_base_powered) {
-    Index n = line_len;
-    Index m = good_composite;
-    ComplexScalar* data = line_buf;
-
-    for (Index i = 0; i < n; ++i) {
-      if(FFTDir == FFT_FORWARD) {
-        a[i] = data[i] * numext::conj(pos_j_base_powered[i]);
-      }
-      else {
-        a[i] = data[i] * pos_j_base_powered[i];
-      }
-    }
-    for (Index i = n; i < m; ++i) {
-      a[i] = ComplexScalar(0, 0);
-    }
-
-    for (Index i = 0; i < n; ++i) {
-      if(FFTDir == FFT_FORWARD) {
-        b[i] = pos_j_base_powered[i];
-      }
-      else {
-        b[i] = numext::conj(pos_j_base_powered[i]);
-      }
-    }
-    for (Index i = n; i < m - n; ++i) {
-      b[i] = ComplexScalar(0, 0);
-    }
-    for (Index i = m - n; i < m; ++i) {
-      if(FFTDir == FFT_FORWARD) {
-        b[i] = pos_j_base_powered[m-i];
-      }
-      else {
-        b[i] = numext::conj(pos_j_base_powered[m-i]);
-      }
-    }
-
-    scramble_FFT(a, m);
-    compute_1D_Butterfly<FFT_FORWARD>(a, m, log_len);
-
-    scramble_FFT(b, m);
-    compute_1D_Butterfly<FFT_FORWARD>(b, m, log_len);
-
-    for (Index i = 0; i < m; ++i) {
-      a[i] *= b[i];
-    }
-
-    scramble_FFT(a, m);
-    compute_1D_Butterfly<FFT_REVERSE>(a, m, log_len);
-
-    //Do the scaling after ifft
-    for (Index i = 0; i < m; ++i) {
-      a[i] /= m;
-    }
-
-    for (Index i = 0; i < n; ++i) {
-      if(FFTDir == FFT_FORWARD) {
-        data[i] = a[i] * numext::conj(pos_j_base_powered[i]);
-      }
-      else {
-        data[i] = a[i] * pos_j_base_powered[i];
-      }
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void scramble_FFT(ComplexScalar* data, Index n) {
-    eigen_assert(isPowerOfTwo(n));
-    Index j = 1;
-    for (Index i = 1; i < n; ++i){
-      if (j > i) {
-        std::swap(data[j-1], data[i-1]);
-      }
-      Index m = n >> 1;
-      while (m >= 2 && j > m) {
-        j -= m;
-        m >>= 1;
-      }
-      j += m;
-    }
-  }
-
-  template <int Dir>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_2(ComplexScalar* data) {
-    ComplexScalar tmp = data[1];
-    data[1] = data[0] - data[1];
-    data[0] += tmp;
-  }
-
-  template <int Dir>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_4(ComplexScalar* data) {
-    ComplexScalar tmp[4];
-    tmp[0] = data[0] + data[1];
-    tmp[1] = data[0] - data[1];
-    tmp[2] = data[2] + data[3];
-    if (Dir == FFT_FORWARD) {
-      tmp[3] = ComplexScalar(0.0, -1.0) * (data[2] - data[3]);
-    } else {
-      tmp[3] = ComplexScalar(0.0, 1.0) * (data[2] - data[3]);
-    }
-    data[0] = tmp[0] + tmp[2];
-    data[1] = tmp[1] + tmp[3];
-    data[2] = tmp[0] - tmp[2];
-    data[3] = tmp[1] - tmp[3];
-  }
-
-  template <int Dir>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_8(ComplexScalar* data) {
-    ComplexScalar tmp_1[8];
-    ComplexScalar tmp_2[8];
-
-    tmp_1[0] = data[0] + data[1];
-    tmp_1[1] = data[0] - data[1];
-    tmp_1[2] = data[2] + data[3];
-    if (Dir == FFT_FORWARD) {
-      tmp_1[3] = (data[2] - data[3]) * ComplexScalar(0, -1);
-    } else {
-      tmp_1[3] = (data[2] - data[3]) * ComplexScalar(0, 1);
-    }
-    tmp_1[4] = data[4] + data[5];
-    tmp_1[5] = data[4] - data[5];
-    tmp_1[6] = data[6] + data[7];
-    if (Dir == FFT_FORWARD) {
-      tmp_1[7] = (data[6] - data[7]) * ComplexScalar(0, -1);
-    } else {
-      tmp_1[7] = (data[6] - data[7]) * ComplexScalar(0, 1);
-    }
-    tmp_2[0] = tmp_1[0] + tmp_1[2];
-    tmp_2[1] = tmp_1[1] + tmp_1[3];
-    tmp_2[2] = tmp_1[0] - tmp_1[2];
-    tmp_2[3] = tmp_1[1] - tmp_1[3];
-    tmp_2[4] = tmp_1[4] + tmp_1[6];
-// SQRT2DIV2 = sqrt(2)/2
-#define SQRT2DIV2 0.7071067811865476
-    if (Dir == FFT_FORWARD) {
-      tmp_2[5] = (tmp_1[5] + tmp_1[7]) * ComplexScalar(SQRT2DIV2, -SQRT2DIV2);
-      tmp_2[6] = (tmp_1[4] - tmp_1[6]) * ComplexScalar(0, -1);
-      tmp_2[7] = (tmp_1[5] - tmp_1[7]) * ComplexScalar(-SQRT2DIV2, -SQRT2DIV2);
-    } else {
-      tmp_2[5] = (tmp_1[5] + tmp_1[7]) * ComplexScalar(SQRT2DIV2, SQRT2DIV2);
-      tmp_2[6] = (tmp_1[4] - tmp_1[6]) * ComplexScalar(0, 1);
-      tmp_2[7] = (tmp_1[5] - tmp_1[7]) * ComplexScalar(-SQRT2DIV2, SQRT2DIV2);
-    }
-    data[0] = tmp_2[0] + tmp_2[4];
-    data[1] = tmp_2[1] + tmp_2[5];
-    data[2] = tmp_2[2] + tmp_2[6];
-    data[3] = tmp_2[3] + tmp_2[7];
-    data[4] = tmp_2[0] - tmp_2[4];
-    data[5] = tmp_2[1] - tmp_2[5];
-    data[6] = tmp_2[2] - tmp_2[6];
-    data[7] = tmp_2[3] - tmp_2[7];
-  }
-
-  template <int Dir>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_1D_merge(
-      ComplexScalar* data, Index n, Index n_power_of_2) {
-    // Original code:
-    // RealScalar wtemp = std::sin(M_PI/n);
-    // RealScalar wpi =  -std::sin(2 * M_PI/n);
-    const RealScalar wtemp = m_sin_PI_div_n_LUT[n_power_of_2];
-    const RealScalar wpi = (Dir == FFT_FORWARD)
-                               ? m_minus_sin_2_PI_div_n_LUT[n_power_of_2]
-                               : -m_minus_sin_2_PI_div_n_LUT[n_power_of_2];
-
-    const ComplexScalar wp(wtemp, wpi);
-    const ComplexScalar wp_one = wp + ComplexScalar(1, 0);
-    const ComplexScalar wp_one_2 = wp_one * wp_one;
-    const ComplexScalar wp_one_3 = wp_one_2 * wp_one;
-    const ComplexScalar wp_one_4 = wp_one_3 * wp_one;
-    const Index n2 = n / 2;
-    ComplexScalar w(1.0, 0.0);
-    for (Index i = 0; i < n2; i += 4) {
-       ComplexScalar temp0(data[i + n2] * w);
-       ComplexScalar temp1(data[i + 1 + n2] * w * wp_one);
-       ComplexScalar temp2(data[i + 2 + n2] * w * wp_one_2);
-       ComplexScalar temp3(data[i + 3 + n2] * w * wp_one_3);
-       w = w * wp_one_4;
-
-       data[i + n2] = data[i] - temp0;
-       data[i] += temp0;
-
-       data[i + 1 + n2] = data[i + 1] - temp1;
-       data[i + 1] += temp1;
-
-       data[i + 2 + n2] = data[i + 2] - temp2;
-       data[i + 2] += temp2;
-
-       data[i + 3 + n2] = data[i + 3] - temp3;
-       data[i + 3] += temp3;
-    }
-  }
-
- template <int Dir>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_1D_Butterfly(
-      ComplexScalar* data, Index n, Index n_power_of_2) {
-    eigen_assert(isPowerOfTwo(n));
-    if (n > 8) {
-      compute_1D_Butterfly<Dir>(data, n / 2, n_power_of_2 - 1);
-      compute_1D_Butterfly<Dir>(data + n / 2, n / 2, n_power_of_2 - 1);
-      butterfly_1D_merge<Dir>(data, n, n_power_of_2);
-    } else if (n == 8) {
-      butterfly_8<Dir>(data);
-    } else if (n == 4) {
-      butterfly_4<Dir>(data);
-    } else if (n == 2) {
-      butterfly_2<Dir>(data);
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getBaseOffsetFromIndex(Index index, Index omitted_dim) const {
-    Index result = 0;
-
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      for (int i = NumDims - 1; i > omitted_dim; --i) {
-        const Index partial_m_stride = m_strides[i] / m_dimensions[omitted_dim];
-        const Index idx = index / partial_m_stride;
-        index -= idx * partial_m_stride;
-        result += idx * m_strides[i];
-      }
-      result += index;
-    }
-    else {
-      for (Index i = 0; i < omitted_dim; ++i) {
-        const Index partial_m_stride = m_strides[i] / m_dimensions[omitted_dim];
-        const Index idx = index / partial_m_stride;
-        index -= idx * partial_m_stride;
-        result += idx * m_strides[i];
-      }
-      result += index;
-    }
-    // Value of index_coords[omitted_dim] is not determined to this step
-    return result;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getIndexFromOffset(Index base, Index omitted_dim, Index offset) const {
-    Index result = base + offset * m_strides[omitted_dim] ;
-    return result;
-  }
-
- protected:
-  Index m_size;
-  const FFT EIGEN_DEVICE_REF m_fft;
-  Dimensions m_dimensions;
-  array<Index, NumDims> m_strides;
-  TensorEvaluator<ArgType, Device> m_impl;
-  EvaluatorPointerType m_data;
-  const Device EIGEN_DEVICE_REF m_device;
-
-  // This will support a maximum FFT size of 2^32 for each dimension
-  // m_sin_PI_div_n_LUT[i] = (-2) * std::sin(M_PI / std::pow(2,i)) ^ 2;
-  const RealScalar m_sin_PI_div_n_LUT[32] = {
-    RealScalar(0.0),
-    RealScalar(-2),
-    RealScalar(-0.999999999999999),
-    RealScalar(-0.292893218813453),
-    RealScalar(-0.0761204674887130),
-    RealScalar(-0.0192147195967696),
-    RealScalar(-0.00481527332780311),
-    RealScalar(-0.00120454379482761),
-    RealScalar(-3.01181303795779e-04),
-    RealScalar(-7.52981608554592e-05),
-    RealScalar(-1.88247173988574e-05),
-    RealScalar(-4.70619042382852e-06),
-    RealScalar(-1.17654829809007e-06),
-    RealScalar(-2.94137117780840e-07),
-    RealScalar(-7.35342821488550e-08),
-    RealScalar(-1.83835707061916e-08),
-    RealScalar(-4.59589268710903e-09),
-    RealScalar(-1.14897317243732e-09),
-    RealScalar(-2.87243293150586e-10),
-    RealScalar( -7.18108232902250e-11),
-    RealScalar(-1.79527058227174e-11),
-    RealScalar(-4.48817645568941e-12),
-    RealScalar(-1.12204411392298e-12),
-    RealScalar(-2.80511028480785e-13),
-    RealScalar(-7.01277571201985e-14),
-    RealScalar(-1.75319392800498e-14),
-    RealScalar(-4.38298482001247e-15),
-    RealScalar(-1.09574620500312e-15),
-    RealScalar(-2.73936551250781e-16),
-    RealScalar(-6.84841378126949e-17),
-    RealScalar(-1.71210344531737e-17),
-    RealScalar(-4.28025861329343e-18)
-  };
-
-  // m_minus_sin_2_PI_div_n_LUT[i] = -std::sin(2 * M_PI / std::pow(2,i));
-  const RealScalar m_minus_sin_2_PI_div_n_LUT[32] = {
-    RealScalar(0.0),
-    RealScalar(0.0),
-    RealScalar(-1.00000000000000e+00),
-    RealScalar(-7.07106781186547e-01),
-    RealScalar(-3.82683432365090e-01),
-    RealScalar(-1.95090322016128e-01),
-    RealScalar(-9.80171403295606e-02),
-    RealScalar(-4.90676743274180e-02),
-    RealScalar(-2.45412285229123e-02),
-    RealScalar(-1.22715382857199e-02),
-    RealScalar(-6.13588464915448e-03),
-    RealScalar(-3.06795676296598e-03),
-    RealScalar(-1.53398018628477e-03),
-    RealScalar(-7.66990318742704e-04),
-    RealScalar(-3.83495187571396e-04),
-    RealScalar(-1.91747597310703e-04),
-    RealScalar(-9.58737990959773e-05),
-    RealScalar(-4.79368996030669e-05),
-    RealScalar(-2.39684498084182e-05),
-    RealScalar(-1.19842249050697e-05),
-    RealScalar(-5.99211245264243e-06),
-    RealScalar(-2.99605622633466e-06),
-    RealScalar(-1.49802811316901e-06),
-    RealScalar(-7.49014056584716e-07),
-    RealScalar(-3.74507028292384e-07),
-    RealScalar(-1.87253514146195e-07),
-    RealScalar(-9.36267570730981e-08),
-    RealScalar(-4.68133785365491e-08),
-    RealScalar(-2.34066892682746e-08),
-    RealScalar(-1.17033446341373e-08),
-    RealScalar(-5.85167231706864e-09),
-    RealScalar(-2.92583615853432e-09)
-  };
-};
-
-}  // end namespace Eigen
-
-#endif  // EIGEN_CXX11_TENSOR_TENSOR_FFT_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorFixedSize.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorFixedSize.h
deleted file mode 100644
index ca39bb8..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorFixedSize.h
+++ /dev/null
@@ -1,379 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H
-#define EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H
-
-namespace Eigen {
-
-/** \class TensorFixedSize
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief The fixed sized version of the tensor class.
-  *
-  * The fixed sized equivalent of
-  * Eigen::Tensor<float, 3> t(3, 5, 7);
-  * is
-  * Eigen::TensorFixedSize<float, Sizes<3,5,7>> t;
-  */
-
-template<typename Scalar_, typename Dimensions_, int Options_, typename IndexType>
-class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_, Options_, IndexType> >
-{
-  public:
-    typedef TensorFixedSize<Scalar_, Dimensions_, Options_, IndexType> Self;
-    typedef TensorBase<TensorFixedSize<Scalar_, Dimensions_, Options_, IndexType> > Base;
-    typedef typename Eigen::internal::nested<Self>::type Nested;
-    typedef typename internal::traits<Self>::StorageKind StorageKind;
-    typedef typename internal::traits<Self>::Index Index;
-    typedef Scalar_ Scalar;
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef typename Base::CoeffReturnType CoeffReturnType;
-
-    static const int Options = Options_;
-
-    enum {
-      IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0),
-      PacketAccess = (internal::packet_traits<Scalar>::size > 1),
-      BlockAccess = false,
-      PreferBlockAccess = false,
-      Layout = Options_ & RowMajor ? RowMajor : ColMajor,
-      CoordAccess = true,
-      RawAccess = true
-    };
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  typedef Dimensions_ Dimensions;
-  static const std::size_t NumIndices = Dimensions::count;
-
-  protected:
-  TensorStorage<Scalar, Dimensions, Options> m_storage;
-
-  public:
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                    rank()                   const { return NumIndices; }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                    dimension(std::size_t n) const { return m_storage.dimensions()[n]; }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions&        dimensions()             const { return m_storage.dimensions(); }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                    size()                   const { return m_storage.size(); }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar                   *data()                        { return m_storage.data(); }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar             *data()                  const { return m_storage.data(); }
-
-    // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
-    // work, because that uses base().coeffRef() - and we don't yet
-    // implement a similar class hierarchy
-    inline Self& base()             { return *this; }
-    inline const Self& base() const { return *this; }
-
-#if EIGEN_HAS_VARIADIC_TEMPLATES
-    template<typename... IndexTypes>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index firstIndex, IndexTypes... otherIndices) const
-    {
-      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
-      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      return coeff(array<Index, NumIndices>{{firstIndex, otherIndices...}});
-    }
-#endif
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& coeff(const array<Index, NumIndices>& indices) const
-    {
-      eigen_internal_assert(checkIndexRange(indices));
-      return m_storage.data()[linearizedIndex(indices)];
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const
-    {
-      eigen_internal_assert(index >= 0 && index < size());
-      return m_storage.data()[index];
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& coeff() const
-    {
-      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
-      return m_storage.data()[0];
-    }
-
-
-#if EIGEN_HAS_VARIADIC_TEMPLATES
-    template<typename... IndexTypes>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices)
-    {
-      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
-      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      return coeffRef(array<Index, NumIndices>{{firstIndex, otherIndices...}});
-    }
-#endif
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices)
-    {
-      eigen_internal_assert(checkIndexRange(indices));
-      return m_storage.data()[linearizedIndex(indices)];
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
-    {
-      eigen_internal_assert(index >= 0 && index < size());
-      return m_storage.data()[index];
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& coeffRef()
-    {
-      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
-      return m_storage.data()[0];
-    }
-
-#if EIGEN_HAS_VARIADIC_TEMPLATES
-    template<typename... IndexTypes>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const
-    {
-      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
-      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      return this->operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}});
-    }
-#else
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const
-    {
-      if (Options&RowMajor) {
-        const Index index = i1 + i0 * m_storage.dimensions()[1];
-        return m_storage.data()[index];
-      } else {
-        const Index index = i0 + i1 * m_storage.dimensions()[0];
-        return m_storage.data()[index];
-      }
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const
-    {
-      if (Options&RowMajor) {
-         const Index index = i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0);
-         return m_storage.data()[index];
-      } else {
-         const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * i2);
-        return m_storage.data()[index];
-      }
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const
-    {
-      if (Options&RowMajor) {
-        const Index index = i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0));
-        return m_storage.data()[index];
-      } else {
-        const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * i3));
-        return m_storage.data()[index];
-      }
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
-    {
-      if (Options&RowMajor) {
-        const Index index = i4 + m_storage.dimensions()[4] * (i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0)));
-        return m_storage.data()[index];
-      } else {
-        const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * (i3 + m_storage.dimensions()[3] * i4)));
-        return m_storage.data()[index];
-      }
-    }
-#endif
-
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const
-    {
-      eigen_assert(checkIndexRange(indices));
-      return coeff(indices);
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const
-    {
-      eigen_internal_assert(index >= 0 && index < size());
-      return coeff(index);
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator()() const
-    {
-      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
-      return coeff();
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const
-    {
-      // The bracket operator is only for vectors, use the parenthesis operator instead.
-      EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE);
-      return coeff(index);
-    }
-
-#if EIGEN_HAS_VARIADIC_TEMPLATES
-    template<typename... IndexTypes>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
-    {
-      // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
-      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      return operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}});
-    }
-#else
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1)
-    {
-       if (Options&RowMajor) {
-         const Index index = i1 + i0 * m_storage.dimensions()[1];
-        return m_storage.data()[index];
-      } else {
-        const Index index = i0 + i1 * m_storage.dimensions()[0];
-        return m_storage.data()[index];
-      }
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2)
-    {
-       if (Options&RowMajor) {
-         const Index index = i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0);
-        return m_storage.data()[index];
-      } else {
-         const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * i2);
-        return m_storage.data()[index];
-      }
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3)
-    {
-      if (Options&RowMajor) {
-        const Index index = i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0));
-        return m_storage.data()[index];
-      } else {
-        const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * i3));
-        return m_storage.data()[index];
-      }
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4)
-    {
-      if (Options&RowMajor) {
-        const Index index = i4 + m_storage.dimensions()[4] * (i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0)));
-        return m_storage.data()[index];
-      } else {
-        const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * (i3 + m_storage.dimensions()[3] * i4)));
-        return m_storage.data()[index];
-      }
-    }
-#endif
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices)
-    {
-      eigen_assert(checkIndexRange(indices));
-      return coeffRef(indices);
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()(Index index)
-    {
-      eigen_assert(index >= 0 && index < size());
-      return coeffRef(index);
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()()
-    {
-      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
-      return coeffRef();
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator[](Index index)
-    {
-      // The bracket operator is only for vectors, use the parenthesis operator instead
-      EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      return coeffRef(index);
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorFixedSize()
-      : m_storage()
-    {
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorFixedSize(const Self& other)
-      : m_storage(other.m_storage)
-    {
-    }
-
-#if EIGEN_HAS_RVALUE_REFERENCES
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize(Self&& other)
-      : m_storage(other.m_storage)
-    {
-    }
-#endif
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorFixedSize(const TensorBase<OtherDerived, ReadOnlyAccessors>& other)
-    {
-      typedef TensorAssignOp<TensorFixedSize, const OtherDerived> Assign;
-      Assign assign(*this, other.derived());
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-    }
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorFixedSize(const TensorBase<OtherDerived, WriteAccessors>& other)
-    {
-      typedef TensorAssignOp<TensorFixedSize, const OtherDerived> Assign;
-      Assign assign(*this, other.derived());
-      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
-    }
-
-    // FIXME: check that the dimensions of other match the dimensions of *this.
-    // Unfortunately this isn't possible yet when the rhs is an expression.
-    EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(TensorFixedSize)
-
-
-  protected:
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE bool checkIndexRange(const array<Index, NumIndices>& /*indices*/) const
-    {
-      using internal::array_apply_and_reduce;
-      using internal::array_zip_and_reduce;
-      using internal::greater_equal_zero_op;
-      using internal::logical_and_op;
-      using internal::lesser_op;
-
-      return true;
-        // check whether the indices are all >= 0
-          /*       array_apply_and_reduce<logical_and_op, greater_equal_zero_op>(indices) &&
-        // check whether the indices fit in the dimensions
-        array_zip_and_reduce<logical_and_op, lesser_op>(indices, m_storage.dimensions());*/
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index linearizedIndex(const array<Index, NumIndices>& indices) const
-    {
-      if (Options&RowMajor) {
-        return m_storage.dimensions().IndexOfRowMajor(indices);
-      } else {
-        return m_storage.dimensions().IndexOfColMajor(indices);
-      }
-    }
-};
-
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorForcedEval.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorForcedEval.h
deleted file mode 100644
index e800ded..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorForcedEval.h
+++ /dev/null
@@ -1,237 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H
-#define EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H
-
-namespace Eigen {
-
-/** \class TensorForcedEval
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Tensor reshaping class.
-  *
-  *
-  */
-namespace internal {
-template<typename XprType>
-struct traits<TensorForcedEvalOp<XprType> >
-{
-  // Type promotion to handle the case where the types of the lhs and the rhs are different.
-  typedef typename XprType::Scalar Scalar;
-  typedef traits<XprType> XprTraits;
-  typedef typename traits<XprType>::StorageKind StorageKind;
-  typedef typename traits<XprType>::Index Index;
-  typedef typename XprType::Nested Nested;
-  typedef typename remove_reference<Nested>::type _Nested;
-  static const int NumDimensions = XprTraits::NumDimensions;
-  static const int Layout = XprTraits::Layout;
-  typedef typename XprTraits::PointerType PointerType;
-
-  enum {
-    Flags = 0
-  };
-};
-
-template<typename XprType>
-struct eval<TensorForcedEvalOp<XprType>, Eigen::Dense>
-{
-  typedef const TensorForcedEvalOp<XprType>& type;
-};
-
-template<typename XprType>
-struct nested<TensorForcedEvalOp<XprType>, 1, typename eval<TensorForcedEvalOp<XprType> >::type>
-{
-  typedef TensorForcedEvalOp<XprType> type;
-};
-
-}  // end namespace internal
-
-
-
-template<typename XprType>
-class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType>, ReadOnlyAccessors>
-{
-  public:
-  typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Scalar Scalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
-  typedef typename Eigen::internal::nested<TensorForcedEvalOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorForcedEvalOp>::StorageKind StorageKind;
-  typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Index Index;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorForcedEvalOp(const XprType& expr)
-      : m_xpr(expr) {}
-
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename XprType::Nested>::type&
-    expression() const { return m_xpr; }
-
-  protected:
-    typename XprType::Nested m_xpr;
-};
-
-namespace internal {
-template <typename Device, typename CoeffReturnType>
-struct non_integral_type_placement_new{
-  template <typename StorageType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index numValues, StorageType m_buffer) {
-   // Initialize non-trivially constructible types.
-    if (!internal::is_arithmetic<CoeffReturnType>::value) {
-      for (Index i = 0; i < numValues; ++i) new (m_buffer + i) CoeffReturnType();
-    }
-}
-};
-
-// SYCL does not support non-integral types 
-// having new (m_buffer + i) CoeffReturnType() causes the following compiler error for SYCL Devices 
-// no matching function for call to 'operator new'
-template <typename CoeffReturnType>
-struct non_integral_type_placement_new<Eigen::SyclDevice, CoeffReturnType> {
-  template <typename StorageType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index, StorageType) {
-}
-};
-} // end namespace internal
-
-template<typename ArgType_, typename Device>
-struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
-{
-  typedef const typename internal::remove_all<ArgType_>::type ArgType;
-  typedef TensorForcedEvalOp<ArgType> XprType;
-  typedef typename ArgType::Scalar Scalar;
-  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
-  typedef typename XprType::Index Index;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
-  enum {
-    IsAligned         = true,
-    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccess       = internal::is_arithmetic<CoeffReturnType>::value,
-    PreferBlockAccess = false,
-    Layout            = TensorEvaluator<ArgType, Device>::Layout,
-    RawAccess         = true
-  };
-
-  static const int NumDims = internal::traits<ArgType>::NumDimensions;
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
-  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
-
-  typedef typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims,
-                                                     Layout, Index>
-      TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device), m_op(op.expression()),
-      m_device(device), m_buffer(NULL)
-  { }
-
-  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
-    const Index numValues =  internal::array_prod(m_impl.dimensions());
-    m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp(numValues * sizeof(CoeffReturnType)));
-
-   internal::non_integral_type_placement_new<Device, CoeffReturnType>()(numValues, m_buffer);
-
-    typedef TensorEvalToOp< const typename internal::remove_const<ArgType>::type > EvalTo;
-    EvalTo evalToTmp(m_device.get(m_buffer), m_op);
-
-    internal::TensorExecutor<
-        const EvalTo, typename internal::remove_const<Device>::type,
-        /*Vectorizable=*/internal::IsVectorizable<Device, const ArgType>::value,
-        /*Tiling=*/internal::IsTileable<Device, const ArgType>::value>::
-        run(evalToTmp, m_device);
-
-    return true;
-  }
-
-#ifdef EIGEN_USE_THREADS
-  template <typename EvalSubExprsCallback>
-  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
-      EvaluatorPointerType, EvalSubExprsCallback done) {
-    const Index numValues = internal::array_prod(m_impl.dimensions());
-    m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp(
-        numValues * sizeof(CoeffReturnType)));
-    typedef TensorEvalToOp<const typename internal::remove_const<ArgType>::type>
-        EvalTo;
-    EvalTo evalToTmp(m_device.get(m_buffer), m_op);
-
-    auto on_done = std::bind([](EvalSubExprsCallback done_) { done_(true); },
-                             std::move(done));
-    internal::TensorAsyncExecutor<
-        const EvalTo, typename internal::remove_const<Device>::type,
-        decltype(on_done),
-        /*Vectorizable=*/internal::IsVectorizable<Device, const ArgType>::value,
-        /*Tiling=*/internal::IsTileable<Device, const ArgType>::value>::
-        runAsync(evalToTmp, m_device, std::move(on_done));
-  }
-#endif
-
-  EIGEN_STRONG_INLINE void cleanup() {
-    m_device.deallocate_temp(m_buffer);
-    m_buffer = NULL;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
-  {
-    return m_buffer[index];
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
-  {
-    return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockResourceRequirements getResourceRequirements() const {
-    return internal::TensorBlockResourceRequirements::any();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
-  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
-          bool /*root_of_expr_ast*/ = false) const {
-    assert(m_buffer != NULL);
-    return TensorBlock::materialize(m_buffer, m_impl.dimensions(), desc, scratch);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  EvaluatorPointerType data() const { return m_buffer; }
-
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_buffer.bind(cgh);
-    m_impl.bind(cgh);
-  }
-#endif
- private:
-  TensorEvaluator<ArgType, Device> m_impl;
-  const ArgType m_op;
-  const Device EIGEN_DEVICE_REF m_device;
-  EvaluatorPointerType m_buffer;
-};
-
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorForwardDeclarations.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorForwardDeclarations.h
deleted file mode 100644
index 246ebe4..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ /dev/null
@@ -1,191 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H
-#define EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H
-
-namespace Eigen {
-
-// MakePointer class is used as a container of the address space of the pointer
-// on the host and on the device. From the host side it generates the T* pointer
-// and when EIGEN_USE_SYCL is used it construct a buffer with a map_allocator to
-// T* m_data on the host. It is always called on the device.
-// Specialisation of MakePointer class for creating the sycl buffer with
-// map_allocator.
-template<typename T> struct MakePointer {
-  typedef T* Type;
-  typedef const T* ConstType;
-};
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T* constCast(const T* data) {
-  return const_cast<T*>(data);
-}
-
-// The StorageMemory class is a container of the device specific pointer
-// used for refering to a Pointer on TensorEvaluator class. While the TensorExpression
-// is a device-agnostic type and need MakePointer class for type conversion,
-// the TensorEvaluator class can be specialized for a device, hence it is possible
-// to construct different types of temproray storage memory in TensorEvaluator
-// for different devices by specializing the following StorageMemory class.
-template<typename T, typename device> struct StorageMemory: MakePointer <T> {};
-
-namespace internal{
-template<typename A, typename B> struct Pointer_type_promotion {
-  static const bool val=false;
-};
-template<typename A> struct Pointer_type_promotion<A, A> {
-  static const bool val = true;
-};
-template<typename A, typename B> struct TypeConversion {
-  typedef A* type;
-};
-}
-
-
-template<typename PlainObjectType, int Options_ = Unaligned, template <class> class MakePointer_ = MakePointer> class TensorMap;
-template<typename Scalar_, int NumIndices_, int Options_ = 0, typename IndexType = DenseIndex> class Tensor;
-template<typename Scalar_, typename Dimensions, int Options_ = 0, typename IndexType = DenseIndex> class TensorFixedSize;
-template<typename PlainObjectType> class TensorRef;
-template<typename Derived, int AccessLevel> class TensorBase;
-
-template<typename NullaryOp, typename PlainObjectType> class TensorCwiseNullaryOp;
-template<typename UnaryOp, typename XprType> class TensorCwiseUnaryOp;
-template<typename BinaryOp, typename LeftXprType, typename RightXprType> class TensorCwiseBinaryOp;
-template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType> class TensorCwiseTernaryOp;
-template<typename IfXprType, typename ThenXprType, typename ElseXprType> class TensorSelectOp;
-template<typename Op, typename Dims, typename XprType, template <class> class MakePointer_ = MakePointer > class TensorReductionOp;
-template<typename XprType> class TensorIndexTupleOp;
-template<typename ReduceOp, typename Dims, typename XprType> class TensorTupleReducerOp;
-template<typename Axis, typename LeftXprType, typename RightXprType> class TensorConcatenationOp;
-template<typename Dimensions, typename LeftXprType, typename RightXprType, typename OutputKernelType> class TensorContractionOp;
-template<typename TargetType, typename XprType> class TensorConversionOp;
-template<typename Dimensions, typename InputXprType, typename KernelXprType> class TensorConvolutionOp;
-template<typename FFT, typename XprType, int FFTDataType, int FFTDirection> class TensorFFTOp;
-template<typename PatchDim, typename XprType> class TensorPatchOp;
-template<DenseIndex Rows, DenseIndex Cols, typename XprType> class TensorImagePatchOp;
-template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType> class TensorVolumePatchOp;
-template<typename Broadcast, typename XprType> class TensorBroadcastingOp;
-template<DenseIndex DimId, typename XprType> class TensorChippingOp;
-template<typename NewDimensions, typename XprType> class TensorReshapingOp;
-template<typename XprType> class TensorLayoutSwapOp;
-template<typename StartIndices, typename Sizes, typename XprType> class TensorSlicingOp;
-template<typename ReverseDimensions, typename XprType> class TensorReverseOp;
-template<typename PaddingDimensions, typename XprType> class TensorPaddingOp;
-template<typename Shuffle, typename XprType> class TensorShufflingOp;
-template<typename Strides, typename XprType> class TensorStridingOp;
-template<typename StartIndices, typename StopIndices, typename Strides, typename XprType> class TensorStridingSlicingOp;
-template<typename Strides, typename XprType> class TensorInflationOp;
-template<typename Generator, typename XprType> class TensorGeneratorOp;
-template<typename LeftXprType, typename RightXprType> class TensorAssignOp;
-template<typename Op, typename XprType> class TensorScanOp;
-template<typename Dims, typename XprType> class TensorTraceOp;
-
-template<typename CustomUnaryFunc, typename XprType> class TensorCustomUnaryOp;
-template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType> class TensorCustomBinaryOp;
-
-template<typename XprType, template <class> class MakePointer_ = MakePointer> class TensorEvalToOp;
-template<typename XprType> class TensorForcedEvalOp;
-
-template<typename ExpressionType, typename DeviceType> class TensorDevice;
-template<typename ExpressionType, typename DeviceType, typename DoneCallback> class TensorAsyncDevice;
-template<typename Derived, typename Device> struct TensorEvaluator;
-
-struct NoOpOutputKernel;
-
-struct DefaultDevice;
-struct ThreadPoolDevice;
-struct GpuDevice;
-struct SyclDevice;
-
-#ifdef EIGEN_USE_SYCL
-
-template <typename T> struct MakeSYCLPointer {
-  typedef Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T> Type;
-};
-
-template <typename T>
-EIGEN_STRONG_INLINE const Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T>&
-constCast(const Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T>& data) {
-  return data;
-}
-
-template <typename T>
-struct StorageMemory<T, SyclDevice> : MakeSYCLPointer<T> {};
-template <typename T>
-struct StorageMemory<T, const SyclDevice> : StorageMemory<T, SyclDevice> {};
-
-namespace TensorSycl {
-namespace internal{
-template <typename Evaluator, typename Op> class GenericNondeterministicReducer;
-}
-}
-#endif
-
-
-enum FFTResultType {
-  RealPart = 0,
-  ImagPart = 1,
-  BothParts = 2
-};
-
-enum FFTDirection {
-    FFT_FORWARD = 0,
-    FFT_REVERSE = 1
-};
-
-
-namespace internal {
-
-template <typename Device, typename Expression>
-struct IsVectorizable {
-  static const bool value = TensorEvaluator<Expression, Device>::PacketAccess;
-};
-
-template <typename Expression>
-struct IsVectorizable<GpuDevice, Expression> {
-  static const bool value = TensorEvaluator<Expression, GpuDevice>::PacketAccess &&
-                            TensorEvaluator<Expression, GpuDevice>::IsAligned;
-};
-
-// Tiled evaluation strategy.
-enum TiledEvaluation {
-  Off = 0,    // tiled evaluation is not supported
-  On = 1,     // still work in progress (see TensorBlock.h)
-};
-
-template <typename Device, typename Expression>
-struct IsTileable {
-  // Check that block evaluation is supported and it's a preferred option (at
-  // least one sub-expression has much faster block evaluation, e.g.
-  // broadcasting).
-  static const bool BlockAccess =
-      TensorEvaluator<Expression, Device>::BlockAccess &&
-      TensorEvaluator<Expression, Device>::PreferBlockAccess;
-
-  static const TiledEvaluation value =
-      BlockAccess ? TiledEvaluation::On : TiledEvaluation::Off;
-};
-
-template <typename Expression, typename Device,
-          bool Vectorizable      = IsVectorizable<Device, Expression>::value,
-          TiledEvaluation Tiling = IsTileable<Device, Expression>::value>
-class TensorExecutor;
-
-template <typename Expression, typename Device, typename DoneCallback,
-          bool Vectorizable = IsVectorizable<Device, Expression>::value,
-          TiledEvaluation Tiling = IsTileable<Device, Expression>::value>
-class TensorAsyncExecutor;
-
-
-}  // end namespace internal
-
-}  // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorFunctors.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorFunctors.h
deleted file mode 100644
index d963032..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorFunctors.h
+++ /dev/null
@@ -1,488 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H
-#define EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H
-
-namespace Eigen {
-namespace internal {
-
-
-/** \internal
- * \brief Template functor to compute the modulo between an array and a scalar.
- */
-template <typename Scalar>
-struct scalar_mod_op {
-  EIGEN_DEVICE_FUNC scalar_mod_op(const Scalar& divisor) : m_divisor(divisor) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a % m_divisor; }
-  const Scalar m_divisor;
-};
-template <typename Scalar>
-struct functor_traits<scalar_mod_op<Scalar> >
-{ enum { Cost = scalar_div_cost<Scalar,false>::value, PacketAccess = false }; };
-
-
-/** \internal
- * \brief Template functor to compute the modulo between 2 arrays.
- */
-template <typename Scalar>
-struct scalar_mod2_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; }
-};
-template <typename Scalar>
-struct functor_traits<scalar_mod2_op<Scalar> >
-{ enum { Cost = scalar_div_cost<Scalar,false>::value, PacketAccess = false }; };
-
-template <typename Scalar>
-struct scalar_fmod_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
-  operator()(const Scalar& a, const Scalar& b) const {
-    return numext::fmod(a, b);
-  }
-};
-template <typename Scalar>
-struct functor_traits<scalar_fmod_op<Scalar> > {
-  enum { Cost = 13,  // Reciprocal throughput of FPREM on Haswell.
-         PacketAccess = false };
-};
-
-template<typename Reducer, typename Device>
-struct reducer_traits {
-  enum {
-    Cost = 1,
-    PacketAccess = false,
-    IsStateful = false,
-    IsExactlyAssociative = true
-  };
-};
-
-// Standard reduction functors
-template <typename T> struct SumReducer
-{
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
-    internal::scalar_sum_op<T> sum_op;
-    *accum = sum_op(*accum, t);
-  }
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
-    (*accum) = padd<Packet>(*accum, p);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
-    internal::scalar_cast_op<int, T> conv;
-    return conv(0);
-  }
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
-    return pset1<Packet>(initialize());
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
-    return accum;
-  }
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
-    return vaccum;
-  }
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
-    internal::scalar_sum_op<T> sum_op;
-    return sum_op(saccum, predux(vaccum));
-  }
-};
-
-template <typename T, typename Device>
-struct reducer_traits<SumReducer<T>, Device> {
-  enum {
-    Cost = NumTraits<T>::AddCost,
-    PacketAccess = PacketType<T, Device>::HasAdd,
-    IsStateful = false,
-    IsExactlyAssociative = NumTraits<T>::IsInteger
-  };
-};
-
-template <typename T> struct MeanReducer
-{
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  MeanReducer() : scalarCount_(0), packetCount_(0) { }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) {
-    internal::scalar_sum_op<T> sum_op;
-    *accum = sum_op(*accum, t);
-    scalarCount_++;
-  }
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) {
-    (*accum) = padd<Packet>(*accum, p);
-    packetCount_++;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
-    internal::scalar_cast_op<int, T> conv;
-    return conv(0);
-  }
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
-    return pset1<Packet>(initialize());
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
-    internal::scalar_quotient_op<T> quotient_op;
-    return quotient_op(accum, T(scalarCount_));
-  }
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
-    return pdiv(vaccum, pset1<Packet>(T(packetCount_)));
-  }
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
-    internal::scalar_sum_op<T> sum_op;
-    internal::scalar_quotient_op<T> quotient_op;
-    return quotient_op(
-        sum_op(saccum, predux(vaccum)),
-        T(scalarCount_ + packetCount_ * unpacket_traits<Packet>::size));
-  }
-
-  protected:
-    DenseIndex scalarCount_;
-    DenseIndex packetCount_;
-};
-
-template <typename T, typename Device>
-struct reducer_traits<MeanReducer<T>, Device> {
-  enum {
-    Cost = NumTraits<T>::AddCost,
-    PacketAccess = PacketType<T, Device>::HasAdd &&
-                   PacketType<T, Device>::HasDiv && !NumTraits<T>::IsInteger,
-    IsStateful = true,
-    IsExactlyAssociative = NumTraits<T>::IsInteger
-  };
-};
-
-
-template <typename T, bool IsMax = true, bool IsInteger = true>
-struct MinMaxBottomValue {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
-    return Eigen::NumTraits<T>::lowest();
-  }
-};
-template <typename T>
-struct MinMaxBottomValue<T, true, false> {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
-    return -Eigen::NumTraits<T>::infinity();
-  }
-};
-template <typename T>
-struct MinMaxBottomValue<T, false, true> {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
-    return Eigen::NumTraits<T>::highest();
-  }
-};
-template <typename T>
-struct MinMaxBottomValue<T, false, false> {
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
-    return Eigen::NumTraits<T>::infinity();
-  }
-};
-
-
-template <typename T, int NaNPropagation=PropagateFast> struct MaxReducer
-{
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
-    scalar_max_op<T, T, NaNPropagation> op;
-    *accum = op(t, *accum);
-  }
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
-    scalar_max_op<T, T, NaNPropagation> op;
-    (*accum) = op.packetOp(*accum, p);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
-    return MinMaxBottomValue<T, /*IsMax=*/true, Eigen::NumTraits<T>::IsInteger>::bottom_value();
-  }
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
-    return pset1<Packet>(initialize());
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
-    return accum;
-  }
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
-    return vaccum;
-  }
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
-    scalar_max_op<T, T, NaNPropagation> op;
-    return op(saccum, op.predux(vaccum));
-  }
-};
-
-template <typename T, typename Device, int NaNPropagation>
-    struct reducer_traits<MaxReducer<T, NaNPropagation>, Device> {
-  enum {
-    Cost = NumTraits<T>::AddCost,
-    PacketAccess = PacketType<T, Device>::HasMax,
-    IsStateful = false,
-    IsExactlyAssociative = (NaNPropagation!=PropagateFast)
-  };
-};
-
-template <typename T, int NaNPropagation=PropagateFast> struct MinReducer
-{
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
-    scalar_min_op<T, T, NaNPropagation> op;
-    *accum = op(t, *accum);
-  }
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
-    scalar_min_op<T, T, NaNPropagation> op;
-    (*accum) = op.packetOp(*accum, p);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
-    return MinMaxBottomValue<T, /*IsMax=*/false, Eigen::NumTraits<T>::IsInteger>::bottom_value();
-  }
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
-    return pset1<Packet>(initialize());
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
-    return accum;
-  }
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
-    return vaccum;
-  }
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
-    scalar_min_op<T, T, NaNPropagation> op;
-    return op(saccum, op.predux(vaccum));
-  }
-};
-
-template <typename T, typename Device, int NaNPropagation>
-    struct reducer_traits<MinReducer<T, NaNPropagation>, Device> {
-  enum {
-    Cost = NumTraits<T>::AddCost,
-    PacketAccess = PacketType<T, Device>::HasMin,
-    IsStateful = false,
-    IsExactlyAssociative = (NaNPropagation!=PropagateFast)
-  };
-};
-
-template <typename T> struct ProdReducer
-{
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
-    internal::scalar_product_op<T> prod_op;
-    (*accum) = prod_op(*accum, t);
-  }
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
-    (*accum) = pmul<Packet>(*accum, p);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
-    internal::scalar_cast_op<int, T> conv;
-    return conv(1);
-  }
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
-    return pset1<Packet>(initialize());
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
-    return accum;
-  }
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
-    return vaccum;
-  }
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
-    internal::scalar_product_op<T> prod_op;
-    return prod_op(saccum, predux_mul(vaccum));
-  }
-};
-
-template <typename T, typename Device>
-struct reducer_traits<ProdReducer<T>, Device> {
-  enum {
-    Cost = NumTraits<T>::MulCost,
-    PacketAccess = PacketType<T, Device>::HasMul,
-    IsStateful = false,
-    IsExactlyAssociative = true
-  };
-};
-
-
-struct AndReducer
-{
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const {
-    *accum = *accum && t;
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool initialize() const {
-    return true;
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool finalize(bool accum) const {
-    return accum;
-  }
-};
-
-template <typename Device>
-struct reducer_traits<AndReducer, Device> {
-  enum {
-    Cost = 1,
-    PacketAccess = false,
-    IsStateful = false,
-    IsExactlyAssociative = true
-  };
-};
-
-
-struct OrReducer {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const {
-    *accum = *accum || t;
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool initialize() const {
-    return false;
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool finalize(bool accum) const {
-    return accum;
-  }
-};
-
-template <typename Device>
-struct reducer_traits<OrReducer, Device> {
-  enum {
-    Cost = 1,
-    PacketAccess = false,
-    IsStateful = false,
-    IsExactlyAssociative = true
-  };
-};
-
-// Argmin/Argmax reducers.  Returns the first occurrence if multiple locations
-// contain the same min/max value.
-template <typename T> struct ArgMaxTupleReducer
-{
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
-    if (t.second < accum->second) {
-      return;
-    } else if (t.second > accum->second || accum->first > t.first ) {
-      *accum = t;
-    }
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
-    return T(0, NumTraits<typename T::second_type>::lowest());
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T& accum) const {
-    return accum;
-  }
-};
-
-template <typename T, typename Device>
-struct reducer_traits<ArgMaxTupleReducer<T>, Device> {
-  enum {
-    Cost = NumTraits<T>::AddCost,
-    PacketAccess = false,
-    IsStateful = false,
-    IsExactlyAssociative = true
-  };
-};
-
-
-template <typename T> struct ArgMinTupleReducer
-{
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T& t, T* accum) const {
-    if (t.second > accum->second) {
-      return;
-    } else if (t.second < accum->second || accum->first > t.first) {
-      *accum = t;
-    }
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
-    return T(0, NumTraits<typename T::second_type>::highest());
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T& accum) const {
-    return accum;
-  }
-};
-
-template <typename T, typename Device>
-struct reducer_traits<ArgMinTupleReducer<T>, Device> {
-  enum {
-    Cost = NumTraits<T>::AddCost,
-    PacketAccess = false,
-    IsStateful = false,
-    IsExactlyAssociative = true
-  };
-};
-
-
-template <typename T, typename Index, size_t NumDims>
-class GaussianGenerator {
- public:
-  static const bool PacketAccess = false;
-
-  EIGEN_DEVICE_FUNC GaussianGenerator(const array<T, NumDims>& means,
-                                      const array<T, NumDims>& std_devs)
-      : m_means(means)
-  {
-    EIGEN_UNROLL_LOOP
-    for (size_t i = 0; i < NumDims; ++i) {
-      m_two_sigmas[i] = std_devs[i] * std_devs[i] * 2;
-    }
-  }
-
-  EIGEN_DEVICE_FUNC T operator()(const array<Index, NumDims>& coordinates) const {
-    T tmp = T(0);
-    EIGEN_UNROLL_LOOP
-    for (size_t i = 0; i < NumDims; ++i) {
-      T offset = coordinates[i] - m_means[i];
-      tmp += offset * offset / m_two_sigmas[i];
-    }
-    return numext::exp(-tmp);
-  }
-
- private:
-  array<T, NumDims> m_means;
-  array<T, NumDims> m_two_sigmas;
-};
-
-template <typename T, typename Index, size_t NumDims>
-struct functor_traits<GaussianGenerator<T, Index, NumDims> > {
-  enum {
-    Cost = NumDims * (2 * NumTraits<T>::AddCost + NumTraits<T>::MulCost +
-                      functor_traits<scalar_quotient_op<T, T> >::Cost) +
-           functor_traits<scalar_exp_op<T> >::Cost,
-    PacketAccess = GaussianGenerator<T, Index, NumDims>::PacketAccess
-  };
-};
-
-template <typename Scalar>
-struct scalar_clamp_op {
-  EIGEN_DEVICE_FUNC inline scalar_clamp_op(const Scalar& _min, const Scalar& _max) : m_min(_min), m_max(_max) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar
-  operator()(const Scalar& x) const {
-    return numext::mini(numext::maxi(x, m_min), m_max);
-  }
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
-  packetOp(const Packet& x) const {
-    return internal::pmin(internal::pmax(x, pset1<Packet>(m_min)), pset1<Packet>(m_max));
-  }
-  const Scalar m_min;
-  const Scalar m_max;
-};
-template<typename Scalar>
-struct functor_traits<scalar_clamp_op<Scalar> >
-{ enum { Cost = 2 * NumTraits<Scalar>::AddCost, PacketAccess = (packet_traits<Scalar>::HasMin && packet_traits<Scalar>::HasMax)}; };
-
-} // end namespace internal
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorGenerator.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorGenerator.h
deleted file mode 100644
index 174bf06..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorGenerator.h
+++ /dev/null
@@ -1,302 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H
-#define EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H
-
-namespace Eigen {
-
-/** \class TensorGeneratorOp
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Tensor generator class.
-  *
-  *
-  */
-namespace internal {
-template<typename Generator, typename XprType>
-struct traits<TensorGeneratorOp<Generator, XprType> > : public traits<XprType>
-{
-  typedef typename XprType::Scalar Scalar;
-  typedef traits<XprType> XprTraits;
-  typedef typename XprTraits::StorageKind StorageKind;
-  typedef typename XprTraits::Index Index;
-  typedef typename XprType::Nested Nested;
-  typedef typename remove_reference<Nested>::type _Nested;
-  static const int NumDimensions = XprTraits::NumDimensions;
-  static const int Layout = XprTraits::Layout;
-  typedef typename XprTraits::PointerType PointerType;
-};
-
-template<typename Generator, typename XprType>
-struct eval<TensorGeneratorOp<Generator, XprType>, Eigen::Dense>
-{
-  typedef const TensorGeneratorOp<Generator, XprType>& type;
-};
-
-template<typename Generator, typename XprType>
-struct nested<TensorGeneratorOp<Generator, XprType>, 1, typename eval<TensorGeneratorOp<Generator, XprType> >::type>
-{
-  typedef TensorGeneratorOp<Generator, XprType> type;
-};
-
-}  // end namespace internal
-
-
-
-template<typename Generator, typename XprType>
-class TensorGeneratorOp : public TensorBase<TensorGeneratorOp<Generator, XprType>, ReadOnlyAccessors>
-{
-  public:
-  typedef typename Eigen::internal::traits<TensorGeneratorOp>::Scalar Scalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename Eigen::internal::nested<TensorGeneratorOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorGeneratorOp>::StorageKind StorageKind;
-  typedef typename Eigen::internal::traits<TensorGeneratorOp>::Index Index;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorGeneratorOp(const XprType& expr, const Generator& generator)
-      : m_xpr(expr), m_generator(generator) {}
-
-    EIGEN_DEVICE_FUNC
-    const Generator& generator() const { return m_generator; }
-
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename XprType::Nested>::type&
-    expression() const { return m_xpr; }
-
-  protected:
-    typename XprType::Nested m_xpr;
-    const Generator m_generator;
-};
-
-
-// Eval as rvalue
-template<typename Generator, typename ArgType, typename Device>
-struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
-{
-  typedef TensorGeneratorOp<Generator, ArgType> XprType;
-  typedef typename XprType::Index Index;
-  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
-  static const int NumDims = internal::array_size<Dimensions>::value;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-  enum {
-    IsAligned         = false,
-    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccess       = true,
-    PreferBlockAccess = true,
-    Layout            = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess       = false,  // to be implemented
-    RawAccess         = false
-  };
-
-  typedef internal::TensorIntDivisor<Index> IndexDivisor;
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
-  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
-
-  typedef typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims,
-                                                     Layout, Index>
-      TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      :  m_device(device), m_generator(op.generator())
-  {
-    TensorEvaluator<ArgType, Device> argImpl(op.expression(), device);
-    m_dimensions = argImpl.dimensions();
-
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      m_strides[0] = 1;
-      EIGEN_UNROLL_LOOP
-      for (int i = 1; i < NumDims; ++i) {
-        m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1];
-        if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]);
-      }
-    } else {
-      m_strides[NumDims - 1] = 1;
-      EIGEN_UNROLL_LOOP
-      for (int i = NumDims - 2; i >= 0; --i) {
-        m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1];
-        if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]);
-      }
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
-    return true;
-  }
-  EIGEN_STRONG_INLINE void cleanup() {
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
-  {
-    array<Index, NumDims> coords;
-    extract_coordinates(index, coords);
-    return m_generator(coords);
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
-  {
-    const int packetSize = PacketType<CoeffReturnType, Device>::size;
-    EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+packetSize-1 < dimensions().TotalSize());
-
-    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
-    for (int i = 0; i < packetSize; ++i) {
-      values[i] = coeff(index+i);
-    }
-    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
-    return rslt;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockResourceRequirements getResourceRequirements() const {
-    const size_t target_size = m_device.firstLevelCacheSize();
-    // TODO(ezhulenev): Generator should have a cost.
-    return internal::TensorBlockResourceRequirements::skewed<Scalar>(
-        target_size);
-  }
-
-  struct BlockIteratorState {
-    Index stride;
-    Index span;
-    Index size;
-    Index count;
-  };
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
-  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
-          bool /*root_of_expr_ast*/ = false) const {
-    static const bool is_col_major =
-        static_cast<int>(Layout) == static_cast<int>(ColMajor);
-
-    // Compute spatial coordinates for the first block element.
-    array<Index, NumDims> coords;
-    extract_coordinates(desc.offset(), coords);
-    array<Index, NumDims> initial_coords = coords;
-
-    // Offset in the output block buffer.
-    Index offset = 0;
-
-    // Initialize output block iterator state. Dimension in this array are
-    // always in inner_most -> outer_most order (col major layout).
-    array<BlockIteratorState, NumDims> it;
-    for (int i = 0; i < NumDims; ++i) {
-      const int dim = is_col_major ? i : NumDims - 1 - i;
-      it[i].size = desc.dimension(dim);
-      it[i].stride = i == 0 ? 1 : (it[i - 1].size * it[i - 1].stride);
-      it[i].span = it[i].stride * (it[i].size - 1);
-      it[i].count = 0;
-    }
-    eigen_assert(it[0].stride == 1);
-
-    // Prepare storage for the materialized generator result.
-    const typename TensorBlock::Storage block_storage =
-        TensorBlock::prepareStorage(desc, scratch);
-
-    CoeffReturnType* block_buffer = block_storage.data();
-
-    static const int packet_size = PacketType<CoeffReturnType, Device>::size;
-
-    static const int inner_dim = is_col_major ? 0 : NumDims - 1;
-    const Index inner_dim_size = it[0].size;
-    const Index inner_dim_vectorized = inner_dim_size - packet_size;
-
-    while (it[NumDims - 1].count < it[NumDims - 1].size) {
-      Index i = 0;
-      // Generate data for the vectorized part of the inner-most dimension.
-      for (; i <= inner_dim_vectorized; i += packet_size) {
-        for (Index j = 0; j < packet_size; ++j) {
-          array<Index, NumDims> j_coords = coords;  // Break loop dependence.
-          j_coords[inner_dim] += j;
-          *(block_buffer + offset + i + j) = m_generator(j_coords);
-        }
-        coords[inner_dim] += packet_size;
-      }
-      // Finalize non-vectorized part of the inner-most dimension.
-      for (; i < inner_dim_size; ++i) {
-        *(block_buffer + offset + i) = m_generator(coords);
-        coords[inner_dim]++;
-      }
-      coords[inner_dim] = initial_coords[inner_dim];
-
-      // For the 1d tensor we need to generate only one inner-most dimension.
-      if (NumDims == 1) break;
-
-      // Update offset.
-      for (i = 1; i < NumDims; ++i) {
-        if (++it[i].count < it[i].size) {
-          offset += it[i].stride;
-          coords[is_col_major ? i : NumDims - 1 - i]++;
-          break;
-        }
-        if (i != NumDims - 1) it[i].count = 0;
-        coords[is_col_major ? i : NumDims - 1 - i] =
-            initial_coords[is_col_major ? i : NumDims - 1 - i];
-        offset -= it[i].span;
-      }
-    }
-
-    return block_storage.AsTensorMaterializedBlock();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
-  costPerCoeff(bool) const {
-    // TODO(rmlarsen): This is just a placeholder. Define interface to make
-    // generators return their cost.
-    return TensorOpCost(0, 0, TensorOpCost::AddCost<Scalar>() +
-                                  TensorOpCost::MulCost<Scalar>());
-  }
-
-  EIGEN_DEVICE_FUNC EvaluatorPointerType  data() const { return NULL; }
-
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler&) const {}
-#endif
-
- protected:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void extract_coordinates(Index index, array<Index, NumDims>& coords) const {
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      for (int i = NumDims - 1; i > 0; --i) {
-        const Index idx = index / m_fast_strides[i];
-        index -= idx * m_strides[i];
-        coords[i] = idx;
-      }
-      coords[0] = index;
-    } else {
-      for (int i = 0; i < NumDims - 1; ++i) {
-        const Index idx = index / m_fast_strides[i];
-        index -= idx * m_strides[i];
-        coords[i] = idx;
-      }
-      coords[NumDims-1] = index;
-    }
-  }
-
-  const Device EIGEN_DEVICE_REF m_device;
-  Dimensions m_dimensions;
-  array<Index, NumDims> m_strides;
-  array<IndexDivisor, NumDims> m_fast_strides;
-  Generator m_generator;
-};
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorGlobalFunctions.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorGlobalFunctions.h
deleted file mode 100644
index 665b861..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorGlobalFunctions.h
+++ /dev/null
@@ -1,33 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H
-#define EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H
-
-namespace Eigen {
-
-/** \cpp11 \returns an expression of the coefficient-wise betainc(\a x, \a a, \a b) to the given tensors.
- *
- * This function computes the regularized incomplete beta function (integral).
- *
- */
-template <typename ADerived, typename BDerived, typename XDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
-    TensorCwiseTernaryOp<internal::scalar_betainc_op<typename XDerived::Scalar>,
-                         const ADerived, const BDerived, const XDerived>
-    betainc(const ADerived& a, const BDerived& b, const XDerived& x) {
-  return TensorCwiseTernaryOp<
-      internal::scalar_betainc_op<typename XDerived::Scalar>, const ADerived,
-      const BDerived, const XDerived>(
-      a, b, x, internal::scalar_betainc_op<typename XDerived::Scalar>());
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorGpuHipCudaDefines.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
deleted file mode 100644
index cb53ce2..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
+++ /dev/null
@@ -1,99 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-// Copyright (C) 2018 Deven Desai <deven.desai.amd@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H)
-#define EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
-
-// Note that we are using EIGEN_USE_HIP here instead of EIGEN_HIPCC...this is by design
-// There is code in the Tensorflow codebase that will define EIGEN_USE_GPU,  but
-// for some reason gets sent to the gcc/host compiler instead of the gpu/nvcc/hipcc compiler
-// When compiling such files, gcc will end up trying to pick up the CUDA headers by 
-// default (see the code within "unsupported/Eigen/CXX11/Tensor" that is guarded by EIGEN_USE_GPU)
-// This will obviously not work when trying to compile tensorflow on a system with no CUDA
-// To work around this issue for HIP systems (and leave the default behaviour intact), the
-// HIP tensorflow build defines EIGEN_USE_HIP when compiling all source files, and 
-// "unsupported/Eigen/CXX11/Tensor" has been updated to use HIP header when EIGEN_USE_HIP is
-// defined. In continuation of that requirement, the guard here needs to be EIGEN_USE_HIP as well
-
-#if defined(EIGEN_USE_HIP)
-
-#define gpuStream_t hipStream_t
-#define gpuDeviceProp_t hipDeviceProp_t
-#define gpuError_t hipError_t
-#define gpuSuccess hipSuccess
-#define gpuErrorNotReady hipErrorNotReady
-#define gpuGetDeviceCount hipGetDeviceCount
-#define gpuGetLastError hipGetLastError
-#define gpuPeekAtLastError hipPeekAtLastError
-#define gpuGetErrorName hipGetErrorName
-#define gpuGetErrorString hipGetErrorString
-#define gpuGetDeviceProperties hipGetDeviceProperties
-#define gpuStreamDefault hipStreamDefault
-#define gpuGetDevice hipGetDevice
-#define gpuSetDevice hipSetDevice
-#define gpuMalloc hipMalloc
-#define gpuFree hipFree
-#define gpuMemsetAsync hipMemsetAsync
-#define gpuMemcpyAsync hipMemcpyAsync
-#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
-#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
-#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
-#define gpuStreamQuery hipStreamQuery
-#define gpuSharedMemConfig hipSharedMemConfig
-#define gpuDeviceSetSharedMemConfig hipDeviceSetSharedMemConfig
-#define gpuStreamSynchronize hipStreamSynchronize
-#define gpuDeviceSynchronize hipDeviceSynchronize
-#define gpuMemcpy hipMemcpy
-
-#else
-
-#define gpuStream_t cudaStream_t
-#define gpuDeviceProp_t cudaDeviceProp
-#define gpuError_t cudaError_t
-#define gpuSuccess cudaSuccess
-#define gpuErrorNotReady cudaErrorNotReady
-#define gpuGetDeviceCount cudaGetDeviceCount
-#define gpuGetLastError cudaGetLastError
-#define gpuPeekAtLastError cudaPeekAtLastError
-#define gpuGetErrorName cudaGetErrorName
-#define gpuGetErrorString cudaGetErrorString
-#define gpuGetDeviceProperties cudaGetDeviceProperties
-#define gpuStreamDefault cudaStreamDefault
-#define gpuGetDevice cudaGetDevice
-#define gpuSetDevice cudaSetDevice
-#define gpuMalloc cudaMalloc
-#define gpuFree cudaFree
-#define gpuMemsetAsync cudaMemsetAsync
-#define gpuMemcpyAsync cudaMemcpyAsync
-#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
-#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
-#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
-#define gpuStreamQuery cudaStreamQuery
-#define gpuSharedMemConfig cudaSharedMemConfig
-#define gpuDeviceSetSharedMemConfig cudaDeviceSetSharedMemConfig
-#define gpuStreamSynchronize cudaStreamSynchronize
-#define gpuDeviceSynchronize cudaDeviceSynchronize
-#define gpuMemcpy cudaMemcpy
-
-#endif
-
-// gpu_assert can be overridden
-#ifndef gpu_assert
-
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-// HIPCC do not support the use of assert on the GPU side.
-#define gpu_assert(COND)
-#else
-#define gpu_assert(COND) assert(COND)
-#endif
-
-#endif // gpu_assert
-
-#endif  // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h
deleted file mode 100644
index 1d142f2..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h
+++ /dev/null
@@ -1,44 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-// Copyright (C) 2018 Deven Desai <deven.desai.amd@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#if defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H)
-
-#ifndef EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES
-
-#undef gpuStream_t
-#undef gpuDeviceProp_t 
-#undef gpuError_t
-#undef gpuSuccess
-#undef gpuErrorNotReady
-#undef gpuGetDeviceCount
-#undef gpuGetErrorString
-#undef gpuGetDeviceProperties
-#undef gpuStreamDefault
-#undef gpuGetDevice
-#undef gpuSetDevice
-#undef gpuMalloc
-#undef gpuFree
-#undef gpuMemsetAsync
-#undef gpuMemcpyAsync
-#undef gpuMemcpyDeviceToDevice
-#undef gpuMemcpyDeviceToHost
-#undef gpuMemcpyHostToDevice
-#undef gpuStreamQuery
-#undef gpuSharedMemConfig
-#undef gpuDeviceSetSharedMemConfig
-#undef gpuStreamSynchronize
-#undef gpuDeviceSynchronize
-#undef gpuMemcpy
-
-#endif // EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES
-
-#undef EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
-
-#endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorIO.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorIO.h
deleted file mode 100644
index a901c5d..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorIO.h
+++ /dev/null
@@ -1,79 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_IO_H
-#define EIGEN_CXX11_TENSOR_TENSOR_IO_H
-
-namespace Eigen {
-
-namespace internal {
-
-// Print the tensor as a 2d matrix
-template <typename Tensor, int Rank>
-struct TensorPrinter {
-  static void run (std::ostream& os, const Tensor& tensor) {
-    typedef typename internal::remove_const<typename Tensor::Scalar>::type Scalar;
-    typedef typename Tensor::Index Index;
-    const Index total_size = internal::array_prod(tensor.dimensions());
-    if (total_size > 0) {
-      const Index first_dim = Eigen::internal::array_get<0>(tensor.dimensions());
-      static const int layout = Tensor::Layout;
-      Map<const Array<Scalar, Dynamic, Dynamic, layout> > matrix(const_cast<Scalar*>(tensor.data()), first_dim, total_size/first_dim);
-      os << matrix;
-    }
-  }
-};
-
-
-// Print the tensor as a vector
-template <typename Tensor>
-struct TensorPrinter<Tensor, 1> {
-  static void run (std::ostream& os, const Tensor& tensor) {
-    typedef typename internal::remove_const<typename Tensor::Scalar>::type Scalar;
-    typedef typename Tensor::Index Index;
-    const Index total_size = internal::array_prod(tensor.dimensions());
-    if (total_size > 0) {
-      Map<const Array<Scalar, Dynamic, 1> > array(const_cast<Scalar*>(tensor.data()), total_size);
-      os << array;
-    }
-  }
-};
-
-
-// Print the tensor as a scalar
-template <typename Tensor>
-struct TensorPrinter<Tensor, 0> {
-  static void run (std::ostream& os, const Tensor& tensor) {
-    os << tensor.coeff(0);
-  }
-};
-}
-
-template <typename T>
-std::ostream& operator << (std::ostream& os, const TensorBase<T, ReadOnlyAccessors>& expr) {
-  typedef TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice> Evaluator;
-  typedef typename Evaluator::Dimensions Dimensions;
-
-  // Evaluate the expression if needed
-  TensorForcedEvalOp<const T> eval = expr.eval();
-  Evaluator tensor(eval, DefaultDevice());
-  tensor.evalSubExprsIfNeeded(NULL);
-
-  // Print the result
-  static const int rank = internal::array_size<Dimensions>::value;
-  internal::TensorPrinter<Evaluator, rank>::run(os, tensor);
-
-  // Cleanup.
-  tensor.cleanup();
-  return os;
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_IO_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorImagePatch.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorImagePatch.h
deleted file mode 100644
index dd51850..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorImagePatch.h
+++ /dev/null
@@ -1,603 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H
-#define EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H
-
-namespace Eigen {
-
-/** \class TensorImagePatch
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Patch extraction specialized for image processing.
-  * This assumes that the input has a least 3 dimensions ordered as follow:
-  *  1st dimension: channels (of size d)
-  *  2nd dimension: rows (of size r)
-  *  3rd dimension: columns (of size c)
-  *  There can be additional dimensions such as time (for video) or batch (for
-  * bulk processing after the first 3.
-  * Calling the image patch code with patch_rows and patch_cols is equivalent
-  * to calling the regular patch extraction code with parameters d, patch_rows,
-  * patch_cols, and 1 for all the additional dimensions.
-  */
-namespace internal {
-
-template<DenseIndex Rows, DenseIndex Cols, typename XprType>
-struct traits<TensorImagePatchOp<Rows, Cols, XprType> > : public traits<XprType>
-{
-  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
-  typedef traits<XprType> XprTraits;
-  typedef typename XprTraits::StorageKind StorageKind;
-  typedef typename XprTraits::Index Index;
-  typedef typename XprType::Nested Nested;
-  typedef typename remove_reference<Nested>::type _Nested;
-  static const int NumDimensions = XprTraits::NumDimensions + 1;
-  static const int Layout = XprTraits::Layout;
-  typedef typename XprTraits::PointerType PointerType;
-};
-
-template<DenseIndex Rows, DenseIndex Cols, typename XprType>
-struct eval<TensorImagePatchOp<Rows, Cols, XprType>, Eigen::Dense>
-{
-  typedef const TensorImagePatchOp<Rows, Cols, XprType>& type;
-};
-
-template<DenseIndex Rows, DenseIndex Cols, typename XprType>
-struct nested<TensorImagePatchOp<Rows, Cols, XprType>, 1, typename eval<TensorImagePatchOp<Rows, Cols, XprType> >::type>
-{
-  typedef TensorImagePatchOp<Rows, Cols, XprType> type;
-};
-
-template <typename Self, bool Vectorizable>
-struct ImagePatchCopyOp {
-  typedef typename Self::Index Index;
-  typedef typename Self::Scalar Scalar;
-  typedef typename Self::Impl Impl;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      const Self& self, const Index num_coeff_to_copy, const Index dst_index,
-      Scalar* dst_data, const Index src_index) {
-    const Impl& impl = self.impl();
-    for (Index i = 0; i < num_coeff_to_copy; ++i) {
-      dst_data[dst_index + i] = impl.coeff(src_index + i);
-    }
-  }
-};
-
-template <typename Self>
-struct ImagePatchCopyOp<Self, true> {
-  typedef typename Self::Index Index;
-  typedef typename Self::Scalar Scalar;
-  typedef typename Self::Impl Impl;
-  typedef typename packet_traits<Scalar>::type Packet;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      const Self& self, const Index num_coeff_to_copy, const Index dst_index,
-      Scalar* dst_data, const Index src_index) {
-    const Impl& impl = self.impl();
-    const Index packet_size = internal::unpacket_traits<Packet>::size;
-    const Index vectorized_size =
-        (num_coeff_to_copy / packet_size) * packet_size;
-    for (Index i = 0; i < vectorized_size; i += packet_size) {
-      Packet p = impl.template packet<Unaligned>(src_index + i);
-      internal::pstoret<Scalar, Packet, Unaligned>(dst_data + dst_index + i, p);
-    }
-    for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) {
-      dst_data[dst_index + i] = impl.coeff(src_index + i);
-    }
-  }
-};
-
-template <typename Self>
-struct ImagePatchPaddingOp {
-  typedef typename Self::Index Index;
-  typedef typename Self::Scalar Scalar;
-  typedef typename packet_traits<Scalar>::type Packet;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      const Index num_coeff_to_pad, const Scalar padding_value,
-      const Index dst_index, Scalar* dst_data) {
-    const Index packet_size = internal::unpacket_traits<Packet>::size;
-    const Packet padded_packet = internal::pset1<Packet>(padding_value);
-    const Index vectorized_size =
-        (num_coeff_to_pad / packet_size) * packet_size;
-    for (Index i = 0; i < vectorized_size; i += packet_size) {
-      internal::pstoret<Scalar, Packet, Unaligned>(dst_data + dst_index + i,
-                                                   padded_packet);
-    }
-    for (Index i = vectorized_size; i < num_coeff_to_pad; ++i) {
-      dst_data[dst_index + i] = padding_value;
-    }
-  }
-};
-
-}  // end namespace internal
-
-template<DenseIndex Rows, DenseIndex Cols, typename XprType>
-class TensorImagePatchOp : public TensorBase<TensorImagePatchOp<Rows, Cols, XprType>, ReadOnlyAccessors>
-{
-  public:
-  typedef typename Eigen::internal::traits<TensorImagePatchOp>::Scalar Scalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename Eigen::internal::nested<TensorImagePatchOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorImagePatchOp>::StorageKind StorageKind;
-  typedef typename Eigen::internal::traits<TensorImagePatchOp>::Index Index;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols,
-                                                           DenseIndex row_strides, DenseIndex col_strides,
-                                                           DenseIndex in_row_strides, DenseIndex in_col_strides,
-                                                           DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
-                                                           PaddingType padding_type, Scalar padding_value)
-                                                           : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
-                                                           m_row_strides(row_strides), m_col_strides(col_strides),
-                                                           m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
-                                                           m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
-                                                           m_padding_explicit(false), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0),
-                                                           m_padding_type(padding_type), m_padding_value(padding_value) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols,
-                                                           DenseIndex row_strides, DenseIndex col_strides,
-                                                           DenseIndex in_row_strides, DenseIndex in_col_strides,
-                                                           DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
-                                                           DenseIndex padding_top, DenseIndex padding_bottom,
-                                                           DenseIndex padding_left, DenseIndex padding_right,
-                                                           Scalar padding_value)
-                                                           : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
-                                                           m_row_strides(row_strides), m_col_strides(col_strides),
-                                                           m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
-                                                           m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
-                                                           m_padding_explicit(true), m_padding_top(padding_top), m_padding_bottom(padding_bottom),
-                                                           m_padding_left(padding_left), m_padding_right(padding_right),
-                                                           m_padding_type(PADDING_VALID), m_padding_value(padding_value) {}
-
-
-    EIGEN_DEVICE_FUNC
-    DenseIndex patch_rows() const { return m_patch_rows; }
-    EIGEN_DEVICE_FUNC
-    DenseIndex patch_cols() const { return m_patch_cols; }
-    EIGEN_DEVICE_FUNC
-    DenseIndex row_strides() const { return m_row_strides; }
-    EIGEN_DEVICE_FUNC
-    DenseIndex col_strides() const { return m_col_strides; }
-    EIGEN_DEVICE_FUNC
-    DenseIndex in_row_strides() const { return m_in_row_strides; }
-    EIGEN_DEVICE_FUNC
-    DenseIndex in_col_strides() const { return m_in_col_strides; }
-    EIGEN_DEVICE_FUNC
-    DenseIndex row_inflate_strides() const { return m_row_inflate_strides; }
-    EIGEN_DEVICE_FUNC
-    DenseIndex col_inflate_strides() const { return m_col_inflate_strides; }
-    EIGEN_DEVICE_FUNC
-    bool padding_explicit() const { return m_padding_explicit; }
-    EIGEN_DEVICE_FUNC
-    DenseIndex padding_top() const { return m_padding_top; }
-    EIGEN_DEVICE_FUNC
-    DenseIndex padding_bottom() const { return m_padding_bottom; }
-    EIGEN_DEVICE_FUNC
-    DenseIndex padding_left() const { return m_padding_left; }
-    EIGEN_DEVICE_FUNC
-    DenseIndex padding_right() const { return m_padding_right; }
-    EIGEN_DEVICE_FUNC
-    PaddingType padding_type() const { return m_padding_type; }
-    EIGEN_DEVICE_FUNC
-    Scalar padding_value() const { return m_padding_value; }
-
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename XprType::Nested>::type&
-    expression() const { return m_xpr; }
-
-  protected:
-    typename XprType::Nested m_xpr;
-    const DenseIndex m_patch_rows;
-    const DenseIndex m_patch_cols;
-    const DenseIndex m_row_strides;
-    const DenseIndex m_col_strides;
-    const DenseIndex m_in_row_strides;
-    const DenseIndex m_in_col_strides;
-    const DenseIndex m_row_inflate_strides;
-    const DenseIndex m_col_inflate_strides;
-    const bool m_padding_explicit;
-    const DenseIndex m_padding_top;
-    const DenseIndex m_padding_bottom;
-    const DenseIndex m_padding_left;
-    const DenseIndex m_padding_right;
-    const PaddingType m_padding_type;
-    const Scalar m_padding_value;
-};
-
-// Eval as rvalue
-template<DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device>
-struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
-{
-  typedef TensorImagePatchOp<Rows, Cols, ArgType> XprType;
-  typedef typename XprType::Index Index;
-  static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
-  static const int NumDims = NumInputDims + 1;
-  typedef DSizes<Index, NumDims> Dimensions;
-  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
-  typedef TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>,
-                          Device> Self;
-  typedef TensorEvaluator<ArgType, Device> Impl;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
-  enum {
-    IsAligned         = false,
-    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess       = false,
-    PreferBlockAccess = true,
-    Layout            = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess       = false,
-    RawAccess         = false
-  };
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device)
-      : m_device(device), m_impl(op.expression(), device)
-  {
-    EIGEN_STATIC_ASSERT((NumDims >= 4), YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-    m_paddingValue = op.padding_value();
-
-    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
-
-    // Caches a few variables.
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      m_inputDepth = input_dims[0];
-      m_inputRows = input_dims[1];
-      m_inputCols = input_dims[2];
-    } else {
-      m_inputDepth = input_dims[NumInputDims-1];
-      m_inputRows = input_dims[NumInputDims-2];
-      m_inputCols = input_dims[NumInputDims-3];
-    }
-
-    m_row_strides = op.row_strides();
-    m_col_strides = op.col_strides();
-
-    // Input strides and effective input/patch size
-    m_in_row_strides = op.in_row_strides();
-    m_in_col_strides = op.in_col_strides();
-    m_row_inflate_strides = op.row_inflate_strides();
-    m_col_inflate_strides = op.col_inflate_strides();
-    // The "effective" input rows and input cols are the input rows and cols
-    // after inflating them with zeros.
-    // For examples, a 2x3 matrix with row_inflate_strides and
-    // col_inflate_strides of 2 comes from:
-    //   A B C
-    //   D E F
-    //
-    // to a matrix is 3 x 5:
-    //
-    //   A . B . C
-    //   . . . . .
-    //   D . E . F
-
-    m_input_rows_eff = (m_inputRows - 1) * m_row_inflate_strides + 1;
-    m_input_cols_eff = (m_inputCols - 1) * m_col_inflate_strides + 1;
-    m_patch_rows_eff = op.patch_rows() + (op.patch_rows() - 1) * (m_in_row_strides - 1);
-    m_patch_cols_eff = op.patch_cols() + (op.patch_cols() - 1) * (m_in_col_strides - 1);
-
-    if (op.padding_explicit()) {
-      m_outputRows = numext::ceil((m_input_rows_eff + op.padding_top() + op.padding_bottom() - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides));
-      m_outputCols = numext::ceil((m_input_cols_eff + op.padding_left() + op.padding_right() - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides));
-      m_rowPaddingTop = op.padding_top();
-      m_colPaddingLeft = op.padding_left();
-    } else {
-      // Computing padding from the type
-      switch (op.padding_type()) {
-        case PADDING_VALID:
-          m_outputRows = numext::ceil((m_input_rows_eff - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides));
-          m_outputCols = numext::ceil((m_input_cols_eff - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides));
-          // Calculate the padding
-          m_rowPaddingTop = numext::maxi<Index>(0, ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2);
-          m_colPaddingLeft = numext::maxi<Index>(0, ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2);
-          break;
-        case PADDING_SAME:
-          m_outputRows = numext::ceil(m_input_rows_eff / static_cast<float>(m_row_strides));
-          m_outputCols = numext::ceil(m_input_cols_eff / static_cast<float>(m_col_strides));
-          // Calculate the padding
-          m_rowPaddingTop = ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2;
-          m_colPaddingLeft = ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2;
-          // The padding size calculation for PADDING_SAME has been updated to
-          // be consistent with how TensorFlow extracts its paddings.
-          m_rowPaddingTop = numext::maxi<Index>(0, m_rowPaddingTop);
-          m_colPaddingLeft = numext::maxi<Index>(0, m_colPaddingLeft);
-          break;
-        default:
-          eigen_assert(false && "unexpected padding");
-          m_outputCols=0; // silence the uninitialised warning;
-          m_outputRows=0; //// silence the uninitialised warning;
-      }
-    }
-    eigen_assert(m_outputRows > 0);
-    eigen_assert(m_outputCols > 0);
-
-    // Dimensions for result of extraction.
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      // ColMajor
-      // 0: depth
-      // 1: patch_rows
-      // 2: patch_cols
-      // 3: number of patches
-      // 4 and beyond: anything else (such as batch).
-      m_dimensions[0] = input_dims[0];
-      m_dimensions[1] = op.patch_rows();
-      m_dimensions[2] = op.patch_cols();
-      m_dimensions[3] = m_outputRows * m_outputCols;
-      for (int i = 4; i < NumDims; ++i) {
-        m_dimensions[i] = input_dims[i-1];
-      }
-    } else {
-      // RowMajor
-      // NumDims-1: depth
-      // NumDims-2: patch_rows
-      // NumDims-3: patch_cols
-      // NumDims-4: number of patches
-      // NumDims-5 and beyond: anything else (such as batch).
-      m_dimensions[NumDims-1] = input_dims[NumInputDims-1];
-      m_dimensions[NumDims-2] = op.patch_rows();
-      m_dimensions[NumDims-3] = op.patch_cols();
-      m_dimensions[NumDims-4] = m_outputRows * m_outputCols;
-      for (int i = NumDims-5; i >= 0; --i) {
-        m_dimensions[i] = input_dims[i];
-      }
-    }
-
-    // Strides for moving the patch in various dimensions.
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      m_colStride = m_dimensions[1];
-      m_patchStride = m_colStride * m_dimensions[2] * m_dimensions[0];
-      m_otherStride = m_patchStride * m_dimensions[3];
-    } else {
-      m_colStride = m_dimensions[NumDims-2];
-      m_patchStride = m_colStride * m_dimensions[NumDims-3] * m_dimensions[NumDims-1];
-      m_otherStride = m_patchStride * m_dimensions[NumDims-4];
-    }
-
-    // Strides for navigating through the input tensor.
-    m_rowInputStride = m_inputDepth;
-    m_colInputStride = m_inputDepth * m_inputRows;
-    m_patchInputStride = m_inputDepth * m_inputRows * m_inputCols;
-
-    // Fast representations of different variables.
-    m_fastOtherStride = internal::TensorIntDivisor<Index>(m_otherStride);
-    m_fastPatchStride = internal::TensorIntDivisor<Index>(m_patchStride);
-    m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride);
-    m_fastInflateRowStride = internal::TensorIntDivisor<Index>(m_row_inflate_strides);
-    m_fastInflateColStride = internal::TensorIntDivisor<Index>(m_col_inflate_strides);
-    m_fastInputColsEff = internal::TensorIntDivisor<Index>(m_input_cols_eff);
-
-    // Number of patches in the width dimension.
-    m_fastOutputRows = internal::TensorIntDivisor<Index>(m_outputRows);
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[0]);
-    } else {
-      m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[NumDims-1]);
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
-    m_impl.evalSubExprsIfNeeded(NULL);
-    return true;
-  }
-
-#ifdef EIGEN_USE_THREADS
-  template <typename EvalSubExprsCallback>
-  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
-      EvaluatorPointerType, EvalSubExprsCallback done) {
-    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
-  }
-#endif  // EIGEN_USE_THREADS
-
-  EIGEN_STRONG_INLINE void cleanup() {
-    m_impl.cleanup();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
-  {
-    // Patch index corresponding to the passed in index.
-    const Index patchIndex = index / m_fastPatchStride;
-    // Find the offset of the element wrt the location of the first element.
-    const Index patchOffset = (index - patchIndex * m_patchStride) / m_fastOutputDepth;
-
-    // Other ways to index this element.
-    const Index otherIndex = (NumDims == 4) ? 0 : index / m_fastOtherStride;
-    const Index patch2DIndex = (NumDims == 4) ? patchIndex : (index - otherIndex * m_otherStride) / m_fastPatchStride;
-
-    // Calculate col index in the input original tensor.
-    const Index colIndex = patch2DIndex / m_fastOutputRows;
-    const Index colOffset = patchOffset / m_fastColStride;
-    const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft;
-    const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInflateColStride) : 0);
-    if (inputCol < 0 || inputCol >= m_input_cols_eff ||
-        ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) {
-      return Scalar(m_paddingValue);
-    }
-
-    // Calculate row index in the original input tensor.
-    const Index rowIndex = patch2DIndex - colIndex * m_outputRows;
-    const Index rowOffset = patchOffset - colOffset * m_colStride;
-    const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop;
-    const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInflateRowStride) : 0);
-    if (inputRow < 0 || inputRow >= m_input_rows_eff ||
-        ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) {
-      return Scalar(m_paddingValue);
-    }
-
-    const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
-    const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index];
-
-    const Index inputIndex = depth + origInputRow * m_rowInputStride + origInputCol * m_colInputStride + otherIndex * m_patchInputStride;
-    return m_impl.coeff(inputIndex);
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
-  {
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
-
-    if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1) {
-      return packetWithPossibleZero(index);
-    }
-
-    const Index indices[2] = {index, index + PacketSize - 1};
-    const Index patchIndex = indices[0] / m_fastPatchStride;
-    if (patchIndex != indices[1] / m_fastPatchStride) {
-      return packetWithPossibleZero(index);
-    }
-    const Index otherIndex = (NumDims == 4) ? 0 : indices[0] / m_fastOtherStride;
-    eigen_assert(otherIndex == indices[1] / m_fastOtherStride);
-
-    // Find the offset of the element wrt the location of the first element.
-    const Index patchOffsets[2] = {(indices[0] - patchIndex * m_patchStride) / m_fastOutputDepth,
-                                   (indices[1] - patchIndex * m_patchStride) / m_fastOutputDepth};
-
-    const Index patch2DIndex = (NumDims == 4) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride;
-    eigen_assert(patch2DIndex == (indices[1] - otherIndex * m_otherStride) / m_fastPatchStride);
-
-    const Index colIndex = patch2DIndex / m_fastOutputRows;
-    const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride, patchOffsets[1] / m_fastColStride};
-
-    // Calculate col indices in the original input tensor.
-    const Index inputCols[2] = {colIndex * m_col_strides + colOffsets[0] -
-      m_colPaddingLeft, colIndex * m_col_strides + colOffsets[1] - m_colPaddingLeft};
-    if (inputCols[1] < 0 || inputCols[0] >= m_inputCols) {
-      return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
-    }
-
-    if (inputCols[0] == inputCols[1]) {
-      const Index rowIndex = patch2DIndex - colIndex * m_outputRows;
-      const Index rowOffsets[2] = {patchOffsets[0] - colOffsets[0]*m_colStride, patchOffsets[1] - colOffsets[1]*m_colStride};
-      eigen_assert(rowOffsets[0] <= rowOffsets[1]);
-      // Calculate col indices in the original input tensor.
-      const Index inputRows[2] = {rowIndex * m_row_strides + rowOffsets[0] -
-        m_rowPaddingTop, rowIndex * m_row_strides + rowOffsets[1] - m_rowPaddingTop};
-
-      if (inputRows[1] < 0 || inputRows[0] >= m_inputRows) {
-        return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
-      }
-
-      if (inputRows[0] >= 0 && inputRows[1] < m_inputRows) {
-        // no padding
-        const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
-        const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index];
-        const Index inputIndex = depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex * m_patchInputStride;
-        return m_impl.template packet<Unaligned>(inputIndex);
-      }
-    }
-
-    return packetWithPossibleZero(index);
-  }
-
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
-
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowPaddingTop() const { return m_rowPaddingTop; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colPaddingLeft() const { return m_colPaddingLeft; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputRows() const { return m_outputRows; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputCols() const { return m_outputCols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userRowStride() const { return m_row_strides; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userColStride() const { return m_col_strides; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInRowStride() const { return m_in_row_strides; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInColStride() const { return m_in_col_strides; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowInflateStride() const { return m_row_inflate_strides; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colInflateStride() const { return m_col_inflate_strides; }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
-  costPerCoeff(bool vectorized) const {
-    // We conservatively estimate the cost for the code path where the computed
-    // index is inside the original image and
-    // TensorEvaluator<ArgType, Device>::CoordAccess is false.
-    const double compute_cost = 3 * TensorOpCost::DivCost<Index>() +
-                                6 * TensorOpCost::MulCost<Index>() +
-                                8 * TensorOpCost::MulCost<Index>();
-    return m_impl.costPerCoeff(vectorized) +
-           TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
-  }
-
- protected:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
-  {
-    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
-    EIGEN_UNROLL_LOOP
-    for (int i = 0; i < PacketSize; ++i) {
-      values[i] = coeff(index+i);
-    }
-    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
-    return rslt;
-  }
-
-  Dimensions m_dimensions;
-
-  Index m_otherStride;
-  Index m_patchStride;
-  Index m_colStride;
-  Index m_row_strides;
-  Index m_col_strides;
-
-  Index m_in_row_strides;
-  Index m_in_col_strides;
-  Index m_row_inflate_strides;
-  Index m_col_inflate_strides;
-
-  Index m_input_rows_eff;
-  Index m_input_cols_eff;
-  Index m_patch_rows_eff;
-  Index m_patch_cols_eff;
-
-  internal::TensorIntDivisor<Index> m_fastOtherStride;
-  internal::TensorIntDivisor<Index> m_fastPatchStride;
-  internal::TensorIntDivisor<Index> m_fastColStride;
-  internal::TensorIntDivisor<Index> m_fastInflateRowStride;
-  internal::TensorIntDivisor<Index> m_fastInflateColStride;
-  internal::TensorIntDivisor<Index> m_fastInputColsEff;
-
-  Index m_rowInputStride;
-  Index m_colInputStride;
-  Index m_patchInputStride;
-
-  Index m_inputDepth;
-  Index m_inputRows;
-  Index m_inputCols;
-
-  Index m_outputRows;
-  Index m_outputCols;
-
-  Index m_rowPaddingTop;
-  Index m_colPaddingLeft;
-
-  internal::TensorIntDivisor<Index> m_fastOutputRows;
-  internal::TensorIntDivisor<Index> m_fastOutputDepth;
-
-  Scalar m_paddingValue;
-
-  const Device EIGEN_DEVICE_REF m_device;
-  TensorEvaluator<ArgType, Device> m_impl;
-};
-
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorIndexList.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorIndexList.h
deleted file mode 100644
index 2d8c7b9..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorIndexList.h
+++ /dev/null
@@ -1,738 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
-#define EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
-
-
-#if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES
-
-#define EIGEN_HAS_INDEX_LIST
-
-namespace Eigen {
-
-/** \internal
-  *
-  * \class TensorIndexList
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Set of classes used to encode a set of Tensor dimensions/indices.
-  *
-  * The indices in the list can be known at compile time or at runtime. A mix
-  * of static and dynamic indices can also be provided if needed. The tensor
-  * code will attempt to take advantage of the indices that are known at
-  * compile time to optimize the code it generates.
-  *
-  * This functionality requires a c++11 compliant compiler. If your compiler
-  * is older you need to use arrays of indices instead.
-  *
-  * Several examples are provided in the cxx11_tensor_index_list.cpp file.
-  *
-  * \sa Tensor
-  */
-
-template <Index n>
-struct type2index {
-  static const Index value = n;
-  EIGEN_DEVICE_FUNC constexpr operator Index() const { return n; }
-  EIGEN_DEVICE_FUNC void set(Index val) {
-    eigen_assert(val == n);
-  }
-};
-
-// This can be used with IndexPairList to get compile-time constant pairs,
-// such as IndexPairList<type2indexpair<1,2>, type2indexpair<3,4>>().
-template <Index f, Index s>
-struct type2indexpair {
-  static const Index first = f;
-  static const Index second = s;
-
-  constexpr EIGEN_DEVICE_FUNC operator IndexPair<Index>() const {
-    return IndexPair<Index>(f, s);
-  }
-
-  EIGEN_DEVICE_FUNC void set(const IndexPair<Index>& val) {
-    eigen_assert(val.first == f);
-    eigen_assert(val.second == s);
-  }
-};
-
-
-template<Index n> struct NumTraits<type2index<n> >
-{
-  typedef Index Real;
-  enum {
-    IsComplex = 0,
-    RequireInitialization = false,
-    ReadCost = 1,
-    AddCost = 1,
-    MulCost = 1
-  };
-
-  EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real epsilon() { return 0; }
-  EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real dummy_precision() { return 0; }
-  EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real highest() { return n; }
-  EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real lowest() { return n; }
-};
-
-namespace internal {
-template <typename T>
-EIGEN_DEVICE_FUNC void update_value(T& val, Index new_val) {
-  val = internal::convert_index<T>(new_val);
-}
-template <Index n>
-EIGEN_DEVICE_FUNC void update_value(type2index<n>& val, Index new_val) {
-  val.set(new_val);
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC void update_value(T& val, IndexPair<Index> new_val) {
-  val = new_val;
-}
-template <Index f, Index s>
-EIGEN_DEVICE_FUNC void update_value(type2indexpair<f, s>& val, IndexPair<Index> new_val) {
-  val.set(new_val);
-}
-
-
-template <typename T>
-struct is_compile_time_constant {
-  static constexpr bool value = false;
-};
-
-template <Index idx>
-struct is_compile_time_constant<type2index<idx> > {
-  static constexpr bool value = true;
-};
-template <Index idx>
-struct is_compile_time_constant<const type2index<idx> > {
-  static constexpr bool value = true;
-};
-template <Index idx>
-struct is_compile_time_constant<type2index<idx>& > {
-  static constexpr bool value = true;
-};
-template <Index idx>
-struct is_compile_time_constant<const type2index<idx>& > {
-  static constexpr bool value = true;
-};
-
-template <Index f, Index s>
-struct is_compile_time_constant<type2indexpair<f, s> > {
-  static constexpr bool value = true;
-};
-template <Index f, Index s>
-struct is_compile_time_constant<const type2indexpair<f, s> > {
-  static constexpr bool value = true;
-};
-template <Index f, Index s>
-struct is_compile_time_constant<type2indexpair<f, s>& > {
-  static constexpr bool value = true;
-};
-template <Index f, Index s>
-struct is_compile_time_constant<const type2indexpair<f, s>& > {
-  static constexpr bool value = true;
-};
-
-
-template<typename... T>
-struct IndexTuple;
-
-template<typename T, typename... O>
-struct IndexTuple<T, O...> {
-  EIGEN_DEVICE_FUNC constexpr IndexTuple() : head(), others() { }
-  EIGEN_DEVICE_FUNC constexpr IndexTuple(const T& v, const O... o) : head(v), others(o...) { }
-
-  constexpr static int count = 1 + sizeof...(O);
-  T head;
-  IndexTuple<O...> others;
-  typedef T Head;
-  typedef IndexTuple<O...> Other;
-};
-
-template<typename T>
-  struct IndexTuple<T> {
-  EIGEN_DEVICE_FUNC constexpr IndexTuple() : head() { }
-  EIGEN_DEVICE_FUNC constexpr IndexTuple(const T& v) : head(v) { }
-
-  constexpr static int count = 1;
-  T head;
-  typedef T Head;
-};
-
-
-template<int N, typename... T>
-struct IndexTupleExtractor;
-
-template<int N, typename T, typename... O>
-struct IndexTupleExtractor<N, T, O...> {
-
-  typedef typename IndexTupleExtractor<N-1, O...>::ValType ValType;
-
-  EIGEN_DEVICE_FUNC static constexpr ValType& get_val(IndexTuple<T, O...>& val) {
-    return IndexTupleExtractor<N-1, O...>::get_val(val.others);
-  }
-
-  EIGEN_DEVICE_FUNC static constexpr const ValType& get_val(const IndexTuple<T, O...>& val) {
-    return IndexTupleExtractor<N-1, O...>::get_val(val.others);
-  }
-  template <typename V>
-  EIGEN_DEVICE_FUNC static void set_val(IndexTuple<T, O...>& val, V& new_val) {
-    IndexTupleExtractor<N-1, O...>::set_val(val.others, new_val);
-  }
-
-};
-
-template<typename T, typename... O>
-  struct IndexTupleExtractor<0, T, O...> {
-
-  typedef T ValType;
-
-  EIGEN_DEVICE_FUNC static constexpr ValType& get_val(IndexTuple<T, O...>& val) {
-    return val.head;
-  }
-  EIGEN_DEVICE_FUNC static constexpr const ValType& get_val(const IndexTuple<T, O...>& val) {
-    return val.head;
-  }
-  template <typename V>
-  EIGEN_DEVICE_FUNC static void set_val(IndexTuple<T, O...>& val, V& new_val) {
-    val.head = new_val;
-  }
-};
-
-
-
-template <int N, typename T, typename... O>
-EIGEN_DEVICE_FUNC constexpr typename IndexTupleExtractor<N, T, O...>::ValType& array_get(IndexTuple<T, O...>& tuple) {
-  return IndexTupleExtractor<N, T, O...>::get_val(tuple);
-}
-template <int N, typename T, typename... O>
-EIGEN_DEVICE_FUNC constexpr const typename IndexTupleExtractor<N, T, O...>::ValType& array_get(const IndexTuple<T, O...>& tuple) {
-  return IndexTupleExtractor<N, T, O...>::get_val(tuple);
-}
-template <typename T, typename... O>
-  struct array_size<IndexTuple<T, O...> > {
-  static const size_t value = IndexTuple<T, O...>::count;
-};
-template <typename T, typename... O>
-  struct array_size<const IndexTuple<T, O...> > {
-  static const size_t value = IndexTuple<T, O...>::count;
-};
-
-
-
-
-template <Index Idx, typename ValueT>
-struct tuple_coeff {
-  template <typename... T>
-  EIGEN_DEVICE_FUNC static constexpr ValueT get(const Index i, const IndexTuple<T...>& t) {
-    //    return array_get<Idx>(t) * (i == Idx) + tuple_coeff<Idx-1>::get(i, t) * (i != Idx);
-    return (i == Idx ? array_get<Idx>(t) : tuple_coeff<Idx-1, ValueT>::get(i, t));
-  }
-  template <typename... T>
-  EIGEN_DEVICE_FUNC static void set(const Index i, IndexTuple<T...>& t, const ValueT& value) {
-    if (i == Idx) {
-      update_value(array_get<Idx>(t), value);
-    } else {
-      tuple_coeff<Idx-1, ValueT>::set(i, t, value);
-    }
-  }
-
-  template <typename... T>
-  EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const Index i, const IndexTuple<T...>& t) {
-    return ((i == Idx) & is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value) ||
-        tuple_coeff<Idx-1, ValueT>::value_known_statically(i, t);
-  }
-
-  template <typename... T>
-  EIGEN_DEVICE_FUNC static constexpr bool values_up_to_known_statically(const IndexTuple<T...>& t) {
-    return is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value &&
-        tuple_coeff<Idx-1, ValueT>::values_up_to_known_statically(t);
-  }
-
-  template <typename... T>
-  EIGEN_DEVICE_FUNC static constexpr bool values_up_to_statically_known_to_increase(const IndexTuple<T...>& t) {
-    return is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value &&
-           is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value &&
-           array_get<Idx>(t) > array_get<Idx-1>(t) &&
-           tuple_coeff<Idx-1, ValueT>::values_up_to_statically_known_to_increase(t);
-  }
-};
-
-template <typename ValueT>
-struct tuple_coeff<0, ValueT> {
-  template <typename... T>
-  EIGEN_DEVICE_FUNC static constexpr ValueT get(const Index /*i*/, const IndexTuple<T...>& t) {
-    //  eigen_assert (i == 0);  // gcc fails to compile assertions in constexpr
-    return array_get<0>(t)/* * (i == 0)*/;
-  }
-  template <typename... T>
-  EIGEN_DEVICE_FUNC static void set(const Index i, IndexTuple<T...>& t, const ValueT value) {
-    eigen_assert (i == 0);
-    update_value(array_get<0>(t), value);
-  }
-  template <typename... T>
-  EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const Index i, const IndexTuple<T...>&) {
-    return is_compile_time_constant<typename IndexTupleExtractor<0, T...>::ValType>::value && (i == 0);
-  }
-
-  template <typename... T>
-  EIGEN_DEVICE_FUNC static constexpr bool values_up_to_known_statically(const IndexTuple<T...>&) {
-    return is_compile_time_constant<typename IndexTupleExtractor<0, T...>::ValType>::value;
-  }
-
-  template <typename... T>
-  EIGEN_DEVICE_FUNC static constexpr bool values_up_to_statically_known_to_increase(const IndexTuple<T...>&) {
-    return true;
-  }
-};
-}  // namespace internal
-
-
-
-template<typename FirstType, typename... OtherTypes>
-struct IndexList : internal::IndexTuple<FirstType, OtherTypes...> {
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Index operator[] (const Index i) const {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::get(i, *this);
-  }
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Index get(const Index i) const {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::get(i, *this);
-  }
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const Index i, const Index value) {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::set(i, *this, value);
-  }
-
-  EIGEN_DEVICE_FUNC constexpr IndexList(const internal::IndexTuple<FirstType, OtherTypes...>& other) : internal::IndexTuple<FirstType, OtherTypes...>(other) { }
-  EIGEN_DEVICE_FUNC constexpr IndexList(FirstType& first, OtherTypes... other) : internal::IndexTuple<FirstType, OtherTypes...>(first, other...) { }
-  EIGEN_DEVICE_FUNC constexpr IndexList() : internal::IndexTuple<FirstType, OtherTypes...>() { }
-
-  EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const Index i) const {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::value_known_statically(i, *this);
-  }
-  EIGEN_DEVICE_FUNC constexpr bool all_values_known_statically() const {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::values_up_to_known_statically(*this);
-  }
-
-  EIGEN_DEVICE_FUNC constexpr bool values_statically_known_to_increase() const {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::values_up_to_statically_known_to_increase(*this);
-  }
-};
-
-template <typename FirstType, typename... OtherTypes>
-std::ostream& operator<<(std::ostream& os,
-                         const IndexList<FirstType, OtherTypes...>& dims) {
-  os << "[";
-  for (size_t i = 0; i < 1 + sizeof...(OtherTypes); ++i) {
-    if (i > 0) os << ", ";
-    os << dims[i];
-  }
-  os << "]";
-  return os;
-}
-
-template<typename FirstType, typename... OtherTypes>
-constexpr IndexList<FirstType, OtherTypes...> make_index_list(FirstType val1, OtherTypes... other_vals) {
-  return IndexList<FirstType, OtherTypes...>(val1, other_vals...);
-}
-
-
-template<typename FirstType, typename... OtherTypes>
-struct IndexPairList : internal::IndexTuple<FirstType, OtherTypes...> {
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr IndexPair<Index> operator[] (const Index i) const {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, IndexPair<Index>>::get(i, *this);
-  }
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const Index i, const IndexPair<Index> value) {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...>>::value-1, IndexPair<Index> >::set(i, *this, value);
-  }
-
-  EIGEN_DEVICE_FUNC  constexpr IndexPairList(const internal::IndexTuple<FirstType, OtherTypes...>& other) : internal::IndexTuple<FirstType, OtherTypes...>(other) { }
-  EIGEN_DEVICE_FUNC  constexpr IndexPairList() : internal::IndexTuple<FirstType, OtherTypes...>() { }
-
-  EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const Index i) const {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::value_known_statically(i, *this);
-  }
-};
-
-namespace internal {
-
-template<typename FirstType, typename... OtherTypes>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index array_prod(const IndexList<FirstType, OtherTypes...>& sizes) {
-  Index result = 1;
-  EIGEN_UNROLL_LOOP
-  for (size_t i = 0; i < array_size<IndexList<FirstType, OtherTypes...> >::value; ++i) {
-    result *= sizes[i];
-  }
-  return result;
-}
-
-template<typename FirstType, typename... OtherTypes> struct array_size<IndexList<FirstType, OtherTypes...> > {
-  static const size_t value = array_size<IndexTuple<FirstType, OtherTypes...> >::value;
-};
-template<typename FirstType, typename... OtherTypes> struct array_size<const IndexList<FirstType, OtherTypes...> > {
-  static const size_t value = array_size<IndexTuple<FirstType, OtherTypes...> >::value;
-};
-
-template<typename FirstType, typename... OtherTypes> struct array_size<IndexPairList<FirstType, OtherTypes...> > {
-  static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value;
-};
-template<typename FirstType, typename... OtherTypes> struct array_size<const IndexPairList<FirstType, OtherTypes...> > {
-  static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value;
-};
-
-template<Index N, typename FirstType, typename... OtherTypes> EIGEN_DEVICE_FUNC constexpr Index array_get(IndexList<FirstType, OtherTypes...>& a) {
-  return IndexTupleExtractor<N, FirstType, OtherTypes...>::get_val(a);
-}
-template<Index N, typename FirstType, typename... OtherTypes> EIGEN_DEVICE_FUNC constexpr Index array_get(const IndexList<FirstType, OtherTypes...>& a) {
-  return IndexTupleExtractor<N, FirstType, OtherTypes...>::get_val(a);
-}
-
-template <typename T>
-struct index_known_statically_impl {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const Index) {
-    return false;
-  }
-};
-
-template <typename FirstType, typename... OtherTypes>
-struct index_known_statically_impl<IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i) {
-    return IndexList<FirstType, OtherTypes...>().value_known_statically(i);
-  }
-};
-
-template <typename FirstType, typename... OtherTypes>
-struct index_known_statically_impl<const IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i) {
-    return IndexList<FirstType, OtherTypes...>().value_known_statically(i);
-  }
-};
-
-
-template <typename T>
-struct all_indices_known_statically_impl {
-  static constexpr bool run() {
-    return false;
-  }
-};
-
-template <typename FirstType, typename... OtherTypes>
-struct all_indices_known_statically_impl<IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run() {
-    return IndexList<FirstType, OtherTypes...>().all_values_known_statically();
-  }
-};
-
-template <typename FirstType, typename... OtherTypes>
-struct all_indices_known_statically_impl<const IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run() {
-    return IndexList<FirstType, OtherTypes...>().all_values_known_statically();
-  }
-};
-
-
-template <typename T>
-struct indices_statically_known_to_increase_impl {
-  EIGEN_DEVICE_FUNC static constexpr bool run() {
-    return false;
-  }
-};
-
-template <typename FirstType, typename... OtherTypes>
-  struct indices_statically_known_to_increase_impl<IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run() {
-    return Eigen::IndexList<FirstType, OtherTypes...>().values_statically_known_to_increase();
-  }
-};
-
-template <typename FirstType, typename... OtherTypes>
-  struct indices_statically_known_to_increase_impl<const IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run() {
-    return Eigen::IndexList<FirstType, OtherTypes...>().values_statically_known_to_increase();
-  }
-};
-
-
-template <typename Tx>
-struct index_statically_eq_impl {
-  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
-    return false;
-  }
-};
-
-template <typename FirstType, typename... OtherTypes>
-struct index_statically_eq_impl<IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
-    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
-        (IndexList<FirstType, OtherTypes...>().get(i) == value);
-  }
-};
-
-template <typename FirstType, typename... OtherTypes>
-struct index_statically_eq_impl<const IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
-    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
-        (IndexList<FirstType, OtherTypes...>().get(i) == value);
-  }
-};
-
-
-template <typename T>
-struct index_statically_ne_impl {
-  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
-    return false;
-  }
-};
-
-template <typename FirstType, typename... OtherTypes>
-struct index_statically_ne_impl<IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
-    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
-        (IndexList<FirstType, OtherTypes...>().get(i) != value);
-  }
-};
-
-template <typename FirstType, typename... OtherTypes>
-struct index_statically_ne_impl<const IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
-    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
-        (IndexList<FirstType, OtherTypes...>().get(i) != value);
-  }
-};
-
-
-template <typename T>
-struct index_statically_gt_impl {
-  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
-    return false;
-  }
-};
-
-template <typename FirstType, typename... OtherTypes>
-struct index_statically_gt_impl<IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
-    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
-        (IndexList<FirstType, OtherTypes...>().get(i) > value);
-  }
-};
-
-template <typename FirstType, typename... OtherTypes>
-struct index_statically_gt_impl<const IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
-    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
-        (IndexList<FirstType, OtherTypes...>().get(i) > value);
-  }
-};
-
-
-
-template <typename T>
-struct index_statically_lt_impl {
-  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
-    return false;
-  }
-};
-
-template <typename FirstType, typename... OtherTypes>
-struct index_statically_lt_impl<IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
-    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
-        (IndexList<FirstType, OtherTypes...>().get(i) < value);
-  }
-};
-
-template <typename FirstType, typename... OtherTypes>
-struct index_statically_lt_impl<const IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
-    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
-        (IndexList<FirstType, OtherTypes...>().get(i) < value);
-  }
-};
-
-
-
-template <typename Tx>
-struct index_pair_first_statically_eq_impl {
-  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
-    return false;
-  }
-};
-
-template <typename FirstType, typename... OtherTypes>
-struct index_pair_first_statically_eq_impl<IndexPairList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
-    return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
-        (IndexPairList<FirstType, OtherTypes...>().operator[](i).first == value);
-  }
-};
-
-template <typename FirstType, typename... OtherTypes>
-struct index_pair_first_statically_eq_impl<const IndexPairList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
-    return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
-        (IndexPairList<FirstType, OtherTypes...>().operator[](i).first == value);
-  }
-};
-
-
-
-template <typename Tx>
-struct index_pair_second_statically_eq_impl {
-  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
-    return false;
-  }
-};
-
-template <typename FirstType, typename... OtherTypes>
-struct index_pair_second_statically_eq_impl<IndexPairList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
-    return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
-        (IndexPairList<FirstType, OtherTypes...>().operator[](i).second == value);
-  }
-};
-
-template <typename FirstType, typename... OtherTypes>
-struct index_pair_second_statically_eq_impl<const IndexPairList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
-    return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
-        (IndexPairList<FirstType, OtherTypes...>().operator[](i).second == value);
-  }
-};
-
-
-}  // end namespace internal
-}  // end namespace Eigen
-
-#else
-
-namespace Eigen {
-namespace internal {
-
-template <typename T>
-struct index_known_statically_impl {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const Index) {
-    return false;
-  }
-};
-
-template <typename T>
-struct all_indices_known_statically_impl {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() {
-    return false;
-  }
-};
-
-template <typename T>
-struct indices_statically_known_to_increase_impl {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() {
-    return false;
-  }
-};
-
-template <typename T>
-struct index_statically_eq_impl {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
-    return false;
-  }
-};
-
-template <typename T>
-struct index_statically_ne_impl {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
-    return false;
-  }
-};
-
-template <typename T>
-struct index_statically_gt_impl {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
-    return false;
-  }
-};
-
-template <typename T>
-struct index_statically_lt_impl {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
-    return false;
-  }
-};
-
-template <typename Tx>
-struct index_pair_first_statically_eq_impl {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
-    return false;
-  }
-};
-
-template <typename Tx>
-struct index_pair_second_statically_eq_impl {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
-    return false;
-  }
-};
-
-
-
-}  // end namespace internal
-}  // end namespace Eigen
-
-#endif
-
-
-namespace Eigen {
-namespace internal {
-template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_known_statically(Index i) {
-  return index_known_statically_impl<T>::run(i);
-}
-
-template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool all_indices_known_statically() {
-  return all_indices_known_statically_impl<T>::run();
-}
-
-template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool indices_statically_known_to_increase() {
-  return indices_statically_known_to_increase_impl<T>::run();
-}
-
-template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_eq(Index i, Index value) {
-  return index_statically_eq_impl<T>::run(i, value);
-}
-
-template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_ne(Index i, Index value) {
-  return index_statically_ne_impl<T>::run(i, value);
-}
-
-template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_gt(Index i, Index value) {
-  return index_statically_gt_impl<T>::run(i, value);
-}
-
-template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_lt(Index i, Index value) {
-  return index_statically_lt_impl<T>::run(i, value);
-}
-
-template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_first_statically_eq(Index i, Index value) {
-  return index_pair_first_statically_eq_impl<T>::run(i, value);
-}
-
-template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_second_statically_eq(Index i, Index value) {
-  return index_pair_second_statically_eq_impl<T>::run(i, value);
-}
-
-}  // end namespace internal
-}  // end namespace Eigen
-
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorInflation.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorInflation.h
deleted file mode 100644
index c5cb61a..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorInflation.h
+++ /dev/null
@@ -1,247 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2015 Ke Yang <yangke@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H
-#define EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H
-
-namespace Eigen {
-
-/** \class TensorInflation
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Tensor inflation class.
-  *
-  *
-  */
-namespace internal {
-template<typename Strides, typename XprType>
-struct traits<TensorInflationOp<Strides, XprType> > : public traits<XprType>
-{
-  typedef typename XprType::Scalar Scalar;
-  typedef traits<XprType> XprTraits;
-  typedef typename XprTraits::StorageKind StorageKind;
-  typedef typename XprTraits::Index Index;
-  typedef typename XprType::Nested Nested;
-  typedef typename remove_reference<Nested>::type _Nested;
-  static const int NumDimensions = XprTraits::NumDimensions;
-  static const int Layout = XprTraits::Layout;
-  typedef typename XprTraits::PointerType PointerType;
-};
-
-template<typename Strides, typename XprType>
-struct eval<TensorInflationOp<Strides, XprType>, Eigen::Dense>
-{
-  typedef const TensorInflationOp<Strides, XprType>& type;
-};
-
-template<typename Strides, typename XprType>
-struct nested<TensorInflationOp<Strides, XprType>, 1, typename eval<TensorInflationOp<Strides, XprType> >::type>
-{
-  typedef TensorInflationOp<Strides, XprType> type;
-};
-
-}  // end namespace internal
-
-template<typename Strides, typename XprType>
-class TensorInflationOp : public TensorBase<TensorInflationOp<Strides, XprType>, ReadOnlyAccessors>
-{
-  public:
-  typedef typename Eigen::internal::traits<TensorInflationOp>::Scalar Scalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename Eigen::internal::nested<TensorInflationOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorInflationOp>::StorageKind StorageKind;
-  typedef typename Eigen::internal::traits<TensorInflationOp>::Index Index;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorInflationOp(const XprType& expr, const Strides& strides)
-      : m_xpr(expr), m_strides(strides) {}
-
-    EIGEN_DEVICE_FUNC
-    const Strides& strides() const { return m_strides; }
-
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename XprType::Nested>::type&
-    expression() const { return m_xpr; }
-
-  protected:
-    typename XprType::Nested m_xpr;
-    const Strides m_strides;
-};
-
-// Eval as rvalue
-template<typename Strides, typename ArgType, typename Device>
-struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
-{
-  typedef TensorInflationOp<Strides, ArgType> XprType;
-  typedef typename XprType::Index Index;
-  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
-  typedef DSizes<Index, NumDims> Dimensions;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
-  enum {
-    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
-    PreferBlockAccess = false,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
-  };
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device), m_strides(op.strides())
-  {
-    m_dimensions = m_impl.dimensions();
-    // Expand each dimension to the inflated dimension.
-    for (int i = 0; i < NumDims; ++i) {
-      m_dimensions[i] = (m_dimensions[i] - 1) * op.strides()[i] + 1;
-    }
-
-    // Remember the strides for fast division.
-    for (int i = 0; i < NumDims; ++i) {
-      m_fastStrides[i] = internal::TensorIntDivisor<Index>(m_strides[i]);
-    }
-
-    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      m_outputStrides[0] = 1;
-      m_inputStrides[0] = 1;
-      for (int i = 1; i < NumDims; ++i) {
-        m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
-        m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
-      }
-    } else {  // RowMajor
-      m_outputStrides[NumDims-1] = 1;
-      m_inputStrides[NumDims-1] = 1;
-      for (int i = NumDims - 2; i >= 0; --i) {
-        m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
-        m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
-      }
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
-    m_impl.evalSubExprsIfNeeded(NULL);
-    return true;
-  }
-  EIGEN_STRONG_INLINE void cleanup() {
-    m_impl.cleanup();
-  }
-
-  // Computes the input index given the output index. Returns true if the output
-  // index doesn't fall into a hole.
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool getInputIndex(Index index, Index* inputIndex) const
-  {
-    eigen_assert(index < dimensions().TotalSize());
-    *inputIndex = 0;
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      EIGEN_UNROLL_LOOP
-      for (int i = NumDims - 1; i > 0; --i) {
-        const Index idx = index / m_outputStrides[i];
-        if (idx != idx / m_fastStrides[i] * m_strides[i]) {
-          return false;
-        }
-        *inputIndex += idx / m_strides[i] * m_inputStrides[i];
-        index -= idx * m_outputStrides[i];
-      }
-      if (index != index / m_fastStrides[0] * m_strides[0]) {
-        return false;
-      }
-      *inputIndex += index / m_strides[0];
-      return true;
-    } else {
-      EIGEN_UNROLL_LOOP
-      for (int i = 0; i < NumDims - 1; ++i) {
-        const Index idx = index / m_outputStrides[i];
-        if (idx != idx / m_fastStrides[i] * m_strides[i]) {
-          return false;
-        }
-        *inputIndex += idx / m_strides[i] * m_inputStrides[i];
-        index -= idx * m_outputStrides[i];
-      }
-      if (index != index / m_fastStrides[NumDims-1] * m_strides[NumDims-1]) {
-        return false;
-      }
-      *inputIndex += index / m_strides[NumDims - 1];
-    }
-    return true;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
-  {
-    Index inputIndex = 0;
-    if (getInputIndex(index, &inputIndex)) {
-     return m_impl.coeff(inputIndex);
-    } else {
-     return Scalar(0);
-    }
-  }
-
-  // TODO(yangke): optimize this function so that we can detect and produce
-  // all-zero packets
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
-  {
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
-
-    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
-    EIGEN_UNROLL_LOOP
-    for (int i = 0; i < PacketSize; ++i) {
-      values[i] = coeff(index+i);
-    }
-    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
-    return rslt;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    const double compute_cost = NumDims * (3 * TensorOpCost::DivCost<Index>() +
-                                           3 * TensorOpCost::MulCost<Index>() +
-                                           2 * TensorOpCost::AddCost<Index>());
-    const double input_size = m_impl.dimensions().TotalSize();
-    const double output_size = m_dimensions.TotalSize();
-    if (output_size == 0)
-      return TensorOpCost();
-    return m_impl.costPerCoeff(vectorized) +
-           TensorOpCost(sizeof(CoeffReturnType) * input_size / output_size, 0,
-                        compute_cost, vectorized, PacketSize);
-  }
-
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
-
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
-
- protected:
-  Dimensions m_dimensions;
-  array<Index, NumDims> m_outputStrides;
-  array<Index, NumDims> m_inputStrides;
-  TensorEvaluator<ArgType, Device> m_impl;
-  const Strides m_strides;
-  array<internal::TensorIntDivisor<Index>, NumDims> m_fastStrides;
-};
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorInitializer.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorInitializer.h
deleted file mode 100644
index 26a3818..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorInitializer.h
+++ /dev/null
@@ -1,82 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H
-#define EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H
-
-#if EIGEN_HAS_VARIADIC_TEMPLATES
-
-#include <initializer_list>
-
-namespace Eigen {
-
-/** \class TensorInitializer
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Helper template to initialize Tensors from std::initializer_lists.
-  */
-namespace internal {
-
-template <typename Derived, int N>
-struct Initializer {
-  typedef std::initializer_list<
-    typename Initializer<Derived, N - 1>::InitList> InitList;
-
-  static void run(TensorEvaluator<Derived, DefaultDevice>& tensor,
-                  Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>* indices,
-                  const InitList& vals) {
-    int i = 0;
-    for (const auto& v : vals) {
-      (*indices)[traits<Derived>::NumDimensions - N] = i++;
-      Initializer<Derived, N - 1>::run(tensor, indices, v);
-    }
-  }
-};
-
-template <typename Derived>
-struct Initializer<Derived, 1> {
-  typedef std::initializer_list<typename traits<Derived>::Scalar> InitList;
-
-  static void run(TensorEvaluator<Derived, DefaultDevice>& tensor,
-                  Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>* indices,
-                  const InitList& vals) {
-    int i = 0;
-    // There is likely a faster way to do that than iterating.
-    for (const auto& v : vals) {
-      (*indices)[traits<Derived>::NumDimensions - 1] = i++;
-      tensor.coeffRef(*indices) = v;
-    }
-  }
-};
-
-template <typename Derived>
-struct Initializer<Derived, 0> {
-  typedef typename traits<Derived>::Scalar InitList;
-
-  static void run(TensorEvaluator<Derived, DefaultDevice>& tensor,
-                  Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>*,
-                  const InitList& v) {
-    tensor.coeffRef(0) = v;
-  }
-};
-
-
-template <typename Derived, int N>
-void initialize_tensor(TensorEvaluator<Derived, DefaultDevice>& tensor,
-                       const typename Initializer<Derived, traits<Derived>::NumDimensions>::InitList& vals) {
-  Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions> indices;
-  Initializer<Derived, traits<Derived>::NumDimensions>::run(tensor, &indices, vals);
-}
-
-}  // namespace internal
-}  // namespace Eigen
-
-#endif  // EIGEN_HAS_VARIADIC_TEMPLATES
-
-#endif  // EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorIntDiv.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorIntDiv.h
deleted file mode 100644
index 6d5cce4..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorIntDiv.h
+++ /dev/null
@@ -1,263 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H
-#define EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H
-
-
-namespace Eigen {
-
-/** \internal
-  *
-  * \class TensorIntDiv
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Fast integer division by a constant.
-  *
-  * See the paper from Granlund and Montgomery for explanation.
-  *   (at https://doi.org/10.1145/773473.178249)
-  *
-  * \sa Tensor
-  */
-
-namespace internal {
-
-namespace {
-
-  // Note: result is undefined if val == 0
-  template <typename T>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-  typename internal::enable_if<sizeof(T)==4,int>::type count_leading_zeros(const T val)
-  {
-#ifdef EIGEN_GPU_COMPILE_PHASE
-    return __clz(val);
-#elif defined(SYCL_DEVICE_ONLY)
-    return cl::sycl::clz(val);
-#elif EIGEN_COMP_MSVC
-    unsigned long index;
-    _BitScanReverse(&index, val);
-    return 31 - index;
-#else
-    EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE);
-    return __builtin_clz(static_cast<uint32_t>(val));
-#endif
-  }
-
-  template <typename T>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-  typename internal::enable_if<sizeof(T)==8,int>::type count_leading_zeros(const T val)
-  {
-#ifdef EIGEN_GPU_COMPILE_PHASE
-    return __clzll(val);
-#elif defined(SYCL_DEVICE_ONLY)
-    return static_cast<int>(cl::sycl::clz(val));
-#elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64
-    unsigned long index;
-    _BitScanReverse64(&index, val);
-    return 63 - index;
-#elif EIGEN_COMP_MSVC
-    // MSVC's _BitScanReverse64 is not available for 32bits builds.
-    unsigned int lo = (unsigned int)(val&0xffffffff);
-    unsigned int hi = (unsigned int)((val>>32)&0xffffffff);
-    int n;
-    if(hi==0)
-      n = 32 + count_leading_zeros<unsigned int>(lo);
-    else
-      n = count_leading_zeros<unsigned int>(hi);
-    return n;
-#else
-    EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE);
-    return __builtin_clzll(static_cast<uint64_t>(val));
-#endif
-  }
-
-  template <typename T>
-  struct UnsignedTraits {
-    typedef typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type type;
-  };
-
-  template <typename T>
-  struct DividerTraits {
-    typedef typename UnsignedTraits<T>::type type;
-    static const int N = sizeof(T) * 8;
-  };
-
-  template <typename T>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) {
-#if defined(EIGEN_GPU_COMPILE_PHASE)
-    return __umulhi(a, b);
-#elif defined(SYCL_DEVICE_ONLY)
-    return cl::sycl::mul_hi(a, static_cast<uint32_t>(b));
-#else
-    return (static_cast<uint64_t>(a) * b) >> 32;
-#endif
-  }
-
-  template <typename T>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
-#if defined(EIGEN_GPU_COMPILE_PHASE)
-    return __umul64hi(a, b);
-#elif defined(SYCL_DEVICE_ONLY)
-    return cl::sycl::mul_hi(a, static_cast<uint64_t>(b));
-#elif EIGEN_HAS_BUILTIN_INT128
-    __uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b);
-    return static_cast<uint64_t>(v >> 64);
-#else
-    return (TensorUInt128<static_val<0>, uint64_t>(a) * TensorUInt128<static_val<0>, uint64_t>(b)).upper();
-#endif
-  }
-
-  template <int N, typename T>
-  struct DividerHelper {
-    static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t computeMultiplier(const int log_div, const T divider) {
-      EIGEN_STATIC_ASSERT(N == 32, YOU_MADE_A_PROGRAMMING_MISTAKE);
-      return static_cast<uint32_t>((static_cast<uint64_t>(1) << (N+log_div)) / divider - (static_cast<uint64_t>(1) << N) + 1);
-    }
-  };
-
-  template <typename T>
-  struct DividerHelper<64, T> {
-    static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) {
-#if EIGEN_HAS_BUILTIN_INT128 && !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY)
-      return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1);
-#else
-      const uint64_t shift = 1ULL << log_div;
-      TensorUInt128<uint64_t, uint64_t> result = TensorUInt128<uint64_t, static_val<0> >(shift, 0) / TensorUInt128<static_val<0>, uint64_t>(divider)
-                                               - TensorUInt128<static_val<1>, static_val<0> >(1, 0)
-                                               + TensorUInt128<static_val<0>, static_val<1> >(1);
-      return static_cast<uint64_t>(result);
-#endif
-    }
-  };
-}
-
-
-template <typename T, bool div_gt_one = false>
-struct TensorIntDivisor {
- public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() {
-    multiplier = 0;
-    shift1 = 0;
-    shift2 = 0;
-  }
-
-  // Must have 0 < divider < 2^31. This is relaxed to
-  // 0 < divider < 2^63 when using 64-bit indices on platforms that support
-  // the __uint128_t type.
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor(const T divider) {
-    const int N = DividerTraits<T>::N;
-    eigen_assert(static_cast<typename UnsignedTraits<T>::type>(divider) < NumTraits<UnsignedType>::highest()/2);
-    eigen_assert(divider > 0);
-
-    // fast ln2
-    const int leading_zeros = count_leading_zeros(static_cast<UnsignedType>(divider));
-    int log_div = N - leading_zeros;
-    // if divider is a power of two then log_div is 1 more than it should be.
-    if ((static_cast<typename UnsignedTraits<T>::type>(1) << (log_div-1)) == static_cast<typename UnsignedTraits<T>::type>(divider))
-      log_div--;
-
-    multiplier = DividerHelper<N, T>::computeMultiplier(log_div, divider);
-    shift1 = log_div > 1 ? 1 : log_div;
-    shift2 = log_div > 1 ? log_div-1 : 0;
-  }
-
-  // Must have 0 <= numerator. On platforms that don't support the __uint128_t
-  // type numerator should also be less than 2^32-1.
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T divide(const T numerator) const {
-    eigen_assert(static_cast<typename UnsignedTraits<T>::type>(numerator) < NumTraits<UnsignedType>::highest()/2);
-    //eigen_assert(numerator >= 0); // this is implicitly asserted by the line above
-
-    UnsignedType t1 = muluh(multiplier, numerator);
-    UnsignedType t = (static_cast<UnsignedType>(numerator) - t1) >> shift1;
-    return (t1 + t) >> shift2;
-  }
-
- private:
-  typedef typename DividerTraits<T>::type UnsignedType;
-  UnsignedType multiplier;
-  int32_t shift1;
-  int32_t shift2;
-};
-
-
-// Optimized version for signed 32 bit integers.
-// Derived from Hacker's Delight.
-// Only works for divisors strictly greater than one
-template <>
-class TensorIntDivisor<int32_t, true> {
- public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() {
-    magic = 0;
-    shift = 0;
-  }
-  // Must have 2 <= divider
-  EIGEN_DEVICE_FUNC TensorIntDivisor(int32_t divider)  {
-    eigen_assert(divider >= 2);
-    calcMagic(divider);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const {
-#ifdef EIGEN_GPU_COMPILE_PHASE
-    return (__umulhi(magic, n) >> shift);
-#elif defined(SYCL_DEVICE_ONLY)
-    return (cl::sycl::mul_hi(magic, static_cast<uint32_t>(n)) >> shift);
-#else
-    uint64_t v = static_cast<uint64_t>(magic) * static_cast<uint64_t>(n);
-    return (static_cast<uint32_t>(v >> 32) >> shift);
-#endif
-  }
-
-private:
-  // Compute the magic numbers. See Hacker's Delight section 10 for an in
-  // depth explanation.
-  EIGEN_DEVICE_FUNC void calcMagic(int32_t d) {
-   const unsigned two31 = 0x80000000;     // 2**31.
-   unsigned ad = d;
-   unsigned t = two31 + (ad >> 31);
-   unsigned anc = t - 1 - t%ad;     // Absolute value of nc.
-   int p = 31;                      // Init. p.
-   unsigned q1 = two31/anc;         // Init. q1 = 2**p/|nc|.
-   unsigned r1 = two31 - q1*anc;    // Init. r1 = rem(2**p, |nc|).
-   unsigned q2 = two31/ad;          // Init. q2 = 2**p/|d|.
-   unsigned r2 = two31 - q2*ad;     // Init. r2 = rem(2**p, |d|).
-   unsigned delta = 0;
-   do {
-      p = p + 1;
-      q1 = 2*q1;           // Update q1 = 2**p/|nc|.
-      r1 = 2*r1;           // Update r1 = rem(2**p, |nc|).
-      if (r1 >= anc) {     // (Must be an unsigned
-         q1 = q1 + 1;      // comparison here).
-         r1 = r1 - anc;}
-      q2 = 2*q2;           // Update q2 = 2**p/|d|.
-      r2 = 2*r2;           // Update r2 = rem(2**p, |d|).
-      if (r2 >= ad) {      // (Must be an unsigned
-         q2 = q2 + 1;      // comparison here).
-         r2 = r2 - ad;}
-      delta = ad - r2;
-   } while (q1 < delta || (q1 == delta && r1 == 0));
-
-   magic = (unsigned)(q2 + 1);
-   shift = p - 32;
-  }
-
-  uint32_t magic;
-  int32_t shift;
-};
-
-
-template <typename T, bool div_gt_one>
-static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator / (const T& numerator, const TensorIntDivisor<T, div_gt_one>& divisor) {
-  return divisor.divide(numerator);
-}
-
-
-} // end namespace internal
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorLayoutSwap.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorLayoutSwap.h
deleted file mode 100644
index 80106c1..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorLayoutSwap.h
+++ /dev/null
@@ -1,216 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H
-#define EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H
-
-namespace Eigen {
-
-/** \class TensorLayoutSwap
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Swap the layout from col-major to row-major, or row-major
-  * to col-major, and invert the order of the dimensions.
-  *
-  * Beware: the dimensions are reversed by this operation. If you want to
-  * preserve the ordering of the dimensions, you need to combine this
-  * operation with a shuffle.
-  *
-  * \example:
-  * Tensor<float, 2, ColMajor> input(2, 4);
-  * Tensor<float, 2, RowMajor> output = input.swap_layout();
-  * eigen_assert(output.dimension(0) == 4);
-  * eigen_assert(output.dimension(1) == 2);
-  *
-  * array<int, 2> shuffle(1, 0);
-  * output = input.swap_layout().shuffle(shuffle);
-  * eigen_assert(output.dimension(0) == 2);
-  * eigen_assert(output.dimension(1) == 4);
-  *
-  */
-namespace internal {
-template<typename XprType>
-struct traits<TensorLayoutSwapOp<XprType> > : public traits<XprType>
-{
-  typedef typename XprType::Scalar Scalar;
-  typedef traits<XprType> XprTraits;
-  typedef typename XprTraits::StorageKind StorageKind;
-  typedef typename XprTraits::Index Index;
-  typedef typename XprType::Nested Nested;
-  typedef typename remove_reference<Nested>::type _Nested;
-  static const int NumDimensions = traits<XprType>::NumDimensions;
-  static const int Layout = (traits<XprType>::Layout == ColMajor) ? RowMajor : ColMajor;
-  typedef typename XprTraits::PointerType PointerType;
-};
-
-template<typename XprType>
-struct eval<TensorLayoutSwapOp<XprType>, Eigen::Dense>
-{
-  typedef const TensorLayoutSwapOp<XprType>& type;
-};
-
-template<typename XprType>
-struct nested<TensorLayoutSwapOp<XprType>, 1, typename eval<TensorLayoutSwapOp<XprType> >::type>
-{
-  typedef TensorLayoutSwapOp<XprType> type;
-};
-
-}  // end namespace internal
-
-
-
-template<typename XprType>
-class TensorLayoutSwapOp : public TensorBase<TensorLayoutSwapOp<XprType>, WriteAccessors>
-{
-  public:
-    typedef TensorBase<TensorLayoutSwapOp<XprType>, WriteAccessors> Base;
-    typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Scalar Scalar;
-    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-    typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
-    typedef typename Eigen::internal::nested<TensorLayoutSwapOp>::type Nested;
-    typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::StorageKind StorageKind;
-    typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Index Index;
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorLayoutSwapOp(const XprType& expr)
-        : m_xpr(expr) {}
-
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename XprType::Nested>::type&
-    expression() const { return m_xpr; }
-
-    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorLayoutSwapOp)
-  protected:
-    typename XprType::Nested m_xpr;
-};
-
-
-// Eval as rvalue
-template<typename ArgType, typename Device>
-struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
-{
-  typedef TensorLayoutSwapOp<ArgType> XprType;
-  typedef typename XprType::Index Index;
-  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
-  typedef DSizes<Index, NumDims> Dimensions;
-
-  enum {
-    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
-    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
-    Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
-    CoordAccess = false,  // to be implemented
-    RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
-  };
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device)
-  {
-    for(int i = 0; i < NumDims; ++i) {
-      m_dimensions[i] = m_impl.dimensions()[NumDims-1-i];
-    }
-  }
-
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
-
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
-    return m_impl.evalSubExprsIfNeeded(data);
-  }
-  EIGEN_STRONG_INLINE void cleanup() {
-    m_impl.cleanup();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
-  {
-    return m_impl.coeff(index);
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
-  {
-    return m_impl.template packet<LoadMode>(index);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    return m_impl.costPerCoeff(vectorized);
-  }
-
-  EIGEN_DEVICE_FUNC typename Storage::Type data() const {
-    return constCast(m_impl.data());
-  }
-
-  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
-
- protected:
-  TensorEvaluator<ArgType, Device> m_impl;
-  Dimensions m_dimensions;
-};
-
-
-// Eval as lvalue
-template<typename ArgType, typename Device>
-  struct TensorEvaluator<TensorLayoutSwapOp<ArgType>, Device>
-  : public TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
-{
-  typedef TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device> Base;
-  typedef TensorLayoutSwapOp<ArgType> XprType;
-
-  enum {
-    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
-    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
-    Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
-    CoordAccess = false  // to be implemented
-  };
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-    : Base(op, device)
-  { }
-
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
-  {
-    return this->m_impl.coeffRef(index);
-  }
-  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketReturnType& x)
-  {
-    this->m_impl.template writePacket<StoreMode>(index, x);
-  }
-};
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorMacros.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorMacros.h
deleted file mode 100644
index 73ff3d2..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorMacros.h
+++ /dev/null
@@ -1,98 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_META_MACROS_H
-#define EIGEN_CXX11_TENSOR_TENSOR_META_MACROS_H
-
-
-/** use this macro in sfinae selection in templated functions
- *
- *   template<typename T,
- *            typename std::enable_if< isBanana<T>::value , int >::type = 0
- *   >
- *   void foo(){}
- *
- *   becomes =>
- *
- *   template<typename TopoType,
- *           SFINAE_ENABLE_IF( isBanana<T>::value )
- *   >
- *   void foo(){}
- */
-
-// SFINAE requires variadic templates
-#if !defined(EIGEN_GPUCC)
-#if EIGEN_HAS_VARIADIC_TEMPLATES
-  // SFINAE doesn't work for gcc <= 4.7
-  #ifdef EIGEN_COMP_GNUC
-    #if EIGEN_GNUC_AT_LEAST(4,8)
-      #define EIGEN_HAS_SFINAE
-    #endif
-  #else
-    #define EIGEN_HAS_SFINAE
-  #endif
-#endif
-#endif
-
-#define EIGEN_SFINAE_ENABLE_IF( __condition__ ) \
-    typename internal::enable_if< ( __condition__ ) , int >::type = 0
-
-// Define a macro to use a reference on the host but a value on the device
-#if defined(SYCL_DEVICE_ONLY)
-  #define EIGEN_DEVICE_REF
-#else
-  #define EIGEN_DEVICE_REF &
-#endif
-
-// Define a macro for catching SYCL exceptions if exceptions are enabled
-#define EIGEN_SYCL_TRY_CATCH(X) \
-  do { \
-    EIGEN_TRY {X;} \
-    EIGEN_CATCH(const cl::sycl::exception& e) { \
-      EIGEN_THROW_X(std::runtime_error("SYCL exception at " + \
-                                       std::string(__FILE__) + ":" + \
-                                       std::to_string(__LINE__) + "\n" + \
-                                       e.what())); \
-    } \
-  } while (false)
-
-// Define a macro if local memory flags are unset or one of them is set
-// Setting both flags is the same as unsetting them
-#if (!defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)) || \
-     (defined(EIGEN_SYCL_LOCAL_MEM) &&  defined(EIGEN_SYCL_NO_LOCAL_MEM))
-  #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON 1
-  #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF 1
-#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
-  #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON 1
-#elif !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
-  #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF 1
-#endif
-
-#if EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
-  #define EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
-    using Base::operator =; \
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) { Base::operator=(other); return *this; } \
-    template <typename OtherDerived> \
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const OtherDerived& other) { Base::operator=(other); return *this; }
-#else
-  #define EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
-    EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived)
-#endif
-
-/** \internal
- * \brief Macro to manually inherit assignment operators.
- * This is necessary, because the implicitly defined assignment operator gets deleted when a custom operator= is defined.
- * This also inherits template<OtherDerived> operator=(const OtherDerived&) assignments.
- * With C++11 or later this also default-implements the copy-constructor
- */
-#define EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(Derived)  \
-    EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
-    EIGEN_DEFAULT_COPY_CONSTRUCTOR(Derived)
-
-#endif
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorMap.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorMap.h
deleted file mode 100644
index 6834c97..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorMap.h
+++ /dev/null
@@ -1,327 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_MAP_H
-#define EIGEN_CXX11_TENSOR_TENSOR_MAP_H
-
-namespace Eigen {
-
-// FIXME use proper doxygen documentation (e.g. \tparam MakePointer_)
-
-/** \class TensorMap
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief A tensor expression mapping an existing array of data.
-  *
-  */
-/// `template <class> class MakePointer_` is added to convert the host pointer to the device pointer.
-/// It is added due to the fact that for our device compiler `T*` is not allowed.
-/// If we wanted to use the same Evaluator functions we have to convert that type to our pointer `T`.
-/// This is done through our `MakePointer_` class. By default the Type in the `MakePointer_<T>` is `T*` .
-/// Therefore, by adding the default value, we managed to convert the type and it does not break any
-/// existing code as its default value is `T*`.
-template<typename PlainObjectType, int Options_, template <class> class MakePointer_> class TensorMap : public TensorBase<TensorMap<PlainObjectType, Options_, MakePointer_> >
-{
-  public:
-    typedef TensorMap<PlainObjectType, Options_, MakePointer_> Self;
-    typedef TensorBase<TensorMap<PlainObjectType, Options_, MakePointer_> > Base;
-  #ifdef EIGEN_USE_SYCL
-    typedef  typename Eigen::internal::remove_reference<typename Eigen::internal::nested<Self>::type>::type Nested;
-  #else
-     typedef typename Eigen::internal::nested<Self>::type Nested;
-  #endif
-   typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind;
-    typedef typename internal::traits<PlainObjectType>::Index Index;
-    typedef typename internal::traits<PlainObjectType>::Scalar Scalar;
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef typename PlainObjectType::Base::CoeffReturnType CoeffReturnType;
-
-    typedef typename MakePointer_<Scalar>::Type PointerType;
-    typedef typename MakePointer_<Scalar>::ConstType PointerConstType;
-
-    // WARN: PointerType still can be a pointer to const (const Scalar*), for
-    // example in TensorMap<Tensor<const Scalar, ...>> expression. This type of
-    // expression should be illegal, but adding this restriction is not possible
-    // in practice (see https://bitbucket.org/eigen/eigen/pull-requests/488).
-    typedef typename internal::conditional<
-        bool(internal::is_lvalue<PlainObjectType>::value),
-        PointerType,      // use simple pointer in lvalue expressions
-        PointerConstType  // use const pointer in rvalue expressions
-        >::type StoragePointerType;
-
-    // If TensorMap was constructed over rvalue expression (e.g. const Tensor),
-    // we should return a reference to const from operator() (and others), even
-    // if TensorMap itself is not const.
-    typedef typename internal::conditional<
-        bool(internal::is_lvalue<PlainObjectType>::value),
-        Scalar&,
-        const Scalar&
-        >::type StorageRefType;
-
-    static const int Options = Options_;
-
-    static const Index NumIndices = PlainObjectType::NumIndices;
-    typedef typename PlainObjectType::Dimensions Dimensions;
-
-    enum {
-      IsAligned = ((int(Options_)&Aligned)==Aligned),
-      Layout = PlainObjectType::Layout,
-      CoordAccess = true,
-      RawAccess = true
-    };
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr) : m_data(dataPtr), m_dimensions() {
-      // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
-      EIGEN_STATIC_ASSERT((0 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
-    }
-
-#if EIGEN_HAS_VARIADIC_TEMPLATES
-    template<typename... IndexTypes> EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) {
-      // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
-      EIGEN_STATIC_ASSERT((sizeof...(otherDimensions) + 1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
-    }
-#else
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) {
-      // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
-      EIGEN_STATIC_ASSERT((1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2) : m_data(dataPtr), m_dimensions(dim1, dim2) {
-      EIGEN_STATIC_ASSERT(2 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3) {
-      EIGEN_STATIC_ASSERT(3 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4) {
-      EIGEN_STATIC_ASSERT(4 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4, dim5) {
-      EIGEN_STATIC_ASSERT(5 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    }
-#endif
-
-   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, const array<Index, NumIndices>& dimensions)
-      : m_data(dataPtr), m_dimensions(dimensions)
-    { }
-
-    template <typename Dimensions>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, const Dimensions& dimensions)
-      : m_data(dataPtr), m_dimensions(dimensions)
-    { }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PlainObjectType& tensor)
-      : m_data(tensor.data()), m_dimensions(tensor.dimensions())
-    { }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index rank() const { return m_dimensions.rank(); }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_dimensions[n]; }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE StoragePointerType data() { return m_data; }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE StoragePointerType data() const { return m_data; }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE StorageRefType operator()(const array<Index, NumIndices>& indices) const
-    {
-      //      eigen_assert(checkIndexRange(indices));
-      if (PlainObjectType::Options&RowMajor) {
-        const Index index = m_dimensions.IndexOfRowMajor(indices);
-        return m_data[index];
-      } else {
-        const Index index = m_dimensions.IndexOfColMajor(indices);
-        return m_data[index];
-      }
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE StorageRefType operator()() const
-    {
-      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      return m_data[0];
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE StorageRefType operator()(Index index) const
-    {
-      eigen_internal_assert(index >= 0 && index < size());
-      return m_data[index];
-    }
-
-#if EIGEN_HAS_VARIADIC_TEMPLATES
-    template<typename... IndexTypes> EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE StorageRefType operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
-    {
-      EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      eigen_assert(internal::all((Eigen::NumTraits<Index>::highest() >= otherIndices)...));
-      if (PlainObjectType::Options&RowMajor) {
-        const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
-        return m_data[index];
-      } else {
-        const Index index = m_dimensions.IndexOfColMajor(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
-        return m_data[index];
-      }
-    }
-#else
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1) const
-    {
-      if (PlainObjectType::Options&RowMajor) {
-        const Index index = i1 + i0 * m_dimensions[1];
-        return m_data[index];
-      } else {
-        const Index index = i0 + i1 * m_dimensions[0];
-        return m_data[index];
-      }
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2) const
-    {
-      if (PlainObjectType::Options&RowMajor) {
-         const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0);
-         return m_data[index];
-      } else {
-         const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2);
-        return m_data[index];
-      }
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3) const
-    {
-      if (PlainObjectType::Options&RowMajor) {
-        const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0));
-        return m_data[index];
-      } else {
-        const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * i3));
-        return m_data[index];
-      }
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
-    {
-      if (PlainObjectType::Options&RowMajor) {
-        const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)));
-        return m_data[index];
-      } else {
-        const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * (i3 + m_dimensions[3] * i4)));
-        return m_data[index];
-      }
-    }
-#endif
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE StorageRefType operator()(const array<Index, NumIndices>& indices)
-    {
-      //      eigen_assert(checkIndexRange(indices));
-      if (PlainObjectType::Options&RowMajor) {
-        const Index index = m_dimensions.IndexOfRowMajor(indices);
-        return m_data[index];
-      } else {
-        const Index index = m_dimensions.IndexOfColMajor(indices);
-        return m_data[index];
-      }
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE StorageRefType operator()()
-    {
-      EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      return m_data[0];
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE StorageRefType operator()(Index index)
-    {
-      eigen_internal_assert(index >= 0 && index < size());
-      return m_data[index];
-    }
-
-#if EIGEN_HAS_VARIADIC_TEMPLATES
-    template<typename... IndexTypes> EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE StorageRefType operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
-    {
-      static_assert(sizeof...(otherIndices) + 2 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
-       eigen_assert(internal::all((Eigen::NumTraits<Index>::highest() >= otherIndices)...));
-      const std::size_t NumDims = sizeof...(otherIndices) + 2;
-      if (PlainObjectType::Options&RowMajor) {
-        const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumDims>{{firstIndex, secondIndex, otherIndices...}});
-        return m_data[index];
-      } else {
-        const Index index = m_dimensions.IndexOfColMajor(array<Index, NumDims>{{firstIndex, secondIndex, otherIndices...}});
-        return m_data[index];
-      }
-    }
-#else
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1)
-    {
-       if (PlainObjectType::Options&RowMajor) {
-         const Index index = i1 + i0 * m_dimensions[1];
-        return m_data[index];
-      } else {
-        const Index index = i0 + i1 * m_dimensions[0];
-        return m_data[index];
-      }
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2)
-    {
-       if (PlainObjectType::Options&RowMajor) {
-         const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0);
-        return m_data[index];
-      } else {
-         const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2);
-        return m_data[index];
-      }
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3)
-    {
-      if (PlainObjectType::Options&RowMajor) {
-        const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0));
-        return m_data[index];
-      } else {
-        const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * i3));
-        return m_data[index];
-      }
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3, Index i4)
-    {
-      if (PlainObjectType::Options&RowMajor) {
-        const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)));
-        return m_data[index];
-      } else {
-        const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * (i3 + m_dimensions[3] * i4)));
-        return m_data[index];
-      }
-    }
-#endif
-
-    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorMap)
-
-  private:
-    StoragePointerType m_data;
-    Dimensions m_dimensions;
-};
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_MAP_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorMeta.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorMeta.h
deleted file mode 100644
index a6181d3..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorMeta.h
+++ /dev/null
@@ -1,311 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_META_H
-#define EIGEN_CXX11_TENSOR_TENSOR_META_H
-
-namespace Eigen {
-
-template<bool cond> struct Cond {};
-
-template<typename T1, typename T2> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-const T1& choose(Cond<true>, const T1& first, const T2&) {
-  return first;
-}
-
-template<typename T1, typename T2> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-const T2& choose(Cond<false>, const T1&, const T2& second) {
-  return second;
-}
-
-
-template <typename T, typename X, typename Y>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-T divup(const X x, const Y y) {
-  return static_cast<T>((x + y - 1) / y);
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-T divup(const T x, const T y) {
-  return static_cast<T>((x + y - 1) / y);
-}
-
-template <size_t n> struct max_n_1 {
-  static const size_t size = n;
-};
-template <> struct max_n_1<0> {
-  static const size_t size = 1;
-};
-
-
-// Default packet types
-template <typename Scalar, typename Device>
-struct PacketType : internal::packet_traits<Scalar> {
-  typedef typename internal::packet_traits<Scalar>::type type;
-};
-
-// For CUDA packet types when using a GpuDevice
-#if defined(EIGEN_USE_GPU) && defined(EIGEN_HAS_GPU_FP16)
-
-typedef ulonglong2 Packet4h2;
-template<>
-struct PacketType<half, GpuDevice> {
-  typedef Packet4h2 type;
-  static const int size = 8;
-  enum {
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasNegate = 1,
-    HasAbs    = 1,
-    HasArg    = 0,
-    HasAbs2   = 0,
-    HasMin    = 1,
-    HasMax    = 1,
-    HasConj   = 0,
-    HasSetLinear = 0,
-    HasBlend  = 0,
-
-    HasDiv    = 1,
-    HasSqrt   = 1,
-    HasRsqrt  = 1,
-    HasExp    = 1,
-    HasExpm1  = 0,
-    HasLog    = 1,
-    HasLog1p  = 0,
-    HasLog10  = 0,
-    HasPow    = 1,
-  };
-};
-#endif
-
-#if defined(EIGEN_USE_SYCL)
-
-namespace TensorSycl {
-namespace internal {
-
-template <typename Index, Index A, Index B> struct PlusOp {
-  static constexpr Index Value = A + B;
-};
-
-template <typename Index, Index A, Index B> struct DivOp {
-  static constexpr Index Value = A / B;
-};
-
-template <typename Index, Index start, Index end, Index step,
-          template <class Indx, Indx...> class StepOp>
-struct static_for {
-  template <typename UnaryOperator>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void loop(UnaryOperator op) {
-    op(start);
-    static_for<Index, StepOp<Index, start, step>::Value, end, step,
-               StepOp>::loop(op);
-  }
-};
-template <typename Index, Index end, Index step,
-          template <class Indx, Indx...> class StepOp>
-struct static_for<Index, end, end, step, StepOp> {
-  template <typename UnaryOperator>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void loop(UnaryOperator) {}
-};
-
-template <typename OutScalar, typename Device, bool Vectorizable>
-struct Vectorise {
-  static const int PacketSize = 1;
-  typedef OutScalar PacketReturnType;
-};
-
-template <typename OutScalar, typename Device>
-struct Vectorise<OutScalar, Device, true> {
-  static const int PacketSize = Eigen::PacketType<OutScalar, Device>::size;
-  typedef typename Eigen::PacketType<OutScalar, Device>::type PacketReturnType;
-};
-
-static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index roundUp(Index x, Index y) {
-  return ((((x) + (y)-1) / (y)) * (y));
-}
-
-} // namespace internal
-} // namespace TensorSycl
-
-template <>
-  struct PacketType<half, SyclDevice> {
-  typedef half type;
-  static const int size = 1;
-  enum {
-    HasAdd    = 0,
-    HasSub    = 0,
-    HasMul    = 0,
-    HasNegate = 0,
-    HasAbs    = 0,
-    HasArg    = 0,
-    HasAbs2   = 0,
-    HasMin    = 0,
-    HasMax    = 0,
-    HasConj   = 0,
-    HasSetLinear = 0,
-    HasBlend  = 0
-  };
-};
-template <typename Scalar>
-struct PacketType<Scalar, SyclDevice> : internal::default_packet_traits {
-  typedef Scalar type;
-  typedef Scalar half;
-  enum {
-    Vectorizable = 0,
-    size = 1,
-    AlignedOnScalar = 0,
-    HasHalfPacket = 0
-  };
-  enum {
-    HasAdd    = 0,
-    HasSub    = 0,
-    HasMul    = 0,
-    HasNegate = 0,
-    HasAbs    = 0,
-    HasAbs2   = 0,
-    HasMin    = 0,
-    HasMax    = 0,
-    HasConj   = 0,
-    HasSetLinear = 0
-  };
-
-};
-
-template <typename Scalar>
-struct PacketType<Scalar, const SyclDevice> : PacketType<Scalar, SyclDevice>{};
-
-#ifndef EIGEN_DONT_VECTORIZE_SYCL
-#define PACKET_TYPE(CVQual, Type, val, lengths, DEV)\
-template<> struct PacketType<CVQual Type, DEV> : internal::sycl_packet_traits<val, lengths> \
-{\
-  typedef typename internal::packet_traits<Type>::type type;\
-  typedef typename internal::packet_traits<Type>::half half;\
-};
-
-
-PACKET_TYPE(const, float, 1, 4, SyclDevice)
-PACKET_TYPE(, float, 1, 4, SyclDevice)
-PACKET_TYPE(const, float, 1, 4, const SyclDevice)
-PACKET_TYPE(, float, 1, 4, const SyclDevice)
-
-PACKET_TYPE(const, double, 0, 2, SyclDevice)
-PACKET_TYPE(, double, 0, 2, SyclDevice)
-PACKET_TYPE(const, double, 0, 2, const SyclDevice)
-PACKET_TYPE(, double, 0, 2, const SyclDevice)
-#undef PACKET_TYPE
-
-template<> struct PacketType<half, const SyclDevice>: PacketType<half, SyclDevice>{};
-template<> struct PacketType<const half, const SyclDevice>: PacketType<half, SyclDevice>{};
-#endif
-#endif
-
-// Tuple mimics std::pair but works on e.g. nvcc.
-template <typename U, typename V> struct Tuple {
- public:
-  U first;
-  V second;
-
-  typedef U first_type;
-  typedef V second_type;
-
-  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  Tuple() : first(), second() {}
-
-  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  Tuple(const U& f, const V& s) : first(f), second(s) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void swap(Tuple& rhs) {
-    using numext::swap;
-    swap(first, rhs.first);
-    swap(second, rhs.second);
-  }
-};
-
-template <typename U, typename V>
-EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-bool operator==(const Tuple<U, V>& x, const Tuple<U, V>& y) {
-  return (x.first == y.first && x.second == y.second);
-}
-
-template <typename U, typename V>
-EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-bool operator!=(const Tuple<U, V>& x, const Tuple<U, V>& y) {
-  return !(x == y);
-}
-
-
-// Can't use std::pairs on cuda devices
-template <typename Idx> struct IndexPair {
-  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) {}
-  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Idx f, Idx s) : first(f), second(s) {}
-
-  EIGEN_DEVICE_FUNC void set(IndexPair<Idx> val) {
-    first = val.first;
-    second = val.second;
-  }
-
-  Idx first;
-  Idx second;
-};
-
-
-#ifdef EIGEN_HAS_SFINAE
-namespace internal {
-
-  template<typename IndexType, typename Index, Index... Is>
-  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  array<Index, sizeof...(Is)> customIndices2Array(IndexType& idx, numeric_list<Index, Is...>) {
-    return { idx[Is]... };
-  }
-  template<typename IndexType, typename Index>
-  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  array<Index, 0> customIndices2Array(IndexType&, numeric_list<Index>) {
-    return array<Index, 0>();
-  }
-
-  /** Make an array (for index/dimensions) out of a custom index */
-  template<typename Index, std::size_t NumIndices, typename IndexType>
-  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  array<Index, NumIndices> customIndices2Array(IndexType& idx) {
-    return customIndices2Array(idx, typename gen_numeric_list<Index, NumIndices>::type{});
-  }
-
-
-  template <typename B, typename D>
-  struct is_base_of
-  {
-
-    typedef char (&yes)[1];
-    typedef char (&no)[2];
-
-    template <typename BB, typename DD>
-    struct Host
-    {
-      operator BB*() const;
-      operator DD*();
-    };
-
-    template<typename T>
-    static yes check(D*, T);
-    static no check(B*, int);
-
-    static const bool value = sizeof(check(Host<B,D>(), int())) == sizeof(yes);
-  };
-
-}
-#endif
-
-
-
-}  // namespace Eigen
-
-#endif  // EIGEN_CXX11_TENSOR_TENSOR_META_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorMorphing.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorMorphing.h
deleted file mode 100644
index b3f00f7..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorMorphing.h
+++ /dev/null
@@ -1,1102 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H
-#define EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H
-
-namespace Eigen {
-
-/** \class TensorReshaping
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Tensor reshaping class.
-  *
-  *
-  */
-namespace internal {
-template<typename NewDimensions, typename XprType>
-struct traits<TensorReshapingOp<NewDimensions, XprType> > : public traits<XprType>
-{
-  typedef typename XprType::Scalar Scalar;
-  typedef traits<XprType> XprTraits;
-  typedef typename XprTraits::StorageKind StorageKind;
-  typedef typename XprTraits::Index Index;
-  typedef typename XprType::Nested Nested;
-  typedef typename remove_reference<Nested>::type _Nested;
-  static const int NumDimensions = array_size<NewDimensions>::value;
-  static const int Layout = XprTraits::Layout;
-  typedef typename XprTraits::PointerType PointerType;
-};
-
-template<typename NewDimensions, typename XprType>
-struct eval<TensorReshapingOp<NewDimensions, XprType>, Eigen::Dense>
-{
-  typedef const TensorReshapingOp<NewDimensions, XprType>EIGEN_DEVICE_REF type;
-};
-
-template<typename NewDimensions, typename XprType>
-struct nested<TensorReshapingOp<NewDimensions, XprType>, 1, typename eval<TensorReshapingOp<NewDimensions, XprType> >::type>
-{
-  typedef TensorReshapingOp<NewDimensions, XprType> type;
-};
-
-}  // end namespace internal
-
-
-
-template<typename NewDimensions, typename XprType>
-class TensorReshapingOp : public TensorBase<TensorReshapingOp<NewDimensions, XprType>, WriteAccessors>
-{
-  public:
-  typedef TensorBase<TensorReshapingOp<NewDimensions, XprType>, WriteAccessors> Base;
-  typedef typename Eigen::internal::traits<TensorReshapingOp>::Scalar Scalar;
-  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
-  typedef typename Eigen::internal::nested<TensorReshapingOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorReshapingOp>::StorageKind StorageKind;
-  typedef typename Eigen::internal::traits<TensorReshapingOp>::Index Index;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReshapingOp(const XprType& expr, const NewDimensions& dims)
-      : m_xpr(expr), m_dims(dims) {}
-
-    EIGEN_DEVICE_FUNC
-    const NewDimensions& dimensions() const { return m_dims; }
-
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename XprType::Nested>::type&
-    expression() const { return m_xpr; }
-
-    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorReshapingOp)
-
-  protected:
-    typename XprType::Nested m_xpr;
-    const NewDimensions m_dims;
-};
-
-
-// Eval as rvalue
-template<typename NewDimensions, typename ArgType, typename Device>
-struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
-{
-  typedef TensorReshapingOp<NewDimensions, ArgType> XprType;
-  typedef NewDimensions Dimensions;
-
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-  typedef StorageMemory<typename internal::remove_const<CoeffReturnType>::type, Device> ConstCastStorage;
-
-  static const int NumOutputDims = internal::array_size<Dimensions>::value;
-  static const int NumInputDims  = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
-
-  enum ReshapingKind {
-    // We do not use layout information to determine reshaping kind.
-    // Depending on the layout `N` can be inner or outer dimension.
-    OneByN = 0,  // expr.reshape(1, N)
-    NByOne = 1,  // expr.reshape(N, 1)
-    Runtime = 2  // Reshape dimensions are dynamic (specified at runtime).
-  };
-
-  // clang-format off
-  static const ReshapingKind kind =
-#if defined(EIGEN_HAS_INDEX_LIST)
-        (NumOutputDims == 2 && internal::index_statically_eq<NewDimensions>(/*index=*/0, /*value=*/1)) ? OneByN
-      : (NumOutputDims == 2 && internal::index_statically_eq<NewDimensions>(/*index=*/1, /*value=*/1)) ? NByOne
-      : Runtime;
-#else
-        Runtime;
-#endif
-  // clang-format on
-
-  enum {
-    IsAligned         = TensorEvaluator<ArgType, Device>::IsAligned,
-    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
-    // For trivial reshapes with raw access to underlying data we will provide
-    // zero overhead block access.
-    // TODO(ezhulenev): Consider adding block access without raw access?
-    BlockAccess       = TensorEvaluator<ArgType, Device>::RawAccess &&
-                        NumInputDims > 0 && NumOutputDims > 0,
-    PreferBlockAccess = false,
-    Layout            = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess       = false,  // to be implemented
-    RawAccess         = TensorEvaluator<ArgType, Device>::RawAccess
-  };
-
-  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockDescriptor<NumOutputDims, Index> TensorBlockDesc;
-  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
-
-  typedef
-      typename internal::TensorMaterializedBlock<ScalarNoConst, NumOutputDims,
-                                                 Layout, Index>
-          TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device), m_dimensions(op.dimensions())
-  {
-    // The total size of the reshaped tensor must be equal to the total size
-    // of the input tensor.
-    eigen_assert(internal::array_prod(m_impl.dimensions()) == internal::array_prod(op.dimensions()));
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-
-#ifdef EIGEN_USE_THREADS
-  template <typename EvalSubExprsCallback>
-  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
-      EvaluatorPointerType data, EvalSubExprsCallback done) {
-    m_impl.evalSubExprsIfNeededAsync(data, std::move(done));
-  }
-#endif
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
-    return m_impl.evalSubExprsIfNeeded(data);
-  }
-  EIGEN_STRONG_INLINE void cleanup() {
-    m_impl.cleanup();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
-  {
-    return m_impl.coeff(index);
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
-  {
-    return m_impl.template packet<LoadMode>(index);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    return m_impl.costPerCoeff(vectorized);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockResourceRequirements getResourceRequirements() const {
-    return internal::TensorBlockResourceRequirements::any();
-  }
-
-  // required in block(OutputTensorBlock* output_block) const
-  // For C++03 compatibility this must be defined outside the method
-  struct BlockIteratorState {
-    Index stride;
-    Index span;
-    Index size;
-    Index count;
-  };
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
-  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
-          bool /*root_of_expr_ast*/ = false) const {
-    eigen_assert(m_impl.data() != NULL);
-    eigen_assert((kind == Runtime) ||
-                 (kind == OneByN && desc.dimensions()[0] == 1) ||
-                 (kind == NByOne && desc.dimensions()[1] == 1));
-
-    if (kind == OneByN || kind == NByOne) {
-      // We can guarantee at compile time that block is just a contiguous slice
-      // of the underlying expression memory buffer.
-      return TensorBlock(internal::TensorBlockKind::kView,
-                           m_impl.data() + desc.offset(), desc.dimensions());
-    } else {
-      // This will do additional runtime checks, and in the end it might be also
-      // a view, or it might be a block materialized in the temporary buffer.
-      return TensorBlock::materialize(m_impl.data(), m_dimensions, desc,
-                                        scratch);
-    }
-  }
-
-  EIGEN_DEVICE_FUNC typename Storage::Type data() const {
-    return constCast(m_impl.data());
-  }
-
-  EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
-
-  #ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-  #endif
- protected:
-  TensorEvaluator<ArgType, Device> m_impl;
-  NewDimensions m_dimensions;
-};
-
-
-// Eval as lvalue
-template<typename NewDimensions, typename ArgType, typename Device>
-  struct TensorEvaluator<TensorReshapingOp<NewDimensions, ArgType>, Device>
-  : public TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
-
-{
-  typedef TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device> Base;
-  typedef TensorReshapingOp<NewDimensions, ArgType> XprType;
-  typedef NewDimensions Dimensions;
-
-  enum {
-    IsAligned         = TensorEvaluator<ArgType, Device>::IsAligned,
-    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess       = TensorEvaluator<ArgType, Device>::RawAccess,
-    PreferBlockAccess = false,
-    Layout            = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess       = false,  // to be implemented
-    RawAccess         = TensorEvaluator<ArgType, Device>::RawAccess
-  };
-
-  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-    : Base(op, device)
-  { }
-
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockDescriptor<TensorEvaluator::NumOutputDims, Index>
-      TensorBlockDesc;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
-  {
-    return this->m_impl.coeffRef(index);
-  }
-
-  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketReturnType& x)
-  {
-    this->m_impl.template writePacket<StoreMode>(index, x);
-  }
-
-  template <typename TensorBlock>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
-      const TensorBlockDesc& desc, const TensorBlock& block) {
-    assert(this->m_impl.data() != NULL);
-
-    typedef typename TensorBlock::XprType TensorBlockExpr;
-    typedef internal::TensorBlockAssignment<
-        Scalar, TensorEvaluator::NumOutputDims, TensorBlockExpr, Index>
-        TensorBlockAssign;
-
-    TensorBlockAssign::Run(
-        TensorBlockAssign::target(desc.dimensions(),
-                                  internal::strides<Layout>(this->dimensions()),
-                                  this->m_impl.data(), desc.offset()),
-        block.expr());
-  }
-};
-
-
-/** \class TensorSlicing
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Tensor slicing class.
-  *
-  *
-  */
-namespace internal {
-template<typename StartIndices, typename Sizes, typename XprType>
-struct traits<TensorSlicingOp<StartIndices, Sizes, XprType> > : public traits<XprType>
-{
-  typedef typename XprType::Scalar Scalar;
-  typedef traits<XprType> XprTraits;
-  typedef typename XprTraits::StorageKind StorageKind;
-  typedef typename XprTraits::Index Index;
-  typedef typename XprType::Nested Nested;
-  typedef typename remove_reference<Nested>::type _Nested;
-  static const int NumDimensions = array_size<StartIndices>::value;
-  static const int Layout = XprTraits::Layout;
-  typedef typename XprTraits::PointerType PointerType;
-};
-
-template<typename StartIndices, typename Sizes, typename XprType>
-struct eval<TensorSlicingOp<StartIndices, Sizes, XprType>, Eigen::Dense>
-{
-  typedef const TensorSlicingOp<StartIndices, Sizes, XprType>EIGEN_DEVICE_REF type;
-};
-
-template<typename StartIndices, typename Sizes, typename XprType>
-struct nested<TensorSlicingOp<StartIndices, Sizes, XprType>, 1, typename eval<TensorSlicingOp<StartIndices, Sizes, XprType> >::type>
-{
-  typedef TensorSlicingOp<StartIndices, Sizes, XprType> type;
-};
-
-}  // end namespace internal
-
-
-
-template<typename StartIndices, typename Sizes, typename XprType>
-class TensorSlicingOp : public TensorBase<TensorSlicingOp<StartIndices, Sizes, XprType> >
-{
-  public:
-  typedef TensorBase<TensorSlicingOp<StartIndices, Sizes, XprType> > Base;
-  typedef typename Eigen::internal::traits<TensorSlicingOp>::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename Eigen::internal::nested<TensorSlicingOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorSlicingOp>::StorageKind StorageKind;
-  typedef typename Eigen::internal::traits<TensorSlicingOp>::Index Index;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSlicingOp(const XprType& expr, const StartIndices& indices, const Sizes& sizes)
-      : m_xpr(expr), m_indices(indices), m_sizes(sizes) {}
-
-    EIGEN_DEVICE_FUNC
-    const StartIndices& startIndices() const { return m_indices; }
-    EIGEN_DEVICE_FUNC
-    const Sizes& sizes() const { return m_sizes; }
-
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename XprType::Nested>::type&
-    expression() const { return m_xpr; }
-
-    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorSlicingOp)
-
-  protected:
-    typename XprType::Nested m_xpr;
-    const StartIndices m_indices;
-    const Sizes m_sizes;
-};
-
-
-// Fixme: figure out the exact threshold
-namespace {
-template <typename Index, typename Device, bool BlockAccess> struct MemcpyTriggerForSlicing {
-  EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const Device& device) : threshold_(2 * device.numThreads()) { }
-  EIGEN_DEVICE_FUNC bool operator ()(Index total, Index contiguous) const {
-    const bool prefer_block_evaluation = BlockAccess && total > 32*1024;
-    return !prefer_block_evaluation && contiguous > threshold_;
-  }
-
- private:
-  Index threshold_;
-};
-
-// It is very expensive to start the memcpy kernel on GPU: we therefore only
-// use it for large copies.
-#ifdef EIGEN_USE_GPU
-template <typename Index, bool BlockAccess> struct MemcpyTriggerForSlicing<Index, GpuDevice, BlockAccess>  {
-  EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const GpuDevice&) { }
-  EIGEN_DEVICE_FUNC bool operator ()(Index, Index contiguous) const { return contiguous > 4*1024*1024; }
-};
-#endif
-
-// It is very expensive to start the memcpy kernel on GPU: we therefore only
-// use it for large copies.
-#ifdef EIGEN_USE_SYCL
-template <typename Index, bool BlockAccess> struct MemcpyTriggerForSlicing<Index, Eigen::SyclDevice, BlockAccess>  {
-  EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const SyclDevice&) { }
-  EIGEN_DEVICE_FUNC bool operator ()(Index, Index contiguous) const { return contiguous > 4*1024*1024; }
-};
-#endif
-
-}
-
-// Eval as rvalue
-template<typename StartIndices, typename Sizes, typename ArgType, typename Device>
-struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
-{
-  typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType;
-  static const int NumDims = internal::array_size<Sizes>::value;
-
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  typedef Sizes Dimensions;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef StorageMemory<typename internal::remove_const<CoeffReturnType>::type, Device> ConstCastStorage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
-  enum {
-    // Alignment can't be guaranteed at compile time since it depends on the
-    // slice offsets and sizes.
-    IsAligned         = false,
-    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess &&
-                        // FIXME: Temporary workaround for bug in slicing of bool tensors.
-                        !internal::is_same<typename internal::remove_const<Scalar>::type, bool>::value,
-    PreferBlockAccess = true,
-    Layout            = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess       = false,
-    RawAccess         = false
-  };
-
-  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
-  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
-
-  // Tensor slicing does not change the block type.
-  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
-      TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices())
-  {
-    m_is_identity = true;
-    for (int i = 0; i < internal::array_size<Dimensions>::value; ++i) {
-      eigen_assert(m_impl.dimensions()[i] >=
-                   op.sizes()[i] + op.startIndices()[i]);
-      if (m_impl.dimensions()[i] != op.sizes()[i] ||
-          op.startIndices()[i] != 0) {
-        m_is_identity = false;
-      }
-    }
-
-    // No strides for scalars.
-    if (NumDims == 0) return;
-
-    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
-    const Sizes& output_dims = op.sizes();
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      m_inputStrides[0] = 1;
-      for (int i = 1; i < NumDims; ++i) {
-        m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
-      }
-
-     // Don't initialize m_fastOutputStrides[0] since it won't ever be accessed.
-      m_outputStrides[0] = 1;
-      for (int i = 1; i < NumDims; ++i) {
-        m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1];
-        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1);
-      }
-    } else {
-      m_inputStrides[NumDims-1] = 1;
-      for (int i = NumDims - 2; i >= 0; --i) {
-        m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
-      }
-
-     // Don't initialize m_fastOutputStrides[NumDims-1] since it won't ever be accessed.
-      m_outputStrides[NumDims-1] = 1;
-      for (int i = NumDims - 2; i >= 0; --i) {
-        m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1];
-        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1);
-      }
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
-    m_impl.evalSubExprsIfNeeded(NULL);
-    if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization
-        && data && m_impl.data()) {
-      Index contiguous_values = 1;
-      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-        for (int i = 0; i < NumDims; ++i) {
-          contiguous_values *= dimensions()[i];
-          if (dimensions()[i] != m_impl.dimensions()[i]) {
-            break;
-          }
-        }
-      } else {
-        for (int i = NumDims-1; i >= 0; --i) {
-          contiguous_values *= dimensions()[i];
-          if (dimensions()[i] != m_impl.dimensions()[i]) {
-            break;
-          }
-        }
-      }
-      // Use memcpy if it's going to be faster than using the regular evaluation.
-      const MemcpyTriggerForSlicing<Index, Device, BlockAccess> trigger(m_device);
-      if (trigger(internal::array_prod(dimensions()), contiguous_values)) {
-        EvaluatorPointerType src = (EvaluatorPointerType)m_impl.data();
-        for (Index i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) {
-          Index offset = srcCoeff(i);
-          m_device.memcpy((void*)(m_device.get(data + i)), m_device.get(src+offset), contiguous_values * sizeof(Scalar));
-        }
-        return false;
-      }
-    }
-    return true;
-  }
-
-#ifdef EIGEN_USE_THREADS
-  template <typename EvalSubExprsCallback>
-  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
-      EvaluatorPointerType /*data*/, EvalSubExprsCallback done) {
-    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
-  }
-#endif  // EIGEN_USE_THREADS
-
-  EIGEN_STRONG_INLINE void cleanup() {
-    m_impl.cleanup();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
-  {
-    if (m_is_identity) {
-      return m_impl.coeff(index);
-    } else {
-      return m_impl.coeff(srcCoeff(index));
-    }
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
-  {
-    const int packetSize = PacketType<CoeffReturnType, Device>::size;
-    EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+packetSize-1 < internal::array_prod(dimensions()));
-
-    if (m_is_identity) {
-      return m_impl.template packet<LoadMode>(index);
-    }
-
-    Index inputIndices[] = {0, 0};
-    Index indices[] = {index, index + packetSize - 1};
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      EIGEN_UNROLL_LOOP
-      for (int i = NumDims - 1; i > 0; --i) {
-        const Index idx0 = indices[0] / m_fastOutputStrides[i];
-        const Index idx1 = indices[1] / m_fastOutputStrides[i];
-        inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i];
-        inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i];
-        indices[0] -= idx0 * m_outputStrides[i];
-        indices[1] -= idx1 * m_outputStrides[i];
-      }
-      inputIndices[0] += (indices[0] + m_offsets[0]);
-      inputIndices[1] += (indices[1] + m_offsets[0]);
-    } else {
-      EIGEN_UNROLL_LOOP
-      for (int i = 0; i < NumDims - 1; ++i) {
-        const Index idx0 = indices[0] / m_fastOutputStrides[i];
-        const Index idx1 = indices[1] / m_fastOutputStrides[i];
-        inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i];
-        inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i];
-        indices[0] -= idx0 * m_outputStrides[i];
-        indices[1] -= idx1 * m_outputStrides[i];
-      }
-      inputIndices[0] += (indices[0] + m_offsets[NumDims-1]);
-      inputIndices[1] += (indices[1] + m_offsets[NumDims-1]);
-    }
-    if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
-      PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
-      return rslt;
-    }
-    else {
-      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
-      values[0] = m_impl.coeff(inputIndices[0]);
-      values[packetSize-1] = m_impl.coeff(inputIndices[1]);
-      EIGEN_UNROLL_LOOP
-      for (int i = 1; i < packetSize-1; ++i) {
-        values[i] = coeff(index+i);
-      }
-      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
-      return rslt;
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockResourceRequirements getResourceRequirements() const {
-    const size_t target_size = m_device.lastLevelCacheSize();
-    return internal::TensorBlockResourceRequirements::merge(
-        internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size),
-        m_impl.getResourceRequirements());
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
-  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
-          bool /*root_of_expr_ast*/ = false) const {
-    TensorBlockDesc arg_desc = desc.WithOffset(srcCoeff(desc.offset()));
-    TensorBlock block = m_impl.block(arg_desc, scratch);
-    if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer();
-    return block;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const {
-    typename Storage::Type result = constCast(m_impl.data());
-    if (result) {
-      Index offset = 0;
-      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-        for (int i = 0; i < NumDims; ++i) {
-          if (m_dimensions[i] != m_impl.dimensions()[i]) {
-            offset += m_offsets[i] * m_inputStrides[i];
-            for (int j = i+1; j < NumDims; ++j) {
-              if (m_dimensions[j] > 1) {
-                return NULL;
-              }
-              offset += m_offsets[j] * m_inputStrides[j];
-            }
-            break;
-          }
-        }
-      } else {
-        for (int i = NumDims - 1; i >= 0; --i) {
-          if (m_dimensions[i] != m_impl.dimensions()[i]) {
-            offset += m_offsets[i] * m_inputStrides[i];
-            for (int j = i-1; j >= 0; --j) {
-              if (m_dimensions[j] > 1) {
-                return NULL;
-              }
-              offset += m_offsets[j] * m_inputStrides[j];
-            }
-            break;
-          }
-        }
-      }
-      return result + offset;
-    }
-    return NULL;
-  }
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
-
- protected:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
-  {
-    Index inputIndex = 0;
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      EIGEN_UNROLL_LOOP
-      for (int i = NumDims - 1; i > 0; --i) {
-        const Index idx = index / m_fastOutputStrides[i];
-        inputIndex += (idx + m_offsets[i]) * m_inputStrides[i];
-        index -= idx * m_outputStrides[i];
-      }
-      inputIndex += (index + m_offsets[0]);
-    } else {
-      EIGEN_UNROLL_LOOP
-      for (int i = 0; i < NumDims - 1; ++i) {
-        const Index idx = index / m_fastOutputStrides[i];
-        inputIndex += (idx + m_offsets[i]) * m_inputStrides[i];
-        index -= idx * m_outputStrides[i];
-      }
-      inputIndex += (index + m_offsets[NumDims-1]);
-    }
-    return inputIndex;
-  }
-
-  array<Index, NumDims> m_outputStrides;
-  array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
-  array<Index, NumDims> m_inputStrides;
-  TensorEvaluator<ArgType, Device> m_impl;
-  const Device EIGEN_DEVICE_REF m_device;
-  Dimensions m_dimensions;
-  bool m_is_identity;
-  const StartIndices m_offsets;
-};
-
-
-// Eval as lvalue
-template<typename StartIndices, typename Sizes, typename ArgType, typename Device>
-struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
-  : public TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
-{
-  typedef TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device> Base;
-  typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType;
-  static const int NumDims = internal::array_size<Sizes>::value;
-
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  typedef Sizes Dimensions;
-
-  enum {
-    IsAligned         = false,
-    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
-    PreferBlockAccess = true,
-    Layout            = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess       = false,
-    RawAccess         = (NumDims == 1) & TensorEvaluator<ArgType, Device>::RawAccess
-  };
-
-  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
-  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-    : Base(op, device)
-    { }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
-  {
-    if (this->m_is_identity) {
-      return this->m_impl.coeffRef(index);
-    } else {
-      return this->m_impl.coeffRef(this->srcCoeff(index));
-    }
-  }
-
-  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketReturnType& x)
-  {
-    if (this->m_is_identity) {
-      this->m_impl.template writePacket<StoreMode>(index, x);
-      return;
-    }
-
-    const int packetSize = PacketType<CoeffReturnType, Device>::size;
-    Index inputIndices[] = {0, 0};
-    Index indices[] = {index, index + packetSize - 1};
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      EIGEN_UNROLL_LOOP
-      for (int i = NumDims - 1; i > 0; --i) {
-        const Index idx0 = indices[0] / this->m_fastOutputStrides[i];
-        const Index idx1 = indices[1] / this->m_fastOutputStrides[i];
-        inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i];
-        inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i];
-        indices[0] -= idx0 * this->m_outputStrides[i];
-        indices[1] -= idx1 * this->m_outputStrides[i];
-      }
-      inputIndices[0] += (indices[0] + this->m_offsets[0]);
-      inputIndices[1] += (indices[1] + this->m_offsets[0]);
-    } else {
-      EIGEN_UNROLL_LOOP
-      for (int i = 0; i < NumDims - 1; ++i) {
-        const Index idx0 = indices[0] / this->m_fastOutputStrides[i];
-        const Index idx1 = indices[1] / this->m_fastOutputStrides[i];
-        inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i];
-        inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i];
-        indices[0] -= idx0 * this->m_outputStrides[i];
-        indices[1] -= idx1 * this->m_outputStrides[i];
-      }
-      inputIndices[0] += (indices[0] + this->m_offsets[NumDims-1]);
-      inputIndices[1] += (indices[1] + this->m_offsets[NumDims-1]);
-    }
-    if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
-      this->m_impl.template writePacket<StoreMode>(inputIndices[0], x);
-    }
-    else {
-      EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
-      internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
-      this->m_impl.coeffRef(inputIndices[0]) = values[0];
-      this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1];
-      EIGEN_UNROLL_LOOP
-      for (int i = 1; i < packetSize-1; ++i) {
-        this->coeffRef(index+i) = values[i];
-      }
-    }
-  }
-
-  template<typename TensorBlock>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
-      const TensorBlockDesc& desc, const TensorBlock& block) {
-    TensorBlockDesc arg_desc = desc.WithOffset(this->srcCoeff(desc.offset()));
-    this->m_impl.writeBlock(arg_desc, block);
-  }
-};
-
-namespace internal {
-template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
-struct traits<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> > : public traits<XprType>
-{
-  typedef typename XprType::Scalar Scalar;
-  typedef traits<XprType> XprTraits;
-  typedef typename XprTraits::StorageKind StorageKind;
-  typedef typename XprTraits::Index Index;
-  typedef typename XprType::Nested Nested;
-  typedef typename remove_reference<Nested>::type _Nested;
-  static const int NumDimensions = array_size<StartIndices>::value;
-  static const int Layout = XprTraits::Layout;
-  typedef typename XprTraits::PointerType PointerType;
-};
-
-template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
-struct eval<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Eigen::Dense>
-{
-  typedef const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>EIGEN_DEVICE_REF type;
-};
-
-template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
-struct nested<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, 1, typename eval<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >::type>
-{
-  typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> type;
-};
-
-}  // end namespace internal
-
-
-template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
-class TensorStridingSlicingOp : public TensorBase<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >
-{
-  public:
-  typedef TensorBase<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> > Base;
-  typedef typename internal::traits<TensorStridingSlicingOp>::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename internal::nested<TensorStridingSlicingOp>::type Nested;
-  typedef typename internal::traits<TensorStridingSlicingOp>::StorageKind StorageKind;
-  typedef typename internal::traits<TensorStridingSlicingOp>::Index Index;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingSlicingOp(
-    const XprType& expr, const StartIndices& startIndices,
-    const StopIndices& stopIndices, const Strides& strides)
-      : m_xpr(expr), m_startIndices(startIndices), m_stopIndices(stopIndices),
-        m_strides(strides) {}
-
-    EIGEN_DEVICE_FUNC
-    const StartIndices& startIndices() const { return m_startIndices; }
-    EIGEN_DEVICE_FUNC
-    const StartIndices& stopIndices() const { return m_stopIndices; }
-    EIGEN_DEVICE_FUNC
-    const StartIndices& strides() const { return m_strides; }
-
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename XprType::Nested>::type&
-    expression() const { return m_xpr; }
-
-    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorStridingSlicingOp)
-
-  protected:
-    typename XprType::Nested m_xpr;
-    const StartIndices m_startIndices;
-    const StopIndices m_stopIndices;
-    const Strides m_strides;
-};
-
-// Eval as rvalue
-template<typename StartIndices, typename StopIndices, typename Strides, typename ArgType, typename Device>
-struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device>
-{
-  typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType> XprType;
-  static const int NumDims = internal::array_size<Strides>::value;
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-  typedef Strides Dimensions;
-
-  enum {
-    // Alignment can't be guaranteed at compile time since it depends on the
-    // slice offsets and sizes.
-    IsAligned = false,
-    PacketAccess = false,
-    BlockAccess = false,
-    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    RawAccess = false
-  };
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device),
-        m_device(device),
-        m_strides(op.strides())
-  {
-    // Handle degenerate intervals by gracefully clamping and allowing m_dimensions to be zero
-    DSizes<Index, NumDims> startIndicesClamped, stopIndicesClamped;
-    for (ptrdiff_t i = 0; i < internal::array_size<Dimensions>::value; ++i) {
-      eigen_assert(m_strides[i] != 0 && "0 stride is invalid");
-      if (m_strides[i] > 0) {
-        startIndicesClamped[i] =
-            clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]);
-        stopIndicesClamped[i] =
-            clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]);
-      } else {
-        /* implies m_strides[i] < 0 by assert */
-        startIndicesClamped[i] =
-            clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1);
-        stopIndicesClamped[i] =
-            clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1);
-      }
-      m_startIndices[i] = startIndicesClamped[i];
-    }
-
-    typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
-    const InputDimensions& input_dims = m_impl.dimensions();
-
-    // compute output tensor shape
-    m_is_identity = true;
-    for (int i = 0; i < NumDims; i++) {
-      Index interval = stopIndicesClamped[i] - startIndicesClamped[i];
-      if (interval == 0 || ((interval < 0) != (m_strides[i] < 0))) {
-        m_dimensions[i] = 0;
-      } else {
-        m_dimensions[i] =
-            (interval / m_strides[i]) + (interval % m_strides[i] != 0 ? 1 : 0);
-        eigen_assert(m_dimensions[i] >= 0);
-      }
-      if (m_strides[i] != 1 || interval != m_impl.dimensions()[i]) {
-        m_is_identity = false;
-      }
-    }
-
-    Strides output_dims = m_dimensions;
-
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      m_inputStrides[0] = m_strides[0];
-      m_offsets[0] = startIndicesClamped[0];
-      Index previousDimProduct = 1;
-      for (int i = 1; i < NumDims; ++i) {
-        previousDimProduct *= input_dims[i-1];
-        m_inputStrides[i] = previousDimProduct * m_strides[i];
-        m_offsets[i] = startIndicesClamped[i] * previousDimProduct;
-      }
-
-      // Don't initialize m_fastOutputStrides[0] since it won't ever be accessed.
-      m_outputStrides[0] = 1;
-      for (int i = 1; i < NumDims; ++i) {
-        m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1];
-        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1);
-      }
-    } else {
-      m_inputStrides[NumDims-1] = m_strides[NumDims-1];
-      m_offsets[NumDims-1] = startIndicesClamped[NumDims-1];
-      Index previousDimProduct = 1;
-      for (int i = NumDims - 2; i >= 0; --i) {
-        previousDimProduct *= input_dims[i+1];
-        m_inputStrides[i] = previousDimProduct * m_strides[i];
-        m_offsets[i] = startIndicesClamped[i] * previousDimProduct;
-      }
-
-      m_outputStrides[NumDims-1] = 1;
-      for (int i = NumDims - 2; i >= 0; --i) {
-        m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1];
-        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1);
-      }
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
-    m_impl.evalSubExprsIfNeeded(NULL);
-    return true;
-  }
-
-  EIGEN_STRONG_INLINE void cleanup() {
-    m_impl.cleanup();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
-  {
-    if (m_is_identity) {
-      return m_impl.coeff(index);
-    } else {
-      return m_impl.coeff(srcCoeff(index));
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const {
-    return NULL;
-  }
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
- protected:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
-  {
-    Index inputIndex = 0;
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      EIGEN_UNROLL_LOOP
-      for (int i = NumDims - 1; i >= 0; --i) {
-        const Index idx = index / m_fastOutputStrides[i];
-        inputIndex += idx * m_inputStrides[i] + m_offsets[i];
-        index -= idx * m_outputStrides[i];
-      }
-    } else {
-      EIGEN_UNROLL_LOOP
-      for (int i = 0; i < NumDims; ++i) {
-        const Index idx = index / m_fastOutputStrides[i];
-        inputIndex += idx * m_inputStrides[i] + m_offsets[i];
-        index -= idx * m_outputStrides[i];
-      }
-    }
-    return inputIndex;
-  }
-
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) {
-#ifndef SYCL_DEVICE_ONLY
-    return numext::maxi(min, numext::mini(max,value));
-#else
-    return cl::sycl::clamp(value, min, max);
-#endif
-  }
-
-  array<Index, NumDims> m_outputStrides;
-  array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
-  array<Index, NumDims> m_inputStrides;
-  bool m_is_identity;
-  TensorEvaluator<ArgType, Device> m_impl;
-  const Device EIGEN_DEVICE_REF m_device;
-  DSizes<Index, NumDims> m_startIndices; // clamped startIndices
-  DSizes<Index, NumDims> m_dimensions;
-  DSizes<Index, NumDims> m_offsets; // offset in a flattened shape
-  const Strides m_strides;
-};
-
-// Eval as lvalue
-template<typename StartIndices, typename StopIndices, typename Strides, typename ArgType, typename Device>
-struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device>
-  : public TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device>
-{
-  typedef TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device> Base;
-  typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType> XprType;
-  static const int NumDims = internal::array_size<Strides>::value;
-
-  enum {
-    IsAligned = false,
-    PacketAccess = false,
-    BlockAccess = false,
-    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess,
-    RawAccess = false
-  };
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-    : Base(op, device)
-    { }
-
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  typedef Strides Dimensions;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
-  {
-    if (this->m_is_identity) {
-      return this->m_impl.coeffRef(index);
-    } else {
-      return this->m_impl.coeffRef(this->srcCoeff(index));
-    }
-  }
-};
-
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorPadding.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorPadding.h
deleted file mode 100644
index ee44382..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorPadding.h
+++ /dev/null
@@ -1,708 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_PADDING_H
-#define EIGEN_CXX11_TENSOR_TENSOR_PADDING_H
-
-namespace Eigen {
-
-/** \class TensorPadding
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Tensor padding class.
-  * At the moment only padding with a constant value is supported.
-  *
-  */
-namespace internal {
-template<typename PaddingDimensions, typename XprType>
-struct traits<TensorPaddingOp<PaddingDimensions, XprType> > : public traits<XprType>
-{
-  typedef typename XprType::Scalar Scalar;
-  typedef traits<XprType> XprTraits;
-  typedef typename XprTraits::StorageKind StorageKind;
-  typedef typename XprTraits::Index Index;
-  typedef typename XprType::Nested Nested;
-  typedef typename remove_reference<Nested>::type _Nested;
-  static const int NumDimensions = XprTraits::NumDimensions;
-  static const int Layout = XprTraits::Layout;
-  typedef typename XprTraits::PointerType PointerType;
-};
-
-template<typename PaddingDimensions, typename XprType>
-struct eval<TensorPaddingOp<PaddingDimensions, XprType>, Eigen::Dense>
-{
-  typedef const TensorPaddingOp<PaddingDimensions, XprType>& type;
-};
-
-template<typename PaddingDimensions, typename XprType>
-struct nested<TensorPaddingOp<PaddingDimensions, XprType>, 1, typename eval<TensorPaddingOp<PaddingDimensions, XprType> >::type>
-{
-  typedef TensorPaddingOp<PaddingDimensions, XprType> type;
-};
-
-}  // end namespace internal
-
-
-
-template<typename PaddingDimensions, typename XprType>
-class TensorPaddingOp : public TensorBase<TensorPaddingOp<PaddingDimensions, XprType>, ReadOnlyAccessors>
-{
-  public:
-  typedef typename Eigen::internal::traits<TensorPaddingOp>::Scalar Scalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename Eigen::internal::nested<TensorPaddingOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorPaddingOp>::StorageKind StorageKind;
-  typedef typename Eigen::internal::traits<TensorPaddingOp>::Index Index;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPaddingOp(const XprType& expr, const PaddingDimensions& padding_dims, const Scalar padding_value)
-      : m_xpr(expr), m_padding_dims(padding_dims), m_padding_value(padding_value) {}
-
-    EIGEN_DEVICE_FUNC
-    const PaddingDimensions& padding() const { return m_padding_dims; }
-    EIGEN_DEVICE_FUNC
-    Scalar padding_value() const { return m_padding_value; }
-
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename XprType::Nested>::type&
-    expression() const { return m_xpr; }
-
-  protected:
-    typename XprType::Nested m_xpr;
-    const PaddingDimensions m_padding_dims;
-    const Scalar m_padding_value;
-};
-
-
-// Eval as rvalue
-template<typename PaddingDimensions, typename ArgType, typename Device>
-struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device>
-{
-  typedef TensorPaddingOp<PaddingDimensions, ArgType> XprType;
-  typedef typename XprType::Index Index;
-  static const int NumDims = internal::array_size<PaddingDimensions>::value;
-  typedef DSizes<Index, NumDims> Dimensions;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
-  enum {
-    IsAligned         = true,
-    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess       = TensorEvaluator<ArgType, Device>::RawAccess,
-    PreferBlockAccess = true,
-    Layout            = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess       = true,
-    RawAccess         = false
-  };
-
-  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
-  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
-
-  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
-                                                     Layout, Index>
-      TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()), m_device(device)
-  {
-    // The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead
-    // to a vector, which doesn't make sense. Instead one should reshape the scalar into a vector
-    // of 1 element first and then pad.
-    EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-    // Compute dimensions
-    m_dimensions = m_impl.dimensions();
-    for (int i = 0; i < NumDims; ++i) {
-      m_dimensions[i] += m_padding[i].first + m_padding[i].second;
-    }
-    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      m_inputStrides[0] = 1;
-      m_outputStrides[0] = 1;
-      for (int i = 1; i < NumDims; ++i) {
-        m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
-        m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
-      }
-      m_outputStrides[NumDims] = m_outputStrides[NumDims-1] * m_dimensions[NumDims-1];
-    } else {
-      m_inputStrides[NumDims - 1] = 1;
-      m_outputStrides[NumDims] = 1;
-      for (int i = NumDims - 2; i >= 0; --i) {
-        m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
-        m_outputStrides[i+1] = m_outputStrides[i+2] * m_dimensions[i+1];
-      }
-      m_outputStrides[0] = m_outputStrides[1] * m_dimensions[0];
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
-    m_impl.evalSubExprsIfNeeded(NULL);
-    return true;
-  }
-
-#ifdef EIGEN_USE_THREADS
-  template <typename EvalSubExprsCallback>
-  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
-      EvaluatorPointerType, EvalSubExprsCallback done) {
-    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
-  }
-#endif  // EIGEN_USE_THREADS
-
-  EIGEN_STRONG_INLINE void cleanup() {
-    m_impl.cleanup();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
-  {
-    eigen_assert(index < dimensions().TotalSize());
-    Index inputIndex = 0;
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      EIGEN_UNROLL_LOOP
-      for (int i = NumDims - 1; i > 0; --i) {
-        const Index idx = index / m_outputStrides[i];
-        if (isPaddingAtIndexForDim(idx, i)) {
-          return m_paddingValue;
-        }
-        inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
-        index -= idx * m_outputStrides[i];
-      }
-      if (isPaddingAtIndexForDim(index, 0)) {
-        return m_paddingValue;
-      }
-      inputIndex += (index - m_padding[0].first);
-    } else {
-      EIGEN_UNROLL_LOOP
-      for (int i = 0; i < NumDims - 1; ++i) {
-        const Index idx = index / m_outputStrides[i+1];
-        if (isPaddingAtIndexForDim(idx, i)) {
-          return m_paddingValue;
-        }
-        inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
-        index -= idx * m_outputStrides[i+1];
-      }
-      if (isPaddingAtIndexForDim(index, NumDims-1)) {
-        return m_paddingValue;
-      }
-      inputIndex += (index - m_padding[NumDims-1].first);
-    }
-    return m_impl.coeff(inputIndex);
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
-  {
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      return packetColMajor(index);
-    }
-    return packetRowMajor(index);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    TensorOpCost cost = m_impl.costPerCoeff(vectorized);
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      EIGEN_UNROLL_LOOP
-      for (int i = 0; i < NumDims; ++i)
-        updateCostPerDimension(cost, i, i == 0);
-    } else {
-      EIGEN_UNROLL_LOOP
-      for (int i = NumDims - 1; i >= 0; --i)
-        updateCostPerDimension(cost, i, i == NumDims - 1);
-    }
-    return cost;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockResourceRequirements getResourceRequirements() const {
-    const size_t target_size = m_device.lastLevelCacheSize();
-    return internal::TensorBlockResourceRequirements::merge(
-        internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size),
-        m_impl.getResourceRequirements());
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
-  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
-          bool /*root_of_expr_ast*/ = false) const {
-    // If one of the dimensions is zero, return empty block view.
-    if (desc.size() == 0) {
-      return TensorBlock(internal::TensorBlockKind::kView, NULL,
-                           desc.dimensions());
-    }
-
-    static const bool IsColMajor = Layout == static_cast<int>(ColMajor);
-    const int inner_dim_idx = IsColMajor ? 0 : NumDims - 1;
-
-    Index offset = desc.offset();
-
-    // Compute offsets in the output tensor corresponding to the desc.offset().
-    DSizes<Index, NumDims> output_offsets;
-    for (int i = NumDims - 1; i > 0; --i) {
-      const int dim = IsColMajor ? i : NumDims - i - 1;
-      const int stride_dim = IsColMajor ? dim : dim + 1;
-      output_offsets[dim] = offset / m_outputStrides[stride_dim];
-      offset -= output_offsets[dim] * m_outputStrides[stride_dim];
-    }
-    output_offsets[inner_dim_idx] = offset;
-
-    // Offsets in the input corresponding to output offsets.
-    DSizes<Index, NumDims> input_offsets = output_offsets;
-    for (int i = 0; i < NumDims; ++i) {
-      const int dim = IsColMajor ? i : NumDims - i - 1;
-      input_offsets[dim] = input_offsets[dim] - m_padding[dim].first;
-    }
-
-    // Compute offset in the input buffer (at this point it might be illegal and
-    // point outside of the input buffer, because we don't check for negative
-    // offsets, it will be autocorrected in the block iteration loop below).
-    Index input_offset = 0;
-    for (int i = 0; i < NumDims; ++i) {
-      const int dim = IsColMajor ? i : NumDims - i - 1;
-      input_offset += input_offsets[dim] * m_inputStrides[dim];
-    }
-
-    // Destination buffer and scratch buffer both indexed from 0 and have the
-    // same dimensions as the requested block (for destination buffer this
-    // property is guaranteed by `desc.destination()`).
-    Index output_offset = 0;
-    const DSizes<Index, NumDims> output_strides =
-        internal::strides<Layout>(desc.dimensions());
-
-    // NOTE(ezhulenev): We initialize bock iteration state for `NumDims - 1`
-    // dimensions, skipping innermost dimension. In theory it should be possible
-    // to squeeze matching innermost dimensions, however in practice that did
-    // not show any improvements in benchmarks. Also in practice first outer
-    // dimension usually has padding, and will prevent squeezing.
-
-    // Initialize output block iterator state. Dimension in this array are
-    // always in inner_most -> outer_most order (col major layout).
-    array<BlockIteratorState, NumDims - 1> it;
-    for (int i = 0; i < NumDims - 1; ++i) {
-      const int dim = IsColMajor ? i + 1 : NumDims - i - 2;
-      it[i].count = 0;
-      it[i].size = desc.dimension(dim);
-
-      it[i].input_stride = m_inputStrides[dim];
-      it[i].input_span = it[i].input_stride * (it[i].size - 1);
-
-      it[i].output_stride = output_strides[dim];
-      it[i].output_span = it[i].output_stride * (it[i].size - 1);
-    }
-
-    const Index input_inner_dim_size =
-        static_cast<Index>(m_impl.dimensions()[inner_dim_idx]);
-
-    // Total output size.
-    const Index output_size = desc.size();
-
-    // We will fill inner dimension of this size in the output. It might be
-    // larger than the inner dimension in the input, so we might have to pad
-    // before/after we copy values from the input inner dimension.
-    const Index output_inner_dim_size = desc.dimension(inner_dim_idx);
-
-    // How many values to fill with padding BEFORE reading from the input inner
-    // dimension.
-    const Index output_inner_pad_before_size =
-        input_offsets[inner_dim_idx] < 0
-            ? numext::mini(numext::abs(input_offsets[inner_dim_idx]),
-                           output_inner_dim_size)
-            : 0;
-
-    // How many values we can actually copy from the input inner dimension.
-    const Index output_inner_copy_size = numext::mini(
-        // Want to copy from input.
-        (output_inner_dim_size - output_inner_pad_before_size),
-        // Can copy from input.
-        numext::maxi(input_inner_dim_size - (input_offsets[inner_dim_idx] +
-                                             output_inner_pad_before_size),
-                     Index(0)));
-
-    eigen_assert(output_inner_copy_size >= 0);
-
-    // How many values to fill with padding AFTER reading from the input inner
-    // dimension.
-    const Index output_inner_pad_after_size =
-        (output_inner_dim_size - output_inner_copy_size -
-         output_inner_pad_before_size);
-
-    // Sanity check, sum of all sizes must be equal to the output size.
-    eigen_assert(output_inner_dim_size ==
-                 (output_inner_pad_before_size + output_inner_copy_size +
-                  output_inner_pad_after_size));
-
-    // Keep track of current coordinates and padding in the output.
-    DSizes<Index, NumDims> output_coord = output_offsets;
-    DSizes<Index, NumDims> output_padded;
-    for (int i = 0; i < NumDims; ++i) {
-      const int dim = IsColMajor ? i : NumDims - i - 1;
-      output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
-    }
-
-    typedef internal::StridedLinearBufferCopy<ScalarNoConst, Index> LinCopy;
-
-    // Prepare storage for the materialized padding result.
-    const typename TensorBlock::Storage block_storage =
-        TensorBlock::prepareStorage(desc, scratch);
-
-    // TODO(ezhulenev): Squeeze multiple non-padded inner dimensions into a
-    // single logical inner dimension.
-
-    // When possible we squeeze writes for the innermost (only if non-padded)
-    // dimension with the first padded dimension. This allows to reduce the
-    // number of calls to LinCopy and better utilize vector instructions.
-    const bool squeeze_writes =
-        NumDims > 1 &&
-        // inner dimension is not padded
-        (input_inner_dim_size == m_dimensions[inner_dim_idx]) &&
-        // and equal to the block inner dimension
-        (input_inner_dim_size == output_inner_dim_size);
-
-    const int squeeze_dim = IsColMajor ? inner_dim_idx + 1 : inner_dim_idx - 1;
-
-    // Maximum coordinate on a squeeze dimension that we can write to.
-    const Index squeeze_max_coord =
-        squeeze_writes ? numext::mini(
-                             // max non-padded element in the input
-                             static_cast<Index>(m_dimensions[squeeze_dim] -
-                                                m_padding[squeeze_dim].second),
-                             // max element in the output buffer
-                             static_cast<Index>(output_offsets[squeeze_dim] +
-                                                desc.dimension(squeeze_dim)))
-                       : static_cast<Index>(0);
-
-    // Iterate copying data from `m_impl.data()` to the output buffer.
-    for (Index size = 0; size < output_size;) {
-      // Detect if we are in the padded region (exclude innermost dimension).
-      bool is_padded = false;
-      for (int j = 1; j < NumDims; ++j) {
-        const int dim = IsColMajor ? j : NumDims - j - 1;
-        is_padded = output_padded[dim];
-        if (is_padded) break;
-      }
-
-      if (is_padded) {
-        // Fill single innermost dimension with padding value.
-        size += output_inner_dim_size;
-
-        LinCopy::template Run<LinCopy::Kind::FillLinear>(
-            typename LinCopy::Dst(output_offset, 1, block_storage.data()),
-            typename LinCopy::Src(0, 0, &m_paddingValue),
-            output_inner_dim_size);
-
-
-      } else if (squeeze_writes) {
-        // Squeeze multiple reads from innermost dimensions.
-        const Index squeeze_num = squeeze_max_coord - output_coord[squeeze_dim];
-        size += output_inner_dim_size * squeeze_num;
-
-        // Copy `squeeze_num` inner dimensions from input to output.
-        LinCopy::template Run<LinCopy::Kind::Linear>(
-            typename LinCopy::Dst(output_offset, 1, block_storage.data()),
-            typename LinCopy::Src(input_offset, 1, m_impl.data()),
-            output_inner_dim_size * squeeze_num);
-
-        // Update iteration state for only `squeeze_num - 1` processed inner
-        // dimensions, because we have another iteration state update at the end
-        // of the loop that will update iteration state for the last inner
-        // processed dimension.
-        it[0].count += (squeeze_num - 1);
-        input_offset += it[0].input_stride * (squeeze_num - 1);
-        output_offset += it[0].output_stride * (squeeze_num - 1);
-        output_coord[squeeze_dim] += (squeeze_num - 1);
-
-      } else {
-        // Single read from innermost dimension.
-        size += output_inner_dim_size;
-
-        {  // Fill with padding before copying from input inner dimension.
-          const Index out = output_offset;
-
-          LinCopy::template Run<LinCopy::Kind::FillLinear>(
-              typename LinCopy::Dst(out, 1, block_storage.data()),
-              typename LinCopy::Src(0, 0, &m_paddingValue),
-              output_inner_pad_before_size);
-        }
-
-        {  // Copy data from input inner dimension.
-          const Index out = output_offset + output_inner_pad_before_size;
-          const Index in = input_offset + output_inner_pad_before_size;
-
-          eigen_assert(output_inner_copy_size == 0 || m_impl.data() != NULL);
-
-          LinCopy::template Run<LinCopy::Kind::Linear>(
-              typename LinCopy::Dst(out, 1, block_storage.data()),
-              typename LinCopy::Src(in, 1, m_impl.data()),
-              output_inner_copy_size);
-        }
-
-        {  // Fill with padding after copying from input inner dimension.
-          const Index out = output_offset + output_inner_pad_before_size +
-                            output_inner_copy_size;
-
-          LinCopy::template Run<LinCopy::Kind::FillLinear>(
-              typename LinCopy::Dst(out, 1, block_storage.data()),
-              typename LinCopy::Src(0, 0, &m_paddingValue),
-              output_inner_pad_after_size);
-        }
-      }
-
-      for (int j = 0; j < NumDims - 1; ++j) {
-        const int dim = IsColMajor ? j + 1 : NumDims - j - 2;
-
-        if (++it[j].count < it[j].size) {
-          input_offset += it[j].input_stride;
-          output_offset += it[j].output_stride;
-          output_coord[dim] += 1;
-          output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
-          break;
-        }
-        it[j].count = 0;
-        input_offset -= it[j].input_span;
-        output_offset -= it[j].output_span;
-        output_coord[dim] -= it[j].size - 1;
-        output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
-      }
-    }
-
-    return block_storage.AsTensorMaterializedBlock();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; }
-
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
-
- private:
-  struct BlockIteratorState {
-    BlockIteratorState()
-        : count(0),
-          size(0),
-          input_stride(0),
-          input_span(0),
-          output_stride(0),
-          output_span(0) {}
-
-    Index count;
-    Index size;
-    Index input_stride;
-    Index input_span;
-    Index output_stride;
-    Index output_span;
-  };
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim(
-      Index index, int dim_index) const {
-#if defined(EIGEN_HAS_INDEX_LIST)
-    return (!internal::index_pair_first_statically_eq<PaddingDimensions>(dim_index, 0) &&
-            index < m_padding[dim_index].first) ||
-        (!internal::index_pair_second_statically_eq<PaddingDimensions>(dim_index, 0) &&
-         index >= m_dimensions[dim_index] - m_padding[dim_index].second);
-#else
-    return (index < m_padding[dim_index].first) ||
-           (index >= m_dimensions[dim_index] - m_padding[dim_index].second);
-#endif
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isLeftPaddingCompileTimeZero(
-      int dim_index) const {
-#if defined(EIGEN_HAS_INDEX_LIST)
-    return internal::index_pair_first_statically_eq<PaddingDimensions>(dim_index, 0);
-#else
-    EIGEN_UNUSED_VARIABLE(dim_index);
-    return false;
-#endif
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isRightPaddingCompileTimeZero(
-      int dim_index) const {
-#if defined(EIGEN_HAS_INDEX_LIST)
-    return internal::index_pair_second_statically_eq<PaddingDimensions>(dim_index, 0);
-#else
-    EIGEN_UNUSED_VARIABLE(dim_index);
-    return false;
-#endif
-  }
-
-
-  void updateCostPerDimension(TensorOpCost& cost, int i, bool first) const {
-    const double in = static_cast<double>(m_impl.dimensions()[i]);
-    const double out = in + m_padding[i].first + m_padding[i].second;
-    if (out == 0)
-      return;
-    const double reduction = in / out;
-    cost *= reduction;
-    if (first) {
-      cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost<Index>() +
-                    reduction * (1 * TensorOpCost::AddCost<Index>()));
-    } else {
-      cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost<Index>() +
-                                 2 * TensorOpCost::MulCost<Index>() +
-                    reduction * (2 * TensorOpCost::MulCost<Index>() +
-                                 1 * TensorOpCost::DivCost<Index>()));
-    }
-  }
-
- protected:
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
-  {
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
-
-    const Index initialIndex = index;
-    Index inputIndex = 0;
-    EIGEN_UNROLL_LOOP
-    for (int i = NumDims - 1; i > 0; --i) {
-      const Index firstIdx = index;
-      const Index lastIdx = index + PacketSize - 1;
-      const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i];
-      const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i];
-      const Index lastPaddedRight = m_outputStrides[i+1];
-
-      if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) {
-        // all the coefficient are in the padding zone.
-        return internal::pset1<PacketReturnType>(m_paddingValue);
-      }
-      else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
-        // all the coefficient are in the padding zone.
-        return internal::pset1<PacketReturnType>(m_paddingValue);
-      }
-      else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
-        // all the coefficient are between the 2 padding zones.
-        const Index idx = index / m_outputStrides[i];
-        inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
-        index -= idx * m_outputStrides[i];
-      }
-      else {
-        // Every other case
-        return packetWithPossibleZero(initialIndex);
-      }
-    }
-
-    const Index lastIdx = index + PacketSize - 1;
-    const Index firstIdx = index;
-    const Index lastPaddedLeft = m_padding[0].first;
-    const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second);
-    const Index lastPaddedRight = m_outputStrides[1];
-
-    if (!isLeftPaddingCompileTimeZero(0) && lastIdx < lastPaddedLeft) {
-      // all the coefficient are in the padding zone.
-      return internal::pset1<PacketReturnType>(m_paddingValue);
-    }
-    else if (!isRightPaddingCompileTimeZero(0) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
-      // all the coefficient are in the padding zone.
-      return internal::pset1<PacketReturnType>(m_paddingValue);
-    }
-    else if ((isLeftPaddingCompileTimeZero(0) && isRightPaddingCompileTimeZero(0)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
-      // all the coefficient are between the 2 padding zones.
-      inputIndex += (index - m_padding[0].first);
-      return m_impl.template packet<Unaligned>(inputIndex);
-    }
-    // Every other case
-    return packetWithPossibleZero(initialIndex);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
-  {
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
-
-    const Index initialIndex = index;
-    Index inputIndex = 0;
-    EIGEN_UNROLL_LOOP
-    for (int i = 0; i < NumDims - 1; ++i) {
-      const Index firstIdx = index;
-      const Index lastIdx = index + PacketSize - 1;
-      const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i+1];
-      const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1];
-      const Index lastPaddedRight = m_outputStrides[i];
-
-      if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) {
-        // all the coefficient are in the padding zone.
-        return internal::pset1<PacketReturnType>(m_paddingValue);
-      }
-      else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
-        // all the coefficient are in the padding zone.
-        return internal::pset1<PacketReturnType>(m_paddingValue);
-      }
-      else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
-        // all the coefficient are between the 2 padding zones.
-        const Index idx = index / m_outputStrides[i+1];
-        inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
-        index -= idx * m_outputStrides[i+1];
-      }
-      else {
-        // Every other case
-        return packetWithPossibleZero(initialIndex);
-      }
-    }
-
-    const Index lastIdx = index + PacketSize - 1;
-    const Index firstIdx = index;
-    const Index lastPaddedLeft = m_padding[NumDims-1].first;
-    const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second);
-    const Index lastPaddedRight = m_outputStrides[NumDims-1];
-
-    if (!isLeftPaddingCompileTimeZero(NumDims-1) && lastIdx < lastPaddedLeft) {
-      // all the coefficient are in the padding zone.
-      return internal::pset1<PacketReturnType>(m_paddingValue);
-    }
-    else if (!isRightPaddingCompileTimeZero(NumDims-1) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
-      // all the coefficient are in the padding zone.
-      return internal::pset1<PacketReturnType>(m_paddingValue);
-    }
-    else if ((isLeftPaddingCompileTimeZero(NumDims-1) && isRightPaddingCompileTimeZero(NumDims-1)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
-      // all the coefficient are between the 2 padding zones.
-      inputIndex += (index - m_padding[NumDims-1].first);
-      return m_impl.template packet<Unaligned>(inputIndex);
-    }
-    // Every other case
-    return packetWithPossibleZero(initialIndex);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
-  {
-    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
-    EIGEN_UNROLL_LOOP
-    for (int i = 0; i < PacketSize; ++i) {
-      values[i] = coeff(index+i);
-    }
-    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
-    return rslt;
-  }
-
-  Dimensions m_dimensions;
-  array<Index, NumDims+1> m_outputStrides;
-  array<Index, NumDims> m_inputStrides;
-  TensorEvaluator<ArgType, Device> m_impl;
-  PaddingDimensions m_padding;
-
-  Scalar m_paddingValue;
-
-  const Device EIGEN_DEVICE_REF m_device;
-};
-
-
-
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_PADDING_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorPatch.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorPatch.h
deleted file mode 100644
index 413d25d..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorPatch.h
+++ /dev/null
@@ -1,291 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_PATCH_H
-#define EIGEN_CXX11_TENSOR_TENSOR_PATCH_H
-
-namespace Eigen {
-
-/** \class TensorPatch
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Tensor patch class.
-  *
-  *
-  */
-namespace internal {
-template<typename PatchDim, typename XprType>
-struct traits<TensorPatchOp<PatchDim, XprType> > : public traits<XprType>
-{
-  typedef typename XprType::Scalar Scalar;
-  typedef traits<XprType> XprTraits;
-  typedef typename XprTraits::StorageKind StorageKind;
-  typedef typename XprTraits::Index Index;
-  typedef typename XprType::Nested Nested;
-  typedef typename remove_reference<Nested>::type _Nested;
-  static const int NumDimensions = XprTraits::NumDimensions + 1;
-  static const int Layout = XprTraits::Layout;
-  typedef typename XprTraits::PointerType PointerType;
-};
-
-template<typename PatchDim, typename XprType>
-struct eval<TensorPatchOp<PatchDim, XprType>, Eigen::Dense>
-{
-  typedef const TensorPatchOp<PatchDim, XprType>& type;
-};
-
-template<typename PatchDim, typename XprType>
-struct nested<TensorPatchOp<PatchDim, XprType>, 1, typename eval<TensorPatchOp<PatchDim, XprType> >::type>
-{
-  typedef TensorPatchOp<PatchDim, XprType> type;
-};
-
-}  // end namespace internal
-
-
-
-template<typename PatchDim, typename XprType>
-class TensorPatchOp : public TensorBase<TensorPatchOp<PatchDim, XprType>, ReadOnlyAccessors>
-{
-  public:
-  typedef typename Eigen::internal::traits<TensorPatchOp>::Scalar Scalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename Eigen::internal::nested<TensorPatchOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorPatchOp>::StorageKind StorageKind;
-  typedef typename Eigen::internal::traits<TensorPatchOp>::Index Index;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPatchOp(const XprType& expr, const PatchDim& patch_dims)
-      : m_xpr(expr), m_patch_dims(patch_dims) {}
-
-    EIGEN_DEVICE_FUNC
-    const PatchDim& patch_dims() const { return m_patch_dims; }
-
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename XprType::Nested>::type&
-    expression() const { return m_xpr; }
-
-  protected:
-    typename XprType::Nested m_xpr;
-    const PatchDim m_patch_dims;
-};
-
-
-// Eval as rvalue
-template<typename PatchDim, typename ArgType, typename Device>
-struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
-{
-  typedef TensorPatchOp<PatchDim, ArgType> XprType;
-  typedef typename XprType::Index Index;
-  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value + 1;
-  typedef DSizes<Index, NumDims> Dimensions;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
-
-  enum {
-    IsAligned = false,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
-    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,
-    RawAccess = false
- };
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device)
-  {
-    Index num_patches = 1;
-    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
-    const PatchDim& patch_dims = op.patch_dims();
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      for (int i = 0; i < NumDims-1; ++i) {
-        m_dimensions[i] = patch_dims[i];
-        num_patches *= (input_dims[i] - patch_dims[i] + 1);
-      }
-      m_dimensions[NumDims-1] = num_patches;
-
-      m_inputStrides[0] = 1;
-      m_patchStrides[0] = 1;
-      for (int i = 1; i < NumDims-1; ++i) {
-        m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
-        m_patchStrides[i] = m_patchStrides[i-1] * (input_dims[i-1] - patch_dims[i-1] + 1);
-      }
-      m_outputStrides[0] = 1;
-      for (int i = 1; i < NumDims; ++i) {
-        m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
-      }
-    } else {
-      for (int i = 0; i < NumDims-1; ++i) {
-        m_dimensions[i+1] = patch_dims[i];
-        num_patches *= (input_dims[i] - patch_dims[i] + 1);
-      }
-      m_dimensions[0] = num_patches;
-
-      m_inputStrides[NumDims-2] = 1;
-      m_patchStrides[NumDims-2] = 1;
-      for (int i = NumDims-3; i >= 0; --i) {
-        m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
-        m_patchStrides[i] = m_patchStrides[i+1] * (input_dims[i+1] - patch_dims[i+1] + 1);
-      }
-      m_outputStrides[NumDims-1] = 1;
-      for (int i = NumDims-2; i >= 0; --i) {
-        m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
-      }
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
-    m_impl.evalSubExprsIfNeeded(NULL);
-    return true;
-  }
-
-  EIGEN_STRONG_INLINE void cleanup() {
-    m_impl.cleanup();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
-  {
-    Index output_stride_index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? NumDims - 1 : 0;
-    // Find the location of the first element of the patch.
-    Index patchIndex = index / m_outputStrides[output_stride_index];
-    // Find the offset of the element wrt the location of the first element.
-    Index patchOffset = index - patchIndex * m_outputStrides[output_stride_index];
-    Index inputIndex = 0;
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      EIGEN_UNROLL_LOOP
-      for (int i = NumDims - 2; i > 0; --i) {
-        const Index patchIdx = patchIndex / m_patchStrides[i];
-        patchIndex -= patchIdx * m_patchStrides[i];
-        const Index offsetIdx = patchOffset / m_outputStrides[i];
-        patchOffset -= offsetIdx * m_outputStrides[i];
-        inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i];
-      }
-    } else {
-      EIGEN_UNROLL_LOOP
-      for (int i = 0; i < NumDims - 2; ++i) {
-        const Index patchIdx = patchIndex / m_patchStrides[i];
-        patchIndex -= patchIdx * m_patchStrides[i];
-        const Index offsetIdx = patchOffset / m_outputStrides[i+1];
-        patchOffset -= offsetIdx * m_outputStrides[i+1];
-        inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i];
-      }
-    }
-    inputIndex += (patchIndex + patchOffset);
-    return m_impl.coeff(inputIndex);
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
-  {
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
-
-    Index output_stride_index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? NumDims - 1 : 0;
-    Index indices[2] = {index, index + PacketSize - 1};
-    Index patchIndices[2] = {indices[0] / m_outputStrides[output_stride_index],
-                             indices[1] / m_outputStrides[output_stride_index]};
-    Index patchOffsets[2] = {indices[0] - patchIndices[0] * m_outputStrides[output_stride_index],
-                             indices[1] - patchIndices[1] * m_outputStrides[output_stride_index]};
-
-    Index inputIndices[2] = {0, 0};
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      EIGEN_UNROLL_LOOP
-      for (int i = NumDims - 2; i > 0; --i) {
-        const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i],
-                                   patchIndices[1] / m_patchStrides[i]};
-        patchIndices[0] -= patchIdx[0] * m_patchStrides[i];
-        patchIndices[1] -= patchIdx[1] * m_patchStrides[i];
-
-        const Index offsetIdx[2] = {patchOffsets[0] / m_outputStrides[i],
-                                    patchOffsets[1] / m_outputStrides[i]};
-        patchOffsets[0] -= offsetIdx[0] * m_outputStrides[i];
-        patchOffsets[1] -= offsetIdx[1] * m_outputStrides[i];
-
-        inputIndices[0] += (patchIdx[0] + offsetIdx[0]) * m_inputStrides[i];
-        inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i];
-      }
-    } else {
-      EIGEN_UNROLL_LOOP
-      for (int i = 0; i < NumDims - 2; ++i) {
-        const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i],
-                                   patchIndices[1] / m_patchStrides[i]};
-        patchIndices[0] -= patchIdx[0] * m_patchStrides[i];
-        patchIndices[1] -= patchIdx[1] * m_patchStrides[i];
-
-        const Index offsetIdx[2] = {patchOffsets[0] / m_outputStrides[i+1],
-                                    patchOffsets[1] / m_outputStrides[i+1]};
-        patchOffsets[0] -= offsetIdx[0] * m_outputStrides[i+1];
-        patchOffsets[1] -= offsetIdx[1] * m_outputStrides[i+1];
-
-        inputIndices[0] += (patchIdx[0] + offsetIdx[0]) * m_inputStrides[i];
-        inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i];
-      }
-    }
-    inputIndices[0] += (patchIndices[0] + patchOffsets[0]);
-    inputIndices[1] += (patchIndices[1] + patchOffsets[1]);
-
-    if (inputIndices[1] - inputIndices[0] == PacketSize - 1) {
-      PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
-      return rslt;
-    }
-    else {
-      EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize];
-      values[0] = m_impl.coeff(inputIndices[0]);
-      values[PacketSize-1] = m_impl.coeff(inputIndices[1]);
-      EIGEN_UNROLL_LOOP
-      for (int i = 1; i < PacketSize-1; ++i) {
-        values[i] = coeff(index+i);
-      }
-      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
-      return rslt;
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    const double compute_cost = NumDims * (TensorOpCost::DivCost<Index>() +
-                                           TensorOpCost::MulCost<Index>() +
-                                           2 * TensorOpCost::AddCost<Index>());
-    return m_impl.costPerCoeff(vectorized) +
-           TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
-  }
-
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
-
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh); 
-  }
-#endif
-
- protected:
-  Dimensions m_dimensions;
-  array<Index, NumDims> m_outputStrides;
-  array<Index, NumDims-1> m_inputStrides;
-  array<Index, NumDims-1> m_patchStrides;
-
-  TensorEvaluator<ArgType, Device> m_impl;
-
-};
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_PATCH_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorRandom.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorRandom.h
deleted file mode 100644
index 37c1d1c..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorRandom.h
+++ /dev/null
@@ -1,322 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
-// Copyright (C) 2018 Mehdi Goli <eigen@codeplay.com> Codeplay Software Ltd.
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
-#define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
-
-namespace Eigen {
-namespace internal {
-
-namespace {
-
-EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
-#if defined(EIGEN_GPU_COMPILE_PHASE)
-  // We don't support 3d kernels since we currently only use 1 and
-  // 2d kernels.
-  gpu_assert(threadIdx.z == 0);
-  return blockIdx.x * blockDim.x + threadIdx.x 
-         + gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
-#else
-  // Rely on Eigen's random implementation.
-  return random<uint64_t>();
-#endif
-}
-
-static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state, uint64_t stream) {
-  // TODO: Unify with the implementation in the non blocking thread pool.
-  uint64_t current = *state;
-  // Update the internal state
-  *state = current * 6364136223846793005ULL + (stream << 1 | 1);
-  // Generate the random output (using the PCG-XSH-RS scheme)
-  return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
-}
-
-static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) {
-  seed = seed ? seed : get_random_seed();
-  return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
-}
-
-}  // namespace
-
-
-template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-T RandomToTypeUniform(uint64_t* state, uint64_t stream) {
-  unsigned rnd = PCG_XSH_RS_generator(state, stream);
-  return static_cast<T>(rnd);
-}
-
-
-template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state, uint64_t stream) {
-  // Generate 10 random bits for the mantissa, merge with exponent.
-  unsigned rnd = PCG_XSH_RS_generator(state, stream);
-  const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x3ffu) | (static_cast<uint16_t>(15) << 10);
-  Eigen::half result = Eigen::numext::bit_cast<Eigen::half>(half_bits);
-  // Return the final result
-  return result - Eigen::half(1.0f);
-}
-
-template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-Eigen::bfloat16 RandomToTypeUniform<Eigen::bfloat16>(uint64_t* state, uint64_t stream) {
-
-  // Generate 7 random bits for the mantissa, merge with exponent.
-  unsigned rnd = PCG_XSH_RS_generator(state, stream);
-  const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x7fu) | (static_cast<uint16_t>(127) << 7);
-  Eigen::bfloat16 result = Eigen::numext::bit_cast<Eigen::bfloat16>(half_bits);
-  // Return the final result
-  return result - Eigen::bfloat16(1.0f);
-}
-
-template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float RandomToTypeUniform<float>(uint64_t* state, uint64_t stream) {
-  typedef union {
-    uint32_t raw;
-    float fp;
-  } internal;
-  internal result;
-  // Generate 23 random bits for the mantissa mantissa
-  const unsigned rnd = PCG_XSH_RS_generator(state, stream);
-  result.raw = rnd & 0x7fffffu;
-  // Set the exponent
-  result.raw |= (static_cast<uint32_t>(127) << 23);
-  // Return the final result
-  return result.fp - 1.0f;
-}
-
-template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double RandomToTypeUniform<double>(uint64_t* state, uint64_t stream) {
-  typedef union {
-    uint64_t raw;
-    double dp;
-  } internal;
-  internal result;
-  result.raw = 0;
-  // Generate 52 random bits for the mantissa
-  // First generate the upper 20 bits
-  unsigned rnd1 = PCG_XSH_RS_generator(state, stream) & 0xfffffu;
-  // The generate the lower 32 bits
-  unsigned rnd2 = PCG_XSH_RS_generator(state, stream);
-  result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2;
-  // Set the exponent
-  result.raw |= (static_cast<uint64_t>(1023) << 52);
-  // Return the final result
-  return result.dp - 1.0;
-}
-
-template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state, uint64_t stream) {
-  return std::complex<float>(RandomToTypeUniform<float>(state, stream),
-                             RandomToTypeUniform<float>(state, stream));
-}
-template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state, uint64_t stream) {
-  return std::complex<double>(RandomToTypeUniform<double>(state, stream),
-                              RandomToTypeUniform<double>(state, stream));
-}
-
-template <typename T> class UniformRandomGenerator {
- public:
-  static const bool PacketAccess = true;
-
-  // Uses the given "seed" if non-zero, otherwise uses a random seed.
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
-      uint64_t seed = 0) {
-    m_state = PCG_XSH_RS_state(seed);
-    #ifdef EIGEN_USE_SYCL
-    // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
-    // Therefor, we need two step to initializate the m_state.
-    // IN SYCL, the constructor of the functor is s called on the CPU
-    // and we get the clock seed here from the CPU. However, This seed is
-    //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
-    // and only  available on the Operator() function (which is called on the GPU).
-    // Thus for CUDA (((CLOCK  + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread
-    // but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each thread adds
-    // the  (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the construction
-    // similar to CUDA Therefore, the thread Id injection is not available at this stage.
-    //However when the operator() is called the thread ID will be avilable. So inside the opeator,
-    // we add the thrreadID, BlockId,... (which is equivalent of i)
-    //to the seed and construct the unique m_state per thead similar to cuda.
-    m_exec_once =false;
-   #endif
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
-      const UniformRandomGenerator& other) {
-    m_state = other.m_state;
-    #ifdef EIGEN_USE_SYCL
-     m_exec_once =other.m_exec_once;
-    #endif
-  }
-
-  template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  T operator()(Index i) const {
-    #ifdef EIGEN_USE_SYCL
-      if(!m_exec_once) {
-      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
-      // The (i * 6364136223846793005ULL) is the remaining part of the PCG_XSH_RS_state on the GPU side
-       m_state += (i * 6364136223846793005ULL);
-       m_exec_once =true;
-      }
-    #endif
-    T result = RandomToTypeUniform<T>(&m_state, i);
-    return result;
-  }
-
-  template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  Packet packetOp(Index i) const {
-    const int packetSize = internal::unpacket_traits<Packet>::size;
-    EIGEN_ALIGN_MAX T values[packetSize];
-      #ifdef EIGEN_USE_SYCL
-      if(!m_exec_once) {
-      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
-       m_state += (i * 6364136223846793005ULL);
-       m_exec_once =true;
-      }
-    #endif
-    EIGEN_UNROLL_LOOP
-    for (int j = 0; j < packetSize; ++j) {
-      values[j] = RandomToTypeUniform<T>(&m_state, i);
-    }
-    return internal::pload<Packet>(values);
-  }
-
- private:
-  mutable uint64_t m_state;
-  #ifdef EIGEN_USE_SYCL
-  mutable bool m_exec_once;
-  #endif
-};
-
-template <typename Scalar>
-struct functor_traits<UniformRandomGenerator<Scalar> > {
-  enum {
-    // Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)).
-    Cost = 12 * NumTraits<Scalar>::AddCost *
-           ((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)),
-    PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess
-  };
-};
-
-
-
-template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-T RandomToTypeNormal(uint64_t* state, uint64_t stream) {
-  // Use the ratio of uniform method to generate numbers following a normal
-  // distribution. See for example Numerical Recipes chapter 7.3.9 for the
-  // details.
-  T u, v, q;
-  do {
-    u = RandomToTypeUniform<T>(state, stream);
-    v = T(1.7156) * (RandomToTypeUniform<T>(state, stream) - T(0.5));
-    const T x = u - T(0.449871);
-    const T y = numext::abs(v) + T(0.386595);
-    q = x*x + y * (T(0.196)*y - T(0.25472)*x);
-  } while (q > T(0.27597) &&
-           (q > T(0.27846) || v*v > T(-4) * numext::log(u) * u*u));
-
-  return v/u;
-}
-
-template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state, uint64_t stream) {
-  return std::complex<float>(RandomToTypeNormal<float>(state, stream),
-                             RandomToTypeNormal<float>(state, stream));
-}
-template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state, uint64_t stream) {
-  return std::complex<double>(RandomToTypeNormal<double>(state, stream),
-                              RandomToTypeNormal<double>(state, stream));
-}
-
-
-template <typename T> class NormalRandomGenerator {
- public:
-  static const bool PacketAccess = true;
-
-  // Uses the given "seed" if non-zero, otherwise uses a random seed.
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) {
-    m_state = PCG_XSH_RS_state(seed);
-    #ifdef EIGEN_USE_SYCL
-    // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
-    // Therefor, we need two steps to initializate the m_state.
-    // IN SYCL, the constructor of the functor is s called on the CPU
-    // and we get the clock seed here from the CPU. However, This seed is
-    //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
-    // and only  available on the Operator() function (which is called on the GPU).
-    // Therefore, the thread Id injection is not available at this stage. However when the operator()
-    //is called the thread ID will be avilable. So inside the opeator,
-    // we add the thrreadID, BlockId,... (which is equivalent of i)
-    //to the seed and construct the unique m_state per thead similar to cuda.
-    m_exec_once =false;
-   #endif
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(
-      const NormalRandomGenerator& other) {
-    m_state = other.m_state;
-#ifdef EIGEN_USE_SYCL
-    m_exec_once=other.m_exec_once;
-#endif
-  }
-
- template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  T operator()(Index i) const {
-    #ifdef EIGEN_USE_SYCL
-    if(!m_exec_once) {
-      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
-      m_state += (i * 6364136223846793005ULL);
-      m_exec_once =true;
-    }
-    #endif
-    T result = RandomToTypeNormal<T>(&m_state, i);
-    return result;
-  }
-
-  template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  Packet packetOp(Index i) const {
-    const int packetSize = internal::unpacket_traits<Packet>::size;
-    EIGEN_ALIGN_MAX T values[packetSize];
-    #ifdef EIGEN_USE_SYCL
-    if(!m_exec_once) {
-      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
-      m_state += (i * 6364136223846793005ULL);
-      m_exec_once =true;
-    }
-    #endif
-    EIGEN_UNROLL_LOOP
-    for (int j = 0; j < packetSize; ++j) {
-      values[j] = RandomToTypeNormal<T>(&m_state, i);
-    }
-    return internal::pload<Packet>(values);
-  }
-
- private:
-  mutable uint64_t m_state;
-   #ifdef EIGEN_USE_SYCL
-  mutable bool m_exec_once;
-  #endif
-};
-
-
-template <typename Scalar>
-struct functor_traits<NormalRandomGenerator<Scalar> > {
-  enum {
-    // On average, we need to generate about 3 random numbers
-    // 15 mul, 8 add, 1.5 logs
-    Cost = 3 * functor_traits<UniformRandomGenerator<Scalar> >::Cost +
-           15 * NumTraits<Scalar>::AddCost + 8 * NumTraits<Scalar>::AddCost +
-           3 * functor_traits<scalar_log_op<Scalar> >::Cost / 2,
-    PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess
-  };
-};
-
-
-} // end namespace internal
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorReduction.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorReduction.h
deleted file mode 100644
index 583f462..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorReduction.h
+++ /dev/null
@@ -1,998 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-// Copyright (C) 2016 Mehdi Goli, Codeplay Software Ltd <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
-#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
-
-// clang is incompatible with the CUDA syntax wrt making a kernel a class friend,
-// so we'll use a macro to make clang happy.
-#ifndef KERNEL_FRIEND
-#if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__))
-#define KERNEL_FRIEND friend __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
-#else
-#define KERNEL_FRIEND friend
-#endif
-#endif
-
-
-namespace Eigen {
-
-
-/** \class TensorReduction
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Tensor reduction class.
-  *
-  */
-
-namespace internal {
-  template<typename Op, typename Dims, typename XprType,template <class> class MakePointer_ >
-  struct traits<TensorReductionOp<Op, Dims, XprType, MakePointer_> >
- : traits<XprType>
-{
-  typedef traits<XprType> XprTraits;
-  typedef typename XprTraits::Scalar Scalar;
-  typedef typename XprTraits::StorageKind StorageKind;
-  typedef typename XprTraits::Index Index;
-  typedef typename XprType::Nested Nested;
-  static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
-  static const int Layout = XprTraits::Layout;
-  typedef typename XprTraits::PointerType PointerType;
-
-  template <class T> struct MakePointer {
-    // Intermediate typedef to workaround MSVC issue.
-    typedef MakePointer_<T> MakePointerT;
-    typedef typename MakePointerT::Type Type;
-  };
-};
-
-template<typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
-struct eval<TensorReductionOp<Op, Dims, XprType, MakePointer_>, Eigen::Dense>
-{
-  typedef const TensorReductionOp<Op, Dims, XprType, MakePointer_>& type;
-};
-
-template<typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
-struct nested<TensorReductionOp<Op, Dims, XprType, MakePointer_>, 1, typename eval<TensorReductionOp<Op, Dims, XprType, MakePointer_> >::type>
-{
-  typedef TensorReductionOp<Op, Dims, XprType, MakePointer_> type;
-};
-
-
-template <typename OutputDims> struct DimInitializer {
-  template <typename InputDims, typename ReducedDims> EIGEN_DEVICE_FUNC
-  static void run(const InputDims& input_dims,
-                  const array<bool, internal::array_size<InputDims>::value>& reduced,
-                  OutputDims* output_dims, ReducedDims* reduced_dims) {
-    const int NumInputDims = internal::array_size<InputDims>::value;
-    int outputIndex = 0;
-    int reduceIndex = 0;
-    for (int i = 0; i < NumInputDims; ++i) {
-      if (reduced[i]) {
-        (*reduced_dims)[reduceIndex] = input_dims[i];
-        ++reduceIndex;
-      } else {
-        (*output_dims)[outputIndex] = input_dims[i];
-        ++outputIndex;
-      }
-    }
-  }
-};
-
-template <> struct DimInitializer<Sizes<> > {
-  template <typename InputDims, typename Index, size_t Rank> EIGEN_DEVICE_FUNC
-  static void run(const InputDims& input_dims, const array<bool, Rank>&,
-                  Sizes<>*, array<Index, Rank>* reduced_dims) {
-    const int NumInputDims = internal::array_size<InputDims>::value;
-    for (int i = 0; i < NumInputDims; ++i) {
-      (*reduced_dims)[i] = input_dims[i];
-    }
-  }
-};
-
-
-template <typename ReducedDims, int NumTensorDims, int Layout>
-struct are_inner_most_dims {
-  static const bool value = false;
-};
-template <typename ReducedDims, int NumTensorDims, int Layout>
-struct preserve_inner_most_dims {
-  static const bool value = false;
-};
-
-#if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES
-template <typename ReducedDims, int NumTensorDims>
-struct are_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
-  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
-  static const bool tmp2 = index_statically_eq<ReducedDims>(0, 0);
-  static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value-1, array_size<ReducedDims>::value-1);
-  static const bool value = tmp1 & tmp2 & tmp3;
-};
-template <typename ReducedDims, int NumTensorDims>
-struct are_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
-  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
-  static const bool tmp2 = index_statically_eq<ReducedDims>(0, NumTensorDims - array_size<ReducedDims>::value);
-  static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
-  static const bool value = tmp1 & tmp2 & tmp3;
-
-};
-template <typename ReducedDims, int NumTensorDims>
-struct preserve_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
-  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
-  static const bool tmp2 = index_statically_gt<ReducedDims>(0, 0);
-  static const bool value = tmp1 & tmp2;
-
-};
-template <typename ReducedDims, int NumTensorDims>
-struct preserve_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
-  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
-  static const bool tmp2 = index_statically_lt<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
-  static const bool value = tmp1 & tmp2;
-};
-#endif
-
-
-template <int DimIndex, typename Self, typename Op>
-struct GenericDimReducer {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) {
-    EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
-      const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
-      GenericDimReducer<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
-    }
-  }
-};
-template <typename Self, typename Op>
-struct GenericDimReducer<0, Self, Op> {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) {
-    for (int j = 0; j < self.m_reducedDims[0]; ++j) {
-      const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
-      reducer.reduce(self.m_impl.coeff(input), accum);
-    }
-  }
-};
-template <typename Self, typename Op>
-struct GenericDimReducer<-1, Self, Op> {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index index, Op& reducer, typename Self::CoeffReturnType* accum) {
-    reducer.reduce(self.m_impl.coeff(index), accum);
-  }
-};
-
-template <typename Self, typename Op, bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess),
-          bool UseTreeReduction = (!Self::ReducerTraits::IsStateful &&
-                                   !Self::ReducerTraits::IsExactlyAssociative)>
-struct InnerMostDimReducer {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
-    typename Self::CoeffReturnType accum = reducer.initialize();
-    for (typename Self::Index j = 0; j < numValuesToReduce; ++j) {
-      reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
-    }
-    return reducer.finalize(accum);
-  }
-};
-
-template <typename Self, typename Op>
-struct InnerMostDimReducer<Self, Op, true, false> {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
-    const typename Self::Index packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
-    const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
-    typename Self::PacketReturnType paccum = reducer.template initializePacket<typename Self::PacketReturnType>();
-    for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) {
-      reducer.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum);
-    }
-    typename Self::CoeffReturnType accum = reducer.initialize();
-    for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) {
-      reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
-    }
-    return reducer.finalizeBoth(accum, paccum);
-  }
-};
-
-#if !defined(EIGEN_HIPCC) 
-static const int kLeafSize = 1024;
-
-template <typename Self, typename Op>
-struct InnerMostDimReducer<Self, Op, false, true> {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType
-  reduce(const Self& self, typename Self::Index firstIndex,
-         typename Self::Index numValuesToReduce, Op& reducer) {
-    typename Self::CoeffReturnType accum = reducer.initialize();
-    if (numValuesToReduce > kLeafSize) {
-      const typename Self::Index half = numValuesToReduce / 2;
-      reducer.reduce(reduce(self, firstIndex, half, reducer), &accum);
-      reducer.reduce(
-          reduce(self, firstIndex + half, numValuesToReduce - half, reducer),
-          &accum);
-    } else {
-      for (typename Self::Index j = 0; j < numValuesToReduce; ++j) {
-        reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
-      }
-    }
-    return reducer.finalize(accum);
-  }
-};
-
-template <typename Self, typename Op>
-struct InnerMostDimReducer<Self, Op, true, true> {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType
-  reduce(const Self& self, typename Self::Index firstIndex,
-         typename Self::Index numValuesToReduce, Op& reducer) {
-    const typename Self::Index packetSize =
-        internal::unpacket_traits<typename Self::PacketReturnType>::size;
-    typename Self::CoeffReturnType accum = reducer.initialize();
-    if (numValuesToReduce > packetSize * kLeafSize) {
-      // Make sure the split point is aligned on a packet boundary.
-      const typename Self::Index split =
-          packetSize *
-          divup(firstIndex + divup(numValuesToReduce, typename Self::Index(2)),
-                packetSize);
-      const typename Self::Index num_left =
-          numext::mini(split - firstIndex, numValuesToReduce);
-      reducer.reduce(reduce(self, firstIndex, num_left, reducer), &accum);
-      if (num_left < numValuesToReduce) {
-        reducer.reduce(
-            reduce(self, split, numValuesToReduce - num_left, reducer), &accum);
-      }
-      return reducer.finalize(accum);
-    } else {
-      const typename Self::Index UnrollSize =
-          (numValuesToReduce / (2*packetSize)) * 2*packetSize;
-      const typename Self::Index VectorizedSize =
-          (numValuesToReduce / packetSize) * packetSize;
-      typename Self::PacketReturnType paccum =
-          reducer.template initializePacket<typename Self::PacketReturnType>();
-      typename Self::PacketReturnType paccum2 =
-          reducer.template initializePacket<typename Self::PacketReturnType>();
-      for (typename Self::Index j = 0; j < UnrollSize; j += packetSize * 2) {
-        reducer.reducePacket(
-            self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum);
-        reducer.reducePacket(
-            self.m_impl.template packet<Unaligned>(firstIndex + j + packetSize),
-            &paccum2);
-      }
-      for (typename Self::Index j = UnrollSize; j < VectorizedSize; j+= packetSize) {
-        reducer.reducePacket(self.m_impl.template packet<Unaligned>(
-                                 firstIndex + j), &paccum);
-      }
-      reducer.reducePacket(paccum2, &paccum);
-      for (typename Self::Index j = VectorizedSize; j < numValuesToReduce;
-           ++j) {
-        reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
-      }
-      return reducer.finalizeBoth(accum, paccum);
-    }
-  }
-};
-#endif
- 
-template <int DimIndex, typename Self, typename Op, bool vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
-struct InnerMostDimPreserver {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) {
-    eigen_assert(false && "should never be called");
-  }
-};
-
-template <int DimIndex, typename Self, typename Op>
-struct InnerMostDimPreserver<DimIndex, Self, Op, true> {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
-    EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    for (typename Self::Index j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
-      const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
-      InnerMostDimPreserver<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
-    }
-  }
-};
-
-template <typename Self, typename Op>
-struct InnerMostDimPreserver<0, Self, Op, true> {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
-    for (typename Self::Index j = 0; j < self.m_reducedDims[0]; ++j) {
-      const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
-      reducer.reducePacket(self.m_impl.template packet<Unaligned>(input), accum);
-    }
-  }
-};
-template <typename Self, typename Op>
-struct InnerMostDimPreserver<-1, Self, Op, true> {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) {
-    eigen_assert(false && "should never be called");
-  }
-};
-
-// Default full reducer
-template <typename Self, typename Op, typename Device, bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
-struct FullReducer {
-  static const bool HasOptimizedImplementation = false;
-
-  static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::EvaluatorPointerType output) {
-    const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions());
-    *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
-  }
-};
-
-
-#ifdef EIGEN_USE_THREADS
-// Multithreaded full reducers
-template <typename Self, typename Op,
-          bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
-struct FullReducerShard {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex,
-                  typename Self::Index numValuesToReduce, Op& reducer,
-                  typename Self::CoeffReturnType* output) {
-    *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(
-        self, firstIndex, numValuesToReduce, reducer);
-  }
-};
-
-// Multithreaded full reducer
-template <typename Self, typename Op, bool Vectorizable>
-struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
-  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful;
-  static const Index PacketSize =
-      unpacket_traits<typename Self::PacketReturnType>::size;
-
-  // launch one reducer per thread and accumulate the result.
-  static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device,
-                  typename Self::CoeffReturnType* output) {
-    typedef typename Self::Index Index;
-    const Index num_coeffs = array_prod(self.m_impl.dimensions());
-    if (num_coeffs == 0) {
-      *output = reducer.finalize(reducer.initialize());
-      return;
-    }
-    const TensorOpCost cost =
-        self.m_impl.costPerCoeff(Vectorizable) +
-        TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable,
-                     PacketSize);
-    const int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
-        num_coeffs, cost, device.numThreads());
-    if (num_threads == 1) {
-      *output =
-          InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
-      return;
-    }
-    const Index blocksize =
-        std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
-    const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
-    eigen_assert(num_coeffs >= numblocks * blocksize);
-
-    Barrier barrier(internal::convert_index<unsigned int>(numblocks));
-    MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
-    for (Index i = 0; i < numblocks; ++i) {
-      device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, Vectorizable>::run,
-                                  self, i * blocksize, blocksize, reducer,
-                                  &shards[i]);
-    }
-    typename Self::CoeffReturnType finalShard;
-    if (numblocks * blocksize < num_coeffs) {
-      finalShard = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(
-          self, numblocks * blocksize, num_coeffs - numblocks * blocksize,
-          reducer);
-    } else {
-      finalShard = reducer.initialize();
-    }
-    barrier.Wait();
-
-    for (Index i = 0; i < numblocks; ++i) {
-      reducer.reduce(shards[i], &finalShard);
-    }
-    *output = reducer.finalize(finalShard);
-  }
-};
-
-#endif
-
-
-// Default inner reducer
-template <typename Self, typename Op, typename Device>
-struct InnerReducer {
-  static const bool HasOptimizedImplementation = false;
-
-  EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
-    eigen_assert(false && "Not implemented");
-    return true;
-  }
-};
-
-// Default outer reducer
-template <typename Self, typename Op, typename Device>
-struct OuterReducer {
-  static const bool HasOptimizedImplementation = false;
-
-  EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
-    eigen_assert(false && "Not implemented");
-    return true;
-  }
-};
-
-#ifdef EIGEN_USE_SYCL
-// Default Generic reducer
-template <typename Self, typename Op, typename Device>
-struct GenericReducer {
-  static const bool HasOptimizedImplementation = false;
-
-  EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
-    eigen_assert(false && "Not implemented");
-    return true;
-  }
-};
-#endif
-
-#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
-template <int B, int N, typename S, typename R, typename I_>
-__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
-
-
-#if defined(EIGEN_HAS_GPU_FP16)
-template <typename S, typename R, typename I_>
-__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits<half>::type*);
-template <int B, int N, typename S, typename R, typename I_>
-__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits<half>::type*);
-template <int NPT, typename S, typename R, typename I_>
-__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
-
-#endif
-
-template <int NPT, typename S, typename R, typename I_>
-__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
-
-template <int NPT, typename S, typename R, typename I_>
-__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
-#endif
-
-/**
- * For SYCL, the return type of the reduction is deduced from the initialize method of the given Op.
- * This allows the reduction to have a different type for the accumulator than the input data type.
- * If this is the case, the functor needs to have two reduce method: one for reducing an element of the input
- * with the accumulator and the other for reducing two accumulators.
- * Such a reducer can be useful for instance when the accumulator is a boolean or a bitset that checks for
- * some properties of the input.
- */
-template <typename Op, typename CoeffReturnType>
-struct ReductionReturnType {
-#if defined(EIGEN_USE_SYCL)
-  typedef typename remove_const<decltype(std::declval<Op>().initialize())>::type type;
-#else
-  typedef typename remove_const<CoeffReturnType>::type type;
-#endif
-};
-
-}  // end namespace internal
-
-
-template <typename Op, typename Dims, typename XprType,  template <class> class MakePointer_>
-class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType, MakePointer_>, ReadOnlyAccessors> {
-  public:
-    typedef typename Eigen::internal::traits<TensorReductionOp>::Scalar Scalar;
-    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-    typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
-    typedef typename Eigen::internal::nested<TensorReductionOp>::type Nested;
-    typedef typename Eigen::internal::traits<TensorReductionOp>::StorageKind StorageKind;
-    typedef typename Eigen::internal::traits<TensorReductionOp>::Index Index;
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    TensorReductionOp(const XprType& expr, const Dims& dims) : m_expr(expr), m_dims(dims)
-    { }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    TensorReductionOp(const XprType& expr, const Dims& dims, const Op& reducer) : m_expr(expr), m_dims(dims), m_reducer(reducer)
-    { }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const XprType& expression() const { return m_expr; }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const Dims& dims() const { return m_dims; }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const Op& reducer() const { return m_reducer; }
-
-  protected:
-    typename XprType::Nested m_expr;
-    const Dims m_dims;
-    const Op m_reducer;
-};
-
-template<typename ArgType, typename Device>
-struct TensorReductionEvaluatorBase;
-
-// Eval as rvalue
-template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
-struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
-{
-  typedef internal::reducer_traits<Op, Device> ReducerTraits;
-  typedef Dims ReducedDims;
-  typedef TensorReductionOp<Op, Dims, ArgType, MakePointer_> XprType;
-  typedef typename XprType::Index Index;
-  typedef ArgType ChildType;
-  typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
-  static const int NumInputDims = internal::array_size<InputDimensions>::value;
-  static const int NumReducedDims = internal::array_size<Dims>::value;
-  static const int NumOutputDims = NumInputDims - NumReducedDims;
-  typedef typename internal::conditional<NumOutputDims==0, Sizes<>, DSizes<Index, NumOutputDims> >::type Dimensions;
-  typedef typename XprType::Scalar Scalar;
-  typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Self;
-  static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess;
-  typedef typename internal::ReductionReturnType<Op, typename XprType::CoeffReturnType>::type CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const Index PacketSize = PacketType<CoeffReturnType, Device>::size;
-
-  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
-    // Subset of strides of the input tensor for the non-reduced dimensions.
-  // Indexed by output dimensions.
-  static const int NumPreservedStrides = max_n_1<NumOutputDims>::size;
-
-  enum {
-    IsAligned = false,
-    PacketAccess = Self::InputPacketAccess && ReducerTraits::PacketAccess,
-    BlockAccess = false,
-    PreferBlockAccess = true,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
-  };
-
-  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
-  static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
-  static const bool RunningFullReduction = (NumOutputDims==0);
-
-  EIGEN_STRONG_INLINE TensorReductionEvaluatorBase(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device)
-  {
-    EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
-                        YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-    // Build the bitmap indicating if an input dimension is reduced or not.
-    for (int i = 0; i < NumInputDims; ++i) {
-      m_reduced[i] = false;
-    }
-    for (int i = 0; i < NumReducedDims; ++i) {
-      eigen_assert(op.dims()[i] >= 0);
-      eigen_assert(op.dims()[i] < NumInputDims);
-      m_reduced[op.dims()[i]] = true;
-    }
-
-    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
-    internal::DimInitializer<Dimensions>::run(input_dims, m_reduced, &m_dimensions, &m_reducedDims);
-
-    // Precompute output strides.
-    if (NumOutputDims > 0) {
-      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-        m_outputStrides[0] = 1;
-        for (int i = 1; i < NumOutputDims; ++i) {
-          m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
-          m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
-        }
-      } else {
-        m_outputStrides[NumOutputDims - 1] = 1;
-        for (int i = NumOutputDims - 2; i >= 0; --i) {
-          m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
-          m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
-        }
-      }
-    }
-
-    // Precompute input strides.
-    if (NumInputDims > 0) {
-      array<Index, NumInputDims> input_strides;
-      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-        input_strides[0] = 1;
-        for (int i = 1; i < NumInputDims; ++i) {
-          input_strides[i] = input_strides[i-1] * input_dims[i-1];
-        }
-      } else {
-        input_strides.back() = 1;
-        for (int i = NumInputDims - 2; i >= 0; --i) {
-          input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
-        }
-      }
-
-      int outputIndex = 0;
-      int reduceIndex = 0;
-      for (int i = 0; i < NumInputDims; ++i) {
-        if (m_reduced[i]) {
-          m_reducedStrides[reduceIndex] = input_strides[i];
-          ++reduceIndex;
-        } else {
-          m_preservedStrides[outputIndex] = input_strides[i];
-          m_output_to_input_dim_map[outputIndex] = i;
-          ++outputIndex;
-        }
-      }
-    }
-
-    // Special case for full reductions
-    if (NumOutputDims == 0) {
-      m_preservedStrides[0] = internal::array_prod(input_dims);
-    }
-
-    m_numValuesToReduce =
-        NumOutputDims == 0
-            ? internal::array_prod(input_dims)
-            : (static_cast<int>(Layout) == static_cast<int>(ColMajor))
-                  ? m_preservedStrides[0]
-                  : m_preservedStrides[NumOutputDims - 1];
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-
-  EIGEN_STRONG_INLINE
-  bool evalSubExprsIfNeededCommon(EvaluatorPointerType data) {
-    // Use the FullReducer if possible.
-    if ((RunningFullReduction && RunningOnSycl) ||(RunningFullReduction &&
-        internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
-        ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) ||
-         !RunningOnGPU))) {
-      bool need_assign = false;
-      if (!data) {
-        m_result = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType))));
-        data = m_result;
-        need_assign = true;
-      }
-      Op reducer(m_reducer);
-      internal::FullReducer<Self, Op, Device>::run(*this, reducer, m_device, data);
-      return need_assign;
-    }
-
-    // Attempt to use an optimized reduction.
-    else if ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || (RunningOnSycl)) {
-      bool reducing_inner_dims = true;
-      for (int i = 0; i < NumReducedDims; ++i) {
-        if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-          reducing_inner_dims &= m_reduced[i];
-        } else {
-          reducing_inner_dims &= m_reduced[NumInputDims - 1 - i];
-        }
-      }
-      if (internal::InnerReducer<Self, Op, Device>::HasOptimizedImplementation &&
-          (reducing_inner_dims || ReducingInnerMostDims)) {
-        const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
-        const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
-        if (!data) {
-          if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) || (RunningOnSycl)) {
-            data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
-            m_result = data;
-          }
-          else {
-            return true;
-          }
-        }
-        Op reducer(m_reducer);
-        // For SYCL this if always return false
-        if (internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
-          if (m_result) {
-            m_device.deallocate_temp(m_result);
-            m_result = NULL;
-          }
-          return true;
-        } else {
-          return (m_result != NULL);
-        }
-      }
-
-      bool preserving_inner_dims = true;
-      for (int i = 0; i < NumReducedDims; ++i) {
-        if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-          preserving_inner_dims &= m_reduced[NumInputDims - 1 - i];
-        } else {
-          preserving_inner_dims &= m_reduced[i];
-        }
-      }
-      if (internal::OuterReducer<Self, Op, Device>::HasOptimizedImplementation &&
-          preserving_inner_dims) {
-        const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
-        const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
-        if (!data) {
-          if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) || (RunningOnSycl)) {
-            data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
-            m_result = data;
-          }
-          else {
-            return true;
-          }
-        }
-        Op reducer(m_reducer);
-        // For SYCL this if always return false
-        if (internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
-          if (m_result) {
-            m_device.deallocate_temp(m_result);
-            m_result = NULL;
-          }
-          return true;
-        } else {
-          return (m_result != NULL);
-        }
-      }
-      #if defined(EIGEN_USE_SYCL)
-      // If there is no Optimised version for SYCL, the reduction expression 
-      // must break into two subexpression and use the SYCL generic Reducer on the device.
-      if(RunningOnSycl) {
-         const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
-         const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
-         if (!data) {
-           data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
-           m_result = data;
-         }
-         Op reducer(m_reducer);
-         internal::GenericReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
-         return (m_result != NULL);
-       }
-      #endif
-    }
-    return true;
-  }
-
-#ifdef EIGEN_USE_THREADS
-  template <typename EvalSubExprsCallback>
-  EIGEN_STRONG_INLINE
-      void
-      evalSubExprsIfNeededAsync(EvaluatorPointerType data,
-                                EvalSubExprsCallback done) {
-    m_impl.evalSubExprsIfNeededAsync(NULL, [this, data, done](bool) {
-      done(evalSubExprsIfNeededCommon(data));
-    });
-  }
-#endif
-
-  EIGEN_STRONG_INLINE
-  bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
-    m_impl.evalSubExprsIfNeeded(NULL);
-    return evalSubExprsIfNeededCommon(data);
-  }
-
-  EIGEN_STRONG_INLINE void cleanup() {
-    m_impl.cleanup();
-    if (m_result) {
-      m_device.deallocate_temp(m_result);
-      m_result = NULL;
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
-  {
-    if (( RunningFullReduction || RunningOnGPU) && m_result ) {
-      return *(m_result + index);
-    }
-    Op reducer(m_reducer);
-    if (ReducingInnerMostDims || RunningFullReduction) {
-      const Index num_values_to_reduce =
-        (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1];
-      return internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstInput(index),
-                                                             num_values_to_reduce, reducer);
-    } else {
-      typename Self::CoeffReturnType accum = reducer.initialize();
-      internal::GenericDimReducer<NumReducedDims-1, Self, Op>::reduce(*this, firstInput(index), reducer, &accum);
-      return reducer.finalize(accum);
-    }
-  }
-
-  // TODO(bsteiner): provide a more efficient implementation.
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
-  {
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index + PacketSize - 1 < Index(internal::array_prod(dimensions())));
-
-    if (RunningOnGPU && m_result) {
-      return internal::pload<PacketReturnType>(m_result + index);
-    }
-
-    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
-    if (ReducingInnerMostDims) {
-      const Index num_values_to_reduce =
-        (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1];
-      const Index firstIndex = firstInput(index);
-      for (Index i = 0; i < PacketSize; ++i) {
-        Op reducer(m_reducer);
-        values[i] = internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstIndex + i * num_values_to_reduce,
-                                                                    num_values_to_reduce, reducer);
-      }
-    } else if (PreservingInnerMostDims) {
-      const Index firstIndex = firstInput(index);
-      const int innermost_dim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumOutputDims - 1;
-      // TBD: extend this the the n innermost dimensions that we preserve.
-      if (((firstIndex % m_dimensions[innermost_dim]) + PacketSize - 1) < m_dimensions[innermost_dim]) {
-        Op reducer(m_reducer);
-        typename Self::PacketReturnType accum = reducer.template initializePacket<typename Self::PacketReturnType>();
-        internal::InnerMostDimPreserver<NumReducedDims-1, Self, Op>::reduce(*this, firstIndex, reducer, &accum);
-        return reducer.finalizePacket(accum);
-      } else {
-        for (int i = 0; i < PacketSize; ++i) {
-          values[i] = coeff(index + i);
-        }
-      }
-    } else {
-      for (int i = 0; i < PacketSize; ++i) {
-        values[i] = coeff(index + i);
-      }
-    }
-    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
-    return rslt;
-  }
-
-  // Must be called after evalSubExprsIfNeeded().
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    if (RunningFullReduction && m_result) {
-      return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
-    } else {
-      const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
-      const double compute_cost = num_values_to_reduce * internal::functor_traits<Op>::Cost;
-      return m_impl.costPerCoeff(vectorized) * num_values_to_reduce +
-          TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
-  EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
-  EIGEN_DEVICE_FUNC const Device& device() const { return m_device; }
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-    m_result.bind(cgh);
-  }
-#endif
-
-  private:
-  template <int, typename, typename> friend struct internal::GenericDimReducer;
-  template <typename, typename, bool, bool> friend struct internal::InnerMostDimReducer;
-  template <int, typename, typename, bool> friend struct internal::InnerMostDimPreserver;
-  template <typename S, typename O, typename D, bool V> friend struct internal::FullReducer;
-#ifdef EIGEN_USE_THREADS
-  template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
-#endif
-#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
-  template <int B, int N, typename S, typename R, typename I_> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
-#if defined(EIGEN_HAS_GPU_FP16)
-  template <typename S, typename R, typename I_> KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits<Eigen::half>::type*);
-  template <int B, int N, typename S, typename R, typename I_> KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits<Eigen::half>::type*);
-  template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
-#endif
-  template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
-
-  template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
-#endif
-
-#if defined(EIGEN_USE_SYCL)
- template < typename Evaluator_, typename Op__> friend class TensorSycl::internal::GenericNondeterministicReducer;
- // SYCL need the Generic reducer for the case the recution algorithm is neither inner, outer, and full reducer
- template <typename, typename, typename> friend struct internal::GenericReducer;
-#endif
-
-
-  template <typename S, typename O, typename D> friend struct internal::InnerReducer;
-
-  struct BlockIteratorState {
-    Index input_dim;
-    Index output_size;
-    Index output_count;
-  };
-
-  // Returns the Index in the input tensor of the first value that needs to be
-  // used to compute the reduction at output index "index".
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
-    if (ReducingInnerMostDims) {
-      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-        return index * m_preservedStrides[0];
-      } else {
-        return index * m_preservedStrides[NumPreservedStrides - 1];
-      }
-    }
-    // TBD: optimize the case where we preserve the innermost dimensions.
-    Index startInput = 0;
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      for (int i = NumOutputDims - 1; i > 0; --i) {
-        // This is index_i in the output tensor.
-        const Index idx = index / m_outputStrides[i];
-        startInput += idx * m_preservedStrides[i];
-        index -= idx * m_outputStrides[i];
-      }
-      if (PreservingInnerMostDims) {
-        eigen_assert(m_preservedStrides[0] == 1);
-        startInput += index;
-      } else {
-        startInput += index * m_preservedStrides[0];
-      }
-    } else {
-      for (int i = 0; i < NumOutputDims - 1; ++i) {
-        // This is index_i in the output tensor.
-        const Index idx = index / m_outputStrides[i];
-        startInput += idx * m_preservedStrides[i];
-        index -= idx * m_outputStrides[i];
-      }
-      if (PreservingInnerMostDims) {
-        eigen_assert(m_preservedStrides[NumPreservedStrides - 1] == 1);
-        startInput += index;
-      } else {
-        startInput += index * m_preservedStrides[NumPreservedStrides - 1];
-      }
-    }
-    return startInput;
-  }
-
-  // Bitmap indicating if an input dimension is reduced or not.
-  array<bool, NumInputDims> m_reduced;
-  // Dimensions of the output of the operation.
-  Dimensions m_dimensions;
-  // Precomputed strides for the output tensor.
-  array<Index, NumOutputDims> m_outputStrides;
-  array<internal::TensorIntDivisor<Index>, NumOutputDims> m_fastOutputStrides;
-  array<Index, NumPreservedStrides> m_preservedStrides;
-  // Map from output to input dimension index.
-  array<Index, NumOutputDims> m_output_to_input_dim_map;
-  // How many values go into each reduction
-  Index m_numValuesToReduce;
-
-  // Subset of strides of the input tensor for the reduced dimensions.
-  // Indexed by reduced dimensions.
-  array<Index, NumReducedDims> m_reducedStrides;
-  // Size of the input dimensions that are reduced.
-  // Indexed by reduced dimensions.
-  array<Index, NumReducedDims> m_reducedDims;
-
-  // Evaluator for the input expression.
-  TensorEvaluator<ArgType, Device> m_impl;
-
-  // Operation to apply for computing the reduction.
-  Op m_reducer;
-
-  // For full reductions
-#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
-  static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
-  static const bool RunningOnSycl = false;
-#elif defined(EIGEN_USE_SYCL)
-static const bool RunningOnSycl = internal::is_same<typename internal::remove_all<Device>::type, Eigen::SyclDevice>::value;
-static const bool RunningOnGPU = false;
-#else
-  static const bool RunningOnGPU = false;
-  static const bool RunningOnSycl = false;
-#endif
-  EvaluatorPointerType m_result;
-
-  const Device EIGEN_DEVICE_REF m_device;
-};
-
-template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
-struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
-: public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> {
-  typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Base;
-  EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Device& device) : Base(op, device){}
-};
-
-
-template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_>
-struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice>
-: public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> {
-
-  typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> Base;
-  EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Eigen::SyclDevice& device) : Base(op, device){}
-  // The coeff function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel
-  //Therefore the coeff function should be overridden by for SYCL kernel
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::CoeffReturnType coeff(typename Base::Index index) const {
-    return *(this->data() + index);
-  }
-  // The packet function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel
-  //Therefore the packet function should be overridden by for SYCL kernel
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::PacketReturnType packet(typename Base::Index index) const {
-    return internal::pload<typename Base::PacketReturnType>(this->data() + index);
-  }
-};
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorReductionCuda.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorReductionCuda.h
deleted file mode 100644
index 68780cd..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorReductionCuda.h
+++ /dev/null
@@ -1,6 +0,0 @@
-
-#if defined(__clang__) || defined(__GNUC__)
-#warning "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorReductionGpu.h file"
-#endif
-
-#include "TensorReductionGpu.h"
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorReductionGpu.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorReductionGpu.h
deleted file mode 100644
index db4e8d8..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorReductionGpu.h
+++ /dev/null
@@ -1,966 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
-#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
-
-namespace Eigen {
-namespace internal {
-
-
-#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
-// Full reducers for GPU, don't vectorize for now
-
-// Reducer function that enables multiple gpu thread to safely accumulate at the same
-// output address. It basically reads the current value of the output variable, and
-// attempts to update it with the new value. If in the meantime another gpu thread
-// updated the content of the output address it will try again.
-template <typename T, typename R>
-__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
-#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
-  if (sizeof(T) == 4)
-  {
-    unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
-    unsigned int newval = oldval;
-    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
-    if (newval == oldval) {
-      return;
-    }
-    unsigned int readback;
-    while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
-      oldval = readback;
-      newval = oldval;
-      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
-      if (newval == oldval) {
-        return;
-      }
-    }
-  }
-  else if (sizeof(T) == 8) {
-    unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
-    unsigned long long newval = oldval;
-    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
-    if (newval == oldval) {
-      return;
-    }
-    unsigned long long readback;
-    while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
-      oldval = readback;
-      newval = oldval;
-      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
-      if (newval == oldval) {
-        return;
-      }
-    }
-  }
-  else {
-    gpu_assert(0 && "Wordsize not supported");
-  }
-#else // EIGEN_CUDA_ARCH >= 300
-  gpu_assert(0 && "Shouldn't be called on unsupported device");
-#endif // EIGEN_CUDA_ARCH >= 300
-}
-
-// We extend atomicExch to support extra data types
-template <typename Type>
-__device__ inline Type atomicExchCustom(Type* address, Type val) {
-  return atomicExch(address, val);
-}
-
-template <>
-__device__ inline double atomicExchCustom(double* address, double val) {
-  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
-  return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
-}
-
-#ifdef EIGEN_HAS_GPU_FP16
-template <typename R>
-__device__ inline void atomicReduce(half2* output, half2 accum, R& reducer) {
-  unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
-  unsigned int newval = oldval;
-  reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
-  if (newval == oldval) {
-    return;
-  }
-  unsigned int readback;
-  while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
-    oldval = readback;
-    newval = oldval;
-    reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
-    if (newval == oldval) {
-      return;
-    }
-  }
-}
-// reduction should be associative since reduction is not atomic in wide vector but atomic in half2 operations
-template <typename R>
-__device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reducer) {
-  half2* houtput=reinterpret_cast<half2*>(output);
-  half2* haccum=reinterpret_cast<half2*>(&accum);
-  for(int i=0;i<4;++i){
-    atomicReduce(houtput+i,*(haccum+i),reducer);
-  }
-}
-#endif  // EIGEN_HAS_GPU_FP16
-
-template <>
-__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
-#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
-  atomicAdd(output, accum);
-#else // EIGEN_CUDA_ARCH >= 300
-  gpu_assert(0 && "Shouldn't be called on unsupported device");
-#endif // EIGEN_CUDA_ARCH >= 300
-}
-
-
-template <typename CoeffType, typename Index>
-__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
-  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  const Index num_threads = blockDim.x * gridDim.x;
-  for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
-    output[i] = val;
-  }
-}
-
-
-template <int BlockSize, int NumPerThread, typename Self,
-          typename Reducer, typename Index>
-__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
-                                    typename Self::CoeffReturnType* output, unsigned int* semaphore) {
-#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
-  // Initialize the output value
-  const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
-  if (gridDim.x == 1) {
-    if (first_index == 0) {
-      *output = reducer.initialize();
-    }
-  }
-  else {
-    if (threadIdx.x == 0) {
-      unsigned int block = atomicCAS(semaphore, 0u, 1u);
-      if (block == 0) {
-        // We're the first block to run, initialize the output value
-        atomicExchCustom(output, reducer.initialize());
-        __threadfence();
-        atomicExch(semaphore, 2u);
-      }
-      else {
-        // Wait for the first block to initialize the output value.
-        // Use atomicCAS here to ensure that the reads aren't cached
-        unsigned int val;
-        do {
-          val = atomicCAS(semaphore, 2u, 2u);
-        }
-        while (val < 2u);
-      }
-    }
-  }
-
-  __syncthreads();
-
-  eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
-
-  typename Self::CoeffReturnType accum = reducer.initialize();
-  Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
-  for (Index i = 0; i < max_iter; i+=BlockSize) {
-    const Index index = first_index + i;
-    eigen_assert(index < num_coeffs);
-    typename Self::CoeffReturnType val = input.m_impl.coeff(index);
-    reducer.reduce(val, &accum);
-  }
-
-#pragma unroll
-  for (int offset = warpSize/2; offset > 0; offset /= 2) {
-  #if defined(EIGEN_HIPCC)
-    // use std::is_floating_point to determine the type of reduced_val 
-    // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error 
-    // and list the float and int versions of __shfl_down as the candidate functions. 
-    if (std::is_floating_point<typename Self::CoeffReturnType>::value) {
-      reducer.reduce(__shfl_down(static_cast<float>(accum), offset, warpSize), &accum);
-    } else {
-      reducer.reduce(__shfl_down(static_cast<int>(accum), offset, warpSize), &accum);
-    }
-  #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
-    reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
-  #else
-    reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
-  #endif
-  }
-
-  if ((threadIdx.x & (warpSize - 1)) == 0) {
-    atomicReduce(output, accum, reducer);
-  }
-
-  if (gridDim.x > 1 && threadIdx.x == 0) {
-    // Let the last block reset the semaphore
-    atomicInc(semaphore, gridDim.x + 1);
-#if defined(EIGEN_HIPCC)
-    __threadfence_system();
-#endif
-  }
-#else // EIGEN_CUDA_ARCH >= 300
-  gpu_assert(0 && "Shouldn't be called on unsupported device");
-#endif // EIGEN_CUDA_ARCH >= 300
-}
-
-
-#ifdef EIGEN_HAS_GPU_FP16
-template <typename Self,
-          typename Reducer, typename Index>
-__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
-                                                      packet_traits<Eigen::half>::type* scratch) {
-  eigen_assert(blockDim.x == 1);
-  eigen_assert(gridDim.x == 1);
-  typedef packet_traits<Eigen::half>::type packet_type;
-  Index packet_remainder =
-      num_coeffs % Index(unpacket_traits<packet_type>::size);
-  if (packet_remainder != 0) {
-    half2* h2scratch = reinterpret_cast<half2*>(scratch);
-    for (Index i = num_coeffs - packet_remainder; i + 2 <= num_coeffs; i += 2) {
-      *h2scratch =
-          __halves2half2(input.m_impl.coeff(i), input.m_impl.coeff(i + 1));
-      h2scratch++;
-    }
-    if ((num_coeffs & 1) != 0) {
-      half lastCoeff = input.m_impl.coeff(num_coeffs - 1);
-      *h2scratch = __halves2half2(lastCoeff, reducer.initialize());
-    }
-  } else {
-    *scratch = reducer.template initializePacket<packet_type>();
-  }
-}
-
-template <typename Self,
-          typename Reducer, typename Index>
-__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
-  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  const Index num_threads = blockDim.x * gridDim.x;
-  typedef typename packet_traits<Eigen::half>::type PacketType;
-
-  const Index num_packets =
-      num_coeffs / Index(unpacket_traits<PacketType>::size);
-  PacketType* p_output = reinterpret_cast<PacketType*>(output);
-  for (Index i = thread_id; i < num_packets; i += num_threads) {
-    p_output[i] = reducer.template initializePacket<PacketType>();
-  }
-  Index packet_remainder =
-      num_coeffs % Index(unpacket_traits<PacketType>::size);
-  if (thread_id < packet_remainder) {
-    output[num_coeffs - packet_remainder + thread_id] = reducer.initialize();
-  }
-}
-
-template <int BlockSize, int NumPerThread, typename Self,
-          typename Reducer, typename Index>
-__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
-                                    half* output, packet_traits<Eigen::half>::type* scratch) {
-  typedef typename packet_traits<Eigen::half>::type PacketType;
-  const int packet_width = unpacket_traits<PacketType>::size;
-  eigen_assert(NumPerThread % packet_width == 0);
-  const Index first_index =
-      blockIdx.x * BlockSize * NumPerThread + packet_width * threadIdx.x;
-
-  // Initialize the output value if it wasn't initialized by the ReductionInitKernel
-
-  if (gridDim.x == 1) {
-    if (first_index == 0) {
-      int rem = num_coeffs % packet_width;
-      if (rem != 0) {
-        half2* p_scratch = reinterpret_cast<half2*>(scratch);
-        *scratch = reducer.template initializePacket<PacketType>();
-        for (int i = 0; i < rem / 2; i++) {
-          *p_scratch = __halves2half2(
-              input.m_impl.coeff(num_coeffs - packet_width + 2 * i),
-              input.m_impl.coeff(num_coeffs - packet_width + 2 * i + 1));
-          p_scratch++;
-        }
-        if ((num_coeffs & 1) != 0) {
-          half last = input.m_impl.coeff(num_coeffs - 1);
-          *p_scratch = __halves2half2(last, reducer.initialize());
-        }
-      } else {
-        *scratch = reducer.template initializePacket<PacketType>();
-      }
-    }
-    __syncthreads();
-  }
-
-  PacketType accum = reducer.template initializePacket<PacketType>();
-  const Index max_iter =
-      numext::mini<Index>((num_coeffs - first_index) / packet_width,
-                          NumPerThread * BlockSize / packet_width);
-  for (Index i = 0; i < max_iter; i += BlockSize) {
-    const Index index = first_index + packet_width * i;
-    eigen_assert(index + packet_width < num_coeffs);
-    PacketType val = input.m_impl.template packet<Unaligned>(index);
-    reducer.reducePacket(val, &accum);
-  }
-
-#pragma unroll
-  for (int offset = warpSize/2; offset > 0; offset /= 2) {
-  #if defined(EIGEN_HIPCC)
-    PacketType r1;
-    half2* hr = reinterpret_cast<half2*>(&r1);
-    half2* hacc = reinterpret_cast<half2*>(&accum);
-    for (int i = 0; i < packet_width / 2; i++) {
-      // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
-      union { int i; half2 h; } wka_in, wka_out;
-      wka_in.h = hacc[i];
-      wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
-      hr[i] = wka_out.h;
-    }
-    reducer.reducePacket(r1, &accum);
-  #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
-    PacketType r1;
-    half2* hr = reinterpret_cast<half2*>(&r1);
-    half2* hacc = reinterpret_cast<half2*>(&accum);
-    for (int i = 0; i < packet_width / 2; i++) {
-      hr[i] = __shfl_down(hacc[i], offset, warpSize);
-    }
-    reducer.reducePacket(r1, &accum);
-  #else
-    PacketType r1;
-    half2* hr = reinterpret_cast<half2*>(&r1);
-    half2* hacc = reinterpret_cast<half2*>(&accum);
-    for (int i = 0; i < packet_width / 2; i++) {
-      hr[i] = __shfl_down_sync(0xFFFFFFFF, hacc[i], (unsigned)offset, warpSize);
-    }
-    reducer.reducePacket(r1, &accum);
-
-  #endif
-  }
-
-  if ((threadIdx.x & (warpSize - 1)) == 0) {
-    atomicReduce(scratch, accum, reducer);
-  }
-
-  __syncthreads();
-  half2* rv1 = reinterpret_cast<half2*>(scratch);
-  if (packet_width > 2) {
-    reducer.reducePacket(rv1[2], rv1);
-    reducer.reducePacket(rv1[3], rv1 + 1);
-    reducer.reducePacket(rv1[1], rv1);
-  }
-  if (gridDim.x == 1) {
-    if (first_index == 0) {
-      half tmp = __low2half(*rv1);
-      reducer.reduce(__high2half(*rv1), &tmp);
-      *output = tmp;
-    }
-  }
-}
-
-template <typename Op>
-__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op reducer, half* output, packet_traits<Eigen::half>::type* scratch) {
-  eigen_assert(threadIdx.x == 1);
-  half2* pscratch = reinterpret_cast<half2*>(scratch);
-  half tmp = __float2half(0.f);
-  typedef packet_traits<Eigen::half>::type packet_type;
-  for (int i = 0; i < unpacket_traits<packet_type>::size; i += 2) {
-    reducer.reduce(__low2half(*pscratch), &tmp);
-    reducer.reduce(__high2half(*pscratch), &tmp);
-    pscratch++;
-  }
-  *output = tmp;
-}
-
-#endif // EIGEN_HAS_GPU_FP16
-
-template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
-struct FullReductionLauncher {
-  static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
-    gpu_assert(false && "Should only be called on doubles, floats and half floats");
-  }
-};
-
-// Specialization for float and double
-template <typename Self, typename Op, typename OutputType, bool PacketAccess>
-struct FullReductionLauncher<
-    Self, Op, OutputType, PacketAccess,
-    typename internal::enable_if<
-      internal::is_same<float, OutputType>::value ||
-      internal::is_same<double, OutputType>::value,
-    void>::type> {
-  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
-
-    typedef typename Self::Index Index;
-    const int block_size = 256;
-    const int num_per_thread = 128;
-    const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
-
-    unsigned int* semaphore = NULL;
-    if (num_blocks > 1) {
-      semaphore = device.semaphore();
-    }
-
-    LAUNCH_GPU_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
-                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
-  }
-};
-
-#ifdef EIGEN_HAS_GPU_FP16
-template <typename Self, typename Op>
-struct FullReductionLauncher<Self, Op, Eigen::half, false> {
-  static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
-    gpu_assert(false && "Should not be called since there is no packet accessor");
-  }
-};
-
-template <typename Self, typename Op>
-struct FullReductionLauncher<Self, Op, Eigen::half, true> {
-  static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
-    typedef typename Self::Index Index;
-    typedef typename packet_traits<Eigen::half>::type PacketType;
-
-    const int block_size = 256;
-    const int num_per_thread = 128;
-    const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
-    PacketType* scratch = static_cast<PacketType*>(device.scratchpad());
-    // half2* scratch = static_cast<half2*>(device.scratchpad());
-
-    if (num_blocks > 1) {
-      // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
-      // won't be a race conditions between multiple thread blocks.
-      LAUNCH_GPU_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
-                         1, 1, 0, device, reducer, self, num_coeffs, scratch);
-    }
-
-    LAUNCH_GPU_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
-                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
-
-    if (num_blocks > 1) {
-      LAUNCH_GPU_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
-                         1, 1, 0, device, reducer, output, scratch);
-    }
-  }
-};
-#endif // EIGEN_HAS_GPU_FP16
-
-
-template <typename Self, typename Op, bool Vectorizable>
-struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
-  // Unfortunately nvidia doesn't support well exotic types such as complex,
-  // so reduce the scope of the optimized version of the code to the simple cases
-  // of doubles, floats and half floats
-#ifdef EIGEN_HAS_GPU_FP16
-  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
-      (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-       internal::is_same<typename Self::CoeffReturnType, double>::value ||
-       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
-#else // EIGEN_HAS_GPU_FP16
-  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
-                                                (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-                                                 internal::is_same<typename Self::CoeffReturnType, double>::value);
-#endif // EIGEN_HAS_GPU_FP16
-
-  template <typename OutputType>
-  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
-    gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
-    const Index num_coeffs = array_prod(self.m_impl.dimensions());
-    // Don't crash when we're called with an input tensor of size 0.
-    if (num_coeffs == 0) {
-      return;
-    }
-
-    FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
-  }
-};
-
-
-template <int NumPerThread, typename Self,
-          typename Reducer, typename Index>
-__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
-                                         typename Self::CoeffReturnType* output) {
-#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
-  typedef typename Self::CoeffReturnType Type;
-  eigen_assert(blockDim.y == 1);
-  eigen_assert(blockDim.z == 1);
-  eigen_assert(gridDim.y == 1);
-  eigen_assert(gridDim.z == 1);
-
-  const int unroll_times = 16;
-  eigen_assert(NumPerThread % unroll_times == 0);
-
-  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
-  const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
-
-  const Index num_threads = blockDim.x * gridDim.x;
-  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-
-  // Initialize the output values if they weren't initialized by the ReductionInitKernel
-  if (gridDim.x == 1) {
-    for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
-      output[i] = reducer.initialize();
-    }
-    __syncthreads();
-  }
-
-  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
-    const Index row = i / input_col_blocks;
-
-    if (row < num_preserved_coeffs) {
-      const Index col_block = i % input_col_blocks;
-      const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
-
-      Type reduced_val = reducer.initialize();
-
-      for (Index j = 0; j < NumPerThread; j += unroll_times) {
-        const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
-        if (last_col >= num_coeffs_to_reduce) {
-          for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
-            const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
-            reducer.reduce(val, &reduced_val);
-          }
-          break;
-        } else {
-          // Faster version of the loop with no branches after unrolling.
-#pragma unroll
-          for (int k = 0; k < unroll_times; ++k) {
-            const Index col = col_begin + blockDim.x * (j + k);
-            reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
-          }
-        }
-      }
-
-#pragma unroll
-      for (int offset = warpSize/2; offset > 0; offset /= 2) {
-      #if defined(EIGEN_HIPCC)
-        // use std::is_floating_point to determine the type of reduced_val 
-       // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error 
-       // and list the float and int versions of __shfl_down as the candidate functions. 
-        if (std::is_floating_point<Type>::value) {
-          reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val);
-        } else {
-          reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val);
-        }
-      #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
-        reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
-      #else
-        reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
-      #endif
-      }
-
-      if ((threadIdx.x & (warpSize - 1)) == 0) {
-        atomicReduce(&(output[row]), reduced_val, reducer);
-      }
-    }
-  }
-#else // EIGEN_CUDA_ARCH >= 300
-  gpu_assert(0 && "Shouldn't be called on unsupported device");
-#endif // EIGEN_CUDA_ARCH >= 300
-}
-
-#ifdef EIGEN_HAS_GPU_FP16
-
-template <int NumPerThread, typename Self,
-          typename Reducer, typename Index>
-__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
-                                              half* output) {
-  eigen_assert(blockDim.y == 1);
-  eigen_assert(blockDim.z == 1);
-  eigen_assert(gridDim.y == 1);
-  eigen_assert(gridDim.z == 1);
-
-  typedef typename packet_traits<Eigen::half>::type PacketType;
-  const int packet_width = unpacket_traits<PacketType>::size;
-  const int unroll_times = 16 / packet_width;
-  eigen_assert(NumPerThread % unroll_times == 0);
-  eigen_assert(unroll_times % 2 == 0);
-
-  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
-  const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
-
-  const Index num_threads = blockDim.x * gridDim.x;
-  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-
-  // Initialize the output values if they weren't initialized by the ReductionInitKernel
-  if (gridDim.x == 1) {
-    Index i = packet_width * thread_id;
-    for (; i + packet_width <= num_preserved_coeffs;
-         i += packet_width * num_threads) {
-      PacketType* poutput = reinterpret_cast<PacketType*>(output + i);
-      *poutput = reducer.template initializePacket<PacketType>();
-    }
-    if (i < num_preserved_coeffs) {
-      output[i] = reducer.initialize();
-    }
-    __syncthreads();
-  }
-
-  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
-    const Index row = 2 * (i / input_col_blocks);  // everybody takes 2 rows
-
-    if (row + 1 < num_preserved_coeffs) {
-      const Index col_block = i % input_col_blocks;
-      const Index col_begin =
-          packet_width * (col_block * blockDim.x * NumPerThread + threadIdx.x);
-
-      PacketType reduced_val1 = reducer.template initializePacket<PacketType>();
-      PacketType reduced_val2 = reducer.template initializePacket<PacketType>();
-
-      for (Index j = 0; j < NumPerThread; j += unroll_times) {
-        const Index last_col =
-            col_begin + blockDim.x * (j + unroll_times - 1) * packet_width;
-        if (last_col >= num_coeffs_to_reduce) {
-          Index col = col_begin + blockDim.x * j;
-          for (; col + packet_width <= num_coeffs_to_reduce;
-               col += blockDim.x) {
-            const PacketType val1 = input.m_impl.template packet<Unaligned>(
-                row * num_coeffs_to_reduce + col);
-            reducer.reducePacket(val1, &reduced_val1);
-            const PacketType val2 = input.m_impl.template packet<Unaligned>(
-                (row + 1) * num_coeffs_to_reduce + col);
-            reducer.reducePacket(val2, &reduced_val2);
-          }
-          if (col < num_coeffs_to_reduce) {
-            PacketType r1 = reducer.template initializePacket<PacketType>();
-            PacketType r2 = reducer.template initializePacket<PacketType>();
-            half2* hr1 = reinterpret_cast<half2*>(&r1);
-            half2* hr2 = reinterpret_cast<half2*>(&r2);
-            while (col + 1 < num_coeffs_to_reduce) {
-              *hr1 = __halves2half2(
-                  input.m_impl.coeff(row * num_coeffs_to_reduce + col),
-                  input.m_impl.coeff(row * num_coeffs_to_reduce + col + 1));
-              *hr2 = __halves2half2(
-                  input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col),
-                  input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col +
-                                     1));
-              hr1++;
-              hr2++;
-              col += 2;
-            }
-            if (col < num_coeffs_to_reduce) {
-              // Peel;
-              const half last1 =
-                  input.m_impl.coeff(row * num_coeffs_to_reduce + col);
-              *hr1 = __halves2half2(last1, reducer.initialize());
-              const half last2 =
-                  input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col);
-              *hr2 = __halves2half2(last2, reducer.initialize());
-            }
-            reducer.reducePacket(r1, &reduced_val1);
-            reducer.reducePacket(r2, &reduced_val2);
-          }
-          break;
-        } else {
-          // Faster version of the loop with no branches after unrolling.
-#pragma unroll
-          for (int k = 0; k < unroll_times; ++k) {
-            const Index col = col_begin + blockDim.x * (j + k) * packet_width;
-            reducer.reducePacket(input.m_impl.template packet<Unaligned>(
-                                     row * num_coeffs_to_reduce + col),
-                                 &reduced_val1);
-            reducer.reducePacket(input.m_impl.template packet<Unaligned>(
-                                     (row + 1) * num_coeffs_to_reduce + col),
-                                 &reduced_val2);
-          }
-        }
-      }
-
-#pragma unroll
-      for (int offset = warpSize/2; offset > 0; offset /= 2) {
-      #if defined(EIGEN_HIPCC)
-        PacketType r1;
-        PacketType r2;
-        half2* hr1 = reinterpret_cast<half2*>(&r1);
-        half2* hr2 = reinterpret_cast<half2*>(&r2);
-        half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
-        half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
-        for (int i = 0; i < packet_width / 2; i++) {
-	  // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
-	  union { int i; half2 h; } wka_in1, wka_out1;
-	  wka_in1.h = rv1[i];
-	  wka_out1.i = __shfl_down(wka_in1.i, offset, warpSize);
-	  hr1[i] = wka_out1.h;
-
-	  union { int i; half2 h; } wka_in2, wka_out2;
-	  wka_in2.h = rv2[i];
-	  wka_out2.i = __shfl_down(wka_in2.i, offset, warpSize);
-	  hr2[i] = wka_out2.h;
-        }
-        reducer.reducePacket(r1, &reduced_val1);
-        reducer.reducePacket(r2, &reduced_val2);
-      #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
-        PacketType r1;
-        PacketType r2;
-        half2* hr1 = reinterpret_cast<half2*>(&r1);
-        half2* hr2 = reinterpret_cast<half2*>(&r2);
-        half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
-        half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
-        for (int i = 0; i < packet_width / 2; i++) {
-          hr1[i] = __shfl_down(rv1[i], offset, warpSize);
-          hr2[i] = __shfl_down(rv2[i], offset, warpSize);
-        }
-        reducer.reducePacket(r1, &reduced_val1);
-        reducer.reducePacket(r2, &reduced_val2);
-      #else
-        PacketType r1;
-        PacketType r2;
-        half2* hr1 = reinterpret_cast<half2*>(&r1);
-        half2* hr2 = reinterpret_cast<half2*>(&r2);
-        half2* rr1 = reinterpret_cast<half2*>(&reduced_val1);
-        half2* rr2 = reinterpret_cast<half2*>(&reduced_val2);
-        for (int i = 0; i < packet_width / 2; i++) {
-          hr1[i] =
-              __shfl_down_sync(0xFFFFFFFF, rr1[i], (unsigned)offset, warpSize);
-          hr2[i] =
-              __shfl_down_sync(0xFFFFFFFF, rr2[i], (unsigned)offset, warpSize);
-        }
-        reducer.reducePacket(r1, &reduced_val1);
-        reducer.reducePacket(r2, &reduced_val2);
-
-      #endif
-      }
-      half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
-      half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
-      half2 val;
-      if (packet_width > 2) {
-        reducer.reducePacket(rv1[2], rv1);
-        reducer.reducePacket(rv1[3], rv1 + 1);
-        reducer.reducePacket(rv1[1], rv1);
-        reducer.reducePacket(rv2[2], rv2);
-        reducer.reducePacket(rv2[3], rv2 + 1);
-        reducer.reducePacket(rv2[1], rv2);
-      }
-      half val1 = __low2half(*rv1);
-      reducer.reduce(__high2half(*rv1), &val1);
-      half val2 = __low2half(*rv2);
-      reducer.reduce(__high2half(*rv2), &val2);
-      val = __halves2half2(val1, val2);
-      if ((threadIdx.x & (warpSize - 1)) == 0) {
-        half* loc = output + row;
-        atomicReduce((half2*)loc, val, reducer);
-      }
-    }
-  }
-}
-
-#endif // EIGEN_HAS_GPU_FP16
-
-template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
-struct InnerReductionLauncher {
-  static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
-    gpu_assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
-    return true;
-  }
-};
-
-// Specialization for float and double
-template <typename Self, typename Op, typename OutputType, bool PacketAccess>
-struct InnerReductionLauncher<
-  Self, Op, OutputType, PacketAccess,
-  typename internal::enable_if<
-    internal::is_same<float, OutputType>::value ||
-    internal::is_same<double, OutputType>::value,
-  void>::type> {
-  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
-    typedef typename Self::Index Index;
-
-    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
-    const int block_size = 256;
-    const int num_per_thread = 128;
-    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
-    const int max_blocks = device.getNumGpuMultiProcessors() *
-                           device.maxGpuThreadsPerMultiProcessor() / block_size;
-    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-
-    if (num_blocks > 1) {
-      // We initialize the outputs outside the reduction kernel when we can't be sure that there
-      // won't be a race conditions between multiple thread blocks.
-      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
-      const int max_blocks = device.getNumGpuMultiProcessors() *
-                           device.maxGpuThreadsPerMultiProcessor() / 1024;
-      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-      LAUNCH_GPU_KERNEL((ReductionInitKernel<OutputType, Index>),
-                         num_blocks, 1024, 0, device, reducer.initialize(),
-                         num_preserved_vals, output);
-    }
-
-    LAUNCH_GPU_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
-                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
-
-    return false;
-  }
-};
-
-#ifdef EIGEN_HAS_GPU_FP16
-template <typename Self, typename Op>
-struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
-  static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
-    gpu_assert(false && "Should not be called since there is no packet accessor");
-    return true;
-  }
-};
-
-template <typename Self, typename Op>
-struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
-  static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
-    typedef typename Self::Index Index;
-
-    if (num_preserved_vals % 2 != 0) {
-      // Not supported yet, revert to the slower code path
-      return true;
-    }
-
-    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
-    const int block_size = /*256*/128;
-    const int num_per_thread = /*128*/64;
-    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
-    const int max_blocks = device.getNumGpuMultiProcessors() *
-                           device.maxGpuThreadsPerMultiProcessor() / block_size;
-    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-
-    if (num_blocks > 1) {
-      // We initialize the outputs outside the reduction kernel when we can't be sure that there
-      // won't be a race conditions between multiple thread blocks.
-      LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
-                         1, 1, 0, device, reducer, self, num_preserved_vals, output);
-    }
-
-    LAUNCH_GPU_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
-                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
-
-    return false;
-  }
-};
-#endif // EIGEN_HAS_GPU_FP16
-
-
-template <typename Self, typename Op>
-struct InnerReducer<Self, Op, GpuDevice> {
-  // Unfortunately nvidia doesn't support well exotic types such as complex,
-  // so reduce the scope of the optimized version of the code to the simple case
-  // of floats and half floats.
-#ifdef EIGEN_HAS_GPU_FP16
-  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
-      (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-       internal::is_same<typename Self::CoeffReturnType, double>::value ||
-       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
-#else // EIGEN_HAS_GPU_FP16
-  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
-                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
-#endif // EIGEN_HAS_GPU_FP16
-
-  template <typename OutputType>
-  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
-    gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
-    const Index num_coeffs = array_prod(self.m_impl.dimensions());
-    // Don't crash when we're called with an input tensor of size 0.
-    if (num_coeffs == 0) {
-      return true;
-    }
-    // It's faster to use the usual code.
-    if (num_coeffs_to_reduce <= 128) {
-      return true;
-    }
-
-    return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
-  }
-};
-
-template <int NumPerThread, typename Self,
-          typename Reducer, typename Index>
-__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
-                                     typename Self::CoeffReturnType* output) {
-  const Index num_threads = blockDim.x * gridDim.x;
-  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  // Initialize the output values if they weren't initialized by the ReductionInitKernel
-  if (gridDim.x == 1) {
-    for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
-      output[i] = reducer.initialize();
-    }
-    __syncthreads();
-  }
-
-  // Do the reduction.
-  const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
-  for (Index i = thread_id; i < max_iter; i += num_threads) {
-    const Index input_col = i % num_preserved_coeffs;
-    const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
-    typename Self::CoeffReturnType reduced_val = reducer.initialize();
-    const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
-    for (Index j = input_row; j < max_row; j++) {
-      typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
-      reducer.reduce(val, &reduced_val);
-    }
-    atomicReduce(&(output[input_col]), reduced_val, reducer);
-  }
-}
-
-
-template <typename Self, typename Op>
-struct OuterReducer<Self, Op, GpuDevice> {
-  // Unfortunately nvidia doesn't support well exotic types such as complex,
-  // so reduce the scope of the optimized version of the code to the simple case
-  // of floats.
-  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
-                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
-  template <typename Device, typename OutputType>
-  static
-    #if !defined(EIGEN_HIPCC)
-    // FIXME :  leaving this EIGEN_DEVICE_FUNC in, results in the following runtime error
-    //          (in the cxx11_tensor_reduction_gpu test)
-    //
-    // terminate called after throwing an instance of 'std::runtime_error'
-    //   what():  No device code available for function: _ZN5Eigen8internal20OuterReductionKernelIL...
-    //
-    // don't know why this happens (and why is it a runtime error instead of a compile time error)
-    //
-    // this will be fixed by HIP PR#457
-    EIGEN_DEVICE_FUNC
-    #endif
-    bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
-    gpu_assert(false && "Should only be called to reduce doubles or floats on a gpu device");
-    return true;
-  }
-
-  static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
-    typedef typename Self::Index Index;
-
-    // It's faster to use the usual code.
-    if (num_coeffs_to_reduce <= 32) {
-      return true;
-    }
-
-    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
-    const int block_size = 256;
-    const int num_per_thread = 16;
-    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
-    const int max_blocks = device.getNumGpuMultiProcessors() *
-                           device.maxGpuThreadsPerMultiProcessor() / block_size;
-    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-
-    if (num_blocks > 1) {
-      // We initialize the outputs in the reduction kernel itself when we don't have to worry
-      // about race conditions between multiple thread blocks.
-      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
-      const int max_blocks = device.getNumGpuMultiProcessors() *
-                             device.maxGpuThreadsPerMultiProcessor() / 1024;
-      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-      LAUNCH_GPU_KERNEL((ReductionInitKernel<float, Index>),
-                         num_blocks, 1024, 0, device, reducer.initialize(),
-                         num_preserved_vals, output);
-    }
-
-    LAUNCH_GPU_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
-                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
-
-    return false;
-  }
-};
-
-#endif // defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
-
-
-} // end namespace internal
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorReductionSycl.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorReductionSycl.h
deleted file mode 100644
index 474eba0..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorReductionSycl.h
+++ /dev/null
@@ -1,582 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TensorReductionSycl.h
- *
- * \brief:
- *  This is the specialization of the reduction operation. Two phase reduction approach 
- * is used since the GPU does not have Global Synchronization for global memory among 
- * different work-group/thread block. To solve the problem, we need to create two kernels 
- * to reduce the data, where the first kernel reduce the data locally and each local 
- * workgroup/thread-block save the input data into global memory. In the second phase (global reduction)
- * one work-group uses one work-group/thread-block to reduces the intermediate data into one single element. 
- * Here is an NVIDIA presentation explaining the optimized two phase reduction algorithm on GPU:
- * https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
- *
- *****************************************************************/
-
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
-namespace Eigen {
-namespace TensorSycl {
-namespace internal {
-
-template <typename Op, typename CoeffReturnType, typename Index, bool Vectorizable>
-struct OpDefiner {
-  typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, Vectorizable>::PacketReturnType PacketReturnType;
-  typedef Op type;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Op &op) { return op; }
-
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
-                                                                            const Index &) {
-    return accumulator;
-  }
-};
-
-template <typename CoeffReturnType, typename Index>
-struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, false> {
-  typedef Eigen::internal::SumReducer<CoeffReturnType> type;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
-    return type();
-  }
-
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType finalise_op(const CoeffReturnType &accumulator,
-                                                                           const Index &scale) {
-    ::Eigen::internal::scalar_quotient_op<CoeffReturnType> quotient_op;
-    return quotient_op(accumulator, CoeffReturnType(scale));
-  }
-};
-
-template <typename CoeffReturnType, typename Index>
-struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, true> {
-  typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, true>::PacketReturnType PacketReturnType;
-  typedef Eigen::internal::SumReducer<CoeffReturnType> type;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
-    return type();
-  }
-
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
-                                                                            const Index &scale) {
-    return ::Eigen::internal::pdiv(accumulator, ::Eigen::internal::pset1<PacketReturnType>(CoeffReturnType(scale)));
-  }
-};
-
-template <typename CoeffReturnType, typename OpType, typename InputAccessor, typename OutputAccessor, typename Index,
-          Index local_range>
-struct SecondStepFullReducer {
-  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
-      LocalAccessor;
-  typedef OpDefiner<OpType, CoeffReturnType, Index, true> OpDef;
-  typedef typename OpDef::type Op;
-  LocalAccessor scratch;
-  InputAccessor aI;
-  OutputAccessor outAcc;
-  Op op;
-  SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)
-      : scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {}
-
-  void operator()(cl::sycl::nd_item<1> itemID) {
-    // Our empirical research shows that the best performance will be achieved
-    // when there is only one element per thread to reduce in the second step.
-    // in this step the second step reduction time is almost negligible.
-    // Hence, in the second step of reduction the input size is fixed to the
-    // local size, thus, there is only one element read per thread. The
-    // algorithm must be changed if the number of reduce per thread in the
-    // second step is greater than 1. Otherwise, the result will be wrong.
-    const Index localid = itemID.get_local_id(0);
-    auto aInPtr = aI.get_pointer() + localid;
-    auto aOutPtr = outAcc.get_pointer();
-    CoeffReturnType *scratchptr = scratch.get_pointer();
-    CoeffReturnType accumulator = *aInPtr;
-
-    scratchptr[localid] = op.finalize(accumulator);
-    for (Index offset = itemID.get_local_range(0) / 2; offset > 0; offset /= 2) {
-      itemID.barrier(cl::sycl::access::fence_space::local_space);
-      if (localid < offset) {
-        op.reduce(scratchptr[localid + offset], &accumulator);
-        scratchptr[localid] = op.finalize(accumulator);
-      }
-    }
-    if (localid == 0) *aOutPtr = op.finalize(accumulator);
-  }
-};
-
-// Full reduction first phase. In this version the vectorization is true and the reduction accept 
-// any generic reducerOp  e.g( max, min, sum, mean, iamax, iamin, etc ). 
-template <typename Evaluator, typename OpType, typename Evaluator::Index local_range>
-class FullReductionKernelFunctor {
- public:
-  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
-  typedef typename Evaluator::Index Index;
-  typedef OpDefiner<OpType, typename Evaluator::CoeffReturnType, Index,
-                    (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
-      OpDef;
-
-  typedef typename OpDef::type Op;
-  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
-  typedef typename Evaluator::PacketReturnType PacketReturnType;
-  typedef
-      typename ::Eigen::internal::conditional<(Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess),
-                                              PacketReturnType, CoeffReturnType>::type OutType;
-  typedef cl::sycl::accessor<OutType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
-      LocalAccessor;
-  LocalAccessor scratch;
-  Evaluator evaluator;
-  EvaluatorPointerType final_output;
-  Index rng;
-  Op op;
-
-  FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_,
-                             Index rng_, OpType op_)
-      : scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {}
-
-  void operator()(cl::sycl::nd_item<1> itemID) { compute_reduction(itemID); }
-
-  template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<Vect>::type compute_reduction(
-      const cl::sycl::nd_item<1> &itemID) {
-    auto output_ptr = final_output.get_pointer();
-    Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize;
-    Index globalid = itemID.get_global_id(0);
-    Index localid = itemID.get_local_id(0);
-    Index step = Evaluator::PacketSize * itemID.get_global_range(0);
-    Index start = Evaluator::PacketSize * globalid;
-    // vectorizable parts
-    PacketReturnType packetAccumulator = op.template initializePacket<PacketReturnType>();
-    for (Index i = start; i < VectorizedRange; i += step) {
-      op.template reducePacket<PacketReturnType>(evaluator.impl().template packet<Unaligned>(i), &packetAccumulator);
-    }
-    globalid += VectorizedRange;
-    // non vectorizable parts
-    for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
-      op.template reducePacket<PacketReturnType>(
-          ::Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, Evaluator::PacketSize>::convert_to_packet_type(
-              evaluator.impl().coeff(i), op.initialize()),
-          &packetAccumulator);
-    }
-    scratch[localid] = packetAccumulator =
-        OpDef::finalise_op(op.template finalizePacket<PacketReturnType>(packetAccumulator), rng);
-    // reduction parts // Local size is always power of 2
-    EIGEN_UNROLL_LOOP
-    for (Index offset = local_range / 2; offset > 0; offset /= 2) {
-      itemID.barrier(cl::sycl::access::fence_space::local_space);
-      if (localid < offset) {
-        op.template reducePacket<PacketReturnType>(scratch[localid + offset], &packetAccumulator);
-        scratch[localid] = op.template finalizePacket<PacketReturnType>(packetAccumulator);
-      }
-    }
-    if (localid == 0) {
-      output_ptr[itemID.get_group(0)] =
-          op.finalizeBoth(op.initialize(), op.template finalizePacket<PacketReturnType>(packetAccumulator));
-    }
-  }
-
-  template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!Vect>::type compute_reduction(
-      const cl::sycl::nd_item<1> &itemID) {
-    auto output_ptr = final_output.get_pointer();
-    Index globalid = itemID.get_global_id(0);
-    Index localid = itemID.get_local_id(0);
-    // vectorizable parts
-    CoeffReturnType accumulator = op.initialize();
-    // non vectorizable parts
-    for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
-      op.reduce(evaluator.impl().coeff(i), &accumulator);
-    }
-    scratch[localid] = accumulator = OpDef::finalise_op(op.finalize(accumulator), rng);
-
-    // reduction parts. the local size is always power of 2
-    EIGEN_UNROLL_LOOP
-    for (Index offset = local_range / 2; offset > 0; offset /= 2) {
-      itemID.barrier(cl::sycl::access::fence_space::local_space);
-      if (localid < offset) {
-        op.reduce(scratch[localid + offset], &accumulator);
-        scratch[localid] = op.finalize(accumulator);
-      }
-    }
-    if (localid == 0) {
-      output_ptr[itemID.get_group(0)] = op.finalize(accumulator);
-    }
-  }
-};
-
-template <typename Evaluator, typename OpType>
-class GenericNondeterministicReducer {
- public:
-  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
-  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
-  typedef typename Evaluator::Index Index;
-  typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
-  typedef typename OpDef::type Op;
-  template <typename Scratch>
-  GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_,
-                       Index range_, Index num_values_to_reduce_)
-      : evaluator(evaluator_),
-        output_accessor(output_accessor_),
-        functor(OpDef::get_op(functor_)),
-        range(range_),
-        num_values_to_reduce(num_values_to_reduce_) {}
-
-  void operator()(cl::sycl::nd_item<1> itemID) {
-    auto output_accessor_ptr = output_accessor.get_pointer();
-    /// const cast added as a naive solution to solve the qualifier drop error
-    Index globalid = static_cast<Index>(itemID.get_global_linear_id());
-    if (globalid < range) {
-      CoeffReturnType accum = functor.initialize();
-      Eigen::internal::GenericDimReducer<Evaluator::NumReducedDims - 1, Evaluator, Op>::reduce(
-          evaluator, evaluator.firstInput(globalid), functor, &accum);
-      output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce);
-    }
-  }
-
- private:
-  Evaluator evaluator;
-  EvaluatorPointerType output_accessor;
-  Op functor;
-  Index range;
-  Index num_values_to_reduce;
-};
-
-enum class reduction_dim { inner_most, outer_most };
-// default is preserver
-template <typename Evaluator, typename OpType, typename PannelParameters, reduction_dim rt>
-struct PartialReductionKernel {
-  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
-  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
-  typedef typename Evaluator::Index Index;
-  typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
-  typedef typename OpDef::type Op;
-  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
-      ScratchAcc;
-  ScratchAcc scratch;
-  Evaluator evaluator;
-  EvaluatorPointerType output_accessor;
-  Op op;
-  const Index preserve_elements_num_groups;
-  const Index reduce_elements_num_groups;
-  const Index num_coeffs_to_preserve;
-  const Index num_coeffs_to_reduce;
-
-  PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_,
-                         const Index preserve_elements_num_groups_, const Index reduce_elements_num_groups_,
-                         const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_)
-      : scratch(scratch_),
-        evaluator(evaluator_),
-        output_accessor(output_accessor_),
-        op(OpDef::get_op(op_)),
-        preserve_elements_num_groups(preserve_elements_num_groups_),
-        reduce_elements_num_groups(reduce_elements_num_groups_),
-        num_coeffs_to_preserve(num_coeffs_to_preserve_),
-        num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId,
-                                                                 CoeffReturnType &accumulator) {
-    if (globalPId >= num_coeffs_to_preserve) {
-      return;
-    }
-    Index global_offset = rt == reduction_dim::outer_most ? globalPId + (globalRId * num_coeffs_to_preserve)
-                                                          : globalRId + (globalPId * num_coeffs_to_reduce);
-    Index localOffset = globalRId;
-
-    const Index per_thread_local_stride = PannelParameters::LocalThreadSizeR * reduce_elements_num_groups;
-    const Index per_thread_global_stride =
-        rt == reduction_dim::outer_most ? num_coeffs_to_preserve * per_thread_local_stride : per_thread_local_stride;
-    for (Index i = globalRId; i < num_coeffs_to_reduce; i += per_thread_local_stride) {
-      op.reduce(evaluator.impl().coeff(global_offset), &accumulator);
-      localOffset += per_thread_local_stride;
-      global_offset += per_thread_global_stride;
-    }
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
-    const Index linearLocalThreadId = itemID.get_local_id(0);
-    Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP
-                                                           : linearLocalThreadId / PannelParameters::LocalThreadSizeR;
-    Index rLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId / PannelParameters::LocalThreadSizeP
-                                                           : linearLocalThreadId % PannelParameters::LocalThreadSizeR;
-    const Index pGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) % preserve_elements_num_groups
-                                                           : itemID.get_group(0) / reduce_elements_num_groups;
-    const Index rGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) / preserve_elements_num_groups
-                                                           : itemID.get_group(0) % reduce_elements_num_groups;
-
-    Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
-    const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId;
-    auto scratchPtr = scratch.get_pointer().get();
-    auto outPtr =
-        output_accessor.get_pointer() + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0);
-    CoeffReturnType accumulator = op.initialize();
-
-    element_wise_reduce(globalRId, globalPId, accumulator);
-
-    accumulator = OpDef::finalise_op(op.finalize(accumulator), num_coeffs_to_reduce);
-    scratchPtr[pLocalThreadId + rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)] =
-        accumulator;
-    if (rt == reduction_dim::inner_most) {
-      pLocalThreadId = linearLocalThreadId % PannelParameters::LocalThreadSizeP;
-      rLocalThreadId = linearLocalThreadId / PannelParameters::LocalThreadSizeP;
-      globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
-    }
-
-    /* Apply the reduction operation between the current local
-     * id and the one on the other half of the vector. */
-    auto out_scratch_ptr =
-        scratchPtr + (pLocalThreadId + (rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)));
-    itemID.barrier(cl::sycl::access::fence_space::local_space);
-    if (rt == reduction_dim::inner_most) {
-      accumulator = *out_scratch_ptr;
-    }
-    // The Local LocalThreadSizeR is always power of 2
-    EIGEN_UNROLL_LOOP
-    for (Index offset = PannelParameters::LocalThreadSizeR >> 1; offset > 0; offset >>= 1) {
-      if (rLocalThreadId < offset) {
-        op.reduce(out_scratch_ptr[(PannelParameters::LocalThreadSizeP + PannelParameters::BC) * offset], &accumulator);
-        // The result has already been divided for mean reducer in the
-        // previous reduction so no need to divide furthermore
-        *out_scratch_ptr = op.finalize(accumulator);
-      }
-      /* All threads collectively read from global memory into local.
-       * The barrier ensures all threads' IO is resolved before
-       * execution continues (strictly speaking, all threads within
-       * a single work-group - there is no co-ordination between
-       * work-groups, only work-items). */
-      itemID.barrier(cl::sycl::access::fence_space::local_space);
-    }
-
-    if (rLocalThreadId == 0 && (globalPId < num_coeffs_to_preserve)) {
-      outPtr[globalPId] = op.finalize(accumulator);
-    }
-  }
-};
-
-template <typename OutScalar, typename Index, typename InputAccessor, typename OutputAccessor, typename OpType>
-struct SecondStepPartialReduction {
-  typedef OpDefiner<OpType, OutScalar, Index, false> OpDef;
-  typedef typename OpDef::type Op;
-  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
-      ScratchAccessor;
-  InputAccessor input_accessor;
-  OutputAccessor output_accessor;
-  Op op;
-  const Index num_coeffs_to_preserve;
-  const Index num_coeffs_to_reduce;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SecondStepPartialReduction(ScratchAccessor, InputAccessor input_accessor_,
-                                                                   OutputAccessor output_accessor_, OpType op_,
-                                                                   const Index num_coeffs_to_preserve_,
-                                                                   const Index num_coeffs_to_reduce_)
-      : input_accessor(input_accessor_),
-        output_accessor(output_accessor_),
-        op(OpDef::get_op(op_)),
-        num_coeffs_to_preserve(num_coeffs_to_preserve_),
-        num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
-    const Index globalId = itemID.get_global_id(0);
-
-    if (globalId >= num_coeffs_to_preserve) return;
-
-    auto in_ptr = input_accessor.get_pointer() + globalId;
-
-    OutScalar accumulator = op.initialize();
-// num_coeffs_to_reduce is not bigger that 256
-    for (Index i = 0; i < num_coeffs_to_reduce; i++) {
-      op.reduce(*in_ptr, &accumulator);
-      in_ptr += num_coeffs_to_preserve;
-    }
-    output_accessor.get_pointer()[globalId] = op.finalize(accumulator);
-  }
-};  // namespace internal
-
-template <typename Index, Index LTP, Index LTR, bool BC_>
-struct ReductionPannel {
-  static EIGEN_CONSTEXPR Index LocalThreadSizeP = LTP;
-  static EIGEN_CONSTEXPR Index LocalThreadSizeR = LTR;
-  static EIGEN_CONSTEXPR bool BC = BC_;
-};
-
-template <typename Self, typename Op, TensorSycl::internal::reduction_dim rt>
-struct PartialReducerLauncher {
-  typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
-  typedef typename Self::CoeffReturnType CoeffReturnType;
-  typedef typename Self::Storage Storage;
-  typedef typename Self::Index Index;
-  typedef ReductionPannel<typename Self::Index, EIGEN_SYCL_LOCAL_THREAD_DIM0, EIGEN_SYCL_LOCAL_THREAD_DIM1, true>
-      PannelParameters;
-
-  typedef PartialReductionKernel<Self, Op, PannelParameters, rt> SyclReducerKerneType;
-
-  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType output,
-                  Index num_coeffs_to_reduce, Index num_coeffs_to_preserve) {
-    Index roundUpP = roundUp(num_coeffs_to_preserve, PannelParameters::LocalThreadSizeP);
-
-    // getPowerOfTwo makes sure local range is power of 2 and <=
-    // maxSyclThreadPerBlock this will help us to avoid extra check on the
-    // kernel
-    static_assert(!((PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR) &
-                    (PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR - 1)),
-                  "The Local thread size must be a power of 2 for the reduction "
-                  "operation");
-
-    EIGEN_CONSTEXPR Index localRange = PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR;
-    // In this step, we force the code not to be more than 2-step reduction:
-    // Our empirical research shows that if each thread reduces at least 64
-    // elemnts individually, we get better performance. However, this can change
-    // on different platforms. In this step we force the code not to be
-    // morthan step reduction: Our empirical research shows that for inner_most
-    // dim reducer, it is better to have 8 group in a reduce dimension for sizes
-    // > 1024 to achieve the best performance.
-    const Index reductionPerThread = 64;
-    Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(), true);
-    const Index pNumGroups = roundUpP / PannelParameters::LocalThreadSizeP;
-    Index rGroups = (cu + pNumGroups - 1) / pNumGroups;
-    const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ? std::min(rGroups, localRange) : 1;
-    const Index globalRange = pNumGroups * rNumGroups * localRange;
-
-    EIGEN_CONSTEXPR Index scratchSize =
-        PannelParameters::LocalThreadSizeR * (PannelParameters::LocalThreadSizeP + PannelParameters::BC);
-    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
-    if (rNumGroups > 1) {
-      CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>(
-          dev.allocate_temp(num_coeffs_to_preserve * rNumGroups * sizeof(CoeffReturnType)));
-      EvaluatorPointerType temp_accessor = dev.get(temp_pointer);
-      dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
-          self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
-          num_coeffs_to_reduce);
-
-      typedef SecondStepPartialReduction<CoeffReturnType, Index, EvaluatorPointerType, EvaluatorPointerType, Op>
-          SecondStepPartialReductionKernel;
-
-      dev.template unary_kernel_launcher<CoeffReturnType, SecondStepPartialReductionKernel>(
-          temp_accessor, output,
-          cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)), Index(1),
-          reducer, num_coeffs_to_preserve, rNumGroups);
-
-      self.device().deallocate_temp(temp_pointer);
-    } else {
-      dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
-          self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
-          num_coeffs_to_reduce);
-    }
-    return false;
-  }
-};
-}  // namespace internal
-}  // namespace TensorSycl
-
-namespace internal {
-
-template <typename Self, typename Op, bool Vectorizable>
-struct FullReducer<Self, Op, Eigen::SyclDevice, Vectorizable> {
-  typedef typename Self::CoeffReturnType CoeffReturnType;
-  typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
-  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
-  static EIGEN_CONSTEXPR int PacketSize = Self::PacketAccess ? Self::PacketSize : 1;
-  static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data) {
-    typedef typename conditional<Self::PacketAccess, typename Self::PacketReturnType, CoeffReturnType>::type OutType;
-    static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
-                    (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
-                  "The Local thread size must be a power of 2 for the reduction "
-                  "operation");
-    EIGEN_CONSTEXPR Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
-
-    typename Self::Index inputSize = self.impl().dimensions().TotalSize();
-    // In this step we force the code not to be more than 2-step reduction:
-    // Our empirical research shows that if each thread reduces at least 512
-    // elemnts individually, we get better performance.
-    const Index reductionPerThread = 2048;
-    // const Index num_work_group =
-    Index reductionGroup = dev.getPowerOfTwo(
-        (inputSize + (reductionPerThread * local_range - 1)) / (reductionPerThread * local_range), true);
-    const Index num_work_group = std::min(reductionGroup, local_range);
-    // 1
-    // ? local_range
-    // : 1);
-    const Index global_range = num_work_group * local_range;
-
-    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
-    typedef TensorSycl::internal::FullReductionKernelFunctor<Self, Op, local_range> reduction_kernel_t;
-    if (num_work_group > 1) {
-      CoeffReturnType *temp_pointer =
-          static_cast<CoeffReturnType *>(dev.allocate_temp(num_work_group * sizeof(CoeffReturnType)));
-      typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
-      dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, tmp_global_accessor, thread_range,
-                                                                      local_range, inputSize, reducer);
-
-      typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,
-                                                          EvaluatorPointerType, Index, local_range>
-          GenericRKernel;
-      dev.template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
-          tmp_global_accessor, data,
-          cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)), num_work_group,
-          reducer);
-
-      dev.deallocate_temp(temp_pointer);
-    } else {
-      dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, data, thread_range, local_range, inputSize,
-                                                                      reducer);
-    }
-  }
-};
-// vectorizable inner_most most dim preserver
-// col reduction
-template <typename Self, typename Op>
-struct OuterReducer<Self, Op, Eigen::SyclDevice> {
-  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
-
-  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
-                  typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
-                  typename Self::Index num_coeffs_to_preserve) {
-    return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
-        Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::outer_most>::run(self, reducer, dev, output,
-                                                                                 num_coeffs_to_reduce,
-                                                                                 num_coeffs_to_preserve);
-  }
-};
-// row reduction
-template <typename Self, typename Op>
-struct InnerReducer<Self, Op, Eigen::SyclDevice> {
-  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
-
-  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
-                  typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
-                  typename Self::Index num_coeffs_to_preserve) {
-    return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
-        Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::inner_most>::run(self, reducer, dev, output,
-                                                                                 num_coeffs_to_reduce,
-                                                                                 num_coeffs_to_preserve);
-  }
-};
-
-// ArmgMax uses this kernel for partial reduction//
-// TODO(@mehdi.goli) come up with a better kernel
-// generic partial reduction
-template <typename Self, typename Op>
-struct GenericReducer<Self, Op, Eigen::SyclDevice> {
-  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = false;
-  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
-                  typename Self::EvaluatorPointerType output, typename Self::Index num_values_to_reduce,
-                  typename Self::Index num_coeffs_to_preserve) {
-    typename Self::Index range, GRange, tileSize;
-    dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
-
-    dev.template unary_kernel_launcher<typename Self::CoeffReturnType,
-                                       TensorSycl::internal::GenericNondeterministicReducer<Self, Op>>(
-        self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1),
-        reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast<Index>(1));
-    return false;
-  }
-};
-
-}  // namespace internal
-}  // namespace Eigen
-
-#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorRef.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorRef.h
deleted file mode 100644
index a27d364..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorRef.h
+++ /dev/null
@@ -1,454 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_REF_H
-#define EIGEN_CXX11_TENSOR_TENSOR_REF_H
-
-namespace Eigen {
-
-namespace internal {
-
-template <typename Dimensions, typename Scalar>
-class TensorLazyBaseEvaluator {
- public:
-  TensorLazyBaseEvaluator() : m_refcount(0) { }
-  virtual ~TensorLazyBaseEvaluator() { }
-
-  EIGEN_DEVICE_FUNC virtual const Dimensions& dimensions() const = 0;
-  EIGEN_DEVICE_FUNC virtual const Scalar* data() const = 0;
-
-  EIGEN_DEVICE_FUNC virtual const Scalar coeff(DenseIndex index) const = 0;
-  EIGEN_DEVICE_FUNC virtual Scalar& coeffRef(DenseIndex index) = 0;
-
-  void incrRefCount() { ++m_refcount; }
-  void decrRefCount() { --m_refcount; }
-  int refCount() const { return m_refcount; }
-
- private:
-  // No copy, no assignment;
-  TensorLazyBaseEvaluator(const TensorLazyBaseEvaluator& other);
-  TensorLazyBaseEvaluator& operator = (const TensorLazyBaseEvaluator& other);
-
-  int m_refcount;
-};
-
-
-template <typename Dimensions, typename Expr, typename Device>
-class TensorLazyEvaluatorReadOnly : public TensorLazyBaseEvaluator<Dimensions, typename TensorEvaluator<Expr, Device>::Scalar> {
- public:
-  //  typedef typename TensorEvaluator<Expr, Device>::Dimensions Dimensions;
-  typedef typename TensorEvaluator<Expr, Device>::Scalar Scalar;
-  typedef StorageMemory<Scalar, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-  typedef  TensorEvaluator<Expr, Device> EvalType;
-
-  TensorLazyEvaluatorReadOnly(const Expr& expr, const Device& device) : m_impl(expr, device), m_dummy(Scalar(0)) {
-    m_dims = m_impl.dimensions();
-    m_impl.evalSubExprsIfNeeded(NULL);
-  }
-  virtual ~TensorLazyEvaluatorReadOnly() {
-    m_impl.cleanup();
-  }
-
-  EIGEN_DEVICE_FUNC virtual const Dimensions& dimensions() const {
-    return m_dims;
-  }
-  EIGEN_DEVICE_FUNC virtual const Scalar* data() const {
-    return m_impl.data();
-  }
-
-  EIGEN_DEVICE_FUNC virtual const Scalar coeff(DenseIndex index) const {
-    return m_impl.coeff(index);
-  }
-  EIGEN_DEVICE_FUNC virtual Scalar& coeffRef(DenseIndex /*index*/) {
-    eigen_assert(false && "can't reference the coefficient of a rvalue");
-    return m_dummy;
-  };
-
- protected:
-  TensorEvaluator<Expr, Device> m_impl;
-  Dimensions m_dims;
-  Scalar m_dummy;
-};
-
-template <typename Dimensions, typename Expr, typename Device>
-class TensorLazyEvaluatorWritable : public TensorLazyEvaluatorReadOnly<Dimensions, Expr, Device> {
- public:
-  typedef TensorLazyEvaluatorReadOnly<Dimensions, Expr, Device> Base;
-  typedef typename Base::Scalar Scalar;
-  typedef StorageMemory<Scalar, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
-  TensorLazyEvaluatorWritable(const Expr& expr, const Device& device) : Base(expr, device) {
-  }
-  virtual ~TensorLazyEvaluatorWritable() {
-  }
-
-  EIGEN_DEVICE_FUNC virtual Scalar& coeffRef(DenseIndex index) {
-    return this->m_impl.coeffRef(index);
-  }
-};
-
-template <typename Dimensions, typename Expr, typename Device>
-class TensorLazyEvaluator : public internal::conditional<bool(internal::is_lvalue<Expr>::value),
-                            TensorLazyEvaluatorWritable<Dimensions, Expr, Device>,
-                            TensorLazyEvaluatorReadOnly<Dimensions, const Expr, Device> >::type {
- public:
-  typedef typename internal::conditional<bool(internal::is_lvalue<Expr>::value),
-                                         TensorLazyEvaluatorWritable<Dimensions, Expr, Device>,
-                                         TensorLazyEvaluatorReadOnly<Dimensions, const Expr, Device> >::type Base;
-  typedef typename Base::Scalar Scalar;
-
-  TensorLazyEvaluator(const Expr& expr, const Device& device) : Base(expr, device) {
-  }
-  virtual ~TensorLazyEvaluator() {
-  }
-};
-
-}  // namespace internal
-
-
-/** \class TensorRef
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief A reference to a tensor expression
-  * The expression will be evaluated lazily (as much as possible).
-  *
-  */
-template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef<PlainObjectType> >
-{
-  public:
-    typedef TensorRef<PlainObjectType> Self;
-    typedef typename PlainObjectType::Base Base;
-    typedef typename Eigen::internal::nested<Self>::type Nested;
-    typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind;
-    typedef typename internal::traits<PlainObjectType>::Index Index;
-    typedef typename internal::traits<PlainObjectType>::Scalar Scalar;
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef typename Base::CoeffReturnType CoeffReturnType;
-    typedef Scalar* PointerType;
-    typedef PointerType PointerArgType;
-
-    static const Index NumIndices = PlainObjectType::NumIndices;
-    typedef typename PlainObjectType::Dimensions Dimensions;
-
-    enum {
-      IsAligned = false,
-      PacketAccess = false,
-      BlockAccess = false,
-      PreferBlockAccess = false,
-      Layout = PlainObjectType::Layout,
-      CoordAccess = false,  // to be implemented
-      RawAccess = false
-    };
-
-    //===- Tensor block evaluation strategy (see TensorBlock.h) -----------===//
-    typedef internal::TensorBlockNotImplemented TensorBlock;
-    //===------------------------------------------------------------------===//
-
-    EIGEN_STRONG_INLINE TensorRef() : m_evaluator(NULL) {
-    }
-
-    template <typename Expression>
-    EIGEN_STRONG_INLINE TensorRef(const Expression& expr) : m_evaluator(new internal::TensorLazyEvaluator<Dimensions, Expression, DefaultDevice>(expr, DefaultDevice())) {
-      m_evaluator->incrRefCount();
-    }
-
-    template <typename Expression>
-    EIGEN_STRONG_INLINE TensorRef& operator = (const Expression& expr) {
-      unrefEvaluator();
-      m_evaluator = new internal::TensorLazyEvaluator<Dimensions, Expression, DefaultDevice>(expr, DefaultDevice());
-      m_evaluator->incrRefCount();
-      return *this;
-    }
-
-    ~TensorRef() {
-      unrefEvaluator();
-    }
-
-    TensorRef(const TensorRef& other) : m_evaluator(other.m_evaluator) {
-      eigen_assert(m_evaluator->refCount() > 0);
-      m_evaluator->incrRefCount();
-    }
-
-    TensorRef& operator = (const TensorRef& other) {
-      if (this != &other) {
-        unrefEvaluator();
-        m_evaluator = other.m_evaluator;
-        eigen_assert(m_evaluator->refCount() > 0);
-        m_evaluator->incrRefCount();
-      }
-      return *this;
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index rank() const { return m_evaluator->dimensions().size(); }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_evaluator->dimensions()[n]; }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_evaluator->dimensions(); }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Index size() const { return m_evaluator->dimensions().TotalSize(); }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar* data() const { return m_evaluator->data(); }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar operator()(Index index) const
-    {
-      return m_evaluator->coeff(index);
-    }
-
-#if EIGEN_HAS_VARIADIC_TEMPLATES
-    template<typename... IndexTypes> EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar operator()(Index firstIndex, IndexTypes... otherIndices) const
-    {
-      const std::size_t num_indices = (sizeof...(otherIndices) + 1);
-      const array<Index, num_indices> indices{{firstIndex, otherIndices...}};
-      return coeff(indices);
-    }
-    template<typename... IndexTypes> EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices)
-    {
-      const std::size_t num_indices = (sizeof...(otherIndices) + 1);
-      const array<Index, num_indices> indices{{firstIndex, otherIndices...}};
-      return coeffRef(indices);
-    }
-#else
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1) const
-    {
-      array<Index, 2> indices;
-      indices[0] = i0;
-      indices[1] = i1;
-      return coeff(indices);
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2) const
-    {
-      array<Index, 3> indices;
-      indices[0] = i0;
-      indices[1] = i1;
-      indices[2] = i2;
-      return coeff(indices);
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2, Index i3) const
-    {
-      array<Index, 4> indices;
-      indices[0] = i0;
-      indices[1] = i1;
-      indices[2] = i2;
-      indices[3] = i3;
-      return coeff(indices);
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
-    {
-      array<Index, 5> indices;
-      indices[0] = i0;
-      indices[1] = i1;
-      indices[2] = i2;
-      indices[3] = i3;
-      indices[4] = i4;
-      return coeff(indices);
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1)
-    {
-      array<Index, 2> indices;
-      indices[0] = i0;
-      indices[1] = i1;
-      return coeffRef(indices);
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1, Index i2)
-    {
-      array<Index, 3> indices;
-      indices[0] = i0;
-      indices[1] = i1;
-      indices[2] = i2;
-      return coeffRef(indices);
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3)
-    {
-      array<Index, 4> indices;
-      indices[0] = i0;
-      indices[1] = i1;
-      indices[2] = i2;
-      indices[3] = i3;
-      return coeffRef(indices);
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1, Index i2, Index i3, Index i4)
-    {
-      array<Index, 5> indices;
-      indices[0] = i0;
-      indices[1] = i1;
-      indices[2] = i2;
-      indices[3] = i3;
-      indices[4] = i4;
-      return coeffRef(indices);
-    }
-#endif
-
-    template <std::size_t NumIndices> EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar coeff(const array<Index, NumIndices>& indices) const
-    {
-      const Dimensions& dims = this->dimensions();
-      Index index = 0;
-      if (PlainObjectType::Options & RowMajor) {
-        index += indices[0];
-        for (size_t i = 1; i < NumIndices; ++i) {
-          index = index * dims[i] + indices[i];
-        }
-      } else {
-        index += indices[NumIndices-1];
-        for (int i = NumIndices-2; i >= 0; --i) {
-          index = index * dims[i] + indices[i];
-        }
-      }
-      return m_evaluator->coeff(index);
-    }
-    template <std::size_t NumIndices> EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices)
-    {
-      const Dimensions& dims = this->dimensions();
-      Index index = 0;
-      if (PlainObjectType::Options & RowMajor) {
-        index += indices[0];
-        for (size_t i = 1; i < NumIndices; ++i) {
-          index = index * dims[i] + indices[i];
-        }
-      } else {
-        index += indices[NumIndices-1];
-        for (int i = NumIndices-2; i >= 0; --i) {
-          index = index * dims[i] + indices[i];
-        }
-      }
-      return m_evaluator->coeffRef(index);
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
-    {
-      return m_evaluator->coeff(index);
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
-    {
-      return m_evaluator->coeffRef(index);
-    }
-
-  private:
-    EIGEN_STRONG_INLINE void unrefEvaluator() {
-      if (m_evaluator) {
-        m_evaluator->decrRefCount();
-        if (m_evaluator->refCount() == 0) {
-          delete m_evaluator;
-        }
-      }
-    }
-
-  internal::TensorLazyBaseEvaluator<Dimensions, Scalar>* m_evaluator;
-};
-
-
-// evaluator for rvalues
-template<typename Derived, typename Device>
-struct TensorEvaluator<const TensorRef<Derived>, Device>
-{
-  typedef typename Derived::Index Index;
-  typedef typename Derived::Scalar Scalar;
-  typedef typename Derived::Scalar CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  typedef typename Derived::Dimensions Dimensions;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
-  enum {
-    IsAligned = false,
-    PacketAccess = false,
-    BlockAccess = false,
-    PreferBlockAccess = false,
-    Layout = TensorRef<Derived>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
-  };
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef<Derived>& m, const Device&)
-      : m_ref(m)
-  { }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_ref.dimensions(); }
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
-    return true;
-  }
-
-  EIGEN_STRONG_INLINE void cleanup() { }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
-    return m_ref.coeff(index);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
-    return m_ref.coeffRef(index);
-  }
-
-  EIGEN_DEVICE_FUNC const Scalar* data() const { return m_ref.data(); }
-
- protected:
-  TensorRef<Derived> m_ref;
-};
-
-
-// evaluator for lvalues
-template<typename Derived, typename Device>
-struct TensorEvaluator<TensorRef<Derived>, Device> : public TensorEvaluator<const TensorRef<Derived>, Device>
-{
-  typedef typename Derived::Index Index;
-  typedef typename Derived::Scalar Scalar;
-  typedef typename Derived::Scalar CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  typedef typename Derived::Dimensions Dimensions;
-
-  typedef TensorEvaluator<const TensorRef<Derived>, Device> Base;
-
-  enum {
-    IsAligned = false,
-    PacketAccess = false,
-    BlockAccess = false,
-    PreferBlockAccess = false,
-    RawAccess = false
-  };
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_STRONG_INLINE TensorEvaluator(TensorRef<Derived>& m, const Device& d) : Base(m, d)
-  { }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
-    return this->m_ref.coeffRef(index);
-  }
-};
-
-
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_REF_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorReverse.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorReverse.h
deleted file mode 100644
index 586ce68..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorReverse.h
+++ /dev/null
@@ -1,465 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Navdeep Jaitly <ndjaitly@google.com>
-//                    Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H
-#define EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H
-namespace Eigen {
-
-/** \class TensorReverse
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Tensor reverse elements class.
-  *
-  */
-namespace internal {
-template<typename ReverseDimensions, typename XprType>
-struct traits<TensorReverseOp<ReverseDimensions,
-                              XprType> > : public traits<XprType>
-{
-  typedef typename XprType::Scalar Scalar;
-  typedef traits<XprType> XprTraits;
-  typedef typename XprTraits::StorageKind StorageKind;
-  typedef typename XprTraits::Index Index;
-  typedef typename XprType::Nested Nested;
-  typedef typename remove_reference<Nested>::type _Nested;
-  static const int NumDimensions = XprTraits::NumDimensions;
-  static const int Layout = XprTraits::Layout;
-  typedef typename XprTraits::PointerType PointerType;
-};
-
-template<typename ReverseDimensions, typename XprType>
-struct eval<TensorReverseOp<ReverseDimensions, XprType>, Eigen::Dense>
-{
-  typedef const TensorReverseOp<ReverseDimensions, XprType>& type;
-};
-
-template<typename ReverseDimensions, typename XprType>
-struct nested<TensorReverseOp<ReverseDimensions, XprType>, 1,
-            typename eval<TensorReverseOp<ReverseDimensions, XprType> >::type>
-{
-  typedef TensorReverseOp<ReverseDimensions, XprType> type;
-};
-
-}  // end namespace internal
-
-template<typename ReverseDimensions, typename XprType>
-class TensorReverseOp : public TensorBase<TensorReverseOp<ReverseDimensions,
-                                          XprType>, WriteAccessors>
-{
-  public:
-    typedef TensorBase<TensorReverseOp<ReverseDimensions, XprType>, WriteAccessors>Base;
-    typedef typename Eigen::internal::traits<TensorReverseOp>::Scalar Scalar;
-    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-    typedef typename XprType::CoeffReturnType CoeffReturnType;
-    typedef typename Eigen::internal::nested<TensorReverseOp>::type Nested;
-    typedef typename Eigen::internal::traits<TensorReverseOp>::StorageKind
-                                                                      StorageKind;
-    typedef typename Eigen::internal::traits<TensorReverseOp>::Index Index;
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReverseOp(
-      const XprType& expr, const ReverseDimensions& reverse_dims)
-      : m_xpr(expr), m_reverse_dims(reverse_dims) { }
-
-    EIGEN_DEVICE_FUNC
-    const ReverseDimensions& reverse() const { return m_reverse_dims; }
-
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename XprType::Nested>::type&
-    expression() const { return m_xpr; }
-
-    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorReverseOp)
-
-
-  protected:
-    typename XprType::Nested m_xpr;
-    const ReverseDimensions m_reverse_dims;
-};
-
-// Eval as rvalue
-template<typename ReverseDimensions, typename ArgType, typename Device>
-struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device>
-{
-  typedef TensorReverseOp<ReverseDimensions, ArgType> XprType;
-  typedef typename XprType::Index Index;
-  static const int NumDims = internal::array_size<ReverseDimensions>::value;
-  typedef DSizes<Index, NumDims> Dimensions;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
-  enum {
-    IsAligned         = false,
-    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess       = NumDims > 0,
-    PreferBlockAccess = true,
-    Layout            = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess       = false,  // to be implemented
-    RawAccess         = false
-  };
-
-  typedef internal::TensorIntDivisor<Index> IndexDivisor;
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
-  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
-
-  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
-      ArgTensorBlock;
-
-  typedef typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims,
-                                                     Layout, Index>
-      TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device),
-        m_reverse(op.reverse()),
-        m_device(device)
-  {
-    // Reversing a scalar isn't supported yet. It would be a no-op anyway.
-    EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-    // Compute strides
-    m_dimensions = m_impl.dimensions();
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      m_strides[0] = 1;
-      for (int i = 1; i < NumDims; ++i) {
-        m_strides[i] = m_strides[i-1] * m_dimensions[i-1];
-        if (m_strides[i] > 0) m_fastStrides[i] = IndexDivisor(m_strides[i]);
-      }
-    } else {
-      m_strides[NumDims-1] = 1;
-      for (int i = NumDims - 2; i >= 0; --i) {
-        m_strides[i] = m_strides[i+1] * m_dimensions[i+1];
-        if (m_strides[i] > 0) m_fastStrides[i] = IndexDivisor(m_strides[i]);
-      }
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  const Dimensions& dimensions() const { return m_dimensions; }
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
-    m_impl.evalSubExprsIfNeeded(NULL);
-    return true;
-  }
-
-#ifdef EIGEN_USE_THREADS
-  template <typename EvalSubExprsCallback>
-  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
-      EvaluatorPointerType, EvalSubExprsCallback done) {
-    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
-  }
-#endif  // EIGEN_USE_THREADS
-
-  EIGEN_STRONG_INLINE void cleanup() {
-    m_impl.cleanup();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index reverseIndex(
-      Index index) const {
-    eigen_assert(index < dimensions().TotalSize());
-    Index inputIndex = 0;
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      EIGEN_UNROLL_LOOP
-      for (int i = NumDims - 1; i > 0; --i) {
-        Index idx = index / m_fastStrides[i];
-        index -= idx * m_strides[i];
-        if (m_reverse[i]) {
-          idx = m_dimensions[i] - idx - 1;
-        }
-        inputIndex += idx * m_strides[i] ;
-      }
-      if (m_reverse[0]) {
-        inputIndex += (m_dimensions[0] - index - 1);
-      } else {
-        inputIndex += index;
-      }
-    } else {
-      EIGEN_UNROLL_LOOP
-      for (int i = 0; i < NumDims - 1; ++i) {
-        Index idx = index / m_fastStrides[i];
-        index -= idx * m_strides[i];
-        if (m_reverse[i]) {
-          idx = m_dimensions[i] - idx - 1;
-        }
-        inputIndex += idx * m_strides[i] ;
-      }
-      if (m_reverse[NumDims-1]) {
-        inputIndex += (m_dimensions[NumDims-1] - index - 1);
-      } else {
-        inputIndex += index;
-      }
-    }
-    return inputIndex;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(
-      Index index) const  {
-    return m_impl.coeff(reverseIndex(index));
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  PacketReturnType packet(Index index) const
-  {
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
-
-    // TODO(ndjaitly): write a better packing routine that uses
-    // local structure.
-    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type
-                                                            values[PacketSize];
-    EIGEN_UNROLL_LOOP
-    for (int i = 0; i < PacketSize; ++i) {
-      values[i] = coeff(index+i);
-    }
-    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
-    return rslt;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockResourceRequirements getResourceRequirements() const {
-    const size_t target_size = m_device.lastLevelCacheSize();
-    // Block evaluation reads underlying memory in reverse order, and default
-    // cost model does not properly catch this in bytes stored/loaded.
-    return internal::TensorBlockResourceRequirements::skewed<Scalar>(
-               target_size)
-        .addCostPerCoeff({0, 0, 24});
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
-  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
-          bool /*root_of_expr_ast*/ = false) const {
-    // TODO(ezhulenev): If underlying tensor expression supports and prefers
-    // block evaluation we must use it. Currently we use coeff and packet
-    // access into the underlying tensor expression.
-    // static const bool useBlockAccessForArgType =
-    //     TensorEvaluator<ArgType, Device>::BlockAccess &&
-    //     TensorEvaluator<ArgType, Device>::PreferBlockAccess;
-
-    static const bool isColMajor =
-        static_cast<int>(Layout) == static_cast<int>(ColMajor);
-
-    static const Index inner_dim_idx = isColMajor ? 0 : NumDims - 1;
-    const bool inner_dim_reversed = m_reverse[inner_dim_idx];
-
-    // Offset in the output block.
-    Index block_offset = 0;
-
-    // Offset in the input Tensor.
-    Index input_offset = reverseIndex(desc.offset());
-
-    // Initialize output block iterator state. Dimension in this array are
-    // always in inner_most -> outer_most order (col major layout).
-    array<BlockIteratorState, NumDims> it;
-    for (int i = 0; i < NumDims; ++i) {
-      const int dim = isColMajor ? i : NumDims - 1 - i;
-      it[i].size = desc.dimension(dim);
-      it[i].count = 0;
-      it[i].reverse = m_reverse[dim];
-
-      it[i].block_stride =
-          i == 0 ? 1 : (it[i - 1].size * it[i - 1].block_stride);
-      it[i].block_span = it[i].block_stride * (it[i].size - 1);
-
-      it[i].input_stride = m_strides[dim];
-      it[i].input_span = it[i].input_stride * (it[i].size - 1);
-
-      if (it[i].reverse) {
-        it[i].input_stride = -1 * it[i].input_stride;
-        it[i].input_span = -1 * it[i].input_span;
-      }
-    }
-
-    // If multiple inner dimensions have the same reverse flag, check if we can
-    // merge them into a single virtual inner dimension.
-    int effective_inner_dim = 0;
-    for (int i = 1; i < NumDims; ++i) {
-      if (it[i].reverse != it[effective_inner_dim].reverse) break;
-      if (it[i].block_stride != it[effective_inner_dim].size) break;
-      if (it[i].block_stride != numext::abs(it[i].input_stride)) break;
-
-      it[i].size = it[effective_inner_dim].size * it[i].size;
-
-      it[i].block_stride = 1;
-      it[i].input_stride = (inner_dim_reversed ? -1 : 1);
-
-      it[i].block_span = it[i].block_stride * (it[i].size - 1);
-      it[i].input_span = it[i].input_stride * (it[i].size - 1);
-
-      effective_inner_dim = i;
-    }
-
-    eigen_assert(it[effective_inner_dim].block_stride == 1);
-    eigen_assert(it[effective_inner_dim].input_stride ==
-                 (inner_dim_reversed ? -1 : 1));
-
-    const Index inner_dim_size = it[effective_inner_dim].size;
-
-    // Prepare storage for the materialized reverse result.
-    const typename TensorBlock::Storage block_storage =
-        TensorBlock::prepareStorage(desc, scratch);
-    CoeffReturnType* block_buffer = block_storage.data();
-
-    while (it[NumDims - 1].count < it[NumDims - 1].size) {
-      // Copy inner-most dimension data from reversed location in input.
-      Index dst = block_offset;
-      Index src = input_offset;
-
-      // NOTE(ezhulenev): Adding vectorized path with internal::preverse showed
-      // worse results in benchmarks than a simple coefficient loop.
-      if (inner_dim_reversed) {
-        for (Index i = 0; i < inner_dim_size; ++i) {
-          block_buffer[dst] = m_impl.coeff(src);
-          ++dst;
-          --src;
-        }
-      } else {
-        for (Index i = 0; i < inner_dim_size; ++i) {
-          block_buffer[dst] = m_impl.coeff(src);
-          ++dst;
-          ++src;
-        }
-      }
-
-      // For the 1d tensor we need to generate only one inner-most dimension.
-      if ((NumDims - effective_inner_dim) == 1) break;
-
-      // Update offset.
-      for (Index i = effective_inner_dim + 1; i < NumDims; ++i) {
-        if (++it[i].count < it[i].size) {
-          block_offset += it[i].block_stride;
-          input_offset += it[i].input_stride;
-          break;
-        }
-        if (i != NumDims - 1) it[i].count = 0;
-        block_offset -= it[i].block_span;
-        input_offset -= it[i].input_span;
-      }
-    }
-
-    return block_storage.AsTensorMaterializedBlock();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
-                                     2 * TensorOpCost::MulCost<Index>() +
-                                     TensorOpCost::DivCost<Index>());
-    for (int i = 0; i < NumDims; ++i) {
-      if (m_reverse[i]) {
-        compute_cost += 2 * TensorOpCost::AddCost<Index>();
-      }
-    }
-    return m_impl.costPerCoeff(vectorized) +
-           TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize);
-  }
-
-  EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
-
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
-
- protected:
-  Dimensions m_dimensions;
-  array<Index, NumDims> m_strides;
-  array<IndexDivisor, NumDims> m_fastStrides;
-  TensorEvaluator<ArgType, Device> m_impl;
-  ReverseDimensions m_reverse;
-  const Device EIGEN_DEVICE_REF m_device;
-
- private:
-  struct BlockIteratorState {
-    BlockIteratorState()
-        : size(0),
-          count(0),
-          reverse(false),
-          block_stride(0),
-          block_span(0),
-          input_stride(0),
-          input_span(0) {}
-
-    Index size;
-    Index count;
-    bool reverse;
-    Index block_stride;
-    Index block_span;
-    Index input_stride;
-    Index input_span;
-  };
-};
-
-// Eval as lvalue
-
-template <typename ReverseDimensions, typename ArgType, typename Device>
-struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
-    : public TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>,
-                             Device> {
-  typedef TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>,
-                          Device> Base;
-  typedef TensorReverseOp<ReverseDimensions, ArgType> XprType;
-  typedef typename XprType::Index Index;
-  static const int NumDims = internal::array_size<ReverseDimensions>::value;
-  typedef DSizes<Index, NumDims> Dimensions;
-
-  enum {
-    IsAligned = false,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
-    PreferBlockAccess = false,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
-  };
-  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : Base(op, device) {}
-
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  const Dimensions& dimensions() const { return this->m_dimensions; }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
-    return this->m_impl.coeffRef(this->reverseIndex(index));
-  }
-
-  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketReturnType& x) {
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
-
-    // This code is pilfered from TensorMorphing.h
-    EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize];
-    internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
-    EIGEN_UNROLL_LOOP
-    for (int i = 0; i < PacketSize; ++i) {
-      this->coeffRef(index+i) = values[i];
-    }
-  }
-};
-
-
-}  // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorScan.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorScan.h
deleted file mode 100644
index beae854..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorScan.h
+++ /dev/null
@@ -1,528 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016 Igor Babuschkin <igor@babuschk.in>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
-#define EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
-
-namespace Eigen {
-
-namespace internal {
-
-template <typename Op, typename XprType>
-struct traits<TensorScanOp<Op, XprType> >
-    : public traits<XprType> {
-  typedef typename XprType::Scalar Scalar;
-  typedef traits<XprType> XprTraits;
-  typedef typename XprTraits::StorageKind StorageKind;
-  typedef typename XprType::Nested Nested;
-  typedef typename remove_reference<Nested>::type _Nested;
-  static const int NumDimensions = XprTraits::NumDimensions;
-  static const int Layout = XprTraits::Layout;
-  typedef typename XprTraits::PointerType PointerType;
-};
-
-template<typename Op, typename XprType>
-struct eval<TensorScanOp<Op, XprType>, Eigen::Dense>
-{
-  typedef const TensorScanOp<Op, XprType>& type;
-};
-
-template<typename Op, typename XprType>
-struct nested<TensorScanOp<Op, XprType>, 1,
-            typename eval<TensorScanOp<Op, XprType> >::type>
-{
-  typedef TensorScanOp<Op, XprType> type;
-};
-} // end namespace internal
-
-/** \class TensorScan
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Tensor scan class.
-  */
-template <typename Op, typename XprType>
-class TensorScanOp
-    : public TensorBase<TensorScanOp<Op, XprType>, ReadOnlyAccessors> {
-public:
-  typedef typename Eigen::internal::traits<TensorScanOp>::Scalar Scalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename Eigen::internal::nested<TensorScanOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorScanOp>::StorageKind StorageKind;
-  typedef typename Eigen::internal::traits<TensorScanOp>::Index Index;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorScanOp(
-      const XprType& expr, const Index& axis, bool exclusive = false, const Op& op = Op())
-      : m_expr(expr), m_axis(axis), m_accumulator(op), m_exclusive(exclusive) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  const Index axis() const { return m_axis; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  const XprType& expression() const { return m_expr; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  const Op accumulator() const { return m_accumulator; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  bool exclusive() const { return m_exclusive; }
-
-protected:
-  typename XprType::Nested m_expr;
-  const Index m_axis;
-  const Op m_accumulator;
-  const bool m_exclusive;
-};
-
-
-namespace internal {
-
-template <typename Self>
-EIGEN_STRONG_INLINE void ReduceScalar(Self& self, Index offset,
-                                      typename Self::CoeffReturnType* data) {
-  // Compute the scan along the axis, starting at the given offset
-  typename Self::CoeffReturnType accum = self.accumulator().initialize();
-  if (self.stride() == 1) {
-    if (self.exclusive()) {
-      for (Index curr = offset; curr < offset + self.size(); ++curr) {
-        data[curr] = self.accumulator().finalize(accum);
-        self.accumulator().reduce(self.inner().coeff(curr), &accum);
-      }
-    } else {
-      for (Index curr = offset; curr < offset + self.size(); ++curr) {
-        self.accumulator().reduce(self.inner().coeff(curr), &accum);
-        data[curr] = self.accumulator().finalize(accum);
-      }
-    }
-  } else {
-    if (self.exclusive()) {
-      for (Index idx3 = 0; idx3 < self.size(); idx3++) {
-        Index curr = offset + idx3 * self.stride();
-        data[curr] = self.accumulator().finalize(accum);
-        self.accumulator().reduce(self.inner().coeff(curr), &accum);
-      }
-    } else {
-      for (Index idx3 = 0; idx3 < self.size(); idx3++) {
-        Index curr = offset + idx3 * self.stride();
-        self.accumulator().reduce(self.inner().coeff(curr), &accum);
-        data[curr] = self.accumulator().finalize(accum);
-      }
-    }
-  }
-}
-
-template <typename Self>
-EIGEN_STRONG_INLINE void ReducePacket(Self& self, Index offset,
-                                      typename Self::CoeffReturnType* data) {
-  using Scalar = typename Self::CoeffReturnType;
-  using Packet = typename Self::PacketReturnType;
-  // Compute the scan along the axis, starting at the calculated offset
-  Packet accum = self.accumulator().template initializePacket<Packet>();
-  if (self.stride() == 1) {
-    if (self.exclusive()) {
-      for (Index curr = offset; curr < offset + self.size(); ++curr) {
-        internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
-        self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
-      }
-    } else {
-      for (Index curr = offset; curr < offset + self.size(); ++curr) {
-        self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
-        internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
-      }
-    }
-  } else {
-    if (self.exclusive()) {
-      for (Index idx3 = 0; idx3 < self.size(); idx3++) {
-        const Index curr = offset + idx3 * self.stride();
-        internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
-        self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
-      }
-    } else {
-      for (Index idx3 = 0; idx3 < self.size(); idx3++) {
-        const Index curr = offset + idx3 * self.stride();
-        self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
-        internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
-      }
-    }
-  }
-}
-
-template <typename Self, bool Vectorize, bool Parallel>
-struct ReduceBlock {
-  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
-                                      typename Self::CoeffReturnType* data) {
-    for (Index idx2 = 0; idx2 < self.stride(); idx2++) {
-      // Calculate the starting offset for the scan
-      Index offset = idx1 + idx2;
-      ReduceScalar(self, offset, data);
-    }
-  }
-};
-
-// Specialization for vectorized reduction.
-template <typename Self>
-struct ReduceBlock<Self, /*Vectorize=*/true, /*Parallel=*/false> {
-  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
-                                      typename Self::CoeffReturnType* data) {
-    using Packet = typename Self::PacketReturnType;
-    const int PacketSize = internal::unpacket_traits<Packet>::size;
-    Index idx2 = 0;
-    for (; idx2 + PacketSize <= self.stride(); idx2 += PacketSize) {
-      // Calculate the starting offset for the packet scan
-      Index offset = idx1 + idx2;
-      ReducePacket(self, offset, data);
-    }
-    for (; idx2 < self.stride(); idx2++) {
-      // Calculate the starting offset for the scan
-      Index offset = idx1 + idx2;
-      ReduceScalar(self, offset, data);
-    }
-  }
-};
-
-// Single-threaded CPU implementation of scan
-template <typename Self, typename Reducer, typename Device,
-          bool Vectorize =
-              (TensorEvaluator<typename Self::ChildTypeNoConst, Device>::PacketAccess &&
-               internal::reducer_traits<Reducer, Device>::PacketAccess)>
-struct ScanLauncher {
-  void operator()(Self& self, typename Self::CoeffReturnType* data) {
-    Index total_size = internal::array_prod(self.dimensions());
-
-    // We fix the index along the scan axis to 0 and perform a
-    // scan per remaining entry. The iteration is split into two nested
-    // loops to avoid an integer division by keeping track of each idx1 and
-    // idx2.
-    for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) {
-      ReduceBlock<Self, Vectorize, /*Parallel=*/false> block_reducer;
-      block_reducer(self, idx1, data);
-    }
-  }
-};
-
-#ifdef EIGEN_USE_THREADS
-
-// Adjust block_size to avoid false sharing of cachelines among
-// threads. Currently set to twice the cache line size on Intel and ARM
-// processors.
-EIGEN_STRONG_INLINE Index AdjustBlockSize(Index item_size, Index block_size) {
-  EIGEN_CONSTEXPR Index kBlockAlignment = 128;
-  const Index items_per_cacheline =
-      numext::maxi<Index>(1, kBlockAlignment / item_size);
-  return items_per_cacheline * divup(block_size, items_per_cacheline);
-}
-
-template <typename Self>
-struct ReduceBlock<Self, /*Vectorize=*/true, /*Parallel=*/true> {
-  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
-                                      typename Self::CoeffReturnType* data) {
-    using Scalar = typename Self::CoeffReturnType;
-    using Packet = typename Self::PacketReturnType;
-    const int PacketSize = internal::unpacket_traits<Packet>::size;
-    Index num_scalars = self.stride();
-    Index num_packets = 0;
-    if (self.stride() >= PacketSize) {
-      num_packets = self.stride() / PacketSize;
-      self.device().parallelFor(
-          num_packets,
-        TensorOpCost(PacketSize * self.size(), PacketSize * self.size(),
-                     16 * PacketSize * self.size(), true, PacketSize),
-        // Make the shard size large enough that two neighboring threads
-        // won't write to the same cacheline of `data`.
-        [=](Index blk_size) {
-          return AdjustBlockSize(PacketSize * sizeof(Scalar), blk_size);
-        },
-        [&](Index first, Index last) {
-          for (Index packet = first; packet < last; ++packet) {
-            const Index idx2 = packet * PacketSize;
-            ReducePacket(self, idx1 + idx2, data);
-          }
-        });
-      num_scalars -= num_packets * PacketSize;
-    }
-    self.device().parallelFor(
-        num_scalars, TensorOpCost(self.size(), self.size(), 16 * self.size()),
-        // Make the shard size large enough that two neighboring threads
-        // won't write to the same cacheline of `data`.
-        [=](Index blk_size) {
-          return AdjustBlockSize(sizeof(Scalar), blk_size);
-        },
-        [&](Index first, Index last) {
-          for (Index scalar = first; scalar < last; ++scalar) {
-            const Index idx2 = num_packets * PacketSize + scalar;
-            ReduceScalar(self, idx1 + idx2, data);
-          }
-        });
-  }
-};
-
-template <typename Self>
-struct ReduceBlock<Self, /*Vectorize=*/false, /*Parallel=*/true> {
-  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
-                                      typename Self::CoeffReturnType* data) {
-    using Scalar = typename Self::CoeffReturnType;
-    self.device().parallelFor(
-        self.stride(), TensorOpCost(self.size(), self.size(), 16 * self.size()),
-        // Make the shard size large enough that two neighboring threads
-        // won't write to the same cacheline of `data`.
-        [=](Index blk_size) {
-          return AdjustBlockSize(sizeof(Scalar), blk_size);
-        },
-        [&](Index first, Index last) {
-          for (Index idx2 = first; idx2 < last; ++idx2) {
-            ReduceScalar(self, idx1 + idx2, data);
-          }
-        });
-  }
-};
-
-// Specialization for multi-threaded execution.
-template <typename Self, typename Reducer, bool Vectorize>
-struct ScanLauncher<Self, Reducer, ThreadPoolDevice, Vectorize> {
-  void operator()(Self& self, typename Self::CoeffReturnType* data) {
-    using Scalar = typename Self::CoeffReturnType;
-    using Packet = typename Self::PacketReturnType;
-    const int PacketSize = internal::unpacket_traits<Packet>::size;
-    const Index total_size = internal::array_prod(self.dimensions());
-    const Index inner_block_size = self.stride() * self.size();
-    bool parallelize_by_outer_blocks = (total_size >= (self.stride() * inner_block_size));
-
-    if ((parallelize_by_outer_blocks && total_size <= 4096) ||
-        (!parallelize_by_outer_blocks && self.stride() < PacketSize)) {
-      ScanLauncher<Self, Reducer, DefaultDevice, Vectorize> launcher;
-      launcher(self, data);
-      return;
-    }
-
-    if (parallelize_by_outer_blocks) {
-      // Parallelize over outer blocks.
-      const Index num_outer_blocks = total_size / inner_block_size;
-      self.device().parallelFor(
-          num_outer_blocks,
-          TensorOpCost(inner_block_size, inner_block_size,
-                       16 * PacketSize * inner_block_size, Vectorize,
-                       PacketSize),
-          [=](Index blk_size) {
-            return AdjustBlockSize(inner_block_size * sizeof(Scalar), blk_size);
-          },
-          [&](Index first, Index last) {
-            for (Index idx1 = first; idx1 < last; ++idx1) {
-              ReduceBlock<Self, Vectorize, /*Parallelize=*/false> block_reducer;
-              block_reducer(self, idx1 * inner_block_size, data);
-            }
-          });
-    } else {
-      // Parallelize over inner packets/scalars dimensions when the reduction
-      // axis is not an inner dimension.
-      ReduceBlock<Self, Vectorize, /*Parallelize=*/true> block_reducer;
-      for (Index idx1 = 0; idx1 < total_size;
-           idx1 += self.stride() * self.size()) {
-        block_reducer(self, idx1, data);
-      }
-    }
-  }
-};
-#endif  // EIGEN_USE_THREADS
-
-#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
-
-// GPU implementation of scan
-// TODO(ibab) This placeholder implementation performs multiple scans in
-// parallel, but it would be better to use a parallel scan algorithm and
-// optimize memory access.
-template <typename Self, typename Reducer>
-__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) {
-  // Compute offset as in the CPU version
-  Index val = threadIdx.x + blockIdx.x * blockDim.x;
-  Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride();
-
-  if (offset + (self.size() - 1) * self.stride() < total_size) {
-    // Compute the scan along the axis, starting at the calculated offset
-    typename Self::CoeffReturnType accum = self.accumulator().initialize();
-    for (Index idx = 0; idx < self.size(); idx++) {
-      Index curr = offset + idx * self.stride();
-      if (self.exclusive()) {
-        data[curr] = self.accumulator().finalize(accum);
-        self.accumulator().reduce(self.inner().coeff(curr), &accum);
-      } else {
-        self.accumulator().reduce(self.inner().coeff(curr), &accum);
-        data[curr] = self.accumulator().finalize(accum);
-      }
-    }
-  }
-  __syncthreads();
-
-}
-
-template <typename Self, typename Reducer, bool Vectorize>
-struct ScanLauncher<Self, Reducer, GpuDevice, Vectorize> {
-  void operator()(const Self& self, typename Self::CoeffReturnType* data) {
-     Index total_size = internal::array_prod(self.dimensions());
-     Index num_blocks = (total_size / self.size() + 63) / 64;
-     Index block_size = 64;
-
-     LAUNCH_GPU_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
-  }
-};
-#endif  // EIGEN_USE_GPU && (EIGEN_GPUCC)
-
-}  // namespace internal
-
-// Eval as rvalue
-template <typename Op, typename ArgType, typename Device>
-struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
-
-  typedef TensorScanOp<Op, ArgType> XprType;
-  typedef typename XprType::Index Index;
-  typedef const ArgType ChildTypeNoConst;
-  typedef const ArgType ChildType;
-  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
-  typedef DSizes<Index, NumDims> Dimensions;
-  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  typedef TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> Self;
-  typedef StorageMemory<Scalar, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
-  enum {
-    IsAligned = false,
-    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccess = false,
-    PreferBlockAccess = false,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,
-    RawAccess = true
-  };
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device),
-        m_device(device),
-        m_exclusive(op.exclusive()),
-        m_accumulator(op.accumulator()),
-        m_size(m_impl.dimensions()[op.axis()]),
-        m_stride(1), m_consume_dim(op.axis()),
-        m_output(NULL) {
-
-    // Accumulating a scalar isn't supported.
-    EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    eigen_assert(op.axis() >= 0 && op.axis() < NumDims);
-
-    // Compute stride of scan axis
-    const Dimensions& dims = m_impl.dimensions();
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      for (int i = 0; i < op.axis(); ++i) {
-        m_stride = m_stride * dims[i];
-      }
-    } else {
-      // dims can only be indexed through unsigned integers,
-      // so let's use an unsigned type to let the compiler knows.
-      // This prevents stupid warnings: ""'*((void*)(& evaluator)+64)[18446744073709551615]' may be used uninitialized in this function"
-      unsigned int axis = internal::convert_index<unsigned int>(op.axis());
-      for (unsigned int i = NumDims - 1; i > axis; --i) {
-        m_stride = m_stride * dims[i];
-      }
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
-    return m_impl.dimensions();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride() const {
-    return m_stride;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& consume_dim() const {
-    return m_consume_dim;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const {
-    return m_size;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& accumulator() const {
-    return m_accumulator;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const {
-    return m_exclusive;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& inner() const {
-    return m_impl;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const {
-    return m_device;
-  }
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
-    m_impl.evalSubExprsIfNeeded(NULL);
-    internal::ScanLauncher<Self, Op, Device> launcher;
-    if (data) {
-      launcher(*this, data);
-      return false;
-    }
-
-    const Index total_size = internal::array_prod(dimensions());
-    m_output = static_cast<EvaluatorPointerType>(m_device.get((Scalar*) m_device.allocate_temp(total_size * sizeof(Scalar))));
-    launcher(*this, m_output);
-    return true;
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
-    return internal::ploadt<PacketReturnType, LoadMode>(m_output + index);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const
-  {
-    return m_output;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
-  {
-    return m_output[index];
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
-    return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
-  }
-
-  EIGEN_STRONG_INLINE void cleanup() {
-    if (m_output) {
-      m_device.deallocate_temp(m_output);
-      m_output = NULL;
-    }
-    m_impl.cleanup();
-  }
-
-#ifdef EIGEN_USE_SYCL
- // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-    m_output.bind(cgh);
-  }
-#endif
-protected:
-  TensorEvaluator<ArgType, Device> m_impl;
-  const Device EIGEN_DEVICE_REF m_device;
-  const bool m_exclusive;
-  Op m_accumulator;
-  const Index m_size;
-  Index m_stride;
-  Index m_consume_dim;
-  EvaluatorPointerType m_output;
-};
-
-}  // end namespace Eigen
-
-#endif  // EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorScanSycl.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorScanSycl.h
deleted file mode 100644
index 7f68ecb..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorScanSycl.h
+++ /dev/null
@@ -1,513 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TensorScanSycl.h
- *
- * \brief:
- *  Tensor Scan Sycl implement the extend  version of
- * "Efficient parallel scan algorithms for GPUs." .for Tensor operations.
- * The algorithm requires up to 3 stage (consequently 3 kernels) depending on
- * the size of the tensor. In the first kernel (ScanKernelFunctor), each
- * threads within the work-group individually reduces the allocated elements per
- * thread in order to reduces the total number of blocks. In the next step all
- * thread within the work-group will reduce the associated blocks into the
- * temporary buffers. In the next kernel(ScanBlockKernelFunctor), the temporary
- * buffer is given as an input and all the threads within a work-group scan and
- * reduces the boundaries between the blocks (generated from the previous
- * kernel). and write the data on the temporary buffer. If the second kernel is
- * required, the third and final kerenl (ScanAdjustmentKernelFunctor) will
- * adjust the final result into the output buffer.
- * The original algorithm for the parallel prefix sum can be found here:
- *
- * Sengupta, Shubhabrata, Mark Harris, and Michael Garland. "Efficient parallel
- * scan algorithms for GPUs." NVIDIA, Santa Clara, CA, Tech. Rep. NVR-2008-003
- *1, no. 1 (2008): 1-17.
- *****************************************************************/
-
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
-
-namespace Eigen {
-namespace TensorSycl {
-namespace internal {
-
-#ifndef EIGEN_SYCL_MAX_GLOBAL_RANGE
-#define EIGEN_SYCL_MAX_GLOBAL_RANGE (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 * 4)
-#endif
-
-template <typename index_t>
-struct ScanParameters {
-  // must be power of 2
-  static EIGEN_CONSTEXPR index_t ScanPerThread = 8;
-  const index_t total_size;
-  const index_t non_scan_size;
-  const index_t scan_size;
-  const index_t non_scan_stride;
-  const index_t scan_stride;
-  const index_t panel_threads;
-  const index_t group_threads;
-  const index_t block_threads;
-  const index_t elements_per_group;
-  const index_t elements_per_block;
-  const index_t loop_range;
-
-  ScanParameters(index_t total_size_, index_t non_scan_size_, index_t scan_size_, index_t non_scan_stride_,
-                 index_t scan_stride_, index_t panel_threads_, index_t group_threads_, index_t block_threads_,
-                 index_t elements_per_group_, index_t elements_per_block_, index_t loop_range_)
-      : total_size(total_size_),
-        non_scan_size(non_scan_size_),
-        scan_size(scan_size_),
-        non_scan_stride(non_scan_stride_),
-        scan_stride(scan_stride_),
-        panel_threads(panel_threads_),
-        group_threads(group_threads_),
-        block_threads(block_threads_),
-        elements_per_group(elements_per_group_),
-        elements_per_block(elements_per_block_),
-        loop_range(loop_range_) {}
-};
-
-enum class scan_step { first, second };
-template <typename Evaluator, typename CoeffReturnType, typename OutAccessor, typename Op, typename Index,
-          scan_step stp>
-struct ScanKernelFunctor {
-  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
-      LocalAccessor;
-  static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2;
-
-  LocalAccessor scratch;
-  Evaluator dev_eval;
-  OutAccessor out_accessor;
-  OutAccessor temp_accessor;
-  const ScanParameters<Index> scanParameters;
-  Op accumulator;
-  const bool inclusive;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanKernelFunctor(LocalAccessor scratch_, const Evaluator dev_eval_,
-                                                          OutAccessor out_accessor_, OutAccessor temp_accessor_,
-                                                          const ScanParameters<Index> scanParameters_, Op accumulator_,
-                                                          const bool inclusive_)
-      : scratch(scratch_),
-        dev_eval(dev_eval_),
-        out_accessor(out_accessor_),
-        temp_accessor(temp_accessor_),
-        scanParameters(scanParameters_),
-        accumulator(accumulator_),
-        inclusive(inclusive_) {}
-
-  template <scan_step sst = stp, typename Input>
-  typename ::Eigen::internal::enable_if<sst == scan_step::first, CoeffReturnType>::type EIGEN_DEVICE_FUNC
-      EIGEN_STRONG_INLINE
-      read(const Input &inpt, Index global_id) {
-    return inpt.coeff(global_id);
-  }
-
-  template <scan_step sst = stp, typename Input>
-  typename ::Eigen::internal::enable_if<sst != scan_step::first, CoeffReturnType>::type EIGEN_DEVICE_FUNC
-      EIGEN_STRONG_INLINE
-      read(const Input &inpt, Index global_id) {
-    return inpt[global_id];
-  }
-
-  template <scan_step sst = stp, typename InclusiveOp>
-  typename ::Eigen::internal::enable_if<sst == scan_step::first>::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  first_step_inclusive_Operation(InclusiveOp inclusive_op) {
-    inclusive_op();
-  }
-
-  template <scan_step sst = stp, typename InclusiveOp>
-  typename ::Eigen::internal::enable_if<sst != scan_step::first>::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  first_step_inclusive_Operation(InclusiveOp) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
-    auto out_ptr = out_accessor.get_pointer();
-    auto tmp_ptr = temp_accessor.get_pointer();
-    auto scratch_ptr = scratch.get_pointer().get();
-
-    for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {
-      Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
-      Index tmp = data_offset % scanParameters.panel_threads;
-      const Index panel_id = data_offset / scanParameters.panel_threads;
-      const Index group_id = tmp / scanParameters.group_threads;
-      tmp = tmp % scanParameters.group_threads;
-      const Index block_id = tmp / scanParameters.block_threads;
-      const Index local_id = tmp % scanParameters.block_threads;
-      // we put one element per packet in scratch_mem
-      const Index scratch_stride = scanParameters.elements_per_block / PacketSize;
-      const Index scratch_offset = (itemID.get_local_id(0) / scanParameters.block_threads) * scratch_stride;
-      CoeffReturnType private_scan[ScanParameters<Index>::ScanPerThread];
-      CoeffReturnType inclusive_scan;
-      // the actual panel size is scan_size * non_scan_size.
-      // elements_per_panel is roundup to power of 2 for binary tree
-      const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size;
-      const Index group_offset = group_id * scanParameters.non_scan_stride;
-      // This will be effective when the size is bigger than elements_per_block
-      const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride;
-      const Index thread_offset = (ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride);
-      const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
-      Index next_elements = 0;
-      EIGEN_UNROLL_LOOP
-      for (int i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
-        Index global_id = global_offset + next_elements;
-        private_scan[i] = ((((block_id * scanParameters.elements_per_block) +
-                             (ScanParameters<Index>::ScanPerThread * local_id) + i) < scanParameters.scan_size) &&
-                           (global_id < scanParameters.total_size))
-                              ? read(dev_eval, global_id)
-                              : accumulator.initialize();
-        next_elements += scanParameters.scan_stride;
-      }
-      first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC {
-        if (inclusive) {
-          inclusive_scan = private_scan[ScanParameters<Index>::ScanPerThread - 1];
-        }
-      });
-      // This for loop must be 2
-      EIGEN_UNROLL_LOOP
-      for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {
-        Index private_offset = 1;
-        // build sum in place up the tree
-        EIGEN_UNROLL_LOOP
-        for (Index d = PacketSize >> 1; d > 0; d >>= 1) {
-          EIGEN_UNROLL_LOOP
-          for (Index l = 0; l < d; l++) {
-            Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
-            Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
-            CoeffReturnType accum = accumulator.initialize();
-            accumulator.reduce(private_scan[ai], &accum);
-            accumulator.reduce(private_scan[bi], &accum);
-            private_scan[bi] = accumulator.finalize(accum);
-          }
-          private_offset *= 2;
-        }
-        scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset] =
-            private_scan[PacketSize - 1 + packetIndex];
-        private_scan[PacketSize - 1 + packetIndex] = accumulator.initialize();
-        // traverse down tree & build scan
-        EIGEN_UNROLL_LOOP
-        for (Index d = 1; d < PacketSize; d *= 2) {
-          private_offset >>= 1;
-          EIGEN_UNROLL_LOOP
-          for (Index l = 0; l < d; l++) {
-            Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
-            Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
-            CoeffReturnType accum = accumulator.initialize();
-            accumulator.reduce(private_scan[ai], &accum);
-            accumulator.reduce(private_scan[bi], &accum);
-            private_scan[ai] = private_scan[bi];
-            private_scan[bi] = accumulator.finalize(accum);
-          }
-        }
-      }
-
-      Index offset = 1;
-      // build sum in place up the tree
-      for (Index d = scratch_stride >> 1; d > 0; d >>= 1) {
-        // Synchronise
-        itemID.barrier(cl::sycl::access::fence_space::local_space);
-        if (local_id < d) {
-          Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
-          Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
-          CoeffReturnType accum = accumulator.initialize();
-          accumulator.reduce(scratch_ptr[ai], &accum);
-          accumulator.reduce(scratch_ptr[bi], &accum);
-          scratch_ptr[bi] = accumulator.finalize(accum);
-        }
-        offset *= 2;
-      }
-      // Synchronise
-      itemID.barrier(cl::sycl::access::fence_space::local_space);
-      // next step optimisation
-      if (local_id == 0) {
-        if (((scanParameters.elements_per_group / scanParameters.elements_per_block) > 1)) {
-          const Index temp_id = panel_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) *
-                                    scanParameters.non_scan_size +
-                                group_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) +
-                                block_id;
-          tmp_ptr[temp_id] = scratch_ptr[scratch_stride - 1 + scratch_offset];
-        }
-        // clear the last element
-        scratch_ptr[scratch_stride - 1 + scratch_offset] = accumulator.initialize();
-      }
-      // traverse down tree & build scan
-      for (Index d = 1; d < scratch_stride; d *= 2) {
-        offset >>= 1;
-        // Synchronise
-        itemID.barrier(cl::sycl::access::fence_space::local_space);
-        if (local_id < d) {
-          Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
-          Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
-          CoeffReturnType accum = accumulator.initialize();
-          accumulator.reduce(scratch_ptr[ai], &accum);
-          accumulator.reduce(scratch_ptr[bi], &accum);
-          scratch_ptr[ai] = scratch_ptr[bi];
-          scratch_ptr[bi] = accumulator.finalize(accum);
-        }
-      }
-      // Synchronise
-      itemID.barrier(cl::sycl::access::fence_space::local_space);
-      // This for loop must be 2
-      EIGEN_UNROLL_LOOP
-      for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {
-        EIGEN_UNROLL_LOOP
-        for (Index i = 0; i < PacketSize; i++) {
-          CoeffReturnType accum = private_scan[packetIndex + i];
-          accumulator.reduce(scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset], &accum);
-          private_scan[packetIndex + i] = accumulator.finalize(accum);
-        }
-      }
-      first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC {
-        if (inclusive) {
-          accumulator.reduce(private_scan[ScanParameters<Index>::ScanPerThread - 1], &inclusive_scan);
-          private_scan[0] = accumulator.finalize(inclusive_scan);
-        }
-      });
-      next_elements = 0;
-      // right the first set of private param
-      EIGEN_UNROLL_LOOP
-      for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
-        Index global_id = global_offset + next_elements;
-        if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) <
-             scanParameters.scan_size) &&
-            (global_id < scanParameters.total_size)) {
-          Index private_id = (i * !inclusive) + (((i + 1) % ScanParameters<Index>::ScanPerThread) * (inclusive));
-          out_ptr[global_id] = private_scan[private_id];
-        }
-        next_elements += scanParameters.scan_stride;
-      }
-    }  // end for loop
-  }
-};
-
-template <typename CoeffReturnType, typename InAccessor, typename OutAccessor, typename Op, typename Index>
-struct ScanAdjustmentKernelFunctor {
-  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
-      LocalAccessor;
-  static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2;
-  InAccessor in_accessor;
-  OutAccessor out_accessor;
-  const ScanParameters<Index> scanParameters;
-  Op accumulator;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanAdjustmentKernelFunctor(LocalAccessor, InAccessor in_accessor_,
-                                                                    OutAccessor out_accessor_,
-                                                                    const ScanParameters<Index> scanParameters_,
-                                                                    Op accumulator_)
-      : in_accessor(in_accessor_),
-        out_accessor(out_accessor_),
-        scanParameters(scanParameters_),
-        accumulator(accumulator_) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
-    auto in_ptr = in_accessor.get_pointer();
-    auto out_ptr = out_accessor.get_pointer();
-
-    for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {
-      Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
-      Index tmp = data_offset % scanParameters.panel_threads;
-      const Index panel_id = data_offset / scanParameters.panel_threads;
-      const Index group_id = tmp / scanParameters.group_threads;
-      tmp = tmp % scanParameters.group_threads;
-      const Index block_id = tmp / scanParameters.block_threads;
-      const Index local_id = tmp % scanParameters.block_threads;
-
-      // the actual panel size is scan_size * non_scan_size.
-      // elements_per_panel is roundup to power of 2 for binary tree
-      const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size;
-      const Index group_offset = group_id * scanParameters.non_scan_stride;
-      // This will be effective when the size is bigger than elements_per_block
-      const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride;
-      const Index thread_offset = ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride;
-
-      const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
-      const Index block_size = scanParameters.elements_per_group / scanParameters.elements_per_block;
-      const Index in_id = (panel_id * block_size * scanParameters.non_scan_size) + (group_id * block_size) + block_id;
-      CoeffReturnType adjust_val = in_ptr[in_id];
-
-      Index next_elements = 0;
-      EIGEN_UNROLL_LOOP
-      for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
-        Index global_id = global_offset + next_elements;
-        if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) <
-             scanParameters.scan_size) &&
-            (global_id < scanParameters.total_size)) {
-          CoeffReturnType accum = adjust_val;
-          accumulator.reduce(out_ptr[global_id], &accum);
-          out_ptr[global_id] = accumulator.finalize(accum);
-        }
-        next_elements += scanParameters.scan_stride;
-      }
-    }
-  }
-};
-
-template <typename Index>
-struct ScanInfo {
-  const Index &total_size;
-  const Index &scan_size;
-  const Index &panel_size;
-  const Index &non_scan_size;
-  const Index &scan_stride;
-  const Index &non_scan_stride;
-
-  Index max_elements_per_block;
-  Index block_size;
-  Index panel_threads;
-  Index group_threads;
-  Index block_threads;
-  Index elements_per_group;
-  Index elements_per_block;
-  Index loop_range;
-  Index global_range;
-  Index local_range;
-  const Eigen::SyclDevice &dev;
-  EIGEN_STRONG_INLINE ScanInfo(const Index &total_size_, const Index &scan_size_, const Index &panel_size_,
-                               const Index &non_scan_size_, const Index &scan_stride_, const Index &non_scan_stride_,
-                               const Eigen::SyclDevice &dev_)
-      : total_size(total_size_),
-        scan_size(scan_size_),
-        panel_size(panel_size_),
-        non_scan_size(non_scan_size_),
-        scan_stride(scan_stride_),
-        non_scan_stride(non_scan_stride_),
-        dev(dev_) {
-    // must be power of 2
-    local_range = std::min(Index(dev.getNearestPowerOfTwoWorkGroupSize()),
-                           Index(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1));
-
-    max_elements_per_block = local_range * ScanParameters<Index>::ScanPerThread;
-
-    elements_per_group =
-        dev.getPowerOfTwo(Index(roundUp(Index(scan_size), ScanParameters<Index>::ScanPerThread)), true);
-    const Index elements_per_panel = elements_per_group * non_scan_size;
-    elements_per_block = std::min(Index(elements_per_group), Index(max_elements_per_block));
-    panel_threads = elements_per_panel / ScanParameters<Index>::ScanPerThread;
-    group_threads = elements_per_group / ScanParameters<Index>::ScanPerThread;
-    block_threads = elements_per_block / ScanParameters<Index>::ScanPerThread;
-    block_size = elements_per_group / elements_per_block;
-#ifdef EIGEN_SYCL_MAX_GLOBAL_RANGE
-    const Index max_threads = std::min(Index(panel_threads * panel_size), Index(EIGEN_SYCL_MAX_GLOBAL_RANGE));
-#else
-    const Index max_threads = panel_threads * panel_size;
-#endif
-    global_range = roundUp(max_threads, local_range);
-    loop_range = Index(
-        std::ceil(double(elements_per_panel * panel_size) / (global_range * ScanParameters<Index>::ScanPerThread)));
-  }
-  inline ScanParameters<Index> get_scan_parameter() {
-    return ScanParameters<Index>(total_size, non_scan_size, scan_size, non_scan_stride, scan_stride, panel_threads,
-                                 group_threads, block_threads, elements_per_group, elements_per_block, loop_range);
-  }
-  inline cl::sycl::nd_range<1> get_thread_range() {
-    return cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
-  }
-};
-
-template <typename EvaluatorPointerType, typename CoeffReturnType, typename Reducer, typename Index>
-struct SYCLAdjustBlockOffset {
-  EIGEN_STRONG_INLINE static void adjust_scan_block_offset(EvaluatorPointerType in_ptr, EvaluatorPointerType out_ptr,
-                                                           Reducer &accumulator, const Index total_size,
-                                                           const Index scan_size, const Index panel_size,
-                                                           const Index non_scan_size, const Index scan_stride,
-                                                           const Index non_scan_stride, const Eigen::SyclDevice &dev) {
-    auto scan_info =
-        ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);
-
-    typedef ScanAdjustmentKernelFunctor<CoeffReturnType, EvaluatorPointerType, EvaluatorPointerType, Reducer, Index>
-        AdjustFuctor;
-    dev.template unary_kernel_launcher<CoeffReturnType, AdjustFuctor>(in_ptr, out_ptr, scan_info.get_thread_range(),
-                                                                      scan_info.max_elements_per_block,
-                                                                      scan_info.get_scan_parameter(), accumulator);
-  }
-};
-
-template <typename CoeffReturnType, scan_step stp>
-struct ScanLauncher_impl {
-  template <typename Input, typename EvaluatorPointerType, typename Reducer, typename Index>
-  EIGEN_STRONG_INLINE static void scan_block(Input in_ptr, EvaluatorPointerType out_ptr, Reducer &accumulator,
-                                             const Index total_size, const Index scan_size, const Index panel_size,
-                                             const Index non_scan_size, const Index scan_stride,
-                                             const Index non_scan_stride, const bool inclusive,
-                                             const Eigen::SyclDevice &dev) {
-    auto scan_info =
-        ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);
-    const Index temp_pointer_size = scan_info.block_size * non_scan_size * panel_size;
-    const Index scratch_size = scan_info.max_elements_per_block / (ScanParameters<Index>::ScanPerThread / 2);
-    CoeffReturnType *temp_pointer =
-        static_cast<CoeffReturnType *>(dev.allocate_temp(temp_pointer_size * sizeof(CoeffReturnType)));
-    EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
-
-    typedef ScanKernelFunctor<Input, CoeffReturnType, EvaluatorPointerType, Reducer, Index, stp> ScanFunctor;
-    dev.template binary_kernel_launcher<CoeffReturnType, ScanFunctor>(
-        in_ptr, out_ptr, tmp_global_accessor, scan_info.get_thread_range(), scratch_size,
-        scan_info.get_scan_parameter(), accumulator, inclusive);
-
-    if (scan_info.block_size > 1) {
-      ScanLauncher_impl<CoeffReturnType, scan_step::second>::scan_block(
-          tmp_global_accessor, tmp_global_accessor, accumulator, temp_pointer_size, scan_info.block_size, panel_size,
-          non_scan_size, Index(1), scan_info.block_size, false, dev);
-
-      SYCLAdjustBlockOffset<EvaluatorPointerType, CoeffReturnType, Reducer, Index>::adjust_scan_block_offset(
-          tmp_global_accessor, out_ptr, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride,
-          non_scan_stride, dev);
-    }
-    dev.deallocate_temp(temp_pointer);
-  }
-};
-
-}  // namespace internal
-}  // namespace TensorSycl
-namespace internal {
-template <typename Self, typename Reducer, bool vectorize>
-struct ScanLauncher<Self, Reducer, Eigen::SyclDevice, vectorize> {
-  typedef typename Self::Index Index;
-  typedef typename Self::CoeffReturnType CoeffReturnType;
-  typedef typename Self::Storage Storage;
-  typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
-  void operator()(Self &self, EvaluatorPointerType data) {
-    const Index total_size = internal::array_prod(self.dimensions());
-    const Index scan_size = self.size();
-    const Index scan_stride = self.stride();
-    // this is the scan op (can be sum or ...)
-    auto accumulator = self.accumulator();
-    auto inclusive = !self.exclusive();
-    auto consume_dim = self.consume_dim();
-    auto dev = self.device();
-
-    auto dims = self.inner().dimensions();
-
-    Index non_scan_size = 1;
-    Index panel_size = 1;
-    if (static_cast<int>(Self::Layout) == static_cast<int>(ColMajor)) {
-      for (int i = 0; i < consume_dim; i++) {
-        non_scan_size *= dims[i];
-      }
-      for (int i = consume_dim + 1; i < Self::NumDims; i++) {
-        panel_size *= dims[i];
-      }
-    } else {
-      for (int i = Self::NumDims - 1; i > consume_dim; i--) {
-        non_scan_size *= dims[i];
-      }
-      for (int i = consume_dim - 1; i >= 0; i--) {
-        panel_size *= dims[i];
-      }
-    }
-    const Index non_scan_stride = (scan_stride > 1) ? 1 : scan_size;
-    auto eval_impl = self.inner();
-    TensorSycl::internal::ScanLauncher_impl<CoeffReturnType, TensorSycl::internal::scan_step::first>::scan_block(
-        eval_impl, data, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride,
-        inclusive, dev);
-  }
-};
-} // namespace internal
-}  // namespace Eigen
-
-#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorShuffling.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorShuffling.h
deleted file mode 100644
index e5e5efd..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorShuffling.h
+++ /dev/null
@@ -1,471 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
-#define EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
-
-namespace Eigen {
-
-/** \class TensorShuffling
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Tensor shuffling class.
-  *
-  *
-  */
-namespace internal {
-template<typename Shuffle, typename XprType>
-struct traits<TensorShufflingOp<Shuffle, XprType> > : public traits<XprType>
-{
-  typedef typename XprType::Scalar Scalar;
-  typedef traits<XprType> XprTraits;
-  typedef typename XprTraits::StorageKind StorageKind;
-  typedef typename XprTraits::Index Index;
-  typedef typename XprType::Nested Nested;
-  typedef typename remove_reference<Nested>::type _Nested;
-  static const int NumDimensions = XprTraits::NumDimensions;
-  static const int Layout = XprTraits::Layout;
-  typedef typename XprTraits::PointerType PointerType;
-};
-
-template<typename Shuffle, typename XprType>
-struct eval<TensorShufflingOp<Shuffle, XprType>, Eigen::Dense>
-{
-  typedef const TensorShufflingOp<Shuffle, XprType>& type;
-};
-
-template<typename Shuffle, typename XprType>
-struct nested<TensorShufflingOp<Shuffle, XprType>, 1, typename eval<TensorShufflingOp<Shuffle, XprType> >::type>
-{
-  typedef TensorShufflingOp<Shuffle, XprType> type;
-};
-
-}  // end namespace internal
-
-
-
-template<typename Shuffle, typename XprType>
-class TensorShufflingOp : public TensorBase<TensorShufflingOp<Shuffle, XprType> >
-{
-  public:
-    typedef TensorBase<TensorShufflingOp<Shuffle, XprType> > Base;
-    typedef typename Eigen::internal::traits<TensorShufflingOp>::Scalar Scalar;
-    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-    typedef typename XprType::CoeffReturnType CoeffReturnType;
-    typedef typename Eigen::internal::nested<TensorShufflingOp>::type Nested;
-    typedef typename Eigen::internal::traits<TensorShufflingOp>::StorageKind StorageKind;
-    typedef typename Eigen::internal::traits<TensorShufflingOp>::Index Index;
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp(const XprType& expr, const Shuffle& shfl)
-      : m_xpr(expr), m_shuffle(shfl) {}
-
-    EIGEN_DEVICE_FUNC
-    const Shuffle& shufflePermutation() const { return m_shuffle; }
-
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename XprType::Nested>::type&
-    expression() const { return m_xpr; }
-
-    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorShufflingOp)
-
-
-  protected:
-    typename XprType::Nested m_xpr;
-    const Shuffle m_shuffle;
-};
-
-
-// Eval as rvalue
-template<typename Shuffle, typename ArgType, typename Device>
-struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
-{
-  typedef TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> Self;
-  typedef TensorShufflingOp<Shuffle, ArgType> XprType;
-  typedef typename XprType::Index Index;
-  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
-  typedef DSizes<Index, NumDims> Dimensions;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
-  enum {
-    IsAligned         = false,
-    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccess       = TensorEvaluator<ArgType, Device>::RawAccess,
-    PreferBlockAccess = true,
-    Layout            = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess       = false,  // to be implemented
-    RawAccess         = false
-  };
-
-  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
-  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
-
-  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
-                                                     Layout, Index>
-      TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_device(device),
-        m_impl(op.expression(), device)
-  {
-    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
-    const Shuffle& shuffle = op.shufflePermutation();
-    m_is_identity = true;
-    for (int i = 0; i < NumDims; ++i) {
-      m_shuffle[i] = static_cast<int>(shuffle[i]);
-      m_dimensions[i] = input_dims[shuffle[i]];
-      m_inverseShuffle[shuffle[i]] = i;
-      if (m_is_identity && shuffle[i] != i) {
-        m_is_identity = false;
-      }
-    }
-
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      m_unshuffledInputStrides[0] = 1;
-      m_outputStrides[0] = 1;
-
-      for (int i = 1; i < NumDims; ++i) {
-        m_unshuffledInputStrides[i] =
-            m_unshuffledInputStrides[i - 1] * input_dims[i - 1];
-        m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
-        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(
-                  m_outputStrides[i] > 0 ? m_outputStrides[i] : Index(1));
-      }
-    } else {
-      m_unshuffledInputStrides[NumDims - 1] = 1;
-      m_outputStrides[NumDims - 1] = 1;
-      for (int i = NumDims - 2; i >= 0; --i) {
-        m_unshuffledInputStrides[i] =
-            m_unshuffledInputStrides[i + 1] * input_dims[i + 1];
-        m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
-        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(
-                  m_outputStrides[i] > 0 ? m_outputStrides[i] : Index(1));
-      }
-    }
-
-    for (int i = 0; i < NumDims; ++i) {
-      m_inputStrides[i] = m_unshuffledInputStrides[shuffle[i]];
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
-    m_impl.evalSubExprsIfNeeded(NULL);
-    return true;
-  }
-
-#ifdef EIGEN_USE_THREADS
-  template <typename EvalSubExprsCallback>
-  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
-      EvaluatorPointerType, EvalSubExprsCallback done) {
-    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
-  }
-#endif  // EIGEN_USE_THREADS
-
-  EIGEN_STRONG_INLINE void cleanup() {
-    m_impl.cleanup();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
-  {
-    if (m_is_identity) {
-      return m_impl.coeff(index);
-    } else {
-      return m_impl.coeff(srcCoeff(index));
-    }
-  }
-
-  template <int LoadMode, typename Self, bool ImplPacketAccess>
-  struct PacketLoader {
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    static PacketReturnType Run(const Self& self, Index index) {
-      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
-      EIGEN_UNROLL_LOOP
-      for (int i = 0; i < PacketSize; ++i) {
-        values[i] = self.coeff(index + i);
-      }
-      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
-      return rslt;
-    }
-  };
-
-  template<int LoadMode, typename Self>
-  struct PacketLoader<LoadMode, Self, true> {
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    static PacketReturnType Run(const Self& self, Index index) {
-      if (self.m_is_identity) {
-        return self.m_impl.template packet<LoadMode>(index);
-      } else {
-        EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
-        EIGEN_UNROLL_LOOP
-        for (int i = 0; i < PacketSize; ++i) {
-          values[i] = self.coeff(index + i);
-        }
-        PacketReturnType rslt = internal::pload<PacketReturnType>(values);
-        return rslt;
-      }
-    }
-  };
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
-  {
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
-        eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
-    return PacketLoader<LoadMode, Self, TensorEvaluator<ArgType, Device>::PacketAccess>::Run(*this, index);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockResourceRequirements getResourceRequirements() const {
-    static const int inner_dim =
-        Layout == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
-
-    const size_t target_size = m_device.firstLevelCacheSize();
-    const bool inner_dim_shuffled = m_shuffle[inner_dim] != inner_dim;
-
-    // Shuffled inner dimensions leads to a random memory access, which is not
-    // captured by default cost model bytes loaded/stored. We add this cost
-    // explicitly. The number of cycles picked based on the benchmarks.
-    // TODO(ezhulenev): This number was picked based on a very questionable
-    // benchmarks, add benchmarks that are representative of real workloads.
-    using BlockRequirements = internal::TensorBlockResourceRequirements;
-    if (inner_dim_shuffled) {
-      return BlockRequirements::uniform<Scalar>(target_size)
-          .addCostPerCoeff({0, 0, NumDims * 28});
-    } else {
-      return BlockRequirements::skewed<Scalar>(target_size);
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
-  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
-          bool root_of_expr_ast = false) const {
-    assert(m_impl.data() != NULL);
-
-    typedef internal::TensorBlockIO<ScalarNoConst, Index, NumDims, Layout>
-        TensorBlockIO;
-    typedef typename TensorBlockIO::Dst TensorBlockIODst;
-    typedef typename TensorBlockIO::Src TensorBlockIOSrc;
-
-    const typename TensorBlock::Storage block_storage =
-        TensorBlock::prepareStorage(
-            desc, scratch, /*allow_strided_storage=*/root_of_expr_ast);
-
-    typename TensorBlockIO::Dimensions input_strides(m_unshuffledInputStrides);
-    TensorBlockIOSrc src(input_strides, m_impl.data(), srcCoeff(desc.offset()));
-
-    TensorBlockIODst dst(block_storage.dimensions(), block_storage.strides(),
-                         block_storage.data());
-
-    typename TensorBlockIO::DimensionsMap dst_to_src_dim_map(m_shuffle);
-    TensorBlockIO::Copy(dst, src, dst_to_src_dim_map);
-
-    return block_storage.AsTensorMaterializedBlock();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    const double compute_cost = m_is_identity ? TensorOpCost::AddCost<Index>() :
-                                NumDims * (2 * TensorOpCost::AddCost<Index>() +
-                                           2 * TensorOpCost::MulCost<Index>() +
-                                           TensorOpCost::DivCost<Index>());
-    return m_impl.costPerCoeff(vectorized) +
-           TensorOpCost(0, 0, compute_cost, m_is_identity /* vectorized */, PacketSize);
-  }
-
-  EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
-
-#ifdef EIGEN_USE_SYCL
-   // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
- protected:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index GetBlockOutputIndex(
-      Index input_index,
-      const DSizes<Index, NumDims>& input_block_strides,
-      const DSizes<Index, NumDims>& output_block_strides,
-      const DSizes<internal::TensorIntDivisor<Index>, NumDims>& fast_input_block_strides) const {
-    Index output_index = 0;
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      for (int i = NumDims - 1; i > 0; --i) {
-        const Index idx = input_index / fast_input_block_strides[i];
-        output_index += idx * output_block_strides[m_inverseShuffle[i]];
-        input_index -= idx * input_block_strides[i];
-      }
-      return output_index + input_index *
-          output_block_strides[m_inverseShuffle[0]];
-    } else {
-      for (int i = 0; i < NumDims - 1; ++i) {
-        const Index idx = input_index / fast_input_block_strides[i];
-        output_index += idx * output_block_strides[m_inverseShuffle[i]];
-        input_index -= idx * input_block_strides[i];
-      }
-      return output_index + input_index *
-          output_block_strides[m_inverseShuffle[NumDims - 1]];
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const {
-    Index inputIndex = 0;
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      for (int i = NumDims - 1; i > 0; --i) {
-        const Index idx = index / m_fastOutputStrides[i];
-        inputIndex += idx * m_inputStrides[i];
-        index -= idx * m_outputStrides[i];
-      }
-      return inputIndex + index * m_inputStrides[0];
-    } else {
-      for (int i = 0; i < NumDims - 1; ++i) {
-        const Index idx = index / m_fastOutputStrides[i];
-        inputIndex += idx * m_inputStrides[i];
-        index -= idx * m_outputStrides[i];
-      }
-      return inputIndex + index * m_inputStrides[NumDims - 1];
-    }
-  }
-
-  Dimensions m_dimensions;
-  bool m_is_identity;
-  array<int, NumDims> m_shuffle;
-  array<Index, NumDims> m_inverseShuffle;  // TODO(ezhulenev): Make it int type.
-  array<Index, NumDims> m_outputStrides;
-  array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
-  array<Index, NumDims> m_inputStrides;
-  array<Index, NumDims> m_unshuffledInputStrides;
-
-  const Device EIGEN_DEVICE_REF m_device;
-  TensorEvaluator<ArgType, Device> m_impl;
-};
-
-
-// Eval as lvalue
-template<typename Shuffle, typename ArgType, typename Device>
-struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
-    : public TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
-{
-  typedef TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> Base;
-
-  typedef TensorShufflingOp<Shuffle, ArgType> XprType;
-  typedef typename XprType::Index Index;
-  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
-  typedef DSizes<Index, NumDims> Dimensions;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-
-  enum {
-    IsAligned         = false,
-    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccess       = TensorEvaluator<ArgType, Device>::RawAccess,
-    PreferBlockAccess = true,
-    Layout            = TensorEvaluator<ArgType, Device>::Layout,
-    RawAccess         = false
-  };
-
-  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : Base(op, device)
-  { }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
-  {
-    return this->m_impl.coeffRef(this->srcCoeff(index));
-  }
-
-  template <int StoreMode> EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketReturnType& x)
-  {
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
-
-    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
-    internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
-    EIGEN_UNROLL_LOOP
-    for (int i = 0; i < PacketSize; ++i) {
-      this->coeffRef(index+i) = values[i];
-    }
-  }
-
-  template <typename TensorBlock>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
-      const TensorBlockDesc& desc, const TensorBlock& block) {
-    eigen_assert(this->m_impl.data() != NULL);
-
-    typedef internal::TensorBlockIO<ScalarNoConst, Index, NumDims, Layout>
-        TensorBlockIO;
-    typedef typename TensorBlockIO::Dst TensorBlockIODst;
-    typedef typename TensorBlockIO::Src TensorBlockIOSrc;
-
-    const Scalar* block_buffer = block.data();
-
-    // TODO(ezhulenev): TensorBlockIO should be able to read from any Eigen
-    // expression with coefficient and packet access as `src`.
-    void* mem = NULL;
-    if (block_buffer == NULL) {
-      mem = this->m_device.allocate(desc.size() * sizeof(Scalar));
-      ScalarNoConst* buf = static_cast<ScalarNoConst*>(mem);
-
-      typedef internal::TensorBlockAssignment<
-          ScalarNoConst, NumDims, typename TensorBlock::XprType, Index>
-          TensorBlockAssignment;
-
-      TensorBlockAssignment::Run(
-          TensorBlockAssignment::target(
-              desc.dimensions(), internal::strides<Layout>(desc.dimensions()),
-              buf),
-          block.expr());
-
-      block_buffer = buf;
-    }
-
-    // Read from block.
-    TensorBlockIOSrc src(internal::strides<Layout>(desc.dimensions()),
-                         block_buffer);
-
-    // Write to the output buffer.
-    typename TensorBlockIO::Dimensions output_strides(
-        this->m_unshuffledInputStrides);
-    typename TensorBlockIO::Dimensions output_dimensions;
-    for (int i = 0; i < NumDims; ++i) {
-      output_dimensions[this->m_shuffle[i]] = desc.dimension(i);
-    }
-    TensorBlockIODst dst(output_dimensions, output_strides, this->m_impl.data(),
-                         this->srcCoeff(desc.offset()));
-
-    // Reorder dimensions according to the shuffle.
-    typename TensorBlockIO::DimensionsMap dst_to_src_dim_map;
-    for (int i = 0; i < NumDims; ++i) {
-      dst_to_src_dim_map[i] = static_cast<int>(this->m_inverseShuffle[i]);
-    }
-    TensorBlockIO::Copy(dst, src, dst_to_src_dim_map);
-
-    // Deallocate temporary buffer used for the block materialization.
-    if (mem != NULL) this->m_device.deallocate(mem);
-  }
-};
-
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorStorage.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorStorage.h
deleted file mode 100644
index 5ff0880..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorStorage.h
+++ /dev/null
@@ -1,161 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
-// Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSORSTORAGE_H
-#define EIGEN_CXX11_TENSOR_TENSORSTORAGE_H
-
-#ifdef EIGEN_TENSOR_STORAGE_CTOR_PLUGIN
-  #define EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN EIGEN_TENSOR_STORAGE_CTOR_PLUGIN;
-#else
-  #define EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN
-#endif
-
-namespace Eigen {
-
-/** \internal
-  *
-  * \class TensorStorage
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Stores the data of a tensor
-  *
-  * This class stores the data of fixed-size, dynamic-size or mixed tensors
-  * in a way as compact as possible.
-  *
-  * \sa Tensor
-  */
-template<typename T, typename Dimensions, int Options> class TensorStorage;
-
-
-// Pure fixed-size storage
-template<typename T, typename FixedDimensions, int Options_>
-class TensorStorage
-{
- private:
-  static const std::size_t Size = FixedDimensions::total_size;
-
-  // Allocate an array of size at least one to prevent compiler warnings.
-  static const std::size_t MinSize = max_n_1<Size>::size;
-  EIGEN_ALIGN_MAX T m_data[MinSize];
-
- public:
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE TensorStorage() {
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE T *data() { return m_data; }
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE const T *data() const { return m_data; }
-
-  static EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE const FixedDimensions& dimensions()
-  {
-    static const FixedDimensions* singleton_dimensions = new FixedDimensions();
-    return *singleton_dimensions;
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE DenseIndex size() const { return Size; }
-};
-
-// pure dynamic
-template<typename T, typename IndexType, int NumIndices_, int Options_>
-class TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_>
-{
-  public:
-    typedef IndexType Index;
-    typedef DSizes<IndexType, NumIndices_> Dimensions;
-    typedef TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_> Self;
-
-    EIGEN_DEVICE_FUNC TensorStorage() : m_data(0), m_dimensions() {
-      if (NumIndices_ == 0) {
-	m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(1);
-      }
-    }
-    EIGEN_DEVICE_FUNC TensorStorage(internal::constructor_without_unaligned_array_assert)
-      : m_data(0), m_dimensions(internal::template repeat<NumIndices_, Index>(0)) {}
-    EIGEN_DEVICE_FUNC TensorStorage(Index size, const array<Index, NumIndices_>& dimensions)
-        : m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(size)), m_dimensions(dimensions)
-      { EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN }
-
-#if EIGEN_HAS_VARIADIC_TEMPLATES
-    template <typename... DenseIndex>
-    EIGEN_DEVICE_FUNC TensorStorage(DenseIndex... indices) : m_dimensions(indices...) {
-      m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(internal::array_prod(m_dimensions));
-    }
-#endif
-
-    EIGEN_DEVICE_FUNC TensorStorage(const Self& other)
-      : m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(internal::array_prod(other.m_dimensions)))
-      , m_dimensions(other.m_dimensions)
-    {
-      internal::smart_copy(other.m_data, other.m_data+internal::array_prod(other.m_dimensions), m_data);
-    }
-    EIGEN_DEVICE_FUNC Self& operator=(const Self& other)
-    {
-      if (this != &other) {
-        Self tmp(other);
-        this->swap(tmp);
-      }
-      return *this;
-    }
-
-#if EIGEN_HAS_RVALUE_REFERENCES
-    EIGEN_DEVICE_FUNC TensorStorage(Self&& other) : TensorStorage()
-    {
-      *this = std::move(other);
-    }
-    
-    EIGEN_DEVICE_FUNC Self& operator=(Self&& other)
-    {
-      numext::swap(m_data, other.m_data);
-      numext::swap(m_dimensions, other.m_dimensions);
-      return *this;
-    }
-#endif
-
-    EIGEN_DEVICE_FUNC  ~TensorStorage() { internal::conditional_aligned_delete_auto<T,(Options_&DontAlign)==0>(m_data, internal::array_prod(m_dimensions)); }
-    EIGEN_DEVICE_FUNC  void swap(Self& other)
-    { numext::swap(m_data,other.m_data); numext::swap(m_dimensions,other.m_dimensions); }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {return m_dimensions;}
-
-    EIGEN_DEVICE_FUNC void resize(Index size, const array<Index, NumIndices_>& nbDimensions)
-    {
-      const Index currentSz = internal::array_prod(m_dimensions);
-      if(size != currentSz)
-      {
-        internal::conditional_aligned_delete_auto<T,(Options_&DontAlign)==0>(m_data, currentSz);
-        if (size)
-          m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(size);
-        else if (NumIndices_ == 0) {
-	  m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(1);
-	}
-	else 
-          m_data = 0;
-        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
-      }
-      m_dimensions = nbDimensions;
-    }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T *data() { return m_data; }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T *data() const { return m_data; }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); }
-
- private:
-  T *m_data;
-  Dimensions m_dimensions;
-};
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSORSTORAGE_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorStriding.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorStriding.h
deleted file mode 100644
index 2f62a66..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorStriding.h
+++ /dev/null
@@ -1,346 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H
-#define EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H
-
-namespace Eigen {
-
-/** \class TensorStriding
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Tensor striding class.
-  *
-  *
-  */
-namespace internal {
-template<typename Strides, typename XprType>
-struct traits<TensorStridingOp<Strides, XprType> > : public traits<XprType>
-{
-  typedef typename XprType::Scalar Scalar;
-  typedef traits<XprType> XprTraits;
-  typedef typename XprTraits::StorageKind StorageKind;
-  typedef typename XprTraits::Index Index;
-  typedef typename XprType::Nested Nested;
-  typedef typename remove_reference<Nested>::type _Nested;
-  static const int NumDimensions = XprTraits::NumDimensions;
-  static const int Layout = XprTraits::Layout;
-  typedef typename XprTraits::PointerType PointerType;
-};
-
-template<typename Strides, typename XprType>
-struct eval<TensorStridingOp<Strides, XprType>, Eigen::Dense>
-{
-  typedef const TensorStridingOp<Strides, XprType>EIGEN_DEVICE_REF type;
-};
-
-template<typename Strides, typename XprType>
-struct nested<TensorStridingOp<Strides, XprType>, 1, typename eval<TensorStridingOp<Strides, XprType> >::type>
-{
-  typedef TensorStridingOp<Strides, XprType> type;
-};
-
-}  // end namespace internal
-
-
-
-template<typename Strides, typename XprType>
-class TensorStridingOp : public TensorBase<TensorStridingOp<Strides, XprType> >
-{
-  public:
-    typedef TensorBase<TensorStridingOp<Strides, XprType> > Base;
-    typedef typename Eigen::internal::traits<TensorStridingOp>::Scalar Scalar;
-    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-    typedef typename XprType::CoeffReturnType CoeffReturnType;
-    typedef typename Eigen::internal::nested<TensorStridingOp>::type Nested;
-    typedef typename Eigen::internal::traits<TensorStridingOp>::StorageKind StorageKind;
-    typedef typename Eigen::internal::traits<TensorStridingOp>::Index Index;
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingOp(const XprType& expr, const Strides& dims)
-      : m_xpr(expr), m_dims(dims) {}
-
-    EIGEN_DEVICE_FUNC
-    const Strides& strides() const { return m_dims; }
-
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename XprType::Nested>::type&
-    expression() const { return m_xpr; }
-
-    EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorStridingOp)
-
-  protected:
-    typename XprType::Nested m_xpr;
-    const Strides m_dims;
-};
-
-
-// Eval as rvalue
-template<typename Strides, typename ArgType, typename Device>
-struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
-{
-  typedef TensorStridingOp<Strides, ArgType> XprType;
-  typedef typename XprType::Index Index;
-  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
-  typedef DSizes<Index, NumDims> Dimensions;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
-  enum {
-    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
-    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
-  };
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device)
-  {
-    m_dimensions = m_impl.dimensions();
-    for (int i = 0; i < NumDims; ++i) {
-      m_dimensions[i] =Eigen::numext::ceil(static_cast<float>(m_dimensions[i]) / op.strides()[i]);
-    }
-
-    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      m_outputStrides[0] = 1;
-      m_inputStrides[0] = 1;
-      for (int i = 1; i < NumDims; ++i) {
-        m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
-        m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
-        m_inputStrides[i-1] *= op.strides()[i-1];
-      }
-      m_inputStrides[NumDims-1] *= op.strides()[NumDims-1];
-    } else {  // RowMajor
-      m_outputStrides[NumDims-1] = 1;
-      m_inputStrides[NumDims-1] = 1;
-      for (int i = NumDims - 2; i >= 0; --i) {
-        m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
-        m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
-        m_inputStrides[i+1] *= op.strides()[i+1];
-      }
-      m_inputStrides[0] *= op.strides()[0];
-    }
-  }
-
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType/*data*/) {
-    m_impl.evalSubExprsIfNeeded(NULL);
-    return true;
-  }
-  EIGEN_STRONG_INLINE void cleanup() {
-    m_impl.cleanup();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
-  {
-    return m_impl.coeff(srcCoeff(index));
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
-  {
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
-
-    Index inputIndices[] = {0, 0};
-    Index indices[] = {index, index + PacketSize - 1};
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      EIGEN_UNROLL_LOOP
-      for (int i = NumDims - 1; i > 0; --i) {
-        const Index idx0 = indices[0] / m_outputStrides[i];
-        const Index idx1 = indices[1] / m_outputStrides[i];
-        inputIndices[0] += idx0 * m_inputStrides[i];
-        inputIndices[1] += idx1 * m_inputStrides[i];
-        indices[0] -= idx0 * m_outputStrides[i];
-        indices[1] -= idx1 * m_outputStrides[i];
-      }
-      inputIndices[0] += indices[0] * m_inputStrides[0];
-      inputIndices[1] += indices[1] * m_inputStrides[0];
-    } else {  // RowMajor
-      EIGEN_UNROLL_LOOP
-      for (int i = 0; i < NumDims - 1; ++i) {
-        const Index idx0 = indices[0] / m_outputStrides[i];
-        const Index idx1 = indices[1] / m_outputStrides[i];
-        inputIndices[0] += idx0 * m_inputStrides[i];
-        inputIndices[1] += idx1 * m_inputStrides[i];
-        indices[0] -= idx0 * m_outputStrides[i];
-        indices[1] -= idx1 * m_outputStrides[i];
-      }
-      inputIndices[0] += indices[0] * m_inputStrides[NumDims-1];
-      inputIndices[1] += indices[1] * m_inputStrides[NumDims-1];
-    }
-    if (inputIndices[1] - inputIndices[0] == PacketSize - 1) {
-      PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
-      return rslt;
-    }
-    else {
-      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
-      values[0] = m_impl.coeff(inputIndices[0]);
-      values[PacketSize-1] = m_impl.coeff(inputIndices[1]);
-      EIGEN_UNROLL_LOOP
-      for (int i = 1; i < PacketSize-1; ++i) {
-        values[i] = coeff(index+i);
-      }
-      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
-      return rslt;
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    double compute_cost = (NumDims - 1) * (TensorOpCost::AddCost<Index>() +
-                                           TensorOpCost::MulCost<Index>() +
-                                           TensorOpCost::DivCost<Index>()) +
-        TensorOpCost::MulCost<Index>();
-    if (vectorized) {
-      compute_cost *= 2;  // packet() computes two indices
-    }
-    const int innerDim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : (NumDims - 1);
-    return m_impl.costPerCoeff(vectorized && m_inputStrides[innerDim] == 1) +
-        // Computation is not vectorized per se, but it is done once per packet.
-        TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
-  }
-
-  EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
-
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
- protected:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
-  {
-    Index inputIndex = 0;
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      EIGEN_UNROLL_LOOP
-      for (int i = NumDims - 1; i > 0; --i) {
-        const Index idx = index / m_outputStrides[i];
-        inputIndex += idx * m_inputStrides[i];
-        index -= idx * m_outputStrides[i];
-      }
-      inputIndex += index * m_inputStrides[0];
-    } else {  // RowMajor
-      EIGEN_UNROLL_LOOP
-      for (int i = 0; i < NumDims - 1; ++i) {
-        const Index idx = index / m_outputStrides[i];
-        inputIndex += idx * m_inputStrides[i];
-        index -= idx * m_outputStrides[i];
-      }
-      inputIndex += index * m_inputStrides[NumDims-1];
-    }
-    return inputIndex;
-  }
-
-  Dimensions m_dimensions;
-  array<Index, NumDims> m_outputStrides;
-  array<Index, NumDims> m_inputStrides;
-  TensorEvaluator<ArgType, Device> m_impl;
-};
-
-// Eval as lvalue
-template<typename Strides, typename ArgType, typename Device>
-struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
-    : public TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
-{
-  typedef TensorStridingOp<Strides, ArgType> XprType;
-  typedef TensorEvaluator<const XprType, Device> Base;
-  //  typedef typename XprType::Index Index;
-  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
-  //  typedef DSizes<Index, NumDims> Dimensions;
-
-  enum {
-    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    PreferBlockAccess = false,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
-  };
-
-  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : Base(op, device) { }
-
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
-  {
-    return this->m_impl.coeffRef(this->srcCoeff(index));
-  }
-
-  template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void writePacket(Index index, const PacketReturnType& x)
-  {
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+PacketSize-1 < this->dimensions().TotalSize());
-
-    Index inputIndices[] = {0, 0};
-    Index indices[] = {index, index + PacketSize - 1};
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      EIGEN_UNROLL_LOOP
-      for (int i = NumDims - 1; i > 0; --i) {
-        const Index idx0 = indices[0] / this->m_outputStrides[i];
-        const Index idx1 = indices[1] / this->m_outputStrides[i];
-        inputIndices[0] += idx0 * this->m_inputStrides[i];
-        inputIndices[1] += idx1 * this->m_inputStrides[i];
-        indices[0] -= idx0 * this->m_outputStrides[i];
-        indices[1] -= idx1 * this->m_outputStrides[i];
-      }
-      inputIndices[0] += indices[0] * this->m_inputStrides[0];
-      inputIndices[1] += indices[1] * this->m_inputStrides[0];
-    } else {  // RowMajor
-      EIGEN_UNROLL_LOOP
-      for (int i = 0; i < NumDims - 1; ++i) {
-        const Index idx0 = indices[0] / this->m_outputStrides[i];
-        const Index idx1 = indices[1] / this->m_outputStrides[i];
-        inputIndices[0] += idx0 * this->m_inputStrides[i];
-        inputIndices[1] += idx1 * this->m_inputStrides[i];
-        indices[0] -= idx0 * this->m_outputStrides[i];
-        indices[1] -= idx1 * this->m_outputStrides[i];
-      }
-      inputIndices[0] += indices[0] * this->m_inputStrides[NumDims-1];
-      inputIndices[1] += indices[1] * this->m_inputStrides[NumDims-1];
-    }
-    if (inputIndices[1] - inputIndices[0] == PacketSize - 1) {
-      this->m_impl.template writePacket<Unaligned>(inputIndices[0], x);
-    }
-    else {
-      EIGEN_ALIGN_MAX Scalar values[PacketSize];
-      internal::pstore<Scalar, PacketReturnType>(values, x);
-      this->m_impl.coeffRef(inputIndices[0]) = values[0];
-      this->m_impl.coeffRef(inputIndices[1]) = values[PacketSize-1];
-      EIGEN_UNROLL_LOOP
-      for (int i = 1; i < PacketSize-1; ++i) {
-        this->coeffRef(index+i) = values[i];
-      }
-    }
-  }
-};
-
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorTrace.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorTrace.h
deleted file mode 100644
index 926ecdd..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorTrace.h
+++ /dev/null
@@ -1,303 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2017 Gagan Goel <gagan.nith@gmail.com>
-// Copyright (C) 2017 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_TRACE_H
-#define EIGEN_CXX11_TENSOR_TENSOR_TRACE_H
-
-namespace Eigen {
-
-/** \class TensorTrace
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Tensor Trace class.
-  *
-  *
-  */
-
-namespace internal {
-template<typename Dims, typename XprType>
-struct traits<TensorTraceOp<Dims, XprType> > : public traits<XprType>
-{
-  typedef typename XprType::Scalar Scalar;
-  typedef traits<XprType> XprTraits;
-  typedef typename XprTraits::StorageKind StorageKind;
-  typedef typename XprTraits::Index Index;
-  typedef typename XprType::Nested Nested;
-  typedef typename remove_reference<Nested>::type _Nested;
-  static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
-  static const int Layout = XprTraits::Layout;
-};
-
-template<typename Dims, typename XprType>
-struct eval<TensorTraceOp<Dims, XprType>, Eigen::Dense>
-{
-  typedef const TensorTraceOp<Dims, XprType>& type;
-};
-
-template<typename Dims, typename XprType>
-struct nested<TensorTraceOp<Dims, XprType>, 1, typename eval<TensorTraceOp<Dims, XprType> >::type>
-{
-  typedef TensorTraceOp<Dims, XprType> type;
-};
-
-} // end namespace internal
-
-
-template<typename Dims, typename XprType>
-class TensorTraceOp : public TensorBase<TensorTraceOp<Dims, XprType> >
-{
-  public:
-    typedef typename Eigen::internal::traits<TensorTraceOp>::Scalar Scalar;
-    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-    typedef typename XprType::CoeffReturnType CoeffReturnType;
-    typedef typename Eigen::internal::nested<TensorTraceOp>::type Nested;
-    typedef typename Eigen::internal::traits<TensorTraceOp>::StorageKind StorageKind;
-    typedef typename Eigen::internal::traits<TensorTraceOp>::Index Index;
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTraceOp(const XprType& expr, const Dims& dims)
-      : m_xpr(expr), m_dims(dims) {
-    }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const Dims& dims() const { return m_dims; }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const typename internal::remove_all<typename XprType::Nested>::type& expression() const { return m_xpr; }
-
-  protected:
-    typename XprType::Nested m_xpr;
-    const Dims m_dims;
-};
-
-
-// Eval as rvalue
-template<typename Dims, typename ArgType, typename Device>
-struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
-{
-  typedef TensorTraceOp<Dims, ArgType> XprType;
-  static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
-  static const int NumReducedDims = internal::array_size<Dims>::value;
-  static const int NumOutputDims = NumInputDims - NumReducedDims;
-  typedef typename XprType::Index Index;
-  typedef DSizes<Index, NumOutputDims> Dimensions;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
-  enum {
-    IsAligned = false,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
-    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,
-    RawAccess = false
-  };
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-    : m_impl(op.expression(), device), m_traceDim(1), m_device(device)
-  {
-
-    EIGEN_STATIC_ASSERT((NumOutputDims >= 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    EIGEN_STATIC_ASSERT((NumReducedDims >= 2) || ((NumReducedDims == 0) && (NumInputDims == 0)), YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-    for (int i = 0; i < NumInputDims; ++i) {
-      m_reduced[i] = false;
-    }
-
-    const Dims& op_dims = op.dims();
-    for (int i = 0; i < NumReducedDims; ++i) {
-      eigen_assert(op_dims[i] >= 0);
-      eigen_assert(op_dims[i] < NumInputDims);
-      m_reduced[op_dims[i]] = true;
-    }
-
-    // All the dimensions should be distinct to compute the trace
-    int num_distinct_reduce_dims = 0;
-    for (int i = 0; i < NumInputDims; ++i) {
-      if (m_reduced[i]) {
-        ++num_distinct_reduce_dims;
-      }
-    }
-
-    eigen_assert(num_distinct_reduce_dims == NumReducedDims);
-
-    // Compute the dimensions of the result.
-    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
-
-    int output_index = 0;
-    int reduced_index = 0;
-    for (int i = 0; i < NumInputDims; ++i) {
-      if (m_reduced[i]) {
-        m_reducedDims[reduced_index] = input_dims[i];
-        if (reduced_index > 0) {
-          // All the trace dimensions must have the same size
-          eigen_assert(m_reducedDims[0] == m_reducedDims[reduced_index]);
-        }
-        ++reduced_index;
-      }
-      else {
-        m_dimensions[output_index] = input_dims[i];
-        ++output_index;
-      }
-    }
-
-    if (NumReducedDims != 0) {
-      m_traceDim = m_reducedDims[0];
-    }
-
-    // Compute the output strides
-    if (NumOutputDims > 0) {
-      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-        m_outputStrides[0] = 1;
-        for (int i = 1; i < NumOutputDims; ++i) {
-          m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
-        }
-      }
-      else {
-        m_outputStrides.back() = 1;
-        for (int i = NumOutputDims - 2; i >= 0; --i) {
-          m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
-        }
-      }
-    }
-
-    // Compute the input strides
-    if (NumInputDims > 0) {
-      array<Index, NumInputDims> input_strides;
-      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-        input_strides[0] = 1;
-        for (int i = 1; i < NumInputDims; ++i) {
-          input_strides[i] = input_strides[i - 1] * input_dims[i - 1];
-        }
-      }
-      else {
-        input_strides.back() = 1;
-        for (int i = NumInputDims - 2; i >= 0; --i) {
-          input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
-        }
-      }
-
-      output_index = 0;
-      reduced_index = 0;
-      for (int i = 0; i < NumInputDims; ++i) {
-        if(m_reduced[i]) {
-          m_reducedStrides[reduced_index] = input_strides[i];
-          ++reduced_index;
-        }
-        else {
-          m_preservedStrides[output_index] = input_strides[i];
-          ++output_index;
-        }
-      }
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
-    return m_dimensions;
-  }
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
-    m_impl.evalSubExprsIfNeeded(NULL);
-    return true;
-  }
-
-  EIGEN_STRONG_INLINE void cleanup() {
-    m_impl.cleanup();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
-  {
-    // Initialize the result
-    CoeffReturnType result = internal::cast<int, CoeffReturnType>(0);
-    Index index_stride = 0;
-    for (int i = 0; i < NumReducedDims; ++i) {
-      index_stride += m_reducedStrides[i];
-    }
-
-    // If trace is requested along all dimensions, starting index would be 0
-    Index cur_index = 0;
-    if (NumOutputDims != 0)
-      cur_index = firstInput(index);
-    for (Index i = 0; i < m_traceDim; ++i) {
-        result += m_impl.coeff(cur_index);
-        cur_index += index_stride;
-    }
-
-    return result;
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
-
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
-
-    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
-    for (int i = 0; i < PacketSize; ++i) {
-        values[i] = coeff(index + i);
-    }
-    PacketReturnType result = internal::ploadt<PacketReturnType, LoadMode>(values);
-    return result;
-  }
-
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
-
- protected:
-  // Given the output index, finds the first index in the input tensor used to compute the trace
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
-    Index startInput = 0;
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      for (int i = NumOutputDims - 1; i > 0; --i) {
-        const Index idx = index / m_outputStrides[i];
-        startInput += idx * m_preservedStrides[i];
-        index -= idx * m_outputStrides[i];
-      }
-      startInput += index * m_preservedStrides[0];
-    }
-    else {
-      for (int i = 0; i < NumOutputDims - 1; ++i) {
-        const Index idx = index / m_outputStrides[i];
-        startInput += idx * m_preservedStrides[i];
-        index -= idx * m_outputStrides[i];
-      }
-      startInput += index * m_preservedStrides[NumOutputDims - 1];
-    }
-    return startInput;
-  }
-
-  Dimensions m_dimensions;
-  TensorEvaluator<ArgType, Device> m_impl;
-  // Initialize the size of the trace dimension
-  Index m_traceDim;
-  const Device EIGEN_DEVICE_REF m_device;
-  array<bool, NumInputDims> m_reduced;
-  array<Index, NumReducedDims> m_reducedDims;
-  array<Index, NumOutputDims> m_outputStrides;
-  array<Index, NumReducedDims> m_reducedStrides;
-  array<Index, NumOutputDims> m_preservedStrides;
-};
-
-
-} // End namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_TRACE_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorTraits.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorTraits.h
deleted file mode 100644
index 4f7fd34..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorTraits.h
+++ /dev/null
@@ -1,264 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H
-#define EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H
-
-namespace Eigen {
-namespace internal {
-
-
-template<typename Scalar, int Options>
-class compute_tensor_flags
-{
-  enum {
-    is_dynamic_size_storage = 1,
-
-    is_aligned =
-    (
-        ((Options&DontAlign)==0) && (
-#if EIGEN_MAX_STATIC_ALIGN_BYTES>0
-            (!is_dynamic_size_storage)
-#else
-            0
-#endif
-            |
-#if EIGEN_MAX_ALIGN_BYTES>0
-            is_dynamic_size_storage
-#else
-            0
-#endif
-      )
-     ),
-    packet_access_bit = packet_traits<Scalar>::Vectorizable && is_aligned ? PacketAccessBit : 0
-  };
-
-  public:
-    enum { ret = packet_access_bit };
-};
-
-
-template<typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
-struct traits<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
-{
-  typedef Scalar_ Scalar;
-  typedef Dense StorageKind;
-  typedef IndexType_ Index;
-  static const int NumDimensions = NumIndices_;
-  static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor;
-  enum {
-    Options = Options_,
-    Flags = compute_tensor_flags<Scalar_, Options_>::ret | (is_const<Scalar_>::value ? 0 : LvalueBit)
-  };
-  template <typename T> struct MakePointer {
-    typedef T* Type;
-  };
-  typedef typename MakePointer<Scalar>::Type PointerType;
-};
-
-
-template<typename Scalar_, typename Dimensions, int Options_, typename IndexType_>
-struct traits<TensorFixedSize<Scalar_, Dimensions, Options_, IndexType_> >
-{
-  typedef Scalar_ Scalar;
-  typedef Dense StorageKind;
-  typedef IndexType_ Index;
-  static const int NumDimensions = array_size<Dimensions>::value;
-  static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor;
-  enum {
-    Options = Options_,
-    Flags = compute_tensor_flags<Scalar_, Options_>::ret | (is_const<Scalar_>::value ? 0: LvalueBit)
-  };
-  template <typename T> struct MakePointer {
-    typedef T* Type;
-  };
-  typedef typename MakePointer<Scalar>::Type PointerType;
-};
-
-
-template<typename PlainObjectType, int Options_, template <class> class MakePointer_>
-struct traits<TensorMap<PlainObjectType, Options_, MakePointer_> >
-  : public traits<PlainObjectType>
-{
-  typedef traits<PlainObjectType> BaseTraits;
-  typedef typename BaseTraits::Scalar Scalar;
-  typedef typename BaseTraits::StorageKind StorageKind;
-  typedef typename BaseTraits::Index Index;
-  static const int NumDimensions = BaseTraits::NumDimensions;
-  static const int Layout = BaseTraits::Layout;
-  enum {
-    Options = Options_,
-    Flags = BaseTraits::Flags
-  };
-  template <class T> struct MakePointer {
-    // Intermediate typedef to workaround MSVC issue.
-    typedef MakePointer_<T> MakePointerT;
-    typedef typename MakePointerT::Type Type;
-  };
-  typedef typename MakePointer<Scalar>::Type PointerType;
-};
-
-template<typename PlainObjectType>
-struct traits<TensorRef<PlainObjectType> >
-  : public traits<PlainObjectType>
-{
-  typedef traits<PlainObjectType> BaseTraits;
-  typedef typename BaseTraits::Scalar Scalar;
-  typedef typename BaseTraits::StorageKind StorageKind;
-  typedef typename BaseTraits::Index Index;
-  static const int NumDimensions = BaseTraits::NumDimensions;
-  static const int Layout = BaseTraits::Layout;
-  enum {
-    Options = BaseTraits::Options,
-    Flags = BaseTraits::Flags
-  };
-  typedef typename BaseTraits::PointerType PointerType;
-};
-
-
-template<typename _Scalar, int NumIndices_, int Options, typename IndexType_>
-struct eval<Tensor<_Scalar, NumIndices_, Options, IndexType_>, Eigen::Dense>
-{
-  typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>EIGEN_DEVICE_REF type;
-};
-
-template<typename _Scalar, int NumIndices_, int Options, typename IndexType_>
-struct eval<const Tensor<_Scalar, NumIndices_, Options, IndexType_>, Eigen::Dense>
-{
-  typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>EIGEN_DEVICE_REF type;
-};
-
-template<typename Scalar_, typename Dimensions, int Options, typename IndexType_>
-struct eval<TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, Eigen::Dense>
-{
-  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type;
-};
-
-template<typename Scalar_, typename Dimensions, int Options, typename IndexType_>
-struct eval<const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, Eigen::Dense>
-{
-  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type;
-};
-
-template<typename PlainObjectType, int Options, template <class> class MakePointer>
-struct eval<TensorMap<PlainObjectType, Options, MakePointer>, Eigen::Dense>
-{
-  typedef const TensorMap<PlainObjectType, Options, MakePointer>EIGEN_DEVICE_REF type;
-};
-
-template<typename PlainObjectType, int Options, template <class> class MakePointer>
-struct eval<const TensorMap<PlainObjectType, Options, MakePointer>, Eigen::Dense>
-{
-  typedef const TensorMap<PlainObjectType, Options, MakePointer>EIGEN_DEVICE_REF type;
-};
-
-template<typename PlainObjectType>
-struct eval<TensorRef<PlainObjectType>, Eigen::Dense>
-{
-  typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type;
-};
-
-template<typename PlainObjectType>
-struct eval<const TensorRef<PlainObjectType>, Eigen::Dense>
-{
-  typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type;
-};
-
-// TODO nested<> does not exist anymore in Eigen/Core, and it thus has to be removed in favor of ref_selector.
-template<typename T, int n=1, typename PlainObject = void> struct nested
-{
-  typedef typename ref_selector<T>::type type;
-};
-
-template <typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
-struct nested<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
-{
-  typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>EIGEN_DEVICE_REF type;
-};
-
-template <typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
-struct nested<const Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
-{
-  typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>EIGEN_DEVICE_REF type;
-};
-
-template <typename Scalar_, typename Dimensions, int Options, typename IndexType_>
-struct nested<TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> >
-{
-  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type;
-};
-
-template <typename Scalar_, typename Dimensions, int Options, typename IndexType_>
-struct nested<const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> >
-{
-  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type;
-};
-
-
-template <typename PlainObjectType>
-struct nested<TensorRef<PlainObjectType> >
-{
-  typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type;
-};
-
-template <typename PlainObjectType>
-struct nested<const TensorRef<PlainObjectType> >
-{
-  typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type;
-};
-
-}  // end namespace internal
-
-// Convolutional layers take in an input tensor of shape (D, R, C, B), or (D, C,
-// R, B), and convolve it with a set of filters, which can also be presented as
-// a tensor (D, K, K, M), where M is the number of filters, K is the filter
-// size, and each 3-dimensional tensor of size (D, K, K) is a filter. For
-// simplicity we assume that we always use square filters (which is usually the
-// case in images), hence the two Ks in the tensor dimension.  It also takes in
-// a few additional parameters:
-// Stride (S): The convolution stride is the offset between locations where we
-//             apply the filters.  A larger stride means that the output will be
-//             spatially smaller.
-// Padding (P): The padding we apply to the input tensor along the R and C
-//              dimensions.  This is usually used to make sure that the spatial
-//              dimensions of the output matches our intention.
-//
-// Two types of padding are often used:
-//   SAME: The pad value is computed so that the output will have size
-//         R/S and C/S.
-//   VALID: no padding is carried out.
-// When we do padding, the padded values at the padded locations are usually
-// zero.
-//
-// The output dimensions for convolution, when given all the parameters above,
-// are as follows:
-// When Padding = SAME: the output size is (B, R', C', M), where
-//   R' = ceil(float(R) / float(S))
-//   C' = ceil(float(C) / float(S))
-// where ceil is the ceiling function.  The input tensor is padded with 0 as
-// needed.  The number of padded rows and columns are computed as:
-//   Pr = ((R' - 1) * S + K - R) / 2
-//   Pc = ((C' - 1) * S + K - C) / 2
-// when the stride is 1, we have the simplified case R'=R, C'=C, Pr=Pc=(K-1)/2.
-// This is where SAME comes from - the output has the same size as the input has.
-// When Padding = VALID: the output size is computed as
-//   R' = ceil(float(R - K + 1) / float(S))
-//   C' = ceil(float(C - K + 1) / float(S))
-// and the number of padded rows and columns are computed in the same way as in
-// the SAME case.
-// When the stride is 1, we have the simplified case R'=R-K+1, C'=C-K+1, Pr=0,
-// Pc=0.
-typedef enum {
-  PADDING_VALID = 1,
-  PADDING_SAME = 2
-} PaddingType;
-
-}  // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorUInt128.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorUInt128.h
deleted file mode 100644
index d23f2e4..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorUInt128.h
+++ /dev/null
@@ -1,249 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_UINT128_H
-#define EIGEN_CXX11_TENSOR_TENSOR_UINT128_H
-
-namespace Eigen {
-namespace internal {
-
-
-template <uint64_t n>
-struct static_val {
-  static const uint64_t value = n;
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE operator uint64_t() const { return n; }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val() { }
-
-  template <typename T>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val(const T& v) {
-    EIGEN_UNUSED_VARIABLE(v);
-    eigen_assert(v == n);
-  }
-};
-
-
-template <typename HIGH = uint64_t, typename LOW = uint64_t>
-struct TensorUInt128
-{
-  HIGH high;
-  LOW low;
-
-  template<typename OTHER_HIGH, typename OTHER_LOW>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-  TensorUInt128(const TensorUInt128<OTHER_HIGH, OTHER_LOW>& other) : high(other.high), low(other.low) {
-    EIGEN_STATIC_ASSERT(sizeof(OTHER_HIGH) <= sizeof(HIGH), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    EIGEN_STATIC_ASSERT(sizeof(OTHER_LOW) <= sizeof(LOW), YOU_MADE_A_PROGRAMMING_MISTAKE);
-  }
-
-  template<typename OTHER_HIGH, typename OTHER_LOW>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-  TensorUInt128& operator = (const TensorUInt128<OTHER_HIGH, OTHER_LOW>& other) {
-    EIGEN_STATIC_ASSERT(sizeof(OTHER_HIGH) <= sizeof(HIGH), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    EIGEN_STATIC_ASSERT(sizeof(OTHER_LOW) <= sizeof(LOW), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    high = other.high;
-    low = other.low;
-    return *this;
-  }
-
-  template<typename T>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-  explicit TensorUInt128(const T& x) : high(0), low(x) {
-    eigen_assert((static_cast<typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type>(x) <= NumTraits<uint64_t>::highest()));
-    eigen_assert(x >= 0);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-  TensorUInt128(HIGH y, LOW x) : high(y), low(x) { }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE operator LOW() const {
-    return low;
-  }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LOW lower() const {
-    return low;
-  }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HIGH upper() const {
-    return high;
-  }
-};
-
-
-template <typename HL, typename LL, typename HR, typename LR>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-bool operator == (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
-{
-  return (lhs.high == rhs.high) & (lhs.low == rhs.low);
-}
-
-template <typename HL, typename LL, typename HR, typename LR>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-bool operator != (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
-{
-  return (lhs.high != rhs.high) | (lhs.low != rhs.low);
-}
-
-template <typename HL, typename LL, typename HR, typename LR>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-bool operator >= (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
-{
-  if (lhs.high != rhs.high) {
-    return lhs.high > rhs.high;
-  }
-  return lhs.low >= rhs.low;
-}
-
-template <typename HL, typename LL, typename HR, typename LR>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-bool operator < (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
-{
-  if (lhs.high != rhs.high) {
-    return lhs.high < rhs.high;
-  }
-  return lhs.low < rhs.low;
-}
-
-template <typename HL, typename LL, typename HR, typename LR>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-TensorUInt128<uint64_t, uint64_t> operator + (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
-{
-  TensorUInt128<uint64_t, uint64_t> result(lhs.high + rhs.high, lhs.low + rhs.low);
-  if (result.low < rhs.low) {
-    result.high += 1;
-  }
-  return result;
-}
-
-template <typename HL, typename LL, typename HR, typename LR>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-TensorUInt128<uint64_t, uint64_t> operator - (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
-{
-  TensorUInt128<uint64_t, uint64_t> result(lhs.high - rhs.high, lhs.low - rhs.low);
-  if (result.low > lhs.low) {
-    result.high -= 1;
-  }
-  return result;
-}
-
-
-template <typename HL, typename LL, typename HR, typename LR>
-static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-TensorUInt128<uint64_t, uint64_t> operator * (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
-{
-  // Split each 128-bit integer into 4 32-bit integers, and then do the
-  // multiplications by hand as follow:
-  //   lhs      a  b  c  d
-  //   rhs      e  f  g  h
-  //           -----------
-  //           ah bh ch dh
-  //           bg cg dg
-  //           cf df
-  //           de
-  // The result is stored in 2 64bit integers, high and low.
-
-  const uint64_t LOW = 0x00000000FFFFFFFFLL;
-  const uint64_t HIGH = 0xFFFFFFFF00000000LL;
-
-  uint64_t d = lhs.low & LOW;
-  uint64_t c = (lhs.low & HIGH) >> 32LL;
-  uint64_t b = lhs.high & LOW;
-  uint64_t a = (lhs.high & HIGH) >> 32LL;
-
-  uint64_t h = rhs.low & LOW;
-  uint64_t g = (rhs.low & HIGH) >> 32LL;
-  uint64_t f = rhs.high & LOW;
-  uint64_t e = (rhs.high & HIGH) >> 32LL;
-
-  // Compute the low 32 bits of low
-  uint64_t acc = d * h;
-  uint64_t low = acc & LOW;
-  //  Compute the high 32 bits of low. Add a carry every time we wrap around
-  acc >>= 32LL;
-  uint64_t carry = 0;
-  uint64_t acc2 = acc + c * h;
-  if (acc2 < acc) {
-    carry++;
-  }
-  acc = acc2 + d * g;
-  if (acc < acc2) {
-    carry++;
-  }
-  low |= (acc << 32LL);
-
-  // Carry forward the high bits of acc to initiate the computation of the
-  // low 32 bits of high
-  acc2 = (acc >> 32LL) | (carry << 32LL);
-  carry = 0;
-
-  acc = acc2 + b * h;
-  if (acc < acc2) {
-    carry++;
-  }
-  acc2 = acc + c * g;
-  if (acc2 < acc) {
-    carry++;
-  }
-  acc = acc2 + d * f;
-  if (acc < acc2) {
-    carry++;
-  }
-  uint64_t high = acc & LOW;
-
-  // Start to compute the high 32 bits of high.
-  acc2 = (acc >> 32LL) | (carry << 32LL);
-
-  acc = acc2 + a * h;
-  acc2 = acc + b * g;
-  acc = acc2 + c * f;
-  acc2 = acc + d * e;
-  high |= (acc2 << 32LL);
-
-  return TensorUInt128<uint64_t, uint64_t>(high, low);
-}
-
-template <typename HL, typename LL, typename HR, typename LR>
-static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-TensorUInt128<uint64_t, uint64_t> operator / (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
-{
-  if (rhs == TensorUInt128<static_val<0>, static_val<1> >(1)) {
-    return TensorUInt128<uint64_t, uint64_t>(lhs.high, lhs.low);
-  } else if (lhs < rhs) {
-    return TensorUInt128<uint64_t, uint64_t>(0);
-  } else {
-    // calculate the biggest power of 2 times rhs that's less than or equal to lhs
-    TensorUInt128<uint64_t, uint64_t> power2(1);
-    TensorUInt128<uint64_t, uint64_t> d(rhs);
-    TensorUInt128<uint64_t, uint64_t> tmp(lhs - d);
-    while (lhs >= d) {
-      tmp = tmp - d;
-      d = d + d;
-      power2 = power2 + power2;
-    }
-
-    tmp = TensorUInt128<uint64_t, uint64_t>(lhs.high, lhs.low);
-    TensorUInt128<uint64_t, uint64_t> result(0);
-    while (power2 != TensorUInt128<static_val<0>, static_val<0> >(0)) {
-      if (tmp >= d) {
-        tmp = tmp - d;
-        result = result + power2;
-      }
-      // Shift right
-      power2 = TensorUInt128<uint64_t, uint64_t>(power2.high >> 1, (power2.low >> 1) | (power2.high << 63));
-      d = TensorUInt128<uint64_t, uint64_t>(d.high >> 1, (d.low >> 1) | (d.high << 63));
-    }
-
-    return result;
-  }
-}
-
-
-}  // namespace internal
-}  // namespace Eigen
-
-
-#endif  // EIGEN_CXX11_TENSOR_TENSOR_UINT128_H
diff --git a/src/EigenUnsupported/CXX11/src/Tensor/TensorVolumePatch.h b/src/EigenUnsupported/CXX11/src/Tensor/TensorVolumePatch.h
deleted file mode 100644
index 0beb9ff..0000000
--- a/src/EigenUnsupported/CXX11/src/Tensor/TensorVolumePatch.h
+++ /dev/null
@@ -1,629 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H
-#define EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H
-
-namespace Eigen {
-
-/** \class TensorVolumePatch
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Patch extraction specialized for processing of volumetric data.
-  * This assumes that the input has a least 4 dimensions ordered as follows:
-  *  - channels
-  *  - planes
-  *  - rows
-  *  - columns
-  *  - (optional) additional dimensions such as time or batch size.
-  * Calling the volume patch code with patch_planes, patch_rows, and patch_cols
-  * is equivalent to calling the regular patch extraction code with parameters
-  * d, patch_planes, patch_rows, patch_cols, and 1 for all the additional
-  * dimensions.
-  */
-namespace internal {
-
-template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
-struct traits<TensorVolumePatchOp<Planes, Rows, Cols, XprType> > : public traits<XprType>
-{
-  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
-  typedef traits<XprType> XprTraits;
-  typedef typename XprTraits::StorageKind StorageKind;
-  typedef typename XprTraits::Index Index;
-  typedef typename XprType::Nested Nested;
-  typedef typename remove_reference<Nested>::type _Nested;
-  static const int NumDimensions = XprTraits::NumDimensions + 1;
-  static const int Layout = XprTraits::Layout;
-  typedef typename XprTraits::PointerType PointerType;
-
-};
-
-template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
-struct eval<TensorVolumePatchOp<Planes, Rows, Cols, XprType>, Eigen::Dense>
-{
-  typedef const TensorVolumePatchOp<Planes, Rows, Cols, XprType>& type;
-};
-
-template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
-struct nested<TensorVolumePatchOp<Planes, Rows, Cols, XprType>, 1, typename eval<TensorVolumePatchOp<Planes, Rows, Cols, XprType> >::type>
-{
-  typedef TensorVolumePatchOp<Planes, Rows, Cols, XprType> type;
-};
-
-}  // end namespace internal
-
-template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
-class TensorVolumePatchOp : public TensorBase<TensorVolumePatchOp<Planes, Rows, Cols, XprType>, ReadOnlyAccessors>
-{
-  public:
-  typedef typename Eigen::internal::traits<TensorVolumePatchOp>::Scalar Scalar;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename Eigen::internal::nested<TensorVolumePatchOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorVolumePatchOp>::StorageKind StorageKind;
-  typedef typename Eigen::internal::traits<TensorVolumePatchOp>::Index Index;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorVolumePatchOp(const XprType& expr, DenseIndex patch_planes, DenseIndex patch_rows, DenseIndex patch_cols,
-                                                            DenseIndex plane_strides, DenseIndex row_strides, DenseIndex col_strides,
-                                                            DenseIndex in_plane_strides, DenseIndex in_row_strides, DenseIndex in_col_strides,
-                                                            DenseIndex plane_inflate_strides, DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
-                                                            PaddingType padding_type, Scalar padding_value)
-                                                            : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
-                                                            m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides),
-                                                            m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
-                                                            m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
-                                                            m_padding_explicit(false), m_padding_top_z(0), m_padding_bottom_z(0), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0),
-                                                            m_padding_type(padding_type), m_padding_value(padding_value) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorVolumePatchOp(const XprType& expr, DenseIndex patch_planes, DenseIndex patch_rows, DenseIndex patch_cols,
-                                                           DenseIndex plane_strides, DenseIndex row_strides, DenseIndex col_strides,
-                                                           DenseIndex in_plane_strides, DenseIndex in_row_strides, DenseIndex in_col_strides,
-                                                           DenseIndex plane_inflate_strides, DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
-                                                           DenseIndex padding_top_z, DenseIndex padding_bottom_z,
-                                                           DenseIndex padding_top, DenseIndex padding_bottom,
-                                                           DenseIndex padding_left, DenseIndex padding_right,
-                                                           Scalar padding_value)
-                                                           : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
-                                                           m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides),
-                                                           m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
-                                                           m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
-                                                           m_padding_explicit(true), m_padding_top_z(padding_top_z), m_padding_bottom_z(padding_bottom_z), m_padding_top(padding_top), m_padding_bottom(padding_bottom),
-                                                           m_padding_left(padding_left), m_padding_right(padding_right),
-                                                           m_padding_type(PADDING_VALID), m_padding_value(padding_value) {}
-
-    EIGEN_DEVICE_FUNC
-    DenseIndex patch_planes() const { return m_patch_planes; }
-    EIGEN_DEVICE_FUNC
-    DenseIndex patch_rows() const { return m_patch_rows; }
-    EIGEN_DEVICE_FUNC
-    DenseIndex patch_cols() const { return m_patch_cols; }
-    EIGEN_DEVICE_FUNC
-    DenseIndex plane_strides() const { return m_plane_strides; }
-    EIGEN_DEVICE_FUNC
-    DenseIndex row_strides() const { return m_row_strides; }
-    EIGEN_DEVICE_FUNC
-    DenseIndex col_strides() const { return m_col_strides; }
-    EIGEN_DEVICE_FUNC
-    DenseIndex in_plane_strides() const { return m_in_plane_strides; }
-    EIGEN_DEVICE_FUNC
-    DenseIndex in_row_strides() const { return m_in_row_strides; }
-    EIGEN_DEVICE_FUNC
-    DenseIndex in_col_strides() const { return m_in_col_strides; }
-    EIGEN_DEVICE_FUNC
-    DenseIndex plane_inflate_strides() const { return m_plane_inflate_strides; }
-    EIGEN_DEVICE_FUNC
-    DenseIndex row_inflate_strides() const { return m_row_inflate_strides; }
-    EIGEN_DEVICE_FUNC
-    DenseIndex col_inflate_strides() const { return m_col_inflate_strides; }
-    EIGEN_DEVICE_FUNC
-    bool padding_explicit() const { return m_padding_explicit; }
-    EIGEN_DEVICE_FUNC
-    DenseIndex padding_top_z() const { return m_padding_top_z; }
-    EIGEN_DEVICE_FUNC
-    DenseIndex padding_bottom_z() const { return m_padding_bottom_z; }
-    EIGEN_DEVICE_FUNC
-    DenseIndex padding_top() const { return m_padding_top; }
-    EIGEN_DEVICE_FUNC
-    DenseIndex padding_bottom() const { return m_padding_bottom; }
-    EIGEN_DEVICE_FUNC
-    DenseIndex padding_left() const { return m_padding_left; }
-    EIGEN_DEVICE_FUNC
-    DenseIndex padding_right() const { return m_padding_right; }
-    EIGEN_DEVICE_FUNC
-    PaddingType padding_type() const { return m_padding_type; }
-    EIGEN_DEVICE_FUNC
-    Scalar padding_value() const { return m_padding_value; }
-
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<typename XprType::Nested>::type&
-    expression() const { return m_xpr; }
-
-  protected:
-    typename XprType::Nested m_xpr;
-    const DenseIndex m_patch_planes;
-    const DenseIndex m_patch_rows;
-    const DenseIndex m_patch_cols;
-    const DenseIndex m_plane_strides;
-    const DenseIndex m_row_strides;
-    const DenseIndex m_col_strides;
-    const DenseIndex m_in_plane_strides;
-    const DenseIndex m_in_row_strides;
-    const DenseIndex m_in_col_strides;
-    const DenseIndex m_plane_inflate_strides;
-    const DenseIndex m_row_inflate_strides;
-    const DenseIndex m_col_inflate_strides;
-    const bool m_padding_explicit;
-    const DenseIndex m_padding_top_z;
-    const DenseIndex m_padding_bottom_z;
-    const DenseIndex m_padding_top;
-    const DenseIndex m_padding_bottom;
-    const DenseIndex m_padding_left;
-    const DenseIndex m_padding_right;
-    const PaddingType m_padding_type;
-    const Scalar m_padding_value;
-};
-
-
-// Eval as rvalue
-template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device>
-struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, Device>
-{
-  typedef TensorVolumePatchOp<Planes, Rows, Cols, ArgType> XprType;
-  typedef typename XprType::Index Index;
-  static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
-  static const int NumDims = NumInputDims + 1;
-  typedef DSizes<Index, NumDims> Dimensions;
-  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
-  enum {
-    IsAligned = false,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
-    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,
-    RawAccess = false
-  };
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) :
- m_impl(op.expression(), device)
-  {
-    EIGEN_STATIC_ASSERT((NumDims >= 5), YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-    m_paddingValue = op.padding_value();
-
-    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
-
-    // Cache a few variables.
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      m_inputDepth = input_dims[0];
-      m_inputPlanes = input_dims[1];
-      m_inputRows = input_dims[2];
-      m_inputCols = input_dims[3];
-    } else {
-      m_inputDepth = input_dims[NumInputDims-1];
-      m_inputPlanes = input_dims[NumInputDims-2];
-      m_inputRows = input_dims[NumInputDims-3];
-      m_inputCols = input_dims[NumInputDims-4];
-    }
-
-    m_plane_strides = op.plane_strides();
-    m_row_strides = op.row_strides();
-    m_col_strides = op.col_strides();
-
-    // Input strides and effective input/patch size
-    m_in_plane_strides = op.in_plane_strides();
-    m_in_row_strides = op.in_row_strides();
-    m_in_col_strides = op.in_col_strides();
-    m_plane_inflate_strides = op.plane_inflate_strides();
-    m_row_inflate_strides = op.row_inflate_strides();
-    m_col_inflate_strides = op.col_inflate_strides();
-
-    // The "effective" spatial size after inflating data with zeros.
-    m_input_planes_eff = (m_inputPlanes - 1) * m_plane_inflate_strides + 1;
-    m_input_rows_eff = (m_inputRows - 1) * m_row_inflate_strides + 1;
-    m_input_cols_eff = (m_inputCols - 1) * m_col_inflate_strides + 1;
-    m_patch_planes_eff = op.patch_planes() + (op.patch_planes() - 1) * (m_in_plane_strides - 1);
-    m_patch_rows_eff = op.patch_rows() + (op.patch_rows() - 1) * (m_in_row_strides - 1);
-    m_patch_cols_eff = op.patch_cols() + (op.patch_cols() - 1) * (m_in_col_strides - 1);
-
-    if (op.padding_explicit()) {
-      m_outputPlanes = numext::ceil((m_input_planes_eff + op.padding_top_z() + op.padding_bottom_z() - m_patch_planes_eff + 1.f) / static_cast<float>(m_plane_strides));
-      m_outputRows = numext::ceil((m_input_rows_eff + op.padding_top() + op.padding_bottom() - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides));
-      m_outputCols = numext::ceil((m_input_cols_eff + op.padding_left() + op.padding_right() - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides));
-      m_planePaddingTop = op.padding_top_z();
-      m_rowPaddingTop = op.padding_top();
-      m_colPaddingLeft = op.padding_left();
-    } else {
-      // Computing padding from the type
-      switch (op.padding_type()) {
-        case PADDING_VALID:
-          m_outputPlanes = numext::ceil((m_input_planes_eff - m_patch_planes_eff + 1.f) / static_cast<float>(m_plane_strides));
-          m_outputRows = numext::ceil((m_input_rows_eff - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides));
-          m_outputCols = numext::ceil((m_input_cols_eff - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides));
-          m_planePaddingTop = 0;
-          m_rowPaddingTop = 0;
-          m_colPaddingLeft = 0;
-          break;
-        case PADDING_SAME: {
-          m_outputPlanes = numext::ceil(m_input_planes_eff / static_cast<float>(m_plane_strides));
-          m_outputRows = numext::ceil(m_input_rows_eff / static_cast<float>(m_row_strides));
-          m_outputCols = numext::ceil(m_input_cols_eff / static_cast<float>(m_col_strides));
-          const Index dz = (m_outputPlanes - 1) * m_plane_strides + m_patch_planes_eff - m_input_planes_eff;
-          const Index dy = (m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff;
-          const Index dx = (m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff;
-          m_planePaddingTop = dz / 2;
-          m_rowPaddingTop = dy / 2;
-          m_colPaddingLeft = dx / 2;
-          break;
-        }
-        default:
-          eigen_assert(false && "unexpected padding");
-      }
-    }
-    eigen_assert(m_outputRows > 0);
-    eigen_assert(m_outputCols > 0);
-    eigen_assert(m_outputPlanes > 0);
-
-    // Dimensions for result of extraction.
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      // ColMajor
-      // 0: depth
-      // 1: patch_planes
-      // 2: patch_rows
-      // 3: patch_cols
-      // 4: number of patches
-      // 5 and beyond: anything else (such as batch).
-      m_dimensions[0] = input_dims[0];
-      m_dimensions[1] = op.patch_planes();
-      m_dimensions[2] = op.patch_rows();
-      m_dimensions[3] = op.patch_cols();
-      m_dimensions[4] = m_outputPlanes * m_outputRows * m_outputCols;
-      for (int i = 5; i < NumDims; ++i) {
-        m_dimensions[i] = input_dims[i-1];
-      }
-    } else {
-      // RowMajor
-      // NumDims-1: depth
-      // NumDims-2: patch_planes
-      // NumDims-3: patch_rows
-      // NumDims-4: patch_cols
-      // NumDims-5: number of patches
-      // NumDims-6 and beyond: anything else (such as batch).
-      m_dimensions[NumDims-1] = input_dims[NumInputDims-1];
-      m_dimensions[NumDims-2] = op.patch_planes();
-      m_dimensions[NumDims-3] = op.patch_rows();
-      m_dimensions[NumDims-4] = op.patch_cols();
-      m_dimensions[NumDims-5] = m_outputPlanes * m_outputRows * m_outputCols;
-      for (int i = NumDims-6; i >= 0; --i) {
-        m_dimensions[i] = input_dims[i];
-      }
-    }
-
-    // Strides for the output tensor.
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      m_rowStride = m_dimensions[1];
-      m_colStride = m_dimensions[2] * m_rowStride;
-      m_patchStride = m_colStride * m_dimensions[3] * m_dimensions[0];
-      m_otherStride = m_patchStride * m_dimensions[4];
-    } else {
-      m_rowStride = m_dimensions[NumDims-2];
-      m_colStride = m_dimensions[NumDims-3] * m_rowStride;
-      m_patchStride = m_colStride * m_dimensions[NumDims-4] * m_dimensions[NumDims-1];
-      m_otherStride = m_patchStride * m_dimensions[NumDims-5];
-    }
-
-    // Strides for navigating through the input tensor.
-    m_planeInputStride = m_inputDepth;
-    m_rowInputStride = m_inputDepth * m_inputPlanes;
-    m_colInputStride = m_inputDepth * m_inputRows * m_inputPlanes;
-    m_otherInputStride = m_inputDepth * m_inputRows * m_inputCols * m_inputPlanes;
-
-    m_outputPlanesRows = m_outputPlanes * m_outputRows;
-
-    // Fast representations of different variables.
-    m_fastOtherStride = internal::TensorIntDivisor<Index>(m_otherStride);
-
-    m_fastPatchStride = internal::TensorIntDivisor<Index>(m_patchStride);
-    m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride);
-    m_fastRowStride = internal::TensorIntDivisor<Index>(m_rowStride);
-    m_fastInputRowStride = internal::TensorIntDivisor<Index>(m_row_inflate_strides);
-    m_fastInputColStride = internal::TensorIntDivisor<Index>(m_col_inflate_strides);
-    m_fastInputPlaneStride = internal::TensorIntDivisor<Index>(m_plane_inflate_strides);
-    m_fastInputColsEff = internal::TensorIntDivisor<Index>(m_input_cols_eff);
-    m_fastOutputPlanes = internal::TensorIntDivisor<Index>(m_outputPlanes);
-    m_fastOutputPlanesRows = internal::TensorIntDivisor<Index>(m_outputPlanesRows);
-
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[0]);
-    } else {
-      m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[NumDims-1]);
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
-    m_impl.evalSubExprsIfNeeded(NULL);
-    return true;
-  }
-
-  EIGEN_STRONG_INLINE void cleanup() {
-    m_impl.cleanup();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
-  {
-    // Patch index corresponding to the passed in index.
-    const Index patchIndex = index / m_fastPatchStride;
-
-    // Spatial offset within the patch. This has to be translated into 3D
-    // coordinates within the patch.
-    const Index patchOffset = (index - patchIndex * m_patchStride) / m_fastOutputDepth;
-
-    // Batch, etc.
-    const Index otherIndex = (NumDims == 5) ? 0 : index / m_fastOtherStride;
-    const Index patch3DIndex = (NumDims == 5) ? patchIndex : (index - otherIndex * m_otherStride) / m_fastPatchStride;
-
-    // Calculate column index in the input original tensor.
-    const Index colIndex = patch3DIndex / m_fastOutputPlanesRows;
-    const Index colOffset = patchOffset / m_fastColStride;
-    const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft;
-    const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
-    if (inputCol < 0 || inputCol >= m_input_cols_eff ||
-        ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) {
-      return Scalar(m_paddingValue);
-    }
-
-    // Calculate row index in the original input tensor.
-    const Index rowIndex = (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes;
-    const Index rowOffset = (patchOffset - colOffset * m_colStride) / m_fastRowStride;
-    const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop;
-    const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
-    if (inputRow < 0 || inputRow >= m_input_rows_eff ||
-        ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) {
-      return Scalar(m_paddingValue);
-    }
-
-    // Calculate plane index in the original input tensor.
-    const Index planeIndex = (patch3DIndex - m_outputPlanes * (colIndex * m_outputRows + rowIndex));
-    const Index planeOffset = patchOffset - colOffset * m_colStride - rowOffset * m_rowStride;
-    const Index inputPlane = planeIndex * m_plane_strides + planeOffset * m_in_plane_strides - m_planePaddingTop;
-    const Index origInputPlane = (m_plane_inflate_strides == 1) ? inputPlane : ((inputPlane >= 0) ? (inputPlane / m_fastInputPlaneStride) : 0);
-    if (inputPlane < 0 || inputPlane >= m_input_planes_eff ||
-        ((m_plane_inflate_strides != 1) && (inputPlane != origInputPlane * m_plane_inflate_strides))) {
-      return Scalar(m_paddingValue);
-    }
-
-    const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
-    const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index];
-
-    const Index inputIndex = depth +
-        origInputRow * m_rowInputStride +
-        origInputCol * m_colInputStride +
-        origInputPlane * m_planeInputStride +
-        otherIndex * m_otherInputStride;
-
-    return m_impl.coeff(inputIndex);
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
-  {
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
-
-    if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1 ||
-        m_in_plane_strides != 1 || m_plane_inflate_strides != 1) {
-      return packetWithPossibleZero(index);
-    }
-
-    const Index indices[2] = {index, index + PacketSize - 1};
-    const Index patchIndex = indices[0] / m_fastPatchStride;
-    if (patchIndex != indices[1] / m_fastPatchStride) {
-      return packetWithPossibleZero(index);
-    }
-    const Index otherIndex = (NumDims == 5) ? 0 : indices[0] / m_fastOtherStride;
-    eigen_assert(otherIndex == indices[1] / m_fastOtherStride);
-
-    // Find the offset of the element wrt the location of the first element.
-    const Index patchOffsets[2] = {(indices[0] - patchIndex * m_patchStride) / m_fastOutputDepth,
-                                   (indices[1] - patchIndex * m_patchStride) / m_fastOutputDepth};
-
-    const Index patch3DIndex = (NumDims == 5) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride;
-    eigen_assert(patch3DIndex == (indices[1] - otherIndex * m_otherStride) / m_fastPatchStride);
-
-    const Index colIndex = patch3DIndex / m_fastOutputPlanesRows;
-    const Index colOffsets[2] = {
-      patchOffsets[0] / m_fastColStride,
-      patchOffsets[1] / m_fastColStride};
-
-    // Calculate col indices in the original input tensor.
-    const Index inputCols[2] = {
-      colIndex * m_col_strides + colOffsets[0] - m_colPaddingLeft,
-      colIndex * m_col_strides + colOffsets[1] - m_colPaddingLeft};
-    if (inputCols[1] < 0 || inputCols[0] >= m_inputCols) {
-      return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
-    }
-
-    if (inputCols[0] != inputCols[1]) {
-      return packetWithPossibleZero(index);
-    }
-
-    const Index rowIndex = (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes;
-    const Index rowOffsets[2] = {
-      (patchOffsets[0] - colOffsets[0] * m_colStride) / m_fastRowStride,
-      (patchOffsets[1] - colOffsets[1] * m_colStride) / m_fastRowStride};
-    eigen_assert(rowOffsets[0] <= rowOffsets[1]);
-    // Calculate col indices in the original input tensor.
-    const Index inputRows[2] = {
-      rowIndex * m_row_strides + rowOffsets[0] - m_rowPaddingTop,
-      rowIndex * m_row_strides + rowOffsets[1] - m_rowPaddingTop};
-
-    if (inputRows[1] < 0 || inputRows[0] >= m_inputRows) {
-      return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
-    }
-
-    if (inputRows[0] != inputRows[1]) {
-      return packetWithPossibleZero(index);
-    }
-
-    const Index planeIndex = (patch3DIndex - m_outputPlanes * (colIndex * m_outputRows + rowIndex));
-    const Index planeOffsets[2] = {
-      patchOffsets[0] - colOffsets[0] * m_colStride - rowOffsets[0] * m_rowStride,
-      patchOffsets[1] - colOffsets[1] * m_colStride - rowOffsets[1] * m_rowStride};
-    eigen_assert(planeOffsets[0] <= planeOffsets[1]);
-    const Index inputPlanes[2] = {
-      planeIndex * m_plane_strides + planeOffsets[0] - m_planePaddingTop,
-      planeIndex * m_plane_strides + planeOffsets[1] - m_planePaddingTop};
-
-    if (inputPlanes[1] < 0 || inputPlanes[0] >= m_inputPlanes) {
-      return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
-    }
-
-    if (inputPlanes[0] >= 0 && inputPlanes[1] < m_inputPlanes) {
-      // no padding
-      const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
-      const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index];
-      const Index inputIndex = depth +
-          inputRows[0] * m_rowInputStride +
-          inputCols[0] * m_colInputStride +
-          m_planeInputStride * inputPlanes[0] +
-          otherIndex * m_otherInputStride;
-      return m_impl.template packet<Unaligned>(inputIndex);
-    }
-
-    return packetWithPossibleZero(index);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
-  costPerCoeff(bool vectorized) const {
-    const double compute_cost =
-        10 * TensorOpCost::DivCost<Index>() + 21 * TensorOpCost::MulCost<Index>() +
-        8 * TensorOpCost::AddCost<Index>();
-    return TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
-  }
-
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
-
-  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
-
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index planePaddingTop() const { return m_planePaddingTop; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowPaddingTop() const { return m_rowPaddingTop; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colPaddingLeft() const { return m_colPaddingLeft; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputPlanes() const { return m_outputPlanes; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputRows() const { return m_outputRows; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputCols() const { return m_outputCols; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userPlaneStride() const { return m_plane_strides; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userRowStride() const { return m_row_strides; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userColStride() const { return m_col_strides; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInPlaneStride() const { return m_in_plane_strides; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInRowStride() const { return m_in_row_strides; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInColStride() const { return m_in_col_strides; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index planeInflateStride() const { return m_plane_inflate_strides; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowInflateStride() const { return m_row_inflate_strides; }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colInflateStride() const { return m_col_inflate_strides; }
-
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
- protected:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
-  {
-    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
-    EIGEN_UNROLL_LOOP
-    for (int i = 0; i < PacketSize; ++i) {
-      values[i] = coeff(index+i);
-    }
-    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
-    return rslt;
-  }
-
-  Dimensions m_dimensions;
-
-  // Parameters passed to the constructor.
-  Index m_plane_strides;
-  Index m_row_strides;
-  Index m_col_strides;
-
-  Index m_outputPlanes;
-  Index m_outputRows;
-  Index m_outputCols;
-
-  Index m_planePaddingTop;
-  Index m_rowPaddingTop;
-  Index m_colPaddingLeft;
-
-  Index m_in_plane_strides;
-  Index m_in_row_strides;
-  Index m_in_col_strides;
-
-  Index m_plane_inflate_strides;
-  Index m_row_inflate_strides;
-  Index m_col_inflate_strides;
-
-  // Cached input size.
-  Index m_inputDepth;
-  Index m_inputPlanes;
-  Index m_inputRows;
-  Index m_inputCols;
-
-  // Other cached variables.
-  Index m_outputPlanesRows;
-
-  // Effective input/patch post-inflation size.
-  Index m_input_planes_eff;
-  Index m_input_rows_eff;
-  Index m_input_cols_eff;
-  Index m_patch_planes_eff;
-  Index m_patch_rows_eff;
-  Index m_patch_cols_eff;
-
-  // Strides for the output tensor.
-  Index m_otherStride;
-  Index m_patchStride;
-  Index m_rowStride;
-  Index m_colStride;
-
-  // Strides for the input tensor.
-  Index m_planeInputStride;
-  Index m_rowInputStride;
-  Index m_colInputStride;
-  Index m_otherInputStride;
-
-  internal::TensorIntDivisor<Index> m_fastOtherStride;
-  internal::TensorIntDivisor<Index> m_fastPatchStride;
-  internal::TensorIntDivisor<Index> m_fastColStride;
-  internal::TensorIntDivisor<Index> m_fastRowStride;
-  internal::TensorIntDivisor<Index> m_fastInputPlaneStride;
-  internal::TensorIntDivisor<Index> m_fastInputRowStride;
-  internal::TensorIntDivisor<Index> m_fastInputColStride;
-  internal::TensorIntDivisor<Index> m_fastInputColsEff;
-  internal::TensorIntDivisor<Index> m_fastOutputPlanesRows;
-  internal::TensorIntDivisor<Index> m_fastOutputPlanes;
-  internal::TensorIntDivisor<Index> m_fastOutputDepth;
-
-  Scalar m_paddingValue;
-
-  TensorEvaluator<ArgType, Device> m_impl;
-
-
-};
-
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H
diff --git a/src/EigenUnsupported/CXX11/src/TensorSymmetry/DynamicSymmetry.h b/src/EigenUnsupported/CXX11/src/TensorSymmetry/DynamicSymmetry.h
deleted file mode 100644
index bc4f202..0000000
--- a/src/EigenUnsupported/CXX11/src/TensorSymmetry/DynamicSymmetry.h
+++ /dev/null
@@ -1,293 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSORSYMMETRY_DYNAMICSYMMETRY_H
-#define EIGEN_CXX11_TENSORSYMMETRY_DYNAMICSYMMETRY_H
-
-namespace Eigen {
-
-class DynamicSGroup
-{
-  public:
-    inline explicit DynamicSGroup() : m_numIndices(1), m_elements(), m_generators(), m_globalFlags(0) { m_elements.push_back(ge(Generator(0, 0, 0))); }
-    inline DynamicSGroup(const DynamicSGroup& o) : m_numIndices(o.m_numIndices), m_elements(o.m_elements), m_generators(o.m_generators), m_globalFlags(o.m_globalFlags) { }
-    inline DynamicSGroup(DynamicSGroup&& o) : m_numIndices(o.m_numIndices), m_elements(), m_generators(o.m_generators), m_globalFlags(o.m_globalFlags) { std::swap(m_elements, o.m_elements); }
-    inline DynamicSGroup& operator=(const DynamicSGroup& o) { m_numIndices = o.m_numIndices; m_elements = o.m_elements; m_generators = o.m_generators; m_globalFlags = o.m_globalFlags; return *this; }
-    inline DynamicSGroup& operator=(DynamicSGroup&& o) { m_numIndices = o.m_numIndices; std::swap(m_elements, o.m_elements); m_generators = o.m_generators; m_globalFlags = o.m_globalFlags; return *this; }
-
-    void add(int one, int two, int flags = 0);
-
-    template<typename Gen_>
-    inline void add(Gen_) { add(Gen_::One, Gen_::Two, Gen_::Flags); }
-    inline void addSymmetry(int one, int two) { add(one, two, 0); }
-    inline void addAntiSymmetry(int one, int two) { add(one, two, NegationFlag); }
-    inline void addHermiticity(int one, int two) { add(one, two, ConjugationFlag); }
-    inline void addAntiHermiticity(int one, int two) { add(one, two, NegationFlag | ConjugationFlag); }
-
-    template<typename Op, typename RV, typename Index, std::size_t N, typename... Args>
-    inline RV apply(const std::array<Index, N>& idx, RV initial, Args&&... args) const
-    {
-      eigen_assert(N >= m_numIndices && "Can only apply symmetry group to objects that have at least the required amount of indices.");
-      for (std::size_t i = 0; i < size(); i++)
-        initial = Op::run(h_permute(i, idx, typename internal::gen_numeric_list<int, N>::type()), m_elements[i].flags, initial, std::forward<Args>(args)...);
-      return initial;
-    }
-
-    template<typename Op, typename RV, typename Index, typename... Args>
-    inline RV apply(const std::vector<Index>& idx, RV initial, Args&&... args) const
-    {
-      eigen_assert(idx.size() >= m_numIndices && "Can only apply symmetry group to objects that have at least the required amount of indices.");
-      for (std::size_t i = 0; i < size(); i++)
-        initial = Op::run(h_permute(i, idx), m_elements[i].flags, initial, std::forward<Args>(args)...);
-      return initial;
-    }
-
-    inline int globalFlags() const { return m_globalFlags; }
-    inline std::size_t size() const { return m_elements.size(); }
-
-    template<typename Tensor_, typename... IndexTypes>
-    inline internal::tensor_symmetry_value_setter<Tensor_, DynamicSGroup> operator()(Tensor_& tensor, typename Tensor_::Index firstIndex, IndexTypes... otherIndices) const
-    {
-      static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
-      return operator()(tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices>{{firstIndex, otherIndices...}});
-    }
-
-    template<typename Tensor_>
-    inline internal::tensor_symmetry_value_setter<Tensor_, DynamicSGroup> operator()(Tensor_& tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices> const& indices) const
-    {
-      return internal::tensor_symmetry_value_setter<Tensor_, DynamicSGroup>(tensor, *this, indices);
-    }
-  private:
-    struct GroupElement {
-      std::vector<int> representation;
-      int flags;
-      bool isId() const
-      {
-        for (std::size_t i = 0; i < representation.size(); i++)
-          if (i != (size_t)representation[i])
-            return false;
-        return true;
-      }
-    };
-    struct Generator {
-      int one;
-      int two;
-      int flags;
-      constexpr inline Generator(int one_, int two_, int flags_) : one(one_), two(two_), flags(flags_) {}
-    };
-
-    std::size_t m_numIndices;
-    std::vector<GroupElement> m_elements;
-    std::vector<Generator> m_generators;
-    int m_globalFlags;
-
-    template<typename Index, std::size_t N, int... n>
-    inline std::array<Index, N> h_permute(std::size_t which, const std::array<Index, N>& idx, internal::numeric_list<int, n...>) const
-    {
-      return std::array<Index, N>{{ idx[n >= m_numIndices ? n : m_elements[which].representation[n]]... }};
-    }
-
-    template<typename Index>
-    inline std::vector<Index> h_permute(std::size_t which, std::vector<Index> idx) const
-    {
-      std::vector<Index> result;
-      result.reserve(idx.size());
-      for (auto k : m_elements[which].representation)
-        result.push_back(idx[k]);
-      for (std::size_t i = m_numIndices; i < idx.size(); i++)
-        result.push_back(idx[i]);
-      return result;
-    }
-
-    inline GroupElement ge(Generator const& g) const
-    {
-      GroupElement result;
-      result.representation.reserve(m_numIndices);
-      result.flags = g.flags;
-      for (std::size_t k = 0; k < m_numIndices; k++) {
-        if (k == (std::size_t)g.one)
-          result.representation.push_back(g.two);
-        else if (k == (std::size_t)g.two)
-          result.representation.push_back(g.one);
-        else
-          result.representation.push_back(int(k));
-      }
-      return result;
-    }
-
-    GroupElement mul(GroupElement, GroupElement) const;
-    inline GroupElement mul(Generator g1, GroupElement g2) const
-    {
-      return mul(ge(g1), g2);
-    }
-
-    inline GroupElement mul(GroupElement g1, Generator g2) const
-    {
-      return mul(g1, ge(g2));
-    }
-
-    inline GroupElement mul(Generator g1, Generator g2) const
-    {
-      return mul(ge(g1), ge(g2));
-    }
-
-    inline int findElement(GroupElement e) const
-    {
-      for (auto ee : m_elements) {
-        if (ee.representation == e.representation)
-          return ee.flags ^ e.flags;
-      }
-      return -1;
-    }
-
-    void updateGlobalFlags(int flagDiffOfSameGenerator);
-};
-
-// dynamic symmetry group that auto-adds the template parameters in the constructor
-template<typename... Gen>
-class DynamicSGroupFromTemplateArgs : public DynamicSGroup
-{
-  public:
-    inline DynamicSGroupFromTemplateArgs() : DynamicSGroup()
-    {
-      add_all(internal::type_list<Gen...>());
-    }
-    inline DynamicSGroupFromTemplateArgs(DynamicSGroupFromTemplateArgs const& other) : DynamicSGroup(other) { }
-    inline DynamicSGroupFromTemplateArgs(DynamicSGroupFromTemplateArgs&& other) : DynamicSGroup(other) { }
-    inline DynamicSGroupFromTemplateArgs<Gen...>& operator=(const DynamicSGroupFromTemplateArgs<Gen...>& o) { DynamicSGroup::operator=(o); return *this; }
-    inline DynamicSGroupFromTemplateArgs<Gen...>& operator=(DynamicSGroupFromTemplateArgs<Gen...>&& o) { DynamicSGroup::operator=(o); return *this; }
-  
-  private:
-    template<typename Gen1, typename... GenNext>
-    inline void add_all(internal::type_list<Gen1, GenNext...>)
-    {
-      add(Gen1());
-      add_all(internal::type_list<GenNext...>());
-    }
-
-    inline void add_all(internal::type_list<>)
-    {
-    }
-};
-
-inline DynamicSGroup::GroupElement DynamicSGroup::mul(GroupElement g1, GroupElement g2) const
-{
-  eigen_internal_assert(g1.representation.size() == m_numIndices);
-  eigen_internal_assert(g2.representation.size() == m_numIndices);
-
-  GroupElement result;
-  result.representation.reserve(m_numIndices);
-  for (std::size_t i = 0; i < m_numIndices; i++) {
-    int v = g2.representation[g1.representation[i]];
-    eigen_assert(v >= 0);
-    result.representation.push_back(v);
-  }
-  result.flags = g1.flags ^ g2.flags;
-  return result;
-}
-
-inline void DynamicSGroup::add(int one, int two, int flags)
-{
-  eigen_assert(one >= 0);
-  eigen_assert(two >= 0);
-  eigen_assert(one != two);
-
-  if ((std::size_t)one >= m_numIndices || (std::size_t)two >= m_numIndices) {
-    std::size_t newNumIndices = (one > two) ? one : two + 1;
-    for (auto& gelem : m_elements) {
-      gelem.representation.reserve(newNumIndices);
-      for (std::size_t i = m_numIndices; i < newNumIndices; i++)
-        gelem.representation.push_back(i);
-    }
-    m_numIndices = newNumIndices;
-  }
-
-  Generator g{one, two, flags};
-  GroupElement e = ge(g);
-
-  /* special case for first generator */
-  if (m_elements.size() == 1) {
-    while (!e.isId()) {
-      m_elements.push_back(e);
-      e = mul(e, g);
-    }
-
-    if (e.flags > 0)
-      updateGlobalFlags(e.flags);
-
-    // only add in case we didn't have identity
-    if (m_elements.size() > 1)
-      m_generators.push_back(g);
-    return;
-  }
-
-  int p = findElement(e);
-  if (p >= 0) {
-    updateGlobalFlags(p);
-    return;
-  }
-
-  std::size_t coset_order = m_elements.size();
-  m_elements.push_back(e);
-  for (std::size_t i = 1; i < coset_order; i++)
-    m_elements.push_back(mul(m_elements[i], e));
-  m_generators.push_back(g);
-
-  std::size_t coset_rep = coset_order;
-  do {
-    for (auto g : m_generators) {
-      e = mul(m_elements[coset_rep], g);
-      p = findElement(e);
-      if (p < 0) {
-        // element not yet in group
-        m_elements.push_back(e);
-        for (std::size_t i = 1; i < coset_order; i++)
-          m_elements.push_back(mul(m_elements[i], e));
-      } else if (p > 0) {
-        updateGlobalFlags(p);
-      }
-    }
-    coset_rep += coset_order;
-  } while (coset_rep < m_elements.size());
-}
-
-inline void DynamicSGroup::updateGlobalFlags(int flagDiffOfSameGenerator)
-{
-    switch (flagDiffOfSameGenerator) {
-      case 0:
-      default:
-        // nothing happened
-        break;
-      case NegationFlag:
-        // every element is it's own negative => whole tensor is zero
-        m_globalFlags |= GlobalZeroFlag;
-        break;
-      case ConjugationFlag:
-        // every element is it's own conjugate => whole tensor is real
-        m_globalFlags |= GlobalRealFlag;
-        break;
-      case (NegationFlag | ConjugationFlag):
-        // every element is it's own negative conjugate => whole tensor is imaginary
-        m_globalFlags |= GlobalImagFlag;
-        break;
-      /* NOTE:
-       *   since GlobalZeroFlag == GlobalRealFlag | GlobalImagFlag, if one generator
-       *   causes the tensor to be real and the next one to be imaginary, this will
-       *   trivially give the correct result
-       */
-    }
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSORSYMMETRY_DYNAMICSYMMETRY_H
-
-/*
- * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
- */
diff --git a/src/EigenUnsupported/CXX11/src/TensorSymmetry/StaticSymmetry.h b/src/EigenUnsupported/CXX11/src/TensorSymmetry/StaticSymmetry.h
deleted file mode 100644
index 942293b..0000000
--- a/src/EigenUnsupported/CXX11/src/TensorSymmetry/StaticSymmetry.h
+++ /dev/null
@@ -1,236 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSORSYMMETRY_STATICSYMMETRY_H
-#define EIGEN_CXX11_TENSORSYMMETRY_STATICSYMMETRY_H
-
-namespace Eigen {
-
-namespace internal {
-
-template<typename list> struct tensor_static_symgroup_permutate;
-
-template<int... nn>
-struct tensor_static_symgroup_permutate<numeric_list<int, nn...>>
-{
-  constexpr static std::size_t N = sizeof...(nn);
-
-  template<typename T>
-  constexpr static inline std::array<T, N> run(const std::array<T, N>& indices)
-  {
-    return {{indices[nn]...}};
-  }
-};
-
-template<typename indices_, int flags_>
-struct tensor_static_symgroup_element
-{
-  typedef indices_ indices;
-  constexpr static int flags = flags_;
-};
-
-template<typename Gen, int N>
-struct tensor_static_symgroup_element_ctor
-{
-  typedef tensor_static_symgroup_element<
-    typename gen_numeric_list_swapped_pair<int, N, Gen::One, Gen::Two>::type,
-    Gen::Flags
-  > type;
-};
-
-template<int N>
-struct tensor_static_symgroup_identity_ctor
-{
-  typedef tensor_static_symgroup_element<
-    typename gen_numeric_list<int, N>::type,
-    0
-  > type;
-};
-
-template<typename iib>
-struct tensor_static_symgroup_multiply_helper
-{
-  template<int... iia>
-  constexpr static inline numeric_list<int, get<iia, iib>::value...> helper(numeric_list<int, iia...>) {
-    return numeric_list<int, get<iia, iib>::value...>();
-  }
-};
-
-template<typename A, typename B>
-struct tensor_static_symgroup_multiply
-{
-  private:
-    typedef typename A::indices iia;
-    typedef typename B::indices iib;
-    constexpr static int ffa = A::flags;
-    constexpr static int ffb = B::flags;
-  
-  public:
-    static_assert(iia::count == iib::count, "Cannot multiply symmetry elements with different number of indices.");
-
-    typedef tensor_static_symgroup_element<
-      decltype(tensor_static_symgroup_multiply_helper<iib>::helper(iia())),
-      ffa ^ ffb
-    > type;
-};
-
-template<typename A, typename B>
-struct tensor_static_symgroup_equality
-{
-    typedef typename A::indices iia;
-    typedef typename B::indices iib;
-    constexpr static int ffa = A::flags;
-    constexpr static int ffb = B::flags;
-    static_assert(iia::count == iib::count, "Cannot compare symmetry elements with different number of indices.");
-
-    constexpr static bool value = is_same<iia, iib>::value;
-
-  private:
-    /* this should be zero if they are identical, or else the tensor
-     * will be forced to be pure real, pure imaginary or even pure zero
-     */
-    constexpr static int flags_cmp_ = ffa ^ ffb;
-
-    /* either they are not equal, then we don't care whether the flags
-     * match, or they are equal, and then we have to check
-     */
-    constexpr static bool is_zero      = value && flags_cmp_ == NegationFlag;
-    constexpr static bool is_real      = value && flags_cmp_ == ConjugationFlag;
-    constexpr static bool is_imag      = value && flags_cmp_ == (NegationFlag | ConjugationFlag);
-
-  public:
-    constexpr static int global_flags = 
-      (is_real ? GlobalRealFlag : 0) |
-      (is_imag ? GlobalImagFlag : 0) |
-      (is_zero ? GlobalZeroFlag : 0);
-};
-
-template<std::size_t NumIndices, typename... Gen>
-struct tensor_static_symgroup
-{
-  typedef StaticSGroup<Gen...> type;
-  constexpr static std::size_t size = type::static_size;
-};
-
-template<typename Index, std::size_t N, int... ii, int... jj>
-constexpr static inline std::array<Index, N> tensor_static_symgroup_index_permute(std::array<Index, N> idx, internal::numeric_list<int, ii...>, internal::numeric_list<int, jj...>)
-{
-  return {{ idx[ii]..., idx[jj]... }};
-}
-
-template<typename Index, int... ii>
-static inline std::vector<Index> tensor_static_symgroup_index_permute(std::vector<Index> idx, internal::numeric_list<int, ii...>)
-{
-  std::vector<Index> result{{ idx[ii]... }};
-  std::size_t target_size = idx.size();
-  for (std::size_t i = result.size(); i < target_size; i++)
-    result.push_back(idx[i]);
-  return result;
-}
-
-template<typename T> struct tensor_static_symgroup_do_apply;
-
-template<typename first, typename... next>
-struct tensor_static_symgroup_do_apply<internal::type_list<first, next...>>
-{
-  template<typename Op, typename RV, std::size_t SGNumIndices, typename Index, std::size_t NumIndices, typename... Args>
-  static inline RV run(const std::array<Index, NumIndices>& idx, RV initial, Args&&... args)
-  {
-    static_assert(NumIndices >= SGNumIndices, "Can only apply symmetry group to objects that have at least the required amount of indices.");
-    typedef typename internal::gen_numeric_list<int, NumIndices - SGNumIndices, SGNumIndices>::type remaining_indices;
-    initial = Op::run(tensor_static_symgroup_index_permute(idx, typename first::indices(), remaining_indices()), first::flags, initial, std::forward<Args>(args)...);
-    return tensor_static_symgroup_do_apply<internal::type_list<next...>>::template run<Op, RV, SGNumIndices>(idx, initial, args...);
-  }
-
-  template<typename Op, typename RV, std::size_t SGNumIndices, typename Index, typename... Args>
-  static inline RV run(const std::vector<Index>& idx, RV initial, Args&&... args)
-  {
-    eigen_assert(idx.size() >= SGNumIndices && "Can only apply symmetry group to objects that have at least the required amount of indices.");
-    initial = Op::run(tensor_static_symgroup_index_permute(idx, typename first::indices()), first::flags, initial, std::forward<Args>(args)...);
-    return tensor_static_symgroup_do_apply<internal::type_list<next...>>::template run<Op, RV, SGNumIndices>(idx, initial, args...);
-  }
-};
-
-template<EIGEN_TPL_PP_SPEC_HACK_DEF(typename, empty)>
-struct tensor_static_symgroup_do_apply<internal::type_list<EIGEN_TPL_PP_SPEC_HACK_USE(empty)>>
-{
-  template<typename Op, typename RV, std::size_t SGNumIndices, typename Index, std::size_t NumIndices, typename... Args>
-  static inline RV run(const std::array<Index, NumIndices>&, RV initial, Args&&...)
-  {
-    // do nothing
-    return initial;
-  }
-
-  template<typename Op, typename RV, std::size_t SGNumIndices, typename Index, typename... Args>
-  static inline RV run(const std::vector<Index>&, RV initial, Args&&...)
-  {
-    // do nothing
-    return initial;
-  }
-};
-
-} // end namespace internal
-
-template<typename... Gen>
-class StaticSGroup
-{
-    constexpr static std::size_t NumIndices = internal::tensor_symmetry_num_indices<Gen...>::value;
-    typedef internal::group_theory::enumerate_group_elements<
-      internal::tensor_static_symgroup_multiply,
-      internal::tensor_static_symgroup_equality,
-      typename internal::tensor_static_symgroup_identity_ctor<NumIndices>::type,
-      internal::type_list<typename internal::tensor_static_symgroup_element_ctor<Gen, NumIndices>::type...>
-    > group_elements;
-    typedef typename group_elements::type ge;
-  public:
-    constexpr inline StaticSGroup() {}
-    constexpr inline StaticSGroup(const StaticSGroup<Gen...>&) {}
-    constexpr inline StaticSGroup(StaticSGroup<Gen...>&&) {}
-
-    template<typename Op, typename RV, typename Index, std::size_t N, typename... Args>
-    static inline RV apply(const std::array<Index, N>& idx, RV initial, Args&&... args)
-    {
-      return internal::tensor_static_symgroup_do_apply<ge>::template run<Op, RV, NumIndices>(idx, initial, args...);
-    }
-
-    template<typename Op, typename RV, typename Index, typename... Args>
-    static inline RV apply(const std::vector<Index>& idx, RV initial, Args&&... args)
-    {
-      eigen_assert(idx.size() == NumIndices);
-      return internal::tensor_static_symgroup_do_apply<ge>::template run<Op, RV, NumIndices>(idx, initial, args...);
-    }
-
-    constexpr static std::size_t static_size = ge::count;
-
-    constexpr static inline std::size_t size() {
-      return ge::count;
-    }
-    constexpr static inline int globalFlags() { return group_elements::global_flags; }
-
-    template<typename Tensor_, typename... IndexTypes>
-    inline internal::tensor_symmetry_value_setter<Tensor_, StaticSGroup<Gen...>> operator()(Tensor_& tensor, typename Tensor_::Index firstIndex, IndexTypes... otherIndices) const
-    {
-      static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
-      return operator()(tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices>{{firstIndex, otherIndices...}});
-    }
-
-    template<typename Tensor_>
-    inline internal::tensor_symmetry_value_setter<Tensor_, StaticSGroup<Gen...>> operator()(Tensor_& tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices> const& indices) const
-    {
-      return internal::tensor_symmetry_value_setter<Tensor_, StaticSGroup<Gen...>>(tensor, *this, indices);
-    }
-};
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSORSYMMETRY_STATICSYMMETRY_H
-
-/*
- * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
- */
diff --git a/src/EigenUnsupported/CXX11/src/TensorSymmetry/Symmetry.h b/src/EigenUnsupported/CXX11/src/TensorSymmetry/Symmetry.h
deleted file mode 100644
index 879d6cd..0000000
--- a/src/EigenUnsupported/CXX11/src/TensorSymmetry/Symmetry.h
+++ /dev/null
@@ -1,338 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSORSYMMETRY_SYMMETRY_H
-#define EIGEN_CXX11_TENSORSYMMETRY_SYMMETRY_H
-
-namespace Eigen {
-
-enum {
-  NegationFlag           = 0x01,
-  ConjugationFlag        = 0x02
-};
-
-enum {
-  GlobalRealFlag         = 0x01,
-  GlobalImagFlag         = 0x02,
-  GlobalZeroFlag         = 0x03
-};
-
-namespace internal {
-
-template<std::size_t NumIndices, typename... Sym>                   struct tensor_symmetry_pre_analysis;
-template<std::size_t NumIndices, typename... Sym>                   struct tensor_static_symgroup;
-template<bool instantiate, std::size_t NumIndices, typename... Sym> struct tensor_static_symgroup_if;
-template<typename Tensor_> struct tensor_symmetry_calculate_flags;
-template<typename Tensor_> struct tensor_symmetry_assign_value;
-template<typename... Sym> struct tensor_symmetry_num_indices;
-
-} // end namespace internal
-
-template<int One_, int Two_>
-struct Symmetry
-{
-  static_assert(One_ != Two_, "Symmetries must cover distinct indices.");
-  constexpr static int One = One_;
-  constexpr static int Two = Two_;
-  constexpr static int Flags = 0;
-};
-
-template<int One_, int Two_>
-struct AntiSymmetry
-{
-  static_assert(One_ != Two_, "Symmetries must cover distinct indices.");
-  constexpr static int One = One_;
-  constexpr static int Two = Two_;
-  constexpr static int Flags = NegationFlag;
-};
-
-template<int One_, int Two_>
-struct Hermiticity
-{
-  static_assert(One_ != Two_, "Symmetries must cover distinct indices.");
-  constexpr static int One = One_;
-  constexpr static int Two = Two_;
-  constexpr static int Flags = ConjugationFlag;
-};
-
-template<int One_, int Two_>
-struct AntiHermiticity
-{
-  static_assert(One_ != Two_, "Symmetries must cover distinct indices.");
-  constexpr static int One = One_;
-  constexpr static int Two = Two_;
-  constexpr static int Flags = ConjugationFlag | NegationFlag;
-};
-
-/** \class DynamicSGroup
-  * \ingroup TensorSymmetry_Module
-  *
-  * \brief Dynamic symmetry group
-  *
-  * The %DynamicSGroup class represents a symmetry group that need not be known at
-  * compile time. It is useful if one wants to support arbitrary run-time defineable
-  * symmetries for tensors, but it is also instantiated if a symmetry group is defined
-  * at compile time that would be either too large for the compiler to reasonably
-  * generate (using templates to calculate this at compile time is very inefficient)
-  * or that the compiler could generate the group but that it wouldn't make sense to
-  * unroll the loop for setting coefficients anymore.
-  */
-class DynamicSGroup;
-
-/** \internal
-  *
-  * \class DynamicSGroupFromTemplateArgs
-  * \ingroup TensorSymmetry_Module
-  *
-  * \brief Dynamic symmetry group, initialized from template arguments
-  *
-  * This class is a child class of DynamicSGroup. It uses the template arguments
-  * specified to initialize itself.
-  */
-template<typename... Gen>
-class DynamicSGroupFromTemplateArgs;
-
-/** \class StaticSGroup
-  * \ingroup TensorSymmetry_Module
-  *
-  * \brief Static symmetry group
-  *
-  * This class represents a symmetry group that is known and resolved completely
-  * at compile time. Ideally, no run-time penalty is incurred compared to the
-  * manual unrolling of the symmetry.
-  *
-  * <b><i>CAUTION:</i></b>
-  *
-  * Do not use this class directly for large symmetry groups. The compiler
-  * may run into a limit, or segfault or in the very least will take a very,
-  * very, very long time to compile the code. Use the SGroup class instead
-  * if you want a static group. That class contains logic that will
-  * automatically select the DynamicSGroup class instead if the symmetry
-  * group becomes too large. (In that case, unrolling may not even be
-  * beneficial.)
-  */
-template<typename... Gen>
-class StaticSGroup;
-
-/** \class SGroup
-  * \ingroup TensorSymmetry_Module
-  *
-  * \brief Symmetry group, initialized from template arguments
-  *
-  * This class represents a symmetry group whose generators are already
-  * known at compile time. It may or may not be resolved at compile time,
-  * depending on the estimated size of the group.
-  *
-  * \sa StaticSGroup
-  * \sa DynamicSGroup
-  */
-template<typename... Gen>
-class SGroup : public internal::tensor_symmetry_pre_analysis<internal::tensor_symmetry_num_indices<Gen...>::value, Gen...>::root_type
-{
-  public:
-    constexpr static std::size_t NumIndices = internal::tensor_symmetry_num_indices<Gen...>::value;
-    typedef typename internal::tensor_symmetry_pre_analysis<NumIndices, Gen...>::root_type Base;
-
-    // make standard constructors + assignment operators public
-    inline SGroup() : Base() { }
-    inline SGroup(const SGroup<Gen...>& other) : Base(other) { }
-    inline SGroup(SGroup<Gen...>&& other) : Base(other) { }
-    inline SGroup<Gen...>& operator=(const SGroup<Gen...>& other) { Base::operator=(other); return *this; }
-    inline SGroup<Gen...>& operator=(SGroup<Gen...>&& other) { Base::operator=(other); return *this; }
-
-    // all else is defined in the base class
-};
-
-namespace internal {
-
-template<typename... Sym> struct tensor_symmetry_num_indices
-{
-  constexpr static std::size_t value = 1;
-};
-
-template<int One_, int Two_, typename... Sym> struct tensor_symmetry_num_indices<Symmetry<One_, Two_>, Sym...>
-{
-private:
-  constexpr static std::size_t One = static_cast<std::size_t>(One_);
-  constexpr static std::size_t Two = static_cast<std::size_t>(Two_);
-  constexpr static std::size_t Three = tensor_symmetry_num_indices<Sym...>::value;
-
-  // don't use std::max, since it's not constexpr until C++14...
-  constexpr static std::size_t maxOneTwoPlusOne = ((One > Two) ? One : Two) + 1;
-public:
-  constexpr static std::size_t value = (maxOneTwoPlusOne > Three) ? maxOneTwoPlusOne : Three;
-};
-
-template<int One_, int Two_, typename... Sym> struct tensor_symmetry_num_indices<AntiSymmetry<One_, Two_>, Sym...>
-  : public tensor_symmetry_num_indices<Symmetry<One_, Two_>, Sym...> {};
-template<int One_, int Two_, typename... Sym> struct tensor_symmetry_num_indices<Hermiticity<One_, Two_>, Sym...>
-  : public tensor_symmetry_num_indices<Symmetry<One_, Two_>, Sym...> {};
-template<int One_, int Two_, typename... Sym> struct tensor_symmetry_num_indices<AntiHermiticity<One_, Two_>, Sym...>
-  : public tensor_symmetry_num_indices<Symmetry<One_, Two_>, Sym...> {};
-
-/** \internal
-  *
-  * \class tensor_symmetry_pre_analysis
-  * \ingroup TensorSymmetry_Module
-  *
-  * \brief Pre-select whether to use a static or dynamic symmetry group
-  *
-  * When a symmetry group could in principle be determined at compile time,
-  * this template implements the logic whether to actually do that or whether
-  * to rather defer that to runtime.
-  *
-  * The logic is as follows:
-  * <dl>
-  * <dt><b>No generators (trivial symmetry):</b></dt>
-  * <dd>Use a trivial static group. Ideally, this has no performance impact
-  *     compared to not using symmetry at all. In practice, this might not
-  *     be the case.</dd>
-  * <dt><b>More than 4 generators:</b></dt>
-  * <dd>Calculate the group at run time, it is likely far too large for the
-  *     compiler to be able to properly generate it in a realistic time.</dd>
-  * <dt><b>Up to and including 4 generators:</b></dt>
-  * <dd>Actually enumerate all group elements, but then check how many there
-  *     are. If there are more than 16, it is unlikely that unrolling the
-  *     loop (as is done in the static compile-time case) is sensible, so
-  *     use a dynamic group instead. If there are at most 16 elements, actually
-  *     use that static group. Note that the largest group with 4 generators
-  *     still compiles with reasonable resources.</dd>
-  * </dl>
-  *
-  * Note: Example compile time performance with g++-4.6 on an Intenl Core i5-3470
-  *       with 16 GiB RAM (all generators non-redundant and the subgroups don't
-  *       factorize):
-  *
-  *          # Generators          -O0 -ggdb               -O2
-  *          -------------------------------------------------------------------
-  *          1                 0.5 s  /   250 MiB     0.45s /   230 MiB
-  *          2                 0.5 s  /   260 MiB     0.5 s /   250 MiB
-  *          3                 0.65s  /   310 MiB     0.62s /   310 MiB
-  *          4                 2.2 s  /   860 MiB     1.7 s /   770 MiB
-  *          5               130   s  / 13000 MiB   120   s / 11000 MiB
-  *
-  * It is clear that everything is still very efficient up to 4 generators, then
-  * the memory and CPU requirements become unreasonable. Thus we only instantiate
-  * the template group theory logic if the number of generators supplied is 4 or
-  * lower, otherwise this will be forced to be done during runtime, where the
-  * algorithm is reasonably fast.
-  */
-template<std::size_t NumIndices>
-struct tensor_symmetry_pre_analysis<NumIndices>
-{
-  typedef StaticSGroup<> root_type;
-};
-
-template<std::size_t NumIndices, typename Gen_, typename... Gens_>
-struct tensor_symmetry_pre_analysis<NumIndices, Gen_, Gens_...>
-{
-  constexpr static std::size_t max_static_generators = 4;
-  constexpr static std::size_t max_static_elements = 16;
-  typedef tensor_static_symgroup_if<(sizeof...(Gens_) + 1 <= max_static_generators), NumIndices, Gen_, Gens_...> helper;
-  constexpr static std::size_t possible_size = helper::size;
-
-  typedef typename conditional<
-    possible_size == 0 || possible_size >= max_static_elements,
-    DynamicSGroupFromTemplateArgs<Gen_, Gens_...>,
-    typename helper::type
-  >::type root_type;
-};
-
-template<bool instantiate, std::size_t NumIndices, typename... Gens>
-struct tensor_static_symgroup_if
-{
-  constexpr static std::size_t size = 0;
-  typedef void type;
-};
-
-template<std::size_t NumIndices, typename... Gens>
-struct tensor_static_symgroup_if<true, NumIndices, Gens...> : tensor_static_symgroup<NumIndices, Gens...> {};
-
-template<typename Tensor_>
-struct tensor_symmetry_assign_value
-{
-  typedef typename Tensor_::Index Index;
-  typedef typename Tensor_::Scalar Scalar;
-  constexpr static std::size_t NumIndices = Tensor_::NumIndices;
-
-  static inline int run(const std::array<Index, NumIndices>& transformed_indices, int transformation_flags, int dummy, Tensor_& tensor, const Scalar& value_)
-  {
-    Scalar value(value_);
-    if (transformation_flags & ConjugationFlag)
-      value = numext::conj(value);
-    if (transformation_flags & NegationFlag)
-      value = -value;
-    tensor.coeffRef(transformed_indices) = value;
-    return dummy;
-  }
-};
-
-template<typename Tensor_>
-struct tensor_symmetry_calculate_flags
-{
-  typedef typename Tensor_::Index Index;
-  constexpr static std::size_t NumIndices = Tensor_::NumIndices;
-
-  static inline int run(const std::array<Index, NumIndices>& transformed_indices, int transform_flags, int current_flags, const std::array<Index, NumIndices>& orig_indices)
-  {
-    if (transformed_indices == orig_indices) {
-      if (transform_flags & (ConjugationFlag | NegationFlag))
-        return current_flags | GlobalImagFlag; // anti-hermitian diagonal
-      else if (transform_flags & ConjugationFlag)
-        return current_flags | GlobalRealFlag; // hermitian diagonal
-      else if (transform_flags & NegationFlag)
-        return current_flags | GlobalZeroFlag; // anti-symmetric diagonal
-    }
-    return current_flags;
-  }
-};
-
-template<typename Tensor_, typename Symmetry_, int Flags = 0>
-class tensor_symmetry_value_setter
-{
-  public:
-    typedef typename Tensor_::Index Index;
-    typedef typename Tensor_::Scalar Scalar;
-    constexpr static std::size_t NumIndices = Tensor_::NumIndices;
-
-    inline tensor_symmetry_value_setter(Tensor_& tensor, Symmetry_ const& symmetry, std::array<Index, NumIndices> const& indices)
-      : m_tensor(tensor), m_symmetry(symmetry), m_indices(indices) { }
-
-    inline tensor_symmetry_value_setter<Tensor_, Symmetry_, Flags>& operator=(Scalar const& value)
-    {
-      doAssign(value);
-      return *this;
-    }
-  private:
-    Tensor_& m_tensor;
-    Symmetry_ m_symmetry;
-    std::array<Index, NumIndices> m_indices;
-
-    inline void doAssign(Scalar const& value)
-    {
-      #ifdef EIGEN_TENSOR_SYMMETRY_CHECK_VALUES
-        int value_flags = m_symmetry.template apply<internal::tensor_symmetry_calculate_flags<Tensor_>, int>(m_indices, m_symmetry.globalFlags(), m_indices);
-        if (value_flags & GlobalRealFlag)
-          eigen_assert(numext::imag(value) == 0);
-        if (value_flags & GlobalImagFlag)
-          eigen_assert(numext::real(value) == 0);
-      #endif
-      m_symmetry.template apply<internal::tensor_symmetry_assign_value<Tensor_>, int>(m_indices, 0, m_tensor, value);
-    }
-};
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSORSYMMETRY_SYMMETRY_H
-
-/*
- * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
- */
diff --git a/src/EigenUnsupported/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h b/src/EigenUnsupported/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h
deleted file mode 100644
index 54bf9db..0000000
--- a/src/EigenUnsupported/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h
+++ /dev/null
@@ -1,669 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSORSYMMETRY_TEMPLATEGROUPTHEORY_H
-#define EIGEN_CXX11_TENSORSYMMETRY_TEMPLATEGROUPTHEORY_H
-
-namespace Eigen {
-
-namespace internal {
-
-namespace group_theory {
-
-/** \internal
-  * \file CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h
-  * This file contains C++ templates that implement group theory algorithms.
-  *
-  * The algorithms allow for a compile-time analysis of finite groups.
-  *
-  * Currently only Dimino's algorithm is implemented, which returns a list
-  * of all elements in a group given a set of (possibly redundant) generators.
-  * (One could also do that with the so-called orbital algorithm, but that
-  * is much more expensive and usually has no advantages.)
-  */
-
-/**********************************************************************
- *                "Ok kid, here is where it gets complicated."
- *                         - Amelia Pond in the "Doctor Who" episode
- *                           "The Big Bang"
- *
- * Dimino's algorithm
- * ==================
- *
- * The following is Dimino's algorithm in sequential form:
- *
- * Input: identity element, list of generators, equality check,
- *        multiplication operation
- * Output: list of group elements
- *
- * 1. add identity element
- * 2. remove identities from list of generators
- * 3. add all powers of first generator that aren't the
- *    identity element
- * 4. go through all remaining generators:
- *        a. if generator is already in the list of elements
- *                -> do nothing
- *        b. otherwise
- *                i.   remember current # of elements
- *                     (i.e. the size of the current subgroup)
- *                ii.  add all current elements (which includes
- *                     the identity) each multiplied from right
- *                     with the current generator to the group
- *                iii. add all remaining cosets that are generated
- *                     by products of the new generator with itself
- *                     and all other generators seen so far
- *
- * In functional form, this is implemented as a long set of recursive
- * templates that have a complicated relationship.
- *
- * The main interface for Dimino's algorithm is the template
- * enumerate_group_elements. All lists are implemented as variadic
- * type_list<typename...> and numeric_list<typename = int, int...>
- * templates.
- *
- * 'Calling' templates is usually done via typedefs.
- *
- * This algorithm is an extended version of the basic version. The
- * extension consists in the fact that each group element has a set
- * of flags associated with it. Multiplication of two group elements
- * with each other results in a group element whose flags are the
- * XOR of the flags of the previous elements. Each time the algorithm
- * notices that a group element it just calculated is already in the
- * list of current elements, the flags of both will be compared and
- * added to the so-called 'global flags' of the group.
- *
- * The rationale behind this extension is that this allows not only
- * for the description of symmetries between tensor indices, but
- * also allows for the description of hermiticity, antisymmetry and
- * antihermiticity. Negation and conjugation each are specific bit
- * in the flags value and if two different ways to reach a group
- * element lead to two different flags, this poses a constraint on
- * the allowed values of the resulting tensor. For example, if a
- * group element is reach both with and without the conjugation
- * flags, it is clear that the resulting tensor has to be real.
- *
- * Note that this flag mechanism is quite generic and may have other
- * uses beyond tensor properties.
- *
- * IMPORTANT: 
- *     This algorithm assumes the group to be finite. If you try to
- *     run it with a group that's infinite, the algorithm will only
- *     terminate once you hit a compiler limit (max template depth).
- *     Also note that trying to use this implementation to create a
- *     very large group will probably either make you hit the same
- *     limit, cause the compiler to segfault or at the very least
- *     take a *really* long time (hours, days, weeks - sic!) to
- *     compile. It is not recommended to plug in more than 4
- *     generators, unless they are independent of each other.
- */
-
-/** \internal
-  *
-  * \class strip_identities
-  * \ingroup CXX11_TensorSymmetry_Module
-  *
-  * \brief Cleanse a list of group elements of the identity element
-  *
-  * This template is used to make a first pass through all initial
-  * generators of Dimino's algorithm and remove the identity
-  * elements.
-  *
-  * \sa enumerate_group_elements
-  */
-template<template<typename, typename> class Equality, typename id, typename L> struct strip_identities;
-
-template<
-  template<typename, typename> class Equality,
-  typename id,
-  typename t,
-  typename... ts
->
-struct strip_identities<Equality, id, type_list<t, ts...>>
-{
-  typedef typename conditional<
-    Equality<id, t>::value,
-    typename strip_identities<Equality, id, type_list<ts...>>::type,
-    typename concat<type_list<t>, typename strip_identities<Equality, id, type_list<ts...>>::type>::type
-  >::type type;
-  constexpr static int global_flags = Equality<id, t>::global_flags | strip_identities<Equality, id, type_list<ts...>>::global_flags;
-};
-
-template<
-  template<typename, typename> class Equality,
-  typename id
-  EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, ts)
->
-struct strip_identities<Equality, id, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(ts)>>
-{
-  typedef type_list<> type;
-  constexpr static int global_flags = 0;
-};
-
-/** \internal
-  *
-  * \class dimino_first_step_elements_helper 
-  * \ingroup CXX11_TensorSymmetry_Module
-  *
-  * \brief Recursive template that adds powers of the first generator to the list of group elements
-  *
-  * This template calls itself recursively to add powers of the first
-  * generator to the list of group elements. It stops if it reaches
-  * the identity element again.
-  *
-  * \sa enumerate_group_elements, dimino_first_step_elements
-  */
-template<
-  template<typename, typename> class Multiply,
-  template<typename, typename> class Equality,
-  typename id,
-  typename g,
-  typename current_element,
-  typename elements,
-  bool dont_add_current_element   // = false
->
-struct dimino_first_step_elements_helper
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-  : // recursive inheritance is too difficult for Doxygen
-  public dimino_first_step_elements_helper<
-    Multiply,
-    Equality,
-    id,
-    g,
-    typename Multiply<current_element, g>::type,
-    typename concat<elements, type_list<current_element>>::type,
-    Equality<typename Multiply<current_element, g>::type, id>::value
-  > {};
-
-template<
-  template<typename, typename> class Multiply,
-  template<typename, typename> class Equality,
-  typename id,
-  typename g,
-  typename current_element,
-  typename elements
->
-struct dimino_first_step_elements_helper<Multiply, Equality, id, g, current_element, elements, true>
-#endif // EIGEN_PARSED_BY_DOXYGEN
-{
-  typedef elements type;
-  constexpr static int global_flags = Equality<current_element, id>::global_flags;
-};
-
-/** \internal
-  *
-  * \class dimino_first_step_elements
-  * \ingroup CXX11_TensorSymmetry_Module
-  *
-  * \brief Add all powers of the first generator to the list of group elements
-  *
-  * This template takes the first non-identity generator and generates the initial
-  * list of elements which consists of all powers of that generator. For a group
-  * with just one generated, it would be enumerated after this.
-  *
-  * \sa enumerate_group_elements
-  */
-template<
-  template<typename, typename> class Multiply,
-  template<typename, typename> class Equality,
-  typename id,
-  typename generators
->
-struct dimino_first_step_elements
-{
-  typedef typename get<0, generators>::type first_generator;
-  typedef typename skip<1, generators>::type next_generators;
-  typedef type_list<first_generator> generators_done;
-
-  typedef dimino_first_step_elements_helper<
-    Multiply,
-    Equality,
-    id,
-    first_generator,
-    first_generator,
-    type_list<id>,
-    false
-  > helper;
-  typedef typename helper::type type;
-  constexpr static int global_flags = helper::global_flags;
-};
-
-/** \internal
-  *
-  * \class dimino_get_coset_elements
-  * \ingroup CXX11_TensorSymmetry_Module
-  *
-  * \brief Generate all elements of a specific coset
-  *
-  * This template generates all the elements of a specific coset by
-  * multiplying all elements in the given subgroup with the new
-  * coset representative. Note that the first element of the
-  * subgroup is always the identity element, so the first element of
-  * the result of this template is going to be the coset
-  * representative itself.
-  *
-  * Note that this template accepts an additional boolean parameter
-  * that specifies whether to actually generate the coset (true) or
-  * just return an empty list (false).
-  *
-  * \sa enumerate_group_elements, dimino_add_cosets_for_rep
-  */
-template<
-  template<typename, typename> class Multiply,
-  typename sub_group_elements,
-  typename new_coset_rep,
-  bool generate_coset      // = true
->
-struct dimino_get_coset_elements
-{
-  typedef typename apply_op_from_right<Multiply, new_coset_rep, sub_group_elements>::type type;
-};
-
-template<
-  template<typename, typename> class Multiply,
-  typename sub_group_elements,
-  typename new_coset_rep
->
-struct dimino_get_coset_elements<Multiply, sub_group_elements, new_coset_rep, false>
-{
-  typedef type_list<> type;
-};
-
-/** \internal
-  *
-  * \class dimino_add_cosets_for_rep
-  * \ingroup CXX11_TensorSymmetry_Module
-  *
-  * \brief Recursive template for adding coset spaces
-  *
-  * This template multiplies the coset representative with a generator
-  * from the list of previous generators. If the new element is not in
-  * the group already, it adds the corresponding coset. Finally it
-  * proceeds to call itself with the next generator from the list.
-  *
-  * \sa enumerate_group_elements, dimino_add_all_coset_spaces
-  */
-template<
-  template<typename, typename> class Multiply,
-  template<typename, typename> class Equality,
-  typename id,
-  typename sub_group_elements,
-  typename elements,
-  typename generators,
-  typename rep_element,
-  int sub_group_size
->
-struct dimino_add_cosets_for_rep;
-
-template<
-  template<typename, typename> class Multiply,
-  template<typename, typename> class Equality,
-  typename id,
-  typename sub_group_elements,
-  typename elements,
-  typename g,
-  typename... gs,
-  typename rep_element,
-  int sub_group_size
->
-struct dimino_add_cosets_for_rep<Multiply, Equality, id, sub_group_elements, elements, type_list<g, gs...>, rep_element, sub_group_size>
-{
-  typedef typename Multiply<rep_element, g>::type new_coset_rep;
-  typedef contained_in_list_gf<Equality, new_coset_rep, elements> _cil;
-  constexpr static bool add_coset = !_cil::value;
-
-  typedef typename dimino_get_coset_elements<
-    Multiply,
-    sub_group_elements,
-    new_coset_rep,
-    add_coset
-  >::type coset_elements;
-
-  typedef dimino_add_cosets_for_rep<
-    Multiply,
-    Equality,
-    id,
-    sub_group_elements,
-    typename concat<elements, coset_elements>::type,
-    type_list<gs...>,
-    rep_element,
-    sub_group_size
-  > _helper;
-
-  typedef typename _helper::type type;
-  constexpr static int global_flags = _cil::global_flags | _helper::global_flags;
-
-  /* Note that we don't have to update global flags here, since
-   * we will only add these elements if they are not part of
-   * the group already. But that only happens if the coset rep
-   * is not already in the group, so the check for the coset rep
-   * will catch this.
-   */
-};
-
-template<
-  template<typename, typename> class Multiply,
-  template<typename, typename> class Equality,
-  typename id,
-  typename sub_group_elements,
-  typename elements
-  EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, empty),
-  typename rep_element,
-  int sub_group_size
->
-struct dimino_add_cosets_for_rep<Multiply, Equality, id, sub_group_elements, elements, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(empty)>, rep_element, sub_group_size>
-{
-  typedef elements type;
-  constexpr static int global_flags = 0;
-};
-
-/** \internal
-  *
-  * \class dimino_add_all_coset_spaces
-  * \ingroup CXX11_TensorSymmetry_Module
-  *
-  * \brief Recursive template for adding all coset spaces for a new generator
-  *
-  * This template tries to go through the list of generators (with
-  * the help of the dimino_add_cosets_for_rep template) as long as
-  * it still finds elements that are not part of the group and add
-  * the corresponding cosets.
-  *
-  * \sa enumerate_group_elements, dimino_add_cosets_for_rep
-  */
-template<
-  template<typename, typename> class Multiply,
-  template<typename, typename> class Equality,
-  typename id,
-  typename sub_group_elements,
-  typename elements,
-  typename generators,
-  int sub_group_size,
-  int rep_pos,
-  bool stop_condition        // = false
->
-struct dimino_add_all_coset_spaces
-{
-  typedef typename get<rep_pos, elements>::type rep_element;
-  typedef dimino_add_cosets_for_rep<
-    Multiply,
-    Equality,
-    id,
-    sub_group_elements,
-    elements,
-    generators,
-    rep_element,
-    sub_group_elements::count
-  > _ac4r;
-  typedef typename _ac4r::type new_elements;
-  
-  constexpr static int new_rep_pos = rep_pos + sub_group_elements::count;
-  constexpr static bool new_stop_condition = new_rep_pos >= new_elements::count;
-
-  typedef dimino_add_all_coset_spaces<
-    Multiply,
-    Equality,
-    id,
-    sub_group_elements,
-    new_elements,
-    generators,
-    sub_group_size,
-    new_rep_pos,
-    new_stop_condition
-  > _helper;
-
-  typedef typename _helper::type type;
-  constexpr static int global_flags = _helper::global_flags | _ac4r::global_flags;
-};
-
-template<
-  template<typename, typename> class Multiply,
-  template<typename, typename> class Equality,
-  typename id,
-  typename sub_group_elements,
-  typename elements,
-  typename generators,
-  int sub_group_size,
-  int rep_pos
->
-struct dimino_add_all_coset_spaces<Multiply, Equality, id, sub_group_elements, elements, generators, sub_group_size, rep_pos, true>
-{
-  typedef elements type;
-  constexpr static int global_flags = 0;
-};
-
-/** \internal
-  *
-  * \class dimino_add_generator
-  * \ingroup CXX11_TensorSymmetry_Module
-  *
-  * \brief Enlarge the group by adding a new generator.
-  *
-  * It accepts a boolean parameter that determines if the generator is redundant,
-  * i.e. was already seen in the group. In that case, it reduces to a no-op.
-  *
-  * \sa enumerate_group_elements, dimino_add_all_coset_spaces
-  */
-template<
-  template<typename, typename> class Multiply,
-  template<typename, typename> class Equality,
-  typename id,
-  typename elements,
-  typename generators_done,
-  typename current_generator,
-  bool redundant          // = false
->
-struct dimino_add_generator
-{
-  /* this template is only called if the generator is not redundant
-   * => all elements of the group multiplied with the new generator
-   *    are going to be new elements of the most trivial coset space
-   */
-  typedef typename apply_op_from_right<Multiply, current_generator, elements>::type multiplied_elements;
-  typedef typename concat<elements, multiplied_elements>::type new_elements;
-
-  constexpr static int rep_pos = elements::count;
-
-  typedef dimino_add_all_coset_spaces<
-    Multiply,
-    Equality,
-    id,
-    elements, // elements of previous subgroup
-    new_elements,
-    typename concat<generators_done, type_list<current_generator>>::type,
-    elements::count, // size of previous subgroup
-    rep_pos,
-    false // don't stop (because rep_pos >= new_elements::count is always false at this point)
-  > _helper;
-  typedef typename _helper::type type;
-  constexpr static int global_flags = _helper::global_flags;
-};
-
-template<
-  template<typename, typename> class Multiply,
-  template<typename, typename> class Equality,
-  typename id,
-  typename elements,
-  typename generators_done,
-  typename current_generator
->
-struct dimino_add_generator<Multiply, Equality, id, elements, generators_done, current_generator, true>
-{
-  // redundant case
-  typedef elements type;
-  constexpr static int global_flags = 0;
-};
-
-/** \internal
-  *
-  * \class dimino_add_remaining_generators
-  * \ingroup CXX11_TensorSymmetry_Module
-  *
-  * \brief Recursive template that adds all remaining generators to a group
-  *
-  * Loop through the list of generators that remain and successively
-  * add them to the group.
-  *
-  * \sa enumerate_group_elements, dimino_add_generator
-  */
-template<
-  template<typename, typename> class Multiply,
-  template<typename, typename> class Equality,
-  typename id,
-  typename generators_done,
-  typename remaining_generators,
-  typename elements
->
-struct dimino_add_remaining_generators
-{
-  typedef typename get<0, remaining_generators>::type first_generator;
-  typedef typename skip<1, remaining_generators>::type next_generators;
-
-  typedef contained_in_list_gf<Equality, first_generator, elements> _cil;
-
-  typedef dimino_add_generator<
-    Multiply,
-    Equality,
-    id,
-    elements,
-    generators_done,
-    first_generator,
-    _cil::value
-  > _helper;
-
-  typedef typename _helper::type new_elements;
-
-  typedef dimino_add_remaining_generators<
-    Multiply,
-    Equality,
-    id,
-    typename concat<generators_done, type_list<first_generator>>::type,
-    next_generators,
-    new_elements
-  > _next_iter;
-
-  typedef typename _next_iter::type type;
-  constexpr static int global_flags =
-    _cil::global_flags |
-    _helper::global_flags |
-    _next_iter::global_flags;
-};
-
-template<
-  template<typename, typename> class Multiply,
-  template<typename, typename> class Equality,
-  typename id,
-  typename generators_done,
-  typename elements
->
-struct dimino_add_remaining_generators<Multiply, Equality, id, generators_done, type_list<>, elements>
-{
-  typedef elements type;
-  constexpr static int global_flags = 0;
-};
-
-/** \internal
-  *
-  * \class enumerate_group_elements_noid
-  * \ingroup CXX11_TensorSymmetry_Module
-  *
-  * \brief Helper template that implements group element enumeration
-  *
-  * This is a helper template that implements the actual enumeration
-  * of group elements. This has been split so that the list of
-  * generators can be cleansed of the identity element before
-  * performing the actual operation.
-  *
-  * \sa enumerate_group_elements
-  */
-template<
-  template<typename, typename> class Multiply,
-  template<typename, typename> class Equality,
-  typename id,
-  typename generators,
-  int initial_global_flags = 0
->
-struct enumerate_group_elements_noid
-{
-  typedef dimino_first_step_elements<Multiply, Equality, id, generators> first_step;
-  typedef typename first_step::type first_step_elements;
-
-  typedef dimino_add_remaining_generators<
-    Multiply,
-    Equality,
-    id,
-    typename first_step::generators_done,
-    typename first_step::next_generators, // remaining_generators
-    typename first_step::type // first_step elements
-  > _helper;
-
-  typedef typename _helper::type type;
-  constexpr static int global_flags =
-    initial_global_flags |
-    first_step::global_flags |
-    _helper::global_flags;
-};
-
-// in case when no generators are specified
-template<
-  template<typename, typename> class Multiply,
-  template<typename, typename> class Equality,
-  typename id,
-  int initial_global_flags
->
-struct enumerate_group_elements_noid<Multiply, Equality, id, type_list<>, initial_global_flags>
-{
-  typedef type_list<id> type;
-  constexpr static int global_flags = initial_global_flags;
-};
-
-/** \internal
-  *
-  * \class enumerate_group_elements
-  * \ingroup CXX11_TensorSymmetry_Module
-  *
-  * \brief Enumerate all elements in a finite group
-  *
-  * This template enumerates all elements in a finite group. It accepts
-  * the following template parameters:
-  *
-  * \tparam Multiply      The multiplication operation that multiplies two group elements
-  *                       with each other.
-  * \tparam Equality      The equality check operation that checks if two group elements
-  *                       are equal to another.
-  * \tparam id            The identity element
-  * \tparam _generators   A list of (possibly redundant) generators of the group
-  */
-template<
-  template<typename, typename> class Multiply,
-  template<typename, typename> class Equality,
-  typename id,
-  typename _generators
->
-struct enumerate_group_elements
-  : public enumerate_group_elements_noid<
-      Multiply,
-      Equality,
-      id,
-      typename strip_identities<Equality, id, _generators>::type,
-      strip_identities<Equality, id, _generators>::global_flags
-    >
-{
-};
-
-} // end namespace group_theory
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSORSYMMETRY_TEMPLATEGROUPTHEORY_H
-
-/*
- * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
- */
diff --git a/src/EigenUnsupported/CXX11/src/ThreadPool/Barrier.h b/src/EigenUnsupported/CXX11/src/ThreadPool/Barrier.h
deleted file mode 100644
index e4c59dc..0000000
--- a/src/EigenUnsupported/CXX11/src/ThreadPool/Barrier.h
+++ /dev/null
@@ -1,67 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2018 Rasmus Munk Larsen <rmlarsen@google.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// Barrier is an object that allows one or more threads to wait until
-// Notify has been called a specified number of times.
-
-#ifndef EIGEN_CXX11_THREADPOOL_BARRIER_H
-#define EIGEN_CXX11_THREADPOOL_BARRIER_H
-
-namespace Eigen {
-
-class Barrier {
- public:
-  Barrier(unsigned int count) : state_(count << 1), notified_(false) {
-    eigen_plain_assert(((count << 1) >> 1) == count);
-  }
-  ~Barrier() { eigen_plain_assert((state_ >> 1) == 0); }
-
-  void Notify() {
-    unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2;
-    if (v != 1) {
-      // Clear the lowest bit (waiter flag) and check that the original state
-      // value was not zero. If it was zero, it means that notify was called
-      // more times than the original count.
-      eigen_plain_assert(((v + 2) & ~1) != 0);
-      return;  // either count has not dropped to 0, or waiter is not waiting
-    }
-    std::unique_lock<std::mutex> l(mu_);
-    eigen_plain_assert(!notified_);
-    notified_ = true;
-    cv_.notify_all();
-  }
-
-  void Wait() {
-    unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel);
-    if ((v >> 1) == 0) return;
-    std::unique_lock<std::mutex> l(mu_);
-    while (!notified_) {
-      cv_.wait(l);
-    }
-  }
-
- private:
-  std::mutex mu_;
-  std::condition_variable cv_;
-  std::atomic<unsigned int> state_;  // low bit is waiter flag
-  bool notified_;
-};
-
-// Notification is an object that allows a user to to wait for another
-// thread to signal a notification that an event has occurred.
-//
-// Multiple threads can wait on the same Notification object,
-// but only one caller must call Notify() on the object.
-struct Notification : Barrier {
-  Notification() : Barrier(1){};
-};
-
-}  // namespace Eigen
-
-#endif  // EIGEN_CXX11_THREADPOOL_BARRIER_H
diff --git a/src/EigenUnsupported/CXX11/src/ThreadPool/EventCount.h b/src/EigenUnsupported/CXX11/src/ThreadPool/EventCount.h
deleted file mode 100644
index 4549aa0..0000000
--- a/src/EigenUnsupported/CXX11/src/ThreadPool/EventCount.h
+++ /dev/null
@@ -1,249 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H_
-#define EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H_
-
-namespace Eigen {
-
-// EventCount allows to wait for arbitrary predicates in non-blocking
-// algorithms. Think of condition variable, but wait predicate does not need to
-// be protected by a mutex. Usage:
-// Waiting thread does:
-//
-//   if (predicate)
-//     return act();
-//   EventCount::Waiter& w = waiters[my_index];
-//   ec.Prewait(&w);
-//   if (predicate) {
-//     ec.CancelWait(&w);
-//     return act();
-//   }
-//   ec.CommitWait(&w);
-//
-// Notifying thread does:
-//
-//   predicate = true;
-//   ec.Notify(true);
-//
-// Notify is cheap if there are no waiting threads. Prewait/CommitWait are not
-// cheap, but they are executed only if the preceding predicate check has
-// failed.
-//
-// Algorithm outline:
-// There are two main variables: predicate (managed by user) and state_.
-// Operation closely resembles Dekker mutual algorithm:
-// https://en.wikipedia.org/wiki/Dekker%27s_algorithm
-// Waiting thread sets state_ then checks predicate, Notifying thread sets
-// predicate then checks state_. Due to seq_cst fences in between these
-// operations it is guaranteed than either waiter will see predicate change
-// and won't block, or notifying thread will see state_ change and will unblock
-// the waiter, or both. But it can't happen that both threads don't see each
-// other changes, which would lead to deadlock.
-class EventCount {
- public:
-  class Waiter;
-
-  EventCount(MaxSizeVector<Waiter>& waiters)
-      : state_(kStackMask), waiters_(waiters) {
-    eigen_plain_assert(waiters.size() < (1 << kWaiterBits) - 1);
-  }
-
-  ~EventCount() {
-    // Ensure there are no waiters.
-    eigen_plain_assert(state_.load() == kStackMask);
-  }
-
-  // Prewait prepares for waiting.
-  // After calling Prewait, the thread must re-check the wait predicate
-  // and then call either CancelWait or CommitWait.
-  void Prewait() {
-    uint64_t state = state_.load(std::memory_order_relaxed);
-    for (;;) {
-      CheckState(state);
-      uint64_t newstate = state + kWaiterInc;
-      CheckState(newstate);
-      if (state_.compare_exchange_weak(state, newstate,
-                                       std::memory_order_seq_cst))
-        return;
-    }
-  }
-
-  // CommitWait commits waiting after Prewait.
-  void CommitWait(Waiter* w) {
-    eigen_plain_assert((w->epoch & ~kEpochMask) == 0);
-    w->state = Waiter::kNotSignaled;
-    const uint64_t me = (w - &waiters_[0]) | w->epoch;
-    uint64_t state = state_.load(std::memory_order_seq_cst);
-    for (;;) {
-      CheckState(state, true);
-      uint64_t newstate;
-      if ((state & kSignalMask) != 0) {
-        // Consume the signal and return immidiately.
-        newstate = state - kWaiterInc - kSignalInc;
-      } else {
-        // Remove this thread from pre-wait counter and add to the waiter stack.
-        newstate = ((state & kWaiterMask) - kWaiterInc) | me;
-        w->next.store(state & (kStackMask | kEpochMask),
-                      std::memory_order_relaxed);
-      }
-      CheckState(newstate);
-      if (state_.compare_exchange_weak(state, newstate,
-                                       std::memory_order_acq_rel)) {
-        if ((state & kSignalMask) == 0) {
-          w->epoch += kEpochInc;
-          Park(w);
-        }
-        return;
-      }
-    }
-  }
-
-  // CancelWait cancels effects of the previous Prewait call.
-  void CancelWait() {
-    uint64_t state = state_.load(std::memory_order_relaxed);
-    for (;;) {
-      CheckState(state, true);
-      uint64_t newstate = state - kWaiterInc;
-      // We don't know if the thread was also notified or not,
-      // so we should not consume a signal unconditionaly.
-      // Only if number of waiters is equal to number of signals,
-      // we know that the thread was notified and we must take away the signal.
-      if (((state & kWaiterMask) >> kWaiterShift) ==
-          ((state & kSignalMask) >> kSignalShift))
-        newstate -= kSignalInc;
-      CheckState(newstate);
-      if (state_.compare_exchange_weak(state, newstate,
-                                       std::memory_order_acq_rel))
-        return;
-    }
-  }
-
-  // Notify wakes one or all waiting threads.
-  // Must be called after changing the associated wait predicate.
-  void Notify(bool notifyAll) {
-    std::atomic_thread_fence(std::memory_order_seq_cst);
-    uint64_t state = state_.load(std::memory_order_acquire);
-    for (;;) {
-      CheckState(state);
-      const uint64_t waiters = (state & kWaiterMask) >> kWaiterShift;
-      const uint64_t signals = (state & kSignalMask) >> kSignalShift;
-      // Easy case: no waiters.
-      if ((state & kStackMask) == kStackMask && waiters == signals) return;
-      uint64_t newstate;
-      if (notifyAll) {
-        // Empty wait stack and set signal to number of pre-wait threads.
-        newstate =
-            (state & kWaiterMask) | (waiters << kSignalShift) | kStackMask;
-      } else if (signals < waiters) {
-        // There is a thread in pre-wait state, unblock it.
-        newstate = state + kSignalInc;
-      } else {
-        // Pop a waiter from list and unpark it.
-        Waiter* w = &waiters_[state & kStackMask];
-        uint64_t next = w->next.load(std::memory_order_relaxed);
-        newstate = (state & (kWaiterMask | kSignalMask)) | next;
-      }
-      CheckState(newstate);
-      if (state_.compare_exchange_weak(state, newstate,
-                                       std::memory_order_acq_rel)) {
-        if (!notifyAll && (signals < waiters))
-          return;  // unblocked pre-wait thread
-        if ((state & kStackMask) == kStackMask) return;
-        Waiter* w = &waiters_[state & kStackMask];
-        if (!notifyAll) w->next.store(kStackMask, std::memory_order_relaxed);
-        Unpark(w);
-        return;
-      }
-    }
-  }
-
-  class Waiter {
-    friend class EventCount;
-    // Align to 128 byte boundary to prevent false sharing with other Waiter
-    // objects in the same vector.
-    EIGEN_ALIGN_TO_BOUNDARY(128) std::atomic<uint64_t> next;
-    std::mutex mu;
-    std::condition_variable cv;
-    uint64_t epoch = 0;
-    unsigned state = kNotSignaled;
-    enum {
-      kNotSignaled,
-      kWaiting,
-      kSignaled,
-    };
-  };
-
- private:
-  // State_ layout:
-  // - low kWaiterBits is a stack of waiters committed wait
-  //   (indexes in waiters_ array are used as stack elements,
-  //   kStackMask means empty stack).
-  // - next kWaiterBits is count of waiters in prewait state.
-  // - next kWaiterBits is count of pending signals.
-  // - remaining bits are ABA counter for the stack.
-  //   (stored in Waiter node and incremented on push).
-  static const uint64_t kWaiterBits = 14;
-  static const uint64_t kStackMask = (1ull << kWaiterBits) - 1;
-  static const uint64_t kWaiterShift = kWaiterBits;
-  static const uint64_t kWaiterMask = ((1ull << kWaiterBits) - 1)
-                                      << kWaiterShift;
-  static const uint64_t kWaiterInc = 1ull << kWaiterShift;
-  static const uint64_t kSignalShift = 2 * kWaiterBits;
-  static const uint64_t kSignalMask = ((1ull << kWaiterBits) - 1)
-                                      << kSignalShift;
-  static const uint64_t kSignalInc = 1ull << kSignalShift;
-  static const uint64_t kEpochShift = 3 * kWaiterBits;
-  static const uint64_t kEpochBits = 64 - kEpochShift;
-  static const uint64_t kEpochMask = ((1ull << kEpochBits) - 1) << kEpochShift;
-  static const uint64_t kEpochInc = 1ull << kEpochShift;
-  std::atomic<uint64_t> state_;
-  MaxSizeVector<Waiter>& waiters_;
-
-  static void CheckState(uint64_t state, bool waiter = false) {
-    static_assert(kEpochBits >= 20, "not enough bits to prevent ABA problem");
-    const uint64_t waiters = (state & kWaiterMask) >> kWaiterShift;
-    const uint64_t signals = (state & kSignalMask) >> kSignalShift;
-    eigen_plain_assert(waiters >= signals);
-    eigen_plain_assert(waiters < (1 << kWaiterBits) - 1);
-    eigen_plain_assert(!waiter || waiters > 0);
-    (void)waiters;
-    (void)signals;
-  }
-
-  void Park(Waiter* w) {
-    std::unique_lock<std::mutex> lock(w->mu);
-    while (w->state != Waiter::kSignaled) {
-      w->state = Waiter::kWaiting;
-      w->cv.wait(lock);
-    }
-  }
-
-  void Unpark(Waiter* w) {
-    for (Waiter* next; w; w = next) {
-      uint64_t wnext = w->next.load(std::memory_order_relaxed) & kStackMask;
-      next = wnext == kStackMask ? nullptr : &waiters_[wnext];
-      unsigned state;
-      {
-        std::unique_lock<std::mutex> lock(w->mu);
-        state = w->state;
-        w->state = Waiter::kSignaled;
-      }
-      // Avoid notifying if it wasn't waiting.
-      if (state == Waiter::kWaiting) w->cv.notify_one();
-    }
-  }
-
-  EventCount(const EventCount&) = delete;
-  void operator=(const EventCount&) = delete;
-};
-
-}  // namespace Eigen
-
-#endif  // EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H_
diff --git a/src/EigenUnsupported/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/src/EigenUnsupported/CXX11/src/ThreadPool/NonBlockingThreadPool.h
deleted file mode 100644
index 23a2b54..0000000
--- a/src/EigenUnsupported/CXX11/src/ThreadPool/NonBlockingThreadPool.h
+++ /dev/null
@@ -1,486 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H
-#define EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H
-
-namespace Eigen {
-
-template <typename Environment>
-class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
- public:
-  typedef typename Environment::Task Task;
-  typedef RunQueue<Task, 1024> Queue;
-
-  ThreadPoolTempl(int num_threads, Environment env = Environment())
-      : ThreadPoolTempl(num_threads, true, env) {}
-
-  ThreadPoolTempl(int num_threads, bool allow_spinning,
-                  Environment env = Environment())
-      : env_(env),
-        num_threads_(num_threads),
-        allow_spinning_(allow_spinning),
-        thread_data_(num_threads),
-        all_coprimes_(num_threads),
-        waiters_(num_threads),
-        global_steal_partition_(EncodePartition(0, num_threads_)),
-        blocked_(0),
-        spinning_(0),
-        done_(false),
-        cancelled_(false),
-        ec_(waiters_) {
-    waiters_.resize(num_threads_);
-    // Calculate coprimes of all numbers [1, num_threads].
-    // Coprimes are used for random walks over all threads in Steal
-    // and NonEmptyQueueIndex. Iteration is based on the fact that if we take
-    // a random starting thread index t and calculate num_threads - 1 subsequent
-    // indices as (t + coprime) % num_threads, we will cover all threads without
-    // repetitions (effectively getting a presudo-random permutation of thread
-    // indices).
-    eigen_plain_assert(num_threads_ < kMaxThreads);
-    for (int i = 1; i <= num_threads_; ++i) {
-      all_coprimes_.emplace_back(i);
-      ComputeCoprimes(i, &all_coprimes_.back());
-    }
-#ifndef EIGEN_THREAD_LOCAL
-    init_barrier_.reset(new Barrier(num_threads_));
-#endif
-    thread_data_.resize(num_threads_);
-    for (int i = 0; i < num_threads_; i++) {
-      SetStealPartition(i, EncodePartition(0, num_threads_));
-      thread_data_[i].thread.reset(
-          env_.CreateThread([this, i]() { WorkerLoop(i); }));
-    }
-#ifndef EIGEN_THREAD_LOCAL
-    // Wait for workers to initialize per_thread_map_. Otherwise we might race
-    // with them in Schedule or CurrentThreadId.
-    init_barrier_->Wait();
-#endif
-  }
-
-  ~ThreadPoolTempl() {
-    done_ = true;
-
-    // Now if all threads block without work, they will start exiting.
-    // But note that threads can continue to work arbitrary long,
-    // block, submit new work, unblock and otherwise live full life.
-    if (!cancelled_) {
-      ec_.Notify(true);
-    } else {
-      // Since we were cancelled, there might be entries in the queues.
-      // Empty them to prevent their destructor from asserting.
-      for (size_t i = 0; i < thread_data_.size(); i++) {
-        thread_data_[i].queue.Flush();
-      }
-    }
-    // Join threads explicitly (by destroying) to avoid destruction order within
-    // this class.
-    for (size_t i = 0; i < thread_data_.size(); ++i)
-      thread_data_[i].thread.reset();
-  }
-
-  void SetStealPartitions(const std::vector<std::pair<unsigned, unsigned>>& partitions) {
-    eigen_plain_assert(partitions.size() == static_cast<std::size_t>(num_threads_));
-
-    // Pass this information to each thread queue.
-    for (int i = 0; i < num_threads_; i++) {
-      const auto& pair = partitions[i];
-      unsigned start = pair.first, end = pair.second;
-      AssertBounds(start, end);
-      unsigned val = EncodePartition(start, end);
-      SetStealPartition(i, val);
-    }
-  }
-
-  void Schedule(std::function<void()> fn) EIGEN_OVERRIDE {
-    ScheduleWithHint(std::move(fn), 0, num_threads_);
-  }
-
-  void ScheduleWithHint(std::function<void()> fn, int start,
-                        int limit) override {
-    Task t = env_.CreateTask(std::move(fn));
-    PerThread* pt = GetPerThread();
-    if (pt->pool == this) {
-      // Worker thread of this pool, push onto the thread's queue.
-      Queue& q = thread_data_[pt->thread_id].queue;
-      t = q.PushFront(std::move(t));
-    } else {
-      // A free-standing thread (or worker of another pool), push onto a random
-      // queue.
-      eigen_plain_assert(start < limit);
-      eigen_plain_assert(limit <= num_threads_);
-      int num_queues = limit - start;
-      int rnd = Rand(&pt->rand) % num_queues;
-      eigen_plain_assert(start + rnd < limit);
-      Queue& q = thread_data_[start + rnd].queue;
-      t = q.PushBack(std::move(t));
-    }
-    // Note: below we touch this after making w available to worker threads.
-    // Strictly speaking, this can lead to a racy-use-after-free. Consider that
-    // Schedule is called from a thread that is neither main thread nor a worker
-    // thread of this pool. Then, execution of w directly or indirectly
-    // completes overall computations, which in turn leads to destruction of
-    // this. We expect that such scenario is prevented by program, that is,
-    // this is kept alive while any threads can potentially be in Schedule.
-    if (!t.f) {
-      ec_.Notify(false);
-    } else {
-      env_.ExecuteTask(t);  // Push failed, execute directly.
-    }
-  }
-
-  void Cancel() EIGEN_OVERRIDE {
-    cancelled_ = true;
-    done_ = true;
-
-    // Let each thread know it's been cancelled.
-#ifdef EIGEN_THREAD_ENV_SUPPORTS_CANCELLATION
-    for (size_t i = 0; i < thread_data_.size(); i++) {
-      thread_data_[i].thread->OnCancel();
-    }
-#endif
-
-    // Wake up the threads without work to let them exit on their own.
-    ec_.Notify(true);
-  }
-
-  int NumThreads() const EIGEN_FINAL { return num_threads_; }
-
-  int CurrentThreadId() const EIGEN_FINAL {
-    const PerThread* pt = const_cast<ThreadPoolTempl*>(this)->GetPerThread();
-    if (pt->pool == this) {
-      return pt->thread_id;
-    } else {
-      return -1;
-    }
-  }
-
- private:
-  // Create a single atomic<int> that encodes start and limit information for
-  // each thread.
-  // We expect num_threads_ < 65536, so we can store them in a single
-  // std::atomic<unsigned>.
-  // Exposed publicly as static functions so that external callers can reuse
-  // this encode/decode logic for maintaining their own thread-safe copies of
-  // scheduling and steal domain(s).
-  static const int kMaxPartitionBits = 16;
-  static const int kMaxThreads = 1 << kMaxPartitionBits;
-
-  inline unsigned EncodePartition(unsigned start, unsigned limit) {
-    return (start << kMaxPartitionBits) | limit;
-  }
-
-  inline void DecodePartition(unsigned val, unsigned* start, unsigned* limit) {
-    *limit = val & (kMaxThreads - 1);
-    val >>= kMaxPartitionBits;
-    *start = val;
-  }
-
-  void AssertBounds(int start, int end) {
-    eigen_plain_assert(start >= 0);
-    eigen_plain_assert(start < end);  // non-zero sized partition
-    eigen_plain_assert(end <= num_threads_);
-  }
-
-  inline void SetStealPartition(size_t i, unsigned val) {
-    thread_data_[i].steal_partition.store(val, std::memory_order_relaxed);
-  }
-
-  inline unsigned GetStealPartition(int i) {
-    return thread_data_[i].steal_partition.load(std::memory_order_relaxed);
-  }
-
-  void ComputeCoprimes(int N, MaxSizeVector<unsigned>* coprimes) {
-    for (int i = 1; i <= N; i++) {
-      unsigned a = i;
-      unsigned b = N;
-      // If GCD(a, b) == 1, then a and b are coprimes.
-      while (b != 0) {
-        unsigned tmp = a;
-        a = b;
-        b = tmp % b;
-      }
-      if (a == 1) {
-        coprimes->push_back(i);
-      }
-    }
-  }
-
-  typedef typename Environment::EnvThread Thread;
-
-  struct PerThread {
-    constexpr PerThread() : pool(NULL), rand(0), thread_id(-1) {}
-    ThreadPoolTempl* pool;  // Parent pool, or null for normal threads.
-    uint64_t rand;          // Random generator state.
-    int thread_id;          // Worker thread index in pool.
-#ifndef EIGEN_THREAD_LOCAL
-    // Prevent false sharing.
-    char pad_[128];
-#endif
-  };
-
-  struct ThreadData {
-    constexpr ThreadData() : thread(), steal_partition(0), queue() {}
-    std::unique_ptr<Thread> thread;
-    std::atomic<unsigned> steal_partition;
-    Queue queue;
-  };
-
-  Environment env_;
-  const int num_threads_;
-  const bool allow_spinning_;
-  MaxSizeVector<ThreadData> thread_data_;
-  MaxSizeVector<MaxSizeVector<unsigned>> all_coprimes_;
-  MaxSizeVector<EventCount::Waiter> waiters_;
-  unsigned global_steal_partition_;
-  std::atomic<unsigned> blocked_;
-  std::atomic<bool> spinning_;
-  std::atomic<bool> done_;
-  std::atomic<bool> cancelled_;
-  EventCount ec_;
-#ifndef EIGEN_THREAD_LOCAL
-  std::unique_ptr<Barrier> init_barrier_;
-  std::mutex per_thread_map_mutex_;  // Protects per_thread_map_.
-  std::unordered_map<uint64_t, std::unique_ptr<PerThread>> per_thread_map_;
-#endif
-
-  // Main worker thread loop.
-  void WorkerLoop(int thread_id) {
-#ifndef EIGEN_THREAD_LOCAL
-    std::unique_ptr<PerThread> new_pt(new PerThread());
-    per_thread_map_mutex_.lock();
-    bool insertOK = per_thread_map_.emplace(GlobalThreadIdHash(), std::move(new_pt)).second;
-    eigen_plain_assert(insertOK);
-    EIGEN_UNUSED_VARIABLE(insertOK);
-    per_thread_map_mutex_.unlock();
-    init_barrier_->Notify();
-    init_barrier_->Wait();
-#endif
-    PerThread* pt = GetPerThread();
-    pt->pool = this;
-    pt->rand = GlobalThreadIdHash();
-    pt->thread_id = thread_id;
-    Queue& q = thread_data_[thread_id].queue;
-    EventCount::Waiter* waiter = &waiters_[thread_id];
-    // TODO(dvyukov,rmlarsen): The time spent in NonEmptyQueueIndex() is
-    // proportional to num_threads_ and we assume that new work is scheduled at
-    // a constant rate, so we set spin_count to 5000 / num_threads_. The
-    // constant was picked based on a fair dice roll, tune it.
-    const int spin_count =
-        allow_spinning_ && num_threads_ > 0 ? 5000 / num_threads_ : 0;
-    if (num_threads_ == 1) {
-      // For num_threads_ == 1 there is no point in going through the expensive
-      // steal loop. Moreover, since NonEmptyQueueIndex() calls PopBack() on the
-      // victim queues it might reverse the order in which ops are executed
-      // compared to the order in which they are scheduled, which tends to be
-      // counter-productive for the types of I/O workloads the single thread
-      // pools tend to be used for.
-      while (!cancelled_) {
-        Task t = q.PopFront();
-        for (int i = 0; i < spin_count && !t.f; i++) {
-          if (!cancelled_.load(std::memory_order_relaxed)) {
-            t = q.PopFront();
-          }
-        }
-        if (!t.f) {
-          if (!WaitForWork(waiter, &t)) {
-            return;
-          }
-        }
-        if (t.f) {
-          env_.ExecuteTask(t);
-        }
-      }
-    } else {
-      while (!cancelled_) {
-        Task t = q.PopFront();
-        if (!t.f) {
-          t = LocalSteal();
-          if (!t.f) {
-            t = GlobalSteal();
-            if (!t.f) {
-              // Leave one thread spinning. This reduces latency.
-              if (allow_spinning_ && !spinning_ && !spinning_.exchange(true)) {
-                for (int i = 0; i < spin_count && !t.f; i++) {
-                  if (!cancelled_.load(std::memory_order_relaxed)) {
-                    t = GlobalSteal();
-                  } else {
-                    return;
-                  }
-                }
-                spinning_ = false;
-              }
-              if (!t.f) {
-                if (!WaitForWork(waiter, &t)) {
-                  return;
-                }
-              }
-            }
-          }
-        }
-        if (t.f) {
-          env_.ExecuteTask(t);
-        }
-      }
-    }
-  }
-
-  // Steal tries to steal work from other worker threads in the range [start,
-  // limit) in best-effort manner.
-  Task Steal(unsigned start, unsigned limit) {
-    PerThread* pt = GetPerThread();
-    const size_t size = limit - start;
-    unsigned r = Rand(&pt->rand);
-    // Reduce r into [0, size) range, this utilizes trick from
-    // https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
-    eigen_plain_assert(all_coprimes_[size - 1].size() < (1<<30));
-    unsigned victim = ((uint64_t)r * (uint64_t)size) >> 32;
-    unsigned index = ((uint64_t) all_coprimes_[size - 1].size() * (uint64_t)r) >> 32;
-    unsigned inc = all_coprimes_[size - 1][index];
-
-    for (unsigned i = 0; i < size; i++) {
-      eigen_plain_assert(start + victim < limit);
-      Task t = thread_data_[start + victim].queue.PopBack();
-      if (t.f) {
-        return t;
-      }
-      victim += inc;
-      if (victim >= size) {
-        victim -= size;
-      }
-    }
-    return Task();
-  }
-
-  // Steals work within threads belonging to the partition.
-  Task LocalSteal() {
-    PerThread* pt = GetPerThread();
-    unsigned partition = GetStealPartition(pt->thread_id);
-    // If thread steal partition is the same as global partition, there is no
-    // need to go through the steal loop twice.
-    if (global_steal_partition_ == partition) return Task();
-    unsigned start, limit;
-    DecodePartition(partition, &start, &limit);
-    AssertBounds(start, limit);
-
-    return Steal(start, limit);
-  }
-
-  // Steals work from any other thread in the pool.
-  Task GlobalSteal() {
-    return Steal(0, num_threads_);
-  }
-
-
-  // WaitForWork blocks until new work is available (returns true), or if it is
-  // time to exit (returns false). Can optionally return a task to execute in t
-  // (in such case t.f != nullptr on return).
-  bool WaitForWork(EventCount::Waiter* waiter, Task* t) {
-    eigen_plain_assert(!t->f);
-    // We already did best-effort emptiness check in Steal, so prepare for
-    // blocking.
-    ec_.Prewait();
-    // Now do a reliable emptiness check.
-    int victim = NonEmptyQueueIndex();
-    if (victim != -1) {
-      ec_.CancelWait();
-      if (cancelled_) {
-        return false;
-      } else {
-        *t = thread_data_[victim].queue.PopBack();
-        return true;
-      }
-    }
-    // Number of blocked threads is used as termination condition.
-    // If we are shutting down and all worker threads blocked without work,
-    // that's we are done.
-    blocked_++;
-    // TODO is blocked_ required to be unsigned?
-    if (done_ && blocked_ == static_cast<unsigned>(num_threads_)) {
-      ec_.CancelWait();
-      // Almost done, but need to re-check queues.
-      // Consider that all queues are empty and all worker threads are preempted
-      // right after incrementing blocked_ above. Now a free-standing thread
-      // submits work and calls destructor (which sets done_). If we don't
-      // re-check queues, we will exit leaving the work unexecuted.
-      if (NonEmptyQueueIndex() != -1) {
-        // Note: we must not pop from queues before we decrement blocked_,
-        // otherwise the following scenario is possible. Consider that instead
-        // of checking for emptiness we popped the only element from queues.
-        // Now other worker threads can start exiting, which is bad if the
-        // work item submits other work. So we just check emptiness here,
-        // which ensures that all worker threads exit at the same time.
-        blocked_--;
-        return true;
-      }
-      // Reached stable termination state.
-      ec_.Notify(true);
-      return false;
-    }
-    ec_.CommitWait(waiter);
-    blocked_--;
-    return true;
-  }
-
-  int NonEmptyQueueIndex() {
-    PerThread* pt = GetPerThread();
-    // We intentionally design NonEmptyQueueIndex to steal work from
-    // anywhere in the queue so threads don't block in WaitForWork() forever
-    // when all threads in their partition go to sleep. Steal is still local.
-    const size_t size = thread_data_.size();
-    unsigned r = Rand(&pt->rand);
-    unsigned inc = all_coprimes_[size - 1][r % all_coprimes_[size - 1].size()];
-    unsigned victim = r % size;
-    for (unsigned i = 0; i < size; i++) {
-      if (!thread_data_[victim].queue.Empty()) {
-        return victim;
-      }
-      victim += inc;
-      if (victim >= size) {
-        victim -= size;
-      }
-    }
-    return -1;
-  }
-
-  static EIGEN_STRONG_INLINE uint64_t GlobalThreadIdHash() {
-    return std::hash<std::thread::id>()(std::this_thread::get_id());
-  }
-
-  EIGEN_STRONG_INLINE PerThread* GetPerThread() {
-#ifndef EIGEN_THREAD_LOCAL
-    static PerThread dummy;
-    auto it = per_thread_map_.find(GlobalThreadIdHash());
-    if (it == per_thread_map_.end()) {
-      return &dummy;
-    } else {
-      return it->second.get();
-    }
-#else
-    EIGEN_THREAD_LOCAL PerThread per_thread_;
-    PerThread* pt = &per_thread_;
-    return pt;
-#endif
-  }
-
-  static EIGEN_STRONG_INLINE unsigned Rand(uint64_t* state) {
-    uint64_t current = *state;
-    // Update the internal state
-    *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
-    // Generate the random output (using the PCG-XSH-RS scheme)
-    return static_cast<unsigned>((current ^ (current >> 22)) >>
-                                 (22 + (current >> 61)));
-  }
-};
-
-typedef ThreadPoolTempl<StlThreadEnvironment> ThreadPool;
-
-}  // namespace Eigen
-
-#endif  // EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H
diff --git a/src/EigenUnsupported/CXX11/src/ThreadPool/RunQueue.h b/src/EigenUnsupported/CXX11/src/ThreadPool/RunQueue.h
deleted file mode 100644
index b572ebc..0000000
--- a/src/EigenUnsupported/CXX11/src/ThreadPool/RunQueue.h
+++ /dev/null
@@ -1,236 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_
-#define EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_
-
-namespace Eigen {
-
-// RunQueue is a fixed-size, partially non-blocking deque or Work items.
-// Operations on front of the queue must be done by a single thread (owner),
-// operations on back of the queue can be done by multiple threads concurrently.
-//
-// Algorithm outline:
-// All remote threads operating on the queue back are serialized by a mutex.
-// This ensures that at most two threads access state: owner and one remote
-// thread (Size aside). The algorithm ensures that the occupied region of the
-// underlying array is logically continuous (can wraparound, but no stray
-// occupied elements). Owner operates on one end of this region, remote thread
-// operates on the other end. Synchronization between these threads
-// (potential consumption of the last element and take up of the last empty
-// element) happens by means of state variable in each element. States are:
-// empty, busy (in process of insertion of removal) and ready. Threads claim
-// elements (empty->busy and ready->busy transitions) by means of a CAS
-// operation. The finishing transition (busy->empty and busy->ready) are done
-// with plain store as the element is exclusively owned by the current thread.
-//
-// Note: we could permit only pointers as elements, then we would not need
-// separate state variable as null/non-null pointer value would serve as state,
-// but that would require malloc/free per operation for large, complex values
-// (and this is designed to store std::function<()>).
-template <typename Work, unsigned kSize>
-class RunQueue {
- public:
-  RunQueue() : front_(0), back_(0) {
-    // require power-of-two for fast masking
-    eigen_plain_assert((kSize & (kSize - 1)) == 0);
-    eigen_plain_assert(kSize > 2);            // why would you do this?
-    eigen_plain_assert(kSize <= (64 << 10));  // leave enough space for counter
-    for (unsigned i = 0; i < kSize; i++)
-      array_[i].state.store(kEmpty, std::memory_order_relaxed);
-  }
-
-  ~RunQueue() { eigen_plain_assert(Size() == 0); }
-
-  // PushFront inserts w at the beginning of the queue.
-  // If queue is full returns w, otherwise returns default-constructed Work.
-  Work PushFront(Work w) {
-    unsigned front = front_.load(std::memory_order_relaxed);
-    Elem* e = &array_[front & kMask];
-    uint8_t s = e->state.load(std::memory_order_relaxed);
-    if (s != kEmpty ||
-        !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
-      return w;
-    front_.store(front + 1 + (kSize << 1), std::memory_order_relaxed);
-    e->w = std::move(w);
-    e->state.store(kReady, std::memory_order_release);
-    return Work();
-  }
-
-  // PopFront removes and returns the first element in the queue.
-  // If the queue was empty returns default-constructed Work.
-  Work PopFront() {
-    unsigned front = front_.load(std::memory_order_relaxed);
-    Elem* e = &array_[(front - 1) & kMask];
-    uint8_t s = e->state.load(std::memory_order_relaxed);
-    if (s != kReady ||
-        !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
-      return Work();
-    Work w = std::move(e->w);
-    e->state.store(kEmpty, std::memory_order_release);
-    front = ((front - 1) & kMask2) | (front & ~kMask2);
-    front_.store(front, std::memory_order_relaxed);
-    return w;
-  }
-
-  // PushBack adds w at the end of the queue.
-  // If queue is full returns w, otherwise returns default-constructed Work.
-  Work PushBack(Work w) {
-    std::unique_lock<std::mutex> lock(mutex_);
-    unsigned back = back_.load(std::memory_order_relaxed);
-    Elem* e = &array_[(back - 1) & kMask];
-    uint8_t s = e->state.load(std::memory_order_relaxed);
-    if (s != kEmpty ||
-        !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
-      return w;
-    back = ((back - 1) & kMask2) | (back & ~kMask2);
-    back_.store(back, std::memory_order_relaxed);
-    e->w = std::move(w);
-    e->state.store(kReady, std::memory_order_release);
-    return Work();
-  }
-
-  // PopBack removes and returns the last elements in the queue.
-  Work PopBack() {
-    if (Empty()) return Work();
-    std::unique_lock<std::mutex> lock(mutex_);
-    unsigned back = back_.load(std::memory_order_relaxed);
-    Elem* e = &array_[back & kMask];
-    uint8_t s = e->state.load(std::memory_order_relaxed);
-    if (s != kReady ||
-        !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
-      return Work();
-    Work w = std::move(e->w);
-    e->state.store(kEmpty, std::memory_order_release);
-    back_.store(back + 1 + (kSize << 1), std::memory_order_relaxed);
-    return w;
-  }
-
-  // PopBackHalf removes and returns half last elements in the queue.
-  // Returns number of elements removed.
-  unsigned PopBackHalf(std::vector<Work>* result) {
-    if (Empty()) return 0;
-    std::unique_lock<std::mutex> lock(mutex_);
-    unsigned back = back_.load(std::memory_order_relaxed);
-    unsigned size = Size();
-    unsigned mid = back;
-    if (size > 1) mid = back + (size - 1) / 2;
-    unsigned n = 0;
-    unsigned start = 0;
-    for (; static_cast<int>(mid - back) >= 0; mid--) {
-      Elem* e = &array_[mid & kMask];
-      uint8_t s = e->state.load(std::memory_order_relaxed);
-      if (n == 0) {
-        if (s != kReady || !e->state.compare_exchange_strong(
-                               s, kBusy, std::memory_order_acquire))
-          continue;
-        start = mid;
-      } else {
-        // Note: no need to store temporal kBusy, we exclusively own these
-        // elements.
-        eigen_plain_assert(s == kReady);
-      }
-      result->push_back(std::move(e->w));
-      e->state.store(kEmpty, std::memory_order_release);
-      n++;
-    }
-    if (n != 0)
-      back_.store(start + 1 + (kSize << 1), std::memory_order_relaxed);
-    return n;
-  }
-
-  // Size returns current queue size.
-  // Can be called by any thread at any time.
-  unsigned Size() const { return SizeOrNotEmpty<true>(); }
-
-  // Empty tests whether container is empty.
-  // Can be called by any thread at any time.
-  bool Empty() const { return SizeOrNotEmpty<false>() == 0; }
-
-  // Delete all the elements from the queue.
-  void Flush() {
-    while (!Empty()) {
-      PopFront();
-    }
-  }
-
- private:
-  static const unsigned kMask = kSize - 1;
-  static const unsigned kMask2 = (kSize << 1) - 1;
-  struct Elem {
-    std::atomic<uint8_t> state;
-    Work w;
-  };
-  enum {
-    kEmpty,
-    kBusy,
-    kReady,
-  };
-  std::mutex mutex_;
-  // Low log(kSize) + 1 bits in front_ and back_ contain rolling index of
-  // front/back, respectively. The remaining bits contain modification counters
-  // that are incremented on Push operations. This allows us to (1) distinguish
-  // between empty and full conditions (if we would use log(kSize) bits for
-  // position, these conditions would be indistinguishable); (2) obtain
-  // consistent snapshot of front_/back_ for Size operation using the
-  // modification counters.
-  std::atomic<unsigned> front_;
-  std::atomic<unsigned> back_;
-  Elem array_[kSize];
-
-  // SizeOrNotEmpty returns current queue size; if NeedSizeEstimate is false,
-  // only whether the size is 0 is guaranteed to be correct.
-  // Can be called by any thread at any time.
-  template<bool NeedSizeEstimate>
-  unsigned SizeOrNotEmpty() const {
-    // Emptiness plays critical role in thread pool blocking. So we go to great
-    // effort to not produce false positives (claim non-empty queue as empty).
-    unsigned front = front_.load(std::memory_order_acquire);
-    for (;;) {
-      // Capture a consistent snapshot of front/tail.
-      unsigned back = back_.load(std::memory_order_acquire);
-      unsigned front1 = front_.load(std::memory_order_relaxed);
-      if (front != front1) {
-        front = front1;
-        std::atomic_thread_fence(std::memory_order_acquire);
-        continue;
-      }
-      if (NeedSizeEstimate) {
-        return CalculateSize(front, back);
-      } else {
-        // This value will be 0 if the queue is empty, and undefined otherwise.
-        unsigned maybe_zero = ((front ^ back) & kMask2);
-        // Queue size estimate must agree with maybe zero check on the queue
-        // empty/non-empty state.
-        eigen_assert((CalculateSize(front, back) == 0) == (maybe_zero == 0));
-        return maybe_zero;
-      }
-    }
-  }
-
-  EIGEN_ALWAYS_INLINE
-  unsigned CalculateSize(unsigned front, unsigned back) const {
-    int size = (front & kMask2) - (back & kMask2);
-    // Fix overflow.
-    if (size < 0) size += 2 * kSize;
-    // Order of modification in push/pop is crafted to make the queue look
-    // larger than it is during concurrent modifications. E.g. push can
-    // increment size before the corresponding pop has decremented it.
-    // So the computed size can be up to kSize + 1, fix it.
-    if (size > static_cast<int>(kSize)) size = kSize;
-    return static_cast<unsigned>(size);
-  }
-
-  RunQueue(const RunQueue&) = delete;
-  void operator=(const RunQueue&) = delete;
-};
-
-}  // namespace Eigen
-
-#endif  // EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_
diff --git a/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadCancel.h b/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadCancel.h
deleted file mode 100644
index a05685f..0000000
--- a/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadCancel.h
+++ /dev/null
@@ -1,23 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H
-#define EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H
-
-// Try to come up with a portable way to cancel a thread
-#if EIGEN_OS_GNULINUX
-  #define EIGEN_THREAD_CANCEL(t)                  \
-    pthread_cancel(t.native_handle());
-  #define EIGEN_SUPPORTS_THREAD_CANCELLATION 1
-#else
-#define EIGEN_THREAD_CANCEL(t)
-#endif
-
-
-#endif  // EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H
diff --git a/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadEnvironment.h b/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadEnvironment.h
deleted file mode 100644
index d94a064..0000000
--- a/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadEnvironment.h
+++ /dev/null
@@ -1,40 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H
-#define EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H
-
-namespace Eigen {
-
-struct StlThreadEnvironment {
-  struct Task {
-    std::function<void()> f;
-  };
-
-  // EnvThread constructor must start the thread,
-  // destructor must join the thread.
-  class EnvThread {
-   public:
-    EnvThread(std::function<void()> f) : thr_(std::move(f)) {}
-    ~EnvThread() { thr_.join(); }
-    // This function is called when the threadpool is cancelled.
-    void OnCancel() { }
-
-   private:
-    std::thread thr_;
-  };
-
-  EnvThread* CreateThread(std::function<void()> f) { return new EnvThread(std::move(f)); }
-  Task CreateTask(std::function<void()> f) { return Task{std::move(f)}; }
-  void ExecuteTask(const Task& t) { t.f(); }
-};
-
-}  // namespace Eigen
-
-#endif  // EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H
diff --git a/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadLocal.h b/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadLocal.h
deleted file mode 100644
index 4e68474..0000000
--- a/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadLocal.h
+++ /dev/null
@@ -1,301 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
-#define EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
-
-#ifdef EIGEN_AVOID_THREAD_LOCAL
-
-#ifdef EIGEN_THREAD_LOCAL
-#undef EIGEN_THREAD_LOCAL
-#endif
-
-#else
-
-#if EIGEN_MAX_CPP_VER >= 11 &&                         \
-    ((EIGEN_COMP_GNUC && EIGEN_GNUC_AT_LEAST(4, 8)) || \
-     __has_feature(cxx_thread_local)                || \
-     (EIGEN_COMP_MSVC >= 1900) )
-#define EIGEN_THREAD_LOCAL static thread_local
-#endif
-
-// Disable TLS for Apple and Android builds with older toolchains.
-#if defined(__APPLE__)
-// Included for TARGET_OS_IPHONE, __IPHONE_OS_VERSION_MIN_REQUIRED,
-// __IPHONE_8_0.
-#include <Availability.h>
-#include <TargetConditionals.h>
-#endif
-// Checks whether C++11's `thread_local` storage duration specifier is
-// supported.
-#if defined(__apple_build_version__) &&     \
-    ((__apple_build_version__ < 8000042) || \
-     (TARGET_OS_IPHONE && __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_9_0))
-// Notes: Xcode's clang did not support `thread_local` until version
-// 8, and even then not for all iOS < 9.0.
-#undef EIGEN_THREAD_LOCAL
-
-#elif defined(__ANDROID__) && EIGEN_COMP_CLANG
-// There are platforms for which TLS should not be used even though the compiler
-// makes it seem like it's supported (Android NDK < r12b for example).
-// This is primarily because of linker problems and toolchain misconfiguration:
-// TLS isn't supported until NDK r12b per
-// https://developer.android.com/ndk/downloads/revision_history.html
-// Since NDK r16, `__NDK_MAJOR__` and `__NDK_MINOR__` are defined in
-// <android/ndk-version.h>. For NDK < r16, users should define these macros,
-// e.g. `-D__NDK_MAJOR__=11 -D__NKD_MINOR__=0` for NDK r11.
-#if __has_include(<android/ndk-version.h>)
-#include <android/ndk-version.h>
-#endif  // __has_include(<android/ndk-version.h>)
-#if defined(__ANDROID__) && defined(__clang__) && defined(__NDK_MAJOR__) && \
-    defined(__NDK_MINOR__) &&                                               \
-    ((__NDK_MAJOR__ < 12) || ((__NDK_MAJOR__ == 12) && (__NDK_MINOR__ < 1)))
-#undef EIGEN_THREAD_LOCAL
-#endif
-#endif  // defined(__ANDROID__) && defined(__clang__)
-
-#endif  // EIGEN_AVOID_THREAD_LOCAL
-
-namespace Eigen {
-
-namespace internal {
-template <typename T>
-struct ThreadLocalNoOpInitialize {
-  void operator()(T&) const {}
-};
-
-template <typename T>
-struct ThreadLocalNoOpRelease {
-  void operator()(T&) const {}
-};
-
-}  // namespace internal
-
-// Thread local container for elements of type T, that does not use thread local
-// storage. As long as the number of unique threads accessing this storage
-// is smaller than `capacity_`, it is lock-free and wait-free. Otherwise it will
-// use a mutex for synchronization.
-//
-// Type `T` has to be default constructible, and by default each thread will get
-// a default constructed value. It is possible to specify custom `initialize`
-// callable, that will be called lazily from each thread accessing this object,
-// and will be passed a default initialized object of type `T`. Also it's
-// possible to pass a custom `release` callable, that will be invoked before
-// calling ~T().
-//
-// Example:
-//
-//   struct Counter {
-//     int value = 0;
-//   }
-//
-//   Eigen::ThreadLocal<Counter> counter(10);
-//
-//   // Each thread will have access to it's own counter object.
-//   Counter& cnt = counter.local();
-//   cnt++;
-//
-// WARNING: Eigen::ThreadLocal uses the OS-specific value returned by
-// std::this_thread::get_id() to identify threads. This value is not guaranteed
-// to be unique except for the life of the thread. A newly created thread may
-// get an OS-specific ID equal to that of an already destroyed thread.
-//
-// Somewhat similar to TBB thread local storage, with similar restrictions:
-// https://www.threadingbuildingblocks.org/docs/help/reference/thread_local_storage/enumerable_thread_specific_cls.html
-//
-template <typename T,
-          typename Initialize = internal::ThreadLocalNoOpInitialize<T>,
-          typename Release = internal::ThreadLocalNoOpRelease<T>>
-class ThreadLocal {
-  // We preallocate default constructed elements in MaxSizedVector.
-  static_assert(std::is_default_constructible<T>::value,
-                "ThreadLocal data type must be default constructible");
-
- public:
-  explicit ThreadLocal(int capacity)
-      : ThreadLocal(capacity, internal::ThreadLocalNoOpInitialize<T>(),
-                    internal::ThreadLocalNoOpRelease<T>()) {}
-
-  ThreadLocal(int capacity, Initialize initialize)
-      : ThreadLocal(capacity, std::move(initialize),
-                    internal::ThreadLocalNoOpRelease<T>()) {}
-
-  ThreadLocal(int capacity, Initialize initialize, Release release)
-      : initialize_(std::move(initialize)),
-        release_(std::move(release)),
-        capacity_(capacity),
-        data_(capacity_),
-        ptr_(capacity_),
-        filled_records_(0) {
-    eigen_assert(capacity_ >= 0);
-    data_.resize(capacity_);
-    for (int i = 0; i < capacity_; ++i) {
-      ptr_.emplace_back(nullptr);
-    }
-  }
-
-  T& local() {
-    std::thread::id this_thread = std::this_thread::get_id();
-    if (capacity_ == 0) return SpilledLocal(this_thread);
-
-    std::size_t h = std::hash<std::thread::id>()(this_thread);
-    const int start_idx = h % capacity_;
-
-    // NOTE: From the definition of `std::this_thread::get_id()` it is
-    // guaranteed that we never can have concurrent insertions with the same key
-    // to our hash-map like data structure. If we didn't find an element during
-    // the initial traversal, it's guaranteed that no one else could have
-    // inserted it while we are in this function. This allows to massively
-    // simplify out lock-free insert-only hash map.
-
-    // Check if we already have an element for `this_thread`.
-    int idx = start_idx;
-    while (ptr_[idx].load() != nullptr) {
-      ThreadIdAndValue& record = *(ptr_[idx].load());
-      if (record.thread_id == this_thread) return record.value;
-
-      idx += 1;
-      if (idx >= capacity_) idx -= capacity_;
-      if (idx == start_idx) break;
-    }
-
-    // If we are here, it means that we found an insertion point in lookup
-    // table at `idx`, or we did a full traversal and table is full.
-
-    // If lock-free storage is full, fallback on mutex.
-    if (filled_records_.load() >= capacity_) return SpilledLocal(this_thread);
-
-    // We double check that we still have space to insert an element into a lock
-    // free storage. If old value in `filled_records_` is larger than the
-    // records capacity, it means that some other thread added an element while
-    // we were traversing lookup table.
-    int insertion_index =
-        filled_records_.fetch_add(1, std::memory_order_relaxed);
-    if (insertion_index >= capacity_) return SpilledLocal(this_thread);
-
-    // At this point it's guaranteed that we can access to
-    // data_[insertion_index_] without a data race.
-    data_[insertion_index].thread_id = this_thread;
-    initialize_(data_[insertion_index].value);
-
-    // That's the pointer we'll put into the lookup table.
-    ThreadIdAndValue* inserted = &data_[insertion_index];
-
-    // We'll use nullptr pointer to ThreadIdAndValue in a compare-and-swap loop.
-    ThreadIdAndValue* empty = nullptr;
-
-    // Now we have to find an insertion point into the lookup table. We start
-    // from the `idx` that was identified as an insertion point above, it's
-    // guaranteed that we will have an empty record somewhere in a lookup table
-    // (because we created a record in the `data_`).
-    const int insertion_idx = idx;
-
-    do {
-      // Always start search from the original insertion candidate.
-      idx = insertion_idx;
-      while (ptr_[idx].load() != nullptr) {
-        idx += 1;
-        if (idx >= capacity_) idx -= capacity_;
-        // If we did a full loop, it means that we don't have any free entries
-        // in the lookup table, and this means that something is terribly wrong.
-        eigen_assert(idx != insertion_idx);
-      }
-      // Atomic CAS of the pointer guarantees that any other thread, that will
-      // follow this pointer will see all the mutations in the `data_`.
-    } while (!ptr_[idx].compare_exchange_weak(empty, inserted));
-
-    return inserted->value;
-  }
-
-  // WARN: It's not thread safe to call it concurrently with `local()`.
-  void ForEach(std::function<void(std::thread::id, T&)> f) {
-    // Reading directly from `data_` is unsafe, because only CAS to the
-    // record in `ptr_` makes all changes visible to other threads.
-    for (auto& ptr : ptr_) {
-      ThreadIdAndValue* record = ptr.load();
-      if (record == nullptr) continue;
-      f(record->thread_id, record->value);
-    }
-
-    // We did not spill into the map based storage.
-    if (filled_records_.load(std::memory_order_relaxed) < capacity_) return;
-
-    // Adds a happens before edge from the last call to SpilledLocal().
-    std::unique_lock<std::mutex> lock(mu_);
-    for (auto& kv : per_thread_map_) {
-      f(kv.first, kv.second);
-    }
-  }
-
-  // WARN: It's not thread safe to call it concurrently with `local()`.
-  ~ThreadLocal() {
-    // Reading directly from `data_` is unsafe, because only CAS to the record
-    // in `ptr_` makes all changes visible to other threads.
-    for (auto& ptr : ptr_) {
-      ThreadIdAndValue* record = ptr.load();
-      if (record == nullptr) continue;
-      release_(record->value);
-    }
-
-    // We did not spill into the map based storage.
-    if (filled_records_.load(std::memory_order_relaxed) < capacity_) return;
-
-    // Adds a happens before edge from the last call to SpilledLocal().
-    std::unique_lock<std::mutex> lock(mu_);
-    for (auto& kv : per_thread_map_) {
-      release_(kv.second);
-    }
-  }
-
- private:
-  struct ThreadIdAndValue {
-    std::thread::id thread_id;
-    T value;
-  };
-
-  // Use unordered map guarded by a mutex when lock free storage is full.
-  T& SpilledLocal(std::thread::id this_thread) {
-    std::unique_lock<std::mutex> lock(mu_);
-
-    auto it = per_thread_map_.find(this_thread);
-    if (it == per_thread_map_.end()) {
-      auto result = per_thread_map_.emplace(this_thread, T());
-      eigen_assert(result.second);
-      initialize_((*result.first).second);
-      return (*result.first).second;
-    } else {
-      return it->second;
-    }
-  }
-
-  Initialize initialize_;
-  Release release_;
-  const int capacity_;
-
-  // Storage that backs lock-free lookup table `ptr_`. Records stored in this
-  // storage contiguously starting from index 0.
-  MaxSizeVector<ThreadIdAndValue> data_;
-
-  // Atomic pointers to the data stored in `data_`. Used as a lookup table for
-  // linear probing hash map (https://en.wikipedia.org/wiki/Linear_probing).
-  MaxSizeVector<std::atomic<ThreadIdAndValue*>> ptr_;
-
-  // Number of records stored in the `data_`.
-  std::atomic<int> filled_records_;
-
-  // We fallback on per thread map if lock-free storage is full. In practice
-  // this should never happen, if `capacity_` is a reasonable estimate of the
-  // number of threads running in a system.
-  std::mutex mu_;  // Protects per_thread_map_.
-  std::unordered_map<std::thread::id, T> per_thread_map_;
-};
-
-}  // namespace Eigen
-
-#endif  // EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
diff --git a/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadPoolInterface.h b/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadPoolInterface.h
deleted file mode 100644
index 25030dc..0000000
--- a/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadPoolInterface.h
+++ /dev/null
@@ -1,48 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H
-#define EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H
-
-namespace Eigen {
-
-// This defines an interface that ThreadPoolDevice can take to use
-// custom thread pools underneath.
-class ThreadPoolInterface {
- public:
-  // Submits a closure to be run by a thread in the pool.
-  virtual void Schedule(std::function<void()> fn) = 0;
-
-  // Submits a closure to be run by threads in the range [start, end) in the
-  // pool.
-  virtual void ScheduleWithHint(std::function<void()> fn, int /*start*/,
-                                int /*end*/) {
-    // Just defer to Schedule in case sub-classes aren't interested in
-    // overriding this functionality.
-    Schedule(fn);
-  }
-
-  // If implemented, stop processing the closures that have been enqueued.
-  // Currently running closures may still be processed.
-  // If not implemented, does nothing.
-  virtual void Cancel() {}
-
-  // Returns the number of threads in the pool.
-  virtual int NumThreads() const = 0;
-
-  // Returns a logical thread index between 0 and NumThreads() - 1 if called
-  // from one of the threads in the pool. Returns -1 otherwise.
-  virtual int CurrentThreadId() const = 0;
-
-  virtual ~ThreadPoolInterface() {}
-};
-
-}  // namespace Eigen
-
-#endif  // EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H
diff --git a/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadYield.h b/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadYield.h
deleted file mode 100644
index a859c7b..0000000
--- a/src/EigenUnsupported/CXX11/src/ThreadPool/ThreadYield.h
+++ /dev/null
@@ -1,20 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H
-#define EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H
-
-// Try to come up with a portable way to yield
-#if EIGEN_COMP_GNUC && EIGEN_GNUC_AT_MOST(4, 7)
-#define EIGEN_THREAD_YIELD() sched_yield()
-#else
-#define EIGEN_THREAD_YIELD() std::this_thread::yield()
-#endif
-
-#endif  // EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H
diff --git a/src/EigenUnsupported/CXX11/src/util/CXX11Meta.h b/src/EigenUnsupported/CXX11/src/util/CXX11Meta.h
deleted file mode 100644
index 149ceaf..0000000
--- a/src/EigenUnsupported/CXX11/src/util/CXX11Meta.h
+++ /dev/null
@@ -1,537 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11META_H
-#define EIGEN_CXX11META_H
-
-#include <vector>
-#include "EmulateArray.h"
-
-#include "CXX11Workarounds.h"
-
-namespace Eigen {
-
-namespace internal {
-
-/** \internal
-  * \file CXX11/util/CXX11Meta.h
-  * This file contains generic metaprogramming classes which are not specifically related to Eigen.
-  * This file expands upon Core/util/Meta.h and adds support for C++11 specific features.
-  */
-
-template<typename... tt>
-struct type_list { constexpr static int count = sizeof...(tt); };
-
-template<typename t, typename... tt>
-struct type_list<t, tt...> { constexpr static int count = sizeof...(tt) + 1; typedef t first_type; };
-
-template<typename T, T... nn>
-struct numeric_list { constexpr static std::size_t count = sizeof...(nn); };
-
-template<typename T, T n, T... nn>
-struct numeric_list<T, n, nn...> { static const std::size_t count = sizeof...(nn) + 1; const static T first_value = n; };
-
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-/* numeric list constructors
- *
- * equivalencies:
- *     constructor                                              result
- *     typename gen_numeric_list<int, 5>::type                  numeric_list<int, 0,1,2,3,4>
- *     typename gen_numeric_list_reversed<int, 5>::type         numeric_list<int, 4,3,2,1,0>
- *     typename gen_numeric_list_swapped_pair<int, 5,1,2>::type numeric_list<int, 0,2,1,3,4>
- *     typename gen_numeric_list_repeated<int, 0, 5>::type      numeric_list<int, 0,0,0,0,0>
- */
-
-template<typename T, std::size_t n, T start = 0, T... ii> struct gen_numeric_list                     : gen_numeric_list<T, n-1, start, start + n-1, ii...> {};
-template<typename T, T start, T... ii>                    struct gen_numeric_list<T, 0, start, ii...> { typedef numeric_list<T, ii...> type; };
-
-template<typename T, std::size_t n, T start = 0, T... ii> struct gen_numeric_list_reversed                     : gen_numeric_list_reversed<T, n-1, start, ii..., start + n-1> {};
-template<typename T, T start, T... ii>                    struct gen_numeric_list_reversed<T, 0, start, ii...> { typedef numeric_list<T, ii...> type; };
-
-template<typename T, std::size_t n, T a, T b, T start = 0, T... ii> struct gen_numeric_list_swapped_pair                           : gen_numeric_list_swapped_pair<T, n-1, a, b, start, (start + n-1) == a ? b : ((start + n-1) == b ? a : (start + n-1)), ii...> {};
-template<typename T, T a, T b, T start, T... ii>                    struct gen_numeric_list_swapped_pair<T, 0, a, b, start, ii...> { typedef numeric_list<T, ii...> type; };
-
-template<typename T, std::size_t n, T V, T... nn> struct gen_numeric_list_repeated                 : gen_numeric_list_repeated<T, n-1, V, V, nn...> {};
-template<typename T, T V, T... nn>                struct gen_numeric_list_repeated<T, 0, V, nn...> { typedef numeric_list<T, nn...> type; };
-
-/* list manipulation: concatenate */
-
-template<class a, class b> struct concat;
-
-template<typename... as, typename... bs> struct concat<type_list<as...>,       type_list<bs...>>        { typedef type_list<as..., bs...> type; };
-template<typename T, T... as, T... bs>   struct concat<numeric_list<T, as...>, numeric_list<T, bs...> > { typedef numeric_list<T, as..., bs...> type; };
-
-template<typename... p> struct mconcat;
-template<typename a>                             struct mconcat<a>           { typedef a type; };
-template<typename a, typename b>                 struct mconcat<a, b>        : concat<a, b> {};
-template<typename a, typename b, typename... cs> struct mconcat<a, b, cs...> : concat<a, typename mconcat<b, cs...>::type> {};
-
-/* list manipulation: extract slices */
-
-template<int n, typename x> struct take;
-template<int n, typename a, typename... as> struct take<n, type_list<a, as...>> : concat<type_list<a>, typename take<n-1, type_list<as...>>::type> {};
-template<int n>                             struct take<n, type_list<>>         { typedef type_list<> type; };
-template<typename a, typename... as>        struct take<0, type_list<a, as...>> { typedef type_list<> type; };
-template<>                                  struct take<0, type_list<>>         { typedef type_list<> type; };
-
-template<typename T, int n, T a, T... as> struct take<n, numeric_list<T, a, as...>> : concat<numeric_list<T, a>, typename take<n-1, numeric_list<T, as...>>::type> {};
-template<typename T, int n>               struct take<n, numeric_list<T>>           { typedef numeric_list<T> type; };
-template<typename T, T a, T... as>        struct take<0, numeric_list<T, a, as...>> { typedef numeric_list<T> type; };
-template<typename T>                      struct take<0, numeric_list<T>>           { typedef numeric_list<T> type; };
-
-template<typename T, int n, T... ii>      struct h_skip_helper_numeric;
-template<typename T, int n, T i, T... ii> struct h_skip_helper_numeric<T, n, i, ii...> : h_skip_helper_numeric<T, n-1, ii...> {};
-template<typename T, T i, T... ii>        struct h_skip_helper_numeric<T, 0, i, ii...> { typedef numeric_list<T, i, ii...> type; };
-template<typename T, int n>               struct h_skip_helper_numeric<T, n>           { typedef numeric_list<T> type; };
-template<typename T>                      struct h_skip_helper_numeric<T, 0>           { typedef numeric_list<T> type; };
-
-template<int n, typename... tt>             struct h_skip_helper_type;
-template<int n, typename t, typename... tt> struct h_skip_helper_type<n, t, tt...> : h_skip_helper_type<n-1, tt...> {};
-template<typename t, typename... tt>        struct h_skip_helper_type<0, t, tt...> { typedef type_list<t, tt...> type; };
-template<int n>                             struct h_skip_helper_type<n>           { typedef type_list<> type; };
-template<>                                  struct h_skip_helper_type<0>           { typedef type_list<> type; };
-#endif //not EIGEN_PARSED_BY_DOXYGEN
-
-template<int n>
-struct h_skip {
-  template<typename T, T... ii>
-  constexpr static EIGEN_STRONG_INLINE typename h_skip_helper_numeric<T, n, ii...>::type helper(numeric_list<T, ii...>) { return typename h_skip_helper_numeric<T, n, ii...>::type(); }
-  template<typename... tt>
-  constexpr static EIGEN_STRONG_INLINE typename h_skip_helper_type<n, tt...>::type helper(type_list<tt...>) { return typename h_skip_helper_type<n, tt...>::type(); }
-};
-
-template<int n, typename a> struct skip { typedef decltype(h_skip<n>::helper(a())) type; };
-
-template<int start, int count, typename a> struct slice : take<count, typename skip<start, a>::type> {};
-
-/* list manipulation: retrieve single element from list */
-
-template<int n, typename x> struct get;
-
-template<int n, typename a, typename... as>               struct get<n, type_list<a, as...>>   : get<n-1, type_list<as...>> {};
-template<typename a, typename... as>                      struct get<0, type_list<a, as...>>   { typedef a type; };
-
-template<typename T, int n, T a, T... as>                        struct get<n, numeric_list<T, a, as...>>   : get<n-1, numeric_list<T, as...>> {};
-template<typename T, T a, T... as>                               struct get<0, numeric_list<T, a, as...>>   { constexpr static T value = a; };
-
-template<std::size_t n, typename T, T a, T... as> constexpr T       array_get(const numeric_list<T, a, as...>&) {
-   return get<(int)n, numeric_list<T, a, as...>>::value;
-}
-
-/* always get type, regardless of dummy; good for parameter pack expansion */
-
-template<typename T, T dummy, typename t> struct id_numeric  { typedef t type; };
-template<typename dummy, typename t>      struct id_type     { typedef t type; };
-
-/* equality checking, flagged version */
-
-template<typename a, typename b> struct is_same_gf : is_same<a, b> { constexpr static int global_flags = 0; };
-
-/* apply_op to list */
-
-template<
-  bool from_left, // false
-  template<typename, typename> class op,
-  typename additional_param,
-  typename... values
->
-struct h_apply_op_helper                                        { typedef type_list<typename op<values, additional_param>::type...> type; };
-template<
-  template<typename, typename> class op,
-  typename additional_param,
-  typename... values
->
-struct h_apply_op_helper<true, op, additional_param, values...> { typedef type_list<typename op<additional_param, values>::type...> type; };
-
-template<
-  bool from_left,
-  template<typename, typename> class op,
-  typename additional_param
->
-struct h_apply_op
-{
-  template<typename... values>
-  constexpr static typename h_apply_op_helper<from_left, op, additional_param, values...>::type helper(type_list<values...>)
-  { return typename h_apply_op_helper<from_left, op, additional_param, values...>::type(); }
-};
-
-template<
-  template<typename, typename> class op,
-  typename additional_param,
-  typename a
->
-struct apply_op_from_left { typedef decltype(h_apply_op<true, op, additional_param>::helper(a())) type; };
-
-template<
-  template<typename, typename> class op,
-  typename additional_param,
-  typename a
->
-struct apply_op_from_right { typedef decltype(h_apply_op<false, op, additional_param>::helper(a())) type; };
-
-/* see if an element is in a list */
-
-template<
-  template<typename, typename> class test,
-  typename check_against,
-  typename h_list,
-  bool last_check_positive = false
->
-struct contained_in_list;
-
-template<
-  template<typename, typename> class test,
-  typename check_against,
-  typename h_list
->
-struct contained_in_list<test, check_against, h_list, true>
-{
-  constexpr static bool value = true;
-};
-
-template<
-  template<typename, typename> class test,
-  typename check_against,
-  typename a,
-  typename... as
->
-struct contained_in_list<test, check_against, type_list<a, as...>, false> : contained_in_list<test, check_against, type_list<as...>, test<check_against, a>::value> {};
-
-template<
-  template<typename, typename> class test,
-  typename check_against
-  EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, empty)
->
-struct contained_in_list<test, check_against, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(empty)>, false> { constexpr static bool value = false; };
-
-/* see if an element is in a list and check for global flags */
-
-template<
-  template<typename, typename> class test,
-  typename check_against,
-  typename h_list,
-  int default_flags = 0,
-  bool last_check_positive = false,
-  int last_check_flags = default_flags
->
-struct contained_in_list_gf;
-
-template<
-  template<typename, typename> class test,
-  typename check_against,
-  typename h_list,
-  int default_flags,
-  int last_check_flags
->
-struct contained_in_list_gf<test, check_against, h_list, default_flags, true, last_check_flags>
-{
-  constexpr static bool value = true;
-  constexpr static int global_flags = last_check_flags;
-};
-
-template<
-  template<typename, typename> class test,
-  typename check_against,
-  typename a,
-  typename... as,
-  int default_flags,
-  int last_check_flags
->
-struct contained_in_list_gf<test, check_against, type_list<a, as...>, default_flags, false, last_check_flags> : contained_in_list_gf<test, check_against, type_list<as...>, default_flags, test<check_against, a>::value, test<check_against, a>::global_flags> {};
-
-template<
-  template<typename, typename> class test,
-  typename check_against
-  EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, empty),
-  int default_flags,
-  int last_check_flags
->
-struct contained_in_list_gf<test, check_against, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(empty)>, default_flags, false, last_check_flags> { constexpr static bool value = false; constexpr static int global_flags = default_flags; };
-
-/* generic reductions */
-
-template<
-  typename Reducer,
-  typename... Ts
-> struct reduce;
-
-template<
-  typename Reducer
-> struct reduce<Reducer>
-{
-  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE int run() { return Reducer::Identity; }
-};
-
-template<
-  typename Reducer,
-  typename A
-> struct reduce<Reducer, A>
-{
-  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE A run(A a) { return a; }
-};
-
-template<
-  typename Reducer,
-  typename A,
-  typename... Ts
-> struct reduce<Reducer, A, Ts...>
-{
-  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, Ts... ts) -> decltype(Reducer::run(a, reduce<Reducer, Ts...>::run(ts...))) {
-    return Reducer::run(a, reduce<Reducer, Ts...>::run(ts...));
-  }
-};
-
-/* generic binary operations */
-
-struct sum_op           {
-  template<typename A, typename B> EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a + b)   { return a + b;   }
-  static constexpr int Identity = 0;
-};
-struct product_op       {
-  template<typename A, typename B> EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a * b)   { return a * b;   }
-  static constexpr int Identity = 1;
-};
-
-struct logical_and_op   { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a && b)  { return a && b;  } };
-struct logical_or_op    { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a || b)  { return a || b;  } };
-
-struct equal_op         { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a == b)  { return a == b;  } };
-struct not_equal_op     { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a != b)  { return a != b;  } };
-struct lesser_op        { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a < b)   { return a < b;   } };
-struct lesser_equal_op  { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a <= b)  { return a <= b;  } };
-struct greater_op       { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a > b)   { return a > b;   } };
-struct greater_equal_op { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a >= b)  { return a >= b;  } };
-
-/* generic unary operations */
-
-struct not_op                { template<typename A> constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(!a)      { return !a;      } };
-struct negation_op           { template<typename A> constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(-a)      { return -a;      } };
-struct greater_equal_zero_op { template<typename A> constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(a >= 0)  { return a >= 0;  } };
-
-
-/* reductions for lists */
-
-// using auto -> return value spec makes ICC 13.0 and 13.1 crash here, so we have to hack it
-// together in front... (13.0 doesn't work with array_prod/array_reduce/... anyway, but 13.1
-// does...
-template<typename... Ts>
-EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE decltype(reduce<product_op, Ts...>::run((*((Ts*)0))...)) arg_prod(Ts... ts)
-{
-  return reduce<product_op, Ts...>::run(ts...);
-}
-
-template<typename... Ts>
-constexpr EIGEN_STRONG_INLINE decltype(reduce<sum_op, Ts...>::run((*((Ts*)0))...)) arg_sum(Ts... ts)
-{
-  return reduce<sum_op, Ts...>::run(ts...);
-}
-
-/* reverse arrays */
-
-template<typename Array, int... n>
-constexpr EIGEN_STRONG_INLINE Array h_array_reverse(Array arr, numeric_list<int, n...>)
-{
-  return {{array_get<sizeof...(n) - n - 1>(arr)...}};
-}
-
-template<typename T, std::size_t N>
-constexpr EIGEN_STRONG_INLINE array<T, N> array_reverse(array<T, N> arr)
-{
-  return h_array_reverse(arr, typename gen_numeric_list<int, N>::type());
-}
-
-
-/* generic array reductions */
-
-// can't reuse standard reduce() interface above because Intel's Compiler
-// *really* doesn't like it, so we just reimplement the stuff
-// (start from N - 1 and work down to 0 because specialization for
-// n == N - 1 also doesn't work in Intel's compiler, so it goes into
-// an infinite loop)
-template<typename Reducer, typename T, std::size_t N, std::size_t n = N - 1>
-struct h_array_reduce {
-  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(array<T, N> arr, T identity) -> decltype(Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr, identity), array_get<n>(arr)))
-  {
-    return Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr, identity), array_get<n>(arr));
-  }
-};
-
-template<typename Reducer, typename T, std::size_t N>
-struct h_array_reduce<Reducer, T, N, 0>
-{
-  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE T run(const array<T, N>& arr, T)
-  {
-    return array_get<0>(arr);
-  }
-};
-
-template<typename Reducer, typename T>
-struct h_array_reduce<Reducer, T, 0>
-{
-  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE T run(const array<T, 0>&, T identity)
-  {
-    return identity;
-  }
-};
-
-template<typename Reducer, typename T, std::size_t N>
-EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_reduce(const array<T, N>& arr, T identity) -> decltype(h_array_reduce<Reducer, T, N>::run(arr, identity))
-{
-  return h_array_reduce<Reducer, T, N>::run(arr, identity);
-}
-
-/* standard array reductions */
-
-template<typename T, std::size_t N>
-EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_sum(const array<T, N>& arr) -> decltype(array_reduce<sum_op, T, N>(arr, static_cast<T>(0)))
-{
-  return array_reduce<sum_op, T, N>(arr, static_cast<T>(0));
-}
-
-template<typename T, std::size_t N>
-EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_prod(const array<T, N>& arr) -> decltype(array_reduce<product_op, T, N>(arr, static_cast<T>(1)))
-{
-  return array_reduce<product_op, T, N>(arr, static_cast<T>(1));
-}
-
-template<typename t>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector<t>& a) {
-  eigen_assert(a.size() > 0);
-  t prod = 1;
-  for (size_t i = 0; i < a.size(); ++i) { prod *= a[i]; }
-  return prod;
-}
-
-/* zip an array */
-
-template<typename Op, typename A, typename B, std::size_t N, int... n>
-constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A(), B())),N> h_array_zip(array<A, N> a, array<B, N> b, numeric_list<int, n...>)
-{
-  return array<decltype(Op::run(A(), B())),N>{{ Op::run(array_get<n>(a), array_get<n>(b))... }};
-}
-
-template<typename Op, typename A, typename B, std::size_t N>
-constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A(), B())),N> array_zip(array<A, N> a, array<B, N> b)
-{
-  return h_array_zip<Op>(a, b, typename gen_numeric_list<int, N>::type());
-}
-
-/* zip an array and reduce the result */
-
-template<typename Reducer, typename Op, typename A, typename B, std::size_t N, int... n>
-constexpr EIGEN_STRONG_INLINE auto h_array_zip_and_reduce(array<A, N> a, array<B, N> b, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...))
-{
-  return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...);
-}
-
-template<typename Reducer, typename Op, typename A, typename B, std::size_t N>
-constexpr EIGEN_STRONG_INLINE auto array_zip_and_reduce(array<A, N> a, array<B, N> b) -> decltype(h_array_zip_and_reduce<Reducer, Op, A, B, N>(a, b, typename gen_numeric_list<int, N>::type()))
-{
-  return h_array_zip_and_reduce<Reducer, Op, A, B, N>(a, b, typename gen_numeric_list<int, N>::type());
-}
-
-/* apply stuff to an array */
-
-template<typename Op, typename A, std::size_t N, int... n>
-constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A())),N> h_array_apply(array<A, N> a, numeric_list<int, n...>)
-{
-  return array<decltype(Op::run(A())),N>{{ Op::run(array_get<n>(a))... }};
-}
-
-template<typename Op, typename A, std::size_t N>
-constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A())),N> array_apply(array<A, N> a)
-{
-  return h_array_apply<Op>(a, typename gen_numeric_list<int, N>::type());
-}
-
-/* apply stuff to an array and reduce */
-
-template<typename Reducer, typename Op, typename A, std::size_t N, int... n>
-constexpr EIGEN_STRONG_INLINE auto h_array_apply_and_reduce(array<A, N> arr, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...))
-{
-  return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...);
-}
-
-template<typename Reducer, typename Op, typename A, std::size_t N>
-constexpr EIGEN_STRONG_INLINE auto array_apply_and_reduce(array<A, N> a) -> decltype(h_array_apply_and_reduce<Reducer, Op, A, N>(a, typename gen_numeric_list<int, N>::type()))
-{
-  return h_array_apply_and_reduce<Reducer, Op, A, N>(a, typename gen_numeric_list<int, N>::type());
-}
-
-/* repeat a value n times (and make an array out of it
- * usage:
- *   array<int, 16> = repeat<16>(42);
- */
-
-template<int n>
-struct h_repeat
-{
-  template<typename t, int... ii>
-  constexpr static EIGEN_STRONG_INLINE array<t, n> run(t v, numeric_list<int, ii...>)
-  {
-    return {{ typename id_numeric<int, ii, t>::type(v)... }};
-  }
-};
-
-template<int n, typename t>
-constexpr array<t, n> repeat(t v) { return h_repeat<n>::run(v, typename gen_numeric_list<int, n>::type()); }
-
-/* instantiate a class by a C-style array */
-template<class InstType, typename ArrType, std::size_t N, bool Reverse, typename... Ps>
-struct h_instantiate_by_c_array;
-
-template<class InstType, typename ArrType, std::size_t N, typename... Ps>
-struct h_instantiate_by_c_array<InstType, ArrType, N, false, Ps...>
-{
-  static InstType run(ArrType* arr, Ps... args)
-  {
-    return h_instantiate_by_c_array<InstType, ArrType, N - 1, false, Ps..., ArrType>::run(arr + 1, args..., arr[0]);
-  }
-};
-
-template<class InstType, typename ArrType, std::size_t N, typename... Ps>
-struct h_instantiate_by_c_array<InstType, ArrType, N, true, Ps...>
-{
-  static InstType run(ArrType* arr, Ps... args)
-  {
-    return h_instantiate_by_c_array<InstType, ArrType, N - 1, false, ArrType, Ps...>::run(arr + 1, arr[0], args...);
-  }
-};
-
-template<class InstType, typename ArrType, typename... Ps>
-struct h_instantiate_by_c_array<InstType, ArrType, 0, false, Ps...>
-{
-  static InstType run(ArrType* arr, Ps... args)
-  {
-    (void)arr;
-    return InstType(args...);
-  }
-};
-
-template<class InstType, typename ArrType, typename... Ps>
-struct h_instantiate_by_c_array<InstType, ArrType, 0, true, Ps...>
-{
-  static InstType run(ArrType* arr, Ps... args)
-  {
-    (void)arr;
-    return InstType(args...);
-  }
-};
-
-template<class InstType, typename ArrType, std::size_t N, bool Reverse = false>
-InstType instantiate_by_c_array(ArrType* arr)
-{
-  return h_instantiate_by_c_array<InstType, ArrType, N, Reverse>::run(arr);
-}
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11META_H
diff --git a/src/EigenUnsupported/CXX11/src/util/CXX11Workarounds.h b/src/EigenUnsupported/CXX11/src/util/CXX11Workarounds.h
deleted file mode 100644
index 056736c..0000000
--- a/src/EigenUnsupported/CXX11/src/util/CXX11Workarounds.h
+++ /dev/null
@@ -1,88 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11WORKAROUNDS_H
-#define EIGEN_CXX11WORKAROUNDS_H
-
-/* COMPATIBILITY CHECKS
- * (so users of compilers that are too old get some realistic error messages)
- */
-#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1310)
-#error Intel Compiler only supports required C++ features since version 13.1.
-// note that most stuff in principle works with 13.0 but when combining
-// some features, at some point 13.0 will just fail with an internal assertion
-#elif defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 6))
-// G++ < 4.6 by default will continue processing the source files - even if we use #error to make
-// it error out. For this reason, we use the pragma to make sure G++ aborts at the first error
-// it sees. Unfortunately, that is still not our #error directive, but at least the output is
-// short enough the user has a chance to see that the compiler version is not sufficient for
-// the funky template mojo we use.
-#pragma GCC diagnostic error "-Wfatal-errors"
-#error GNU C++ Compiler (g++) only supports required C++ features since version 4.6.
-#endif
-
-/* Check that the compiler at least claims to support C++11. It might not be sufficient
- * because the compiler may not implement it correctly, but at least we'll know.
- * On the other hand, visual studio still doesn't claim to support C++11 although it's
- * compliant enugh for our purpose.
- */
-#if (EIGEN_COMP_CXXVER < 11)
-#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
-#pragma GCC diagnostic error "-Wfatal-errors"
-#endif
-#error This library needs at least a C++11 compliant compiler. If you use g++/clang, please enable the -std=c++11 compiler flag. (-std=c++0x on older versions.)
-#endif
-
-namespace Eigen {
-
-namespace internal {
-
-/* std::get is only constexpr in C++14, not yet in C++11
- */
-
-
-template<std::size_t I_, class T> constexpr inline T&       array_get(std::vector<T>&       a) { return a[I_]; }
-template<std::size_t I_, class T> constexpr inline T&&      array_get(std::vector<T>&&      a) { return a[I_]; }
-template<std::size_t I_, class T> constexpr inline T const& array_get(std::vector<T> const& a) { return a[I_]; }
-
-/* Suppose you have a template of the form
- * template<typename T> struct X;
- * And you want to specialize it in such a way:
- *    template<typename S1, typename... SN> struct X<Foo<S1, SN...>> { ::: };
- *    template<>                            struct X<Foo<>>          { ::: };
- * This will work in Intel's compiler 13.0, but only to some extent in g++ 4.6, since
- * g++ can only match templates called with parameter packs if the number of template
- * arguments is not a fixed size (so inside the first specialization, referencing
- * X<Foo<Sn...>> will fail in g++). On the other hand, g++ will accept the following:
- *    template<typename S...> struct X<Foo<S...>> { ::: }:
- * as an additional (!) specialization, which will then only match the empty case.
- * But Intel's compiler 13.0 won't accept that, it will only accept the empty syntax,
- * so we have to create a workaround for this.
- */
-#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
-#define EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n)    mt... n
-#define EIGEN_TPL_PP_SPEC_HACK_DEFC(mt, n)   , EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n)
-#define EIGEN_TPL_PP_SPEC_HACK_USE(n)        n...
-#define EIGEN_TPL_PP_SPEC_HACK_USEC(n)       , n...
-#else
-#define EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n)
-#define EIGEN_TPL_PP_SPEC_HACK_DEFC(mt, n)
-#define EIGEN_TPL_PP_SPEC_HACK_USE(n)
-#define EIGEN_TPL_PP_SPEC_HACK_USEC(n)
-#endif
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11WORKAROUNDS_H
-
-/*
- * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
- */
diff --git a/src/EigenUnsupported/CXX11/src/util/EmulateArray.h b/src/EigenUnsupported/CXX11/src/util/EmulateArray.h
deleted file mode 100644
index 834b20b..0000000
--- a/src/EigenUnsupported/CXX11/src/util/EmulateArray.h
+++ /dev/null
@@ -1,261 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_EMULATE_ARRAY_H
-#define EIGEN_EMULATE_ARRAY_H
-
-
-
-// The array class is only available starting with cxx11. Emulate our own here
-// if needed. Beware, msvc still doesn't advertise itself as a c++11 compiler!
-// Moreover, CUDA doesn't support the STL containers, so we use our own instead.
-#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(EIGEN_GPUCC) || defined(EIGEN_AVOID_STL_ARRAY)
-
-namespace Eigen {
-template <typename T, size_t n> class array {
- public:
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE T& operator[] (size_t index) { eigen_internal_assert(index < size()); return values[index]; }
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { eigen_internal_assert(index < size()); return values[index]; }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE T& at(size_t index) { eigen_assert(index < size()); return values[index]; }
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE const T& at(size_t index) const { eigen_assert(index < size()); return values[index]; }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE T& front() { return values[0]; }
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE const T& front() const { return values[0]; }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE T& back() { return values[n-1]; }
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE const T& back() const { return values[n-1]; }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-  static std::size_t size() { return n; }
-
-  T values[n];
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE array() { }
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE array(const T& v) {
-    EIGEN_STATIC_ASSERT(n==1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    values[0] = v;
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE array(const T& v1, const T& v2) {
-    EIGEN_STATIC_ASSERT(n==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    values[0] = v1;
-    values[1] = v2;
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3) {
-    EIGEN_STATIC_ASSERT(n==3, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    values[0] = v1;
-    values[1] = v2;
-    values[2] = v3;
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3,
-                            const T& v4) {
-    EIGEN_STATIC_ASSERT(n==4, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    values[0] = v1;
-    values[1] = v2;
-    values[2] = v3;
-    values[3] = v4;
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
-                            const T& v5) {
-    EIGEN_STATIC_ASSERT(n==5, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    values[0] = v1;
-    values[1] = v2;
-    values[2] = v3;
-    values[3] = v4;
-    values[4] = v5;
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
-                            const T& v5, const T& v6) {
-    EIGEN_STATIC_ASSERT(n==6, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    values[0] = v1;
-    values[1] = v2;
-    values[2] = v3;
-    values[3] = v4;
-    values[4] = v5;
-    values[5] = v6;
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
-                            const T& v5, const T& v6, const T& v7) {
-    EIGEN_STATIC_ASSERT(n==7, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    values[0] = v1;
-    values[1] = v2;
-    values[2] = v3;
-    values[3] = v4;
-    values[4] = v5;
-    values[5] = v6;
-    values[6] = v7;
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE array(
-      const T& v1, const T& v2, const T& v3, const T& v4,
-      const T& v5, const T& v6, const T& v7, const T& v8) {
-    EIGEN_STATIC_ASSERT(n==8, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    values[0] = v1;
-    values[1] = v2;
-    values[2] = v3;
-    values[3] = v4;
-    values[4] = v5;
-    values[5] = v6;
-    values[6] = v7;
-    values[7] = v8;
-  }
-
-#if EIGEN_HAS_VARIADIC_TEMPLATES
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE array(std::initializer_list<T> l) {
-    eigen_assert(l.size() == n);
-    internal::smart_copy(l.begin(), l.end(), values);
-  }
-#endif
-};
-
-
-// Specialize array for zero size
-template <typename T> class array<T, 0> {
- public:
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE T& operator[] (size_t) {
-    eigen_assert(false && "Can't index a zero size array");
-    return dummy;
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE const T& operator[] (size_t) const {
-    eigen_assert(false && "Can't index a zero size array");
-    return dummy;
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE T& front() {
-    eigen_assert(false && "Can't index a zero size array");
-    return dummy;
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE const T& front() const {
-    eigen_assert(false && "Can't index a zero size array");
-    return dummy;
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE T& back() {
-    eigen_assert(false && "Can't index a zero size array");
-    return dummy;
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE const T& back() const {
-    eigen_assert(false && "Can't index a zero size array");
-    return dummy;
-  }
-
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::size_t size() { return 0; }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE array() : dummy() { }
-
-#if EIGEN_HAS_VARIADIC_TEMPLATES
-  EIGEN_DEVICE_FUNC array(std::initializer_list<T> l) : dummy() {
-    EIGEN_UNUSED_VARIABLE(l);
-    eigen_assert(l.size() == 0);
-  }
-#endif
-
- private:
-  T dummy;
-};
-
-// Comparison operator
-// Todo: implement !=, <, <=, >,  and >=
-template<class T, std::size_t N>
-EIGEN_DEVICE_FUNC bool operator==(const array<T,N>& lhs, const array<T,N>& rhs) {
-  for (std::size_t i = 0; i < N; ++i) {
-    if (lhs[i] != rhs[i]) {
-      return false;
-    }
-  }
-  return true;
-}
-
-
-namespace internal {
-template<std::size_t I_, class T, std::size_t N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(array<T,N>& a) {
-  return a[I_];
-}
-template<std::size_t I_, class T, std::size_t N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const array<T,N>& a) {
-  return a[I_];
-}
-
-template<class T, std::size_t N> struct array_size<array<T,N> > {
-  enum { value = N };
-};
-template<class T, std::size_t N> struct array_size<array<T,N>& > {
-  enum { value = N };
-};
-template<class T, std::size_t N> struct array_size<const array<T,N> > {
-  enum { value = N };
-};
-template<class T, std::size_t N> struct array_size<const array<T,N>& > {
-  enum { value = N };
-};
-
-}  // end namespace internal
-}  // end namespace Eigen
-
-#else
-
-// The compiler supports c++11, and we're not targeting cuda: use std::array as Eigen::array
-#include <array>
-namespace Eigen {
-
-template <typename T, std::size_t N> using array = std::array<T, N>;
-
-namespace internal {
-/* std::get is only constexpr in C++14, not yet in C++11
- *     - libstdc++ from version 4.7 onwards has it nevertheless,
- *                                          so use that
- *     - libstdc++ older versions: use _M_instance directly
- *     - libc++ all versions so far: use __elems_ directly
- *     - all other libs: use std::get to be portable, but
- *                       this may not be constexpr
- */
-#if defined(__GLIBCXX__) && __GLIBCXX__ < 20120322
-#define STD_GET_ARR_HACK             a._M_instance[I_]
-#elif defined(_LIBCPP_VERSION)
-#define STD_GET_ARR_HACK             a.__elems_[I_]
-#else
-#define STD_GET_ARR_HACK             std::template get<I_, T, N>(a)
-#endif
-
-template<std::size_t I_, class T, std::size_t N> constexpr inline T&       array_get(std::array<T,N>&       a) { return (T&)       STD_GET_ARR_HACK; }
-template<std::size_t I_, class T, std::size_t N> constexpr inline T&&      array_get(std::array<T,N>&&      a) { return (T&&)      STD_GET_ARR_HACK; }
-template<std::size_t I_, class T, std::size_t N> constexpr inline T const& array_get(std::array<T,N> const& a) { return (T const&) STD_GET_ARR_HACK; }
-
-#undef STD_GET_ARR_HACK
-
-}  // end namespace internal
-}  // end namespace Eigen
-
-#endif
-
-#endif  // EIGEN_EMULATE_ARRAY_H
diff --git a/src/EigenUnsupported/CXX11/src/util/MaxSizeVector.h b/src/EigenUnsupported/CXX11/src/util/MaxSizeVector.h
deleted file mode 100644
index 277ab14..0000000
--- a/src/EigenUnsupported/CXX11/src/util/MaxSizeVector.h
+++ /dev/null
@@ -1,158 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_FIXEDSIZEVECTOR_H
-#define EIGEN_FIXEDSIZEVECTOR_H
-
-namespace Eigen {
-
-/** \class MaxSizeVector
-  * \ingroup Core
-  *
-  * \brief The MaxSizeVector class.
-  *
-  * The %MaxSizeVector provides a subset of std::vector functionality.
-  *
-  * The goal is to provide basic std::vector operations when using
-  * std::vector is not an option (e.g. on GPU or when compiling using
-  * FMA/AVX, as this can cause either compilation failures or illegal
-  * instruction failures).
-  *
-  * Beware: The constructors are not API compatible with these of
-  * std::vector.
-  */
-template <typename T>
-class MaxSizeVector {
-  static const size_t alignment = EIGEN_PLAIN_ENUM_MAX(EIGEN_ALIGNOF(T), sizeof(void*));
- public:
-  // Construct a new MaxSizeVector, reserve n elements.
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit MaxSizeVector(size_t n)
-      : reserve_(n), size_(0),
-        data_(static_cast<T*>(internal::handmade_aligned_malloc(n * sizeof(T), alignment))) {
-  }
-
-  // Construct a new MaxSizeVector, reserve and resize to n.
-  // Copy the init value to all elements.
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  MaxSizeVector(size_t n, const T& init)
-      : reserve_(n), size_(n),
-        data_(static_cast<T*>(internal::handmade_aligned_malloc(n * sizeof(T), alignment))) {
-    size_t i = 0;
-    EIGEN_TRY
-    {
-      for(; i < size_; ++i) { new (&data_[i]) T(init); }
-    }
-    EIGEN_CATCH(...)
-    {
-      // Construction failed, destruct in reverse order:
-      for(; (i+1) > 0; --i) { data_[i-1].~T(); }
-      internal::handmade_aligned_free(data_);
-      EIGEN_THROW;
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  ~MaxSizeVector() {
-    for (size_t i = size_; i > 0; --i) {
-      data_[i-1].~T();
-    }
-    internal::handmade_aligned_free(data_);
-  }
-
-  void resize(size_t n) {
-    eigen_assert(n <= reserve_);
-    for (; size_ < n; ++size_) {
-      new (&data_[size_]) T;
-    }
-    for (; size_ > n; --size_) {
-      data_[size_-1].~T();
-    }
-    eigen_assert(size_ == n);
-  }
-
-  // Append new elements (up to reserved size).
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void push_back(const T& t) {
-    eigen_assert(size_ < reserve_);
-    new (&data_[size_++]) T(t);
-  }
-
-  // For C++03 compatibility this only takes one argument
-  template<class X>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void emplace_back(const X& x) {
-    eigen_assert(size_ < reserve_);
-    new (&data_[size_++]) T(x);
-  }
-
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  const T& operator[] (size_t i) const {
-    eigen_assert(i < size_);
-    return data_[i];
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  T& operator[] (size_t i) {
-    eigen_assert(i < size_);
-    return data_[i];
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  T& back() {
-    eigen_assert(size_ > 0);
-    return data_[size_ - 1];
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  const T& back() const {
-    eigen_assert(size_ > 0);
-    return data_[size_ - 1];
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void pop_back() {
-    eigen_assert(size_ > 0);
-    data_[--size_].~T();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  size_t size() const { return size_; }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  bool empty() const { return size_ == 0; }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  T* data() { return data_; }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  const T* data() const { return data_; }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  T* begin() { return data_; }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  T* end() { return data_ + size_; }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  const T* begin() const { return data_; }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  const T* end() const { return data_ + size_; }
-
- private:
-  size_t reserve_;
-  size_t size_;
-  T* data_;
-};
-
-}  // namespace Eigen
-
-#endif  // EIGEN_FIXEDSIZEVECTOR_H