Merge branch 'dev' into 'master'

Temporary master branch See merge request !29

Merge branch 'dev' into 'master'
Temporary master branch See merge request !29
832d856b · Maxence Naud · 031a6350 · 62519657 · 832d856b · 832d856b
Commit 832d856b authored 1 year ago by Maxence Naud
--- a/include/aidge/backend/cpu/data/TensorImpl.hpp
+++ b/include/aidge/backend/cpu/data/TensorImpl.hpp
@@ -3,8 +3,11 @@

 #include "aidge/backend/TensorImpl.hpp"
 #include "aidge/data/Tensor.hpp"
+#include "aidge/data/half.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
+#include "aidge/utils/ErrorHandling.hpp"
+#include "aidge/utils/future_std/span.hpp"

 namespace Aidge {
 template <class T>
@@ -12,7 +15,10 @@ class TensorImpl_cpu : public TensorImpl {
   private:
    const Tensor &mTensor;  // Impl needs to access Tensor information, but is not
                            // supposed to change it!
-    std::vector<T> mData;
+    /// Pointer to the data and its capacity
+    future_std::span<T> mData;
+    /// If this instance own the data, std::unique_ptr manages it
+    std::unique_ptr<T[]> mDataOwner;

   public:
    static constexpr const char *Backend = "cpu";
@@ -20,9 +26,12 @@ class TensorImpl_cpu : public TensorImpl {
    TensorImpl_cpu(const Tensor &tensor) : TensorImpl(Backend), mTensor(tensor) {}

    bool operator==(const TensorImpl &otherImpl) const override final {
+        const auto& typedOtherImpl = reinterpret_cast<const TensorImpl_cpu<T> &>(otherImpl);
+        AIDGE_INTERNAL_ASSERT(typedOtherImpl.data().size() >= mTensor.size());
+
        std::size_t i = 0;
        for (; i < mTensor.size() &&
-               mData[i] == reinterpret_cast<const TensorImpl_cpu<T> &>(otherImpl).data()[i];
+               mData[i] == typedOtherImpl.data()[i];
             ++i) {
        }
        return i == mTensor.size();
@@ -33,36 +42,129 @@ class TensorImpl_cpu : public TensorImpl {
    }

    // native interface
-    const std::vector<T> &data() const { return mData; }
+    const future_std::span<T>& data() const { return mData; }

+    std::size_t size() const override { return mData.size(); }
    std::size_t scalarSize() const override { return sizeof(T); }

-    void copy(const void *src, NbElts_t length) override {
+    void setDevice(DeviceIdx_t device) override {
+        AIDGE_ASSERT(device == 0, "device cannot be != 0 for CPU backend");
+    }
+
+    void copy(const void *src, NbElts_t length, NbElts_t offset = 0) override {
+        AIDGE_ASSERT(length <= mData.size() || length <= mTensor.size(), "copy length is above capacity");
+        std::copy(static_cast<const T *>(src), static_cast<const T *>(src) + length,
+                  static_cast<T *>(rawPtr()) + offset);
+    }
+
+    void copyCast(const void *src, NbElts_t length, const DataType srcDt) override {
+        if (length == 0) {
+            return;
+        }
+
+        AIDGE_ASSERT(length <= mData.size() || length <= mTensor.size(), "copy length is above capacity");
+        if (srcDt == DataType::Float64) {
+            std::copy(static_cast<const double*>(src), static_cast<const double*>(src) + length,
+                    static_cast<T *>(rawPtr()));
+        }
+        else if (srcDt == DataType::Float32) {
+            std::copy(static_cast<const float*>(src), static_cast<const float*>(src) + length,
+                    static_cast<T *>(rawPtr()));
+        }
+        else if (srcDt == DataType::Float16) {
+            std::copy(static_cast<const half_float::half*>(src), static_cast<const half_float::half*>(src) + length,
+                    static_cast<T *>(rawPtr()));
+        }
+        else if (srcDt == DataType::Int64) {
+            std::copy(static_cast<const int64_t*>(src), static_cast<const int64_t*>(src) + length,
+                    static_cast<T *>(rawPtr()));
+        }
+        else if (srcDt == DataType::UInt64) {
+            std::copy(static_cast<const uint64_t*>(src), static_cast<const uint64_t*>(src) + length,
+                    static_cast<T *>(rawPtr()));
+        }
+        else if (srcDt == DataType::Int32) {
+            std::copy(static_cast<const int32_t*>(src), static_cast<const int32_t*>(src) + length,
+                    static_cast<T *>(rawPtr()));
+        }
+        else if (srcDt == DataType::UInt32) {
+            std::copy(static_cast<const uint32_t*>(src), static_cast<const uint32_t*>(src) + length,
+                    static_cast<T *>(rawPtr()));
+        }
+        else if (srcDt == DataType::Int16) {
+            std::copy(static_cast<const int16_t*>(src), static_cast<const int16_t*>(src) + length,
+                    static_cast<T *>(rawPtr()));
+        }
+        else if (srcDt == DataType::UInt16) {
+            std::copy(static_cast<const uint16_t*>(src), static_cast<const uint16_t*>(src) + length,
+                    static_cast<T *>(rawPtr()));
+        }
+        else if (srcDt == DataType::Int8) {
+            std::copy(static_cast<const int8_t*>(src), static_cast<const int8_t*>(src) + length,
+                    static_cast<T *>(rawPtr()));
+        }
+        else if (srcDt == DataType::UInt8) {
+            std::copy(static_cast<const uint8_t*>(src), static_cast<const uint8_t*>(src) + length,
+                    static_cast<T *>(rawPtr()));
+        }
+        else {
+            AIDGE_THROW_OR_ABORT(std::runtime_error, "Unsupported data type.");
+        }
+    }
+
+    void copyFromDevice(const void *src, NbElts_t length, const std::pair<std::string, DeviceIdx_t>& device) override {
+        AIDGE_ASSERT(device.first == Backend, "backend must match");
+        AIDGE_ASSERT(device.second == 0, "device cannot be != 0 for CPU backend");
+        copy(src, length);
+    }
+
+    void copyFromHost(const void *src, NbElts_t length) override {
+        copy(src, length);
+    }
+
+    void copyToHost(void *dst, NbElts_t length) const override {
+        AIDGE_ASSERT(length <= mData.size() || length <= mTensor.size(), "copy length is above capacity");
+        const T* src = static_cast<const T*>(rawPtr());
        std::copy(static_cast<const T *>(src), static_cast<const T *>(src) + length,
-                  static_cast<T *>(rawPtr()));
+                  static_cast<T *>(dst));
    }

-    void *rawPtr() override {
-        lazyInit(mData);
-        return mData.data();
+    void *rawPtr(NbElts_t offset = 0) override {
+        lazyInit();
+        return (mData.data() + offset);
    };

-   void* getRaw(std::size_t idx){
-       return  static_cast<void*>(static_cast<T *>(rawPtr()) + idx);
-   };
+    const void *rawPtr(NbElts_t offset = 0) const override {
+        AIDGE_ASSERT(mData.size() >= mTensor.size(), "accessing uninitialized const rawPtr");
+        return (mData.data() + offset);
+    };

-    virtual ~TensorImpl_cpu() = default;
+    void *hostPtr(NbElts_t offset = 0) override {
+        lazyInit();
+        return (mData.data() + offset);
+    };

-    void setRawPtr(void *ptr) override final {
-        T *newPtr = static_cast<T *>(ptr);
-        mData = std::vector<T>(newPtr, newPtr + mTensor.size());
+    const void *hostPtr(NbElts_t offset = 0) const override {
+        AIDGE_ASSERT(mData.size() >= mTensor.size(), "accessing uninitialized const hostPtr");
+        return (mData.data() + offset);
    };

-   private:
-    void lazyInit(std::vector<T> &data) {
-        assert(mTensor.dataType() == NativeType<T>::type);
+    void setRawPtr(void *ptr, NbElts_t length) override final {
+        AIDGE_ASSERT(length >= mTensor.size(), "trying to set raw pointer of insufficient capacity");
+        mData = future_std::span<T>(static_cast<T *>(ptr), length);
+        mDataOwner.reset();
+    };
+
+    virtual ~TensorImpl_cpu() = default;

-        if (data.size() != mTensor.size()) data.resize(mTensor.size());
+private:
+    void lazyInit() {
+        if (mData.size() < mTensor.size()) {
+            // Need more data, a re-allocation will occur
+            AIDGE_ASSERT(mData.empty() || mDataOwner != nullptr, "trying to enlarge non-owned data");
+            mDataOwner.reset(new T[mTensor.size()]);
+            mData = future_std::span<T>(mDataOwner.get(), mTensor.size());
+        }
    }
 };

@@ -71,6 +173,8 @@ static Registrar<Tensor> registrarTensorImpl_cpu_Float64(
        {"cpu", DataType::Float64}, Aidge::TensorImpl_cpu<double>::create);
 static Registrar<Tensor> registrarTensorImpl_cpu_Float32(
        {"cpu", DataType::Float32}, Aidge::TensorImpl_cpu<float>::create);
+static Registrar<Tensor> registrarTensorImpl_cpu_Float16(
+        {"cpu", DataType::Float16}, Aidge::TensorImpl_cpu<half_float::half>::create);
 static Registrar<Tensor> registrarTensorImpl_cpu_Int32(
        {"cpu", DataType::Int32}, Aidge::TensorImpl_cpu<int>::create);
 }  // namespace

--- a/include/aidge/backend/cpu/operator/AddImpl.hpp
+++ b/include/aidge/backend/cpu/operator/AddImpl.hpp
@@ -39,19 +39,7 @@ public:
        return std::make_unique<AddImpl_cpu>(op);
    }

-public:
-    NbElts_t getNbRequiredData(const IOIndex_t inputIdx) const override final;
-
    NbElts_t getNbRequiredProtected(const IOIndex_t /*inputIdx*/) const override final;
-
-    NbElts_t getRequiredMemory(const IOIndex_t outputIdx, const std::vector<DimSize_t>& /*inputsSize*/) const override final;
-
-    NbElts_t getNbConsumedData(const IOIndex_t inputIdx) const override final;
-
-    NbElts_t getNbProducedData(const IOIndex_t outputIdx) const override final;
-
-    void updateConsummerProducer() override final;
-
    void forward() override;
 };


--- a/include/aidge/backend/cpu/operator/AddImpl_forward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AddImpl_forward_kernels.hpp
@@ -27,11 +27,12 @@ void AddImpl_cpu_forward_kernel(const std::size_t inputLength, const std::vector
    }
    O* output = static_cast<O*>(output_);

-	for (std::size_t iIndex = 0; iIndex < inputs.size(); ++iIndex) {
-		for (std::size_t oIndex = 0; oIndex < inputLength; ++oIndex) {
-			output[oIndex] += inputs[iIndex][oIndex];
-		}
-	}
+    for (std::size_t oIndex = 0; oIndex < inputLength; ++oIndex) {
+        output[oIndex] = 0;
+        for (std::size_t iIndex = 0; iIndex < inputs.size(); ++iIndex) {
+            output[oIndex] += inputs[iIndex][oIndex];
+        }
+    }
 }

 namespace {

--- a/include/aidge/backend/cpu/operator/AvgPoolingImpl_forward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AvgPoolingImpl_forward_kernels.hpp
@@ -61,6 +61,7 @@ void AvgPoolingImpl2D_cpu_forward_kernel(const AvgPooling_Op<2>::Attrs &attrs,
        for (std::size_t ch = 0; ch < dims[1]; ++ch) {
            const std::size_t oIndex = (ch + batch*dims[1]) * oxSize * oySize;
            const std::size_t iIndex = (ch + batch*dims[1]) * dims[2] * dims[3];
+            std::fill(output + oIndex, output+(oIndex+oxSize*oySize), 0);
            for (std::size_t ox = 0; ox < oxSize; ++ox) {
                const signedsize difx = static_cast<signedsize>(- ox * std::get<0>(attrs)[0]);
                const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));

--- a/include/aidge/backend/cpu/operator/ConvImpl_forward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConvImpl_forward_kernels.hpp
@@ -14,6 +14,7 @@

 #include "aidge/utils/Registrar.hpp"

+#include "aidge/data/half.hpp"
 #include "aidge/backend/cpu/operator/ConvImpl.hpp"
 #include "aidge/utils/Types.h"
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
@@ -151,6 +152,9 @@ namespace {
 static Registrar<ConvImpl2DForward_cpu> registrarConvImpl2DForward_cpu_Float32(
        {DataType::Float32, DataType::Float32, DataType::Float32, DataType::Float32},
        Aidge::ConvImpl2D_cpu_forward_kernel<float, float, float, float>);
+static Registrar<ConvImpl2DForward_cpu> registrarConvImpl2DForward_cpu_Float16(
+        {DataType::Float16, DataType::Float16, DataType::Float16, DataType::Float16},
+        Aidge::ConvImpl2D_cpu_forward_kernel<half_float::half, half_float::half, half_float::half, half_float::half>);
 static Registrar<ConvImpl2DForward_cpu> registrarConvImpl2DForward_cpu_Int32(
        {DataType::Int32, DataType::Int32, DataType::Int32, DataType::Int32},
        Aidge::ConvImpl2D_cpu_forward_kernel<int, int, int, int>);

--- a/include/aidge/backend/cpu/operator/SliceImpl.hpp
+++ b/include/aidge/backend/cpu/operator/SliceImpl.hpp
@@ -13,7 +13,6 @@
 #define AIDGE_CPU_OPERATOR_SLICEIMPL_H_

 #include <memory>
-#include <tuple>
 #include <vector>

 #include "aidge/backend/OperatorImpl.hpp"
@@ -39,7 +38,6 @@ class SliceImplBackward_cpu
                              const void*,
                              void*)> {};

-
 class SliceImpl_cpu : public OperatorImpl {
 public:
    SliceImpl_cpu(const Slice_Op& op) : OperatorImpl(op) {}
@@ -48,7 +46,6 @@ public:
        return std::make_unique<SliceImpl_cpu>(op);
    }

-public:
    NbElts_t getNbRequiredData(const IOIndex_t /*inputIdx*/) const override final;
    NbElts_t getNbRequiredProtected(const IOIndex_t /*inputIdx*/) const override final;
    NbElts_t getRequiredMemory(const IOIndex_t outputIdx,
@@ -58,14 +55,12 @@ public:
    void updateConsummerProducer() override final;

    void forward() override;
-
    void backward() override;
 };

-
 namespace {
 static Registrar<Slice_Op> registrarSliceImpl_cpu("cpu", Aidge::SliceImpl_cpu::create);
-}  // namespace
+}
 }  // namespace Aidge

-#endif /* AIDGE_CPU_OPERATOR_LEAKYRELUIMPL_H_ */
\ No newline at end of file
+#endif /* AIDGE_CPU_OPERATOR_SLICEIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/SliceImpl_forward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/SliceImpl_forward_kernels.hpp
@@ -12,57 +12,73 @@
 #ifndef AIDGE_CPU_OPERATOR_SLICEIMPL_FORWARD_KERNEL_H_
 #define AIDGE_CPU_OPERATOR_SLICEIMPL_FORWARD_KERNEL_H_

-#include "aidge/utils/Registrar.hpp"
-#include "aidge/operator/Slice.hpp"
-#include "aidge/backend/cpu/operator/SliceImpl.hpp"
-#include <vector>
 #include <cstddef>
+#include <vector>

+#include "aidge/backend/cpu/operator/SliceImpl.hpp"
 #include "aidge/data/Data.hpp"
+#include "aidge/operator/Slice.hpp"
+#include "aidge/utils/Registrar.hpp"

 namespace Aidge {
 template <class I>
 void SliceImpl_cpu_forward_kernel(const typename Slice_Op::Attrs& attrs,
-                                     const std::vector<std::size_t> inputDims,
-                                     const void* input_,
-                                     void* output_) {
+                                  const std::vector<std::size_t> inputDims,
+                                  const void* input_,
+                                  void* output_) {
+    std::vector<std::size_t> slicedDims = inputDims;
+
+    std::size_t beginning = 0;
+    DimSize_t nbAxes = std::get<2>(attrs).size();
+    for (std::size_t i = 0; i < nbAxes; ++i) {
+        // For each slice operation get the params and cast them to size_t
+        const std::int64_t axis_ = std::get<2>(attrs)[i];
+        const std::int64_t start_ = std::get<0>(attrs)[i];
+        const std::int64_t end_ = std::get<1>(attrs)[i];
+        const std::size_t axis = axis_ >= 0 ? axis_ : static_cast<std::size_t>(axis_ + static_cast<std::int32_t>(inputDims.size()));
+        const std::size_t start = start_ >= 0 ? start_ : start_ + inputDims[axis];
+        const std::size_t end = end_ >= 0 ? end_ : end_ + inputDims[axis];
+        std::size_t stride = 1;
+        for (std::size_t j = inputDims.size() - 1; j > axis; --j) stride *= inputDims[j];
+        beginning += start * stride;
+        const std::size_t sliceLength = end - start + 1;
+        slicedDims[axis] = sliceLength;
+    }

-    const I* input = static_cast<const I*>(input_) + std::get<0>(attrs);
+    const I* input = static_cast<const I*>(input_) + beginning;
    I* output = static_cast<I*>(output_);
-    const std::vector<std::size_t> slicedDims = std::get<1>(attrs);
    const std::size_t nbDims = slicedDims.size();

-	// for inputDims = {4,5,5,3} & slicedDims = {3,2,2,1}, substractDims = {1,5,5,3}
+    // for inputDims = {4,5,5,3} & slicedDims = {3,2,2,1}, substractDims = {1,5,5,3}
    std::vector<std::size_t> substractedDims = std::vector<std::size_t>(nbDims);
    for (std::size_t i = 0; i < nbDims; ++i) {
        substractedDims[i] = inputDims[i] - slicedDims[i];
    }

-	// for slicedDims = {3,2,2,1}, prodSlicedDims = {12,4,2,1}
+    // for slicedDims = {3,2,2,1}, prodSlicedDims = {12,4,2,1}
    std::vector<std::size_t> prodSlicedDims = std::vector<std::size_t>(nbDims);
-    std::vector<std::size_t> prodInputDims = std::vector<std::size_t>(nbDims+1);
-	prodSlicedDims[nbDims - 1] = slicedDims[nbDims - 1];
-	prodInputDims[nbDims - 1] = inputDims[nbDims - 1];
-	prodInputDims[nbDims] = 1;
-	for (std::size_t i = 2; i <= nbDims; ++i) {
-		prodSlicedDims[nbDims - i] = prodSlicedDims[nbDims - i + 1]*slicedDims[nbDims - i];
-		prodInputDims[nbDims - i] = prodInputDims[nbDims - i + 1]*inputDims[nbDims - i];
-	}
+    std::vector<std::size_t> prodInputDims = std::vector<std::size_t>(nbDims + 1);
+    prodSlicedDims[nbDims - 1] = slicedDims[nbDims - 1];
+    prodInputDims[nbDims - 1] = inputDims[nbDims - 1];
+    prodInputDims[nbDims] = 1;
+    for (std::size_t i = 2; i <= nbDims; ++i) {
+        prodSlicedDims[nbDims - i] = prodSlicedDims[nbDims - i + 1] * slicedDims[nbDims - i];
+        prodInputDims[nbDims - i] = prodInputDims[nbDims - i + 1] * inputDims[nbDims - i];
+    }

-	std::size_t j = 0;
-	std::size_t i = 0;
-	for (; j < prodSlicedDims[0];) {
-		output[j] = input[i++];
+    std::size_t j = 0;
+    std::size_t i = 0;
+    for (; j < prodSlicedDims[0];) {
+        output[j] = input[i++];
        ++j;
-		for (std::size_t idx = nbDims - 1; idx > 0; --idx) {
-			i += j % prodSlicedDims[idx] == 0 ? substractedDims[idx]*prodInputDims[idx+1] : 0;
-		}
-	}
+        for (std::size_t idx = nbDims - 1; idx > 0; --idx) {
+            i += j % prodSlicedDims[idx] == 0 ? substractedDims[idx] * prodInputDims[idx + 1] : 0;
+        }
+    }
 }

 namespace {

-// DIM = 1
 static Registrar<SliceImplForward_cpu> registrarSliceImplForward_cpu_Float32(
        {DataType::Float32}, Aidge::SliceImpl_cpu_forward_kernel<float>);
 static Registrar<SliceImplForward_cpu> registrarSliceImplForward_cpu_Int32(

--- a/src/operator/AddImpl.cpp
+++ b/src/operator/AddImpl.cpp
@@ -21,46 +21,11 @@
 #include "aidge/backend/cpu/operator/AddImpl.hpp"
 #include "aidge/backend/cpu/operator/AddImpl_forward_kernels.hpp"

-Aidge::NbElts_t Aidge::AddImpl_cpu::getNbRequiredData(const Aidge::IOIndex_t inputIdx) const {
-    assert(mOp.getRawInput(inputIdx) && "requires valid input");
-
-    // Requires the whole tensors
-    const auto& inputDims = std::static_pointer_cast<Tensor>(mOp.getRawInput(inputIdx))->dims();
-    return std::accumulate(inputDims.begin(), inputDims.end(), NbElts_t(1), std::multiplies<NbElts_t>());
-}
-
 Aidge::NbElts_t  Aidge::AddImpl_cpu::getNbRequiredProtected(const Aidge::IOIndex_t /*inputIdx*/) const {
-    // for the direct convolution algorithm, convolutions can be in-place, if there is no padding!
+    // this implementation can be in-place
    return 0;
 }

-Aidge::NbElts_t  Aidge::AddImpl_cpu::getRequiredMemory(const Aidge::IOIndex_t outputIdx, const std::vector<Aidge::DimSize_t>& /*inputsSize*/) const {
-    // Requires the whole tensors, regardless of available data on inputs
-    assert(outputIdx == 0 && "operator has only one output");
-    (void) outputIdx;
-
-    const auto& outputDims = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims();
-    return std::accumulate(outputDims.begin(), outputDims.end(), NbElts_t(1), std::multiplies<NbElts_t>());
-}
-
-Aidge::NbElts_t  Aidge::AddImpl_cpu::getNbConsumedData(const Aidge::IOIndex_t inputIdx) const {
-    assert(inputIdx < mNbConsumedData.size());
-    return mNbConsumedData[inputIdx];
-}
-
-Aidge::NbElts_t  Aidge::AddImpl_cpu::getNbProducedData(const Aidge::IOIndex_t outputIdx) const {
-    assert(outputIdx < mNbProducedData.size());
-    return mNbProducedData[outputIdx];
-}
-
-void  Aidge::AddImpl_cpu::updateConsummerProducer() {
-    for (IOIndex_t inputIdx = 0; static_cast<NbElts_t>(inputIdx) < mNbConsumedData.size(); ++inputIdx)
-        mNbConsumedData[inputIdx]+= getNbRequiredData(inputIdx); // each input is consumed by the minimum amount for a forward pass
-
-    mNbProducedData[0]+= getRequiredMemory(0, {});
-
-}
-
 void  Aidge::AddImpl_cpu::forward() {
    assert(mOp.getRawInput(0) && "missing input in Add operator");
    DataType datatypeFirstInput = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dataType();
@@ -69,16 +34,36 @@ void  Aidge::AddImpl_cpu::forward() {
        assert(std::static_pointer_cast<Tensor>(mOp.getRawInput(i))->dataType() == datatypeFirstInput);
    }

-    auto kernelFunc = Registrar<AddImplForward_cpu>::create({
+    // Find the correct kernel type
+    const auto outputDataType = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType();
+    const Registrar<AddImplForward_cpu>::registrar_key registrarKey = {
        datatypeFirstInput,
-        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()});
+        outputDataType};

+    Registrar<AddImplForward_cpu>::registrar_type kernelFunc;
+    if (Registrar<AddImplForward_cpu>::exists(registrarKey)) {
+        // One exists with the right inputs/output types
+        kernelFunc = Registrar<AddImplForward_cpu>::create(registrarKey);
+    }
+    else {
+        // Otherwise, fallback to the kernel with all types matching output type
+        kernelFunc = Registrar<AddImplForward_cpu>::create({
+            outputDataType, outputDataType});
+    }
+
+    // Convert input data (no overhead if not needed!)
+    // TODO: right now, if needed, memory will be allocated/deallocated at each
+    // call to forward(). We might put the following shared_ptr as members of
+    // this class to avoid that.
    std::vector<const void*> opInputs;
+    std::vector<std::shared_ptr<Tensor>> inputsFallback(mOp.nbInputs());
    for (IOIndex_t i = 0; i < mOp.nbInputs(); ++i) {
-        opInputs.push_back(getCPUPtr(mOp.getRawInput(i)));
+        const auto& input = std::static_pointer_cast<Tensor>(mOp.getRawInput(i))->refCastFrom(inputsFallback[i], *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
+        opInputs.push_back(input.getImpl()->rawPtr());
    }

+    // Call kernel
    kernelFunc(std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->size(),
               opInputs,
               getCPUPtr(mOp.getRawOutput(0)));
-}
\ No newline at end of file
+}
--- a/src/operator/ConvImpl.cpp
+++ b/src/operator/ConvImpl.cpp
@@ -34,14 +34,35 @@ void Aidge::ConvImpl2D_cpu::forward() {
    assert(mOp.getRawInput(2) && "missing input #2");

    // Find the correct kernel type
-    auto kernelFunc =
-            Registrar<ConvImpl2DForward_cpu>::create({std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dataType(),
-                                                      std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dataType(),
-                                                      std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->dataType(),
-                                                      std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()});
+    const auto outputDataType = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType();
+    const Registrar<ConvImpl2DForward_cpu>::registrar_key registrarKey = {
+        std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dataType(),
+        std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dataType(),
+        std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->dataType(),
+        outputDataType};
+
+    Registrar<ConvImpl2DForward_cpu>::registrar_type kernelFunc;
+    if (Registrar<ConvImpl2DForward_cpu>::exists(registrarKey)) {
+        // One exists with the right inputs/output types
+        kernelFunc = Registrar<ConvImpl2DForward_cpu>::create(registrarKey);
+    }
+    else {
+        // Otherwise, fallback to the kernel with all types matching output type
+        kernelFunc = Registrar<ConvImpl2DForward_cpu>::create({
+            outputDataType, outputDataType, outputDataType, outputDataType});
+    }
+
+    // Convert input data (no overhead if not needed!)
+    // TODO: right now, if needed, memory will be allocated/deallocated at each
+    // call to forward(). We might put the following shared_ptr as members of
+    // this class to avoid that.
+    std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback;
+    const auto& input0 = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->refCastFrom(input0Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
+    const auto& input1 = std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->refCastFrom(input1Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
+    const auto& input2 = std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->refCastFrom(input2Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));

    // Call kernel
    kernelFunc(dynamic_cast<const Conv_Op<2>&>(mOp).getStaticAttributes(), std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->template dims<4>(),
-               getCPUPtr(mOp.getRawInput(0)), getCPUPtr(mOp.getRawInput(1)),
-               getCPUPtr(mOp.getRawInput(2)), getCPUPtr(mOp.getRawOutput(0)));
+        input0.getImpl()->rawPtr(), input1.getImpl()->rawPtr(), input2.getImpl()->rawPtr(),
+        getCPUPtr(mOp.getRawOutput(0)));
 }
--- a/src/operator/FCImpl.cpp
+++ b/src/operator/FCImpl.cpp
@@ -29,29 +29,37 @@ void Aidge::FCImpl_cpu::forward()
    assert(std::static_pointer_cast<Tensor>(mOp.getRawInput(2)) && "missing input #2");

    // Find the correct kernel type
-    auto kernelFunc = Registrar<FCImplForward_cpu>::create(
-        {std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dataType(),
-         std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dataType(),
-         std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->dataType(),
-         std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType()});
+    const auto outputDataType = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dataType();
+    const Registrar<FCImplForward_cpu>::registrar_key registrarKey = {
+        std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dataType(),
+        std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dataType(),
+        std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->dataType(),
+        outputDataType};
+
+    Registrar<FCImplForward_cpu>::registrar_type kernelFunc;
+    if (Registrar<FCImplForward_cpu>::exists(registrarKey)) {
+        // One exists with the right inputs/output types
+        kernelFunc = Registrar<FCImplForward_cpu>::create(registrarKey);
+    }
+    else {
+        // Otherwise, fallback to the kernel with all types matching output type
+        kernelFunc = Registrar<FCImplForward_cpu>::create({
+            outputDataType, outputDataType, outputDataType, outputDataType});
+    }
+
+    // Convert input data (no overhead if not needed!)
+    // TODO: right now, if needed, memory will be allocated/deallocated at each
+    // call to forward(). We might put the following shared_ptr as members of
+    // this class to avoid that.
+    std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback;
+    const auto& input0 = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->refCastFrom(input0Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
+    const auto& input1 = std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->refCastFrom(input1Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));
+    const auto& input2 = std::static_pointer_cast<Tensor>(mOp.getRawInput(2))->refCastFrom(input2Fallback, *std::static_pointer_cast<Tensor>(mOp.getRawOutput(0)));

    // Call kernel
-    // if (std::static_pointer_cast<Tensor>(mOp.getRawInput(0)->nbDims() == 4) {
-    //     kernelFunc(
-    //         mOp.getStaticAttributes(),
-    //         std::static_pointer_cast<Tensor>(std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->template dims<4>(),
-    //         getCPUPtr(mOp.getRawInput(0),
-    //         mOp.mInputs[1]->getImpl()->rawPtr(),
-    //         mOp.mInputs[2]->getImpl()->rawPtr(),
-    //         mOp.getOutput(0)->getImpl()->rawPtr());
-    // }
-    // else
-    kernelFunc(
-        dynamic_cast<const FC_Op&>(mOp).getStaticAttributes(),
-        std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims()[0],
-        std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->sizeM1(),
-        getCPUPtr(mOp.getRawInput(0)),
-        getCPUPtr(mOp.getRawInput(1)),
-        getCPUPtr(mOp.getRawInput(2)),
+    kernelFunc(dynamic_cast<const FC_Op&>(mOp).getStaticAttributes(),
+        input0.dims()[0],
+        input0.size() / input0.dims()[0],
+        input0.getImpl()->rawPtr(), input1.getImpl()->rawPtr(), input2.getImpl()->rawPtr(),
        getCPUPtr(mOp.getRawOutput(0)));
 }
--- a/src/operator/MatMulImpl.cpp
+++ b/src/operator/MatMulImpl.cpp
@@ -47,7 +47,7 @@ void Aidge::MatMulImpl_cpu::forward()
    kernelFunc(
        dynamic_cast<const MatMul_Op&>(mOp).getStaticAttributes(),
        std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims()[0],
-        std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->sizeM1(),
+        std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->size() / std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims()[0],
        getCPUPtr(mOp.getRawInput(0)),
        getCPUPtr(mOp.getRawInput(1)),
        getCPUPtr(mOp.getRawOutput(0)));

--- a/src/operator/SoftmaxImpl.cpp
+++ b/src/operator/SoftmaxImpl.cpp
@@ -38,7 +38,7 @@ void Aidge::SoftmaxImpl_cpu::forward() {

    DimSize_t batchSize = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims()[0];
    DimSize_t channelSize = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims()[1];
-    DimSize_t featureSize = std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->sizeM1()/channelSize;
+    DimSize_t featureSize = (std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->size()/batchSize)/channelSize;
    // Call kernel
    kernelFunc(batchSize,
               channelSize,

--- a/unit_tests/data/Test_TensorImpl.cpp
+++ b/unit_tests/data/Test_TensorImpl.cpp
@@ -45,7 +45,7 @@ TEST_CASE("Tensor creation") {
      REQUIRE(x.get<int>({0, 0, 1}) == 2);
      REQUIRE(x.get<int>({0, 1, 1}) == 4);
      REQUIRE(x.get<int>({1, 1, 0}) == 7);
-      x.get<int>({1, 1, 1}) = 36;
+      x.set<int>({1, 1, 1}, 36);
      REQUIRE(x.get<int>({1, 1, 1}) == 36);
    }


--- a/unit_tests/operator/Test_MetaOperator.cpp
+++ b/unit_tests/operator/Test_MetaOperator.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <catch2/catch_test_macros.hpp>
+#include <cmath>
+#include <cstdlib>
+#include <memory>
+
+#include "aidge/backend/cpu/operator/ConvImpl.hpp"
+#include "aidge/backend/cpu/operator/PadImpl.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/Conv.hpp"
+#include "aidge/operator/MetaOperator.hpp"
+#include "aidge/operator/MetaOperatorDefs.hpp"
+#include "aidge/operator/Pad.hpp"
+
+using namespace Aidge;
+
+TEST_CASE("[cpu/operator] MetaOperator/PaddedConv(forward)", "[MetaOperator][PaddedConv][CPU]") {
+    std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(
+            Array4D<double, 4, 3, 3, 3>{{{{{6.20986394e-01, 1.19775136e-03, 7.22876095e-02},
+                                          {1.16492919e-01, 8.21634093e-02, 1.17413265e-01},
+                                          {2.23743494e-01, 3.99495413e-01, 5.55552411e-01}},
+                                         {{6.64970077e-01, 9.62199940e-01, 4.87531967e-01},
+                                          {6.12586558e-01, 8.09918671e-02, 8.40649383e-01},
+                                          {4.15264406e-01, 8.28247138e-01, 1.52301135e-01}},
+                                         {{1.76992844e-02, 7.78697112e-01, 8.14531592e-01},
+                                          {1.36960611e-01, 4.64806728e-01, 4.85150000e-01},
+                                          {4.34776520e-01, 9.51740977e-01, 9.05793799e-01}}},
+
+                                        {{{1.71925246e-02, 1.91082720e-01, 3.67982644e-01},
+                                          {1.56806559e-01, 6.22280998e-01, 3.15827594e-01},
+                                          {6.04359038e-01, 2.83095947e-01, 6.11168892e-01}},
+                                         {{2.76942832e-01, 1.89768419e-01, 8.07988176e-01},
+                                          {1.67925807e-01, 2.68356150e-01, 6.28875602e-01},
+                                          {1.69093357e-04, 9.64788636e-01, 7.29254981e-01}},
+                                         {{6.34030122e-01, 1.32087038e-01, 3.33857107e-01},
+                                          {7.63047502e-01, 5.12539506e-02, 9.77400493e-01},
+                                          {8.06151288e-01, 2.60237147e-01, 3.93729313e-01}}},
+
+                                        {{{5.84605240e-01, 4.74648725e-01, 8.54111741e-01},
+                                          {7.10897067e-02, 5.02579011e-01, 3.35236224e-01},
+                                          {9.08637408e-01, 8.02903830e-01, 2.83929907e-01}},
+                                         {{3.68206999e-01, 9.18579021e-02, 7.33168098e-01},
+                                          {1.59875539e-01, 9.13163381e-01, 3.59806060e-01},
+                                          {1.41295882e-01, 7.00312185e-01, 5.63728289e-01}},
+                                         {{9.39513546e-01, 1.91704891e-01, 1.11454944e-01},
+                                          {5.46298282e-01, 2.89698587e-01, 2.62612651e-01},
+                                          {1.18554992e-01, 4.32147376e-02, 7.53016994e-01}}},
+
+                                        {{{9.53179175e-01, 2.05041054e-02, 1.11318451e-01},
+                                          {8.67878485e-01, 2.93263422e-01, 8.03912714e-01},
+                                          {8.93620255e-01, 1.37831128e-01, 3.83640583e-01}},
+                                         {{3.96020188e-01, 6.24959320e-01, 1.90709175e-01},
+                                          {5.80538620e-01, 6.63031275e-01, 2.07247191e-01},
+                                          {5.65672171e-01, 5.57014317e-01, 9.26909496e-01}},
+                                         {{3.43901418e-01, 4.47741636e-01, 6.59249367e-01},
+                                          {7.34639028e-01, 2.84957200e-02, 9.70225217e-01},
+                                          {1.33578790e-02, 6.12054702e-01, 9.36685235e-02}}}}});
+    std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(
+            Array1D<double, 4>{{0.16884905, 0.27994487, 0.57227465, 0.06435205}});
+    std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<double, 2, 3, 5, 5>{
+            // NCHW
+            {{{{0.43224481, 0.9047832, 0.18402257, 0.06162838, 0.52490127},
+               {0.27773404, 0.55402353, 0.9485062, 0.31197083, 0.80328607},
+               {0.85065842, 0.88226201, 0.54971951, 0.23360494, 0.53907884},
+               {0.33423098, 0.79564312, 0.80419414, 0.76839638, 0.87248221},
+               {0.77328729, 0.65749407, 0.47277589, 0.32889198, 0.93970518}},
+
+              {{0.66669145, 0.64193351, 0.45315988, 0.32794057, 0.38461822},
+               {0.72295814, 0.18395073, 0.85909664, 0.30010301, 0.56065865},
+               {0.34777938, 0.77869746, 0.33159421, 0.19540932, 0.77767906},
+               {0.5778391, 0.08218411, 0.27758371, 0.99017749, 0.61827997},
+               {0.10440745, 0.3197831, 0.89157608, 0.12216887, 0.950232}},
+
+              {{0.68073443, 0.2681118, 0.51848834, 0.62864493, 0.36717478},
+               {0.64106244, 0.43779425, 0.02771029, 0.78275231, 0.45693104},
+               {0.6487417, 0.01603838, 0.73869997, 0.96494221, 0.39588782},
+               {0.5975827, 0.90913292, 0.55036969, 0.4747373, 0.62460509},
+               {0.79675124, 0.02807549, 0.53227602, 0.88805927, 0.96646591}}},
+
+             {{{0.81851935, 0.21267665, 0.01580692, 0.54907998, 0.89010049},
+               {0.80165784, 0.55195592, 0.20740314, 0.22782844, 0.89205031},
+               {0.94217108, 0.58434542, 0.20738313, 0.79065873, 0.9371597},
+               {0.02254708, 0.95539178, 0.95165758, 0.53736666, 0.49100362},
+               {0.08018625, 0.69108027, 0.00329741, 0.74565761, 0.30899213}},
+
+              {{0.34868638, 0.12792604, 0.37382248, 0.0374756, 0.50653087},
+               {0.59614405, 0.64820746, 0.31470307, 0.62460364, 0.29253268},
+               {0.92864889, 0.51014224, 0.08921206, 0.11094072, 0.64691121},
+               {0.50586371, 0.6686477, 0.72511169, 0.41681783, 0.6325049},
+               {0.71594137, 0.73382767, 0.36589439, 0.03255165, 0.75006865}},
+
+              {{0.6294127, 0.85548534, 0.0902963, 0.28915773, 0.36564289},
+               {0.95873236, 0.6742374, 0.55679676, 0.6323497, 0.34072958},
+               {0.49694061, 0.79173045, 0.19738225, 0.14755281, 0.80818177},
+               {0.02332061, 0.74270703, 0.59415632, 0.08195934, 0.46295434},
+               {0.71426058, 0.85032931, 0.90750818, 0.28768431, 0.4401146}}}}});
+
+    std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(
+            Array4D<double, 2, 4, 5, 5>{{{{{3.40294218, 3.74021220, 4.02050114, 4.07054710, 2.46286273},
+                {4.61770582, 6.70517588, 6.50356627, 6.29688787, 3.53332567},
+                {5.47480106, 5.92094421, 6.64605665, 7.95090199, 4.28721523},
+                {4.01485729, 6.06748962, 7.52447891, 7.37980652, 5.28401136},
+                {2.83065438, 3.62033439, 3.56222963, 5.56103945, 3.23335814}},
+
+                {{3.30230498, 4.92814112, 4.34710836, 3.96262765, 2.97987890},
+                {4.49693012, 6.68929291, 5.53603029, 5.68874264, 4.28756475},
+                {4.20528078, 6.82776880, 6.70569849, 7.12809610, 4.40845442},
+                {4.31169367, 6.73352146, 6.30962515, 7.45826864, 4.99164438},
+                {2.18136287, 4.28968000, 4.20080042, 4.89814138, 2.87394023}},
+
+                {{3.54787683, 4.35851812, 4.63881302, 4.23359537, 3.16992092},
+                {5.25099468, 7.54282856, 6.69849157, 5.64309788, 4.56919575},
+                {4.71914101, 7.52830601, 6.71450949, 7.81113863, 5.84658146},
+                {4.97893143, 7.39293909, 6.89905310, 8.14430809, 5.62998581},
+                {2.79735112, 4.80967140, 5.57630205, 5.38828325, 4.57078695}},
+
+                {{3.03048635, 5.04540300, 4.21824932, 4.87323284, 2.35113740},
+                {4.45167351, 6.47721338, 7.40922976, 6.70445728, 3.60700107},
+                {3.77927423, 6.82826376, 7.41777134, 7.57402420, 5.13131523},
+                {4.08747244, 7.07994175, 7.57206821, 8.51897335, 5.26987123},
+                {2.34426999, 4.60127831, 4.86486769, 6.01579571, 3.97803569}}},
+
+
+                {{{3.84700942, 4.25972605, 3.05269003, 3.78043652, 2.08771229},
+                {6.00459957, 6.05633259, 4.45951605, 4.54089880, 4.03066444},
+                {5.41579390, 7.29543972, 6.18680000, 5.58812714, 3.45964241},
+                {6.04531050, 7.70924091, 5.52207708, 5.02131319, 4.09403706},
+                {3.18092418, 4.45422697, 4.04294252, 3.86577177, 2.18776536}},
+
+                {{4.02600670, 4.27603531, 3.81011319, 4.03631020, 2.57254648},
+                {5.33471155, 5.72588634, 5.12079763, 5.11733150, 3.76836705},
+                {5.62947607, 5.92492962, 6.24170446, 6.44130468, 3.44276404},
+                {5.38414621, 6.02679539, 5.88985586, 5.90263271, 3.15044069},
+                {3.31261086, 4.44371319, 3.47660780, 4.15411520, 1.48961508}},
+
+                {{3.95879412, 4.17324543, 3.70114422, 3.27447152, 3.09713888},
+                {5.78258181, 6.57920837, 4.99913597, 6.20961237, 4.98552179},
+                {5.84685421, 7.19971228, 6.66386652, 6.68013430, 4.90963316},
+                {5.24417877, 7.06430531, 6.58512402, 6.02492285, 4.48986387},
+                {3.64294529, 5.00678444, 5.04760027, 4.72895622, 2.67990756}},
+
+                {{3.48610687, 4.12853813, 4.07563591, 3.51327014, 2.44217038},
+                {4.80529881, 7.33211374, 5.14774036, 4.77281189, 4.44612408},
+                {5.11703110, 7.55168772, 7.14374542, 6.43696356, 4.10621357},
+                {5.41270018, 6.85949135, 6.73503923, 5.74601364, 4.46150303},
+                {3.16612267, 4.38248920, 5.23248482, 4.21292210, 2.86031270}}}}});
+
+    std::shared_ptr<Node> myConv = Conv<2>(3, 4, {3, 3}, "myconv");
+    auto convOp = std::static_pointer_cast<OperatorTensor>(myConv->getOperator());
+
+    std::shared_ptr<Node> myPad =
+            Pad<2>({1, 1, 1, 1}, "myPad", PadBorderType::Constant, 0.0);
+    auto padOp = std::static_pointer_cast<OperatorTensor>(myPad->getOperator());
+
+    convOp->setInput(1, myWeights);
+    convOp->setInput(2, myBias);
+
+    myPad->addChild(myConv, 0, 0);
+    padOp->setInput(0, myInput);
+
+    padOp->setDataType(DataType::Float64);
+    padOp->setBackend("cpu");
+    padOp->computeOutputDims();
+    convOp->setDataType(DataType::Float64);
+    convOp->setBackend("cpu");
+    convOp->computeOutputDims();
+
+    myPad->forward();
+    myConv->forward();
+    convOp -> getOutput(0) -> print();
+
+    double* computedOutput = static_cast<double*>(convOp->getOutput(0)->getImpl()->rawPtr());
+    double* expectedOutput = static_cast<double*>(myOutput->getImpl()->rawPtr());
+    for (std::size_t i = 0; i < myOutput->size(); ++i) {
+        REQUIRE(std::abs(computedOutput[i] - expectedOutput[i]) < 1e-5);
+    }
+
+    std::shared_ptr<Node> myPaddedConv =
+            PaddedConv(3, 4, {3, 3}, "myPaddedConv", {1, 1}, {1, 1, 1, 1});
+}
\ No newline at end of file
--- a/unit_tests/operator/Test_SliceImpl.cpp
+++ b/unit_tests/operator/Test_SliceImpl.cpp
@@ -27,14 +27,14 @@ TEST_CASE("[cpu/operator] Slice(forward)", "[Slice][CPU]") {
            {0, 1, 2,-3}
        });

-        std::shared_ptr<Node> mySlice = Slice(0, {4});
+        std::shared_ptr<Node> mySlice = Slice({0}, {3}, {0});
        auto op = std::static_pointer_cast<OperatorTensor>(mySlice -> getOperator());
        mySlice->getOperator()->associateInput(0,input0);
        mySlice->getOperator()->setDataType(DataType::Int32);
        mySlice->getOperator()->setBackend("cpu");
        op->computeOutputDims();
        mySlice->forward();
-        // mySlice->getOperator()->output(0).print();
+
        REQUIRE(*(op->getOutput(0)) == *expectedOutput);
        REQUIRE(op->getOutput(0)->dims() == expectedOutput->dims());
        REQUIRE(op->getOutput(0)->dataType() == expectedOutput->dataType());
@@ -54,7 +54,7 @@ TEST_CASE("[cpu/operator] Slice(forward)", "[Slice][CPU]") {
            }
        });

-        std::shared_ptr<Node> mySlice = Slice(5, {2,3});
+        std::shared_ptr<Node> mySlice = Slice({0,5}, {1,7}, {0,1});
        auto op = std::static_pointer_cast<OperatorTensor>(mySlice -> getOperator());
        mySlice->getOperator()->associateInput(0,input0);
        mySlice->getOperator()->setDataType(DataType::Int32);
@@ -88,7 +88,7 @@ TEST_CASE("[cpu/operator] Slice(forward)", "[Slice][CPU]") {
            }
        });

-        std::shared_ptr<Node> mySlice = Slice(14, {1,1,3});
+        std::shared_ptr<Node> mySlice = Slice({0,1,4}, {0,1,6}, {0,1,2});
        auto op = std::static_pointer_cast<OperatorTensor>(mySlice -> getOperator());
        mySlice->getOperator()->associateInput(0,input0);
        mySlice->getOperator()->setDataType(DataType::Int32);
@@ -151,7 +151,7 @@ TEST_CASE("[cpu/operator] Slice(forward)", "[Slice][CPU]") {
            }
        });

-        std::shared_ptr<Node> mySlice = Slice(0, {2,2,2,10});
+        std::shared_ptr<Node> mySlice = Slice({0,0,0,0}, {1,1,1,9}, {0,1,2,3});
        auto op = std::static_pointer_cast<OperatorTensor>(mySlice -> getOperator());
        mySlice->getOperator()->associateInput(0,input0);
        mySlice->getOperator()->setDataType(DataType::Int32);

--- a/unit_tests/operator/Test_SoftmaxImpl.cpp
+++ b/unit_tests/operator/Test_SoftmaxImpl.cpp
@@ -39,7 +39,7 @@ TEST_CASE("[cpu/operator] Softmax(forward)", "[Softmax][CPU]") {
            }
        });

-        std::shared_ptr<Node> mySoftmax = Softmax();
+        std::shared_ptr<Node> mySoftmax = Softmax(1);
        auto op = std::static_pointer_cast<OperatorTensor>(mySoftmax -> getOperator());
        mySoftmax->getOperator()->associateInput(0,input);
        mySoftmax->getOperator()->setDataType(DataType::Float32);
@@ -108,7 +108,7 @@ TEST_CASE("[cpu/operator] Softmax(forward)", "[Softmax][CPU]") {
            }
        });

-        std::shared_ptr<Node> mySoftmax = Softmax();
+        std::shared_ptr<Node> mySoftmax = Softmax(1);
        auto op = std::static_pointer_cast<OperatorTensor>(mySoftmax -> getOperator());
        mySoftmax->getOperator()->associateInput(0,input);
        mySoftmax->getOperator()->setDataType(DataType::Float32);

--- a/unit_tests/recipies/Test_ExplicitCastMove.cpp
+++ b/unit_tests/recipies/Test_ExplicitCastMove.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/recipies/Recipies.hpp"
+#include "aidge/operator/Conv.hpp"
+#include "aidge/operator/Producer.hpp"
+#include "aidge/graph/OpArgs.hpp"
+#include <cstddef>
+
+using namespace Aidge;
+
+TEST_CASE("[ExplicitCastMove] conv") {
+    auto conv1 = Conv(3, 32, {3, 3}, "conv1");
+    auto conv2 = Conv(32, 64, {3, 3}, "conv2");
+    auto conv3 = Conv(64, 10, {1, 1}, "conv3", {2, 2});
+
+    auto g1 = Sequential({
+        Producer({16, 3, 224, 224}, "dataProvider"),
+        conv1,
+        conv2,
+        conv3
+    });
+
+    g1->setBackend("cpu");
+    conv1->getOperator()->setDataType(DataType::Int32);
+    conv3->getOperator()->setDataType(DataType::Float64);
+
+    g1->save("explicitCastMove_before");
+    REQUIRE(g1->getNodes().size() == 10);
+
+    g1->forwardDims();
+    explicitCastMove(g1);
+
+    g1->save("explicitCastMove_after");
+    REQUIRE(g1->getNodes().size() == 13);
+}
--- a/unit_tests/recipies/Test_HorizontalTiling.cpp
+++ b/unit_tests/recipies/Test_HorizontalTiling.cpp
@@ -183,26 +183,4 @@ TEST_CASE("[core/recipies] Tiling(transformation)", "[Tiling][Recipies]") {
        }
    }
 }
-}
-        // std::shared_ptr<GraphView> g = Sequential({
-        //     Conv(3, 16, {3,3}, "conv1"),
-        //     ReLU("relu1"),
-        //     Conv(16, 32, {1,1}, "conv2"),
-        //     Conv(32, 16, {1,1}, "conv3"),
-        //     Conv(16, 10, {3,3}, "conv4"),
-        //     ReLU("relu2")
-        // });
-
-    //     for (auto& individualConv : g->match("Conv")) {
-    //         auto tiledConv = horizontalTiling(individualConv);
-    //         g->replace(individualConv, tiledConv);
-    //     }
-    // }
-
-    // SECTION("Create the GraphView with tiled layers") {
-    //     std::shared_ptr<GraphView> g;
-    //     g->addChild(horizontalTiling(Conv()))
-    // }
-
-// }
-// } // namespace Aidge
\ No newline at end of file
+} // namespace Aidge
\ No newline at end of file
--- a/unit_tests/scheduler/Test_CastMove.cpp
+++ b/unit_tests/scheduler/Test_CastMove.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <catch2/catch_test_macros.hpp>
+#include <memory>
+#include <string>
+
+#include "aidge/data/Tensor.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+#include "aidge/graph/Node.hpp"
+#include "aidge/graph/GraphView.hpp"
+#include "aidge/graph/OpArgs.hpp"
+#include "aidge/scheduler/Scheduler.hpp"
+#include "aidge/recipies/Recipies.hpp"
+
+#include "aidge/backend/cpu.hpp"
+
+using namespace Aidge;
+
+TEST_CASE("[cpu/castmove] CastMove(forward)") {
+    std::shared_ptr<Tensor> inputTensor =
+            std::make_shared<Tensor>(Array4D<int, 2, 1, 5, 5>{{{{{0, 1, 2, 3, 4},
+                                                                 {5, 6, 7, 8, 9},
+                                                                 {10, 11, 12, 13, 14},
+                                                                 {15, 16, 17, 18, 19},
+                                                                 {20, 21, 22, 23, 24}}},
+                                                               {{{25, 26, 27, 28, 29},
+                                                                 {30, 31, 32, 33, 34},
+                                                                 {35, 36, 37, 38, 39},
+                                                                 {40, 41, 42, 43, 44},
+                                                                 {45, 46, 47, 48, 49}}}}});
+
+    std::shared_ptr<Tensor> weight1 = std::make_shared<Tensor>(
+            Array4D<int, 3, 1, 3, 3>{{{{{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}},
+                                      {{{10, 11, 12}, {13, 14, 15}, {16, 17, 18}}},
+                                      {{{19, 20, 21}, {22, 23, 24}, {25, 26, 27}}}}});
+
+    std::shared_ptr<Tensor> bias1 = std::make_shared<Tensor>(Array1D<int, 3>{{1, 2, 3}});
+
+    SECTION("Test implicit") {
+        std::shared_ptr<GraphView> g =
+                Sequential({
+                    Conv(1, 3, {3, 3}, "conv1"),
+                    Conv(3, 4, {1, 1}, "conv2"),
+                    Conv(4, 3, {1, 1}, "conv3"),
+                    FC(27, 5, false, "fc")});
+
+        g->getNode("conv1")->getOperator()->setInput(0, inputTensor);
+        g->getNode("conv1")->getOperator()->setInput(1, weight1);
+        g->getNode("conv1")->getOperator()->setInput(2, bias1);
+
+        std::shared_ptr<Tensor> weight2 =
+                std::make_shared<Tensor>(Array4D<int, 4, 3, 1, 1>{{{{{1}}, {{2}}, {{3}}},
+                                                                   {{{4}}, {{5}}, {{6}}},
+                                                                   {{{7}}, {{8}}, {{9}}},
+                                                                   {{{10}}, {{11}}, {{12}}}}});
+        std::shared_ptr<Tensor> bias2 = std::make_shared<Tensor>(Array1D<int, 4>{{1, 2, 3, 4}});
+        g->getNode("conv2")->getOperator()->setInput(1, weight2);
+        g->getNode("conv2")->getOperator()->setInput(2, bias2);
+        // *(g->getNode("conv2")->getOperator()->input(1, weight2);
+
+        std::shared_ptr<Tensor> weight3 = std::make_shared<Tensor>(
+                Array4D<int, 3, 4, 1, 1>{{{{{1}}, {{2}}, {{3}}, {{4}}},
+                                          {{{5}}, {{6}}, {{7}}, {{8}}},
+                                          {{{9}}, {{10}}, {{11}}, {{12}}}}});
+        std::shared_ptr<Tensor> bias3 = std::make_shared<Tensor>(Array1D<int, 3>{{1, 2, 3}});
+        g->getNode("conv3")->getOperator()->setInput(1, weight3);
+        g->getNode("conv3")->getOperator()->setInput(2, bias3);
+
+        std::shared_ptr<Tensor> weightfc = std::make_shared<Tensor>(
+                Array2D<int, 5, 27>{{{1,  2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+                                      15, 1, 2, 3, 4, 5, 6, 7, 8, 9,  10, 11, 12},
+                                     {13, 14, 15, 1,  2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                                      12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9},
+                                     {10, 11, 12, 13, 14, 15, 1,  2, 3, 4, 5, 6, 7, 8,
+                                      9,  10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6},
+                                     {7, 8, 9, 10, 11, 12, 13, 14, 15, 1,  2, 3, 4, 5,
+                                      6, 7, 8, 9,  10, 11, 12, 13, 14, 15, 1, 2, 3},
+                                     {4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2,
+                                      3, 4, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14, 15}}});
+        std::shared_ptr<Tensor> biasfc = std::make_shared<Tensor>(Array1D<int, 5>{{1, 2, 3, 4, 5}});
+        g->getNode("fc")->getOperator()->setInput(1, weightfc);
+        g->getNode("fc")->getOperator()->setInput(2, biasfc);
+
+        // input->addChild(g);
+        g->setDataType(Aidge::DataType::Int32);
+        g->getNode("conv1")->getOperator()->setDataType(DataType::Float32);
+        g->getNode("conv3")->getOperator()->setDataType(DataType::Float64);
+
+        g->setBackend("cpu");
+        g->forwardDims();
+        SequentialScheduler scheduler(g);
+        REQUIRE_NOTHROW(scheduler.forward());
+        scheduler.saveSchedulingDiagram("schedulingSequential");
+
+        std::shared_ptr<Tensor> expectedOutput1 = std::make_shared<Tensor>(Array4D<int, 2, 3, 3, 3>{
+                {{{{367, 412, 457}, {592, 637, 682}, {817, 862, 907}},
+                  {{854, 980, 1106}, {1484, 1610, 1736}, {2114, 2240, 2366}},
+                  {{1341, 1548, 1755}, {2376, 2583, 2790}, {3411, 3618, 3825}}},
+                 {{{1492, 1537, 1582}, {1717, 1762, 1807}, {1942, 1987, 2032}},
+                  {{4004, 4130, 4256}, {4634, 4760, 4886}, {5264, 5390, 5516}},
+                  {{6516, 6723, 6930}, {7551, 7758, 7965}, {8586, 8793, 9000}}}}});
+
+        std::shared_ptr<Tensor> expectedOutput2 = std::make_shared<Tensor>(Array4D<int, 2, 4, 3, 3>{
+                {{{{6099, 7017, 7935}, {10689, 11607, 12525}, {15279, 16197, 17115}},
+                  {{13786, 15838, 17890}, {24046, 26098, 28150}, {34306, 36358, 38410}},
+                  {{21473, 24659, 27845}, {37403, 40589, 43775}, {53333, 56519, 59705}},
+                  {{29160, 33480, 37800}, {50760, 55080, 59400}, {72360, 76680, 81000}}},
+                 {{{29049, 29967, 30885}, {33639, 34557, 35475}, {38229, 39147, 40065}},
+                  {{65086, 67138, 69190}, {75346, 77398, 79450}, {85606, 87658, 89710}},
+                  {{101123, 104309, 107495}, {117053, 120239, 123425}, {132983, 136169, 139355}},
+                  {{137160, 141480, 145800}, {158760, 163080, 167400}, {180360, 184680, 189000}}}}});
+
+        std::shared_ptr<Tensor> expectedOutput3 = std::make_shared<Tensor>(Array4D<int, 2, 3, 3, 3>{
+                {{{{214731, 246591, 278451}, {374031, 405891, 437751}, {533331, 565191, 597051}},
+                  {{496804, 570568, 644332}, {865624, 939388, 1013152}, {1234444, 1308208, 1381972}},
+                  {{778877, 894545, 1010213}, {1357217, 1472885, 1588553}, {1935557, 2051225, 2166893}}},
+                 {{{1011231, 1043091, 1074951}, {1170531, 1202391, 1234251}, {1329831, 1361691, 1393551}},
+                  {{2340904, 2414668, 2488432}, {2709724, 2783488, 2857252}, {3078544, 3152308, 3226072}},
+                  {{3670577, 3786245, 3901913}, {4248917, 4364585, 4480253}, {4827257, 4942925, 5058593}}}}});
+
+        Tensor expectedOutput4 = Array2D<int, 2, 5>{
+                {{205050376, 198925904, 181355097, 196978090, 238868348},
+                {598467376, 561797804, 560823897, 593043790, 698672948}}};
+        std::shared_ptr<Tensor> other1 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv1")->getOperator())->getOutput(0);
+        REQUIRE(approxEq<float, int>(*other1, *expectedOutput1, 0.0, 1.0e-12));
+        std::shared_ptr<Tensor> other2 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv2")->getOperator())->getOutput(0);
+        REQUIRE(approxEq<int>(*other2, *expectedOutput2, 0.0, 1.0e-12));
+        std::shared_ptr<Tensor> other3 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv3")->getOperator())->getOutput(0);
+        REQUIRE(approxEq<double, int>(*other3, *expectedOutput3, 0.0, 1.0e-12));
+        std::shared_ptr<Tensor> other4 = std::static_pointer_cast<OperatorTensor>(g->getNode("fc")->getOperator())->getOutput(0);
+        REQUIRE(approxEq<int>(*other4, expectedOutput4, 0.0, 1.0e-12));
+    }
+
+    SECTION("Half") {
+        Tensor refTensor = Array2D<float, 3, 2>{{{0.0, 1.0},{2.1, 3.4},{5000.0, 1.0e5}}};
+        Tensor tensor(DataType::Float16);
+        tensor.copyCastFrom(refTensor);
+        REQUIRE(approxEq<float, half_float::half>(refTensor, tensor, 1.0e-3, 0.0));
+    }
+
+    SECTION("Test explicit") {
+        std::shared_ptr<GraphView> g =
+                Sequential({
+                    Conv(1, 3, {3, 3}, "conv1"),
+                    Conv(3, 4, {1, 1}, "conv2"),
+                    Conv(4, 3, {1, 1}, "conv3"),
+                    FC(27, 5, false, "fc")});
+
+        g->getNode("conv1")->getOperator()->setInput(0, inputTensor);
+        g->getNode("conv1")->getOperator()->setInput(1, weight1);
+        g->getNode("conv1")->getOperator()->setInput(2, bias1);
+
+        std::shared_ptr<Tensor> weight2 =
+                std::make_shared<Tensor>(Array4D<int, 4, 3, 1, 1>{{{{{1}}, {{2}}, {{3}}},
+                                                                   {{{4}}, {{5}}, {{6}}},
+                                                                   {{{7}}, {{8}}, {{9}}},
+                                                                   {{{10}}, {{11}}, {{12}}}}});
+        std::shared_ptr<Tensor> bias2 = std::make_shared<Tensor>(Array1D<int, 4>{{1, 2, 3, 4}});
+        g->getNode("conv2")->getOperator()->setInput(1, weight2);
+        g->getNode("conv2")->getOperator()->setInput(2, bias2);
+        // *(g->getNode("conv2")->getOperator()->input(1, weight2);
+
+        std::shared_ptr<Tensor> weight3 = std::make_shared<Tensor>(
+                Array4D<int, 3, 4, 1, 1>{{{{{1}}, {{2}}, {{3}}, {{4}}},
+                                          {{{5}}, {{6}}, {{7}}, {{8}}},
+                                          {{{9}}, {{10}}, {{11}}, {{12}}}}});
+        std::shared_ptr<Tensor> bias3 = std::make_shared<Tensor>(Array1D<int, 3>{{1, 2, 3}});
+        g->getNode("conv3")->getOperator()->setInput(1, weight3);
+        g->getNode("conv3")->getOperator()->setInput(2, bias3);
+
+        std::shared_ptr<Tensor> weightfc = std::make_shared<Tensor>(
+                Array2D<int, 5, 27>{{{1,  2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+                                      15, 1, 2, 3, 4, 5, 6, 7, 8, 9,  10, 11, 12},
+                                     {13, 14, 15, 1,  2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                                      12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9},
+                                     {10, 11, 12, 13, 14, 15, 1,  2, 3, 4, 5, 6, 7, 8,
+                                      9,  10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6},
+                                     {7, 8, 9, 10, 11, 12, 13, 14, 15, 1,  2, 3, 4, 5,
+                                      6, 7, 8, 9,  10, 11, 12, 13, 14, 15, 1, 2, 3},
+                                     {4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2,
+                                      3, 4, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14, 15}}});
+        std::shared_ptr<Tensor> biasfc = std::make_shared<Tensor>(Array1D<int, 5>{{1, 2, 3, 4, 5}});
+        g->getNode("fc")->getOperator()->setInput(1, weightfc);
+        g->getNode("fc")->getOperator()->setInput(2, biasfc);
+
+        // input->addChild(g);
+        g->setDataType(Aidge::DataType::Int32);
+        g->getNode("conv1")->getOperator()->setDataType(DataType::Float32);
+        g->getNode("conv3")->getOperator()->setDataType(DataType::Float64);
+
+        explicitCastMove(g);
+        g->setBackend("cpu");
+        g->forwardDims();
+
+        SequentialScheduler scheduler(g);
+        REQUIRE_NOTHROW(scheduler.forward());
+        scheduler.saveSchedulingDiagram("schedulingSequential");
+
+        std::shared_ptr<Tensor> expectedOutput1 = std::make_shared<Tensor>(Array4D<int, 2, 3, 3, 3>{
+                {{{{367, 412, 457}, {592, 637, 682}, {817, 862, 907}},
+                  {{854, 980, 1106}, {1484, 1610, 1736}, {2114, 2240, 2366}},
+                  {{1341, 1548, 1755}, {2376, 2583, 2790}, {3411, 3618, 3825}}},
+                 {{{1492, 1537, 1582}, {1717, 1762, 1807}, {1942, 1987, 2032}},
+                  {{4004, 4130, 4256}, {4634, 4760, 4886}, {5264, 5390, 5516}},
+                  {{6516, 6723, 6930}, {7551, 7758, 7965}, {8586, 8793, 9000}}}}});
+
+        std::shared_ptr<Tensor> expectedOutput2 = std::make_shared<Tensor>(Array4D<int, 2, 4, 3, 3>{
+                {{{{6099, 7017, 7935}, {10689, 11607, 12525}, {15279, 16197, 17115}},
+                  {{13786, 15838, 17890}, {24046, 26098, 28150}, {34306, 36358, 38410}},
+                  {{21473, 24659, 27845}, {37403, 40589, 43775}, {53333, 56519, 59705}},
+                  {{29160, 33480, 37800}, {50760, 55080, 59400}, {72360, 76680, 81000}}},
+                 {{{29049, 29967, 30885}, {33639, 34557, 35475}, {38229, 39147, 40065}},
+                  {{65086, 67138, 69190}, {75346, 77398, 79450}, {85606, 87658, 89710}},
+                  {{101123, 104309, 107495}, {117053, 120239, 123425}, {132983, 136169, 139355}},
+                  {{137160, 141480, 145800}, {158760, 163080, 167400}, {180360, 184680, 189000}}}}});
+
+        std::shared_ptr<Tensor> expectedOutput3 = std::make_shared<Tensor>(Array4D<int, 2, 3, 3, 3>{
+                {{{{214731, 246591, 278451}, {374031, 405891, 437751}, {533331, 565191, 597051}},
+                  {{496804, 570568, 644332}, {865624, 939388, 1013152}, {1234444, 1308208, 1381972}},
+                  {{778877, 894545, 1010213}, {1357217, 1472885, 1588553}, {1935557, 2051225, 2166893}}},
+                 {{{1011231, 1043091, 1074951}, {1170531, 1202391, 1234251}, {1329831, 1361691, 1393551}},
+                  {{2340904, 2414668, 2488432}, {2709724, 2783488, 2857252}, {3078544, 3152308, 3226072}},
+                  {{3670577, 3786245, 3901913}, {4248917, 4364585, 4480253}, {4827257, 4942925, 5058593}}}}});
+
+        Tensor expectedOutput4 = Array2D<int, 2, 5>{
+                {{205050376, 198925904, 181355097, 196978090, 238868348},
+                {598467376, 561797804, 560823897, 593043790, 698672948}}};
+        std::shared_ptr<Tensor> other1 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv1")->getOperator())->getOutput(0);
+        REQUIRE(approxEq<float, int>(*other1, *expectedOutput1, 0.0, 1.0e-12));
+        std::shared_ptr<Tensor> other2 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv2")->getOperator())->getOutput(0);
+        REQUIRE(approxEq<int>(*other2, *expectedOutput2, 0.0, 1.0e-12));
+        std::shared_ptr<Tensor> other3 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv3")->getOperator())->getOutput(0);
+        REQUIRE(approxEq<double, int>(*other3, *expectedOutput3, 0.0, 1.0e-12));
+        std::shared_ptr<Tensor> other4 = std::static_pointer_cast<OperatorTensor>(g->getNode("fc")->getOperator())->getOutput(0);
+        REQUIRE(approxEq<int>(*other4, expectedOutput4, 0.0, 1.0e-12));
+    }
+}