Compare revisions

a424079c · a424079c · a424079c · a424079c · a424079c · a424079c
--- a/unit_tests/operator/Test_RoundImpl.cpp
+++ b/unit_tests/operator/Test_RoundImpl.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <chrono>      // std::micro, std::chrono::time_point,
+                       // std::chrono::system_clock, std::chrono::duration
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::uint16_t
+#include <functional>  // std::multiplies
+#include <memory>
+#include <numeric>     // std::accumulate
+#include <random>      // std::random_device, std::mt19937
+                       // std::uniform_int_distribution, std::uniform_real_distribution
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
+
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/RoundImpl.hpp"
+#include "aidge/data/Data.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/Round.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+namespace Aidge {
+
+TEST_CASE("[cpu/operator] Round_Test", "[Round][CPU]") {
+    constexpr std::uint16_t NBTRIALS = 15;
+    // Create a random number generator
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> valueDist(-15, 15);
+    std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(5));
+    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(3));
+
+    // Create BitShift Operator
+    std::shared_ptr<Node> myRound = Round();
+    auto op = std::static_pointer_cast<OperatorTensor>(myRound-> getOperator());
+    op->setDataType(DataType::Float32);
+    op->setBackend("cpu");
+
+    // Create 2 input Tensors
+    std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>();
+    op->associateInput(0,T0);
+    T0->setDataType(DataType::Float32);
+    T0->setBackend("cpu");
+    // Create results Tensor
+    std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>();
+    Tres->setDataType(DataType::Float32);
+    Tres->setBackend("cpu");
+
+    // To measure execution time of 'Round_Op::forward()' member function call
+    std::chrono::time_point<std::chrono::system_clock> start;
+    std::chrono::time_point<std::chrono::system_clock> end;
+    std::chrono::duration<double, std::micro> duration{};
+
+    SECTION("Round [Forward]") {
+        SECTION("Test Forward Kernel") {
+            std::size_t number_of_operation = 0;
+
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+
+                // generate 2 random Tensors
+                const std::size_t nbDims = nbDimsDist(gen);
+                std::vector<std::size_t> dims;
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    dims.push_back(dimSizeDist(gen));
+                }
+                const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                number_of_operation += nb_elements;
+
+                // without broadcasting
+                float* array0 = new float[nb_elements];
+                float* result = new float[nb_elements];
+
+                for (std::size_t i = 0; i < nb_elements; ++i) {
+                    array0[i] = valueDist(gen);
+                    result[i] = std::nearbyint(array0[i]);
+
+                }
+
+                // input0
+                T0->resize(dims);
+                T0 -> getImpl() -> setRawPtr(array0, nb_elements);
+
+                // results
+                Tres->resize(dims);
+                Tres -> getImpl() -> setRawPtr(result, nb_elements);
+
+                op->forwardDims();
+                start = std::chrono::system_clock::now();
+                myRound->forward();
+                end = std::chrono::system_clock::now();
+                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+                REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
+
+                delete[] array0;
+                delete[] result;
+
+
+            }
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {} μs\n", duration.count());
+        }
+    }
+} // namespace Aidge
+}
\ No newline at end of file
--- a/unit_tests/operator/Test_SliceImpl.cpp
+++ b/unit_tests/operator/Test_SliceImpl.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/Slice.hpp"
+
+using namespace Aidge;
+
+TEST_CASE("[cpu/operator] Slice(forward)", "[Slice][CPU]") {
+    SECTION("1D Tensor") {
+        std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array1D<int,10> {
+            {0, 1, -2,-3, 4,-5,-6, 7, 8, 9}
+        });
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array1D<int,3> {
+            {0, 1, -2}
+        });
+        std::shared_ptr<Tensor> starts = std::make_shared<Tensor>(Array1D<int,1>{{0}});
+        std::shared_ptr<Tensor> ends = std::make_shared<Tensor>(Array1D<int,1>{{3}});
+        std::shared_ptr<Tensor> axes = std::make_shared<Tensor>(Array1D<int,1>{{0}});
+
+        std::shared_ptr<Node> mySlice = Slice();
+        auto op = std::static_pointer_cast<OperatorTensor>(mySlice -> getOperator());
+        mySlice->getOperator()->associateInput(0,input0);
+        mySlice->getOperator()->associateInput(1,starts);
+        mySlice->getOperator()->associateInput(2,ends);
+        mySlice->getOperator()->associateInput(3,axes);
+        mySlice->getOperator()->setDataType(DataType::Int32);
+        mySlice->getOperator()->setBackend("cpu");
+        mySlice->forward();
+
+        REQUIRE(*(op->getOutput(0)) == *expectedOutput);
+        REQUIRE(op->getOutput(0)->dims() == expectedOutput->dims());
+        REQUIRE(op->getOutput(0)->dataType() == expectedOutput->dataType());
+    }
+
+    SECTION("2D Tensor") {
+        std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array2D<int,2,10> {
+            {
+                { 0, 1, 2,-3, 4,-5,-6, 7, 8, 9},
+                {-5, 4, 2,-3, 4,-5,-6, 7,-1,10}
+            }
+        });
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array2D<int,2,3> {
+            {
+                {-5,-6, 7},
+                {-5,-6, 7}
+            }
+        });
+        std::shared_ptr<Tensor> starts = std::make_shared<Tensor>(Array1D<int,2>{{0,5}});
+        std::shared_ptr<Tensor> ends = std::make_shared<Tensor>(Array1D<int,2>{{2,8}});
+        std::shared_ptr<Tensor> axes = std::make_shared<Tensor>(Array1D<int,2>{{0,1}});
+
+        std::shared_ptr<Node> mySlice = Slice();
+        auto op = std::static_pointer_cast<OperatorTensor>(mySlice -> getOperator());
+        mySlice->getOperator()->associateInput(0,input0);
+        mySlice->getOperator()->associateInput(1,starts);
+        mySlice->getOperator()->associateInput(2,ends);
+        mySlice->getOperator()->associateInput(3,axes);
+        mySlice->getOperator()->setDataType(DataType::Int32);
+        mySlice->getOperator()->setBackend("cpu");
+        mySlice->forward();
+        // op->getOutput(0)->print();
+        REQUIRE(*(op->getOutput(0)) == *expectedOutput);
+        REQUIRE(op->getOutput(0)->dims() == expectedOutput->dims());
+        REQUIRE(op->getOutput(0)->dataType() == expectedOutput->dataType());
+    }
+
+    SECTION("3D Tensor") {
+        std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array3D<int,2,2,10> {
+            {
+                {
+                    { 0, 1, 2,-3, 4,-5,-6, 7, 8, 9},
+                    {-5, 4, 2,-3, 4,-5,-6, 7,-1,10}
+                },
+                {
+                    { 0, 1, 2,-3, 4,-5,-6, 7, 8, 9},
+                    {-5, 4, 2,-3, 4,-5,-6, 7,-1,10}
+                }
+            }
+        });
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array3D<int,1,1,3> {
+            {
+                {
+                    { 4,-5,-6}
+                }
+            }
+        });
+        std::shared_ptr<Tensor> starts = std::make_shared<Tensor>(Array1D<int,3>{{0,1,4}});
+        std::shared_ptr<Tensor> ends = std::make_shared<Tensor>(Array1D<int,3>{{1,2,7}});
+        std::shared_ptr<Tensor> axes = std::make_shared<Tensor>(Array1D<int,3>{{0,1,2}});
+
+        std::shared_ptr<Node> mySlice = Slice();
+        auto op = std::static_pointer_cast<OperatorTensor>(mySlice -> getOperator());
+        mySlice->getOperator()->associateInput(0,input0);
+        mySlice->getOperator()->associateInput(1,starts);
+        mySlice->getOperator()->associateInput(2,ends);
+        mySlice->getOperator()->associateInput(3,axes);
+        mySlice->getOperator()->setDataType(DataType::Int32);
+        mySlice->getOperator()->setBackend("cpu");
+        mySlice->forward();
+        // mySlice->getOperator()->output(0).print();
+        REQUIRE(*(op->getOutput(0)) == *expectedOutput);
+        REQUIRE(op->getOutput(0)->dims() == expectedOutput->dims());
+        REQUIRE(op->getOutput(0)->dataType() == expectedOutput->dataType());
+    }
+
+    SECTION("4D Tensor") {
+        std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array4D<int,2,2,2,10> {
+            {
+                {
+                    {
+                        { 0, 1, 2,-3, 4,-5,-6, 7, 8, 9},
+                        {-5, 4, 2,-3, 4,-5,-6, 7,-1,10}
+                    },
+                    {
+                        { 0, 1, 2,-3, 4,-5,-6, 7, 8, 9},
+                        {-5, 4, 2,-3, 4,-5,-6, 7,-1,10}
+                    }
+                },
+                {
+                    {
+                        { 0, 1, 2,-3, 6,-5,-6, 7, 8, 9},
+                        {-5, 4, 2,-3, 4,-5,-6, 7,-1,10}
+                    },
+                    {
+                        { 0, 1, 2,-3, 4,-5,-6, 7, 8, 9},
+                        {-5, 4, 2,-3,11,-5,-6, 7,-1,10}
+                    }
+                }
+            }
+        });
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,2,2,2,10> {
+            {
+                {
+                    {
+                        { 0, 1, 2,-3, 4,-5,-6, 7, 8, 9},
+                        {-5, 4, 2,-3, 4,-5,-6, 7,-1,10}
+                    },
+                    {
+                        { 0, 1, 2,-3, 4,-5,-6, 7, 8, 9},
+                        {-5, 4, 2,-3, 4,-5,-6, 7,-1,10}
+                    }
+                },
+                {
+                    {
+                        { 0, 1, 2,-3, 6,-5,-6, 7, 8, 9},
+                        {-5, 4, 2,-3, 4,-5,-6, 7,-1,10}
+                    },
+                    {
+                        { 0, 1, 2,-3, 4,-5,-6, 7, 8, 9},
+                        {-5, 4, 2,-3,11,-5,-6, 7,-1,10}
+                    }
+                }
+            }
+        });
+        std::shared_ptr<Tensor> starts = std::make_shared<Tensor>(Array1D<int,4>{{0,0,0,0}});
+        std::shared_ptr<Tensor> ends = std::make_shared<Tensor>(Array1D<int,4>{{2,2,2,10}});
+        std::shared_ptr<Tensor> axes = std::make_shared<Tensor>(Array1D<int,4>{{0,1,2,3}});
+
+        std::shared_ptr<Node> mySlice = Slice();
+        auto op = std::static_pointer_cast<OperatorTensor>(mySlice -> getOperator());
+        mySlice->getOperator()->associateInput(0,input0);
+        mySlice->getOperator()->associateInput(1,starts);
+        mySlice->getOperator()->associateInput(2,ends);
+        mySlice->getOperator()->associateInput(3,axes);
+        mySlice->getOperator()->setDataType(DataType::Int32);
+        mySlice->getOperator()->setBackend("cpu");
+        mySlice->forward();
+        // op->getOutput(0)->print();
+        REQUIRE(*(op->getOutput(0)) == *expectedOutput);
+        REQUIRE(op->getOutput(0)->dims() == expectedOutput->dims());
+        REQUIRE(op->getOutput(0)->dataType() == expectedOutput->dataType());
+    }
+
+    SECTION("Attributes instead of inputs") {
+        std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array4D<int,2,2,2,10> {
+            {
+                {
+                    {
+                        { 0, 1, 2,-3, 4,-5,-6, 7, 8, 9},
+                        {-5, 4, 2,-3, 4,-5,-6, 7,-1,10}
+                    },
+                    {
+                        { 0, 1, 2,-3, 4,-5,-6, 7, 8, 9},
+                        {-5, 4, 2,-3, 4,-5,-6, 7,-1,10}
+                    }
+                },
+                {
+                    {
+                        { 0, 1, 2,-3, 6,-5,-6, 7, 8, 9},
+                        {-5, 4, 2,-3, 4,-5,-6, 7,-1,10}
+                    },
+                    {
+                        { 0, 1, 2,-3, 4,-5,-6, 7, 8, 9},
+                        {-5, 4, 2,-3,11,-5,-6, 7,-1,10}
+                    }
+                }
+            }
+        });
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,1,1,1,5> {
+            {
+                {
+                    {
+                        { 0, 1, 2,-3, 4}
+                    }
+                }
+            }
+        });
+
+        std::shared_ptr<Node> mySlice = Slice({0,0,0,0}, {1,1,1,5}, {0,1,2,3}, {1,1,1,1});
+        auto op = std::static_pointer_cast<OperatorTensor>(mySlice -> getOperator());
+        mySlice->getOperator()->associateInput(0,input0);
+        mySlice->getOperator()->setDataType(DataType::Int32);
+        mySlice->getOperator()->setBackend("cpu");
+        mySlice->forward();
+        // op->getOutput(0)->print();
+        REQUIRE(*(op->getOutput(0)) == *expectedOutput);
+        REQUIRE(op->getOutput(0)->dims() == expectedOutput->dims());
+        REQUIRE(op->getOutput(0)->dataType() == expectedOutput->dataType());
+    }
+
+    SECTION("Different Steps") {
+        std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array3D<int,4,2,8> {
+            {
+                {
+                    { 0, 1, 2,-3, 4,-5,-6,7},
+                    {-5, 4, 2,-3, 4,-5,-6,-7}
+                },
+                {
+                    { 10, 11, 12,-13, 14,-15,-16,17},
+                    {-15, 14, 12,-13, 14,-15,-16,-17}
+                },
+                {
+                    { 20, 21, 22,-23, 24,-25,-26,27},
+                    {-25, 24, 22,-23, 24,-25,-26,-27}
+                },
+                {
+                    { 30, 31, 32,-33, 34,-35,-36,37},
+                    {-35, 34, 32,-33, 34,-35,-36,-37}
+                }
+            }
+        });
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array3D<int,2,1,3> {
+            {
+                {
+                    { 7, 4, 1}
+                },
+                {
+                    { 27, 24, 21}
+                }
+            }
+        });
+
+        std::shared_ptr<Node> mySlice = Slice({0,0,7}, {4,1,0}, {0,1,2}, {2,1,-3});
+        // Steps are 2,1,-3 so the slice will be:
+        // on Axis 0: from 0 to 4 by step of 2
+        // on Axis 1: from 0 to 1 by step of 1
+        // on Axis 2: from 7 to 0 by step of -3 (reverse the order of elements)
+        auto op = std::static_pointer_cast<OperatorTensor>(mySlice -> getOperator());
+        mySlice->getOperator()->associateInput(0,input0);
+        mySlice->getOperator()->setDataType(DataType::Int32);
+        mySlice->getOperator()->setBackend("cpu");
+        mySlice->forward();
+        op->getOutput(0)->print();
+        REQUIRE(*(op->getOutput(0)) == *expectedOutput);
+        REQUIRE(op->getOutput(0)->dims() == expectedOutput->dims());
+        REQUIRE(op->getOutput(0)->dataType() == expectedOutput->dataType());
+    }
+}
--- a/unit_tests/operator/Test_SoftmaxImpl.cpp
+++ b/unit_tests/operator/Test_SoftmaxImpl.cpp
@@ -9,18 +9,20 @@
 *
 ********************************************************************************/

+#include <memory>
+
 #include <catch2/catch_test_macros.hpp>

+#include "aidge/backend/cpu/operator/SoftmaxImpl.hpp"
+#include "aidge/data/DataType.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/Softmax.hpp"
-
-#include "aidge/backend/cpu.hpp"
-
-#include <memory>
+#include "aidge/utils/ArrayHelpers.hpp"
+#include "aidge/utils/TensorUtils.hpp"

 using namespace Aidge;

-TEST_CASE("[cpu/operator] Softmax(forward)") {
+TEST_CASE("[cpu/operator] Softmax(forward)", "[Softmax][CPU]") {
    SECTION("2D Tensor") {
        std::shared_ptr<Tensor> input = std::make_shared<Tensor>(Array2D<float,2,10> {
            {
@@ -30,28 +32,22 @@ TEST_CASE("[cpu/operator] Softmax(forward)") {
                    0.35077620, -0.78156322, -0.98952234,  0.04166317,  1.34357309}
            }
        });
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array2D<float,2,10> {
+        Tensor expectedOutput = Array2D<float,2,10> {
            {
                {0.04883239, 0.11326669, 0.05974559, 0.09930880, 0.09267281, 0.03006749,
                    0.15842478, 0.24514021, 0.07825989, 0.07428131},
                {0.05429055, 0.27136859, 0.28389078, 0.02240700, 0.06262558, 0.06087753,
                    0.01961952, 0.01593576, 0.04469007, 0.16429459}
            }
-        });
+        };

-        std::shared_ptr<Node> mySoftmax = Softmax();
-        mySoftmax->getOperator()->setDatatype(DataType::Float32);
-        mySoftmax->getOperator()->setBackend("cpu");
-        mySoftmax->getOperator()->associateInput(0,input);
-        mySoftmax->getOperator()->computeOutputDims();
-        mySoftmax->forward();
-
-        float* resPtr = static_cast<float*>(mySoftmax->getOperator()->getOutput(0)->getImpl()->rawPtr());
-        float* expectedPtr = static_cast<float*>(expectedOutput->getImpl()->rawPtr());
-        for (std::size_t i = 0; i< 20; ++i) {
-            REQUIRE(std::abs(resPtr[i]-expectedPtr[i]) < 0.00001);
-        }
+        std::shared_ptr<Softmax_Op> op = std::make_shared<Softmax_Op>(1);
+        op->associateInput(0,input);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cpu");
+        op->forward();

+        REQUIRE(approxEq<float>(*(op->getOutput(0)), expectedOutput, 1e-5f, 1e-8f));
    }
    SECTION("4D Tensor") {
        std::shared_ptr<Tensor> input = std::make_shared<Tensor>(Array4D<float,2,3,3,3> {
@@ -80,7 +76,7 @@ TEST_CASE("[cpu/operator] Softmax(forward)") {
                }
            }
        });
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<float,2,3,3,3> {
+        Tensor expectedOutput = Array4D<float,2,3,3,3> {
            {
                {
                    {{0.45109013, 0.42849392, 0.43775153},
@@ -105,20 +101,14 @@ TEST_CASE("[cpu/operator] Softmax(forward)") {
                     {0.34566763, 0.32462072, 0.48979440}}
                }
            }
-        });
+        };

-        std::shared_ptr<Node> mySoftmax = Softmax();
-        mySoftmax->getOperator()->setDatatype(DataType::Float32);
-        mySoftmax->getOperator()->setBackend("cpu");
-        mySoftmax->getOperator()->associateInput(0,input);
-        mySoftmax->getOperator()->computeOutputDims();
-        mySoftmax->forward();
+        std::shared_ptr<Softmax_Op> op = std::make_shared<Softmax_Op>(1);
+        op->associateInput(0,input);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cpu");
+        op->forward();

-        float* resPtr = static_cast<float*>(mySoftmax->getOperator()->getOutput(0)->getImpl()->rawPtr());
-        float* expectedPtr = static_cast<float*>(expectedOutput->getImpl()->rawPtr());
-        for (std::size_t i = 0; i< 54; ++i) {
-            REQUIRE(std::abs(resPtr[i]-expectedPtr[i]) < 0.00001);
-        }
-        // REQUIRE(*mySoftmax->getOperator()->getOutput(0) == *expectedOutput);
+        REQUIRE(approxEq<float>(*(op->getOutput(0)), expectedOutput, 1e-5f, 1e-8f));
    }
 }
\ No newline at end of file
--- a/unit_tests/operator/Test_SqrtImpl.cpp
+++ b/unit_tests/operator/Test_SqrtImpl.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <memory>
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu/operator/SqrtImpl.hpp"
+#include "aidge/data/DataType.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/Sqrt.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+using namespace Aidge;
+
+TEST_CASE("[cpu/operator] Sqrt(forward)", "[Sqrt][CPU]") {
+    SECTION("2D Tensor") {
+        std::shared_ptr<Tensor> input = std::make_shared<Tensor>(Array2D<float,2,2> {
+            {
+                {16.00000000,  0.62226844},
+                { 0.00000000,  1.84539008}
+            }
+        });
+        Tensor expectedOutput = Array2D<float,2,2> {
+            {
+                {4.00000000, 0.78883994},
+                {0.00000000, 1.35845140}
+            }
+        };
+
+        std::shared_ptr<Sqrt_Op> op = std::make_shared<Sqrt_Op>();
+        op->associateInput(0,input);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cpu");
+        op->forward();
+
+        REQUIRE(approxEq<float>(*(op->getOutput(0)), expectedOutput, 1e-5f, 1e-8f));
+    }
+
+    SECTION("4D Tensor") {
+        std::shared_ptr<Tensor> input = std::make_shared<Tensor>(Array4D<float,2,3,3,3> {
+            {
+                {
+                    {{0.06218481, 0.46850157, 0.60914326},
+                     {0.57470602, 0.09943211, 0.59992820},
+                     {0.99623793, 0.54931718, 0.89343822}},
+                    {{0.75176072, 0.38237786, 0.84824580},
+                     {0.10619396, 0.11959118, 0.93499404},
+                     {0.65563291, 0.02913034, 0.17093092}},
+                    {{0.36303985, 0.92073035, 0.79146117},
+                     {0.88962847, 0.94561219, 0.92033130},
+                     {0.52903181, 0.13397896, 0.76086712}}
+                },
+                {
+                    {{0.31242222, 0.80526417, 0.48411584},
+                     {0.84375203, 0.65408552, 0.55028963},
+                     {0.77546734, 0.06203610, 0.83163154}},
+                    {{0.46342927, 0.53631741, 0.39145601},
+                     {0.14204198, 0.84214240, 0.94185621},
+                     {0.05068624, 0.99889028, 0.38464361}},
+                    {{0.37591159, 0.51769549, 0.30288595},
+                     {0.96883464, 0.35154045, 0.55648762},
+                     {0.13022375, 0.73467660, 0.02705121}}
+                }
+            }
+        });
+
+        Tensor expectedOutput = Array4D<float,2,3,3,3> {
+            {
+                {
+                    {{0.24936883, 0.6844717,  0.7804763},
+                     {0.75809366, 0.31532857, 0.7745503},
+                     {0.9981172,  0.7411593,  0.9452186}},
+                    {{0.86704135, 0.6183671,  0.9210026},
+                     {0.32587415, 0.34581956, 0.9669509},
+                     {0.80971164, 0.17067613, 0.41343793}},
+                    {{0.60252786, 0.9595469,  0.88964105},
+                     {0.9432012,  0.97242594, 0.95933896},
+                     {0.7273457,  0.36603138, 0.87227696}}
+                },
+                {
+                    {{0.55894744, 0.89736515, 0.69578433},
+                     {0.91855973, 0.8087555,  0.7418151},
+                     {0.88060623, 0.24907047, 0.91193837}},
+                    {{0.6807564,  0.73233694, 0.6256645},
+                     {0.37688458, 0.9176832,  0.9704928},
+                     {0.22513604, 0.99944496, 0.62019646}},
+                    {{0.6131163,  0.7195106,  0.5503507},
+                     {0.984294,   0.59290844, 0.745981},
+                     {0.3608653,  0.8571328,  0.16447252}}
+                }
+            }
+        };
+
+        std::shared_ptr<Sqrt_Op> op = std::make_shared<Sqrt_Op>();
+        op->associateInput(0,input);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cpu");
+        op->forward();
+
+        REQUIRE(approxEq<float>(*(op->getOutput(0)), expectedOutput, 1e-5f, 1e-8f));
+    }
+}
\ No newline at end of file
--- a/unit_tests/operator/Test_SubImpl.cpp
+++ b/unit_tests/operator/Test_SubImpl.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <chrono>      // std::micro, std::chrono::time_point,
+                       // std::chrono::system_clock
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::uint16_t
+#include <functional>  // std::multiplies
+#include <memory>
+#include <numeric>     // std::accumulate
+#include <random>      // std::random_device, std::mt19937
+                       // std::uniform_int_distribution, std::uniform_real_distribution
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
+
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/SubImpl.hpp"
+#include "aidge/data/Data.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/Sub.hpp"
+#include "aidge/operator/OperatorTensor.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+namespace Aidge {
+
+TEST_CASE("[cpu/operator] Sub", "[Sub][CPU]") {
+    constexpr std::uint16_t NBTRIALS = 10;
+    // Create a random number generator
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
+    std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
+    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
+    std::uniform_int_distribution<int> boolDist(0,1);
+
+    // Create MatMul Operator
+    std::shared_ptr<Node> mySub = Sub();
+    auto op = std::static_pointer_cast<OperatorTensor>(mySub-> getOperator());
+    op->setDataType(DataType::Float32);
+    op->setBackend("cpu");
+
+    // Create 2 input Tensors
+    std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>();
+    op->associateInput(0,T0);
+    T0->setDataType(DataType::Float32);
+    T0->setBackend("cpu");
+    std::shared_ptr<Tensor> T1 = std::make_shared<Tensor>();
+    op -> associateInput(1,T1);
+    T1->setDataType(DataType::Float32);
+    T1->setBackend("cpu");
+
+    // Create results Tensor
+    std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>();
+    Tres->setDataType(DataType::Float32);
+    Tres->setBackend("cpu");
+
+    // To measure execution time of 'MatMul_Op::forward()' member function call
+    std::chrono::time_point<std::chrono::system_clock> start;
+    std::chrono::time_point<std::chrono::system_clock> end;
+    std::chrono::duration<double, std::micro> duration{};
+
+    SECTION("SubImpl_cpu::forward()") {
+        SECTION("Scalar / Scalar") {
+
+        }
+        SECTION("Scalar / +1-D Tensor") {
+
+        }
+        SECTION("+1-D Tensor / +1-D Tensor - same dimensions") {
+            std::size_t number_of_operation = 0;
+
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                // generate 2 random Tensors
+                const std::size_t nbDims = nbDimsDist(gen);
+                std::vector<std::size_t> dims;
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    dims.push_back(dimSizeDist(gen));
+                }
+                const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                number_of_operation += nb_elements;
+
+                // without broadcasting
+                float* array0 = new float[nb_elements];
+                float* array1 = new float[nb_elements];
+                float* result = new float[nb_elements];
+
+                for (std::size_t i = 0; i < nb_elements; ++i) {
+                    array0[i] = valueDist(gen);
+                    array1[i] = valueDist(gen);
+                    result[i] = array0[i] - array1[i];
+                }
+
+                // input0
+                T0->resize(dims);
+                T0 -> getImpl() -> setRawPtr(array0, nb_elements);
+
+                // input1
+                T1->resize(dims);
+                T1 -> getImpl() -> setRawPtr(array1, nb_elements);
+
+                // results
+                Tres->resize(dims);
+                Tres -> getImpl() -> setRawPtr(result, nb_elements);
+
+                op->forwardDims();
+                start = std::chrono::system_clock::now();
+                mySub->forward();
+                end = std::chrono::system_clock::now();
+                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+                REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
+
+                delete[] array0;
+                delete[] array1;
+                delete[] result;
+
+                // with broadcasting
+            }
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {}μs\n", duration.count());
+        }
+
+        SECTION("+1-D Tensor / +1-D Tensor - broadcasting") {
+            std::size_t number_of_operation = 0;
+
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                // generate 2 random Tensors
+                // handle dimensions, replace some dimensions with '1' to get broadcasting
+                constexpr std::size_t nbDims = 4;
+                std::vector<std::size_t> dims;
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    dims.push_back(dimSizeDist(gen));
+                }
+                std::vector<std::size_t> dims0 = dims;
+                std::vector<std::size_t> dims1 = dims;
+                std::vector<std::size_t> dimsOut = dims;
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    if (boolDist(gen)) {
+                        dims0[i] = 1;
+                    }
+                    if (boolDist(gen)) {
+                        dims1[i] = 1;
+                    }
+                    dimsOut[i] = (dims0[i] == 1) ? dims1[i] : dims0[i];
+                }
+
+                // create arrays and fill them with random values
+                float* array0 = new float[dims0[0]*dims0[1]*dims0[2]*dims0[3]];
+                float* array1 = new float[dims1[0]*dims1[1]*dims1[2]*dims1[3]];
+                float* result = new float[dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]];
+
+                for (std::size_t i = 0; i < dims0[0]*dims0[1]*dims0[2]*dims0[3]; ++i) {
+                    array0[i] = valueDist(gen);
+                }
+                for (std::size_t i = 0; i < dims1[0]*dims1[1]*dims1[2]*dims1[3]; ++i) {
+                    array1[i] = valueDist(gen);
+                }
+
+                // compute true result
+                const std::size_t strides0[nbDims] = {dims0[1]*dims0[2]*dims0[3], dims0[2]*dims0[3], dims0[3], 1};
+                const std::size_t strides1[nbDims] = {dims1[1]*dims1[2]*dims1[3], dims1[2]*dims1[3], dims1[3], 1};
+                for (std::size_t a = 0; a < dimsOut[0]; ++a) {
+                    for (std::size_t b = 0; b < dimsOut[1]; ++b) {
+                        const std::size_t idx0_0 = strides0[0] * ((dims0[0] > 1) ? a : 0)
+                                                    + strides0[1] * ((dims0[1] > 1) ? b : 0);
+                        const std::size_t idx1_0 = strides1[0] * ((dims1[0] > 1) ? a : 0)
+                                                    + strides1[1] * ((dims1[1] > 1) ? b : 0);
+                        for (std::size_t c = 0; c < dimsOut[2]; ++c) {
+                            const std::size_t idx_out = dimsOut[3] * (c + dimsOut[2] * (b + dimsOut[1] * a));
+                            for (std::size_t d = 0; d < dimsOut[3]; ++d) {
+                                std::size_t idx0 = idx0_0
+                                                    + strides0[2] * ((dims0[2] > 1) ? c : 0)
+                                                    + ((dims0[3] > 1) ? d : 0);
+                                std::size_t idx1 = idx1_0
+                                                    + strides1[2] * ((dims1[2] > 1) ? c : 0)
+                                                    + ((dims1[3] > 1) ? d : 0);
+                                result[idx_out + d] = array0[idx0] - array1[idx1];
+                                // std::cout << "(" << idx0 << ", " << idx1 << ") -> " << array0[idx0] << " - " << array1[idx1] << " -> " << idx_out + d << std::endl;
+                            }
+                        }
+                    }
+                }
+
+                // conversion to Aidge::Tensors
+                // input0
+                T0->resize(dims0);
+                T0 -> getImpl() -> setRawPtr(array0, dims0[0]*dims0[1]*dims0[2]*dims0[3]);
+
+                // input1
+                T1->resize(dims1);
+                T1 -> getImpl() -> setRawPtr(array1, dims1[0]*dims1[1]*dims1[2]*dims1[3]);
+
+                // results
+                Tres->resize(dimsOut);
+                Tres -> getImpl() -> setRawPtr(result, dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]);
+
+                // compute result
+                op->forwardDims();
+                start = std::chrono::system_clock::now();
+                mySub->forward();
+                end = std::chrono::system_clock::now();
+                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+                // comparison between truth and computed result
+                REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
+
+                delete[] array0;
+                delete[] array1;
+                delete[] result;
+
+                const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                number_of_operation += nb_elements;
+            }
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {}μs\n", duration.count());
+        }
+        SECTION("+1-D Tensor / 1-D Tensor") {
+            std::size_t number_of_operation = 0;
+            std::uniform_int_distribution<std::size_t> nbRemovedDimsDist(std::size_t(1), std::size_t(3));
+
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                // generate 2 random Tensors
+                // handle dimensions
+                constexpr std::size_t nbDims = 4;
+                std::vector<std::size_t> dims0(4);
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    dims0[i] = dimSizeDist(gen);
+                }
+                std::vector<std::size_t> dimsOut = dims0;
+                std::vector<std::size_t> dims1 = dims0;
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    if (boolDist(gen)) {
+                        dims1[i] = 1;
+                    }
+                }
+                dims1.erase(dims1.cbegin(), dims1.cbegin() + nbRemovedDimsDist(gen));
+
+                // create arrays and fill them with random values
+                float* array0 = new float[dims0[0]*dims0[1]*dims0[2]*dims0[3]];
+                std::size_t array1_size = std::accumulate(dims1.cbegin(), dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                float* array1 = new float[array1_size];
+                float* result = new float[dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]];
+
+                for (std::size_t i = 0; i < (dims0[0]*dims0[1]*dims0[2]*dims0[3]); ++i) {
+                    array0[i] = valueDist(gen);
+                }
+                for (std::size_t i = 0; i < array1_size; ++i) {
+                    array1[i] = valueDist(gen);
+                }
+
+                // compute true result
+                auto dims1_tmp = dims1;
+                dims1_tmp.insert(dims1_tmp.cbegin(), 4 - dims1_tmp.size(), std::size_t(1));
+
+                const std::size_t strides0[nbDims] = {dims0[1]*dims0[2]*dims0[3], dims0[2]*dims0[3], dims0[3], 1};
+                const std::size_t strides1[nbDims] = {dims1_tmp[1]*dims1_tmp[2]*dims1_tmp[3], dims1_tmp[2]*dims1_tmp[3], dims1_tmp[3], 1};
+                for (std::size_t a = 0; a < dimsOut[0]; ++a) {
+                    for (std::size_t b = 0; b < dimsOut[1]; ++b) {
+                        const std::size_t idx0_0 = strides0[0] * ((dims0[0] > 1) ? a : 0)
+                                                    + strides0[1] * ((dims0[1] > 1) ? b : 0);
+                        const std::size_t idx1_0 = strides1[0] * ((dims1_tmp[0] > 1) ? a : 0)
+                                                    + strides1[1] * ((dims1_tmp[1] > 1) ? b : 0);
+                        for (std::size_t c = 0; c < dimsOut[2]; ++c) {
+                            const std::size_t idx_out = dimsOut[3] * (c + dimsOut[2] * (b + dimsOut[1] * a));
+                            for (std::size_t d = 0; d < dimsOut[3]; ++d) {
+                                std::size_t idx0 = idx0_0
+                                                    + strides0[2] * ((dims0[2] > 1) ? c : 0)
+                                                    + ((dims0[3] > 1) ? d : 0);
+                                std::size_t idx1 = idx1_0
+                                                    + strides1[2] * ((dims1_tmp[2] > 1) ? c : 0)
+                                                    + ((dims1_tmp[3] > 1) ? d : 0);
+                                result[idx_out + d] = array0[idx0] - array1[idx1];
+                                // std::cout << "(" << idx0 << ", " << idx1 << ") -> " << array0[idx0] << " - " << array1[idx1] << " -> " << idx_out + d << std::endl;
+                            }
+                        }
+                    }
+                }
+
+                // conversion to Aidge::Tensors
+                // input0
+                T0->resize(dims0);
+                T0 -> getImpl() -> setRawPtr(array0, dims0[0]*dims0[1]*dims0[2]*dims0[3]);
+
+                // input1
+                T1->resize(dims1);
+                T1 -> getImpl() -> setRawPtr(array1, array1_size);
+
+                // results
+                Tres->resize(dimsOut);
+                Tres -> getImpl() -> setRawPtr(result, dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]);
+
+                // compute result
+                op->forwardDims();
+                start = std::chrono::system_clock::now();
+                mySub->forward();
+                end = std::chrono::system_clock::now();
+                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+                // comparison between truth and computed result
+                REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
+
+                delete[] array0;
+                delete[] array1;
+                delete[] result;
+
+                const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                number_of_operation += nb_elements;
+            }
+
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {}μs\n", duration.count());
+        }
+    }
+}
+} // namespace Aidge
--- a/unit_tests/operator/Test_WeightInterleavingImpl.cpp
+++ b/unit_tests/operator/Test_WeightInterleavingImpl.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/WeightInterleaving.hpp"
+#include "aidge/recipes/Recipes.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+#include "aidge/backend/cpu.hpp"
+
+#include <memory>
+
+using namespace Aidge;
+
+TEST_CASE("[cpu/operator] WeightInterleaving", "[WeightInterleaving][CPU]") {
+
+    std::shared_ptr<Node> myWeightInterleaving = WeightInterleaving();
+    auto opWeightInterleaving = std::static_pointer_cast<WeightInterleaving_Op>(myWeightInterleaving -> getOperator());
+
+    SECTION("CompactDataSize - Single element cases") {
+        REQUIRE(opWeightInterleaving->compactDataSize(1, 1) == 1);  // 1 bit, needs 1 byte
+        REQUIRE(opWeightInterleaving->compactDataSize(1, 7) == 1);  // 7 bits, needs 1 byte
+    }
+
+    SECTION("CompactDataSize - Boundary cases for different nb_bits values") {
+        REQUIRE(opWeightInterleaving->compactDataSize(8, 1) == 1);  // 8 elements at 1 bit each, fits in 1 byte
+        REQUIRE(opWeightInterleaving->compactDataSize(8, 2) == 2);  // 8 elements at 2 bits each, needs 2 bytes
+        REQUIRE(opWeightInterleaving->compactDataSize(8, 3) == 4);  // 8 elements at 3 bits each, needs 4 bytes
+        REQUIRE(opWeightInterleaving->compactDataSize(8, 4) == 4);  // 8 elements at 4 bits each, needs 4 bytes
+    }
+
+    SECTION("CompactDataSize - Larger dataSize values") {
+        REQUIRE(opWeightInterleaving->compactDataSize(16, 1) == 2);  // 16 elements at 1 bit each, fits in 2 bytes
+        REQUIRE(opWeightInterleaving->compactDataSize(16, 2) == 4);  // 16 elements at 2 bits each, needs 4 bytes
+        REQUIRE(opWeightInterleaving->compactDataSize(16, 3) == 8);  // 16 elements at 3 bits each, needs 6 bytes
+        REQUIRE(opWeightInterleaving->compactDataSize(16, 4) == 8);  // 16 elements at 4 bits each, needs 8 bytes
+    }
+
+    SECTION("CompactDataSize - Odd dataSize values with varying nb_bits") {
+        REQUIRE(opWeightInterleaving->compactDataSize(7, 1) == 1);  // 7 elements at 1 bit each, fits in 1 byte
+        REQUIRE(opWeightInterleaving->compactDataSize(7, 2) == 2);  // 7 elements at 2 bits each, needs 2 bytes
+        REQUIRE(opWeightInterleaving->compactDataSize(7, 3) == 4);  // 7 elements at 3 bits each, needs 4 bytes
+        REQUIRE(opWeightInterleaving->compactDataSize(7, 4) == 4);  // 7 elements at 4 bits each, needs 4 bytes
+    }
+
+    SECTION("CompactDataSize - Minimum and maximum values for nb_bits") {
+        REQUIRE(opWeightInterleaving->compactDataSize(5, 1) == 1);  // 5 elements at 1 bit each, fits in 1 byte
+    }
+
+    SECTION("CompactDataSize - Edge Case - dataSize of 0 should result in 0 required size") {
+        REQUIRE(opWeightInterleaving->compactDataSize(0, 1) == 0);  // No data elements
+    }
+
+
+    SECTION("CompactData - 4-bit compaction") {
+        std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array1D<std::int8_t, 4>{
+                                                                {static_cast<std::int8_t>(0x0F),
+                                                                static_cast<std::int8_t>(0xF5),
+                                                                static_cast<std::int8_t>(0xB3),
+                                                                static_cast<std::int8_t>(0x9C)}
+                                                                });
+
+        weight->setDataFormat(Aidge::DataFormat::NHWC);
+        weight->setDataType(Aidge::DataType::Int4);
+
+        std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array1D<std::int8_t, 2>{
+                                                                {static_cast<int8_t>(0xF5),
+                                                                static_cast<int8_t>(0x3C)}
+                                                                });
+
+        expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
+        expectedWeightInterleaving->setDataType(WeightInterleavedType_v<Aidge::DataType::Int4>);
+
+        std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
+        auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
+        op->associateInput(0,weight);
+        op->setDataType(WeightInterleavedType_v<Aidge::DataType::Int4>);
+        op->setDataFormat(DataFormat::NHWC);
+        op->setBackend("cpu");
+        myWeightInterleavingNode->forward();
+        REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving);
+    }
+
+    SECTION("CompactData - 3-bit compaction") {
+        std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array1D<std::int8_t, 4>{
+                                                                {static_cast<int8_t>(0x0F),
+                                                                static_cast<int8_t>(0x05),
+                                                                static_cast<int8_t>(0x04),
+                                                                static_cast<int8_t>(0xD3)}
+                                                                });
+
+        weight->setDataFormat(Aidge::DataFormat::NHWC);
+        weight->setDataType(Aidge::DataType::Int3);
+
+        std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array1D<std::int8_t, 2>{
+                                                                {static_cast<int8_t>(0x75),
+                                                                static_cast<int8_t>(0x43)}
+                                                                });
+
+        expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
+        expectedWeightInterleaving->setDataType(WeightInterleavedType_v<Aidge::DataType::Int3>);
+
+        std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
+        auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
+        op->associateInput(0,weight);
+        op->setDataType(WeightInterleavedType_v<Aidge::DataType::Int3>);
+        op->setDataFormat(DataFormat::NHWC);
+        op->setBackend("cpu");
+        myWeightInterleavingNode->forward();
+        REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving);
+    }
+
+    SECTION("CompactData - 2-bit compaction") {
+        std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array1D<std::int8_t, 4>{
+                                                                {static_cast<std::int8_t>(0x03),
+                                                                 static_cast<std::int8_t>(0x02),
+                                                                 static_cast<std::int8_t>(0x01),
+                                                                 static_cast<std::int8_t>(0x00)}
+                                                                 });
+
+        weight->setDataFormat(Aidge::DataFormat::NHWC);
+        weight->setDataType(Aidge::DataType::Int2);
+
+        std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array1D<std::int8_t, 1>{
+                                                                {static_cast<int8_t>(0xE4)}
+                                                                });
+
+        expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
+        expectedWeightInterleaving->setDataType(WeightInterleavedType_v<Aidge::DataType::Int2>);
+
+        std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
+        auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
+        op->associateInput(0,weight);
+        op->setDataType(WeightInterleavedType_v<Aidge::DataType::Int2>);
+        op->setDataFormat(DataFormat::NHWC);
+        op->setBackend("cpu");
+        myWeightInterleavingNode->forward();
+        REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving);
+    }
+
+    SECTION("CompactData - Edge Cases - Single element data") {
+        std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array1D<std::int8_t, 1>{
+                                                                {static_cast<int8_t>(0x0F)}
+                                                                });
+
+        weight->setDataFormat(Aidge::DataFormat::NHWC);
+        weight->setDataType(Aidge::DataType::Int4);
+
+        std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array1D<std::int8_t, 1>{
+                                                                {static_cast<int8_t>(0xF0)}
+                                                                });
+
+        expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
+        expectedWeightInterleaving->setDataType(WeightInterleavedType_v<Aidge::DataType::Int4>);
+
+        std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
+        auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
+        op->associateInput(0,weight);
+        op->setDataType(WeightInterleavedType_v<Aidge::DataType::Int4>);
+        op->setDataFormat(DataFormat::NHWC);
+        op->setBackend("cpu");
+        myWeightInterleavingNode->forward();
+        REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving);
+    }
+
+    SECTION("CompactData - Edge Cases - Non-divisible dataSize for nbSlot with nbbits=4") {
+        std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array1D<std::int8_t, 3>{
+                                                                {static_cast<int8_t>(0x0F),
+                                                                static_cast<int8_t>(0xA5),
+                                                                static_cast<int8_t>(0x34)}
+                                                                });
+
+        weight->setDataFormat(Aidge::DataFormat::NHWC);
+        weight->setDataType(Aidge::DataType::Int4);
+
+        std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array1D<std::int8_t, 2>{
+                                                                {static_cast<int8_t>(0xF5),
+                                                                static_cast<int8_t>(0x40)}
+                                                                });
+
+        expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
+        expectedWeightInterleaving->setDataType(WeightInterleavedType_v<Aidge::DataType::Int4>);
+
+        std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
+        auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
+        op->associateInput(0,weight);
+        op->setDataType(WeightInterleavedType_v<Aidge::DataType::Int4>);
+        op->setDataFormat(DataFormat::NHWC);
+        op->setBackend("cpu");
+        myWeightInterleavingNode->forward();
+        REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving);
+
+    }
+
+    SECTION("CompactData - Edge Cases - Non-divisible dataSize for nbSlot with nbbits=3") {
+
+        std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array1D<std::int8_t, 3>{
+                                                                {static_cast<int8_t>(0x0F),
+                                                                static_cast<int8_t>(0x05),
+                                                                static_cast<int8_t>(0x04)}
+                                                                });
+
+        weight->setDataFormat(Aidge::DataFormat::NHWC);
+        weight->setDataType(Aidge::DataType::Int3);
+
+        std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array1D<std::int8_t, 2>{
+                                                                {static_cast<int8_t>(0x75),
+                                                                static_cast<int8_t>(0x40)}
+                                                                });
+
+        expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
+        expectedWeightInterleaving->setDataType(WeightInterleavedType_v<Aidge::DataType::Int3>);
+
+        std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
+        auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
+        op->associateInput(0,weight);
+        op->setDataType(WeightInterleavedType_v<Aidge::DataType::Int3>);
+        op->setDataFormat(DataFormat::NHWC);
+        op->setBackend("cpu");
+        myWeightInterleavingNode->forward();
+        REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving);
+
+    }
+
+    SECTION("Forward Op - Convolution weight interleaving") {
+
+        // Weight [Cout = 2, H = 3, W = 3, Cin = 4]:
+        std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array4D<std::int8_t,2,3,3,4> {
+            {
+                {
+                    {
+                        {-6,  0,  5, -8}, // 'A' '0' '5' '8' in hexadecimal format
+                        { 5,  5,  4, -5}, // '5' '5' '4' 'B' in hexadecimal format
+                        {-7, -1,  4, -7}  // '9' 'F' '4' '9' in hexadecimal format
+                    },
+                    {
+                        { 3, -3, -3, -3}, // '3' 'D' 'D' 'D' in hexadecimal format
+                        { 1,  3,  1, -1}, // '1' '3' '1' 'F' in hexadecimal format
+                        { 7, -3, -1,  4}  // '7' 'D' 'F' '4' in hexadecimal format
+                    },
+                    {
+                        {-1,  3,  5,  6}, // 'F' '3' '5' '6' in hexadecimal format
+                        {-8,  4,  7,  1}, // '8' '4' '7' '1' in hexadecimal format
+                        {-5,  0, -1, -2}  // 'B' '0' 'F' 'E' in hexadecimal format
+                    }
+                },
+                {
+                    {
+                        { 2, -7,  7, -4}, // '2' '9' '7' 'C' in hexadecimal format
+                        {-7,  3,  0,  2}, // '9' '3' '0' '2' in hexadecimal format
+                        { 1, -1,  2,  3}  // '1' 'F' '2' '3' in hexadecimal format
+                    },
+                    {
+                        {-1, -5, -3, -7}, // 'F' 'B' 'D' '9' in hexadecimal format
+                        {-8,  3,  5, -1}, // '8' '3' '5' 'F' in hexadecimal format
+                        {-7, -4, -6, -1}  // '9' 'C' 'A' 'F' in hexadecimal format
+                    },
+                    {
+                        { 1,  7,  5, -1}, // '1' '7' '5' 'F' in hexadecimal format
+                        { 1, -8,  1,  2}, // '1' '8' '1' '2' in hexadecimal format
+                        {-1, -6, -3,  0}  // 'F' 'A' 'D' '0' in hexadecimal format
+                    }
+                }
+            }
+        });
+
+        std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array4D<std::int8_t,2,3,3,2> {
+            {
+                {
+                    {
+                        {static_cast<int8_t>(0xA0), static_cast<int8_t>(0x58)}, // 'A' '0' '5' '8' in hexadecimal format
+                        {static_cast<int8_t>(0x55), static_cast<int8_t>(0x4B)}, // '5' '5' '4' 'B' in hexadecimal format
+                        {static_cast<int8_t>(0x9F), static_cast<int8_t>(0x49)}  // '9' 'F' '4' '9' in hexadecimal format
+                    },
+                    {
+                        {static_cast<int8_t>(0x3D), static_cast<int8_t>(0xDD)}, // '3' 'D' 'D' 'D' in hexadecimal format
+                        {static_cast<int8_t>(0x13), static_cast<int8_t>(0x1F)}, // '1' '3' '1' 'F' in hexadecimal format
+                        {static_cast<int8_t>(0x7D), static_cast<int8_t>(0xF4)}  // '7' 'D' 'F' '4' in hexadecimal format
+                    },
+                    {
+                        {static_cast<int8_t>(0xF3), static_cast<int8_t>(0x56)}, // 'F' '3' '5' '6' in hexadecimal format
+                        {static_cast<int8_t>(0x84), static_cast<int8_t>(0x71)}, // '8' '4' '7' '1' in hexadecimal format
+                        {static_cast<int8_t>(0xB0), static_cast<int8_t>(0xFE)}  // 'B' '0' 'F' 'E' in hexadecimal format
+                    }
+                },
+                {
+                    {
+                        {static_cast<int8_t>(0x29), static_cast<int8_t>(0x7C)}, // '2' '9' '7' 'C' in hexadecimal format
+                        {static_cast<int8_t>(0x93), static_cast<int8_t>(0x02)}, // '9' '3' '0' '2' in hexadecimal format
+                        {static_cast<int8_t>(0x1F), static_cast<int8_t>(0x23)}  // '1' 'F' '2' '3' in hexadecimal format
+                    },
+                    {
+                        {static_cast<int8_t>(0xFB), static_cast<int8_t>(0xD9)}, // 'F' 'B' 'D' '9' in hexadecimal format
+                        {static_cast<int8_t>(0x83), static_cast<int8_t>(0x5F)}, // '8' '3' '5' 'F' in hexadecimal format
+                        {static_cast<int8_t>(0x9C), static_cast<int8_t>(0xAF)}  // '9' 'C' 'A' 'F' in hexadecimal format
+                    },
+                    {
+                        {static_cast<int8_t>(0x17), static_cast<int8_t>(0x5F)}, // '1' '7' '5' 'F' in hexadecimal format
+                        {static_cast<int8_t>(0x18), static_cast<int8_t>(0x12)}, // '1' '8' '1' '2' in hexadecimal format
+                        {static_cast<int8_t>(0xFA), static_cast<int8_t>(0xD0)}  // 'F' 'A' 'D' '0' in hexadecimal format
+                    }
+                }
+            }
+        });
+
+        weight->setDataFormat(Aidge::DataFormat::NHWC);
+        weight->setDataType(Aidge::DataType::Int4);
+
+        expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
+        expectedWeightInterleaving->setDataType(WeightInterleavedType_v<Aidge::DataType::Int4>);
+
+        std::shared_ptr<Node> myWeightInterleavingNode = WeightInterleaving();
+        auto op = std::static_pointer_cast<OperatorTensor>(myWeightInterleavingNode -> getOperator());
+        op->associateInput(0,weight);
+        op->setDataType(WeightInterleavedType_v<Aidge::DataType::Int4>);
+        op->setDataFormat(DataFormat::NHWC);
+        op->setBackend("cpu");
+        myWeightInterleavingNode->forward();
+        REQUIRE(*(op->getOutput(0)) == *expectedWeightInterleaving);
+    }
+
+    SECTION("Recipie ApplyWeightInterleaving") {
+
+        // Weight [Cout = 2, H = 3, W = 3, Cin = 4]:
+        std::shared_ptr<Tensor> weight = std::make_shared<Tensor>(Array4D<std::int8_t,2,3,3,4> {
+            {
+                {
+                    {
+                        {-6,  0,  5, -8}, // 'A' '0' '5' '8' in hexadecimal format
+                        { 5,  5,  4, -5}, // '5' '5' '4' 'B' in hexadecimal format
+                        {-7, -1,  4, -7}  // '9' 'F' '4' '9' in hexadecimal format
+                    },
+                    {
+                        { 3, -3, -3, -3}, // '3' 'D' 'D' 'D' in hexadecimal format
+                        { 1,  3,  1, -1}, // '1' '3' '1' 'F' in hexadecimal format
+                        { 7, -3, -1,  4}  // '7' 'D' 'F' '4' in hexadecimal format
+                    },
+                    {
+                        {-1,  3,  5,  6}, // 'F' '3' '5' '6' in hexadecimal format
+                        {-8,  4,  7,  1}, // '8' '4' '7' '1' in hexadecimal format
+                        {-5,  0, -1, -2}  // 'B' '0' 'F' 'E' in hexadecimal format
+                    }
+                },
+                {
+                    {
+                        { 2, -7,  7, -4}, // '2' '9' '7' 'C' in hexadecimal format
+                        {-7,  3,  0,  2}, // '9' '3' '0' '2' in hexadecimal format
+                        { 1, -1,  2,  3}  // '1' 'F' '2' '3' in hexadecimal format
+                    },
+                    {
+                        {-1, -5, -3, -7}, // 'F' 'B' 'D' '9' in hexadecimal format
+                        {-8,  3,  5, -1}, // '8' '3' '5' 'F' in hexadecimal format
+                        {-7, -4, -6, -1}  // '9' 'C' 'A' 'F' in hexadecimal format
+                    },
+                    {
+                        { 1,  7,  5, -1}, // '1' '7' '5' 'F' in hexadecimal format
+                        { 1, -8,  1,  2}, // '1' '8' '1' '2' in hexadecimal format
+                        {-1, -6, -3,  0}  // 'F' 'A' 'D' '0' in hexadecimal format
+                    }
+                }
+            }
+        });
+
+        std::shared_ptr<Tensor> expectedWeightInterleaving = std::make_shared<Tensor>(Array4D<std::int8_t,2,3,3,2> {
+            {
+                {
+                    {
+                        {static_cast<int8_t>(0xA0), static_cast<int8_t>(0x58)}, // 'A' '0' '5' '8' in hexadecimal format
+                        {static_cast<int8_t>(0x55), static_cast<int8_t>(0x4B)}, // '5' '5' '4' 'B' in hexadecimal format
+                        {static_cast<int8_t>(0x9F), static_cast<int8_t>(0x49)}  // '9' 'F' '4' '9' in hexadecimal format
+                    },
+                    {
+                        {static_cast<int8_t>(0x3D), static_cast<int8_t>(0xDD)}, // '3' 'D' 'D' 'D' in hexadecimal format
+                        {static_cast<int8_t>(0x13), static_cast<int8_t>(0x1F)}, // '1' '3' '1' 'F' in hexadecimal format
+                        {static_cast<int8_t>(0x7D), static_cast<int8_t>(0xF4)}  // '7' 'D' 'F' '4' in hexadecimal format
+                    },
+                    {
+                        {static_cast<int8_t>(0xF3), static_cast<int8_t>(0x56)}, // 'F' '3' '5' '6' in hexadecimal format
+                        {static_cast<int8_t>(0x84), static_cast<int8_t>(0x71)}, // '8' '4' '7' '1' in hexadecimal format
+                        {static_cast<int8_t>(0xB0), static_cast<int8_t>(0xFE)}  // 'B' '0' 'F' 'E' in hexadecimal format
+                    }
+                },
+                {
+                    {
+                        {static_cast<int8_t>(0x29), static_cast<int8_t>(0x7C)}, // '2' '9' '7' 'C' in hexadecimal format
+                        {static_cast<int8_t>(0x93), static_cast<int8_t>(0x02)}, // '9' '3' '0' '2' in hexadecimal format
+                        {static_cast<int8_t>(0x1F), static_cast<int8_t>(0x23)}  // '1' 'F' '2' '3' in hexadecimal format
+                    },
+                    {
+                        {static_cast<int8_t>(0xFB), static_cast<int8_t>(0xD9)}, // 'F' 'B' 'D' '9' in hexadecimal format
+                        {static_cast<int8_t>(0x83), static_cast<int8_t>(0x5F)}, // '8' '3' '5' 'F' in hexadecimal format
+                        {static_cast<int8_t>(0x9C), static_cast<int8_t>(0xAF)}  // '9' 'C' 'A' 'F' in hexadecimal format
+                    },
+                    {
+                        {static_cast<int8_t>(0x17), static_cast<int8_t>(0x5F)}, // '1' '7' '5' 'F' in hexadecimal format
+                        {static_cast<int8_t>(0x18), static_cast<int8_t>(0x12)}, // '1' '8' '1' '2' in hexadecimal format
+                        {static_cast<int8_t>(0xFA), static_cast<int8_t>(0xD0)}  // 'F' 'A' 'D' '0' in hexadecimal format
+                    }
+                }
+            }
+        });
+
+        expectedWeightInterleaving->setDataFormat(Aidge::DataFormat::NHWC);
+        expectedWeightInterleaving->setDataType(Aidge::DataType::Dual_Int4);
+
+        // Create convolution node
+        std::shared_ptr<Node> conv = Conv(4, 2, {3, 3}, "conv1");
+
+        // Place the weight tensor in the weight producer of the conv
+        auto weightProducer = conv->getParent(1);
+        weightProducer->getOperator()->setOutput(0, weight);
+
+        // Set dataType, dataformat and backend of convolution
+        conv->getOperator()->setDataFormat(Aidge::DataFormat::NHWC);
+        conv->getOperator()->setDataType(Aidge::DataType::Int4);
+        conv->getOperator()->setBackend("cpu");
+
+        // Apply recipie
+        applyWeightInterleaving(conv);
+
+        // Compare the weight producer output tensor with the expected weights with interleaving
+        auto newProdOp = std::static_pointer_cast<OperatorTensor>(conv->getParent(1)->getOperator());
+        REQUIRE(*(newProdOp->getOutput(0)) == *expectedWeightInterleaving);
+    }
+
+}
--- a/unit_tests/recipies/Test_ConstantFolding.cpp
+++ b/unit_tests/recipies/Test_ConstantFolding.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/recipes/Recipes.hpp"
+#include "aidge/operator/Add.hpp"
+#include "aidge/operator/MatMul.hpp"
+#include "aidge/operator/Producer.hpp"
+#include "aidge/graph/OpArgs.hpp"
+#include "aidge/scheduler/SequentialScheduler.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+#include <cstddef>
+
+using namespace Aidge;
+
+TEST_CASE("[ConstantFolding] forward", "[ConstantFolding][forward][CPU]") {
+    // generate the original GraphView
+    auto matmul0 = MatMul("matmul0");
+    auto add0 = Add("add0");
+    auto matmul1 = MatMul("matmul1");
+    auto add1 = Add("add1");
+
+    auto b0 = Producer(std::make_shared<Tensor>(Array1D<float,5>{{1, 2, 3, 4, 5}}), "B0", true);
+    auto w0 = Producer(std::make_shared<Tensor>(Array2D<float,5,5>{{{1, 2, 3, 4, 5}, {6, 7, 8, 9, 0}, {1, 2, 3, 4, 5}, {6, 7, 8, 9, 0}, {1, 2, 3, 4, 5}}}), "W0", true);
+    auto b1 = Producer(std::make_shared<Tensor>(Array1D<float,5>{{1, 2, 3, 4, 5}}), "B1", true);
+    auto w1 = Producer(std::make_shared<Tensor>(Array2D<float,5,5>{{{6, 7, 8, 9, 0}, {1, 2, 3, 4, 5}, {6, 7, 8, 9, 0}, {1, 2, 3, 4, 5}, {6, 7, 8, 9, 0}}}),"W1", true);
+    auto input = Producer(std::make_shared<Tensor>(Array2D<float,2,5>{{{1, 2, 3, 4, 5}, {6, 7, 8, 9, 0}}}), "input", true);
+
+    input->addChild(matmul0, 0, 0);
+    w0->addChild(matmul0, 0, 1);
+
+    matmul0->addChild(add0, 0, 0);
+    b0->addChild(add0, 0, 1);
+
+    add0->addChild(matmul1, 0, 0);
+    w1->addChild(matmul1, 0, 1);
+
+    matmul1->addChild(add1, 0, 0);
+    b1->addChild(add1, 0, 1);
+
+    auto g = std::make_shared<GraphView>();
+    g->add({input, w0, matmul0, b0, add0, w1, matmul1, b1, add1});
+    g->setBackend("cpu");
+    g->forwardDims();
+
+    // Check original graph
+    REQUIRE(g->getNodes() ==
+            std::set<std::shared_ptr<Node>>({input, w0, matmul0, b0, add0, w1, matmul1, b1, add1}));
+    REQUIRE(((matmul0->getParent(0) == input) && (matmul0->getParent(1) == w0)));
+    REQUIRE(((add0->getParent(0) == matmul0) && (add0->getParent(1) == b0)));
+    REQUIRE(((matmul1->getParent(0) == add0) && (matmul1->getParent(1) == w1)));
+    REQUIRE(((add1->getParent(0) == matmul1) && (add1->getParent(1) == b1)));
+
+    auto scheduler = SequentialScheduler(g);
+    scheduler.forward();
+
+    const std::shared_ptr<Tensor> result = std::make_shared<Tensor>(Array2D<float,2,5>{{
+        { 1201.000000, 1532.000000, 1863.000000, 2194.000000, 785.000000},
+        { 2501.000000, 3207.000000, 3913.000000, 4619.000000, 1735.000000}
+    }});
+
+    auto add1Op = std::static_pointer_cast<Add_Op>(add1->getOperator());
+    REQUIRE(approxEq<float>(*(add1Op->getOutput(0)), *result));
+
+	// Transform GraphView inplace
+    constantFolding(g);
+
+	// Check new GraphView
+	std::set<std::shared_ptr<Node>> newNodes = g->getNodes();
+	REQUIRE(newNodes != std::set<std::shared_ptr<Node>>({input, w0, matmul0, b0, add0, w1, matmul1, b1, add1}));
+	REQUIRE(newNodes.size() == 1);
+	REQUIRE((*newNodes.cbegin())->type() == "Producer");
+
+    auto prodOp = std::static_pointer_cast<Producer_Op>((*newNodes.cbegin())->getOperator());
+    REQUIRE(approxEq<float>(*(prodOp->getOutput(0)), *result));
+}
--- a/unit_tests/recipies/Test_ConvToMatMul.cpp
+++ b/unit_tests/recipies/Test_ConvToMatMul.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/recipes/Recipes.hpp"
+#include "aidge/operator/Conv.hpp"
+#include "aidge/operator/Producer.hpp"
+#include "aidge/scheduler/SequentialScheduler.hpp"
+#include "aidge/filler/Filler.hpp"
+#include "aidge/graph/OpArgs.hpp"
+#include <cstddef>
+
+using namespace Aidge;
+
+TEST_CASE("[ConvToMatMul] conv") {
+    auto conv1 = Conv(3, 4, {3, 3}, "conv1");
+    auto conv2 = Conv(4, 7, {3, 3}, "conv2", {1, 1}, {1, 1}, true);
+    auto conv3 = Conv(7, 10, {1, 1}, "conv3", {2, 2});
+
+    auto g1 = Sequential({
+        Producer({2, 3, 13, 24}, "dataProvider"),
+        conv1,
+        conv2,
+        conv3
+    });
+
+    g1->setBackend("cpu");
+    g1->forwardDims();
+
+    // Random initialization of input and weights
+    uniformFiller<float>(std::static_pointer_cast<OperatorTensor>(conv1->getOperator())->getInput(0), -10.0, 10.0);
+    uniformFiller<float>(std::static_pointer_cast<OperatorTensor>(conv1->getOperator())->getInput(1), -10.0, 10.0);
+    uniformFiller<float>(std::static_pointer_cast<OperatorTensor>(conv1->getOperator())->getInput(2), -10.0, 10.0);
+    uniformFiller<float>(std::static_pointer_cast<OperatorTensor>(conv2->getOperator())->getInput(1), -10.0, 10.0);
+    uniformFiller<float>(std::static_pointer_cast<OperatorTensor>(conv3->getOperator())->getInput(1), -10.0, 10.0);
+    uniformFiller<float>(std::static_pointer_cast<OperatorTensor>(conv3->getOperator())->getInput(2), -10.0, 10.0);
+
+    auto s1 = SequentialScheduler(g1);
+    s1.forward();
+
+    g1->save("convToMatMul_before");
+
+    auto g2 = g1->clone();
+    g2->forwardDims();
+    REQUIRE(convToMatMul(g2) == 3);
+    
+    g2->setBackend("cpu");
+
+    auto s2 = SequentialScheduler(g2);
+    s2.forward();
+
+    g2->save("convToMatMul_after");
+
+    auto g1OutOp = std::static_pointer_cast<OperatorTensor>((*g1->outputNodes().cbegin())->getOperator());
+    auto g2OutOp = std::static_pointer_cast<OperatorTensor>((*g1->outputNodes().cbegin())->getOperator());
+    REQUIRE(*(g1OutOp->getOutput(0)) == *(g2OutOp->getOutput(0)));
+
+    // Simplify the graph: freeze parameters to allow reshaping of the Producers
+    for (auto node : g2->getNodes()) {
+        if (node->type() == Producer_Op::Type && node->name() != "dataProvider") {
+            std::static_pointer_cast<Producer_Op>(node->getOperator())->constant() = true;
+        }
+    }
+
+    constantFolding(g2);
+    g2->save("convToMatMul_after_folding");
+}
--- a/unit_tests/recipies/Test_ExplicitCastMove.cpp
+++ b/unit_tests/recipies/Test_ExplicitCastMove.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/recipes/Recipes.hpp"
+#include "aidge/operator/Conv.hpp"
+#include "aidge/operator/Producer.hpp"
+#include "aidge/graph/OpArgs.hpp"
+#include <cstddef>
+
+using namespace Aidge;
+
+TEST_CASE("[ExplicitCastMove] conv") {
+    auto conv1 = Conv(3, 32, {3, 3}, "conv1");
+    auto conv2 = Conv(32, 64, {3, 3}, "conv2");
+    auto conv3 = Conv(64, 10, {1, 1}, "conv3", {2, 2});
+
+    auto g1 = Sequential({
+        Producer({16, 3, 224, 224}, "dataProvider"),
+        conv1,
+        conv2,
+        conv3
+    });
+
+    g1->setBackend("cpu");
+    conv1->getOperator()->setDataType(DataType::Int32);
+    conv3->getOperator()->setDataType(DataType::Float64);
+
+    g1->save("explicitCastMove_before");
+    REQUIRE(g1->getNodes().size() == 10);
+
+    g1->forwardDims();
+    explicitCastMove(g1);
+
+    g1->save("explicitCastMove_after");
+    REQUIRE(g1->getNodes().size() == 13);
+}
--- a/unit_tests/recipies/Test_FuseBatchNorm.cpp
+++ b/unit_tests/recipies/Test_FuseBatchNorm.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <catch2/catch_test_macros.hpp>
+#include <memory>
+#include <cmath>
+
+#include "aidge/graph/GraphView.hpp"
+#include "aidge/graph/OpArgs.hpp"
+#include "aidge/operator/Conv.hpp"
+#include "aidge/operator/BatchNorm.hpp"
+#include "aidge/operator/Producer.hpp"
+#include "aidge/recipes/Recipes.hpp"
+#include "aidge/scheduler/SequentialScheduler.hpp"
+
+#include "aidge/data/Tensor.hpp"
+
+namespace Aidge {
+
+TEST_CASE("[core/recipes] FuseBatchNorm", "[recipes][FuseBatchNorm]") {
+    auto myProd = Producer({2, 3, 3, 3}, "dataProvider");
+    auto myConv = Conv(3, 3, {1, 1}, "conv1");
+    auto myBN = BatchNorm<2>(32, 1.0e-5F, 0.1F, "batchnorm1");
+
+    auto myProdOp = std::static_pointer_cast<Producer_Op>(myProd->getOperator());
+    auto myConvOp = std::static_pointer_cast<Conv_Op<2>>(myConv->getOperator());
+    auto myBNOp = std::static_pointer_cast<BatchNorm_Op<2>>(myBN->getOperator());
+
+    myProdOp->setOutput(0, std::make_shared<Tensor>(Array4D<float,2,3,3,3> { //NCHW
+        {
+                {
+                    {{8.28257084e-01, 7.99335480e-01, 7.36702740e-01},
+                     {2.36729562e-01, 8.61912668e-01, 9.93067741e-01},
+                     {1.63514376e-01, 8.95773172e-02, 2.96533108e-01}},
+                    {{2.20776618e-01, 5.89067876e-01, 2.03930080e-01},
+                     {1.31294072e-01, 7.10182846e-01, 1.08420849e-04},
+                     {7.21750259e-01, 4.38212037e-01, 5.08823872e-01}},
+                    {{4.30953979e-01, 1.51903450e-01, 3.76343548e-01},
+                     {8.07861805e-01, 7.79679358e-01, 5.01209974e-01},
+                     {9.31280375e-01, 9.94207084e-01, 1.74868107e-03}}
+                },
+                {
+                    {{6.22058094e-01, 2.32256651e-02, 6.18222237e-01},
+                     {9.58304763e-01, 2.11395025e-02, 4.95614648e-01},
+                     {2.50825584e-01, 4.50860739e-01, 3.80362332e-01}},
+                    {{9.91703272e-02, 5.06073236e-01, 4.88969564e-01},
+                     {1.12059772e-01, 7.64178872e-01, 7.60362148e-01},
+                     {2.84135342e-02, 4.29610193e-01, 1.27862811e-01}},
+                    {{9.57209170e-01, 8.22797656e-01, 1.91352129e-01},
+                     {9.52722490e-01, 6.35501027e-01, 5.67592978e-02},
+                     {2.00799644e-01, 4.00822222e-01, 9.14380193e-01}}
+                }
+            }
+    }));
+    myConvOp -> setInput(1, std::make_shared<Tensor>(Array4D<float,3,3,1,1> { //NCHW
+        {
+            {
+                {{8.28257084e-01}},
+                {{7.99335480e-01}},
+                {{7.36702740e-01}}
+            },
+            {
+                {{2.36729562e-01}},
+                {{8.61912668e-01}},
+                {{9.93067741e-01}}
+            },
+            {
+                {{1.63514376e-01}},
+                {{8.95773172e-02}},
+                {{2.96533108e-01}}
+            }
+        }
+    }));
+    myConvOp -> setInput(2, std::make_shared<Tensor>(Array1D<float,3> {{0.4470, 0.3064, 0.7061}}));
+    myBNOp -> setInput(1, std::make_shared<Tensor>(Array1D<float,3> {{0.9044, 0.3028, 0.0218}}));
+    myBNOp -> setInput(2, std::make_shared<Tensor>(Array1D<float,3> {{0.1332, 0.7503, 0.0878}}));
+    myBNOp -> setInput(3, std::make_shared<Tensor>(Array1D<float,3> {{0.9931, 0.8421, 0.9936}}));
+    myBNOp -> setInput(4, std::make_shared<Tensor>(Array1D<float,3> {{0.4470, 0.3064, 0.7061}}));
+
+    auto g1 = Sequential({
+        myProd,
+        myConv,
+        myBN
+    });
+    g1 -> setName("fuseBNGraph");
+    g1 -> compile("cpu", DataType::Float32);
+
+    auto s = SequentialScheduler(g1);
+    s.forward();
+    std::shared_ptr<Tensor> res1 = std::make_shared<Tensor>(*(myBNOp -> getOutput(0)));
+
+    fuseBatchNorm(g1);
+
+    s.resetScheduling();
+    s.forward();
+    std::shared_ptr<Tensor> res2 = std::make_shared<Tensor>(*(myConvOp -> getOutput(0)));
+
+    REQUIRE(g1 -> outputNodes().size() == 1);
+    REQUIRE(g1 -> inputNodes().size() == 0);
+    bool eq = true;
+    for (std::size_t i = 0; i < res1->size(); ++i) {
+         eq &= std::abs(res1->get<float>(i) - res2->get<float>(i)) < 1.0e-06;
+    }
+    REQUIRE(eq);
+
+}
+} // namespace Aidge
--- a/unit_tests/recipies/Test_HorizontalTiling.cpp
+++ b/unit_tests/recipies/Test_HorizontalTiling.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <catch2/catch_test_macros.hpp>
+#include <set>
+
+#include "aidge/graph/GraphView.hpp"
+#include "aidge/graph/OpArgs.hpp"
+#include "aidge/operator/Conv.hpp"
+#include "aidge/operator/ReLU.hpp"
+#include "aidge/recipes/Recipes.hpp"
+#include "aidge/scheduler/SequentialScheduler.hpp"
+#include "aidge/operator/Concat.hpp"
+
+
+namespace Aidge {
+
+TEST_CASE("[core/recipes] Tiling(transformation)", "[Tiling][Recipes]") {
+
+    SECTION("Transform a pre-generated GraphView") {
+
+        SECTION("Simple Node: Conv") {
+            std::shared_ptr<Node> myReLU = ReLU("myReLU");
+            std::shared_ptr<Node> myConv = Conv(3,4,{3,3}, "myconv");
+            std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array4D<int,4,3,3,3> {
+                {
+                    {
+                        {{  0,   1,   2},
+                         {  3,   4,   5},
+                         {  6,   7,   8}},
+                        {{  9,  10,  11},
+                         { 12,  13,  14},
+                         { 15,  16,  17}},
+                        {{ 18,  19,  20},
+                         { 21,  22,  23},
+                         { 24,  25,  26}}
+                    },
+                    {
+                        {{ 27,  28,  29},
+                         { 30,  31,  32},
+                         { 33,  34,  35}},
+                        {{ 36,  37,  38},
+                         { 39,  40,  41},
+                         { 42,  43,  44}},
+                        {{ 45,  46,  47},
+                         { 48,  49,  50},
+                         { 51,  52,  53}}
+                    },
+                    {
+                        {{ 54,  55,  56},
+                         { 57,  58,  59},
+                         { 60,  61,  62}},
+                        {{ 63,  64,  65},
+                         { 66,  67,  68},
+                         { 69,  70,  71}},
+                        {{ 72,  73,  74},
+                         { 75,  76,  77},
+                         { 78,  79,  80}}
+                    },
+                    {
+                        {{ 81,  82,  83},
+                         { 84,  85,  86},
+                         { 87,  88,  89}},
+                        {{ 90,  91,  92},
+                         { 93,  94,  95},
+                         { 96,  97,  98}},
+                        {{ 99, 100, 101},
+                         {102, 103, 104},
+                         {105, 106, 107}}
+                    }
+                }
+            });
+            std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<int,4> {{7,0,9,0}});
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<int,2,3,5,5> { //NCHW
+                {
+                    {
+                        {{  0,   1,   2,   3,   4},
+                        {  5,   6,   7,   8,   9},
+                        { 10,  11,  12,  13,  14},
+                        { 15,  16,  17,  18,  19},
+                        { 20,  21,  22,  23,  24}},
+
+                        {{ 25,  26,  27,  28,  29},
+                        { 30,  31,  32,  33,  34},
+                        { 35,  36,  37,  38,  39},
+                        { 40,  41,  42,  43,  44},
+                        { 45,  46,  47,  48,  49}},
+
+                        {{ 50,  51,  52,  53,  54},
+                        { 55,  56,  57,  58,  59},
+                        { 60,  61,  62,  63,  64},
+                        { 65,  66,  67,  68,  69},
+                        { 70,  71,  72,  73,  74}}
+                    },
+                    {
+                        {{ 75,  76,  77,  78,  79},
+                        { 80,  81,  82,  83,  84},
+                        { 85,  86,  87,  88,  89},
+                        { 90,  91,  92,  93,  94},
+                        { 95,  96,  97,  98,  99}},
+
+                        {{100, 101, 102, 103, 104},
+                        {105, 106, 107, 108, 109},
+                        {110, 111, 112, 113, 114},
+                        {115, 116, 117, 118, 119},
+                        {120, 121, 122, 123, 124}},
+
+                        {{125, 126, 127, 128, 129},
+                        {130, 131, 132, 133, 134},
+                        {135, 136, 137, 138, 139},
+                        {140, 141, 142, 143, 144},
+                        {145, 146, 147, 148, 149}}
+                    }
+                }
+            });
+            std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<int,2,4,3,3> {
+                {
+                    {
+                        {{ 15226,  15577,  15928},
+                         { 16981,  17332,  17683},
+                         { 18736,  19087,  19438}},
+
+                        {{ 37818,  38898,  39978},
+                         { 43218,  44298,  45378},
+                         { 48618,  49698,  50778}},
+
+                        {{ 60426,  62235,  64044},
+                         { 69471,  71280,  73089},
+                         { 78516,  80325,  82134}},
+
+                        {{ 83016,  85554,  88092},
+                         { 95706,  98244, 100782},
+                         {108396, 110934, 113472}}
+                    },
+                    {
+                        {{ 41551,  41902,  42253},
+                         { 43306,  43657,  44008},
+                         { 45061,  45412,  45763}},
+
+                        {{118818, 119898, 120978},
+                         {124218, 125298, 126378},
+                         {129618, 130698, 131778}},
+
+                        {{196101, 197910, 199719},
+                         {205146, 206955, 208764},
+                         {214191, 216000, 217809}},
+
+                        {{273366, 275904, 278442},
+                         {286056, 288594, 291132},
+                         {298746, 301284, 303822}}
+                    }
+                }
+            });
+            myReLU->getOperator()->associateInput(0, myInput);
+            myReLU->addChild(myConv, 0, 0);
+            myConv->getOperator()->setInput(1, myWeights);
+            myConv->getOperator()->setInput(2, myBias);
+
+            std::shared_ptr<GraphView> g = std::make_shared<GraphView>();
+            g->add({myReLU, myConv});
+            g->compile("cpu", DataType::Int32);
+            std::set<std::shared_ptr<Node>> tiledConv = getConvHorizontalTiling(myConv, 2, 3);
+
+            SequentialScheduler s(g);
+            s.forward();
+            REQUIRE(*(std::dynamic_pointer_cast<Conv_Op<2>>(myConv->getOperator())->getOutput(0)) == *myOutput);
+
+            GraphView::replace({myConv, myConv->getParent(1), myConv->getParent(2)}, tiledConv);
+            g->compile("cpu", DataType::Int32, 0, {{2,3,5,5}});  // changes myInput DataType from Int32 to Float32. Why??????
+            s.resetScheduling();
+            s.forward();
+
+            REQUIRE(*(std::dynamic_pointer_cast<OperatorTensor>((*g->outputNodes().begin())->getOperator())->getOutput(0)) == *myOutput);
+        }
+    }
+}
+} // namespace Aidge
\ No newline at end of file
--- a/unit_tests/recipies/Test_MatMulTiling.cpp
+++ b/unit_tests/recipies/Test_MatMulTiling.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <cstddef>
+#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/recipes/Recipes.hpp"
+#include "aidge/operator/MatMul.hpp"
+#include "aidge/operator/AvgPooling.hpp"
+#include "aidge/operator/MaxPooling.hpp"
+#include "aidge/operator/GenericOperator.hpp"
+#include "aidge/operator/Producer.hpp"
+#include "aidge/graph/OpArgs.hpp"
+#include "aidge/scheduler/SequentialScheduler.hpp"
+#include "aidge/graph/Matching.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+using namespace Aidge;
+
+TEST_CASE("[MatMulTiling]") {
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> valueDist(-1.0f, 1.0f);
+
+    auto dataProvider = Producer({2, 3, 80, 80}, "dataProvider");
+    auto w1 = Producer({2, 3, 80, 80}, "w1");
+    auto matmul1 = MatMul("matmul1");
+    auto w2 = Producer({2, 3, 80, 80}, "w1");
+    auto matmul2 = MatMul("matmul2");
+    auto w3 = Producer({2, 3, 80, 80}, "w1");
+    auto matmul3 = MatMul("matmul3");
+
+    dataProvider->addChild(matmul1, 0, 0);
+    w1->addChild(matmul1, 0, 1);
+    matmul1->addChild(matmul2, 0, 0);
+    w2->addChild(matmul2, 0, 1);
+    matmul2->addChild(matmul3, 0, 0);
+    w3->addChild(matmul3, 0, 1);
+
+    auto g1 = getConnectedGraphView(matmul1);
+    g1->setBackend("cpu");
+    g1->forwardDims();
+    g1->save("MatMulSplitting_graph");
+
+    // Fill random values
+    fmt::println("Fill random values");
+    auto tData = std::static_pointer_cast<OperatorTensor>(dataProvider->getOperator())->getOutput(0);
+    for (size_t i = 0; i < tData->size(); ++i) {
+        tData->set<float>(i, valueDist(gen));
+    }
+    auto tw1 = std::static_pointer_cast<OperatorTensor>(w1->getOperator())->getOutput(0);
+    for (size_t i = 0; i < tw1->size(); ++i) {
+        tw1->set<float>(i, valueDist(gen));
+    }
+    auto tw2 = std::static_pointer_cast<OperatorTensor>(w2->getOperator())->getOutput(0);
+    for (size_t i = 0; i < tw2->size(); ++i) {
+        tw2->set<float>(i, valueDist(gen));
+    }
+    auto tw3 = std::static_pointer_cast<OperatorTensor>(w3->getOperator())->getOutput(0);
+    for (size_t i = 0; i < tw3->size(); ++i) {
+        tw3->set<float>(i, valueDist(gen));
+    }
+
+    fmt::println("Schedule forward graph");
+    auto s1 = SequentialScheduler(g1);
+    s1.forward();
+
+    const auto tOut = std::static_pointer_cast<OperatorTensor>(g1->getOrderedOutputs()[0].first->getOperator())->getOutput(0)->clone();
+
+    // Tiling
+    fmt::println("Tiling");
+    matMulTiling(matmul1, {16, 16});
+    removeIdentity(g1);
+
+    g1->setBackend("cpu");
+    g1->save("MatMulSplitting_graph_split");
+
+    auto gm = SinglePassGraphMatching(g1);
+    gm.addNodeLambda("16x16", [](const NodePtr& node) {
+        const auto op =
+            std::static_pointer_cast<OperatorTensor>(node->getOperator());
+        const auto dims = op->getOutput(0)->dims();
+        return (dims.end()[-2] == 16 && dims.end()[-1] == 16);
+    });
+
+    const auto results = gm.match("MatMul[16x16]");
+    REQUIRE(results.size() == 25);
+
+    // Check result
+    fmt::println("Schedule forward tiled graph");
+    s1 = SequentialScheduler(g1);
+    s1.resetScheduling();
+    s1.forward();
+
+    const auto tOutTiled = std::static_pointer_cast<OperatorTensor>(g1->getOrderedOutputs()[0].first->getOperator())->getOutput(0)->clone();
+    REQUIRE(approxEq<float>(tOut, tOutTiled));
+}
--- a/unit_tests/scheduler/Test_CastMove.cpp
+++ b/unit_tests/scheduler/Test_CastMove.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <catch2/catch_test_macros.hpp>
+#include <memory>
+#include <string>
+
+
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/ConvImpl.hpp"
+#include "aidge/backend/cpu/operator/FCImpl.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/graph/Node.hpp"
+#include "aidge/graph/GraphView.hpp"
+#include "aidge/graph/OpArgs.hpp"
+#include "aidge/operator/Conv.hpp"
+#include "aidge/operator/FC.hpp"
+#include "aidge/scheduler/SequentialScheduler.hpp"
+#include "aidge/recipes/Recipes.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+using namespace Aidge;
+
+TEST_CASE("[cpu/castmove] CastMove(forward)") {
+    std::shared_ptr<Tensor> inputTensor =
+            std::make_shared<Tensor>(Array4D<int, 2, 1, 5, 5>{{{{{0, 1, 2, 3, 4},
+                                                                 {5, 6, 7, 8, 9},
+                                                                 {10, 11, 12, 13, 14},
+                                                                 {15, 16, 17, 18, 19},
+                                                                 {20, 21, 22, 23, 24}}},
+                                                               {{{25, 26, 27, 28, 29},
+                                                                 {30, 31, 32, 33, 34},
+                                                                 {35, 36, 37, 38, 39},
+                                                                 {40, 41, 42, 43, 44},
+                                                                 {45, 46, 47, 48, 49}}}}});
+
+    std::shared_ptr<Tensor> weight1 = std::make_shared<Tensor>(
+            Array4D<int, 3, 1, 3, 3>{{{{{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}},
+                                      {{{10, 11, 12}, {13, 14, 15}, {16, 17, 18}}},
+                                      {{{19, 20, 21}, {22, 23, 24}, {25, 26, 27}}}}});
+
+    std::shared_ptr<Tensor> bias1 = std::make_shared<Tensor>(Array1D<int, 3>{{1, 2, 3}});
+
+    SECTION("Test implicit") {
+        std::shared_ptr<GraphView> g =
+                Sequential({
+                    Conv(1, 3, {3, 3}, "conv1"),
+                    Conv(3, 4, {1, 1}, "conv2"),
+                    Conv(4, 3, {1, 1}, "conv3"),
+                    FC(27, 5, false, "fc")});
+
+        g->getNode("conv1")->getOperator()->setInput(0, inputTensor);
+        g->getNode("conv1")->getOperator()->setInput(1, weight1);
+        g->getNode("conv1")->getOperator()->setInput(2, bias1);
+
+        std::shared_ptr<Tensor> weight2 =
+                std::make_shared<Tensor>(Array4D<int, 4, 3, 1, 1>{{{{{1}}, {{2}}, {{3}}},
+                                                                   {{{4}}, {{5}}, {{6}}},
+                                                                   {{{7}}, {{8}}, {{9}}},
+                                                                   {{{10}}, {{11}}, {{12}}}}});
+        std::shared_ptr<Tensor> bias2 = std::make_shared<Tensor>(Array1D<int, 4>{{1, 2, 3, 4}});
+        g->getNode("conv2")->getOperator()->setInput(1, weight2);
+        g->getNode("conv2")->getOperator()->setInput(2, bias2);
+        // *(g->getNode("conv2")->getOperator()->input(1, weight2);
+
+        std::shared_ptr<Tensor> weight3 = std::make_shared<Tensor>(
+                Array4D<int, 3, 4, 1, 1>{{{{{1}}, {{2}}, {{3}}, {{4}}},
+                                          {{{5}}, {{6}}, {{7}}, {{8}}},
+                                          {{{9}}, {{10}}, {{11}}, {{12}}}}});
+        std::shared_ptr<Tensor> bias3 = std::make_shared<Tensor>(Array1D<int, 3>{{1, 2, 3}});
+        g->getNode("conv3")->getOperator()->setInput(1, weight3);
+        g->getNode("conv3")->getOperator()->setInput(2, bias3);
+
+        std::shared_ptr<Tensor> weightfc = std::make_shared<Tensor>(
+                Array2D<int, 5, 27>{{{1,  2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+                                      15, 1, 2, 3, 4, 5, 6, 7, 8, 9,  10, 11, 12},
+                                     {13, 14, 15, 1,  2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                                      12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9},
+                                     {10, 11, 12, 13, 14, 15, 1,  2, 3, 4, 5, 6, 7, 8,
+                                      9,  10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6},
+                                     {7, 8, 9, 10, 11, 12, 13, 14, 15, 1,  2, 3, 4, 5,
+                                      6, 7, 8, 9,  10, 11, 12, 13, 14, 15, 1, 2, 3},
+                                     {4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2,
+                                      3, 4, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14, 15}}});
+        std::shared_ptr<Tensor> biasfc = std::make_shared<Tensor>(Array1D<int, 5>{{1, 2, 3, 4, 5}});
+        g->getNode("fc")->getOperator()->setInput(1, weightfc);
+        g->getNode("fc")->getOperator()->setInput(2, biasfc);
+
+        // input->addChild(g);
+        g->setDataType(Aidge::DataType::Int32);
+        g->getNode("conv1")->getOperator()->setDataType(DataType::Float32);
+        g->getNode("conv3")->getOperator()->setDataType(DataType::Float64);
+
+        g->setBackend("cpu");
+        g->forwardDims();
+        SequentialScheduler scheduler(g);
+        REQUIRE_NOTHROW(scheduler.forward());
+        scheduler.saveSchedulingDiagram("schedulingSequential");
+
+        std::shared_ptr<Tensor> expectedOutput1 = std::make_shared<Tensor>(Array4D<int, 2, 3, 3, 3>{
+                {{{{367, 412, 457}, {592, 637, 682}, {817, 862, 907}},
+                  {{854, 980, 1106}, {1484, 1610, 1736}, {2114, 2240, 2366}},
+                  {{1341, 1548, 1755}, {2376, 2583, 2790}, {3411, 3618, 3825}}},
+                 {{{1492, 1537, 1582}, {1717, 1762, 1807}, {1942, 1987, 2032}},
+                  {{4004, 4130, 4256}, {4634, 4760, 4886}, {5264, 5390, 5516}},
+                  {{6516, 6723, 6930}, {7551, 7758, 7965}, {8586, 8793, 9000}}}}});
+
+        std::shared_ptr<Tensor> expectedOutput2 = std::make_shared<Tensor>(Array4D<int, 2, 4, 3, 3>{
+                {{{{6099, 7017, 7935}, {10689, 11607, 12525}, {15279, 16197, 17115}},
+                  {{13786, 15838, 17890}, {24046, 26098, 28150}, {34306, 36358, 38410}},
+                  {{21473, 24659, 27845}, {37403, 40589, 43775}, {53333, 56519, 59705}},
+                  {{29160, 33480, 37800}, {50760, 55080, 59400}, {72360, 76680, 81000}}},
+                 {{{29049, 29967, 30885}, {33639, 34557, 35475}, {38229, 39147, 40065}},
+                  {{65086, 67138, 69190}, {75346, 77398, 79450}, {85606, 87658, 89710}},
+                  {{101123, 104309, 107495}, {117053, 120239, 123425}, {132983, 136169, 139355}},
+                  {{137160, 141480, 145800}, {158760, 163080, 167400}, {180360, 184680, 189000}}}}});
+
+        std::shared_ptr<Tensor> expectedOutput3 = std::make_shared<Tensor>(Array4D<int, 2, 3, 3, 3>{
+                {{{{214731, 246591, 278451}, {374031, 405891, 437751}, {533331, 565191, 597051}},
+                  {{496804, 570568, 644332}, {865624, 939388, 1013152}, {1234444, 1308208, 1381972}},
+                  {{778877, 894545, 1010213}, {1357217, 1472885, 1588553}, {1935557, 2051225, 2166893}}},
+                 {{{1011231, 1043091, 1074951}, {1170531, 1202391, 1234251}, {1329831, 1361691, 1393551}},
+                  {{2340904, 2414668, 2488432}, {2709724, 2783488, 2857252}, {3078544, 3152308, 3226072}},
+                  {{3670577, 3786245, 3901913}, {4248917, 4364585, 4480253}, {4827257, 4942925, 5058593}}}}});
+
+        Tensor expectedOutput4 = Array2D<int, 2, 5>{
+                {{205050376, 198925904, 181355097, 196978090, 238868348},
+                {598467376, 561797804, 560823897, 593043790, 698672948}}};
+        std::shared_ptr<Tensor> other1 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv1")->getOperator())->getOutput(0);
+        REQUIRE(approxEq<float, int>(*other1, *expectedOutput1, 0.0, 1.0e-12));
+        std::shared_ptr<Tensor> other2 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv2")->getOperator())->getOutput(0);
+        REQUIRE(approxEq<int>(*other2, *expectedOutput2, 0.0, 1.0e-12));
+        std::shared_ptr<Tensor> other3 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv3")->getOperator())->getOutput(0);
+        REQUIRE(approxEq<double, int>(*other3, *expectedOutput3, 0.0, 1.0e-12));
+        std::shared_ptr<Tensor> other4 = std::static_pointer_cast<OperatorTensor>(g->getNode("fc")->getOperator())->getOutput(0);
+        REQUIRE(approxEq<int>(*other4, expectedOutput4, 0.0, 1.0e-12));
+    }
+
+    SECTION("Half") {
+        Tensor refTensor = Array2D<float, 3, 2>{{{0.0, 1.0},{2.1, 3.4},{5000.0, 1.0e5}}};
+        Tensor tensor(DataType::Float16);
+        tensor.copyCastFrom(refTensor);
+        REQUIRE(approxEq<float, half_float::half>(refTensor, tensor, 1.0e-3, 0.0));
+    }
+
+    SECTION("Test explicit") {
+        std::shared_ptr<GraphView> g =
+                Sequential({
+                    Conv(1, 3, {3, 3}, "conv1"),
+                    Conv(3, 4, {1, 1}, "conv2"),
+                    Conv(4, 3, {1, 1}, "conv3"),
+                    FC(27, 5, false, "fc")});
+
+        g->getNode("conv1")->getOperator()->setInput(0, inputTensor);
+        g->getNode("conv1")->getOperator()->setInput(1, weight1);
+        g->getNode("conv1")->getOperator()->setInput(2, bias1);
+
+        std::shared_ptr<Tensor> weight2 =
+                std::make_shared<Tensor>(Array4D<int, 4, 3, 1, 1>{{{{{1}}, {{2}}, {{3}}},
+                                                                   {{{4}}, {{5}}, {{6}}},
+                                                                   {{{7}}, {{8}}, {{9}}},
+                                                                   {{{10}}, {{11}}, {{12}}}}});
+        std::shared_ptr<Tensor> bias2 = std::make_shared<Tensor>(Array1D<int, 4>{{1, 2, 3, 4}});
+        g->getNode("conv2")->getOperator()->setInput(1, weight2);
+        g->getNode("conv2")->getOperator()->setInput(2, bias2);
+        // *(g->getNode("conv2")->getOperator()->input(1, weight2);
+
+        std::shared_ptr<Tensor> weight3 = std::make_shared<Tensor>(
+                Array4D<int, 3, 4, 1, 1>{{{{{1}}, {{2}}, {{3}}, {{4}}},
+                                          {{{5}}, {{6}}, {{7}}, {{8}}},
+                                          {{{9}}, {{10}}, {{11}}, {{12}}}}});
+        std::shared_ptr<Tensor> bias3 = std::make_shared<Tensor>(Array1D<int, 3>{{1, 2, 3}});
+        g->getNode("conv3")->getOperator()->setInput(1, weight3);
+        g->getNode("conv3")->getOperator()->setInput(2, bias3);
+
+        std::shared_ptr<Tensor> weightfc = std::make_shared<Tensor>(
+                Array2D<int, 5, 27>{{{1,  2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+                                      15, 1, 2, 3, 4, 5, 6, 7, 8, 9,  10, 11, 12},
+                                     {13, 14, 15, 1,  2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                                      12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9},
+                                     {10, 11, 12, 13, 14, 15, 1,  2, 3, 4, 5, 6, 7, 8,
+                                      9,  10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6},
+                                     {7, 8, 9, 10, 11, 12, 13, 14, 15, 1,  2, 3, 4, 5,
+                                      6, 7, 8, 9,  10, 11, 12, 13, 14, 15, 1, 2, 3},
+                                     {4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2,
+                                      3, 4, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14, 15}}});
+        std::shared_ptr<Tensor> biasfc = std::make_shared<Tensor>(Array1D<int, 5>{{1, 2, 3, 4, 5}});
+        g->getNode("fc")->getOperator()->setInput(1, weightfc);
+        g->getNode("fc")->getOperator()->setInput(2, biasfc);
+
+        // input->addChild(g);
+        g->setDataType(Aidge::DataType::Int32);
+        g->getNode("conv1")->getOperator()->setDataType(DataType::Float32);
+        g->getNode("conv3")->getOperator()->setDataType(DataType::Float64);
+
+        explicitCastMove(g);
+        g->setBackend("cpu");
+        g->forwardDims();
+
+        SequentialScheduler scheduler(g);
+        REQUIRE_NOTHROW(scheduler.forward());
+        scheduler.saveSchedulingDiagram("schedulingSequential");
+
+        Tensor expectedOutput1 = Array4D<int, 2, 3, 3, 3>{
+                {{{{367, 412, 457}, {592, 637, 682}, {817, 862, 907}},
+                  {{854, 980, 1106}, {1484, 1610, 1736}, {2114, 2240, 2366}},
+                  {{1341, 1548, 1755}, {2376, 2583, 2790}, {3411, 3618, 3825}}},
+                 {{{1492, 1537, 1582}, {1717, 1762, 1807}, {1942, 1987, 2032}},
+                  {{4004, 4130, 4256}, {4634, 4760, 4886}, {5264, 5390, 5516}},
+                  {{6516, 6723, 6930}, {7551, 7758, 7965}, {8586, 8793, 9000}}}}};
+
+        Tensor expectedOutput2 = Array4D<int, 2, 4, 3, 3>{
+                {{{{6099, 7017, 7935}, {10689, 11607, 12525}, {15279, 16197, 17115}},
+                  {{13786, 15838, 17890}, {24046, 26098, 28150}, {34306, 36358, 38410}},
+                  {{21473, 24659, 27845}, {37403, 40589, 43775}, {53333, 56519, 59705}},
+                  {{29160, 33480, 37800}, {50760, 55080, 59400}, {72360, 76680, 81000}}},
+                 {{{29049, 29967, 30885}, {33639, 34557, 35475}, {38229, 39147, 40065}},
+                  {{65086, 67138, 69190}, {75346, 77398, 79450}, {85606, 87658, 89710}},
+                  {{101123, 104309, 107495}, {117053, 120239, 123425}, {132983, 136169, 139355}},
+                  {{137160, 141480, 145800}, {158760, 163080, 167400}, {180360, 184680, 189000}}}}};
+
+        Tensor expectedOutput3 = Array4D<int, 2, 3, 3, 3>{
+                {{{{214731, 246591, 278451}, {374031, 405891, 437751}, {533331, 565191, 597051}},
+                  {{496804, 570568, 644332}, {865624, 939388, 1013152}, {1234444, 1308208, 1381972}},
+                  {{778877, 894545, 1010213}, {1357217, 1472885, 1588553}, {1935557, 2051225, 2166893}}},
+                 {{{1011231, 1043091, 1074951}, {1170531, 1202391, 1234251}, {1329831, 1361691, 1393551}},
+                  {{2340904, 2414668, 2488432}, {2709724, 2783488, 2857252}, {3078544, 3152308, 3226072}},
+                  {{3670577, 3786245, 3901913}, {4248917, 4364585, 4480253}, {4827257, 4942925, 5058593}}}}};
+
+        Tensor expectedOutput4 = Array2D<int, 2, 5>{
+                {{205050376, 198925904, 181355097, 196978090, 238868348},
+                {598467376, 561797804, 560823897, 593043790, 698672948}}};
+        std::shared_ptr<Tensor> other1 = std::static_pointer_cast<Conv_Op<2>>(g->getNode("conv1")->getOperator())->getOutput(0);
+        REQUIRE(approxEq<float, int>(*other1, expectedOutput1, 0.0, 1.0e-12));
+        std::shared_ptr<Tensor> other2 = std::static_pointer_cast<Conv_Op<2>>(g->getNode("conv2")->getOperator())->getOutput(0);
+        REQUIRE(*other2 == expectedOutput2);
+        std::shared_ptr<Tensor> other3 = std::static_pointer_cast<Conv_Op<2>>(g->getNode("conv3")->getOperator())->getOutput(0);
+        REQUIRE(approxEq<double, int>(*other3, expectedOutput3, 0.0, 1.0e-12));
+        std::shared_ptr<Tensor> other4 = std::static_pointer_cast<FC_Op>(g->getNode("fc")->getOperator())->getOutput(0);
+        REQUIRE(*other4 == expectedOutput4);
+    }
+}
--- a/unit_tests/scheduler/Test_Scheduler.cpp
+++ b/unit_tests/scheduler/Test_Scheduler.cpp
@@ -17,13 +17,25 @@
 #include "aidge/graph/Node.hpp"
 #include "aidge/graph/GraphView.hpp"
 #include "aidge/graph/OpArgs.hpp"
-#include "aidge/scheduler/Scheduler.hpp"
+#include "aidge/operator/Memorize.hpp"
+#include "aidge/operator/Pop.hpp"
+#include "aidge/operator/Stack.hpp"
+#include "aidge/operator/Identity.hpp"
+#include "aidge/operator/MetaOperator.hpp"
+#include "aidge/scheduler/SequentialScheduler.hpp"
+#include "aidge/scheduler/ParallelScheduler.hpp"

-#include "aidge/backend/cpu.hpp"
+#include "aidge/backend/cpu/operator/FCImpl.hpp"
+#include "aidge/backend/cpu/operator/ConvImpl.hpp"
+#include "aidge/backend/cpu/operator/ReLUImpl.hpp"
+#include "aidge/backend/cpu/operator/SqrtImpl.hpp"
+#include "aidge/backend/cpu/operator/AddImpl.hpp"

-using namespace Aidge;
+#include "aidge/recipes/GraphViewHelper.hpp"


+namespace Aidge {
+
 TEST_CASE("[cpu/scheduler] SequentialScheduler(forward)") {
    std::shared_ptr<Tensor> inputTensor =
            std::make_shared<Tensor>(Array4D<int, 2, 1, 5, 5>{{{{{0, 1, 2, 3, 4},
@@ -50,13 +62,11 @@ TEST_CASE("[cpu/scheduler] SequentialScheduler(forward)") {
                    Conv(1, 3, {3, 3}, "conv1"),
                    Conv(3, 4, {1, 1}, "conv2"),
                    Conv(4, 3, {1, 1}, "conv3"),
-                    FC(5, false, "fc")});
-        g->setDatatype(Aidge::DataType::Int32);
-        g->setBackend("cpu");
+                    FC(27, 5, false, "fc")});

-        g->getNode("conv1")->getOperator()->input(0) = *inputTensor;
-        g->getNode("conv1")->getOperator()->input(1) = *weight1;
-        g->getNode("conv1")->getOperator()->input(2) = *bias1;
+        g->getNode("conv1")->getOperator()->setInput(0, inputTensor);
+        g->getNode("conv1")->getOperator()->setInput(1, weight1);
+        g->getNode("conv1")->getOperator()->setInput(2, bias1);

        std::shared_ptr<Tensor> weight2 =
                std::make_shared<Tensor>(Array4D<int, 4, 3, 1, 1>{{{{{1}}, {{2}}, {{3}}},
@@ -64,8 +74,8 @@ TEST_CASE("[cpu/scheduler] SequentialScheduler(forward)") {
                                                                   {{{7}}, {{8}}, {{9}}},
                                                                   {{{10}}, {{11}}, {{12}}}}});
        std::shared_ptr<Tensor> bias2 = std::make_shared<Tensor>(Array1D<int, 4>{{1, 2, 3, 4}});
-        g->getNode("conv2")->getOperator()->input(1) = *weight2;
-        g->getNode("conv2")->getOperator()->input(2) = *bias2;
+        g->getNode("conv2")->getOperator()->setInput(1, weight2);
+        g->getNode("conv2")->getOperator()->setInput(2, bias2);
        // *(g->getNode("conv2")->getOperator()->input(1, weight2);

        std::shared_ptr<Tensor> weight3 = std::make_shared<Tensor>(
@@ -73,8 +83,8 @@ TEST_CASE("[cpu/scheduler] SequentialScheduler(forward)") {
                                          {{{5}}, {{6}}, {{7}}, {{8}}},
                                          {{{9}}, {{10}}, {{11}}, {{12}}}}});
        std::shared_ptr<Tensor> bias3 = std::make_shared<Tensor>(Array1D<int, 3>{{1, 2, 3}});
-        g->getNode("conv3")->getOperator()->input(1) = *weight3;
-        g->getNode("conv3")->getOperator()->input(2) = *bias3;
+        g->getNode("conv3")->getOperator()->setInput(1, weight3);
+        g->getNode("conv3")->getOperator()->setInput(2, bias3);

        std::shared_ptr<Tensor> weightfc = std::make_shared<Tensor>(
                Array2D<int, 5, 27>{{{1,  2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
@@ -88,10 +98,12 @@ TEST_CASE("[cpu/scheduler] SequentialScheduler(forward)") {
                                     {4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2,
                                      3, 4, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14, 15}}});
        std::shared_ptr<Tensor> biasfc = std::make_shared<Tensor>(Array1D<int, 5>{{1, 2, 3, 4, 5}});
-        g->getNode("fc")->getOperator()->input(1) = *weightfc;
-        g->getNode("fc")->getOperator()->input(2) = *biasfc;
+        g->getNode("fc")->getOperator()->setInput(1, weightfc);
+        g->getNode("fc")->getOperator()->setInput(2, biasfc);

        // input->addChild(g);
+        g->setDataType(Aidge::DataType::Int32);
+        g->setBackend("cpu");
        g->forwardDims();
        SequentialScheduler scheduler(g);
        REQUIRE_NOTHROW(scheduler.forward());
@@ -126,17 +138,17 @@ TEST_CASE("[cpu/scheduler] SequentialScheduler(forward)") {
        Tensor expectedOutput4 = Array2D<int, 2, 5>{
                {{205050376, 198925904, 181355097, 196978090, 238868348},
                {598467376, 561797804, 560823897, 593043790, 698672948}}};
-        Tensor other1 = g->getNode("conv1")->getOperator()->output(0);
-        bool equal1 = (other1 == *expectedOutput1);
+        std::shared_ptr<Tensor> other1 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv1")->getOperator())->getOutput(0);
+        bool equal1 = (*other1 == *expectedOutput1);
        REQUIRE(equal1);
-        Tensor other2 = g->getNode("conv2")->getOperator()->output(0);
-        bool equal2 = (other2 == *expectedOutput2);
+        std::shared_ptr<Tensor> other2 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv2")->getOperator())->getOutput(0);
+        bool equal2 = (*other2 == *expectedOutput2);
        REQUIRE(equal2);
-        Tensor other3 = g->getNode("conv3")->getOperator()->output(0);
-        bool equal3 = (other3 == *expectedOutput3);
+        std::shared_ptr<Tensor> other3 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv3")->getOperator())->getOutput(0);
+        bool equal3 = (*other3 == *expectedOutput3);
        REQUIRE(equal3);
-        Tensor other4 = g->getNode("fc")->getOperator()->output(0);
-        bool equal4 = (other4 == expectedOutput4);
+        std::shared_ptr<Tensor> other4 = std::static_pointer_cast<OperatorTensor>(g->getNode("fc")->getOperator())->getOutput(0);
+        bool equal4 = (*other4 == expectedOutput4);
        REQUIRE(equal4);
    }

@@ -144,39 +156,40 @@ TEST_CASE("[cpu/scheduler] SequentialScheduler(forward)") {
        std::shared_ptr<GraphView> g =
                Sequential({Conv(1, 3, {3, 3}, "inputConv"),
                            Parallel({
-                                Conv(3, 3, {1, 1}, "conv1.1"),
-                                Conv(3, 3, {1, 1}, "conv1.2"),
+                                Sequential({
+                                    Parallel({
+                                        Conv(3, 3, {1, 1}, "conv1.1"),
+                                        Conv(3, 3, {1, 1}, "conv1.2")}),
+                                    Add("add1")}),
                                Conv(3, 3, {1, 1}, "conv1.3")}),
-                            Add<3>("add1"),
+                            Add("add2"),
                            Conv(3, 2, {1, 1}, "conv2"),
-                            FC(5, false, "out")});
-        g->setBackend("cpu");
-        g->setDatatype(Aidge::DataType::Int32);
+                            FC(18, 5, false, "out")});

-        g->getNode("inputConv")->getOperator()->input(0) = *inputTensor;
-        g->getNode("inputConv")->getOperator()->input(1) = *weight1;
-        g->getNode("inputConv")->getOperator()->input(2) = *bias1;
+        g->getNode("inputConv")->getOperator()->setInput(0, inputTensor);
+        g->getNode("inputConv")->getOperator()->setInput(1, weight1);
+        g->getNode("inputConv")->getOperator()->setInput(2, bias1);

        std::shared_ptr<Tensor> conv11Weight = std::make_shared<Tensor>(Array4D<int, 3, 3, 1, 1>{
                {{{{1}}, {{2}}, {{3}}}, {{{4}}, {{5}}, {{6}}}, {{{7}}, {{8}}, {{9}}}}});
-        g->getNode("conv1.1")->getOperator()->input(1) = *conv11Weight;
-        g->getNode("conv1.1")->getOperator()->input(2) = *bias1;
+        g->getNode("conv1.1")->getOperator()->setInput(1, conv11Weight);
+        g->getNode("conv1.1")->getOperator()->setInput(2, bias1);

        std::shared_ptr<Tensor> conv12Weight = std::make_shared<Tensor>(Array4D<int, 3, 3, 1, 1>{
                {{{{11}}, {{12}}, {{13}}}, {{{14}}, {{15}}, {{16}}}, {{{17}}, {{18}}, {{19}}}}});
-        g->getNode("conv1.2")->getOperator()->input(1) = *conv12Weight;
-        g->getNode("conv1.2")->getOperator()->input(2) = *bias1;
+        g->getNode("conv1.2")->getOperator()->setInput(1, conv12Weight);
+        g->getNode("conv1.2")->getOperator()->setInput(2, bias1);

        std::shared_ptr<Tensor> conv13Weight = std::make_shared<Tensor>(Array4D<int, 3, 3, 1, 1>{
                {{{{21}}, {{22}}, {{23}}}, {{{24}}, {{25}}, {{26}}}, {{{27}}, {{28}}, {{29}}}}});
-        g->getNode("conv1.3")->getOperator()->input(1) = *conv13Weight;
-        g->getNode("conv1.3")->getOperator()->input(2) = *bias1;
+        g->getNode("conv1.3")->getOperator()->setInput(1, conv13Weight);
+        g->getNode("conv1.3")->getOperator()->setInput(2, bias1);

        std::shared_ptr<Tensor> conv2Weight = std::make_shared<Tensor>(
                Array4D<int, 2, 3, 1, 1>{{{{{1}}, {{2}}, {{3}}}, {{{4}}, {{5}}, {{6}}}}});
        std::shared_ptr<Tensor> bias2 = std::make_shared<Tensor>(Array1D<int, 2>{{1, 2}});
-        g->getNode("conv2")->getOperator()->input(1) = *conv2Weight;
-        g->getNode("conv2")->getOperator()->input(2) = *bias2;
+        g->getNode("conv2")->getOperator()->setInput(1, conv2Weight);
+        g->getNode("conv2")->getOperator()->setInput(2, bias2);

        std::shared_ptr<Tensor> fcWeight = std::make_shared<Tensor>(
                Array2D<int, 5, 18>{{{1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3},
@@ -185,19 +198,21 @@ TEST_CASE("[cpu/scheduler] SequentialScheduler(forward)") {
                                     {5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2},
                                     {3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5}}});
        std::shared_ptr<Tensor> fcBias = std::make_shared<Tensor>(Array1D<int, 5>{{1, 2, 3, 4, 5}});
-        g->getNode("out")->getOperator()->input(1) = *fcWeight;
-        g->getNode("out")->getOperator()->input(2) = *fcBias;
+        g->getNode("out")->getOperator()->setInput(1, fcWeight);
+        g->getNode("out")->getOperator()->setInput(2, fcBias);

        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(
                Array2D<int, 2, 5>{{{124324368, 130692907, 133325056, 125044620, 142843879},
                                    {369195468, 394615207, 382643056, 379441320, 416291779}}});

+        g->setBackend("cpu");
+        g->setDataType(Aidge::DataType::Int32);
        g->forwardDims();
        SequentialScheduler scheduler(g);
        REQUIRE_NOTHROW(scheduler.forward());
        scheduler.saveSchedulingDiagram("schedulingSequential");
        std::shared_ptr<Tensor> result =
-                std::static_pointer_cast<Tensor>(g->getNode("out")->getOperator()->getOutput(0));
+                std::static_pointer_cast<Tensor>(g->getNode("out")->getOperator()->getRawOutput(0));
        bool equal = (*result == *expectedOutput);
        REQUIRE(equal);
    }
@@ -205,5 +220,296 @@ TEST_CASE("[cpu/scheduler] SequentialScheduler(forward)") {
    SECTION("Test Residual graph") {
    }

-    SECTION("Test Recurrent graph") {}
-}
\ No newline at end of file
+    SECTION("Test Recurrent graph (sequential)") {
+        std::shared_ptr<Tensor> in = std::make_shared<Tensor>(
+                Array2D<int, 2, 3>{{{1, 2, 3}, {4, 5, 6}}});
+        std::shared_ptr<Tensor> initTensor = std::make_shared<Tensor>(
+                Array2D<int, 2, 3>{{{0, 0, 0}, {1, 1, 1}}});
+        std::shared_ptr<Tensor> biasTensor = std::make_shared<Tensor>(
+                Array2D<int, 2, 3>{{{2, 0, 0}, {1, 0, 0}}});
+
+        auto add1 = Add("add1");
+        auto mem = Memorize(3, "mem1");
+        auto add2 = Add("add2");
+        auto bias = Producer(biasTensor, "bias");
+        auto init = Producer(initTensor, "init");
+        auto input = Producer(in, "input");
+
+        std::shared_ptr<GraphView> g = Sequential({add1, mem, add2});
+        init->addChild(mem, 0, 1);
+        mem->addChild(add1, 1, 1);
+        bias->addChild(add2, 0, 1);
+        input->addChild(add1, 0, 0);
+        // Update GraphView inputs/outputs following previous connections:
+        g->add({mem, add1, add2, init, bias, input});
+
+        g->setBackend("cpu");
+        g->setDataType(Aidge::DataType::Int32);
+        g->save("graphRecurrent");
+        g->forwardDims();
+
+        SequentialScheduler scheduler(g);
+        REQUIRE_NOTHROW(scheduler.forward(true));
+        scheduler.saveStaticSchedulingDiagram("static_schedule");
+        scheduler.saveSchedulingDiagram("schedulingRecurrent_seq");
+
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(
+                Array2D<int, 2, 3>{{{5, 6, 9}, {14, 16, 19}}});
+        std::shared_ptr<Tensor> result =
+                std::static_pointer_cast<Tensor>(g->getNode("add2")->getOperator()->getRawOutput(0));
+        result->print();
+        expectedOutput->print();
+        bool equal = (*result == *expectedOutput);
+        REQUIRE(equal);
+    }
+
+
+    SECTION("Test Recurrent graph (parallel)") {
+        std::shared_ptr<Tensor> in = std::make_shared<Tensor>(
+                Array2D<int, 2, 3>{{{1, 2, 3}, {4, 5, 6}}});
+        std::shared_ptr<Tensor> initTensor = std::make_shared<Tensor>(
+                Array2D<int, 2, 3>{{{0, 0, 0}, {1, 1, 1}}});
+        std::shared_ptr<Tensor> biasTensor = std::make_shared<Tensor>(
+                Array2D<int, 2, 3>{{{2, 0, 0}, {1, 0, 0}}});
+
+        auto add1 = Add("add1");
+        auto mem = Memorize(3, "mem1");
+        auto add2 = Add("add2");
+        auto bias = Producer(biasTensor, "bias");
+        auto init = Producer(initTensor, "init");
+        auto input = Producer(in, "input");
+
+        std::shared_ptr<GraphView> g = Sequential({add1, mem, add2});
+        init->addChild(mem, 0, 1);
+        mem->addChild(add1, 1, 1);
+        bias->addChild(add2, 0, 1);
+        input->addChild(add1, 0, 0);
+        // Update GraphView inputs/outputs following previous connections:
+        g->add({mem, add1, add2, init, bias, input});
+
+        g->setBackend("cpu");
+        g->setDataType(Aidge::DataType::Int32);
+        g->save("graphRecurrent");
+        g->forwardDims();
+
+        ParallelScheduler scheduler(g);
+        REQUIRE_NOTHROW(scheduler.forward(true));
+        scheduler.saveSchedulingDiagram("schedulingRecurrent_par");
+
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(
+                Array2D<int, 2, 3>{{{5, 6, 9}, {14, 16, 19}}});
+        std::shared_ptr<Tensor> result =
+                std::static_pointer_cast<Tensor>(g->getNode("add2")->getOperator()->getRawOutput(0));
+        result->print();
+        expectedOutput->print();
+        bool equal = (*result == *expectedOutput);
+        REQUIRE(equal);
+    }
+
+    SECTION("Test ConnectInput graph") {
+        std::shared_ptr<GraphView> g =
+                Sequential({
+                    Conv(1, 3, {3, 3}, "conv1"),
+                    Conv(3, 4, {1, 1}, "conv2"),
+                    Conv(4, 3, {1, 1}, "conv3"),
+                    FC(27, 5, false, "fc")});
+
+        // g->getNode("conv1")->getOperator()->setInput(0, inputTensor);
+        g->getNode("conv1")->getOperator()->setInput(1, weight1);
+        g->getNode("conv1")->getOperator()->setInput(2, bias1);
+
+        std::shared_ptr<Tensor> weight2 =
+                std::make_shared<Tensor>(Array4D<int, 4, 3, 1, 1>{{{{{1}}, {{2}}, {{3}}},
+                                                                   {{{4}}, {{5}}, {{6}}},
+                                                                   {{{7}}, {{8}}, {{9}}},
+                                                                   {{{10}}, {{11}}, {{12}}}}});
+        std::shared_ptr<Tensor> bias2 = std::make_shared<Tensor>(Array1D<int, 4>{{1, 2, 3, 4}});
+        g->getNode("conv2")->getOperator()->setInput(1, weight2);
+        g->getNode("conv2")->getOperator()->setInput(2, bias2);
+        // *(g->getNode("conv2")->getOperator()->input(1, weight2);
+
+        std::shared_ptr<Tensor> weight3 = std::make_shared<Tensor>(
+                Array4D<int, 3, 4, 1, 1>{{{{{1}}, {{2}}, {{3}}, {{4}}},
+                                          {{{5}}, {{6}}, {{7}}, {{8}}},
+                                          {{{9}}, {{10}}, {{11}}, {{12}}}}});
+        std::shared_ptr<Tensor> bias3 = std::make_shared<Tensor>(Array1D<int, 3>{{1, 2, 3}});
+        g->getNode("conv3")->getOperator()->setInput(1, weight3);
+        g->getNode("conv3")->getOperator()->setInput(2, bias3);
+
+        std::shared_ptr<Tensor> weightfc = std::make_shared<Tensor>(
+                Array2D<int, 5, 27>{{{1,  2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+                                      15, 1, 2, 3, 4, 5, 6, 7, 8, 9,  10, 11, 12},
+                                     {13, 14, 15, 1,  2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                                      12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9},
+                                     {10, 11, 12, 13, 14, 15, 1,  2, 3, 4, 5, 6, 7, 8,
+                                      9,  10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6},
+                                     {7, 8, 9, 10, 11, 12, 13, 14, 15, 1,  2, 3, 4, 5,
+                                      6, 7, 8, 9,  10, 11, 12, 13, 14, 15, 1, 2, 3},
+                                     {4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2,
+                                      3, 4, 5, 6, 7, 8, 9,  10, 11, 12, 13, 14, 15}}});
+        std::shared_ptr<Tensor> biasfc = std::make_shared<Tensor>(Array1D<int, 5>{{1, 2, 3, 4, 5}});
+        g->getNode("fc")->getOperator()->setInput(1, weightfc);
+        g->getNode("fc")->getOperator()->setInput(2, biasfc);
+
+        // input->addChild(g);
+        g->setDataType(Aidge::DataType::Int32);
+        g->setBackend("cpu");
+        std::vector<std::vector<Aidge::DimSize_t>> dims = {inputTensor->dims()};
+        g->forwardDims(dims);
+        SequentialScheduler scheduler(g);
+
+        std::vector<std::shared_ptr<Aidge::Tensor>> dataIn = {inputTensor};
+        REQUIRE_NOTHROW(scheduler.forward(true, dataIn));
+
+        scheduler.saveSchedulingDiagram("schedulingSequential");
+
+        std::shared_ptr<Tensor> expectedOutput1 = std::make_shared<Tensor>(Array4D<int, 2, 3, 3, 3>{
+                {{{{367, 412, 457}, {592, 637, 682}, {817, 862, 907}},
+                  {{854, 980, 1106}, {1484, 1610, 1736}, {2114, 2240, 2366}},
+                  {{1341, 1548, 1755}, {2376, 2583, 2790}, {3411, 3618, 3825}}},
+                 {{{1492, 1537, 1582}, {1717, 1762, 1807}, {1942, 1987, 2032}},
+                  {{4004, 4130, 4256}, {4634, 4760, 4886}, {5264, 5390, 5516}},
+                  {{6516, 6723, 6930}, {7551, 7758, 7965}, {8586, 8793, 9000}}}}});
+
+        std::shared_ptr<Tensor> expectedOutput2 = std::make_shared<Tensor>(Array4D<int, 2, 4, 3, 3>{
+                {{{{6099, 7017, 7935}, {10689, 11607, 12525}, {15279, 16197, 17115}},
+                  {{13786, 15838, 17890}, {24046, 26098, 28150}, {34306, 36358, 38410}},
+                  {{21473, 24659, 27845}, {37403, 40589, 43775}, {53333, 56519, 59705}},
+                  {{29160, 33480, 37800}, {50760, 55080, 59400}, {72360, 76680, 81000}}},
+                 {{{29049, 29967, 30885}, {33639, 34557, 35475}, {38229, 39147, 40065}},
+                  {{65086, 67138, 69190}, {75346, 77398, 79450}, {85606, 87658, 89710}},
+                  {{101123, 104309, 107495}, {117053, 120239, 123425}, {132983, 136169, 139355}},
+                  {{137160, 141480, 145800}, {158760, 163080, 167400}, {180360, 184680, 189000}}}}});
+
+        std::shared_ptr<Tensor> expectedOutput3 = std::make_shared<Tensor>(Array4D<int, 2, 3, 3, 3>{
+                {{{{214731, 246591, 278451}, {374031, 405891, 437751}, {533331, 565191, 597051}},
+                  {{496804, 570568, 644332}, {865624, 939388, 1013152}, {1234444, 1308208, 1381972}},
+                  {{778877, 894545, 1010213}, {1357217, 1472885, 1588553}, {1935557, 2051225, 2166893}}},
+                 {{{1011231, 1043091, 1074951}, {1170531, 1202391, 1234251}, {1329831, 1361691, 1393551}},
+                  {{2340904, 2414668, 2488432}, {2709724, 2783488, 2857252}, {3078544, 3152308, 3226072}},
+                  {{3670577, 3786245, 3901913}, {4248917, 4364585, 4480253}, {4827257, 4942925, 5058593}}}}});
+
+        Tensor expectedOutput4 = Array2D<int, 2, 5>{
+                {{205050376, 198925904, 181355097, 196978090, 238868348},
+                {598467376, 561797804, 560823897, 593043790, 698672948}}};
+        std::shared_ptr<Tensor> other1 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv1")->getOperator())->getOutput(0);
+        bool equal1 = (*other1 == *expectedOutput1);
+        REQUIRE(equal1);
+        std::shared_ptr<Tensor> other2 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv2")->getOperator())->getOutput(0);
+        bool equal2 = (*other2 == *expectedOutput2);
+        REQUIRE(equal2);
+        std::shared_ptr<Tensor> other3 = std::static_pointer_cast<OperatorTensor>(g->getNode("conv3")->getOperator())->getOutput(0);
+        bool equal3 = (*other3 == *expectedOutput3);
+        REQUIRE(equal3);
+        std::shared_ptr<Tensor> other4 = std::static_pointer_cast<OperatorTensor>(g->getNode("fc")->getOperator())->getOutput(0);
+        bool equal4 = (*other4 == expectedOutput4);
+        REQUIRE(equal4);
+    }
+}
+
+TEST_CASE("[cpu/scheduler] SequentialScheduler(backward)", "[scheduler][backward]") {
+
+    // create GraphView
+    std::shared_ptr<GraphView> gv = Sequential({ReLU("relu0"), Sqrt("srqt0"), ReLU("relu1")});
+
+    std::shared_ptr<Tensor> inputTensor =
+            std::make_shared<Tensor>(Array4D<float, 2, 1, 5, 5>{{{{{0.0f,  1.0f,  2.0f,  3.0f,  4.0f},
+                                                                 {5.0f,  6.0f,  7.0f,  8.0f,  9.0f},
+                                                                {10.0f, 11.0f, 12.0f, 13.0f, 14.0f},
+                                                                {15.0f, 16.0f, 17.0f, 18.0f, 19.0f},
+                                                                {20.0f, 21.0f, 22.0f, 23.0f, 24.0f}}},
+                                                              {{{25.0f, 26.0f, 27.0f, 28.0f, 29.0f},
+                                                                {30.0f, 31.0f, 32.0f, 33.0f, 34.0f},
+                                                                {35.0f, 36.0f, 37.0f, 38.0f, 39.0f},
+                                                                {40.0f, 41.0f, 42.0f, 43.0f, 44.0f},
+                                                                {45.0f, 46.0f, 47.0f, 48.0f, 49.0f}}}}});
+    auto label = inputTensor;
+    // implem already set to default
+    auto myProd = Producer(inputTensor, "prod");
+    myProd -> addChild(gv);
+    gv -> compile("cpu", DataType::Float32);
+
+    SequentialScheduler scheduler(gv);
+    scheduler.forward();
+    auto outNode = gv->getOrderedOutputs()[0].first;
+    std::shared_ptr<Tensor> predictedOutput = std::dynamic_pointer_cast<OperatorTensor>(outNode->getOperator())->getOutput(0);
+    std::shared_ptr<Tensor> targetOutput =
+          std::make_shared<Tensor>(Array4D<float, 2, 1, 5, 5>{{{{{0.0f, 1.0f, 1.0f, 2.0f, 2.0f},
+                                                                 {2.0f, 2.0f, 3.0f, 3.0f, 3.0f},
+                                                                 {3.0f, 3.0f, 3.0f, 4.0f, 4.0f},
+                                                                 {4.0f, 4.0f, 4.0f, 4.0f, 4.0f},
+                                                                 {4.0f, 5.0f, 5.0f, 5.0f, 5.0f}}},
+                                                               {{{5.0f, 5.0f, 5.0f, 5.0f, 5.0f},
+                                                                 {5.0f, 6.0f, 6.0f, 6.0f, 6.0f},
+                                                                 {6.0f, 6.0f, 6.0f, 6.0f, 6.0f},
+                                                                 {6.0f, 6.0f, 6.0f, 7.0f, 7.0f},
+                                                                 {7.0f, 7.0f, 7.0f, 7.0f, 7.0f}}}}});
+    predictedOutput->setGrad(targetOutput);
+    REQUIRE_NOTHROW(scheduler.backward());
+}
+
+std::shared_ptr<Node> Accumulate(int seqLength, const std::string& name) {
+    auto input = Identity((!name.empty()) ? name + "_input" : "");
+    auto hiddenState = Memorize(seqLength, (!name.empty()) ? name + "_hidden_state" : "");
+    auto add = Add((!name.empty()) ? name + "_add" : "");
+
+    input->addChild(add, 0, 0);
+    add->addChild(hiddenState, 0,0);
+    hiddenState->addChild(/*otherNode=*/add, /*outId=*/1, /*otherInId=*/1);
+
+    std::shared_ptr<GraphView> microGraph = std::make_shared<GraphView>();
+    microGraph->add(input);
+    microGraph->add({hiddenState, add});
+    microGraph->setOrderedInputs({{input, 0}, {hiddenState, 1}});
+    microGraph->setOrderedOutputs({{hiddenState, 0}});
+
+    auto metaOp = MetaOperator("Accumulate", microGraph, {}, name);
+    return metaOp;
+}
+
+TEST_CASE("[cpu/scheduler] Accumulate", "[scheduler]") {
+    std::shared_ptr<Tensor> Input = std::make_shared<Tensor>(
+        Array3D<float, 2, 3, 2>{{{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}},
+                                 {{2.0, 3.0}, {4.0, 5.0}, {6.0, 7.0}}}});
+
+    std::shared_ptr<Tensor> MemInit =
+        std::make_shared<Tensor>(Array2D<float, 3, 2>{
+            {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}});
+
+    auto meta = Accumulate(2, "accumulate");
+    auto op = std::static_pointer_cast<MetaOperator_Op>(meta->getOperator());
+    auto pop_i = Pop("pop_input");
+    auto pop_o = Identity("pop_output"); // NOTE: Could be Identity/Stack/Whatever node you want, this is is not the problem here
+
+    pop_i->getOperator()->associateInput(0, Input);
+    pop_i->addChild(op->getMicroGraph()->getOrderedInputs()[0].first, 0, 0);
+    op->getMicroGraph()->getOrderedOutputs()[0].first->addChild(pop_o, 0, 0);
+
+    //pop_i->addChild(meta, 0, 0);
+    //meta->addChild(pop_o, 0, 0);
+
+    //op->associateInput(1, MemInit);
+    op->getMicroGraph()->getNode("accumulate_hidden_state")->getOperator()->associateInput(1, MemInit);
+
+    // Build the graph.
+    auto myGraph = std::make_shared<GraphView>();
+    myGraph->add(pop_i);
+    myGraph->add(op->getMicroGraph());
+    //myGraph->add(meta);
+    myGraph->add(pop_o);
+    myGraph->compile("cpu", DataType::Float32);
+
+    myGraph->save("accumulate_graph", true);
+
+    // Schedule and run
+    auto scheduler = SequentialScheduler(myGraph);
+    scheduler.generateScheduling();
+    scheduler.saveStaticSchedulingDiagram("accumulate_scheduling");
+    REQUIRE_NOTHROW(scheduler.forward(true));
+
+    std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(
+        Array2D<float, 3, 2>{{{3.0, 5.0}, {7.0, 9.0}, {11.0, 13.0}}});
+    std::shared_ptr<Tensor> output = std::static_pointer_cast<OperatorTensor>(pop_o->getOperator())->getOutput(0);
+    REQUIRE(*output == *expectedOutput);
+}
+} // namespace Aidge
--- a/version.txt
+++ b/version.txt
-0.0.1
\ No newline at end of file
+0.5.0
No results found