Compare revisions

a424079c · a424079c · a424079c · a424079c · a424079c · a424079c
--- a/unit_tests/operator/Test_DivImpl.cpp
+++ b/unit_tests/operator/Test_DivImpl.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <chrono>      // std::micro, std::chrono::time_point,
+                       // std::chrono::system_clock
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::uint16_t
+#include <functional>  // std::multiplies
+#include <memory>
+#include <numeric>     // std::accumulate
+#include <random>      // std::random_device, std::mt19937
+                       // std::uniform_int_distribution, std::uniform_real_distribution
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
+
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/DivImpl.hpp"
+#include "aidge/data/Data.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/Div.hpp"
+#include "aidge/operator/OperatorTensor.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+namespace Aidge {
+
+TEST_CASE("[cpu/operator] Div", "[Div][CPU]") {
+    constexpr std::uint16_t NBTRIALS = 10;
+    // Create a random number generator
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
+    std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
+    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
+    std::uniform_int_distribution<int> boolDist(0,1);
+
+    // Create MatMul Operator
+    std::shared_ptr<Node> myDiv = Div();
+    auto op = std::static_pointer_cast<OperatorTensor>(myDiv-> getOperator());
+    op->setDataType(DataType::Float32);
+    op->setBackend("cpu");
+
+    // Create 2 input Tensors
+    std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>();
+    op->associateInput(0,T0);
+    T0->setDataType(DataType::Float32);
+    T0->setBackend("cpu");
+    std::shared_ptr<Tensor> T1 = std::make_shared<Tensor>();
+    op -> associateInput(1,T1);
+    T1->setDataType(DataType::Float32);
+    T1->setBackend("cpu");
+
+    // Create results Tensor
+    std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>();
+    Tres->setDataType(DataType::Float32);
+    Tres->setBackend("cpu");
+
+    // To measure execution time of 'MatMul_Op::forward()' member function call
+    std::chrono::time_point<std::chrono::system_clock> start;
+    std::chrono::time_point<std::chrono::system_clock> end;
+    std::chrono::duration<double, std::micro> duration{};
+
+    SECTION("DivImpl_cpu::forward()") {
+        SECTION("Scalar / Scalar") {
+
+        }
+        SECTION("Scalar / +1-D Tensor") {
+
+        }
+        SECTION("+1-D Tensor / +1-D Tensor - same dimensions") {
+            std::size_t number_of_operation = 0;
+
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                // generate 2 random Tensors
+                const std::size_t nbDims = nbDimsDist(gen);
+                std::vector<std::size_t> dims;
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    dims.push_back(dimSizeDist(gen));
+                }
+                const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                number_of_operation += nb_elements;
+
+                // without broadcasting
+                float* array0 = new float[nb_elements];
+                float* array1 = new float[nb_elements];
+                float* result = new float[nb_elements];
+
+                for (std::size_t i = 0; i < nb_elements; ++i) {
+                    array0[i] = valueDist(gen);
+                    array1[i] = valueDist(gen);
+                    result[i] = array0[i] / array1[i];
+                }
+
+                // input0
+                T0->resize(dims);
+                T0 -> getImpl() -> setRawPtr(array0, nb_elements);
+
+                // input1
+                T1->resize(dims);
+                T1 -> getImpl() -> setRawPtr(array1, nb_elements);
+
+                // results
+                Tres->resize(dims);
+                Tres -> getImpl() -> setRawPtr(result, nb_elements);
+
+                op->forwardDims();
+                start = std::chrono::system_clock::now();
+                myDiv->forward();
+                end = std::chrono::system_clock::now();
+                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+                REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
+
+                delete[] array0;
+                delete[] array1;
+                delete[] result;
+
+                // with broadcasting
+            }
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {} μs\n", duration.count());
+        }
+
+        SECTION("+1-D Tensor / +1-D Tensor - broadcasting") {
+            std::size_t number_of_operation = 0;
+
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                // generate 2 random Tensors
+                // handle dimensions, replace some dimensions with '1' to get broadcasting
+                constexpr std::size_t nbDims = 4;
+                std::vector<std::size_t> dims;
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    dims.push_back(dimSizeDist(gen));
+                }
+                std::vector<std::size_t> dims0 = dims;
+                std::vector<std::size_t> dims1 = dims;
+                std::vector<std::size_t> dimsOut = dims;
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    if (boolDist(gen)) {
+                        dims0[i] = 1;
+                    }
+                    if (boolDist(gen)) {
+                        dims1[i] = 1;
+                    }
+                    dimsOut[i] = (dims0[i] == 1) ? dims1[i] : dims0[i];
+                }
+
+                // create arrays and fill them with random values
+                float* array0 = new float[dims0[0]*dims0[1]*dims0[2]*dims0[3]];
+                float* array1 = new float[dims1[0]*dims1[1]*dims1[2]*dims1[3]];
+                float* result = new float[dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]];
+
+                for (std::size_t i = 0; i < dims0[0]*dims0[1]*dims0[2]*dims0[3]; ++i) {
+                    array0[i] = valueDist(gen);
+                }
+                for (std::size_t i = 0; i < dims1[0]*dims1[1]*dims1[2]*dims1[3]; ++i) {
+                    array1[i] = valueDist(gen);
+                }
+
+                // compute true result
+                const std::size_t strides0[nbDims] = {dims0[1]*dims0[2]*dims0[3], dims0[2]*dims0[3], dims0[3], 1};
+                const std::size_t strides1[nbDims] = {dims1[1]*dims1[2]*dims1[3], dims1[2]*dims1[3], dims1[3], 1};
+                for (std::size_t a = 0; a < dimsOut[0]; ++a) {
+                    for (std::size_t b = 0; b < dimsOut[1]; ++b) {
+                        const std::size_t idx0_0 = strides0[0] * ((dims0[0] > 1) ? a : 0)
+                                                    + strides0[1] * ((dims0[1] > 1) ? b : 0);
+                        const std::size_t idx1_0 = strides1[0] * ((dims1[0] > 1) ? a : 0)
+                                                    + strides1[1] * ((dims1[1] > 1) ? b : 0);
+                        for (std::size_t c = 0; c < dimsOut[2]; ++c) {
+                            const std::size_t idx_out = dimsOut[3] * (c + dimsOut[2] * (b + dimsOut[1] * a));
+                            for (std::size_t d = 0; d < dimsOut[3]; ++d) {
+                                std::size_t idx0 = idx0_0
+                                                    + strides0[2] * ((dims0[2] > 1) ? c : 0)
+                                                    + ((dims0[3] > 1) ? d : 0);
+                                std::size_t idx1 = idx1_0
+                                                    + strides1[2] * ((dims1[2] > 1) ? c : 0)
+                                                    + ((dims1[3] > 1) ? d : 0);
+                                result[idx_out + d] = array0[idx0] / array1[idx1];
+                                // std::cout << "(" << idx0 << ", " << idx1 << ") -> " << array0[idx0] << " / " << array1[idx1] << " -> " << idx_out + d << std::endl;
+                            }
+                        }
+                    }
+                }
+
+                // conversion to Aidge::Tensors
+                // input0
+                T0->resize(dims0);
+                T0 -> getImpl() -> setRawPtr(array0, dims0[0]*dims0[1]*dims0[2]*dims0[3]);
+
+                // input1
+                T1->resize(dims1);
+                T1 -> getImpl() -> setRawPtr(array1, dims1[0]*dims1[1]*dims1[2]*dims1[3]);
+
+                // results
+                Tres->resize(dimsOut);
+                Tres -> getImpl() -> setRawPtr(result, dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]);
+
+                // compute result
+                op->forwardDims();
+                start = std::chrono::system_clock::now();
+                myDiv->forward();
+                end = std::chrono::system_clock::now();
+                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+                // comparison between truth and computed result
+                REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
+
+                delete[] array0;
+                delete[] array1;
+                delete[] result;
+
+                const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                number_of_operation += nb_elements;
+            }
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {} μs\n", duration.count());
+        }
+        SECTION("+1-D Tensor / 1-D Tensor") {
+            std::size_t number_of_operation = 0;
+            std::uniform_int_distribution<std::size_t> nbRemovedDimsDist(std::size_t(1), std::size_t(3));
+
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                // generate 2 random Tensors
+                // handle dimensions
+                constexpr std::size_t nbDims = 4;
+                std::vector<std::size_t> dims0(4);
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    dims0[i] = dimSizeDist(gen);
+                }
+                std::vector<std::size_t> dimsOut = dims0;
+                std::vector<std::size_t> dims1 = dims0;
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    if (boolDist(gen)) {
+                        dims1[i] = 1;
+                    }
+                }
+                dims1.erase(dims1.cbegin(), dims1.cbegin() + nbRemovedDimsDist(gen));
+
+                // create arrays and fill them with random values
+                float* array0 = new float[dims0[0]*dims0[1]*dims0[2]*dims0[3]];
+                std::size_t array1_size = std::accumulate(dims1.cbegin(), dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                float* array1 = new float[array1_size];
+                float* result = new float[dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]];
+
+                for (std::size_t i = 0; i < (dims0[0]*dims0[1]*dims0[2]*dims0[3]); ++i) {
+                    array0[i] = valueDist(gen);
+                }
+                for (std::size_t i = 0; i < array1_size; ++i) {
+                    array1[i] = valueDist(gen);
+                }
+
+                // compute true result
+                auto dims1_tmp = dims1;
+                dims1_tmp.insert(dims1_tmp.cbegin(), 4 - dims1_tmp.size(), std::size_t(1));
+
+                const std::size_t strides0[nbDims] = {dims0[1]*dims0[2]*dims0[3], dims0[2]*dims0[3], dims0[3], 1};
+                const std::size_t strides1[nbDims] = {dims1_tmp[1]*dims1_tmp[2]*dims1_tmp[3], dims1_tmp[2]*dims1_tmp[3], dims1_tmp[3], 1};
+                for (std::size_t a = 0; a < dimsOut[0]; ++a) {
+                    for (std::size_t b = 0; b < dimsOut[1]; ++b) {
+                        const std::size_t idx0_0 = strides0[0] * ((dims0[0] > 1) ? a : 0)
+                                                    + strides0[1] * ((dims0[1] > 1) ? b : 0);
+                        const std::size_t idx1_0 = strides1[0] * ((dims1_tmp[0] > 1) ? a : 0)
+                                                    + strides1[1] * ((dims1_tmp[1] > 1) ? b : 0);
+                        for (std::size_t c = 0; c < dimsOut[2]; ++c) {
+                            const std::size_t idx_out = dimsOut[3] * (c + dimsOut[2] * (b + dimsOut[1] * a));
+                            for (std::size_t d = 0; d < dimsOut[3]; ++d) {
+                                std::size_t idx0 = idx0_0
+                                                    + strides0[2] * ((dims0[2] > 1) ? c : 0)
+                                                    + ((dims0[3] > 1) ? d : 0);
+                                std::size_t idx1 = idx1_0
+                                                    + strides1[2] * ((dims1_tmp[2] > 1) ? c : 0)
+                                                    + ((dims1_tmp[3] > 1) ? d : 0);
+                                result[idx_out + d] = array0[idx0] / array1[idx1];
+                                // std::cout << "(" << idx0 << ", " << idx1 << ") -> " << array0[idx0] << " / " << array1[idx1] << " -> " << idx_out + d << std::endl;
+                            }
+                        }
+                    }
+                }
+
+                // conversion to Aidge::Tensors
+                // input0
+                T0->resize(dims0);
+                T0 -> getImpl() -> setRawPtr(array0, dims0[0]*dims0[1]*dims0[2]*dims0[3]);
+
+                // input1
+                T1->resize(dims1);
+                T1 -> getImpl() -> setRawPtr(array1, array1_size);
+
+                // results
+                Tres->resize(dimsOut);
+                Tres -> getImpl() -> setRawPtr(result, dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]);
+
+                // compute result
+                op->forwardDims();
+                start = std::chrono::system_clock::now();
+                myDiv->forward();
+                end = std::chrono::system_clock::now();
+                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+                // comparison between truth and computed result
+                REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
+
+                delete[] array0;
+                delete[] array1;
+                delete[] result;
+
+                const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                number_of_operation += nb_elements;
+            }
+
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {} μs\n", duration.count());
+        }
+    }
+}
+} // namespace Aidge
--- a/unit_tests/operator/Test_ErfImpl.cpp
+++ b/unit_tests/operator/Test_ErfImpl.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <memory>
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu/operator/ErfImpl.hpp"
+#include "aidge/data/DataType.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/Erf.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+
+using namespace Aidge;
+
+TEST_CASE("[cpu/operator] Erf(forward)") {
+    SECTION("1D Tensor") {
+        std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array1D<float,10> {
+            {0.41384590, 0.43120754, 0.93762982, 0.31049860, 0.77547199, 0.09514862,
+              0.16145366, 0.42776686, 0.43487436, 0.41170865}
+        });
+        Tensor expectedOutput = Array1D<float,10> {
+                {0.44163144, 0.45801866, 0.81516320, 0.33941913, 0.72722000, 0.10704061,
+              0.18061027, 0.45479023, 0.46144873, 0.43959764}
+        };
+
+        auto op = std::make_shared<Erf_Op>();
+        op->associateInput(0,input0);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cpu");
+        op->forward();
+
+        REQUIRE(approxEq<float>(*(op->getOutput(0)), expectedOutput, 1e-5f, 1e-8f));
+    }
+
+    SECTION("3D Tensor") {
+        std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array3D<float,2,2,3> {
+            {
+                {
+                    {0.97037154, 0.86208081, 0.77767169},
+                    {0.38160080, 0.11422747, 0.77284443},
+                },
+                {
+                    {0.51592529, 0.72543722, 0.54641193},
+                    {0.93866944, 0.97767913, 0.34172094}
+                }
+            }
+        });
+        Tensor expectedOutput = Array3D<float,2,2,3> {
+            {
+                {
+                    {0.83003384, 0.77721894, 0.72857803},
+                    {0.41057193, 0.12833349, 0.72559172},
+                },
+                {
+                    {0.53438270, 0.69507217, 0.56032562},
+                    {0.81564975, 0.83322692, 0.37109339}
+                }
+            }
+        };
+
+        auto op = std::make_shared<Erf_Op>();
+        op->associateInput(0,input0);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cpu");
+        op->forward();
+
+        REQUIRE(approxEq<float>(*(op->getOutput(0)), expectedOutput, 1e-5f, 1e-8f));
+    }
+}
\ No newline at end of file
--- a/unit_tests/operator/Test_ExpandImpl.cpp
+++ b/unit_tests/operator/Test_ExpandImpl.cpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <memory>
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/ExpandImpl.hpp"
+#include "aidge/data/DataType.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/Expand.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"
+
+using std::shared_ptr;
+
+using namespace Aidge;
+
+void setupTestExpand(shared_ptr<Tensor> inputData,
+                     shared_ptr<Tensor> inputShape,
+                     shared_ptr<Expand_Op> &op) {
+
+    op->getOutput(0)->setDataType(inputData->dataType());
+
+    inputData->setBackend("cpu");
+    op->associateInput(0, inputData);
+
+    inputShape->setBackend("cpu");
+    op->associateInput(1, inputShape);
+}
+
+TEST_CASE("[cpu/operator] Expand(forward)", "[Expand][CPU]") {
+    std::shared_ptr<Expand_Op> op = std::make_shared<Expand_Op>();
+    op->setBackend("cpu");
+
+    SECTION("Expand shape is bigger than inputData") {
+        auto inputData = std::make_shared<Tensor>(Array1D<int, 2>({1, 3}));
+        auto inputShape =
+            std::make_shared<Tensor>(Array1D<std::int64_t, 4>({1, 3, 4, 2}));
+        Tensor expectedOutput =
+            Array4D<cpptype_t<DataType::Int32>, 1, 3, 4, 2>({{{{{1, 3}, {1, 3}, {1, 3}, {1, 3}},
+                                        {{1, 3}, {1, 3}, {1, 3}, {1, 3}},
+                                        {{1, 3}, {1, 3}, {1, 3}, {1, 3}}}}});
+        setupTestExpand(inputData, inputShape, op);
+
+        // forwardDims has already been tested in core
+        CHECK(op->forwardDims(true));
+        REQUIRE_NOTHROW(op->forward());
+        REQUIRE(expectedOutput == *op->getOutput(0));
+    }
+    SECTION("Expand shape has less dimensions than inputData") {
+        auto inputData = std::make_shared<Tensor>(
+            Array3D<int, 2, 1, 3>({{{2, 1, 3}, {2, 1, 3}}}));
+        auto inputShape =
+            std::make_shared<Tensor>(Array1D<std::int64_t, 2>({2, 3}));
+        Tensor expectedOutput = Array3D<cpptype_t<DataType::Int32>, 2, 2, 3>(
+            {{{{2, 1, 3}, {2, 1, 3}}, {{2, 1, 3}, {2, 1, 3}}}});
+        setupTestExpand(inputData, inputShape, op);
+
+        // forwardDims has already been tested in core
+        CHECK(op->forwardDims(true));
+        REQUIRE_NOTHROW(op->forward());
+        REQUIRE(expectedOutput == *op->getOutput(0));
+    }
+    SECTION("Expand shape = {1} leads to input equal to output.") {
+        auto inputData = std::make_shared<Tensor>(
+            Array4D<int, 2, 1, 3, 1>({{{2, 1, 3}, {2, 1, 3}}}));
+        auto inputShape =
+            std::make_shared<Tensor>(Array1D<std::int64_t, 1>({1}));
+        Tensor expectedOutput =
+            Array4D<cpptype_t<DataType::Int32>, 2, 1, 3, 1>({{{2, 1, 3}, {2, 1, 3}}});
+        setupTestExpand(inputData, inputShape, op);
+
+        // forwardDims has already been tested in core
+        CHECK(op->forwardDims(true));
+        REQUIRE_NOTHROW(op->forward());
+        REQUIRE(expectedOutput == *op->getOutput(0));
+    }
+    SECTION("The only common dimension is the last one & its equal to 1") {
+        auto inputData = std::make_shared<Tensor>(
+            Array4D<int, 1, 1, 3, 1>({{{{2, 1, 3}}}}));
+        auto inputShape =
+            std::make_shared<Tensor>(Array1D<std::int64_t, 3>({2, 1, 1}));
+        Tensor expectedOutput =
+            Array4D<cpptype_t<DataType::Int32>, 1, 2, 3, 1>({{{{2, 1, 3}, {2, 1, 3}}}});
+        setupTestExpand(inputData, inputShape, op);
+
+        // forwardDims has already been tested in core
+        CHECK(op->forwardDims(true));
+        REQUIRE_NOTHROW(op->forward());
+        REQUIRE(expectedOutput == *op->getOutput(0));
+    }
+    SECTION("N-Dim to N-Dim") {}
+    auto inputData = std::shared_ptr<Tensor>();
+}
--- a/unit_tests/operator/Test_FCImpl.cpp
+++ b/unit_tests/operator/Test_FCImpl.cpp
@@ -9,17 +9,20 @@
 *
 ********************************************************************************/

-#include <catch2/catch_test_macros.hpp>
 #include <memory>

+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/FCImpl.hpp"
+#include "aidge/data/DataType.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/FC.hpp"
-
-#include "aidge/backend/cpu.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"

 using namespace Aidge;

-TEST_CASE("[cpu/oeprator] FC(forward)") {
+TEST_CASE("[cpu/oeprator] FC(forward)", "[FC][CPU]") {
    std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array2D<int, 5, 75>{
            {{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,
              5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,
@@ -42,14 +45,15 @@ TEST_CASE("[cpu/oeprator] FC(forward)") {
              9,  10, 11, 12, 13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
              13, 14, 15, 1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}}});
    std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<int, 5>{{1, 2, 3, 4, 5}});
-    std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<int, 2, 5>{
-            {{23601, 23602, 23603, 23604, 23605}, {68601, 68602, 68603, 68604, 68605}}});
+    Tensor myOutput = Array2D<int, 2, 5>{
+            {{23601, 23602, 23603, 23604, 23605}, {68601, 68602, 68603, 68604, 68605}}};

-    std::shared_ptr<Node> myFC = FC(5, false, "myfc");
-    myFC->getOperator()->setDatatype(DataType::Int32);
-    myFC->getOperator()->setBackend("cpu");
-    myFC->getOperator()->associateInput(1, myWeights);
-    myFC->getOperator()->associateInput(2, myBias);
+    std::shared_ptr<Node> myFC = FC(75, 5, false, "myfc");
+    auto op = std::static_pointer_cast<FC_Op>(myFC -> getOperator());
+    op -> setDataType(DataType::Int32);
+    op -> setBackend("cpu");
+    op -> associateInput(1, myWeights);
+    op -> associateInput(2, myBias);

    SECTION("2D input") {
        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<int, 2, 75>{
@@ -62,10 +66,9 @@ TEST_CASE("[cpu/oeprator] FC(forward)") {
                  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
                  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
                  135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149}}});
-        myFC->getOperator()->associateInput(0, myInput);
-        myFC->getOperator()->computeOutputDims();
+        op->associateInput(0, myInput);
        myFC->forward();
-        REQUIRE(*std::static_pointer_cast<Tensor>(myFC->getOperator()->getOutput(0)) == *myOutput);
+        REQUIRE(*(op->getOutput(0)) == myOutput);
    }
    SECTION("4D input") {
        std::shared_ptr<Tensor> myInput =
@@ -99,10 +102,9 @@ TEST_CASE("[cpu/oeprator] FC(forward)") {
                                                                     {135, 136, 137, 138, 139},
                                                                     {140, 141, 142, 143, 144},
                                                                     {145, 146, 147, 148, 149}}}}});
-        myFC->getOperator()->associateInput(0, myInput);
-        myFC->getOperator()->computeOutputDims();
+        op->associateInput(0, myInput);
        myFC->forward();
-        REQUIRE(*std::static_pointer_cast<Tensor>(myFC->getOperator()->getOutput(0)) == *myOutput);
+        REQUIRE(*(op->getOutput(0)) == myOutput);
    }

    // std::cout << static_cast<Tensor>((*myFC->getOperator())["weight"])[0][0][0][0] << std::endl;

--- a/unit_tests/operator/Test_FoldImpl.cpp
+++ b/unit_tests/operator/Test_FoldImpl.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <catch2/catch_test_macros.hpp>
+#include <cstdlib>
+#include <memory>
+
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/graph/GraphView.hpp"
+#include "aidge/scheduler/SequentialScheduler.hpp"
+#include "aidge/operator/Fold.hpp"
+#include "aidge/operator/Unfold.hpp"
+#include "aidge/operator/MatMul.hpp"
+#include "aidge/operator/Reshape.hpp"
+
+using namespace Aidge;
+
+TEST_CASE("[cpu/operator] Fold(forward)", "[Fold][CPU]") {
+    std::shared_ptr<Node> myUnfold = Unfold({3,3}, "myunfold");
+    std::shared_ptr<Node> myReshape = Reshape({4, 27}, "myreshape");
+    std::shared_ptr<Node> myMatMul = MatMul("mymatmul");
+    std::shared_ptr<Node> myFold = Fold({3,3}, {1,1}, "myfold");
+    myUnfold->addChild(myMatMul, 0, 1);
+    myReshape->addChild(myMatMul, 0, 0);
+    myMatMul->addChild(myFold, 0, 0);
+
+    std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array4D<int,4,3,3,3> {
+        {
+            {
+                {{  0,   1,   2},
+                {  3,   4,   5},
+                {  6,   7,   8}},
+                {{  9,  10,  11},
+                { 12,  13,  14},
+                { 15,  16,  17}},
+                {{ 18,  19,  20},
+                { 21,  22,  23},
+                { 24,  25,  26}}
+            },
+            {
+                {{ 27,  28,  29},
+                { 30,  31,  32},
+                { 33,  34,  35}},
+                {{ 36,  37,  38},
+                { 39,  40,  41},
+                { 42,  43,  44}},
+                {{ 45,  46,  47},
+                { 48,  49,  50},
+                { 51,  52,  53}}
+            },
+            {
+                {{ 54,  55,  56},
+                { 57,  58,  59},
+                { 60,  61,  62}},
+                {{ 63,  64,  65},
+                { 66,  67,  68},
+                { 69,  70,  71}},
+                {{ 72,  73,  74},
+                { 75,  76,  77},
+                { 78,  79,  80}}
+            },
+            {
+                {{ 81,  82,  83},
+                { 84,  85,  86},
+                { 87,  88,  89}},
+                {{ 90,  91,  92},
+                { 93,  94,  95},
+                { 96,  97,  98}},
+                {{ 99, 100, 101},
+                {102, 103, 104},
+                {105, 106, 107}}
+            }
+        }
+    });
+    std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<int,2,3,5,5> { //NCHW
+        {
+            {
+                {{  0,   1,   2,   3,   4},
+                {  5,   6,   7,   8,   9},
+                { 10,  11,  12,  13,  14},
+                { 15,  16,  17,  18,  19},
+                { 20,  21,  22,  23,  24}},
+
+                {{ 25,  26,  27,  28,  29},
+                { 30,  31,  32,  33,  34},
+                { 35,  36,  37,  38,  39},
+                { 40,  41,  42,  43,  44},
+                { 45,  46,  47,  48,  49}},
+
+                {{ 50,  51,  52,  53,  54},
+                { 55,  56,  57,  58,  59},
+                { 60,  61,  62,  63,  64},
+                { 65,  66,  67,  68,  69},
+                { 70,  71,  72,  73,  74}}
+            },
+            {
+                {{ 75,  76,  77,  78,  79},
+                { 80,  81,  82,  83,  84},
+                { 85,  86,  87,  88,  89},
+                { 90,  91,  92,  93,  94},
+                { 95,  96,  97,  98,  99}},
+
+                {{100, 101, 102, 103, 104},
+                {105, 106, 107, 108, 109},
+                {110, 111, 112, 113, 114},
+                {115, 116, 117, 118, 119},
+                {120, 121, 122, 123, 124}},
+
+                {{125, 126, 127, 128, 129},
+                {130, 131, 132, 133, 134},
+                {135, 136, 137, 138, 139},
+                {140, 141, 142, 143, 144},
+                {145, 146, 147, 148, 149}}
+            }
+        }
+    });
+    std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<int,2,4,3,3> {
+        {
+            {
+                {{ 15219, 15570, 15921},
+                { 16974, 17325, 17676},
+                { 18729, 19080, 19431}},
+                {{ 37818, 38898, 39978},
+                { 43218, 44298, 45378},
+                { 48618, 49698, 50778}},
+                {{ 60417, 62226, 64035},
+                { 69462, 71271, 73080},
+                { 78507, 80316, 82125}},
+                {{ 83016, 85554, 88092},
+                { 95706, 98244, 100782},
+                { 108396, 110934, 113472}}
+            },
+            {
+                {{ 41544, 41895, 42246},
+                { 43299, 43650, 44001},
+                { 45054, 45405, 45756}},
+                {{ 118818, 119898, 120978},
+                { 124218, 125298, 126378},
+                { 129618, 130698, 131778}},
+                {{ 196092, 197901, 199710},
+                { 205137, 206946, 208755},
+                { 214182, 215991, 217800}},
+                {{ 273366, 275904, 278442},
+                { 286056, 288594, 291132},
+                { 298746, 301284, 303822}}
+            }
+        }
+    });
+
+    auto opUnfold = std::static_pointer_cast<OperatorTensor>(myUnfold -> getOperator());
+    auto opReshape = std::static_pointer_cast<OperatorTensor>(myReshape -> getOperator());
+    auto opMatMul = std::static_pointer_cast<OperatorTensor>(myMatMul -> getOperator());
+    auto opFold = std::static_pointer_cast<OperatorTensor>(myFold -> getOperator());
+    opUnfold->associateInput(0,myInput);
+    opReshape->associateInput(0,myWeights);
+
+    auto g = getConnectedGraphView(myMatMul);
+    g->setDataType(DataType::Int32);
+    g->setBackend("cpu");
+
+    g->forwardDims();
+    g->save("unfold_matmul_fold");
+
+    SequentialScheduler scheduler(g);
+    scheduler.forward();
+    //opFold->getOutput(0)->print();
+    REQUIRE(*(opFold->getOutput(0)) == *myOutput);
+}
\ No newline at end of file
--- a/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp
+++ b/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <chrono>
+#include <cstddef> // std::size_t
+#include <cstdint> // std::uint16_t
+#include <functional>  // std::multiplies
+#include <memory>
+#include <numeric> // std::accumulate
+#include <random>      // std::random_device, std::mt19937
+                       // std::uniform_int_distribution, std::uniform_real_distribution
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
+
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp"
+#include "aidge/data/Data.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/GlobalAveragePooling.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+
+TEST_CASE("[cpu/operator] GlobalAveragePooling",
+          "[GlobalAveragePooling][CPU]") {
+  constexpr std::uint16_t NBTRIALS = 10;
+  // Create a random number generator
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_real_distribution<float> valueDist(
+      0.1f, 1.1f); // Random float distribution between 0 and 1
+  std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2),
+                                                         std::size_t(10));
+
+  std::uniform_int_distribution<std::size_t> nbLowDimsDist(std::size_t(1),
+                                                           std::size_t(2));
+  std::uniform_int_distribution<std::size_t> nbHighDimsDist(std::size_t(3),
+                                                            std::size_t(7));
+
+  // Create MatGlobalAveragePooling Operator
+  std::shared_ptr<GlobalAveragePooling_Op> op = std::make_shared<GlobalAveragePooling_Op>();
+  op->setDataType(DataType::Float32);
+  op->setBackend("cpu");
+
+  // Create the input Tensor
+  std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>();
+  op->associateInput(0, T0);
+  T0->setDataType(DataType::Float32);
+  T0->setBackend("cpu");
+
+  // Create results Tensor
+  std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>();
+  Tres->setDataType(DataType::Float32);
+  Tres->setBackend("cpu");
+
+  // To measure execution time of 'MatGlobalAveragePooling_Op::forward()' member
+  // function call
+  std::chrono::time_point<std::chrono::system_clock> start;
+  std::chrono::time_point<std::chrono::system_clock> end;
+  std::chrono::duration<double, std::micro> duration{};
+  int number_of_operation{0};
+
+  SECTION("GlobalAveragePoolingImpl_cpu::forward()") {
+    SECTION(
+        "1-2Dim > not enough dimensions leads to function throwing an error") {
+      // generate a random tensors
+      const std::size_t nbDims = nbLowDimsDist(gen);
+      std::vector<std::size_t> dims;
+      for (std::size_t i = 0; i < nbDims; ++i) {
+        dims.push_back(dimSizeDist(gen));
+      }
+      const std::size_t nb_elements =
+          std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1),
+                          std::multiplies<std::size_t>());
+
+      float *array0 = new float[nb_elements];
+      for (std::size_t i = 0; i < nb_elements; ++i) {
+        array0[i] = valueDist(gen);
+      }
+      // input0
+      T0->resize(dims);
+      T0->getImpl()->setRawPtr(array0, nb_elements);
+
+      REQUIRE_THROWS(op->forward());
+      delete[] array0;
+    }
+
+    SECTION("3+Dim") {
+      SECTION("Fill a tensor with all values set as N will result with every "
+              "output being N") {
+        // generate the tensor
+        const std::size_t nbDims = nbHighDimsDist(gen);
+        std::vector<std::size_t> dims_in;
+        for (std::size_t i = 0; i < nbDims; ++i) {
+          dims_in.push_back(dimSizeDist(gen));
+        }
+        // create in nb_elems
+        const std::size_t in_nb_elems =
+            std::accumulate(dims_in.cbegin(), dims_in.cend(), std::size_t(1),
+                            std::multiplies<std::size_t>());
+        const DimSize_t in_batch_nb_elems = in_nb_elems / dims_in[0];
+        const DimSize_t in_channel_nb_elems = in_batch_nb_elems / dims_in[1];
+
+        number_of_operation +=
+            in_nb_elems +
+            dims_in[1]; //  averaging per channel : 1 addition per element in
+                        //  the channel + 1 division this for every batch
+        // create out nb_elems
+        std::vector<std::size_t> dims_out(dims_in.size(), 1);
+        dims_out[0] = dims_in[0];
+        dims_out[1] = dims_in[1];
+        const std::size_t out_nb_elems =
+            std::accumulate(dims_out.cbegin(), dims_out.cend(), std::size_t(1),
+                            std::multiplies<std::size_t>());
+        const DimSize_t out_batch_nb_elems = out_nb_elems / dims_out[0];
+
+        // iterate over each batch/channel
+        float *array0 = new float[in_nb_elems];
+        float *result = new float[out_nb_elems];
+        float val = valueDist(gen);
+        for (std::size_t batch = 0; batch < dims_in[0]; ++batch) {
+          for (std::size_t channel = 0; channel < dims_in[1]; ++channel) {
+            for (std::size_t i = 0; i < in_channel_nb_elems; ++i)
+
+            {
+              array0[batch * in_batch_nb_elems + channel * in_channel_nb_elems +
+                     i] = val;
+            }
+            result[batch * out_batch_nb_elems + channel] = val;
+          }
+        }
+
+        // input0
+        T0->resize(dims_in);
+        T0->getImpl()->setRawPtr(array0, in_nb_elems);
+
+        // results
+        Tres->resize(dims_out);
+        Tres->getImpl()->setRawPtr(result, out_nb_elems);
+
+        op->forwardDims();
+        start = std::chrono::system_clock::now();
+        REQUIRE_NOTHROW(op->forward());
+        end = std::chrono::system_clock::now();
+        duration +=
+            std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+        REQUIRE(Tres->nbDims() == op->getOutput(0)->nbDims());
+        for (DimSize_t i = 0; i < op->getOutput(0)->nbDims(); ++i) {
+          REQUIRE(Tres->dims().at(i) == op->getOutput(0)->dims().at(i));
+        }
+
+        REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
+
+        delete[] array0;
+        delete[] result;
+      }
+
+      SECTION("random testing") {
+        for (int trial = 0; trial < NBTRIALS; ++trial) {
+          // generate the tensor
+          const std::size_t nbDims = nbHighDimsDist(gen);
+          std::vector<std::size_t> dims_in;
+          for (std::size_t i = 0; i < nbDims; ++i) {
+            dims_in.push_back(dimSizeDist(gen));
+          }
+          // create in nb_elems
+          const std::size_t in_nb_elems =
+              std::accumulate(dims_in.cbegin(), dims_in.cend(), std::size_t(1),
+                              std::multiplies<std::size_t>());
+          const DimSize_t in_batch_nb_elems = in_nb_elems / dims_in[0];
+          const DimSize_t in_channel_nb_elems = in_batch_nb_elems / dims_in[1];
+          number_of_operation +=
+              in_nb_elems +
+              dims_in[1]; //  averaging per channel : 1 addition per element in
+                          //  the channel + 1 division this for every batch
+
+          // create out nb_elems
+          std::vector<std::size_t> dims_out(dims_in.size(), 1);
+          dims_out[0] = dims_in[0];
+          dims_out[1] = dims_in[1];
+          const std::size_t out_nb_elems =
+              std::accumulate(dims_out.cbegin(), dims_out.cend(),
+                              std::size_t(1), std::multiplies<std::size_t>());
+          const DimSize_t out_batch_nb_elems = out_nb_elems / dims_out[0];
+
+          // iterate over each batch/channel
+          float *array0 = new float[in_nb_elems];
+          float *result = new float[out_nb_elems];
+          for (std::size_t batch = 0; batch < dims_in[0]; ++batch) {
+            for (std::size_t channel = 0; channel < dims_in[1]; ++channel) {
+              float channel_sum = 0;
+              for (std::size_t i = 0; i < in_channel_nb_elems; ++i)
+
+              {
+                float val = valueDist(gen);
+                array0[batch * in_batch_nb_elems +
+                       channel * in_channel_nb_elems + i] = val;
+                channel_sum += val;
+              }
+              result[batch * out_batch_nb_elems + channel] =
+                  channel_sum / in_channel_nb_elems;
+            }
+          }
+
+          // input0
+          T0->resize(dims_in);
+          T0->getImpl()->setRawPtr(array0, in_nb_elems);
+
+          // results
+          Tres->resize(dims_out);
+          Tres->getImpl()->setRawPtr(result, out_nb_elems);
+
+          op->forwardDims();
+          start = std::chrono::system_clock::now();
+          REQUIRE_NOTHROW(op->forward());
+          end = std::chrono::system_clock::now();
+          duration += std::chrono::duration_cast<std::chrono::microseconds>(
+              end - start);
+
+          REQUIRE(Tres->nbDims() == op->getOutput(0)->nbDims());
+          for (DimSize_t i = 0; i < op->getOutput(0)->nbDims(); ++i) {
+            REQUIRE(Tres->dims().at(i) == op->getOutput(0)->dims().at(i));
+          }
+
+          REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres, 1e-4f));
+
+          delete[] array0;
+          delete[] result;
+        }
+      }
+      SECTION("Using result from a pytorch function as groundtruth") {
+        DimSize_t batch_size = 2;
+        DimSize_t channels = 3;
+        DimSize_t height = 4;
+        DimSize_t width = 3;
+        DimSize_t depth = 2;
+
+        SECTION("2D_img") {
+          const std::vector<DimSize_t> in_dims{batch_size, channels, height,
+                                               width};
+          std::vector<std::size_t> out_dims(in_dims.size(), 1);
+          out_dims[0] = in_dims[0];
+          out_dims[1] = in_dims[1];
+          DimSize_t in_nb_elems = batch_size * channels * height * width;
+          DimSize_t out_nb_elems = batch_size * channels;
+          number_of_operation +=
+              in_nb_elems +
+              channels; //  averaging per channel : 1 addition per element in
+                        //  the channel + 1 division this for every batch
+          auto input = new float[in_nb_elems];
+          auto result = new float[out_nb_elems];
+          input[0] = 0.1807716;
+          input[1] = -0.0699881;
+          input[2] = -0.3596235;
+          input[3] = -0.9152045;
+          input[4] = 0.6257653;
+          input[5] = 0.0255099;
+          input[6] = 0.9545137;
+          input[7] = 0.0643485;
+          input[8] = 0.3611506;
+          input[9] = 1.1678782;
+          input[10] = -1.3498932;
+          input[11] = -0.5101767;
+          input[12] = 0.2359577;
+          input[13] = -0.2397784;
+          input[14] = -0.9211147;
+          input[15] = 1.5432971;
+          input[16] = 1.3488258;
+          input[17] = -0.1396417;
+          input[18] = 0.2857972;
+          input[19] = 0.9651205;
+          input[20] = -2.0371499;
+          input[21] = 0.4931363;
+          input[22] = 1.4869986;
+          input[23] = 0.5910330;
+          input[24] = 0.1260297;
+          input[25] = -1.5626874;
+          input[26] = -1.1601028;
+          input[27] = -0.3348408;
+          input[28] = 0.4477722;
+          input[29] = -0.8016447;
+          input[30] = 1.5236114;
+          input[31] = 2.5085869;
+          input[32] = -0.6630959;
+          input[33] = -0.2512752;
+          input[34] = 1.0101448;
+          input[35] = 0.1215468;
+          input[36] = 0.1583993;
+          input[37] = 1.1340188;
+          input[38] = -1.1538976;
+          input[39] = -0.2983968;
+          input[40] = -0.5075365;
+          input[41] = -0.9239212;
+          input[42] = 0.5467061;
+          input[43] = -1.4947776;
+          input[44] = -1.2057148;
+          input[45] = 0.5718198;
+          input[46] = -0.5973545;
+          input[47] = -0.6936757;
+          input[48] = 1.6455388;
+          input[49] = -0.8029931;
+          input[50] = 1.3514109;
+          input[51] = -0.2759193;
+          input[52] = -1.5108346;
+          input[53] = 2.1047730;
+          input[54] = 2.7629590;
+          input[55] = -1.7465292;
+          input[56] = 0.8353187;
+          input[57] = -1.9560477;
+          input[58] = -0.8002653;
+          input[59] = -0.5044988;
+          input[60] = -0.0711742;
+          input[61] = -0.5130699;
+          input[62] = -1.0307810;
+          input[63] = 0.9154347;
+          input[64] = -0.2282317;
+          input[65] = -0.6884708;
+          input[66] = 0.1832259;
+          input[67] = 0.6003584;
+          input[68] = -1.5429375;
+          input[69] = -0.3465560;
+          input[70] = -0.1476223;
+          input[71] = 0.6469797;
+
+          result[0] = 0.0145876;
+          result[1] = 0.3010401;
+          result[2] = 0.0803371;
+
+          result[3] = -0.3720275;
+          result[4] = 0.0919094;
+          result[5] = -0.1852371;
+
+          // input0
+          T0->resize(in_dims);
+          T0->getImpl()->setRawPtr(input, in_nb_elems);
+
+          // results
+          Tres->resize(out_dims);
+          Tres->getImpl()->setRawPtr(result, out_nb_elems);
+          op->forwardDims();
+          start = std::chrono::system_clock::now();
+          REQUIRE_NOTHROW(op->forward());
+          end = std::chrono::system_clock::now();
+          duration += std::chrono::duration_cast<std::chrono::microseconds>(
+              end - start);
+
+          REQUIRE(Tres->nbDims() == op->getOutput(0)->nbDims());
+          for (DimSize_t i = 0; i < op->getOutput(0)->nbDims(); ++i) {
+            REQUIRE(Tres->dims().at(i) == op->getOutput(0)->dims().at(i));
+          }
+          REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
+          delete[] input;
+          delete[] result;
+        }
+        SECTION("3D_img") {
+          const std::vector<DimSize_t> in_dims{batch_size, channels, height,
+                                               width, depth};
+          std::vector<std::size_t> out_dims(in_dims.size(), 1);
+          out_dims[0] = in_dims[0];
+          out_dims[1] = in_dims[1];
+          DimSize_t in_nb_elems =
+              batch_size * channels * height * width * depth;
+          number_of_operation +=
+              in_nb_elems +
+              channels; //  averaging per channel : 1 addition per element in
+                        //  the channel + 1 division this for every batch
+          DimSize_t out_nb_elems = batch_size * channels;
+          auto input = new float[in_nb_elems];
+          auto result = new float[out_nb_elems];
+          input[0] = 0.0061403;
+          input[1] = -0.9665052;
+          input[2] = 0.3582928;
+          input[3] = 0.1072854;
+          input[4] = 1.2463317;
+          input[5] = 1.2460036;
+          input[6] = 0.3534451;
+          input[7] = 0.9425349;
+          input[8] = -0.2103887;
+          input[9] = -0.7959853;
+          input[10] = 0.1297970;
+          input[11] = -1.9445597;
+          input[12] = 0.0609514;
+          input[13] = -0.2379328;
+          input[14] = 1.9020044;
+          input[15] = -1.1762751;
+          input[16] = 0.3404147;
+          input[17] = 1.1685153;
+          input[18] = -0.6526139;
+          input[19] = 0.3767620;
+          input[20] = 0.1887376;
+          input[21] = 0.5154487;
+          input[22] = 0.6371427;
+          input[23] = -0.3948864;
+          input[24] = -1.1571540;
+          input[25] = 0.2896117;
+          input[26] = 0.6163548;
+          input[27] = -0.4370409;
+          input[28] = 0.6589766;
+          input[29] = 0.6587803;
+          input[30] = -1.3702172;
+          input[31] = -1.6210355;
+          input[32] = 0.5872851;
+          input[33] = 0.2860694;
+          input[34] = 0.0082870;
+          input[35] = -0.2523253;
+          input[36] = -1.3247224;
+          input[37] = 0.1891782;
+          input[38] = 0.0211001;
+          input[39] = 0.9404197;
+          input[40] = -0.5576900;
+          input[41] = -0.6939272;
+          input[42] = -0.3252473;
+          input[43] = 1.2439330;
+          input[44] = -1.1671864;
+          input[45] = -0.4091243;
+          input[46] = 1.2600617;
+          input[47] = -1.5630058;
+          input[48] = 1.1346143;
+          input[49] = -0.0823837;
+          input[50] = 0.2893163;
+          input[51] = 0.8357732;
+          input[52] = -0.2449911;
+          input[53] = 0.2712233;
+          input[54] = 0.0936364;
+          input[55] = -0.8834321;
+          input[56] = -0.3274170;
+          input[57] = 0.0783938;
+          input[58] = -0.3807656;
+          input[59] = 0.3775077;
+          input[60] = 0.1119123;
+          input[61] = 2.3142793;
+          input[62] = -0.7989057;
+          input[63] = -0.5643027;
+          input[64] = -1.1346605;
+          input[65] = 0.1705271;
+          input[66] = 0.9946650;
+          input[67] = 1.2625724;
+          input[68] = 1.6218156;
+          input[69] = 1.0774711;
+          input[70] = 0.5947813;
+          input[71] = -1.5290873;
+          input[72] = 2.0437069;
+          input[73] = -0.1656267;
+          input[74] = 0.0870704;
+          input[75] = -0.5276564;
+          input[76] = -0.1002882;
+          input[77] = 1.0539219;
+          input[78] = -0.6230739;
+          input[79] = -1.5905718;
+          input[80] = -0.9741858;
+          input[81] = -0.1869211;
+          input[82] = 0.5816050;
+          input[83] = -2.6339815;
+          input[84] = -1.0764544;
+          input[85] = 2.5903966;
+          input[86] = 0.4940658;
+          input[87] = 0.4671729;
+          input[88] = 0.6588292;
+          input[89] = -0.7257792;
+          input[90] = 1.4280071;
+          input[91] = -1.2187740;
+          input[92] = 0.7380729;
+          input[93] = -1.1599953;
+          input[94] = -1.4355115;
+          input[95] = -1.5304037;
+          input[96] = 0.8474578;
+          input[97] = 0.0774260;
+          input[98] = 0.5433396;
+          input[99] = -0.8438400;
+          input[100] = -0.1089903;
+          input[101] = -0.6354192;
+          input[102] = 0.8772392;
+          input[103] = 0.2844733;
+          input[104] = 0.0975270;
+          input[105] = -0.9785872;
+          input[106] = -0.4320499;
+          input[107] = -1.4937501;
+          input[108] = -2.0644901;
+          input[109] = 0.0851217;
+          input[110] = 0.6644159;
+          input[111] = 0.4168026;
+          input[112] = 0.0958830;
+          input[113] = -1.5699565;
+          input[114] = 0.3739572;
+          input[115] = -0.1420672;
+          input[116] = -0.7864021;
+          input[117] = 0.2443752;
+          input[118] = -0.9811850;
+          input[119] = -0.0698569;
+          input[120] = 0.1463890;
+          input[121] = 0.2536245;
+          input[122] = 0.2136150;
+          input[123] = 0.3113698;
+          input[124] = 1.8353856;
+          input[125] = 1.4473228;
+          input[126] = -0.7373698;
+          input[127] = 0.2485314;
+          input[128] = -0.4789796;
+          input[129] = -0.3396149;
+          input[130] = 0.6438198;
+          input[131] = 0.7287521;
+          input[132] = -1.5119252;
+          input[133] = -0.1006494;
+          input[134] = 1.8955028;
+          input[135] = 1.0871323;
+          input[136] = 0.3620502;
+          input[137] = -0.8826663;
+          input[138] = 1.2220223;
+          input[139] = -1.2817260;
+          input[140] = 1.4153577;
+          input[141] = 0.4148015;
+          input[142] = 1.3458617;
+          input[143] = 1.9718349;
+
+          result[0] = 0.1333608;
+          result[1] = -0.1716091;
+          result[2] = 0.2201060;
+          result[3] = -0.1585989;
+          result[4] = -0.2291074;
+          result[5] = 0.4254351;
+
+          // input0
+          T0->resize(in_dims);
+          T0->getImpl()->setRawPtr(input, in_nb_elems);
+
+          // results
+          Tres->resize(out_dims);
+          Tres->getImpl()->setRawPtr(result, out_nb_elems);
+          op->forwardDims();
+          start = std::chrono::system_clock::now();
+          REQUIRE_NOTHROW(op->forward());
+          end = std::chrono::system_clock::now();
+          duration += std::chrono::duration_cast<std::chrono::microseconds>(
+              end - start);
+
+          REQUIRE(Tres->nbDims() == op->getOutput(0)->nbDims());
+          for (DimSize_t i = 0; i < op->getOutput(0)->nbDims(); ++i) {
+            REQUIRE(Tres->dims().at(i) == op->getOutput(0)->dims().at(i));
+          }
+          REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
+          delete[] input;
+          delete[] result;
+        }
+      }
+      Log::info("GlobalAveragePooling total execution time: {}µs\n", duration.count());
+      Log::info("Number of operations : {}\n", number_of_operation);
+      Log::info("Operation / µs = {}\n", number_of_operation / duration.count());
+    }
+  }
+}
+} // namespace Aidge
--- a/unit_tests/operator/Test_HeavisideImpl.cpp
+++ b/unit_tests/operator/Test_HeavisideImpl.cpp
+/********************************************************************************
+ * Copyright (c) 2025 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include "aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp"
+
+#include <memory>
+#include <cstdlib>
+#include <random>
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/data/Tensor.hpp"
+#include "aidge/backend/cpu/operator/HeavisideImpl.hpp"
+#include "aidge/graph/Node.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+namespace Aidge
+{
+
+TEST_CASE("[cpu/operator] Heaviside(forward)", "[Heaviside][CPU]") {
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> valueDist(-1.0f, 1.0f);
+    std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
+    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
+
+    SECTION("1D Tensor") {
+
+        std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array1D<float,10> {
+            {0, 1, 2,-3, 4,-5,-6, 7, 8, 9}
+        });
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array1D<float,10> {
+            {0.5, 1, 1, 0, 1, 0, 0, 1, 1, 1}
+        });
+
+        std::shared_ptr<Node> heaviside = Heaviside(0.5);
+        auto op = std::static_pointer_cast<OperatorTensor>(heaviside->getOperator());
+        op->associateInput(0, input0);
+        op->setBackend("cpu");
+        op->setDataType(DataType::Float32);
+
+        op->forward();
+        REQUIRE(approxEq<float>(*op->getOutput(0),*expectedOutput));
+    }
+
+    SECTION("+1-D Tensor")
+    {
+        auto dims = std::vector<std::size_t>();
+        auto nbDims = nbDimsDist(gen);
+
+        for (auto i = 0u; i < nbDims; ++i) {
+            dims.push_back(dimSizeDist(gen));
+        }
+
+        auto numberOfElements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+        float* inputArray = new float[numberOfElements];
+        float* resultArray = new float[numberOfElements];
+
+        for(auto i = 0u; i < numberOfElements; ++i)
+        {
+            inputArray[i] = valueDist(gen);
+            resultArray[i] = inputArray[i] > 0 ? 1 : (inputArray[i] == 0 ? 0.5 : 0);
+        }
+
+        auto T0 = std::make_shared<Tensor>();
+        T0->setDataType(DataType::Float32);
+        T0->setBackend("cpu");
+
+        auto T1 = std::make_shared<Tensor>();
+        T1->setDataType(DataType::Float32);
+        T1->setBackend("cpu");
+
+        T0->resize(dims);
+        T0->getImpl()->setRawPtr(inputArray, numberOfElements);
+        T1->resize(dims);
+        T1->getImpl()->setRawPtr(resultArray, numberOfElements);
+
+        std::shared_ptr<Node> heaviside = Heaviside(0.5);
+        auto op = std::static_pointer_cast<OperatorTensor>(heaviside->getOperator());
+        op->associateInput(0, T0);
+        op->setBackend("cpu");
+        op->setDataType(DataType::Float32);
+
+        op->forward();
+
+        REQUIRE(approxEq<float>(*(op->getOutput(0)), *T1));
+    }
+}
+}
--- a/unit_tests/operator/Test_LeakyReLUImpl.cpp
+++ b/unit_tests/operator/Test_LeakyReLUImpl.cpp
@@ -9,16 +9,19 @@
 *
 ********************************************************************************/

+#include <memory>
+
 #include <catch2/catch_test_macros.hpp>

+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/LeakyReLUImpl.hpp"
+#include "aidge/data/DataType.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/LeakyReLU.hpp"

-#include "aidge/backend/cpu.hpp"
-
 using namespace Aidge;

-TEST_CASE("[cpu/operator] LeakyReLU(forward)") {
+TEST_CASE("[cpu/operator] LeakyReLU(forward)", "[LeakyReLU][CPU]") {
    SECTION("1D Tensor") {
        std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array1D<int,10> {
            {0, 1, 2,-3, 4,-5,-6, 7, 8, 9}
@@ -28,12 +31,12 @@ TEST_CASE("[cpu/operator] LeakyReLU(forward)") {
        });

        std::shared_ptr<Node> myLeakyReLU = LeakyReLU();
-        myLeakyReLU->getOperator()->setDatatype(DataType::Int32);
-        myLeakyReLU->getOperator()->setBackend("cpu");
-        myLeakyReLU->getOperator()->associateInput(0,input0);
-        myLeakyReLU->getOperator()->computeOutputDims();
+        auto op = std::static_pointer_cast<OperatorTensor>(myLeakyReLU -> getOperator());
+        op->associateInput(0,input0);
+        op->setDataType(DataType::Int32);
+        op->setBackend("cpu");
        myLeakyReLU->forward();
-        REQUIRE(*std::static_pointer_cast<Tensor>(myLeakyReLU->getOperator()->getOutput(0)) == *expectedOutput);
+        REQUIRE(*(op->getOutput(0)) == *expectedOutput);
    }

    SECTION("2D Tensor") {
@@ -51,12 +54,12 @@ TEST_CASE("[cpu/operator] LeakyReLU(forward)") {
        });

        std::shared_ptr<Node> myLeakyReLU = LeakyReLU();
-        myLeakyReLU->getOperator()->setDatatype(DataType::Int32);
-        myLeakyReLU->getOperator()->setBackend("cpu");
-        myLeakyReLU->getOperator()->associateInput(0,input0);
-        myLeakyReLU->getOperator()->computeOutputDims();
+        auto op = std::static_pointer_cast<OperatorTensor>(myLeakyReLU -> getOperator());
+        op->associateInput(0,input0);
+        op->setDataType(DataType::Int32);
+        op->setBackend("cpu");
        myLeakyReLU->forward();
-        REQUIRE(*myLeakyReLU->getOperator()->getOutput(0) == *expectedOutput);
+        REQUIRE(*(op->getOutput(0)) == *expectedOutput);
    }

    SECTION("3D Tensor") {
@@ -86,12 +89,12 @@ TEST_CASE("[cpu/operator] LeakyReLU(forward)") {
        });

        std::shared_ptr<Node> myLeakyReLU = LeakyReLU();
-        myLeakyReLU->getOperator()->setDatatype(DataType::Int32);
-        myLeakyReLU->getOperator()->setBackend("cpu");
-        myLeakyReLU->getOperator()->associateInput(0,input0);
-        myLeakyReLU->getOperator()->computeOutputDims();
+        auto op = std::static_pointer_cast<OperatorTensor>(myLeakyReLU -> getOperator());
+        op->associateInput(0,input0);
+        op->setDataType(DataType::Int32);
+        op->setBackend("cpu");
        myLeakyReLU->forward();
-        REQUIRE(*myLeakyReLU->getOperator()->getOutput(0) == *expectedOutput);
+        REQUIRE(*(op->getOutput(0)) == *expectedOutput);
    }

    SECTION("4D Tensor") {
@@ -145,12 +148,12 @@ TEST_CASE("[cpu/operator] LeakyReLU(forward)") {
        });

        std::shared_ptr<Node> myLeakyReLU = LeakyReLU();
-        myLeakyReLU->getOperator()->setDatatype(DataType::Int32);
-        myLeakyReLU->getOperator()->setBackend("cpu");
-        myLeakyReLU->getOperator()->associateInput(0,input0);
-        myLeakyReLU->getOperator()->computeOutputDims();
+        auto op = std::static_pointer_cast<OperatorTensor>(myLeakyReLU -> getOperator());
+        op->associateInput(0,input0);
+        op->setDataType(DataType::Int32);
+        op->setBackend("cpu");
        myLeakyReLU->forward();
-        REQUIRE(*myLeakyReLU->getOperator()->getOutput(0) == *expectedOutput);
+        REQUIRE(*(op->getOutput(0)) == *expectedOutput);
    }

    SECTION("Test construction attribute: negative_slop") {
@@ -162,11 +165,11 @@ TEST_CASE("[cpu/operator] LeakyReLU(forward)") {
        });

        std::shared_ptr<Node> myLeakyReLU = LeakyReLU(0.5f);
-        myLeakyReLU->getOperator()->setDatatype(DataType::Float32);
-        myLeakyReLU->getOperator()->setBackend("cpu");
-        myLeakyReLU->getOperator()->associateInput(0,input0);
-        myLeakyReLU->getOperator()->computeOutputDims();
+        auto op = std::static_pointer_cast<OperatorTensor>(myLeakyReLU -> getOperator());
+        op->associateInput(0,input0);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cpu");
        myLeakyReLU->forward();
-        REQUIRE(*myLeakyReLU->getOperator()->getOutput(0) == *expectedOutput);
+        REQUIRE(*(op->getOutput(0)) == *expectedOutput);
    }
 }
\ No newline at end of file
--- a/unit_tests/operator/Test_MatMulImpl.cpp
+++ b/unit_tests/operator/Test_MatMulImpl.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <chrono>      // std::micro, std::chrono::time_point,
+                       // std::chrono::system_clock, std::chrono::duration
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::uint16_t
+#include <memory>
+#include <random>      // std::random_device, std::mt19937
+                       // std::uniform_int_distribution, std::uniform_real_distribution
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
+
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/MatMulImpl.hpp"
+#include "aidge/data/Data.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/MatMul.hpp"
+#include "aidge/operator/OperatorTensor.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+namespace Aidge {
+
+TEST_CASE("[cpu/operator] MatMul(forward)", "[MatMul][CPU]") {
+    const std::uint16_t NBTRIALS = 10;
+    // Create a random number generator
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> dis(0.0, 1.0); // Random float distribution between 0 and 1
+    std::uniform_int_distribution<std::size_t> distDims(10, 100);
+    std::uniform_int_distribution<std::size_t> distNbMatrix(1, 5);
+
+    // Create MatMul Operator
+    std::shared_ptr<Node> myMatMul = MatMul();
+    auto op = std::static_pointer_cast<OperatorTensor>(myMatMul -> getOperator());
+
+    // To measure execution time of 'MatMul_Op::forward()' member function call
+    std::chrono::time_point<std::chrono::system_clock> start;
+    std::chrono::time_point<std::chrono::system_clock> end;
+    std::chrono::duration<double, std::micro> duration;
+
+    SECTION("2-D Tensors") {
+        std::size_t totalComputation = 0;
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+            // generate Tensors dimensions
+            const std::size_t dim0 = distDims(gen);
+            const std::size_t dim1 = distDims(gen);
+            const std::size_t dim2 = distDims(gen);
+            totalComputation += dim0*dim1*dim2;
+
+            // Create and populate the array with random float values
+            float* bigArray1 = new float[dim0*dim1];
+            for (int i = 0; i < dim0*dim1; ++i) {
+                bigArray1[i] = dis(gen); // Generate random float value
+            }
+            float* bigArray2 = new float[dim1*dim2];
+            for (int i = 0; i < dim1*dim2; ++i) {
+                bigArray2[i] = dis(gen); // Generate random float value
+            }
+            float* res = new float[dim0*dim2];
+            for (int i = 0; i < dim0; ++i) {
+                for (int j = 0; j < dim2; ++j) {
+                    float sum = 0.0;
+                    for (int k = 0; k < dim1; ++k) {
+                        sum += bigArray1[i*dim1+k] * bigArray2[k*dim2+j];
+                    }
+                    res[i*dim2+j] = sum;
+                }
+            }
+
+
+            // Convert bigArray1 to Tensor
+            std::shared_ptr<Tensor> T1 = std::make_shared<Tensor>(DataType::Float32);
+            T1 -> resize({dim0,dim1});
+            T1 -> setBackend("cpu");
+            T1 -> getImpl() -> setRawPtr(bigArray1, dim0*dim1);
+            // Convert bigArray2 to Tensor
+            std::shared_ptr<Tensor> T2 = std::make_shared<Tensor>(DataType::Float32);
+            T2 -> resize({dim1,dim2});
+            T2 -> setBackend("cpu");
+            T2 -> getImpl() -> setRawPtr(bigArray2, dim1*dim2);
+            // convert res to Tensor
+            std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>(DataType::Float32);
+            Tres -> resize({dim0,dim2});
+            Tres -> setBackend("cpu");
+            Tres -> getImpl() -> setRawPtr(res, dim0*dim2);
+
+            op->associateInput(0, T1);
+            op->associateInput(1, T2);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            op->forwardDims();
+            start = std::chrono::system_clock::now();
+            myMatMul->forward();
+            end = std::chrono::system_clock::now();
+            duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+            REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
+
+            delete[] bigArray1;
+            delete[] bigArray2;
+            delete[] res;
+        }
+        Log::info("number of multiplications over time spent: {}\n", (totalComputation / duration.count()));
+        Log::info("total time: {} μs\n", duration.count());
+    }
+
+    SECTION("3-D Tensors") {
+        std::size_t totalComputation = 0;
+        duration = std::chrono::duration<double, std::micro>::zero();
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+            // generate Tensors dimensions
+            const std::size_t dimNb = distNbMatrix(gen);
+            const std::size_t dim0 = distDims(gen);
+            const std::size_t dim1 = distDims(gen);
+            const std::size_t dim2 = distDims(gen);
+            totalComputation += dim0*dim1*dim2*dimNb;
+
+            // Create and populate the array with random float values
+            float* bigArray1 = new float[dimNb*dim0*dim1];
+            for (std::size_t i = 0; i < dimNb*dim0*dim1; ++i) {
+                bigArray1[i] = dis(gen); // Generate random float value
+            }
+            float* bigArray2 = new float[dimNb*dim1*dim2];
+            for (int i = 0; i < dimNb*dim1*dim2; ++i) {
+                bigArray2[i] = dis(gen); // Generate random float value
+            }
+            float* res = new float[dimNb*dim0*dim2];
+            for (std::size_t n = 0; n < dimNb; ++n) {
+                for (int i = 0; i < dim0; ++i) {
+                    for (int j = 0; j < dim2; ++j) {
+                        float sum = 0.0;
+                        for (int k = 0; k < dim1; ++k) {
+                            sum += bigArray1[n*dim0*dim1 + i*dim1 + k] * bigArray2[n*dim2*dim1+k*dim2+j];
+                        }
+                        res[n*dim0*dim2+i*dim2+j] = sum;
+                    }
+                }
+            }
+            // Convert bigArray1 to Tensor
+            std::shared_ptr<Tensor> T1 = std::make_shared<Tensor>(DataType::Float32);
+            T1 -> resize({dimNb,dim0,dim1});
+            T1 -> setBackend("cpu");
+            T1 -> getImpl() -> setRawPtr(bigArray1, dimNb*dim0*dim1);
+            // Convert bigArray2 to Tensor
+            std::shared_ptr<Tensor> T2 = std::make_shared<Tensor>(DataType::Float32);
+            T2 -> resize({dimNb,dim1,dim2});
+            T2 -> setBackend("cpu");
+            T2 -> getImpl() -> setRawPtr(bigArray2, dimNb*dim1*dim2);
+            // convert res to Tensor
+            std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>(DataType::Float32);
+            Tres -> resize({dimNb,dim0,dim2});
+            Tres -> setBackend("cpu");
+            Tres -> getImpl() -> setRawPtr(res, dimNb*dim0*dim2);
+
+            op->associateInput(0, T1);
+            op->associateInput(1, T2);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            op->forwardDims();
+            start = std::chrono::system_clock::now();
+            myMatMul->forward();
+            end = std::chrono::system_clock::now();
+            duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+            REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
+
+            delete[] bigArray1;
+            delete[] bigArray2;
+            delete[] res;
+        }
+        Log::info("number of multiplications over time spent: {}\n", (totalComputation / duration.count()));
+        Log::info("total time: {} μs\n", duration.count());
+    }
+
+    SECTION("4-D Tensors") {
+        std::size_t totalComputation = 0;
+        duration = std::chrono::duration<double, std::micro>::zero();
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+            // generate Tensors dimensions
+            const std::size_t dimNb1 = distNbMatrix(gen);
+            const std::size_t dimNb2 = distNbMatrix(gen);
+            const std::size_t dim0 = distDims(gen);
+            const std::size_t dim1 = distDims(gen);
+            const std::size_t dim2 = distDims(gen);
+            totalComputation += dim0*dim1*dim2*dimNb1*dimNb2;
+
+            // Create and populate the array with random float values
+            float* bigArray1 = new float[dimNb1*dimNb2*dim0*dim1];
+            for (std::size_t i = 0; i < dimNb1*dimNb2*dim0*dim1; ++i) {
+                bigArray1[i] = dis(gen); // Generate random float value
+            }
+            float* bigArray2 = new float[dimNb1*dimNb2*dim1*dim2];
+            for (std::size_t i = 0; i < dimNb1*dimNb2*dim1*dim2; ++i) {
+                bigArray2[i] = dis(gen); // Generate random float value
+            }
+            float* res = new float[dimNb1*dimNb2*dim0*dim2];
+            for (std::size_t n1 = 0; n1 < dimNb1; ++n1) {
+                for (std::size_t n2 = 0; n2 < dimNb2; ++n2) {
+                    for (int i = 0; i < dim0; ++i) {
+                        for (int j = 0; j < dim2; ++j) {
+                            float sum = 0.0;
+                            for (int k = 0; k < dim1; ++k) {
+                                sum += bigArray1[n1*dimNb2*dim0*dim1+n2*dim0*dim1+i*dim1+k] * bigArray2[n1*dimNb2*dim1*dim2+n2*dim1*dim2+k*dim2+j];
+                            }
+                            res[n1*dimNb2*dim0*dim2+n2*dim0*dim2+i*dim2+j] = sum;
+                        }
+                    }
+                }
+            }
+            // Convert bigArray1 to Tensor
+            std::shared_ptr<Tensor> T1 = std::make_shared<Tensor>(DataType::Float32);
+            T1 -> resize({dimNb1,dimNb2,dim0,dim1});
+            T1 -> setBackend("cpu");
+            T1 -> getImpl() -> setRawPtr(bigArray1, dimNb1*dimNb2*dim0*dim1);
+            // Convert bigArray2 to Tensor
+            std::shared_ptr<Tensor> T2 = std::make_shared<Tensor>(DataType::Float32);
+            T2 -> resize({dimNb1,dimNb2,dim1,dim2});
+            T2 -> setBackend("cpu");
+            T2 -> getImpl() -> setRawPtr(bigArray2, dimNb1*dimNb2*dim1*dim2);
+            // convert res to Tensor
+            std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>(DataType::Float32);
+            Tres -> resize({dimNb1,dimNb2,dim0,dim2});
+            Tres -> setBackend("cpu");
+            Tres -> getImpl() -> setRawPtr(res, dimNb1*dimNb2*dim0*dim2);
+
+            op->associateInput(0, T1);
+            op->associateInput(1, T2);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            op->forwardDims();
+            start = std::chrono::system_clock::now();
+            myMatMul->forward();
+            end = std::chrono::system_clock::now();
+            duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+            REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
+
+            delete[] bigArray1;
+            delete[] bigArray2;
+            delete[] res;
+        }
+        Log::info("number of multiplications over time spent: {}\n", (totalComputation / duration.count()));
+        Log::info("total time: {} μs\n", duration.count());
+    }
+
+    SECTION("+2-D / 1-D") {
+        // allows to test both computation with a 1-D Tensor and broadcasting
+        // input_0
+        std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>();
+        op->associateInput(0,T0);
+        const std::size_t dim0 = distNbMatrix(gen);
+        const std::size_t dim1 = distNbMatrix(gen) + 1;
+        const std::size_t dim2 = distNbMatrix(gen);
+        const std::size_t dim3 = distNbMatrix(gen);
+        T0->resize({dim0,dim1,dim2,dim3});
+        T0->setDataType(DataType::Float32);
+        T0->setBackend("cpu");
+
+        // input_1
+        std::shared_ptr<Tensor> T1 = std::make_shared<Tensor>();
+        op -> associateInput(1,T1);
+        T1->resize({dim3});
+        T1->setDataType(DataType::Float32);
+        T1->setBackend("cpu");
+
+        op->setDataType(DataType::Float32);
+        op->setBackend("cpu");
+        op->forwardDims();
+        myMatMul->forward();
+
+    }
+}
+} // namespace Aidge
\ No newline at end of file
--- a/unit_tests/operator/Test_MaxPoolingImpl.cpp
+++ b/unit_tests/operator/Test_MaxPoolingImpl.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <array>
+#include <memory>
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/MaxPoolingImpl.hpp"
+#include "aidge/data/DataType.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/MaxPooling.hpp"
+
+using namespace Aidge;
+
+
+TEST_CASE("[cpu/operator] MaxPooling(forward)", "[MaxPooling][CPU]") {
+    std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,2,5,5> { //NCHW
+        {
+            {
+                {{-0.3848,  0.2166, -0.4373,  0.6142,  0.5277},
+                 {0.7995,  0.3638, -1.4589, -1.0843,  1.0918},
+            	 {0.7147,  0.0936, -1.2902,  1.2037,  0.4874},
+                 {-0.5981,  2.1184, -0.9175,  1.3859,  0.3305},
+                 {-1.7700,  0.0563, -0.3914,  0.0538, -0.3955}},
+
+                {{-3.1409, -0.4554,  0.0524,  2.2291,  0.4859},
+                 {-0.7465, -0.6567, -2.3703, -0.6386, -1.4152},
+                 { 2.2329, -0.5850,  0.0700,  1.2838, -1.7363},
+                 { 0.2139,  0.0624, -1.0689, -0.8221, -0.8038},
+                 { 0.1886, -0.7840, -0.2313,  0.2651, -1.6244}}
+            },
+            {
+                {{ 0.4371,  1.6417,  0.9129,  0.6325,  0.5438},
+                 {-2.3552, -0.8850, -0.0232, -0.5462, -1.2011},
+                 {1.7653, -1.6668, -1.0814,  0.6182,  1.2071},
+                 {0.9541, -0.5133,  0.8664, -0.8892,  1.4585},
+                 {1.0220, -0.5107,  0.1829, -0.2301, -0.4268}},
+
+                {{ 1.0429,  0.6279, -0.2875,  0.7187, -0.1500},
+                 {1.6041,  2.9635,  1.4172, -0.7517,  0.5441},
+                 {-0.2276,  0.0857,  0.6776, -0.1389, -0.0614},
+                 {-0.1547, -0.3435,  0.0650, -0.5095, -1.8073},
+                 {1.7217,  0.3999, -0.5953,  1.0604, -0.4126}}
+            }
+        }
+    });
+    SECTION("Stride") {
+        std::shared_ptr<MaxPooling_Op<2>> op = std::make_shared<MaxPooling_Op<2>>(std::array<std::size_t, 2>({2,2}), std::array<std::size_t, 2>({2,2}));
+
+        Tensor myOutput = Array4D<float,2,2,2,2> {
+            {
+                {
+                    {{  0.7995,  0.6142},
+                     { 2.1184,  1.3859}},
+                    {{ -0.4554,  2.2291},
+                     {  2.2329,  1.2838}}
+                },
+                {
+                    {{1.6417,  0.9129},
+                     {1.7653,  0.8664}},
+                    {{2.9635,  1.4172},
+                     {0.0857,  0.6776}}
+                }
+            }
+        };
+        op->associateInput(0,myInput);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cpu");
+        op->forward();
+        op->getOutput(0)->print();
+        REQUIRE(*(op->getOutput(0)) == myOutput);
+    }
+}
\ No newline at end of file
--- a/unit_tests/operator/Test_Memorize.cpp
+++ b/unit_tests/operator/Test_Memorize.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <memory>
+#include <string>
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/AddImpl.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/graph/Node.hpp"
+#include "aidge/graph/GraphView.hpp"
+#include "aidge/graph/OpArgs.hpp"
+#include "aidge/operator/Add.hpp"
+#include "aidge/operator/Memorize.hpp"
+#include "aidge/operator/Producer.hpp"
+#include "aidge/recipes/GraphViewHelper.hpp"
+#include "aidge/scheduler/SequentialScheduler.hpp"
+
+namespace Aidge {
+
+TEST_CASE("[cpu/operator] Memorize(forward)", "[Memorize][CPU]") {
+    SECTION("Test simple") {
+        std::shared_ptr<Tensor> inputTensor =
+                std::make_shared<Tensor>(Array1D<int, 1>{{1}});
+
+        auto input = Producer({1}, "input");
+        auto init = Producer({1}, "init");
+        auto add = Add("add");
+        auto mem = Memorize(3, "mem");
+
+        input->addChild(add, 0, 0);
+        init->addChild(mem, 0, 1);
+        add->addChild(mem, 0,0);
+        mem->addChild(/*otherNode=*/add, /*outId=*/1, /*otherInId=*/1);
+
+        input->getOperator()->setOutput(0, inputTensor);
+        init->getOperator()->setOutput(0, inputTensor);
+
+        auto g = getConnectedGraphView(input);
+
+        g->setDataType(Aidge::DataType::Int32);
+        g->setBackend("cpu");
+        g->forwardDims();
+        g->save("simple_graph");
+
+        SequentialScheduler scheduler(g);
+        REQUIRE_NOTHROW(scheduler.forward());
+        scheduler.saveSchedulingDiagram("simple");
+
+        const Tensor expectedOutput = Array1D<int, 1>{{4}};
+        std::shared_ptr<Tensor> other = std::static_pointer_cast<OperatorTensor>(mem->getOperator())->getOutput(0);
+        other->print();
+        REQUIRE((*other == expectedOutput));
+    }
+}
+} // namespace Aidge
--- a/unit_tests/operator/Test_MetaOperator.cpp
+++ b/unit_tests/operator/Test_MetaOperator.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <cmath>
+#include <cstdlib>
+#include <memory>
+#include <random>
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu/operator/ConvImpl.hpp"
+#include "aidge/backend/cpu/operator/PadImpl.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/filler/Filler.hpp"
+#include "aidge/operator/Conv.hpp"
+#include "aidge/operator/FC.hpp"
+#include "aidge/operator/Identity.hpp"
+#include "aidge/operator/MetaOperator.hpp"
+#include "aidge/operator/MetaOperatorDefs.hpp"
+#include "aidge/operator/Pad.hpp"
+#include "aidge/operator/Pop.hpp"
+#include "aidge/operator/Stack.hpp"
+#include "aidge/scheduler/ParallelScheduler.hpp"
+#include "aidge/scheduler/SequentialScheduler.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+using namespace Aidge;
+
+TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
+    SECTION("PaddedConv(forward)") {
+        std::shared_ptr<Tensor> myWeights =
+            std::make_shared<Tensor>(Array4D<double, 4, 3, 3, 3>{
+                {{{{6.20986394e-01, 1.19775136e-03, 7.22876095e-02},
+                   {1.16492919e-01, 8.21634093e-02, 1.17413265e-01},
+                   {2.23743494e-01, 3.99495413e-01, 5.55552411e-01}},
+                  {{6.64970077e-01, 9.62199940e-01, 4.87531967e-01},
+                   {6.12586558e-01, 8.09918671e-02, 8.40649383e-01},
+                   {4.15264406e-01, 8.28247138e-01, 1.52301135e-01}},
+                  {{1.76992844e-02, 7.78697112e-01, 8.14531592e-01},
+                   {1.36960611e-01, 4.64806728e-01, 4.85150000e-01},
+                   {4.34776520e-01, 9.51740977e-01, 9.05793799e-01}}},
+
+                 {{{1.71925246e-02, 1.91082720e-01, 3.67982644e-01},
+                   {1.56806559e-01, 6.22280998e-01, 3.15827594e-01},
+                   {6.04359038e-01, 2.83095947e-01, 6.11168892e-01}},
+                  {{2.76942832e-01, 1.89768419e-01, 8.07988176e-01},
+                   {1.67925807e-01, 2.68356150e-01, 6.28875602e-01},
+                   {1.69093357e-04, 9.64788636e-01, 7.29254981e-01}},
+                  {{6.34030122e-01, 1.32087038e-01, 3.33857107e-01},
+                   {7.63047502e-01, 5.12539506e-02, 9.77400493e-01},
+                   {8.06151288e-01, 2.60237147e-01, 3.93729313e-01}}},
+
+                 {{{5.84605240e-01, 4.74648725e-01, 8.54111741e-01},
+                   {7.10897067e-02, 5.02579011e-01, 3.35236224e-01},
+                   {9.08637408e-01, 8.02903830e-01, 2.83929907e-01}},
+                  {{3.68206999e-01, 9.18579021e-02, 7.33168098e-01},
+                   {1.59875539e-01, 9.13163381e-01, 3.59806060e-01},
+                   {1.41295882e-01, 7.00312185e-01, 5.63728289e-01}},
+                  {{9.39513546e-01, 1.91704891e-01, 1.11454944e-01},
+                   {5.46298282e-01, 2.89698587e-01, 2.62612651e-01},
+                   {1.18554992e-01, 4.32147376e-02, 7.53016994e-01}}},
+
+                 {{{9.53179175e-01, 2.05041054e-02, 1.11318451e-01},
+                   {8.67878485e-01, 2.93263422e-01, 8.03912714e-01},
+                   {8.93620255e-01, 1.37831128e-01, 3.83640583e-01}},
+                  {{3.96020188e-01, 6.24959320e-01, 1.90709175e-01},
+                   {5.80538620e-01, 6.63031275e-01, 2.07247191e-01},
+                   {5.65672171e-01, 5.57014317e-01, 9.26909496e-01}},
+                  {{3.43901418e-01, 4.47741636e-01, 6.59249367e-01},
+                   {7.34639028e-01, 2.84957200e-02, 9.70225217e-01},
+                   {1.33578790e-02, 6.12054702e-01, 9.36685235e-02}}}}});
+        std::shared_ptr<Tensor> myBias =
+            std::make_shared<Tensor>(Array1D<double, 4>{
+                {0.16884905, 0.27994487, 0.57227465, 0.06435205}});
+        std::shared_ptr<Tensor> myInput = std::make_shared<
+            Tensor>(Array4D<double, 2, 3, 5, 5>{
+            // NCHW
+            {{{{0.43224481, 0.9047832, 0.18402257, 0.06162838, 0.52490127},
+               {0.27773404, 0.55402353, 0.9485062, 0.31197083, 0.80328607},
+               {0.85065842, 0.88226201, 0.54971951, 0.23360494, 0.53907884},
+               {0.33423098, 0.79564312, 0.80419414, 0.76839638, 0.87248221},
+               {0.77328729, 0.65749407, 0.47277589, 0.32889198, 0.93970518}},
+
+              {{0.66669145, 0.64193351, 0.45315988, 0.32794057, 0.38461822},
+               {0.72295814, 0.18395073, 0.85909664, 0.30010301, 0.56065865},
+               {0.34777938, 0.77869746, 0.33159421, 0.19540932, 0.77767906},
+               {0.5778391, 0.08218411, 0.27758371, 0.99017749, 0.61827997},
+               {0.10440745, 0.3197831, 0.89157608, 0.12216887, 0.950232}},
+
+              {{0.68073443, 0.2681118, 0.51848834, 0.62864493, 0.36717478},
+               {0.64106244, 0.43779425, 0.02771029, 0.78275231, 0.45693104},
+               {0.6487417, 0.01603838, 0.73869997, 0.96494221, 0.39588782},
+               {0.5975827, 0.90913292, 0.55036969, 0.4747373, 0.62460509},
+               {0.79675124, 0.02807549, 0.53227602, 0.88805927, 0.96646591}}},
+
+             {{{0.81851935, 0.21267665, 0.01580692, 0.54907998, 0.89010049},
+               {0.80165784, 0.55195592, 0.20740314, 0.22782844, 0.89205031},
+               {0.94217108, 0.58434542, 0.20738313, 0.79065873, 0.9371597},
+               {0.02254708, 0.95539178, 0.95165758, 0.53736666, 0.49100362},
+               {0.08018625, 0.69108027, 0.00329741, 0.74565761, 0.30899213}},
+
+              {{0.34868638, 0.12792604, 0.37382248, 0.0374756, 0.50653087},
+               {0.59614405, 0.64820746, 0.31470307, 0.62460364, 0.29253268},
+               {0.92864889, 0.51014224, 0.08921206, 0.11094072, 0.64691121},
+               {0.50586371, 0.6686477, 0.72511169, 0.41681783, 0.6325049},
+               {0.71594137, 0.73382767, 0.36589439, 0.03255165, 0.75006865}},
+
+              {{0.6294127, 0.85548534, 0.0902963, 0.28915773, 0.36564289},
+               {0.95873236, 0.6742374, 0.55679676, 0.6323497, 0.34072958},
+               {0.49694061, 0.79173045, 0.19738225, 0.14755281, 0.80818177},
+               {0.02332061, 0.74270703, 0.59415632, 0.08195934, 0.46295434},
+               {0.71426058,
+                0.85032931,
+                0.90750818,
+                0.28768431,
+                0.4401146}}}}});
+
+        std::shared_ptr<Tensor> myOutput = std::make_shared<
+            Tensor>(Array4D<double, 2, 4, 5, 5>{
+            {{{{3.40294218, 3.74021220, 4.02050114, 4.07054710, 2.46286273},
+               {4.61770582, 6.70517588, 6.50356627, 6.29688787, 3.53332567},
+               {5.47480106, 5.92094421, 6.64605665, 7.95090199, 4.28721523},
+               {4.01485729, 6.06748962, 7.52447891, 7.37980652, 5.28401136},
+               {2.83065438, 3.62033439, 3.56222963, 5.56103945, 3.23335814}},
+
+              {{3.30230498, 4.92814112, 4.34710836, 3.96262765, 2.97987890},
+               {4.49693012, 6.68929291, 5.53603029, 5.68874264, 4.28756475},
+               {4.20528078, 6.82776880, 6.70569849, 7.12809610, 4.40845442},
+               {4.31169367, 6.73352146, 6.30962515, 7.45826864, 4.99164438},
+               {2.18136287, 4.28968000, 4.20080042, 4.89814138, 2.87394023}},
+
+              {{3.54787683, 4.35851812, 4.63881302, 4.23359537, 3.16992092},
+               {5.25099468, 7.54282856, 6.69849157, 5.64309788, 4.56919575},
+               {4.71914101, 7.52830601, 6.71450949, 7.81113863, 5.84658146},
+               {4.97893143, 7.39293909, 6.89905310, 8.14430809, 5.62998581},
+               {2.79735112, 4.80967140, 5.57630205, 5.38828325, 4.57078695}},
+
+              {{3.03048635, 5.04540300, 4.21824932, 4.87323284, 2.35113740},
+               {4.45167351, 6.47721338, 7.40922976, 6.70445728, 3.60700107},
+               {3.77927423, 6.82826376, 7.41777134, 7.57402420, 5.13131523},
+               {4.08747244, 7.07994175, 7.57206821, 8.51897335, 5.26987123},
+               {2.34426999, 4.60127831, 4.86486769, 6.01579571, 3.97803569}}},
+
+             {{{3.84700942, 4.25972605, 3.05269003, 3.78043652, 2.08771229},
+               {6.00459957, 6.05633259, 4.45951605, 4.54089880, 4.03066444},
+               {5.41579390, 7.29543972, 6.18680000, 5.58812714, 3.45964241},
+               {6.04531050, 7.70924091, 5.52207708, 5.02131319, 4.09403706},
+               {3.18092418, 4.45422697, 4.04294252, 3.86577177, 2.18776536}},
+
+              {{4.02600670, 4.27603531, 3.81011319, 4.03631020, 2.57254648},
+               {5.33471155, 5.72588634, 5.12079763, 5.11733150, 3.76836705},
+               {5.62947607, 5.92492962, 6.24170446, 6.44130468, 3.44276404},
+               {5.38414621, 6.02679539, 5.88985586, 5.90263271, 3.15044069},
+               {3.31261086, 4.44371319, 3.47660780, 4.15411520, 1.48961508}},
+
+              {{3.95879412, 4.17324543, 3.70114422, 3.27447152, 3.09713888},
+               {5.78258181, 6.57920837, 4.99913597, 6.20961237, 4.98552179},
+               {5.84685421, 7.19971228, 6.66386652, 6.68013430, 4.90963316},
+               {5.24417877, 7.06430531, 6.58512402, 6.02492285, 4.48986387},
+               {3.64294529, 5.00678444, 5.04760027, 4.72895622, 2.67990756}},
+
+              {{3.48610687, 4.12853813, 4.07563591, 3.51327014, 2.44217038},
+               {4.80529881, 7.33211374, 5.14774036, 4.77281189, 4.44612408},
+               {5.11703110, 7.55168772, 7.14374542, 6.43696356, 4.10621357},
+               {5.41270018, 6.85949135, 6.73503923, 5.74601364, 4.46150303},
+               {3.16612267,
+                4.38248920,
+                5.23248482,
+                4.21292210,
+                2.86031270}}}}});
+
+        std::shared_ptr<Node> myConv = Conv<2>(3, 4, {3, 3}, "myconv");
+        auto convOp =
+            std::static_pointer_cast<OperatorTensor>(myConv->getOperator());
+
+        std::shared_ptr<Node> myPad =
+            Pad<2>({1, 1, 1, 1}, "myPad", PadBorderType::Constant, 0.0);
+        auto padOp =
+            std::static_pointer_cast<OperatorTensor>(myPad->getOperator());
+
+        convOp->setInput(1, myWeights);
+        convOp->setInput(2, myBias);
+
+        myPad->addChild(myConv, 0, 0);
+        padOp->setInput(0, myInput);
+
+        padOp->setDataType(DataType::Float64);
+        padOp->setBackend("cpu");
+        convOp->setDataType(DataType::Float64);
+        convOp->setBackend("cpu");
+
+        myPad->forward();
+        myConv->forward();
+        convOp->getOutput(0)->print();
+
+        double *computedOutput =
+            static_cast<double *>(convOp->getOutput(0)->getImpl()->rawPtr());
+        double *expectedOutput =
+            static_cast<double *>(myOutput->getImpl()->rawPtr());
+        for (std::size_t i = 0; i < myOutput->size(); ++i) {
+            REQUIRE(std::abs(computedOutput[i] - expectedOutput[i]) < 1e-5);
+        }
+
+        std::shared_ptr<Node> myPaddedConv =
+            PaddedConv(3, 4, {3, 3}, "myPaddedConv", {1, 1}, {1, 1, 1, 1});
+    }
+    SECTION("LSTM(forward)") {
+
+        auto pop = Pop();
+        auto myLSTM = LSTM(32, 64, 0, true, "ltsm");
+        auto op =
+            std::dynamic_pointer_cast<MetaOperator_Op>(myLSTM->getOperator());
+
+        auto microGraph = op->getMicroGraph();
+        microGraph->save("lstm", false, true);
+
+        REQUIRE(myLSTM->nbInputs() == 3 + 8 + 8);
+        REQUIRE(myLSTM->inputCategory(0) == InputCategory::Data);
+        for (size_t i = 1; i < 9; ++i) {
+            REQUIRE(myLSTM->inputCategory(i) == InputCategory::Param);
+        }
+        for (size_t i = 9; i < 17; ++i) {
+            REQUIRE(myLSTM->inputCategory(i) == InputCategory::OptionalParam);
+        }
+        REQUIRE(myLSTM->nbOutputs() == 2);
+
+        std::shared_ptr<Tensor> myInput =
+            std::make_shared<Tensor>(Array2D<float, 16, 32>{});
+        std::shared_ptr<Tensor> myInit =
+            std::make_shared<Tensor>(Array2D<float, 32, 64>{});
+        std::shared_ptr<Tensor> myInitW =
+            std::make_shared<Tensor>(Array2D<float, 64, 32>{});
+        std::shared_ptr<Tensor> myInitR =
+            std::make_shared<Tensor>(Array2D<float, 64, 64>{});
+
+        pop->addChild(myLSTM, 0, 0);
+        pop->getOperator()->associateInput(0, myInput);
+        op->associateInput(17, myInit);
+        op->associateInput(18, myInit);
+
+        // Weights X
+        myLSTM->input(1).first->getOperator()->setOutput(0, myInitW);
+        myLSTM->input(2).first->getOperator()->setOutput(0, myInitW);
+        myLSTM->input(3).first->getOperator()->setOutput(0, myInitW);
+        myLSTM->input(4).first->getOperator()->setOutput(0, myInitW);
+        // Weights H
+        myLSTM->input(5).first->getOperator()->setOutput(0, myInitR);
+        myLSTM->input(6).first->getOperator()->setOutput(0, myInitR);
+        myLSTM->input(7).first->getOperator()->setOutput(0, myInitR);
+        myLSTM->input(8).first->getOperator()->setOutput(0, myInitR);
+
+        auto g = getConnectedGraphView(myLSTM);
+        g->setDataType(DataType::Float32);
+        g->setBackend("cpu");
+
+        auto scheduler = SequentialScheduler(g);
+        scheduler.forward(true);
+
+        g->save("lstm_outside_dims", true, true);
+
+        microGraph->save("lstm_dims", true, true);
+        REQUIRE(op->dimsForwarded());
+
+        auto microGraphScheduler =
+            std::dynamic_pointer_cast<MetaOperator_Op>(op)
+                ->getMicroGraphScheduler();
+        microGraphScheduler->saveSchedulingDiagram("lstm_scheduling");
+
+        REQUIRE(op->getNbConsumedData(0).data == 512);
+        REQUIRE(op->getNbConsumedData(1).data == 32768);
+        REQUIRE(op->getNbProducedData(0).data == 34816);
+        REQUIRE(op->getNbProducedData(1).data == 34816);
+        REQUIRE(microGraphScheduler->getStaticScheduling(0).size() == 26);
+        REQUIRE(microGraphScheduler->getStaticScheduling(1).size() == 24);
+        REQUIRE(microGraphScheduler->getStaticScheduling(15).size() == 24);
+    }
+
+    SECTION("LSTM(forward_values)") {
+        auto myLSTM = LSTM(2, 3, 0, true, "ltsm");
+        auto op =
+            std::static_pointer_cast<OperatorTensor>(myLSTM->getOperator());
+
+        auto microGraph =
+            std::dynamic_pointer_cast<MetaOperator_Op>(op)->getMicroGraph();
+        microGraph->save("lstm", false, false);
+
+        REQUIRE(myLSTM->nbInputs() == 3 + 8 + 8);
+        REQUIRE(myLSTM->inputCategory(0) == InputCategory::Data);
+        for (size_t i = 1; i < 9; ++i) {
+            REQUIRE(myLSTM->inputCategory(i) == InputCategory::Param);
+        }
+        for (size_t i = 9; i < 17; ++i) {
+            REQUIRE(myLSTM->inputCategory(i) == InputCategory::OptionalParam);
+        }
+        REQUIRE(myLSTM->nbOutputs() == 2);
+
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
+            Array2D<float, 3, 2>{{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}}});
+        std::shared_ptr<Tensor> myInit =
+            std::make_shared<Tensor>(Array2D<float, 3, 3>{
+                {{0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}}});
+        std::shared_ptr<Tensor> myInitW = std::make_shared<Tensor>(
+            Array2D<float, 3, 2>{{{0.1, 0.1}, {0.1, 0.1}, {0.1, 0.1}}});
+        std::shared_ptr<Tensor> myInitR =
+            std::make_shared<Tensor>(Array2D<float, 3, 3>{
+                {{0.1, 0.1, 0.1}, {0.1, 0.1, 0.1}, {0.1, 0.1, 0.1}}});
+
+        op->associateInput(0, myInput);
+        op->associateInput(17, myInit);
+        op->associateInput(18, myInit);
+
+        // Weights X
+        myLSTM->input(1).first->getOperator()->setOutput(0, myInitW);
+        myLSTM->input(2).first->getOperator()->setOutput(0, myInitW);
+        myLSTM->input(3).first->getOperator()->setOutput(0, myInitW);
+        myLSTM->input(4).first->getOperator()->setOutput(0, myInitW);
+        // Weights H
+        myLSTM->input(5).first->getOperator()->setOutput(0, myInitR);
+        myLSTM->input(6).first->getOperator()->setOutput(0, myInitR);
+        myLSTM->input(7).first->getOperator()->setOutput(0, myInitR);
+        myLSTM->input(8).first->getOperator()->setOutput(0, myInitR);
+
+        auto g = getConnectedGraphView(myLSTM);
+        g->setDataType(DataType::Float32);
+        g->setBackend("cpu");
+
+        auto scheduler = SequentialScheduler(g);
+        scheduler.forward();
+
+        microGraph->save("lstm_values_dims", false, true);
+
+        std::shared_ptr<Tensor> myHiddenState = std::make_shared<Tensor>(
+            Array2D<float, 3, 3>{{{0.0952412, 0.0952412, 0.0952412},
+                                  {0.25606447, 0.25606447, 0.25606447},
+                                  {0.40323776, 0.40323776, 0.40323776}}});
+
+        auto microGraphScheduler =
+            std::dynamic_pointer_cast<MetaOperator_Op>(op)
+                ->getMicroGraphScheduler();
+        microGraphScheduler->saveSchedulingDiagram("lstm_values_scheduling");
+
+        op->getOutput(0)->print();
+        myHiddenState->print();
+
+        REQUIRE(approxEq<float>(*(op->getOutput(0)), *myHiddenState));
+    }
+
+    SECTION("LSTM(forward_values_seq)") {
+        auto pop = Pop();
+        auto myLSTM = LSTM(2, 3, 2, true, "ltsm");
+        auto myGraph = Sequential({pop, myLSTM});
+        auto op =
+            std::static_pointer_cast<OperatorTensor>(myLSTM->getOperator());
+
+        REQUIRE(myLSTM->nbInputs() == 3 + 8 + 8);
+        REQUIRE(myLSTM->inputCategory(0) == InputCategory::Data);
+        for (size_t i = 1; i < 9; ++i) {
+            REQUIRE(myLSTM->inputCategory(i) == InputCategory::Param);
+        }
+        for (size_t i = 9; i < 17; ++i) {
+            REQUIRE(myLSTM->inputCategory(i) == InputCategory::OptionalParam);
+        }
+        REQUIRE(myLSTM->nbOutputs() == 2);
+
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
+            Array3D<float, 2, 3, 2>{{{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}},
+                                     {{2.0, 3.0}, {4.0, 5.0}, {6.0, 7.0}}}});
+        std::shared_ptr<Tensor> myInit =
+            std::make_shared<Tensor>(Array2D<float, 3, 3>{
+                {{0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}}});
+        std::shared_ptr<Tensor> myInitW = std::make_shared<Tensor>(
+            Array2D<float, 3, 2>{{{0.1, 0.1}, {0.1, 0.1}, {0.1, 0.1}}});
+        std::shared_ptr<Tensor> myInitR =
+            std::make_shared<Tensor>(Array2D<float, 3, 3>{
+                {{0.1, 0.1, 0.1}, {0.1, 0.1, 0.1}, {0.1, 0.1, 0.1}}});
+
+        pop->getOperator()->associateInput(0, myInput);
+        op->associateInput(17, myInit);
+        op->associateInput(18, myInit);
+
+        // Weights X
+        myLSTM->input(1).first->getOperator()->setOutput(0, myInitW);
+        myLSTM->input(2).first->getOperator()->setOutput(0, myInitW);
+        myLSTM->input(3).first->getOperator()->setOutput(0, myInitW);
+        myLSTM->input(4).first->getOperator()->setOutput(0, myInitW);
+        // Weights H
+        myLSTM->input(5).first->getOperator()->setOutput(0, myInitR);
+        myLSTM->input(6).first->getOperator()->setOutput(0, myInitR);
+        myLSTM->input(7).first->getOperator()->setOutput(0, myInitR);
+        myLSTM->input(8).first->getOperator()->setOutput(0, myInitR);
+
+        auto g = getConnectedGraphView(myLSTM);
+        g->compile("cpu", DataType::Float32);
+
+        g->save("lstm_seq", true, true);
+
+        auto scheduler = SequentialScheduler(g);
+        scheduler.forward();
+        scheduler.saveSchedulingDiagram("lstm_seq_schedule");
+
+        std::shared_ptr<Tensor> myHiddenState = std::make_shared<Tensor>(
+            Array2D<float, 3, 3>{{{0.24439372, 0.24439372, 0.24439372},
+                                  {0.49801484, 0.49801484, 0.49801484},
+                                  {0.67162132, 0.67162132, 0.67162132}}});
+
+        myGraph->save("lstm_seq_mygraph", true, true);
+
+        op->getOutput(0)->print();
+        myHiddenState->print();
+
+        REQUIRE(approxEq<float>(*(op->getOutput(0)), *myHiddenState));
+    }
+
+    SECTION("LSTM(forward_values_seq_flatten)(sequential)") {
+        auto pop = Pop();
+        auto myLSTM = LSTM(2, 3, 2, true, "ltsm");
+        auto op =
+            std::static_pointer_cast<MetaOperator_Op>(myLSTM->getOperator());
+
+        // Here we test LSTM as it is was flatten in the graph.
+        // We just borrow its micro-graph into our larger myGraph graph.
+        auto myGraph = std::make_shared<GraphView>();
+        pop->addChild(op->getMicroGraph()->getOrderedInputs()[0].first, 0, 0);
+        myGraph->add(op->getMicroGraph());
+        myGraph->add(pop);
+
+        REQUIRE(myLSTM->nbInputs() == 3 + 8 + 8);
+        REQUIRE(myLSTM->inputCategory(0) == InputCategory::Data);
+        for (size_t i = 1; i < 9; ++i) {
+            REQUIRE(myLSTM->inputCategory(i) == InputCategory::Param);
+        }
+        for (size_t i = 9; i < 17; ++i) {
+            REQUIRE(myLSTM->inputCategory(i) == InputCategory::OptionalParam);
+        }
+        REQUIRE(myLSTM->nbOutputs() == 2);
+
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
+            Array3D<float, 2, 3, 2>{{{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}},
+                                     {{2.0, 3.0}, {4.0, 5.0}, {6.0, 7.0}}}});
+        std::shared_ptr<Tensor> myInit =
+            std::make_shared<Tensor>(Array2D<float, 3, 3>{
+                {{0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}}});
+        std::shared_ptr<Tensor> myInitW = std::make_shared<Tensor>(
+            Array2D<float, 3, 2>{{{0.1, 0.1}, {0.1, 0.1}, {0.1, 0.1}}});
+        std::shared_ptr<Tensor> myInitR =
+            std::make_shared<Tensor>(Array2D<float, 3, 3>{
+                {{0.1, 0.1, 0.1}, {0.1, 0.1, 0.1}, {0.1, 0.1, 0.1}}});
+
+        pop->getOperator()->associateInput(0, myInput);
+        op->associateInput(17, myInit);
+        op->associateInput(18, myInit);
+
+        // Weights X
+        auto prodX = Producer(myInitW);
+        prodX->addChild(op->getMicroGraph()->getOrderedInputs()[1].first,
+                        0,
+                        1);
+        prodX->addChild(op->getMicroGraph()->getOrderedInputs()[2].first,
+                        0,
+                        1);
+        prodX->addChild(op->getMicroGraph()->getOrderedInputs()[3].first,
+                        0,
+                        1);
+        prodX->addChild(op->getMicroGraph()->getOrderedInputs()[4].first,
+                        0,
+                        1);
+        // Weights H
+        auto prodH = Producer(myInitR);
+        prodH->addChild(op->getMicroGraph()->getOrderedInputs()[5].first,
+                        0,
+                        1);
+        prodH->addChild(op->getMicroGraph()->getOrderedInputs()[6].first,
+                        0,
+                        1);
+        prodH->addChild(op->getMicroGraph()->getOrderedInputs()[7].first,
+                        0,
+                        1);
+        prodH->addChild(op->getMicroGraph()->getOrderedInputs()[8].first,
+                        0,
+                        1);
+        myGraph->add({prodX, prodH});
+
+        myGraph->setDataType(DataType::Float32);
+        myGraph->setBackend("cpu");
+        myGraph->save("lstm_seq_flatten", true, true);
+
+        std::shared_ptr<Tensor> myHiddenState = std::make_shared<Tensor>(
+            Array2D<float, 3, 3>{{{0.24439372, 0.24439372, 0.24439372},
+                                  {0.49801484, 0.49801484, 0.49801484},
+                                  {0.67162132, 0.67162132, 0.67162132}}});
+
+        auto scheduler = SequentialScheduler(myGraph);
+        scheduler.generateScheduling();
+        scheduler.saveStaticSchedulingDiagram("lstm_static_schedule");
+        scheduler.forward(true);
+        scheduler.saveSchedulingDiagram("lstm_seq_flatten_schedule_seq");
+
+        op->getOutput(0)->print();
+        myHiddenState->print();
+
+        REQUIRE(approxEq<float>(*(op->getOutput(0)), *myHiddenState));
+    }
+    SECTION("LSTM(forward_values_seq_flatten)(parallel)") {
+        auto pop = Pop();
+        auto myLSTM = LSTM(2, 3, 2, true, "ltsm");
+        auto op =
+            std::static_pointer_cast<MetaOperator_Op>(myLSTM->getOperator());
+
+        // Here we test LSTM as it is was flatten in the graph.
+        // We just borrow its micro-graph into our larger myGraph graph.
+        auto myGraph = std::make_shared<GraphView>();
+        pop->addChild(op->getMicroGraph()->getOrderedInputs()[0].first, 0, 0);
+        myGraph->add(op->getMicroGraph());
+        myGraph->add(pop);
+
+        REQUIRE(myLSTM->nbInputs() == 3 + 8 + 8);
+        REQUIRE(myLSTM->inputCategory(0) == InputCategory::Data);
+        for (size_t i = 1; i < 9; ++i) {
+            REQUIRE(myLSTM->inputCategory(i) == InputCategory::Param);
+        }
+        for (size_t i = 9; i < 17; ++i) {
+            REQUIRE(myLSTM->inputCategory(i) == InputCategory::OptionalParam);
+        }
+        REQUIRE(myLSTM->nbOutputs() == 2);
+
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
+            Array3D<float, 2, 3, 2>{{{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}},
+                                     {{2.0, 3.0}, {4.0, 5.0}, {6.0, 7.0}}}});
+        std::shared_ptr<Tensor> myInit =
+            std::make_shared<Tensor>(Array2D<float, 3, 3>{
+                {{0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}}});
+        std::shared_ptr<Tensor> myInitW = std::make_shared<Tensor>(
+            Array2D<float, 3, 2>{{{0.1, 0.1}, {0.1, 0.1}, {0.1, 0.1}}});
+        std::shared_ptr<Tensor> myInitR =
+            std::make_shared<Tensor>(Array2D<float, 3, 3>{
+                {{0.1, 0.1, 0.1}, {0.1, 0.1, 0.1}, {0.1, 0.1, 0.1}}});
+
+        pop->getOperator()->associateInput(0, myInput);
+        op->associateInput(17, myInit);
+        op->associateInput(18, myInit);
+
+        // Weights X
+        auto prodX = Producer(myInitW);
+        prodX->addChild(op->getMicroGraph()->getOrderedInputs()[1].first,
+                        0,
+                        1);
+        prodX->addChild(op->getMicroGraph()->getOrderedInputs()[2].first,
+                        0,
+                        1);
+        prodX->addChild(op->getMicroGraph()->getOrderedInputs()[3].first,
+                        0,
+                        1);
+        prodX->addChild(op->getMicroGraph()->getOrderedInputs()[4].first,
+                        0,
+                        1);
+        // Weights H
+        auto prodH = Producer(myInitR);
+        prodH->addChild(op->getMicroGraph()->getOrderedInputs()[5].first,
+                        0,
+                        1);
+        prodH->addChild(op->getMicroGraph()->getOrderedInputs()[6].first,
+                        0,
+                        1);
+        prodH->addChild(op->getMicroGraph()->getOrderedInputs()[7].first,
+                        0,
+                        1);
+        prodH->addChild(op->getMicroGraph()->getOrderedInputs()[8].first,
+                        0,
+                        1);
+        myGraph->add({prodX, prodH});
+
+        myGraph->setDataType(DataType::Float32);
+        myGraph->setBackend("cpu");
+        myGraph->save("lstm_seq_flatten", true, true);
+
+        std::shared_ptr<Tensor> myHiddenState = std::make_shared<Tensor>(
+            Array2D<float, 3, 3>{{{0.24439372, 0.24439372, 0.24439372},
+                                  {0.49801484, 0.49801484, 0.49801484},
+                                  {0.67162132, 0.67162132, 0.67162132}}});
+
+        auto scheduler = ParallelScheduler(myGraph);
+        scheduler.generateScheduling();
+        scheduler.forward(true);
+        scheduler.saveSchedulingDiagram("lstm_seq_flatten_schedule_par");
+
+        op->getOutput(0)->print();
+        myHiddenState->print();
+
+        REQUIRE(approxEq<float>(*(op->getOutput(0)), *myHiddenState));
+    }
+
+    SECTION("Leaky(forward)(fixed)") {
+
+        constexpr auto inChannels = 10;
+        constexpr auto outChannels = 5;
+
+        constexpr auto beta = 0.95;
+        constexpr auto threshold = 1.0;
+        constexpr auto nbTimeSteps = 2;
+
+        auto myWeights =
+            std::make_shared<Tensor>(Array2D<float, outChannels, inChannels>{{
+                {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0},
+                {1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1},
+                {0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 0.1, 0.2, 0.3, 0.4},
+                {0.4, 0.3, 0.2, 0.1, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5},
+                {0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0.0},
+            }});
+
+        auto myWeights2 =
+            std::make_shared<Tensor>(Array2D<float, inChannels, outChannels>{{
+                {0.1, 0.2, 0.3, 0.4, 0.5},
+                {0.6, 0.7, 0.8, 0.9, 1.0},
+                {1.0, 0.9, 0.8, 0.7, 0.6},
+                {0.5, 0.4, 0.3, 0.2, 0.1},
+                {0.5, 0.6, 0.7, 0.8, 0.9},
+                {1.0, 0.1, 0.2, 0.3, 0.4},
+                {0.4, 0.3, 0.2, 0.1, 0.0},
+                {0.1, 0.2, 0.3, 0.4, 0.5},
+                {0.9, 0.8, 0.7, 0.6, 0.5},
+                {0.4, 0.3, 0.2, 0.1, 0.0},
+            }});
+
+        auto myInput = std::make_shared<Tensor>(Array2D<float, 2, 10>{{
+            {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0},
+            {1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1},
+        }});
+
+        // py/snn Torch computed result, output of fc1 at time step 1
+        auto expectedOutputlif1ts1 =
+            std::make_shared<Tensor>(Array2D<float, 2, 5>{{
+                {3.850, 2.2000, 2.6500, 1.5000, 1.6500},
+                {2.200, 3.8500, 3.4000, 1.2500, 3.3000},
+            }});
+
+        auto expectedOutputfc2ts1 =
+            std::make_shared<Tensor>(Array2D<float, 2, 10>{{
+                {1.5000,
+                 4.0000,
+                 4.0000,
+                 1.5000,
+                 3.5000,
+                 2.0000,
+                 1.0000,
+                 1.5000,
+                 3.5000,
+                 1.0000},
+                {1.5000,
+                 4.0000,
+                 4.0000,
+                 1.5000,
+                 3.5000,
+                 2.0000,
+                 1.0000,
+                 1.5000,
+                 3.5000,
+                 1.0000},
+            }});
+
+        auto expectedOutputlif1ts2 =
+            std::make_shared<Tensor>(Array2D<float, 2, 5>{{
+                {6.5075, 3.2900, 4.1675, 1.9250, 2.2175},
+                {3.2900, 6.5075, 5.6300, 1.4375, 5.4350},
+            }});
+
+        // NOTE: Same output as before, because for all channels, we have a
+        // potential higher than threshold. Thus the lif neuron fires at every
+        // timestep for every channel.
+        auto expectedOutputfc2ts2 =
+            std::make_shared<Tensor>(Array2D<float, 2, 10>{{
+                {1.5000,
+                 4.0000,
+                 4.0000,
+                 1.5000,
+                 3.5000,
+                 2.0000,
+                 1.0000,
+                 1.5000,
+                 3.5000,
+                 1.0000},
+                {1.5000,
+                 4.0000,
+                 4.0000,
+                 1.5000,
+                 3.5000,
+                 2.0000,
+                 1.0000,
+                 1.5000,
+                 3.5000,
+                 1.0000},
+            }});
+
+        auto init = std::make_shared<Tensor>(Array2D<float, 2, 5>{});
+        uniformFiller<float>(init, 0.0, 0.0);
+
+        auto fc1 = FC(inChannels, outChannels, true, "myfc");
+        auto fc2 = FC(outChannels, inChannels, true, "fc2");
+        // NOTE: Account for init step by adding 1 to the max timestep
+        // parameter.
+        auto lif1 = Leaky(nbTimeSteps + 1, beta, threshold, "leaky");
+
+        // associateInput() does not work
+        fc1->input(1).first->getOperator()->setOutput(0, myWeights);
+        fc2->input(1).first->getOperator()->setOutput(0, myWeights2);
+
+        auto fc1Op =
+            std::static_pointer_cast<OperatorTensor>(fc1->getOperator());
+        auto lif1Op =
+            std::static_pointer_cast<MetaOperator_Op>(lif1->getOperator());
+        auto fc2Op =
+            std::static_pointer_cast<OperatorTensor>(fc2->getOperator());
+
+        fc1Op->associateInput(0, myInput);
+        lif1Op->associateInput(1, init);
+        lif1Op->associateInput(2, init);
+
+        fc1->addChild(lif1, 0, 0);
+        lif1->addChild(fc2, 1, 0);
+
+        auto g = std::make_shared<GraphView>();
+        g->add({fc1, lif1, fc2});
+        g->compile("cpu", DataType::Float32);
+        auto scheduler = SequentialScheduler(g);
+
+        // Forward 1 (simulate timestep 0)
+        scheduler.forward(true);
+        REQUIRE(approxEq<float>(*(lif1Op->getOutput(0)),
+                                *(expectedOutputlif1ts1)));
+        REQUIRE(
+            approxEq<float>(*(fc2Op->getOutput(0)), *(expectedOutputfc2ts1)));
+
+        // Forward 1 (simulate timestep 1)
+        scheduler.forward(true);
+        REQUIRE(approxEq<float>(*(lif1Op->getOutput(0)),
+                                *(expectedOutputlif1ts2)));
+        REQUIRE(
+            approxEq<float>(*(fc2Op->getOutput(0)), *(expectedOutputfc2ts2)));
+    }
+
+    SECTION("Leaky(forward)") {
+
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> valueDist(
+            0.1f,
+            1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2),
+                                                               std::size_t(4));
+        std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(3),
+                                                              std::size_t(3));
+        std::uniform_int_distribution<int> boolDist(0, 1);
+        std::uniform_real_distribution<float> betaDist(0,1);
+
+        const std::size_t nbDims = nbDimsDist(gen);
+        Log::info("Nbdims : {}", nbDims);
+        std::vector<std::size_t> dims;
+        for (std::size_t i = 0; i < nbDims; ++i) {
+            dims.push_back(dimSizeDist(gen));
+        }
+        Log::info("timesteps : {}", dims[0]);
+        Log::info("dimensions : ");
+        for (auto dim : dims) {
+            Log::info("{}", dim);
+        }
+
+        const auto nbTimeSteps = dims[0];
+        const auto beta = betaDist(gen); 
+
+        auto myLeaky = Leaky(nbTimeSteps, beta, 1.0, "leaky");
+        auto op =
+            std::static_pointer_cast<MetaOperator_Op>(myLeaky->getOperator());
+        // auto stack = Stack(2);
+        auto mem_rec = Stack(nbTimeSteps, "mem_rec");
+        auto spk_rec = Stack(nbTimeSteps, "spk_rec");
+        auto pop = Pop("popinput");
+
+        // Here we test LSTM as it is was flatten in the graph.
+        // We just borrow its micro-graph into our larger myGraph graph.
+        auto myGraph = std::make_shared<GraphView>();
+
+        pop->addChild(op->getMicroGraph()->getOrderedInputs()[0].first, 0, 0);
+        // 0 for mem 1 for stack
+        op->getMicroGraph()->getOrderedOutputs()[1].first->addChild(mem_rec,
+                                                                    0,
+                                                                    0);
+        op->getMicroGraph()->getOrderedOutputs()[0].first->addChild(spk_rec,
+                                                                    0,
+                                                                    0);
+        for (auto node : op->getMicroGraph()->getOrderedOutputs()) {
+            Log::info("name  of output {}", node.first->name());
+        }
+
+        myGraph->add(pop);
+        myGraph->add(op->getMicroGraph());
+        myGraph->add(mem_rec);
+        myGraph->add(spk_rec);
+        myGraph->save("mg", true, true);
+
+        // 3 outputs
+        REQUIRE(myLeaky->nbInputs() == 3);
+        REQUIRE(myLeaky->inputCategory(0) == InputCategory::Data);
+        // Two spikes connected to nothing, + the Add node real output
+        REQUIRE(myLeaky->nbOutputs() == 4);
+
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
+            Array3D<float, 2, 3, 2>{{{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}},
+                                     {{2.0, 3.0}, {4.0, 5.0}, {6.0, 7.0}}}});
+
+        // std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(
+        //     Array3D<float, 2, 3, 2>{{{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}},
+        //                              {{2.0, 3.0}, {4.0, 5.0},
+        //                              {6.0, 7.0}}}});
+
+        // Generate input
+        std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>();
+        T0->setDataType(DataType::Float32);
+        T0->setBackend("cpu");
+
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>();
+        expectedOutput->setDataType(DataType::Float32);
+        expectedOutput->setBackend("cpu");
+
+        const auto nb_elements =
+            std::accumulate(dims.cbegin(),
+                            dims.cend(),
+                            std::size_t(1),
+                            std::multiplies<std::size_t>());
+        float *input = new float[nb_elements];
+        float *result = new float[nb_elements];
+
+        for (std::size_t i = 0; i < nb_elements; ++i) {
+            input[i] = valueDist(gen);
+        }
+        T0->resize(dims);
+        T0->getImpl()->setRawPtr(input, nb_elements);
+        T0->print();
+
+        // Elements popped at each time step
+        auto nbElementsPerTimeStep = nb_elements / dims[0];
+
+        // Init
+        for (int i = 0; i < nbElementsPerTimeStep; ++i) {
+            result[i] = input[i];
+        }
+
+        // Reccurence
+        for (int i = 1; i < dims[0]; ++i) {
+            auto offset = nbElementsPerTimeStep * i;
+            auto prev = nbElementsPerTimeStep * (i - 1);
+            for (int j = 0; j < nbElementsPerTimeStep; ++j) {
+                auto reset = (result[prev + j] > 1.0 ? 1 : 0);
+                result[offset + j] =
+                    result[prev + j] * beta + input[offset + j] - reset;
+            }
+        }
+
+        expectedOutput->resize(dims);
+        expectedOutput->getImpl()->setRawPtr(result, nb_elements);
+        Log::info("Expected ouptut : ");
+        expectedOutput->print();
+
+        std::shared_ptr<Tensor> myInit =
+            std::make_shared<Tensor>(Array2D<float, 3, 3>{
+                {{0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}}});
+
+        auto initMemdims =
+            std::vector<std::size_t>(dims.begin() + 1, dims.end());
+        Log::info("dimensions : ");
+        for (auto dim : initMemdims) {
+            Log::info("{}", dim);
+        }
+        std::shared_ptr<Tensor> myInitW = std::make_shared<Tensor>(
+            Array2D<float, 3, 2>{{{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}});
+
+        std::shared_ptr<Tensor> myInitR =
+            std::make_shared<Tensor>(initMemdims);
+        myInitR->setDataType(DataType::Float32);
+        myInitR->setBackend("cpu");
+        uniformFiller<float>(myInitR, 0, 0);
+
+        pop->getOperator()->associateInput(0, T0);
+        op->associateInput(1, myInitR);
+        op->associateInput(2, myInitR);
+
+        myGraph->compile("cpu", DataType::Float32);
+
+        auto scheduler = SequentialScheduler(myGraph);
+        REQUIRE_NOTHROW(scheduler.generateScheduling());
+        REQUIRE_NOTHROW(scheduler.forward(true));
+
+        auto memOp =
+            std::static_pointer_cast<OperatorTensor>(spk_rec->getOperator());
+        REQUIRE(approxEq<float>(*(memOp->getOutput(0)), *(expectedOutput)));
+    }
+}
--- a/unit_tests/operator/Test_MulImpl.cpp
+++ b/unit_tests/operator/Test_MulImpl.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <chrono>
+#include <cstddef> // std::size_t
+#include <cstdint> // std::uint16_t
+#include <memory>
+#include <numeric> // std::accumulate
+#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution,
+                   // std::uniform_int_distribution
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/MulImpl.hpp"
+#include "aidge/data/DataType.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/Mul.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"
+#include "aidge/utils/Log.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+namespace Aidge {
+
+TEST_CASE("[CPU/Operator] Mul(Backward)", "[Mul][CPU][Backward]") {
+    std::shared_ptr<Mul_Op> op = std::make_shared<Mul_Op>();
+    op->setDataType(DataType::Float32);
+    op->setBackend("cpu");
+
+    // NOTE: The first four tests use fixed values, the last one uses random values but static dimensions.
+
+    SECTION("Case 1: 1D and 2D Tensors") {
+        const auto T0 = std::make_shared<Tensor>(
+            Array2D<cpptype_t<DataType::Float32>, 2, 3>({{{1, 2, 3}, {4, 5, 6}}}));
+
+        const auto T1 =
+            std::make_shared<Tensor>(Array1D<cpptype_t<DataType::Float32>, 3>({0.1, 0.2, 0.3}));
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>(
+            Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}})));
+        op->forwardDims();
+
+        op->backward();
+
+        const Tensor expectedGrad0 =
+            Array2D<cpptype_t<DataType::Float32>, 2, 3>({{{0.1, 0.2, 0.3}, {0.1, 0.2, 0.3}}});
+
+        const Tensor expectedGrad1 = Array1D<cpptype_t<DataType::Float32>, 3>({5, 7, 9});
+
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1));
+    }
+
+    SECTION("Case 2: 3D and 1D tensors") {
+        const auto T0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+            {{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}},
+              {{7.0, 8.0, 9.0}, {10.0, 11.0, 12.0}}}}));
+
+        const auto T1 =
+            std::make_shared<Tensor>(Array1D<float, 3>({0.3, 0.2, 0.1}));
+
+        const auto newGrad = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+            {{{{1, 1, 1}, {1, 1, 1}}, {{1, 1, 1}, {1, 1, 1}}}}));
+
+        const Tensor expectedGrad0 =
+            Array3D<float, 2, 2, 3>({{{{0.3, 0.2, 0.1}, {0.3, 0.2, 0.1}},
+                                      {{0.3, 0.2, 0.1}, {0.3, 0.2, 0.1}}}});
+
+        const Tensor expectedGrad1 = Array1D<cpptype_t<DataType::Float32>, 3>({22.0, 26.0, 30.0});
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(newGrad);
+        op->forwardDims();
+
+        op->backward();
+
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1));
+    }
+
+    SECTION("Case 3: 4D and 2D tensors") {
+        const auto T0 = std::make_shared<Tensor>(Array4D<cpptype_t<DataType::Float32>, 2, 2, 3, 3>(
+            {{{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}, {7.0, 8.0, 9.0}},
+               {{10.0, 11.0, 12.0}, {13.0, 14.0, 15.0}, {16.0, 17.0, 18.0}}},
+              {{{19.0, 20.0, 21.0}, {22.0, 23.0, 24.0}, {25.0, 26.0, 27.0}},
+               {{28.0, 29.0, 30.0},
+                {31.0, 32.0, 33.0},
+                {34.0, 35.0, 36.0}}}}}));
+
+        const auto T1 = std::make_shared<Tensor>(Array2D<cpptype_t<DataType::Float32>, 3, 3>(
+            {{{0.5, 0.3, 0.1}, {0.4, 0.2, 0.6}, {0.7, 0.8, 0.9}}}));
+
+        const auto newGrad =
+            std::make_shared<Tensor>(Array4D<cpptype_t<DataType::Float32>, 2, 2, 3, 3>(
+                {{{{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}},
+                   {{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}},
+                  {{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}},
+                   {{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}}}}));
+
+        const Tensor expectedGrad0 =
+            Array4D<cpptype_t<DataType::Float32>, 2, 2, 3, 3>(
+                {{{{{0.5, 0.3, 0.1}, {0.4, 0.2, 0.6}, {0.7, 0.8, 0.9}},
+                   {{0.5, 0.3, 0.1}, {0.4, 0.2, 0.6}, {0.7, 0.8, 0.9}}},
+                  {{{0.5, 0.3, 0.1}, {0.4, 0.2, 0.6}, {0.7, 0.8, 0.9}},
+                   {{0.5, 0.3, 0.1}, {0.4, 0.2, 0.6}, {0.7, 0.8, 0.9}}}}});
+
+        const Tensor expectedGrad1 =
+            Array2D<cpptype_t<DataType::Float32>, 3, 3>({{{58.0, 62.0, 66.0},
+                                   {70.0, 74.0, 78.0},
+                                   {82.0, 86.0, 90.0}}});
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(newGrad);
+        op->forwardDims();
+
+        op->backward();
+
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1));
+    }
+
+    SECTION("Case 4: 3D and 2D tensors") {
+        const auto T0 = std::make_shared<Tensor>(
+            Array3D<float, 2, 3, 4>({{{
+                                          {1.0, 2.0, 3.0, 4.0},
+                                          {5.0, 6.0, 7.0, 8.0},
+                                          {9.0, 10.0, 11.0, 12.0},
+                                      },
+                                      {
+                                          {13.0, 14.0, 15.0, 16.0},
+                                          {17.0, 18.0, 19.0, 20.0},
+                                          {21.0, 22.0, 23.0, 24.0},
+                                      }}}));
+
+        const auto T1 = std::make_shared<Tensor>(
+            Array2D<cpptype_t<DataType::Float32>, 3, 4>({{{0.1, 0.2, 0.3, 0.4},
+                                   {0.5, 0.6, 0.7, 0.8},
+                                   {0.9, 1.0, 1.1, 1.2}}}));
+
+        const auto newGrad = std::make_shared<Tensor>(
+            Array3D<cpptype_t<DataType::Float32>, 2, 3, 4>({{{
+                                          {1.0, 1.0, 1.0, 1.0},
+                                          {1.0, 1.0, 1.0, 1.0},
+                                          {1.0, 1.0, 1.0, 1.0},
+                                      },
+                                      {
+                                          {1.0, 1.0, 1.0, 1.0},
+                                          {1.0, 1.0, 1.0, 1.0},
+                                          {1.0, 1.0, 1.0, 1.0},
+                                      }}}));
+
+        const Tensor expectedGrad0 =
+            Array3D<cpptype_t<DataType::Float32>, 2, 3, 4>({{{{0.1, 0.2, 0.3, 0.4},
+                                       {0.5, 0.6, 0.7, 0.8},
+                                       {0.9, 1.0, 1.1, 1.2}},
+                                      {{0.1, 0.2, 0.3, 0.4},
+                                       {0.5, 0.6, 0.7, 0.8},
+                                       {0.9, 1.0, 1.1, 1.2}}}});
+
+        const Tensor expectedGrad1 =
+            Array2D<cpptype_t<DataType::Float32>, 3, 4>({{{14.0, 16.0, 18.0, 20.0},
+                                   {22.0, 24.0, 26.0, 28.0},
+                                   {30.0, 32.0, 34.0, 36.0}}});
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(newGrad);
+        op->forwardDims();
+
+        op->backward();
+
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1));
+    }
+
+    SECTION("Case 5: Tensors with random values") {
+
+        // Use random values
+        const std::vector<std::size_t> dims0 = {5, 2, 1, 7}; // First tensor
+        const std::vector<std::size_t> dims1 = {2, 6, 7};    // Second tensor
+        const std::vector<std::size_t> outputDims = {5, 2, 6, 7};
+
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> dist(0.1f, 1.0f);
+
+        auto T0 = std::make_shared<Tensor>(dims0);
+        T0->setDataType(DataType::Float32);
+        T0->setBackend("cpu");
+        float* input0Data = static_cast<float*>(T0->getImpl()->rawPtr());
+        // Fill with random values
+        for (std::size_t i = 0; i < T0->size(); ++i) {
+            input0Data[i] = dist(gen);
+        }
+
+        auto T1 = std::make_shared<Tensor>(dims1);
+        T1->setDataType(DataType::Float32);
+        T1->setBackend("cpu");
+        float* input1Data = static_cast<float*>(T1->getImpl()->rawPtr());
+        // Fill with random values
+        for (std::size_t i = 0; i < T1->size(); ++i) {
+            input1Data[i] = dist(gen);
+        }
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+
+        op->forwardDims();
+        op->forward();
+
+        Tensor expectedOutput{outputDims};
+        expectedOutput.setBackend("cpu");
+        float* expectedOutputData = static_cast<float*>(expectedOutput.getImpl()->rawPtr());
+
+        for (std::size_t n = 0; n < 5; ++n) {
+            for (std::size_t c = 0; c < 2; ++c) {
+                for (std::size_t h = 0; h < 6; ++h) {
+                    for (std::size_t w = 0; w < 7; ++w) {
+                        std::size_t outIdx = w + 7 * (h + 6 * (c + 2 * n));
+                        std::size_t in0Idx =
+                            w + 7 * (0 + 1 * (c + 2 * n)); // middle dim is 1
+                        std::size_t in1Idx =
+                            w + 7 * (h + 6 * c);           // no n dimension
+
+                        expectedOutputData[outIdx] = input0Data[in0Idx] * input1Data[in1Idx];
+                    }
+                }
+            }
+        }
+
+        auto outputTensor = op->getOutput(0);
+
+        REQUIRE(approxEq<float>(*outputTensor, expectedOutput));
+
+        // Backward pass
+        std::vector<float> gradOutputData(expectedOutput.size());
+        for (auto &val : gradOutputData) {
+            val = dist(gen);
+        }
+
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>());
+        op->getOutput(0)->grad()->resize(outputDims);
+        op->getOutput(0)->grad()->getImpl()->setRawPtr(gradOutputData.data(),
+                                                       expectedOutput.size());
+
+        // Compute reference gradients
+        std::vector<float> expectedGrad0(T0->size(), 0.0f);
+        std::vector<float> expectedGrad1(T1->size(), 0.0f);
+
+        for (std::size_t n = 0; n < 5; ++n) {
+            for (std::size_t c = 0; c < 2; ++c) {
+                for (std::size_t h = 0; h < 6; ++h) {
+                    for (std::size_t w = 0; w < 7; ++w) {
+                        std::size_t outIdx = w + 7 * (h + 6 * (c + 2 * n));
+                        std::size_t in0Idx = w + 7 * (0 + 1 * (c + 2 * n));
+                        std::size_t in1Idx = w + 7 * (h + 6 * c);
+
+                        // Gradient for input0: grad_output * input1
+                        expectedGrad0[in0Idx] +=
+                            gradOutputData[outIdx] * input1Data[in1Idx];
+
+                        // Gradient for input1: grad_output * input0
+                        expectedGrad1[in1Idx] +=
+                            gradOutputData[outIdx] * input0Data[in0Idx];
+                    }
+                }
+            }
+        }
+
+        // Perform backward pass
+        op->backward();
+
+        auto expectedGrad0Tensor = std::make_shared<Tensor>();
+        expectedGrad0Tensor->resize(T0->dims());
+        expectedGrad0Tensor->setBackend("cpu");
+        expectedGrad0Tensor->setDataType(DataType::Float32);
+        expectedGrad0Tensor->getImpl()->setRawPtr(expectedGrad0.data(),
+                                                    expectedGrad0.size());
+
+        auto expectedGrad1Tensor = std::make_shared<Tensor>(T1->dims());
+        expectedGrad1Tensor->setBackend("cpu");
+        expectedGrad1Tensor->setDataType(DataType::Float32);
+        expectedGrad1Tensor->getImpl()->setRawPtr(expectedGrad1.data(),
+                                                    expectedGrad1.size());
+
+        // Verify backward pass
+        REQUIRE(approxEq<float>(*T0->grad(), *expectedGrad0Tensor));
+        REQUIRE(approxEq<float>(*T1->grad(), *expectedGrad1Tensor));
+
+        // Optional: Print some values for verification
+        // std::cout << "Input shapes: (" << dims0[0] << "," << dims0[1] <<
+        // "," << dims0[2] << "," << dims0[3]
+        //           << ") * (" << dims1[0] << "," << dims1[1] << "," <<
+        //           dims1[2]
+        //           << ") -> (" << outputDims[0] << "," << outputDims[1]
+        //           << "," << outputDims[2] << "," << outputDims[3] <<
+        //           ")\n";
+        // std::cout << "Input sizes: " << input0_size << " * " <<
+        // input1_size << " -> " << output_size << "\n";
+    }
+}
+
+TEST_CASE("[cpu/operator] Mul(forward)", "[Mul][CPU]") {
+    constexpr std::uint16_t NBTRIALS = 10;
+    // Create a random number generator
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> valueDist(
+        0.1f,
+        1.1f); // Random float distribution between 0 and 1
+    std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2),
+                                                           std::size_t(10));
+    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1),
+                                                          std::size_t(3));
+    std::uniform_int_distribution<int> boolDist(0, 1);
+
+    std::shared_ptr<Mul_Op> op = std::make_shared<Mul_Op>();
+    op->setDataType(DataType::Float32);
+    op->setBackend("cpu");
+
+    std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>();
+    op->associateInput(0, T0);
+    T0->setDataType(DataType::Float32);
+    T0->setBackend("cpu");
+    std::shared_ptr<Tensor> T1 = std::make_shared<Tensor>();
+    op->associateInput(1, T1);
+    T1->setDataType(DataType::Float32);
+    T1->setBackend("cpu");
+
+    std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>();
+    Tres->setDataType(DataType::Float32);
+    Tres->setBackend("cpu");
+
+    // To measure execution time of 'MatMul_Op::forward()' member function call
+    std::chrono::time_point<std::chrono::system_clock> start;
+    std::chrono::time_point<std::chrono::system_clock> end;
+    std::chrono::duration<double, std::micro> duration{};
+
+    SECTION("MulImpl_cpu::forward()") {
+        SECTION("Scalar / Scalar") {}
+        SECTION("Scalar / +1-D Tensor") {}
+        SECTION("+1-D Tensor / +1-D Tensor - same dimensions") {
+
+            std::size_t number_of_operation = 0;
+
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+
+                // generate 2 random Tensors
+                const auto nbDims = nbDimsDist(gen);
+                auto dims = std::vector<std::size_t>{};
+
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    dims.push_back(dimSizeDist(gen));
+                }
+
+                const auto nb_elements =
+                    std::accumulate(dims.cbegin(),
+                                    dims.cend(),
+                                    std::size_t(1),
+                                    std::multiplies<std::size_t>());
+                number_of_operation += nb_elements;
+
+                // without broadcasting
+                float *array0 = new float[nb_elements];
+                float *array1 = new float[nb_elements];
+                float *result = new float[nb_elements];
+
+                for (std::size_t i = 0; i < nb_elements; ++i) {
+                    array0[i] = valueDist(gen);
+                    array1[i] = valueDist(gen);
+                    result[i] = array0[i] * array1[i];
+                }
+
+                // input0
+                T0->resize(dims);
+                T0->getImpl()->setRawPtr(array0, nb_elements);
+
+                // input1
+                T1->resize(dims);
+                T1->getImpl()->setRawPtr(array1, nb_elements);
+
+                // results
+                Tres->resize(dims);
+                Tres->getImpl()->setRawPtr(result, nb_elements);
+
+                op->forwardDims();
+                start = std::chrono::system_clock::now();
+                op->forward();
+                end = std::chrono::system_clock::now();
+                duration +=
+                    std::chrono::duration_cast<std::chrono::microseconds>(
+                        end - start);
+
+                REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
+
+                delete[] array0;
+                delete[] array1;
+                delete[] result;
+            }
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {}μs\n", duration.count());
+        }
+
+        SECTION("+1-D Tensor / +1-D Tensor - broadcasting") {
+            std::size_t number_of_operation = 0;
+
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+
+                // generate 2 random Tensors
+                // handle dimensions, replace some dimensions with '1' to get
+                // broadcasting
+
+                constexpr std::size_t nbDims = 4;
+                std::vector<std::size_t> dimensions;
+
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    dimensions.push_back(dimSizeDist(gen));
+                }
+
+                auto dims0 = dimensions;
+                auto dims1 = dimensions;
+                auto dimsOut = dimensions;
+
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    if (boolDist(gen)) {
+                        dims0[i] = 1;
+                    }
+
+                    if (boolDist(gen)) {
+                        dims1[i] = 1;
+                    }
+
+                    dimsOut[i] = (dims0[i] == 1) ? dims1[i] : dims0[i];
+                }
+
+                for (auto dim : dims0) {
+                    Log::info("Dimension of input 0 : {}", dim);
+                }
+
+                for (auto dim : dims1) {
+                    Log::info("Dimension of input 1 : {}", dim);
+                }
+
+                // create arrays and fill them with random values
+                float *array0 =
+                    new float[dims0[0] * dims0[1] * dims0[2] * dims0[3]];
+                float *array1 =
+                    new float[dims1[0] * dims1[1] * dims1[2] * dims1[3]];
+                float *result = new float[dimsOut[0] * dimsOut[1] *
+                                          dimsOut[2] * dimsOut[3]];
+
+                for (std::size_t i = 0;
+                     i < dims0[0] * dims0[1] * dims0[2] * dims0[3];
+                     ++i) {
+                    array0[i] = valueDist(gen);
+                }
+
+                for (std::size_t i = 0;
+                     i < dims1[0] * dims1[1] * dims1[2] * dims1[3];
+                     ++i) {
+                    array1[i] = valueDist(gen);
+                }
+
+                // compute true result
+                const std::size_t strides0[nbDims] = {
+                    dims0[1] * dims0[2] * dims0[3],
+                    dims0[2] * dims0[3],
+                    dims0[3],
+                    1};
+                const std::size_t strides1[nbDims] = {
+                    dims1[1] * dims1[2] * dims1[3],
+                    dims1[2] * dims1[3],
+                    dims1[3],
+                    1};
+
+                for (std::size_t a = 0; a < dimsOut[0]; ++a) {
+                    for (std::size_t b = 0; b < dimsOut[1]; ++b) {
+                        const std::size_t idx0_0 =
+                            strides0[0] * ((dims0[0] > 1) ? a : 0) +
+                            strides0[1] * ((dims0[1] > 1) ? b : 0);
+
+                        const std::size_t idx1_0 =
+                            strides1[0] * ((dims1[0] > 1) ? a : 0) +
+                            strides1[1] * ((dims1[1] > 1) ? b : 0);
+
+                        for (std::size_t c = 0; c < dimsOut[2]; ++c) {
+                            const std::size_t idx_out =
+                                dimsOut[3] *
+                                (c + dimsOut[2] * (b + dimsOut[1] * a));
+
+                            for (std::size_t d = 0; d < dimsOut[3]; ++d) {
+                                std::size_t idx0 =
+                                    idx0_0 +
+                                    strides0[2] * ((dims0[2] > 1) ? c : 0) +
+                                    ((dims0[3] > 1) ? d : 0);
+
+                                std::size_t idx1 =
+                                    idx1_0 +
+                                    strides1[2] * ((dims1[2] > 1) ? c : 0) +
+                                    ((dims1[3] > 1) ? d : 0);
+
+                                result[idx_out + d] =
+                                    array0[idx0] * array1[idx1];
+                                // std::cout << "(" << idx0 << ", " << idx1 <<
+                                // ") -> " << array0[idx0] << " * " <<
+                                // array1[idx1] << " -> " << idx_out + d <<
+                                // std::endl;
+                            }
+                        }
+                    }
+                }
+
+                // conversion to Aidge::Tensors
+                // input0
+                T0->resize(dims0);
+                T0->getImpl()->setRawPtr(
+                    array0,
+                    dims0[0] * dims0[1] * dims0[2] * dims0[3]);
+
+                // input1
+                T1->resize(dims1);
+                T1->getImpl()->setRawPtr(
+                    array1,
+                    dims1[0] * dims1[1] * dims1[2] * dims1[3]);
+
+                // results
+                Tres->resize(dimsOut);
+                Tres->getImpl()->setRawPtr(
+                    result,
+                    dimsOut[0] * dimsOut[1] * dimsOut[2] * dimsOut[3]);
+
+                // compute result
+                op->forwardDims();
+                start = std::chrono::system_clock::now();
+                op->forward();
+                end = std::chrono::system_clock::now();
+                duration +=
+                    std::chrono::duration_cast<std::chrono::microseconds>(
+                        end - start);
+
+                // comparison between truth and computed result
+                REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
+
+                delete[] array0;
+                delete[] array1;
+                delete[] result;
+
+                const std::size_t nb_elements =
+                    std::accumulate(dimsOut.cbegin(),
+                                    dimsOut.cend(),
+                                    std::size_t(1),
+                                    std::multiplies<std::size_t>());
+                number_of_operation += nb_elements;
+            }
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {}μs\n", duration.count());
+        }
+        SECTION("+1-D Tensor / 1-D Tensor") {
+            std::size_t number_of_operation = 0;
+            std::uniform_int_distribution<std::size_t> nbRemovedDimsDist(
+                std::size_t(1),
+                std::size_t(3));
+
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                // generate 2 random Tensors
+                // handle dimensions
+                constexpr std::size_t nbDims = 4;
+                std::vector<std::size_t> dims0(4);
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    dims0[i] = dimSizeDist(gen);
+                }
+                std::vector<std::size_t> dimsOut = dims0;
+                std::vector<std::size_t> dims1 = dims0;
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    if (boolDist(gen)) {
+                        dims1[i] = 1;
+                    }
+                }
+                dims1.erase(dims1.cbegin(),
+                            dims1.cbegin() + nbRemovedDimsDist(gen));
+
+                // create arrays and fill them with random values
+                float *array0 =
+                    new float[dims0[0] * dims0[1] * dims0[2] * dims0[3]];
+                std::size_t array1_size =
+                    std::accumulate(dims1.cbegin(),
+                                    dims1.cend(),
+                                    std::size_t(1),
+                                    std::multiplies<std::size_t>());
+                float *array1 = new float[array1_size];
+                float *result = new float[dimsOut[0] * dimsOut[1] *
+                                          dimsOut[2] * dimsOut[3]];
+
+                for (std::size_t i = 0;
+                     i < (dims0[0] * dims0[1] * dims0[2] * dims0[3]);
+                     ++i) {
+                    array0[i] = valueDist(gen);
+                }
+                for (std::size_t i = 0; i < array1_size; ++i) {
+                    array1[i] = valueDist(gen);
+                }
+
+                // compute true result
+                auto dims1_tmp = dims1;
+                dims1_tmp.insert(dims1_tmp.cbegin(),
+                                 4 - dims1_tmp.size(),
+                                 std::size_t(1));
+
+                const std::size_t strides0[nbDims] = {
+                    dims0[1] * dims0[2] * dims0[3],
+                    dims0[2] * dims0[3],
+                    dims0[3],
+                    1};
+                const std::size_t strides1[nbDims] = {
+                    dims1_tmp[1] * dims1_tmp[2] * dims1_tmp[3],
+                    dims1_tmp[2] * dims1_tmp[3],
+                    dims1_tmp[3],
+                    1};
+                for (std::size_t a = 0; a < dimsOut[0]; ++a) {
+                    for (std::size_t b = 0; b < dimsOut[1]; ++b) {
+                        const std::size_t idx0_0 =
+                            strides0[0] * ((dims0[0] > 1) ? a : 0) +
+                            strides0[1] * ((dims0[1] > 1) ? b : 0);
+                        const std::size_t idx1_0 =
+                            strides1[0] * ((dims1_tmp[0] > 1) ? a : 0) +
+                            strides1[1] * ((dims1_tmp[1] > 1) ? b : 0);
+                        for (std::size_t c = 0; c < dimsOut[2]; ++c) {
+                            const std::size_t idx_out =
+                                dimsOut[3] *
+                                (c + dimsOut[2] * (b + dimsOut[1] * a));
+                            for (std::size_t d = 0; d < dimsOut[3]; ++d) {
+                                std::size_t idx0 =
+                                    idx0_0 +
+                                    strides0[2] * ((dims0[2] > 1) ? c : 0) +
+                                    ((dims0[3] > 1) ? d : 0);
+                                std::size_t idx1 =
+                                    idx1_0 +
+                                    strides1[2] *
+                                        ((dims1_tmp[2] > 1) ? c : 0) +
+                                    ((dims1_tmp[3] > 1) ? d : 0);
+                                result[idx_out + d] =
+                                    array0[idx0] * array1[idx1];
+                                // std::cout << "(" << idx0 << ", " << idx1 <<
+                                // ") -> " << array0[idx0] << " * " <<
+                                // array1[idx1] << " -> " << idx_out + d <<
+                                // std::endl;
+                            }
+                        }
+                    }
+                }
+
+                // conversion to Aidge::Tensors
+                // input0
+                T0->resize(dims0);
+                T0->getImpl()->setRawPtr(
+                    array0,
+                    dims0[0] * dims0[1] * dims0[2] * dims0[3]);
+
+                // input1
+                T1->resize(dims1);
+                T1->getImpl()->setRawPtr(array1, array1_size);
+
+                // results
+                Tres->resize(dimsOut);
+                Tres->getImpl()->setRawPtr(
+                    result,
+                    dimsOut[0] * dimsOut[1] * dimsOut[2] * dimsOut[3]);
+
+                // compute result
+                op->forwardDims();
+                start = std::chrono::system_clock::now();
+                op->forward();
+                end = std::chrono::system_clock::now();
+                duration +=
+                    std::chrono::duration_cast<std::chrono::microseconds>(
+                        end - start);
+
+                // comparison between truth and computed result
+                REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
+
+                delete[] array0;
+                delete[] array1;
+                delete[] result;
+
+                const std::size_t nb_elements =
+                    std::accumulate(dimsOut.cbegin(),
+                                    dimsOut.cend(),
+                                    std::size_t(1),
+                                    std::multiplies<std::size_t>());
+                number_of_operation += nb_elements;
+            }
+
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {}μs\n", duration.count());
+        }
+    }
+}
+} // namespace Aidge
+
--- a/unit_tests/operator/Test_PadImpl.cpp
+++ b/unit_tests/operator/Test_PadImpl.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <memory>
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/PadImpl.hpp"
+#include "aidge/data/DataType.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/graph/Node.hpp"
+#include "aidge/operator/Pad.hpp"
+
+using namespace Aidge;
+
+TEST_CASE("[cpu/operator] Pad(forward)", "[Pad][CPU]") {
+    SECTION("Symmetric Pad") {
+        const int pv = 0; // pad value
+
+        std::shared_ptr<Node> myPad = Pad<2>({1, 1, 1, 1}, "mypad", PadBorderType::Constant, static_cast<double>(pv));
+        auto op = std::static_pointer_cast<OperatorTensor>(myPad -> getOperator());
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<int,2,3,5,5> { //NCHW
+            {
+                {
+                    {{  0,   1,   2,   3,   4},
+                    {  5,   6,   7,   8,   9},
+                    { 10,  11,  12,  13,  14},
+                    { 15,  16,  17,  18,  19},
+                    { 20,  21,  22,  23,  24}},
+
+                    {{ 25,  26,  27,  28,  29},
+                    { 30,  31,  32,  33,  34},
+                    { 35,  36,  37,  38,  39},
+                    { 40,  41,  42,  43,  44},
+                    { 45,  46,  47,  48,  49}},
+
+                    {{ 50,  51,  52,  53,  54},
+                    { 55,  56,  57,  58,  59},
+                    { 60,  61,  62,  63,  64},
+                    { 65,  66,  67,  68,  69},
+                    { 70,  71,  72,  73,  74}}
+                },
+                {
+                    {{ 75,  76,  77,  78,  79},
+                    { 80,  81,  82,  83,  84},
+                    { 85,  86,  87,  88,  89},
+                    { 90,  91,  92,  93,  94},
+                    { 95,  96,  97,  98,  99}},
+
+                    {{100, 101, 102, 103, 104},
+                    {105, 106, 107, 108, 109},
+                    {110, 111, 112, 113, 114},
+                    {115, 116, 117, 118, 119},
+                    {120, 121, 122, 123, 124}},
+
+                    {{125, 126, 127, 128, 129},
+                    {130, 131, 132, 133, 134},
+                    {135, 136, 137, 138, 139},
+                    {140, 141, 142, 143, 144},
+                    {145, 146, 147, 148, 149}}
+                }
+            }
+        });
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<int,2,3,7,7> { //NCHW
+            {
+                {
+                    {{ pv,  pv,   pv,   pv,   pv,   pv,  pv},
+                    { pv,   0,   1,   2,   3,   4,  pv},
+                    { pv,   5,   6,   7,   8,   9,  pv},
+                    { pv,  10,  11,  12,  13,  14,  pv},
+                    { pv,  15,  16,  17,  18,  19,  pv},
+                    { pv,  20,  21,  22,  23,  24,  pv},
+                    { pv,  pv,   pv,   pv,   pv,   pv,  pv}},
+
+                    {{ pv,  pv,   pv,   pv,   pv,   pv,  pv},
+                    { pv,  25,  26,  27,  28,  29,  pv},
+                    { pv,  30,  31,  32,  33,  34,  pv},
+                    { pv,  35,  36,  37,  38,  39,  pv},
+                    { pv,  40,  41,  42,  43,  44,  pv},
+                    { pv,  45,  46,  47,  48,  49,  pv},
+                    { pv,  pv,   pv,   pv,   pv,   pv,  pv}},
+
+                    {{ pv,  pv,   pv,   pv,   pv,   pv,  pv},
+                    { pv,  50,  51,  52,  53,  54,  pv},
+                    { pv,  55,  56,  57,  58,  59,  pv},
+                    { pv,  60,  61,  62,  63,  64,  pv},
+                    { pv,  65,  66,  67,  68,  69,  pv},
+                    { pv,  70,  71,  72,  73,  74,  pv},
+                    { pv,  pv,   pv,   pv,   pv,   pv,  pv}}
+                },
+                {
+                    {{ pv,  pv,   pv,   pv,   pv,   pv,  pv},
+                    { pv,  75,  76,  77,  78,  79,  pv},
+                    { pv,  80,  81,  82,  83,  84,  pv},
+                    { pv,  85,  86,  87,  88,  89,  pv},
+                    { pv,  90,  91,  92,  93,  94,  pv},
+                    { pv,  95,  96,  97,  98,  99,  pv},
+                    { pv,  pv,   pv,   pv,   pv,   pv,  pv}},
+
+                    {{ pv,  pv,   pv,   pv,   pv,   pv,  pv},
+                    {pv,  100, 101, 102, 103, 104,  pv},
+                    {pv,  105, 106, 107, 108, 109,  pv},
+                    {pv,  110, 111, 112, 113, 114,  pv},
+                    {pv,  115, 116, 117, 118, 119,  pv},
+                    {pv,  120, 121, 122, 123, 124,  pv},
+                    { pv,  pv,   pv,   pv,   pv,   pv,  pv}},
+
+                    {{ pv,  pv,   pv,   pv,   pv,   pv,  pv},
+                    {pv,  125, 126, 127, 128, 129,  pv},
+                    {pv,  130, 131, 132, 133, 134,  pv},
+                    {pv,  135, 136, 137, 138, 139,  pv},
+                    {pv,  140, 141, 142, 143, 144,  pv},
+                    {pv,  145, 146, 147, 148, 149,  pv},
+                    { pv,  pv,   pv,   pv,   pv,   pv,  pv}}
+                }
+            }
+        });
+
+        myPad->getOperator()->associateInput(0,myInput);
+        myPad->getOperator()->setDataType(DataType::Int32);
+        myPad->getOperator()->setBackend("cpu");
+        myPad->forward();
+        // myPad->getOperator()->getOutput(0)->print();
+        REQUIRE(*(op->getOutput(0)) == *myOutput);
+    }
+
+    SECTION("Asymmetric Pad") {
+        const int pv = 0; // pad value
+
+        std::shared_ptr<Node> myPad = Pad<2>({1, 0, 0, 1}, "mypad", PadBorderType::Constant, static_cast<double>(pv));
+        auto op = std::static_pointer_cast<OperatorTensor>(myPad -> getOperator());
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<int,2,3,5,5> { //NCHW
+            {
+                {
+                    {{  0,   1,   2,   3,   4},
+                    {  5,   6,   7,   8,   9},
+                    { 10,  11,  12,  13,  14},
+                    { 15,  16,  17,  18,  19},
+                    { 20,  21,  22,  23,  24}},
+
+                    {{ 25,  26,  27,  28,  29},
+                    { 30,  31,  32,  33,  34},
+                    { 35,  36,  37,  38,  39},
+                    { 40,  41,  42,  43,  44},
+                    { 45,  46,  47,  48,  49}},
+
+                    {{ 50,  51,  52,  53,  54},
+                    { 55,  56,  57,  58,  59},
+                    { 60,  61,  62,  63,  64},
+                    { 65,  66,  67,  68,  69},
+                    { 70,  71,  72,  73,  74}}
+                },
+                {
+                    {{ 75,  76,  77,  78,  79},
+                    { 80,  81,  82,  83,  84},
+                    { 85,  86,  87,  88,  89},
+                    { 90,  91,  92,  93,  94},
+                    { 95,  96,  97,  98,  99}},
+
+                    {{100, 101, 102, 103, 104},
+                    {105, 106, 107, 108, 109},
+                    {110, 111, 112, 113, 114},
+                    {115, 116, 117, 118, 119},
+                    {120, 121, 122, 123, 124}},
+
+                    {{125, 126, 127, 128, 129},
+                    {130, 131, 132, 133, 134},
+                    {135, 136, 137, 138, 139},
+                    {140, 141, 142, 143, 144},
+                    {145, 146, 147, 148, 149}}
+                }
+            }
+        });
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<int,2,3,6,6> { //NCHW
+            {
+                {
+                    {{ pv,   pv,   pv,   pv,   pv,  pv},
+                    { 0,   1,   2,   3,   4,  pv},
+                    { 5,   6,   7,   8,   9,  pv},
+                    { 10,  11,  12,  13,  14,  pv},
+                    { 15,  16,  17,  18,  19,  pv},
+                    { 20,  21,  22,  23,  24,  pv}},
+
+                    {{ pv,   pv,   pv,   pv,   pv,  pv},
+                    { 25,  26,  27,  28,  29,  pv},
+                    { 30,  31,  32,  33,  34,  pv},
+                    { 35,  36,  37,  38,  39,  pv},
+                    { 40,  41,  42,  43,  44,  pv},
+                    { 45,  46,  47,  48,  49,  pv}},
+
+                    {{ pv,   pv,   pv,   pv,   pv,  pv},
+                    { 50,  51,  52,  53,  54,  pv},
+                    { 55,  56,  57,  58,  59,  pv},
+                    { 60,  61,  62,  63,  64,  pv},
+                    { 65,  66,  67,  68,  69,  pv},
+                    { 70,  71,  72,  73,  74,  pv}}
+                },
+                {
+                    {{ pv,   pv,   pv,   pv,   pv,  pv},
+                    { 75,  76,  77,  78,  79,  pv},
+                    { 80,  81,  82,  83,  84,  pv},
+                    { 85,  86,  87,  88,  89,  pv},
+                    { 90,  91,  92,  93,  94,  pv},
+                    { 95,  96,  97,  98,  99,  pv}},
+
+                    {{ pv,   pv,   pv,   pv,   pv,  pv},
+                    { 100, 101, 102, 103, 104,  pv},
+                    { 105, 106, 107, 108, 109,  pv},
+                    { 110, 111, 112, 113, 114,  pv},
+                    { 115, 116, 117, 118, 119,  pv},
+                    { 120, 121, 122, 123, 124,  pv}},
+
+                    {{ pv,   pv,   pv,   pv,   pv,  pv},
+                    { 125, 126, 127, 128, 129,  pv},
+                    { 130, 131, 132, 133, 134,  pv},
+                    { 135, 136, 137, 138, 139,  pv},
+                    { 140, 141, 142, 143, 144,  pv},
+                    { 145, 146, 147, 148, 149,  pv}}
+                }
+            }
+        });
+
+        myPad->getOperator()->associateInput(0,myInput);
+        myPad->getOperator()->setDataType(DataType::Int32);
+        myPad->getOperator()->setBackend("cpu");
+        myPad->forward();
+        // myPad->getOperator()->getOutput(0)->print();
+        REQUIRE(*(op->getOutput(0)) == *myOutput);
+    }
+
+    SECTION("Pad Edge") {
+        std::shared_ptr<Node> myPad = Pad<2>({1, 1, 1, 1}, "mypad", PadBorderType::Edge);
+        auto op = std::static_pointer_cast<OperatorTensor>(myPad -> getOperator());
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<int,2,3,5,5> { //NCHW
+            {
+                {
+                    {{  0,   1,   2,   3,   4},
+                    {  5,   6,   7,   8,   9},
+                    { 10,  11,  12,  13,  14},
+                    { 15,  16,  17,  18,  19},
+                    { 20,  21,  22,  23,  24}},
+
+                    {{ 25,  26,  27,  28,  29},
+                    { 30,  31,  32,  33,  34},
+                    { 35,  36,  37,  38,  39},
+                    { 40,  41,  42,  43,  44},
+                    { 45,  46,  47,  48,  49}},
+
+                    {{ 50,  51,  52,  53,  54},
+                    { 55,  56,  57,  58,  59},
+                    { 60,  61,  62,  63,  64},
+                    { 65,  66,  67,  68,  69},
+                    { 70,  71,  72,  73,  74}}
+                },
+                {
+                    {{ 75,  76,  77,  78,  79},
+                    { 80,  81,  82,  83,  84},
+                    { 85,  86,  87,  88,  89},
+                    { 90,  91,  92,  93,  94},
+                    { 95,  96,  97,  98,  99}},
+
+                    {{100, 101, 102, 103, 104},
+                    {105, 106, 107, 108, 109},
+                    {110, 111, 112, 113, 114},
+                    {115, 116, 117, 118, 119},
+                    {120, 121, 122, 123, 124}},
+
+                    {{125, 126, 127, 128, 129},
+                    {130, 131, 132, 133, 134},
+                    {135, 136, 137, 138, 139},
+                    {140, 141, 142, 143, 144},
+                    {145, 146, 147, 148, 149}}
+                }
+            }
+        });
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<int,2,3,7,7> { //NCHW
+            {
+                {
+                    {{ 0,  0,   1,   2,   3,   4,  4},
+                    { 0,   0,   1,   2,   3,   4,  4},
+                    { 5,   5,   6,   7,   8,   9,  9},
+                    { 10,  10,  11,  12,  13,  14,  14},
+                    { 15,  15,  16,  17,  18,  19,  19},
+                    { 20,  20,  21,  22,  23,  24,  24},
+                    { 20,  20,  21,  22,  23,  24,  24}},
+
+                    {{ 25,  25,  26,  27,  28,  29,  29},
+                    { 25,  25,  26,  27,  28,  29,  29},
+                    { 30,  30,  31,  32,  33,  34,  34},
+                    { 35,  35,  36,  37,  38,  39,  39},
+                    { 40,  40,  41,  42,  43,  44,  44},
+                    { 45,  45,  46,  47,  48,  49,  49},
+                    { 45,  45,  46,  47,  48,  49, 49}},
+
+                    {{ 50,  50,  51,  52,  53,  54,  54},
+                    { 50,  50,  51,  52,  53,  54,  54},
+                    { 55,  55,  56,  57,  58,  59,  59},
+                    { 60,  60,  61,  62,  63,  64,  64},
+                    { 65,  65,  66,  67,  68,  69,  69},
+                    { 70,  70,  71,  72,  73,  74,  74},
+                    { 70,  70,  71,  72,  73,  74,  74}}
+                },
+                {
+                    {{ 75,  75,  76,  77,  78,  79,  79},
+                    { 75,  75,  76,  77,  78,  79,  79},
+                    { 80,  80,  81,  82,  83,  84,  84},
+                    { 85,  85,  86,  87,  88,  89,  89},
+                    { 90,  90,  91,  92,  93,  94,  94},
+                    { 95,  95,  96,  97,  98,  99,  99},
+                    { 95,  95,  96,  97,  98,  99,  99}},
+
+                    {{100,  100, 101, 102, 103, 104,  104},
+                    {100,  100, 101, 102, 103, 104,  104},
+                    {105,  105, 106, 107, 108, 109, 109},
+                    {110,  110, 111, 112, 113, 114,  114},
+                    {115,  115, 116, 117, 118, 119,  119},
+                    {120,  120, 121, 122, 123, 124,  124},
+                    {120,  120, 121, 122, 123, 124,  124}},
+
+                    {{125,  125, 126, 127, 128, 129,  129},
+                    {125,  125, 126, 127, 128, 129,  129},
+                    {130,  130, 131, 132, 133, 134,  134},
+                    {135,  135, 136, 137, 138, 139,  139},
+                    {140,  140, 141, 142, 143, 144,  144},
+                    {145,  145, 146, 147, 148, 149,  149},
+                    {145,  145, 146, 147, 148, 149,  149}}
+                }
+            }
+        });
+
+        myPad->getOperator()->associateInput(0,myInput);
+        myPad->getOperator()->setDataType(DataType::Int32);
+        myPad->getOperator()->setBackend("cpu");
+        myPad->forward();
+        // myPad->getOperator()->getOutput(0)->print();
+        REQUIRE(*(op->getOutput(0)) == *myOutput);
+    }
+
+    SECTION("Pad Reflect") {
+        std::shared_ptr<Node> myPad = Pad<2>({1, 1, 1, 1}, "mypad", PadBorderType::Reflect);
+        auto op = std::static_pointer_cast<OperatorTensor>(myPad -> getOperator());
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<int,2,3,5,5> { //NCHW
+            {
+                {
+                    {{  0,   1,   2,   3,   4},
+                    {  5,   6,   7,   8,   9},
+                    { 10,  11,  12,  13,  14},
+                    { 15,  16,  17,  18,  19},
+                    { 20,  21,  22,  23,  24}},
+
+                    {{ 25,  26,  27,  28,  29},
+                    { 30,  31,  32,  33,  34},
+                    { 35,  36,  37,  38,  39},
+                    { 40,  41,  42,  43,  44},
+                    { 45,  46,  47,  48,  49}},
+
+                    {{ 50,  51,  52,  53,  54},
+                    { 55,  56,  57,  58,  59},
+                    { 60,  61,  62,  63,  64},
+                    { 65,  66,  67,  68,  69},
+                    { 70,  71,  72,  73,  74}}
+                },
+                {
+                    {{ 75,  76,  77,  78,  79},
+                    { 80,  81,  82,  83,  84},
+                    { 85,  86,  87,  88,  89},
+                    { 90,  91,  92,  93,  94},
+                    { 95,  96,  97,  98,  99}},
+
+                    {{100, 101, 102, 103, 104},
+                    {105, 106, 107, 108, 109},
+                    {110, 111, 112, 113, 114},
+                    {115, 116, 117, 118, 119},
+                    {120, 121, 122, 123, 124}},
+
+                    {{125, 126, 127, 128, 129},
+                    {130, 131, 132, 133, 134},
+                    {135, 136, 137, 138, 139},
+                    {140, 141, 142, 143, 144},
+                    {145, 146, 147, 148, 149}}
+                }
+            }
+        });
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<int,2,3,7,7> { //NCHW
+            {
+                {
+                    {
+                    { 6, 5, 6, 7, 8, 9, 5},
+                    { 1, 0, 1, 2, 3, 4, 0},
+                    { 6, 5, 6, 7, 8, 9, 5},
+                    { 11, 10, 11, 12, 13, 14, 10},
+                    { 16, 15, 16, 17, 18, 19, 15},
+                    { 21, 20, 21, 22, 23, 24, 20},
+                    { 1, 0, 1, 2, 3, 4, 0}
+                    },
+                    {
+                    { 31, 30, 31, 32, 33, 34, 30},
+                    { 26, 25, 26, 27, 28, 29, 25},
+                    { 31, 30, 31, 32, 33, 34, 30},
+                    { 36, 35, 36, 37, 38, 39, 35},
+                    { 41, 40, 41, 42, 43, 44, 40},
+                    { 46, 45, 46, 47, 48, 49, 45},
+                    { 26, 25, 26, 27, 28, 29, 25}
+                    },
+                    {
+                    { 56, 55, 56, 57, 58, 59, 55},
+                    { 51, 50, 51, 52, 53, 54, 50},
+                    { 56, 55, 56, 57, 58, 59, 55},
+                    { 61, 60, 61, 62, 63, 64, 60},
+                    { 66, 65, 66, 67, 68, 69, 65},
+                    { 71, 70, 71, 72, 73, 74, 70},
+                    { 51, 50, 51, 52, 53, 54, 50}
+                    }
+                },
+                {
+                    {
+                    { 81, 80, 81, 82, 83, 84, 80},
+                    { 76, 75, 76, 77, 78, 79, 75},
+                    { 81, 80, 81, 82, 83, 84, 80},
+                    { 86, 85, 86, 87, 88, 89, 85},
+                    { 91, 90, 91, 92, 93, 94, 90},
+                    { 96, 95, 96, 97, 98, 99, 95},
+                    { 76, 75, 76, 77, 78, 79, 75}
+                    },
+                    {
+                    { 106, 105, 106, 107, 108, 109, 105},
+                    { 101, 100, 101, 102, 103, 104, 100},
+                    { 106, 105, 106, 107, 108, 109, 105},
+                    { 111, 110, 111, 112, 113, 114, 110},
+                    { 116, 115, 116, 117, 118, 119, 115},
+                    { 121, 120, 121, 122, 123, 124, 120},
+                    { 101, 100, 101, 102, 103, 104, 100}
+                    },
+                    {
+                    { 131, 130, 131, 132, 133, 134, 130},
+                    { 126, 125, 126, 127, 128, 129, 125},
+                    { 131, 130, 131, 132, 133, 134, 130},
+                    { 136, 135, 136, 137, 138, 139, 135},
+                    { 141, 140, 141, 142, 143, 144, 140},
+                    { 146, 145, 146, 147, 148, 149, 145},
+                    { 126, 125, 126, 127, 128, 129, 125}
+                    }
+                    }
+                }
+        });
+
+        myPad->getOperator()->associateInput(0,myInput);
+        myPad->getOperator()->setDataType(DataType::Int32);
+        myPad->getOperator()->setBackend("cpu");
+        myPad->forward();
+        op->getOutput(0)->print();
+        REQUIRE(*(op->getOutput(0)) == *myOutput);
+    }
+
+    SECTION("Pad Wrap") {
+        std::shared_ptr<Node> myPad = Pad<2>({1, 1, 1, 1}, "mypad", PadBorderType::Wrap);
+        auto op = std::static_pointer_cast<OperatorTensor>(myPad -> getOperator());
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<int,2,3,5,5> { //NCHW
+            {
+                {
+                    {{  0,   1,   2,   3,   4},
+                    {  5,   6,   7,   8,   9},
+                    { 10,  11,  12,  13,  14},
+                    { 15,  16,  17,  18,  19},
+                    { 20,  21,  22,  23,  24}},
+
+                    {{ 25,  26,  27,  28,  29},
+                    { 30,  31,  32,  33,  34},
+                    { 35,  36,  37,  38,  39},
+                    { 40,  41,  42,  43,  44},
+                    { 45,  46,  47,  48,  49}},
+
+                    {{ 50,  51,  52,  53,  54},
+                    { 55,  56,  57,  58,  59},
+                    { 60,  61,  62,  63,  64},
+                    { 65,  66,  67,  68,  69},
+                    { 70,  71,  72,  73,  74}}
+                },
+                {
+                    {{ 75,  76,  77,  78,  79},
+                    { 80,  81,  82,  83,  84},
+                    { 85,  86,  87,  88,  89},
+                    { 90,  91,  92,  93,  94},
+                    { 95,  96,  97,  98,  99}},
+
+                    {{100, 101, 102, 103, 104},
+                    {105, 106, 107, 108, 109},
+                    {110, 111, 112, 113, 114},
+                    {115, 116, 117, 118, 119},
+                    {120, 121, 122, 123, 124}},
+
+                    {{125, 126, 127, 128, 129},
+                    {130, 131, 132, 133, 134},
+                    {135, 136, 137, 138, 139},
+                    {140, 141, 142, 143, 144},
+                    {145, 146, 147, 148, 149}}
+                }
+            }
+        });
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<int,2,3,7,7> { //NCHW
+            {
+                {
+                    {{ 24,  20,  21,  22,  23,  24,  20},
+                    { 4,   0,   1,   2,   3,   4,  0},
+                    { 9,   5,   6,   7,   8,   9,  5},
+                    { 14,  10,  11,  12,  13,  14,  10},
+                    { 19,  15,  16,  17,  18,  19,  15},
+                    { 24,  20,  21,  22,  23,  24,  20},
+                    { 4,   0,   1,   2,   3,   4,  0}},
+
+                    {{ 49,  45,  46,  47,  48,  49, 45},
+                    { 29,  25,  26,  27,  28,  29,  25},
+                    { 34,  30,  31,  32,  33,  34,  30},
+                    { 39,  35,  36,  37,  38,  39,  35},
+                    { 44,  40,  41,  42,  43,  44,  40},
+                    { 49,  45,  46,  47,  48,  49,  45},
+                    { 29,  25,  26,  27,  28,  29,  25}},
+
+                    {{ 74,  70,  71,  72,  73,  74,  70},
+                    { 54,  50,  51,  52,  53,  54,  50},
+                    { 59,  55,  56,  57,  58,  59,  55},
+                    { 64,  60,  61,  62,  63,  64,  60},
+                    { 69,  65,  66,  67,  68,  69,  65},
+                    { 74,  70,  71,  72,  73,  74,  70},
+                    { 54,  50,  51,  52,  53,  54,  50}}
+                },
+                {
+                    {{ 99,  95,  96,  97,  98,  99,  95},
+                    { 79,  75,  76,  77,  78,  79,  75},
+                    { 84,  80,  81,  82,  83,  84,  80},
+                    { 89,  85,  86,  87,  88,  89,  85},
+                    { 94,  90,  91,  92,  93,  94,  90},
+                    { 99,  95,  96,  97,  98,  99,  95},
+                    { 79,  75,  76,  77,  78,  79,  75}},
+
+                    {{124,  120, 121, 122, 123, 124,  120},
+                    {104,  100, 101, 102, 103, 104,  100},
+                    {109,  105, 106, 107, 108, 109, 105},
+                    {114,  110, 111, 112, 113, 114,  110},
+                    {119,  115, 116, 117, 118, 119,  115},
+                    {124,  120, 121, 122, 123, 124,  120},
+                    {104,  100, 101, 102, 103, 104,  100}},
+
+                    {{149,  145, 146, 147, 148, 149,  145},
+                    {129,  125, 126, 127, 128, 129,  125},
+                    {134,  130, 131, 132, 133, 134,  130},
+                    {139,  135, 136, 137, 138, 139,  135},
+                    {144,  140, 141, 142, 143, 144,  140},
+                    {149,  145, 146, 147, 148, 149,  145},
+                    {129,  125, 126, 127, 128, 129,  125}}
+                }
+            }
+        });
+
+        myPad->getOperator()->associateInput(0,myInput);
+        myPad->getOperator()->setDataType(DataType::Int32);
+        myPad->getOperator()->setBackend("cpu");
+        myPad->forward();
+        // myPad->getOperator()->getOutput(0)->print();
+        REQUIRE(*(op->getOutput(0)) == *myOutput);
+    }
+}
\ No newline at end of file
--- a/unit_tests/operator/Test_PaddedConv.cpp
+++ b/unit_tests/operator/Test_PaddedConv.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <memory>
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/PaddedConvImpl.hpp"
+#include "aidge/data/DataType.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/graph/Node.hpp"
+#include "aidge/operator/MetaOperatorDefs.hpp"
+
+using namespace Aidge;
+
+TEST_CASE("[cpu/operator] PaddedConv(forward)", "[PaddedConv][CPU]") {
+    SECTION("Classic Conv") {
+        std::shared_ptr<Node> myConv = PaddedConv(3,4,{3,3}, "myconv");
+        auto op = std::static_pointer_cast<OperatorTensor>(myConv -> getOperator());
+        std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array4D<int,4,3,3,3> {
+            {
+                {
+                    {{  0,   1,   2},
+                    {  3,   4,   5},
+                    {  6,   7,   8}},
+                    {{  9,  10,  11},
+                    { 12,  13,  14},
+                    { 15,  16,  17}},
+                    {{ 18,  19,  20},
+                    { 21,  22,  23},
+                    { 24,  25,  26}}
+                },
+                {
+                    {{ 27,  28,  29},
+                    { 30,  31,  32},
+                    { 33,  34,  35}},
+                    {{ 36,  37,  38},
+                    { 39,  40,  41},
+                    { 42,  43,  44}},
+                    {{ 45,  46,  47},
+                    { 48,  49,  50},
+                    { 51,  52,  53}}
+                },
+                {
+                    {{ 54,  55,  56},
+                    { 57,  58,  59},
+                    { 60,  61,  62}},
+                    {{ 63,  64,  65},
+                    { 66,  67,  68},
+                    { 69,  70,  71}},
+                    {{ 72,  73,  74},
+                    { 75,  76,  77},
+                    { 78,  79,  80}}
+                },
+                {
+                    {{ 81,  82,  83},
+                    { 84,  85,  86},
+                    { 87,  88,  89}},
+                    {{ 90,  91,  92},
+                    { 93,  94,  95},
+                    { 96,  97,  98}},
+                    {{ 99, 100, 101},
+                    {102, 103, 104},
+                    {105, 106, 107}}
+                }
+            }
+        });
+        std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<int,4> {{7,0,9,0}});
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<int,2,3,5,5> { //NCHW
+            {
+                {
+                    {{  0,   1,   2,   3,   4},
+                    {  5,   6,   7,   8,   9},
+                    { 10,  11,  12,  13,  14},
+                    { 15,  16,  17,  18,  19},
+                    { 20,  21,  22,  23,  24}},
+
+                    {{ 25,  26,  27,  28,  29},
+                    { 30,  31,  32,  33,  34},
+                    { 35,  36,  37,  38,  39},
+                    { 40,  41,  42,  43,  44},
+                    { 45,  46,  47,  48,  49}},
+
+                    {{ 50,  51,  52,  53,  54},
+                    { 55,  56,  57,  58,  59},
+                    { 60,  61,  62,  63,  64},
+                    { 65,  66,  67,  68,  69},
+                    { 70,  71,  72,  73,  74}}
+                },
+                {
+                    {{ 75,  76,  77,  78,  79},
+                    { 80,  81,  82,  83,  84},
+                    { 85,  86,  87,  88,  89},
+                    { 90,  91,  92,  93,  94},
+                    { 95,  96,  97,  98,  99}},
+
+                    {{100, 101, 102, 103, 104},
+                    {105, 106, 107, 108, 109},
+                    {110, 111, 112, 113, 114},
+                    {115, 116, 117, 118, 119},
+                    {120, 121, 122, 123, 124}},
+
+                    {{125, 126, 127, 128, 129},
+                    {130, 131, 132, 133, 134},
+                    {135, 136, 137, 138, 139},
+                    {140, 141, 142, 143, 144},
+                    {145, 146, 147, 148, 149}}
+                }
+            }
+        });
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<int,2,4,3,3> {
+            {
+                {
+                    {{ 15226,  15577,  15928},
+                    { 16981,  17332,  17683},
+                    { 18736,  19087,  19438}},
+                    {{ 37818,  38898,  39978},
+                    { 43218,  44298,  45378},
+                    { 48618,  49698,  50778}},
+                    {{ 60426,  62235,  64044},
+                    { 69471,  71280,  73089},
+                    { 78516,  80325,  82134}},
+                    {{ 83016,  85554,  88092},
+                    { 95706,  98244, 100782},
+                    {108396, 110934, 113472}}
+                },
+                {
+                    {{ 41551,  41902,  42253},
+                    { 43306,  43657,  44008},
+                    { 45061,  45412,  45763}},
+                    {{118818, 119898, 120978},
+                    {124218, 125298, 126378},
+                    {129618, 130698, 131778}},
+                    {{196101, 197910, 199719},
+                    {205146, 206955, 208764},
+                    {214191, 216000, 217809}},
+                    {{273366, 275904, 278442},
+                    {286056, 288594, 291132},
+                    {298746, 301284, 303822}}
+                }
+            }
+        });
+
+        myConv->getOperator()->associateInput(0,myInput);
+        myConv->input(1).first->getOperator()->setOutput(0, myWeights);
+        myConv->input(2).first->getOperator()->setOutput(0, myBias);
+
+        auto g = getConnectedGraphView(myConv);
+        g->setDataType(DataType::Int32);
+        g->setBackend("cpu");
+
+        auto scheduler = SequentialScheduler(g);
+        scheduler.forward();
+
+        REQUIRE(*(op->getOutput(0)) == *myOutput);
+    }
+    SECTION("test Padding") {
+        std::shared_ptr<Node> myConv = PaddedConv(3,4,{3,3}, "myconv", {1,1}, {1,1,1,1});
+        auto op = std::static_pointer_cast<OperatorTensor>(myConv -> getOperator());
+        std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array4D<int,4,3,3,3> {
+            {
+                {
+                    {{  0,   1,   2},
+                    {  3,   4,   5},
+                    {  6,   7,   8}},
+                    {{  9,  10,  11},
+                    { 12,  13,  14},
+                    { 15,  16,  17}},
+                    {{ 18,  19,  20},
+                    { 21,  22,  23},
+                    { 24,  25,  26}}
+                },
+                {
+                    {{ 27,  28,  29},
+                    { 30,  31,  32},
+                    { 33,  34,  35}},
+                    {{ 36,  37,  38},
+                    { 39,  40,  41},
+                    { 42,  43,  44}},
+                    {{ 45,  46,  47},
+                    { 48,  49,  50},
+                    { 51,  52,  53}}
+                },
+                {
+                    {{ 54,  55,  56},
+                    { 57,  58,  59},
+                    { 60,  61,  62}},
+                    {{ 63,  64,  65},
+                    { 66,  67,  68},
+                    { 69,  70,  71}},
+                    {{ 72,  73,  74},
+                    { 75,  76,  77},
+                    { 78,  79,  80}}
+                },
+                {
+                    {{ 81,  82,  83},
+                    { 84,  85,  86},
+                    { 87,  88,  89}},
+                    {{ 90,  91,  92},
+                    { 93,  94,  95},
+                    { 96,  97,  98}},
+                    {{ 99, 100, 101},
+                    {102, 103, 104},
+                    {105, 106, 107}}
+                }
+            }
+        });
+        std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<int,4> {{7,0,9,0}});
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<int,2,3,5,5> { //NCHW
+            {
+                {
+                    {{  0,   1,   2,   3,   4},
+                    {  5,   6,   7,   8,   9},
+                    { 10,  11,  12,  13,  14},
+                    { 15,  16,  17,  18,  19},
+                    { 20,  21,  22,  23,  24}},
+
+                    {{ 25,  26,  27,  28,  29},
+                    { 30,  31,  32,  33,  34},
+                    { 35,  36,  37,  38,  39},
+                    { 40,  41,  42,  43,  44},
+                    { 45,  46,  47,  48,  49}},
+
+                    {{ 50,  51,  52,  53,  54},
+                    { 55,  56,  57,  58,  59},
+                    { 60,  61,  62,  63,  64},
+                    { 65,  66,  67,  68,  69},
+                    { 70,  71,  72,  73,  74}}
+                },
+                {
+                    {{ 75,  76,  77,  78,  79},
+                    { 80,  81,  82,  83,  84},
+                    { 85,  86,  87,  88,  89},
+                    { 90,  91,  92,  93,  94},
+                    { 95,  96,  97,  98,  99}},
+
+                    {{100, 101, 102, 103, 104},
+                    {105, 106, 107, 108, 109},
+                    {110, 111, 112, 113, 114},
+                    {115, 116, 117, 118, 119},
+                    {120, 121, 122, 123, 124}},
+
+                    {{125, 126, 127, 128, 129},
+                    {130, 131, 132, 133, 134},
+                    {135, 136, 137, 138, 139},
+                    {140, 141, 142, 143, 144},
+                    {145, 146, 147, 148, 149}}
+                }
+            }
+        });
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<int,2,4,5,5> {
+            {
+                {
+                    {{  6895,  10225,  10486,  10747,   7063},
+                     { 10303,  15226,  15577,  15928,  10429},
+                     { 11518,  16981,  17332,  17683,  11554},
+                     { 12733,  18736,  19087,  19438,  12679},
+                     {  8047,  11791,  11998,  12205,   7927}},
+
+                    {{ 15960,  24069,  24816,  25563,  17100},
+                     { 25119,  37818,  38898,  39978,  26703},
+                     { 28764,  43218,  44298,  45378,  30258},
+                     { 32409,  48618,  49698,  50778,  33813},
+                     { 21972,  32925,  33618,  34311,  22824}},
+
+                    {{ 25041,  37929,  39162,  40395,  27153},
+                     { 39951,  60426,  62235,  64044,  42993},
+                     { 46026,  69471,  71280,  73089,  48978},
+                     { 52101,  78516,  80325,  82134,  54963},
+                     { 35913,  54075,  55254,  56433,  37737}},
+
+                    {{ 34104,  51771,  53490,  55209,  37188},
+                     { 54765,  83016,  85554,  88092,  59265},
+                     { 63270,  95706,  98244, 100782,  67680},
+                     { 71775, 108396, 110934, 113472,  76095},
+                     { 49836,  75207,  76872,  78537,  52632}}
+                },
+                {
+                    {{ 20395,  29800,  30061,  30322,  19663},
+                     { 28528,  41551,  41902,  42253,  27304},
+                     { 29743,  43306,  43657,  44008,  28429},
+                     { 30958,  45061,  45412,  45763,  29554},
+                     { 18847,  27316,  27523,  27730,  17827}},
+
+                    {{ 53760,  80094,  80841,  81588,  54000},
+                     { 79794, 118818, 119898, 120978,  80028},
+                     { 83439, 124218, 125298, 126378,  83583},
+                     { 87084, 129618, 130698, 131778,  87138},
+                     { 57072,  84900,  85593,  86286,  57024}},
+
+                    {{ 87141, 130404, 131637, 132870,  88353},
+                     {131076, 196101, 197910, 199719, 132768},
+                     {137151, 205146, 206955, 208764, 138753},
+                     {143226, 214191, 216000, 217809, 144738},
+                     { 95313, 142500, 143679, 144858,  96237}},
+
+                    {{120504, 180696, 182415, 184134, 122688},
+                     {182340, 273366, 275904, 278442, 185490},
+                     {190845, 286056, 288594, 291132, 193905},
+                     {199350, 298746, 301284, 303822, 202320},
+                     {133536, 200082, 201747, 203412, 135432}}
+                }
+            }
+        });
+
+        myConv->getOperator()->associateInput(0,myInput);
+        myConv->input(1).first->getOperator()->setOutput(0, myWeights);
+        myConv->input(2).first->getOperator()->setOutput(0, myBias);
+
+        auto g = getConnectedGraphView(myConv);
+        g->setDataType(DataType::Int32);
+        g->setBackend("cpu");
+
+        auto scheduler = SequentialScheduler(g);
+        scheduler.forward();
+
+        REQUIRE(*(op->getOutput(0)) == *myOutput);
+    }
+}
--- a/unit_tests/operator/Test_PowImpl.cpp
+++ b/unit_tests/operator/Test_PowImpl.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <chrono>      // std::micro, std::chrono::time_point,
+                       // std::chrono::system_clock, std::chrono::duration
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::uint16_t
+#include <functional>  // std::multiplies
+#include <memory>
+#include <numeric>     // std::accumulate
+#include <random>      // std::random_device, std::mt19937
+                       // std::uniform_int_distribution, std::uniform_real_distribution
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
+
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/PowImpl.hpp"
+#include "aidge/data/Data.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/Pow.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+namespace Aidge {
+
+TEST_CASE("[cpu/operator] Pow", "[Pow][CPU]") {
+    constexpr std::uint16_t NBTRIALS = 10;
+    // Create a random number generator
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
+    std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
+    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
+    std::uniform_int_distribution<int> boolDist(0,1);
+
+    // Create MatPow Operator
+    std::shared_ptr<Node> myPow = Pow();
+    auto op = std::static_pointer_cast<OperatorTensor>(myPow-> getOperator());
+    op->setDataType(DataType::Float32);
+    op->setBackend("cpu");
+
+    // Create 2 input Tensors
+    std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>();
+    op->associateInput(0,T0);
+    T0->setDataType(DataType::Float32);
+    T0->setBackend("cpu");
+    std::shared_ptr<Tensor> T1 = std::make_shared<Tensor>();
+    op -> associateInput(1,T1);
+    T1->setDataType(DataType::Float32);
+    T1->setBackend("cpu");
+
+    // Create results Tensor
+    std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>();
+    Tres->setDataType(DataType::Float32);
+    Tres->setBackend("cpu");
+
+    // To measure execution time of 'MatPow_Op::forward()' member function call
+    std::chrono::time_point<std::chrono::system_clock> start;
+    std::chrono::time_point<std::chrono::system_clock> end;
+    std::chrono::duration<double, std::micro> duration{};
+
+    SECTION("PowImpl_cpu::forward()") {
+        SECTION("Scalar / Scalar") {
+
+        }
+        SECTION("Scalar / +1-D Tensor") {
+
+        }
+        SECTION("+1-D Tensor / +1-D Tensor - same dimensions") {
+            std::size_t number_of_operation = 0;
+
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                // generate 2 random Tensors
+                const std::size_t nbDims = nbDimsDist(gen);
+                std::vector<std::size_t> dims;
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    dims.push_back(dimSizeDist(gen));
+                }
+                const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                number_of_operation += nb_elements;
+
+                // without broadcasting
+                float* array0 = new float[nb_elements];
+                float* array1 = new float[nb_elements];
+                float* result = new float[nb_elements];
+
+                for (std::size_t i = 0; i < nb_elements; ++i) {
+                    array0[i] = valueDist(gen);
+                    array1[i] = valueDist(gen);
+                    result[i] = std::pow(array0[i], array1[i]);
+                }
+
+                // input0
+                T0->resize(dims);
+                T0 -> getImpl() -> setRawPtr(array0, nb_elements);
+
+                // input1
+                T1->resize(dims);
+                T1 -> getImpl() -> setRawPtr(array1, nb_elements);
+
+                // results
+                Tres->resize(dims);
+                Tres -> getImpl() -> setRawPtr(result, nb_elements);
+
+                op->forwardDims();
+                start = std::chrono::system_clock::now();
+                myPow->forward();
+                end = std::chrono::system_clock::now();
+                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+                REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
+
+                delete[] array0;
+                delete[] array1;
+                delete[] result;
+
+                // with broadcasting
+            }
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {} μs\n", duration.count());
+        }
+
+        SECTION("+1-D Tensor / +1-D Tensor - broadcasting") {
+            std::size_t number_of_operation = 0;
+
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                // generate 2 random Tensors
+                // handle dimensions, replace some dimensions with '1' to get broadcasting
+                constexpr std::size_t nbDims = 4;
+                std::vector<std::size_t> dims;
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    dims.push_back(dimSizeDist(gen));
+                }
+                std::vector<std::size_t> dims0 = dims;
+                std::vector<std::size_t> dims1 = dims;
+                std::vector<std::size_t> dimsOut = dims;
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    if (boolDist(gen)) {
+                        dims0[i] = 1;
+                    }
+                    if (boolDist(gen)) {
+                        dims1[i] = 1;
+                    }
+                    dimsOut[i] = (dims0[i] == 1) ? dims1[i] : dims0[i];
+                }
+
+                // create arrays and fill them with random values
+                float* array0 = new float[dims0[0]*dims0[1]*dims0[2]*dims0[3]];
+                float* array1 = new float[dims1[0]*dims1[1]*dims1[2]*dims1[3]];
+                float* result = new float[dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]];
+
+                for (std::size_t i = 0; i < dims0[0]*dims0[1]*dims0[2]*dims0[3]; ++i) {
+                    array0[i] = valueDist(gen);
+                }
+                for (std::size_t i = 0; i < dims1[0]*dims1[1]*dims1[2]*dims1[3]; ++i) {
+                    array1[i] = valueDist(gen);
+                }
+
+                // compute true result
+                const std::size_t strides0[nbDims] = {dims0[1]*dims0[2]*dims0[3], dims0[2]*dims0[3], dims0[3], 1};
+                const std::size_t strides1[nbDims] = {dims1[1]*dims1[2]*dims1[3], dims1[2]*dims1[3], dims1[3], 1};
+                for (std::size_t a = 0; a < dimsOut[0]; ++a) {
+                    for (std::size_t b = 0; b < dimsOut[1]; ++b) {
+                        const std::size_t idx0_0 = strides0[0] * ((dims0[0] > 1) ? a : 0)
+                                                    + strides0[1] * ((dims0[1] > 1) ? b : 0);
+                        const std::size_t idx1_0 = strides1[0] * ((dims1[0] > 1) ? a : 0)
+                                                    + strides1[1] * ((dims1[1] > 1) ? b : 0);
+                        for (std::size_t c = 0; c < dimsOut[2]; ++c) {
+                            const std::size_t idx_out = dimsOut[3] * (c + dimsOut[2] * (b + dimsOut[1] * a));
+                            for (std::size_t d = 0; d < dimsOut[3]; ++d) {
+                                std::size_t idx0 = idx0_0
+                                                    + strides0[2] * ((dims0[2] > 1) ? c : 0)
+                                                    + ((dims0[3] > 1) ? d : 0);
+                                std::size_t idx1 = idx1_0
+                                                    + strides1[2] * ((dims1[2] > 1) ? c : 0)
+                                                    + ((dims1[3] > 1) ? d : 0);
+                                result[idx_out + d] = std::pow(array0[idx0], array1[idx1]);
+                                // std::cout << "(" << idx0 << ", " << idx1 << ") -> " << array0[idx0] << " ** " << array1[idx1] << " -> " << idx_out + d << std::endl;
+                            }
+                        }
+                    }
+                }
+
+                // conversion to Aidge::Tensors
+                // input0
+                T0->resize(dims0);
+                T0 -> getImpl() -> setRawPtr(array0, dims0[0]*dims0[1]*dims0[2]*dims0[3]);
+
+                // input1
+                T1->resize(dims1);
+                T1 -> getImpl() -> setRawPtr(array1, dims1[0]*dims1[1]*dims1[2]*dims1[3]);
+
+                // results
+                Tres->resize(dimsOut);
+                Tres -> getImpl() -> setRawPtr(result, dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]);
+
+                // compute result
+                op->forwardDims();
+                start = std::chrono::system_clock::now();
+                myPow->forward();
+                end = std::chrono::system_clock::now();
+                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+                // comparison between truth and computed result
+                REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
+
+                delete[] array0;
+                delete[] array1;
+                delete[] result;
+
+                const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                number_of_operation += nb_elements;
+            }
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {} μs\n", duration.count());
+        }
+        SECTION("+1-D Tensor / 1-D Tensor") {
+            std::size_t number_of_operation = 0;
+            std::uniform_int_distribution<std::size_t> nbRemovedDimsDist(std::size_t(1), std::size_t(3));
+
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                // generate 2 random Tensors
+                // handle dimensions
+                constexpr std::size_t nbDims = 4;
+                std::vector<std::size_t> dims0(4);
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    dims0[i] = dimSizeDist(gen);
+                }
+                std::vector<std::size_t> dimsOut = dims0;
+                std::vector<std::size_t> dims1 = dims0;
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    if (boolDist(gen)) {
+                        dims1[i] = 1;
+                    }
+                }
+                dims1.erase(dims1.cbegin(), dims1.cbegin() + nbRemovedDimsDist(gen));
+
+                // create arrays and fill them with random values
+                float* array0 = new float[dims0[0]*dims0[1]*dims0[2]*dims0[3]];
+                std::size_t array1_size = std::accumulate(dims1.cbegin(), dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                float* array1 = new float[array1_size];
+                float* result = new float[dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]];
+
+                for (std::size_t i = 0; i < (dims0[0]*dims0[1]*dims0[2]*dims0[3]); ++i) {
+                    array0[i] = valueDist(gen);
+                }
+                for (std::size_t i = 0; i < array1_size; ++i) {
+                    array1[i] = valueDist(gen);
+                }
+
+                // compute true result
+                auto dims1_tmp = dims1;
+                dims1_tmp.insert(dims1_tmp.cbegin(), 4 - dims1_tmp.size(), std::size_t(1));
+
+                const std::size_t strides0[nbDims] = {dims0[1]*dims0[2]*dims0[3], dims0[2]*dims0[3], dims0[3], 1};
+                const std::size_t strides1[nbDims] = {dims1_tmp[1]*dims1_tmp[2]*dims1_tmp[3], dims1_tmp[2]*dims1_tmp[3], dims1_tmp[3], 1};
+                for (std::size_t a = 0; a < dimsOut[0]; ++a) {
+                    for (std::size_t b = 0; b < dimsOut[1]; ++b) {
+                        const std::size_t idx0_0 = strides0[0] * ((dims0[0] > 1) ? a : 0)
+                                                    + strides0[1] * ((dims0[1] > 1) ? b : 0);
+                        const std::size_t idx1_0 = strides1[0] * ((dims1_tmp[0] > 1) ? a : 0)
+                                                    + strides1[1] * ((dims1_tmp[1] > 1) ? b : 0);
+                        for (std::size_t c = 0; c < dimsOut[2]; ++c) {
+                            const std::size_t idx_out = dimsOut[3] * (c + dimsOut[2] * (b + dimsOut[1] * a));
+                            for (std::size_t d = 0; d < dimsOut[3]; ++d) {
+                                std::size_t idx0 = idx0_0
+                                                    + strides0[2] * ((dims0[2] > 1) ? c : 0)
+                                                    + ((dims0[3] > 1) ? d : 0);
+                                std::size_t idx1 = idx1_0
+                                                    + strides1[2] * ((dims1_tmp[2] > 1) ? c : 0)
+                                                    + ((dims1_tmp[3] > 1) ? d : 0);
+                                result[idx_out + d] = std::pow(array0[idx0], array1[idx1]);
+                                // std::cout << "(" << idx0 << ", " << idx1 << ") -> " << array0[idx0] << " ** " << array1[idx1] << " -> " << idx_out + d << std::endl;
+                            }
+                        }
+                    }
+                }
+
+                // conversion to Aidge::Tensors
+                // input0
+                T0->resize(dims0);
+                T0 -> getImpl() -> setRawPtr(array0, dims0[0]*dims0[1]*dims0[2]*dims0[3]);
+
+                // input1
+                T1->resize(dims1);
+                T1 -> getImpl() -> setRawPtr(array1, array1_size);
+
+                // results
+                Tres->resize(dimsOut);
+                Tres -> getImpl() -> setRawPtr(result, dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]);
+
+                // compute result
+                op->forwardDims();
+                start = std::chrono::system_clock::now();
+                myPow->forward();
+                end = std::chrono::system_clock::now();
+                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+                // comparison between truth and computed result
+                REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
+
+                delete[] array0;
+                delete[] array1;
+                delete[] result;
+
+                const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                number_of_operation += nb_elements;
+            }
+
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {} μs\n", duration.count());
+        }
+    }
+
+
+    SECTION("PowImpl_cpu::backward()") {
+        SECTION("3D Tensors") {
+            const auto input0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
+                {
+                    {
+                        {
+                            {2.0, 3.0},
+                            {4.0, 5.0}
+                        },
+                        {
+                            {6.0, 7.0},
+                            {8.0, 9.0}
+                        }
+                    }
+                }
+            ));
+            const auto input1 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
+                {
+                    {
+                        {
+                            {1.0, 2.0},
+                            {3.0, 2.0}
+                        },
+                        {
+                            {2.0, 3.0},
+                            {1.0, 0.5}
+                        }
+                    }
+                }
+            ));
+            const auto gradOut = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
+                {
+                    {
+                        {
+                            {0.5, 1.0},
+                            {1.5, 2.0}
+                        },
+                        {
+                            {2.5, 3.0},
+                            {3.5, 4.0}
+                        }
+                    }
+                }
+            ));
+            const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
+                {
+                    {
+                        {
+                            {0.50000000,   6.00000000},
+                            {72.00000000,  20.00000000}
+                        },
+                        {
+                            {30.00000000, 441.00000000},
+                            {3.50000000,   0.66666669}
+                        }
+                    }
+                }
+            ));
+            const auto expectedGrad1 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
+                {
+                    {
+                        {
+                            {  0.693147182, 9.88751030},
+                            {1.33084259e+02, 8.04718933e+01}
+                        },
+                        {
+                            {1.61258362e+02, 2.00234143e+03},
+                            {5.82243652e+01, 2.63666954e+01}
+                        }
+                    }
+                }
+            ));
+            for(const auto T: {input0, input1, gradOut, expectedGrad0, expectedGrad1})
+            {
+                    T->setBackend("cpu") ;
+                    T->setDataType(DataType::Float32);
+            }
+            std::shared_ptr<Node> powOp = Pow();
+            auto opr = std::static_pointer_cast<OperatorTensor>(powOp-> getOperator());
+            opr->setDataType(DataType::Float32);
+            opr->setBackend("cpu");
+            opr->associateInput(0, input0);
+            opr->associateInput(1, input1);
+            opr->getOutput(0)->setGrad(gradOut);
+            opr->forward();
+
+            powOp->backward();
+            REQUIRE(approxEq<float>(*(opr->getInput(0)->grad()), *expectedGrad0));
+            REQUIRE(approxEq<float>(*(opr->getInput(1)->grad()), *expectedGrad1));
+        }
+        SECTION("Broadcasting") {
+            const auto input0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+                {
+                    {
+                        {
+                            {1.0, 2.0, 3.0},
+                            {4.0, 5.0, 6.0}
+                        },
+                        {
+                            {1.5, 2.5, 3.5},
+                            {4.5, 5.5, 6.5}
+                        }
+                    }
+                }
+            ));
+            const auto input1 = std::make_shared<Tensor>(Array1D<float, 3>(
+                {
+                    {0.1, 0.2, 0.3}
+                }
+            ));
+
+            const auto gradOut = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+                {
+                    {
+                        {
+                            {1.0, 2.0, 3.0},
+                            {4.0, 5.0, 6.0}
+                        },
+                        {
+                            {6.0, 5.0, 4.0},
+                            {3.0, 2.0, 1.0}
+                        }
+                    }
+                }
+            ));
+            const Tensor expectedGrad0 = Array3D<float, 2, 2, 3>(
+                {
+                    {
+                        {
+                            {0.10000000, 0.22973967, 0.41711676},
+                            {0.11486985, 0.27594593, 0.51353097}
+                        },
+                        {
+                            {0.41655189, 0.48044977, 0.49926791},
+                            {0.07748720, 0.10227509, 0.08092485}
+                        }
+                    }
+                }
+            );
+            const Tensor expectedGrad1 = Array1D<float, 3>(
+                {
+                    {14.14779854, 22.99299049, 33.56402588}
+                }
+            );
+
+            std::shared_ptr<Node> powOp = Pow();
+            auto opr = std::static_pointer_cast<OperatorTensor>(powOp-> getOperator());
+            opr->setDataType(DataType::Float32);
+            opr->setBackend("cpu");
+            opr->associateInput(0, input0);
+            opr->associateInput(1, input1);
+            opr->getOutput(0)->setGrad(gradOut);
+            powOp->forward();
+
+            powOp->backward();
+            REQUIRE(approxEq<float>(*(opr->getInput(0)->grad()), expectedGrad0));
+            REQUIRE(approxEq<float>(*(opr->getInput(1)->grad()), expectedGrad1));
+        }
+    }
+}
+} // namespace Aidge
--- a/unit_tests/operator/Test_ReLUImpl.cpp
+++ b/unit_tests/operator/Test_ReLUImpl.cpp
@@ -9,34 +9,34 @@
 *
 ********************************************************************************/

+#include <memory>
+
 #include <catch2/catch_test_macros.hpp>

+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/ReLUImpl.hpp"
+#include "aidge/data/DataType.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/ReLU.hpp"

-#include "aidge/backend/cpu.hpp"
-
-#include <memory>
-

 using namespace Aidge;

-TEST_CASE("[cpu/operator] ReLU(forward)") {
+TEST_CASE("[cpu/operator] ReLU(forward)", "[ReLU][CPU]") {
    SECTION("1D Tensor") {
        std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(Array1D<int,10> {
            {0, 1, 2,-3, 4,-5,-6, 7, 8, 9}
        });
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array1D<int,10> {
+        Tensor expectedOutput = Array1D<int,10> {
            {0, 1, 2, 0, 4, 0, 0, 7, 8, 9}
-        });
+        };

-        std::shared_ptr<Node> myReLU = ReLU();
-        myReLU->getOperator()->setDatatype(DataType::Int32);
-        myReLU->getOperator()->setBackend("cpu");
-        myReLU->getOperator()->associateInput(0,input0);
-        myReLU->getOperator()->computeOutputDims();
-        myReLU->forward();
-        REQUIRE(*(myReLU->getOperator()->getOutput(0)) == *expectedOutput);
+        std::shared_ptr<ReLU_Op> op = std::make_shared<ReLU_Op>();
+        op->associateInput(0,input0);
+        op->setDataType(DataType::Int32);
+        op->setBackend("cpu");
+        op->forward();
+        REQUIRE(*(op->getOutput(0)) == expectedOutput);
    }

    SECTION("2D Tensor") {
@@ -46,20 +46,19 @@ TEST_CASE("[cpu/operator] ReLU(forward)") {
                {-5, 4, 2,-3, 4,-5,-6, 7,-1,10}
            }
        });
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array2D<int,2,10> {
+        Tensor expectedOutput = Array2D<int,2,10> {
            {
                { 0, 1, 2, 0, 4, 0, 0, 7, 8, 9},
                { 0, 4, 2, 0, 4, 0, 0, 7, 0,10}
            }
-        });
+        };

-        std::shared_ptr<Node> myReLU = ReLU();
-        myReLU->getOperator()->setDatatype(DataType::Int32);
-        myReLU->getOperator()->setBackend("cpu");
-        myReLU->getOperator()->associateInput(0,input0);
-        myReLU->getOperator()->computeOutputDims();
-        myReLU->forward();
-        REQUIRE(*myReLU->getOperator()->getOutput(0) == *expectedOutput);
+        std::shared_ptr<ReLU_Op> op = std::make_shared<ReLU_Op>();
+        op->associateInput(0,input0);
+        op->setDataType(DataType::Int32);
+        op->setBackend("cpu");
+        op->forward();
+        REQUIRE(*op->getOutput(0) == expectedOutput);
    }

    SECTION("3D Tensor") {
@@ -75,7 +74,7 @@ TEST_CASE("[cpu/operator] ReLU(forward)") {
                }
            }
        });
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array3D<int,2,2,10> {
+        Tensor expectedOutput = Array3D<int,2,2,10> {
            {
                {
                    { 0, 1, 2, 0, 4, 0, 0, 7, 8, 9},
@@ -86,15 +85,14 @@ TEST_CASE("[cpu/operator] ReLU(forward)") {
                    { 0, 4, 2, 0, 4, 0, 0, 7, 0,10}
                }
            }
-        });
+        };

-        std::shared_ptr<Node> myReLU = ReLU();
-        myReLU->getOperator()->setDatatype(DataType::Int32);
-        myReLU->getOperator()->setBackend("cpu");
-        myReLU->getOperator()->associateInput(0,input0);
-        myReLU->getOperator()->computeOutputDims();
-        myReLU->forward();
-        REQUIRE(*(myReLU->getOperator()->getOutput(0)) == *expectedOutput);
+        std::shared_ptr<ReLU_Op> op = std::make_shared<ReLU_Op>();
+        op->associateInput(0,input0);
+        op->setDataType(DataType::Int32);
+        op->setBackend("cpu");
+        op->forward();
+        REQUIRE(*(op->getOutput(0)) == expectedOutput);
    }

    SECTION("4D Tensor") {
@@ -122,7 +120,7 @@ TEST_CASE("[cpu/operator] ReLU(forward)") {
                }
            }
        });
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,2,2,2,10> {
+        Tensor expectedOutput = Array4D<int,2,2,2,10> {
            {
                {
                    {
@@ -145,14 +143,13 @@ TEST_CASE("[cpu/operator] ReLU(forward)") {
                    }
                }
            }
-        });
+        };

-        std::shared_ptr<Node> myReLU = ReLU();
-        myReLU->getOperator()->setDatatype(DataType::Int32);
-        myReLU->getOperator()->setBackend("cpu");
-        myReLU->getOperator()->associateInput(0,input0);
-        myReLU->getOperator()->computeOutputDims();
-        myReLU->forward();
-        REQUIRE(*myReLU->getOperator()->getOutput(0) == *expectedOutput);
+        std::shared_ptr<ReLU_Op> op = std::make_shared<ReLU_Op>();
+        op->associateInput(0,input0);
+        op->setDataType(DataType::Int32);
+        op->setBackend("cpu");
+        op->forward();
+        REQUIRE(*op->getOutput(0) == expectedOutput);
    }
 }
\ No newline at end of file
--- a/unit_tests/operator/Test_ReduceMeanImpl.cpp
+++ b/unit_tests/operator/Test_ReduceMeanImpl.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <algorithm>   // std::fill
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::int32_t, std::uint16_t
+#include <memory>
+#include <random>      // std::random_device, std::mt19937
+                       // std::uniform_int_distribution, std::uniform_real_distribution
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
+
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/ReduceMeanImpl.hpp"
+#include "aidge/data/DataType.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/ReduceMean.hpp"
+#include "aidge/operator/OperatorTensor.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+using namespace Aidge;
+
+TEST_CASE("[cpu/operator] ReduceMean(forward)", "[ReduceMean][CPU]") {
+    SECTION("ForwardDims")
+    {
+        constexpr std::uint16_t NBTRIALS = 10;
+        // Create a random number generator
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
+        std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
+        std::uniform_int_distribution<int> boolDist(0,1);
+
+        SECTION("KeepDims") {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                DimSize_t nbDims = nbDimsDist(gen);
+                std::vector<DimSize_t> dims(nbDims);
+                std::vector<DimSize_t> expectedOutDims(nbDims);
+                std::vector<std::int32_t> axes;
+                for (std::size_t i = 0; i < nbDims; i++) {
+                    dims[i] = dimSizeDist(gen);
+                    expectedOutDims[i] = dims[i];
+                    if(boolDist(gen)) {
+                        axes.push_back(i);
+                        expectedOutDims[i] = 1;
+                    }
+                }
+                if (axes.empty()) { // Default behaviour if no axes are provided is to reduce all dimensions
+                   std::fill(expectedOutDims.begin(), expectedOutDims.end(), 1);
+                }
+
+                std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(dims);
+                myInput->setBackend("cpu");
+                myInput->setDataType(DataType::Float32);
+                myInput->zeros();
+                std::shared_ptr<Node> myReduceMean = ReduceMean(axes, true);
+                auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
+                op->associateInput(0,myInput);
+                op->setDataType(DataType::Float32);
+                op->setBackend("cpu");
+                op->forwardDims();
+
+                const auto outputDims = op->getOutput(0)->dims();
+                REQUIRE(outputDims == expectedOutDims);
+            }
+        }
+        SECTION("Not KeepDims") {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                DimSize_t nbDims = nbDimsDist(gen);
+                std::vector<DimSize_t> dims(nbDims);
+                std::vector<DimSize_t> expectedOutDims;
+                std::vector<std::int32_t> axes;
+                for (std::size_t i = 0; i < nbDims; i++) {
+                    dims[i] = dimSizeDist(gen);
+                    if(boolDist(gen)) {
+                        axes.push_back(i);
+                    }
+                    else {
+                        expectedOutDims.push_back(dims[i]);
+                    }
+                }
+                if (axes.empty() || expectedOutDims.empty()) { // Default behaviour if no axes are provided is to reduce all dimensions
+                   expectedOutDims = std::vector<DimSize_t>{1};
+                }
+
+                std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(dims);
+                myInput->setBackend("cpu");
+                myInput->setDataType(DataType::Float32);
+                std::shared_ptr<Node> myReduceMean = ReduceMean(axes, false);
+                auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
+                op->associateInput(0,myInput);
+                op->setDataType(DataType::Float32);
+                op->setBackend("cpu");
+
+                op->forwardDims();
+
+                const auto outputDims = op->getOutput(0)->dims();
+                REQUIRE(outputDims == expectedOutDims);
+            }
+        }
+        SECTION("NoopWithEmptyAxes") {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                DimSize_t nbDims = nbDimsDist(gen);
+                std::vector<DimSize_t> dims(nbDims);
+                for (std::size_t i = 0; i < nbDims; i++) {
+                    dims[i] = dimSizeDist(gen);
+                }
+                std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(dims);
+                myInput->setBackend("cpu");
+                myInput->setDataType(DataType::Float32);
+                std::shared_ptr<Node> myReduceMean = ReduceMean(std::vector<int32_t>{}, false, true);
+                auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
+                op->associateInput(0,myInput);
+                op->setDataType(DataType::Float32);
+                op->setBackend("cpu");
+
+                op->forwardDims();
+
+                const auto outputDims = op->getOutput(0)->dims();
+                REQUIRE(outputDims == dims);
+            }
+        }
+        SECTION("Not NoopWithEmptyAxes") {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                DimSize_t nbDims = nbDimsDist(gen);
+                std::vector<DimSize_t> dims(nbDims);
+                for (std::size_t i = 0; i < nbDims; i++) {
+                    dims[i] = dimSizeDist(gen);
+                }
+                std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(dims);
+                myInput->setBackend("cpu");
+                myInput->setDataType(DataType::Float32);
+                std::shared_ptr<Node> myReduceMean = ReduceMean({}, false, false);
+                auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
+                op->associateInput(0,myInput);
+                op->setDataType(DataType::Float32);
+                op->setBackend("cpu");
+
+                op->forwardDims();
+
+                REQUIRE(op->getOutput(0)->nbDims() == 1);
+                REQUIRE(op->getOutput(0)->size() == 1);
+            }
+        }
+    }
+    SECTION("KeepDims") {
+        SECTION("test 1") {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
+                {
+                    {
+                        { 5.0, 1.0 },
+                        { 20.0, 2.0 }
+                    },
+                    {
+                        { 30.0, 1.0 },
+                        { 40.0, 2.0 }
+                    },
+                    {
+                        { 55.0, 1.0 },
+                        { 60.0, 2.0 }
+                    }
+                }
+            });
+            Tensor myOutput = Tensor(Array3D<float,3,1,2> {
+                {
+
+                    {{ 12.5, 1.5 }},
+                    {{ 35.0, 1.5 }},
+                    {{ 57.5, 1.5 }}
+                }
+            });
+
+            std::shared_ptr<Node> myReduceMean = ReduceMean({1}, 1);
+            auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            myReduceMean->forward();
+            op->getOutput(0)->print();
+
+            REQUIRE(*(op->getOutput(0)) == myOutput);
+        }
+        SECTION("test 2") {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,3,2> {
+                {
+                    {
+                        { 0.0, 0.0 },
+                        { 1.0, 1.0 },
+                        { 2.0, 2.0 }
+                    },
+                    {
+                        { 3.0, 3.0 },
+                        { 4.0, 4.0 },
+                        { 5.0, 5.0 }
+                    },
+                    {
+                        { 6.0, 6.0 },
+                        { 7.0, 7.0 },
+                        { 8.0, 8.0 }
+                    }
+                }
+            });
+            Tensor myOutput = Tensor(Array3D<float,3,1,1> {
+                {
+
+                    {{ 1.0 }},
+                    {{ 4.0 }},
+                    {{ 7.0 }}
+                }
+            });
+
+            std::shared_ptr<Node> myReduceMean = ReduceMean({1, 2}, 1);
+            auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            myReduceMean->forward();
+            myOutput.print();
+            op->getOutput(0)->print();
+            REQUIRE(*(op->getOutput(0)) == myOutput);
+        }
+    }
+    SECTION("not_KeepDims") {
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
+            {
+                {
+                    { 5.0, 1.0 },
+                    { 20.0, 2.0 }
+                },
+                {
+                    { 30.0, 1.0 },
+                    { 40.0, 2.0 }
+                },
+                {
+                    { 55.0, 1.0 },
+                    { 60.0, 2.0 }
+                }
+            }
+        });
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float,3,2> {
+            {
+                { 12.5, 1.5 },
+                { 35.0, 1.5 },
+                { 57.5, 1.5 }
+            }
+        });
+
+        std::shared_ptr<Node> myReduceMean = ReduceMean({1}, 0);
+        auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
+        op->associateInput(0,myInput);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cpu");
+        myReduceMean->forward();
+        op->getOutput(0)->print();
+
+        REQUIRE(*(op->getOutput(0)) == *myOutput);
+
+    }
+    SECTION("all_axes") {
+        SECTION("1") {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
+                {
+                    {
+                        { 5.0, 1.0 },
+                        { 20.0, 2.0 }
+                    },
+                    {
+                        { 30.0, 1.0 },
+                        { 40.0, 2.0 }
+                    },
+                    {
+                        { 55.0, 1.0 },
+                        { 60.0, 2.0 }
+                    }
+                }
+            });
+            std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array1D<float,1> {
+                {18.25}
+            });
+
+            std::shared_ptr<Node> myReduceMean = ReduceMean({}, 0);
+            auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            myReduceMean->forward();
+            op->getOutput(0)->print();
+
+            REQUIRE(*(op->getOutput(0)) == *myOutput);
+        }
+        SECTION("2") {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float,5,4> {
+               {{ 0.004232f, 0.105120f, 0.045124f, 0.009205f},
+                { 0.000766f, 0.272162f, 0.503560f, 0.044163f},
+                { 0.049755f, 0.000305f, 0.143634f, 0.013253f},
+                { 0.096258f, 0.311231f, 0.358143f, 0.000452f},
+                { 0.468617f, 0.015693f, 0.145316f, 0.000105f}}
+            });
+            std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array1D<float,1> {
+                {0.1293547f}
+            });
+
+            std::shared_ptr<Node> myReduceMean = ReduceMean({}, 0);
+            auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            myReduceMean->forward();
+
+            REQUIRE(approxEq<float>(*(op->getOutput(0)), *myOutput));
+        }
+        SECTION("noop_with_empty_axes") {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
+                {
+                    {
+                        { 5.0, 1.0 },
+                        { 20.0, 2.0 }
+                    },
+                    {
+                        { 30.0, 1.0 },
+                        { 40.0, 2.0 }
+                    },
+                    {
+                        { 55.0, 1.0 },
+                        { 60.0, 2.0 }
+                    }
+                }
+            });
+
+            std::shared_ptr<Node> myReduceMean = ReduceMean({}, 0, 1);
+            auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            myReduceMean->forward();
+            op->getOutput(0)->print();
+
+            REQUIRE(*(op->getOutput(0)) == *myInput);
+        }
+    }
+}
\ No newline at end of file
--- a/unit_tests/operator/Test_ReduceSumImpl.cpp
+++ b/unit_tests/operator/Test_ReduceSumImpl.cpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <cstddef>  // std::size_t
+#include <cstdint>  // std::uint16_t, std::int32_t
+#include <memory>
+#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/data/Data.hpp"  // DataType
+#include "aidge/data/Tensor.hpp"
+#include "aidge/graph/Node.hpp"
+#include "aidge/operator/OperatorTensor.hpp"
+#include "aidge/operator/ReduceSum.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+#include "aidge/utils/Types.h"
+
+using namespace Aidge;
+
+TEST_CASE("[cpu/operator] ReduceSum(forward)", "[ReduceSum][CPU]") {
+    SECTION("ForwardDims")
+    {
+        constexpr std::uint16_t NBTRIALS = 10;
+        // Create a random number generator
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
+        std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
+        std::uniform_int_distribution<int> boolDist(0,1);
+
+        SECTION("KeepDims") {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                DimSize_t nbDims = nbDimsDist(gen);
+                std::vector<DimSize_t> dims(nbDims);
+                std::vector<DimSize_t> expectedOutDims(nbDims);
+                std::vector<std::int32_t> axes;
+                for (std::size_t i = 0; i < nbDims; i++) {
+                    dims[i] = dimSizeDist(gen);
+                    expectedOutDims[i] = dims[i];
+                    if(boolDist(gen)) {
+                        axes.push_back(i);
+                        expectedOutDims[i] = 1;
+                    }
+                }
+                if (axes.empty()) { // Default behaviour if no axes are provided is to reduce all dimensions
+                   std::fill(expectedOutDims.begin(), expectedOutDims.end(), 1);
+                }
+
+                std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(dims);
+                myInput->setBackend("cpu");
+                myInput->setDataType(DataType::Float32);
+                myInput->zeros();
+                std::shared_ptr<Node> myReduceSum = ReduceSum(axes, true);
+                auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
+                op->associateInput(0,myInput);
+                op->setDataType(DataType::Float32);
+                op->setBackend("cpu");
+                op->forwardDims();
+
+                const auto outputDims = op->getOutput(0)->dims();
+                REQUIRE(outputDims == expectedOutDims);
+            }
+        }
+        SECTION("Not KeepDims") {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                DimSize_t nbDims = nbDimsDist(gen);
+                std::vector<DimSize_t> dims(nbDims);
+                std::vector<DimSize_t> expectedOutDims;
+                std::vector<std::int32_t> axes;
+                for (std::size_t i = 0; i < nbDims; i++) {
+                    dims[i] = dimSizeDist(gen);
+                    if(boolDist(gen)) {
+                        axes.push_back(i);
+                    }
+                    else {
+                        expectedOutDims.push_back(dims[i]);
+                    }
+                }
+                if (axes.empty() || expectedOutDims.empty()) { // Default behaviour if no axes are provided is to reduce all dimensions
+                   expectedOutDims = std::vector<DimSize_t>{1};
+                }
+
+                std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(dims);
+                myInput->setBackend("cpu");
+                myInput->setDataType(DataType::Float32);
+                std::shared_ptr<Node> myReduceSum = ReduceSum(axes, false);
+                auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
+                op->associateInput(0,myInput);
+                op->setDataType(DataType::Float32);
+                op->setBackend("cpu");
+
+                op->forwardDims();
+
+                const auto outputDims = op->getOutput(0)->dims();
+                REQUIRE(outputDims == expectedOutDims);
+            }
+        }
+        SECTION("NoopWithEmptyAxes") {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                DimSize_t nbDims = nbDimsDist(gen);
+                std::vector<DimSize_t> dims(nbDims);
+                for (std::size_t i = 0; i < nbDims; i++) {
+                    dims[i] = dimSizeDist(gen);
+                }
+                std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(dims);
+                myInput->setBackend("cpu");
+                myInput->setDataType(DataType::Float32);
+                std::shared_ptr<Node> myReduceSum = ReduceSum(std::vector<std::int32_t>{}, false, true);
+                auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
+                op->associateInput(0,myInput);
+                op->setDataType(DataType::Float32);
+                op->setBackend("cpu");
+
+                op->forwardDims();
+
+                const auto outputDims = op->getOutput(0)->dims();
+                REQUIRE(outputDims == dims);
+            }
+        }
+        SECTION("Not NoopWithEmptyAxes") {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                DimSize_t nbDims = nbDimsDist(gen);
+                std::vector<DimSize_t> dims(nbDims);
+                for (std::size_t i = 0; i < nbDims; i++) {
+                    dims[i] = dimSizeDist(gen);
+                }
+                std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(dims);
+                myInput->setBackend("cpu");
+                myInput->setDataType(DataType::Float32);
+                std::shared_ptr<Node> myReduceSum = ReduceSum({}, false, false);
+                auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
+                op->associateInput(0,myInput);
+                op->setDataType(DataType::Float32);
+                op->setBackend("cpu");
+
+                op->forwardDims();
+
+                REQUIRE(op->getOutput(0)->nbDims() == 1);
+                REQUIRE(op->getOutput(0)->size() == 1);
+            }
+        }
+    }
+    SECTION("KeepDims") {
+        SECTION("test 1") {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
+                {
+                    {
+                        { 5.0, 1.0 },
+                        { 20.0, 2.0 }
+                    },
+                    {
+                        { 30.0, 1.0 },
+                        { 40.0, 2.0 }
+                    },
+                    {
+                        { 55.0, 1.0 },
+                        { 60.0, 2.0 }
+                    }
+                }
+            });
+            Tensor myOutput = Tensor(Array3D<float,3,1,2> {
+                {
+
+                    {{ 25.0, 3.0 }},
+                    {{ 70.0, 3.0 }},
+                    {{ 115.0, 3.0 }}
+                }
+            });
+
+            std::shared_ptr<Node> myReduceSum = ReduceSum({1}, 1);
+            auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            myReduceSum->forward();
+            op->getOutput(0)->print();
+
+            REQUIRE(*(op->getOutput(0)) == myOutput);
+        }
+        SECTION("test 2") {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,3,2> {
+                {
+                    {
+                        { 0.0, 0.0 },
+                        { 1.0, 1.0 },
+                        { 2.0, 2.0 }
+                    },
+                    {
+                        { 3.0, 3.0 },
+                        { 4.0, 4.0 },
+                        { 5.0, 5.0 }
+                    },
+                    {
+                        { 6.0, 6.0 },
+                        { 7.0, 7.0 },
+                        { 8.0, 8.0 }
+                    }
+                }
+            });
+            Tensor myOutput = Tensor(Array3D<float,3,1,1> {
+                {
+
+                    {{ 6.0 }},
+                    {{ 24.0 }},
+                    {{ 42.0 }}
+                }
+            });
+
+            std::shared_ptr<Node> myReduceSum = ReduceSum({1, 2}, 1);
+            auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            myReduceSum->forward();
+            myOutput.print();
+            op->getOutput(0)->print();
+            REQUIRE(*(op->getOutput(0)) == myOutput);
+        }
+    }
+    SECTION("not_KeepDims") {
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
+            {
+                {
+                    { 5.0, 1.0 },
+                    { 20.0, 2.0 }
+                },
+                {
+                    { 30.0, 1.0 },
+                    { 40.0, 2.0 }
+                },
+                {
+                    { 55.0, 1.0 },
+                    { 60.0, 2.0 }
+                }
+            }
+        });
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float,3,2> {
+            {
+                { 25.0, 3.0 },
+                { 70.0, 3.0 },
+                { 115.0, 3.0 }
+            }
+        });
+
+        std::shared_ptr<Node> myReduceSum = ReduceSum({1}, 0);
+        auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
+        op->associateInput(0,myInput);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cpu");
+        myReduceSum->forward();
+        op->getOutput(0)->print();
+
+        REQUIRE(*(op->getOutput(0)) == *myOutput);
+
+    }
+    SECTION("all_axes") {
+        SECTION("1") {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
+                {
+                    {
+                        { 5.0, 1.0 },
+                        { 20.0, 2.0 }
+                    },
+                    {
+                        { 30.0, 1.0 },
+                        { 40.0, 2.0 }
+                    },
+                    {
+                        { 55.0, 1.0 },
+                        { 60.0, 2.0 }
+                    }
+                }
+            });
+            std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array1D<float,1> {
+                {219.0}
+            });
+
+            std::shared_ptr<Node> myReduceSum = ReduceSum({}, 0);
+            auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            myReduceSum->forward();
+            op->getOutput(0)->print();
+
+            REQUIRE(*(op->getOutput(0)) == *myOutput);
+        }
+        SECTION("2") {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float,5,4> {
+               {{ 0.004232f, 0.105120f, 0.045124f, 0.009205f},
+                { 0.000766f, 0.272162f, 0.503560f, 0.044163f},
+                { 0.049755f, 0.000305f, 0.143634f, 0.013253f},
+                { 0.096258f, 0.311231f, 0.358143f, 0.000452f},
+                { 0.468617f, 0.015693f, 0.145316f, 0.000105f}}
+            });
+            std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array1D<float,1> {
+                {2.587094f}
+            });
+
+            std::shared_ptr<Node> myReduceSum = ReduceSum({0, 1}, 0);
+            auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            myReduceSum->forward();
+            op->getOutput(0)->print();
+            REQUIRE(approxEq<float>(*(op->getOutput(0)), *myOutput));
+        }
+        SECTION("noop_with_empty_axes") {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
+                {
+                    {
+                        { 5.0, 1.0 },
+                        { 20.0, 2.0 }
+                    },
+                    {
+                        { 30.0, 1.0 },
+                        { 40.0, 2.0 }
+                    },
+                    {
+                        { 55.0, 1.0 },
+                        { 60.0, 2.0 }
+                    }
+                }
+            });
+
+            std::shared_ptr<Node> myReduceSum = ReduceSum({}, 0, 1);
+            auto op = std::static_pointer_cast<OperatorTensor>(myReduceSum -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            myReduceSum->forward();
+            op->getOutput(0)->print();
+
+            REQUIRE(*(op->getOutput(0)) == *myInput);
+        }
+    }
+}
\ No newline at end of file
--- a/unit_tests/operator/Test_ResizeImpl.cpp
+++ b/unit_tests/operator/Test_ResizeImpl.cpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <cstdint>
+#include <memory>
+
+#include <aidge/data/Data.hpp>
+#include <aidge/data/Interpolation.hpp>
+#include <aidge/data/half.hpp>
+#include <aidge/operator/Pad.hpp>
+#include <aidge/utils/ArrayHelpers.hpp>
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/OperatorTensor.hpp"
+#include "aidge/operator/Resize.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+namespace Aidge {
+
+TEST_CASE("[cpu/operator] Resize(forward)", "[Resize][CPU]") {
+
+    Log::setConsoleLevel(Log::Level::Debug);
+
+    SECTION("Nearest") {
+        SECTION("Ceil") {
+            std::shared_ptr<Tensor> input_tensor = std::make_shared<Tensor>(Array4D<std::int32_t, 1, 1, 2, 2>{{
+                {
+                    {
+                        { 1, 2},
+                        { 3, 4}
+                    }
+                }
+            }});
+            Tensor expected_out_tensor = Tensor(Array4D<std::int32_t, 1, 1, 4, 4>{{
+                {
+                    {
+                        { 1, 1, 1, 2},
+                        { 1, 1, 1, 2},
+                        { 1, 1, 1, 2},
+                        { 3, 3, 3, 4}
+                    }
+                }
+            }});
+
+            std::vector<float> scales = {1.0f, 1.0f, 2.0f, 2.0f};
+            auto resize_node = Resize(scales, {}, Interpolation::CoordinateTransformation::HalfPixel, Interpolation::Mode::Floor);
+            auto op = std::static_pointer_cast<Resize_Op>(resize_node->getOperator());
+            op->associateInput(0, input_tensor);
+
+
+            op->setDataType(DataType::Int32);
+            op->setBackend("cpu");
+            op->forwardDims(true);
+            op->forward();
+
+            op->getOutput(0)->print();
+            expected_out_tensor.print();
+
+            CHECK(*(op->getOutput(0)) == expected_out_tensor);
+        }
+    }
+
+    SECTION("1-sized input tensor (upscaling)") {
+        std::shared_ptr<Tensor> input_tensor = std::make_shared<Tensor>(Array4D<float, 1, 1, 1, 1>{{{{{0.417022}}}}});
+
+        std::vector<std::size_t> sizes = {1, 1, 2, 2};
+        auto resize_node = Resize({}, sizes, Interpolation::CoordinateTransformation::HalfPixel, Interpolation::Mode::Linear);
+        auto op = std::static_pointer_cast<Resize_Op>(resize_node->getOperator());
+        op->associateInput(0, input_tensor);
+
+
+        op->setDataType(DataType::Float32);
+        op->setBackend("cpu");
+        op->forwardDims(true);
+        op->forward();
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<float, 1, 1, 2, 2>{
+            {{{{0.417022, 0.417022}, {0.417022, 0.417022}}}}});
+        op->getOutput(0)->print();
+        CHECK(approxEq<float>(*op->getOutput(0), *expectedOutput) == true);
+    }
+    SECTION("Upscaling from 5x5 to 10x10 (linear)") {
+        std::shared_ptr<Tensor> input_tensor = std::make_shared<Tensor>(
+            Array4D<float, 1, 1, 5, 5>{{{{{7.20324516e-01,
+                                               1.14374816e-04,
+                                               3.02332580e-01,
+                                               1.46755889e-01,
+                                               9.23385918e-02},
+                                              {1.86260208e-01,
+                                               3.45560730e-01,
+                                               3.96767467e-01,
+                                               5.38816750e-01,
+                                               4.19194520e-01},
+                                              {6.85219526e-01,
+                                               2.04452246e-01,
+                                               8.78117442e-01,
+                                               2.73875929e-02,
+                                               6.70467496e-01},
+                                              {4.17304814e-01,
+                                               5.58689833e-01,
+                                               1.40386939e-01,
+                                               1.98101491e-01,
+                                               8.00744593e-01},
+                                              {9.68261600e-01,
+                                               3.13424170e-01,
+                                               6.92322612e-01,
+                                               8.76389146e-01,
+                                               8.94606650e-01}}}}}
+        );
+
+        std::vector<std::size_t> sizes = {1, 1, 10, 10};
+        auto resize_node = Resize({}, sizes, Interpolation::CoordinateTransformation::Asymmetric, Interpolation::Mode::Linear);
+        auto op = std::static_pointer_cast<Resize_Op>(resize_node->getOperator());
+        op->associateInput(0, input_tensor);
+
+        op->setDataType(DataType::Float32);
+        op->setBackend("cpu");
+        op->forwardDims(true);
+        op->forward();
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(
+            Array4D<float, 1, 1, 10, 10>{{{{{7.20324516e-01,
+                                             3.60219449e-01,
+                                             1.14374816e-04,
+                                             1.51223481e-01,
+                                             3.02332580e-01,
+                                             2.24544227e-01,
+                                             1.46755889e-01,
+                                             1.19547240e-01,
+                                             9.23385918e-02,
+                                             9.23385918e-02},
+
+                                            {4.53292370e-01,
+                                             3.13064963e-01,
+                                             1.72837555e-01,
+                                             2.61193782e-01,
+                                             3.49550009e-01,
+                                             3.46168160e-01,
+                                             3.42786312e-01,
+                                             2.99276441e-01,
+                                             2.55766571e-01,
+                                             2.55766571e-01},
+
+                                            {1.86260208e-01,
+                                             2.65910476e-01,
+                                             3.45560730e-01,
+                                             3.71164083e-01,
+                                             3.96767467e-01,
+                                             4.67792094e-01,
+                                             5.38816750e-01,
+                                             4.79005635e-01,
+                                             4.19194520e-01,
+                                             4.19194520e-01},
+
+                                            {4.35739875e-01,
+                                             3.55373204e-01,
+                                             2.75006473e-01,
+                                             4.56224471e-01,
+                                             6.37442470e-01,
+                                             4.60272312e-01,
+                                             2.83102185e-01,
+                                             4.13966596e-01,
+                                             5.44831038e-01,
+                                             5.44831038e-01},
+
+                                            {6.85219526e-01,
+                                             4.44835901e-01,
+                                             2.04452246e-01,
+                                             5.41284859e-01,
+                                             8.78117442e-01,
+                                             4.52752531e-01,
+                                             2.73875929e-02,
+                                             3.48927557e-01,
+                                             6.70467496e-01,
+                                             6.70467496e-01},
+
+                                            {5.51262140e-01,
+                                             4.66416597e-01,
+                                             3.81571054e-01,
+                                             4.45411623e-01,
+                                             5.09252191e-01,
+                                             3.10998380e-01,
+                                             1.12744540e-01,
+                                             4.24175322e-01,
+                                             7.35606015e-01,
+                                             7.35606015e-01},
+
+                                            {4.17304814e-01,
+                                             4.87997323e-01,
+                                             5.58689833e-01,
+                                             3.49538386e-01,
+                                             1.40386939e-01,
+                                             1.69244215e-01,
+                                             1.98101491e-01,
+                                             4.99423027e-01,
+                                             8.00744593e-01,
+                                             8.00744593e-01},
+
+                                            {6.92783237e-01,
+                                             5.64420104e-01,
+                                             4.36057001e-01,
+                                             4.26205903e-01,
+                                             4.16354775e-01,
+                                             4.76800054e-01,
+                                             5.37245333e-01,
+                                             6.92460477e-01,
+                                             8.47675622e-01,
+                                             8.47675622e-01},
+
+                                            {9.68261600e-01,
+                                             6.40842915e-01,
+                                             3.13424170e-01,
+                                             5.02873421e-01,
+                                             6.92322612e-01,
+                                             7.84355879e-01,
+                                             8.76389146e-01,
+                                             8.85497928e-01,
+                                             8.94606650e-01,
+                                             8.94606650e-01},
+
+                                            {9.68261600e-01,
+                                             6.40842915e-01,
+                                             3.13424170e-01,
+                                             5.02873421e-01,
+                                             6.92322612e-01,
+                                             7.84355879e-01,
+                                             8.76389146e-01,
+                                             8.85497928e-01,
+                                             8.94606650e-01,
+                                             8.94606650e-01}}}}});
+        Log::notice("Expected result : dims = {}", expectedOutput->dims());
+        expectedOutput->print();
+        Log::notice("\nActual result: dims = {}", op->getOutput(0)->dims());
+        op->getOutput(0)->print();
+        CHECK(approxEq<float>(*op->getOutput(0),
+                              *expectedOutput,
+                              1e-5f,
+                              1e-5f) == true);
+    }
+}
+
+} // namespace Aidge
No results found