diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 97fcaa704b72922d35ad70feb923633fa194c850..56dc0ef17faa0be88f81cc4b7ed95e4d654a4c38 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -12,19 +12,14 @@ stages:
   - deploy
 
 include:
-  - project: 'eclipse/aidge/gitlab_shared_files' 
+  - project: 'eclipse/aidge/gitlab_shared_files'
     ref: 'main'
-    file: 
-      #Â choose which jobs to run by including the corresponding files.
+    file: #Â choose which jobs to run by including the corresponding files.
       - '.gitlab/ci/ubuntu_cpp.gitlab-ci.yml'
 
       - '.gitlab/ci/ubuntu_python.gitlab-ci.yml'
-      - '.gitlab/ci/release/cibuildwheel_ubuntu.gitlab-ci.yml'   
+      - '.gitlab/ci/release/cibuildwheel_ubuntu.gitlab-ci.yml'
 
       - '.gitlab/ci/windows_cpp.gitlab-ci.yml'
-
-      - '.gitlab/ci/windows_python.gitlab-ci.yml'   
-      - '.gitlab/ci/release/cibuildwheel_windows.gitlab-ci.yml'   
-
-    
-
+      - '.gitlab/ci/windows_python.gitlab-ci.yml'
+      - '.gitlab/ci/release/cibuildwheel_windows.gitlab-ci.yml'
diff --git a/CHANGELOG b/CHANGELOG
index 9a76d7b11556b434cf9749d625cedea85dc6c5ac..a461371a17b586e8ebc65172282153a6ae8e09e2 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,5 @@
+# Verson 0.4.0 (December 6, 2024)
+
 # Version 0.2.2 (May 14, 2024)
 
 * Remove implmentation for Operators soly handling memory and format
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3574e25cec5977bc2249c7d756041c09650f9b11..e9e191c36d5ad57a9a9dbed378154db6676ec796 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,6 +22,9 @@ execute_process(
 message(STATUS "Latest git commit: ${GIT_COMMIT_HASH}")
 add_definitions(-DGIT_COMMIT_HASH="${GIT_COMMIT_HASH}")
 
+# helper for LSP users
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
 # Note : project name is ${CMAKE_PROJECT_NAME} and python module name is also ${CMAKE_PROJECT_NAME}
 set(module_name _${CMAKE_PROJECT_NAME}) # target name
 set(pybind_module_name ${CMAKE_PROJECT_NAME}) # name of submodule for python bindings
diff --git a/aidge_backend_cpu/unit_tests/test_scheduler.py b/aidge_backend_cpu/unit_tests/test_scheduler.py
index c37fc54437c02b0bb1c6f09a1c73d5cc538fa4c0..494f34565ffd644971c97e9adfa06709dee9e36d 100644
--- a/aidge_backend_cpu/unit_tests/test_scheduler.py
+++ b/aidge_backend_cpu/unit_tests/test_scheduler.py
@@ -13,9 +13,10 @@ class test_scheduler(unittest.TestCase):
         pass
 
     def test_relu_forward(self):
-        values = np.arange(6) - 3
 
-        input_node = aidge_core.Producer(aidge_core.Tensor(values), "Input")
+        t = aidge_core.Tensor(np.arange(6, dtype=np.int32) - 3)
+
+        input_node = aidge_core.Producer(t)
         relu = aidge_core.ReLU()
         input_node.add_child(relu)
 
@@ -34,7 +35,7 @@ class test_scheduler(unittest.TestCase):
         out_tensor = relu.get_operator().get_output(0)
         expected_out = [0,0,0,0,1,2]
         for i in range(len(expected_out)):
-            self.assertEqual(expected_out[i], out_tensor[i])
+            self.assertEqual(expected_out[i], out_tensor[i], f"On idx {i}")
 
     def test_sequential_scheduling(self):
         input_data =  np.array([0]).astype(np.float32)
@@ -69,7 +70,7 @@ class test_scheduler(unittest.TestCase):
             aidge_core.Producer(input_tensor, "X"),
             aidge_core.FC(1, 50, name='0'),
             aidge_core.parallel([aidge_core.FC(50, 50, name='1'), aidge_core.FC(50, 50, name='3')]),
-            aidge_core.Add(2, name='2'),
+            aidge_core.Add(name='2'),
         ])
 
         EXPECTED_SCHEDULE = [['0', '1', '3', '2'],  ['0', '3', '1', '2']] # Both scheduling are valid !
diff --git a/include/aidge/backend/cpu.hpp b/include/aidge/backend/cpu.hpp
index b45aa1cb4151d8d6c5268d4a94da97bb25a89a40..caa75328e58f6c9581f81368a3981bb79a069d49 100644
--- a/include/aidge/backend/cpu.hpp
+++ b/include/aidge/backend/cpu.hpp
@@ -15,11 +15,14 @@
 #include "aidge/backend/cpu/operator/AbsImpl.hpp"
 #include "aidge/backend/cpu/operator/AddImpl.hpp"
 #include "aidge/backend/cpu/operator/AndImpl.hpp"
+#include "aidge/backend/cpu/operator/AtanImpl.hpp"
+
 #include "aidge/backend/cpu/operator/ArgMaxImpl.hpp"
 #include "aidge/backend/cpu/operator/AvgPoolingImpl.hpp"
 #include "aidge/backend/cpu/operator/MaxPoolingImpl.hpp"
 #include "aidge/backend/cpu/operator/BatchNormImpl.hpp"
 #include "aidge/backend/cpu/operator/BitShiftImpl.hpp"
+#include "aidge/backend/cpu/operator/ClipImpl.hpp"
 #include "aidge/backend/cpu/operator/ConvDepthWiseImpl.hpp"
 #include "aidge/backend/cpu/operator/ConvImpl.hpp"
 #include "aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp"
@@ -28,15 +31,19 @@
 #include "aidge/backend/cpu/operator/FCImpl.hpp"
 #include "aidge/backend/cpu/operator/FoldImpl.hpp"
 #include "aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp"
+#include "aidge/backend/cpu/operator/LRNImpl.hpp"
 #include "aidge/backend/cpu/operator/LeakyReLUImpl.hpp"
 #include "aidge/backend/cpu/operator/LnImpl.hpp"
 #include "aidge/backend/cpu/operator/MatMulImpl.hpp"
 #include "aidge/backend/cpu/operator/MulImpl.hpp"
 #include "aidge/backend/cpu/operator/PadImpl.hpp"
+#include "aidge/backend/cpu/operator/PaddedConvImpl.hpp"
 #include "aidge/backend/cpu/operator/PowImpl.hpp"
 #include "aidge/backend/cpu/operator/ReduceMeanImpl.hpp"
 #include "aidge/backend/cpu/operator/ReduceSumImpl.hpp"
+#include "aidge/backend/cpu/operator/ResizeImpl.hpp"
 #include "aidge/backend/cpu/operator/ReLUImpl.hpp"
+#include "aidge/backend/cpu/operator/RoundImpl.hpp"
 #include "aidge/backend/cpu/operator/ScalingImpl.hpp"
 #include "aidge/backend/cpu/operator/SigmoidImpl.hpp"
 #include "aidge/backend/cpu/operator/SqrtImpl.hpp"
diff --git a/include/aidge/backend/cpu/data/Interpolation.hpp b/include/aidge/backend/cpu/data/Interpolation.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..5909f02a190f4e10cdeb878505fdfea1a17e2d75
--- /dev/null
+++ b/include/aidge/backend/cpu/data/Interpolation.hpp
@@ -0,0 +1,117 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_DATA_INTERPOLATION_H_
+#define AIDGE_CPU_DATA_INTERPOLATION_H_
+
+#include <vector>
+
+#include <aidge/data/Interpolation.hpp>
+#include <aidge/utils/Types.h>
+
+namespace Aidge {
+class InterpolationCPU : public Interpolation {
+  public:
+    /*
+     * @brief Interpolates values given via input in given mode.
+     *
+     * Values are contiguously arranged in a "square" shape around the point to
+     * interpolate. Depending on interpolation mode.
+     * The point that will be interpolated is located right in the
+     * middle of all points.
+     * Immediate neighbours :
+     * 1D interp :     2D interp :
+     *                 . . . . . .
+     * . . 1 2 . .     . . . . . .
+     *                 . . 1 2 . .
+     *                 . . 3 4 . .
+     *                 . . . . . .
+     *                 . . . . . .
+     *
+     * 2 neighbours :
+     * 1D interp :         2D interp :
+     *                   .  .  .  .  .  .  . .
+     *                   .  .  .  .  .  .  . .
+     * . . 1 2 3 4 . .   .  .  1  2  3  4  . .
+     *                   .  .  5  6  7  8  . .
+     *                   .  .  9 10 11 12  . .
+     *                   .  . 13 14 15 16  . .
+     *                   .  .  .  .  .  .  . .
+     *                   .  .  .  .  .  .  . .
+     *
+     * @param[in]Â originalCoords: coord of the point to interpolate in the
+     * original picture. These coords are generated with
+     * Interpolation::untransformCoords(coordsInInterpolatedTensor)
+     * @param[in]Â points : points to interpolate, arranged in a vector of a
+     * pairs ((point_coord), value) :
+     * [[[X1, X2, ..., XN], Xval], ...., [[A1, A2, ..., AN],Aval]].
+     * With :
+     * - N: the number of dimensions.
+     * - A: the number of points of the grid to interpolate.
+     * - All coordinates expressed in originalTensor frame.
+     * @param[in] interpMode: interpolation mode
+     * @return interpolated value
+     */
+    template <typename T>
+    static T interpolate(const std::vector<float> &coordsToInterpolate,
+                         const std::set<Point<T>> &points,
+                         const Mode interpMode = Interpolation::Mode::Linear);
+
+    /**
+     * @brief performs linear interpolation on given points.
+     * @param[in]Â values: values to interpolate, since we only do an average of
+     * all values, their indexes isn't useful.
+     * @return interpolated value
+     */
+    template <typename T>
+    static T linear(const std::vector<float> &originalCoords,
+                    const std::set<Point<T>> &points);
+
+    /**
+     * @brief performs nearest interpolation on given points.
+     * @note it is a wrapper for linearRecurse() private method
+     * @param[in]Â coordsToInterpolate: coordinates to interpolate
+     * @param[in]Â points: points to interpolate
+     * @param[in]Â interpMode: interpolation method, must be a Nearest...
+     * otherwise function will throw an error.
+     * @return interpolated value
+     */
+    template <typename T>
+    static T nearest(const std::vector<float> &coordsToInterpolate,
+                     const std::set<Point<T>> &points,
+                     const Interpolation::Mode nearestMode);
+
+  private:
+    /**
+     * @brief actual linear interpolation function.
+     * will :
+     * - Split all points along each dimension depending of if their coords at
+     * idx alongDim are above or under coordsToInterpolate until they are
+     * 1-to-1.
+     * - Perform interpolation in 2 leftover points and return interpolated
+     * point to parent call with a set of size 1.
+     * - repeat until all dimensions have been interpolated.
+     * @param[in]Â coordsToInterpolate: coordinates to interpolate
+     * @param[in]Â points: points to interpolate
+     * @param[in] alongDim: discriminant on along which dimension are being
+     * segregated.
+     * @return
+     */
+    template <typename T>
+    static std::set<Interpolation::Point<T>>
+    linearRecurse(const std::vector<float> &coordsToInterpolate,
+                  const std::set<Point<T>> &points,
+                  const DimIdx_t alongDim = 0);
+};
+
+} // namespace Aidge
+
+#endif // AIDGE_CPU_DATA_INTERPOLATION_H_
diff --git a/include/aidge/backend/cpu/operator/AddImpl.hpp b/include/aidge/backend/cpu/operator/AddImpl.hpp
index 5e795922a67be178dde588e8e5e346ec268efe86..e39c35b42fdb6065aa72aee092cd1cd23b2b1011 100644
--- a/include/aidge/backend/cpu/operator/AddImpl.hpp
+++ b/include/aidge/backend/cpu/operator/AddImpl.hpp
@@ -25,7 +25,7 @@
 namespace Aidge {
 // Operator implementation entry point for the backend
 using AddImpl_cpu = OperatorImpl_cpu<Add_Op,
-    void(const std::vector<const void*>, const std::vector<std::vector<std::size_t>>&, const std::size_t, const std::vector<std::size_t>&, void*)>;
+    void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*, void*)>;
 
 // Implementation entry point registration to Operator
 REGISTRAR(Add_Op, "cpu", Aidge::AddImpl_cpu::create);
diff --git a/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp
index 4a4ba2a8999c4dc33fc743b5a3a7dad023f9e0dd..e6d13fcf3699824a8410015d35ff766adf617c11 100644
--- a/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp
@@ -14,31 +14,137 @@
 
 #include "aidge/utils/Registrar.hpp"
 
-#include <cstdint>     // std::int32_t, std::int64_t
+#include <cstddef>  // std::size_t
 
 #include "aidge/backend/cpu/data/Broadcasting.hpp"
 #include "aidge/backend/cpu/operator/AddImpl.hpp"
 
 namespace Aidge {
 
+namespace {
+// suppose values are contiguous in memory
 template <class I, class O>
-void AddImpl_cpu_forward_kernel(const std::vector<const void*> inputs_, const std::vector<std::vector<std::size_t>>& inputDims, const std::size_t outputLength, const std::vector<std::size_t>& outDims, void* output_) {
-    // FIXME: missing Add attributes as arguments
-    std::vector<const I*> inputs;
-    for (const auto& input_ : inputs_) {
-        inputs.push_back(static_cast<const I*>(input_));
+void add_contiguous_arrays(const std::size_t input1size,
+                            const std::size_t input2size,
+                            const std::size_t output1size,
+                            const I* input1,
+                            const I* input2,
+                            O* output)
+{
+    for (std::size_t i = 0; i < output1size; ++i)
+    {
+        const std::size_t in1_id = (input1size != 1) ? i : 0;
+        const std::size_t in2_id = (input2size != 1) ? i : 0;
+        output[i] = static_cast<O>(input1[in1_id] + input2[in2_id]);
     }
+}
+}
+
+template <class I, class O>
+void AddImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
+                                std::vector<std::size_t> dims1,
+                                const std::vector<std::size_t>& outputDims,
+                                const void* input0_,
+                                const void* input1_,
+                                void* output_) {
+
+    const I* input_0 = static_cast<const I*>(input0_);
+    const I* input_1 = static_cast<const I*>(input1_);
     O* output = static_cast<O*>(output_);
 
-	for (std::size_t oIndex = 0; oIndex < outputLength; ++oIndex)
-	{
-        output[oIndex] = 0;
-		std::vector<size_t> indexes = getMultiDimIndices(outDims, oIndex);
-		for(std::size_t iIndex = 0; iIndex < inputs.size(); ++iIndex) {
-			std::size_t idx = getFlattenedIndex(inputDims[iIndex], indexes);
-            output[oIndex] += inputs[iIndex][idx];
-		}
-	}
+    // [5,2,1,7] & [2,6,7]
+    // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7]
+    // 2. Find the highest equal dimension -> 3
+    //    Exception: if the first diverging dimension is the last one, then -> 4 (dims.size())
+    // 3. Compute the highest number of contiguous data -> 7
+    // 4. Compute stride and offset step for the broadcast mechanism
+    // 5. Call a simple kernel
+
+    // special case for equal dimensions, the kernel is called with the entire arrays at once
+    if (dims0 == dims1) {
+        const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+        for (std::size_t i = 0; i < input0_contiguous_size; ++i)
+        {
+            output[i] = static_cast<O>(input_0[i] + input_1[i]);
+        }
+        return;
+    }
+
+    // set dimensions to be of equal size by filling the smallest one with ones.
+    if (dims0.size() > dims1.size()) {
+        dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1));
+    }
+    else if (dims1.size() > dims0.size()) {
+        dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1));
+    }
+
+    const std::size_t nbDims = dims0.size();
+
+    // Find the highest equal dimension
+    // std::size_t contiguousIdx = nbDims - 1;
+    std::size_t contiguousIdx = nbDims;
+    while (contiguousIdx-- > 0) {
+    // for (; contiguousIdx+1 > 0; --contiguousIdx) {
+        if (dims0[contiguousIdx] != dims1[contiguousIdx]) {
+            if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1
+                const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1;
+                while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) {
+                    --contiguousIdx;
+                }
+            }
+            break;
+        }
+    }
+    ++contiguousIdx;
+
+    // Compute the highest number of contiguous data for each Tensor
+    const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+
+    // initialize strides to iterate through data because of broadcasting
+    std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    if (contiguousIdx > 0) {
+        stride_post0[contiguousIdx - 1] = 1;
+        stride_post1[contiguousIdx - 1] = 1;
+        for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) {
+            stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]);
+            stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]);
+        }
+        for (std::size_t i = 0; i != contiguousIdx; ++i) {
+            stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1;
+            stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1;
+        }
+    }
+
+    // variables for arrays offsets
+    std::size_t offsetIn0 = 0;
+    std::size_t offsetIn1 = 0;
+    std::size_t offsetOut = 0;
+
+
+    std::size_t dim = contiguousIdx - 1;
+    const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
+    for (std::size_t stack = 0; stack < nbStacks;) {
+        add_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
+                    input_0 + offsetIn0*input0_contiguous_size,
+                    input_1 + offsetIn1*input1_contiguous_size,
+                    output + offsetOut*output_contiguous_size);
+        if (++stack < nbStacks) {
+            std::size_t tmp_stack = stack;
+            while(tmp_stack % outputDims[dim] == 0) {
+                tmp_stack /= outputDims[dim];
+                dim--;
+            }
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = contiguousIdx - 1;
+        }
+    }
 }
 
 // Kernels registration to implementation entry point
@@ -48,6 +154,12 @@ REGISTRAR(AddImpl_cpu,
 REGISTRAR(AddImpl_cpu,
     {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}},
     {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(AddImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int8}},
+    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int8_t, std::int8_t>, nullptr});
+REGISTRAR(AddImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::UInt8}},
+    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::uint8_t, std::uint8_t>, nullptr});
 REGISTRAR(AddImpl_cpu,
     {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}},
     {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr});
diff --git a/include/aidge/backend/cpu/operator/AndImpl.hpp b/include/aidge/backend/cpu/operator/AndImpl.hpp
index 316a2fb922596642088d133a7fec49c988739bb7..8814df2fac36be56332035731679b724b169efe7 100644
--- a/include/aidge/backend/cpu/operator/AndImpl.hpp
+++ b/include/aidge/backend/cpu/operator/AndImpl.hpp
@@ -23,7 +23,7 @@
 namespace Aidge {
 // Operator implementation entry point for the backend
 using AndImpl_cpu = OperatorImpl_cpu<And_Op,
-    void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*,void*)>;
+    void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*, void*)>;
 
 // Implementation entry point registration to Operator
 REGISTRAR(And_Op, "cpu", Aidge::AndImpl_cpu::create);
diff --git a/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp
index 197e829f3527ce2f36c3ef5ee812a26477633703..73b710e021ac5031923eb1e9a2492502c02a3633 100644
--- a/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp
@@ -12,52 +12,152 @@
 #ifndef AIDGE_CPU_OPERATOR_ANDIMPL_KERNELS_H_
 #define AIDGE_CPU_OPERATOR_ANDIMPL_KERNELS_H_
 
-#include "aidge/backend/cpu/data/Broadcasting.hpp"
 #include "aidge/backend/cpu/operator/AndImpl.hpp"
 #include "aidge/utils/Registrar.hpp"
 
 namespace Aidge {
-template <class I1, class I2, class O>
-void AndImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims,
-                                const std::vector<std::size_t>& input2Dims,
+
+namespace {
+// suppose values are contiguous in memory
+template <class I, class O>
+void equal_contiguous_arrays(const std::size_t input1size,
+                            const std::size_t input2size,
+                            const std::size_t output1size,
+                            const I* input1,
+                            const I* input2,
+                            O* output)
+{
+    for (std::size_t i = 0; i < output1size; ++i)
+    {
+        const std::size_t in1_id = (input1size != 1) ? i : 0;
+        const std::size_t in2_id = (input2size != 1) ? i : 0;
+        output[i] = static_cast<O>(input1[in1_id] == input2[in2_id]);
+    }
+}
+}
+
+
+template <class I, class O>
+void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
+                                std::vector<std::size_t> dims1,
                                 const std::vector<std::size_t>& outputDims,
+                                const void* input0_,
                                 const void* input1_,
-                                const void* input2_,
                                 void* output_) {
 
-    const I1* input_1 = static_cast<const I1*>(input1_);
-    const I2* input_2 = static_cast<const I2*>(input2_);
+    const I* input_0 = static_cast<const I*>(input0_);
+    const I* input_1 = static_cast<const I*>(input1_);
     O* output = static_cast<O*>(output_);
 
-    size_t totalElements = 1;
-    for (size_t dimSize : outputDims) {
-        totalElements *= dimSize;
+    // [5,2,1,7] & [2,6,7]
+    // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7]
+    // 2. Find the highest equal dimension -> 3
+    //    Exception: if the first diverging dimension is the last one, then -> 4 (dims.size())
+    // 3. Compute the highest number of contiguous data -> 7
+    // 4. Compute stride and offset step for the broadcast mechanism
+    // 5. Call a simple kernel
+
+    // special case for equal dimensions, the kernel is called with the entire arrays at once
+    if (dims0 == dims1) {
+        const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+        for (std::size_t i = 0; i < input0_contiguous_size; ++i)
+        {
+            output[i] = static_cast<O>(input_0[i] == input_1[i]);
+        }
+        return;
     }
 
-	for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex)
-	{
-		std::vector<size_t> indexes = getMultiDimIndices(outputDims, oIndex);
+    // set dimensions to be of equal size by filling the smallest one with ones.
+    if (dims0.size() > dims1.size()) {
+        dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1));
+    }
+    else if (dims1.size() > dims0.size()) {
+        dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1));
+    }
 
-		std::size_t idx1 = getFlattenedIndex(input1Dims, indexes);
-		std::size_t idx2 = getFlattenedIndex(input2Dims, indexes);
+    const std::size_t nbDims = dims0.size();
 
-        output[oIndex] = static_cast<O>(input_1[idx1] == input_2[idx2]);
+    // Find the highest equal dimension
+    // std::size_t contiguousIdx = nbDims - 1;
+    std::size_t contiguousIdx = nbDims;
+    while (contiguousIdx-- > 0) {
+    // for (; contiguousIdx+1 > 0; --contiguousIdx) {
+        if (dims0[contiguousIdx] != dims1[contiguousIdx]) {
+            if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1
+                const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1;
+                while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) {
+                    --contiguousIdx;
+                }
+            }
+            break;
+        }
+    }
+    ++contiguousIdx;
+
+    // Compute the highest number of contiguous data for each Tensor
+    const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+
+    // initialize strides to iterate through data because of broadcasting
+    std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    if (contiguousIdx > 0) {
+        stride_post0[contiguousIdx - 1] = 1;
+        stride_post1[contiguousIdx - 1] = 1;
+        for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) {
+            stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]);
+            stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]);
+        }
+        for (std::size_t i = 0; i != contiguousIdx; ++i) {
+            stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1;
+            stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1;
+        }
+    }
+
+    // variables for arrays offsets
+    std::size_t offsetIn0 = 0;
+    std::size_t offsetIn1 = 0;
+    std::size_t offsetOut = 0;
+
+
+    std::size_t dim = contiguousIdx - 1;
+    const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
+    for (std::size_t stack = 0; stack < nbStacks;) {
+        equal_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
+                    input_0 + offsetIn0*input0_contiguous_size,
+                    input_1 + offsetIn1*input1_contiguous_size,
+                    output + offsetOut*output_contiguous_size);
+        if (++stack < nbStacks) {
+            std::size_t tmp_stack = stack;
+            while(tmp_stack % outputDims[dim] == 0) {
+                tmp_stack /= outputDims[dim];
+                dim--;
+            }
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = contiguousIdx - 1;
+        }
     }
 }
 
 // Kernels registration to implementation entry point
 REGISTRAR(AndImpl_cpu,
     {DataType::Float32},
-    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<float, float, float>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<float, float>, nullptr});
 REGISTRAR(AndImpl_cpu,
     {DataType::Float64},
-    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<double, double, double>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<double, double>, nullptr});
 REGISTRAR(AndImpl_cpu,
     {DataType::Int32},
-    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr});
 REGISTRAR(AndImpl_cpu,
     {DataType::Int64},
-    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<std::int64_t, std::int64_t, std::int64_t>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, nullptr});
+
 }  // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_ANDIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/AtanImpl.hpp b/include/aidge/backend/cpu/operator/AtanImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2f1b4bf0ad666ff9856c24fa675b70d6f830b07c
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/AtanImpl.hpp
@@ -0,0 +1,33 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_ATAN_H_
+#define AIDGE_CPU_OPERATOR_ATAN_H_
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Atan.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include <memory>
+#include <vector>
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using AtanImpl_cpu = OperatorImpl_cpu<Atan_Op,
+    void(const std::size_t, const void*, void*),
+    void(const std::size_t, const void*, const void*, void*)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(Atan_Op, "cpu", Aidge::AtanImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_ATAN_H_ */
diff --git a/include/aidge/backend/cpu/operator/AtanImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AtanImpl_kernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2a786339503354514416705b61cfedfcc0b7c321
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/AtanImpl_kernels.hpp
@@ -0,0 +1,60 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_ATANIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_ATANIMPL_KERNELS_H_
+
+#include "aidge/utils/Registrar.hpp"
+
+#include "aidge/backend/cpu/operator/AtanImpl.hpp"
+#include <cmath>  // For atan()
+
+
+namespace Aidge {
+template <class I, class O>
+void AtanImpl_cpu_forward_kernel(std::size_t inputLenght,
+                                    const void* input_,
+                                    void* output_) {
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+
+    for (size_t i = 0; i < inputLenght; ++i) {
+        output[i] = static_cast<O>(atan(input[i]));
+    }
+
+}
+
+template <class O, class GI, class GO>
+void AtanImpl_cpu_backward_kernel(const std::size_t inputLenght,
+                                     const void* output_, const void* grad_output_,
+				     void* grad_input_) {
+    const O* output = static_cast<const O*>(output_);
+    const GO* grad_output = static_cast<const GO*>(grad_output_);
+    GI* grad_input = static_cast<GI*>(grad_input_);
+
+    // Apply the derivative of atan for each element in the input array
+    for (size_t i = 0; i < inputLenght; ++i) {
+        // dx = dy * (1 / (1 + x^2))
+        grad_input[i] = grad_output[i] * static_cast<O>(1.0 / (1.0 + output[i] * output[i]));
+    }
+}
+
+
+// Kernels registration to implementation entry point
+REGISTRAR(AtanImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::AtanImpl_cpu_forward_kernel<float, float>, Aidge::AtanImpl_cpu_backward_kernel<float, float, float>});
+REGISTRAR(AtanImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::AtanImpl_cpu_forward_kernel<double, double>, Aidge::AtanImpl_cpu_backward_kernel<double, double, double>});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_ATANIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/BatchNormImpl.hpp b/include/aidge/backend/cpu/operator/BatchNormImpl.hpp
index 36a100b21edc6cd63a0176c89f2f1e57c10001c7..03dd5d1d04d5263eb84843925a1ce9ee3263423f 100644
--- a/include/aidge/backend/cpu/operator/BatchNormImpl.hpp
+++ b/include/aidge/backend/cpu/operator/BatchNormImpl.hpp
@@ -29,7 +29,7 @@ using BatchNorm2D_Op = BatchNorm_Op<2>;
 using BatchNormImpl2D_cpu = OperatorImpl_cpu<BatchNorm_Op<2>,
     void(float,
         float,
-        const std::array<DimSize_t, 4> &,
+        const std::vector<DimSize_t> &,
         const void *,
         const void *,
         const void *,
diff --git a/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp b/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp
index ec71e3b8e37e344c551fd643dc7b3957bdddcb67..cf97f7372ac528ef28d0f378beb2650af32bfa30 100644
--- a/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp
@@ -38,7 +38,7 @@ namespace Aidge {
  * @param output_ Output Tensor.
  */
 template <class I, class P, class O>
-void BatchNormImpl2D_cpu_forward_kernel(float epsilon, float momentum, const std::array<DimSize_t, 4> &dims,
+void BatchNormImpl2D_cpu_forward_kernel(float epsilon, float momentum, const std::vector<DimSize_t> &dims,
                                        const void *input_, const void *scale_, const void *shift_, void *batchMean_, void *batchVar_, void *output_, const bool freeze) {
     // FIXME: missing convolution attributes as arguments
     const I *input = static_cast<const I *>(input_);
@@ -49,9 +49,8 @@ void BatchNormImpl2D_cpu_forward_kernel(float epsilon, float momentum, const std
     O *output = static_cast<O *>(output_);
 
     const DimSize_t nbBatch = dims[0];
-    const DimSize_t nbChannels = dims[1];
-    const DimSize_t featureMapSize = dims[2]*dims[3];
-
+    const DimSize_t nbChannels = (dims.size() > 1) ? dims[1] : 1;
+    const DimSize_t featureMapSize = (dims.size() > 2) ? std::accumulate(dims.begin() + 2, dims.end(), 1, std::multiplies<DimSize_t>()) : 1;
 
     if ((freeze == true) || (momentum == 0.0f)) {
         for (std::size_t batch = 0; batch < nbBatch; ++batch) {
diff --git a/include/aidge/backend/cpu/operator/BitShiftImpl.hpp b/include/aidge/backend/cpu/operator/BitShiftImpl.hpp
index 6da67bb7dd4469b6ca609c5aea1ae70dfca3f939..807d2b972ba385f9382d4121173a75207600d098 100644
--- a/include/aidge/backend/cpu/operator/BitShiftImpl.hpp
+++ b/include/aidge/backend/cpu/operator/BitShiftImpl.hpp
@@ -24,13 +24,13 @@ namespace Aidge {
 // Operator implementation entry point for the backend
 using BitShiftImpl_cpu = OperatorImpl_cpu<BitShift_Op,
     void(const BitShift_Op::BitShiftDirection,
-    const std::vector<std::size_t>&, 
-    const std::vector<std::size_t>&, 
-    const std::vector<std::size_t>&, 
-    const void*, 
+    std::vector<std::size_t>,
+    std::vector<std::size_t>,
+    const std::vector<std::size_t>&,
+    const void*,
     const void*,
     void*)>;
-    
+
     // Implementation entry point registration to Operator
     REGISTRAR(BitShift_Op,"cpu",Aidge::BitShiftImpl_cpu::create);
 }  // namespace Aidge
diff --git a/include/aidge/backend/cpu/operator/BitShiftImpl_kernels.hpp b/include/aidge/backend/cpu/operator/BitShiftImpl_kernels.hpp
index f815e946ea2e4abaff48a6e5155368d564e88e8c..1f2561afe0be9997116cbd82f754c485a1760090 100644
--- a/include/aidge/backend/cpu/operator/BitShiftImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/BitShiftImpl_kernels.hpp
@@ -12,47 +12,150 @@
 #ifndef AIDGE_CPU_OPERATOR_BITSHIFTIMPL_KERNELS_H_
 #define AIDGE_CPU_OPERATOR_BITSHIFTIMPL_KERNELS_H_
 
-#include "aidge/utils/Registrar.hpp"
 
-#include <cstdint>     // std::int32_t, std::int64_t
-#include "aidge/operator/BitShift.hpp"
+#include <cstdint>  // std::int32_t, std::int64_t
+#include <cstddef>  // std::size_t
 
 #include "aidge/backend/cpu/data/Broadcasting.hpp"
 #include "aidge/backend/cpu/operator/BitShiftImpl.hpp"
+#include "aidge/operator/BitShift.hpp"
+#include "aidge/utils/Registrar.hpp"
 
 
+namespace {
+// suppose values are contiguous in memory
+template <class I1, class I2, class O>
+void bitshift_contiguous_arrays(
+    const Aidge::BitShift_Op::BitShiftDirection direction,
+    const std::size_t input1size,
+    const std::size_t input2size,
+    const std::size_t output1size,
+    const I1* input_1,
+    const I2* input_2,
+    O* output)
+{
+    if(direction == Aidge::BitShift_Op::BitShiftDirection::right) {
+        for (std::size_t i = 0; i < output1size; ++i) {
+            const std::size_t idx1 = (input1size != 1) ? i : 0;
+            const std::size_t idx2 = (input2size != 1) ? i : 0;
+            output[i]= input_1[idx1] >> input_2[idx2];
+        }
+
+    } else {
+        for (std::size_t i = 0; i < output1size; ++i) {
+            const std::size_t idx1 = (input1size != 1) ? i : 0;
+            const std::size_t idx2 = (input2size != 1) ? i : 0;
+            output[i] = input_1[idx1] << input_2[idx2];
+        }
+    }
+}
+}
 
 namespace Aidge {
 template <class I1, class I2, class O>
 void BitShiftImpl_cpu_forward_kernel(
                                 const BitShift_Op::BitShiftDirection direction,
-                                const std::vector<std::size_t>& input1Dims,
-                                const std::vector<std::size_t>& input2Dims,
+                                std::vector<std::size_t> dims0,
+                                std::vector<std::size_t> dims1,
                                 const std::vector<std::size_t>& outputDims,
+                                const void* input0_,
                                 const void* input1_,
-                                const void* input2_,
                                 void* output_
                                 ) {
 
-    const I1* input_1 = static_cast<const I1*>(input1_);
-    const I2* input_2 = static_cast<const I2*>(input2_);
+    const I1* input_0 = static_cast<const I1*>(input0_);
+    const I2* input_1 = static_cast<const I2*>(input1_);
     O* output = static_cast<O*>(output_);
 
-    const size_t totalElements = std::accumulate(outputDims.begin(), outputDims.end(), std::size_t(1), std::multiplies<std::size_t>());
-    
-    for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex)
-    {
-        std::vector<size_t> indexes = getMultiDimIndices(outputDims, oIndex);
-        std::size_t idx1 = getFlattenedIndex(input1Dims, indexes);
-        std::size_t idx2 = getFlattenedIndex(input2Dims, indexes);
-        if(direction == BitShift_Op::BitShiftDirection::right)
-
-        {
-                output[oIndex]= input_1[idx1] >> input_2[idx2];
+    // [5,2,1,7] & [2,6,7]
+    // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7]
+    // 2. Find the highest equal dimension -> 3
+    //    Exception: if the first diverging dimension is the last one, then -> 4 (dims.size())
+    // 3. Compute the highest number of contiguous data -> 7
+    // 4. Compute stride and offset step for the broadcast mechanism
+    // 5. Call a simple kernel
+
+    // ## Compute compatible input dimensions
+    // special case for equal dimensions, the kernel is called with the entire arrays at once
+    if (dims0 == dims1) {
+        const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+        bitshift_contiguous_arrays(direction, input0_contiguous_size, input0_contiguous_size, input0_contiguous_size, input_0, input_1, output);
+        return;
+    }
+
+    // set dimensions to be of equal size by filling the smallest one with ones.
+    if (dims0.size() > dims1.size()) {
+        dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1));
+    }
+    else if (dims1.size() > dims0.size()) {
+        dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1));
+    }
+
+    const std::size_t nbDims = dims0.size();
+
+    // Find the highest equal dimension
+    // std::size_t contiguousIdx = nbDims - 1;
+    std::size_t contiguousIdx = nbDims;
+    while (contiguousIdx-- > 0) {
+    // for (; contiguousIdx+1 > 0; --contiguousIdx) {
+        if (dims0[contiguousIdx] != dims1[contiguousIdx]) {
+            if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1
+                const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1;
+                while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) {
+                    --contiguousIdx;
+                }
+            }
+            break;
         }
-        else
-        {
-                output[oIndex] = input_1[idx1] << input_2[idx2];
+    }
+    ++contiguousIdx;
+
+    // Compute the highest number of contiguous data for each Tensor
+    const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+
+    // initialize strides to iterate through data because of broadcasting
+    std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    if (contiguousIdx > 0) {
+        stride_post0[contiguousIdx - 1] = 1;
+        stride_post1[contiguousIdx - 1] = 1;
+        for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) {
+            stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]);
+            stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]);
+        }
+        for (std::size_t i = 0; i != contiguousIdx; ++i) {
+            stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1;
+            stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1;
+        }
+    }
+
+    // variables for arrays offsets
+    std::size_t offsetIn0 = 0;
+    std::size_t offsetIn1 = 0;
+    std::size_t offsetOut = 0;
+
+
+    std::size_t dim = contiguousIdx - 1;
+    const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
+    for (std::size_t stack = 0; stack < nbStacks;) {
+        bitshift_contiguous_arrays<I1,I2,O>(direction, input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
+                    input_0 + offsetIn0*input0_contiguous_size,
+                    input_1 + offsetIn1*input1_contiguous_size,
+                    output + offsetOut*output_contiguous_size);
+        if (++stack < nbStacks) {
+            std::size_t tmp_stack = stack;
+            while(tmp_stack % outputDims[dim] == 0) {
+                tmp_stack /= outputDims[dim];
+                dim--;
+            }
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = contiguousIdx - 1;
         }
     }
 }
diff --git a/include/aidge/backend/cpu/operator/ClipImpl.hpp b/include/aidge/backend/cpu/operator/ClipImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c83836d5aa1d6aae27e3fdce1bbb9561b70ec31e
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/ClipImpl.hpp
@@ -0,0 +1,46 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_CLIPIMPL_H_
+#define AIDGE_CPU_OPERATOR_CLIPIMPL_H_
+
+#include <cstddef>  // std::size_t
+#include <memory>
+#include <tuple>    // std::tuple
+#include <vector>
+#include <algorithm>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Clip.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+    using ClipImpl_cpu = OperatorImpl_cpu<Clip_Op,
+    void(float, //Forward Types
+    float, 
+    const void*,
+    const std::size_t, 
+    void*),
+    void(float,//Backward Types
+    float, 
+    const std::size_t,
+    const void*, 
+    const void*,
+    void*)>;
+
+    REGISTRAR(Clip_Op,"cpu",Aidge::ClipImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_CLIPIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/ClipImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ClipImpl_kernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1afac4698be2a63790ebac671ecc1e59166c5f94
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/ClipImpl_kernels.hpp
@@ -0,0 +1,77 @@
+
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_CLIPIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_CLIPIMPL_KERNELS_H_
+
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/backend/cpu/operator/ClipImpl.hpp"
+
+namespace Aidge {
+template <class I, class O>
+void ClipImpl_cpu_forward_kernel(
+        float min_,
+        float max_,
+        const void* input_,
+        const std::size_t length,
+        void* output_) 
+{
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+
+    for (std::size_t i = 0; i < length; ++i) {
+        output[i] = std::min(std::max(static_cast<float>(input[i]), min_), max_);
+    }
+}
+
+template <class I, class GI, class GO>
+void ClipImpl_cpu_backward_kernel(
+        float min_,
+        float max_,
+        const std::size_t length,
+        const void* input_, 
+        const void* grad_output_,
+		void* grad_input_)           
+{
+    const I* input = static_cast<const I*>(input_);
+    const GO* grad_output = static_cast<const GO*>(grad_output_);
+    GI* grad_input = static_cast<GI*>(grad_input_);
+
+    for (std::size_t i = 0; i < length; ++i) {
+        grad_input[i] = ((input[i] > min_) && (input[i] < max_)) ? grad_output[i] : 0;
+    }
+}
+
+REGISTRAR(ClipImpl_cpu,
+{DataType::Float32},
+{ProdConso::inPlaceModel,
+Aidge::ClipImpl_cpu_forward_kernel<float,float>,
+Aidge::ClipImpl_cpu_backward_kernel<float,float,float>});
+REGISTRAR(ClipImpl_cpu,
+{DataType::Float64},
+{ProdConso::inPlaceModel,
+Aidge::ClipImpl_cpu_forward_kernel<double,double>,
+Aidge::ClipImpl_cpu_backward_kernel<double,double,double>});
+REGISTRAR(ClipImpl_cpu,
+{DataType::Int32},
+{ProdConso::inPlaceModel,
+Aidge::ClipImpl_cpu_forward_kernel<std::int32_t,std::int32_t>,
+Aidge::ClipImpl_cpu_backward_kernel<std::int32_t,std::int32_t,std::int32_t>});
+REGISTRAR(ClipImpl_cpu,
+{DataType::Int64},
+{ProdConso::inPlaceModel,
+Aidge::ClipImpl_cpu_forward_kernel<std::int64_t,std::int64_t>,
+Aidge::ClipImpl_cpu_backward_kernel<std::int64_t,std::int64_t,std::int64_t>});
+
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_CLIPIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp
index 59a471aee82f7c706be390d80b5db569bd3c6f1e..46ae59877bee1b87a9a17be242434d3caca7aae2 100644
--- a/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp
@@ -137,6 +137,7 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri
     const std::size_t oxSize =
             static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilated_kernel_x + strideDims[0]) /
                                 static_cast<float>(strideDims[0])));
+
     // output W size
     const DimSize_t dilated_kernel_y = dilationDims[1]*(kernelDims[1] - 1) + 1;
     const std::size_t oySize =
@@ -148,54 +149,106 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri
     // input  (batch, ch, Xin, Yin)
     // weight (outCh, ch, kernelX, kernelY)
     // does not take Dilation attribute into account
-    using signedsize = std::make_signed<std::size_t>::type;
-    for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
-        for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
-            const std::size_t oIndex = (ch + batch*inputDims[1]) * oxSize * oySize;
-            B biasVal = (biases != nullptr) ? biases[ch] : B(0);
-            std::fill(output + oIndex, output+(oIndex+oxSize*oySize), biasVal);
-            const std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
-            const std::size_t wIndex = ch * kernelDims[0] * kernelDims[1];
-            for (std::size_t ox = 0; ox < oxSize; ++ox) {
-                // const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]);
-                // const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
-                // const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx);
-                const std::size_t sxMin = 0;
-                const std::size_t sxMax = dilated_kernel_x;
-                for (std::size_t oy = 0; oy < oySize; ++oy) {
-                    // const signedsize dify = static_cast<signedsize>(- oy * strideDims[1]);
-                    // const std::size_t syMin = static_cast<std::size_t>(std::max(dify, signedsize(0)));
-                    // const std::size_t syMax = (static_cast<signedsize>(inputDims[3]) + dify) < 0 ? 0 : ((inputDims[3] + dify) > kernelDims[1] ? kernelDims[1] : inputDims[3] + dify);
-                    const std::size_t syMin = 0;
-                    const std::size_t syMax = dilated_kernel_y;
-                    const std::size_t oIndexFull = oIndex + ox*oySize + oy;
-                    const signedsize ix = static_cast<signedsize>(ox * strideDims[0]);
-                    const signedsize iy = static_cast<signedsize>(oy * strideDims[1]);
-
-                    if (sxMin == 0 && syMin == 0 && sxMax == 3 && syMax == 3) {
-                        output[oIndexFull] +=  (weights[wIndex + 0*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
-                                                weights[wIndex + 0*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
-                                                weights[wIndex + 0*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+2)] +
-                                                weights[wIndex + 1*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
-                                                weights[wIndex + 1*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
-                                                weights[wIndex + 1*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+2)] +
-                                                weights[wIndex + 2*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
-                                                weights[wIndex + 2*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
-                                                weights[wIndex + 2*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+2)]);
-                    } else {
-                        for (std::size_t sx = sxMin; sx*dilationDims[0] < sxMax; ++sx) {
-                            for (std::size_t sy = syMin; sy*dilationDims[1] < syMax; ++sy) {
+    const std::size_t outChannels_s =  oxSize * oySize;
+
+    if (dilated_kernel_x ==3 && dilated_kernel_y == 3) {
+        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+            for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
+
+                B biasVal = (biases != nullptr) ? biases[ch] : B(0);
+
+                std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
+                const std::size_t wIndex = ch * 9;
+
+                if (strideDims[0] == 1 && strideDims[1]==1) {
+                    for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=inputDims[3]) {
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
+                            output[oIndex + oy] = biasVal + weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+1]+weights[wIndex+2]*input[iIndex+oy+2];
+                        }
+                        iIndex+=inputDims[3];
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
+                            output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy]+weights[wIndex+4]*input[iIndex+oy+1]+weights[wIndex+5]*input[iIndex+oy+2];
+                        }
+                        iIndex+=inputDims[3];
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
+                            output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy]+weights[wIndex+7]*input[iIndex+oy+1]+weights[wIndex+8]*input[iIndex+oy+2];
+                        }
+                    }
+                } else {
+                    for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=strideDims[0]*inputDims[3]) {
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
+                            output[oIndex + oy] += weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+strideDims[0]]+weights[wIndex+2]*input[iIndex+oy+strideDims[0]*2];
+                        }
+                        iIndex+=strideDims[0]*inputDims[3];
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
+                            output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy]+weights[wIndex+4]*input[iIndex+oy+strideDims[0]]+weights[wIndex+5]*input[iIndex+oy+strideDims[0]*2];
+                        }
+                        iIndex+=strideDims[0]*inputDims[3];
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
+                            output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy]+weights[wIndex+7]*input[iIndex+oy+strideDims[0]]+weights[wIndex+8]*input[iIndex+oy+strideDims[0]*2];
+                        }
+                    }
+                }
+                output += outChannels_s;
+            }
+        }
+    } else if (dilated_kernel_x == 1 && dilated_kernel_y == 1) {
+        std::size_t index = 0;
+        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+            for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
+
+                B biasVal = (biases != nullptr) ? biases[ch] : B(0);
+
+                const std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
+                const std::size_t wIndex = ch;
+
+                if (strideDims[0] == 1 && strideDims[1] == 1) {
+                    for (; index < iIndex + oxSize*oySize; ++index) {
+                        output[index] = biasVal + weights[wIndex] * input[index];
+                    }
+                } else  {
+                    std::size_t oIndex =  (ch + batch*inputDims[1]) * oxSize * oySize;
+                    for (std::size_t ox = 0; ox < oxSize; ++ox, oIndex+=oySize) {
+                        index = iIndex + strideDims[0]*inputDims[3];
+                        for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) {
+                            output[oIndex + oy] += weights[wIndex]*input[index+iy];
+                        }
+                    }
+                }
+            }
+        }
+    } else {
+        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+            for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
+
+                B biasVal = (biases != nullptr) ? biases[ch] : B(0);
+                std::fill(output, output+outChannels_s, biasVal);
+
+                const std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
+                const std::size_t wIndex = ch * kernelDims[0] * kernelDims[1];
+
+                for (std::size_t ox = 0; ox < oxSize; ++ox) {
+                    for (std::size_t oy = 0; oy < oySize; ++oy) {
+
+                        const std::size_t oIndexFull = ox*oySize + oy;
+                        const std::size_t ix = ox * strideDims[0];
+                        const std::size_t iy = oy * strideDims[1];
+
+                        for (std::size_t sx = 0; sx*dilationDims[0] < dilated_kernel_x; ++sx) {
+                            for (std::size_t sy = 0; sy*dilationDims[1] < dilated_kernel_y; ++sy) {
                                 output[oIndexFull] += weights[wIndex + sx*kernelDims[1] + sy] *
-                                                        input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx*dilationDims[0]))*inputDims[3] + static_cast<std::size_t>(iy+static_cast<signedsize>(sy*dilationDims[1]))];
+                                                        input[iIndex + static_cast<std::size_t>(ix + sx*dilationDims[0])*inputDims[3] + static_cast<std::size_t>(iy + sy*dilationDims[1])];
                             }
                         }
                     }
                 }
             }
+            output += outChannels_s;
         }
     }
 }
 
+
 // Kernels registration to implementation entry point
 REGISTRAR(ConvDepthWiseImpl2D_cpu,
     {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}},
diff --git a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
index e800c252676ec5247a776abf458f808289b278c8..e3b709bf308288a93fd72865a2fdef0e58908134 100644
--- a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
@@ -141,15 +141,15 @@ void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
     O *output = static_cast<O *>(output_);
 
     // output H size
+    const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1;
     const std::size_t oxSize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilationDims[0]*(kernelDims[0] - 1) - 1 + strideDims[0]) /
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilated_kernel_x + strideDims[0]) /
                                 static_cast<float>(strideDims[0])));
-    const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1;
     // output W size
+    const DimSize_t dilated_kernel_y = dilationDims[1]*(kernelDims[1] - 1) + 1;
     const std::size_t oySize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[3] - dilationDims[1]*(kernelDims[1] - 1) - 1 + strideDims[1]) /
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[3] - dilated_kernel_y + strideDims[1]) /
                                 static_cast<float>(strideDims[1])));
-    const DimSize_t dilated_kernel_y = dilationDims[1]*(kernelDims[1] - 1) + 1;
 
 
     // TODO: kernel computation
@@ -157,57 +157,107 @@ void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
     // input  (batch, inCh, Xin, Yin)
     // weight (outCh, inCh, kernelX, kernelY)
     // does not take Dilation attribute into account
-    using signedsize = std::make_signed<std::size_t>::type;
-    for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
-        for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
-            const std::size_t oIndex = (outCh + batch*outChannels) * oxSize * oySize;
-            // If bias = nullptr, set B(0)
-            B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
-            std::fill(output + oIndex, output+(oIndex+oxSize*oySize), biasVal);
-            for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
-                const std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
-                const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0] * kernelDims[1];
-                for (std::size_t ox = 0; ox < oxSize; ++ox) {
-                    // const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]);
-                    // const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
-                    // const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx);
-                    const std::size_t sxMin = 0;
-                    const std::size_t sxMax = dilated_kernel_x;
-                    for (std::size_t oy = 0; oy < oySize; ++oy) {
-                        // const signedsize dify = static_cast<signedsize>(- oy * strideDims[1]);
-                        // const std::size_t syMin = static_cast<std::size_t>(std::max(dify, signedsize(0)));
-                        // const std::size_t syMax = (static_cast<signedsize>(inputDims[3]) + dify) < 0 ? 0 : ((inputDims[3] + dify) > kernelDims[1] ? kernelDims[1] : inputDims[3] + dify);
-                        const std::size_t syMin = 0;
-                        const std::size_t syMax = dilated_kernel_y;
-                        const std::size_t oIndexFull = oIndex + ox*oySize + oy;
-                        const signedsize ix = static_cast<signedsize>(ox * strideDims[0]);
-                        const signedsize iy = static_cast<signedsize>(oy * strideDims[1]);
-
-                        if (sxMin == 0 && syMin == 0 && sxMax == 3 && syMax == 3) {
-                            output[oIndexFull] += (weights[wIndex + 0*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
-                                                   weights[wIndex + 0*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
-                                                   weights[wIndex + 0*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+2)] +
-                                                   weights[wIndex + 1*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
-                                                   weights[wIndex + 1*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
-                                                   weights[wIndex + 1*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+2)] +
-                                                   weights[wIndex + 2*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
-                                                   weights[wIndex + 2*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
-                                                   weights[wIndex + 2*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+2)]);
-                        } else {
-                            for (std::size_t sx = sxMin; sx*dilationDims[0] < sxMax; ++sx) {
-                                for (std::size_t sy = syMin; sy*dilationDims[1] < syMax; ++sy) {
-                                    output[oIndexFull] += weights[wIndex + sx*kernelDims[1] + sy] *
-                                                            input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx*dilationDims[0]))*inputDims[3] + static_cast<std::size_t>(iy+static_cast<signedsize>(sy*dilationDims[1]))];
+    const std::size_t outChannels_s =  oxSize * oySize;
+
+    if (dilated_kernel_x == 3 && dilated_kernel_y == 3) {
+        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+            for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
+                // If bias = nullptr, set B(0)
+                B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
+                std::fill(output, output+outChannels_s, biasVal);
+                for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
+                    std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
+                    const std::size_t wIndex = (inCh + outCh*inputDims[1]) * 9;
+                    if (strideDims[0] == 1 && strideDims[1]==1) {
+                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=inputDims[3]) {
+                            for (std::size_t oy = 0; oy < oySize; ++oy) {
+                                output[oIndex + oy] += weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+1]+weights[wIndex+2]*input[iIndex+oy+2];
+                            }
+                            iIndex+=inputDims[3];
+                            for (std::size_t oy = 0; oy < oySize; ++oy) {
+                                output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy]+weights[wIndex+4]*input[iIndex+oy+1]+weights[wIndex+5]*input[iIndex+oy+2];
+                            }
+                            iIndex+=inputDims[3];
+                            for (std::size_t oy = 0; oy < oySize; ++oy) {
+                                output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy]+weights[wIndex+7]*input[iIndex+oy+1]+weights[wIndex+8]*input[iIndex+oy+2];
+                            }
+                        }
+                    } else {
+                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=strideDims[0]*inputDims[3]) {
+                            for (std::size_t oy = 0; oy < oySize; ++oy) {
+                                output[oIndex + oy] += weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+strideDims[0]]+weights[wIndex+2]*input[iIndex+oy+strideDims[0]*2];
+                            }
+                            iIndex+=strideDims[0]*inputDims[3];
+                            for (std::size_t oy = 0; oy < oySize; ++oy) {
+                                output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy]+weights[wIndex+4]*input[iIndex+oy+strideDims[0]]+weights[wIndex+5]*input[iIndex+oy+strideDims[0]*2];
+                            }
+                            iIndex+=strideDims[0]*inputDims[3];
+                            for (std::size_t oy = 0; oy < oySize; ++oy) {
+                                output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy]+weights[wIndex+7]*input[iIndex+oy+strideDims[0]]+weights[wIndex+8]*input[iIndex+oy+strideDims[0]*2];
+                            }
+                        }
+                    }
+                }
+                output += outChannels_s;
+            }
+        }
+    } else if (dilated_kernel_x == 1 && dilated_kernel_y == 1) {
+        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+            for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
+                // If bias = nullptr, set B(0)
+                B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
+                std::fill(output, output+outChannels_s, biasVal);
+                for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
+                    std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
+                    const std::size_t wIndex = (inCh + outCh*inputDims[1]);
+                    if (strideDims[0] == 1 && strideDims[1] == 1) {
+                        for (std::size_t oIndex = 0; oIndex < oxSize*oySize; ++oIndex, ++iIndex) {
+                            output[oIndex] += weights[wIndex] * input[iIndex];
+                        }
+                    } else  {
+                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=inputDims[3]*strideDims[0]) {
+                            for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) {
+                                output[oIndex + oy] += weights[wIndex+0]*input[iIndex+iy];
+                            }
+                        }
+                    }
+                }
+                output += outChannels_s;
+            }
+        }
+    } else {
+        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+            for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
+                // If bias = nullptr, set B(0)
+                B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
+                std::fill(output, output+outChannels_s, biasVal);
+                for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
+                    std::size_t iIndex_channel = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
+                    const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0] * kernelDims[1];
+
+                    // loop over each ouput line
+                    for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex_channel+=inputDims[3]*strideDims[0]) {
+                        // loop over associated input line
+                        for (std::size_t ky = 0, ix = 0; ky < kernelDims[0]; ++ky, ix += inputDims[3]*dilationDims[0]) {
+                            // loop over the entire line
+                            for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) {
+                                const std::size_t iIndex = iIndex_channel + ix + iy;
+                                // loop over elements assosicated with one output
+                                for (std::size_t kx = 0;  kx < kernelDims[0]; ++kx) {
+                                    output[oIndex + oy] += weights[wIndex+kernelDims[0]*ky+kx]*input[iIndex+kx*dilationDims[1]];
                                 }
                             }
                         }
                     }
                 }
+                output += outChannels_s;
             }
         }
     }
 }
 
+
+
 // Kernels registration to implementation entry point
 REGISTRAR(ConvImpl2D_cpu,
     {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}},
diff --git a/include/aidge/backend/cpu/operator/GridSampleImpl_kernels.hpp b/include/aidge/backend/cpu/operator/GridSampleImpl_kernels.hpp
index ea62fd010db8c155a3ff86ff8396797da5ebb6be..3461b254b7beecf3e7a41e90a7e40d3f6ecf6a36 100644
--- a/include/aidge/backend/cpu/operator/GridSampleImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/GridSampleImpl_kernels.hpp
@@ -65,7 +65,7 @@ static float update_normalized_coord_with_padding(float coord, Aidge::GridSample
     return coord;
 }
 
-static inline std::int64_t update_unnormalized_coord_with_padding(std::int64_t coord, std::int64_t size, Aidge::GridSample_Op::PaddingMode padding_mode) {
+static std::int64_t update_unnormalized_coord_with_padding(std::int64_t coord, std::int64_t size, Aidge::GridSample_Op::PaddingMode padding_mode) {
     if (!in_bound(coord, 0, size)) {
         // out of bound. switch padding mode
         if (padding_mode == Aidge::GridSample_Op::PaddingMode::Border) {
@@ -96,11 +96,11 @@ void GridSampleImpl1D_cpu_forward_kernel(const GridSample_Op& op,
                             const std::shared_ptr<Tensor>& in1,
                             const std::shared_ptr<Tensor>& out)
 {
-    const I* const input = static_cast<const I * const>(in0->getImpl()->rawPtr());
+    const I* const input = static_cast<const I *>(in0->getImpl()->rawPtr());
     const I* input_ptr = input;
-    float* const grid = static_cast<float* const>(in1->getImpl()->rawPtr());
+    float* const grid = static_cast<float*>(in1->getImpl()->rawPtr());
     float* grid_ptr = grid;
-    O* const output = static_cast<O* const>(out->getImpl()->rawPtr());
+    O* const output = static_cast<O*>(out->getImpl()->rawPtr());
     O* output_ptr = output;
 
     const std::size_t N = in0->dim(0);
@@ -243,9 +243,9 @@ void GridSampleImpl2D_cpu_forward_kernel(const GridSample_Op& op,
 {
     const I* input = static_cast<const I *>(in0->getImpl()->rawPtr());
     const I* input_ptr = input;
-    float* const grid = static_cast<float* const>(in0->getImpl()->rawPtr());
+    float* const grid = static_cast<float*>(in0->getImpl()->rawPtr());
     float* grid_ptr = grid;
-    O* const output = static_cast<O* const>(out->getImpl()->rawPtr());
+    O* const output = static_cast<O*>(out->getImpl()->rawPtr());
 
     const std::size_t N = in0->dim(0);
     const std::size_t C = in0->dim(1);
diff --git a/include/aidge/backend/cpu/operator/LRNImpl.hpp b/include/aidge/backend/cpu/operator/LRNImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..81956c8763010d6294bd4a11a943e66fb93a64eb
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/LRNImpl.hpp
@@ -0,0 +1,32 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_LRNIMPL_H_
+#define AIDGE_CPU_OPERATOR_LRNIMPL_H_
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/LRN.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include <memory>
+#include <vector>
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using LRNImpl_cpu = OperatorImpl_cpu<LRN_Op,
+    void(float, float, float, std::size_t, const std::vector<DimSize_t>&, const void*, void*)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(LRN_Op, "cpu", Aidge::LRNImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_LRNIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/LRNImpl_kernels.hpp b/include/aidge/backend/cpu/operator/LRNImpl_kernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..02018c9f8e002965584df38a95364ca10f69f8b7
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/LRNImpl_kernels.hpp
@@ -0,0 +1,69 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_LRNIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_LRNIMPL_KERNELS_H_
+
+#include "aidge/utils/Registrar.hpp"
+#include <cstddef>
+#include <cmath>
+#include "aidge/data/Data.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+
+#include "aidge/backend/cpu/operator/LRNImpl.hpp"
+
+namespace Aidge {
+template <class I, class O>
+void LRNImpl_cpu_forward_kernel(float alpha, float beta, float bias, std::size_t size, const std::vector<DimSize_t>& inputDims, const void* input_, void* output_)
+{
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+
+    const DimSize_t nbBatch = inputDims[0];
+    const DimSize_t nbChannels = (inputDims.size() > 1) ? inputDims[1] : 1;
+    const DimSize_t featureMapSize = (inputDims.size() > 2) ? std::accumulate(inputDims.begin() + 2, inputDims.end(), 1, std::multiplies<DimSize_t>()) : 1;
+
+    for (std::size_t batch = 0; batch < nbBatch; ++batch) {
+        for (std::size_t ch = 0; ch < nbChannels; ++ch) {
+            const std::size_t ioIndex = (ch + batch*nbChannels) * featureMapSize;
+            const unsigned int channelMin
+                = std::max<int>(0, ch - size / 2);
+            const unsigned int channelMax
+                = std::min<size_t>(nbChannels - 1, ch + size / 2);
+
+            for (std::size_t feature = 0; feature<featureMapSize; ++feature) {
+                // For each input channel, accumulate the value
+                O accAccrossChannels(0.0);
+
+                for (unsigned int accChannel = channelMin;
+                    accChannel < channelMax; ++accChannel)
+                {
+                    accAccrossChannels += input[ioIndex + feature];
+                }
+
+                // Compute the output signal
+                output[ioIndex + feature] = input[ioIndex + feature]
+                    / std::pow((bias + (accAccrossChannels * accAccrossChannels) * alpha), beta);
+            }
+        }
+    }
+}
+
+REGISTRAR(LRNImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::LRNImpl_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(LRNImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::LRNImpl_cpu_forward_kernel<double, double>, nullptr});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_LRNIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/MulImpl.hpp b/include/aidge/backend/cpu/operator/MulImpl.hpp
index 05fceba17471229d83d9f8738614b2e747121b49..c927af9ebd4d658c764cc059df9778c273ba178e 100644
--- a/include/aidge/backend/cpu/operator/MulImpl.hpp
+++ b/include/aidge/backend/cpu/operator/MulImpl.hpp
@@ -23,21 +23,21 @@
 namespace Aidge {
 // Operator implementation entry point for the backend
 using MulImpl_cpu = OperatorImpl_cpu<Mul_Op,
-    void(const std::vector<std::size_t>&,
-        const std::vector<std::size_t>&, 
-        const std::vector<std::size_t>&, 
-        const void*, 
+    void(std::vector<std::size_t>,
+        std::vector<std::size_t>,
+        const std::vector<std::size_t>&,
+        const void*,
         const void*,
         void*),
-    void(const std::size_t, 
-        const std::size_t, 
+    void(const std::size_t,
+        const std::size_t,
         const std::size_t,
         const std::vector<std::size_t>,
         const std::vector<std::size_t>,
-        const void*, 
-        const void*, 
-        const void*, 
-        void*, 
+        const void*,
+        const void*,
+        const void*,
+        void*,
         void*)>;
 
 // Implementation entry point registration to Operator
diff --git a/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp
index c015b8f0182608fecd3da94220e9411decfd186c..556dd56cd32f28de14a43d20b97deb0083341fee 100644
--- a/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp
@@ -19,44 +19,143 @@
 #include "aidge/backend/cpu/data/Broadcasting.hpp"
 #include "aidge/backend/cpu/operator/MulImpl.hpp"
 
+namespace {
+// suppose values are contiguous in memory
+template <class I1, class I2, class O>
+void mul_contiguous_arrays(const std::size_t input1size,
+                            const std::size_t input2size,
+                            const std::size_t output1size,
+                            const I1* input1,
+                            const I2* input2,
+                            O* output)
+{
+    for (std::size_t i = 0; i < output1size; ++i)
+    {
+        const std::size_t in1_id = (input1size != 1) ? i : 0;
+        const std::size_t in2_id = (input2size != 1) ? i : 0;
+        output[i] = static_cast<O>(input1[in1_id] * input2[in2_id]);
+    }
+}
+}
+
 namespace Aidge {
+
 template <class I1, class I2, class O>
-void MulImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims,
-                                const std::vector<std::size_t>& input2Dims,
+void MulImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
+                                std::vector<std::size_t> dims1,
                                 const std::vector<std::size_t>& outputDims,
+                                const void* input0_,
                                 const void* input1_,
-                                const void* input2_,
                                 void* output_) {
-
-    const I1* input_1 = static_cast<const I1*>(input1_);
-    const I2* input_2 = static_cast<const I2*>(input2_);
+    const I1* input_0 = static_cast<const I1*>(input0_);
+    const I2* input_1 = static_cast<const I2*>(input1_);
     O* output = static_cast<O*>(output_);
 
-    size_t totalElements = 1;
-    for (size_t dimSize : outputDims) {
-        totalElements *= dimSize;
+    // [5,2,1,7] & [2,6,7]
+    // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7]
+    // 2. Find the highest equal dimension -> 3
+    //    Exception: if the first diverging dimension is the last one, then -> 4 (dims.size())
+    // 3. Compute the highest number of contiguous data -> 7
+    // 4. Compute stride and offset step for the broadcast mechanism
+    // 5. Call a simple kernel
+
+    // ## Compute compatible input dimensions
+    // special case for equal dimensions, the kernel is called with the entire arrays at once
+    if (dims0 == dims1) {
+        const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+        for (std::size_t i = 0; i < input0_contiguous_size; ++i)
+        {
+            output[i] = static_cast<O>(input_0[i] * input_1[i]);
+        }
+        return;
     }
 
-	for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex)
-	{
-		std::vector<size_t> indexes = getMultiDimIndices(outputDims, oIndex);
+    // set dimensions to be of equal size by filling the smallest one with ones.
+    if (dims0.size() > dims1.size()) {
+        dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1));
+    }
+    else if (dims1.size() > dims0.size()) {
+        dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1));
+    }
 
-		std::size_t idx1 = getFlattenedIndex(input1Dims, indexes);
-		std::size_t idx2 = getFlattenedIndex(input2Dims, indexes);
+    const std::size_t nbDims = dims0.size();
+
+    // Find the highest equal dimension
+    // std::size_t contiguousIdx = nbDims - 1;
+    std::size_t contiguousIdx = nbDims;
+    while (contiguousIdx-- > 0) {
+    // for (; contiguousIdx+1 > 0; --contiguousIdx) {
+        if (dims0[contiguousIdx] != dims1[contiguousIdx]) {
+            if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1
+                const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1;
+                while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) {
+                    --contiguousIdx;
+                }
+            }
+            break;
+        }
+    }
+    ++contiguousIdx;
+
+    // Compute the highest number of contiguous data for each Tensor
+    const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+
+    // initialize strides to iterate through data because of broadcasting
+    std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    if (contiguousIdx > 0) {
+        stride_post0[contiguousIdx - 1] = 1;
+        stride_post1[contiguousIdx - 1] = 1;
+        for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) {
+            stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]);
+            stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]);
+        }
+        for (std::size_t i = 0; i != contiguousIdx; ++i) {
+            stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1;
+            stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1;
+        }
+    }
 
-        output[oIndex] = input_1[idx1] * input_2[idx2];
+    // variables for arrays offsets
+    std::size_t offsetIn0 = 0;
+    std::size_t offsetIn1 = 0;
+    std::size_t offsetOut = 0;
+
+
+    std::size_t dim = contiguousIdx - 1;
+    const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
+    for (std::size_t stack = 0; stack < nbStacks;) {
+        mul_contiguous_arrays<I1,I2,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
+                    input_0 + offsetIn0*input0_contiguous_size,
+                    input_1 + offsetIn1*input1_contiguous_size,
+                    output + offsetOut*output_contiguous_size);
+        if (++stack < nbStacks) {
+            std::size_t tmp_stack = stack;
+            while(tmp_stack % outputDims[dim] == 0) {
+                tmp_stack /= outputDims[dim];
+                dim--;
+            }
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = contiguousIdx - 1;
+        }
     }
 }
 
 template <class I1, class I2, class O>
-void MulImpl_cpu_backward_kernel(const std::size_t input0Length, 
+void MulImpl_cpu_backward_kernel(const std::size_t input0Length,
                                  const std::size_t input1Length,
                                  const std::size_t grad0Length,
                                  const std::vector<std::size_t> input0Dims,
                                  const std::vector<std::size_t> input1Dims,
-                                 const void* input0_, 
-                                 const void* input1_, 
-                                 const void* grad_output_, 
+                                 const void* input0_,
+                                 const void* input1_,
+                                 const void* grad_output_,
                                  void* gradientInput0,
                                  void* gradientInput1)
 {
diff --git a/include/aidge/backend/cpu/operator/OperatorImpl.hpp b/include/aidge/backend/cpu/operator/OperatorImpl.hpp
index abf94ab9069a07e8f87819cb29c027b1adbfd9c6..45f099e8140395181d8be1600c61024efaa9c6a7 100644
--- a/include/aidge/backend/cpu/operator/OperatorImpl.hpp
+++ b/include/aidge/backend/cpu/operator/OperatorImpl.hpp
@@ -38,8 +38,10 @@ public:
         return impl.prodConso(mOp);
     }
 
-    virtual std::set<ImplSpec> getAvailableImplSpecs() const override {
-        return Registrar<OperatorImpl_cpu>::getKeys();
+    virtual std::vector<ImplSpec> getAvailableImplSpecs() const override {
+        // return Registrar<OperatorImpl_cpu>::getKeys(); // Note: cannot return set due to python binding 
+        std::set<ImplSpec> implSpecsSet = Registrar<OperatorImpl_cpu>::getKeys();
+        return std::vector<ImplSpec>(implSpecsSet.begin(), implSpecsSet.end());
     }
 
     void forward() override;
diff --git a/include/aidge/backend/cpu/operator/PadImpl_kernels.hpp b/include/aidge/backend/cpu/operator/PadImpl_kernels.hpp
index a362be0944aa18c36dd74a2f0066aaa21a1fc4c0..6d218cb1d719e8576f6c013ac5a1b9c60a739852 100644
--- a/include/aidge/backend/cpu/operator/PadImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/PadImpl_kernels.hpp
@@ -55,19 +55,19 @@ void PadImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 2>& beginEndBorder
                 O outputValue = static_cast<O>(borderValue);
 
                 if (borderType == PadBorderType::Constant) {
-                    int ix = static_cast<int>(ox) - static_cast<int>(beginEndBorders[1]);
+                    int ix = static_cast<int>(ox) - static_cast<int>(beginEndBorders[0]);
 
                     if (ix >= 0  && ix < static_cast<int>(dims[2])) {
                         outputValue = input[iIndex + static_cast<std::size_t>(ix)];
                     }
                 }
                 else if (borderType == PadBorderType::Edge) {
-                    int ix = std::max(0, std::min(static_cast<int>(dims[2]) - 1, static_cast<int>(ox) - static_cast<int>(beginEndBorders[1])));
+                    int ix = std::max(0, std::min(static_cast<int>(dims[2]) - 1, static_cast<int>(ox) - static_cast<int>(beginEndBorders[0])));
 
                     outputValue = input[iIndex + static_cast<std::size_t>(ix)];
                 }
                 else if (borderType == PadBorderType::Reflect) {
-                    int ix = static_cast<int>(ox) - static_cast<int>(beginEndBorders[1]);
+                    int ix = static_cast<int>(ox) - static_cast<int>(beginEndBorders[0]);
 
                     if (ix < 0)
                         ix = 0 - ix;
@@ -77,7 +77,7 @@ void PadImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 2>& beginEndBorder
                     outputValue = input[iIndex + static_cast<std::size_t>(ix)];
                 }
                 else if (borderType == PadBorderType::Wrap) {
-                    int ix = (static_cast<int>(dims[2]) + static_cast<int>(ox) - static_cast<int>(beginEndBorders[1])) % static_cast<int>(dims[2]);
+                    int ix = (static_cast<int>(dims[2]) + static_cast<int>(ox) - static_cast<int>(beginEndBorders[0])) % static_cast<int>(dims[2]);
 
                     outputValue = input[iIndex + static_cast<std::size_t>(ix)];
                 }
@@ -120,8 +120,8 @@ void PadImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 4>& beginEndBorder
     const I *input = static_cast<const I *>(input_);
     O *output = static_cast<O *>(output_);
 
-    const std::size_t oySize = dims[2] + beginEndBorders[0] + beginEndBorders[1];
-    const std::size_t oxSize = dims[3] + beginEndBorders[2] + beginEndBorders[3];
+    const std::size_t oySize = dims[2] + beginEndBorders[0] + beginEndBorders[2];
+    const std::size_t oxSize = dims[3] + beginEndBorders[1] + beginEndBorders[3];
 
     for (std::size_t batch = 0; batch < dims[0]; ++batch) {
         for (std::size_t ch = 0; ch < dims[1]; ++ch) {
@@ -135,22 +135,22 @@ void PadImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 4>& beginEndBorder
                     O outputValue = static_cast<O>(borderValue);
 
                     if (borderType == PadBorderType::Constant) {
-                        std::int32_t ix = static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[3]);
-                        std::int32_t iy = static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[1]);
+                        std::int32_t ix = static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[1]);
+                        std::int32_t iy = static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[0]);
 
                         if (ix >= 0  && ix < static_cast<std::int32_t>(dims[3]) && iy >= 0  && iy < static_cast<std::int32_t>(dims[2])) {
                             outputValue = input[iIndex + static_cast<std::size_t>(iy)*dims[3] + static_cast<std::size_t>(ix)];
                         }
                     }
                     else if (borderType == PadBorderType::Edge) {
-                        std::int32_t ix = std::max(0, std::min(static_cast<std::int32_t>(dims[3]) - 1, static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[3])));
-                        std::int32_t iy = std::max(0, std::min(static_cast<std::int32_t>(dims[2]) - 1, static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[1])));
+                        std::int32_t ix = std::max(0, std::min(static_cast<std::int32_t>(dims[3]) - 1, static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[1])));
+                        std::int32_t iy = std::max(0, std::min(static_cast<std::int32_t>(dims[2]) - 1, static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[0])));
 
                         outputValue = input[iIndex + static_cast<std::size_t>(iy)*dims[3] + static_cast<std::size_t>(ix)];
                     }
                     else if (borderType == PadBorderType::Reflect) {
-                        std::int32_t ix = static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[3]);
-                        std::int32_t iy = static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[1]);
+                        std::int32_t ix = static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[1]);
+                        std::int32_t iy = static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[0]);
 
                         if (ix < 0)
                             ix = 0 - ix;
@@ -164,8 +164,8 @@ void PadImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 4>& beginEndBorder
                         outputValue = input[iIndex + static_cast<std::size_t>(iy)*dims[3] + static_cast<std::size_t>(ix)];
                     }
                     else if (borderType == PadBorderType::Wrap) {
-                        std::int32_t ix = (static_cast<std::int32_t>(dims[3]) + static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[3])) % static_cast<std::int32_t>(dims[3]);
-                        std::int32_t iy = (static_cast<std::int32_t>(dims[2]) + static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[1])) % static_cast<std::int32_t>(dims[2]);
+                        std::int32_t ix = (static_cast<std::int32_t>(dims[3]) + static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[1])) % static_cast<std::int32_t>(dims[3]);
+                        std::int32_t iy = (static_cast<std::int32_t>(dims[2]) + static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[0])) % static_cast<std::int32_t>(dims[2]);
 
                         outputValue = input[iIndex + static_cast<std::size_t>(iy)*dims[3] + static_cast<std::size_t>(ix)];
                     }
diff --git a/include/aidge/backend/cpu/operator/PaddedConvImpl.hpp b/include/aidge/backend/cpu/operator/PaddedConvImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e1e2a89ceb1356dacae965903eaf405a3d524866
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/PaddedConvImpl.hpp
@@ -0,0 +1,59 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_PADDEDCONVIMPL_H_
+#define AIDGE_CPU_OPERATOR_PADDEDCONVIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/MetaOperatorDefs.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using PaddedConv1D_Op = MetaOperator_Op;
+using PaddedConvImpl1D_cpu = OperatorImpl_cpu<MetaOperator_Op,
+    void(const std::array<DimSize_t, 2>&,
+                            const std::array<DimSize_t, 1>&,
+                            const std::array<DimSize_t, 1>&,
+                            const std::array<DimSize_t, 1>&,
+                            const std::array<DimSize_t, 3> &,
+                            DimSize_t,
+                            const void *,
+                            const void *,
+                            const void *,
+                            void *)>;
+
+using PaddedConv2D_Op = MetaOperator_Op;
+using PaddedConvImpl2D_cpu = OperatorImpl_cpu<MetaOperator_Op,
+    void(const std::array<DimSize_t, 4>&,
+                            const std::array<DimSize_t, 2>&,
+                            const std::array<DimSize_t, 2>&,
+                            const std::array<DimSize_t, 2>&,
+                            const std::array<DimSize_t, 4> &,
+                            DimSize_t,
+                            const void *,
+                            const void *,
+                            const void *,
+                            void *)>;
+
+// Implementation entry point registration to Operator
+// Uncomment to activate implementation for PaddedConv. It is currently less efficient, hence why it is commented.
+// REGISTRAR(PaddedConv1D_Op, std::array<std::string, 2>({"cpu", "PaddedConv1D"}), Aidge::PaddedConvImpl1D_cpu::create);
+// REGISTRAR(PaddedConv2D_Op, std::array<std::string, 2>({"cpu", "PaddedConv2D"}), Aidge::PaddedConvImpl2D_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_PADDEDCONVIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/PaddedConvImpl_kernels.hpp b/include/aidge/backend/cpu/operator/PaddedConvImpl_kernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..85fb72435421f0024f11db6a13c5ddfbae4a0aeb
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/PaddedConvImpl_kernels.hpp
@@ -0,0 +1,228 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_PADDEDCONVIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_PADDEDCONVIMPL_KERNELS_H_
+
+#include <array>
+#include <cstddef>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/backend/cpu/operator/PaddedConvImpl.hpp"
+#include "aidge/operator/Pad.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+// Only works for constant padding zero
+/**
+ * @brief Forward kernel for 1D Convolution on CPU backend.
+ * @tparam I Input data type.
+ * @tparam W Weight data type.
+ * @tparam B Bias data type.
+ * @tparam O Output data type.
+ * @param params tuple of Attributes from the Operator
+ * @param inputDims Array of input dimensions.
+ * @param input_ const input Tensor.
+ * @param weights_ const weight Tensor.
+ * @param biases_ const Biais Tensor.
+ * @param output_ Output Tensor.
+ */
+template <class I, class W, class B, class O>
+void PaddedConvImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 2>& beginEndBorders,
+                            const std::array<DimSize_t, 1>& strideDims,
+                            const std::array<DimSize_t, 1>& dilationDims,
+                            const std::array<DimSize_t, 1>& kernelDims,
+                            const std::array<DimSize_t, 3>& inputDims,
+                            DimSize_t outChannels,
+                            const void *input_,
+                            const void *weights_,
+                            const void *biases_,
+                            void *output_)
+{
+    // FIXME: missing convolution attributes as arguments
+    const I *input = static_cast<const I *>(input_);
+    const W *weights = static_cast<const W *>(weights_);
+    const B *biases = static_cast<const B *>(biases_);
+    O *output = static_cast<O *>(output_);
+
+    // output H size
+    const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1;
+    const std::size_t oxSize =
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilated_kernel_x + strideDims[0]) /
+                                static_cast<float>(strideDims[0])));
+
+    // TODO: kernel computation
+    // output (batch, outCh, Xout, Yout)
+    // input  (batch, inCh, Xin, Yin)
+    // weight (outCh, inCh, kernelX, kernelY)
+    // does not take Dilation attribute into account
+    using signedsize = std::make_signed<std::size_t>::type;
+    for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+        for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
+            const std::size_t oIndex = (outCh + batch*outChannels) * oxSize;
+            // If bias = nullptr, set B(0)
+            B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
+            std::fill(output + oIndex, output+(oIndex+oxSize), biasVal);
+            for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
+                const std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2];
+                const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0];
+                for (std::size_t ox = 0; ox < oxSize; ++ox) {
+                    const signedsize difx = static_cast<signedsize>(ox * strideDims[0]);
+                    const std::size_t sxMin = static_cast<std::size_t>(std::max(static_cast<signedsize>(beginEndBorders[0]) - difx, signedsize(0)));
+                    const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + static_cast<signedsize>(beginEndBorders[1]) - difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx);
+
+                    const std::size_t oIndexFull = oIndex + ox;
+                    const signedsize ix = static_cast<signedsize>(ox * strideDims[0]) - static_cast<signedsize>(beginEndBorders[0]);
+
+                    for (std::size_t sx = sxMin; sx*dilationDims[0] < sxMax; ++sx) {
+                        output[oIndexFull] += weights[wIndex + sx] *
+                                                input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx*dilationDims[0]))];
+                    }
+                }
+            }
+        }
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(PaddedConvImpl1D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}, DynamicAttributes(std::map<std::string, future_std::any>({std::make_pair("type", future_std::any(std::string("PaddedConv1D")))}))},
+    {ProdConso::inPlaceModel, Aidge::PaddedConvImpl1D_cpu_forward_kernel<float, float, float, float>, nullptr});
+REGISTRAR(PaddedConvImpl1D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float16, DataFormat::NCHW}, DynamicAttributes(std::map<std::string, future_std::any>({std::make_pair("type", future_std::any(std::string("PaddedConv1D")))}))},
+    {ProdConso::inPlaceModel, Aidge::PaddedConvImpl1D_cpu_forward_kernel<half_float::half, half_float::half, half_float::half, half_float::half>, nullptr});
+REGISTRAR(PaddedConvImpl1D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}, DynamicAttributes(std::map<std::string, future_std::any>({std::make_pair("type", future_std::any(std::string("PaddedConv1D")))}))},
+    {ProdConso::inPlaceModel, Aidge::PaddedConvImpl1D_cpu_forward_kernel<int32_t, int32_t, int32_t, int32_t>, nullptr});
+REGISTRAR(PaddedConvImpl1D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}, DynamicAttributes(std::map<std::string, future_std::any>({std::make_pair("type", future_std::any(std::string("PaddedConv1D")))}))},
+    {ProdConso::inPlaceModel, Aidge::PaddedConvImpl1D_cpu_forward_kernel<double, double, double, double>, nullptr});
+
+
+/**
+ * @brief Forward kernel for 2D Convolution on CPU backend.
+ * @tparam I Input data type.
+ * @tparam W Weight data type.
+ * @tparam B Bias data type.
+ * @tparam O Output data type.
+ * @param params tuple of Attributes from the Operator
+ * @param inputDims Array of input dimensions.
+ * @param input_ const input Tensor.
+ * @param weights_ const weight Tensor.
+ * @param biases_ const Biais Tensor.
+ * @param output_ Output Tensor.
+ */
+template <class I, class W, class B, class O>
+void PaddedConvImpl2D_cpu_forward_kernel(
+                            const std::array<DimSize_t, 4>& beginEndBorders,
+                            const std::array<DimSize_t, 2>& strideDims,
+                            const std::array<DimSize_t, 2>& dilationDims,
+                            const std::array<DimSize_t, 2>& kernelDims,
+                            const std::array<DimSize_t, 4> &inputDims,
+                            DimSize_t outChannels,
+                            const void *input_,
+                            const void *weights_,
+                            const void *biases_,
+                            void *output_)
+{
+    // FIXME: missing convolution attributes as arguments
+    const I *input = static_cast<const I *>(input_);
+    const W *weights = static_cast<const W *>(weights_);
+    const B *biases = static_cast<const B *>(biases_);
+    O *output = static_cast<O *>(output_);
+
+    // output H size
+    const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1;
+    const std::size_t oxSize =
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilated_kernel_x + beginEndBorders[0] + beginEndBorders[2] + strideDims[0]) /
+                                static_cast<float>(strideDims[0])));
+    // output W size
+    const DimSize_t dilated_kernel_y = dilationDims[1]*(kernelDims[1] - 1) + 1;
+    const std::size_t oySize =
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[3] - dilated_kernel_y + beginEndBorders[1] + beginEndBorders[3] + strideDims[1]) /
+                                static_cast<float>(strideDims[1])));
+
+    for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+        for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
+            const std::size_t oIndex = (outCh + batch*outChannels) * oxSize * oySize;
+            // If bias = nullptr, set B(0)
+            B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
+            std::fill(output + oIndex, output+(oIndex+oxSize*oySize), biasVal);
+            for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
+                const std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
+                const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0] * kernelDims[1];
+                for (std::size_t ox = 0; ox < oxSize; ++ox) {
+                    const std::size_t difx = ox * strideDims[0];
+                    const std::size_t sxMin = beginEndBorders[0] < difx ? std::size_t(0) : beginEndBorders[0] - difx;
+                    const std::size_t sxMax = (inputDims[2] + beginEndBorders[2]) < difx ?
+                                                0 :
+                                                ((inputDims[2] + beginEndBorders[2]) > dilated_kernel_x + difx ?
+                                                    dilated_kernel_x :
+                                                    (inputDims[2] + beginEndBorders[2] - difx));
+
+                    for (std::size_t oy = 0; oy < oySize; ++oy) {
+                        const std::size_t dify = oy * strideDims[1];
+                        const std::size_t syMin = beginEndBorders[1] < dify ? std::size_t(0) : beginEndBorders[1] - dify;
+                        const std::size_t syMax = (inputDims[3] + beginEndBorders[3]) < dify ?
+                                                0 :
+                                                ((inputDims[3] + beginEndBorders[3]) > dilated_kernel_y + dify ?
+                                                    dilated_kernel_y :
+                                                    (inputDims[3] + beginEndBorders[3] - dify));
+                        const std::size_t oIndexFull = oIndex + ox*oySize + oy;
+                        const std::size_t ix = ox * strideDims[0] - beginEndBorders[0];
+                        const std::size_t iy = oy * strideDims[1] - beginEndBorders[1];
+
+
+                        if (sxMin == 0 && syMin == 0 && sxMax == 3 && syMax == 3) {
+                            output[oIndexFull] += (weights[wIndex + 0*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
+                                                   weights[wIndex + 0*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
+                                                   weights[wIndex + 0*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+2)] +
+                                                   weights[wIndex + 1*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
+                                                   weights[wIndex + 1*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
+                                                   weights[wIndex + 1*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+2)] +
+                                                   weights[wIndex + 2*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
+                                                   weights[wIndex + 2*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
+                                                   weights[wIndex + 2*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+2)]);
+                        } else {
+                            for (std::size_t sx = sxMin; sx*dilationDims[0] < sxMax; ++sx) {
+                                for (std::size_t sy = syMin; sy*dilationDims[1] < syMax; ++sy) {
+                                    output[oIndexFull] += weights[wIndex + sx*kernelDims[1] + sy] *
+                                                            input[iIndex + (sx*dilationDims[0] + ix)*inputDims[3] + sy*dilationDims[1] + iy];
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+
+// Kernels registration to implementation entry point
+REGISTRAR(PaddedConvImpl2D_cpu,
+    // ImplSpec{std::vector<ImplSpec::IOSpec>({ImplSpec::IOSpec{DataType::Any, DataFormat::NCHW}, ImplSpec::IOSpec{DataType::Any, DataFormat::NCHW}}) , std::vector<ImplSpec::IOSpec>({ImplSpec::IOSpec{DataType::Int32, DataFormat::NCHW}})},
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}, DynamicAttributes(std::map<std::string, future_std::any>({std::make_pair("type", future_std::any(std::string("PaddedConv2D")))}))},
+    {ProdConso::inPlaceModel, Aidge::PaddedConvImpl2D_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t, std::int32_t>, nullptr});
+REGISTRAR(PaddedConvImpl2D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float16, DataFormat::NCHW}, DynamicAttributes(std::map<std::string, future_std::any>({std::make_pair("type", future_std::any(std::string("PaddedConv2D")))}))},
+    {ProdConso::inPlaceModel, Aidge::PaddedConvImpl2D_cpu_forward_kernel<half_float::half, half_float::half, half_float::half, half_float::half>, nullptr});
+REGISTRAR(PaddedConvImpl2D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}, DynamicAttributes(std::map<std::string, future_std::any>({std::make_pair("type", future_std::any(std::string("PaddedConv2D")))}))},
+    {ProdConso::inPlaceModel, Aidge::PaddedConvImpl2D_cpu_forward_kernel<float, float, float, float>, nullptr});
+REGISTRAR(PaddedConvImpl2D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}, DynamicAttributes(std::map<std::string, future_std::any>({std::make_pair("type", future_std::any(std::string("PaddedConv2D")))}))},
+    {ProdConso::inPlaceModel, Aidge::PaddedConvImpl2D_cpu_forward_kernel<double, double, double, double>, nullptr});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_PADDEDCONVIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/PowImpl.hpp b/include/aidge/backend/cpu/operator/PowImpl.hpp
index cfbb8173d1f83162519016a8f2b3c3166977a5b7..b31ce08c9089df05bd2e711fd87f09690fd2df23 100644
--- a/include/aidge/backend/cpu/operator/PowImpl.hpp
+++ b/include/aidge/backend/cpu/operator/PowImpl.hpp
@@ -12,18 +12,21 @@
 #ifndef AIDGE_CPU_OPERATOR_POWIMPL_H_
 #define AIDGE_CPU_OPERATOR_POWIMPL_H_
 
+#include <cstddef>  // std::size_t
+#include <memory>   // std::unique_ptr, std::make_unique
+#include <string>
+#include <vector>
+
 #include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/Pow.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
-#include "aidge/backend/cpu/data/GetCPUPtr.h"
-#include <memory>
-#include <vector>
+
 
 namespace Aidge {
 // Operator implementation entry point for the backend
 using PowImpl_cpu = OperatorImpl_cpu<Pow_Op,
-    void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*,void*),
+    void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*, void*),
     void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*, const void*, void*, void*)>;
 
 
diff --git a/include/aidge/backend/cpu/operator/PowImpl_kernels.hpp b/include/aidge/backend/cpu/operator/PowImpl_kernels.hpp
index ab9b2ccc7b823842decd044b90a5c6364cedc9c9..cae106632053366e1370b5ce1d3a2ee4cfd3b62b 100644
--- a/include/aidge/backend/cpu/operator/PowImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/PowImpl_kernels.hpp
@@ -13,36 +13,141 @@
 #define AIDGE_CPU_OPERATOR_POWIMPL_KERNELS_H_
 
 #include "aidge/utils/Registrar.hpp"
-#include <cmath>
+
+#include <cstddef>  // std::size_t
 
 #include "aidge/backend/cpu/data/Broadcasting.hpp"
 #include "aidge/backend/cpu/operator/PowImpl.hpp"
 
 namespace Aidge {
-template <class I1, class I2, class O>
-void PowImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims,
-                                const std::vector<std::size_t>& input2Dims,
+
+namespace {
+// suppose values are contiguous in memory
+template <class I, class O>
+void pow_contiguous_arrays(const std::size_t input1size,
+                            const std::size_t input2size,
+                            const std::size_t output1size,
+                            const I* input1,
+                            const I* input2,
+                            O* output)
+{
+    for (std::size_t i = 0; i < output1size; ++i)
+    {
+        const std::size_t in1_id = (input1size != 1) ? i : 0;
+        const std::size_t in2_id = (input2size != 1) ? i : 0;
+        output[i] = static_cast<O>(std::pow(input1[in1_id], input2[in2_id]));
+    }
+}
+}
+
+template <class I, class O>
+void PowImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
+                                std::vector<std::size_t> dims1,
                                 const std::vector<std::size_t>& outputDims,
+                                const void* input0_,
                                 const void* input1_,
-                                const void* input2_,
                                 void* output_) {
 
-    const I1* input_1 = static_cast<const I1*>(input1_);
-    const I2* input_2 = static_cast<const I2*>(input2_);
+    const I* input_0 = static_cast<const I*>(input0_);
+    const I* input_1 = static_cast<const I*>(input1_);
     O* output = static_cast<O*>(output_);
 
-    std::size_t totalElements = std::accumulate(outputDims.cbegin(), outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
-	for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex) 
-	{
-		std::vector<std::size_t> indexes = getMultiDimIndices(outputDims, oIndex);
+    // [5,2,1,7] & [2,6,7]
+    // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7]
+    // 2. Find the highest equal dimension -> 3
+    //    Exception: if the first diverging dimension is the last one, then -> 4 (dims.size())
+    // 3. Compute the highest number of contiguous data -> 7
+    // 4. Compute stride and offset step for the broadcast mechanism
+    // 5. Call a simple kernel
+
+    // special case for equal dimensions, the kernel is called with the entire arrays at once
+    if (dims0 == dims1) {
+        const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+        for (std::size_t i = 0; i < input0_contiguous_size; ++i)
+        {
+            output[i] = static_cast<O>(std::pow(input_0[i], input_1[i]));
+        }
+        return;
+    }
+
+    // set dimensions to be of equal size by filling the smallest one with ones.
+    if (dims0.size() > dims1.size()) {
+        dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1));
+    }
+    else if (dims1.size() > dims0.size()) {
+        dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1));
+    }
 
-		std::size_t idx1 = getFlattenedIndex(input1Dims, indexes);
-		std::size_t idx2 = getFlattenedIndex(input2Dims, indexes);
-		
-        output[oIndex] = std::pow(input_1[idx1], input_2[idx2]);
-	}
+    const std::size_t nbDims = dims0.size();
+
+    // Find the highest equal dimension
+    // std::size_t contiguousIdx = nbDims - 1;
+    std::size_t contiguousIdx = nbDims;
+    while (contiguousIdx-- > 0) {
+    // for (; contiguousIdx+1 > 0; --contiguousIdx) {
+        if (dims0[contiguousIdx] != dims1[contiguousIdx]) {
+            if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1
+                const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1;
+                while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) {
+                    --contiguousIdx;
+                }
+            }
+            break;
+        }
+    }
+    ++contiguousIdx;
+
+    // Compute the highest number of contiguous data for each Tensor
+    const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+
+    // initialize strides to iterate through data because of broadcasting
+    std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    if (contiguousIdx > 0) {
+        stride_post0[contiguousIdx - 1] = 1;
+        stride_post1[contiguousIdx - 1] = 1;
+        for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) {
+            stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]);
+            stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]);
+        }
+        for (std::size_t i = 0; i != contiguousIdx; ++i) {
+            stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1;
+            stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1;
+        }
+    }
+
+    // variables for arrays offsets
+    std::size_t offsetIn0 = 0;
+    std::size_t offsetIn1 = 0;
+    std::size_t offsetOut = 0;
+
+
+    std::size_t dim = contiguousIdx - 1;
+    const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
+    for (std::size_t stack = 0; stack < nbStacks;) {
+        pow_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
+                    input_0 + offsetIn0*input0_contiguous_size,
+                    input_1 + offsetIn1*input1_contiguous_size,
+                    output + offsetOut*output_contiguous_size);
+        if (++stack < nbStacks) {
+            std::size_t tmp_stack = stack;
+            while(tmp_stack % outputDims[dim] == 0) {
+                tmp_stack /= outputDims[dim];
+                dim--;
+            }
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = contiguousIdx - 1;
+        }
+    }
 }
 
+
 template <class I1, class I2, class O>
 void PowImpl_cpu_backward_kernel(const std::vector<std::size_t>& input0Dims,
                                 const std::vector<std::size_t>& input1Dims,
@@ -82,14 +187,23 @@ void PowImpl_cpu_backward_kernel(const std::vector<std::size_t>& input0Dims,
 
 // Kernels registration to implementation entry point
 REGISTRAR(PowImpl_cpu,
-    {DataType::Float32},
-    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<float, float, float>, Aidge::PowImpl_cpu_backward_kernel<float, float, float>});
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float32}},
+    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<float, float>, Aidge::PowImpl_cpu_backward_kernel<float, float, float>});
+REGISTRAR(PowImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}},
+    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<double, double>, Aidge::PowImpl_cpu_backward_kernel<double, double, double>});
+REGISTRAR(PowImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}},
+    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<int32_t, int32_t>, Aidge::PowImpl_cpu_backward_kernel<int32_t, int32_t, int32_t>});
+REGISTRAR(PowImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int64}},
+    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, Aidge::PowImpl_cpu_backward_kernel<std::int64_t, std::int64_t, std::int64_t>});
 REGISTRAR(PowImpl_cpu,
-    {DataType::Float64},
-    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<double, double, double>, Aidge::PowImpl_cpu_backward_kernel<double, double, double>});
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int8}},
+    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<std::int8_t, std::int8_t>, Aidge::PowImpl_cpu_backward_kernel<std::int8_t, std::int8_t, std::int8_t>});
 REGISTRAR(PowImpl_cpu,
-    {DataType::Int32},
-    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<int32_t, int32_t, int32_t>, Aidge::PowImpl_cpu_backward_kernel<int32_t, int32_t, int32_t>});
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::UInt8}},
+    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<std::uint8_t, std::uint8_t>, Aidge::PowImpl_cpu_backward_kernel<std::uint8_t, std::uint8_t, std::uint8_t>});
 }  // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_POWIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/ResizeImpl.hpp b/include/aidge/backend/cpu/operator/ResizeImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2bf5c1e807c0b0a64ac0dd2d3ac87219ba6349df
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/ResizeImpl.hpp
@@ -0,0 +1,37 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_RESIZEIMPL_H_
+#define AIDGE_CPU_OPERATOR_RESIZEIMPL_H_
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Resize.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include <aidge/data/Interpolation.hpp>
+#include <aidge/operator/Pad.hpp>
+#include <cstdint>
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using ResizeImpl_cpu = OperatorImpl_cpu<
+    Resize_Op,
+    void(const void *,                                  // input
+         const std::vector<DimSize_t> &,                // INput dims
+         const std::vector<DimSize_t> &,                // OUTput dims
+         const Interpolation::CoordinateTransformation, // coord transfo
+         const Interpolation::Mode,                     // interpolation mode
+         const PadBorderType,                           // padding mode
+         void *)>;                                      // output
+// Implementation entry point registration to Operator
+REGISTRAR(Resize_Op, "cpu", Aidge::ResizeImpl_cpu::create);
+} // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_RESIZEIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/ResizeImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ResizeImpl_kernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..6a22ff4ec9d7beaf05be3b479b43dd3ad69bc74b
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/ResizeImpl_kernels.hpp
@@ -0,0 +1,160 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_RESIZEIMPL_FORWARD_KERNEL_H_
+#define AIDGE_CPU_OPERATOR_RESIZEIMPL_FORWARD_KERNEL_H_
+
+#include "aidge/backend/cpu/operator/ResizeImpl.hpp"
+
+#include <aidge/data/Data.hpp>
+#include <aidge/data/half.hpp>
+#include <aidge/operator/Pad.hpp>
+#include <cmath>
+#include <cstdint>
+#include <numeric>
+
+#include "aidge/backend/cpu/data/Interpolation.hpp"
+#include "aidge/data/Interpolation.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+
+template <typename IO>
+void ResizeImpl_cpu_forward_kernel(
+    const void *input_,
+    const std::vector<DimSize_t> &inputDims,
+    const std::vector<DimSize_t> &outputDims,
+    const Interpolation::CoordinateTransformation coordTransfoMode,
+    const Interpolation::Mode interpMode,
+    const PadBorderType paddingMode,
+    // const double * /*roi*/,
+    // const float * /*scales*/,
+    // const int64_t * /*sizes*/,
+    void *output_) {
+
+    // Seting a data
+    const IO *input = static_cast<const IO *>(input_);
+    IO *output = static_cast<IO *>(output_);
+
+    const DimSize_t outputLen = std::accumulate(outputDims.cbegin(),
+                                          outputDims.cend(),
+                                          1,
+                                          std::multiplies<DimSize_t>());
+    std::vector<float> coordInApprox(inputDims.size());
+    std::vector<std::size_t> coordIn(inputDims.size());
+    std::vector<DimSize_t> coordOut;
+    for (DimSize_t idxFlatOut = 0; idxFlatOut < outputLen; ++idxFlatOut) {
+        coordOut = Tensor::toCoord(outputDims, idxFlatOut);
+        coordInApprox =
+            Interpolation::untransformCoordinates(coordOut,
+                                                  inputDims,
+                                                  outputDims,
+                                                  coordTransfoMode);
+        if ((interpMode == Interpolation::Mode::Ceil) || (interpMode == Interpolation::Mode::Floor) || (interpMode == Interpolation::Mode::RoundPreferCeil) || (interpMode == Interpolation::Mode::RoundPreferFloor)) {
+            for (std::size_t i = 0; i < coordInApprox.size(); ++i) {
+                if (interpMode == Interpolation::Mode::Ceil) {
+                    coordInApprox[i] = std::ceil(coordInApprox[i]);
+                } else if (interpMode == Interpolation::Mode::Floor) {
+                    coordInApprox[i] = std::floor(coordInApprox[i]);
+                } else if (interpMode == Interpolation::Mode::RoundPreferCeil) {
+                    coordInApprox[i] = std::floor(coordInApprox[i] + 0.5f);
+                } else { // (interpMode == Interpolation::Mode::RoundPreferFloor)
+                    coordInApprox[i] = std::ceil(coordInApprox[i] - 0.5f);
+                }
+            }
+            if (Tensor::isInBounds<float>(inputDims, coordInApprox)) {
+                for (std::size_t i = 0; i < coordInApprox.size(); ++i) {
+                    coordIn[i] = static_cast<std::size_t>(coordInApprox[i]);
+                }
+            } else {
+                if (paddingMode == PadBorderType::Edge) {
+                    for (std::size_t i = 0; i < coordInApprox.size(); ++i) {
+                        coordIn[i] = coordInApprox[i] < 0 ? 0 : (coordInApprox[i] >=inputDims[i] ? inputDims[i] - 1 : static_cast<std::size_t>(coordInApprox[i]));
+                    }
+                } else {
+                    AIDGE_THROW_OR_ABORT(std::runtime_error, "Padding mode not supported");
+                }
+            }
+            output[idxFlatOut] = input[Tensor::toIndex(inputDims, coordIn)];
+        } else {
+            std::set<Interpolation::Point<IO>> neighbours =
+                InterpolationCPU::retrieveNeighbours(input,
+                                                     inputDims,
+                                                     coordInApprox,
+                                                     paddingMode);
+            output[idxFlatOut] = InterpolationCPU::interpolate(coordInApprox,
+                                                               neighbours,
+                                                               interpMode);
+        }
+    }
+    return;
+}
+// Kernels registration to implementation entry point
+REGISTRAR(ResizeImpl_cpu,
+          {{{DataType::Int16},
+            {DataType::Float32},
+            {DataType::Float32},
+            {DataType::UInt64}},
+           {DataType::Int16}},
+          {ProdConso::inPlaceModel,
+           ResizeImpl_cpu_forward_kernel<int16_t>,
+           nullptr});
+REGISTRAR(ResizeImpl_cpu,
+          {{{DataType::Int32},
+            {DataType::Float32},
+            {DataType::Float32},
+            {DataType::UInt64}},
+           {DataType::Int32}},
+          {ProdConso::inPlaceModel,
+           ResizeImpl_cpu_forward_kernel<int32_t>,
+           nullptr});
+REGISTRAR(ResizeImpl_cpu,
+          {{{DataType::Int64},
+            {DataType::Float32},
+            {DataType::Float32},
+            {DataType::Int64}},
+           {DataType::UInt64}},
+          {ProdConso::inPlaceModel,
+           ResizeImpl_cpu_forward_kernel<int64_t>,
+           nullptr});
+
+REGISTRAR(ResizeImpl_cpu,
+          {{{DataType::Float16},
+            {DataType::Float32},
+            {DataType::Float32},
+            {DataType::UInt64}},
+           {DataType::Float16}},
+          {ProdConso::inPlaceModel,
+           ResizeImpl_cpu_forward_kernel<half_float::half>,
+           nullptr});
+REGISTRAR(ResizeImpl_cpu,
+          {{{DataType::Float32},
+            {DataType::Float32},
+            {DataType::Float32},
+            {DataType::UInt64}},
+           {DataType::Float32}},
+          {ProdConso::inPlaceModel,
+           ResizeImpl_cpu_forward_kernel<float>,
+           nullptr});
+REGISTRAR(ResizeImpl_cpu,
+          {{{DataType::Float64},
+            {DataType::Float32},
+            {DataType::Float32},
+            {DataType::UInt64}},
+           {DataType::Float64}},
+          {ProdConso::inPlaceModel,
+           ResizeImpl_cpu_forward_kernel<double>,
+           nullptr});
+} // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_RESIZEIMPL_FORWARD_KERNEL_H_ */
diff --git a/include/aidge/backend/cpu/operator/RoundImpl.hpp b/include/aidge/backend/cpu/operator/RoundImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c595e251cc18348b6f732f1c36a05de54f647204
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/RoundImpl.hpp
@@ -0,0 +1,34 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_ROUNDIMPL_H_
+#define AIDGE_CPU_OPERATOR_ROUNDIMPL_H_
+
+#include <cstddef>  // std::size_t
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Round.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using RoundImpl_cpu = OperatorImpl_cpu<Round_Op,
+    void(const std::size_t, const void*, void*)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(Round_Op, "cpu", Aidge::RoundImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_ROUNDIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/RoundImpl_kernels.hpp b/include/aidge/backend/cpu/operator/RoundImpl_kernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ba9c63bc3618ba81e238d7721147c894b54cf832
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/RoundImpl_kernels.hpp
@@ -0,0 +1,46 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_ROUNDIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_ROUNDIMPL_KERNELS_H_
+
+#include <cmath>   //std::round 
+#include <cstddef>  // std::size_t
+
+#include "aidge/utils/Registrar.hpp"
+
+#include "aidge/backend/cpu/operator/RoundImpl.hpp"
+
+namespace Aidge {
+template <class I, class O>
+void RoundImpl_cpu_forward_kernel(const std::size_t inputLenght,
+                                     const void* input_,
+                                     void* output_) {
+
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+
+    for (std::size_t i = 0; i < inputLenght; ++i) {
+        //std::round would not work since it doesn't follow the halves rules (See ONNX Round)
+        output[i] = static_cast<O>(std::nearbyint(static_cast<float>(input[i])));
+    }
+}
+
+
+REGISTRAR(RoundImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::RoundImpl_cpu_forward_kernel<float, float>,nullptr});
+REGISTRAR(RoundImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::RoundImpl_cpu_forward_kernel<double, double>,nullptr});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_ROUNDIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/SliceImpl_kernels.hpp b/include/aidge/backend/cpu/operator/SliceImpl_kernels.hpp
index 1bf4c491723c570fa8bfd1774beca1630d2de9be..d290c40f26270a789c2d328f98560c65ecac1559 100644
--- a/include/aidge/backend/cpu/operator/SliceImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/SliceImpl_kernels.hpp
@@ -89,13 +89,13 @@ void SliceImpl_cpu_forward_kernel(const std::vector<std::int64_t>& starts,
 }
 
 REGISTRAR(SliceImpl_cpu,
-    {DataType::Float32},
+    {{DataType::Float32, DataType::Any}, {DataType::Float32}},
     {ProdConso::inPlaceModel, Aidge::SliceImpl_cpu_forward_kernel<float, float>, nullptr});
 REGISTRAR(SliceImpl_cpu,
-    {DataType::Float64},
+    {{DataType::Float64, DataType::Any}, {DataType::Float64}},
     {ProdConso::inPlaceModel, Aidge::SliceImpl_cpu_forward_kernel<double, double>, nullptr});
 REGISTRAR(SliceImpl_cpu,
-    {DataType::Int32},
+    {{DataType::Int32, DataType::Any}, {DataType::Int32}},
     {ProdConso::inPlaceModel, Aidge::SliceImpl_cpu_forward_kernel<int32_t, int32_t>, nullptr});
 }  // namespace Aidge
 
diff --git a/include/aidge/backend/cpu/operator/SubImpl.hpp b/include/aidge/backend/cpu/operator/SubImpl.hpp
index 2bb22bda74edf7db09404fd5613b6714ddcdf513..eed26ddcc9f57b3bb7796049a62f3f6be7de4eb5 100644
--- a/include/aidge/backend/cpu/operator/SubImpl.hpp
+++ b/include/aidge/backend/cpu/operator/SubImpl.hpp
@@ -23,7 +23,7 @@
 namespace Aidge {
 // Operator implementation entry point for the backend
 using SubImpl_cpu = OperatorImpl_cpu<Sub_Op,
-    void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*,void*)>;
+    void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*,void*)>;
 
 // Implementation entry point registration to Operator
 REGISTRAR(Sub_Op, "cpu", Aidge::SubImpl_cpu::create);
diff --git a/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp b/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp
index 0486ed2105b23e95f9cdfcda578e14900fcb2c8e..1d789c3c8886d35ce6597d5704c76060bad196c1 100644
--- a/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp
@@ -21,32 +21,132 @@
 #include "aidge/backend/cpu/data/Broadcasting.hpp"
 #include "aidge/backend/cpu/operator/SubImpl.hpp"
 
+namespace {
+// suppose values are contiguous in memory
+template <class I1, class I2, class O>
+void sub_contiguous_arrays(const std::size_t input1size,
+                            const std::size_t input2size,
+                            const std::size_t output1size,
+                            const I1* input1,
+                            const I2* input2,
+                            O* output)
+{
+    for (std::size_t i = 0; i < output1size; ++i)
+    {
+        const std::size_t in1_id = (input1size != 1) ? i : 0;
+        const std::size_t in2_id = (input2size != 1) ? i : 0;
+        output[i] = static_cast<O>(input1[in1_id] - input2[in2_id]);
+    }
+}
+}
+
 
 namespace Aidge {
 template <class I1, class I2, class O>
-void SubImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims,
-                                const std::vector<std::size_t>& input2Dims,
+void SubImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
+                                std::vector<std::size_t> dims1,
                                 const std::vector<std::size_t>& outputDims,
+                                const void* input0_,
                                 const void* input1_,
-                                const void* input2_,
                                 void* output_) {
 
-    const I1* input_1 = static_cast<const I1*>(input1_);
-    const I2* input_2 = static_cast<const I2*>(input2_);
+    const I1* input_0 = static_cast<const I1*>(input0_);
+    const I2* input_1 = static_cast<const I2*>(input1_);
     O* output = static_cast<O*>(output_);
 
-    size_t totalElements = 1;
-    for (size_t dimSize : outputDims) {
-        totalElements *= dimSize;
+    // [5,2,1,7] & [2,6,7]
+    // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7]
+    // 2. Find the highest equal dimension -> 3
+    //    Exception: if the first diverging dimension is the last one, then -> 4 (dims.size())
+    // 3. Compute the highest number of contiguous data -> 7
+    // 4. Compute stride and offset step for the broadcast mechanism
+    // 5. Call a simple kernel
+
+    // special case for equal dimensions, the kernel is called with the entire arrays at once
+    if (dims0 == dims1) {
+        const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+        for (std::size_t i = 0; i < input0_contiguous_size; ++i)
+        {
+            output[i] = static_cast<O>(input_0[i] - input_1[i]);
+        }
+        return;
+    }
+
+    // set dimensions to be of equal size by filling the smallest one with ones.
+    if (dims0.size() > dims1.size()) {
+        dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1));
+    }
+    else if (dims1.size() > dims0.size()) {
+        dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1));
+    }
+
+    const std::size_t nbDims = dims0.size();
+
+    // Find the highest equal dimension
+    // std::size_t contiguousIdx = nbDims - 1;
+    std::size_t contiguousIdx = nbDims;
+    while (contiguousIdx-- > 0) {
+    // for (; contiguousIdx+1 > 0; --contiguousIdx) {
+        if (dims0[contiguousIdx] != dims1[contiguousIdx]) {
+            if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1
+                const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1;
+                while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) {
+                    --contiguousIdx;
+                }
+            }
+            break;
+        }
+    }
+    ++contiguousIdx;
+
+    // Compute the highest number of contiguous data for each Tensor
+    const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+
+    // initialize strides to iterate through data because of broadcasting
+    std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    if (contiguousIdx > 0) {
+        stride_post0[contiguousIdx - 1] = 1;
+        stride_post1[contiguousIdx - 1] = 1;
+        for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) {
+            stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]);
+            stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]);
+        }
+        for (std::size_t i = 0; i != contiguousIdx; ++i) {
+            stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1;
+            stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1;
+        }
     }
 
-	for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex)
-	{
-		std::vector<size_t> indexes = getMultiDimIndices(outputDims, oIndex);
-		std::size_t idx1 = getFlattenedIndex(input1Dims, indexes);
-		std::size_t idx2 = getFlattenedIndex(input2Dims, indexes);
-        output[oIndex] = input_1[idx1] - input_2[idx2];
-	}
+    // variables for arrays offsets
+    std::size_t offsetIn0 = 0;
+    std::size_t offsetIn1 = 0;
+    std::size_t offsetOut = 0;
+
+
+    std::size_t dim = contiguousIdx - 1;
+    const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
+    for (std::size_t stack = 0; stack < nbStacks;) {
+        sub_contiguous_arrays<I1,I2,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
+                    input_0 + offsetIn0*input0_contiguous_size,
+                    input_1 + offsetIn1*input1_contiguous_size,
+                    output + offsetOut*output_contiguous_size);
+        if (++stack < nbStacks) {
+            std::size_t tmp_stack = stack;
+            while(tmp_stack % outputDims[dim] == 0) {
+                tmp_stack /= outputDims[dim];
+                dim--;
+            }
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = contiguousIdx - 1;
+        }
+    }
 }
 
 // Kernels registration to implementation entry point
@@ -56,6 +156,12 @@ REGISTRAR(SubImpl_cpu,
 REGISTRAR(SubImpl_cpu,
     {DataType::Float64},
     {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<double, double, double>, nullptr});
+REGISTRAR(SubImpl_cpu,
+    {DataType::Int8},
+    {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<std::int8_t, std::int8_t, std::int8_t>, nullptr});
+REGISTRAR(SubImpl_cpu,
+    {DataType::UInt8},
+    {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<std::uint8_t, std::uint8_t, std::uint8_t>, nullptr});
 REGISTRAR(SubImpl_cpu,
     {DataType::Int32},
     {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, nullptr});
diff --git a/src/data/Interpolation.cpp b/src/data/Interpolation.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fbf224d84f65c442e98967783d303605a177d390
--- /dev/null
+++ b/src/data/Interpolation.cpp
@@ -0,0 +1,436 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include "aidge/backend/cpu/data/Interpolation.hpp"
+
+#include <aidge/utils/Log.hpp>
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+
+#include <iterator>
+#include <stdexcept>
+#include <utility>
+#include <vector>
+
+#include <aidge/data/Interpolation.hpp>
+#include <aidge/data/half.hpp>
+#include <aidge/utils/ErrorHandling.hpp>
+#include <aidge/utils/Types.h>
+
+namespace Aidge {
+
+template <typename T>
+std::set<Interpolation::Point<T>>
+InterpolationCPU::linearRecurse(const std::vector<float> &coordToInterpolate,
+                                const std::set<Point<T>> &points,
+                                const DimIdx_t alongDim) {
+
+    // all points have been discriminated properly along given dimension.
+    if (points.size() == 1) {
+        return points;
+    }
+
+    auto extractPtCoords = [](std::set<Point<T>> pts) -> std::set<Coords> {
+        std::set<Coords> result;
+        for (const auto &pt : pts) {
+            result.insert(pt.first);
+        }
+        return result;
+    };
+    ///////////////////
+    // ERROR CHECKING
+    if (alongDim > coordToInterpolate.size() || points.size() == 0) {
+        // retrieving points coords as points values can be in half_float &
+        // this type is not fmt compatible
+        std::vector<Coords> pointsCoords;
+        for (const auto &point : points) {
+            pointsCoords.push_back(point.first);
+        }
+        AIDGE_ASSERT(
+            alongDim >= coordToInterpolate.size(),
+            "InterpolationCPU::linearInterpolationRecurse: alongDim value "
+            "exceeded exceeded number of dimensions of coordsTointerpolate. "
+            "Interpolation has failed. Input values : \n - "
+            "coordsToInterpolate {}\n - pointsToInterpolate {}\n - alongDim "
+            "{}",
+            coordToInterpolate,
+            pointsCoords,
+            alongDim);
+        AIDGE_ASSERT(
+            points.size() == 0,
+            "InterpolationCPU::linearInterpolationRecurse: entering recursive "
+            "function with 0 points. Interpolation has failed."
+            "Please file a bug report to aidge_backend_cpu repo: "
+            "https://gitlab.eclipse.org/eclipse/aidge/aidge_backend_cpu/-/"
+            "issues."
+            "\nInput values : \n - "
+            "coordsToInterpolate {}\n - pointsToInterpolate {}\n - alongDim "
+            "{}",
+            coordToInterpolate,
+            pointsCoords,
+            alongDim);
+    }
+    Log::debug("\nEntering linear recurse with {} points.", points.size());
+    Log::debug("Points : {}", extractPtCoords(points));
+    Log::debug("coordsToInterpolate : {}", coordToInterpolate);
+    Log::debug("alongDim : {}", alongDim);
+
+    ///////////////////
+    // COMPUTATION
+    // split  all points along each dimension
+    // depending on if their coords[alongDim] are above or under
+    // coords to interpolate values
+    std::set<Point<T>> lowerPoints;
+    std::set<Point<T>> upperPoints;
+    for (const auto &point : points) {
+        if (point.first[alongDim] <= coordToInterpolate[alongDim]) {
+            lowerPoints.insert(point);
+        } else {
+            upperPoints.insert(point);
+        }
+    }
+    Log::debug("alongDim : {}", alongDim);
+    Log::debug("lowerPoints : {}", extractPtCoords(lowerPoints));
+    Log::debug("upperPoints : {}", extractPtCoords(upperPoints));
+
+    // Here are 3 cases
+    // 1. upper/lowerPoints.size() == 0
+    //        Coordinates to interpolate along current dimension are round.
+    //        That would be equivalent to a linear interpolation with a
+    //        ponderation of 1 for lowerPoints & 0 for upperPoints(or the
+    //        opposite idk), hence we will only take lower/upperPoints values
+    //        from there.
+    //
+    //        Why this happens :
+    //        If coordinates are round, the floor()/ceil() operations called
+    //        in retrieveNeighbours to generate direct neighbours of floating
+    //        coordinates returned the same value.
+    //
+    // 2. lower/upperPoints.size() == 1
+    //        All dimensions have been discriminated, we can proceed to
+    //        weighted interpolation
+    //
+    // 3. lower/upperPoints.size() > 1
+    //        points have not been all discriminated and must be further split
+    //        so we call linearRecurse()
+    switch (lowerPoints.size()) {
+        case 0: {
+            return linearRecurse(coordToInterpolate, upperPoints, alongDim + 1);
+        }
+        case 1: {
+            break;
+        }
+        default: {
+            lowerPoints =
+                linearRecurse(coordToInterpolate, lowerPoints, alongDim + 1);
+            break;
+        }
+    }
+
+    switch (upperPoints.size()) {
+        case 0: {
+            return linearRecurse(coordToInterpolate, lowerPoints, alongDim + 1);
+        }
+        case 1: {
+            break;
+        }
+        default: {
+            upperPoints =
+                linearRecurse(coordToInterpolate, upperPoints, alongDim + 1);
+            break;
+        }
+    }
+
+    // At this point lowerPoints & upperPoints are garanteed to be
+    // 1 sized arrays
+    AIDGE_ASSERT(lowerPoints.size() == 1,
+                 "LowerPoints Size = {} != 1",
+                 lowerPoints.size());
+    AIDGE_ASSERT(upperPoints.size() == 1,
+                 "upperPoints Size = {} != 1",
+                 upperPoints.size());
+
+    //     ( point[dim] - Pl[dim] )
+    // t = ------------------------
+    //      ( Pu[dim] - Pl[dim] )
+    float weight =
+        (coordToInterpolate[alongDim] - lowerPoints.begin()->first[alongDim]) /
+        (upperPoints.begin()->first[alongDim] -
+         lowerPoints.begin()->first[alongDim]);
+
+    Point<T> interpolatedPoint = std::make_pair(
+        lowerPoints.begin()->first,
+        static_cast<T>((1.F - weight) * lowerPoints.begin()->second +
+                       weight * upperPoints.begin()->second));
+    // 0 is just a sanity check to ensure later that all dims have been
+    // interpolate
+    interpolatedPoint.first[alongDim] = 0;
+    Log::debug("successfully returned from alongDim : {}", alongDim);
+    return std::set<Point<T>>({interpolatedPoint});
+}
+
+template <typename T>
+T InterpolationCPU::linear(const std::vector<float> &coordToInterpolate,
+                           const std::set<Point<T>> &pointsToInterpolate) {
+
+    auto result = linearRecurse(coordToInterpolate, pointsToInterpolate, 0);
+    AIDGE_ASSERT(result.size() == 1,
+                 "Result size is not 1 but {}",
+                 result.size());
+    // if (!std::all_of(result.begin()->first.begin(),
+    //                  result.begin()->first.end(),
+    //                  [](DimSize_t coord) -> bool { return coord == 0; })) {
+    //     std::vector<Coords> ptCoords;
+    //     std::transform(pointsToInterpolate.begin(),
+    //                    pointsToInterpolate.end(),
+    //                    std::back_inserter(ptCoords),
+    //                    [](Point<T> pt) { return pt.first; });
+    //     AIDGE_THROW_OR_ABORT(std::runtime_error,
+    //                          "Not all dimensions have been interpolated."
+    //                          "Input data :"
+    //                          "\n\t coord to interpolate : {}"
+    //                          "\n\t pointsToInterpolate : {}",
+    //                          //        "\n\tAll non 0 values show dimensions
+    //                          //        that were not interpolated : {}",
+    //                          coordToInterpolate,
+    //                          ptCoords //,
+    //                                   // result.begin()->first
+    //     );
+    // }
+    return result.begin()->second;
+}
+
+template <typename T>
+T InterpolationCPU::nearest(const std::vector<float> &coordsToInterpolate,
+                            const std::set<Point<T>> &points,
+                            const Interpolation::Mode nearestMode) {
+
+    AIDGE_ASSERT(
+        coordsToInterpolate.size() == points.begin()->first.size(),
+        "Interpolation::nearest(): dimension mismatch : coordinate "
+        "to interpolate ({}) have not the same number of dimensions than "
+        "the points to interpolate({}).",
+        coordsToInterpolate,
+        points.begin()->first);
+    std::function<int64_t(const float &)> updateCoordinates;
+    switch (nearestMode) {
+    case Interpolation::Mode::Ceil: {
+        updateCoordinates = [](const float &coord) -> int64_t {
+            return ceil(coord);
+        };
+        break;
+    }
+    case Interpolation::Mode::Floor: {
+        updateCoordinates = [](const float &coord) -> int64_t {
+            return floor(coord);
+        };
+        break;
+    }
+    case Interpolation::Mode::RoundPreferFloor: {
+        updateCoordinates = [](const float &coord) -> int64_t {
+            return (coord - floor(coord)) == 0.5 ? floor(coord)
+                                                 : std::round(coord);
+        };
+        break;
+    }
+    case Interpolation::Mode::RoundPreferCeil: {
+        updateCoordinates = [](const float &coord) -> int64_t {
+            return (coord - floor(coord)) == 0.5 ? ceil(coord)
+                                                 : std::round(coord);
+        };
+        break;
+    }
+    default: {
+        AIDGE_THROW_OR_ABORT(
+            std::runtime_error,
+            "Invalid Interpolation mode for "
+            "InterpolationCPU::interpolateNearest. Accepted modes are : "
+            "Ceil({}),Floor({}),RoundPreferCeil({}), "
+            "RoundPreferFloor({}). Got {}.",
+            static_cast<int>(Ceil),
+            static_cast<int>(Floor),
+            static_cast<int>(RoundPreferCeil),
+            static_cast<int>(RoundPreferFloor),
+            static_cast<int>(nearestMode));
+    }
+    }
+    Coords nearestCoords;
+    nearestCoords.reserve(coordsToInterpolate.size());
+    for (const auto &coord : coordsToInterpolate) {
+        nearestCoords.push_back(updateCoordinates(coord));
+    }
+    auto it = std::find_if(
+        points.begin(),
+        points.end(),
+        [nearestCoords](auto &point) { return nearestCoords == point.first; });
+    if (it != points.end()) {
+        return it->second;
+    } else {
+        Log::warn("Interpolate::nearest(): did not find a fitting point in "
+                  "the neighbours whose coordinates were {}, returning 0. "
+                  "Available neighbours are at following indexes: ",
+                  coordsToInterpolate);
+        for (const auto &point : points) {
+            Log::warn("idx : [{}]\t\tvalue {}", point.first);
+        }
+        return static_cast<T>(0);
+    }
+}
+
+template <typename T>
+T InterpolationCPU::interpolate(const std::vector<float> &coordsToInterpolate,
+                                const std::set<Point<T>> &points,
+                                const Mode interpMode) {
+
+    T result{0};
+    switch (interpMode) {
+    case Interpolation::Mode::Cubic: {
+        AIDGE_THROW_OR_ABORT(
+            std::runtime_error,
+            "Unsupported interpolation mode selected : Cubic.");
+        break;
+    }
+    case Interpolation::Mode::Linear: {
+        return linear(coordsToInterpolate, points);
+        break;
+    }
+    case Interpolation::Mode::Ceil:
+    case Interpolation::Mode::Floor:
+    case Interpolation::Mode::RoundPreferFloor:
+    case Interpolation::Mode::RoundPreferCeil: {
+        result =
+            InterpolationCPU::nearest(coordsToInterpolate, points, interpMode);
+        break;
+    }
+    default: {
+        AIDGE_THROW_OR_ABORT(std::runtime_error,
+                             "InterpolationCPU::Interpolate({}): Unsupported "
+                             "interpolation mode given as input.",
+                             static_cast<int>(interpMode));
+        break;
+    }
+    }
+    return result;
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+// TEMPLATE DECLARATION
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////
+// INTERPOLATE
+template int8_t InterpolationCPU::interpolate<int8_t>(
+    const std::vector<float> &originalCoords,
+    const std::set<Point<int8_t>> &points,
+    const Mode interpMode);
+template int16_t InterpolationCPU::interpolate<int16_t>(
+    const std::vector<float> &originalCoords,
+    const std::set<Point<int16_t>> &points,
+    const Mode interpMode);
+template int32_t InterpolationCPU::interpolate<int32_t>(
+    const std::vector<float> &originalCoords,
+    const std::set<Point<int32_t>> &points,
+    const Mode interpMode);
+template int64_t InterpolationCPU::interpolate<int64_t>(
+    const std::vector<float> &originalCoords,
+    const std::set<Point<int64_t>> &points,
+    const Mode interpMode);
+
+template half_float::half InterpolationCPU::interpolate<half_float::half>(
+    const std::vector<float> &originalCoords,
+    const std::set<Point<half_float::half>> &points,
+    const Mode interpMode);
+template float InterpolationCPU::interpolate<float>(
+    const std::vector<float> &originalCoords,
+    const std::set<Point<float>> &points,
+    const Mode interpMode);
+template double InterpolationCPU::interpolate<double>(
+    const std::vector<float> &originalCoords,
+    const std::set<Point<double>> &points,
+    const Mode interpMode);
+
+////////////////////////////////////////////////////////////////////
+// INTERPOLATE LINEAR (& its associated recursive function)
+template int8_t
+InterpolationCPU::linear(const std::vector<float> &coordsToInterpolate,
+                         const std::set<Point<int8_t>> &points);
+template std::set<Interpolation::Point<int8_t>>
+InterpolationCPU::linearRecurse(const std::vector<float> &coordsToInterpolate,
+                                const std::set<Point<int8_t>> &points,
+                                DimIdx_t alongDim);
+template int16_t
+InterpolationCPU::linear(const std::vector<float> &coordsToInterpolate,
+                         const std::set<Point<int16_t>> &points);
+template std::set<Interpolation::Point<int16_t>>
+InterpolationCPU::linearRecurse(const std::vector<float> &coordsToInterpolate,
+                                const std::set<Point<int16_t>> &points,
+                                DimIdx_t alongDim);
+template int32_t
+InterpolationCPU::linear(const std::vector<float> &coordsToInterpolate,
+                         const std::set<Point<int32_t>> &points);
+template std::set<Interpolation::Point<int32_t>>
+InterpolationCPU::linearRecurse(const std::vector<float> &coordsToInterpolate,
+                                const std::set<Point<int32_t>> &points,
+                                DimIdx_t alongDim);
+
+template half_float::half
+InterpolationCPU::linear(const std::vector<float> &coordsToInterpolate,
+                         const std::set<Point<half_float::half>> &points);
+template std::set<Interpolation::Point<half_float::half>>
+InterpolationCPU::linearRecurse(
+    const std::vector<float> &coordsToInterpolate,
+    const std::set<Point<half_float::half>> &points,
+    DimIdx_t alongDim);
+template float
+InterpolationCPU::linear(const std::vector<float> &coordsToInterpolate,
+                         const std::set<Point<float>> &points);
+template std::set<Interpolation::Point<float>>
+InterpolationCPU::linearRecurse(const std::vector<float> &coordsToInterpolate,
+                                const std::set<Point<float>> &points,
+                                DimIdx_t alongDim);
+template double
+InterpolationCPU::linear(const std::vector<float> &coordsToInterpolate,
+                         const std::set<Point<double>> &points);
+template std::set<Interpolation::Point<double>>
+InterpolationCPU::linearRecurse(const std::vector<float> &coordsToInterpolate,
+                                const std::set<Point<double>> &points,
+                                DimIdx_t alongDim);
+
+//////////////////////////////////
+// INTERPOLATE NEAREST
+template int8_t
+InterpolationCPU::nearest(const std::vector<float> &originalCoords,
+                          const std::set<Point<int8_t>> &points,
+                          const Interpolation::Mode nearestMode);
+template int16_t
+InterpolationCPU::nearest(const std::vector<float> &originalCoords,
+                          const std::set<Point<int16_t>> &points,
+                          const Interpolation::Mode nearestMode);
+template int32_t
+InterpolationCPU::nearest(const std::vector<float> &originalCoords,
+                          const std::set<Point<int32_t>> &points,
+                          const Interpolation::Mode nearestMode);
+
+template half_float::half
+InterpolationCPU::nearest(const std::vector<float> &originalCoords,
+                          const std::set<Point<half_float::half>> &points,
+                          const Interpolation::Mode nearestMode);
+template float
+InterpolationCPU::nearest(const std::vector<float> &originalCoords,
+                          const std::set<Point<float>> &points,
+                          const Interpolation::Mode nearestMode);
+template double
+InterpolationCPU::nearest(const std::vector<float> &originalCoords,
+                          const std::set<Point<double>> &points,
+                          const Interpolation::Mode nearestMode);
+
+} // namespace Aidge
diff --git a/src/operator/AddImpl.cpp b/src/operator/AddImpl.cpp
index 457a0b17e531fac35ff873f9eedca7bbbe82d459..101743eccb606c998a38f49dd9b89f5ec279bcae 100644
--- a/src/operator/AddImpl.cpp
+++ b/src/operator/AddImpl.cpp
@@ -12,7 +12,6 @@
 #include "aidge/backend/cpu/operator/AddImpl.hpp"
 
 #include <cassert>
-#include <numeric> // std::accumulate
 #include <vector>
 
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
@@ -28,12 +27,11 @@ void  Aidge::AddImpl_cpu::forward() {
     // Check inputs
     AIDGE_ASSERT(op.getInput(0), "missing input in Add operator");
     AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Add forward because the 0-th input has no implementation.");
-    DataType datatypeFirstInput = op.getInput(0)->dataType();
-    for (IOIndex_t i = 1; i < op.nbInputs(); ++i) {
-        AIDGE_ASSERT(op.getInput(i), "missing input in Add operator");
-        AIDGE_ASSERT(op.getInput(i)->hasImpl(), "cannot run Add forward because the {}-th input has no implementation.", i);
-        AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, "Cannot add inputs with two differents data type.");
-    }
+
+    AIDGE_ASSERT(op.getInput(1), "missing input in Add operator");
+    AIDGE_ASSERT(op.getInput(1)->hasImpl(), "cannot run Add forward because the 1st input has no implementation.");
+
+    AIDGE_ASSERT(op.getInput(1)->dataType() == op.getInput(0)->dataType(), "Cannot add inputs with two differents data type.");
 
     // Find the correct kernel type
     const auto impl = Registrar<AddImpl_cpu>::create(getBestMatch(getRequiredSpec()));
@@ -42,28 +40,17 @@ void  Aidge::AddImpl_cpu::forward() {
     // TODO: right now, if needed, memory will be allocated/deallocated at each
     // call to forward(). We might put the following shared_ptr as members of
     // this class to avoid that.
-    const std::size_t nbDims = op.getOutput(0)->nbDims();
-    std::vector<std::vector<std::size_t>> inputsDims;
-    std::vector<const void*> opInputs;
-    std::vector<std::shared_ptr<Tensor>> inputsFallback(op.nbInputs());
-    for (IOIndex_t i = 0; i < op.nbInputs(); ++i) {
-        std::vector<std::size_t> inputDims(nbDims, 1);
-        auto dims = op.getInput(i)->dims();
-		for(std::size_t j=dims.size()-1; j+1>0; --j)
-		{
-			std::size_t idx = nbDims - (dims.size()-j);
-			inputDims[idx] = dims[j];
-		}
-        inputsDims.push_back(inputDims);
-        const auto& input = op.getInput(i)->refCastFrom(inputsFallback[i], *op.getOutput(0));
-        opInputs.push_back(input.getImpl()->rawPtr());
-    }
+    std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback;
+    const auto& input0 = op.getInput(0)->refCastFrom(input0Fallback, *op.getInput(0));
+    const auto& input1 = op.getInput(1)->refCastFrom(input1Fallback, *op.getInput(1));
+
 
-    impl.forward(opInputs,
-               inputsDims,
-               op.getOutput(0)->size(),
-               op.getOutput(0)->dims(),
-               getCPUPtr(op.getRawOutput(0)));
+    impl.forward(op.getInput(0)->dims(),
+                op.getInput(1)->dims(),
+                op.getOutput(0)->dims(),
+                input0.getImpl()->rawPtr(),
+                input1.getImpl()->rawPtr(),
+                getCPUPtr(op.getRawOutput(0)));
 }
 
 template <>
diff --git a/src/operator/AndImpl.cpp b/src/operator/AndImpl.cpp
index 2e0f59769ad86f6e4143ab59d089706e34792244..0cff914a4d03f6ef1ef339d7c7b46e48b6f4c293 100644
--- a/src/operator/AndImpl.cpp
+++ b/src/operator/AndImpl.cpp
@@ -25,22 +25,34 @@
 
 template <>
 void Aidge::AndImpl_cpu::forward() {
-    const std::vector<std::size_t> inputDims0 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
-                                                                   std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims());
-    const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
-                                                                   std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims());
+    const And_Op& op = static_cast<const And_Op&>(mOp);
+    // Check inputs
+    AIDGE_ASSERT(op.getInput(0), "missing input in And operator");
+    AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run And forward because the 0-th input has no implementation.");
 
+    AIDGE_ASSERT(op.getInput(1), "missing input in And operator");
+    AIDGE_ASSERT(op.getInput(1)->hasImpl(), "cannot run And forward because the 1st input has no implementation.");
+
+    AIDGE_ASSERT(op.getInput(1)->dataType() == op.getInput(0)->dataType(), "Cannot And inputs with two differents data type.");
 
     // Find the correct kernel type
     const auto impl = Registrar<AndImpl_cpu>::create(getBestMatch(getRequiredSpec()));
 
-    // Call kernel
-    impl.forward(inputDims0,
-        inputDims1,
-        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
-        getCPUPtr(mOp.getRawInput(0)),
-        getCPUPtr(mOp.getRawInput(1)),
-        getCPUPtr(mOp.getRawOutput(0)));
+    // Convert input data (no overhead if not needed!)
+    // TODO: right now, if needed, memory will be allocated/deallocated at each
+    // call to forward(). We might put the following shared_ptr as members of
+    // this class to avoid that.
+    std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback;
+    const auto& input0 = op.getInput(0)->refCastFrom(input0Fallback, *op.getInput(0));
+    const auto& input1 = op.getInput(1)->refCastFrom(input1Fallback, *op.getInput(1));
+
+
+    impl.forward(op.getInput(0)->dims(),
+                op.getInput(1)->dims(),
+                op.getOutput(0)->dims(),
+                input0.getImpl()->rawPtr(),
+                input1.getImpl()->rawPtr(),
+                getCPUPtr(op.getRawOutput(0)));
 }
 
 template <>
diff --git a/src/operator/AtanImpl.cpp b/src/operator/AtanImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..af3393e7eb13fad4b414172edc7d1ab32ffcc573
--- /dev/null
+++ b/src/operator/AtanImpl.cpp
@@ -0,0 +1,54 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <cassert>
+#include <chrono>  // std::chrono::milliseconds
+#include <numeric> // std::accumulate
+#include <thread>  // std::this_thread::sleep_for
+#include <vector>
+
+#include "aidge/operator/Atan.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+
+#include "aidge/backend/cpu/operator/AtanImpl.hpp"
+#include "aidge/backend/cpu/operator/AtanImpl_kernels.hpp"
+
+template <>
+void Aidge::AtanImpl_cpu::forward() {
+	const Atan_Op& op_ = dynamic_cast<const Atan_Op&>(mOp);
+    std::shared_ptr<Tensor> in0 = op_.getInput(0);
+    std::shared_ptr<Tensor> out0 = op_.getOutput(0);
+    AIDGE_ASSERT(in0, "missing input #0");
+
+    // Find the correct kernel type
+    const auto impl = Registrar<AtanImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Call kernel
+    impl.forward(in0->size(),
+        getCPUPtr(mOp.getRawInput(0)),
+        getCPUPtr(mOp.getRawOutput(0)));
+}
+
+template <>
+void Aidge::AtanImpl_cpu::backward() {
+    const Atan_Op& op_ = dynamic_cast<const Atan_Op&>(mOp);
+    std::shared_ptr<Tensor> out0  = op_.getOutput(0);
+    std::shared_ptr<Tensor> gra_int0 = op_.getInput(0)->grad();
+    std::shared_ptr<Tensor> gra_out0 = op_.getOutput(0)->grad();
+    AIDGE_ASSERT(out0, "missing output #0 for current {} operator", op_.type());
+
+    // Find the correct kernel type
+    const auto impl = Registrar<AtanImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Call kernel
+    impl.backward(gra_int0->size(), getCPUPtr(out0), getCPUPtr(gra_out0), getCPUPtr(gra_int0));
+}
diff --git a/src/operator/BatchNormImpl.cpp b/src/operator/BatchNormImpl.cpp
index 9f1d986e63f14e6038c80054e5e3bc631ec24224..af59310830a865b496019e7620cfb661721ff39a 100644
--- a/src/operator/BatchNormImpl.cpp
+++ b/src/operator/BatchNormImpl.cpp
@@ -30,15 +30,13 @@ void Aidge::BatchNormImpl2D_cpu::forward() {
     AIDGE_ASSERT(op_.getInput(3), "missing input #3 for BatchNorm Operator");
     AIDGE_ASSERT(op_.getInput(4), "missing input #4 for BatchNorm Operator");
 
-    AIDGE_ASSERT(op_.getOutput(0)->nbDims() == 4, "");
-
     // Find the correct kernel type
     const auto impl = Registrar<BatchNormImpl2D_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Call kernel
     impl.forward(op_.epsilon(),
             op_.momentum(),
-            op_.getInput(0)->template dims<4>(),
+            op_.getInput(0)->dims(),
             getCPUPtr(op_.getRawInput(0)),
             getCPUPtr(op_.getRawInput(1)),
             getCPUPtr(op_.getRawInput(2)),
diff --git a/src/operator/BitShiftImpl.cpp b/src/operator/BitShiftImpl.cpp
index 1e0f79fd29fd140f0b41c64d245b9b240da80028..c6940554dd925905a18de66651707c3d58594ade 100644
--- a/src/operator/BitShiftImpl.cpp
+++ b/src/operator/BitShiftImpl.cpp
@@ -28,27 +28,18 @@ void Aidge::BitShiftImpl_cpu::forward() {
 
     const auto& op_ = dynamic_cast<const BitShift_Op&>(mOp);
 
-
     const auto impl = Registrar<BitShiftImpl_cpu>::create(getBestMatch(getRequiredSpec()));
 
-
-    const std::vector<std::size_t> inputDims0 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
-                                                                   std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims());
-    const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
-                                                                   std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims());
-
-    BitShift_Op::BitShiftDirection direction = op_.direction();
-
     // Call kernel
     impl.forward(
-        direction,
-        inputDims0,
-        inputDims1,
-        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
+        op_.direction(),
+        op_.getInput(0)->dims(),
+        op_.getInput(1)->dims(),
+        op_.getOutput(0)->dims(),
         getCPUPtr(mOp.getRawInput(0)),
         getCPUPtr(mOp.getRawInput(1)),
         getCPUPtr(mOp.getRawOutput(0)));
-        
+
 }
 
 template <>
diff --git a/src/operator/ClipImpl.cpp b/src/operator/ClipImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..931d25426a8f6e08363bfc08d23f1714e934634c
--- /dev/null
+++ b/src/operator/ClipImpl.cpp
@@ -0,0 +1,67 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <memory>
+#include <vector>
+
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/Clip.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include "aidge/utils/ErrorHandling.hpp"
+
+#include "aidge/backend/cpu/operator/ClipImpl.hpp"
+#include "aidge/backend/cpu/operator/ClipImpl_kernels.hpp"
+
+template<>
+void Aidge::ClipImpl_cpu::forward() {
+
+	const Clip_Op& op_ = dynamic_cast<const Clip_Op&>(mOp);
+    std::shared_ptr<Tensor> in0 = op_.getInput(0);
+    std::shared_ptr<Tensor> out0 = op_.getOutput(0);
+    AIDGE_ASSERT(in0, "missing input #0");
+    /*AIDGE_ASSERT(in1, "missing input #1 -> Min value empty shape Tensor");
+    AIDGE_ASSERT(in2, "missing input #2 -> Max value empty shape Tensor");*/
+    // Find the correct kernel type
+    const auto impl = Registrar<ClipImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Call kernel
+    impl.forward(
+       op_.min(),
+       op_.max(),
+       getCPUPtr(mOp.getRawInput(0)), 
+       in0->size(), 
+       getCPUPtr(mOp.getRawOutput(0))
+    );
+}
+
+template<>
+void Aidge::ClipImpl_cpu::backward() {
+
+    const Clip_Op& op_ = dynamic_cast<const Clip_Op&>(mOp);
+    std::shared_ptr<Tensor> in0  = op_.getInput(0);
+    std::shared_ptr<Tensor> out0  = op_.getOutput(0);
+    std::shared_ptr<Tensor> gra_in0 = op_.getInput(0)->grad();
+    std::shared_ptr<Tensor> gra_out0 = op_.getOutput(0)->grad();    
+    AIDGE_ASSERT(out0, "missing output #0 for current {} operator", op_.type());
+    
+    // Find the correct kernel type
+    const auto impl = Registrar<ClipImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+    // Call kernel
+    impl.backward(
+        op_.min(),
+        op_.max(),
+        gra_in0->size(), 
+        getCPUPtr(in0), 
+        getCPUPtr(gra_out0), 
+        getCPUPtr(gra_in0)
+    );
+}
diff --git a/src/operator/ConvDepthWiseImpl.cpp b/src/operator/ConvDepthWiseImpl.cpp
index d86bba8d1abf348eb25e2d9c69d04b5c33a8a176..9b4ca3ad50d4b1db3367d39381191cf6d8b01314 100644
--- a/src/operator/ConvDepthWiseImpl.cpp
+++ b/src/operator/ConvDepthWiseImpl.cpp
@@ -65,7 +65,6 @@ void Aidge::ConvDepthWiseImpl2D_cpu::forward() {
 
     AIDGE_ASSERT(op_.getInput(0), "missing input #0 in ConvDepthWise Operator");
     AIDGE_ASSERT(op_.getInput(1), "missing input #1 in ConvDepthWise Operator");
-    AIDGE_ASSERT(op_.getInput(2), "missing input #2 in ConvDepthWise Operator");
 
     AIDGE_ASSERT((op_.getInput(0)->nbDims() == 4), "support for 4-dimensions tensors only");
 
diff --git a/src/operator/LRNImpl.cpp b/src/operator/LRNImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b914ffac236e995c58fe2c6a10417c32493b791c
--- /dev/null
+++ b/src/operator/LRNImpl.cpp
@@ -0,0 +1,46 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <cassert>
+#include <chrono>  // std::chrono::milliseconds
+#include <numeric> // std::accumulate
+#include <thread>  // std::this_thread::sleep_for
+#include <vector>
+
+#include "aidge/operator/LRN.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+
+#include "aidge/backend/cpu/operator/LRNImpl.hpp"
+#include "aidge/backend/cpu/operator/LRNImpl_kernels.hpp"
+
+template <>
+void Aidge::LRNImpl_cpu::forward() {
+    const auto& op_ = dynamic_cast<const LRN_Op&>(mOp);
+    AIDGE_ASSERT(!op_.getInput(0)->empty(), "LRN input empty");
+
+    // Find the correct kernel type
+    const auto impl = Registrar<LRNImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Call kernel
+    impl.forward(op_.alpha(),
+               op_.beta(),
+               op_.bias(),
+               op_.size(),
+               std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims(),
+               std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->getImpl()->rawPtr(),
+               std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr());
+}
+
+template <>
+void Aidge::LRNImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for LRN_Op on backend cpu");
+}
diff --git a/src/operator/MulImpl.cpp b/src/operator/MulImpl.cpp
index ea5e3d3ab8ac24934a0cb6f9042858fa094700af..422bdd005f058fc9200cf5f7962bfc8d5877e6e1 100644
--- a/src/operator/MulImpl.cpp
+++ b/src/operator/MulImpl.cpp
@@ -25,18 +25,15 @@
 
 template <>
 void Aidge::MulImpl_cpu::forward() {
-    const std::vector<std::size_t> inputDims0 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
-                                                                   std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims());
-    const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
-                                                                   std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims());
+    const Mul_Op& op_ = dynamic_cast<const Mul_Op&>(mOp);
 
     // Find the correct kernel type
     const auto impl = Registrar<MulImpl_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Call kernel
-    impl.forward(inputDims0,
-        inputDims1,
-        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
+    impl.forward(op_.getInput(0)->dims(),
+        op_.getInput(1)->dims(),
+        op_.getOutput(0)->dims(),
         getCPUPtr(mOp.getRawInput(0)),
         getCPUPtr(mOp.getRawInput(1)),
         getCPUPtr(mOp.getRawOutput(0)));
@@ -45,7 +42,7 @@ void Aidge::MulImpl_cpu::forward() {
 template <>
 void Aidge::MulImpl_cpu::backward() {
     const Mul_Op& op_ = dynamic_cast<const Mul_Op&>(mOp);
-    
+
     auto in0 = op_.getInput(0);
     auto in1 = op_.getInput(1);
     auto in0grad = op_.getInput(0)->grad();
@@ -56,14 +53,14 @@ void Aidge::MulImpl_cpu::backward() {
     const auto impl = Registrar<MulImpl_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Call kernel
-    impl.backward(/* input0Length */ in0grad->size(), 
+    impl.backward(/* input0Length */ in0grad->size(),
                /* input1Length */ in1grad->size(),
                /* grad0Length  */ out0grad->size(),
                /* input0Dims   */ in0->dims(),
                /* input1Dims   */ in1->dims(),
-               getCPUPtr(in0), 
-               getCPUPtr(in1), 
-               getCPUPtr(out0grad), 
-               getCPUPtr(in0grad), 
+               getCPUPtr(in0),
+               getCPUPtr(in1),
+               getCPUPtr(out0grad),
+               getCPUPtr(in0grad),
                getCPUPtr(in1grad));
 }
diff --git a/src/operator/PaddedConvImpl.cpp b/src/operator/PaddedConvImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b85039d1fb86484e7b7609a0cb335d5e41bbc21f
--- /dev/null
+++ b/src/operator/PaddedConvImpl.cpp
@@ -0,0 +1,128 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include "aidge/backend/cpu/operator/PaddedConvImpl.hpp"
+#include "aidge/backend/cpu/operator/PaddedConvImpl_kernels.hpp"
+
+#include <memory>
+#include <vector>
+
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/MetaOperator.hpp"
+#include "aidge/operator/Conv.hpp"
+#include "aidge/operator/Pad.hpp"
+#include "aidge/utils/ErrorHandling.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+
+template <>
+void Aidge::PaddedConvImpl1D_cpu::forward() {
+    const auto& op_ = static_cast<const MetaOperator_Op&>(mOp);
+
+    // FIXME: uncomment the following code once memory handling will work
+    AIDGE_ASSERT(op_.getInput(0), "missing input #0 in Conv Operator.");
+    AIDGE_ASSERT(op_.getInput(1), "missing input #1 in Conv Operator.");
+
+    // Find the correct kernel type
+    const auto impl = Registrar<PaddedConvImpl1D_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Convert input data (no overhead if not needed!)
+    // TODO: right now, if needed, memory will be allocated/deallocated at each
+    // call to forward(). We might put the following shared_ptr as members of
+    // this class to avoid that.
+    std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback;
+    const auto& input0 = op_.getInput(0)->refCastFrom(input0Fallback, *op_.getOutput(0));
+    const auto& input1 = op_.getInput(1)->refCastFrom(input1Fallback, *op_.getOutput(0));
+    const auto& input2 = (op_.getInput(2)) ? op_.getInput(2)->refCastFrom(input2Fallback, *op_.getOutput(0)) : Tensor();
+
+    std::shared_ptr<Conv_Op<1>> conv_op;
+    std::shared_ptr<Pad_Op<1>> pad_op;
+    for (const auto& n : op_.getMicroGraph()->getNodes()) {
+        if (n->getOperator()->type() == Conv_Op<1>::Type) {
+            conv_op = std::static_pointer_cast<Conv_Op<1>>(n->getOperator());
+        } else {
+            pad_op =  std::static_pointer_cast<Pad_Op<1>>(n->getOperator());
+        }
+    }
+
+    // Call kernel
+    impl.forward(
+            pad_op->beginEndBorders(),
+            conv_op->strideDims(),
+            conv_op->dilationDims(),
+            conv_op->kernelDims(),
+            op_.getInput(0)->template dims<3>(), // input dimensions
+            conv_op->outChannels(), // outChannels
+            input0.getImpl()->rawPtr(), // input
+            input1.getImpl()->rawPtr(), // weight
+            op_.getInput(2) ? input2.getImpl()->rawPtr() : nullptr, // bias
+            getCPUPtr(mOp.getRawOutput(0)) // output
+            );
+}
+
+template <>
+void Aidge::PaddedConvImpl1D_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Conv_Op<1> on backend cpu");
+}
+
+template <>
+void Aidge::PaddedConvImpl2D_cpu::forward() {
+    const auto& op_ = dynamic_cast<const MetaOperator_Op&>(mOp);
+
+    // FIXME: uncomment the following code once memory handling will work
+    AIDGE_ASSERT(op_.getInput(0), "missing input #0 in Conv Operator.");
+    AIDGE_ASSERT(op_.getInput(1), "missing input #1 in Conv Operator.");
+
+    // Find the correct kernel type
+    const auto impl = Registrar<PaddedConvImpl2D_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Convert input data (no overhead if not needed!)
+    // TODO: right now, if needed, memory will be allocated/deallocated at each
+    // call to forward(). We might put the following shared_ptr as members of
+    // this class to avoid that.
+    std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback;
+    const auto& input0 = op_.getInput(0)->refCastFrom(input0Fallback, *op_.getOutput(0));
+    const auto& input1 = op_.getInput(1)->refCastFrom(input1Fallback, *op_.getOutput(0));
+    const auto& input2 = (op_.getInput(2)) ? op_.getInput(2)->refCastFrom(input2Fallback, *op_.getOutput(0)) : Tensor();
+
+    std::shared_ptr<Conv_Op<2>> conv_op;
+    std::shared_ptr<Pad_Op<2>> pad_op;
+
+    for (const auto& n : op_.getMicroGraph()->getNodes()) {
+        if (n->getOperator()->type() == Conv_Op<2>::Type) {
+            conv_op = std::static_pointer_cast<Conv_Op<2>>(n->getOperator());
+        } else {
+            pad_op =  std::static_pointer_cast<Pad_Op<2>>(n->getOperator());
+        }
+    }
+
+    // Call kernel
+    impl.forward(
+            pad_op->beginEndBorders(),
+            conv_op->strideDims(),
+            conv_op->dilationDims(),
+            conv_op->kernelDims(),
+            op_.getInput(0)->template dims<4>(), // input dimensions
+            conv_op->outChannels(), // outChannels
+            input0.getImpl()->rawPtr(), // input
+            input1.getImpl()->rawPtr(), // weight
+            op_.getInput(2) ? input2.getImpl()->rawPtr() : nullptr, // bias
+            getCPUPtr(mOp.getRawOutput(0)) // output
+            );
+}
+
+template <>
+void Aidge::PaddedConvImpl2D_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Conv_Op<2> on backend cpu");
+}
diff --git a/src/operator/PowImpl.cpp b/src/operator/PowImpl.cpp
index 74a7be71e176ba8e1cb8851050e575d6aa7465df..4448c8e9c455e59b584b084d32a8b17e8ae03453 100644
--- a/src/operator/PowImpl.cpp
+++ b/src/operator/PowImpl.cpp
@@ -25,21 +25,36 @@
 
 template <>
 void Aidge::PowImpl_cpu::forward() {
-    const std::vector<std::size_t> inputDims0 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
-                                                                   std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims());
-    const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
-                                                                   std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims());
+
+    const Pow_Op& op = static_cast<const Pow_Op&>(mOp);
+    // Check inputs
+    AIDGE_ASSERT(op.getInput(0), "missing input in Pow operator");
+    AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Pow forward because the 0-th input has no implementation.");
+
+    AIDGE_ASSERT(op.getInput(1), "missing input in Pow operator");
+    AIDGE_ASSERT(op.getInput(1)->hasImpl(), "cannot run Pow forward because the 1st input has no implementation.");
+
+    AIDGE_ASSERT(op.getInput(1)->dataType() == op.getInput(0)->dataType(), "Cannot compute Pow with inputs of two differents data type.");
 
     // Find the correct kernel type
     const auto impl = Registrar<PowImpl_cpu>::create(getBestMatch(getRequiredSpec()));
 
-    // Call kernel
-    impl.forward(inputDims0,
-        inputDims1,
-        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
-        getCPUPtr(mOp.getRawInput(0)),
-        getCPUPtr(mOp.getRawInput(1)),
-        getCPUPtr(mOp.getRawOutput(0)));
+    // Convert input data (no overhead if not needed!)
+    // TODO: right now, if needed, memory will be allocated/deallocated at each
+    // call to forward(). We might put the following shared_ptr as members of
+    // this class to avoid that.
+    std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback;
+    const auto& input0 = op.getInput(0)->refCastFrom(input0Fallback, *op.getInput(0));
+    const auto& input1 = op.getInput(1)->refCastFrom(input1Fallback, *op.getInput(1));
+
+
+    impl.forward(op.getInput(0)->dims(),
+                op.getInput(1)->dims(),
+                op.getOutput(0)->dims(),
+                input0.getImpl()->rawPtr(),
+                input1.getImpl()->rawPtr(),
+                getCPUPtr(op.getRawOutput(0)));
+
 }
 
 template <>
@@ -69,4 +84,4 @@ void Aidge::PowImpl_cpu::backward() {
                 getCPUPtr(out0grad),
                 getCPUPtr(in0grad),
                 getCPUPtr(in1grad));
-}
\ No newline at end of file
+}
diff --git a/src/operator/ResizeImpl.cpp b/src/operator/ResizeImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..38e3639312879ed75dac13fd5ed1226620e0cbd9
--- /dev/null
+++ b/src/operator/ResizeImpl.cpp
@@ -0,0 +1,59 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#include "aidge/backend/cpu/operator/ResizeImpl.hpp"
+#include "aidge/backend/cpu/operator/ResizeImpl_kernels.hpp"
+#include "aidge/operator/Resize.hpp"
+
+#include <cassert>
+#include <cstdint>
+#include <sys/stat.h>
+
+#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/utils/ErrorHandling.hpp"
+
+namespace Aidge {
+
+template <> void ResizeImpl_cpu::forward() {
+    auto &op = dynamic_cast<const Resize_Op &>(mOp);
+
+    /** @brief input #0 */
+    int8_t idxData = 0;
+
+    const bool input0DataPresent =
+        op.getInput(idxData) && !op.getInput(idxData)->undefined();
+
+    ///////////////////////////////////////
+    // CHECKING NODE CONNECTIONS
+    AIDGE_ASSERT(input0DataPresent, "{}: missing data input #0", op.type());
+
+    ///////////////////////////////////////
+    // CALL TO FORWARD
+    const auto impl =
+        Registrar<ResizeImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    impl.forward(op.getInput(idxData)->getImpl()->rawPtr(),
+                 op.getInput(idxData)->dims(),
+                 op.getOutput(0)->dims(),
+
+                 op.coordinateTransformationMode(),
+                 op.interpolationMode(),
+                 op.paddingMode(),
+
+                 op.getOutput(0)->getImpl()->rawPtr() // output pointer
+    );
+}
+
+template <> void Aidge::ResizeImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(
+        std::runtime_error,
+        "Backward not yet implemented for Slice_Op on backend cpu");
+}
+} // namespace Aidge
diff --git a/src/operator/RoundImpl.cpp b/src/operator/RoundImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6f19f064cabfaa6bde7b434b0defe53f5c1b78cf
--- /dev/null
+++ b/src/operator/RoundImpl.cpp
@@ -0,0 +1,40 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <memory>
+#include <vector>
+
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/Round.hpp"
+#include "aidge/utils/ErrorHandling.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/operator/RoundImpl.hpp"
+#include "aidge/backend/cpu/operator/RoundImpl_kernels.hpp"
+
+template <>
+void Aidge::RoundImpl_cpu::forward() {
+    std::shared_ptr<Tensor> in0 = std::static_pointer_cast<Tensor>(mOp.getRawInput(0));
+    std::shared_ptr<Tensor> out0 = std::static_pointer_cast<Tensor>(mOp.getRawOutput(0));
+    AIDGE_ASSERT(in0, "missing input #0");
+
+    // Find the correct kernel type
+    const auto impl = Registrar<RoundImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Call kernel
+    impl.forward(in0->size(),
+        getCPUPtr(mOp.getRawInput(0)),
+        getCPUPtr(mOp.getRawOutput(0)));
+}
+template <>
+void Aidge::RoundImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Round_Op on backend cpu");
+}
\ No newline at end of file
diff --git a/src/operator/SubImpl.cpp b/src/operator/SubImpl.cpp
index d43771b967889183801cb93418c967ce9d9c8453..e36abe2a9d68a2b56ab1777aa04b0e911df514c8 100644
--- a/src/operator/SubImpl.cpp
+++ b/src/operator/SubImpl.cpp
@@ -25,18 +25,15 @@
 
 template <>
 void Aidge::SubImpl_cpu::forward() {
-    const std::vector<std::size_t> inputDims0 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
-                                                                   std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->dims());
-    const std::vector<std::size_t> inputDims1 = getBroadcastedDims(std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
-                                                                   std::static_pointer_cast<Tensor>(mOp.getRawInput(1))->dims());
+    const Sub_Op& op_ = dynamic_cast<const Sub_Op&>(mOp);
 
     // Find the correct kernel type
     const auto impl = Registrar<SubImpl_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Call kernel
-    impl.forward(inputDims0,
-        inputDims1,
-        std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->dims(),
+    impl.forward(op_.getInput(0)->dims(),
+        op_.getInput(1)->dims(),
+        op_.getOutput(0)->dims(),
         getCPUPtr(mOp.getRawInput(0)),
         getCPUPtr(mOp.getRawInput(1)),
         getCPUPtr(mOp.getRawOutput(0)));
diff --git a/unit_tests/CMakeLists.txt b/unit_tests/CMakeLists.txt
index 8178df93beb96a3a7538dae8d9a706380c06ecf8..5984524fdc8c596641e505897d16e12de78024cc 100644
--- a/unit_tests/CMakeLists.txt
+++ b/unit_tests/CMakeLists.txt
@@ -3,7 +3,7 @@ Include(FetchContent)
 FetchContent_Declare(
   Catch2
   GIT_REPOSITORY https://github.com/catchorg/Catch2.git
-  GIT_TAG        v3.0.1 # or a later release
+  GIT_TAG        v3.7.1 # or a later release
 )
 
 FetchContent_MakeAvailable(Catch2)
diff --git a/unit_tests/data/Test_Interpolation.cpp b/unit_tests/data/Test_Interpolation.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5c3b56f02ab17092a6ba238cc74e1bf75e203718
--- /dev/null
+++ b/unit_tests/data/Test_Interpolation.cpp
@@ -0,0 +1,237 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <aidge/backend/cpu/data/Interpolation.hpp>
+#include <aidge/data/Interpolation.hpp>
+#include <aidge/data/Tensor.hpp>
+#include <aidge/filler/Filler.hpp>
+#include <aidge/utils/Types.h>
+#include <catch2/catch_test_macros.hpp>
+#include <limits>
+
+#include "aidge/backend/cpu/data/Interpolation.hpp"
+
+namespace Aidge {
+
+TEST_CASE("Interpolation", "[Interpolation][Data]") {
+
+    SECTION("Linear") {
+        std::set<Interpolation::Point<int>> pointsToInterpolateInt;
+        std::set<Interpolation::Point<float>> pointsToInterpolateFloat;
+
+        SECTION("1D") {
+            pointsToInterpolateInt =
+                std::set<Interpolation::Point<int>>({{{0}, 10}, {{1}, 20}});
+            CHECK(abs(InterpolationCPU::linear({0.5}, pointsToInterpolateInt) -
+                      15) <= std::numeric_limits<int>::epsilon());
+
+            pointsToInterpolateFloat = std::set<Interpolation::Point<float>>(
+                {{{0}, .0F}, {{1}, 0.2F}});
+            CHECK(fabs(InterpolationCPU::linear({0.3},
+                                                pointsToInterpolateFloat) -
+                       .06F) <= 1e-5);
+        }
+        SECTION("2D") {
+            // example taken from
+            // https://en.wikipedia.org/wiki/Bilinear_interpolation
+            pointsToInterpolateFloat = {{{14, 20}, 91.F},
+                                        {{14, 21}, 162.F},
+                                        {{15, 20}, 210.F},
+                                        {{15, 21}, 95.F}};
+            CHECK(fabs(InterpolationCPU::linear<float>(
+                           {14.5F, 20.2F},
+                           pointsToInterpolateFloat) -
+                       146.1) < 1e-5);
+            // pointsToInterpolateFloat = {{{0, 0}, .10F},
+            //                             {{0, 1}, .20F},
+            //                             {{1, 0}, .30F},
+            //                             {{1, 1}, .40F}};
+            // CHECK(abs(InterpolationCPU::linear<float>({1.5, 0.5},
+            //                                         pointsToInterpolateInt)
+            //                                         -
+            //           25) < std::numeric_limits<int>::epsilon());
+
+            // pointsToInterpolateFloat = std::vector({0.1F, 0.2F, 0.3F,
+            // 0.4F}); CHECK(InterpolationCPU::linear(pointsToInterpolateFloat)
+            // == .25f);
+        }
+        SECTION("3D") {
+            pointsToInterpolateFloat = {{{0, 0, 0}, .1F},
+                                        {{0, 0, 1}, .2F},
+                                        {{0, 1, 0}, .3F},
+                                        {{0, 1, 1}, .4F},
+                                        {{1, 0, 0}, .5F},
+                                        {{1, 0, 1}, .6F},
+                                        {{1, 1, 0}, .7F},
+                                        {{1, 1, 1}, .8F}};
+            CHECK(fabs(InterpolationCPU::linear({.5, .5, .5},
+                                                pointsToInterpolateFloat) -
+                       .45f) < 1e-5);
+        }
+        SECTION("4D") {
+            SECTION("Casual") {
+                pointsToInterpolateFloat = {{{0, 0, 0, 0}, .1F},
+                                            {{0, 0, 0, 1}, .2F},
+                                            {{0, 0, 1, 0}, .3F},
+                                            {{0, 0, 1, 1}, .4F},
+                                            {{0, 1, 0, 0}, .5F},
+                                            {{0, 1, 0, 1}, .6F},
+                                            {{0, 1, 1, 0}, .7F},
+                                            {{0, 1, 1, 1}, .8F},
+                                            {{1, 0, 0, 0}, .9F},
+                                            {{1, 0, 0, 1}, 1.F},
+                                            {{1, 0, 1, 0}, 1.1F},
+                                            {{1, 0, 1, 1}, 1.2F},
+                                            {{1, 1, 0, 0}, 1.3F},
+                                            {{1, 1, 0, 1}, 1.4F},
+                                            {{1, 1, 1, 0}, 1.5F},
+                                            {{1, 1, 1, 1}, 1.6F}};
+                CHECK(fabs(InterpolationCPU::linear<float>(
+                               {.5, .5, .5, .5},
+                               pointsToInterpolateFloat) -
+                           .85f) < 0.0001);
+            }
+        }
+        SECTION("Some of the coords to interpolate were round") {
+            // In this case retrieveNeighbours()
+            //  only retrieved the neighbours against not round dimensions
+            auto tensor =
+                std::make_shared<Tensor>(std::vector<DimSize_t>({10, 10}));
+            tensor->setDataType(DataType::Float32);
+            tensor->setBackend("cpu");
+            Aidge::constantFiller(tensor, 1337.F);
+
+            std::set<Interpolation::Point<float>> expectedResult = {
+                {{0, 0, -1, -1}, 0.F},
+                {{0, 0, 0, -1}, 0.F},
+                {{0, 0, -1, 0}, 0.F},
+                {{0, 0, 0, 0}, 1337.F}};
+
+            pointsToInterpolateFloat = Interpolation::retrieveNeighbours(
+                reinterpret_cast<float *>(tensor->getImpl()->rawPtr()),
+                tensor->dims(),
+                std::vector<float>({0.F, 0.F, -0.25F, -0.25F}));
+
+            pointsToInterpolateFloat = {{{0, 0, -1, -1}, 1337.F},
+                                        {{0, 0, 0, -1}, 1337.F},
+                                        {{0, 0, -1, 0}, 1337.F},
+                                        {{0, 0, 0, 0}, 1337.F}};
+        }
+    }
+    SECTION("Nearest") {
+        std::set<Interpolation::Point<float>> pointsToInterpolate;
+        std::vector<float> coordToInterpolate;
+        SECTION("1D") {
+            coordToInterpolate = {0.5F};
+            pointsToInterpolate =
+                std::set<Interpolation::Point<float>>{{{0}, 1.0F},
+                                                      {{1}, 2.0F},
+                                                      {{2}, 3.0F},
+                                                      {{3}, 4.0F},
+                                                      {{4}, 5.0F}};
+
+            SECTION("Floor") {
+                CHECK(InterpolationCPU::nearest(
+                          coordToInterpolate,
+                          pointsToInterpolate,
+                          Interpolation::Mode::Floor) == 1);
+            }
+            SECTION("Ceil") {
+                CHECK(InterpolationCPU::nearest(
+                          coordToInterpolate,
+                          pointsToInterpolate,
+                          Interpolation::Mode::Ceil) == 2);
+            }
+            SECTION("RoundPreferFloor") {
+                CHECK(InterpolationCPU::nearest(
+                          coordToInterpolate,
+                          pointsToInterpolate,
+                          Interpolation::Mode::RoundPreferFloor) == 1);
+            }
+            SECTION("RoundPreferCeil") {
+                CHECK(InterpolationCPU::nearest(
+                          coordToInterpolate,
+                          pointsToInterpolate,
+                          Interpolation::Mode::RoundPreferCeil) == 2);
+            }
+        }
+        SECTION("2D") {
+            coordToInterpolate = {2.5F, 3.97F};
+            pointsToInterpolate = {{{0, 0}, 10.0},
+                                   {{1, 1}, 20.0},
+                                   {{2, 3}, 30.0},
+                                   {{2, 4}, 40.0},
+                                   {{3, 3}, 50.0},
+                                   {{3, 4}, 60.0}};
+            SECTION("Floor") {
+                CHECK(InterpolationCPU::nearest(
+                          coordToInterpolate,
+                          pointsToInterpolate,
+                          Interpolation::Mode::Floor) == 30.);
+            }
+            SECTION("Ceil") {
+                CHECK(InterpolationCPU::nearest(
+                          coordToInterpolate,
+                          pointsToInterpolate,
+                          Interpolation::Mode::Ceil) == 60.);
+            }
+            SECTION("RoundPreferFloor") {
+                CHECK(InterpolationCPU::nearest(
+                          coordToInterpolate,
+                          pointsToInterpolate,
+                          Interpolation::Mode::RoundPreferFloor) ==
+                      40.);
+            }
+            SECTION("RoundPreferCeil") {
+                CHECK(InterpolationCPU::nearest(
+                          coordToInterpolate,
+                          pointsToInterpolate,
+                          Interpolation::Mode::RoundPreferCeil) == 60.);
+            }
+        }
+        SECTION("3D") {
+            coordToInterpolate = {1.9, 2.1, 3.6};
+            pointsToInterpolate = {{{0, 0, 0}, 5.0},
+                                   {{1, 2, 3}, 10.0},
+                                   {{2, 1, 4}, 20.0},
+                                   {{2, 2, 4}, 30.0},
+                                   {{2, 3, 3}, 40.0},
+                                   {{2, 3, 4}, 50.0},
+                                   {{3, 3, 4}, 60.0}};
+            SECTION("Floor") {
+                CHECK(InterpolationCPU::nearest(
+                          coordToInterpolate,
+                          pointsToInterpolate,
+                          Interpolation::Mode::Floor) == 10.);
+            }
+            SECTION("Ceil") {
+                CHECK(InterpolationCPU::nearest(
+                          coordToInterpolate,
+                          pointsToInterpolate,
+                          Interpolation::Mode::Ceil) == 50.);
+            }
+            SECTION("RoundPreferFloor") {
+                CHECK(InterpolationCPU::nearest(
+                          coordToInterpolate,
+                          pointsToInterpolate,
+                          Interpolation::Mode::RoundPreferFloor) ==
+                      30.);
+            }
+            SECTION("RoundPreferCeil") {
+                CHECK(InterpolationCPU::nearest(
+                          coordToInterpolate,
+                          pointsToInterpolate,
+                          Interpolation::Mode::RoundPreferCeil) == 30.);
+            }
+        }
+    }
+}
+} // namespace Aidge
diff --git a/unit_tests/data/Test_TensorImpl.cpp b/unit_tests/data/Test_TensorImpl.cpp
index 5f870acfb44366632474b7290228658d7a4701dd..fd938f10a947d1520600a1d00022eeb970cd76e6 100644
--- a/unit_tests/data/Test_TensorImpl.cpp
+++ b/unit_tests/data/Test_TensorImpl.cpp
@@ -25,7 +25,7 @@
 
 namespace Aidge {
 
-TEST_CASE("Test addition of Tensors","[TensorImpl][Add]") {
+TEST_CASE("Test addition of Tensors","[TensorImpl][Add][Data]") {
     constexpr std::uint16_t NBTRIALS = 10;
     // Create a random number generator
     std::random_device rd;
@@ -35,7 +35,7 @@ TEST_CASE("Test addition of Tensors","[TensorImpl][Add]") {
     std::uniform_int_distribution<int> boolDist(0,1);
 
     // Create MatMul Operator
-    std::shared_ptr<Node> mySub = Add(2);
+    std::shared_ptr<Node> mySub = Add();
     auto op = std::static_pointer_cast<OperatorTensor>(mySub-> getOperator());
     op->setDataType(DataType::Float32);
     op->setBackend("cpu");
@@ -193,4 +193,100 @@ TEST_CASE("Test division of Tensors","[TensorImpl][Div]") {
     Tensor T3(T1.dims());
     REQUIRE_THROWS(T0 / T3);
 }
+
+TEST_CASE("Tensor arithmetic operators", "[Tensor][Operator][CPU]") {
+    SECTION("Addition") {
+        const Tensor t = Array1D<std::int32_t, 5>{1,2,3,4,5};
+        const Tensor t2 = Array1D<std::int32_t, 5>{10,20,30,40,50};
+        const Tensor t3 = Tensor(std::int32_t(3));
+
+        SECTION("operator+") {
+            auto a = t.clone();
+            auto b = t2.clone();
+            auto c = t3.clone();
+
+            // simple addition
+            auto r1 = a + b;
+            const Tensor expected_res_simple = Array1D<std::int32_t, 5>{11,22,33,44,55};
+
+            // input tensors are not modified
+            REQUIRE(a == t);
+            REQUIRE(b == t2);
+            // result is right
+            REQUIRE(r1 == expected_res_simple);
+
+            // simple addition of arithmetic value
+            auto r2 = a + 10;
+            const Tensor expected_res_simple_arithmetic = Array1D<std::int32_t, 5>{11,12,13,14,15};
+
+            // input tensors are not modified
+            REQUIRE(a == t);
+            // result is right
+            REQUIRE(r2 == expected_res_simple_arithmetic);
+
+
+            // chained addition a+b+c
+            auto r3 = a + b + c;
+            const Tensor expected_res_chained = Array1D<std::int32_t, 5>{14,25,36,47,58};
+
+            // input Tensors are not modified
+            REQUIRE(a == t);
+            REQUIRE(b == t2);
+            REQUIRE(c == t3);
+            // result is right
+            REQUIRE(r3 == expected_res_chained);
+        }
+        SECTION("operator+=") {
+            auto a = t.clone();
+            auto b = t2.clone();
+
+            a += b;
+            const Tensor expected_res = Array1D<std::int32_t, 5>{11,22,33,44,55};
+
+            // input tensors are not modified
+            REQUIRE(b == t2);
+            // result is right
+            REQUIRE(a == expected_res);
+
+            // simple addition of arithmetic value
+            a = t.clone();
+            a += 10;
+            const Tensor expected_res_arithmetic = Array1D<std::int32_t, 5>{11,12,13,14,15};
+
+            // result is right
+            REQUIRE(a == expected_res_arithmetic);
+        }
+    }
+    SECTION("Substraction") {
+        const Tensor t = Array1D<std::int32_t, 5>{1,2,3,4,5};
+        const Tensor t2 = Tensor(std::int32_t(3));
+
+        SECTION("operator-") {
+            auto a = t.clone();
+            auto b = t2.clone();
+
+            // simple substraction
+            auto r1 = a - b;
+            const Tensor expected_res_simple = Array1D<std::int32_t, 5>{-2,-1,0,1,2};
+
+            // input tensors are not modified
+            REQUIRE(a == t);
+            REQUIRE(b == t2);
+            // result is right
+            REQUIRE(r1 == expected_res_simple);
+        }
+        SECTION("operator-=") {
+            auto a = t.clone();
+            auto b = t2.clone();
+
+            a -= b;
+            const Tensor expected_res = Array1D<std::int32_t, 5>{-2,-1,0,1,2};
+
+            // input tensors are not modified
+            REQUIRE(b == t2);
+            // result is right
+            REQUIRE(a == expected_res);
+        }
+    }
+}
 } // namespace Aidge
diff --git a/unit_tests/operator/Test_AddImpl.cpp b/unit_tests/operator/Test_AddImpl.cpp
index 95a0e96fe6cf8c19beeef2bdbae3c07873996dcf..bca4025705cb1c851dcf3e9accbf016c4535120a 100644
--- a/unit_tests/operator/Test_AddImpl.cpp
+++ b/unit_tests/operator/Test_AddImpl.cpp
@@ -39,17 +39,6 @@ TEST_CASE("[cpu/operator] Add(forward)", "[Add][CPU]") {
         }                                       //
     });                                         //
 
-    SECTION("One input") {
-        std::shared_ptr<Node> myAdd = Add(1);
-        auto op = std::static_pointer_cast<OperatorTensor>(myAdd -> getOperator());
-        op->associateInput(0, input1);
-        op->setBackend("cpu");
-        op->setDataType(DataType::Int32);
-        myAdd->forward();
-
-        REQUIRE(*(op->getOutput(0)) == *input1);
-    }
-
     SECTION("Two inputs") {
         std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
             {
@@ -71,7 +60,7 @@ TEST_CASE("[cpu/operator] Add(forward)", "[Add][CPU]") {
             }
         });
 
-        std::shared_ptr<Node> myAdd = Add(2);
+        std::shared_ptr<Node> myAdd = Add();
         auto op = std::static_pointer_cast<OperatorTensor>(myAdd -> getOperator());
         op->associateInput(0, input1);
         op->associateInput(1, input1);
@@ -82,39 +71,6 @@ TEST_CASE("[cpu/operator] Add(forward)", "[Add][CPU]") {
         REQUIRE(*(op->getOutput(0)) == *expectedOutput);
     }
 
-    SECTION("Three inputs") {
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
-            {
-                {
-                    {{ 60, 141},{ 63, 144},{ 66, 147}},
-                    {{ 69, 150},{ 72, 153},{ 75, 156}},
-                    {{ 78, 159},{ 81, 162},{ 84, 165}}
-                },
-                {
-                    {{ 87, 168},{ 90, 171},{ 93, 174}},
-                    {{ 96, 177},{ 99, 180},{102, 183}},
-                    {{105, 186},{108, 189},{111, 192}}
-                },
-                {
-                    {{114, 195},{117, 198},{120, 201}},
-                    {{123, 204},{126, 207},{129, 210}},
-                    {{132, 213},{135, 216},{138, 219}}
-                }
-            }
-        });
-
-        std::shared_ptr<Node> myAdd = Add(3);
-        auto op = std::static_pointer_cast<OperatorTensor>(myAdd -> getOperator());
-        op->associateInput(0, input1);
-        op->associateInput(1, input1);
-        op->associateInput(2, input1);
-        op->setDataType(DataType::Int32);
-        op->setBackend("cpu");
-        myAdd->forward();
-
-        REQUIRE(*op->getOutput(0) == *expectedOutput);
-    }
-
     SECTION("Broadcasting") {
         std::shared_ptr<Tensor> input_0 = std::make_shared<Tensor>(Array4D<int,3,1,3,2> {
         {                                       //
@@ -139,7 +95,7 @@ TEST_CASE("[cpu/operator] Add(forward)", "[Add][CPU]") {
         }                                       //
         });                                     //
 
-        std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<int,2> {{100,200}});  
+        std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<int,2> {{100,200}});
         std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
             {                                               //
                 {                                           //
@@ -160,16 +116,23 @@ TEST_CASE("[cpu/operator] Add(forward)", "[Add][CPU]") {
             }                                               //
         });                                                 //
 
-        std::shared_ptr<Node> myAdd = Add(3);
-        auto op = std::static_pointer_cast<OperatorTensor>(myAdd -> getOperator());
-        op->associateInput(0, input_0);
-        op->associateInput(1, input_1);
-        op->associateInput(2, input_2);
-        op->setDataType(DataType::Int32);
-        op->setBackend("cpu");
-        myAdd->forward();
-        op->getOutput(0)->print();
+        std::shared_ptr<Node> myAdd_0 = Add();
+        std::shared_ptr<Node> myAdd_1 = Add();
+        auto op_0 = std::static_pointer_cast<OperatorTensor>(myAdd_0 -> getOperator());
+        auto op_1 = std::static_pointer_cast<OperatorTensor>(myAdd_1 -> getOperator());
+        op_0->associateInput(0, input_0);
+        op_0->associateInput(1, input_1);
+
+        op_1->associateInput(0, input_2);
+        op_1->associateInput(1, op_0->getOutput(0));
+        op_0->setDataType(DataType::Int32);
+        op_1->setDataType(DataType::Int32);
+        op_0->setBackend("cpu");
+        op_1->setBackend("cpu");
+        myAdd_0->forward();
+        myAdd_1->forward();
+        op_1->getOutput(0)->print();
         expectedOutput->print();
-        REQUIRE(*op->getOutput(0) == *expectedOutput);
+        REQUIRE(*op_1->getOutput(0) == *expectedOutput);
     }
 }
\ No newline at end of file
diff --git a/unit_tests/operator/Test_Atan.cpp b/unit_tests/operator/Test_Atan.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9548e35d81b0423125424a4198d82558c4e57df4
--- /dev/null
+++ b/unit_tests/operator/Test_Atan.cpp
@@ -0,0 +1,77 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/Atan.hpp"
+
+#include "aidge/backend/cpu.hpp"
+
+#include <memory>
+
+using namespace Aidge;
+
+TEST_CASE("[cpu/operator] Atan(forward)") {
+  SECTION("1D Tensor") {
+    std::shared_ptr<Tensor> input0 =
+        std::make_shared<Tensor>(Array1D<float, 10>{
+            {0.41384590, 0.43120754, 0.93762982, 0.31049860, 0.77547199,
+             0.09514862, 0.16145366, 0.42776686, 0.43487436, 0.41170865}});
+    std::shared_ptr<Tensor> expectedOutput =
+        std::make_shared<Tensor>(Array1D<float, 10>{
+            {0.39238522, 0.40711672, 0.75322037, 0.30106049, 0.65960488,
+             0.09486303, 0.16007232, 0.40421187, 0.4102045, 0.39055911}});
+
+    std::shared_ptr<Node> myAtan = Atan();
+    auto op = std::static_pointer_cast<OperatorTensor>(myAtan->getOperator());
+    op->associateInput(0, input0);
+    op->setDataType(DataType::Float32);
+    op->setBackend("cpu");
+    myAtan->forward();
+
+    float* resPtr = static_cast<float*>(op->getOutput(0)->getImpl()->rawPtr());
+    float* expectedPtr =
+        static_cast<float*>(expectedOutput->getImpl()->rawPtr());
+    for (std::size_t i = 0; i < expectedOutput->size(); ++i) {
+      REQUIRE(std::abs(resPtr[i] - expectedPtr[i]) < 0.00001);
+    }
+  }
+
+  SECTION("3D Tensor") {
+    std::shared_ptr<Tensor> input0 = std::make_shared<Tensor>(
+        Array3D<float, 2, 2, 3>{{{
+                                     {0.97037154, 0.86208081, 0.77767169},
+                                     {0.38160080, 0.11422747, 0.77284443},
+                                 },
+                                 {{0.51592529, 0.72543722, 0.54641193},
+                                  {0.93866944, 0.97767913, 0.34172094}}}});
+    std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(
+        Array3D<float, 2, 2, 3>{{{{0.77036231, 0.71146592, 0.66097706},
+                                  {0.36454508, 0.11373451, 0.65796196}},
+                                 {{0.47630652, 0.62759472, 0.50008428},
+                                  {0.75377332, 0.77411225, 0.32928031}}}});
+
+    std::shared_ptr<Node> myAtan = Atan();
+    auto op = std::static_pointer_cast<OperatorTensor>(myAtan->getOperator());
+    op->associateInput(0, input0);
+    op->setDataType(DataType::Float32);
+    op->setBackend("cpu");
+    myAtan->forward();
+
+    float* resPtr = static_cast<float*>(op->getOutput(0)->getImpl()->rawPtr());
+    float* expectedPtr =
+        static_cast<float*>(expectedOutput->getImpl()->rawPtr());
+    for (std::size_t i = 0; i < expectedOutput->size(); ++i) {
+      REQUIRE(std::abs(resPtr[i] - expectedPtr[i]) < 0.00001);
+    }
+  }
+}
diff --git a/unit_tests/operator/Test_ClipImpl.cpp b/unit_tests/operator/Test_ClipImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..45c8da5bf7ecc84fad6b3e694fe204540f579af3
--- /dev/null
+++ b/unit_tests/operator/Test_ClipImpl.cpp
@@ -0,0 +1,318 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <catch2/catch_test_macros.hpp>
+#include <cstddef>  // std::size_t
+#include <cstdint>  // std::uint16_t
+#include <chrono>
+#include <iostream>
+#include <vector>
+#include <algorithm>
+#include <iomanip>
+#include <memory>
+#include <random>   // std::random_device, std::mt19937, std::uniform_real_distribution
+
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/Clip.hpp"
+#include "aidge/operator/OperatorTensor.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+#include "aidge/backend/cpu.hpp"
+
+void ComputeClipBackward(const std::vector<float>& vec1, std::vector<float>& vec2, float min, float max) {
+    if (vec1.size() != vec2.size()) {
+        std::cerr << "Vectors should have the same sizes." << std::endl;
+        return;
+    }
+
+    for (size_t i = 0; i < vec1.size(); ++i) {
+        if (vec1[i] < min || vec1[i] > max) {
+            vec2[i] = 0.0f;
+        }
+    }
+}
+namespace Aidge 
+{
+TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
+ {
+    const std::uint16_t NBTRIALS = 10;
+    // Create a random number generator
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> dis(0.0, 10.0);
+    std::uniform_real_distribution<float> dismin(0.0, 4.5); 
+    std::uniform_real_distribution<float> dismax(5.5, 10.0); 
+    std::uniform_int_distribution<std::size_t> distDims(5,15);
+    std::uniform_int_distribution<std::size_t> distNbMatrix(1, 5);
+
+    // Create MatMul Operator
+    std::shared_ptr<Node> myClip = Aidge::Clip("nop");
+    auto op = std::static_pointer_cast<OperatorTensor>(myClip -> getOperator());
+
+    // To measure execution time of 'MatMul_Op::forward()' member function call
+    std::chrono::time_point<std::chrono::system_clock> start;
+    std::chrono::time_point<std::chrono::system_clock> end;
+    std::chrono::duration<double, std::micro> duration;
+
+    SECTION("Simple clip test [Forward]") {
+        std::size_t totalComputation = 0;
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+            // generate Tensors dimensions
+            const std::size_t dim0 = distDims(gen);
+            const std::size_t dim1 = distDims(gen);
+            totalComputation += dim0*dim1;
+
+            // Create and populate the array with random float values
+            float* Array = new float[dim0*dim1];
+            for (int i = 0; i < dim0*dim1; ++i) {
+                Array[i] = dis(gen); // Generate random float value
+            }
+
+            // Convert Input to Tensor
+            std::shared_ptr<Tensor> TInput = std::make_shared<Tensor>(DataType::Float32);
+            TInput -> resize({dim0,dim1});
+            TInput -> setBackend("cpu");
+            TInput -> getImpl() -> setRawPtr(Array, dim0*dim1);
+            
+            float min = dismin(gen);
+            std::shared_ptr<Tensor> Tmin = std::make_shared<Tensor>(DataType::Float32);
+            Tmin -> resize({});
+            Tmin -> setBackend("cpu");
+            Tmin -> getImpl() -> setRawPtr(&min,1);
+
+            float max = dismax(gen);
+            std::shared_ptr<Tensor> Tmax = std::make_shared<Tensor>(DataType::Float32);
+            Tmax -> resize({});
+            Tmax -> setBackend("cpu");
+            Tmax -> getImpl() -> setRawPtr(&max,1);
+            // convert res to Tensordf
+            std::vector<float> GT(Array, Array + (dim0*dim1));
+            for (float& val : GT)
+            {
+                val = std::max(min, std::min(val, max));
+            }
+            std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>(DataType::Float32);
+            Tres -> resize({dim0,dim1});
+            Tres -> setBackend("cpu");
+            Tres -> getImpl() -> setRawPtr(GT.data(), dim0*dim1);
+
+            op->associateInput(0, TInput);
+            op->associateInput(1, Tmin);
+            op->associateInput(2, Tmax);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            op->forwardDims(true);
+            
+            start = std::chrono::system_clock::now();
+            myClip->forward();
+            end = std::chrono::system_clock::now();
+
+            duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+            REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
+        }
+        std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl;
+        std::cout << "total time: " << duration.count() << std::endl;
+    } 
+    SECTION("Clip test with min >= max [Forward]") {
+        std::size_t totalComputation = 0;
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+            // generate Tensors dimensions
+            const std::size_t dim0 = distDims(gen);
+            const std::size_t dim1 = distDims(gen);
+            totalComputation += dim0*dim1;
+
+            // Create and populate the array with random float values
+            float* Array = new float[dim0*dim1];
+            for (int i = 0; i < dim0*dim1; ++i) {
+                Array[i] = dis(gen); // Generate random float value
+            }
+
+            // Convert Input to Tensor
+            std::shared_ptr<Tensor> TInput = std::make_shared<Tensor>(DataType::Float32);
+            TInput -> resize({dim0,dim1});
+            TInput -> setBackend("cpu");
+            TInput -> getImpl() -> setRawPtr(Array, dim0*dim1);
+            
+            float min = dismax(gen);
+            std::shared_ptr<Tensor> Tmin = std::make_shared<Tensor>(DataType::Float32);
+            Tmin -> resize({});
+            Tmin -> setBackend("cpu");
+            Tmin -> getImpl() -> setRawPtr(&min,1);
+
+            float max = dismin(gen); //We generate max and min so that max is always <= min
+            std::shared_ptr<Tensor> Tmax = std::make_shared<Tensor>(DataType::Float32);
+            Tmax -> resize({});
+            Tmax -> setBackend("cpu");
+            Tmax -> getImpl() -> setRawPtr(&max,1);
+            // convert res to Tensor
+            std::vector<float> GT(Array, Array + (dim0*dim1));
+            for (float& val : GT)
+            {
+                val = max;
+            }
+            std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>(DataType::Float32);
+            Tres -> resize({dim0,dim1});
+            Tres -> setBackend("cpu");
+            Tres -> getImpl() -> setRawPtr(GT.data(), dim0*dim1);
+
+            op->associateInput(0, TInput);
+            op->associateInput(1, Tmin);
+            op->associateInput(2, Tmax);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            op->forwardDims(true);
+            
+            start = std::chrono::system_clock::now();
+            myClip->forward();
+            end = std::chrono::system_clock::now();
+
+            duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+            REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
+        }
+        std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl;
+        std::cout << "total time: " << duration.count() << std::endl;
+    } 
+    SECTION("Clip with Clip Attr [Forward]")
+    {
+        std::size_t totalComputation = 0;
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) 
+        {
+
+            float min = dismin(gen);
+            float max = dismax(gen);
+            std::shared_ptr<Node> myCl = Aidge::Clip("",min,max);
+            auto op = std::static_pointer_cast<OperatorTensor>(myCl -> getOperator());
+
+
+            // generate Tensors dimensions
+            const std::size_t dim0 = 3;
+            const std::size_t dim1 = 3;
+            totalComputation += dim0*dim1;
+
+            // Create and populate the array with random float values
+            float* Array = new float[dim0*dim1];
+            for (int i = 0; i < dim0*dim1; ++i) {
+                Array[i] = dis(gen); // Generate random float value
+            }
+            // Convert Input to Tensor
+            std::shared_ptr<Tensor> TInput = std::make_shared<Tensor>(DataType::Float32);
+            TInput -> resize({dim0,dim1});
+            TInput -> setBackend("cpu");
+            TInput -> getImpl() -> setRawPtr(Array, dim0*dim1);
+
+            // convert res to Tensordf
+            std::vector<float> GT(Array, Array + (dim0*dim1));
+            for (float& val : GT)
+            {
+                val = std::max(min, std::min(val, max));
+            }
+            std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>(DataType::Float32);
+            Tres -> resize({dim0,dim1});
+            Tres -> setBackend("cpu");
+            Tres -> getImpl() -> setRawPtr(GT.data(), dim0*dim1);
+            op->associateInput(0, TInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            op->forwardDims(true);
+            start = std::chrono::system_clock::now();
+            myCl->forward();
+            end = std::chrono::system_clock::now();
+
+            duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+            REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
+        }
+        std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl;
+        std::cout << "total time: " << duration.count() << std::endl;
+    }
+    SECTION("Simple clip test [Backward]") {
+        std::size_t totalComputation = 0;
+        duration = std::chrono::duration<double, std::micro>::zero();
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+            std::size_t totalComputation = 0;
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+            // generate Tensors dimensions
+            const std::size_t dim0 = distDims(gen);
+            const std::size_t dim1 = distDims(gen);
+  
+            totalComputation += dim0*dim1;
+
+            // Create and populate the array with random float values
+            float* Array = new float[dim0*dim1];
+            float* gradArray = new float[dim0*dim1];
+            for (int i = 0; i < dim0*dim1; ++i) {
+                Array[i] = dis(gen); // Generate random float value
+                gradArray[i] = dis(gen);
+            }
+
+            std::shared_ptr<Tensor> TGrad = std::make_shared<Tensor>(DataType::Float32);
+            TGrad -> resize({dim0,dim1});
+            TGrad -> setBackend("cpu");
+            TGrad -> getImpl() -> setRawPtr(gradArray, dim0*dim1);
+
+            // Convert Input to Tensor
+            std::shared_ptr<Tensor> TInput = std::make_shared<Tensor>(DataType::Float32);
+            TInput -> resize({dim0,dim1});
+            TInput -> setBackend("cpu");
+            TInput -> getImpl() -> setRawPtr(Array, dim0*dim1);
+            
+            float min = dismin(gen);
+            std::shared_ptr<Tensor> Tmin = std::make_shared<Tensor>(DataType::Float32);
+            Tmin -> resize({});
+            Tmin -> setBackend("cpu");
+            Tmin -> getImpl() -> setRawPtr(&min,1);
+
+            float max = dismax(gen);
+            std::shared_ptr<Tensor> Tmax = std::make_shared<Tensor>(DataType::Float32);
+            Tmax -> resize({});
+            Tmax -> setBackend("cpu");
+            Tmax -> getImpl() -> setRawPtr(&max,1);
+            // convert res to Tensor
+            std::vector<float> GT(Array, Array + (dim0*dim1));
+            for (float& val : GT)
+            {
+                val = std::max(min, std::min(val, max));//Clip operation
+            }
+            std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>(DataType::Float32);
+            Tres -> resize({dim0,dim1});
+            Tres -> setBackend("cpu");
+            Tres -> getImpl() -> setRawPtr(GT.data(), dim0*dim1);
+
+            op->associateInput(0, TInput);
+            op->associateInput(1, Tmin);
+            op->associateInput(2, Tmax);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            op->forwardDims(true);
+            myClip->forward();
+
+            op->getOutput(0)->setGrad(TGrad);
+            
+            start = std::chrono::system_clock::now();
+            REQUIRE_NOTHROW(myClip->backward());
+            end = std::chrono::system_clock::now();
+
+            auto GradTensor = op->getInput(0)->grad();
+            float* BackwardTensor = (float*)GradTensor->getImpl()->rawPtr();
+            std::vector<float> GT0(Array,Array+(dim0*dim1));
+            std::vector<float> GT1(gradArray,gradArray+(dim0*dim1));
+            std::vector<float> BackwardTensorVec(BackwardTensor,BackwardTensor+(dim0*dim1));
+            ComputeClipBackward(GT0,GT1,min,max);
+            duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+            REQUIRE(GT1 == BackwardTensorVec);
+        }
+        std::cout << "multiplications over time spent: " << totalComputation/duration.count() << std::endl;
+        std::cout << "total time: " << duration.count() << std::endl;
+    }
+ }
+} // namespace Aidge 
+}
\ No newline at end of file
diff --git a/unit_tests/operator/Test_ConvDepthWiseImpl.cpp b/unit_tests/operator/Test_ConvDepthWiseImpl.cpp
index e4e46de91bfbc38f41520f1edfc7e99d197e5c83..f1594ef5a21070803a7b86861eac513708ec03a2 100644
--- a/unit_tests/operator/Test_ConvDepthWiseImpl.cpp
+++ b/unit_tests/operator/Test_ConvDepthWiseImpl.cpp
@@ -11,144 +11,219 @@
 
 #include <catch2/catch_test_macros.hpp>
 #include <memory>
+#include <vector>
 
+#include "aidge/backend/cpu/operator/ConvDepthWiseImpl.hpp"
+#include "aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/ConvDepthWise.hpp"
-
-#include "aidge/backend/cpu.hpp"
+#include "aidge/utils/TensorUtils.hpp"
 
 using namespace Aidge;
 
 TEST_CASE("[cpu/operator] ConvDepthWise(forward)", "[ConvDepthWise][CPU]") {
-    std::shared_ptr<Node> myCDW = ConvDepthWise(4, {3,3}, "mycdw");
-    auto op = std::static_pointer_cast<OperatorTensor>(myCDW -> getOperator());
-    std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array4D<int,4,1,3,3> {
-        {
-            {{
-                {  0,  1,  2},
-                {  3,  4,  5},
-                {  6,  7,  8}
-
-            }},
-            {{
-                { 27, 28, 29},
-                { 30, 31, 32},
-                { 33, 34, 35}
-
-            }},
-            {{
-                { 54, 55, 56},
-                { 57, 58, 59},
-                { 60, 61, 62}
-            }},
-            {{
-                { 81, 82, 83},
-                { 84, 85, 86},
-                { 87, 88, 89}
-            }}
-        }
-    });
-    std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<int,4> {{7,0,9,0}});
-    std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<int,2,4,5,5> { //NCHW
-        {
-            {
-                {{  0,   1,   2,   3,   4},
-                 {  5,   6,   7,   8,   9},
-                 { 10,  11,  12,  13,  14},
-                 { 15,  16,  17,  18,  19},
-                 { 20,  21,  22,  23,  24}},
-
-                {{ 25,  26,  27,  28,  29},
-                 { 30,  31,  32,  33,  34},
-                 { 35,  36,  37,  38,  39},
-                 { 40,  41,  42,  43,  44},
-                 { 45,  46,  47,  48,  49}},
-
-                {{ 50,  51,  52,  53,  54},
-                 { 55,  56,  57,  58,  59},
-                 { 60,  61,  62,  63,  64},
-                 { 65,  66,  67,  68,  69},
-                 { 70,  71,  72,  73,  74}},
-
-                {{ 75,  76,  77,  78,  79},
-                 { 80,  81,  82,  83,  84},
-                 { 85,  86,  87,  88,  89},
-                 { 90,  91,  92,  93,  94},
-                 { 95,  96,  97,  98,  99}}
-            },
+    SECTION("k[3,3]") {
+        std::shared_ptr<Node> myCDW = ConvDepthWise(4, {3,3}, "mycdw");
+        auto op = std::static_pointer_cast<OperatorTensor>(myCDW -> getOperator());
+        std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array4D<int,4,1,3,3> {
             {
-                {{100, 101, 102, 103, 104},
-                 {105, 106, 107, 108, 109},
-                 {110, 111, 112, 113, 114},
-                 {115, 116, 117, 118, 119},
-                 {120, 121, 122, 123, 124}},
-
-                {{125, 126, 127, 128, 129},
-                 {130, 131, 132, 133, 134},
-                 {135, 136, 137, 138, 139},
-                 {140, 141, 142, 143, 144},
-                 {145, 146, 147, 148, 149}},
-
-                {{150, 151, 152, 153, 154},
-                 {155, 156, 157, 158, 159},
-                 {160, 161, 162, 163, 164},
-                 {165, 166, 167, 168, 169},
-                 {170, 171, 172, 173, 174}},
-
-                {{175, 176, 177, 178, 179},
-                 {180, 181, 182, 183, 184},
-                 {185, 186, 187, 188, 189},
-                 {190, 191, 192, 193, 194},
-                 {195, 196, 197, 198, 199}}
+                {{
+                    {  0,  1,  2},
+                    {  3,  4,  5},
+                    {  6,  7,  8}
+
+                }},
+                {{
+                    { 27, 28, 29},
+                    { 30, 31, 32},
+                    { 33, 34, 35}
+
+                }},
+                {{
+                    { 54, 55, 56},
+                    { 57, 58, 59},
+                    { 60, 61, 62}
+                }},
+                {{
+                    { 81, 82, 83},
+                    { 84, 85, 86},
+                    { 87, 88, 89}
+                }}
             }
-        }
-    });
-    std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<int,2,4,3,3> {
-        {
+        });
+        std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<int,4> {{7,0,9,0}});
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<int,2,4,5,5> { //NCHW
             {
-                {{   319,    355,    391},
-                 {   499,    535,    571},
-                 {   679,    715,    751}},
-
-                {{  8745,   9024,   9303},
-                 { 10140,  10419,  10698},
-                 { 11535,  11814,  12093}},
-
-                {{ 29337,  29859,  30381},
-                 { 31947,  32469,  32991},
-                 { 34557,  35079,  35601}},
-
-                {{ 62061,  62826,  63591},
-                 { 65886,  66651,  67416},
-                 { 69711,  70476,  71241}}
-            },
+                {
+                    {{  0,   1,   2,   3,   4},
+                    {  5,   6,   7,   8,   9},
+                    { 10,  11,  12,  13,  14},
+                    { 15,  16,  17,  18,  19},
+                    { 20,  21,  22,  23,  24}},
+
+                    {{ 25,  26,  27,  28,  29},
+                    { 30,  31,  32,  33,  34},
+                    { 35,  36,  37,  38,  39},
+                    { 40,  41,  42,  43,  44},
+                    { 45,  46,  47,  48,  49}},
+
+                    {{ 50,  51,  52,  53,  54},
+                    { 55,  56,  57,  58,  59},
+                    { 60,  61,  62,  63,  64},
+                    { 65,  66,  67,  68,  69},
+                    { 70,  71,  72,  73,  74}},
+
+                    {{ 75,  76,  77,  78,  79},
+                    { 80,  81,  82,  83,  84},
+                    { 85,  86,  87,  88,  89},
+                    { 90,  91,  92,  93,  94},
+                    { 95,  96,  97,  98,  99}}
+                },
+                {
+                    {{100, 101, 102, 103, 104},
+                    {105, 106, 107, 108, 109},
+                    {110, 111, 112, 113, 114},
+                    {115, 116, 117, 118, 119},
+                    {120, 121, 122, 123, 124}},
+
+                    {{125, 126, 127, 128, 129},
+                    {130, 131, 132, 133, 134},
+                    {135, 136, 137, 138, 139},
+                    {140, 141, 142, 143, 144},
+                    {145, 146, 147, 148, 149}},
+
+                    {{150, 151, 152, 153, 154},
+                    {155, 156, 157, 158, 159},
+                    {160, 161, 162, 163, 164},
+                    {165, 166, 167, 168, 169},
+                    {170, 171, 172, 173, 174}},
+
+                    {{175, 176, 177, 178, 179},
+                    {180, 181, 182, 183, 184},
+                    {185, 186, 187, 188, 189},
+                    {190, 191, 192, 193, 194},
+                    {195, 196, 197, 198, 199}}
+                }
+            }
+        });
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<int,2,4,3,3> {
             {
-                {{  3919,   3955,   3991},
-                 {  4099,   4135,   4171},
-                 {  4279,   4315,   4351}},
-
-                {{ 36645,  36924,  37203},
-                 { 38040,  38319,  38598},
-                 { 39435,  39714,  39993}},
-
-                {{ 81537,  82059,  82581},
-                 { 84147,  84669,  85191},
-                 { 86757,  87279,  87801}},
-
-                {{138561, 139326, 140091},
-                 {142386, 143151, 143916},
-                 {146211, 146976, 147741}}
+                {
+                    {{   319,    355,    391},
+                    {   499,    535,    571},
+                    {   679,    715,    751}},
+
+                    {{  8745,   9024,   9303},
+                    { 10140,  10419,  10698},
+                    { 11535,  11814,  12093}},
+
+                    {{ 29337,  29859,  30381},
+                    { 31947,  32469,  32991},
+                    { 34557,  35079,  35601}},
+
+                    {{ 62061,  62826,  63591},
+                    { 65886,  66651,  67416},
+                    { 69711,  70476,  71241}}
+                },
+                {
+                    {{  3919,   3955,   3991},
+                    {  4099,   4135,   4171},
+                    {  4279,   4315,   4351}},
+
+                    {{ 36645,  36924,  37203},
+                    { 38040,  38319,  38598},
+                    { 39435,  39714,  39993}},
+
+                    {{ 81537,  82059,  82581},
+                    { 84147,  84669,  85191},
+                    { 86757,  87279,  87801}},
+
+                    {{138561, 139326, 140091},
+                    {142386, 143151, 143916},
+                    {146211, 146976, 147741}}
+                }
             }
-        }
-    });
-    op -> associateInput(0, myInput);
-    op -> associateInput(1, myWeights);
-    op -> associateInput(2, myBias);
-    op->setDataType(DataType::Int32);
-    op->setBackend("cpu");
-    myCDW -> forward();
-    op -> getOutput(0) -> print();
-    REQUIRE(*(op -> getOutput(0)) == *myOutput);
-
-    // std::cout << static_cast<Tensor>((*op)["weight"])[0][0][0][0] << std::endl;
+        });
+        op -> associateInput(0, myInput);
+        op -> associateInput(1, myWeights);
+        op -> associateInput(2, myBias);
+        op->setDataType(DataType::Int32);
+        op->setBackend("cpu");
+        myCDW -> forward();
+        op -> getOutput(0) -> print();
+        REQUIRE(*(op -> getOutput(0)) == *myOutput);
+    }
+    SECTION("point-wise") {
+        ConvDepthWise_Op<2> conv_op = ConvDepthWise_Op<2>({1,1});
+        std::shared_ptr<Tensor> weights = std::make_shared<Tensor>(std::vector<std::size_t>({3,1,1,1}));
+        weights -> setBackend("cpu");
+        std::shared_ptr<Tensor> biases = std::make_shared<Tensor>(std::vector<std::size_t>({3}));
+        biases -> setBackend("cpu");
+        std::shared_ptr<Tensor> input = std::make_shared<Tensor>(std::vector<std::size_t>({2,3,5,5}));
+        input -> setBackend("cpu");
+        std::shared_ptr<Tensor> expected_output = std::make_shared<Tensor>(std::vector<std::size_t>({2,3,5,5}));
+        expected_output -> setBackend("cpu");
+
+        float weighst_array[3] {-0.0045, -0.4223, -0.9452};
+        weights->getImpl()->setRawPtr(weighst_array, 3);
+
+        float biases_array[3] {-0.8595,  0.7062, -0.0062};
+        biases->getImpl()->setRawPtr(biases_array, 3);
+
+        float input_array[2*3*5*5] {
+            0.6581, 0.2509, 0.2660, 0.8270, 0.8040, 0.3147, 0.5028, 0.2591, 0.8585,
+            0.7762, 0.9972, 0.0305, 0.1202, 0.2682, 0.9306, 0.7927, 0.1494, 0.0678,
+            0.5550, 0.4132, 0.4742, 0.6199, 0.1802, 0.6350, 0.2539, 0.5594, 0.0143,
+            0.8656, 0.7105, 0.1420, 0.2464, 0.7883, 0.5715, 0.7642, 0.5492, 0.6628,
+            0.4922, 0.7941, 0.8421, 0.7914, 0.0237, 0.8081, 0.0174, 0.6018, 0.7402,
+            0.3770, 0.8786, 0.3651, 0.5355, 0.4267, 0.4457, 0.6756, 0.9631, 0.0145,
+            0.4470, 0.5202, 0.2675, 0.5815, 0.3487, 0.3457, 0.7179, 0.0518, 0.1520,
+            0.0573, 0.9219, 0.3615, 0.0866, 0.5237, 0.4725, 0.2565, 0.8726, 0.6434,
+            0.6875, 0.2919, 0.3355, 0.1886, 0.1749, 0.0785, 0.4091, 0.1907, 0.4664,
+            0.2738, 0.4784, 0.7807, 0.0687, 0.3091, 0.4557, 0.2277, 0.2424, 0.8691,
+            0.1893, 0.2918, 0.5691, 0.1926, 0.2866, 0.0097, 0.5445, 0.5085, 0.1110,
+            0.7099, 0.8927, 0.6182, 0.2538, 0.8694, 0.7872, 0.3196, 0.0710, 0.2888,
+            0.0403, 0.1670, 0.6840, 0.7323, 0.4861, 0.3390, 0.1096, 0.5070, 0.3872,
+            0.7473, 0.6224, 0.6910, 0.7530, 0.0149, 0.0866, 0.9022, 0.5027, 0.3849,
+            0.5255, 0.1977, 0.0570, 0.9581, 0.5461, 0.4623, 0.0101, 0.2362, 0.5922,
+            0.8398, 0.1497, 0.5160, 0.2862, 0.5931, 0.9728, 0.1353, 0.7790, 0.9137,
+            0.9351, 0.4036, 0.7638, 0.3873, 0.0494, 0.7450};
+        input->getImpl()->setRawPtr(input_array, 2*3*5*5);
+
+        float expected_output_array[2*3*5*5] {
+            -0.8624, -0.8606, -0.8607, -0.8632, -0.8631, -0.8609, -0.8617, -0.8606,
+            -0.8633, -0.8629, -0.8639, -0.8596, -0.8600, -0.8607, -0.8636, -0.8630,
+            -0.8601, -0.8598, -0.8620, -0.8613, -0.8616, -0.8622, -0.8603, -0.8623,
+            -0.8606,  0.4700,  0.7002,  0.3407,  0.4062,  0.6463,  0.6022,  0.3733,
+            0.4649,  0.3835,  0.4743,  0.4263,  0.4984,  0.3709,  0.3506,  0.3720,
+            0.6962,  0.3650,  0.6989,  0.4521,  0.3936,  0.5470,  0.3352,  0.5520,
+            0.4801,  0.5260, -0.4274, -0.6447, -0.9165, -0.0199, -0.4287, -0.4979,
+            -0.2590, -0.5559, -0.3358, -0.3329, -0.6847, -0.0552, -0.1499, -0.0603,
+            -0.8776, -0.3479, -0.0881, -0.5011, -0.4528, -0.2486, -0.8309, -0.6143,
+            -0.6561, -0.2821, -0.3233, -0.8603, -0.8603, -0.8598, -0.8613, -0.8603,
+            -0.8616, -0.8607, -0.8616, -0.8630, -0.8598, -0.8609, -0.8615, -0.8605,
+            -0.8606, -0.8634, -0.8603, -0.8608, -0.8620, -0.8603, -0.8608, -0.8595,
+            -0.8619, -0.8617, -0.8600, -0.8626,  0.3292,  0.4451,  0.5991,  0.3390,
+            0.3738,  0.5712,  0.6762,  0.5843,  0.6892,  0.6357,  0.4174,  0.3969,
+            0.5009,  0.5631,  0.6599,  0.4921,  0.5427,  0.3906,  0.4434,  0.4144,
+            0.3882,  0.6999,  0.6697,  0.3252,  0.4939, -0.3700, -0.5029, -0.1931,
+            -0.0601, -0.9118, -0.5224, -0.4432, -0.0157, -0.2294, -0.5660, -0.7999,
+            -0.1477, -0.4939, -0.2767, -0.5668, -0.9257, -0.1341, -0.7425, -0.8698,
+            -0.8900, -0.3877, -0.7282, -0.3722, -0.0529, -0.7103};
+        expected_output->getImpl()->setRawPtr(expected_output_array, 2*3*5*5);
+
+        conv_op.associateInput(0, input);
+        conv_op.associateInput(1, weights);
+        conv_op.associateInput(2, biases);
+
+        conv_op.setBackend("cpu");
+        conv_op.setDataType(DataType::Float32);
+        conv_op.forwardDims();
+
+        conv_op.forward();
+
+        conv_op.getOutput(0)->print();
+
+        REQUIRE(approxEq<float>(*(conv_op.getOutput(0)),*expected_output, 1e-3f, 1e-4f));
+    }
 }
\ No newline at end of file
diff --git a/unit_tests/operator/Test_ConvImpl.cpp b/unit_tests/operator/Test_ConvImpl.cpp
index b52085139294021de2fe9d72e173ad74db028ea3..e48d69c89eb0d6d52a834b3f32a41d8621fdd42b 100644
--- a/unit_tests/operator/Test_ConvImpl.cpp
+++ b/unit_tests/operator/Test_ConvImpl.cpp
@@ -15,6 +15,7 @@
 
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/Conv.hpp"
+#include "aidge/utils/TensorUtils.hpp"
 
 #include "aidge/backend/cpu.hpp"
 
@@ -153,7 +154,7 @@ TEST_CASE("[cpu/operator] Conv(forward)", "[Conv][CPU]") {
         op->setDataType(DataType::Int32);
         op->setBackend("cpu");
         myConv->forward();
-        // op->getOutput(0)->print();
+        op->getOutput(0)->print();
         REQUIRE(*(op->getOutput(0)) == *myOutput);
     }
     SECTION("Point-wise") {
@@ -251,4 +252,147 @@ TEST_CASE("[cpu/operator] Conv(forward)", "[Conv][CPU]") {
             REQUIRE(std::abs(resPtr[i]-expectedPtr[i]) < 0.00001);
         }
     }
+    SECTION("Strided and dilated Conv") {
+        std::shared_ptr<Node> myConv = Conv(3,4,{3,3}, "myconv", {3,3},{2,2});
+        auto op = std::static_pointer_cast<OperatorTensor>(myConv -> getOperator());
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<float,2,3,8,8> {
+            {{{
+                {0.0107F, 0.5076F, 0.2293F, 0.0486F, 0.7375F, 0.2637F, 0.9615F, 0.9138F},
+                {0.0678F, 0.5604F, 0.1940F, 0.0287F, 0.1029F, 0.2059F, 0.5058F, 0.9885F},
+                {0.9904F, 0.2890F, 0.4606F, 0.1055F, 0.9028F, 0.1654F, 0.6499F, 0.4775F},
+                {0.9499F, 0.4695F, 0.1713F, 0.0731F, 0.4913F, 0.8921F, 0.1782F, 0.1111F},
+                {0.2479F, 0.4669F, 0.1078F, 0.6153F, 0.0299F, 0.6484F, 0.2397F, 0.1814F},
+                {0.3779F, 0.9032F, 0.5651F, 0.3896F, 0.8439F, 0.6404F, 0.3813F, 0.0841F},
+                {0.5566F, 0.8950F, 0.1226F, 0.8881F, 0.9870F, 0.6256F, 0.6387F, 0.0628F},
+                {0.2857F, 0.0579F, 0.6247F, 0.1286F, 0.0951F, 0.1268F, 0.9510F, 0.3789F}},
+
+               {{0.7648F, 0.5340F, 0.1024F, 0.4098F, 0.9958F, 0.7941F, 0.1190F, 0.7328F},
+                {0.4532F, 0.6598F, 0.9146F, 0.1690F, 0.6041F, 0.7230F, 0.5719F, 0.9282F},
+                {0.2862F, 0.2329F, 0.7302F, 0.6717F, 0.1983F, 0.1876F, 0.4561F, 0.2126F},
+                {0.7849F, 0.0239F, 0.7977F, 0.5935F, 0.9958F, 0.4703F, 0.4612F, 0.1627F},
+                {0.6393F, 0.3544F, 0.8643F, 0.5039F, 0.8087F, 0.6521F, 0.5086F, 0.9331F},
+                {0.7749F, 0.9798F, 0.6820F, 0.7869F, 0.5144F, 0.2941F, 0.8137F, 0.4561F},
+                {0.6505F, 0.3974F, 0.6909F, 0.7019F, 0.2729F, 0.4240F, 0.0162F, 0.1536F},
+                {0.3529F, 0.8821F, 0.1812F, 0.3426F, 0.3472F, 0.0300F, 0.8841F, 0.8088F}},
+
+               {{0.5099F, 0.3323F, 0.1488F, 0.3424F, 0.1494F, 0.6225F, 0.8103F, 0.5995F},
+                {0.9198F, 0.5635F, 0.8908F, 0.9378F, 0.6689F, 0.3176F, 0.3755F, 0.3883F},
+                {0.0626F, 0.5309F, 0.0307F, 0.3955F, 0.2794F, 0.1420F, 0.4758F, 0.7558F},
+                {0.6154F, 0.5280F, 0.2318F, 0.3832F, 0.4435F, 0.3490F, 0.4043F, 0.5872F},
+                {0.3705F, 0.3848F, 0.2182F, 0.8332F, 0.4559F, 0.5310F, 0.4611F, 0.4236F},
+                {0.6141F, 0.8103F, 0.2260F, 0.9907F, 0.5615F, 0.4520F, 0.6949F, 0.0175F},
+                {0.3969F, 0.5021F, 0.0970F, 0.9937F, 0.9270F, 0.4302F, 0.2868F, 0.3891F},
+                {0.8693F, 0.5170F, 0.5348F, 0.2676F, 0.9769F, 0.3356F, 0.9427F, 0.3908F}}
+            },
+            {
+               {{0.4803F, 0.5223F, 0.6395F, 0.8402F, 0.4442F, 0.6377F, 0.7852F, 0.9063F},
+                {0.0361F, 0.0470F, 0.3104F, 0.6921F, 0.0543F, 0.4490F, 0.9541F, 0.7395F},
+                {0.3832F, 0.3828F, 0.2236F, 0.2068F, 0.4369F, 0.7443F, 0.6952F, 0.6394F},
+                {0.5309F, 0.8483F, 0.1991F, 0.9756F, 0.8969F, 0.7284F, 0.4657F, 0.5486F},
+                {0.8839F, 0.3260F, 0.6892F, 0.4074F, 0.9473F, 0.5526F, 0.4147F, 0.4786F},
+                {0.9674F, 0.0952F, 0.8379F, 0.2163F, 0.9420F, 0.4046F, 0.1339F, 0.5234F},
+                {0.4213F, 0.8392F, 0.3184F, 0.4576F, 0.9349F, 0.8267F, 0.0931F, 0.8009F},
+                {0.5570F, 0.5871F, 0.4175F, 0.5465F, 0.6679F, 0.9224F, 0.0049F, 0.9421F}},
+
+               {{0.3739F, 0.6230F, 0.7613F, 0.1337F, 0.8527F, 0.0557F, 0.6424F, 0.8463F},
+                {0.7179F, 0.5638F, 0.2457F, 0.4579F, 0.0487F, 0.8693F, 0.8216F, 0.0415F},
+                {0.1724F, 0.5108F, 0.9103F, 0.0850F, 0.0080F, 0.8927F, 0.7706F, 0.3600F},
+                {0.7751F, 0.8828F, 0.7872F, 0.4541F, 0.3181F, 0.1855F, 0.2486F, 0.0033F},
+                {0.5558F, 0.3500F, 0.6034F, 0.1763F, 0.7418F, 0.5190F, 0.5147F, 0.4090F},
+                {0.4476F, 0.1249F, 0.8116F, 0.9091F, 0.1738F, 0.6150F, 0.3285F, 0.3133F},
+                {0.5657F, 0.4447F, 0.5049F, 0.3425F, 0.7443F, 0.2718F, 0.2466F, 0.5586F},
+                {0.3684F, 0.7616F, 0.5165F, 0.9621F, 0.2864F, 0.7747F, 0.8110F, 0.7045F}},
+
+               {{0.4570F, 0.4577F, 0.0373F, 0.6084F, 0.4632F, 0.3472F, 0.9917F, 0.2011F},
+                {0.7921F, 0.2202F, 0.9525F, 0.7274F, 0.3357F, 0.0076F, 0.5786F, 0.3034F},
+                {0.6510F, 0.0798F, 0.2757F, 0.1738F, 0.3046F, 0.2197F, 0.3872F, 0.5650F},
+                {0.1532F, 0.3204F, 0.6094F, 0.3287F, 0.8903F, 0.9773F, 0.7950F, 0.2845F},
+                {0.2482F, 0.3395F, 0.8795F, 0.4325F, 0.1395F, 0.2457F, 0.2968F, 0.5424F},
+                {0.8636F, 0.7426F, 0.2151F, 0.6900F, 0.3938F, 0.0062F, 0.4980F, 0.4098F},
+                {0.8026F, 0.0464F, 0.2662F, 0.7835F, 0.8444F, 0.0688F, 0.8796F, 0.7625F},
+                {0.2764F, 0.5341F, 0.1773F, 0.6671F, 0.7555F, 0.5235F, 0.7142F, 0.9423F}}}}
+        });
+        std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float,4> {{ 0.1902F, -0.1789F, -0.0314F, -0.0589F}});
+        std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array4D<float,4,3,3,3> { //NCHW
+            {
+                {
+                    {{ 0.0039F,  0.1098F, -0.0834F},
+                     {-0.0890F,  0.0725F, -0.1178F},
+                     { 0.1056F, -0.0924F, -0.0574F}},
+                    {{ 0.0070F, -0.0730F, -0.0674F},
+                     {-0.0380F, -0.1025F, -0.0085F},
+                     {-0.1451F, -0.0656F,  0.1137F}},
+                    {{ 0.1020F,  0.1025F, -0.0678F},
+                     { 0.0028F,  0.1512F, -0.0871F},
+                     { 0.1563F, -0.1446F, -0.1636F}}
+                },
+                {
+                    {{ 0.1472F,  0.0025F, -0.0281F},
+                     { 0.0350F,  0.0296F, -0.1711F},
+                     {-0.1197F, -0.1198F, -0.1130F}},
+                    {{-0.1492F,  0.1554F, -0.1044F},
+                     { 0.1203F, -0.1596F,  0.0589F},
+                     {-0.0436F, -0.1876F, -0.0816F}},
+                    {{ 0.1572F, -0.0982F,  0.1293F},
+                     { 0.1358F,  0.1559F,  0.1322F},
+                     { 0.0296F, -0.0354F, -0.0632F}}
+                },
+                {
+                    {{-0.0941F, -0.0479F,  0.0908F},
+                     {-0.1319F, -0.1333F,  0.1223F},
+                     {-0.1098F,  0.1924F,  0.1075F}},
+                    {{ 0.1796F,  0.0213F,  0.0626F},
+                     { 0.0275F,  0.1883F, -0.0818F},
+                     { 0.0363F,  0.0684F,  0.1094F}},
+                    {{ 0.1131F,  0.1258F, -0.0558F},
+                     { 0.1498F,  0.0322F, -0.0186F},
+                     {-0.1801F, -0.0358F,  0.1727F}}
+                },
+                {
+                    {{-0.1500F, -0.0554F, -0.0994F},
+                     {-0.0818F, -0.1223F,  0.1365F},
+                     { 0.1281F,  0.1507F, -0.0890F}},
+                    {{-0.0444F, -0.1071F, -0.1632F},
+                     { 0.0757F, -0.1235F,  0.0408F},
+                     { 0.0401F, -0.1914F,  0.1772F}},
+                    {{-0.0714F,  0.1582F, -0.0065F},
+                     {-0.0119F,  0.1375F, -0.0727F},
+                     {-0.1532F, -0.1826F, -0.0417F}}
+                }
+            }
+        });
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,4,2,2> {
+            {
+                {
+                    {{-0.2174F, -0.0778F},
+                     {-0.2584F,  0.2303F}},
+                    {{-0.7686F, -0.3879F},
+                     {-0.1775F,  0.0119F}},
+                    {{ 0.5180F,  0.5087F},
+                     { 0.5398F,  0.3476F}},
+                    {{-0.5258F, -0.3128F},
+                     {-0.6673F, -0.1827F}}
+                },
+                {
+                    {{-0.1902F, -0.0467F},
+                     {-0.3327F, -0.1701F}},
+                    {{-0.5505F, -0.4875F},
+                     {-0.4119F, -0.5726F}},
+                    {{ 0.5777F,  0.4428F},
+                     { 0.6121F,  0.7221F}},
+                    {{-0.6009F, -0.6335F},
+                     {-0.5159F, -0.3353F}}
+                }
+            }
+        });
+        op->associateInput(0,myInput);
+        op->associateInput(1,myWeights);
+        op->associateInput(2,myBias);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cpu");
+        op->forwardDims();
+        myConv->forward();
+        op->getOutput(0)->print();
+        REQUIRE(approxEq<float>(*(op->getOutput(0)),*myOutput, 1e-3f, 1e-4f));
+    }
 }
\ No newline at end of file
diff --git a/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp b/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp
index d5f2065b624de431b43edef9a83bf079905129dd..43af544871ad6c2ac319de09f3c6fce5065e60d5 100644
--- a/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp
+++ b/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp
@@ -124,7 +124,9 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling",
             dims_in[1]; //  averaging per channel : 1 addition per element in
                         //  the channel + 1 division this for every batch
         // create out nb_elems
-        std::vector<std::size_t> dims_out{dims_in[0], dims_in[1]};
+        std::vector<std::size_t> dims_out(dims_in.size(), 1);
+        dims_out[0] = dims_in[0];
+        dims_out[1] = dims_in[1];
         const std::size_t out_nb_elems =
             std::accumulate(dims_out.cbegin(), dims_out.cend(), std::size_t(1),
                             std::multiplies<std::size_t>());
@@ -192,7 +194,9 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling",
                           //  the channel + 1 division this for every batch
 
           // create out nb_elems
-          std::vector<std::size_t> dims_out{dims_in[0], dims_in[1]};
+          std::vector<std::size_t> dims_out(dims_in.size(), 1);
+          dims_out[0] = dims_in[0];
+          dims_out[1] = dims_in[1];
           const std::size_t out_nb_elems =
               std::accumulate(dims_out.cbegin(), dims_out.cend(),
                               std::size_t(1), std::multiplies<std::size_t>());
@@ -253,7 +257,9 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling",
         SECTION("2D_img") {
           const std::vector<DimSize_t> in_dims{batch_size, channels, height,
                                                width};
-          const std::vector<DimSize_t> out_dims{batch_size, channels};
+          std::vector<std::size_t> out_dims(in_dims.size(), 1);
+          out_dims[0] = in_dims[0];
+          out_dims[1] = in_dims[1];
           DimSize_t in_nb_elems = batch_size * channels * height * width;
           DimSize_t out_nb_elems = batch_size * channels;
           number_of_operation +=
@@ -368,7 +374,9 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling",
         SECTION("3D_img") {
           const std::vector<DimSize_t> in_dims{batch_size, channels, height,
                                                width, depth};
-          const std::vector<DimSize_t> out_dims{batch_size, channels};
+          std::vector<std::size_t> out_dims(in_dims.size(), 1);
+          out_dims[0] = in_dims[0];
+          out_dims[1] = in_dims[1];
           DimSize_t in_nb_elems =
               batch_size * channels * height * width * depth;
           number_of_operation +=
diff --git a/unit_tests/operator/Test_PadImpl.cpp b/unit_tests/operator/Test_PadImpl.cpp
index 75233c0b97fc6f9812020d0e3d3c695d8cd388f0..cdd3a5f979085f3782776ce69ddd92c0d53150c4 100644
--- a/unit_tests/operator/Test_PadImpl.cpp
+++ b/unit_tests/operator/Test_PadImpl.cpp
@@ -134,7 +134,7 @@ TEST_CASE("[cpu/operator] Pad(forward)", "[Pad][CPU]") {
     SECTION("Asymmetric Pad") {
         const int pv = 0; // pad value
 
-        std::shared_ptr<Node> myPad = Pad<2>({0, 1, 1, 0}, "mypad", PadBorderType::Constant, static_cast<double>(pv));
+        std::shared_ptr<Node> myPad = Pad<2>({1, 0, 0, 1}, "mypad", PadBorderType::Constant, static_cast<double>(pv));
         auto op = std::static_pointer_cast<OperatorTensor>(myPad -> getOperator());
         std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<int,2,3,5,5> { //NCHW
             {
diff --git a/unit_tests/operator/Test_ResizeImpl.cpp b/unit_tests/operator/Test_ResizeImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6b3520fc88d36660ff44403bd41a47cd7ed96256
--- /dev/null
+++ b/unit_tests/operator/Test_ResizeImpl.cpp
@@ -0,0 +1,249 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <cstdint>
+#include <memory>
+
+#include <aidge/data/Data.hpp>
+#include <aidge/data/Interpolation.hpp>
+#include <aidge/data/half.hpp>
+#include <aidge/operator/Pad.hpp>
+#include <aidge/utils/ArrayHelpers.hpp>
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/OperatorTensor.hpp"
+#include "aidge/operator/Resize.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+namespace Aidge {
+
+TEST_CASE("[cpu/operator] Resize(forward)", "[Resize][CPU]") {
+
+    Log::setConsoleLevel(Log::Level::Debug);
+
+    SECTION("Nearest") {
+        SECTION("Ceil") {
+            std::shared_ptr<Tensor> input_tensor = std::make_shared<Tensor>(Array4D<std::int32_t, 1, 1, 2, 2>{{
+                {
+                    {
+                        { 1, 2},
+                        { 3, 4}
+                    }
+                }
+            }});
+            Tensor expected_out_tensor = Tensor(Array4D<std::int32_t, 1, 1, 4, 4>{{
+                {
+                    {
+                        { 1, 1, 1, 2},
+                        { 1, 1, 1, 2},
+                        { 1, 1, 1, 2},
+                        { 3, 3, 3, 4}
+                    }
+                }
+            }});
+
+            std::vector<float> scales = {1.0f, 1.0f, 2.0f, 2.0f};
+            auto resize_node = Resize(scales, {}, Interpolation::CoordinateTransformation::HalfPixel, Interpolation::Mode::Floor);
+            auto op = std::static_pointer_cast<Resize_Op>(resize_node->getOperator());
+            op->associateInput(0, input_tensor);
+
+
+            op->setDataType(DataType::Int32);
+            op->setBackend("cpu");
+            op->forwardDims(true);
+            op->forward();
+
+            op->getOutput(0)->print();
+            expected_out_tensor.print();
+
+            CHECK(*(op->getOutput(0)) == expected_out_tensor);
+        }
+    }
+
+    SECTION("1-sized input tensor (upscaling)") {
+        std::shared_ptr<Tensor> input_tensor = std::make_shared<Tensor>(Array4D<float, 1, 1, 1, 1>{{{{{0.417022}}}}});
+
+        std::vector<std::size_t> sizes = {1, 1, 2, 2};
+        auto resize_node = Resize({}, sizes, Interpolation::CoordinateTransformation::HalfPixel, Interpolation::Mode::Linear);
+        auto op = std::static_pointer_cast<Resize_Op>(resize_node->getOperator());
+        op->associateInput(0, input_tensor);
+
+
+        op->setDataType(DataType::Float32);
+        op->setBackend("cpu");
+        op->forwardDims(true);
+        op->forward();
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<float, 1, 1, 2, 2>{
+            {{{{0.417022, 0.417022}, {0.417022, 0.417022}}}}});
+        op->getOutput(0)->print();
+        CHECK(approxEq<float>(*op->getOutput(0), *expectedOutput) == true);
+    }
+    SECTION("Upscaling from 5x5 to 10x10 (linear)") {
+        std::shared_ptr<Tensor> input_tensor = std::make_shared<Tensor>(
+            Array4D<float, 1, 1, 5, 5>{{{{{7.20324516e-01,
+                                               1.14374816e-04,
+                                               3.02332580e-01,
+                                               1.46755889e-01,
+                                               9.23385918e-02},
+                                              {1.86260208e-01,
+                                               3.45560730e-01,
+                                               3.96767467e-01,
+                                               5.38816750e-01,
+                                               4.19194520e-01},
+                                              {6.85219526e-01,
+                                               2.04452246e-01,
+                                               8.78117442e-01,
+                                               2.73875929e-02,
+                                               6.70467496e-01},
+                                              {4.17304814e-01,
+                                               5.58689833e-01,
+                                               1.40386939e-01,
+                                               1.98101491e-01,
+                                               8.00744593e-01},
+                                              {9.68261600e-01,
+                                               3.13424170e-01,
+                                               6.92322612e-01,
+                                               8.76389146e-01,
+                                               8.94606650e-01}}}}}
+        );
+
+        std::vector<std::size_t> sizes = {1, 1, 10, 10};
+        auto resize_node = Resize({}, sizes, Interpolation::CoordinateTransformation::Asymmetric, Interpolation::Mode::Linear);
+        auto op = std::static_pointer_cast<Resize_Op>(resize_node->getOperator());
+        op->associateInput(0, input_tensor);
+
+        op->setDataType(DataType::Float32);
+        op->setBackend("cpu");
+        op->forwardDims(true);
+        op->forward();
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(
+            Array4D<float, 1, 1, 10, 10>{{{{{7.20324516e-01,
+                                             3.60219449e-01,
+                                             1.14374816e-04,
+                                             1.51223481e-01,
+                                             3.02332580e-01,
+                                             2.24544227e-01,
+                                             1.46755889e-01,
+                                             1.19547240e-01,
+                                             9.23385918e-02,
+                                             9.23385918e-02},
+
+                                            {4.53292370e-01,
+                                             3.13064963e-01,
+                                             1.72837555e-01,
+                                             2.61193782e-01,
+                                             3.49550009e-01,
+                                             3.46168160e-01,
+                                             3.42786312e-01,
+                                             2.99276441e-01,
+                                             2.55766571e-01,
+                                             2.55766571e-01},
+
+                                            {1.86260208e-01,
+                                             2.65910476e-01,
+                                             3.45560730e-01,
+                                             3.71164083e-01,
+                                             3.96767467e-01,
+                                             4.67792094e-01,
+                                             5.38816750e-01,
+                                             4.79005635e-01,
+                                             4.19194520e-01,
+                                             4.19194520e-01},
+
+                                            {4.35739875e-01,
+                                             3.55373204e-01,
+                                             2.75006473e-01,
+                                             4.56224471e-01,
+                                             6.37442470e-01,
+                                             4.60272312e-01,
+                                             2.83102185e-01,
+                                             4.13966596e-01,
+                                             5.44831038e-01,
+                                             5.44831038e-01},
+
+                                            {6.85219526e-01,
+                                             4.44835901e-01,
+                                             2.04452246e-01,
+                                             5.41284859e-01,
+                                             8.78117442e-01,
+                                             4.52752531e-01,
+                                             2.73875929e-02,
+                                             3.48927557e-01,
+                                             6.70467496e-01,
+                                             6.70467496e-01},
+
+                                            {5.51262140e-01,
+                                             4.66416597e-01,
+                                             3.81571054e-01,
+                                             4.45411623e-01,
+                                             5.09252191e-01,
+                                             3.10998380e-01,
+                                             1.12744540e-01,
+                                             4.24175322e-01,
+                                             7.35606015e-01,
+                                             7.35606015e-01},
+
+                                            {4.17304814e-01,
+                                             4.87997323e-01,
+                                             5.58689833e-01,
+                                             3.49538386e-01,
+                                             1.40386939e-01,
+                                             1.69244215e-01,
+                                             1.98101491e-01,
+                                             4.99423027e-01,
+                                             8.00744593e-01,
+                                             8.00744593e-01},
+
+                                            {6.92783237e-01,
+                                             5.64420104e-01,
+                                             4.36057001e-01,
+                                             4.26205903e-01,
+                                             4.16354775e-01,
+                                             4.76800054e-01,
+                                             5.37245333e-01,
+                                             6.92460477e-01,
+                                             8.47675622e-01,
+                                             8.47675622e-01},
+
+                                            {9.68261600e-01,
+                                             6.40842915e-01,
+                                             3.13424170e-01,
+                                             5.02873421e-01,
+                                             6.92322612e-01,
+                                             7.84355879e-01,
+                                             8.76389146e-01,
+                                             8.85497928e-01,
+                                             8.94606650e-01,
+                                             8.94606650e-01},
+
+                                            {9.68261600e-01,
+                                             6.40842915e-01,
+                                             3.13424170e-01,
+                                             5.02873421e-01,
+                                             6.92322612e-01,
+                                             7.84355879e-01,
+                                             8.76389146e-01,
+                                             8.85497928e-01,
+                                             8.94606650e-01,
+                                             8.94606650e-01}}}}});
+        Log::notice("Expected result : dims = {}", expectedOutput->dims());
+        expectedOutput->print();
+        Log::notice("\nActual result: dims = {}", op->getOutput(0)->dims());
+        op->getOutput(0)->print();
+        CHECK(approxEq<float>(*op->getOutput(0),
+                              *expectedOutput,
+                              1e-5f,
+                              1e-5f) == true);
+    }
+}
+
+} // namespace Aidge
diff --git a/unit_tests/operator/Test_RoundImpl.cpp b/unit_tests/operator/Test_RoundImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b4cf9ffbedc18b35b42ebbc05971f86e0fa584e3
--- /dev/null
+++ b/unit_tests/operator/Test_RoundImpl.cpp
@@ -0,0 +1,115 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <catch2/catch_test_macros.hpp>
+#include <cstddef>   // std::size_t
+#include <cstdint>   // std::uint16_t
+#include <chrono>
+#include <iostream>
+#include <memory>
+#include <numeric>   
+#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
+#include <iomanip>
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/Round.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+namespace Aidge {
+
+TEST_CASE("[cpu/operator] Round_Test", "[Round][CPU]") {
+    constexpr std::uint16_t NBTRIALS = 15;
+    // Create a random number generator
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> valueDist(-15, 15); 
+    std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(5));
+    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(3));
+
+    // Create BitShift Operator
+    std::shared_ptr<Node> myRound = Round();
+    auto op = std::static_pointer_cast<OperatorTensor>(myRound-> getOperator());
+    op->setDataType(DataType::Float32);
+    op->setBackend("cpu");
+
+    // Create 2 input Tensors
+    std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>();
+    op->associateInput(0,T0);
+    T0->setDataType(DataType::Float32);
+    T0->setBackend("cpu");
+    // Create results Tensor
+    std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>();
+    Tres->setDataType(DataType::Float32);
+    Tres->setBackend("cpu");
+
+    // To measure execution time of 'Round_Op::forward()' member function call
+    std::chrono::time_point<std::chrono::system_clock> start;
+    std::chrono::time_point<std::chrono::system_clock> end;
+    std::chrono::duration<double, std::micro> duration{};
+
+    SECTION("Round [Forward]") {
+        SECTION("Test Forward Kernel") {
+            std::size_t number_of_operation = 0;
+
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                
+                // generate 2 random Tensors
+                const std::size_t nbDims = nbDimsDist(gen);
+                std::vector<std::size_t> dims;
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    dims.push_back(dimSizeDist(gen));
+                }
+                const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                number_of_operation += nb_elements;
+
+                // without broadcasting
+                float* array0 = new float[nb_elements];
+                float* result = new float[nb_elements];
+                
+                for (std::size_t i = 0; i < nb_elements; ++i) {
+                    array0[i] = valueDist(gen);
+                    result[i] = std::nearbyint(array0[i]);
+
+                }
+
+                // input0
+                T0->resize(dims);
+                T0 -> getImpl() -> setRawPtr(array0, nb_elements);
+
+                // results
+                Tres->resize(dims);
+                Tres -> getImpl() -> setRawPtr(result, nb_elements);
+                
+                op->forwardDims();
+                start = std::chrono::system_clock::now();
+                myRound->forward();
+                end = std::chrono::system_clock::now();
+                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+                bool is_eq = approxEq<float>(*(op->getOutput(0)), *Tres);
+
+                auto Output = *(op->getOutput(0));
+                
+                auto prt = Output.getImpl()->rawPtr();
+
+                REQUIRE(is_eq);
+                
+
+                delete[] array0;
+                delete[] result;
+
+
+            }
+            std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
+            std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
+        }
+    }
+} // namespace Aidge
+}
\ No newline at end of file
diff --git a/unit_tests/recipies/Test_ConstantFolding.cpp b/unit_tests/recipies/Test_ConstantFolding.cpp
index c4866b1258702b93a1bce80501d9acd094a65741..cd035fd5336d3cb66fc70b1c0a4e5c82c9bef0d8 100644
--- a/unit_tests/recipies/Test_ConstantFolding.cpp
+++ b/unit_tests/recipies/Test_ConstantFolding.cpp
@@ -22,12 +22,12 @@
 
 using namespace Aidge;
 
-TEST_CASE("[ConstantFolding] test") {
+TEST_CASE("[ConstantFolding] forward", "[ConstantFolding][forward][CPU]") {
     // generate the original GraphView
     auto matmul0 = MatMul("matmul0");
-    auto add0 = Add(2, "add0");
+    auto add0 = Add("add0");
     auto matmul1 = MatMul("matmul1");
-    auto add1 = Add(2, "add1");
+    auto add1 = Add("add1");
 
     auto b0 = Producer(std::make_shared<Tensor>(Array1D<float,5>{{1, 2, 3, 4, 5}}), "B0", true);
     auto w0 = Producer(std::make_shared<Tensor>(Array2D<float,5,5>{{{1, 2, 3, 4, 5}, {6, 7, 8, 9, 0}, {1, 2, 3, 4, 5}, {6, 7, 8, 9, 0}, {1, 2, 3, 4, 5}}}), "W0", true);
diff --git a/unit_tests/recipies/Test_MatMulTiling.cpp b/unit_tests/recipies/Test_MatMulTiling.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..46d5418fd557fbb716f7e1d9c54eb76d94b0061e
--- /dev/null
+++ b/unit_tests/recipies/Test_MatMulTiling.cpp
@@ -0,0 +1,107 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <cstddef>
+#include <random>  // std::random_device, std::mt19937, std::uniform_real_distribution
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/recipes/Recipes.hpp"
+#include "aidge/operator/MatMul.hpp"
+#include "aidge/operator/AvgPooling.hpp"
+#include "aidge/operator/MaxPooling.hpp"
+#include "aidge/operator/GenericOperator.hpp"
+#include "aidge/operator/Producer.hpp"
+#include "aidge/graph/OpArgs.hpp"
+#include "aidge/scheduler/SequentialScheduler.hpp"
+#include "aidge/graph/Matching.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+using namespace Aidge;
+
+TEST_CASE("[MatMulTiling]") {
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> valueDist(-1.0f, 1.0f);
+
+    auto dataProvider = Producer({2, 3, 80, 80}, "dataProvider");
+    auto w1 = Producer({2, 3, 80, 80}, "w1");
+    auto matmul1 = MatMul("matmul1");
+    auto w2 = Producer({2, 3, 80, 80}, "w1");
+    auto matmul2 = MatMul("matmul2");
+    auto w3 = Producer({2, 3, 80, 80}, "w1");
+    auto matmul3 = MatMul("matmul3");
+
+    dataProvider->addChild(matmul1, 0, 0);
+    w1->addChild(matmul1, 0, 1);
+    matmul1->addChild(matmul2, 0, 0);
+    w2->addChild(matmul2, 0, 1);
+    matmul2->addChild(matmul3, 0, 0);
+    w3->addChild(matmul3, 0, 1);
+
+    auto g1 = getConnectedGraphView(matmul1);
+    g1->setBackend("cpu");
+    g1->forwardDims();
+    g1->save("MatMulSplitting_graph");
+
+    // Fill random values
+    fmt::println("Fill random values");
+    auto tData = std::static_pointer_cast<OperatorTensor>(dataProvider->getOperator())->getOutput(0);
+    for (size_t i = 0; i < tData->size(); ++i) {
+        tData->set<float>(i, valueDist(gen));
+    }
+    auto tw1 = std::static_pointer_cast<OperatorTensor>(w1->getOperator())->getOutput(0);
+    for (size_t i = 0; i < tw1->size(); ++i) {
+        tw1->set<float>(i, valueDist(gen));
+    }
+    auto tw2 = std::static_pointer_cast<OperatorTensor>(w2->getOperator())->getOutput(0);
+    for (size_t i = 0; i < tw2->size(); ++i) {
+        tw2->set<float>(i, valueDist(gen));
+    }
+    auto tw3 = std::static_pointer_cast<OperatorTensor>(w3->getOperator())->getOutput(0);
+    for (size_t i = 0; i < tw3->size(); ++i) {
+        tw3->set<float>(i, valueDist(gen));
+    }
+
+    fmt::println("Schedule forward graph");
+    auto s1 = SequentialScheduler(g1);
+    s1.forward();
+
+    const auto tOut = std::static_pointer_cast<OperatorTensor>(g1->getOrderedOutputs()[0].first->getOperator())->getOutput(0)->clone();
+
+    // Tiling
+    fmt::println("Tiling");
+    matMulTiling(matmul1, {16, 16});
+    removeIdentity(g1);
+
+    g1->setBackend("cpu");
+    g1->save("MatMulSplitting_graph_split");
+
+    auto gm = SinglePassGraphMatching(g1);
+    gm.addNodeLambda("16x16", [](const NodePtr& node) {
+        const auto op =
+            std::static_pointer_cast<OperatorTensor>(node->getOperator());
+        const auto dims = op->getOutput(0)->dims();
+        return (dims.end()[-2] == 16 && dims.end()[-1] == 16);
+    });
+
+    const auto results = gm.match("MatMul[16x16]");
+    REQUIRE(results.size() == 25);
+
+    // Check result
+    fmt::println("Schedule forward tiled graph");
+    s1 = SequentialScheduler(g1);
+    s1.resetScheduling();
+    s1.forward();
+
+    const auto tOutTiled = std::static_pointer_cast<OperatorTensor>(g1->getOrderedOutputs()[0].first->getOperator())->getOutput(0)->clone();
+    REQUIRE(approxEq<float>(tOut, tOutTiled));
+}
diff --git a/unit_tests/scheduler/Test_Scheduler.cpp b/unit_tests/scheduler/Test_Scheduler.cpp
index 16112628053a35ef71d5819a53aacc85425da88d..78a10c308a60f026b83ea64cfbd25a848099eb90 100644
--- a/unit_tests/scheduler/Test_Scheduler.cpp
+++ b/unit_tests/scheduler/Test_Scheduler.cpp
@@ -147,10 +147,13 @@ TEST_CASE("[cpu/scheduler] SequentialScheduler(forward)") {
         std::shared_ptr<GraphView> g =
                 Sequential({Conv(1, 3, {3, 3}, "inputConv"),
                             Parallel({
-                                Conv(3, 3, {1, 1}, "conv1.1"),
-                                Conv(3, 3, {1, 1}, "conv1.2"),
+                                Sequential({
+                                    Parallel({
+                                        Conv(3, 3, {1, 1}, "conv1.1"),
+                                        Conv(3, 3, {1, 1}, "conv1.2")}),
+                                    Add("add1")}),
                                 Conv(3, 3, {1, 1}, "conv1.3")}),
-                            Add(3, "add1"),
+                            Add("add2"),
                             Conv(3, 2, {1, 1}, "conv2"),
                             FC(18, 5, false, "out")});
 
@@ -216,9 +219,9 @@ TEST_CASE("[cpu/scheduler] SequentialScheduler(forward)") {
         std::shared_ptr<Tensor> biasTensor = std::make_shared<Tensor>(
                 Array2D<int, 2, 3>{{{2, 0, 0}, {1, 0, 0}}});
 
-        auto add1 = Add(2, "add1");
+        auto add1 = Add("add1");
         auto mem = Memorize(3, "mem1");
-        auto add2 = Add(2, "add2");
+        auto add2 = Add("add2");
         auto bias = Producer(biasTensor, "bias");
         auto init = Producer(initTensor, "init");
         auto input = Producer(in, "input");
@@ -260,9 +263,9 @@ TEST_CASE("[cpu/scheduler] SequentialScheduler(forward)") {
         std::shared_ptr<Tensor> biasTensor = std::make_shared<Tensor>(
                 Array2D<int, 2, 3>{{{2, 0, 0}, {1, 0, 0}}});
 
-        auto add1 = Add(2, "add1");
+        auto add1 = Add("add1");
         auto mem = Memorize(3, "mem1");
-        auto add2 = Add(2, "add2");
+        auto add2 = Add("add2");
         auto bias = Producer(biasTensor, "bias");
         auto init = Producer(initTensor, "init");
         auto input = Producer(in, "input");
diff --git a/version.txt b/version.txt
index d15723fbe8de36b1c3ae302c77d8095459ea88e6..1d0ba9ea182b0f7354f3daf12120744ec5e0c2f8 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.3.2
+0.4.0