diff --git a/README.md b/README.md
index 2e309083fb677f82b4adb1e9d5a3b7923bb32c47..ed44c2eaaee074f17c8d7edad059c0891473f272 100644
--- a/README.md
+++ b/README.md
@@ -29,27 +29,15 @@ pip install . -v
 
 ### Standard C++ Compilation
 
-You will need to compile first the Core library before compiling the CPU one.
-The makefile is designed to do it for you.
+You will need to compile and install the [Core Library](https://gitlab.eclipse.org/eclipse/aidge/aidge_core) before compiling the CPU one.
 
-To only compile the CPU library, run
-```
-make cpu_only
-```
+Once this has been done, you'll need run CMake with the
+`CMAKE_INSTALL_PREFIX:PATH` flag, in order to indicate to CMake where
+`aidge_core` has been installed : 
+```sh
+cmake -DCMAKE_INSTALL_PREFIX:PATH=$(path_to_install_folder) $(CMAKE PARAMETERS) $(projet_root)
 
-To compile the CPU library + the associated unitary tests, run
-```
-make cpu_tests
+make all
 ```
 
-To compile the CPU library with the python binding, run
-```
-make cpu_with_pybind
-```
-Important: this command can also be run with `make`.
-
-
-To compile the CPU library with the python binding + the associated unitary tests, run
-```
-make cpu_with_pybind_tests
-```
+More detailed information is available in the [Aidge User Guide](https://eclipse.dev/aidge/source/GetStarted/install.html)
diff --git a/include/aidge/backend/cpu.hpp b/include/aidge/backend/cpu.hpp
index 4134f5c50220198840e078dc78b8f37ded09b78e..963895c1adbff22a770851bbc5ba305007415f1e 100644
--- a/include/aidge/backend/cpu.hpp
+++ b/include/aidge/backend/cpu.hpp
@@ -21,6 +21,7 @@
 #include "aidge/backend/cpu/operator/BatchNormImpl.hpp"
 #include "aidge/backend/cpu/operator/ConvDepthWiseImpl.hpp"
 #include "aidge/backend/cpu/operator/ConvImpl.hpp"
+#include "aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp"
 #include "aidge/backend/cpu/operator/DivImpl.hpp"
 #include "aidge/backend/cpu/operator/ErfImpl.hpp"
 #include "aidge/backend/cpu/operator/FCImpl.hpp"
diff --git a/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp
index 141136bb46312d700eaede421c9789e74e2360ad..4a4ba2a8999c4dc33fc743b5a3a7dad023f9e0dd 100644
--- a/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp
@@ -43,16 +43,16 @@ void AddImpl_cpu_forward_kernel(const std::vector<const void*> inputs_, const st
 
 // Kernels registration to implementation entry point
 REGISTRAR(AddImpl_cpu,
-    {{DataType::Any}, {DataType::Float32}},
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float32}},
     {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<float, float>, nullptr});
 REGISTRAR(AddImpl_cpu,
-    {{DataType::Any}, {DataType::Float64}},
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}},
     {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<double, double>, nullptr});
 REGISTRAR(AddImpl_cpu,
-    {{DataType::Any}, {DataType::Int32}},
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}},
     {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr});
 REGISTRAR(AddImpl_cpu,
-    {{DataType::Any}, {DataType::Int64}},
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int64}},
     {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, nullptr});
 }  // namespace Aidge
 
diff --git a/include/aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp b/include/aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..83e7e030f526e0db3cff4741eabe39e287130562
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp
@@ -0,0 +1,34 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_H_
+#define AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_H_
+
+#include <cstddef>
+#include <memory>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/ConstantOfShape.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using ConstantOfShapeImpl_cpu = OperatorImpl_cpu<ConstantOfShape_Op,
+    void(const std::vector<DimSize_t>, const Tensor&, void *)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(ConstantOfShape_Op, "cpu", Aidge::ConstantOfShapeImpl_cpu::create);
+} // namespace Aidge
+
+#endif /* _AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_H_ */
+
diff --git a/include/aidge/backend/cpu/operator/ConstantOfShapeImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConstantOfShapeImpl_kernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..18ab9c0a77c4545c955fc4fe1f1fc1cbcb763bf7
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/ConstantOfShapeImpl_kernels.hpp
@@ -0,0 +1,71 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_KERNELS_H_
+
+#include <aidge/data/Tensor.hpp>
+#include <aidge/data/half.hpp>
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <functional> // std::multiplies
+#include <numeric>    // std::accumulate
+#include <vector>
+
+#include "aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp"
+#include "aidge/data/Data.hpp"
+#include "aidge/utils/ErrorHandling.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+template <class O>
+void ConstantOfShapeimpl_cpu_forward_kernel(
+    const std::vector<DimSize_t> output_dims, const Tensor &value,
+    void *output_) {
+
+  O *output = static_cast<O *>(output_);
+  O val;
+  std::copy(static_cast<O *>(value.getImpl()->hostPtr()),
+            static_cast<O *>(value.getImpl()->hostPtr()) +
+                static_cast<NbElts_t>(1),
+            &val);
+  const size_t output_size = std::accumulate(
+      output_dims.begin(), output_dims.end(), 1, std::multiplies<DimSize_t>());
+  for (size_t i = 0; i < output_size; ++i) {
+    output[i] = val;
+  }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(ConstantOfShapeImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int64}, ImplSpec::IOSpec{DataType::Float16}},
+    {ProdConso::defaultModel, Aidge::ConstantOfShapeimpl_cpu_forward_kernel<half_float::half>, nullptr});
+REGISTRAR(ConstantOfShapeImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int64}, ImplSpec::IOSpec{DataType::Float32}},
+    {ProdConso::defaultModel, Aidge::ConstantOfShapeimpl_cpu_forward_kernel<float>, nullptr});
+REGISTRAR(ConstantOfShapeImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int64}, ImplSpec::IOSpec{DataType::Float64}},
+    {ProdConso::defaultModel, Aidge::ConstantOfShapeimpl_cpu_forward_kernel<double>, nullptr});
+REGISTRAR(ConstantOfShapeImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int64}, ImplSpec::IOSpec{DataType::Int16}},
+    {ProdConso::defaultModel, Aidge::ConstantOfShapeimpl_cpu_forward_kernel<std::int16_t>, nullptr});
+REGISTRAR(ConstantOfShapeImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int64}, ImplSpec::IOSpec{DataType::Int32}},
+    {ProdConso::defaultModel, Aidge::ConstantOfShapeimpl_cpu_forward_kernel<std::int32_t>, nullptr});
+REGISTRAR(ConstantOfShapeImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int64}, ImplSpec::IOSpec{DataType::Int64}},
+    {ProdConso::defaultModel, Aidge::ConstantOfShapeimpl_cpu_forward_kernel<std::int64_t>, nullptr});
+} // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_KERNELS_H_ */
+
diff --git a/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp
index ae433261658669deaa33f573369829a9cce08860..c57f86e6ac6e74acebb48f471991e7181920f7c3 100644
--- a/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp
@@ -173,13 +173,13 @@ void FCImpl_cpu_backward_kernel(const DimSize_t batchSize,
 
 // Kernels registration to implementation entry point
 REGISTRAR(FCImpl_cpu,
-    {{DataType::Any}, {DataType::Float32}},
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float32}},
     {ProdConso::defaultModel, Aidge::FCImpl_cpu_forward_kernel<float, float, float, float>, Aidge::FCImpl_cpu_backward_kernel<float, float, float, float>});
 REGISTRAR(FCImpl_cpu,
-    {{DataType::Any}, {DataType::Float64}},
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}},
     {ProdConso::defaultModel, Aidge::FCImpl_cpu_forward_kernel<double, double, double, double>, Aidge::FCImpl_cpu_backward_kernel<double, double, double, double>});
 REGISTRAR(FCImpl_cpu,
-    {{DataType::Any}, {DataType::Int32}},
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}},
     {ProdConso::defaultModel, Aidge::FCImpl_cpu_forward_kernel<int32_t, int32_t, int32_t, int32_t>, Aidge::FCImpl_cpu_backward_kernel<int32_t, int32_t, int32_t, int32_t>});
 }  // namespace Aidge
 
diff --git a/include/aidge/backend/cpu/operator/GridSampleImpl.hpp b/include/aidge/backend/cpu/operator/GridSampleImpl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..697bb35a983bc108c2a5d65db3c08ef462ffcdbd
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/GridSampleImpl.hpp
@@ -0,0 +1,38 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_GRIDSAMPLEIMPL_H_
+#define AIDGE_CPU_OPERATOR_GRIDSAMPLEIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/GridSample.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using GridSampleImpl_cpu = OperatorImpl_cpu<GridSample_Op,
+    void(const GridSample_Op&,
+        const std::shared_ptr<Tensor>&,
+        const std::shared_ptr<Tensor>&,
+        const std::shared_ptr<Tensor>&)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(GridSample_Op, "cpu", Aidge::GridSampleImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_GRIDSAMPLEIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/GridSampleImpl_kernels.hpp b/include/aidge/backend/cpu/operator/GridSampleImpl_kernels.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ea62fd010db8c155a3ff86ff8396797da5ebb6be
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/GridSampleImpl_kernels.hpp
@@ -0,0 +1,477 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_CONVIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_CONVIMPL_KERNELS_H_
+
+#include <algorithm>  // std::max, std::min
+#include <cmath>      // std::fabs, std::trunf, std::nearbyint
+#include <cstddef>    // std::size_t
+#include <cstdint>    // std::int64_t
+
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include "aidge/backend/cpu/operator/GridSampleImpl.hpp"
+#include "aidge/data/half.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+static bool in_bound(float coord, float lower_bound, float upper_bound) noexcept {
+    return (coord > lower_bound) && (coord < upper_bound);
+}
+
+static float unnormalized_coord(float coord, float new_lower_bound, float new_upper_bound) noexcept {
+    return (coord + 1) / 2 * (new_upper_bound - new_lower_bound) + new_lower_bound;
+}
+
+// unused
+// static float normalized_coord(float coord, float prev_lower_bound, float prev_upper_bound) noexcept {
+//     return (coord + prev_lower_bound) / (prev_upper_bound-prev_lower_bound) * 2 - 1;
+// }
+
+static float unnormalize_grid_sample_coord(float coord, std::size_t size, bool align_corners) noexcept {
+    return align_corners ? unnormalized_coord(coord, 0.0f, static_cast<float>(size) - 1.0f)
+                         : unnormalized_coord(coord, -0.5f, static_cast<float>(size) - 0.5f);
+}
+
+// unused
+// static float normalize_grid_sample_coord(float coord, std::size_t size, bool align_corners) noexcept {
+//     return align_corners ? normalized_coord(coord, 0.0f, static_cast<float>(size) - 1.0f)
+//                          : normalized_coord(coord, -0.5f, static_cast<float>(size) - 0.5f);
+// }
+
+static float update_normalized_coord_with_padding(float coord, Aidge::GridSample_Op::PaddingMode padding_mode) {
+    if (!in_bound(coord, -1.0f, 1.0f)) {
+        if (padding_mode == Aidge::GridSample_Op::PaddingMode::Border) {
+            coord = std::min(std::max(-1.0f, coord), 1.0f);
+        }
+        else if (padding_mode == Aidge::GridSample_Op::PaddingMode::Reflection) {
+            float abs_coord = std::fabs(coord);
+            float int_coord = std::truncf(abs_coord);
+            std::int32_t nb_refl = static_cast<std::int32_t>((int_coord - 1) / 2);
+            float res = ((nb_refl + 1)*2) - abs_coord;
+            coord = (coord > 0) ? (nb_refl % 2 == 0 ? res : -res) \
+                            : (nb_refl % 2 == 0 ? -res : res);
+        }
+
+    }
+    return coord;
+}
+
+static inline std::int64_t update_unnormalized_coord_with_padding(std::int64_t coord, std::int64_t size, Aidge::GridSample_Op::PaddingMode padding_mode) {
+    if (!in_bound(coord, 0, size)) {
+        // out of bound. switch padding mode
+        if (padding_mode == Aidge::GridSample_Op::PaddingMode::Border) {
+            coord = std::min(std::max(std::int64_t(0), coord), size-std::int64_t(1));
+        } else if (padding_mode == Aidge::GridSample_Op::PaddingMode::Reflection) {
+            const std::int64_t quotient = coord / (size-1);
+            const std::int64_t remainer = std::abs(coord - quotient*(size-1));
+            coord = (quotient % 2 == 0) ? remainer : size - 1 - remainer;
+        }
+    }
+    return coord;
+}
+
+namespace Aidge {
+/**
+ * @brief Forward kernel for 1D GridSample on CPU backend.
+ * @tparam I Input data type.
+ * @tparam O Output data type.
+ * @param params tuple of Attributes from the Operator
+ * @param inputDims Array of input dimensions.
+ * @param input_ const input Tensor.
+ * @param grid_ const grid Tensor.
+ * @param output_ Output Tensor.
+ */
+template <class I, class O>
+void GridSampleImpl1D_cpu_forward_kernel(const GridSample_Op& op,
+                            const std::shared_ptr<Tensor>& in0,
+                            const std::shared_ptr<Tensor>& in1,
+                            const std::shared_ptr<Tensor>& out)
+{
+    const I* const input = static_cast<const I * const>(in0->getImpl()->rawPtr());
+    const I* input_ptr = input;
+    float* const grid = static_cast<float* const>(in1->getImpl()->rawPtr());
+    float* grid_ptr = grid;
+    O* const output = static_cast<O* const>(out->getImpl()->rawPtr());
+    O* output_ptr = output;
+
+    const std::size_t N = in0->dim(0);
+    const std::size_t C = in0->dim(1);
+    const std::size_t in_H = in0->dim(2);
+    const std::size_t grid_H = in1->dim(1);
+
+    const std::size_t in_N_s = in0->stride(0);
+    const std::size_t in_C_s = in0->stride(1);
+    const std::size_t in_H_s = in0->stride(2);
+    const std::size_t grid_N_s = in1->stride(0);
+    const std::size_t grid_H_s = in1->stride(1);
+    const std::size_t out_N_s = out->stride(0);
+    const std::size_t out_C_s = out->stride(1);
+    const std::size_t out_H_s = out->stride(2);
+
+    float* grid_ptr_N = grid;
+    const I* input_ptr_N = input;
+    O* output_ptr_N = output;
+    for (std::size_t n = 0; n < N; ++n) {
+        grid_ptr = grid_ptr_N;
+        for (std::size_t grid_x = 0; grid_x < grid_H; ++grid_x) {
+            output_ptr = output_ptr_N + grid_x*out_H_s;
+            /*
+            * change grid_x coord to match padding_mode
+            * Change range from [-1, 1] to [0, H-1] or [-0.5, H-0.5] according to align_corners
+            * Handle computation of interpolation
+            *   any value outside bounds is considered 0
+            *   if nearest:
+            *   else if linear:
+            *   else if cubic:
+            *   else : nothing
+            */
+            float x = *grid_ptr;
+            x = update_normalized_coord_with_padding(x, op.paddingMode());
+            x = unnormalize_grid_sample_coord(x, in_H, op.alignCorners());
+            if (op.mode() == GridSample_Op::Mode::Nearest) {
+                const std::int64_t x_rounded = std::nearbyintf(x);
+
+                if (in_bound(x_rounded, 0, in_H)) {
+                    input_ptr = input_ptr_N + x_rounded*in_H_s;
+                    for (std::size_t c = 0; c < C; ++c) {
+                        *output_ptr = *input_ptr;
+                        input_ptr += in_C_s;
+                        output_ptr += out_C_s;
+                    }
+                } else {
+                    for (std::size_t c = 0; c < C; ++c) {
+                        *output_ptr = O(0);
+                        output_ptr += out_C_s;
+                    }
+                }
+            } else if (op.mode() == GridSample_Op::Mode::Linear) {
+                const std::int64_t x_inf = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(x)), in_H, op.paddingMode());
+                const std::int64_t x_sup = update_unnormalized_coord_with_padding(x_inf + 1, in_H, op.paddingMode());
+
+                const I* input_ptr_NC = input_ptr_N;
+                for (std::size_t c = 0; c < C; ++c) {
+                    const I f_inf = in_bound(x_inf, 0, in_H) ?
+                        input_ptr_NC[static_cast<std::size_t>(x_inf)*in_H_s] : I(0);
+                    const I f_sup = in_bound(x_sup, 0, in_H) ?
+                        input_ptr_NC[static_cast<std::size_t>(x_sup)*in_H_s] : I(0);
+
+                    *output_ptr = static_cast<O>(static_cast<I>(x - x_inf)*f_inf \
+                            + static_cast<I>(x_sup - x)*f_sup);
+
+                    input_ptr_NC += in_C_s;
+                    output_ptr += out_C_s;
+                }
+            } else if (op.mode() == GridSample_Op::Mode::Cubic) {
+                const std::int64_t x_inf = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(x)), in_H, op.paddingMode());
+                const std::int64_t x_sup = update_unnormalized_coord_with_padding(x_inf + 1, in_H, op.paddingMode());
+                const std::int64_t x_inf_inf = update_unnormalized_coord_with_padding(x_inf - 1, in_H, op.paddingMode());
+                const std::int64_t x_sup_sup = update_unnormalized_coord_with_padding(x_sup + 1, in_H, op.paddingMode());
+
+                const I x1 = static_cast<I>(x - static_cast<float>(x_inf));
+                const I x2 = x1 * x1;
+                const I x3 = x1 * x2;
+
+                const I* input_ptr_NC = input_ptr_N;
+                for (std::size_t c = 0; c < C; ++c) {
+                    const I f_inf_inf = in_bound(x_inf_inf, 0, in_H) ? input_ptr_NC[x_inf_inf*in_H_s] : I(0);
+                    const I f_inf = in_bound(x_inf, 0, in_H) ? input_ptr_NC[x_inf*in_H_s] : I(0);
+                    const I f_sup = in_bound(x_sup, 0, in_H) ? input_ptr_NC[x_sup*in_H_s] : I(0);
+                    const I f_sup_sup = in_bound(x_sup_sup, 0, in_H) ? input_ptr_NC[x_sup_sup*in_H_s] : I(0);
+
+                    const I m_inf = (f_sup - f_inf_inf) / I(2);
+                    const I m_sup = (f_sup_sup - f_inf) / I(2);
+
+                    *output_ptr = f_inf \
+                        + x1 * m_inf \
+                        + x2 * (3 * (f_sup - f_inf) - 2 * m_inf - m_sup) \
+                        + x3 * (2*(f_inf - f_sup) + m_inf + m_sup);
+
+                    input_ptr_NC += in_C_s;
+                    output_ptr += out_C_s;
+                }
+            }
+
+            grid_ptr += grid_H_s;
+        }
+
+        input_ptr_N += in_N_s;
+        grid_ptr_N += grid_N_s;
+        output_ptr_N += out_N_s;
+    }
+}
+
+// Kernels registration to implementation entry point
+// only accept 1st input with only 1 spatial feat. (nb dims = 1)
+REGISTRAR(GridSampleImpl_cpu,
+    {{{DataType::Any, DataFormat::Any, {{-1, -1}}}, {DataType::Any}}, {{DataType::Float16}}},
+    {ProdConso::defaultModel, Aidge::GridSampleImpl1D_cpu_forward_kernel<half_float::half, half_float::half>, nullptr});
+REGISTRAR(GridSampleImpl_cpu,
+    {{{DataType::Any, DataFormat::Any, {{-1, -1}}}, {DataType::Any}}, {{DataType::Float32}}},
+    {ProdConso::defaultModel, Aidge::GridSampleImpl1D_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(GridSampleImpl_cpu,
+    {{{DataType::Any, DataFormat::Any, {{-1, -1}}}, {DataType::Any}}, {{DataType::Float64}}},
+    {ProdConso::defaultModel, Aidge::GridSampleImpl1D_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(GridSampleImpl_cpu,
+    {{{DataType::Any, DataFormat::Any, {{-1, -1}}}, {DataType::Any}}, {{DataType::Int32}}},
+    {ProdConso::defaultModel, Aidge::GridSampleImpl1D_cpu_forward_kernel<int32_t, int32_t>, nullptr});
+
+
+/**
+ * @brief Forward kernel for 1D GridSample on CPU backend.
+ * @tparam I Input data type.
+ * @tparam O Output data type.
+ * @param params tuple of Attributes from the Operator
+ * @param inputDims Array of input dimensions.
+ * @param input_ const input Tensor.
+ * @param grid_ const grid Tensor.
+ * @param output_ Output Tensor.
+ */
+template <class I, class O>
+void GridSampleImpl2D_cpu_forward_kernel(const GridSample_Op& op,
+                            const std::shared_ptr<Tensor>& in0,
+                            const std::shared_ptr<Tensor>& in1,
+                            const std::shared_ptr<Tensor>& out)
+{
+    const I* input = static_cast<const I *>(in0->getImpl()->rawPtr());
+    const I* input_ptr = input;
+    float* const grid = static_cast<float* const>(in0->getImpl()->rawPtr());
+    float* grid_ptr = grid;
+    O* const output = static_cast<O* const>(out->getImpl()->rawPtr());
+
+    const std::size_t N = in0->dim(0);
+    const std::size_t C = in0->dim(1);
+    const std::size_t in_H = in0->dim(2);
+    const std::size_t in_W = in0->dim(3);
+    const std::size_t grid_H = in1->dim(1);
+    const std::size_t grid_W = in1->dim(2);
+
+    const std::size_t in_N_s = in0->stride(0);
+    const std::size_t in_C_s = in0->stride(1);
+    const std::size_t in_H_s = in0->stride(2);
+    const std::size_t in_W_s = in0->stride(3);
+    const std::size_t grid_N_s = in1->stride(0);
+    const std::size_t grid_H_s = in1->stride(1);
+    const std::size_t grid_W_s = in1->stride(2);
+    const std::size_t grid_Coord_s = in1->stride(3);
+    const std::size_t out_N_s = out->stride(0);
+    const std::size_t out_C_s = out->stride(1);
+    const std::size_t out_H_s = out->stride(2);
+    const std::size_t out_W_s = out->stride(3);
+
+
+    float* grid_ptr_N = grid;
+    const I* input_ptr_N = input;
+    O* output_ptr_N = output;
+    for (std::size_t n = 0; n < N; ++n) {
+        for (std::size_t grid_y = 0; grid_y < grid_H; ++grid_y) {
+            for (std::size_t grid_x = 0; grid_x < grid_W; ++grid_x) {
+                O* output_ptr = output_ptr_N + grid_y*out_H_s + grid_y*out_W_s;
+                grid_ptr = grid_ptr_N + grid_y*grid_H_s + grid_x*grid_W_s;
+                /*
+                * change grid_x coord to match padding_mode
+                * Change range from [-1, 1] to [0, H-1] or [-0.5, H-0.5] according to align_corners
+                * Handle computation of interpolation
+                *   any value outside bounds is considered 0
+                *   if nearest:
+                *   else if linear:
+                *   else if cubic:
+                *   else : nothing
+                */
+                float x = *grid_ptr;
+                float y = grid_ptr[grid_Coord_s];
+                x = update_normalized_coord_with_padding(x, op.paddingMode());
+                x = unnormalize_grid_sample_coord(x, in_W, op.alignCorners());
+                y = update_normalized_coord_with_padding(y, op.paddingMode());
+                y = unnormalize_grid_sample_coord(y, in_H, op.alignCorners());
+                if (op.mode() == GridSample_Op::Mode::Nearest) {
+                    const std::int64_t x_rounded = std::nearbyintf(x);
+                    const std::int64_t y_rounded = std::nearbyintf(y);
+
+                    if (in_bound(x_rounded, 0, in_W) && in_bound(y_rounded, 0, in_H)) {
+                        input_ptr = input_ptr_N + y_rounded*in_H_s + x_rounded*in_W_s;
+                        for (std::size_t c = 0; c < C; ++c) {
+                            *output_ptr = *input_ptr;
+                            input_ptr += in_C_s;
+                            output_ptr += out_C_s;
+                        }
+                    } else {
+                        for (std::size_t c = 0; c < C; ++c) {
+                            *output_ptr = O(0);
+                            output_ptr += out_C_s;
+                        }
+                    }
+                } else if (op.mode() == GridSample_Op::Mode::Linear) {
+                    const std::int64_t x_r = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(x)), in_W, op.paddingMode()); // right
+                    const std::int64_t x_l = update_unnormalized_coord_with_padding(x_r + 1, in_W, op.paddingMode()); // left
+
+                    const std::int64_t y_t = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(y)), in_H, op.paddingMode()); // top
+                    const std::int64_t y_b = update_unnormalized_coord_with_padding(y_t + 1, in_H, op.paddingMode()); // bottom
+
+                    const I* input_ptr_NC = input_ptr_N;
+                    for (std::size_t c = 0; c < C; ++c) {
+
+                        const I f_tr = (in_bound(x_r, 0, in_W) && in_bound(y_t, 0, in_H)) ?
+                            input_ptr_NC[static_cast<std::size_t>(y_t)*in_H_s
+                                         + static_cast<std::size_t>(x_r)*in_W_s]
+                                : I(0);
+                        const I f_tl = (in_bound(x_l, 0, in_W) && in_bound(y_t, 0, in_H)) ?
+                            input_ptr_NC[static_cast<std::size_t>(y_t)*in_H_s
+                                         + static_cast<std::size_t>(x_l)*in_W_s]
+                                : I(0);
+                        const I f_br = (in_bound(x_r, 0, in_W) && in_bound(y_b, 0, in_H)) ?
+                            input_ptr_NC[static_cast<std::size_t>(y_b)*in_H_s
+                                         + static_cast<std::size_t>(x_r)*in_W_s]
+                                : I(0);
+                        const I f_bl = (in_bound(x_l, 0, in_W) && in_bound(y_b, 0, in_H)) ?
+                            input_ptr_NC[static_cast<std::size_t>(y_b)*in_H_s
+                                         + static_cast<std::size_t>(x_l)*in_W_s]
+                                : I(0);
+
+                        // compute weighted sum of the 4 corners
+                        const I w_tr = static_cast<I>((y - static_cast<float>(y_t))*(static_cast<float>(x_r) - x));
+                        const I w_tl = static_cast<I>((y - static_cast<float>(y_t))*(x - static_cast<float>(x_l)));
+                        const I w_br = static_cast<I>((static_cast<float>(y_b) - y)*(static_cast<float>(x_r) - x));
+                        const I w_bl = static_cast<I>((static_cast<float>(y_b) - y)*(x - static_cast<float>(x_l)));
+
+                        *output_ptr = static_cast<O>(w_tr*f_tr + w_tl*f_tl + w_br*f_br + w_bl*f_bl);
+
+                        input_ptr_NC += in_C_s;
+                        output_ptr += out_C_s;
+                    }
+                } else if (op.mode() == GridSample_Op::Mode::Cubic) {
+                    /*
+                    *  .. .. .. .. .. ..
+                    *  .. 00 01 02 03 ..
+                    *  .. 10 11 12 13 ..
+                    *  .. 20 21 22 23 ..
+                    *  .. 30 31 32 33 ..
+                    *  .. .. .. .. .. ..
+                    */
+                    const std::int64_t x_1 = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(x)), in_W, op.paddingMode());
+                    const std::int64_t x_0 = update_unnormalized_coord_with_padding(x_1 - 1, in_W, op.paddingMode());
+                    const std::int64_t x_2 = update_unnormalized_coord_with_padding(x_1 + 1, in_W, op.paddingMode());
+                    const std::int64_t x_3 = update_unnormalized_coord_with_padding(x_1 + 2, in_W, op.paddingMode());
+
+                    const std::int64_t y_1 = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(y)), in_H, op.paddingMode());
+                    const std::int64_t y_0 = update_unnormalized_coord_with_padding(y_1 - 1, in_H, op.paddingMode());
+                    const std::int64_t y_2 = update_unnormalized_coord_with_padding(y_1 + 1, in_H, op.paddingMode());
+                    const std::int64_t y_3 = update_unnormalized_coord_with_padding(y_1 + 2, in_H, op.paddingMode());
+
+                    const I* input_ptr_NC = input_ptr_N;
+
+                    for (std::size_t c = 0; c < C; ++c) {
+                        const I f_00 = in_bound(x_0, 0, in_W) && in_bound(y_0, 0, in_H) ?
+                                        input_ptr_NC[x_0*in_W_s + y_0*in_H_s] : I(0);
+                        const I f_01 = in_bound(x_0, 0, in_W) && in_bound(y_1, 0, in_H) ?
+                                        input_ptr_NC[x_0*in_W_s + y_1*in_H_s] : I(0);
+                        const I f_02 = in_bound(x_0, 0, in_W) && in_bound(y_2, 0, in_H) ?
+                                        input_ptr_NC[x_0*in_W_s + y_2*in_H_s] : I(0);
+                        const I f_03 = in_bound(x_0, 0, in_W) && in_bound(y_3, 0, in_H) ?
+                                        input_ptr_NC[x_0*in_W_s + y_3*in_H_s] : I(0);
+                        const I f_10 = in_bound(x_1, 0, in_W) && in_bound(y_0, 0, in_H) ?
+                                        input_ptr_NC[x_1*in_W_s + y_0*in_H_s] : I(0);
+                        const I f_20 = in_bound(x_2, 0, in_W) && in_bound(y_0, 0, in_H) ?
+                                        input_ptr_NC[x_2*in_W_s + y_0*in_H_s] : I(0);
+                        const I f_30 = in_bound(x_3, 0, in_W) && in_bound(y_0, 0, in_H) ?
+                                        input_ptr_NC[x_3*in_W_s + y_0*in_H_s] : I(0);
+                        const I f_11 = in_bound(x_1, 0, in_W) && in_bound(y_1, 0, in_H) ?
+                                        input_ptr_NC[x_1*in_W_s + y_1*in_H_s] : I(0);
+                        const I f_12 = in_bound(x_1, 0, in_W) && in_bound(y_2, 0, in_H) ?
+                                        input_ptr_NC[x_1*in_W_s + y_2*in_H_s] : I(0);
+                        const I f_13 = in_bound(x_1, 0, in_W) && in_bound(y_3, 0, in_H) ?
+                                        input_ptr_NC[x_1*in_W_s + y_3*in_H_s] : I(0);
+                        const I f_21 = in_bound(x_2, 0, in_W) && in_bound(y_1, 0, in_H) ?
+                                        input_ptr_NC[x_2*in_W_s + y_1*in_H_s] : I(0);
+                        const I f_22 = in_bound(x_2, 0, in_W) && in_bound(y_2, 0, in_H) ?
+                                        input_ptr_NC[x_2*in_W_s + y_2*in_H_s] : I(0);
+                        const I f_23 = in_bound(x_2, 0, in_W) && in_bound(y_3, 0, in_H) ?
+                                        input_ptr_NC[x_2*in_W_s + y_3*in_H_s] : I(0);
+                        const I f_31 = in_bound(x_3, 0, in_W) && in_bound(y_1, 0, in_H) ?
+                                        input_ptr_NC[x_3*in_W_s + y_1*in_H_s] : I(0);
+                        const I f_32 = in_bound(x_3, 0, in_W) && in_bound(y_2, 0, in_H) ?
+                                        input_ptr_NC[x_3*in_W_s + y_2*in_H_s] : I(0);
+                        const I f_33 = in_bound(x_3, 0, in_W) && in_bound(y_3, 0, in_H) ?
+                                        input_ptr_NC[x_3*in_W_s + y_3*in_H_s] : I(0);
+
+                        const I mx_11 = (f_21 - f_01) / I(2);
+                        const I mx_12 = (f_22 - f_02) / I(2);
+                        const I mx_21 = (f_31 - f_11) / I(2);
+                        const I mx_22 = (f_32 - f_12) / I(2);
+
+                        const I my_11 = (f_12 - f_10) / I(2);
+                        const I my_12 = (f_13 - f_11) / I(2);
+                        const I my_21 = (f_22 - f_20) / I(2);
+                        const I my_22 = (f_23 - f_21) / I(2);
+
+                        const I mxy_11 = (f_22 - f_20 - f_02 - + f_00) / I(4);
+                        const I mxy_12 = (f_23 - f_21 - f_03 - + f_01) / I(4);
+                        const I mxy_21 = (f_32 - f_30 - f_12 - + f_10) / I(4);
+                        const I mxy_22 = (f_33 - f_31 - f_13 - + f_11) / I(4);
+
+                        const I a_00 = f_11;
+                        const I a_10 = mx_11;
+                        const I a_20 = I(3)*(f_21 - f_11) - I(2)*mx_11 - mx_21;
+                        const I a_30 = I(2)*(f_11 - f_21) + mx_11 + mx_21;
+                        const I a_01 = my_11;
+                        const I a_11 = mxy_11;
+                        const I a_21 = I(3)*(my_21 - my_11) - I(2)*mxy_11 - mxy_21;
+                        const I a_31 = I(2)*(my_11 - my_21) + mxy_11 + mxy_21;
+                        const I a_02 = I(3)*(f_12 - f_11) - I(2)*my_11 - my_12;
+                        const I a_12 = I(3)*(mx_12 - mx_11) - I(2)*mxy_11 - mxy_12;
+                        const I a_22 = I(9)*(f_11 + f_22 - f_21 - f_12) + I(3)*(I(2)*(mx_11 - mx_12 + my_11 - my_21) + mx_21 - mx_22 + my_12 - my_22) + mxy_22 + I(2)*(mxy_12 + mxy_21 + I(2)*mxy_11);
+                        const I a_32 = - mxy_12 - mxy_22 + I(2)*(my_22 - my_12 - mxy_11 - mxy_21 + I(2)*(my_21 - my_11) + I(3)*(f_21 + f_12 - f_11 - f_22)) + I(3)*(mx_12 + mx_22 - mx_11 - mx_21);
+                        const I a_03 = I(2)*(f_11 - f_12) + my_11 + my_12;
+                        const I a_13 = I(2)*(mx_11 - mx_12) + mxy_11 + mxy_12;
+                        const I a_23 = - mxy_21 - mxy_22 + I(2)*(-mx_21 + mx_22 - mxy_11 - mxy_12 + I(2)*(mx_12 - mx_11) + I(3)*(f_12 + f_21 - f_11 - f_22)) + I(3)*(my_21 + my_22 - my_11 - my_12);
+                        const I a_33 = mxy_11 + mxy_21 + mxy_12 + mxy_22 + I(2)*(mx_11 + mx_21 - mx_12 - mx_22 + my_11 - my_21 + my_12 - my_22 + I(2)*(f_11 - f_21 - f_12 + f_22));
+
+                        const I x2 = static_cast<I>(x*x);
+                        const I x3 = static_cast<I>(x*x*x);
+                        const I y2 = static_cast<I>(y*y);
+                        const I y3 = static_cast<I>(y*y*y);
+
+                        *output_ptr = static_cast<O>( \
+                            a_00 + a_10*x + a_20*x2 + a_30*x3 \
+                            + a_01*y + a_11*x*y + a_21*x2*y + a_31*x3*y \
+                            + a_02*y2 + a_12*x*y2 + a_22*x2*y2 + a_32*x3*y2 \
+                            + a_03*y3 + a_13*x*y3 + a_23*x2*y3 + a_33*x3*y3);
+
+                        input_ptr_NC += in_C_s;
+                        output_ptr += out_C_s;
+                    }
+                }
+            }
+        }
+
+        input_ptr_N += in_N_s;
+        grid_ptr_N += grid_N_s;
+        output_ptr_N += out_N_s;
+    }
+}
+
+// Kernels registration to implementation entry point
+// only accept 1st input with only 2 spatial feat. (nb dims = 2)
+REGISTRAR(GridSampleImpl_cpu,
+    {{{DataType::Any, DataFormat::Any, {{-1, -1}, {-1, -1}}}, {DataType::Any}}, {{DataType::Float16}}},
+    {ProdConso::defaultModel, Aidge::GridSampleImpl2D_cpu_forward_kernel<half_float::half, half_float::half>, nullptr});
+REGISTRAR(GridSampleImpl_cpu,
+    {{{DataType::Any, DataFormat::Any, {{-1, -1}, {-1, -1}}}, {DataType::Any}}, {{DataType::Float32}}},
+    {ProdConso::defaultModel, Aidge::GridSampleImpl2D_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(GridSampleImpl_cpu,
+    {{{DataType::Any, DataFormat::Any, {{-1, -1}, {-1, -1}}}, {DataType::Any}}, {{DataType::Float64}}},
+    {ProdConso::defaultModel, Aidge::GridSampleImpl2D_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(GridSampleImpl_cpu,
+    {{{DataType::Any, DataFormat::Any, {{-1, -1}, {-1, -1}}}, {DataType::Any}}, {{DataType::Int32}}},
+    {ProdConso::defaultModel, Aidge::GridSampleImpl2D_cpu_forward_kernel<int32_t, int32_t>, nullptr});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_CONVIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/MulImpl.hpp b/include/aidge/backend/cpu/operator/MulImpl.hpp
index f70de8cc5324d32260e6ee757560514c9c6db04b..05fceba17471229d83d9f8738614b2e747121b49 100644
--- a/include/aidge/backend/cpu/operator/MulImpl.hpp
+++ b/include/aidge/backend/cpu/operator/MulImpl.hpp
@@ -23,7 +23,22 @@
 namespace Aidge {
 // Operator implementation entry point for the backend
 using MulImpl_cpu = OperatorImpl_cpu<Mul_Op,
-    void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*,void*)>;
+    void(const std::vector<std::size_t>&,
+        const std::vector<std::size_t>&, 
+        const std::vector<std::size_t>&, 
+        const void*, 
+        const void*,
+        void*),
+    void(const std::size_t, 
+        const std::size_t, 
+        const std::size_t,
+        const std::vector<std::size_t>,
+        const std::vector<std::size_t>,
+        const void*, 
+        const void*, 
+        const void*, 
+        void*, 
+        void*)>;
 
 // Implementation entry point registration to Operator
 REGISTRAR(Mul_Op, "cpu", Aidge::MulImpl_cpu::create);
diff --git a/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp
index c1a976c52aed92f5be4de1a7be42e0b13670c0b8..c015b8f0182608fecd3da94220e9411decfd186c 100644
--- a/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp
@@ -48,19 +48,79 @@ void MulImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims,
     }
 }
 
+template <class I1, class I2, class O>
+void MulImpl_cpu_backward_kernel(const std::size_t input0Length, 
+                                 const std::size_t input1Length,
+                                 const std::size_t grad0Length,
+                                 const std::vector<std::size_t> input0Dims,
+                                 const std::vector<std::size_t> input1Dims,
+                                 const void* input0_, 
+                                 const void* input1_, 
+                                 const void* grad_output_, 
+                                 void* gradientInput0,
+                                 void* gradientInput1)
+{
+    const auto* input0 = static_cast<const I1*>(input0_);
+    const auto* input1 = static_cast<const I1*>(input1_);
+    const auto* grad_output = static_cast<const O*>(grad_output_);
+    auto* grad_input_0 = static_cast<I1*>(gradientInput0);
+    auto* grad_input_1 = static_cast<I2*>(gradientInput1);
+
+
+    if(input0Dims.size() >= input1Dims.size())
+    {
+        AIDGE_ASSERT(input0Length == grad0Length, "Incorrect dimensions between Mul input and output tensors");
+
+        for(auto i = 0U; i < input0Length; ++i)
+        {
+            const auto indices = getMultiDimIndices(input1Dims, i);
+            const auto flattenedIndex = getFlattenedIndex(input1Dims, indices);
+
+            grad_input_0[i] = input1[flattenedIndex] * grad_output[i];
+        }
+
+        for(std::size_t i = 0 ; i < grad0Length; ++i)
+        {
+            const auto indices = getMultiDimIndices(input1Dims, i);
+            const auto flattenedIndex = getFlattenedIndex(input1Dims, indices);
+
+            grad_input_1[flattenedIndex] += input0[i] * grad_output[i];
+        }
+
+    } else {
+        AIDGE_ASSERT(input1Length == grad0Length, "Incorrect dimensions between Mul input and output tensors");
+
+        for(auto i = 0U; i < input1Length; ++i)
+        {
+            const auto indices = getMultiDimIndices(input0Dims, i);
+            const auto flattenedIndex = getFlattenedIndex(input0Dims, indices);
+
+            grad_input_1[i] = input0[flattenedIndex] * grad_output[i];
+        }
+
+        for(std::size_t i = 0 ; i < grad0Length; ++i)
+        {
+            const auto indices = getMultiDimIndices(input0Dims, i);
+            const auto flattenedIndex = getFlattenedIndex(input0Dims, indices);
+
+            grad_input_0[flattenedIndex] += input1[i] * grad_output[i];
+        }
+    }
+}
+
 // Kernels registration to implementation entry point
 REGISTRAR(MulImpl_cpu,
     {DataType::Float32},
-    {ProdConso::inPlaceModel, Aidge::MulImpl_cpu_forward_kernel<float, float, float>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::MulImpl_cpu_forward_kernel<float, float, float>, Aidge::MulImpl_cpu_backward_kernel<float, float, float>});
 REGISTRAR(MulImpl_cpu,
     {DataType::Float64},
-    {ProdConso::inPlaceModel, Aidge::MulImpl_cpu_forward_kernel<double, double, double>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::MulImpl_cpu_forward_kernel<double, double, double>, Aidge::MulImpl_cpu_backward_kernel<double, double, double>});
 REGISTRAR(MulImpl_cpu,
     {DataType::Int32},
-    {ProdConso::inPlaceModel, Aidge::MulImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::MulImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, Aidge::MulImpl_cpu_backward_kernel<std::int32_t, std::int32_t, std::int32_t>});
 REGISTRAR(MulImpl_cpu,
     {DataType::Int64},
-    {ProdConso::inPlaceModel, Aidge::MulImpl_cpu_forward_kernel<std::int64_t, std::int64_t, std::int64_t>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::MulImpl_cpu_forward_kernel<std::int64_t, std::int64_t, std::int64_t>, Aidge::MulImpl_cpu_backward_kernel<std::int64_t, std::int64_t, std::int64_t>});
 }  // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_MULIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/PadImpl_kernels.hpp b/include/aidge/backend/cpu/operator/PadImpl_kernels.hpp
index 679c0b17263d92de51584a316e829eda588e3563..a362be0944aa18c36dd74a2f0066aaa21a1fc4c0 100644
--- a/include/aidge/backend/cpu/operator/PadImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/PadImpl_kernels.hpp
@@ -130,7 +130,7 @@ void PadImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 4>& beginEndBorder
 
             for (std::uint32_t oy = 0; oy < oySize; ++oy) {
                 for (std::uint32_t ox = 0; ox < oxSize; ++ox) {
-                    const std::size_t oIndexFull = oIndex + ox*oySize + oy;
+                    const std::size_t oIndexFull = oIndex + oy*oxSize + ox;
 
                     O outputValue = static_cast<O>(borderValue);
 
@@ -139,14 +139,14 @@ void PadImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 4>& beginEndBorder
                         std::int32_t iy = static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[1]);
 
                         if (ix >= 0  && ix < static_cast<std::int32_t>(dims[3]) && iy >= 0  && iy < static_cast<std::int32_t>(dims[2])) {
-                            outputValue = input[iIndex + static_cast<std::size_t>(ix)*dims[2] + static_cast<std::size_t>(iy)];
+                            outputValue = input[iIndex + static_cast<std::size_t>(iy)*dims[3] + static_cast<std::size_t>(ix)];
                         }
                     }
                     else if (borderType == PadBorderType::Edge) {
                         std::int32_t ix = std::max(0, std::min(static_cast<std::int32_t>(dims[3]) - 1, static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[3])));
                         std::int32_t iy = std::max(0, std::min(static_cast<std::int32_t>(dims[2]) - 1, static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[1])));
 
-                        outputValue = input[iIndex + static_cast<std::size_t>(ix)*dims[2] + static_cast<std::size_t>(iy)];
+                        outputValue = input[iIndex + static_cast<std::size_t>(iy)*dims[3] + static_cast<std::size_t>(ix)];
                     }
                     else if (borderType == PadBorderType::Reflect) {
                         std::int32_t ix = static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[3]);
@@ -161,13 +161,13 @@ void PadImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 4>& beginEndBorder
                         if (iy >= static_cast<std::int32_t>(dims[2]))
                             iy = static_cast<std::int32_t>(dims[2]) - iy;
 
-                        outputValue = input[iIndex + static_cast<std::size_t>(ix)*dims[2] + static_cast<std::size_t>(iy)];
+                        outputValue = input[iIndex + static_cast<std::size_t>(iy)*dims[3] + static_cast<std::size_t>(ix)];
                     }
                     else if (borderType == PadBorderType::Wrap) {
                         std::int32_t ix = (static_cast<std::int32_t>(dims[3]) + static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[3])) % static_cast<std::int32_t>(dims[3]);
                         std::int32_t iy = (static_cast<std::int32_t>(dims[2]) + static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[1])) % static_cast<std::int32_t>(dims[2]);
 
-                        outputValue = input[iIndex + static_cast<std::size_t>(ix)*dims[2] + static_cast<std::size_t>(iy)];
+                        outputValue = input[iIndex + static_cast<std::size_t>(iy)*dims[3] + static_cast<std::size_t>(ix)];
                     }
 
                     output[oIndexFull] = outputValue;
diff --git a/src/operator/ConstantOfShapeImpl.cpp b/src/operator/ConstantOfShapeImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..16e4b762ba04e5f01bfccf965f6de3650fa2e734
--- /dev/null
+++ b/src/operator/ConstantOfShapeImpl.cpp
@@ -0,0 +1,44 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include "aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp"
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/ConstantOfShapeImpl_kernels.hpp"
+#include "aidge/data/Data.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/ConstantOfShape.hpp"
+#include "aidge/utils/ErrorHandling.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+template <>
+void Aidge::ConstantOfShapeImpl_cpu::forward() {
+  const ConstantOfShape_Op &op_ = static_cast<const ConstantOfShape_Op &>(mOp);
+  // Check if input is provided
+  AIDGE_ASSERT(op_.getInput(0), "{} : Missing input 0", __func__);
+
+    // Find the correct kernel type
+    const auto impl = Registrar<ConstantOfShapeImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Call kernel
+    impl.forward(op_.getOutput(0)->dims(),
+             op_.value(), 
+             op_.getOutput(0)->getImpl()->rawPtr());
+}
+
+template <>
+void Aidge::ConstantOfShapeImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for ConstantOfShape_Op on backend cpu");
+}
diff --git a/src/operator/GridSampleImpl.cpp b/src/operator/GridSampleImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5b87390fc3de21d5d406d893e4827e80cce06c35
--- /dev/null
+++ b/src/operator/GridSampleImpl.cpp
@@ -0,0 +1,48 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include "aidge/backend/cpu/operator/GridSampleImpl.hpp"
+
+#include <functional>
+#include <vector>
+
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include "aidge/backend/cpu/operator/GridSampleImpl_kernels.hpp"
+#include "aidge/operator/GridSample.hpp"
+#include "aidge/utils/Types.h"
+
+template <>
+void Aidge::GridSampleImpl_cpu::forward() {
+    const auto& op_ = static_cast<const GridSample_Op&>(mOp);
+
+    // Find the correct kernel type
+    const auto impl = Registrar<GridSampleImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Convert input data (no overhead if not needed!)
+    // TODO: right now, if needed, memory will be allocated/deallocated at each
+    // call to forward(). We might put the following shared_ptr as members of
+    // this class to avoid that.
+    std::shared_ptr<Tensor> input0Fallback, input1Fallback;
+    const auto& input0 = std::make_shared<Tensor>(op_.getInput(0)->refCastFrom(input0Fallback, *op_.getOutput(0)));
+    const auto& input1 = std::make_shared<Tensor>(op_.getInput(1)->refCastFrom(input1Fallback, *op_.getOutput(0)));
+
+    // Call kernel
+    impl.forward(op_,
+            input0, // input
+            input1, // grid
+            op_.getOutput(0) // output
+            );
+}
+
+template <>
+void Aidge::GridSampleImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for GridSample_Op on backend cpu");
+}
diff --git a/src/operator/MulImpl.cpp b/src/operator/MulImpl.cpp
index 541e931302f61c09d7c96435d40dcf5dde10b133..ea5e3d3ab8ac24934a0cb6f9042858fa094700af 100644
--- a/src/operator/MulImpl.cpp
+++ b/src/operator/MulImpl.cpp
@@ -44,5 +44,26 @@ void Aidge::MulImpl_cpu::forward() {
 
 template <>
 void Aidge::MulImpl_cpu::backward() {
-    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Mul_Op on backend cpu");
+    const Mul_Op& op_ = dynamic_cast<const Mul_Op&>(mOp);
+    
+    auto in0 = op_.getInput(0);
+    auto in1 = op_.getInput(1);
+    auto in0grad = op_.getInput(0)->grad();
+    auto in1grad = op_.getInput(1)->grad();
+    auto out0grad = op_.getOutput(0)->grad();
+
+    // Find the correct kernel type
+    const auto impl = Registrar<MulImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Call kernel
+    impl.backward(/* input0Length */ in0grad->size(), 
+               /* input1Length */ in1grad->size(),
+               /* grad0Length  */ out0grad->size(),
+               /* input0Dims   */ in0->dims(),
+               /* input1Dims   */ in1->dims(),
+               getCPUPtr(in0), 
+               getCPUPtr(in1), 
+               getCPUPtr(out0grad), 
+               getCPUPtr(in0grad), 
+               getCPUPtr(in1grad));
 }
diff --git a/unit_tests/operator/Test_ConstantOfShapeImpl.cpp b/unit_tests/operator/Test_ConstantOfShapeImpl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..42505d385fde7e72e09531f1607287ffc6978f75
--- /dev/null
+++ b/unit_tests/operator/Test_ConstantOfShapeImpl.cpp
@@ -0,0 +1,120 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <cstddef> // std::size_t
+#include <cstdint> // std::uint16_t
+#include <iostream>
+#include <memory>
+#include <numeric> // std::accumulate
+#include <ostream>
+#include <random> // std::random_device, std::mt19937, std::uniform_real_distribution
+
+#include "catch2/internal/catch_compiler_capabilities.hpp"
+#include "catch2/internal/catch_enforce.hpp"
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/generators/catch_generators_random.hpp>
+
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/ConstantOfShape.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+#include <aidge/data/Data.hpp>
+#include <aidge/data/half.hpp>
+#include <aidge/filler/Filler.hpp>
+#include <aidge/operator/OperatorTensor.hpp>
+#include <aidge/operator/Reshape.hpp>
+#include <aidge/utils/TensorUtils.hpp>
+#include <aidge/utils/Types.h>
+
+namespace Aidge {
+TEST_CASE("[cpu/operator] ConstantOfShape", "[ConstantOfShape][CPU]") {
+  constexpr std::uint16_t NBTRIALS = 10;
+  // Create a random number generator
+  auto random_seed = Catch::Generators::Detail::getSeed;
+  std::mt19937 gen(random_seed());
+  std::uniform_real_distribution<float> valueDist(
+      0.1f, 1.1f); // Random float distribution between 0 and 1
+  std::uniform_int_distribution<DimSize_t> input_tensor_size_dist(
+      std::size_t(1), std::size_t(10));
+  std::uniform_int_distribution<int64_t> input_tensor_values_dist(
+      std::size_t(1), std::size_t(7));
+  std::uniform_real_distribution<double> operator_attr_value_dist(-100., 100.);
+
+  ///////////////////////////////////////////////
+  // SETUP FUNCTIONS
+  auto generate_input_tensor =
+      [&gen, &input_tensor_size_dist,
+       &input_tensor_values_dist]() -> std::shared_ptr<Tensor> {
+    std::vector<DimSize_t> input_dims;
+    input_dims.push_back(input_tensor_size_dist(gen));
+
+    auto result = std::make_shared<Tensor>(input_dims);
+    result->setDataType(DataType::Int64);
+    result->setBackend("cpu");
+    for (DimSize_t i = 0; i < result->size(); ++i) {
+      result->set<int64_t>(i, input_tensor_values_dist(gen));
+    }
+    return result;
+  };
+
+  auto generate_random_operator =
+      [&gen,
+       &operator_attr_value_dist]() -> std::shared_ptr<ConstantOfShape_Op> {
+    auto node = ConstantOfShape(Tensor(operator_attr_value_dist(gen)));
+    auto op = std::static_pointer_cast<ConstantOfShape_Op>(node->getOperator());
+    op->setDataType(DataType::Float64);
+    op->setBackend("cpu");
+    return op;
+  };
+
+  auto generate_output_tensor = [](std::shared_ptr<Tensor> input_tensor,
+                                   std::shared_ptr<ConstantOfShape_Op> op) {
+    std::vector<DimSize_t> output_dims;
+    output_dims.reserve(input_tensor->size());
+    for (DimSize_t i = 0; i < input_tensor->size(); ++i) {
+      output_dims.push_back(input_tensor->get<int64_t>(i));
+    }
+    auto result = std::make_shared<Tensor>(output_dims);
+    result->setDataType(op->value().dataType());
+    result->setBackend("cpu");
+    constantFiller(result, op->value().get<double>(0));
+    return result;
+  };
+
+  /////////////////////////////////////
+  // BENCHMARKING
+  std::chrono::time_point<std::chrono::system_clock> start;
+  std::chrono::time_point<std::chrono::system_clock> end;
+  std::chrono::duration<double, std::micro> duration{};
+  int number_of_operation{0};
+
+  SECTION("ConstantOfShapeImpl_cpu::forward()") {
+    for (int i = 0; i < NBTRIALS; ++i) {
+      auto input_T = generate_input_tensor();
+      std::shared_ptr<ConstantOfShape_Op> op = generate_random_operator();
+      auto output_T = generate_output_tensor(input_T, op);
+      op->associateInput(0, input_T);
+
+      REQUIRE(op->forwardDims(true));
+      REQUIRE_NOTHROW(op->forward());
+
+      CHECK(output_T->nbDims() == op->getOutput(0)->nbDims());
+      for (DimIdx_t i = 0; i < output_T->nbDims(); ++i) {
+        CHECK(output_T->dims().at(i) == op->getOutput(0)->dims().at(i));
+      }
+      CHECK(approxEq<double>(*output_T, *op->getOutput(0)));
+    }
+  }
+}
+} // namespace Aidge
+
diff --git a/unit_tests/operator/Test_MulImpl.cpp b/unit_tests/operator/Test_MulImpl.cpp
index 9d592d31e1999f63fb0ebe3f5ad9d19e85c8645c..3378861d0d3d7e74e7867c2765a0b09069fa8caf 100644
--- a/unit_tests/operator/Test_MulImpl.cpp
+++ b/unit_tests/operator/Test_MulImpl.cpp
@@ -24,6 +24,337 @@
 
 namespace Aidge {
 
+    TEST_CASE("[CPU/Operator] Mul Backward", "[Mul][CPU][Backward]")
+    {
+        std::shared_ptr<Node> myMul = Mul();
+        auto op = std::static_pointer_cast<OperatorTensor>(myMul->getOperator());
+        op->setDataType(DataType::Float32);
+        op->setBackend("cpu");
+
+        SECTION("Case 1: 2D and 1D tensors") {
+            const auto T0 = std::make_shared<Tensor>(Array2D<float,2,3>(
+                {
+                    {
+                        {1,2,3},{4,5,6}
+                    }
+                }
+            ));
+
+            const auto T1 = std::make_shared<Tensor>(Array1D<float,3>(
+                {0.1,0.2,0.3}
+            ));
+
+            T0->setDataType(DataType::Float32);
+            T0->setBackend("cpu");
+            T1->setDataType(DataType::Float32);
+            T1->setBackend("cpu");
+
+            op->getOutput(0)->setGrad(std::make_shared<Tensor>(Array2D<float,2,3>({{{1.0,1.0,1.0},{1.0,1.0,1.0}}})));
+
+            op->associateInput(0,T0);
+            op->associateInput(1,T1);
+            op->forwardDims();
+
+            myMul->forward();
+            myMul->backward();
+
+            auto T0Grad = std::make_shared<Tensor>(Array2D<float, 2,3>({{{0.1,0.2,0.3},{0.1, 0.2, 0.3}}}));
+            auto T1Grad = std::make_shared<Tensor>(Array1D<float, 3>({5,7,9}));
+
+            REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *T0Grad));
+            REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *T1Grad));
+        }
+
+        SECTION("Case 2: 3D and 1D tensors") {
+            const auto T0 = std::make_shared<Tensor>(Array3D<float,2,2,3>(
+                {
+                    {
+                        {
+                            {1.0, 2.0, 3.0},
+                            {4.0, 5.0, 6.0}
+                        },
+                        {
+                            {7.0, 8.0, 9.0},
+                            {10.0, 11.0, 12.0}
+                        }
+                    }
+                }
+            ));
+
+            const auto T1 = std::make_shared<Tensor>(Array1D<float, 3>({0.3,0.2,0.1}));
+
+            const auto newGrad = std::make_shared<Tensor>(Array3D<float,2,2,3>(
+                    {
+                        {
+                            {
+                                {1, 1, 1},
+                                {1, 1, 1}
+                            },
+                            {
+                                {1, 1, 1},
+                                {1, 1, 1}
+                            }
+                        }
+                    }
+                ));
+
+            const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float,2,2,3>(
+                {
+                    {
+                        {
+                            {0.3, 0.2, 0.1},
+                            {0.3, 0.2, 0.1}
+                        },
+                        {
+                            {0.3, 0.2, 0.1},
+                            {0.3, 0.2, 0.1}
+                        }
+                    }
+                }
+            ));
+
+            const auto expectedGrad1 = std::make_shared<Tensor>(Array1D<float,3>(
+                {22.0, 26.0, 30.0}
+            ));
+
+            for(auto T: {T0, T1, newGrad, expectedGrad0, expectedGrad1})
+            {
+                    T->setBackend("cpu") ;
+                    T->setDataType(DataType::Float32);
+            }
+
+            op->associateInput(0, T0);
+            op->associateInput(1, T1);
+            op->getOutput(0)->setGrad(newGrad);
+            op->forwardDims();
+
+            myMul->backward();
+
+            REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0));
+            REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1));
+        }
+
+        SECTION("Case 3: 4D and 2D tensors") {
+            const auto T0 = std::make_shared<Tensor>(Array4D<float,2, 2, 3, 3>(
+                {
+                    {
+                        {
+                            {
+                                {1.0, 2.0, 3.0},
+                                {4.0, 5.0, 6.0},
+                                {7.0, 8.0, 9.0}
+                            },
+                            {
+                                {10.0, 11.0, 12.0},
+                                {13.0, 14.0, 15.0},
+                                {16.0, 17.0, 18.0}
+                            }
+                        },
+                        {
+                            {
+                                {19.0, 20.0, 21.0},
+                                {22.0, 23.0, 24.0},
+                                {25.0, 26.0, 27.0}
+                            },
+                            {
+                                {28.0, 29.0, 30.0},
+                                {31.0, 32.0, 33.0},
+                                {34.0, 35.0, 36.0}
+                            }
+                        }
+                    }
+                }
+            ));
+
+            const auto T1 = std::make_shared<Tensor>(Array2D<float, 3,3>(
+                {
+                    {
+                        {0.5,0.3,0.1},
+                        {0.4,0.2,0.6},
+                        {0.7,0.8,0.9}
+                    }
+                }
+            ));
+
+            const auto newGrad = std::make_shared<Tensor>(Array4D<float,2, 2, 3, 3>(
+                {
+                    {
+                        {
+                            {
+                                {1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0}
+                            },
+                            {
+                                {1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0}
+                            }
+                        },
+                        {
+                            {
+                                {1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0}
+                            },
+                            {
+                                {1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0},
+                                {1.0, 1.0, 1.0}
+                            }
+                        }
+                    }
+                }
+            ));
+
+            const auto expectedGrad0 = std::make_shared<Tensor>(Array4D<float,2,2,3,3>(
+                {
+                    {
+                        {
+                            {
+                                {0.5, 0.3, 0.1},
+                                {0.4, 0.2, 0.6},
+                                {0.7, 0.8, 0.9}
+                            },
+                            {
+                                {0.5, 0.3, 0.1},
+                                {0.4, 0.2, 0.6},
+                                {0.7, 0.8, 0.9}
+                            }
+                        },
+                        {
+                            {
+                                {0.5, 0.3, 0.1},
+                                {0.4, 0.2, 0.6},
+                                {0.7, 0.8, 0.9}
+                            },
+                            {
+                                {0.5, 0.3, 0.1},
+                                {0.4, 0.2, 0.6},
+                                {0.7, 0.8, 0.9}
+                            }
+                        }
+                    }
+                }
+            ));
+
+            const auto expectedGrad1 = std::make_shared<Tensor>(Array2D<float,3, 3>(
+                {
+                    {
+                        {58.0, 62.0, 66.0},
+                        {70.0, 74.0, 78.0},
+                        {82.0, 86.0, 90.0}
+                    }
+                }
+            ));
+
+            for(const auto T: {T0, T1, newGrad, expectedGrad0, expectedGrad1})
+            {
+                    T->setBackend("cpu") ;
+                    T->setDataType(DataType::Float32);
+            }
+
+            op->associateInput(0, T0);
+            op->associateInput(1, T1);
+            op->getOutput(0)->setGrad(newGrad);
+            op->forwardDims();
+
+            myMul->backward();
+
+            REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0));
+            REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1));
+        }
+
+        SECTION("Case 4: 3D and 2D tensors") {
+            const auto T0 = std::make_shared<Tensor>(Array3D<float, 2, 3, 4>(
+                {
+                    {
+                        {
+                            {1.0, 2.0, 3.0, 4.0},
+                            {5.0, 6.0, 7.0, 8.0},
+                            {9.0, 10.0, 11.0, 12.0},
+                        },
+                        {
+                            {13.0, 14.0, 15.0, 16.0},
+                            {17.0, 18.0, 19.0, 20.0},
+                            {21.0, 22.0, 23.0, 24.0},
+                        }
+                    }
+                }
+            ));
+
+            const auto T1 = std::make_shared<Tensor>(Array2D<float, 3, 4>(
+                {
+                    {
+                        {0.1, 0.2, 0.3, 0.4},
+                        {0.5, 0.6, 0.7, 0.8},
+                        {0.9, 1.0, 1.1, 1.2}
+                    }
+                }
+            ));
+
+            const auto newGrad = std::make_shared<Tensor>(Array3D<float, 2,3,4>(
+                {
+                    {
+                        {
+                            {1.0, 1.0, 1.0, 1.0},
+                            {1.0, 1.0, 1.0, 1.0},
+                            {1.0, 1.0, 1.0, 1.0},
+                        },
+                        {
+                            {1.0, 1.0, 1.0, 1.0},
+                            {1.0, 1.0, 1.0, 1.0},
+                            {1.0, 1.0, 1.0, 1.0},
+                        }
+                    }
+                }
+            ));
+
+            const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float,2,3,4>(
+                {
+                    {
+                        {
+                            {0.1, 0.2, 0.3, 0.4},
+                            {0.5, 0.6, 0.7, 0.8},
+                            {0.9, 1.0, 1.1, 1.2}
+                        },
+                        {
+                            {0.1, 0.2, 0.3, 0.4},
+                            {0.5, 0.6, 0.7, 0.8},
+                            {0.9, 1.0, 1.1, 1.2}
+                        }
+                    }
+                }
+            ));
+
+            const auto expectedGrad1 = std::make_shared<Tensor>(Array2D<float,3, 4>(
+                {
+                    {
+                        {14.0, 16.0, 18.0, 20.0},
+                        {22.0, 24.0, 26.0, 28.0},
+                        {30.0, 32.0, 34.0, 36.0}
+                    }
+                }
+            ));
+
+            for(const auto T: {T0, T1, newGrad, expectedGrad0, expectedGrad1})
+            {
+                T->setBackend("cpu") ;
+                T->setDataType(DataType::Float32);
+            }
+
+            op->associateInput(0, T0);
+            op->associateInput(1, T1);
+            op->getOutput(0)->setGrad(newGrad);
+            op->forwardDims();
+
+            myMul->backward();
+
+            REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0));
+            REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1));
+        }
+    }
+
 TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
     constexpr std::uint16_t NBTRIALS = 10;
     // Create a random number generator
@@ -31,7 +362,7 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
     std::mt19937 gen(rd());
     std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
     std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
-    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
+    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(3));
     std::uniform_int_distribution<int> boolDist(0,1);
 
     // Create MatMul Operator
@@ -60,6 +391,7 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
     std::chrono::time_point<std::chrono::system_clock> end;
     std::chrono::duration<double, std::micro> duration{};
 
+
     SECTION("MulImpl_cpu::forward()") {
         SECTION("Scalar / Scalar") {
 
@@ -68,16 +400,20 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
 
         }
         SECTION("+1-D Tensor / +1-D Tensor - same dimensions") {
+
             std::size_t number_of_operation = 0;
 
             for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+
                 // generate 2 random Tensors
-                const std::size_t nbDims = nbDimsDist(gen);
-                std::vector<std::size_t> dims;
+                const auto nbDims = nbDimsDist(gen);
+                auto dims = std::vector<std::size_t>{};
+
                 for (std::size_t i = 0; i < nbDims; ++i) {
                     dims.push_back(dimSizeDist(gen));
                 }
-                const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+
+                const auto nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
                 number_of_operation += nb_elements;
 
                 // without broadcasting
@@ -114,67 +450,101 @@ TEST_CASE("[cpu/operator] Mul", "[Mul][CPU]") {
                 delete[] array0;
                 delete[] array1;
                 delete[] result;
-
-                // with broadcasting
             }
             std::cout << "number of elements over time spent: " << (number_of_operation / duration.count())<< std::endl;
             std::cout << "total time: " << duration.count() << "Î¼s" << std::endl;
         }
 
+
         SECTION("+1-D Tensor / +1-D Tensor - broadcasting") {
             std::size_t number_of_operation = 0;
 
             for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+
                 // generate 2 random Tensors
                 // handle dimensions, replace some dimensions with '1' to get broadcasting
+
                 constexpr std::size_t nbDims = 4;
-                std::vector<std::size_t> dims;
-                for (std::size_t i = 0; i < nbDims; ++i) {
-                    dims.push_back(dimSizeDist(gen));
+                std::vector<std::size_t> dimensions;
+
+                for (std::size_t i = 0; i < nbDims; ++i)
+                {
+                    dimensions.push_back(dimSizeDist(gen));
                 }
-                std::vector<std::size_t> dims0 = dims;
-                std::vector<std::size_t> dims1 = dims;
-                std::vector<std::size_t> dimsOut = dims;
-                for (std::size_t i = 0; i < nbDims; ++i) {
-                    if (boolDist(gen)) {
+
+                auto dims0 = dimensions;
+                auto dims1 = dimensions;
+                auto dimsOut = dimensions;
+
+                for (std::size_t i = 0; i < nbDims; ++i)
+                {
+                    if (boolDist(gen))
+                    {
                         dims0[i] = 1;
                     }
-                    if (boolDist(gen)) {
+
+                    if (boolDist(gen))
+                    {
                         dims1[i] = 1;
                     }
+
                     dimsOut[i] = (dims0[i] == 1) ? dims1[i] : dims0[i];
                 }
 
+                for(auto dim : dims0)
+                {
+                    Log::info("Dimension of input 0 : {}", dim);
+                }
+
+                for(auto dim : dims1)
+                {
+                    Log::info("Dimension of input 1 : {}", dim);
+                }
+
                 // create arrays and fill them with random values
                 float* array0 = new float[dims0[0]*dims0[1]*dims0[2]*dims0[3]];
                 float* array1 = new float[dims1[0]*dims1[1]*dims1[2]*dims1[3]];
                 float* result = new float[dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]];
 
-                for (std::size_t i = 0; i < dims0[0]*dims0[1]*dims0[2]*dims0[3]; ++i) {
+
+                for (std::size_t i = 0; i < dims0[0]*dims0[1]*dims0[2]*dims0[3]; ++i)
+                {
                     array0[i] = valueDist(gen);
                 }
-                for (std::size_t i = 0; i < dims1[0]*dims1[1]*dims1[2]*dims1[3]; ++i) {
+
+                for (std::size_t i = 0; i < dims1[0]*dims1[1]*dims1[2]*dims1[3]; ++i)
+                {
                     array1[i] = valueDist(gen);
                 }
 
                 // compute true result
                 const std::size_t strides0[nbDims] = {dims0[1]*dims0[2]*dims0[3], dims0[2]*dims0[3], dims0[3], 1};
                 const std::size_t strides1[nbDims] = {dims1[1]*dims1[2]*dims1[3], dims1[2]*dims1[3], dims1[3], 1};
-                for (std::size_t a = 0; a < dimsOut[0]; ++a) {
-                    for (std::size_t b = 0; b < dimsOut[1]; ++b) {
+
+                for (std::size_t a = 0; a < dimsOut[0]; ++a)
+                {
+                    for (std::size_t b = 0; b < dimsOut[1]; ++b)
+                    {
                         const std::size_t idx0_0 = strides0[0] * ((dims0[0] > 1) ? a : 0)
                                                     + strides0[1] * ((dims0[1] > 1) ? b : 0);
+
                         const std::size_t idx1_0 = strides1[0] * ((dims1[0] > 1) ? a : 0)
                                                     + strides1[1] * ((dims1[1] > 1) ? b : 0);
-                        for (std::size_t c = 0; c < dimsOut[2]; ++c) {
+
+                        for (std::size_t c = 0; c < dimsOut[2]; ++c)
+                        {
                             const std::size_t idx_out = dimsOut[3] * (c + dimsOut[2] * (b + dimsOut[1] * a));
-                            for (std::size_t d = 0; d < dimsOut[3]; ++d) {
+
+                            for (std::size_t d = 0; d < dimsOut[3]; ++d)
+                            {
                                 std::size_t idx0 = idx0_0
                                                     + strides0[2] * ((dims0[2] > 1) ? c : 0)
                                                     + ((dims0[3] > 1) ? d : 0);
+
                                 std::size_t idx1 = idx1_0
                                                     + strides1[2] * ((dims1[2] > 1) ? c : 0)
                                                     + ((dims1[3] > 1) ? d : 0);
+
                                 result[idx_out + d] = array0[idx0] * array1[idx1];
                                 // std::cout << "(" << idx0 << ", " << idx1 << ") -> " << array0[idx0] << " * " << array1[idx1] << " -> " << idx_out + d << std::endl;
                             }
diff --git a/unit_tests/operator/Test_PadImpl.cpp b/unit_tests/operator/Test_PadImpl.cpp
index cdd3a5f979085f3782776ce69ddd92c0d53150c4..75233c0b97fc6f9812020d0e3d3c695d8cd388f0 100644
--- a/unit_tests/operator/Test_PadImpl.cpp
+++ b/unit_tests/operator/Test_PadImpl.cpp
@@ -134,7 +134,7 @@ TEST_CASE("[cpu/operator] Pad(forward)", "[Pad][CPU]") {
     SECTION("Asymmetric Pad") {
         const int pv = 0; // pad value
 
-        std::shared_ptr<Node> myPad = Pad<2>({1, 0, 0, 1}, "mypad", PadBorderType::Constant, static_cast<double>(pv));
+        std::shared_ptr<Node> myPad = Pad<2>({0, 1, 1, 0}, "mypad", PadBorderType::Constant, static_cast<double>(pv));
         auto op = std::static_pointer_cast<OperatorTensor>(myPad -> getOperator());
         std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<int,2,3,5,5> { //NCHW
             {