Compare revisions

a424079c · a424079c · a424079c · a424079c · a424079c · a424079c
--- a/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp
+++ b/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_H_
+#define AIDGE_CPU_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_H_
+
+#include <memory>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/GlobalAveragePooling.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using GlobalAveragePoolingImpl_cpu = OperatorImpl_cpu<GlobalAveragePooling_Op,
+    void(const std::vector<DimSize_t> &, const void *, void *)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(GlobalAveragePooling_Op, "cpu", Aidge::GlobalAveragePoolingImpl_cpu::create);
+} // namespace Aidge
+
+#endif /* _AIDGE_CPU_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_KERNELS_H_
+
+#include <cstddef>
+#include <functional>  // std::multiplies
+#include <numeric>     // std::accumulate
+#include <vector>
+
+#include "aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp"
+#include "aidge/data/Data.hpp"
+#include "aidge/utils/ErrorHandling.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+
+namespace Aidge {
+
+template <typename T>
+typename std::enable_if<std::is_floating_point<T>::value, T>::type
+stableMean(const T* vec, size_t size) {
+  T mean = 0;
+  for (size_t i = 0; i < size; ++i) {
+    mean = std::fma<T>(vec[i] - mean, 1.0f / (i + 1), mean);
+  }
+  return mean;
+}
+
+// Specialization for integers: perform the mean computation in float
+template <typename T>
+typename std::enable_if<!std::is_floating_point<T>::value, T>::type
+stableMean(const T* vec, size_t size) {
+  double mean = 0;
+  for (size_t i = 0; i < size; ++i) {
+    mean = std::fma<double>(vec[i] - mean, 1.0f / (i + 1), mean);
+  }
+  return mean;
+}
+
+template <typename T>
+typename std::enable_if<std::is_floating_point<T>::value, T>::type
+castFromFloat(T value) {
+  return value;
+}
+
+template <typename T>
+typename std::enable_if<!std::is_floating_point<T>::value, T>::type
+castFromFloat(double value) {
+  return static_cast<T>(std::nearbyint(value));
+}
+
+template <class I, class O>
+void GlobalAveragePoolingImpl_cpu_forward_kernel(
+    const std::vector<DimSize_t> &dims, const void *input_, void *output_) {
+  // error checking
+    AIDGE_ASSERT(dims.size() >= 3,"GlobalAveragePool needs at least a 3 dimensions "
+                 "input, number of input dim : {}",
+                 dims.size());
+
+  // computation
+  const I *input = static_cast<const I *>(input_);
+  O *output = static_cast<O *>(output_);
+
+  DimSize_t nb_elems = std::accumulate(dims.begin(), dims.end(), std::size_t(1),
+                                       std::multiplies<std::size_t>());
+
+  const DimSize_t in_batch_nb_elems{nb_elems / dims[0]};
+  const DimSize_t in_channel_nb_elems{in_batch_nb_elems / dims[1]};
+  const DimSize_t out_batch_nb_elems{dims[1]};
+  // parse channel by channel and fill each output with the average of the
+  // values in the channel
+  for (DimSize_t batch = 0; batch < dims[0]; ++batch) {
+    for (DimSize_t channel = 0; channel < dims[1]; ++channel) {
+      const I *filter_start = std::next(
+          input, (batch * in_batch_nb_elems) + (channel * in_channel_nb_elems));
+      output[batch * out_batch_nb_elems + channel] = castFromFloat<O>(stableMean<I>(filter_start, in_channel_nb_elems));
+    }
+  }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(GlobalAveragePoolingImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::defaultModel, Aidge::GlobalAveragePoolingImpl_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(GlobalAveragePoolingImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::defaultModel, Aidge::GlobalAveragePoolingImpl_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(GlobalAveragePoolingImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::defaultModel, Aidge::GlobalAveragePoolingImpl_cpu_forward_kernel<int32_t, int32_t>, nullptr});
+} // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/GridSampleImpl.hpp
+++ b/include/aidge/backend/cpu/operator/GridSampleImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_GRIDSAMPLEIMPL_H_
+#define AIDGE_CPU_OPERATOR_GRIDSAMPLEIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/GridSample.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using GridSampleImpl_cpu = OperatorImpl_cpu<GridSample_Op,
+    void(const GridSample_Op&,
+        const std::shared_ptr<Tensor>&,
+        const std::shared_ptr<Tensor>&,
+        const std::shared_ptr<Tensor>&)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(GridSample_Op, "cpu", Aidge::GridSampleImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_GRIDSAMPLEIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/GridSampleImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/GridSampleImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_CONVIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_CONVIMPL_KERNELS_H_
+
+#include <algorithm>  // std::max, std::min
+#include <cmath>      // std::fabs, std::trunf, std::nearbyint
+#include <cstddef>    // std::size_t
+#include <cstdint>    // std::int64_t
+
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include "aidge/backend/cpu/operator/GridSampleImpl.hpp"
+#include "aidge/data/half.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+static bool in_bound(float coord, float lower_bound, float upper_bound) noexcept {
+    return (coord > lower_bound) && (coord < upper_bound);
+}
+
+static float unnormalized_coord(float coord, float new_lower_bound, float new_upper_bound) noexcept {
+    return (coord + 1) / 2 * (new_upper_bound - new_lower_bound) + new_lower_bound;
+}
+
+// unused
+// static float normalized_coord(float coord, float prev_lower_bound, float prev_upper_bound) noexcept {
+//     return (coord + prev_lower_bound) / (prev_upper_bound-prev_lower_bound) * 2 - 1;
+// }
+
+static float unnormalize_grid_sample_coord(float coord, std::size_t size, bool align_corners) noexcept {
+    return align_corners ? unnormalized_coord(coord, 0.0f, static_cast<float>(size) - 1.0f)
+                         : unnormalized_coord(coord, -0.5f, static_cast<float>(size) - 0.5f);
+}
+
+// unused
+// static float normalize_grid_sample_coord(float coord, std::size_t size, bool align_corners) noexcept {
+//     return align_corners ? normalized_coord(coord, 0.0f, static_cast<float>(size) - 1.0f)
+//                          : normalized_coord(coord, -0.5f, static_cast<float>(size) - 0.5f);
+// }
+
+static float update_normalized_coord_with_padding(float coord, Aidge::GridSample_Op::PaddingMode padding_mode) {
+    if (!in_bound(coord, -1.0f, 1.0f)) {
+        if (padding_mode == Aidge::GridSample_Op::PaddingMode::Border) {
+            coord = std::min(std::max(-1.0f, coord), 1.0f);
+        }
+        else if (padding_mode == Aidge::GridSample_Op::PaddingMode::Reflection) {
+            float abs_coord = std::fabs(coord);
+            float int_coord = std::truncf(abs_coord);
+            std::int32_t nb_refl = static_cast<std::int32_t>((int_coord - 1) / 2);
+            float res = ((nb_refl + 1)*2) - abs_coord;
+            coord = (coord > 0) ? (nb_refl % 2 == 0 ? res : -res) \
+                            : (nb_refl % 2 == 0 ? -res : res);
+        }
+
+    }
+    return coord;
+}
+
+static std::int64_t update_unnormalized_coord_with_padding(std::int64_t coord, std::int64_t size, Aidge::GridSample_Op::PaddingMode padding_mode) {
+    if (!in_bound(coord, 0, size)) {
+        // out of bound. switch padding mode
+        if (padding_mode == Aidge::GridSample_Op::PaddingMode::Border) {
+            coord = std::min(std::max(std::int64_t(0), coord), size-std::int64_t(1));
+        } else if (padding_mode == Aidge::GridSample_Op::PaddingMode::Reflection) {
+            const std::int64_t quotient = coord / (size-1);
+            const std::int64_t remainer = std::abs(coord - quotient*(size-1));
+            coord = (quotient % 2 == 0) ? remainer : size - 1 - remainer;
+        }
+    }
+    return coord;
+}
+
+namespace Aidge {
+/**
+ * @brief Forward kernel for 1D GridSample on CPU backend.
+ * @tparam I Input data type.
+ * @tparam O Output data type.
+ * @param params tuple of Attributes from the Operator
+ * @param inputDims Array of input dimensions.
+ * @param input_ const input Tensor.
+ * @param grid_ const grid Tensor.
+ * @param output_ Output Tensor.
+ */
+template <class I, class O>
+void GridSampleImpl1D_cpu_forward_kernel(const GridSample_Op& op,
+                            const std::shared_ptr<Tensor>& in0,
+                            const std::shared_ptr<Tensor>& in1,
+                            const std::shared_ptr<Tensor>& out)
+{
+    const I* const input = static_cast<const I *>(in0->getImpl()->rawPtr());
+    const I* input_ptr = input;
+    float* const grid = static_cast<float*>(in1->getImpl()->rawPtr());
+    float* grid_ptr = grid;
+    O* const output = static_cast<O*>(out->getImpl()->rawPtr());
+    O* output_ptr = output;
+
+    const std::size_t N = in0->dim(0);
+    const std::size_t C = in0->dim(1);
+    const std::size_t in_H = in0->dim(2);
+    const std::size_t grid_H = in1->dim(1);
+
+    const std::size_t in_N_s = in0->stride(0);
+    const std::size_t in_C_s = in0->stride(1);
+    const std::size_t in_H_s = in0->stride(2);
+    const std::size_t grid_N_s = in1->stride(0);
+    const std::size_t grid_H_s = in1->stride(1);
+    const std::size_t out_N_s = out->stride(0);
+    const std::size_t out_C_s = out->stride(1);
+    const std::size_t out_H_s = out->stride(2);
+
+    float* grid_ptr_N = grid;
+    const I* input_ptr_N = input;
+    O* output_ptr_N = output;
+    for (std::size_t n = 0; n < N; ++n) {
+        grid_ptr = grid_ptr_N;
+        for (std::size_t grid_x = 0; grid_x < grid_H; ++grid_x) {
+            output_ptr = output_ptr_N + grid_x*out_H_s;
+            /*
+            * change grid_x coord to match padding_mode
+            * Change range from [-1, 1] to [0, H-1] or [-0.5, H-0.5] according to align_corners
+            * Handle computation of interpolation
+            *   any value outside bounds is considered 0
+            *   if nearest:
+            *   else if linear:
+            *   else if cubic:
+            *   else : nothing
+            */
+            float x = *grid_ptr;
+            x = update_normalized_coord_with_padding(x, op.paddingMode());
+            x = unnormalize_grid_sample_coord(x, in_H, op.alignCorners());
+            if (op.mode() == GridSample_Op::Mode::Nearest) {
+                const std::int64_t x_rounded = std::nearbyintf(x);
+
+                if (in_bound(x_rounded, 0, in_H)) {
+                    input_ptr = input_ptr_N + x_rounded*in_H_s;
+                    for (std::size_t c = 0; c < C; ++c) {
+                        *output_ptr = *input_ptr;
+                        input_ptr += in_C_s;
+                        output_ptr += out_C_s;
+                    }
+                } else {
+                    for (std::size_t c = 0; c < C; ++c) {
+                        *output_ptr = O(0);
+                        output_ptr += out_C_s;
+                    }
+                }
+            } else if (op.mode() == GridSample_Op::Mode::Linear) {
+                const std::int64_t x_inf = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(x)), in_H, op.paddingMode());
+                const std::int64_t x_sup = update_unnormalized_coord_with_padding(x_inf + 1, in_H, op.paddingMode());
+
+                const I* input_ptr_NC = input_ptr_N;
+                for (std::size_t c = 0; c < C; ++c) {
+                    const I f_inf = in_bound(x_inf, 0, in_H) ?
+                        input_ptr_NC[static_cast<std::size_t>(x_inf)*in_H_s] : I(0);
+                    const I f_sup = in_bound(x_sup, 0, in_H) ?
+                        input_ptr_NC[static_cast<std::size_t>(x_sup)*in_H_s] : I(0);
+
+                    *output_ptr = static_cast<O>(static_cast<I>(x - x_inf)*f_inf \
+                            + static_cast<I>(x_sup - x)*f_sup);
+
+                    input_ptr_NC += in_C_s;
+                    output_ptr += out_C_s;
+                }
+            } else if (op.mode() == GridSample_Op::Mode::Cubic) {
+                const std::int64_t x_inf = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(x)), in_H, op.paddingMode());
+                const std::int64_t x_sup = update_unnormalized_coord_with_padding(x_inf + 1, in_H, op.paddingMode());
+                const std::int64_t x_inf_inf = update_unnormalized_coord_with_padding(x_inf - 1, in_H, op.paddingMode());
+                const std::int64_t x_sup_sup = update_unnormalized_coord_with_padding(x_sup + 1, in_H, op.paddingMode());
+
+                const I x1 = static_cast<I>(x - static_cast<float>(x_inf));
+                const I x2 = x1 * x1;
+                const I x3 = x1 * x2;
+
+                const I* input_ptr_NC = input_ptr_N;
+                for (std::size_t c = 0; c < C; ++c) {
+                    const I f_inf_inf = in_bound(x_inf_inf, 0, in_H) ? input_ptr_NC[x_inf_inf*in_H_s] : I(0);
+                    const I f_inf = in_bound(x_inf, 0, in_H) ? input_ptr_NC[x_inf*in_H_s] : I(0);
+                    const I f_sup = in_bound(x_sup, 0, in_H) ? input_ptr_NC[x_sup*in_H_s] : I(0);
+                    const I f_sup_sup = in_bound(x_sup_sup, 0, in_H) ? input_ptr_NC[x_sup_sup*in_H_s] : I(0);
+
+                    const I m_inf = (f_sup - f_inf_inf) / I(2);
+                    const I m_sup = (f_sup_sup - f_inf) / I(2);
+
+                    *output_ptr = f_inf \
+                        + x1 * m_inf \
+                        + x2 * (3 * (f_sup - f_inf) - 2 * m_inf - m_sup) \
+                        + x3 * (2*(f_inf - f_sup) + m_inf + m_sup);
+
+                    input_ptr_NC += in_C_s;
+                    output_ptr += out_C_s;
+                }
+            }
+
+            grid_ptr += grid_H_s;
+        }
+
+        input_ptr_N += in_N_s;
+        grid_ptr_N += grid_N_s;
+        output_ptr_N += out_N_s;
+    }
+}
+
+// Kernels registration to implementation entry point
+// only accept 1st input with only 1 spatial feat. (nb dims = 1)
+REGISTRAR(GridSampleImpl_cpu,
+    {{{DataType::Any, DataFormat::Any, {{-1, -1}}}, {DataType::Any}}, {{DataType::Float16}}},
+    {ProdConso::defaultModel, Aidge::GridSampleImpl1D_cpu_forward_kernel<half_float::half, half_float::half>, nullptr});
+REGISTRAR(GridSampleImpl_cpu,
+    {{{DataType::Any, DataFormat::Any, {{-1, -1}}}, {DataType::Any}}, {{DataType::Float32}}},
+    {ProdConso::defaultModel, Aidge::GridSampleImpl1D_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(GridSampleImpl_cpu,
+    {{{DataType::Any, DataFormat::Any, {{-1, -1}}}, {DataType::Any}}, {{DataType::Float64}}},
+    {ProdConso::defaultModel, Aidge::GridSampleImpl1D_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(GridSampleImpl_cpu,
+    {{{DataType::Any, DataFormat::Any, {{-1, -1}}}, {DataType::Any}}, {{DataType::Int32}}},
+    {ProdConso::defaultModel, Aidge::GridSampleImpl1D_cpu_forward_kernel<int32_t, int32_t>, nullptr});
+
+
+/**
+ * @brief Forward kernel for 1D GridSample on CPU backend.
+ * @tparam I Input data type.
+ * @tparam O Output data type.
+ * @param params tuple of Attributes from the Operator
+ * @param inputDims Array of input dimensions.
+ * @param input_ const input Tensor.
+ * @param grid_ const grid Tensor.
+ * @param output_ Output Tensor.
+ */
+template <class I, class O>
+void GridSampleImpl2D_cpu_forward_kernel(const GridSample_Op& op,
+                            const std::shared_ptr<Tensor>& in0,
+                            const std::shared_ptr<Tensor>& in1,
+                            const std::shared_ptr<Tensor>& out)
+{
+    const I* input = static_cast<const I *>(in0->getImpl()->rawPtr());
+    const I* input_ptr = input;
+    float* const grid = static_cast<float*>(in0->getImpl()->rawPtr());
+    float* grid_ptr = grid;
+    O* const output = static_cast<O*>(out->getImpl()->rawPtr());
+
+    const std::size_t N = in0->dim(0);
+    const std::size_t C = in0->dim(1);
+    const std::size_t in_H = in0->dim(2);
+    const std::size_t in_W = in0->dim(3);
+    const std::size_t grid_H = in1->dim(1);
+    const std::size_t grid_W = in1->dim(2);
+
+    const std::size_t in_N_s = in0->stride(0);
+    const std::size_t in_C_s = in0->stride(1);
+    const std::size_t in_H_s = in0->stride(2);
+    const std::size_t in_W_s = in0->stride(3);
+    const std::size_t grid_N_s = in1->stride(0);
+    const std::size_t grid_H_s = in1->stride(1);
+    const std::size_t grid_W_s = in1->stride(2);
+    const std::size_t grid_Coord_s = in1->stride(3);
+    const std::size_t out_N_s = out->stride(0);
+    const std::size_t out_C_s = out->stride(1);
+    const std::size_t out_H_s = out->stride(2);
+    const std::size_t out_W_s = out->stride(3);
+
+
+    float* grid_ptr_N = grid;
+    const I* input_ptr_N = input;
+    O* output_ptr_N = output;
+    for (std::size_t n = 0; n < N; ++n) {
+        for (std::size_t grid_y = 0; grid_y < grid_H; ++grid_y) {
+            for (std::size_t grid_x = 0; grid_x < grid_W; ++grid_x) {
+                O* output_ptr = output_ptr_N + grid_y*out_H_s + grid_y*out_W_s;
+                grid_ptr = grid_ptr_N + grid_y*grid_H_s + grid_x*grid_W_s;
+                /*
+                * change grid_x coord to match padding_mode
+                * Change range from [-1, 1] to [0, H-1] or [-0.5, H-0.5] according to align_corners
+                * Handle computation of interpolation
+                *   any value outside bounds is considered 0
+                *   if nearest:
+                *   else if linear:
+                *   else if cubic:
+                *   else : nothing
+                */
+                float x = *grid_ptr;
+                float y = grid_ptr[grid_Coord_s];
+                x = update_normalized_coord_with_padding(x, op.paddingMode());
+                x = unnormalize_grid_sample_coord(x, in_W, op.alignCorners());
+                y = update_normalized_coord_with_padding(y, op.paddingMode());
+                y = unnormalize_grid_sample_coord(y, in_H, op.alignCorners());
+                if (op.mode() == GridSample_Op::Mode::Nearest) {
+                    const std::int64_t x_rounded = std::nearbyintf(x);
+                    const std::int64_t y_rounded = std::nearbyintf(y);
+
+                    if (in_bound(x_rounded, 0, in_W) && in_bound(y_rounded, 0, in_H)) {
+                        input_ptr = input_ptr_N + y_rounded*in_H_s + x_rounded*in_W_s;
+                        for (std::size_t c = 0; c < C; ++c) {
+                            *output_ptr = *input_ptr;
+                            input_ptr += in_C_s;
+                            output_ptr += out_C_s;
+                        }
+                    } else {
+                        for (std::size_t c = 0; c < C; ++c) {
+                            *output_ptr = O(0);
+                            output_ptr += out_C_s;
+                        }
+                    }
+                } else if (op.mode() == GridSample_Op::Mode::Linear) {
+                    const std::int64_t x_r = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(x)), in_W, op.paddingMode()); // right
+                    const std::int64_t x_l = update_unnormalized_coord_with_padding(x_r + 1, in_W, op.paddingMode()); // left
+
+                    const std::int64_t y_t = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(y)), in_H, op.paddingMode()); // top
+                    const std::int64_t y_b = update_unnormalized_coord_with_padding(y_t + 1, in_H, op.paddingMode()); // bottom
+
+                    const I* input_ptr_NC = input_ptr_N;
+                    for (std::size_t c = 0; c < C; ++c) {
+
+                        const I f_tr = (in_bound(x_r, 0, in_W) && in_bound(y_t, 0, in_H)) ?
+                            input_ptr_NC[static_cast<std::size_t>(y_t)*in_H_s
+                                         + static_cast<std::size_t>(x_r)*in_W_s]
+                                : I(0);
+                        const I f_tl = (in_bound(x_l, 0, in_W) && in_bound(y_t, 0, in_H)) ?
+                            input_ptr_NC[static_cast<std::size_t>(y_t)*in_H_s
+                                         + static_cast<std::size_t>(x_l)*in_W_s]
+                                : I(0);
+                        const I f_br = (in_bound(x_r, 0, in_W) && in_bound(y_b, 0, in_H)) ?
+                            input_ptr_NC[static_cast<std::size_t>(y_b)*in_H_s
+                                         + static_cast<std::size_t>(x_r)*in_W_s]
+                                : I(0);
+                        const I f_bl = (in_bound(x_l, 0, in_W) && in_bound(y_b, 0, in_H)) ?
+                            input_ptr_NC[static_cast<std::size_t>(y_b)*in_H_s
+                                         + static_cast<std::size_t>(x_l)*in_W_s]
+                                : I(0);
+
+                        // compute weighted sum of the 4 corners
+                        const I w_tr = static_cast<I>((y - static_cast<float>(y_t))*(static_cast<float>(x_r) - x));
+                        const I w_tl = static_cast<I>((y - static_cast<float>(y_t))*(x - static_cast<float>(x_l)));
+                        const I w_br = static_cast<I>((static_cast<float>(y_b) - y)*(static_cast<float>(x_r) - x));
+                        const I w_bl = static_cast<I>((static_cast<float>(y_b) - y)*(x - static_cast<float>(x_l)));
+
+                        *output_ptr = static_cast<O>(w_tr*f_tr + w_tl*f_tl + w_br*f_br + w_bl*f_bl);
+
+                        input_ptr_NC += in_C_s;
+                        output_ptr += out_C_s;
+                    }
+                } else if (op.mode() == GridSample_Op::Mode::Cubic) {
+                    /*
+                    *  .. .. .. .. .. ..
+                    *  .. 00 01 02 03 ..
+                    *  .. 10 11 12 13 ..
+                    *  .. 20 21 22 23 ..
+                    *  .. 30 31 32 33 ..
+                    *  .. .. .. .. .. ..
+                    */
+                    const std::int64_t x_1 = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(x)), in_W, op.paddingMode());
+                    const std::int64_t x_0 = update_unnormalized_coord_with_padding(x_1 - 1, in_W, op.paddingMode());
+                    const std::int64_t x_2 = update_unnormalized_coord_with_padding(x_1 + 1, in_W, op.paddingMode());
+                    const std::int64_t x_3 = update_unnormalized_coord_with_padding(x_1 + 2, in_W, op.paddingMode());
+
+                    const std::int64_t y_1 = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(y)), in_H, op.paddingMode());
+                    const std::int64_t y_0 = update_unnormalized_coord_with_padding(y_1 - 1, in_H, op.paddingMode());
+                    const std::int64_t y_2 = update_unnormalized_coord_with_padding(y_1 + 1, in_H, op.paddingMode());
+                    const std::int64_t y_3 = update_unnormalized_coord_with_padding(y_1 + 2, in_H, op.paddingMode());
+
+                    const I* input_ptr_NC = input_ptr_N;
+
+                    for (std::size_t c = 0; c < C; ++c) {
+                        const I f_00 = in_bound(x_0, 0, in_W) && in_bound(y_0, 0, in_H) ?
+                                        input_ptr_NC[x_0*in_W_s + y_0*in_H_s] : I(0);
+                        const I f_01 = in_bound(x_0, 0, in_W) && in_bound(y_1, 0, in_H) ?
+                                        input_ptr_NC[x_0*in_W_s + y_1*in_H_s] : I(0);
+                        const I f_02 = in_bound(x_0, 0, in_W) && in_bound(y_2, 0, in_H) ?
+                                        input_ptr_NC[x_0*in_W_s + y_2*in_H_s] : I(0);
+                        const I f_03 = in_bound(x_0, 0, in_W) && in_bound(y_3, 0, in_H) ?
+                                        input_ptr_NC[x_0*in_W_s + y_3*in_H_s] : I(0);
+                        const I f_10 = in_bound(x_1, 0, in_W) && in_bound(y_0, 0, in_H) ?
+                                        input_ptr_NC[x_1*in_W_s + y_0*in_H_s] : I(0);
+                        const I f_20 = in_bound(x_2, 0, in_W) && in_bound(y_0, 0, in_H) ?
+                                        input_ptr_NC[x_2*in_W_s + y_0*in_H_s] : I(0);
+                        const I f_30 = in_bound(x_3, 0, in_W) && in_bound(y_0, 0, in_H) ?
+                                        input_ptr_NC[x_3*in_W_s + y_0*in_H_s] : I(0);
+                        const I f_11 = in_bound(x_1, 0, in_W) && in_bound(y_1, 0, in_H) ?
+                                        input_ptr_NC[x_1*in_W_s + y_1*in_H_s] : I(0);
+                        const I f_12 = in_bound(x_1, 0, in_W) && in_bound(y_2, 0, in_H) ?
+                                        input_ptr_NC[x_1*in_W_s + y_2*in_H_s] : I(0);
+                        const I f_13 = in_bound(x_1, 0, in_W) && in_bound(y_3, 0, in_H) ?
+                                        input_ptr_NC[x_1*in_W_s + y_3*in_H_s] : I(0);
+                        const I f_21 = in_bound(x_2, 0, in_W) && in_bound(y_1, 0, in_H) ?
+                                        input_ptr_NC[x_2*in_W_s + y_1*in_H_s] : I(0);
+                        const I f_22 = in_bound(x_2, 0, in_W) && in_bound(y_2, 0, in_H) ?
+                                        input_ptr_NC[x_2*in_W_s + y_2*in_H_s] : I(0);
+                        const I f_23 = in_bound(x_2, 0, in_W) && in_bound(y_3, 0, in_H) ?
+                                        input_ptr_NC[x_2*in_W_s + y_3*in_H_s] : I(0);
+                        const I f_31 = in_bound(x_3, 0, in_W) && in_bound(y_1, 0, in_H) ?
+                                        input_ptr_NC[x_3*in_W_s + y_1*in_H_s] : I(0);
+                        const I f_32 = in_bound(x_3, 0, in_W) && in_bound(y_2, 0, in_H) ?
+                                        input_ptr_NC[x_3*in_W_s + y_2*in_H_s] : I(0);
+                        const I f_33 = in_bound(x_3, 0, in_W) && in_bound(y_3, 0, in_H) ?
+                                        input_ptr_NC[x_3*in_W_s + y_3*in_H_s] : I(0);
+
+                        const I mx_11 = (f_21 - f_01) / I(2);
+                        const I mx_12 = (f_22 - f_02) / I(2);
+                        const I mx_21 = (f_31 - f_11) / I(2);
+                        const I mx_22 = (f_32 - f_12) / I(2);
+
+                        const I my_11 = (f_12 - f_10) / I(2);
+                        const I my_12 = (f_13 - f_11) / I(2);
+                        const I my_21 = (f_22 - f_20) / I(2);
+                        const I my_22 = (f_23 - f_21) / I(2);
+
+                        const I mxy_11 = (f_22 - f_20 - f_02 - + f_00) / I(4);
+                        const I mxy_12 = (f_23 - f_21 - f_03 - + f_01) / I(4);
+                        const I mxy_21 = (f_32 - f_30 - f_12 - + f_10) / I(4);
+                        const I mxy_22 = (f_33 - f_31 - f_13 - + f_11) / I(4);
+
+                        const I a_00 = f_11;
+                        const I a_10 = mx_11;
+                        const I a_20 = I(3)*(f_21 - f_11) - I(2)*mx_11 - mx_21;
+                        const I a_30 = I(2)*(f_11 - f_21) + mx_11 + mx_21;
+                        const I a_01 = my_11;
+                        const I a_11 = mxy_11;
+                        const I a_21 = I(3)*(my_21 - my_11) - I(2)*mxy_11 - mxy_21;
+                        const I a_31 = I(2)*(my_11 - my_21) + mxy_11 + mxy_21;
+                        const I a_02 = I(3)*(f_12 - f_11) - I(2)*my_11 - my_12;
+                        const I a_12 = I(3)*(mx_12 - mx_11) - I(2)*mxy_11 - mxy_12;
+                        const I a_22 = I(9)*(f_11 + f_22 - f_21 - f_12) + I(3)*(I(2)*(mx_11 - mx_12 + my_11 - my_21) + mx_21 - mx_22 + my_12 - my_22) + mxy_22 + I(2)*(mxy_12 + mxy_21 + I(2)*mxy_11);
+                        const I a_32 = - mxy_12 - mxy_22 + I(2)*(my_22 - my_12 - mxy_11 - mxy_21 + I(2)*(my_21 - my_11) + I(3)*(f_21 + f_12 - f_11 - f_22)) + I(3)*(mx_12 + mx_22 - mx_11 - mx_21);
+                        const I a_03 = I(2)*(f_11 - f_12) + my_11 + my_12;
+                        const I a_13 = I(2)*(mx_11 - mx_12) + mxy_11 + mxy_12;
+                        const I a_23 = - mxy_21 - mxy_22 + I(2)*(-mx_21 + mx_22 - mxy_11 - mxy_12 + I(2)*(mx_12 - mx_11) + I(3)*(f_12 + f_21 - f_11 - f_22)) + I(3)*(my_21 + my_22 - my_11 - my_12);
+                        const I a_33 = mxy_11 + mxy_21 + mxy_12 + mxy_22 + I(2)*(mx_11 + mx_21 - mx_12 - mx_22 + my_11 - my_21 + my_12 - my_22 + I(2)*(f_11 - f_21 - f_12 + f_22));
+
+                        const I x2 = static_cast<I>(x*x);
+                        const I x3 = static_cast<I>(x*x*x);
+                        const I y2 = static_cast<I>(y*y);
+                        const I y3 = static_cast<I>(y*y*y);
+
+                        *output_ptr = static_cast<O>( \
+                            a_00 + a_10*x + a_20*x2 + a_30*x3 \
+                            + a_01*y + a_11*x*y + a_21*x2*y + a_31*x3*y \
+                            + a_02*y2 + a_12*x*y2 + a_22*x2*y2 + a_32*x3*y2 \
+                            + a_03*y3 + a_13*x*y3 + a_23*x2*y3 + a_33*x3*y3);
+
+                        input_ptr_NC += in_C_s;
+                        output_ptr += out_C_s;
+                    }
+                }
+            }
+        }
+
+        input_ptr_N += in_N_s;
+        grid_ptr_N += grid_N_s;
+        output_ptr_N += out_N_s;
+    }
+}
+
+// Kernels registration to implementation entry point
+// only accept 1st input with only 2 spatial feat. (nb dims = 2)
+REGISTRAR(GridSampleImpl_cpu,
+    {{{DataType::Any, DataFormat::Any, {{-1, -1}, {-1, -1}}}, {DataType::Any}}, {{DataType::Float16}}},
+    {ProdConso::defaultModel, Aidge::GridSampleImpl2D_cpu_forward_kernel<half_float::half, half_float::half>, nullptr});
+REGISTRAR(GridSampleImpl_cpu,
+    {{{DataType::Any, DataFormat::Any, {{-1, -1}, {-1, -1}}}, {DataType::Any}}, {{DataType::Float32}}},
+    {ProdConso::defaultModel, Aidge::GridSampleImpl2D_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(GridSampleImpl_cpu,
+    {{{DataType::Any, DataFormat::Any, {{-1, -1}, {-1, -1}}}, {DataType::Any}}, {{DataType::Float64}}},
+    {ProdConso::defaultModel, Aidge::GridSampleImpl2D_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(GridSampleImpl_cpu,
+    {{{DataType::Any, DataFormat::Any, {{-1, -1}, {-1, -1}}}, {DataType::Any}}, {{DataType::Int32}}},
+    {ProdConso::defaultModel, Aidge::GridSampleImpl2D_cpu_forward_kernel<int32_t, int32_t>, nullptr});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_CONVIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/HeavisideImpl.hpp
+++ b/include/aidge/backend/cpu/operator/HeavisideImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2025 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_HEAVISIDEIMPL_H_
+#define AIDGE_CPU_OPERATOR_HEAVISIDEIMPL_H_
+
+#include <cstddef> // std::size_t
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Heaviside.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/future_std/span.hpp"
+
+namespace Aidge {
+using HeavisideImplCpu =
+    OperatorImpl_cpu<Heaviside_Op,
+                     void(std::size_t, const void *, void *, const float),
+                     void(const float, std::size_t, const void *, void *)>;
+
+// Implementation entry point registration for operator Heaviside
+REGISTRAR(Heaviside_Op, "cpu", HeavisideImplCpu::create);
+} // namespace Aidge
+
+#endif // AIDGE_CPU_OPERATOR_HEAVISIDEIMPL_H_
--- a/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2025 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_HEAVISIDEIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_HEAVISIDEIMPL_KERNELS_H_
+
+#include "aidge/utils/Registrar.hpp"
+
+#include <cstddef> // std::size_t
+
+#include "aidge/backend/cpu/operator/HeavisideImpl.hpp"
+#include "aidge/utils/ErrorHandling.hpp"
+
+
+namespace Aidge {
+
+template <class I, class O>
+void HeavisideImplCpuForwardKernel(std::size_t inputLenght,
+                                   const void *input_,
+                                   void *output_,
+                                   const float value) {
+    const I *input = static_cast<const I *>(input_);
+    O *output = static_cast<O *>(output_);
+
+    for (std::size_t i = 0; i < inputLenght; ++i) {
+        output[i] = (input[i] > 0) ? 1 : (input[i] == 0 ? value : 0);
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(HeavisideImplCpu,
+          {DataType::Float32},
+          {ProdConso::inPlaceModel,
+           Aidge::HeavisideImplCpuForwardKernel<float, float>,
+           nullptr});
+} // namespace Aidge
+
+#endif // AIDGE_CPU_OPERATOR_HEAVISIDEIMPL_KERNELS_H__H_
--- a/include/aidge/backend/cpu/operator/LRNImpl.hpp
+++ b/include/aidge/backend/cpu/operator/LRNImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_LRNIMPL_H_
+#define AIDGE_CPU_OPERATOR_LRNIMPL_H_
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/LRN.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include <memory>
+#include <vector>
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using LRNImpl_cpu = OperatorImpl_cpu<LRN_Op,
+    void(float, float, float, std::size_t, const std::vector<DimSize_t>&, const void*, void*)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(LRN_Op, "cpu", Aidge::LRNImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_LRNIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/LRNImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/LRNImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_LRNIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_LRNIMPL_KERNELS_H_
+
+#include "aidge/utils/Registrar.hpp"
+#include <cstddef>
+#include <cmath>
+#include "aidge/data/Data.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+
+#include "aidge/backend/cpu/operator/LRNImpl.hpp"
+
+namespace Aidge {
+template <class I, class O>
+void LRNImpl_cpu_forward_kernel(float alpha, float beta, float bias, std::size_t size, const std::vector<DimSize_t>& inputDims, const void* input_, void* output_)
+{
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+
+    const DimSize_t nbBatch = inputDims[0];
+    const DimSize_t nbChannels = (inputDims.size() > 1) ? inputDims[1] : 1;
+    const DimSize_t featureMapSize = (inputDims.size() > 2) ? std::accumulate(inputDims.begin() + 2, inputDims.end(), 1, std::multiplies<DimSize_t>()) : 1;
+
+    for (std::size_t batch = 0; batch < nbBatch; ++batch) {
+        for (std::size_t ch = 0; ch < nbChannels; ++ch) {
+            const std::size_t ioIndex = (ch + batch*nbChannels) * featureMapSize;
+            const unsigned int channelMin
+                = std::max<int>(0, ch - size / 2);
+            const unsigned int channelMax
+                = std::min<size_t>(nbChannels - 1, ch + size / 2);
+
+            for (std::size_t feature = 0; feature<featureMapSize; ++feature) {
+                // For each input channel, accumulate the value
+                O accAccrossChannels(0.0);
+
+                for (unsigned int accChannel = channelMin;
+                    accChannel < channelMax; ++accChannel)
+                {
+                    accAccrossChannels += input[ioIndex + feature];
+                }
+
+                // Compute the output signal
+                output[ioIndex + feature] = input[ioIndex + feature]
+                    / std::pow((bias + (accAccrossChannels * accAccrossChannels) * alpha), beta);
+            }
+        }
+    }
+}
+
+REGISTRAR(LRNImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::LRNImpl_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(LRNImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::LRNImpl_cpu_forward_kernel<double, double>, nullptr});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_LRNIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/LeakyReLUImpl.hpp
+++ b/include/aidge/backend/cpu/operator/LeakyReLUImpl.hpp
@@ -12,52 +12,30 @@
 #ifndef AIDGE_CPU_OPERATOR_LEAKYRELUIMPL_H_
 #define AIDGE_CPU_OPERATOR_LEAKYRELUIMPL_H_

-#include "aidge/backend/OperatorImpl.hpp"
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/LeakyReLU.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
-#include <memory>
-#include <vector>
+#include "aidge/backend/cpu/data/GetCPUPtr.h"

 namespace Aidge {
-// class LeakyReLU_Op;
-
-// compute kernel registry for forward and backward
-class LeakyReLUImplForward_cpu
-    : public Registrable<LeakyReLUImplForward_cpu, std::tuple<DataType, DataType>, void(const LeakyReLU_Op::Attrs&, std::size_t, const void*, void*)> {
-};
-class LeakyReLUImplBackward_cpu
-    : public Registrable<LeakyReLUImplBackward_cpu, std::tuple<DataType, DataType>, void(const LeakyReLU_Op::Attrs&, std::size_t, const void*, void*)> {
-};
-
-class LeakyReLUImpl_cpu : public OperatorImpl {
-   private:
-    const LeakyReLU_Op& mOp;
-    std::array<NbElts_t, 1> mNbConsumedData;
-    std::array<NbElts_t, 1> mNbProducedData;
-
-   public:
-    LeakyReLUImpl_cpu(const LeakyReLU_Op& op) : mOp(op), mNbConsumedData({0}), mNbProducedData({0}) {}
-
-    static std::unique_ptr<LeakyReLUImpl_cpu> create(const LeakyReLU_Op& op) {
-        return std::make_unique<LeakyReLUImpl_cpu>(op);
-    }
-
-   public:
-    NbElts_t getNbRequiredData(const IOIndex_t inputIdx) const override final;
-    NbElts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
-    NbElts_t getRequiredMemory(const IOIndex_t /*outputIdx*/, const std::vector<DimSize_t>& /*inputsSize*/) const override final;
-    NbElts_t getNbConsumedData(const IOIndex_t inputIdx) const override final;
-    NbElts_t getNbProducedData(const IOIndex_t outputIdx) const override final;
-    void updateConsummerProducer() override final;
-    void forward();
-
-    void backward();
-};
-
-namespace {
-static Registrar<LeakyReLU_Op> registrarLeakyReLUImpl_cpu("cpu", Aidge::LeakyReLUImpl_cpu::create);
-}
+// Operator implementation entry point for the backend
+using LeakyReLUImpl_cpu = OperatorImpl_cpu<LeakyReLU_Op,
+    void(const float,
+        std::size_t,
+        const void*,
+        void*),
+    void(const float,
+        std::size_t,
+        const void*,
+        void*)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(LeakyReLU_Op, "cpu", Aidge::LeakyReLUImpl_cpu::create);
 }  // namespace Aidge

 #endif /* AIDGE_CPU_OPERATOR_LEAKYRELUIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/LeakyReLUImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/LeakyReLUImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_LEAKYRELUIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_LEAKYRELUIMPL_KERNELS_H_
+
+#include "aidge/utils/Registrar.hpp"
+
+#include "aidge/backend/cpu/operator/LeakyReLUImpl.hpp"
+
+namespace Aidge {
+template <class I, class O>
+void LeakyReLUImpl_cpu_forward_kernel(const float negativeSlope_,
+                                     std::size_t inputLenght,
+                                     const void* input_,
+                                     void* output_) {
+
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+    const I negativeSlope = static_cast<const I>(negativeSlope_);
+
+    for (std::size_t i = 0; i < inputLenght; ++i) {
+        output[i] = (input[i] >= 0) ? input[i] : input[i] * negativeSlope;
+    }
+}
+
+template <class I, class O>
+void LeakyReLUImpl_cpu_backward_kernel(const float negativeSlope_,
+                                     std::size_t inputLenght,
+                                     const void* input_,
+                                     void* output_) {
+
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+    const I negativeSlope = static_cast<const I>(negativeSlope_);
+
+    for (std::size_t i = 0; i < inputLenght; ++i) {
+        output[i] = (input[i] > 0) ? input[i] : negativeSlope*input[i];
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(LeakyReLUImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::LeakyReLUImpl_cpu_forward_kernel<float, float>, Aidge::LeakyReLUImpl_cpu_backward_kernel<float, float>});
+REGISTRAR(LeakyReLUImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::LeakyReLUImpl_cpu_forward_kernel<double, double>, Aidge::LeakyReLUImpl_cpu_backward_kernel<double, double>});
+REGISTRAR(LeakyReLUImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::inPlaceModel, Aidge::LeakyReLUImpl_cpu_forward_kernel<int32_t, int32_t>, Aidge::LeakyReLUImpl_cpu_backward_kernel<int32_t, int32_t>});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_LEAKYRELUIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/LnImpl.hpp
+++ b/include/aidge/backend/cpu/operator/LnImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_LNIMPL_H_
+#define AIDGE_CPU_OPERATOR_LNIMPL_H_
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Ln.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include <memory>
+#include <vector>
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using LnImpl_cpu = OperatorImpl_cpu<Ln_Op,
+    void(const std::size_t, const void*, void*),
+    void(const std::size_t, const void*, const void*, void*)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(Ln_Op, "cpu", Aidge::LnImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_LNIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/LnImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/LnImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_LNIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_LNIMPL_KERNELS_H_
+
+#include "aidge/utils/Registrar.hpp"
+
+#include "aidge/backend/cpu/operator/LnImpl.hpp"
+
+namespace Aidge {
+template <class I, class O>
+void LnImpl_cpu_forward_kernel(std::size_t inputLenght,
+                               const void* input_,
+                               void* output_) {
+
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+	const float eps = 1.0e-20f;
+
+//#pragma omp parallel for if (inputLenght > 1024)
+    for (std::size_t i = 0; i < inputLenght; ++i) {
+		if (input[i] > I(eps)) {
+			output[i] = std::log(input[i]);
+		} else {
+			output[i] = std::log(I(eps));
+		}
+    }
+}
+
+template <class I, class GI, class GO>
+void LnImpl_cpu_backward_kernel(const std::size_t inputLenght,
+                                const void* input_, const void* grad_output_,
+	                            void* grad_input_) {
+						 
+    const I* input = static_cast<const I*>(input_);
+    const GO* grad_output = static_cast<const GO*>(grad_output_);
+    GI* grad_input = static_cast<GI*>(grad_input_);
+	const float eps = 1.0e-20f;
+	
+    for (std::size_t i = 0; i < inputLenght; ++i) {
+		if (input[i] > I(eps)) {
+			grad_input[i] = grad_output[i] / input[i];
+		} else {
+			grad_input[i] = GI(0);
+		}
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(LnImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::LnImpl_cpu_forward_kernel<float, float>, Aidge::LnImpl_cpu_backward_kernel<float, float, float>});
+REGISTRAR(LnImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::LnImpl_cpu_forward_kernel<double, double>, Aidge::LnImpl_cpu_backward_kernel<double, double, double>});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_LNIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/MatMulImpl.hpp
+++ b/include/aidge/backend/cpu/operator/MatMulImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_MATMULIMPL_H_
+#define AIDGE_CPU_OPERATOR_MATMULIMPL_H_
+
+#include <array>
+#include <memory>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/MatMul.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using MatMulImpl_cpu = OperatorImpl_cpu<MatMul_Op,
+    void(const std::size_t, const std::size_t, const std::size_t,
+                              const void *, const void *, void *)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(MatMul_Op, "cpu", Aidge::MatMulImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_MATMULIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_MATMULIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_MATMULIMPL_KERNELS_H_
+
+#include "aidge/backend/cpu/operator/MatMulImpl.hpp"
+
+namespace Aidge {
+
+template <class I, class O>
+void MatMulImpl_cpu_forward_kernel(const std::size_t n, const std::size_t k, const std::size_t m,
+                                    const void* input1_, const void* input2_, void* __restrict output_) {
+    // FIXME: missing MatMul parameters as arguments
+    const I* input1 = static_cast<const I*>(input1_);
+    const I* input2 = static_cast<const I*>(input2_);
+    O* __restrict output = static_cast<O* __restrict>(output_);
+
+    std::memset(output, O(0), n * m * sizeof(O));
+
+    for (std::size_t i = 0; i < n; ++i) {
+        for (std::size_t l = 0; l < k; ++l) {
+            for (std::size_t j = 0; j < m; ++j) {
+                output[i*m + j] += static_cast<O>(input1[i*k + l] * input2[l*m + j]);
+            }
+        }
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(MatMulImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::defaultModel, Aidge::MatMulImpl_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(MatMulImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::defaultModel, Aidge::MatMulImpl_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(MatMulImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::defaultModel, Aidge::MatMulImpl_cpu_forward_kernel<int32_t, int32_t>, nullptr});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_MATMULIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp
+++ b/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_MaxPOOLINGIMPL_H_
+#define AIDGE_CPU_OPERATOR_MaxPOOLINGIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/MaxPooling.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using MaxPooling2D_Op = MaxPooling_Op<2>;
+using MaxPoolingImpl2D_cpu = OperatorImpl_cpu<MaxPooling_Op<2>,
+    void(const std::array<DimSize_t, 2>&,
+                            const std::array<DimSize_t, 2>&,
+                            const bool,
+                            const std::array<DimSize_t, 4> &,
+                            const void *,
+                            void *)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(MaxPooling2D_Op, "cpu", Aidge::MaxPoolingImpl2D_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_MaxPOOLINGIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_MaxPOOLINGIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_MaxPOOLINGIMPL_KERNELS_H_
+
+#include <array>
+#include <cmath>
+#include <tuple>
+
+#include "aidge/backend/cpu/operator/MaxPoolingImpl.hpp"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include "aidge/data/Data.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+/**
+ * @brief Forward kernel for 2D MaxPoolingolution on CPU backend.
+ * @tparam I Input data type.
+ * @tparam O Output data type.
+ * @param attrs tuple of Attributes from the Operator
+ * @param dims Array of input dimensions.
+ * @param input_ const input Tensor.
+ * @param output_ Output Tensor.
+ */
+template <class I, class O>
+void MaxPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
+                                        const std::array<DimSize_t, 2>& kernelDims,
+                                        const bool /*ceilMode*/,
+                                        const std::array<DimSize_t, 4> &dims,
+                                        const void *input_,
+                                        void *output_) {
+    // FIXME: missing convolution parameters as arguments
+    const I *input = static_cast<const I *>(input_);
+    O *output = static_cast<O *>(output_);
+
+    // output H size
+    const std::size_t oxSize =
+            static_cast<std::size_t>(std::floor(static_cast<float>(dims[2] - kernelDims[0] + strideDims[0]) /
+                                static_cast<float>(strideDims[0])));
+    // output W size
+    const std::size_t oySize =
+            static_cast<std::size_t>(std::floor(static_cast<float>(dims[3] - kernelDims[1] + strideDims[1]) /
+                                static_cast<float>(strideDims[1])));
+
+    // TODO: kernel computation
+    // output (batch, outCh, Xout, Yout)
+    // input  (batch, ch, Xin, Yin)
+    // weight (outCh, ch, kernelX, kernelY)
+    // does not take Dilation parameter into account
+    using signedsize = std::make_signed<std::size_t>::type;
+    for (std::size_t batch = 0; batch < dims[0]; ++batch) {
+        for (std::size_t ch = 0; ch < dims[1]; ++ch) {
+            const std::size_t oIndex = (ch + batch*dims[1]) * oxSize * oySize;
+            const std::size_t iIndex = (ch + batch*dims[1]) * dims[2] * dims[3];
+            for (std::size_t ox = 0; ox < oxSize; ++ox) {
+                const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]);
+                const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
+                const std::size_t sxMax = (static_cast<signedsize>(dims[2]) + difx) < 0 ? 0 : ((dims[2] + difx) > kernelDims[0] ? kernelDims[0] : dims[2] + difx);
+                for (std::size_t oy = 0; oy < oySize; ++oy) {
+                    const signedsize dify = static_cast<signedsize>(- oy * strideDims[1]);
+                    const std::size_t syMin = static_cast<std::size_t>(std::max(dify, signedsize(0)));
+                    const std::size_t syMax = (static_cast<signedsize>(dims[3]) + dify) < 0 ? 0 : ((dims[3] + dify) > kernelDims[1] ? kernelDims[1] : dims[3] + dify);
+                    const std::size_t oIndexFull = oIndex + ox*oySize + oy;
+                    const std::size_t ix = ox * strideDims[0];
+                    const std::size_t iy = oy * strideDims[1];
+
+                    I poolValue(0.0);
+                    bool valid = false;
+
+                    for (unsigned int channel = 0; channel < dims[1];
+                            ++channel){
+                        for (unsigned int sy = syMin; sy < syMax; ++sy) {
+                            for (unsigned int sx = sxMin; sx < sxMax; ++sx)
+                            {
+                                const I value = input[iIndex + (ix+sx)*dims[3] + (iy+sy)];
+
+                                if (!valid || value > poolValue) {
+                                    poolValue = value;
+                                    valid = true;
+                                }
+                            }
+                        }
+                    }
+                    output[oIndexFull] = poolValue;
+                }
+            }
+        }
+    }
+}
+
+//N2D2 version
+/*
+template <class T>
+void N2D2::PoolCell_Frame_Kernels::forwardMax(const T* alpha,
+                                              const Tensor<T>&
+                                              inputs,
+                                              const Descriptor& desc,
+                                              const T* beta,
+                                              Tensor<T>& outputs,
+                                              Tensor<ArgMax>& argMax,
+                                              bool useArgMax,
+                                              const Tensor<bool>& maps)
+{
+    const unsigned int size = inputs.dimB() * outputs.dimZ();
+
+#if defined(_OPENMP) && _OPENMP >= 200805
+#pragma omp parallel for collapse(2) if (size > 16)
+#else
+#pragma omp parallel for if (inputs.dimB() > 4 && size > 16)
+#endif
+    for (int batchPos = 0; batchPos < (int)inputs.dimB(); ++batchPos) {
+        for (unsigned int output = 0; output < outputs.dimZ(); ++output) {
+            for (unsigned int oy = 0; oy < outputs.dimY(); ++oy) {
+                for (unsigned int ox = 0; ox < outputs.dimX(); ++ox) {
+                    const unsigned int sxMin = (unsigned int)std::max(
+                        desc.padding[0] - (int)(ox * desc.stride[0]), 0);
+                    const unsigned int syMin = (unsigned int)std::max(
+                        desc.padding[1] - (int)(oy * desc.stride[1]), 0);
+                    const unsigned int sxMax = Utils::clamp
+                        <int>(inputs.dimX() + desc.padding[0] - ox * desc.stride[0],
+                              0,
+                              desc.pool[0]);
+                    const unsigned int syMax = Utils::clamp
+                        <int>(inputs.dimY() + desc.padding[1] - oy * desc.stride[1],
+                              0,
+                              desc.pool[1]);
+
+                    const int ix = (int)(ox * desc.stride[0]) - desc.padding[0];
+                    const int iy = (int)(oy * desc.stride[1]) - desc.padding[1];
+
+                    T poolValue(0.0);
+
+                    // For each output, compute the pool value
+                    if (useArgMax) {
+                        const ArgMax inputMax
+                            = argMax(ox, oy, output, batchPos);
+
+                        if (inputMax.valid) {
+                            poolValue = inputs(inputMax.ix,
+                                               inputMax.iy,
+                                               inputMax.channel,
+                                               batchPos);
+                        }
+                    }
+                    else {
+                        unsigned int ixMax = 0;
+                        unsigned int iyMax = 0;
+                        unsigned int channelMax = 0;
+                        bool valid = false;
+
+                        for (unsigned int channel = 0; channel < inputs.dimZ();
+                             ++channel)
+                        {
+                            if (!maps.empty() && !maps(output, channel))
+                                continue;
+
+                            for (unsigned int sy = syMin; sy < syMax; ++sy) {
+                                for (unsigned int sx = sxMin; sx < sxMax; ++sx)
+                                {
+                                    const T value = inputs(ix + sx,
+                                                                 iy + sy,
+                                                                 channel,
+                                                                 batchPos);
+
+                                    if (!valid || value > poolValue) {
+                                        poolValue = value;
+                                        valid = true;
+
+                                        ixMax = ix + sx;
+                                        iyMax = iy + sy;
+                                        channelMax = channel;
+                                    }
+                                }
+                            }
+                        }
+
+                        argMax(ox, oy, output, batchPos)
+                            = ArgMax(ixMax, iyMax, channelMax, valid);
+                    }
+
+                    outputs(ox, oy, output, batchPos)
+                        = (*alpha) * poolValue
+                          + (*beta) * outputs(ox, oy, output, batchPos);
+                }
+            }
+        }
+    }
+}
+
+*/
+
+// Kernels registration to implementation entry point
+REGISTRAR(MaxPoolingImpl2D_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::MaxPoolingImpl2D_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(MaxPoolingImpl2D_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::MaxPoolingImpl2D_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(MaxPoolingImpl2D_cpu,
+    {DataType::Int32},
+    {ProdConso::inPlaceModel, Aidge::MaxPoolingImpl2D_cpu_forward_kernel<int32_t, int32_t>, nullptr});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_MaxPOOLINGIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/MulImpl.hpp
+++ b/include/aidge/backend/cpu/operator/MulImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_MULIMPL_H_
+#define AIDGE_CPU_OPERATOR_MULIMPL_H_
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Mul.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include <memory>
+#include <vector>
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using MulImpl_cpu = OperatorImpl_cpu<Mul_Op,
+    void(std::vector<std::size_t>,
+        std::vector<std::size_t>,
+        const std::vector<std::size_t>&,
+        const void*,
+        const void*,
+        void*),
+    void(const std::size_t,
+        const std::size_t,
+        const std::size_t,
+        const std::vector<std::size_t>,
+        const std::vector<std::size_t>,
+        const std::vector<std::size_t>,
+        const void*,
+        const void*,
+        const void*,
+        void*,
+        void*)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(Mul_Op, "cpu", Aidge::MulImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_MULIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_MULIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_MULIMPL_KERNELS_H_
+
+#include "aidge/utils/Registrar.hpp"
+
+#include <cstdint>     // std::int32_t, std::int64_t
+
+#include "aidge/backend/cpu/data/Broadcasting.hpp"
+#include "aidge/backend/cpu/operator/MulImpl.hpp"
+
+namespace {
+// suppose values are contiguous in memory
+template <class I1, class I2, class O>
+void mul_contiguous_arrays(const std::size_t input1size,
+                            const std::size_t input2size,
+                            const std::size_t output1size,
+                            const I1* input1,
+                            const I2* input2,
+                            O* output)
+{
+    for (std::size_t i = 0; i < output1size; ++i)
+    {
+        const std::size_t in1_id = (input1size != 1) ? i : 0;
+        const std::size_t in2_id = (input2size != 1) ? i : 0;
+        output[i] = static_cast<O>(input1[in1_id] * input2[in2_id]);
+    }
+}
+}
+
+namespace Aidge {
+
+template <class I1, class I2, class O>
+void MulImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
+                                std::vector<std::size_t> dims1,
+                                const std::vector<std::size_t>& outputDims,
+                                const void* input0_,
+                                const void* input1_,
+                                void* output_) {
+    const I1* input_0 = static_cast<const I1*>(input0_);
+    const I2* input_1 = static_cast<const I2*>(input1_);
+    O* output = static_cast<O*>(output_);
+
+    // [5,2,1,7] & [2,6,7]
+    // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7]
+    // 2. Find the highest equal dimension -> 3
+    //    Exception: if the first diverging dimension is the last one, then -> 4 (dims.size())
+    // 3. Compute the highest number of contiguous data -> 7
+    // 4. Compute stride and offset step for the broadcast mechanism
+    // 5. Call a simple kernel
+
+    // ## Compute compatible input dimensions
+    // special case for equal dimensions, the kernel is called with the entire arrays at once
+    if (dims0 == dims1) {
+        const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+        for (std::size_t i = 0; i < input0_contiguous_size; ++i)
+        {
+            output[i] = static_cast<O>(input_0[i] * input_1[i]);
+        }
+        return;
+    }
+
+    // set dimensions to be of equal size by filling the smallest one with ones.
+    if (dims0.size() > dims1.size()) {
+        dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1));
+    }
+    else if (dims1.size() > dims0.size()) {
+        dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1));
+    }
+
+    const std::size_t nbDims = dims0.size();
+
+    // Find the highest equal dimension
+    // std::size_t contiguousIdx = nbDims - 1;
+    std::size_t contiguousIdx = nbDims;
+    while (contiguousIdx-- > 0) {
+    // for (; contiguousIdx+1 > 0; --contiguousIdx) {
+        if (dims0[contiguousIdx] != dims1[contiguousIdx]) {
+            if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1
+                const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1;
+                while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) {
+                    --contiguousIdx;
+                }
+            }
+            break;
+        }
+    }
+    ++contiguousIdx;
+
+    // Compute the highest number of contiguous data for each Tensor
+    const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+
+    // initialize strides to iterate through data because of broadcasting
+    std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    if (contiguousIdx > 0) {
+        stride_post0[contiguousIdx - 1] = 1;
+        stride_post1[contiguousIdx - 1] = 1;
+        for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) {
+            stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]);
+            stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]);
+        }
+        for (std::size_t i = 0; i != contiguousIdx; ++i) {
+            stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1;
+            stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1;
+        }
+    }
+
+    // variables for arrays offsets
+    std::size_t offsetIn0 = 0;
+    std::size_t offsetIn1 = 0;
+    std::size_t offsetOut = 0;
+
+
+    std::size_t dim = contiguousIdx - 1;
+    const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
+    for (std::size_t stack = 0; stack < nbStacks;) {
+        mul_contiguous_arrays<I1,I2,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
+                    input_0 + offsetIn0*input0_contiguous_size,
+                    input_1 + offsetIn1*input1_contiguous_size,
+                    output + offsetOut*output_contiguous_size);
+        if (++stack < nbStacks) {
+            std::size_t tmp_stack = stack;
+            while(tmp_stack % outputDims[dim] == 0) {
+                tmp_stack /= outputDims[dim];
+                dim--;
+            }
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = contiguousIdx - 1;
+        }
+    }
+}
+
+template <class I1, class I2, class O>
+void MulImpl_cpu_backward_kernel(const std::size_t input0Length,
+                                  const std::size_t input1Length,
+                                  const std::size_t gradOutputLength,
+                                  const std::vector<std::size_t>& dims0,
+                                  const std::vector<std::size_t>& dims1,
+                                  const std::vector<std::size_t>& outputDims,
+                                  const void* input0_,
+                                  const void* input1_,
+                                  const void* grad_output_,
+                                  void* gradientInput0_,
+                                  void* gradientInput1_)
+{
+    const I1* input0 = static_cast<const I1*>(input0_);
+    const I2* input1 = static_cast<const I2*>(input1_);
+    const O* grad_output = static_cast<const O*>(grad_output_);
+    auto* grad_input_0 = static_cast<I1*>(gradientInput0_);
+    auto* grad_input_1 = static_cast<I2*>(gradientInput1_);
+
+    std::fill_n(grad_input_0, input0Length, static_cast<I1>(0));
+    std::fill_n(grad_input_1, input1Length, static_cast<I2>(0));
+
+    // Broadcast dims0 and dims1 to match the shape of outputDims
+    auto broadcastedDims0 = getBroadcastedDims(outputDims, dims0);
+    auto broadcastedDims1 = getBroadcastedDims(outputDims, dims1);
+
+    for (std::size_t i = 0; i < gradOutputLength; ++i) {
+        auto idxOutputGrad = getMultiDimIndices(outputDims, i);
+        std::vector<std::size_t> idxInput0(broadcastedDims0.size());
+        std::vector<std::size_t> idxInput1(broadcastedDims1.size());
+
+        // Map output indices to input0 indices, considering broadcasting
+        for (std::size_t dimension = 0; dimension < broadcastedDims0.size(); ++dimension) {
+            // If input0 is broadcasted along this dimension (== 1) or both dimensions are 1, index is 0.
+            // idxInput0 represent the multi dim index of input0 contributing
+            // to the output at index i.
+            idxInput0[dimension] = (broadcastedDims0[dimension] == 1) ? 0 : idxOutputGrad[dimension];
+        }
+
+        for (std::size_t dimension = 0; dimension < broadcastedDims1.size(); ++dimension) {
+            idxInput1[dimension] = (broadcastedDims1[dimension] == 1) ? 0 : idxOutputGrad[dimension];
+        }
+
+        // We have to access tensors with a flat index, hence the conversion
+        auto idx0 = getFlattenedIndex(broadcastedDims0, idxInput0);
+        auto idx1 = getFlattenedIndex(broadcastedDims1, idxInput1);
+
+        grad_input_0[idx0] += static_cast<I1>(grad_output[i] * input1[idx1]);
+        grad_input_1[idx1] += static_cast<I2>(grad_output[i] * input0[idx0]);
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(MulImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::MulImpl_cpu_forward_kernel<float, float, float>, Aidge::MulImpl_cpu_backward_kernel<float, float, float>});
+REGISTRAR(MulImpl_cpu,
+    {{{DataType::Float32}, {DataType::Float64}}, {DataType::Float32}},
+    {ProdConso::inPlaceModel, Aidge::MulImpl_cpu_forward_kernel<float, double, float>, Aidge::MulImpl_cpu_backward_kernel<float, double, float>});
+REGISTRAR(MulImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::MulImpl_cpu_forward_kernel<double, double, double>, Aidge::MulImpl_cpu_backward_kernel<double, double, double>});
+REGISTRAR(MulImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::inPlaceModel, Aidge::MulImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, Aidge::MulImpl_cpu_backward_kernel<std::int32_t, std::int32_t, std::int32_t>});
+REGISTRAR(MulImpl_cpu,
+    {DataType::Int64},
+    {ProdConso::inPlaceModel, Aidge::MulImpl_cpu_forward_kernel<std::int64_t, std::int64_t, std::int64_t>, Aidge::MulImpl_cpu_backward_kernel<std::int64_t, std::int64_t, std::int64_t>});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_MULIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/ProducerImpl.hpp
+++ b/include/aidge/backend/cpu/operator/ProducerImpl.hpp
@@ -9,44 +9,44 @@
 *
 ********************************************************************************/

-#ifndef AIDGE_CPU_OPERATOR_PRODUCERIMPL_H_
-#define AIDGE_CPU_OPERATOR_PRODUCERIMPL_H_
+#ifndef AIDGE_CPU_OPERATOR_IMPL_H_
+#define AIDGE_CPU_OPERATOR_IMPL_H_

+#include <cstddef>  // std::size_t
 #include <memory>
+#include <tuple>    // std::tuple
+#include <vector>

 #include "aidge/backend/OperatorImpl.hpp"
-#include "aidge/operator/Producer.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"

 namespace Aidge {
-class ProducerImpl_cpu : public OperatorImpl {
-   private:
-    const Producer_Op &mOp;
-
-   public:
-    ProducerImpl_cpu(const Producer_Op &op) : mOp(op) {}
-
-    static std::unique_ptr<ProducerImpl_cpu> create(const Producer_Op &op) {
-        return std::make_unique<ProducerImpl_cpu>(op);
+template <class Op, class FwdFunc, class BwdFunc = void()>
+class OperatorImpl_cpu : public OperatorImpl,
+    public Registrable<OperatorImpl_cpu<Op, FwdFunc, BwdFunc>, ImplSpec, Impl<FwdFunc, BwdFunc>>
+{
+public:
+    OperatorImpl_cpu(const Op& op) : OperatorImpl(op, "cpu") {}
+
+    static std::unique_ptr<OperatorImpl_cpu<Op, FwdFunc, BwdFunc>> create(const Op& op) {
+        return std::make_unique<OperatorImpl_cpu<Op, FwdFunc, BwdFunc>>(op);
    }

-   public:
-    NbElts_t getNbRequiredData(const IOIndex_t inputIdx) const override final;
-    NbElts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
-    NbElts_t getRequiredMemory(const IOIndex_t /*outputIdx*/, const std::vector<DimSize_t> &/*inputsSize*/) const override final;
-    NbElts_t getNbConsumedData(const IOIndex_t inputIdx) const override final;
-    NbElts_t getNbProducedData(const IOIndex_t outputIdx) const override final;
-    void updateConsummerProducer() override final;
+    virtual std::shared_ptr<ProdConso> getProdConso() const override {
+        const auto impl = Registrar<OperatorImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+        return impl.prodConso(mOp);
+    }

-    void forward();
+    virtual std::vector<ImplSpec> getAvailableImplSpecs() const override {
+        // return Registrar<OperatorImpl_cpu>::getKeys(); // Note: cannot return set due to python binding 
+        std::set<ImplSpec> implSpecsSet = Registrar<OperatorImpl_cpu>::getKeys();
+        return std::vector<ImplSpec>(implSpecsSet.begin(), implSpecsSet.end());
+    }

-    void backward();
+    void forward() override;
+    void backward() override;
 };
-
-namespace {
-static Registrar<Producer_Op> registrarProducer1DImpl_cpu("cpu", Aidge::ProducerImpl_cpu::create);
-}  // namespace
 }  // namespace Aidge

-#endif /* AIDGE_CPU_OPERATOR_PRODUCERIMPL_H_ */
+#endif /* AIDGE_CPU_OPERATOR_IMPL_H_ */
--- a/include/aidge/backend/cpu/operator/PadImpl.hpp
+++ b/include/aidge/backend/cpu/operator/PadImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_PADIMPL_H_
+#define AIDGE_CPU_OPERATOR_PADIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Pad.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+
+namespace Aidge {
+class Pad_ProdConso_cpu : public ProdConso {
+public:
+    Pad_ProdConso_cpu(const Operator& op): ProdConso(op) {}
+
+    static std::unique_ptr<ProdConso> defaultModel(const Operator& op) {
+        return std::make_unique<Pad_ProdConso_cpu>(op);
+    }
+
+    Elts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
+};
+
+// Operator implementation entry point for the backend
+using Pad1D_Op = Pad_Op<1>;
+using PadImpl1D_cpu = OperatorImpl_cpu<Pad_Op<1>,
+    void(const std::array<DimSize_t, 2>&,
+                            const PadBorderType,
+                            const double,
+                            const std::array<DimSize_t, 3> &,
+                            const void *,
+                            void *)>;
+
+using Pad2D_Op = Pad_Op<2>;
+using PadImpl2D_cpu = OperatorImpl_cpu<Pad_Op<2>,
+    void(const std::array<DimSize_t, 4>&,
+                            const PadBorderType,
+                            const double,
+                            const std::array<DimSize_t, 4> &,
+                            const void *,
+                            void *)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(Pad1D_Op, "cpu", Aidge::PadImpl1D_cpu::create);
+REGISTRAR(Pad2D_Op, "cpu", Aidge::PadImpl2D_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_PADIMPL_H_ */
No results found