Compare revisions

99af25ca · 99af25ca · 99af25ca · 99af25ca · 99af25ca · 99af25ca
--- a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_MaxPOOLINGIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_MaxPOOLINGIMPL_KERNELS_H_
+
+#include <array>
+#include <cmath>
+#include <cstdint>
+#include <tuple>
+
+
+#include "aidge/backend/cpu/operator/MaxPoolingImpl.hpp"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include "aidge/data/Data.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+/**
+ * @brief Forward kernel for 2D MaxPoolingolution on CPU backend.
+ * @tparam I Input data type.
+ * @tparam O Output data type.
+ * @param attrs tuple of Attributes from the Operator
+ * @param dims Array of input dimensions.
+ * @param input_ const input Tensor.
+ * @param output_ Output Tensor.
+ */
+template <class I, class O>
+void MaxPoolingImpl2D_cpu_forward_kernel(
+  const std::array<DimSize_t, 2>& strideDims,
+  const std::array<DimSize_t, 2>& kernelDims,
+  const std::array<DimSize_t, 2>& dilations,
+  const bool ceilMode,
+  const std::array<DimSize_t, 4> &dims,
+  const void *input_,
+  void *output_)
+{
+  const I *input = static_cast<const I *>(input_);
+  O *output = static_cast<O *>(output_);
+
+  // output H size
+  auto hOut = static_cast<float>(
+    dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0]
+  ) / static_cast<float>(strideDims[0]);
+  const std::size_t outXSize = ceilMode
+    ? static_cast<std::size_t>(std::ceil(hOut))
+    : static_cast<std::size_t>(std::floor(hOut));
+
+  // output W size
+  auto wOut = static_cast<float>( 
+      dims[3] - ( kernelDims[1] - 1) * dilations[1] - 1 + strideDims[1]
+    ) / static_cast<float>(strideDims[1]);
+
+  const std::size_t outYSize = ceilMode
+    ? static_cast<std::size_t>(std::ceil(wOut))
+    : static_cast<std::size_t>(std::floor(wOut));
+
+  using signedsize = std::make_signed<std::size_t>::type;
+
+#ifdef _OPENMP
+    #pragma omp parallel for collapse(2) if (dims[0] * dims[1] >= 16)
+#endif
+  for (int batch = 0; batch < static_cast<int>(dims[0]); ++batch){
+    for (int channel = 0; channel < static_cast<int>(dims[1]); ++channel){
+      auto batchChannelIndex = (channel + batch * dims[1]);
+      const std::size_t outputBaseIndex = batchChannelIndex * outXSize * outYSize;
+      const std::size_t inputBaseIndex = batchChannelIndex * dims[2] * dims[3];
+      for (std::size_t outX = 0; outX < outXSize; ++outX) {
+        const signedsize negStrideX = static_cast<signedsize>(
+		-outX * strideDims[0]
+	);
+        const std::size_t kernelXMin = static_cast<std::size_t>(
+          std::max(negStrideX, signedsize(0))
+        );
+        /* Compute kernelXMax */
+        std::size_t kernelXMax = dims[2] + negStrideX;
+        if ((static_cast<signedsize>(dims[2]) + negStrideX) < 0){
+          kernelXMax = 0;
+        }
+        else if (kernelXMax > kernelDims[0]){
+          kernelXMax = kernelDims[0];
+        }
+        for (std::size_t outY = 0; outY < outYSize; ++outY) {
+          const signedsize negStrideY = static_cast<signedsize>(-outY * strideDims[1]);
+          const std::size_t kernelYMin = static_cast<std::size_t>(
+            std::max(negStrideY, signedsize(0))
+          );
+          /* Compute kernelYMax */
+          std::size_t kernelYMax = dims[3] + negStrideY;
+          const std::size_t outputIndex = outputBaseIndex + outX * outYSize + outY;
+          const std::size_t strideXoffset = outX * strideDims[0];
+          const std::size_t strideYoffset = outY * strideDims[1];
+          I poolValue(0.0);
+          bool valid = false;
+          if (static_cast<signedsize>(dims[3]) + negStrideY < 0){
+            kernelYMax = 0;
+          }
+          else if(kernelYMax > kernelDims[1]){
+            kernelYMax = kernelDims[1];
+          }
+          for (unsigned int kY = kernelYMin; kY < kernelYMax ; ++kY){
+            for (unsigned int kX = kernelXMin; kX < kernelXMax; ++kX){
+              // Apply dilation factor to kernel indices
+              const std::size_t dilatedkernelX = kX * dilations[0];
+              const std::size_t dilatedkernelY = kY * dilations[1];
+              // Ensure indices are within bounds
+              auto inputXPostDilation = strideXoffset + dilatedkernelX;
+              auto inputYPostDilation = strideYoffset + dilatedkernelY;
+              if (inputXPostDilation < dims[2] && inputYPostDilation < dims[3]){
+                const I inputValue = input[
+                inputBaseIndex + inputXPostDilation * dims[3] 
+                + inputYPostDilation
+                ];
+                if (!valid || inputValue > poolValue) {
+                  poolValue = inputValue;
+                  valid = true;
+                }
+              }
+            }
+          }
+          output[outputIndex] = poolValue;
+        }
+      }
+    }
+  }
+} 
+
+
+template <class I, class O>
+void MaxPoolingImpl2D_cpu_backward_kernel(
+  const std::array<DimSize_t, 2>& strideDims,
+  const std::array<DimSize_t, 2>& kernelDims,
+  const std::array<DimSize_t, 2>& dilations,
+  const bool ceilMode,
+  const std::array<DimSize_t, 4> &dims,
+  const void *input_,
+  void *grad_
+)
+{
+  const I *input = static_cast<const I *>(input_);
+  I *grad = static_cast<I *>(grad_);
+
+  // output H size
+  auto hOut = static_cast<float>(
+    dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0]
+  ) / static_cast<float>(strideDims[0]);
+  const std::size_t outXSize = ceilMode
+    ? static_cast<std::size_t>(std::ceil(hOut))
+    : static_cast<std::size_t>(std::floor(hOut));
+
+  // output W size
+  auto wOut = static_cast<float>( 
+      dims[3] - ( kernelDims[1] - 1) * dilations[1] - 1 + strideDims[1]
+    ) / static_cast<float>(strideDims[1]);
+
+  const std::size_t outYSize = ceilMode
+    ? static_cast<std::size_t>(std::ceil(wOut))
+    : static_cast<std::size_t>(std::floor(wOut));
+
+  using signedsize = std::make_signed<std::size_t>::type;
+
+  for (std::size_t batch = 0; batch < dims[0]; ++batch){
+    for (std::size_t channel = 0; channel < dims[1]; ++channel){
+      auto batchChannelIndex = (channel + batch * dims[1]);
+      const std::size_t inputBaseIndex = batchChannelIndex * dims[2] * dims[3];
+      for (std::size_t outX = 0; outX < outXSize; ++outX) {
+        const signedsize negStrideX = static_cast<signedsize>(
+          -outX * strideDims[0]
+        );
+        const std::size_t kernelXMin = static_cast<std::size_t>(
+          std::max(negStrideX, signedsize(0))
+        );
+        /* Compute kernelXMax */
+        std::size_t kernelXMax = dims[2] + negStrideX;
+        if ((static_cast<signedsize>(dims[2]) + negStrideX) < 0){
+          kernelXMax = 0;
+        }
+        else if (kernelXMax > kernelDims[0]){
+          kernelXMax = kernelDims[0];
+        }
+        for (std::size_t outY = 0; outY < outYSize; ++outY) {
+          const signedsize negStrideY = static_cast<signedsize>(-outY * strideDims[1]);
+          const std::size_t kernelYMin = static_cast<std::size_t>(
+            std::max(negStrideY, signedsize(0))
+          );
+          /* Compute kernelYMax */
+          std::size_t kernelYMax = dims[3] + negStrideY;
+          const std::size_t strideXoffset = outX * strideDims[0];
+          const std::size_t strideYoffset = outY * strideDims[1];
+          I poolValue(0.0);
+          bool valid = false;
+          if (static_cast<signedsize>(dims[3]) + negStrideY < 0){
+            kernelYMax = 0;
+          }
+          else if(kernelYMax > kernelDims[1]){
+            kernelYMax = kernelDims[1];
+          }
+	  std::size_t saveIndex = 0;
+          for (unsigned int kY = kernelYMin; kY < kernelYMax ; ++kY){
+            for (unsigned int kX = kernelXMin; kX < kernelXMax; ++kX){
+              // Apply dilation factor to kernel indices
+              const std::size_t dilatedkernelX = kX * dilations[0];
+              const std::size_t dilatedkernelY = kY * dilations[1];
+              // Ensure indices are within bounds
+              auto inputXPostDilation = strideXoffset + dilatedkernelX;
+              auto inputYPostDilation = strideYoffset + dilatedkernelY;
+              if (inputXPostDilation < dims[2] && inputYPostDilation < dims[3]){
+		std::size_t inputIndex = 
+			inputBaseIndex + inputXPostDilation * dims[3] 
+			+ inputYPostDilation;
+                const I inputValue = input[inputIndex];
+                if (!valid || inputValue > poolValue) {
+                  poolValue = inputValue;
+		  saveIndex = inputIndex;
+                  valid = true;
+                }
+              }
+            }
+          }
+	  if (valid){
+		grad[saveIndex]++;
+	  }
+        }
+      }
+    }
+  }
+}
+
+
+
+
+// Kernels registration to implementation entry point
+REGISTRAR(MaxPoolingImpl2D_cpu,
+    {DataType::Float32},
+    {
+          ProdConso::inPlaceModel,
+          Aidge::MaxPoolingImpl2D_cpu_forward_kernel<float, float>,
+          Aidge::MaxPoolingImpl2D_cpu_backward_kernel<float, float>,
+    }
+);
+REGISTRAR(MaxPoolingImpl2D_cpu,
+    {DataType::Float64},
+    {
+          ProdConso::inPlaceModel,
+          Aidge::MaxPoolingImpl2D_cpu_forward_kernel<double, double>,
+          Aidge::MaxPoolingImpl2D_cpu_backward_kernel<double, double>,
+    }
+);
+REGISTRAR(MaxPoolingImpl2D_cpu,
+    {DataType::Int32},
+    {
+          ProdConso::inPlaceModel,
+          Aidge::MaxPoolingImpl2D_cpu_forward_kernel<int32_t, int32_t>,
+          Aidge::MaxPoolingImpl2D_cpu_backward_kernel<int32_t, int32_t>,
+    }
+);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_MaxPOOLINGIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/ModImpl.hpp
+++ b/include/aidge/backend/cpu/operator/ModImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_MODIMPL_H_
+#define AIDGE_CPU_OPERATOR_MODIMPL_H_
+
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Mod.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using ModImpl_cpu = OperatorImpl_cpu<Mod_Op,
+    void(bool, const std::size_t, const std::size_t, const std::size_t, const void*, const void*,void*)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(Mod_Op, "cpu", Aidge::ModImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_MODIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/ModImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ModImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_MODIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_MODIMPL_KERNELS_H_
+
+#include <numeric>     // std::accumulate
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::int32_t, std::int64_t
+#include <functional>  // std::multiplies
+
+#include "aidge/utils/Registrar.hpp"
+
+#include "aidge/backend/cpu/data/Broadcasting.hpp"
+#include "aidge/backend/cpu/operator/ModImpl.hpp"
+
+namespace Aidge {
+
+template <typename T,  
+    typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
+static inline T modulus(T a, T b) {
+    return a % b;
+}
+
+template <typename T,  
+    typename std::enable_if<!std::is_integral<T>::value>::type* = nullptr>
+static inline T modulus(T /*a*/, T /*b*/) {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Mod Operator with fmod attribute set to false only supports integer types.");
+}
+
+template <class I1, class I2, class O>
+constexpr void ModImpl_cpu_forward_kernel(bool fmod,
+                                const std::size_t input1size_,
+                                const std::size_t input2size_,
+                                const std::size_t output1size_,
+                                const void* input1_,
+                                const void* input2_,
+                                void* output_) {
+
+    const I1* input_1 = static_cast<const I1*>(input1_);
+    const I2* input_2 = static_cast<const I2*>(input2_);
+    O* output = static_cast<O*>(output_);
+
+// suppose values are contiguous in memory
+    for (std::size_t i = 0; i < output1size_; ++i) {
+        const std::size_t in1_id = (input1size_ != 1) ? i : 0;
+        const std::size_t in2_id = (input2size_ != 1) ? i : 0;
+        if (fmod) {
+            output[i] = static_cast<O>(std::fmod(input_1[in1_id], input_2[in2_id]));
+        }
+        else {
+            output[i] = static_cast<O>(modulus(input_1[in1_id], input_2[in2_id]));
+        }
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(ModImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::ModImpl_cpu_forward_kernel<float, float, float>, nullptr});
+REGISTRAR(ModImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::ModImpl_cpu_forward_kernel<double, double, double>, nullptr});
+REGISTRAR(ModImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::inPlaceModel, Aidge::ModImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, nullptr});
+REGISTRAR(ModImpl_cpu,
+    {DataType::UInt64},
+    {ProdConso::inPlaceModel, Aidge::ModImpl_cpu_forward_kernel<std::uint64_t, std::uint64_t, std::uint64_t>, nullptr});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_MODIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/MulImpl.hpp
+++ b/include/aidge/backend/cpu/operator/MulImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_MULIMPL_H_
+#define AIDGE_CPU_OPERATOR_MULIMPL_H_
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Mul.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include <memory>
+#include <vector>
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using MulImpl_cpu = OperatorImpl_cpu<Mul_Op,
+    void(std::vector<std::size_t>,
+        std::vector<std::size_t>,
+        const std::vector<std::size_t>&,
+        const void*,
+        const void*,
+        void*),
+    void(const std::size_t,
+        const std::size_t,
+        const std::size_t,
+        const std::vector<std::size_t>,
+        const std::vector<std::size_t>,
+        const std::vector<std::size_t>,
+        const void*,
+        const void*,
+        const void*,
+        void*,
+        void*)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(Mul_Op, "cpu", Aidge::MulImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_MULIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_MULIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_MULIMPL_KERNELS_H_
+
+#include "aidge/utils/Registrar.hpp"
+
+#include <cstdint>     // std::int32_t, std::int64_t
+
+#include "aidge/backend/cpu/data/Broadcasting.hpp"
+#include "aidge/backend/cpu/operator/MulImpl.hpp"
+
+namespace {
+// suppose values are contiguous in memory
+template <class I1, class I2, class O>
+void mul_contiguous_arrays(const std::size_t input1size,
+                            const std::size_t input2size,
+                            const std::size_t output1size,
+                            const I1* input1,
+                            const I2* input2,
+                            O* output)
+{
+    for (std::size_t i = 0; i < output1size; ++i)
+    {
+        const std::size_t in1_id = (input1size != 1) ? i : 0;
+        const std::size_t in2_id = (input2size != 1) ? i : 0;
+        output[i] = static_cast<O>(input1[in1_id] * input2[in2_id]);
+    }
+}
+}
+
+namespace Aidge {
+
+template <class I1, class I2, class O>
+void MulImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
+                                std::vector<std::size_t> dims1,
+                                const std::vector<std::size_t>& outputDims,
+                                const void* input0_,
+                                const void* input1_,
+                                void* output_) {
+    const I1* input_0 = static_cast<const I1*>(input0_);
+    const I2* input_1 = static_cast<const I2*>(input1_);
+    O* output = static_cast<O*>(output_);
+
+    // [5,2,1,7] & [2,6,7]
+    // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7]
+    // 2. Find the highest equal dimension -> 3
+    //    Exception: if the first diverging dimension is the last one, then -> 4 (dims.size())
+    // 3. Compute the highest number of contiguous data -> 7
+    // 4. Compute stride and offset step for the broadcast mechanism
+    // 5. Call a simple kernel
+
+    // ## Compute compatible input dimensions
+    // special case for equal dimensions, the kernel is called with the entire arrays at once
+    if (dims0 == dims1) {
+        const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+        for (std::size_t i = 0; i < input0_contiguous_size; ++i)
+        {
+            output[i] = static_cast<O>(input_0[i] * input_1[i]);
+        }
+        return;
+    }
+
+    // set dimensions to be of equal size by filling the smallest one with ones.
+    if (dims0.size() > dims1.size()) {
+        dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1));
+    }
+    else if (dims1.size() > dims0.size()) {
+        dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1));
+    }
+
+    const std::size_t nbDims = dims0.size();
+
+    // Find the highest equal dimension
+    // std::size_t contiguousIdx = nbDims - 1;
+    std::size_t contiguousIdx = nbDims;
+    while (contiguousIdx-- > 0) {
+    // for (; contiguousIdx+1 > 0; --contiguousIdx) {
+        if (dims0[contiguousIdx] != dims1[contiguousIdx]) {
+            if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1
+                const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1;
+                while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) {
+                    --contiguousIdx;
+                }
+            }
+            break;
+        }
+    }
+    ++contiguousIdx;
+
+    // Compute the highest number of contiguous data for each Tensor
+    const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+
+    // initialize strides to iterate through data because of broadcasting
+    std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    if (contiguousIdx > 0) {
+        stride_post0[contiguousIdx - 1] = 1;
+        stride_post1[contiguousIdx - 1] = 1;
+        for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) {
+            stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]);
+            stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]);
+        }
+        for (std::size_t i = 0; i != contiguousIdx; ++i) {
+            stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1;
+            stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1;
+        }
+    }
+
+    // variables for arrays offsets
+    std::size_t offsetIn0 = 0;
+    std::size_t offsetIn1 = 0;
+    std::size_t offsetOut = 0;
+
+
+    std::size_t dim = contiguousIdx - 1;
+    const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
+    for (std::size_t stack = 0; stack < nbStacks;) {
+        mul_contiguous_arrays<I1,I2,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
+                    input_0 + offsetIn0*input0_contiguous_size,
+                    input_1 + offsetIn1*input1_contiguous_size,
+                    output + offsetOut*output_contiguous_size);
+        if (++stack < nbStacks) {
+            std::size_t tmp_stack = stack;
+            while(tmp_stack % outputDims[dim] == 0) {
+                tmp_stack /= outputDims[dim];
+                dim--;
+            }
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = contiguousIdx - 1;
+        }
+    }
+}
+
+template <class I1, class I2, class O>
+void MulImpl_cpu_backward_kernel(const std::size_t /*input0Length*/,
+                                  const std::size_t /*input1Length*/,
+                                  const std::size_t gradOutputLength,
+                                  const std::vector<std::size_t>& dims0,
+                                  const std::vector<std::size_t>& dims1,
+                                  const std::vector<std::size_t>& outputDims,
+                                  const void* input0_,
+                                  const void* input1_,
+                                  const void* grad_output_,
+                                  void* gradientInput0_,
+                                  void* gradientInput1_)
+{
+    const I1* input0 = static_cast<const I1*>(input0_);
+    const I2* input1 = static_cast<const I2*>(input1_);
+    const O* grad_output = static_cast<const O*>(grad_output_);
+    auto* grad_input_0 = static_cast<I1*>(gradientInput0_);
+    auto* grad_input_1 = static_cast<I2*>(gradientInput1_);
+
+    // Broadcast dims0 and dims1 to match the shape of outputDims
+    auto broadcastedDims0 = getBroadcastedDims(outputDims, dims0);
+    auto broadcastedDims1 = getBroadcastedDims(outputDims, dims1);
+
+    for (std::size_t i = 0; i < gradOutputLength; ++i) {
+        auto idxOutputGrad = getMultiDimIndices(outputDims, i);
+        std::vector<std::size_t> idxInput0(broadcastedDims0.size());
+        std::vector<std::size_t> idxInput1(broadcastedDims1.size());
+
+        // Map output indices to input0 indices, considering broadcasting
+        for (std::size_t dimension = 0; dimension < broadcastedDims0.size(); ++dimension) {
+            // If input0 is broadcasted along this dimension (== 1) or both dimensions are 1, index is 0.
+            // idxInput0 represent the multi dim index of input0 contributing
+            // to the output at index i.
+            idxInput0[dimension] = (broadcastedDims0[dimension] == 1) ? 0 : idxOutputGrad[dimension];
+        }
+
+        for (std::size_t dimension = 0; dimension < broadcastedDims1.size(); ++dimension) {
+            idxInput1[dimension] = (broadcastedDims1[dimension] == 1) ? 0 : idxOutputGrad[dimension];
+        }
+
+        // We have to access tensors with a flat index, hence the conversion
+        auto idx0 = getFlattenedIndex(broadcastedDims0, idxInput0);
+        auto idx1 = getFlattenedIndex(broadcastedDims1, idxInput1);
+
+        grad_input_0[idx0] += static_cast<I1>(grad_output[i] * input1[idx1]);
+        grad_input_1[idx1] += static_cast<I2>(grad_output[i] * input0[idx0]);
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(MulImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::MulImpl_cpu_forward_kernel<float, float, float>, Aidge::MulImpl_cpu_backward_kernel<float, float, float>});
+REGISTRAR(MulImpl_cpu,
+    {{{DataType::Float32}, {DataType::Float64}}, {DataType::Float32}},
+    {ProdConso::inPlaceModel, Aidge::MulImpl_cpu_forward_kernel<float, double, float>, Aidge::MulImpl_cpu_backward_kernel<float, double, float>});
+REGISTRAR(MulImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::MulImpl_cpu_forward_kernel<double, double, double>, Aidge::MulImpl_cpu_backward_kernel<double, double, double>});
+REGISTRAR(MulImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::inPlaceModel, Aidge::MulImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, Aidge::MulImpl_cpu_backward_kernel<std::int32_t, std::int32_t, std::int32_t>});
+REGISTRAR(MulImpl_cpu,
+    {DataType::Int64},
+    {ProdConso::inPlaceModel, Aidge::MulImpl_cpu_forward_kernel<std::int64_t, std::int64_t, std::int64_t>, Aidge::MulImpl_cpu_backward_kernel<std::int64_t, std::int64_t, std::int64_t>});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_MULIMPL_KERNELS_H_ */
--- a/include/aidge/operator/ProducerImpl.hpp
+++ b/include/aidge/operator/ProducerImpl.hpp
@@ -9,43 +9,44 @@
 *
 ********************************************************************************/

-#ifndef AIDGE_CPU_OPERATOR_PRODUCERIMPL_H_
-#define AIDGE_CPU_OPERATOR_PRODUCERIMPL_H_
+#ifndef AIDGE_CPU_OPERATOR_IMPL_H_
+#define AIDGE_CPU_OPERATOR_IMPL_H_

+#include <cstddef>  // std::size_t
 #include <memory>
+#include <tuple>    // std::tuple
+#include <vector>

 #include "aidge/backend/OperatorImpl.hpp"
-#include "aidge/operator/Producer.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"

 namespace Aidge {
-class ProducerImpl_cpu : public OperatorImpl {
-   private:
-    const Producer_Op &mOp;
-
-   public:
-    ProducerImpl_cpu(const Producer_Op &op) : mOp(op) {}
-
-    static std::unique_ptr<ProducerImpl_cpu> create(const Producer_Op &op) {
-        return std::make_unique<ProducerImpl_cpu>(op);
+template <class Op, class FwdFunc, class BwdFunc = void()>
+class OperatorImpl_cpu : public OperatorImpl,
+    public Registrable<OperatorImpl_cpu<Op, FwdFunc, BwdFunc>, ImplSpec, Impl<FwdFunc, BwdFunc>>
+{
+public:
+    OperatorImpl_cpu(const Op& op) : OperatorImpl(op, "cpu") {}
+
+    static std::unique_ptr<OperatorImpl_cpu<Op, FwdFunc, BwdFunc>> create(const Op& op) {
+        return std::make_unique<OperatorImpl_cpu<Op, FwdFunc, BwdFunc>>(op);
    }

-   public:
-    NbElts_t getNbRequiredData(const IOIndex_t inputIdx) const override final;
-    NbElts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
-    NbElts_t getRequiredMemory(__attribute__((unused)) const IOIndex_t outputIdx, __attribute__((unused)) const std::vector<DimSize_t> &inputsSize) const override final;
-    NbElts_t getNbConsumedData(const IOIndex_t inputIdx) const override final;
-    NbElts_t getNbProducedData(const IOIndex_t outputIdx) const override final;
+    virtual std::shared_ptr<ProdConso> getProdConso() const override {
+        const auto impl = Registrar<OperatorImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+        return impl.prodConso(mOp);
+    }

-    void forward();
+    virtual std::vector<ImplSpec> getAvailableImplSpecs() const override {
+        // return Registrar<OperatorImpl_cpu>::getKeys(); // Note: cannot return set due to python binding 
+        std::set<ImplSpec> implSpecsSet = Registrar<OperatorImpl_cpu>::getKeys();
+        return std::vector<ImplSpec>(implSpecsSet.begin(), implSpecsSet.end());
+    }

-    void backward();
+    void forward() override;
+    void backward() override;
 };
-
-namespace {
-static Registrar<Producer_Op> registrarProducer1DImpl_cpu("cpu", Aidge::ProducerImpl_cpu::create);
-}  // namespace
 }  // namespace Aidge

-#endif /* AIDGE_CPU_OPERATOR_PRODUCERIMPL_H_ */
\ No newline at end of file
+#endif /* AIDGE_CPU_OPERATOR_IMPL_H_ */
--- a/include/aidge/backend/cpu/operator/PadImpl.hpp
+++ b/include/aidge/backend/cpu/operator/PadImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_PADIMPL_H_
+#define AIDGE_CPU_OPERATOR_PADIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Pad.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+
+namespace Aidge {
+class Pad_ProdConso_cpu : public ProdConso {
+public:
+    Pad_ProdConso_cpu(const Operator& op): ProdConso(op) {}
+
+    static std::unique_ptr<ProdConso> defaultModel(const Operator& op) {
+        return std::make_unique<Pad_ProdConso_cpu>(op);
+    }
+
+    Elts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
+};
+
+// Operator implementation entry point for the backend
+using Pad1D_Op = Pad_Op<1>;
+using PadImpl1D_cpu = OperatorImpl_cpu<Pad_Op<1>,
+    void(const std::array<DimSize_t, 2>&,
+                            const PadBorderType,
+                            const double,
+                            const std::array<DimSize_t, 3> &,
+                            const void *,
+                            void *)>;
+
+using Pad2D_Op = Pad_Op<2>;
+using PadImpl2D_cpu = OperatorImpl_cpu<Pad_Op<2>,
+    void(const std::array<DimSize_t, 4>&,
+                            const PadBorderType,
+                            const double,
+                            const std::array<DimSize_t, 4> &,
+                            const void *,
+                            void *)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(Pad1D_Op, "cpu", Aidge::PadImpl1D_cpu::create);
+REGISTRAR(Pad2D_Op, "cpu", Aidge::PadImpl2D_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_PADIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/PadImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/PadImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_PADIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_PADIMPL_KERNELS_H_
+
+#include <algorithm>  // std::max, std::min
+#include <array>
+#include <cstddef>    // std::size_t
+#include <cstdint>    // std::int32_t
+
+#include "aidge/backend/cpu/operator/PadImpl.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+/**
+ * @brief Forward kernel for 1D Padding on CPU backend.
+ * @tparam I Input data type.
+ * @tparam O Output data type.
+ * @param attrs tuple of Parameters from the Operator
+ * @param dims Array of input dimensions.
+ * @param input_ const input Tensor.
+ * @param output_ Output Tensor.
+ */
+template <class I, class O>
+void PadImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 2>& beginEndBorders,
+                                const PadBorderType borderType,
+                                const double borderValue,
+                                const std::array<DimSize_t, 3>& dims,
+                                const void *input_,
+                                void *output_)
+{
+    const I *input = static_cast<const I *>(input_);
+    O *output = static_cast<O *>(output_);
+
+    const std::size_t oxSize = dims[2] + beginEndBorders[0] + beginEndBorders[1];
+
+    for (std::size_t batch = 0; batch < dims[0]; ++batch) {
+        for (std::size_t ch = 0; ch < dims[1]; ++ch) {
+            const std::size_t iIndex = (ch + batch*dims[1]) * dims[2];
+            const std::size_t oIndex = (ch + batch*dims[1]) * oxSize;
+
+            for (unsigned int ox = 0; ox < oxSize; ++ox) {
+                const std::size_t oIndexFull = oIndex + ox;
+
+                O outputValue = static_cast<O>(borderValue);
+
+                if (borderType == PadBorderType::Constant) {
+                    int ix = static_cast<int>(ox) - static_cast<int>(beginEndBorders[0]);
+
+                    if (ix >= 0  && ix < static_cast<int>(dims[2])) {
+                        outputValue = input[iIndex + static_cast<std::size_t>(ix)];
+                    }
+                }
+                else if (borderType == PadBorderType::Edge) {
+                    int ix = std::max(0, std::min(static_cast<int>(dims[2]) - 1, static_cast<int>(ox) - static_cast<int>(beginEndBorders[0])));
+
+                    outputValue = input[iIndex + static_cast<std::size_t>(ix)];
+                }
+                else if (borderType == PadBorderType::Reflect) {
+                    int ix = static_cast<int>(ox) - static_cast<int>(beginEndBorders[0]);
+
+                    if (ix < 0)
+                        ix = 0 - ix;
+                    if (ix >= static_cast<int>(dims[2]))
+                        ix = static_cast<int>(dims[2]) - ix;
+
+                    outputValue = input[iIndex + static_cast<std::size_t>(ix)];
+                }
+                else if (borderType == PadBorderType::Wrap) {
+                    int ix = (static_cast<int>(dims[2]) + static_cast<int>(ox) - static_cast<int>(beginEndBorders[0])) % static_cast<int>(dims[2]);
+
+                    outputValue = input[iIndex + static_cast<std::size_t>(ix)];
+                }
+
+                output[oIndexFull] = outputValue;
+            }
+        }
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(PadImpl1D_cpu,
+    {{DataType::Float32, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}},
+    {Pad_ProdConso_cpu::defaultModel, Aidge::PadImpl1D_cpu_forward_kernel<cpptype_t<DataType::Float32>, cpptype_t<DataType::Float32>>, nullptr});
+REGISTRAR(PadImpl1D_cpu,
+    {{DataType::Float64, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}},
+    {Pad_ProdConso_cpu::defaultModel, Aidge::PadImpl1D_cpu_forward_kernel<cpptype_t<DataType::Float64>, cpptype_t<DataType::Float64>>, nullptr});
+REGISTRAR(PadImpl1D_cpu,
+    {{DataType::Int32, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}},
+    {Pad_ProdConso_cpu::defaultModel, Aidge::PadImpl1D_cpu_forward_kernel<cpptype_t<DataType::Int32>, cpptype_t<DataType::Int32>>, nullptr});
+
+
+/**
+ * @brief Forward kernel for 2D Padding on CPU backend.
+ * @tparam I Input data type.
+ * @tparam O Output data type.
+ * @param attrs tuple of Parameters from the Operator
+ * @param dims Array of input dimensions.
+ * @param input_ const input Tensor.
+ * @param output_ Output Tensor.
+ */
+template <class I, class O>
+void PadImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 4>& beginEndBorders,
+                                const PadBorderType borderType,
+                                const double borderValue,
+                                const std::array<DimSize_t, 4> &dims,
+                                const void *input_,
+                                void *output_)
+{
+    const I *input = static_cast<const I *>(input_);
+    O *output = static_cast<O *>(output_);
+
+    const std::size_t oySize = dims[2] + beginEndBorders[0] + beginEndBorders[2];
+    const std::size_t oxSize = dims[3] + beginEndBorders[1] + beginEndBorders[3];
+
+    for (std::size_t batch = 0; batch < dims[0]; ++batch) {
+        for (std::size_t ch = 0; ch < dims[1]; ++ch) {
+            const std::size_t iIndex = (ch + batch*dims[1]) * dims[2] * dims[3];
+            const std::size_t oIndex = (ch + batch*dims[1]) * oxSize * oySize;
+
+            for (std::uint32_t oy = 0; oy < oySize; ++oy) {
+                for (std::uint32_t ox = 0; ox < oxSize; ++ox) {
+                    const std::size_t oIndexFull = oIndex + oy*oxSize + ox;
+
+                    O outputValue = static_cast<O>(borderValue);
+
+                    if (borderType == PadBorderType::Constant) {
+                        std::int32_t ix = static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[1]);
+                        std::int32_t iy = static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[0]);
+
+                        if (ix >= 0  && ix < static_cast<std::int32_t>(dims[3]) && iy >= 0  && iy < static_cast<std::int32_t>(dims[2])) {
+                            outputValue = input[iIndex + static_cast<std::size_t>(iy)*dims[3] + static_cast<std::size_t>(ix)];
+                        }
+                    }
+                    else if (borderType == PadBorderType::Edge) {
+                        std::int32_t ix = std::max(0, std::min(static_cast<std::int32_t>(dims[3]) - 1, static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[1])));
+                        std::int32_t iy = std::max(0, std::min(static_cast<std::int32_t>(dims[2]) - 1, static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[0])));
+
+                        outputValue = input[iIndex + static_cast<std::size_t>(iy)*dims[3] + static_cast<std::size_t>(ix)];
+                    }
+                    else if (borderType == PadBorderType::Reflect) {
+                        std::int32_t ix = static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[1]);
+                        std::int32_t iy = static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[0]);
+
+                        if (ix < 0)
+                            ix = 0 - ix;
+                        if (iy < 0)
+                            iy = 0 - iy;
+                        if (ix >= static_cast<std::int32_t>(dims[3]))
+                            ix = static_cast<std::int32_t>(dims[3]) - ix;
+                        if (iy >= static_cast<std::int32_t>(dims[2]))
+                            iy = static_cast<std::int32_t>(dims[2]) - iy;
+
+                        outputValue = input[iIndex + static_cast<std::size_t>(iy)*dims[3] + static_cast<std::size_t>(ix)];
+                    }
+                    else if (borderType == PadBorderType::Wrap) {
+                        std::int32_t ix = (static_cast<std::int32_t>(dims[3]) + static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[1])) % static_cast<std::int32_t>(dims[3]);
+                        std::int32_t iy = (static_cast<std::int32_t>(dims[2]) + static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[0])) % static_cast<std::int32_t>(dims[2]);
+
+                        outputValue = input[iIndex + static_cast<std::size_t>(iy)*dims[3] + static_cast<std::size_t>(ix)];
+                    }
+
+                    output[oIndexFull] = outputValue;
+                }
+            }
+        }
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(PadImpl2D_cpu,
+    {{DataType::Float32, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}},
+    {Pad_ProdConso_cpu::defaultModel, Aidge::PadImpl2D_cpu_forward_kernel<cpptype_t<DataType::Float32>, cpptype_t<DataType::Float32>>, nullptr});
+REGISTRAR(PadImpl2D_cpu,
+    {{DataType::Float64, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}},
+    {Pad_ProdConso_cpu::defaultModel, Aidge::PadImpl2D_cpu_forward_kernel<cpptype_t<DataType::Float64>, cpptype_t<DataType::Float64>>, nullptr});
+REGISTRAR(PadImpl2D_cpu,
+    {{DataType::Int32, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}},
+    {Pad_ProdConso_cpu::defaultModel, Aidge::PadImpl2D_cpu_forward_kernel<cpptype_t<DataType::Int32>, cpptype_t<DataType::Int32>>, nullptr});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_PADIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/PaddedConvImpl.hpp
+++ b/include/aidge/backend/cpu/operator/PaddedConvImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_PADDEDCONVIMPL_H_
+#define AIDGE_CPU_OPERATOR_PADDEDCONVIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/MetaOperatorDefs.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using PaddedConv1D_Op = MetaOperator_Op;
+using PaddedConvImpl1D_cpu = OperatorImpl_cpu<MetaOperator_Op,
+    void(const std::array<DimSize_t, 2>&,
+                            const std::array<DimSize_t, 1>&,
+                            const std::array<DimSize_t, 1>&,
+                            const std::array<DimSize_t, 1>&,
+                            const std::array<DimSize_t, 3> &,
+                            DimSize_t,
+                            const void *,
+                            const void *,
+                            const void *,
+                            void *)>;
+
+using PaddedConv2D_Op = MetaOperator_Op;
+using PaddedConvImpl2D_cpu = OperatorImpl_cpu<MetaOperator_Op,
+    void(const std::array<DimSize_t, 4>&,
+                            const std::array<DimSize_t, 2>&,
+                            const std::array<DimSize_t, 2>&,
+                            const std::array<DimSize_t, 2>&,
+                            const std::array<DimSize_t, 4> &,
+                            DimSize_t,
+                            const void *,
+                            const void *,
+                            const void *,
+                            void *)>;
+
+// Implementation entry point registration to Operator
+// Uncomment to activate implementation for PaddedConv. It is currently less efficient, hence why it is commented.
+// REGISTRAR(PaddedConv1D_Op, std::array<std::string, 2>({"cpu", "PaddedConv1D"}), Aidge::PaddedConvImpl1D_cpu::create);
+// REGISTRAR(PaddedConv2D_Op, std::array<std::string, 2>({"cpu", "PaddedConv2D"}), Aidge::PaddedConvImpl2D_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_PADDEDCONVIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/PaddedConvImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/PaddedConvImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_PADDEDCONVIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_PADDEDCONVIMPL_KERNELS_H_
+
+#include <array>
+#include <cstddef>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/backend/cpu/operator/PaddedConvImpl.hpp"
+#include "aidge/operator/Pad.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+// Only works for constant padding zero
+/**
+ * @brief Forward kernel for 1D Convolution on CPU backend.
+ * @tparam I Input data type.
+ * @tparam W Weight data type.
+ * @tparam B Bias data type.
+ * @tparam O Output data type.
+ * @param params tuple of Attributes from the Operator
+ * @param inputDims Array of input dimensions.
+ * @param input_ const input Tensor.
+ * @param weights_ const weight Tensor.
+ * @param biases_ const Biais Tensor.
+ * @param output_ Output Tensor.
+ */
+template <class I, class W, class B, class O>
+void PaddedConvImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 2>& beginEndBorders,
+                            const std::array<DimSize_t, 1>& strideDims,
+                            const std::array<DimSize_t, 1>& dilationDims,
+                            const std::array<DimSize_t, 1>& kernelDims,
+                            const std::array<DimSize_t, 3>& inputDims,
+                            DimSize_t outChannels,
+                            const void *input_,
+                            const void *weights_,
+                            const void *biases_,
+                            void *output_)
+{
+    // FIXME: missing convolution attributes as arguments
+    const I *input = static_cast<const I *>(input_);
+    const W *weights = static_cast<const W *>(weights_);
+    const B *biases = static_cast<const B *>(biases_);
+    O *output = static_cast<O *>(output_);
+
+    // output H size
+    const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1;
+    const std::size_t oxSize =
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilated_kernel_x + strideDims[0]) /
+                                static_cast<float>(strideDims[0])));
+
+    // TODO: kernel computation
+    // output (batch, outCh, Xout, Yout)
+    // input  (batch, inCh, Xin, Yin)
+    // weight (outCh, inCh, kernelX, kernelY)
+    // does not take Dilation attribute into account
+    using signedsize = std::make_signed<std::size_t>::type;
+    for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+        for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
+            const std::size_t oIndex = (outCh + batch*outChannels) * oxSize;
+            // If bias = nullptr, set B(0)
+            B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
+            std::fill(output + oIndex, output+(oIndex+oxSize), biasVal);
+            for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
+                const std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2];
+                const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0];
+                for (std::size_t ox = 0; ox < oxSize; ++ox) {
+                    const signedsize difx = static_cast<signedsize>(ox * strideDims[0]);
+                    const std::size_t sxMin = static_cast<std::size_t>(std::max(static_cast<signedsize>(beginEndBorders[0]) - difx, signedsize(0)));
+                    const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + static_cast<signedsize>(beginEndBorders[1]) - difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx);
+
+                    const std::size_t oIndexFull = oIndex + ox;
+                    const signedsize ix = static_cast<signedsize>(ox * strideDims[0]) - static_cast<signedsize>(beginEndBorders[0]);
+
+                    for (std::size_t sx = sxMin; sx*dilationDims[0] < sxMax; ++sx) {
+                        output[oIndexFull] += weights[wIndex + sx] *
+                                                input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx*dilationDims[0]))];
+                    }
+                }
+            }
+        }
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(PaddedConvImpl1D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}, DynamicAttributes(std::map<std::string, future_std::any>({std::make_pair("type", future_std::any(std::string("PaddedConv1D")))}))},
+    {ProdConso::inPlaceModel, Aidge::PaddedConvImpl1D_cpu_forward_kernel<float, float, float, float>, nullptr});
+REGISTRAR(PaddedConvImpl1D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float16, DataFormat::NCHW}, DynamicAttributes(std::map<std::string, future_std::any>({std::make_pair("type", future_std::any(std::string("PaddedConv1D")))}))},
+    {ProdConso::inPlaceModel, Aidge::PaddedConvImpl1D_cpu_forward_kernel<half_float::half, half_float::half, half_float::half, half_float::half>, nullptr});
+REGISTRAR(PaddedConvImpl1D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}, DynamicAttributes(std::map<std::string, future_std::any>({std::make_pair("type", future_std::any(std::string("PaddedConv1D")))}))},
+    {ProdConso::inPlaceModel, Aidge::PaddedConvImpl1D_cpu_forward_kernel<int32_t, int32_t, int32_t, int32_t>, nullptr});
+REGISTRAR(PaddedConvImpl1D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}, DynamicAttributes(std::map<std::string, future_std::any>({std::make_pair("type", future_std::any(std::string("PaddedConv1D")))}))},
+    {ProdConso::inPlaceModel, Aidge::PaddedConvImpl1D_cpu_forward_kernel<double, double, double, double>, nullptr});
+
+
+/**
+ * @brief Forward kernel for 2D Convolution on CPU backend.
+ * @tparam I Input data type.
+ * @tparam W Weight data type.
+ * @tparam B Bias data type.
+ * @tparam O Output data type.
+ * @param params tuple of Attributes from the Operator
+ * @param inputDims Array of input dimensions.
+ * @param input_ const input Tensor.
+ * @param weights_ const weight Tensor.
+ * @param biases_ const Biais Tensor.
+ * @param output_ Output Tensor.
+ */
+template <class I, class W, class B, class O>
+void PaddedConvImpl2D_cpu_forward_kernel(
+                            const std::array<DimSize_t, 4>& beginEndBorders,
+                            const std::array<DimSize_t, 2>& strideDims,
+                            const std::array<DimSize_t, 2>& dilationDims,
+                            const std::array<DimSize_t, 2>& kernelDims,
+                            const std::array<DimSize_t, 4> &inputDims,
+                            DimSize_t outChannels,
+                            const void *input_,
+                            const void *weights_,
+                            const void *biases_,
+                            void *output_)
+{
+    // FIXME: missing convolution attributes as arguments
+    const I *input = static_cast<const I *>(input_);
+    const W *weights = static_cast<const W *>(weights_);
+    const B *biases = static_cast<const B *>(biases_);
+    O *output = static_cast<O *>(output_);
+
+    // output H size
+    const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1;
+    const std::size_t oxSize =
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilated_kernel_x + beginEndBorders[0] + beginEndBorders[2] + strideDims[0]) /
+                                static_cast<float>(strideDims[0])));
+    // output W size
+    const DimSize_t dilated_kernel_y = dilationDims[1]*(kernelDims[1] - 1) + 1;
+    const std::size_t oySize =
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[3] - dilated_kernel_y + beginEndBorders[1] + beginEndBorders[3] + strideDims[1]) /
+                                static_cast<float>(strideDims[1])));
+
+    for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+        for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
+            const std::size_t oIndex = (outCh + batch*outChannels) * oxSize * oySize;
+            // If bias = nullptr, set B(0)
+            B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
+            std::fill(output + oIndex, output+(oIndex+oxSize*oySize), biasVal);
+            for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
+                const std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
+                const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0] * kernelDims[1];
+                for (std::size_t ox = 0; ox < oxSize; ++ox) {
+                    const std::size_t difx = ox * strideDims[0];
+                    const std::size_t sxMin = beginEndBorders[0] < difx ? std::size_t(0) : beginEndBorders[0] - difx;
+                    const std::size_t sxMax = (inputDims[2] + beginEndBorders[2]) < difx ?
+                                                0 :
+                                                ((inputDims[2] + beginEndBorders[2]) > dilated_kernel_x + difx ?
+                                                    dilated_kernel_x :
+                                                    (inputDims[2] + beginEndBorders[2] - difx));
+
+                    for (std::size_t oy = 0; oy < oySize; ++oy) {
+                        const std::size_t dify = oy * strideDims[1];
+                        const std::size_t syMin = beginEndBorders[1] < dify ? std::size_t(0) : beginEndBorders[1] - dify;
+                        const std::size_t syMax = (inputDims[3] + beginEndBorders[3]) < dify ?
+                                                0 :
+                                                ((inputDims[3] + beginEndBorders[3]) > dilated_kernel_y + dify ?
+                                                    dilated_kernel_y :
+                                                    (inputDims[3] + beginEndBorders[3] - dify));
+                        const std::size_t oIndexFull = oIndex + ox*oySize + oy;
+                        const std::size_t ix = ox * strideDims[0] - beginEndBorders[0];
+                        const std::size_t iy = oy * strideDims[1] - beginEndBorders[1];
+
+
+                        if (sxMin == 0 && syMin == 0 && sxMax == 3 && syMax == 3) {
+                            output[oIndexFull] += (weights[wIndex + 0*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
+                                                   weights[wIndex + 0*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
+                                                   weights[wIndex + 0*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+2)] +
+                                                   weights[wIndex + 1*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
+                                                   weights[wIndex + 1*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
+                                                   weights[wIndex + 1*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+2)] +
+                                                   weights[wIndex + 2*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
+                                                   weights[wIndex + 2*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
+                                                   weights[wIndex + 2*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+2)]);
+                        } else {
+                            for (std::size_t sx = sxMin; sx*dilationDims[0] < sxMax; ++sx) {
+                                for (std::size_t sy = syMin; sy*dilationDims[1] < syMax; ++sy) {
+                                    output[oIndexFull] += weights[wIndex + sx*kernelDims[1] + sy] *
+                                                            input[iIndex + (sx*dilationDims[0] + ix)*inputDims[3] + sy*dilationDims[1] + iy];
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+
+// Kernels registration to implementation entry point
+REGISTRAR(PaddedConvImpl2D_cpu,
+    // ImplSpec{std::vector<ImplSpec::IOSpec>({ImplSpec::IOSpec{DataType::Any, DataFormat::NCHW}, ImplSpec::IOSpec{DataType::Any, DataFormat::NCHW}}) , std::vector<ImplSpec::IOSpec>({ImplSpec::IOSpec{DataType::Int32, DataFormat::NCHW}})},
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}, DynamicAttributes(std::map<std::string, future_std::any>({std::make_pair("type", future_std::any(std::string("PaddedConv2D")))}))},
+    {ProdConso::inPlaceModel, Aidge::PaddedConvImpl2D_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t, std::int32_t>, nullptr});
+REGISTRAR(PaddedConvImpl2D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float16, DataFormat::NCHW}, DynamicAttributes(std::map<std::string, future_std::any>({std::make_pair("type", future_std::any(std::string("PaddedConv2D")))}))},
+    {ProdConso::inPlaceModel, Aidge::PaddedConvImpl2D_cpu_forward_kernel<half_float::half, half_float::half, half_float::half, half_float::half>, nullptr});
+REGISTRAR(PaddedConvImpl2D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}, DynamicAttributes(std::map<std::string, future_std::any>({std::make_pair("type", future_std::any(std::string("PaddedConv2D")))}))},
+    {ProdConso::inPlaceModel, Aidge::PaddedConvImpl2D_cpu_forward_kernel<float, float, float, float>, nullptr});
+REGISTRAR(PaddedConvImpl2D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}, DynamicAttributes(std::map<std::string, future_std::any>({std::make_pair("type", future_std::any(std::string("PaddedConv2D")))}))},
+    {ProdConso::inPlaceModel, Aidge::PaddedConvImpl2D_cpu_forward_kernel<double, double, double, double>, nullptr});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_PADDEDCONVIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/PowImpl.hpp
+++ b/include/aidge/backend/cpu/operator/PowImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_POWIMPL_H_
+#define AIDGE_CPU_OPERATOR_POWIMPL_H_
+
+#include <cstddef>  // std::size_t
+#include <memory>   // std::unique_ptr, std::make_unique
+#include <string>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Pow.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using PowImpl_cpu = OperatorImpl_cpu<Pow_Op,
+    void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*, void*),
+    void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*, const void*, void*, void*)>;
+
+
+// Implementation entry point registration to Operator
+REGISTRAR(Pow_Op, "cpu", Aidge::PowImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_POWIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/PowImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/PowImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_POWIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_POWIMPL_KERNELS_H_
+
+#include "aidge/utils/Registrar.hpp"
+
+#include <cstddef>  // std::size_t
+
+#include "aidge/backend/cpu/data/Broadcasting.hpp"
+#include "aidge/backend/cpu/operator/PowImpl.hpp"
+
+namespace Aidge {
+
+namespace {
+// suppose values are contiguous in memory
+template <class I, class O>
+void pow_contiguous_arrays(const std::size_t input1size,
+                            const std::size_t input2size,
+                            const std::size_t output1size,
+                            const I* input1,
+                            const I* input2,
+                            O* output)
+{
+    for (std::size_t i = 0; i < output1size; ++i)
+    {
+        const std::size_t in1_id = (input1size != 1) ? i : 0;
+        const std::size_t in2_id = (input2size != 1) ? i : 0;
+        output[i] = static_cast<O>(std::pow(input1[in1_id], input2[in2_id]));
+    }
+}
+}
+
+template <class I, class O>
+void PowImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
+                                std::vector<std::size_t> dims1,
+                                const std::vector<std::size_t>& outputDims,
+                                const void* input0_,
+                                const void* input1_,
+                                void* output_) {
+
+    const I* input_0 = static_cast<const I*>(input0_);
+    const I* input_1 = static_cast<const I*>(input1_);
+    O* output = static_cast<O*>(output_);
+
+    // [5,2,1,7] & [2,6,7]
+    // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7]
+    // 2. Find the highest equal dimension -> 3
+    //    Exception: if the first diverging dimension is the last one, then -> 4 (dims.size())
+    // 3. Compute the highest number of contiguous data -> 7
+    // 4. Compute stride and offset step for the broadcast mechanism
+    // 5. Call a simple kernel
+
+    // special case for equal dimensions, the kernel is called with the entire arrays at once
+    if (dims0 == dims1) {
+        const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+        for (std::size_t i = 0; i < input0_contiguous_size; ++i)
+        {
+            output[i] = static_cast<O>(std::pow(input_0[i], input_1[i]));
+        }
+        return;
+    }
+
+    // set dimensions to be of equal size by filling the smallest one with ones.
+    if (dims0.size() > dims1.size()) {
+        dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1));
+    }
+    else if (dims1.size() > dims0.size()) {
+        dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1));
+    }
+
+    const std::size_t nbDims = dims0.size();
+
+    // Find the highest equal dimension
+    // std::size_t contiguousIdx = nbDims - 1;
+    std::size_t contiguousIdx = nbDims;
+    while (contiguousIdx-- > 0) {
+    // for (; contiguousIdx+1 > 0; --contiguousIdx) {
+        if (dims0[contiguousIdx] != dims1[contiguousIdx]) {
+            if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1
+                const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1;
+                while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) {
+                    --contiguousIdx;
+                }
+            }
+            break;
+        }
+    }
+    ++contiguousIdx;
+
+    // Compute the highest number of contiguous data for each Tensor
+    const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+
+    // initialize strides to iterate through data because of broadcasting
+    std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    if (contiguousIdx > 0) {
+        stride_post0[contiguousIdx - 1] = 1;
+        stride_post1[contiguousIdx - 1] = 1;
+        for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) {
+            stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]);
+            stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]);
+        }
+        for (std::size_t i = 0; i != contiguousIdx; ++i) {
+            stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1;
+            stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1;
+        }
+    }
+
+    // variables for arrays offsets
+    std::size_t offsetIn0 = 0;
+    std::size_t offsetIn1 = 0;
+    std::size_t offsetOut = 0;
+
+
+    std::size_t dim = contiguousIdx - 1;
+    const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
+    for (std::size_t stack = 0; stack < nbStacks;) {
+        pow_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
+                    input_0 + offsetIn0*input0_contiguous_size,
+                    input_1 + offsetIn1*input1_contiguous_size,
+                    output + offsetOut*output_contiguous_size);
+        if (++stack < nbStacks) {
+            std::size_t tmp_stack = stack;
+            while(tmp_stack % outputDims[dim] == 0) {
+                tmp_stack /= outputDims[dim];
+                dim--;
+            }
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = contiguousIdx - 1;
+        }
+    }
+}
+
+
+template <class I1, class I2, class O>
+void PowImpl_cpu_backward_kernel(const std::vector<std::size_t>& input0Dims,
+                                const std::vector<std::size_t>& input1Dims,
+                                const std::vector<std::size_t>& outputDims,
+                                const void* input0_,
+                                const void* input1_,
+                                const void* gradOutput_,
+                                void* gradientInput0_,
+                                void* gradientInput1_) {
+	const I1* input0 = static_cast<const I1*>(input0_);
+	I1* grad0 = static_cast<I1*>(gradientInput0_);
+    const I2* input1 = static_cast<const I2*>(input1_);
+    I2* grad1 = static_cast<I2*>(gradientInput1_);
+    const O* gradOut = static_cast<const O*>(gradOutput_);
+
+	std::size_t totalElements = std::accumulate(outputDims.cbegin(), outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    for (size_t oIndex = 0; oIndex < totalElements; ++oIndex)
+    {
+        // Compute indexes in inputs 0 and 1 to support broadcasting
+        std::vector<std::size_t> indexes = getMultiDimIndices(outputDims, oIndex);
+        std::size_t idx0 = getFlattenedIndex(input0Dims, indexes);
+        std::size_t idx1 = getFlattenedIndex(input1Dims, indexes);
+
+        // grad0 = grad_output * (input1 * pow(input0, (input1 -1)))
+        grad0[idx0] += gradOut[oIndex]*input1[idx1]* std::pow(input0[idx0], input1[idx1]-1);
+
+        // grad1 = grad_output * (output * ln(input0))
+        grad1[idx1] += gradOut[oIndex] * std::pow(input0[idx0], input1[idx1]) * std::log(input0[idx0]);
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(PowImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float32}},
+    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<float, float>, Aidge::PowImpl_cpu_backward_kernel<float, float, float>});
+REGISTRAR(PowImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}},
+    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<double, double>, Aidge::PowImpl_cpu_backward_kernel<double, double, double>});
+REGISTRAR(PowImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}},
+    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<int32_t, int32_t>, Aidge::PowImpl_cpu_backward_kernel<int32_t, int32_t, int32_t>});
+REGISTRAR(PowImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int64}},
+    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, Aidge::PowImpl_cpu_backward_kernel<std::int64_t, std::int64_t, std::int64_t>});
+REGISTRAR(PowImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int8}},
+    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<std::int8_t, std::int8_t>, Aidge::PowImpl_cpu_backward_kernel<std::int8_t, std::int8_t, std::int8_t>});
+REGISTRAR(PowImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::UInt8}},
+    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<std::uint8_t, std::uint8_t>, Aidge::PowImpl_cpu_backward_kernel<std::uint8_t, std::uint8_t, std::uint8_t>});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_POWIMPL_KERNELS_H_ */
--- a/include/aidge/operator/ReLUImpl.hpp
+++ b/include/aidge/operator/ReLUImpl.hpp
@@ -12,52 +12,24 @@
 #ifndef AIDGE_CPU_OPERATOR_RELUIMPL_H_
 #define AIDGE_CPU_OPERATOR_RELUIMPL_H_

-#include "aidge/backend/OperatorImpl.hpp"
+#include <cstddef>  // std::size_t
+#include <memory>
+#include <tuple>    // std::tuple
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/ReLU.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
-#include <memory>
-#include <vector>

 namespace Aidge {
-// class ReLU_Op;
-
-// compute kernel registry for forward and backward
-class ReLUImplForward_cpu
-    : public Registrable<ReLUImplForward_cpu, std::tuple<DataType, DataType>, void(const std::size_t, const void*, void*)> {
-};
-class ReLUImplBackward_cpu
-    : public Registrable<ReLUImplBackward_cpu, std::tuple<DataType, DataType>, void(const std::size_t, const void*, void*)> {
-};
-
-class ReLUImpl_cpu : public OperatorImpl {
-   protected:
-    const ReLU_Op& mOp;
-    std::array<NbElts_t, 1> mNbConsumedData;
-    std::array<NbElts_t, 1> mNbProducedData;
-
-   public:
-    ReLUImpl_cpu(const ReLU_Op& op) : mOp(op), mNbConsumedData({0}), mNbProducedData({0}) {}
-
-    static std::unique_ptr<ReLUImpl_cpu> create(const ReLU_Op& op) {
-        return std::make_unique<ReLUImpl_cpu>(op);
-    }
-
-   public:
-    NbElts_t getNbRequiredData(const IOIndex_t inputIdx) const override final;
-    NbElts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
-    NbElts_t getRequiredMemory(__attribute__((unused)) const IOIndex_t outputIdx, __attribute__((unused)) const std::vector<DimSize_t>& inputsSize) const override final;
-    NbElts_t getNbConsumedData(const IOIndex_t inputIdx) const override final;
-    NbElts_t getNbProducedData(const IOIndex_t outputIdx) const override final;
-
-    void forward();
-
-    void backward();
-};
+// Operator implementation entry point for the backend
+using ReLUImpl_cpu = OperatorImpl_cpu<ReLU_Op,
+    void(const std::size_t, const void*, void*),
+    void(const std::size_t, const void*, const void*, void*)>;

-namespace {
-static Registrar<ReLU_Op> registrarReLUImpl_cpu("cpu", Aidge::ReLUImpl_cpu::create);
-}
+// Implementation entry point registration to Operator
+REGISTRAR(ReLU_Op, "cpu", Aidge::ReLUImpl_cpu::create);
 }  // namespace Aidge

-#endif /* AIDGE_CPU_OPERATOR_RELUIMPL_H_ */
\ No newline at end of file
+#endif /* AIDGE_CPU_OPERATOR_RELUIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/ReLUImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ReLUImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_RELUIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_RELUIMPL_KERNELS_H_
+
+#include <cstddef>  // std::size_t
+#include <memory>
+#include <tuple>    // std::tuple
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/backend/cpu/operator/ReLUImpl.hpp"
+#include "aidge/operator/ReLU.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+// Kernels
+template <class I, class O>
+void ReLUImpl_cpu_forward_kernel(std::size_t inputLength,
+                                     const void* input_,
+                                     void* output_) {
+
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+
+//#pragma omp parallel for if (inputLength > 1024)
+    for (std::size_t i = 0; i < inputLength; ++i) {
+        output[i] = (input[i] > 0) ? input[i] : 0;
+    }
+}
+
+template <class I, class GI, class GO>
+void ReLUImpl_cpu_backward_kernel(const std::size_t inputLength,
+                                  const void* input_, const void* grad_output_,
+				  void* grad_input_) {
+    const I* input = static_cast<const I*>(input_);
+    const GO* grad_output = static_cast<const GO*>(grad_output_);
+    GI* grad_input = static_cast<GI*>(grad_input_);
+    for (std::size_t i = 0; i < inputLength; ++i) {
+        grad_input[i] += (input[i] > 0) ? grad_output[i] : 0;
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(ReLUImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::ReLUImpl_cpu_forward_kernel<float, float>, Aidge::ReLUImpl_cpu_backward_kernel<float, float, float>});
+REGISTRAR(ReLUImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::ReLUImpl_cpu_forward_kernel<double, double>, Aidge::ReLUImpl_cpu_backward_kernel<double, double, double>});
+REGISTRAR(ReLUImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::inPlaceModel, Aidge::ReLUImpl_cpu_forward_kernel<int32_t, int32_t>, nullptr});
+REGISTRAR(ReLUImpl_cpu,
+    {DataType::Int8},
+    {ProdConso::inPlaceModel, Aidge::ReLUImpl_cpu_forward_kernel<int8_t, int8_t>, nullptr});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_RELUIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/ReduceMeanImpl.hpp
+++ b/include/aidge/backend/cpu/operator/ReduceMeanImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_REDUCEMEANIMPL_H_
+#define AIDGE_CPU_OPERATOR_REDUCEMEANIMPL_H_
+
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/ReduceMean.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using ReduceMeanImpl_cpu = OperatorImpl_cpu<ReduceMean_Op,
+    void(const std::vector<std::int32_t>&,
+                            DimSize_t,
+                            const std::vector<DimSize_t>&,
+                            const void *,
+                            void *)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(ReduceMean_Op, "cpu", Aidge::ReduceMeanImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_REDUCEMEANIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/ReduceMeanImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ReduceMeanImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_REDUCEMEANIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_REDUCEMEANIMPL_KERNELS_H_
+
+#include <algorithm>   // std::for_each
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::int32_t
+#include <functional>  //std::multiplies
+#include <numeric>     //std::accumulate
+#include <vector>
+
+#include "aidge/backend/cpu/operator/ReduceMeanImpl.hpp"
+#include "aidge/data/Data.hpp"
+#include "aidge/operator/ReduceMean.hpp"
+#include "aidge/utils/Registrar.hpp"
+
+namespace Aidge {
+
+template <typename T>
+using Acc_T = typename std::conditional_t<std::is_floating_point<T>::value, T, double>;
+
+template <typename T>
+typename std::enable_if<std::is_floating_point<T>::value, T>::type
+stableMean(const T* vec, std::size_t len, std::size_t stride) {
+  T mean = 0;
+  for (std::size_t i = 0; i < len; ++i) {
+    mean = std::fma(vec[i * stride] - mean, static_cast<T>(1) / static_cast<T>(i + 1), mean);
+  }
+  return mean;
+}
+
+// Specialization for integers: perform the mean computation in float
+template <typename T>
+typename std::enable_if_t<!std::is_floating_point<T>::value, double>
+stableMean(const T* vec, std::size_t len, std::size_t stride) {
+  double mean = 0;
+  for (size_t i = 0; i < len; ++i) {
+    mean = std::fma<double>(static_cast<double>(vec[i * stride]) - mean, 1.0 / static_cast<double>(i + 1), mean);
+  }
+  return mean;
+}
+
+template <typename T>
+typename std::enable_if_t<std::is_floating_point<T>::value, T>
+castFromFloat(T value) {
+  return value;
+}
+
+template <typename T>
+typename std::enable_if_t<!std::is_floating_point<T>::value, T>
+castFromFloat(double value) {
+  return static_cast<T>(std::nearbyint(value));
+}
+
+template <class I, class O>
+void ReduceMeanImpl_cpu_forward_kernel(const std::vector<std::int32_t>& axes,
+                                    DimSize_t /*keepDims*/,
+                                    const std::vector<DimSize_t>& inputDims,
+                                    const void* input_,
+                                    void* output_) {
+
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+
+    const std::size_t nb_dims = inputDims.size();
+    const std::size_t totalElements = std::accumulate(inputDims.cbegin(), inputDims.cend(), 1, std::multiplies<std::size_t>());
+
+    if (axes.empty()){
+        std::copy_n(input,totalElements, output);
+    }
+    else if (axes.size() == 1) {
+        const std::size_t stride_pre = std::accumulate(inputDims.cbegin(), inputDims.cbegin() + axes[0], 1, std::multiplies<std::size_t>());
+        const std::size_t stride_post = std::accumulate(inputDims.crbegin(), inputDims.crbegin() + nb_dims -1 - axes[0], 1, std::multiplies<std::size_t>());
+
+        const std::size_t dim_i = inputDims[axes[0]];
+        for (std::size_t pre = 0; pre < stride_pre; ++pre) {
+            for (std::size_t post = 0; post < stride_post; ++post) {
+                const std::size_t idx_i = pre * dim_i * stride_post + post;
+                const std::size_t idx_o = pre * stride_post + post;
+                output[idx_o]  = castFromFloat<O>(stableMean(input + idx_i, dim_i, stride_post));
+            }
+        }
+    } else {
+        std::size_t outputElements = totalElements;
+
+        auto stride_post = std::unique_ptr<std::size_t[]>(new std::size_t[nb_dims]);
+        stride_post[nb_dims - 1] = 1;
+        for (std::size_t i = nb_dims-2; i != static_cast<std::size_t>(-1); --i) {
+            stride_post[i] = stride_post[i+1]*inputDims[i+1];
+        }
+        auto stride_pre = std::unique_ptr<std::size_t[]>(new std::size_t[nb_dims]);
+        stride_pre[0] = 1;
+        for (std::size_t i = 1; i < nb_dims; ++i) {
+            stride_pre[i] = stride_pre[i-1]*inputDims[i-1];
+        }
+
+        // Type should be the return type of stableMean<I>(), which is always floating point
+        const Acc_T<I>* inputAccumulation = nullptr;
+        Acc_T<I>* outputAccumulation = nullptr;
+
+        for (const auto& axisInt : axes) {
+            const std::size_t a = static_cast<std::size_t>(axisInt);
+            outputElements /= inputDims[a];
+            outputAccumulation = new Acc_T<I>[outputElements];
+            const std::size_t dim_i = inputDims[a];
+            for (std::size_t pre = 0; pre < stride_pre[a]; ++pre) {
+                for (std::size_t post = 0; post < stride_post[a]; ++post) {
+                    const std::size_t idx_i = pre * dim_i * stride_post[a] + post;
+                    const std::size_t idx_o = pre * stride_post[a] + post;
+                    if (inputAccumulation == nullptr) {
+                        outputAccumulation[idx_o] = stableMean<I>(input + idx_i, dim_i, stride_post[a]);
+                    }
+                    else {
+                        outputAccumulation[idx_o] = stableMean<Acc_T<I>>(inputAccumulation + idx_i, dim_i, stride_post[a]);
+                    }
+                }
+            }
+            std::for_each(stride_pre.get()+a+1, stride_pre.get()+nb_dims, [dim_i] (std::size_t& val) { val /= dim_i; });
+            if (inputAccumulation != nullptr) {
+                delete[] inputAccumulation;
+            }
+            inputAccumulation = outputAccumulation;
+        }
+
+        std::transform(inputAccumulation, inputAccumulation + outputElements, output,
+            [](auto value) { return castFromFloat<O>(value); });
+        if (outputAccumulation) {
+            delete[] outputAccumulation;
+        }
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(ReduceMeanImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::ReduceMeanImpl_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(ReduceMeanImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::ReduceMeanImpl_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(ReduceMeanImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::inPlaceModel, Aidge::ReduceMeanImpl_cpu_forward_kernel<int32_t, int32_t>, nullptr});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_REDUCEMEANIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/ReduceSumImpl.hpp
+++ b/include/aidge/backend/cpu/operator/ReduceSumImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_REDUCESUMIMPL_H_
+#define AIDGE_CPU_OPERATOR_REDUCESUMIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/ReduceSum.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using ReduceSumImpl_cpu = OperatorImpl_cpu<ReduceSum_Op,
+    void(const std::vector<std::int32_t>&,
+                            DimSize_t,
+                            const std::vector<DimSize_t>&,
+                            const void *,
+                            void *)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(ReduceSum_Op, "cpu", Aidge::ReduceSumImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_REDUCESUMIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/ReduceSumImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ReduceSumImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_REDUCESUMIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_REDUCESUMIMPL_KERNELS_H_
+
+#include <algorithm>   // std::for_each
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::int32_t
+#include <functional>  //std::multiplies
+#include <numeric>     //std::accumulate
+#include <vector>
+
+#include "aidge/backend/cpu/operator/ReduceSumImpl.hpp"
+#include "aidge/data/Data.hpp"
+#include "aidge/operator/ReduceSum.hpp"
+#include "aidge/utils/Registrar.hpp"
+
+namespace Aidge {
+template <class I, class O>
+void ReduceSumImpl_cpu_forward_kernel(const std::vector<std::int32_t>& axes,
+                                    DimSize_t /*keepDims*/,
+                                    const std::vector<DimSize_t>& inputDims,
+                                    const void* input_,
+                                    void* output_) {
+
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+
+    const std::size_t nb_dims = inputDims.size();
+    const std::size_t totalElements = std::accumulate(inputDims.cbegin(), inputDims.cend(), 1, std::multiplies<std::size_t>());
+
+    if (axes.empty()){
+        std::copy_n(input,totalElements, output);
+    }
+    else if (axes.size() == 1) {
+        const std::size_t stride_pre = std::accumulate(inputDims.cbegin(), inputDims.cbegin() + axes[0], 1, std::multiplies<std::size_t>());
+        const std::size_t stride_post = std::accumulate(inputDims.crbegin(), inputDims.crbegin() + nb_dims -1 - axes[0], 1, std::multiplies<std::size_t>());
+
+        const std::size_t dim_i = inputDims[axes[0]];
+        for (std::size_t pre = 0; pre < stride_pre; ++pre) {
+            for (std::size_t post = 0; post < stride_post; ++post) {
+                const std::size_t idx_i = pre * dim_i * stride_post + post;
+                const std::size_t idx_o = pre * stride_post + post;
+                O sum = 0;
+                for (std::size_t i = 0; i < dim_i; ++i) {
+                    sum +=input[idx_i + i*stride_post];
+                }
+                output[idx_o]  = sum;
+            }
+        }
+    } else {
+        std::size_t outputElements = totalElements;
+
+        auto stride_post = std::unique_ptr<std::size_t[]>(new std::size_t[nb_dims]);
+        stride_post[nb_dims - 1] = 1;
+        for (std::size_t i = nb_dims-2; i != static_cast<std::size_t>(-1); --i) {
+            stride_post[i] = stride_post[i+1]*inputDims[i+1];
+        }
+        auto stride_pre = std::unique_ptr<std::size_t[]>(new std::size_t[nb_dims]);
+        stride_pre[0] = 1;
+        for (std::size_t i = 1; i < nb_dims; ++i) {
+            stride_pre[i] = stride_pre[i-1]*inputDims[i-1];
+        }
+
+        const I* inputAccumulation = input;
+        I* outputAccumulation = nullptr;
+
+        for (const auto& axisInt : axes) {
+            const std::size_t a = static_cast<std::size_t>(axisInt);
+            outputElements /= inputDims[a];
+            outputAccumulation = new I[outputElements];
+            const std::size_t dim_i = inputDims[a];
+            for (std::size_t pre = 0; pre < stride_pre[a]; ++pre) {
+                for (std::size_t post = 0; post < stride_post[a]; ++post) {
+                    const std::size_t idx_i = pre * dim_i * stride_post[a] + post;
+                    const std::size_t idx_o = pre * stride_post[a] + post;
+                    I sum = 0;
+                    for (std::size_t i = 0; i < dim_i; ++i) {
+                        sum += inputAccumulation[idx_i + i*stride_post[a]];
+                    }
+                    outputAccumulation[idx_o] = sum;
+                }
+            }
+            std::for_each(stride_pre.get()+a+1, stride_pre.get()+nb_dims, [dim_i] (std::size_t& val) { val /= dim_i; });
+            if (inputAccumulation != input) {
+                delete[] inputAccumulation;
+            }
+            inputAccumulation = outputAccumulation;
+        }
+
+        // Copy elements from inputAccumulation to output while dividing by divisor
+        std::copy(inputAccumulation, inputAccumulation + outputElements, output);
+        if (outputAccumulation) {
+            delete[] outputAccumulation;
+        }
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(ReduceSumImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::ReduceSumImpl_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(ReduceSumImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::ReduceSumImpl_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(ReduceSumImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::inPlaceModel, Aidge::ReduceSumImpl_cpu_forward_kernel<int32_t, int32_t>, nullptr});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_REDUCESUMIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/ResizeImpl.hpp
+++ b/include/aidge/backend/cpu/operator/ResizeImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_RESIZEIMPL_H_
+#define AIDGE_CPU_OPERATOR_RESIZEIMPL_H_
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Resize.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include <aidge/data/Interpolation.hpp>
+#include <aidge/operator/Pad.hpp>
+#include <cstdint>
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using ResizeImpl_cpu = OperatorImpl_cpu<
+    Resize_Op,
+    void(const void *,                                  // input
+         const std::vector<DimSize_t> &,                // INput dims
+         const std::vector<DimSize_t> &,                // OUTput dims
+         const Interpolation::CoordinateTransformation, // coord transfo
+         const Interpolation::Mode,                     // interpolation mode
+         const PadBorderType,                           // padding mode
+         void *)>;                                      // output
+// Implementation entry point registration to Operator
+REGISTRAR(Resize_Op, "cpu", Aidge::ResizeImpl_cpu::create);
+} // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_RESIZEIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/ResizeImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ResizeImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_RESIZEIMPL_FORWARD_KERNEL_H_
+#define AIDGE_CPU_OPERATOR_RESIZEIMPL_FORWARD_KERNEL_H_
+
+#include "aidge/backend/cpu/operator/ResizeImpl.hpp"
+
+#include <aidge/data/Data.hpp>
+#include <aidge/data/half.hpp>
+#include <aidge/operator/Pad.hpp>
+#include <cmath>
+#include <cstdint>
+#include <numeric>
+
+#include "aidge/backend/cpu/data/Interpolation.hpp"
+#include "aidge/data/Interpolation.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+
+template <typename IO>
+void ResizeImpl_cpu_forward_kernel(
+    const void *input_,
+    const std::vector<DimSize_t> &inputDims,
+    const std::vector<DimSize_t> &outputDims,
+    const Interpolation::CoordinateTransformation coordTransfoMode,
+    const Interpolation::Mode interpMode,
+    const PadBorderType paddingMode,
+    // const double * /*roi*/,
+    // const float * /*scales*/,
+    // const int64_t * /*sizes*/,
+    void *output_) {
+
+    // Seting a data
+    const IO *input = static_cast<const IO *>(input_);
+    IO *output = static_cast<IO *>(output_);
+
+    const DimSize_t outputLen = std::accumulate(outputDims.cbegin(),
+                                          outputDims.cend(),
+                                          1,
+                                          std::multiplies<DimSize_t>());
+
+#ifdef _OPENMP
+    #pragma omp parallel for if (outputLen >= 16)
+#endif
+    for (int idxFlatOut = 0; idxFlatOut < static_cast<int>(outputLen); ++idxFlatOut) {
+        const auto coordOut = Tensor::toCoord(outputDims, idxFlatOut);
+        auto coordInApprox =
+            Interpolation::untransformCoordinates(coordOut,
+                                                  inputDims,
+                                                  outputDims,
+                                                  coordTransfoMode);
+        if ((interpMode == Interpolation::Mode::Ceil) || (interpMode == Interpolation::Mode::Floor) || (interpMode == Interpolation::Mode::RoundPreferCeil) || (interpMode == Interpolation::Mode::RoundPreferFloor)) {
+            for (std::size_t i = 0; i < coordInApprox.size(); ++i) {
+                if (interpMode == Interpolation::Mode::Ceil) {
+                    coordInApprox[i] = std::ceil(coordInApprox[i]);
+                } else if (interpMode == Interpolation::Mode::Floor) {
+                    coordInApprox[i] = std::floor(coordInApprox[i]);
+                } else if (interpMode == Interpolation::Mode::RoundPreferCeil) {
+                    coordInApprox[i] = std::floor(coordInApprox[i] + 0.5f);
+                } else { // (interpMode == Interpolation::Mode::RoundPreferFloor)
+                    coordInApprox[i] = std::ceil(coordInApprox[i] - 0.5f);
+                }
+            }
+            std::vector<std::size_t> coordIn(inputDims.size());
+            if (Tensor::isInBounds<float>(inputDims, coordInApprox)) {
+                for (std::size_t i = 0; i < coordInApprox.size(); ++i) {
+                    coordIn[i] = static_cast<std::size_t>(coordInApprox[i]);
+                }
+            } else {
+                if (paddingMode == PadBorderType::Edge) {
+                    for (std::size_t i = 0; i < coordInApprox.size(); ++i) {
+                        coordIn[i] = coordInApprox[i] < 0 ? 0 : (coordInApprox[i] >=inputDims[i] ? inputDims[i] - 1 : static_cast<std::size_t>(coordInApprox[i]));
+                    }
+                } else {
+                    AIDGE_THROW_OR_ABORT(std::runtime_error, "Padding mode not supported");
+                }
+            }
+            output[idxFlatOut] = input[Tensor::toIndex(inputDims, coordIn)];
+        } else {
+            std::set<Interpolation::Point<IO>> neighbours =
+                InterpolationCPU::retrieveNeighbours(input,
+                                                     inputDims,
+                                                     coordInApprox,
+                                                     paddingMode);
+            output[idxFlatOut] = InterpolationCPU::interpolate(coordInApprox,
+                                                               neighbours,
+                                                               interpMode);
+        }
+    }
+    return;
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(ResizeImpl_cpu,
+          {{{DataType::Int16},
+            {DataType::Any},
+            {DataType::Any},
+            {DataType::Any}},
+           {DataType::Int16}},
+          {ProdConso::inPlaceModel,
+           ResizeImpl_cpu_forward_kernel<int16_t>,
+           nullptr});
+REGISTRAR(ResizeImpl_cpu,
+          {{{DataType::Int32},
+            {DataType::Any},
+            {DataType::Any},
+            {DataType::Any}},
+           {DataType::Int32}},
+          {ProdConso::inPlaceModel,
+           ResizeImpl_cpu_forward_kernel<int32_t>,
+           nullptr});
+REGISTRAR(ResizeImpl_cpu,
+          {{{DataType::Int64},
+            {DataType::Any},
+            {DataType::Any},
+            {DataType::Any}},
+           {DataType::UInt64}},
+          {ProdConso::inPlaceModel,
+           ResizeImpl_cpu_forward_kernel<int64_t>,
+           nullptr});
+
+REGISTRAR(ResizeImpl_cpu,
+          {{{DataType::Float16},
+            {DataType::Any},
+            {DataType::Any},
+            {DataType::Any}},
+           {DataType::Float16}},
+          {ProdConso::inPlaceModel,
+           ResizeImpl_cpu_forward_kernel<half_float::half>,
+           nullptr});
+REGISTRAR(ResizeImpl_cpu,
+          {{{DataType::Float32},
+            {DataType::Any},
+            {DataType::Any},
+            {DataType::Any}},
+           {DataType::Float32}},
+          {ProdConso::inPlaceModel,
+           ResizeImpl_cpu_forward_kernel<float>,
+           nullptr});
+REGISTRAR(ResizeImpl_cpu,
+          {{{DataType::Float64},
+            {DataType::Any},
+            {DataType::Any},
+            {DataType::Any}},
+           {DataType::Float64}},
+          {ProdConso::inPlaceModel,
+           ResizeImpl_cpu_forward_kernel<double>,
+           nullptr});
+} // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_RESIZEIMPL_FORWARD_KERNEL_H_ */
No results found