Compare revisions

382a5508 · 382a5508 · 6d4b1b3f · 382a5508 · 382a5508 · 6d4b1b3f
--- a/include/aidge/backend/cpu/operator/ConstantOfShapeImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConstantOfShapeImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_KERNELS_H_
+
+#include <aidge/data/Tensor.hpp>
+#include <aidge/data/half.hpp>
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <functional> // std::multiplies
+#include <numeric>    // std::accumulate
+#include <vector>
+
+#include "aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp"
+#include "aidge/data/Data.hpp"
+#include "aidge/utils/ErrorHandling.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+template <class O>
+void ConstantOfShapeimpl_cpu_forward_kernel(
+    const std::vector<DimSize_t> output_dims, const Tensor &value,
+    void *output_) {
+
+  O *output = static_cast<O *>(output_);
+  O val;
+  std::copy(static_cast<O *>(value.getImpl()->hostPtr()),
+            static_cast<O *>(value.getImpl()->hostPtr()) +
+                static_cast<NbElts_t>(1),
+            &val);
+  const size_t output_size = std::accumulate(
+      output_dims.begin(), output_dims.end(), 1, std::multiplies<DimSize_t>());
+  for (size_t i = 0; i < output_size; ++i) {
+    output[i] = val;
+  }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(ConstantOfShapeImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int64}, ImplSpec::IOSpec{DataType::Float16}},
+    {ProdConso::defaultModel, Aidge::ConstantOfShapeimpl_cpu_forward_kernel<half_float::half>, nullptr});
+REGISTRAR(ConstantOfShapeImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int64}, ImplSpec::IOSpec{DataType::Float32}},
+    {ProdConso::defaultModel, Aidge::ConstantOfShapeimpl_cpu_forward_kernel<float>, nullptr});
+REGISTRAR(ConstantOfShapeImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int64}, ImplSpec::IOSpec{DataType::Float64}},
+    {ProdConso::defaultModel, Aidge::ConstantOfShapeimpl_cpu_forward_kernel<double>, nullptr});
+REGISTRAR(ConstantOfShapeImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int64}, ImplSpec::IOSpec{DataType::Int16}},
+    {ProdConso::defaultModel, Aidge::ConstantOfShapeimpl_cpu_forward_kernel<std::int16_t>, nullptr});
+REGISTRAR(ConstantOfShapeImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int64}, ImplSpec::IOSpec{DataType::Int32}},
+    {ProdConso::defaultModel, Aidge::ConstantOfShapeimpl_cpu_forward_kernel<std::int32_t>, nullptr});
+REGISTRAR(ConstantOfShapeImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int64}, ImplSpec::IOSpec{DataType::Int64}},
+    {ProdConso::defaultModel, Aidge::ConstantOfShapeimpl_cpu_forward_kernel<std::int64_t>, nullptr});
+} // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_KERNELS_H_ */
+
--- a/include/aidge/backend/cpu/operator/ConvDepthWiseImpl.hpp
+++ b/include/aidge/backend/cpu/operator/ConvDepthWiseImpl.hpp
@@ -17,43 +17,39 @@
 #include <tuple>
 #include <vector>

-#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/ConvDepthWise.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
 #include "aidge/backend/cpu/data/GetCPUPtr.h"

 namespace Aidge {
-// class ConvDepthWise_Op;
-
-// compute kernel registry for forward and backward
-class ConvDepthWiseImpl2DForward_cpu
-    : public Registrable<ConvDepthWiseImpl2DForward_cpu,
-                         std::tuple<DataType, DataType, DataType, DataType>,
-                         void(const ConvDepthWise_Op<2>::Attrs &, const std::array<DimSize_t, 4> &, const void *,
-                              const void *, const void *, void *)> {};
-class ConvDepthWiseImpl2DBackward_cpu
-    : public Registrable<ConvDepthWiseImpl2DBackward_cpu,
-                         std::tuple<DataType, DataType, DataType, DataType>,
-                         void(const ConvDepthWise_Op<2>::Attrs &, const std::array<DimSize_t, 4> &, const void *,
-                              const void *, const void *, void *)> {};
-
-class ConvDepthWiseImpl2D_cpu : public OperatorImpl {
-public:
-    ConvDepthWiseImpl2D_cpu(const ConvDepthWise_Op<2> &op) : OperatorImpl(op) {}
-
-    static std::unique_ptr<ConvDepthWiseImpl2D_cpu> create(const ConvDepthWise_Op<2> &op) {
-        return std::make_unique<ConvDepthWiseImpl2D_cpu>(op);
-    }
-
-    NbElts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
-    void forward() override;
-};
-
-namespace {
-// add cpu backend to ConvDepthWise_Op<2> implementation registry
-static Registrar<ConvDepthWise_Op<2>> registrarConvDepthWiseImpl2D_cpu("cpu", Aidge::ConvDepthWiseImpl2D_cpu::create);
-}  // namespace
+// Operator implementation entry point for the backend
+using ConvDepthWise1D_Op = ConvDepthWise_Op<1>;
+using ConvDepthWiseImpl1D_cpu = OperatorImpl_cpu<ConvDepthWise_Op<1>,
+    void(const std::array<DimSize_t, 1>&,
+        const std::array<DimSize_t, 1>&,
+        const std::array<DimSize_t, 1>&,
+        const std::array<DimSize_t, 3>&,
+        const void *,
+        const void *,
+        const void *,
+        void *)>;
+
+using ConvDepthWise2D_Op = ConvDepthWise_Op<2>;
+using ConvDepthWiseImpl2D_cpu = OperatorImpl_cpu<ConvDepthWise_Op<2>,
+    void(const std::array<DimSize_t, 2>&,
+        const std::array<DimSize_t, 2>&,
+        const std::array<DimSize_t, 2>&,
+        const std::array<DimSize_t, 4> &,
+        const void *,
+        const void *,
+        const void *,
+        void *)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(ConvDepthWise1D_Op, "cpu", Aidge::ConvDepthWiseImpl1D_cpu::create);
+REGISTRAR(ConvDepthWise2D_Op, "cpu", Aidge::ConvDepthWiseImpl2D_cpu::create);
 }  // namespace Aidge

 #endif /* AIDGE_CPU_OPERATOR_CONVDEPTHWISEIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_forward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_forward_kernels.hpp
-/********************************************************************************
- * Copyright (c) 2023 CEA-List
- *
- * This program and the accompanying materials are made available under the
- * terms of the Eclipse Public License 2.0 which is available at
- * http://www.eclipse.org/legal/epl-2.0.
- *
- * SPDX-License-Identifier: EPL-2.0
- *
- ********************************************************************************/
-
-#ifndef AIDGE_CPU_OPERATOR_CONVDEPTHWISEIMPL_FORWARD_KERNEL_H_
-#define AIDGE_CPU_OPERATOR_CONVDEPTHWISEIMPL_FORWARD_KERNEL_H_
-
-#include "aidge/utils/Registrar.hpp"
-
-#include "aidge/backend/cpu/operator/ConvDepthWiseImpl.hpp"
-#include "aidge/utils/Types.h"
-#include "aidge/backend/cpu/data/GetCPUPtr.h"
-#include <cmath>
-#include <cstddef>
-#include <array>
-#include <algorithm>
-
-namespace Aidge {
-/**
- * @brief Forward kernel for 2D ConvDepthWiseolution on CPU backend.
- * @tparam I Input data type.
- * @tparam W Weight data type.
- * @tparam B Bias data type.
- * @tparam O Output data type.
- * @param params tuple of Attributes from the Operator
- * @param dims Array of input dimensions.
- * @param input_ const input Tensor.
- * @param weights_ const weight Tensor.
- * @param biases_ const Biais Tensor.
- * @param output_ Output Tensor.
- */
-template <class I, class W, class B, class O>
-void ConvDepthWiseImpl2D_cpu_forward_kernel(const ConvDepthWise_Op<2>::Attrs &attrs, const std::array<DimSize_t, 4> &dims,
-                                       const void *input_, const void *weights_, const void *biases_, void *output_) {
-    // FIXME: missing convolution attributes as arguments
-    const I *input = static_cast<const I *>(input_);
-    const W *weights = static_cast<const W *>(weights_);
-    const B *biases = static_cast<const B *>(biases_);
-    O *output = static_cast<O *>(output_);
-
-
-    // output H size
-    const std::size_t oxSize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(dims[2] - std::get<3>(attrs)[0] + std::get<0>(attrs)[0]) /
-                                static_cast<float>(std::get<0>(attrs)[0])));
-    // output W size
-    const std::size_t oySize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(dims[3] - std::get<3>(attrs)[1] + std::get<0>(attrs)[1]) /
-                                static_cast<float>(std::get<0>(attrs)[1])));
-
-    // TODO: kernel computation
-    // output (batch, outCh, Xout, Yout)
-    // input  (batch, ch, Xin, Yin)
-    // weight (outCh, ch, kernelX, kernelY)
-    // does not take Dilation attribute into account
-    using signedsize = std::make_signed<std::size_t>::type;
-    for (std::size_t batch = 0; batch < dims[0]; ++batch) {
-        for (std::size_t ch = 0; ch < std::get<2>(attrs); ++ch) {
-            const std::size_t oIndex = (ch + batch*std::get<2>(attrs)) * oxSize * oySize;
-            B biasVal = (biases != nullptr) ? biases[ch] : B(0);
-            std::fill(output + oIndex, output+(oIndex+oxSize*oySize), biasVal);
-            const std::size_t iIndex = (ch + batch*dims[1]) * dims[2] * dims[3];
-            const std::size_t wIndex = ch * std::get<3>(attrs)[0] * std::get<3>(attrs)[1];
-            for (std::size_t ox = 0; ox < oxSize; ++ox) {
-                const signedsize difx = static_cast<signedsize>(- ox * std::get<0>(attrs)[0]);
-                const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
-                const std::size_t sxMax = (static_cast<signedsize>(dims[2]) + difx) < 0 ? 0 : ((dims[2] + difx) > std::get<3>(attrs)[0] ? std::get<3>(attrs)[0] : dims[2] + difx);
-                for (std::size_t oy = 0; oy < oySize; ++oy) {
-                    const signedsize dify = static_cast<signedsize>(- oy * std::get<0>(attrs)[1]);
-                    const std::size_t syMin = static_cast<std::size_t>(std::max(dify, signedsize(0)));
-                    const std::size_t syMax = (static_cast<signedsize>(dims[3]) + dify) < 0 ? 0 : ((dims[3] + dify) > std::get<3>(attrs)[1] ? std::get<3>(attrs)[1] : dims[3] + dify);
-                    const std::size_t oIndexFull = oIndex + ox*oySize + oy;
-                    const signedsize ix = static_cast<signedsize>(ox * std::get<0>(attrs)[0]);
-                    const signedsize iy = static_cast<signedsize>(oy * std::get<0>(attrs)[1]);
-
-                    if (sxMin == 0 && syMin == 0 && sxMax == 3 && syMax == 3) {
-                        output[oIndexFull] +=  (weights[wIndex + 0*std::get<3>(attrs)[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+0)*dims[3] + static_cast<std::size_t>(iy+0)] +
-                                                weights[wIndex + 0*std::get<3>(attrs)[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+0)*dims[3] + static_cast<std::size_t>(iy+1)] +
-                                                weights[wIndex + 0*std::get<3>(attrs)[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+0)*dims[3] + static_cast<std::size_t>(iy+2)] +
-                                                weights[wIndex + 1*std::get<3>(attrs)[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+1)*dims[3] + static_cast<std::size_t>(iy+0)] +
-                                                weights[wIndex + 1*std::get<3>(attrs)[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+1)*dims[3] + static_cast<std::size_t>(iy+1)] +
-                                                weights[wIndex + 1*std::get<3>(attrs)[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+1)*dims[3] + static_cast<std::size_t>(iy+2)] +
-                                                weights[wIndex + 2*std::get<3>(attrs)[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+2)*dims[3] + static_cast<std::size_t>(iy+0)] +
-                                                weights[wIndex + 2*std::get<3>(attrs)[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+2)*dims[3] + static_cast<std::size_t>(iy+1)] +
-                                                weights[wIndex + 2*std::get<3>(attrs)[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+2)*dims[3] + static_cast<std::size_t>(iy+2)]);
-                    } else {
-                        for (std::size_t sx = sxMin; sx < sxMax; ++sx) {
-                            for (std::size_t sy = syMin; sy < syMax; ++sy) {
-                                output[oIndexFull] += weights[wIndex + sx*std::get<3>(attrs)[1] + sy] *
-                                                        input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx))*dims[3] + static_cast<std::size_t>(iy+static_cast<signedsize>(sy))];
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-namespace {
-static Registrar<ConvDepthWiseImpl2DForward_cpu> registrarConvDepthWiseImpl2DForward_cpu_Float32(
-        {DataType::Float32, DataType::Float32, DataType::Float32, DataType::Float32},
-        Aidge::ConvDepthWiseImpl2D_cpu_forward_kernel<float, float, float, float>);
-static Registrar<ConvDepthWiseImpl2DForward_cpu> registrarConvDepthWiseImpl2DForward_cpu_Int32(
-        {DataType::Int32, DataType::Int32, DataType::Int32, DataType::Int32},
-        Aidge::ConvDepthWiseImpl2D_cpu_forward_kernel<int, int, int, int>);
-static Registrar<ConvDepthWiseImpl2DForward_cpu> registrarConvDepthWiseImpl2DForward_cpu_Float64(
-        {DataType::Float64, DataType::Float64, DataType::Float64, DataType::Float64},
-        Aidge::ConvDepthWiseImpl2D_cpu_forward_kernel<double, double, double, double>);
-}  // namespace
-}  // namespace Aidge
-
-#endif /* AIDGE_CPU_OPERATOR_CONVDEPTHWISEIMPL_FORWARD_KERNEL_H_ */
--- a/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_CONVDEPTHWISEIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_CONVDEPTHWISEIMPL_KERNELS_H_
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <cstddef>
+
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include "aidge/backend/cpu/operator/ConvDepthWiseImpl.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+/**
+ * @brief Forward kernel for 1D ConvDepthWiseolution on CPU backend.
+ * @tparam I Input data type.
+ * @tparam W Weight data type.
+ * @tparam B Bias data type.
+ * @tparam O Output data type.
+ * @param params tuple of Attributes from the Operator
+ * @param inputDims Array of input dimensions.
+ * @param input_ const input Tensor.
+ * @param weights_ const weight Tensor.
+ * @param biases_ const Biais Tensor.
+ * @param output_ Output Tensor.
+ */
+template <class I, class W, class B, class O>
+void ConvDepthWiseImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& strideDims,
+                            const std::array<DimSize_t, 1>& dilationDims,
+                            const std::array<DimSize_t, 1>& kernelDims,
+                            const std::array<DimSize_t, 3>& inputDims,
+                            const void *input_,
+                            const void *weights_,
+                            const void *biases_,
+                            void *output_) {
+    // FIXME: missing convolution attributes as arguments
+    const I *input = static_cast<const I *>(input_);
+    const W *weights = static_cast<const W *>(weights_);
+    const B *biases = static_cast<const B *>(biases_);
+    O *output = static_cast<O *>(output_);
+
+
+    // output H size
+    const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1;
+    const std::size_t oxSize =
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilated_kernel_x + strideDims[0]) /
+                                static_cast<float>(strideDims[0])));
+
+
+    // TODO: kernel computation
+    // output (batch, outCh, Xout, Yout)
+    // input  (batch, ch, Xin, Yin)
+    // weight (outCh, ch, kernelX, kernelY)
+    // does not take Dilation attribute into account
+    using signedsize = std::make_signed<std::size_t>::type;
+    for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+        for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
+            const std::size_t oIndex = (ch + batch*inputDims[1]) * oxSize;
+            B biasVal = (biases != nullptr) ? biases[ch] : B(0);
+            std::fill(output + oIndex, output+(oIndex+oxSize), biasVal);
+            const std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2];
+            const std::size_t wIndex = ch * kernelDims[0];
+            for (std::size_t ox = 0; ox < oxSize; ++ox) {
+                // const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]);
+                // const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
+                // const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx);
+                const std::size_t sxMin = 0;
+                const std::size_t sxMax = dilated_kernel_x;
+                const std::size_t oIndexFull = oIndex + ox;
+                const signedsize ix = static_cast<signedsize>(ox * strideDims[0]);
+
+                for (std::size_t sx = sxMin; sx*dilationDims[0] < sxMax; ++sx) {
+                    output[oIndexFull] += weights[wIndex + sx] *
+                                            input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx*dilationDims[0]))];
+                }
+            }
+        }
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(ConvDepthWiseImpl1D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvDepthWiseImpl1D_cpu_forward_kernel<float, float, float, float>, nullptr});
+REGISTRAR(ConvDepthWiseImpl1D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvDepthWiseImpl1D_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t, std::int32_t>, nullptr});
+REGISTRAR(ConvDepthWiseImpl1D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvDepthWiseImpl1D_cpu_forward_kernel<double, double, double, double>, nullptr});
+
+
+/**
+ * @brief Forward kernel for 2D ConvDepthWiseolution on CPU backend.
+ * @tparam I Input data type.
+ * @tparam W Weight data type.
+ * @tparam B Bias data type.
+ * @tparam O Output data type.
+ * @param params tuple of Attributes from the Operator
+ * @param inputDims Array of input dimensions.
+ * @param input_ const input Tensor.
+ * @param weights_ const weight Tensor.
+ * @param biases_ const Biais Tensor.
+ * @param output_ Output Tensor.
+ */
+template <class I, class W, class B, class O>
+void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
+                            const std::array<DimSize_t, 2>& dilationDims,
+                            const std::array<DimSize_t, 2>& kernelDims,
+                            const std::array<DimSize_t, 4>& inputDims,
+                            const void *input_,
+                            const void *weights_,
+                            const void *biases_,
+                            void *output_)
+{
+    // FIXME: missing convolution attributes as arguments
+    const I *input = static_cast<const I *>(input_);
+    const W *weights = static_cast<const W *>(weights_);
+    const B *biases = static_cast<const B *>(biases_);
+    O *output = static_cast<O *>(output_);
+
+
+    // output H size
+    const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1;
+    const std::size_t oxSize =
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilated_kernel_x + strideDims[0]) /
+                                static_cast<float>(strideDims[0])));
+
+    // output W size
+    const DimSize_t dilated_kernel_y = dilationDims[1]*(kernelDims[1] - 1) + 1;
+    const std::size_t oySize =
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[3] - dilated_kernel_y + strideDims[1]) /
+                                static_cast<float>(strideDims[1])));
+
+    // TODO: kernel computation
+    // output (batch, outCh, Xout, Yout)
+    // input  (batch, ch, Xin, Yin)
+    // weight (outCh, ch, kernelX, kernelY)
+    // does not take Dilation attribute into account
+    using signedsize = std::make_signed<std::size_t>::type;
+    const std::size_t outChannels_s =  oxSize * oySize;
+
+    if (dilated_kernel_x ==3 && dilated_kernel_y == 3) {
+        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+            for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
+
+                B biasVal = (biases != nullptr) ? biases[ch] : B(0);
+
+                std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
+                const std::size_t wIndex = ch * 9;
+
+                if (strideDims[0] == 1 && strideDims[1]==1) {
+                    for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=inputDims[3]) {
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
+                            output[oIndex + oy] = biasVal + weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+1]+weights[wIndex+2]*input[iIndex+oy+2];
+                        }
+                        iIndex+=inputDims[3];
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
+                            output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy]+weights[wIndex+4]*input[iIndex+oy+1]+weights[wIndex+5]*input[iIndex+oy+2];
+                        }
+                        iIndex+=inputDims[3];
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
+                            output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy]+weights[wIndex+7]*input[iIndex+oy+1]+weights[wIndex+8]*input[iIndex+oy+2];
+                        }
+                    }
+                } else {
+                    for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=strideDims[0]*inputDims[3]) {
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
+                            output[oIndex + oy] += weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+strideDims[0]]+weights[wIndex+2]*input[iIndex+oy+strideDims[0]*2];
+                        }
+                        iIndex+=strideDims[0]*inputDims[3];
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
+                            output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy]+weights[wIndex+4]*input[iIndex+oy+strideDims[0]]+weights[wIndex+5]*input[iIndex+oy+strideDims[0]*2];
+                        }
+                        iIndex+=strideDims[0]*inputDims[3];
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
+                            output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy]+weights[wIndex+7]*input[iIndex+oy+strideDims[0]]+weights[wIndex+8]*input[iIndex+oy+strideDims[0]*2];
+                        }
+                    }
+                }
+                output += outChannels_s;
+            }
+        }
+    } else if (dilated_kernel_x == 1 && dilated_kernel_y == 1) {
+        std::size_t index = 0;
+        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+            for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
+
+                B biasVal = (biases != nullptr) ? biases[ch] : B(0);
+
+                const std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
+                const std::size_t wIndex = ch;
+
+                if (strideDims[0] == 1 && strideDims[1] == 1) {
+                    for (; index < iIndex + oxSize*oySize; ++index) {
+                        output[index] = biasVal + weights[wIndex] * input[index];
+                    }
+                } else  {
+                    std::size_t oIndex =  (ch + batch*inputDims[1]) * oxSize * oySize;
+                    for (std::size_t ox = 0; ox < oxSize; ++ox, oIndex+=oySize) {
+                        index = iIndex + strideDims[0]*inputDims[3];
+                        for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) {
+                            output[oIndex + oy] += weights[wIndex]*input[index+iy];
+                        }
+                    }
+                }
+            }
+        }
+    } else {
+        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+            for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
+
+                B biasVal = (biases != nullptr) ? biases[ch] : B(0);
+                std::fill(output, output+outChannels_s, biasVal);
+
+                const std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
+                const std::size_t wIndex = ch * kernelDims[0] * kernelDims[1];
+
+                for (std::size_t ox = 0; ox < oxSize; ++ox) {
+                    for (std::size_t oy = 0; oy < oySize; ++oy) {
+
+                        const std::size_t oIndexFull = ox*oySize + oy;
+                        const signedsize ix = static_cast<signedsize>(ox * strideDims[0]);
+                        const signedsize iy = static_cast<signedsize>(oy * strideDims[1]);
+
+                        for (std::size_t sx = 0; sx*dilationDims[0] < dilated_kernel_x; ++sx) {
+                            for (std::size_t sy = 0; sy*dilationDims[1] < dilated_kernel_y; ++sy) {
+                                output[oIndexFull] += weights[wIndex + sx*kernelDims[1] + sy] *
+                                                        input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx*dilationDims[0]))*inputDims[3] + static_cast<std::size_t>(iy+static_cast<signedsize>(sy*dilationDims[1]))];
+                            }
+                        }
+                    }
+                }
+            }
+            output += outChannels_s;
+        }
+    }
+}
+
+
+// Kernels registration to implementation entry point
+REGISTRAR(ConvDepthWiseImpl2D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvDepthWiseImpl2D_cpu_forward_kernel<float, float, float, float>, nullptr});
+REGISTRAR(ConvDepthWiseImpl2D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvDepthWiseImpl2D_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t, std::int32_t>, nullptr});
+REGISTRAR(ConvDepthWiseImpl2D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvDepthWiseImpl2D_cpu_forward_kernel<double, double, double, double>, nullptr});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_CONVDEPTHWISEIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/ConvImpl.hpp
+++ b/include/aidge/backend/cpu/operator/ConvImpl.hpp
@@ -17,44 +17,41 @@
 #include <tuple>
 #include <vector>

-#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/Conv.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
 #include "aidge/backend/cpu/data/GetCPUPtr.h"

 namespace Aidge {
-// class Conv_Op;
-
-// compute kernel registry for forward and backward
-class ConvImpl2DForward_cpu
-    : public Registrable<ConvImpl2DForward_cpu,
-                         std::tuple<DataType, DataType, DataType, DataType>,
-                         void(const Conv_Op<2>::Attrs &, const std::array<DimSize_t, 4> &, const void *,
-                              const void *, const void *, void *)> {};
-class ConvImpl2DBackward_cpu
-    : public Registrable<ConvImpl2DBackward_cpu,
-                         std::tuple<DataType, DataType, DataType, DataType>,
-                         void(const Conv_Op<2>::Attrs &, const std::array<DimSize_t, 4> &, const void *,
-                              const void *, const void *, void *)> {};
-
-class ConvImpl2D_cpu : public OperatorImpl {
-   public:
-    ConvImpl2D_cpu(const Conv_Op<2>& op) : OperatorImpl(op) {}
-
-    static std::unique_ptr<ConvImpl2D_cpu> create(const Conv_Op<2> &op) {
-        return std::make_unique<ConvImpl2D_cpu>(op);
-    }
-
-   public:
-    NbElts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
-    void forward() override;
-};
-
-namespace {
-// add cpu backend to Conv_Op<2> implementation registry
-static Registrar<Conv_Op<2>> registrarConvImpl2D_cpu("cpu", Aidge::ConvImpl2D_cpu::create);
-}  // namespace
+// Operator implementation entry point for the backend
+using Conv1D_Op = Conv_Op<1>;
+using ConvImpl1D_cpu = OperatorImpl_cpu<Conv_Op<1>,
+    void(const std::array<DimSize_t, 1>&,
+        const std::array<DimSize_t, 1>&,
+        const std::array<DimSize_t, 1>&,
+        const std::array<DimSize_t, 3> &,
+        DimSize_t,
+        const void *,
+        const void *,
+        const void *,
+        void *)>;
+
+using Conv2D_Op = Conv_Op<2>;
+using ConvImpl2D_cpu = OperatorImpl_cpu<Conv_Op<2>,
+    void(const std::array<DimSize_t, 2>&,
+        const std::array<DimSize_t, 2>&,
+        const std::array<DimSize_t, 2>&,
+        const std::array<DimSize_t, 4> &,
+        DimSize_t,
+        const void *,
+        const void *,
+        const void *,
+        void *)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(Conv1D_Op, "cpu", Aidge::ConvImpl1D_cpu::create);
+REGISTRAR(Conv2D_Op, "cpu", Aidge::ConvImpl2D_cpu::create);
 }  // namespace Aidge

 #endif /* AIDGE_CPU_OPERATOR_CONVIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/ConvImpl_forward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConvImpl_forward_kernels.hpp
-/********************************************************************************
- * Copyright (c) 2023 CEA-List
- *
- * This program and the accompanying materials are made available under the
- * terms of the Eclipse Public License 2.0 which is available at
- * http://www.eclipse.org/legal/epl-2.0.
- *
- * SPDX-License-Identifier: EPL-2.0
- *
- ********************************************************************************/
-
-#ifndef AIDGE_CPU_OPERATOR_CONVIMPL_FORWARD_KERNEL_H_
-#define AIDGE_CPU_OPERATOR_CONVIMPL_FORWARD_KERNEL_H_
-
-#include "aidge/utils/Registrar.hpp"
-
-#include "aidge/data/half.hpp"
-#include "aidge/backend/cpu/operator/ConvImpl.hpp"
-#include "aidge/utils/Types.h"
-#include "aidge/backend/cpu/data/GetCPUPtr.h"
-#include <cmath>
-#include <array>
-#include <algorithm>
-
-namespace Aidge {
-/**
- * @brief Forward kernel for 2D Convolution on CPU backend.
- * @tparam I Input data type.
- * @tparam W Weight data type.
- * @tparam B Bias data type.
- * @tparam O Output data type.
- * @param params tuple of Attributes from the Operator
- * @param dims Array of input dimensions.
- * @param input_ const input Tensor.
- * @param weights_ const weight Tensor.
- * @param biases_ const Biais Tensor.
- * @param output_ Output Tensor.
- */
-template <class I, class W, class B, class O>
-void ConvImpl2D_cpu_forward_kernel(const Conv_Op<2>::Attrs &attrs, const std::array<DimSize_t, 4> &dims,
-                                       const void *input_, const void *weights_, const void *biases_, void *output_) {
-    // FIXME: missing convolution attributes as arguments
-    const I *input = static_cast<const I *>(input_);
-    const W *weights = static_cast<const W *>(weights_);
-    const B *biases = static_cast<const B *>(biases_);
-    O *output = static_cast<O *>(output_);
-/*
-    // output H size
-    const std::size_t oxSize =
-            static_cast<std::size_t>(static_cast<float>(dims[0] - std::get<4>(attrs)[0] + std::get<0>(attrs)[0]) /
-                                static_cast<float>(std::get<0>(attrs)[0]));
-    // output W size
-    const std::size_t oySize =
-            static_cast<std::size_t>(static_cast<float>(dims[1] - std::get<4>(attrs)[1] + std::get<0>(attrs)[1]) /
-                                static_cast<float>(std::get<0>(attrs)[1]));
-
-    // TODO: kernel computation
-    // output (Xout, Yout, outCh, batch)
-    // input  (Xin, Yin, inCh, batch)
-    // weight (kernelX, kernelY, inCh, outCh)
-    // does not take Dilation attribute into account
-    for (std::size_t ox = 0; ox < oxSize; ++ox) {
-        for (std::size_t oy = 0; oy < oySize; ++oy) {
-            const std::size_t ix = ox * std::get<0>(attrs)[0];
-            const std::size_t iy = oy * std::get<0>(attrs)[1];
-
-            for (std::size_t outCh = 0; outCh < std::get<3>(attrs); ++outCh) {
-                const std::size_t oIndex = dims[3] * (outCh + std::get<3>(attrs) * (oy + oySize * ox));
-                B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
-                for (std::size_t batch = 0; batch < dims[3]; ++batch) {
-                    output[oIndex + batch] = biasVal;
-                }
-                for (std::size_t inCh = 0; inCh < dims[2]; ++inCh) {
-                    for (std::size_t sx = 0; sx < std::get<4>(attrs)[0]; ++sx) {
-                        for (std::size_t sy = 0; sy < std::get<4>(attrs)[1]; ++sy) {
-                            const std::size_t wIndex =
-                                    outCh + std::get<3>(attrs) * (inCh + dims[2] * (sy + std::get<4>(attrs)[1] * sx));
-                            std::size_t iIndex = dims[3] * (inCh + dims[2] * ((iy + sy) + dims[1] * (ix + sx)));
-                            for (std::size_t batch = 0; batch < dims[3]; ++batch) {
-                                output[oIndex + batch] += weights[wIndex] * input[iIndex + batch];
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-*/
-
-
-    // output H size
-    const std::size_t oxSize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(dims[2] - std::get<4>(attrs)[0] + std::get<0>(attrs)[0]) /
-                                static_cast<float>(std::get<0>(attrs)[0])));
-    // output W size
-    const std::size_t oySize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(dims[3] - std::get<4>(attrs)[1] + std::get<0>(attrs)[1]) /
-                                static_cast<float>(std::get<0>(attrs)[1])));
-
-    // TODO: kernel computation
-    // output (batch, outCh, Xout, Yout)
-    // input  (batch, inCh, Xin, Yin)
-    // weight (outCh, inCh, kernelX, kernelY)
-    // does not take Dilation attribute into account
-    using signedsize = std::make_signed<std::size_t>::type;
-    for (std::size_t batch = 0; batch < dims[0]; ++batch) {
-        for (std::size_t outCh = 0; outCh < std::get<3>(attrs); ++outCh) {
-            const std::size_t oIndex = (outCh + batch*std::get<3>(attrs)) * oxSize * oySize;
-            B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
-            std::fill(output + oIndex, output+(oIndex+oxSize*oySize), biasVal);
-            for (std::size_t inCh = 0; inCh < dims[1]; ++inCh) {
-                const std::size_t iIndex = (inCh + batch*dims[1]) * dims[2] * dims[3];
-                const std::size_t wIndex = (inCh + outCh*dims[1]) * std::get<4>(attrs)[0] * std::get<4>(attrs)[1];
-                for (std::size_t ox = 0; ox < oxSize; ++ox) {
-                    const signedsize difx = static_cast<signedsize>(- ox * std::get<0>(attrs)[0]);
-                    const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
-                    const std::size_t sxMax = (static_cast<signedsize>(dims[2]) + difx) < 0 ? 0 : ((dims[2] + difx) > std::get<4>(attrs)[0] ? std::get<4>(attrs)[0] : dims[2] + difx);
-                    for (std::size_t oy = 0; oy < oySize; ++oy) {
-                        const signedsize dify = static_cast<signedsize>(- oy * std::get<0>(attrs)[1]);
-                        const std::size_t syMin = static_cast<std::size_t>(std::max(dify, signedsize(0)));
-                        const std::size_t syMax = (static_cast<signedsize>(dims[3]) + dify) < 0 ? 0 : ((dims[3] + dify) > std::get<4>(attrs)[1] ? std::get<4>(attrs)[1] : dims[3] + dify);
-                        const std::size_t oIndexFull = oIndex + ox*oySize + oy;
-                        const signedsize ix = static_cast<signedsize>(ox * std::get<0>(attrs)[0]);
-                        const signedsize iy = static_cast<signedsize>(oy * std::get<0>(attrs)[1]);
-
-                        if (sxMin == 0 && syMin == 0 && sxMax == 3 && syMax == 3) {
-                            output[oIndexFull] += (weights[wIndex + 0*std::get<4>(attrs)[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+0)*dims[3] + static_cast<std::size_t>(iy+0)] +
-                                                   weights[wIndex + 0*std::get<4>(attrs)[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+0)*dims[3] + static_cast<std::size_t>(iy+1)] +
-                                                   weights[wIndex + 0*std::get<4>(attrs)[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+0)*dims[3] + static_cast<std::size_t>(iy+2)] +
-                                                   weights[wIndex + 1*std::get<4>(attrs)[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+1)*dims[3] + static_cast<std::size_t>(iy+0)] +
-                                                   weights[wIndex + 1*std::get<4>(attrs)[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+1)*dims[3] + static_cast<std::size_t>(iy+1)] +
-                                                   weights[wIndex + 1*std::get<4>(attrs)[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+1)*dims[3] + static_cast<std::size_t>(iy+2)] +
-                                                   weights[wIndex + 2*std::get<4>(attrs)[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+2)*dims[3] + static_cast<std::size_t>(iy+0)] +
-                                                   weights[wIndex + 2*std::get<4>(attrs)[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+2)*dims[3] + static_cast<std::size_t>(iy+1)] +
-                                                   weights[wIndex + 2*std::get<4>(attrs)[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+2)*dims[3] + static_cast<std::size_t>(iy+2)]);
-                        } else {
-                            for (std::size_t sx = sxMin; sx < sxMax; ++sx) {
-                                for (std::size_t sy = syMin; sy < syMax; ++sy) {
-                                    output[oIndexFull] += weights[wIndex + sx*std::get<4>(attrs)[1] + sy] *
-                                                            input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx))*dims[3] + static_cast<std::size_t>(iy+static_cast<signedsize>(sy))];
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-namespace {
-static Registrar<ConvImpl2DForward_cpu> registrarConvImpl2DForward_cpu_Float32(
-        {DataType::Float32, DataType::Float32, DataType::Float32, DataType::Float32},
-        Aidge::ConvImpl2D_cpu_forward_kernel<float, float, float, float>);
-static Registrar<ConvImpl2DForward_cpu> registrarConvImpl2DForward_cpu_Float16(
-        {DataType::Float16, DataType::Float16, DataType::Float16, DataType::Float16},
-        Aidge::ConvImpl2D_cpu_forward_kernel<half_float::half, half_float::half, half_float::half, half_float::half>);
-static Registrar<ConvImpl2DForward_cpu> registrarConvImpl2DForward_cpu_Int32(
-        {DataType::Int32, DataType::Int32, DataType::Int32, DataType::Int32},
-        Aidge::ConvImpl2D_cpu_forward_kernel<int, int, int, int>);
-static Registrar<ConvImpl2DForward_cpu> registrarConvImpl2DForward_cpu_Float64(
-        {DataType::Float64, DataType::Float64, DataType::Float64, DataType::Float64},
-        Aidge::ConvImpl2D_cpu_forward_kernel<double, double, double, double>);
-}  // namespace
-}  // namespace Aidge
-
-#endif /* AIDGE_CPU_OPERATOR_CONVIMPL_FORWARD_KERNEL_H_ */
--- a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_CONVIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_CONVIMPL_KERNELS_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/backend/cpu/operator/ConvImpl.hpp"
+#include "aidge/operator/Conv.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+
+namespace Aidge {
+/**
+ * @brief Forward kernel for 1D Convolution on CPU backend.
+ * @tparam I Input data type.
+ * @tparam W Weight data type.
+ * @tparam B Bias data type.
+ * @tparam O Output data type.
+ * @param params tuple of Attributes from the Operator
+ * @param inputDims Array of input dimensions.
+ * @param input_ const input Tensor.
+ * @param weights_ const weight Tensor.
+ * @param biases_ const Biais Tensor.
+ * @param output_ Output Tensor.
+ */
+template <class I, class W, class B, class O>
+void ConvImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& strideDims,
+                            const std::array<DimSize_t, 1>& dilationDims,
+                            const std::array<DimSize_t, 1>& kernelDims,
+                            const std::array<DimSize_t, 3>& inputDims,
+                            DimSize_t outChannels,
+                            const void *input_,
+                            const void *weights_,
+                            const void *biases_,
+                            void *output_)
+{
+    // FIXME: missing convolution attributes as arguments
+    const I *input = static_cast<const I *>(input_);
+    const W *weights = static_cast<const W *>(weights_);
+    const B *biases = static_cast<const B *>(biases_);
+    O *output = static_cast<O *>(output_);
+
+    // output H size
+    const std::size_t oxSize =
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilationDims[0]*(kernelDims[0] - 1) - 1 + strideDims[0]) /
+                                static_cast<float>(strideDims[0])));
+    const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1;
+
+    // TODO: kernel computation
+    // output (batch, outCh, Xout, Yout)
+    // input  (batch, inCh, Xin, Yin)
+    // weight (outCh, inCh, kernelX, kernelY)
+    // does not take Dilation attribute into account
+    using signedsize = std::make_signed<std::size_t>::type;
+    for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+        for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
+            const std::size_t oIndex = (outCh + batch*outChannels) * oxSize;
+            // If bias = nullptr, set B(0)
+            B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
+            std::fill(output + oIndex, output+(oIndex+oxSize), biasVal);
+            for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
+                const std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2];
+                const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0];
+                for (std::size_t ox = 0; ox < oxSize; ++ox) {
+                    // const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]);
+                    // const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
+                    // const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx);
+                    const std::size_t sxMin = 0;
+                    const std::size_t sxMax = dilated_kernel_x;
+                    const std::size_t oIndexFull = oIndex + ox;
+                    const signedsize ix = static_cast<signedsize>(ox * strideDims[0]);
+
+                    for (std::size_t sx = sxMin; sx*dilationDims[0] < sxMax; ++sx) {
+                        output[oIndexFull] += weights[wIndex + sx] *
+                                                input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx*dilationDims[0]))];
+                    }
+                }
+            }
+        }
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(ConvImpl1D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvImpl1D_cpu_forward_kernel<float, float, float, float>, nullptr});
+REGISTRAR(ConvImpl1D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float16, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvImpl1D_cpu_forward_kernel<half_float::half, half_float::half, half_float::half, half_float::half>, nullptr});
+REGISTRAR(ConvImpl1D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvImpl1D_cpu_forward_kernel<int32_t, int32_t, int32_t, int32_t>, nullptr});
+REGISTRAR(ConvImpl1D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvImpl1D_cpu_forward_kernel<double, double, double, double>, nullptr});
+
+
+/**
+ * @brief Forward kernel for 2D Convolution on CPU backend.
+ * @tparam I Input data type.
+ * @tparam W Weight data type.
+ * @tparam B Bias data type.
+ * @tparam O Output data type.
+ * @param params tuple of Attributes from the Operator
+ * @param inputDims Array of input dimensions.
+ * @param input_ const input Tensor.
+ * @param weights_ const weight Tensor.
+ * @param biases_ const Biais Tensor.
+ * @param output_ Output Tensor.
+ */
+template <class I, class W, class B, class O>
+void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
+                            const std::array<DimSize_t, 2>& dilationDims,
+                            const std::array<DimSize_t, 2>& kernelDims,
+                            const std::array<DimSize_t, 4> &inputDims,
+                            DimSize_t outChannels,
+                            const void *input_,
+                            const void *weights_,
+                            const void *biases_,
+                            void *output_)
+{
+    // FIXME: missing convolution attributes as arguments
+    const I *input = static_cast<const I *>(input_);
+    const W *weights = static_cast<const W *>(weights_);
+    const B *biases = static_cast<const B *>(biases_);
+    O *output = static_cast<O *>(output_);
+
+    // output H size
+    const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1;
+    const std::size_t oxSize =
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilated_kernel_x + strideDims[0]) /
+                                static_cast<float>(strideDims[0])));
+    // output W size
+    const DimSize_t dilated_kernel_y = dilationDims[1]*(kernelDims[1] - 1) + 1;
+    const std::size_t oySize =
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[3] - dilated_kernel_y + strideDims[1]) /
+                                static_cast<float>(strideDims[1])));
+
+
+    // TODO: kernel computation
+    // output (batch, outCh, Xout, Yout)
+    // input  (batch, inCh, Xin, Yin)
+    // weight (outCh, inCh, kernelX, kernelY)
+    // does not take Dilation attribute into account
+    const std::size_t outChannels_s =  oxSize * oySize;
+    using signedsize = std::make_signed<std::size_t>::type;
+
+    if (dilated_kernel_x == 3 && dilated_kernel_y == 3) {
+        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+            for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
+                // If bias = nullptr, set B(0)
+                B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
+                std::fill(output, output+outChannels_s, biasVal);
+                for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
+                    std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
+                    const std::size_t wIndex = (inCh + outCh*inputDims[1]) * 9;
+                    if (strideDims[0] == 1 && strideDims[1]==1) {
+                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=inputDims[3]) {
+                            for (std::size_t oy = 0; oy < oySize; ++oy) {
+                                output[oIndex + oy] += weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+1]+weights[wIndex+2]*input[iIndex+oy+2];
+                            }
+                            iIndex+=inputDims[3];
+                            for (std::size_t oy = 0; oy < oySize; ++oy) {
+                                output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy]+weights[wIndex+4]*input[iIndex+oy+1]+weights[wIndex+5]*input[iIndex+oy+2];
+                            }
+                            iIndex+=inputDims[3];
+                            for (std::size_t oy = 0; oy < oySize; ++oy) {
+                                output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy]+weights[wIndex+7]*input[iIndex+oy+1]+weights[wIndex+8]*input[iIndex+oy+2];
+                            }
+                        }
+                    } else {
+                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=strideDims[0]*inputDims[3]) {
+                            for (std::size_t oy = 0; oy < oySize; ++oy) {
+                                output[oIndex + oy] += weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+strideDims[0]]+weights[wIndex+2]*input[iIndex+oy+strideDims[0]*2];
+                            }
+                            iIndex+=strideDims[0]*inputDims[3];
+                            for (std::size_t oy = 0; oy < oySize; ++oy) {
+                                output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy]+weights[wIndex+4]*input[iIndex+oy+strideDims[0]]+weights[wIndex+5]*input[iIndex+oy+strideDims[0]*2];
+                            }
+                            iIndex+=strideDims[0]*inputDims[3];
+                            for (std::size_t oy = 0; oy < oySize; ++oy) {
+                                output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy]+weights[wIndex+7]*input[iIndex+oy+strideDims[0]]+weights[wIndex+8]*input[iIndex+oy+strideDims[0]*2];
+                            }
+                        }
+                    }
+                }
+                output += outChannels_s;
+            }
+        }
+    } else if (dilated_kernel_x == 1 && dilated_kernel_y == 1) {
+        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+            for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
+                // If bias = nullptr, set B(0)
+                B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
+                std::fill(output, output+outChannels_s, biasVal);
+                for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
+                    std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
+                    const std::size_t wIndex = (inCh + outCh*inputDims[1]);
+                    if (strideDims[0] == 1 && strideDims[1] == 1) {
+                        for (std::size_t oIndex = 0; oIndex < oxSize*oySize; ++oIndex, ++iIndex) {
+                            output[oIndex] += weights[wIndex] * input[iIndex];
+                        }
+                    } else  {
+                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=inputDims[3]*strideDims[0]) {
+                            for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) {
+                                output[oIndex + oy] += weights[wIndex+0]*input[iIndex+iy];
+                            }
+                        }
+                    }
+                }
+                output += outChannels_s;
+            }
+        }
+    } else {
+        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+            for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
+                // If bias = nullptr, set B(0)
+                B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
+                std::fill(output, output+outChannels_s, biasVal);
+                for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
+                    std::size_t iIndex_channel = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
+                    const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0] * kernelDims[1];
+
+                    // loop over each ouput line
+                    for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex_channel+=inputDims[3]*strideDims[0]) {
+                        // loop over associated input line
+                        for (std::size_t ky = 0, ix = 0; ky < kernelDims[0]; ++ky, ix += inputDims[3]*dilationDims[0]) {
+                            // loop over the entire line
+                            for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) {
+                                const std::size_t iIndex = iIndex_channel + ix + iy;
+                                // loop over elements assosicated with one output
+                                for (std::size_t kx = 0;  kx < kernelDims[0]; ++kx) {
+                                    output[oIndex + oy] += weights[wIndex+kernelDims[0]*ky+kx]*input[iIndex+kx*dilationDims[1]];
+                                }
+                            }
+                        }
+                    }
+                }
+                output += outChannels_s;
+            }
+        }
+    }
+}
+
+
+
+// Kernels registration to implementation entry point
+REGISTRAR(ConvImpl2D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvImpl2D_cpu_forward_kernel<float, float, float, float>, nullptr});
+REGISTRAR(ConvImpl2D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float16, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvImpl2D_cpu_forward_kernel<half_float::half, half_float::half, half_float::half, half_float::half>, nullptr});
+REGISTRAR(ConvImpl2D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvImpl2D_cpu_forward_kernel<int32_t, int32_t, int32_t, int32_t>, nullptr});
+REGISTRAR(ConvImpl2D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvImpl2D_cpu_forward_kernel<double, double, double, double>, nullptr});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_CONVIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/DivImpl.hpp
+++ b/include/aidge/backend/cpu/operator/DivImpl.hpp
@@ -16,38 +16,18 @@
 #include <tuple>
 #include <vector>

-#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/Div.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"

 namespace Aidge {
+// Operator implementation entry point for the backend
+using DivImpl_cpu = OperatorImpl_cpu<Div_Op,
+    void(const std::size_t, const std::size_t, const std::size_t, const void*, const void*,void*)>;

-// compute kernel registry for forward and backward
-class DivImplForward_cpu
-    // : public Registrable<DivImplForward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*,void*)> {
-    : public Registrable<DivImplForward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::size_t, const std::size_t, const std::size_t, const void*, const void*,void*)> {
-};
-class DivImplBackward_cpu
-    : public Registrable<DivImplBackward_cpu, std::tuple<DataType, DataType, DataType>, void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*, void*)> {
-};
-
-class DivImpl_cpu : public OperatorImpl {
-public:
-    DivImpl_cpu(const Div_Op& op) : OperatorImpl(op) {}
-
-    static std::unique_ptr<DivImpl_cpu> create(const Div_Op& op) {
-        return std::make_unique<DivImpl_cpu>(op);
-    }
-
-    NbElts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
-
-    void forward() override final;
-};
-
-namespace {
-static Registrar<Div_Op> registrarDivImpl_cpu("cpu", Aidge::DivImpl_cpu::create);
-}
+// Implementation entry point registration to Operator
+REGISTRAR(Div_Op, "cpu", Aidge::DivImpl_cpu::create);
 }  // namespace Aidge

 #endif /* AIDGE_CPU_OPERATOR_DIVIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/DivImpl_forward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/DivImpl_forward_kernels.hpp
@@ -9,11 +9,12 @@
 *
 ********************************************************************************/

-#ifndef AIDGE_CPU_OPERATOR_DIVIMPL_FORWARD_KERNEL_H_
-#define AIDGE_CPU_OPERATOR_DIVIMPL_FORWARD_KERNEL_H_
+#ifndef AIDGE_CPU_OPERATOR_DIVIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_DIVIMPL_KERNELS_H_

 #include <numeric>     // std::accumulate
 #include <cstddef>     // std::size_t
+#include <cstdint>     // std::int32_t, std::int64_t
 #include <functional>  // std::multiplies

 #include "aidge/utils/Registrar.hpp"
@@ -68,19 +69,16 @@ constexpr void DivImpl_cpu_forward_kernel(const std::size_t input1size_,
    }
 }

-
-
-namespace {
-static Registrar<DivImplForward_cpu> registrarDivImplForward_cpu_Float32(
-        {DataType::Float32, DataType::Float32, DataType::Float32},
-        Aidge::DivImpl_cpu_forward_kernel<float, float, float>);
-static Registrar<DivImplForward_cpu> registrarDivImplForward_cpu_Int32(
-        {DataType::Int32, DataType::Int32, DataType::Int32},
-        Aidge::DivImpl_cpu_forward_kernel<int, int, int>);
-static Registrar<DivImplForward_cpu> registrarDivImplForward_cpu_Float64(
-        {DataType::Float64, DataType::Float64, DataType::Float64},
-        Aidge::DivImpl_cpu_forward_kernel<double, double, double>);
-}  // namespace
+// Kernels registration to implementation entry point
+REGISTRAR(DivImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::DivImpl_cpu_forward_kernel<float, float, float>, nullptr});
+REGISTRAR(DivImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::DivImpl_cpu_forward_kernel<double, double, double>, nullptr});
+REGISTRAR(DivImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::inPlaceModel, Aidge::DivImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, nullptr});
 }  // namespace Aidge

-#endif /* AIDGE_CPU_OPERATOR_DIVIMPL_FORWARD_KERNEL_H_ */
+#endif /* AIDGE_CPU_OPERATOR_DIVIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/ErfImpl.hpp
+++ b/include/aidge/backend/cpu/operator/ErfImpl.hpp
@@ -12,7 +12,7 @@
 #ifndef AIDGE_CPU_OPERATOR_ERFIMPL_H_
 #define AIDGE_CPU_OPERATOR_ERFIMPL_H_

-#include "aidge/backend/OperatorImpl.hpp"
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/Erf.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
@@ -20,31 +20,12 @@
 #include <vector>

 namespace Aidge {
-// class Erf_Op;
+// Operator implementation entry point for the backend
+using ErfImpl_cpu = OperatorImpl_cpu<Erf_Op,
+    void(const std::size_t, const void*, void*)>;

-// compute kernel registry for forward and backward
-class ErfImplForward_cpu
-    : public Registrable<ErfImplForward_cpu, std::tuple<DataType, DataType>, void(const std::size_t, const void*, void*)> {
-};
-class ErfImplBackward_cpu
-    : public Registrable<ErfImplBackward_cpu, std::tuple<DataType, DataType>, void(const std::size_t, const void*, void*)> {
-};
-
-class ErfImpl_cpu : public OperatorImpl {
-public:
-    ErfImpl_cpu(const Erf_Op& op) : OperatorImpl(op) {}
-
-    static std::unique_ptr<ErfImpl_cpu> create(const Erf_Op& op) {
-        return std::make_unique<ErfImpl_cpu>(op);
-    }
-
-    NbElts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
-    void forward() override;
-};
-
-namespace {
-static Registrar<Erf_Op> registrarErfImpl_cpu("cpu", Aidge::ErfImpl_cpu::create);
-}
+// Implementation entry point registration to Operator
+REGISTRAR(Erf_Op, "cpu", Aidge::ErfImpl_cpu::create);
 }  // namespace Aidge

 #endif /* AIDGE_CPU_OPERATOR_ERFIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/ErfImpl_forward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ErfImpl_forward_kernels.hpp
@@ -9,8 +9,8 @@
 *
 ********************************************************************************/

-#ifndef AIDGE_CPU_OPERATOR_ERFIMPL_FORWARD_KERNEL_H_
-#define AIDGE_CPU_OPERATOR_ERFIMPL_FORWARD_KERNEL_H_
+#ifndef AIDGE_CPU_OPERATOR_ERFIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_ERFIMPL_KERNELS_H_

 #include <cmath>

@@ -32,14 +32,16 @@ void ErfImpl_cpu_forward_kernel(std::size_t inputLenght,
    }
 }

-namespace {
-static Registrar<ErfImplForward_cpu> registrarErfImplForward_cpu_Float32(
-        {DataType::Float32, DataType::Float32}, Aidge::ErfImpl_cpu_forward_kernel<float, float>);
-static Registrar<ErfImplForward_cpu> registrarErfImplForward_cpu_Int32(
-        {DataType::Int32, DataType::Int32}, Aidge::ErfImpl_cpu_forward_kernel<int, int>);
-static Registrar<ErfImplForward_cpu> registrarErfImplForward_cpu_Float64(
-        {DataType::Float64, DataType::Float64}, Aidge::ErfImpl_cpu_forward_kernel<double, double>);
-}  // namespace
+// Kernels registration to implementation entry point
+REGISTRAR(ErfImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::ErfImpl_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(ErfImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::ErfImpl_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(ErfImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::inPlaceModel, Aidge::ErfImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr});
 }  // namespace Aidge

-#endif /* AIDGE_CPU_OPERATOR_ERFIMPL_FORWARD_KERNEL_H_ */
+#endif /* AIDGE_CPU_OPERATOR_ERFIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/FCImpl.hpp
+++ b/include/aidge/backend/cpu/operator/FCImpl.hpp
@@ -12,42 +12,37 @@
 #ifndef AIDGE_CPU_OPERATOR_FCIMPL_H_
 #define AIDGE_CPU_OPERATOR_FCIMPL_H_

-#include "aidge/backend/OperatorImpl.hpp"
+#include <array>
+#include <memory>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/FC.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
-#include "aidge/backend/cpu/data/GetCPUPtr.h"
-#include <memory>
-#include <vector>
-#include <array>

 namespace Aidge {
-// class FC_Op;
-
-// compute kernel registry for forward and backward
-class FCImplForward_cpu : public Registrable<FCImplForward_cpu,
-                                                 std::tuple<DataType, DataType, DataType, DataType>,
-                                                 void(const FC_Op::Attrs &, const DimSize_t, const DimSize_t,
-                                                      const void *, const void *, const void *, void *)> {};
-class FCImplBackward_cpu : public Registrable<FCImplBackward_cpu,
-                                                  std::tuple<DataType, DataType, DataType, DataType>,
-                                                  void(const FC_Op::Attrs &, const DimSize_t, const DimSize_t,
-                                                       const void *, const void *, const void *, void *)> {};
-
-class FCImpl_cpu : public OperatorImpl {
-public:
-    FCImpl_cpu(const FC_Op &op) : OperatorImpl(op) {}
-
-    static std::unique_ptr<FCImpl_cpu> create(const FC_Op &op) {
-        return std::make_unique<FCImpl_cpu>(op);
-    }
-
-    void forward() override;
-};
-
-namespace {
-static Registrar<FC_Op> registrarFCImpl_cpu("cpu", Aidge::FCImpl_cpu::create);
-}
+// Operator implementation entry point for the backend
+using FCImpl_cpu = OperatorImpl_cpu<FC_Op,
+    void(const DimSize_t,
+        const DimSize_t,
+        const DimSize_t,
+        const void *,
+        const void *,
+        const void *,
+        void *),
+    void(const DimSize_t,
+        const DimSize_t,
+        const DimSize_t,
+        const void *,
+        const void *,
+        const void *,
+        void *,
+        void *,
+        void *)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(FC_Op, "cpu", Aidge::FCImpl_cpu::create);
 }  // namespace Aidge

 #endif /* AIDGE_CPU_OPERATOR_FCIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/FCImpl_forward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/FCImpl_forward_kernels.hpp
@@ -9,13 +9,13 @@
 *
 ********************************************************************************/

-#ifndef AIDGE_CPU_OPERATOR_FCIMPL_FORWARD_KERNEL_H_
-#define AIDGE_CPU_OPERATOR_FCIMPL_FORWARD_KERNEL_H_
+#ifndef AIDGE_CPU_OPERATOR_FCIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_FCIMPL_KERNELS_H_

-#include "aidge/utils/Registrar.hpp"
 #include <algorithm>

 #include "aidge/backend/cpu/operator/FCImpl.hpp"
+#include "aidge/utils/Registrar.hpp"

 namespace Aidge {
 // template <class I, class W, class B, class O>
@@ -27,9 +27,9 @@ namespace Aidge {
 //     const B* biases = static_cast<const B*>(biases_);
 //     O* output = static_cast<O*>(output_);

-//     for (std::size_t outIdx = 0; outIdx < std::get<0>(attrs); ++outIdx) {
+//     for (std::size_t outIdx = 0; outIdx < outputFeatureSize; ++outIdx) {
 //         std::size_t oIndex = outIdx * dims[3];
-//         const B bias = std::get<1>(attrs) ? B(0) : biases[outIdx];
+//         const B bias = std::get<0>(attrs) ? B(0) : biases[outIdx];
 //         for (std::size_t batch = 0; batch < dims[3]; ++batch) {
 //             output[oIndex + batch] = bias;
 //         }
@@ -39,10 +39,10 @@ namespace Aidge {
 //         for (std::size_t iy = 0; iy < dims[1]; ++iy) {
 //             for (std::size_t inCh = 0; inCh < dims[2]; ++inCh) {
 //                 const std::size_t iIndex = dims[3] * (inCh + dims[2] * (iy + dims[1] * ix));
-//                 for (std::size_t outCh = 0; outCh < std::get<0>(attrs); ++outCh) {
+//                 for (std::size_t outCh = 0; outCh < outputFeatureSize; ++outCh) {
 //                     const std::size_t oIndex = dims[3] * outCh;
-//                     const std::size_t wIndex = (inCh + dims[2] * (iy + dims[1] * ix)) * std::get<0>(attrs) +
-//                                           outCh;  // (iIndex*std::get<0>(attrs) + oIndex)/dims[3];
+//                     const std::size_t wIndex = (inCh + dims[2] * (iy + dims[1] * ix)) * outputFeatureSize +
+//                                           outCh;  // (iIndex*outputFeatureSize + oIndex)/dims[3];
 //                     for (std::size_t batch = 0; batch < dims[3]; ++batch) {
 //                         output[oIndex + batch] += weights[wIndex] * input[iIndex + batch];
 //                     }
@@ -63,9 +63,9 @@ namespace Aidge {

 //     // let's have I.dims() = [N, C, H, W] instead of [H, W, C, N]

-//     for (std::size_t outIdx = 0; outIdx < std::get<0>(attrs); ++outIdx) {
+//     for (std::size_t outIdx = 0; outIdx < outputFeatureSize; ++outIdx) {
 //         std::size_t oIndex = outIdx * dims[0];
-//         const B bias = std::get<1>(attrs) ? B(0) : biases[outIdx];
+//         const B bias = std::get<0>(attrs) ? B(0) : biases[outIdx];
 //         for (std::size_t batch = 0; batch < dims[0]; ++batch) {
 //             output[oIndex + batch] = bias;
 //         }
@@ -74,8 +74,8 @@ namespace Aidge {
 //     for (std::size_t batch = 0; batch < dims[0]; ++batch) {
 //         const std::size_t oIndex = dims[1] * batch;
 //         for (std::size_t i = 0; i < dims[1]; ++i) {
-//             for (std::size_t outCh = 0; outCh < std::get<0>(attrs); ++outCh) {
-//                 std::size_t wIndex = i * std::get<0>(attrs) + outCh;  // (iIndex*std::get<0>(attrs) + oIndex)/dims[3];
+//             for (std::size_t outCh = 0; outCh < outputFeatureSize; ++outCh) {
+//                 std::size_t wIndex = i * outputFeatureSize + outCh;  // (iIndex*outputFeatureSize + oIndex)/dims[3];
 //                 output[oIndex + outCh] += weights[wIndex] * input[i + batch];
 //             }
 //         }
@@ -83,46 +83,104 @@ namespace Aidge {
 // }

 template <class I, class W, class B, class O>
-void FCImpl_cpu_forward_kernel(const FC_Op::Attrs& attrs, const DimSize_t batchSize, const DimSize_t oneInputSize,
-                                   const void* input_, const void* weights_, const void* biases_, void* output_) {
+void FCImpl_cpu_forward_kernel(const DimSize_t batchSize,
+                            const DimSize_t inputFeatureSize,
+                            const DimSize_t outputFeatureSize,
+                            const void* input_,
+                            const void* weights_,
+                            const void* biases_,
+                            void* output_) {
    // FIXME: missing FC attributes as arguments
    const I* input = static_cast<const I*>(input_);
    const W* weights = static_cast<const W*>(weights_);
    const B* biases = static_cast<const B*>(biases_);
    O* output = static_cast<O*>(output_);

-    if (std::get<1>(attrs)) {
-        std::fill(output, output+(batchSize*std::get<0>(attrs)), B(0));
+    if (biases == nullptr) {
+        std::fill(output, output+(batchSize*outputFeatureSize), B(0));
    }
    else {
        for (std::size_t batch = 0; batch < batchSize; ++batch) {
-            std::copy(biases, biases+std::get<0>(attrs), output+(batch*std::get<0>(attrs)));
+            std::copy(biases, biases+outputFeatureSize, output+(batch*outputFeatureSize));
        }
    }

    for (std::size_t batch = 0; batch < batchSize; ++batch) {
-        for (std::size_t out = 0; out < std::get<0>(attrs); ++out) {
-            output[out + batch*std::get<0>(attrs)] = std::inner_product(input + batch*oneInputSize,
-                                                        input + (batch + 1)*oneInputSize,
-                                                        weights + out*oneInputSize,
-                                                        output[out + batch*std::get<0>(attrs)]);
+        for (std::size_t out = 0; out < outputFeatureSize; ++out) {
+            output[out + batch*outputFeatureSize] = std::inner_product(input + batch*inputFeatureSize,
+                                                        input + (batch + 1)*inputFeatureSize,
+                                                        weights + out*inputFeatureSize,
+                                                        output[out + batch*outputFeatureSize]);
        }
    }
 }

+template <class I, class O, class W, class B>
+void FCImpl_cpu_backward_kernel(const DimSize_t batchSize,
+                                const DimSize_t inputFeatureSize,
+                                const DimSize_t outputFeatureSize,
+                                const void* input_,
+                                const void* originalInput_,
+                                const void* weight_,
+                                void* output_,
+                                void* weightGrad_,
+                                void* biasesGrad_)
+{
+    // FIXME: missing FC attributes as arguments
+    const I* input  = static_cast<const I*>(input_);
+    const I* originalInput  = static_cast<const I*>(originalInput_);
+    const W* weight = static_cast<const W*>(weight_);
+    O* output       = static_cast<O*>(output_);
+    W* weightGrad   = static_cast<W*>(weightGrad_);
+    B* biasesGrad   = static_cast<B*>(biasesGrad_);
+
+
+    // bias grad
+    if (biasesGrad == nullptr) { // no bias
+        std::fill(biasesGrad, biasesGrad + outputFeatureSize, B(0));
+    } else {
+        for (std::size_t o = 0; o < outputFeatureSize; ++o) { // nb outputs
+            B sum{0};
+            for (std::size_t b = 0; b < batchSize; ++b) {
+                sum += input[b*outputFeatureSize + o];
+            }
+            biasesGrad[o] = sum;
+        }
+    }

-namespace {
-static Registrar<FCImplForward_cpu> registrarFCImpl2DForward_cpu_Float32(
-        {DataType::Float32, DataType::Float32, DataType::Float32, DataType::Float32},
-        Aidge::FCImpl_cpu_forward_kernel<float, float, float, float>);
-static Registrar<FCImplForward_cpu> registrarFCImpl2DForward_cpu_Int32(
-        {DataType::Int32, DataType::Int32, DataType::Int32, DataType::Int32},
-        Aidge::FCImpl_cpu_forward_kernel<int, int, int, int>);
-static Registrar<FCImplForward_cpu> registrarFCImpl2DForward_cpu_Float64(
-        {DataType::Float64, DataType::Float64, DataType::Float64, DataType::Float64},
-        Aidge::FCImpl_cpu_forward_kernel<double, double, double, double>);
-}  // namespace
+    // weight grad
+    for (std::size_t o = 0; o < outputFeatureSize; ++o) {
+        for (std::size_t c = 0; c < inputFeatureSize; ++c) {
+            W sum{0};
+            for (std::size_t b = 0; b < batchSize; ++b) {
+                sum += originalInput[b*inputFeatureSize + c]*input[b*outputFeatureSize + o];
+            }
+            weightGrad[o*inputFeatureSize + c] = sum;
+        }
+    }
+
+    // input grad
+    for (std::size_t b = 0; b < batchSize; ++b) {
+        for (std::size_t c = 0; c < inputFeatureSize; ++c) {
+            O sum{0};
+            for (std::size_t o = 0; o < outputFeatureSize; ++o) {
+                sum += weight[o*inputFeatureSize + c] * input[b*outputFeatureSize + o];
+            }
+            output[b*inputFeatureSize + c] = sum;
+        }
+    }
+}

+// Kernels registration to implementation entry point
+REGISTRAR(FCImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float32}},
+    {ProdConso::defaultModel, Aidge::FCImpl_cpu_forward_kernel<float, float, float, float>, Aidge::FCImpl_cpu_backward_kernel<float, float, float, float>});
+REGISTRAR(FCImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}},
+    {ProdConso::defaultModel, Aidge::FCImpl_cpu_forward_kernel<double, double, double, double>, Aidge::FCImpl_cpu_backward_kernel<double, double, double, double>});
+REGISTRAR(FCImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}},
+    {ProdConso::defaultModel, Aidge::FCImpl_cpu_forward_kernel<int32_t, int32_t, int32_t, int32_t>, Aidge::FCImpl_cpu_backward_kernel<int32_t, int32_t, int32_t, int32_t>});
 }  // namespace Aidge

-#endif /* AIDGE_CPU_OPERATOR_FCIMPL_FORWARD_KERNEL_H_ */
+#endif /* AIDGE_CPU_OPERATOR_FCIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/ReshapeImpl.hpp
+++ b/include/aidge/backend/cpu/operator/ReshapeImpl.hpp
@@ -9,42 +9,34 @@
 *
 ********************************************************************************/

-#ifndef AIDGE_CPU_OPERATOR_RESHAPEIMPL_H_
-#define AIDGE_CPU_OPERATOR_RESHAPEIMPL_H_
+#ifndef AIDGE_CPU_OPERATOR_FOLDIMPL_H_
+#define AIDGE_CPU_OPERATOR_FOLDIMPL_H_

-#include "aidge/backend/OperatorImpl.hpp"
-#include "aidge/operator/Reshape.hpp"
-#include "aidge/utils/Registrar.hpp"
-#include "aidge/utils/Types.h"
+#include <array>
 #include <memory>
+#include <tuple>
 #include <vector>

-namespace Aidge {
-// class Reshape_Op;
-
-// compute kernel registry for forward and backward
-class ReshapeImplForward_cpu
-    : public Registrable<ReshapeImplForward_cpu, std::tuple<DataType, DataType>, void(std::size_t, const void*, void*)> {
-};
-class ReshapeImplBackward_cpu
-    : public Registrable<ReshapeImplBackward_cpu, std::tuple<DataType, DataType>, void(std::size_t, const void*, void*)> {
-};
-
-class ReshapeImpl_cpu : public OperatorImpl {
-public:
-    ReshapeImpl_cpu(const Reshape_Op& op) : OperatorImpl(op) {}
-
-    static std::unique_ptr<ReshapeImpl_cpu> create(const Reshape_Op& op) {
-        return std::make_unique<ReshapeImpl_cpu>(op);
-    }
-
-    NbElts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
-    void forward() override;
-};
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Fold.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"

-namespace {
-static Registrar<Reshape_Op> registrarReshapeImpl_cpu("cpu", Aidge::ReshapeImpl_cpu::create);
-}
+namespace Aidge {
+// Operator implementation entry point for the backend
+using Fold2D_Op = Fold_Op<2>;
+using FoldImpl2D_cpu = OperatorImpl_cpu<Fold_Op<2>,
+    void(const std::array<DimSize_t, 2>&,
+        const std::array<DimSize_t, 2>&,
+        const std::array<DimSize_t, 2>&,
+        const std::array<DimSize_t, 2>&,
+        const std::vector<DimSize_t> &,
+        const void *,
+        void *)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(Fold2D_Op, "cpu", Aidge::FoldImpl2D_cpu::create);
 }  // namespace Aidge

-#endif /* AIDGE_CPU_OPERATOR_RESHAPEIMPL_H_ */
+#endif /* AIDGE_CPU_OPERATOR_FOLDIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/FoldImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/FoldImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_FOLDIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_FOLDIMPL_KERNELS_H_
+
+#include "aidge/utils/Registrar.hpp"
+
+#include "aidge/backend/cpu/operator/FoldImpl.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include <cmath>
+#include <array>
+#include <algorithm>
+
+namespace Aidge {
+template <class I, class O>
+void FoldImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& outputDims,
+                                    const std::array<DimSize_t, 2>& strideDims,
+                                    const std::array<DimSize_t, 2>& dilationDims,
+                                    const std::array<DimSize_t, 2>& kernelDims,
+                                    const std::vector<DimSize_t> &dims,
+                                    const void *input_, void *output_)
+{
+    const I *input = static_cast<const I *>(input_);
+    O *output = static_cast<O *>(output_);
+
+    const DimSize_t inHeight = outputDims[0];
+    const DimSize_t inWidth = outputDims[1];
+
+    const DimSize_t kernelExtentHeight = dilationDims[0] *
+                                            (kernelDims[0] - 1) + 1;
+    const DimSize_t outHeight = 1 + static_cast<DimSize_t>(
+                    floor(static_cast<float>(inHeight - kernelExtentHeight) /
+                            static_cast<float>(strideDims[0])));
+    const DimSize_t kernelExtentWidth = dilationDims[1] *
+                                            (kernelDims[1] - 1) + 1;
+    const DimSize_t outWidth = 1 + static_cast<DimSize_t>(
+                    floor(static_cast<float>(inWidth - kernelExtentWidth) /
+                            static_cast<float>(strideDims[1])));
+    const DimSize_t outChannels = dims[dims.size() - 2];
+    const DimSize_t inChannels = outChannels / kernelDims[0] / kernelDims[1];
+
+    std::fill_n(output, dims[0] * outHeight * outWidth * outChannels, O(0));
+
+    for (DimSize_t n = 0; n < dims[0]; ++n) {
+        for (DimSize_t outC = 0; outC < outChannels; ++outC) {
+            const auto inOffsetW = outC % kernelDims[1];
+            const auto inOffsetH = (outC / kernelDims[1]) % kernelDims[0];
+            const auto inC = outC / kernelDims[0] / kernelDims[1];
+
+            for (DimSize_t outH = 0; outH < outHeight; ++outH) {
+                const auto inH = outH * strideDims[0] + inOffsetH * dilationDims[0];
+
+                for (DimSize_t outW = 0; outW < outWidth; ++outW) {
+                    const auto inW = outW * strideDims[1] + inOffsetW * dilationDims[1];
+
+                    output[((n * inChannels + inC) * inHeight + inH) * inWidth + inW] +=
+                        input[((n * outChannels + outC) * outHeight + outH) * outWidth + outW];
+                }
+            }
+        }
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(FoldImpl2D_cpu,
+    {DataType::Float32},
+    {ProdConso::defaultModel, Aidge::FoldImpl2D_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(FoldImpl2D_cpu,
+    {DataType::Float64},
+    {ProdConso::defaultModel, Aidge::FoldImpl2D_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(FoldImpl2D_cpu,
+    {DataType::Int32},
+    {ProdConso::defaultModel, Aidge::FoldImpl2D_cpu_forward_kernel<int32_t, int32_t>, nullptr});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_FOLDIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/GatherImpl_forward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/GatherImpl_forward_kernels.hpp
-/********************************************************************************
- * Copyright (c) 2023 CEA-List
- *
- * This program and the accompanying materials are made available under the
- * terms of the Eclipse Public License 2.0 which is available at
- * http://www.eclipse.org/legal/epl-2.0.
- *
- * SPDX-License-Identifier: EPL-2.0
- *
- ********************************************************************************/
-
-#ifndef AIDGE_CPU_OPERATOR_GATHERIMPL_FORWARD_KERNEL_H_
-#define AIDGE_CPU_OPERATOR_GATHERIMPL_FORWARD_KERNEL_H_
-
-#include "aidge/utils/Registrar.hpp"
-#include <cstddef>
-#include <cmath>
-#include "aidge/data/Data.hpp"
-#include "aidge/utils/Types.h"
-
-#include "aidge/backend/cpu/operator/GatherImpl.hpp"
-
-namespace Aidge {
-template <class I, class O>
-void GatherImpl_cpu_forward_kernel(const typename Gather_Op::Attrs& attrs, const std::vector<DimSize_t>& inputDims, const void* input_, void* output_)
-{
-    const I* input = static_cast<const I*>(input_);
-    O* output = static_cast<O*>(output_);
-
-    const std::size_t axisIdx = std::get<2>(attrs)>=0 ?
-                                std::get<2>(attrs) :
-                                static_cast<std::size_t>(std::get<2>(attrs)) + inputDims.size();
-
-    std::size_t postAxisElems = 1;
-    for (std::size_t i = axisIdx + 1; i < inputDims.size(); ++i) {
-        postAxisElems *= inputDims[i];
-    }
-    std::size_t preAxisElems = 1;
-    for (std::size_t i = 0; i < axisIdx; ++i) {
-        preAxisElems *= inputDims[i];
-    }
-
-    const std::vector<std::int64_t> indices = std::get<0>(attrs);
-    for (std::size_t i=0; i<preAxisElems; ++i)
-    {
-        for(std::size_t j=0; j<indices.size(); ++j)
-        {
-            const std::size_t idx = indices[j] >= 0 ? indices[j] : static_cast<std::size_t>(indices[j]) + inputDims[axisIdx];
-            const I* startPtr = std::next(input, i * postAxisElems * inputDims[axisIdx] + idx * postAxisElems);
-            std::copy_n(startPtr, postAxisElems, output);
-            output += postAxisElems;
-        }
-    }
-}
-
-namespace {
-static Registrar<GatherImplForward_cpu> registrarGatherImplForward_cpu_Float32(
-        {DataType::Float32, DataType::Float32}, Aidge::GatherImpl_cpu_forward_kernel<float, float>);
-static Registrar<GatherImplForward_cpu> registrarGatherImplForward_cpu_Int32(
-        {DataType::Int32, DataType::Int32}, Aidge::GatherImpl_cpu_forward_kernel<int, int>);
-static Registrar<GatherImplForward_cpu> registrarGatherImplForward_cpu_Float64(
-        {DataType::Float64, DataType::Float64}, Aidge::GatherImpl_cpu_forward_kernel<double, double>);
-}  // namespace
-}  // namespace Aidge
-
-#endif /* AIDGE_CPU_OPERATOR_GATHERIMPL_FORWARD_KERNEL_H_ */
--- a/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp
+++ b/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_H_
+#define AIDGE_CPU_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_H_
+
+#include <memory>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/GlobalAveragePooling.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using GlobalAveragePoolingImpl_cpu = OperatorImpl_cpu<GlobalAveragePooling_Op,
+    void(const std::vector<DimSize_t> &, const void *, void *)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(GlobalAveragePooling_Op, "cpu", Aidge::GlobalAveragePoolingImpl_cpu::create);
+} // namespace Aidge
+
+#endif /* _AIDGE_CPU_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_KERNELS_H_
+
+#include <cstddef>
+#include <functional>  // std::multiplies
+#include <numeric>     // std::accumulate
+#include <vector>
+
+#include "aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp"
+#include "aidge/data/Data.hpp"
+#include "aidge/utils/ErrorHandling.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+
+namespace Aidge {
+template <class I, class O>
+void GlobalAveragePoolingImpl_cpu_forward_kernel(
+    const std::vector<DimSize_t> &dims, const void *input_, void *output_) {
+  // error checking
+    AIDGE_ASSERT(dims.size() >= 3,"GlobalAveragePool needs at least a 3 dimensions "
+                 "input, number of input dim : {}",
+                 dims.size());
+
+  // computation
+  const I *input = static_cast<const I *>(input_);
+  O *output = static_cast<O *>(output_);
+
+  DimSize_t nb_elems = std::accumulate(dims.begin(), dims.end(), std::size_t(1),
+                                       std::multiplies<std::size_t>());
+
+  const DimSize_t in_batch_nb_elems{nb_elems / dims[0]};
+  const DimSize_t in_channel_nb_elems{in_batch_nb_elems / dims[1]};
+  const DimSize_t out_batch_nb_elems{dims[1]};
+  // parse channel by channel and fill each output with the average of the
+  // values in the channel
+  for (DimSize_t batch = 0; batch < dims[0]; ++batch) {
+    for (DimSize_t channel = 0; channel < dims[1]; ++channel) {
+      const I *filter_start = std::next(
+          input, (batch * in_batch_nb_elems) + (channel * in_channel_nb_elems));
+      I mean = 0;
+      for (size_t i = 0; i < in_channel_nb_elems; ++i) {
+        // Single pass numerically stable mean, using the fmaf
+        mean = fmaf(filter_start[i] - mean, 1.0f/(i+1), mean);
+      }
+      output[batch * out_batch_nb_elems + channel] = mean;
+    }
+  }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(GlobalAveragePoolingImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::defaultModel, Aidge::GlobalAveragePoolingImpl_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(GlobalAveragePoolingImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::defaultModel, Aidge::GlobalAveragePoolingImpl_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(GlobalAveragePoolingImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::defaultModel, Aidge::GlobalAveragePoolingImpl_cpu_forward_kernel<int32_t, int32_t>, nullptr});
+} // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/GridSampleImpl.hpp
+++ b/include/aidge/backend/cpu/operator/GridSampleImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_GRIDSAMPLEIMPL_H_
+#define AIDGE_CPU_OPERATOR_GRIDSAMPLEIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/GridSample.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using GridSampleImpl_cpu = OperatorImpl_cpu<GridSample_Op,
+    void(const GridSample_Op&,
+        const std::shared_ptr<Tensor>&,
+        const std::shared_ptr<Tensor>&,
+        const std::shared_ptr<Tensor>&)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(GridSample_Op, "cpu", Aidge::GridSampleImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_GRIDSAMPLEIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/GridSampleImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/GridSampleImpl_kernels.hpp
No results found