Compare revisions

a424079c · a424079c · a424079c · a424079c · a424079c · a424079c
--- a/include/aidge/backend/cpu/operator/PadImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/PadImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_PADIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_PADIMPL_KERNELS_H_
+#include <algorithm>  // std::max, std::min
+#include <array>
+#include <cstddef>    // std::size_t
+#include <cstdint>    // std::int32_t
+#include "aidge/backend/cpu/operator/PadImpl.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+namespace Aidge {
+/**
+ * @brief Forward kernel for 1D Padding on CPU backend.
+ * @tparam I Input data type.
+ * @tparam O Output data type.
+ * @param attrs tuple of Parameters from the Operator
+ * @param dims Array of input dimensions.
+ * @param input_ const input Tensor.
+ * @param output_ Output Tensor.
+ */
+template <class I, class O>
+void PadImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 2>& beginEndBorders,
+                                const PadBorderType borderType,
+                                const double borderValue,
+                                const std::array<DimSize_t, 3>& dims,
+                                const void *input_,
+                                void *output_)
+{
+    const I *input = static_cast<const I *>(input_);
+    O *output = static_cast<O *>(output_);
+    const std::size_t oxSize = dims[2] + beginEndBorders[0] + beginEndBorders[1];
+    for (std::size_t batch = 0; batch < dims[0]; ++batch) {
+        for (std::size_t ch = 0; ch < dims[1]; ++ch) {
+            const std::size_t iIndex = (ch + batch*dims[1]) * dims[2];
+            const std::size_t oIndex = (ch + batch*dims[1]) * oxSize;
+            for (unsigned int ox = 0; ox < oxSize; ++ox) {
+                const std::size_t oIndexFull = oIndex + ox;
+                O outputValue = static_cast<O>(borderValue);
+                if (borderType == PadBorderType::Constant) {
+                    int ix = static_cast<int>(ox) - static_cast<int>(beginEndBorders[0]);
+                    if (ix >= 0  && ix < static_cast<int>(dims[2])) {
+                        outputValue = input[iIndex + static_cast<std::size_t>(ix)];
+                    }
+                }
+                else if (borderType == PadBorderType::Edge) {
+                    int ix = std::max(0, std::min(static_cast<int>(dims[2]) - 1, static_cast<int>(ox) - static_cast<int>(beginEndBorders[0])));
+                    outputValue = input[iIndex + static_cast<std::size_t>(ix)];
+                }
+                else if (borderType == PadBorderType::Reflect) {
+                    int ix = static_cast<int>(ox) - static_cast<int>(beginEndBorders[0]);
+                    if (ix < 0)
+                        ix = 0 - ix;
+                    if (ix >= static_cast<int>(dims[2]))
+                        ix = static_cast<int>(dims[2]) - ix;
+                    outputValue = input[iIndex + static_cast<std::size_t>(ix)];
+                }
+                else if (borderType == PadBorderType::Wrap) {
+                    int ix = (static_cast<int>(dims[2]) + static_cast<int>(ox) - static_cast<int>(beginEndBorders[0])) % static_cast<int>(dims[2]);
+                    outputValue = input[iIndex + static_cast<std::size_t>(ix)];
+                }
+                output[oIndexFull] = outputValue;
+            }
+        }
+    }
+}
+// Kernels registration to implementation entry point
+REGISTRAR(PadImpl1D_cpu,
+    {{DataType::Float32, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}},
+    {Pad_ProdConso_cpu::defaultModel, Aidge::PadImpl1D_cpu_forward_kernel<cpptype_t<DataType::Float32>, cpptype_t<DataType::Float32>>, nullptr});
+REGISTRAR(PadImpl1D_cpu,
+    {{DataType::Float64, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}},
+    {Pad_ProdConso_cpu::defaultModel, Aidge::PadImpl1D_cpu_forward_kernel<cpptype_t<DataType::Float64>, cpptype_t<DataType::Float64>>, nullptr});
+REGISTRAR(PadImpl1D_cpu,
+    {{DataType::Int32, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}},
+    {Pad_ProdConso_cpu::defaultModel, Aidge::PadImpl1D_cpu_forward_kernel<cpptype_t<DataType::Int32>, cpptype_t<DataType::Int32>>, nullptr});
+/**
+ * @brief Forward kernel for 2D Padding on CPU backend.
+ * @tparam I Input data type.
+ * @tparam O Output data type.
+ * @param attrs tuple of Parameters from the Operator
+ * @param dims Array of input dimensions.
+ * @param input_ const input Tensor.
+ * @param output_ Output Tensor.
+ */
+template <class I, class O>
+void PadImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 4>& beginEndBorders,
+                                const PadBorderType borderType,
+                                const double borderValue,
+                                const std::array<DimSize_t, 4> &dims,
+                                const void *input_,
+                                void *output_)
+{
+    const I *input = static_cast<const I *>(input_);
+    O *output = static_cast<O *>(output_);
+    const std::size_t oySize = dims[2] + beginEndBorders[0] + beginEndBorders[2];
+    const std::size_t oxSize = dims[3] + beginEndBorders[1] + beginEndBorders[3];
+    for (std::size_t batch = 0; batch < dims[0]; ++batch) {
+        for (std::size_t ch = 0; ch < dims[1]; ++ch) {
+            const std::size_t iIndex = (ch + batch*dims[1]) * dims[2] * dims[3];
+            const std::size_t oIndex = (ch + batch*dims[1]) * oxSize * oySize;
+            for (std::uint32_t oy = 0; oy < oySize; ++oy) {
+                for (std::uint32_t ox = 0; ox < oxSize; ++ox) {
+                    const std::size_t oIndexFull = oIndex + oy*oxSize + ox;
+                    O outputValue = static_cast<O>(borderValue);
+                    if (borderType == PadBorderType::Constant) {
+                        std::int32_t ix = static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[1]);
+                        std::int32_t iy = static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[0]);
+                        if (ix >= 0  && ix < static_cast<std::int32_t>(dims[3]) && iy >= 0  && iy < static_cast<std::int32_t>(dims[2])) {
+                            outputValue = input[iIndex + static_cast<std::size_t>(iy)*dims[3] + static_cast<std::size_t>(ix)];
+                        }
+                    }
+                    else if (borderType == PadBorderType::Edge) {
+                        std::int32_t ix = std::max(0, std::min(static_cast<std::int32_t>(dims[3]) - 1, static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[1])));
+                        std::int32_t iy = std::max(0, std::min(static_cast<std::int32_t>(dims[2]) - 1, static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[0])));
+                        outputValue = input[iIndex + static_cast<std::size_t>(iy)*dims[3] + static_cast<std::size_t>(ix)];
+                    }
+                    else if (borderType == PadBorderType::Reflect) {
+                        std::int32_t ix = static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[1]);
+                        std::int32_t iy = static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[0]);
+                        if (ix < 0)
+                            ix = 0 - ix;
+                        if (iy < 0)
+                            iy = 0 - iy;
+                        if (ix >= static_cast<std::int32_t>(dims[3]))
+                            ix = static_cast<std::int32_t>(dims[3]) - ix;
+                        if (iy >= static_cast<std::int32_t>(dims[2]))
+                            iy = static_cast<std::int32_t>(dims[2]) - iy;
+                        outputValue = input[iIndex + static_cast<std::size_t>(iy)*dims[3] + static_cast<std::size_t>(ix)];
+                    }
+                    else if (borderType == PadBorderType::Wrap) {
+                        std::int32_t ix = (static_cast<std::int32_t>(dims[3]) + static_cast<std::int32_t>(ox) - static_cast<std::int32_t>(beginEndBorders[1])) % static_cast<std::int32_t>(dims[3]);
+                        std::int32_t iy = (static_cast<std::int32_t>(dims[2]) + static_cast<std::int32_t>(oy) - static_cast<std::int32_t>(beginEndBorders[0])) % static_cast<std::int32_t>(dims[2]);
+                        outputValue = input[iIndex + static_cast<std::size_t>(iy)*dims[3] + static_cast<std::size_t>(ix)];
+                    }
+                    output[oIndexFull] = outputValue;
+                }
+            }
+        }
+    }
+}
+// Kernels registration to implementation entry point
+REGISTRAR(PadImpl2D_cpu,
+    {{DataType::Float32, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}},
+    {Pad_ProdConso_cpu::defaultModel, Aidge::PadImpl2D_cpu_forward_kernel<cpptype_t<DataType::Float32>, cpptype_t<DataType::Float32>>, nullptr});
+REGISTRAR(PadImpl2D_cpu,
+    {{DataType::Float64, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}},
+    {Pad_ProdConso_cpu::defaultModel, Aidge::PadImpl2D_cpu_forward_kernel<cpptype_t<DataType::Float64>, cpptype_t<DataType::Float64>>, nullptr});
+REGISTRAR(PadImpl2D_cpu,
+    {{DataType::Int32, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}},
+    {Pad_ProdConso_cpu::defaultModel, Aidge::PadImpl2D_cpu_forward_kernel<cpptype_t<DataType::Int32>, cpptype_t<DataType::Int32>>, nullptr});
+}  // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_PADIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/PaddedConvImpl.hpp
+++ b/include/aidge/backend/cpu/operator/PaddedConvImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_PADDEDCONVIMPL_H_
+#define AIDGE_CPU_OPERATOR_PADDEDCONVIMPL_H_
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/MetaOperatorDefs.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+namespace Aidge {
+// Operator implementation entry point for the backend
+using PaddedConv1D_Op = MetaOperator_Op;
+using PaddedConvImpl1D_cpu = OperatorImpl_cpu<MetaOperator_Op,
+    void(const std::array<DimSize_t, 2>&,
+                            const std::array<DimSize_t, 1>&,
+                            const std::array<DimSize_t, 1>&,
+                            const std::array<DimSize_t, 1>&,
+                            const std::array<DimSize_t, 3> &,
+                            DimSize_t,
+                            const void *,
+                            const void *,
+                            const void *,
+                            void *)>;
+using PaddedConv2D_Op = MetaOperator_Op;
+using PaddedConvImpl2D_cpu = OperatorImpl_cpu<MetaOperator_Op,
+    void(const std::array<DimSize_t, 4>&,
+                            const std::array<DimSize_t, 2>&,
+                            const std::array<DimSize_t, 2>&,
+                            const std::array<DimSize_t, 2>&,
+                            const std::array<DimSize_t, 4> &,
+                            DimSize_t,
+                            const void *,
+                            const void *,
+                            const void *,
+                            void *)>;
+// Implementation entry point registration to Operator
+// Uncomment to activate implementation for PaddedConv. It is currently less efficient, hence why it is commented.
+// REGISTRAR(PaddedConv1D_Op, std::array<std::string, 2>({"cpu", "PaddedConv1D"}), Aidge::PaddedConvImpl1D_cpu::create);
+// REGISTRAR(PaddedConv2D_Op, std::array<std::string, 2>({"cpu", "PaddedConv2D"}), Aidge::PaddedConvImpl2D_cpu::create);
+}  // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_PADDEDCONVIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/PaddedConvImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/PaddedConvImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_PADDEDCONVIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_PADDEDCONVIMPL_KERNELS_H_
+#include <array>
+#include <cstddef>
+#include <vector>
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/backend/cpu/operator/PaddedConvImpl.hpp"
+#include "aidge/operator/Pad.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+namespace Aidge {
+// Only works for constant padding zero
+/**
+ * @brief Forward kernel for 1D Convolution on CPU backend.
+ * @tparam I Input data type.
+ * @tparam W Weight data type.
+ * @tparam B Bias data type.
+ * @tparam O Output data type.
+ * @param params tuple of Attributes from the Operator
+ * @param inputDims Array of input dimensions.
+ * @param input_ const input Tensor.
+ * @param weights_ const weight Tensor.
+ * @param biases_ const Biais Tensor.
+ * @param output_ Output Tensor.
+ */
+template <class I, class W, class B, class O>
+void PaddedConvImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 2>& beginEndBorders,
+                            const std::array<DimSize_t, 1>& strideDims,
+                            const std::array<DimSize_t, 1>& dilationDims,
+                            const std::array<DimSize_t, 1>& kernelDims,
+                            const std::array<DimSize_t, 3>& inputDims,
+                            DimSize_t outChannels,
+                            const void *input_,
+                            const void *weights_,
+                            const void *biases_,
+                            void *output_)
+{
+    // FIXME: missing convolution attributes as arguments
+    const I *input = static_cast<const I *>(input_);
+    const W *weights = static_cast<const W *>(weights_);
+    const B *biases = static_cast<const B *>(biases_);
+    O *output = static_cast<O *>(output_);
+    // output H size
+    const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1;
+    const std::size_t oxSize =
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilated_kernel_x + strideDims[0]) /
+                                static_cast<float>(strideDims[0])));
+    // TODO: kernel computation
+    // output (batch, outCh, Xout, Yout)
+    // input  (batch, inCh, Xin, Yin)
+    // weight (outCh, inCh, kernelX, kernelY)
+    // does not take Dilation attribute into account
+    using signedsize = std::make_signed<std::size_t>::type;
+    for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+        for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
+            const std::size_t oIndex = (outCh + batch*outChannels) * oxSize;
+            // If bias = nullptr, set B(0)
+            B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
+            std::fill(output + oIndex, output+(oIndex+oxSize), biasVal);
+            for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
+                const std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2];
+                const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0];
+                for (std::size_t ox = 0; ox < oxSize; ++ox) {
+                    const signedsize difx = static_cast<signedsize>(ox * strideDims[0]);
+                    const std::size_t sxMin = static_cast<std::size_t>(std::max(static_cast<signedsize>(beginEndBorders[0]) - difx, signedsize(0)));
+                    const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + static_cast<signedsize>(beginEndBorders[1]) - difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx);
+                    const std::size_t oIndexFull = oIndex + ox;
+                    const signedsize ix = static_cast<signedsize>(ox * strideDims[0]) - static_cast<signedsize>(beginEndBorders[0]);
+                    for (std::size_t sx = sxMin; sx*dilationDims[0] < sxMax; ++sx) {
+                        output[oIndexFull] += weights[wIndex + sx] *
+                                                input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx*dilationDims[0]))];
+                    }
+                }
+            }
+        }
+    }
+}
+// Kernels registration to implementation entry point
+REGISTRAR(PaddedConvImpl1D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}, DynamicAttributes(std::map<std::string, future_std::any>({std::make_pair("type", future_std::any(std::string("PaddedConv1D")))}))},
+    {ProdConso::inPlaceModel, Aidge::PaddedConvImpl1D_cpu_forward_kernel<float, float, float, float>, nullptr});
+REGISTRAR(PaddedConvImpl1D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float16, DataFormat::NCHW}, DynamicAttributes(std::map<std::string, future_std::any>({std::make_pair("type", future_std::any(std::string("PaddedConv1D")))}))},
+    {ProdConso::inPlaceModel, Aidge::PaddedConvImpl1D_cpu_forward_kernel<half_float::half, half_float::half, half_float::half, half_float::half>, nullptr});
+REGISTRAR(PaddedConvImpl1D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}, DynamicAttributes(std::map<std::string, future_std::any>({std::make_pair("type", future_std::any(std::string("PaddedConv1D")))}))},
+    {ProdConso::inPlaceModel, Aidge::PaddedConvImpl1D_cpu_forward_kernel<int32_t, int32_t, int32_t, int32_t>, nullptr});
+REGISTRAR(PaddedConvImpl1D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}, DynamicAttributes(std::map<std::string, future_std::any>({std::make_pair("type", future_std::any(std::string("PaddedConv1D")))}))},
+    {ProdConso::inPlaceModel, Aidge::PaddedConvImpl1D_cpu_forward_kernel<double, double, double, double>, nullptr});
+/**
+ * @brief Forward kernel for 2D Convolution on CPU backend.
+ * @tparam I Input data type.
+ * @tparam W Weight data type.
+ * @tparam B Bias data type.
+ * @tparam O Output data type.
+ * @param params tuple of Attributes from the Operator
+ * @param inputDims Array of input dimensions.
+ * @param input_ const input Tensor.
+ * @param weights_ const weight Tensor.
+ * @param biases_ const Biais Tensor.
+ * @param output_ Output Tensor.
+ */
+template <class I, class W, class B, class O>
+void PaddedConvImpl2D_cpu_forward_kernel(
+                            const std::array<DimSize_t, 4>& beginEndBorders,
+                            const std::array<DimSize_t, 2>& strideDims,
+                            const std::array<DimSize_t, 2>& dilationDims,
+                            const std::array<DimSize_t, 2>& kernelDims,
+                            const std::array<DimSize_t, 4> &inputDims,
+                            DimSize_t outChannels,
+                            const void *input_,
+                            const void *weights_,
+                            const void *biases_,
+                            void *output_)
+{
+    // FIXME: missing convolution attributes as arguments
+    const I *input = static_cast<const I *>(input_);
+    const W *weights = static_cast<const W *>(weights_);
+    const B *biases = static_cast<const B *>(biases_);
+    O *output = static_cast<O *>(output_);
+    // output H size
+    const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1;
+    const std::size_t oxSize =
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilated_kernel_x + beginEndBorders[0] + beginEndBorders[2] + strideDims[0]) /
+                                static_cast<float>(strideDims[0])));
+    // output W size
+    const DimSize_t dilated_kernel_y = dilationDims[1]*(kernelDims[1] - 1) + 1;
+    const std::size_t oySize =
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[3] - dilated_kernel_y + beginEndBorders[1] + beginEndBorders[3] + strideDims[1]) /
+                                static_cast<float>(strideDims[1])));
+    for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+        for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
+            const std::size_t oIndex = (outCh + batch*outChannels) * oxSize * oySize;
+            // If bias = nullptr, set B(0)
+            B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
+            std::fill(output + oIndex, output+(oIndex+oxSize*oySize), biasVal);
+            for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
+                const std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
+                const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0] * kernelDims[1];
+                for (std::size_t ox = 0; ox < oxSize; ++ox) {
+                    const std::size_t difx = ox * strideDims[0];
+                    const std::size_t sxMin = beginEndBorders[0] < difx ? std::size_t(0) : beginEndBorders[0] - difx;
+                    const std::size_t sxMax = (inputDims[2] + beginEndBorders[2]) < difx ?
+                                                0 :
+                                                ((inputDims[2] + beginEndBorders[2]) > dilated_kernel_x + difx ?
+                                                    dilated_kernel_x :
+                                                    (inputDims[2] + beginEndBorders[2] - difx));
+                    for (std::size_t oy = 0; oy < oySize; ++oy) {
+                        const std::size_t dify = oy * strideDims[1];
+                        const std::size_t syMin = beginEndBorders[1] < dify ? std::size_t(0) : beginEndBorders[1] - dify;
+                        const std::size_t syMax = (inputDims[3] + beginEndBorders[3]) < dify ?
+                                                0 :
+                                                ((inputDims[3] + beginEndBorders[3]) > dilated_kernel_y + dify ?
+                                                    dilated_kernel_y :
+                                                    (inputDims[3] + beginEndBorders[3] - dify));
+                        const std::size_t oIndexFull = oIndex + ox*oySize + oy;
+                        const std::size_t ix = ox * strideDims[0] - beginEndBorders[0];
+                        const std::size_t iy = oy * strideDims[1] - beginEndBorders[1];
+                        if (sxMin == 0 && syMin == 0 && sxMax == 3 && syMax == 3) {
+                            output[oIndexFull] += (weights[wIndex + 0*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
+                                                   weights[wIndex + 0*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
+                                                   weights[wIndex + 0*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+0)*inputDims[3] + static_cast<std::size_t>(iy+2)] +
+                                                   weights[wIndex + 1*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
+                                                   weights[wIndex + 1*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
+                                                   weights[wIndex + 1*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+1)*inputDims[3] + static_cast<std::size_t>(iy+2)] +
+                                                   weights[wIndex + 2*kernelDims[1] + 0] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+0)] +
+                                                   weights[wIndex + 2*kernelDims[1] + 1] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+1)] +
+                                                   weights[wIndex + 2*kernelDims[1] + 2] * input[iIndex + static_cast<std::size_t>(ix+2)*inputDims[3] + static_cast<std::size_t>(iy+2)]);
+                        } else {
+                            for (std::size_t sx = sxMin; sx*dilationDims[0] < sxMax; ++sx) {
+                                for (std::size_t sy = syMin; sy*dilationDims[1] < syMax; ++sy) {
+                                    output[oIndexFull] += weights[wIndex + sx*kernelDims[1] + sy] *
+                                                            input[iIndex + (sx*dilationDims[0] + ix)*inputDims[3] + sy*dilationDims[1] + iy];
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+// Kernels registration to implementation entry point
+REGISTRAR(PaddedConvImpl2D_cpu,
+    // ImplSpec{std::vector<ImplSpec::IOSpec>({ImplSpec::IOSpec{DataType::Any, DataFormat::NCHW}, ImplSpec::IOSpec{DataType::Any, DataFormat::NCHW}}) , std::vector<ImplSpec::IOSpec>({ImplSpec::IOSpec{DataType::Int32, DataFormat::NCHW}})},
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}, DynamicAttributes(std::map<std::string, future_std::any>({std::make_pair("type", future_std::any(std::string("PaddedConv2D")))}))},
+    {ProdConso::inPlaceModel, Aidge::PaddedConvImpl2D_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t, std::int32_t>, nullptr});
+REGISTRAR(PaddedConvImpl2D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float16, DataFormat::NCHW}, DynamicAttributes(std::map<std::string, future_std::any>({std::make_pair("type", future_std::any(std::string("PaddedConv2D")))}))},
+    {ProdConso::inPlaceModel, Aidge::PaddedConvImpl2D_cpu_forward_kernel<half_float::half, half_float::half, half_float::half, half_float::half>, nullptr});
+REGISTRAR(PaddedConvImpl2D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}, DynamicAttributes(std::map<std::string, future_std::any>({std::make_pair("type", future_std::any(std::string("PaddedConv2D")))}))},
+    {ProdConso::inPlaceModel, Aidge::PaddedConvImpl2D_cpu_forward_kernel<float, float, float, float>, nullptr});
+REGISTRAR(PaddedConvImpl2D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}, DynamicAttributes(std::map<std::string, future_std::any>({std::make_pair("type", future_std::any(std::string("PaddedConv2D")))}))},
+    {ProdConso::inPlaceModel, Aidge::PaddedConvImpl2D_cpu_forward_kernel<double, double, double, double>, nullptr});
+}  // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_PADDEDCONVIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/PowImpl.hpp
+++ b/include/aidge/backend/cpu/operator/PowImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_POWIMPL_H_
+#define AIDGE_CPU_OPERATOR_POWIMPL_H_
+#include <cstddef>  // std::size_t
+#include <memory>   // std::unique_ptr, std::make_unique
+#include <string>
+#include <vector>
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Pow.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+namespace Aidge {
+// Operator implementation entry point for the backend
+using PowImpl_cpu = OperatorImpl_cpu<Pow_Op,
+    void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*, void*),
+    void(const std::vector<std::size_t>&, const std::vector<std::size_t>&, const std::vector<std::size_t>&, const void*, const void*, const void*, void*, void*)>;
+// Implementation entry point registration to Operator
+REGISTRAR(Pow_Op, "cpu", Aidge::PowImpl_cpu::create);
+}  // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_POWIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/PowImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/PowImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_POWIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_POWIMPL_KERNELS_H_
+#include "aidge/utils/Registrar.hpp"
+#include <cstddef>  // std::size_t
+#include "aidge/backend/cpu/data/Broadcasting.hpp"
+#include "aidge/backend/cpu/operator/PowImpl.hpp"
+namespace Aidge {
+namespace {
+// suppose values are contiguous in memory
+template <class I, class O>
+void pow_contiguous_arrays(const std::size_t input1size,
+                            const std::size_t input2size,
+                            const std::size_t output1size,
+                            const I* input1,
+                            const I* input2,
+                            O* output)
+{
+    for (std::size_t i = 0; i < output1size; ++i)
+    {
+        const std::size_t in1_id = (input1size != 1) ? i : 0;
+        const std::size_t in2_id = (input2size != 1) ? i : 0;
+        output[i] = static_cast<O>(std::pow(input1[in1_id], input2[in2_id]));
+    }
+}
+}
+template <class I, class O>
+void PowImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
+                                std::vector<std::size_t> dims1,
+                                const std::vector<std::size_t>& outputDims,
+                                const void* input0_,
+                                const void* input1_,
+                                void* output_) {
+    const I* input_0 = static_cast<const I*>(input0_);
+    const I* input_1 = static_cast<const I*>(input1_);
+    O* output = static_cast<O*>(output_);
+    // [5,2,1,7] & [2,6,7]
+    // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7]
+    // 2. Find the highest equal dimension -> 3
+    //    Exception: if the first diverging dimension is the last one, then -> 4 (dims.size())
+    // 3. Compute the highest number of contiguous data -> 7
+    // 4. Compute stride and offset step for the broadcast mechanism
+    // 5. Call a simple kernel
+    // special case for equal dimensions, the kernel is called with the entire arrays at once
+    if (dims0 == dims1) {
+        const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+        for (std::size_t i = 0; i < input0_contiguous_size; ++i)
+        {
+            output[i] = static_cast<O>(std::pow(input_0[i], input_1[i]));
+        }
+        return;
+    }
+    // set dimensions to be of equal size by filling the smallest one with ones.
+    if (dims0.size() > dims1.size()) {
+        dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1));
+    }
+    else if (dims1.size() > dims0.size()) {
+        dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1));
+    }
+    const std::size_t nbDims = dims0.size();
+    // Find the highest equal dimension
+    // std::size_t contiguousIdx = nbDims - 1;
+    std::size_t contiguousIdx = nbDims;
+    while (contiguousIdx-- > 0) {
+    // for (; contiguousIdx+1 > 0; --contiguousIdx) {
+        if (dims0[contiguousIdx] != dims1[contiguousIdx]) {
+            if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1
+                const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1;
+                while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) {
+                    --contiguousIdx;
+                }
+            }
+            break;
+        }
+    }
+    ++contiguousIdx;
+    // Compute the highest number of contiguous data for each Tensor
+    const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    // initialize strides to iterate through data because of broadcasting
+    std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    if (contiguousIdx > 0) {
+        stride_post0[contiguousIdx - 1] = 1;
+        stride_post1[contiguousIdx - 1] = 1;
+        for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) {
+            stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]);
+            stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]);
+        }
+        for (std::size_t i = 0; i != contiguousIdx; ++i) {
+            stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1;
+            stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1;
+        }
+    }
+    // variables for arrays offsets
+    std::size_t offsetIn0 = 0;
+    std::size_t offsetIn1 = 0;
+    std::size_t offsetOut = 0;
+    std::size_t dim = contiguousIdx - 1;
+    const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
+    for (std::size_t stack = 0; stack < nbStacks;) {
+        pow_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
+                    input_0 + offsetIn0*input0_contiguous_size,
+                    input_1 + offsetIn1*input1_contiguous_size,
+                    output + offsetOut*output_contiguous_size);
+        if (++stack < nbStacks) {
+            std::size_t tmp_stack = stack;
+            while(tmp_stack % outputDims[dim] == 0) {
+                tmp_stack /= outputDims[dim];
+                dim--;
+            }
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = contiguousIdx - 1;
+        }
+    }
+}
+template <class I1, class I2, class O>
+void PowImpl_cpu_backward_kernel(const std::vector<std::size_t>& input0Dims,
+                                const std::vector<std::size_t>& input1Dims,
+                                const std::vector<std::size_t>& outputDims,
+                                const void* input0_,
+                                const void* input1_,
+                                const void* gradOutput_,
+                                void* gradientInput0_,
+                                void* gradientInput1_) {
+	const I1* input0 = static_cast<const I1*>(input0_);
+	I1* grad0 = static_cast<I1*>(gradientInput0_);
+    const I2* input1 = static_cast<const I2*>(input1_);
+    I2* grad1 = static_cast<I2*>(gradientInput1_);
+    const O* gradOut = static_cast<const O*>(gradOutput_);
+    // Fill input grads with zeros
+	std::size_t input0Elements = std::accumulate(input0Dims.cbegin(), input0Dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+	std::fill(grad0, grad0 + input0Elements, I1(0));
+	std::size_t input1Elements = std::accumulate(input1Dims.cbegin(), input1Dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+	std::fill(grad1, grad1 + input1Elements, I2(0));
+	std::size_t totalElements = std::accumulate(outputDims.cbegin(), outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    for (size_t oIndex = 0; oIndex < totalElements; ++oIndex)
+    {
+        // Compute indexes in inputs 0 and 1 to support broadcasting
+        std::vector<std::size_t> indexes = getMultiDimIndices(outputDims, oIndex);
+        std::size_t idx0 = getFlattenedIndex(input0Dims, indexes);
+        std::size_t idx1 = getFlattenedIndex(input1Dims, indexes);
+        // grad0 = grad_output * (input1 * pow(input0, (input1 -1)))
+        grad0[idx0] += gradOut[oIndex]*input1[idx1]* std::pow(input0[idx0], input1[idx1]-1);
+        // grad1 = grad_output * (output * ln(input0))
+        grad1[idx1] += gradOut[oIndex] * std::pow(input0[idx0], input1[idx1]) * std::log(input0[idx0]);
+    }
+}
+// Kernels registration to implementation entry point
+REGISTRAR(PowImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float32}},
+    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<float, float>, Aidge::PowImpl_cpu_backward_kernel<float, float, float>});
+REGISTRAR(PowImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}},
+    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<double, double>, Aidge::PowImpl_cpu_backward_kernel<double, double, double>});
+REGISTRAR(PowImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}},
+    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<int32_t, int32_t>, Aidge::PowImpl_cpu_backward_kernel<int32_t, int32_t, int32_t>});
+REGISTRAR(PowImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int64}},
+    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, Aidge::PowImpl_cpu_backward_kernel<std::int64_t, std::int64_t, std::int64_t>});
+REGISTRAR(PowImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int8}},
+    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<std::int8_t, std::int8_t>, Aidge::PowImpl_cpu_backward_kernel<std::int8_t, std::int8_t, std::int8_t>});
+REGISTRAR(PowImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::UInt8}},
+    {ProdConso::inPlaceModel, Aidge::PowImpl_cpu_forward_kernel<std::uint8_t, std::uint8_t>, Aidge::PowImpl_cpu_backward_kernel<std::uint8_t, std::uint8_t, std::uint8_t>});
+}  // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_POWIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/ReLUImpl.hpp
+++ b/include/aidge/backend/cpu/operator/ReLUImpl.hpp
@@ -12,52 +12,24 @@
 #ifndef AIDGE_CPU_OPERATOR_RELUIMPL_H_
 #define AIDGE_CPU_OPERATOR_RELUIMPL_H_
-#include "aidge/backend/OperatorImpl.hpp"
+#include <cstddef>  // std::size_t
+#include <memory>
+#include <tuple>    // std::tuple
+#include <vector>
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/ReLU.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
-#include <memory>
-#include <vector>
 namespace Aidge {
-// class ReLU_Op;
+// Operator implementation entry point for the backend
+using ReLUImpl_cpu = OperatorImpl_cpu<ReLU_Op,
-// compute kernel registry for forward and backward
+    void(const std::size_t, const void*, void*),
-class ReLUImplForward_cpu
+    void(const std::size_t, const void*, const void*, void*)>;
-    : public Registrable<ReLUImplForward_cpu, std::tuple<DataType, DataType>, void(const std::size_t, const void*, void*)> {
-};
-class ReLUImplBackward_cpu
-    : public Registrable<ReLUImplBackward_cpu, std::tuple<DataType, DataType>, void(const std::size_t, const void*, void*)> {
-};
-class ReLUImpl_cpu : public OperatorImpl {
-   protected:
-    const ReLU_Op& mOp;
-    std::array<NbElts_t, 1> mNbConsumedData;
-    std::array<NbElts_t, 1> mNbProducedData;
-   public:
-    ReLUImpl_cpu(const ReLU_Op& op) : mOp(op), mNbConsumedData({0}), mNbProducedData({0}) {}
-    static std::unique_ptr<ReLUImpl_cpu> create(const ReLU_Op& op) {
-        return std::make_unique<ReLUImpl_cpu>(op);
-    }
-   public:
-    NbElts_t getNbRequiredData(const IOIndex_t inputIdx) const override final;
-    NbElts_t getNbRequiredProtected(const IOIndex_t inputIdx) const override final;
-    NbElts_t getRequiredMemory(const IOIndex_t /*outputIdx*/, const std::vector<DimSize_t>& /*inputsSize*/) const override final;
-    NbElts_t getNbConsumedData(const IOIndex_t inputIdx) const override final;
-    NbElts_t getNbProducedData(const IOIndex_t outputIdx) const override final;
-    void updateConsummerProducer() override final;
-    void forward();
-    void backward();
-};
-namespace {
+// Implementation entry point registration to Operator
-static Registrar<ReLU_Op> registrarReLUImpl_cpu("cpu", Aidge::ReLUImpl_cpu::create);
+REGISTRAR(ReLU_Op, "cpu", Aidge::ReLUImpl_cpu::create);
-}
 }  // namespace Aidge
 #endif /* AIDGE_CPU_OPERATOR_RELUIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/ReLUImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ReLUImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_RELUIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_RELUIMPL_KERNELS_H_
+#include <cstddef>  // std::size_t
+#include <memory>
+#include <tuple>    // std::tuple
+#include <vector>
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/backend/cpu/operator/ReLUImpl.hpp"
+#include "aidge/operator/ReLU.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+namespace Aidge {
+// Kernels
+template <class I, class O>
+void ReLUImpl_cpu_forward_kernel(std::size_t inputLenght,
+                                     const void* input_,
+                                     void* output_) {
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+//#pragma omp parallel for if (inputLenght > 1024)
+    for (std::size_t i = 0; i < inputLenght; ++i) {
+        output[i] = (input[i] > 0) ? input[i] : 0;
+    }
+}
+template <class I, class GI, class GO>
+void ReLUImpl_cpu_backward_kernel(const std::size_t inputLenght,
+                                  const void* input_, const void* grad_output_,
+				  void* grad_input_) {
+    const I* input = static_cast<const I*>(input_);
+    const GO* grad_output = static_cast<const GO*>(grad_output_);
+    GI* grad_input = static_cast<GI*>(grad_input_);
+    for (std::size_t i = 0; i < inputLenght; ++i) {
+        grad_input[i] = (input[i] > 0) ? grad_output[i] : 0;
+    }
+}
+// Kernels registration to implementation entry point
+REGISTRAR(ReLUImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::ReLUImpl_cpu_forward_kernel<float, float>, Aidge::ReLUImpl_cpu_backward_kernel<float, float, float>});
+REGISTRAR(ReLUImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::ReLUImpl_cpu_forward_kernel<double, double>, Aidge::ReLUImpl_cpu_backward_kernel<double, double, double>});
+REGISTRAR(ReLUImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::inPlaceModel, Aidge::ReLUImpl_cpu_forward_kernel<int32_t, int32_t>, Aidge::ReLUImpl_cpu_backward_kernel<int32_t, int32_t, int32_t>});
+}  // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_RELUIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/ReduceMeanImpl.hpp
+++ b/include/aidge/backend/cpu/operator/ReduceMeanImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_REDUCEMEANIMPL_H_
+#define AIDGE_CPU_OPERATOR_REDUCEMEANIMPL_H_
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/ReduceMean.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+namespace Aidge {
+// Operator implementation entry point for the backend
+using ReduceMeanImpl_cpu = OperatorImpl_cpu<ReduceMean_Op,
+    void(const std::vector<std::int32_t>&,
+                            DimSize_t,
+                            const std::vector<DimSize_t>&,
+                            const void *,
+                            void *)>;
+// Implementation entry point registration to Operator
+REGISTRAR(ReduceMean_Op, "cpu", Aidge::ReduceMeanImpl_cpu::create);
+}  // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_REDUCEMEANIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/ReduceMeanImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ReduceMeanImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_REDUCEMEANIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_REDUCEMEANIMPL_KERNELS_H_
+#include <algorithm>   // std::for_each
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::int32_t
+#include <functional>  //std::multiplies
+#include <numeric>     //std::accumulate
+#include <vector>
+#include "aidge/backend/cpu/operator/ReduceMeanImpl.hpp"
+#include "aidge/data/Data.hpp"
+#include "aidge/operator/ReduceMean.hpp"
+#include "aidge/utils/Registrar.hpp"
+namespace Aidge {
+template <typename T>
+typename std::enable_if<std::is_floating_point<T>::value, T>::type
+stableMean(const T* vec, size_t len, size_t stride) {
+  T mean = 0;
+  for (size_t i = 0; i < len; ++i) {
+    mean = std::fma<T>(vec[i * stride] - mean, 1.0f / (i + 1), mean);
+  }
+  return mean;
+}
+// Specialization for integers: perform the mean computation in float
+template <typename T>
+typename std::enable_if<!std::is_floating_point<T>::value, T>::type
+stableMean(const T* vec, size_t len, size_t stride) {
+  double mean = 0;
+  for (size_t i = 0; i < len; ++i) {
+    mean = std::fma<double>(vec[i * stride] - mean, 1.0f / (i + 1), mean);
+  }
+  return mean;
+}
+template <typename T>
+typename std::enable_if<std::is_floating_point<T>::value, T>::type
+castFromFloat(T value) {
+  return value;
+}
+template <typename T>
+typename std::enable_if<!std::is_floating_point<T>::value, T>::type
+castFromFloat(double value) {
+  return static_cast<T>(std::nearbyint(value));
+}
+template <class I, class O>
+void ReduceMeanImpl_cpu_forward_kernel(const std::vector<std::int32_t>& axes,
+                                    DimSize_t /*keepDims*/,
+                                    const std::vector<DimSize_t>& inputDims,
+                                    const void* input_,
+                                    void* output_) {
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+    const std::size_t nb_dims = inputDims.size();
+    const std::size_t totalElements = std::accumulate(inputDims.cbegin(), inputDims.cend(), 1, std::multiplies<std::size_t>());
+    if (axes.empty()){
+        std::copy_n(input,totalElements, output);
+    }
+    else if (axes.size() == 1) {
+        const std::size_t stride_pre = std::accumulate(inputDims.cbegin(), inputDims.cbegin() + axes[0], 1, std::multiplies<std::size_t>());
+        const std::size_t stride_post = std::accumulate(inputDims.crbegin(), inputDims.crbegin() + nb_dims -1 - axes[0], 1, std::multiplies<std::size_t>());
+        const std::size_t dim_i = inputDims[axes[0]];
+        for (std::size_t pre = 0; pre < stride_pre; ++pre) {
+            for (std::size_t post = 0; post < stride_post; ++post) {
+                const std::size_t idx_i = pre * dim_i * stride_post + post;
+                const std::size_t idx_o = pre * stride_post + post;
+                output[idx_o]  = castFromFloat<O>(stableMean(input + idx_i, dim_i, stride_post));
+            }
+        }
+    } else {
+        std::size_t outputElements = totalElements;
+        auto stride_post = std::unique_ptr<std::size_t[]>(new std::size_t[nb_dims]);
+        stride_post[nb_dims - 1] = 1;
+        for (std::size_t i = nb_dims-2; i != static_cast<std::size_t>(-1); --i) {
+            stride_post[i] = stride_post[i+1]*inputDims[i+1];
+        }
+        auto stride_pre = std::unique_ptr<std::size_t[]>(new std::size_t[nb_dims]);
+        stride_pre[0] = 1;
+        for (std::size_t i = 1; i < nb_dims; ++i) {
+            stride_pre[i] = stride_pre[i-1]*inputDims[i-1];
+        }
+        // Type should be the return type of stableMean<I>(), which is always floating point
+        const decltype(stableMean<I>(input, 0, 0))* inputAccumulation = nullptr;
+        decltype(stableMean<I>(input, 0, 0))* outputAccumulation = nullptr;
+        for (const auto& axisInt : axes) {
+            const std::size_t a = static_cast<std::size_t>(axisInt);
+            outputElements /= inputDims[a];
+            outputAccumulation = new I[outputElements];
+            const std::size_t dim_i = inputDims[a];
+            for (std::size_t pre = 0; pre < stride_pre[a]; ++pre) {
+                for (std::size_t post = 0; post < stride_post[a]; ++post) {
+                    const std::size_t idx_i = pre * dim_i * stride_post[a] + post;
+                    const std::size_t idx_o = pre * stride_post[a] + post;
+                    if (inputAccumulation == nullptr) {
+                        outputAccumulation[idx_o] = stableMean<I>(input + idx_i, dim_i, stride_post[a]);
+                    }
+                    else {
+                        outputAccumulation[idx_o] = stableMean<I>(inputAccumulation + idx_i, dim_i, stride_post[a]);
+                    }
+                }
+            }
+            std::for_each(stride_pre.get()+a+1, stride_pre.get()+nb_dims, [dim_i] (std::size_t& val) { val /= dim_i; });
+            if (inputAccumulation != nullptr) {
+                delete[] inputAccumulation;
+            }
+            inputAccumulation = outputAccumulation;
+        }
+        std::transform(inputAccumulation, inputAccumulation + outputElements, output,
+            [](auto value) { return castFromFloat<O>(value); });
+        if (outputAccumulation) {
+            delete[] outputAccumulation;
+        }
+    }
+}
+// Kernels registration to implementation entry point
+REGISTRAR(ReduceMeanImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::ReduceMeanImpl_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(ReduceMeanImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::ReduceMeanImpl_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(ReduceMeanImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::inPlaceModel, Aidge::ReduceMeanImpl_cpu_forward_kernel<int32_t, int32_t>, nullptr});
+}  // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_REDUCEMEANIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/ReduceSumImpl.hpp
+++ b/include/aidge/backend/cpu/operator/ReduceSumImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_REDUCESUMIMPL_H_
+#define AIDGE_CPU_OPERATOR_REDUCESUMIMPL_H_
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/ReduceSum.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+namespace Aidge {
+// Operator implementation entry point for the backend
+using ReduceSumImpl_cpu = OperatorImpl_cpu<ReduceSum_Op,
+    void(const std::vector<std::int32_t>&,
+                            DimSize_t,
+                            const std::vector<DimSize_t>&,
+                            const void *,
+                            void *)>;
+// Implementation entry point registration to Operator
+REGISTRAR(ReduceSum_Op, "cpu", Aidge::ReduceSumImpl_cpu::create);
+}  // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_REDUCESUMIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/ReduceSumImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ReduceSumImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_REDUCESUMIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_REDUCESUMIMPL_KERNELS_H_
+#include <algorithm>   // std::for_each
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::int32_t
+#include <functional>  //std::multiplies
+#include <numeric>     //std::accumulate
+#include <vector>
+#include "aidge/backend/cpu/operator/ReduceSumImpl.hpp"
+#include "aidge/data/Data.hpp"
+#include "aidge/operator/ReduceSum.hpp"
+#include "aidge/utils/Registrar.hpp"
+namespace Aidge {
+template <class I, class O>
+void ReduceSumImpl_cpu_forward_kernel(const std::vector<std::int32_t>& axes,
+                                    DimSize_t /*keepDims*/,
+                                    const std::vector<DimSize_t>& inputDims,
+                                    const void* input_,
+                                    void* output_) {
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+    const std::size_t nb_dims = inputDims.size();
+    const std::size_t totalElements = std::accumulate(inputDims.cbegin(), inputDims.cend(), 1, std::multiplies<std::size_t>());
+    if (axes.empty()){
+        std::copy_n(input,totalElements, output);
+    }
+    else if (axes.size() == 1) {
+        const std::size_t stride_pre = std::accumulate(inputDims.cbegin(), inputDims.cbegin() + axes[0], 1, std::multiplies<std::size_t>());
+        const std::size_t stride_post = std::accumulate(inputDims.crbegin(), inputDims.crbegin() + nb_dims -1 - axes[0], 1, std::multiplies<std::size_t>());
+        const std::size_t dim_i = inputDims[axes[0]];
+        for (std::size_t pre = 0; pre < stride_pre; ++pre) {
+            for (std::size_t post = 0; post < stride_post; ++post) {
+                const std::size_t idx_i = pre * dim_i * stride_post + post;
+                const std::size_t idx_o = pre * stride_post + post;
+                O sum = 0;
+                for (std::size_t i = 0; i < dim_i; ++i) {
+                    sum +=input[idx_i + i*stride_post];
+                }
+                output[idx_o]  = sum;
+            }
+        }
+    } else {
+        std::size_t outputElements = totalElements;
+        auto stride_post = std::unique_ptr<std::size_t[]>(new std::size_t[nb_dims]);
+        stride_post[nb_dims - 1] = 1;
+        for (std::size_t i = nb_dims-2; i != static_cast<std::size_t>(-1); --i) {
+            stride_post[i] = stride_post[i+1]*inputDims[i+1];
+        }
+        auto stride_pre = std::unique_ptr<std::size_t[]>(new std::size_t[nb_dims]);
+        stride_pre[0] = 1;
+        for (std::size_t i = 1; i < nb_dims; ++i) {
+            stride_pre[i] = stride_pre[i-1]*inputDims[i-1];
+        }
+        const I* inputAccumulation = input;
+        I* outputAccumulation = nullptr;
+        for (const auto& axisInt : axes) {
+            const std::size_t a = static_cast<std::size_t>(axisInt);
+            outputElements /= inputDims[a];
+            outputAccumulation = new I[outputElements];
+            const std::size_t dim_i = inputDims[a];
+            for (std::size_t pre = 0; pre < stride_pre[a]; ++pre) {
+                for (std::size_t post = 0; post < stride_post[a]; ++post) {
+                    const std::size_t idx_i = pre * dim_i * stride_post[a] + post;
+                    const std::size_t idx_o = pre * stride_post[a] + post;
+                    I sum = 0;
+                    for (std::size_t i = 0; i < dim_i; ++i) {
+                        sum += inputAccumulation[idx_i + i*stride_post[a]];
+                    }
+                    outputAccumulation[idx_o] = sum;
+                }
+            }
+            std::for_each(stride_pre.get()+a+1, stride_pre.get()+nb_dims, [dim_i] (std::size_t& val) { val /= dim_i; });
+            if (inputAccumulation != input) {
+                delete[] inputAccumulation;
+            }
+            inputAccumulation = outputAccumulation;
+        }
+        // Copy elements from inputAccumulation to output while dividing by divisor
+        std::copy(inputAccumulation, inputAccumulation + outputElements, output);
+        if (outputAccumulation) {
+            delete[] outputAccumulation;
+        }
+    }
+}
+// Kernels registration to implementation entry point
+REGISTRAR(ReduceSumImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::ReduceSumImpl_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(ReduceSumImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::ReduceSumImpl_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(ReduceSumImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::inPlaceModel, Aidge::ReduceSumImpl_cpu_forward_kernel<int32_t, int32_t>, nullptr});
+}  // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_REDUCESUMIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/ResizeImpl.hpp
+++ b/include/aidge/backend/cpu/operator/ResizeImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_RESIZEIMPL_H_
+#define AIDGE_CPU_OPERATOR_RESIZEIMPL_H_
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Resize.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include <aidge/data/Interpolation.hpp>
+#include <aidge/operator/Pad.hpp>
+#include <cstdint>
+namespace Aidge {
+// Operator implementation entry point for the backend
+using ResizeImpl_cpu = OperatorImpl_cpu<
+    Resize_Op,
+    void(const void *,                                  // input
+         const std::vector<DimSize_t> &,                // INput dims
+         const std::vector<DimSize_t> &,                // OUTput dims
+         const Interpolation::CoordinateTransformation, // coord transfo
+         const Interpolation::Mode,                     // interpolation mode
+         const PadBorderType,                           // padding mode
+         void *)>;                                      // output
+// Implementation entry point registration to Operator
+REGISTRAR(Resize_Op, "cpu", Aidge::ResizeImpl_cpu::create);
+} // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_RESIZEIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/ResizeImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ResizeImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_RESIZEIMPL_FORWARD_KERNEL_H_
+#define AIDGE_CPU_OPERATOR_RESIZEIMPL_FORWARD_KERNEL_H_
+#include "aidge/backend/cpu/operator/ResizeImpl.hpp"
+#include <aidge/data/Data.hpp>
+#include <aidge/data/half.hpp>
+#include <aidge/operator/Pad.hpp>
+#include <cmath>
+#include <cstdint>
+#include <numeric>
+#include "aidge/backend/cpu/data/Interpolation.hpp"
+#include "aidge/data/Interpolation.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+namespace Aidge {
+template <typename IO>
+void ResizeImpl_cpu_forward_kernel(
+    const void *input_,
+    const std::vector<DimSize_t> &inputDims,
+    const std::vector<DimSize_t> &outputDims,
+    const Interpolation::CoordinateTransformation coordTransfoMode,
+    const Interpolation::Mode interpMode,
+    const PadBorderType paddingMode,
+    // const double * /*roi*/,
+    // const float * /*scales*/,
+    // const int64_t * /*sizes*/,
+    void *output_) {
+    // Seting a data
+    const IO *input = static_cast<const IO *>(input_);
+    IO *output = static_cast<IO *>(output_);
+    const DimSize_t outputLen = std::accumulate(outputDims.cbegin(),
+                                          outputDims.cend(),
+                                          1,
+                                          std::multiplies<DimSize_t>());
+    std::vector<float> coordInApprox(inputDims.size());
+    std::vector<std::size_t> coordIn(inputDims.size());
+    std::vector<DimSize_t> coordOut;
+    for (DimSize_t idxFlatOut = 0; idxFlatOut < outputLen; ++idxFlatOut) {
+        coordOut = Tensor::toCoord(outputDims, idxFlatOut);
+        coordInApprox =
+            Interpolation::untransformCoordinates(coordOut,
+                                                  inputDims,
+                                                  outputDims,
+                                                  coordTransfoMode);
+        if ((interpMode == Interpolation::Mode::Ceil) || (interpMode == Interpolation::Mode::Floor) || (interpMode == Interpolation::Mode::RoundPreferCeil) || (interpMode == Interpolation::Mode::RoundPreferFloor)) {
+            for (std::size_t i = 0; i < coordInApprox.size(); ++i) {
+                if (interpMode == Interpolation::Mode::Ceil) {
+                    coordInApprox[i] = std::ceil(coordInApprox[i]);
+                } else if (interpMode == Interpolation::Mode::Floor) {
+                    coordInApprox[i] = std::floor(coordInApprox[i]);
+                } else if (interpMode == Interpolation::Mode::RoundPreferCeil) {
+                    coordInApprox[i] = std::floor(coordInApprox[i] + 0.5f);
+                } else { // (interpMode == Interpolation::Mode::RoundPreferFloor)
+                    coordInApprox[i] = std::ceil(coordInApprox[i] - 0.5f);
+                }
+            }
+            if (Tensor::isInBounds<float>(inputDims, coordInApprox)) {
+                for (std::size_t i = 0; i < coordInApprox.size(); ++i) {
+                    coordIn[i] = static_cast<std::size_t>(coordInApprox[i]);
+                }
+            } else {
+                if (paddingMode == PadBorderType::Edge) {
+                    for (std::size_t i = 0; i < coordInApprox.size(); ++i) {
+                        coordIn[i] = coordInApprox[i] < 0 ? 0 : (coordInApprox[i] >=inputDims[i] ? inputDims[i] - 1 : static_cast<std::size_t>(coordInApprox[i]));
+                    }
+                } else {
+                    AIDGE_THROW_OR_ABORT(std::runtime_error, "Padding mode not supported");
+                }
+            }
+            output[idxFlatOut] = input[Tensor::toIndex(inputDims, coordIn)];
+        } else {
+            std::set<Interpolation::Point<IO>> neighbours =
+                InterpolationCPU::retrieveNeighbours(input,
+                                                     inputDims,
+                                                     coordInApprox,
+                                                     paddingMode);
+            output[idxFlatOut] = InterpolationCPU::interpolate(coordInApprox,
+                                                               neighbours,
+                                                               interpMode);
+        }
+    }
+    return;
+}
+// Kernels registration to implementation entry point
+REGISTRAR(ResizeImpl_cpu,
+          {{{DataType::Int16},
+            {DataType::Any},
+            {DataType::Any},
+            {DataType::Any}},
+           {DataType::Int16}},
+          {ProdConso::inPlaceModel,
+           ResizeImpl_cpu_forward_kernel<int16_t>,
+           nullptr});
+REGISTRAR(ResizeImpl_cpu,
+          {{{DataType::Int32},
+            {DataType::Any},
+            {DataType::Any},
+            {DataType::Any}},
+           {DataType::Int32}},
+          {ProdConso::inPlaceModel,
+           ResizeImpl_cpu_forward_kernel<int32_t>,
+           nullptr});
+REGISTRAR(ResizeImpl_cpu,
+          {{{DataType::Int64},
+            {DataType::Any},
+            {DataType::Any},
+            {DataType::Any}},
+           {DataType::UInt64}},
+          {ProdConso::inPlaceModel,
+           ResizeImpl_cpu_forward_kernel<int64_t>,
+           nullptr});
+REGISTRAR(ResizeImpl_cpu,
+          {{{DataType::Float16},
+            {DataType::Any},
+            {DataType::Any},
+            {DataType::Any}},
+           {DataType::Float16}},
+          {ProdConso::inPlaceModel,
+           ResizeImpl_cpu_forward_kernel<half_float::half>,
+           nullptr});
+REGISTRAR(ResizeImpl_cpu,
+          {{{DataType::Float32},
+            {DataType::Any},
+            {DataType::Any},
+            {DataType::Any}},
+           {DataType::Float32}},
+          {ProdConso::inPlaceModel,
+           ResizeImpl_cpu_forward_kernel<float>,
+           nullptr});
+REGISTRAR(ResizeImpl_cpu,
+          {{{DataType::Float64},
+            {DataType::Any},
+            {DataType::Any},
+            {DataType::Any}},
+           {DataType::Float64}},
+          {ProdConso::inPlaceModel,
+           ResizeImpl_cpu_forward_kernel<double>,
+           nullptr});
+} // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_RESIZEIMPL_FORWARD_KERNEL_H_ */
--- a/include/aidge/backend/cpu/operator/RoundImpl.hpp
+++ b/include/aidge/backend/cpu/operator/RoundImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_ROUNDIMPL_H_
+#define AIDGE_CPU_OPERATOR_ROUNDIMPL_H_
+#include <cstddef>  // std::size_t
+#include <memory>
+#include <tuple>
+#include <vector>
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Round.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+namespace Aidge {
+// Operator implementation entry point for the backend
+using RoundImpl_cpu = OperatorImpl_cpu<Round_Op,
+    void(const std::size_t, const void*, void*)>;
+// Implementation entry point registration to Operator
+REGISTRAR(Round_Op, "cpu", Aidge::RoundImpl_cpu::create);
+}  // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_ROUNDIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/RoundImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/RoundImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_ROUNDIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_ROUNDIMPL_KERNELS_H_
+#include <cmath>   //std::round 
+#include <cstddef>  // std::size_t
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/backend/cpu/operator/RoundImpl.hpp"
+namespace Aidge {
+template <class I, class O>
+void RoundImpl_cpu_forward_kernel(const std::size_t inputLenght,
+                                     const void* input_,
+                                     void* output_) {
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+    for (std::size_t i = 0; i < inputLenght; ++i) {
+        //std::round would not work since it doesn't follow the halves rules (See ONNX Round)
+        output[i] = static_cast<O>(std::nearbyint(static_cast<float>(input[i])));
+    }
+}
+REGISTRAR(RoundImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::RoundImpl_cpu_forward_kernel<float, float>,nullptr});
+REGISTRAR(RoundImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::RoundImpl_cpu_forward_kernel<double, double>,nullptr});
+}  // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_ROUNDIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/ScalingImpl.hpp
+++ b/include/aidge/backend/cpu/operator/ScalingImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef __AIDGE_CPU_OPERATOR_ScalingIMPL_H__
+#define __AIDGE_CPU_OPERATOR_ScalingIMPL_H__
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Scaling.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include <memory>
+#include <vector>
+#include <array>
+namespace Aidge {
+// Operator implementation entry point for the backend
+using ScalingImpl_cpu = OperatorImpl_cpu<Scaling_Op,
+    void(const float,
+        const std::size_t,
+        const bool,
+        std::size_t,
+        const void*,
+        void*)>;
+// Implementation entry point registration to Operator
+REGISTRAR(Scaling_Op, "cpu", Aidge::ScalingImpl_cpu::create);
+}  // namespace Aidge
+#endif /* __AIDGE_CPU_OPERATOR_ScalingIMPL_H__ */
\ No newline at end of file
--- a/include/aidge/backend/cpu/operator/ScalingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ScalingImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_SCALINGIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_SCALINGIMPL_KERNELS_H_
+#include <cmath>
+#include <cstddef>
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/backend/cpu/operator/ScalingImpl.hpp"
+//TODO : improve propagate, n2d2 :
+/*
+template<typename T>
+void N2D2::floatingPointScaling_propagate(const Tensor<T>& input, Tensor<T>& output,
+                                          std::size_t batchSize, std::size_t nbChannels,
+                                          std::size_t height, std::size_t width,
+                                          bool isClipped,
+                                          const std::vector<Float_T>& clippingFactorPerChannel,
+                                          const std::vector<Float_T>& scalingFactorPerChannel,
+                                          std::size_t quantizedNbBits, bool isOutputUnsigned)
+{
+    std::size_t index = 0;
+    for (std::size_t batch = 0; batch < batchSize; batch++) {
+        for(std::size_t ch = 0; ch < nbChannels; ch++) {
+            for(std::size_t y = 0; y < height; y++) {
+                for(std::size_t x = 0; x < width; x++) {
+                    T res = isClipped ? Clip(input(index), clippingFactorPerChannel[ch])
+                                    : input(index);
+                    res = Scale(res, scalingFactorPerChannel[ch]);
+                    if(quantizedNbBits > 0) {
+                        res = saturate(std::round(res), quantizedNbBits, isOutputUnsigned);
+                    }
+                    output(index) = (T) res;
+                    index++;
+                }
+            }
+        }
+    }
+}
+*/
+namespace Aidge {
+template <class O>
+const O& clamp(const O& x, const O& min, const O& max)
+{
+    return (x < min) ? min : (x > max) ? max : x;
+}
+template<class O>
+O saturate(const O value, const std::size_t quantizedNbBits, const bool isOutputUnsigned) {
+    // TODO: no assertions in kernel
+    assert(quantizedNbBits > 0);
+    const O min = isOutputUnsigned ? 0 :
+                                  -(1ll << (quantizedNbBits - 1ll));
+    const O max = isOutputUnsigned ? (1ll << quantizedNbBits) - 1ll :
+                                   (1ll << (quantizedNbBits - 1ll)) - 1ll;
+    return clamp(value, min, max);
+}
+template <class I, class O>
+void ScalingImpl_cpu_forward_kernel(const float scalingFactor,
+                                    const std::size_t quantizedNbBits,
+                                    const bool isOutputUnsigned,
+                                    std::size_t inputLenght,
+                                    const void* input_,
+                                    void* output_) {
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+    for (std::size_t i = 0; i < inputLenght; ++i) {
+        output[i] = static_cast<O>(input[i] * static_cast<I>(scalingFactor));
+        if(quantizedNbBits > 0) {
+            output[i] = saturate(std::round(output[i]), quantizedNbBits, isOutputUnsigned);
+        }
+    }
+}
+// Kernels registration to implementation entry point
+REGISTRAR(ScalingImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::ScalingImpl_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(ScalingImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::ScalingImpl_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(ScalingImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::inPlaceModel, Aidge::ScalingImpl_cpu_forward_kernel<int32_t, int32_t>, nullptr});
+}  // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_SCALINGIMPL_KERNELS_H_ */
\ No newline at end of file
--- a/include/aidge/backend/cpu/operator/SigmoidImpl.hpp
+++ b/include/aidge/backend/cpu/operator/SigmoidImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_SIGMOIDIMPL_H_
+#define AIDGE_CPU_OPERATOR_SIGMOIDIMPL_H_
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Sigmoid.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include <memory>
+#include <vector>
+namespace Aidge {
+// Operator implementation entry point for the backend
+using SigmoidImpl_cpu = OperatorImpl_cpu<Sigmoid_Op,
+    void(const std::size_t, const void*, void*),
+    void(const std::size_t, const void*, const void*, void*)>;
+// Implementation entry point registration to Operator
+REGISTRAR(Sigmoid_Op, "cpu", Aidge::SigmoidImpl_cpu::create);
+}  // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_SIGMOIDIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/SigmoidImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/SigmoidImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_SIGMOIDIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_SIGMOIDIMPL_KERNELS_H_
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/backend/cpu/operator/SigmoidImpl.hpp"
+namespace Aidge {
+template <class I, class O>
+void SigmoidImpl_cpu_forward_kernel(std::size_t inputLenght,
+                                    const void* input_,
+                                    void* output_) {
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+//#pragma omp parallel for if (inputLenght > 1024)
+    for (std::size_t i = 0; i < inputLenght; ++i) {
+		if (input[i] > I(0)) {
+			output[i] = O(1) / (O(1) + std::exp(-input[i]));
+		} else {
+			output[i] = std::exp(input[i]) / (O(1) + std::exp(input[i]));
+		}
+    }
+}
+template <class O, class GI, class GO>
+void SigmoidImpl_cpu_backward_kernel(const std::size_t inputLenght,
+                                     const void* output_, const void* grad_output_,
+				     void* grad_input_) {
+    const O* output = static_cast<const O*>(output_);
+    const GO* grad_output = static_cast<const GO*>(grad_output_);
+    GI* grad_input = static_cast<GI*>(grad_input_);
+    for (std::size_t i = 0; i < inputLenght; ++i) {
+        grad_input[i] = output[i] * (O(1) - output[i]) * grad_output[i];
+    }
+}
+// Kernels registration to implementation entry point
+REGISTRAR(SigmoidImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::SigmoidImpl_cpu_forward_kernel<float, float>, Aidge::SigmoidImpl_cpu_backward_kernel<float, float, float>});
+REGISTRAR(SigmoidImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::SigmoidImpl_cpu_forward_kernel<double, double>, Aidge::SigmoidImpl_cpu_backward_kernel<double, double, double>});
+}  // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_SIGMOIDIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/SliceImpl.hpp
+++ b/include/aidge/backend/cpu/operator/SliceImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_SLICEIMPL_H__
+#define AIDGE_CPU_OPERATOR_SLICEIMPL_H__
+#include <memory>
+#include <vector>
+#include <array>
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Slice.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+namespace Aidge {
+// Operator implementation entry point for the backend
+using SliceImpl_cpu = OperatorImpl_cpu<Slice_Op,
+    void(const std::vector<std::int64_t>&,
+                            const std::vector<std::int64_t>&,
+                            const std::vector<std::int8_t>&,
+                            const std::vector<std::int64_t>&,
+                            const std::vector<DimSize_t>&,
+                            const void*,
+                            void*)>;
+// Implementation entry point registration to Operator
+REGISTRAR(Slice_Op, "cpu", Aidge::SliceImpl_cpu::create);
+}  // namespace Aidge
+#endif /* __AIDGE_CPU_OPERATOR_SLICEIMPL_H__ */
No results found