Compare revisions

a424079c · a424079c · a424079c · a424079c · a424079c · a424079c
--- a/include/aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp
+++ b/include/aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_H_
+#define AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_H_
+#include <cstddef>
+#include <memory>
+#include <vector>
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/ConstantOfShape.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+namespace Aidge {
+// Operator implementation entry point for the backend
+using ConstantOfShapeImpl_cpu = OperatorImpl_cpu<ConstantOfShape_Op,
+    void(const std::vector<DimSize_t>, const Tensor&, void *)>;
+// Implementation entry point registration to Operator
+REGISTRAR(ConstantOfShape_Op, "cpu", Aidge::ConstantOfShapeImpl_cpu::create);
+} // namespace Aidge
+#endif /* _AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/ConstantOfShapeImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConstantOfShapeImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_KERNELS_H_
+#include <aidge/data/Tensor.hpp>
+#include <aidge/data/half.hpp>
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <functional> // std::multiplies
+#include <numeric>    // std::accumulate
+#include <vector>
+#include "aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp"
+#include "aidge/data/Data.hpp"
+#include "aidge/utils/ErrorHandling.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+namespace Aidge {
+template <class O>
+void ConstantOfShapeimpl_cpu_forward_kernel(
+    const std::vector<DimSize_t> output_dims, const Tensor &value,
+    void *output_) {
+  O *output = static_cast<O *>(output_);
+  O val;
+  std::copy(static_cast<O *>(value.getImpl()->hostPtr()),
+            static_cast<O *>(value.getImpl()->hostPtr()) +
+                static_cast<NbElts_t>(1),
+            &val);
+  const size_t output_size = std::accumulate(
+      output_dims.begin(), output_dims.end(), 1, std::multiplies<DimSize_t>());
+  for (size_t i = 0; i < output_size; ++i) {
+    output[i] = val;
+  }
+}
+// Kernels registration to implementation entry point
+REGISTRAR(ConstantOfShapeImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int64}, ImplSpec::IOSpec{DataType::Float16}},
+    {ProdConso::defaultModel, Aidge::ConstantOfShapeimpl_cpu_forward_kernel<half_float::half>, nullptr});
+REGISTRAR(ConstantOfShapeImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int64}, ImplSpec::IOSpec{DataType::Float32}},
+    {ProdConso::defaultModel, Aidge::ConstantOfShapeimpl_cpu_forward_kernel<float>, nullptr});
+REGISTRAR(ConstantOfShapeImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int64}, ImplSpec::IOSpec{DataType::Float64}},
+    {ProdConso::defaultModel, Aidge::ConstantOfShapeimpl_cpu_forward_kernel<double>, nullptr});
+REGISTRAR(ConstantOfShapeImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int64}, ImplSpec::IOSpec{DataType::Int16}},
+    {ProdConso::defaultModel, Aidge::ConstantOfShapeimpl_cpu_forward_kernel<std::int16_t>, nullptr});
+REGISTRAR(ConstantOfShapeImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int64}, ImplSpec::IOSpec{DataType::Int32}},
+    {ProdConso::defaultModel, Aidge::ConstantOfShapeimpl_cpu_forward_kernel<std::int32_t>, nullptr});
+REGISTRAR(ConstantOfShapeImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Int64}, ImplSpec::IOSpec{DataType::Int64}},
+    {ProdConso::defaultModel, Aidge::ConstantOfShapeimpl_cpu_forward_kernel<std::int64_t>, nullptr});
+} // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/ConvDepthWiseImpl.hpp
+++ b/include/aidge/backend/cpu/operator/ConvDepthWiseImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_CONVDEPTHWISEIMPL_H_
+#define AIDGE_CPU_OPERATOR_CONVDEPTHWISEIMPL_H_
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/ConvDepthWise.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+namespace Aidge {
+// Operator implementation entry point for the backend
+using ConvDepthWise1D_Op = ConvDepthWise_Op<1>;
+using ConvDepthWiseImpl1D_cpu = OperatorImpl_cpu<ConvDepthWise_Op<1>,
+    void(const std::array<DimSize_t, 1>&,
+        const std::array<DimSize_t, 1>&,
+        const std::array<DimSize_t, 1>&,
+        const std::array<DimSize_t, 3>&,
+        const void *,
+        const void *,
+        const void *,
+        void *)>;
+using ConvDepthWise2D_Op = ConvDepthWise_Op<2>;
+using ConvDepthWiseImpl2D_cpu = OperatorImpl_cpu<ConvDepthWise_Op<2>,
+    void(const std::array<DimSize_t, 2>&,
+        const std::array<DimSize_t, 2>&,
+        const std::array<DimSize_t, 2>&,
+        const std::array<DimSize_t, 4> &,
+        const void *,
+        const void *,
+        const void *,
+        void *)>;
+// Implementation entry point registration to Operator
+REGISTRAR(ConvDepthWise1D_Op, "cpu", Aidge::ConvDepthWiseImpl1D_cpu::create);
+REGISTRAR(ConvDepthWise2D_Op, "cpu", Aidge::ConvDepthWiseImpl2D_cpu::create);
+}  // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_CONVDEPTHWISEIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_CONVDEPTHWISEIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_CONVDEPTHWISEIMPL_KERNELS_H_
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <cstddef>
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include "aidge/backend/cpu/operator/ConvDepthWiseImpl.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+namespace Aidge {
+/**
+ * @brief Forward kernel for 1D ConvDepthWiseolution on CPU backend.
+ * @tparam I Input data type.
+ * @tparam W Weight data type.
+ * @tparam B Bias data type.
+ * @tparam O Output data type.
+ * @param params tuple of Attributes from the Operator
+ * @param inputDims Array of input dimensions.
+ * @param input_ const input Tensor.
+ * @param weights_ const weight Tensor.
+ * @param biases_ const Biais Tensor.
+ * @param output_ Output Tensor.
+ */
+template <class I, class W, class B, class O>
+void ConvDepthWiseImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& strideDims,
+                            const std::array<DimSize_t, 1>& dilationDims,
+                            const std::array<DimSize_t, 1>& kernelDims,
+                            const std::array<DimSize_t, 3>& inputDims,
+                            const void *input_,
+                            const void *weights_,
+                            const void *biases_,
+                            void *output_) {
+    // FIXME: missing convolution attributes as arguments
+    const I *input = static_cast<const I *>(input_);
+    const W *weights = static_cast<const W *>(weights_);
+    const B *biases = static_cast<const B *>(biases_);
+    O *output = static_cast<O *>(output_);
+    // output H size
+    const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1;
+    const std::size_t oxSize =
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilated_kernel_x + strideDims[0]) /
+                                static_cast<float>(strideDims[0])));
+    // TODO: kernel computation
+    // output (batch, outCh, Xout, Yout)
+    // input  (batch, ch, Xin, Yin)
+    // weight (outCh, ch, kernelX, kernelY)
+    // does not take Dilation attribute into account
+    using signedsize = std::make_signed<std::size_t>::type;
+    for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+        for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
+            const std::size_t oIndex = (ch + batch*inputDims[1]) * oxSize;
+            B biasVal = (biases != nullptr) ? biases[ch] : B(0);
+            std::fill(output + oIndex, output+(oIndex+oxSize), biasVal);
+            const std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2];
+            const std::size_t wIndex = ch * kernelDims[0];
+            for (std::size_t ox = 0; ox < oxSize; ++ox) {
+                // const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]);
+                // const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
+                // const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx);
+                const std::size_t sxMin = 0;
+                const std::size_t sxMax = dilated_kernel_x;
+                const std::size_t oIndexFull = oIndex + ox;
+                const signedsize ix = static_cast<signedsize>(ox * strideDims[0]);
+                for (std::size_t sx = sxMin; sx*dilationDims[0] < sxMax; ++sx) {
+                    output[oIndexFull] += weights[wIndex + sx] *
+                                            input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx*dilationDims[0]))];
+                }
+            }
+        }
+    }
+}
+// Kernels registration to implementation entry point
+REGISTRAR(ConvDepthWiseImpl1D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvDepthWiseImpl1D_cpu_forward_kernel<float, float, float, float>, nullptr});
+REGISTRAR(ConvDepthWiseImpl1D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvDepthWiseImpl1D_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t, std::int32_t>, nullptr});
+REGISTRAR(ConvDepthWiseImpl1D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvDepthWiseImpl1D_cpu_forward_kernel<double, double, double, double>, nullptr});
+/**
+ * @brief Forward kernel for 2D ConvDepthWiseolution on CPU backend.
+ * @tparam I Input data type.
+ * @tparam W Weight data type.
+ * @tparam B Bias data type.
+ * @tparam O Output data type.
+ * @param params tuple of Attributes from the Operator
+ * @param inputDims Array of input dimensions.
+ * @param input_ const input Tensor.
+ * @param weights_ const weight Tensor.
+ * @param biases_ const Biais Tensor.
+ * @param output_ Output Tensor.
+ */
+template <class I, class W, class B, class O>
+void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
+                            const std::array<DimSize_t, 2>& dilationDims,
+                            const std::array<DimSize_t, 2>& kernelDims,
+                            const std::array<DimSize_t, 4>& inputDims,
+                            const void *input_,
+                            const void *weights_,
+                            const void *biases_,
+                            void *output_)
+{
+    // FIXME: missing convolution attributes as arguments
+    const I *input = static_cast<const I *>(input_);
+    const W *weights = static_cast<const W *>(weights_);
+    const B *biases = static_cast<const B *>(biases_);
+    O *output = static_cast<O *>(output_);
+    // output H size
+    const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1;
+    const std::size_t oxSize =
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilated_kernel_x + strideDims[0]) /
+                                static_cast<float>(strideDims[0])));
+    // output W size
+    const DimSize_t dilated_kernel_y = dilationDims[1]*(kernelDims[1] - 1) + 1;
+    const std::size_t oySize =
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[3] - dilated_kernel_y + strideDims[1]) /
+                                static_cast<float>(strideDims[1])));
+    // TODO: kernel computation
+    // output (batch, outCh, Xout, Yout)
+    // input  (batch, ch, Xin, Yin)
+    // weight (outCh, ch, kernelX, kernelY)
+    // does not take Dilation attribute into account
+    const std::size_t outChannels_s =  oxSize * oySize;
+    if (dilated_kernel_x ==3 && dilated_kernel_y == 3) {
+        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+            for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
+                B biasVal = (biases != nullptr) ? biases[ch] : B(0);
+                std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
+                const std::size_t wIndex = ch * 9;
+                if (strideDims[0] == 1 && strideDims[1]==1) {
+                    for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=inputDims[3]) {
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
+                            output[oIndex + oy] = biasVal + weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+1]+weights[wIndex+2]*input[iIndex+oy+2];
+                        }
+                        iIndex+=inputDims[3];
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
+                            output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy]+weights[wIndex+4]*input[iIndex+oy+1]+weights[wIndex+5]*input[iIndex+oy+2];
+                        }
+                        iIndex+=inputDims[3];
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
+                            output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy]+weights[wIndex+7]*input[iIndex+oy+1]+weights[wIndex+8]*input[iIndex+oy+2];
+                        }
+                    }
+                } else {
+                    for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=(strideDims[0]-2)*inputDims[3]) {
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
+                            output[oIndex + oy] = biasVal + weights[wIndex+0]*input[iIndex+oy*strideDims[1]]+weights[wIndex+1]*input[iIndex+oy*strideDims[1]+1]+weights[wIndex+2]*input[iIndex+oy*strideDims[1]+2];
+                        }
+                        iIndex+=inputDims[3];
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
+                            output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy*strideDims[1]]+weights[wIndex+4]*input[iIndex+oy*strideDims[1]+1]+weights[wIndex+5]*input[iIndex+oy*strideDims[1]+2];
+                        }
+                        iIndex+=inputDims[3];
+                        for (std::size_t oy = 0; oy < oySize; ++oy) {
+                            output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy*strideDims[1]]+weights[wIndex+7]*input[iIndex+oy*strideDims[1]+1]+weights[wIndex+8]*input[iIndex+oy*strideDims[1]+2];
+                        }
+                    }
+                }
+                output += outChannels_s;
+            }
+        }
+    } else if (dilated_kernel_x == 1 && dilated_kernel_y == 1) {
+        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+            for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
+                B biasVal = (biases != nullptr) ? biases[ch] : B(0);
+                std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
+                const std::size_t wIndex = ch;
+                if (strideDims[0] == 1 && strideDims[1] == 1) {
+                    for (std::size_t i = iIndex; i < iIndex + oxSize*oySize; ++i) {
+                        output[i] = biasVal + weights[wIndex] * input[i];
+                    }
+                } else  {
+                    std::size_t oIndex =  (ch + batch*inputDims[1]) * oxSize * oySize;
+                    for (std::size_t ox = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=strideDims[0]*inputDims[3]) {
+                        for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) {
+                            output[oIndex + oy] = biasVal + weights[wIndex]*input[iIndex+iy];
+                        }
+                    }
+                }
+            }
+        }
+    } else {
+        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+            for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
+                B biasVal = (biases != nullptr) ? biases[ch] : B(0);
+                std::fill(output, output+outChannels_s, biasVal);
+                const std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
+                const std::size_t wIndex = ch * kernelDims[0] * kernelDims[1];
+                for (std::size_t ox = 0; ox < oxSize; ++ox) {
+                    for (std::size_t oy = 0; oy < oySize; ++oy) {
+                        const std::size_t oIndexFull = ox*oySize + oy;
+                        const std::size_t ix = ox * strideDims[0];
+                        const std::size_t iy = oy * strideDims[1];
+                        for (std::size_t kx = 0; kx*dilationDims[0] < dilated_kernel_x; ++kx) {
+                            for (std::size_t ky = 0; ky*dilationDims[1] < dilated_kernel_y; ++ky) {
+                                output[oIndexFull] += weights[wIndex + kx*kernelDims[1] + ky] *
+                                                        input[iIndex + (ix + kx*dilationDims[0])*inputDims[3] + (iy + ky*dilationDims[1])];
+                            }
+                        }
+                    }
+                }
+                output += outChannels_s;
+            }
+        }
+    }
+}
+// Kernels registration to implementation entry point
+REGISTRAR(ConvDepthWiseImpl2D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvDepthWiseImpl2D_cpu_forward_kernel<float, float, float, float>, nullptr});
+REGISTRAR(ConvDepthWiseImpl2D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvDepthWiseImpl2D_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t, std::int32_t>, nullptr});
+REGISTRAR(ConvDepthWiseImpl2D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvDepthWiseImpl2D_cpu_forward_kernel<double, double, double, double>, nullptr});
+}  // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_CONVDEPTHWISEIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/ConvImpl.hpp
+++ b/include/aidge/backend/cpu/operator/ConvImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_CONVIMPL_H_
+#define AIDGE_CPU_OPERATOR_CONVIMPL_H_
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Conv.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+namespace Aidge {
+// Operator implementation entry point for the backend
+using Conv1D_Op = Conv_Op<1>;
+using ConvImpl1D_cpu = OperatorImpl_cpu<Conv_Op<1>,
+    void(const std::array<DimSize_t, 1>&,
+        const std::array<DimSize_t, 1>&,
+        const std::array<DimSize_t, 1>&,
+        const std::array<DimSize_t, 3> &,
+        DimSize_t,
+        const void *,
+        const void *,
+        const void *,
+        void *)>;
+using Conv2D_Op = Conv_Op<2>;
+using ConvImpl2D_cpu = OperatorImpl_cpu<Conv_Op<2>,
+    void(const std::array<DimSize_t, 2>&,
+        const std::array<DimSize_t, 2>&,
+        const std::array<DimSize_t, 2>&,
+        const std::array<DimSize_t, 4> &,
+        DimSize_t,
+        const void *,
+        const void *,
+        const void *,
+        void *)>;
+// Implementation entry point registration to Operator
+REGISTRAR(Conv1D_Op, "cpu", Aidge::ConvImpl1D_cpu::create);
+REGISTRAR(Conv2D_Op, "cpu", Aidge::ConvImpl2D_cpu::create);
+}  // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_CONVIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_CONVIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_CONVIMPL_KERNELS_H_
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/backend/cpu/operator/ConvImpl.hpp"
+#include "aidge/operator/Conv.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+namespace Aidge {
+/**
+ * @brief Forward kernel for 1D Convolution on CPU backend.
+ * @tparam I Input data type.
+ * @tparam W Weight data type.
+ * @tparam B Bias data type.
+ * @tparam O Output data type.
+ * @param params tuple of Attributes from the Operator
+ * @param inputDims Array of input dimensions.
+ * @param input_ const input Tensor.
+ * @param weights_ const weight Tensor.
+ * @param biases_ const Biais Tensor.
+ * @param output_ Output Tensor.
+ */
+template <class I, class W, class B, class O>
+void ConvImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& strideDims,
+                            const std::array<DimSize_t, 1>& dilationDims,
+                            const std::array<DimSize_t, 1>& kernelDims,
+                            const std::array<DimSize_t, 3>& inputDims,
+                            DimSize_t outChannels,
+                            const void *input_,
+                            const void *weights_,
+                            const void *biases_,
+                            void *output_)
+{
+    // FIXME: missing convolution attributes as arguments
+    const I *input = static_cast<const I *>(input_);
+    const W *weights = static_cast<const W *>(weights_);
+    const B *biases = static_cast<const B *>(biases_);
+    O *output = static_cast<O *>(output_);
+    // output H size
+    const std::size_t oxSize =
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilationDims[0]*(kernelDims[0] - 1) - 1 + strideDims[0]) /
+                                static_cast<float>(strideDims[0])));
+    const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1;
+    // TODO: kernel computation
+    // output (batch, outCh, Xout, Yout)
+    // input  (batch, inCh, Xin, Yin)
+    // weight (outCh, inCh, kernelX, kernelY)
+    // does not take Dilation attribute into account
+    using signedsize = std::make_signed<std::size_t>::type;
+    for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+        for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
+            const std::size_t oIndex = (outCh + batch*outChannels) * oxSize;
+            // If bias = nullptr, set B(0)
+            B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
+            std::fill(output + oIndex, output+(oIndex+oxSize), biasVal);
+            for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
+                const std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2];
+                const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0];
+                for (std::size_t ox = 0; ox < oxSize; ++ox) {
+                    // const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]);
+                    // const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
+                    // const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx);
+                    const std::size_t sxMin = 0;
+                    const std::size_t sxMax = dilated_kernel_x;
+                    const std::size_t oIndexFull = oIndex + ox;
+                    const signedsize ix = static_cast<signedsize>(ox * strideDims[0]);
+                    for (std::size_t sx = sxMin; sx*dilationDims[0] < sxMax; ++sx) {
+                        output[oIndexFull] += weights[wIndex + sx] *
+                                                input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx*dilationDims[0]))];
+                    }
+                }
+            }
+        }
+    }
+}
+// Kernels registration to implementation entry point
+REGISTRAR(ConvImpl1D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvImpl1D_cpu_forward_kernel<float, float, float, float>, nullptr});
+REGISTRAR(ConvImpl1D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float16, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvImpl1D_cpu_forward_kernel<half_float::half, half_float::half, half_float::half, half_float::half>, nullptr});
+REGISTRAR(ConvImpl1D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvImpl1D_cpu_forward_kernel<int32_t, int32_t, int32_t, int32_t>, nullptr});
+REGISTRAR(ConvImpl1D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvImpl1D_cpu_forward_kernel<double, double, double, double>, nullptr});
+/**
+ * @brief Forward kernel for 2D Convolution on CPU backend.
+ * @tparam I Input data type.
+ * @tparam W Weight data type.
+ * @tparam B Bias data type.
+ * @tparam O Output data type.
+ * @param params tuple of Attributes from the Operator
+ * @param inputDims Array of input dimensions.
+ * @param input_ const input Tensor.
+ * @param weights_ const weight Tensor.
+ * @param biases_ const Biais Tensor.
+ * @param output_ Output Tensor.
+ */
+template <class I, class W, class B, class O>
+void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
+                            const std::array<DimSize_t, 2>& dilationDims,
+                            const std::array<DimSize_t, 2>& kernelDims,
+                            const std::array<DimSize_t, 4> &inputDims,
+                            DimSize_t outChannels,
+                            const void *input_,
+                            const void *weights_,
+                            const void *biases_,
+                            void *output_)
+{
+    // FIXME: missing convolution attributes as arguments
+    const I *input = static_cast<const I *>(input_);
+    const W *weights = static_cast<const W *>(weights_);
+    const B *biases = static_cast<const B *>(biases_);
+    O *output = static_cast<O *>(output_);
+    // output H size
+    const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1;
+    const std::size_t oxSize =
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilated_kernel_x + strideDims[0]) /
+                                static_cast<float>(strideDims[0])));
+    // output W size
+    const DimSize_t dilated_kernel_y = dilationDims[1]*(kernelDims[1] - 1) + 1;
+    const std::size_t oySize =
+            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[3] - dilated_kernel_y + strideDims[1]) /
+                                static_cast<float>(strideDims[1])));
+    // TODO: kernel computation
+    // output (batch, outCh, Xout, Yout)
+    // input  (batch, inCh, Xin, Yin)
+    // weight (outCh, inCh, kernelX, kernelY)
+    // does not take Dilation attribute into account
+    const std::size_t outChannels_s =  oxSize * oySize;
+    if (dilated_kernel_x == 3 && dilated_kernel_y == 3) {
+        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+            for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
+                // If bias = nullptr, set B(0)
+                B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
+                std::fill(output, output+outChannels_s, biasVal);
+                for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
+                    std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
+                    const std::size_t wIndex = (inCh + outCh*inputDims[1]) * 9;
+                    if (strideDims[0] == 1 && strideDims[1]==1) {
+                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=inputDims[3]) {
+                            for (std::size_t oy = 0; oy < oySize; ++oy) {
+                                output[oIndex + oy] += weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+1]+weights[wIndex+2]*input[iIndex+oy+2];
+                            }
+                            iIndex+=inputDims[3];
+                            for (std::size_t oy = 0; oy < oySize; ++oy) {
+                                output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy]+weights[wIndex+4]*input[iIndex+oy+1]+weights[wIndex+5]*input[iIndex+oy+2];
+                            }
+                            iIndex+=inputDims[3];
+                            for (std::size_t oy = 0; oy < oySize; ++oy) {
+                                output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy]+weights[wIndex+7]*input[iIndex+oy+1]+weights[wIndex+8]*input[iIndex+oy+2];
+                            }
+                        }
+                    } else {
+                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=(strideDims[0]-2)*inputDims[3]) {
+                            for (std::size_t oy = 0; oy < oySize; ++oy) {
+                                output[oIndex + oy] += weights[wIndex+0]*input[iIndex+oy*strideDims[1]]+weights[wIndex+1]*input[iIndex+oy*strideDims[1]+1]+weights[wIndex+2]*input[iIndex+oy*strideDims[1]+2];
+                            }
+                            iIndex+=inputDims[3];
+                            for (std::size_t oy = 0; oy < oySize; ++oy) {
+                                output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy*strideDims[1]]+weights[wIndex+4]*input[iIndex+oy*strideDims[1]+1]+weights[wIndex+5]*input[iIndex+oy*strideDims[1]+2];
+                            }
+                            iIndex+=inputDims[3];
+                            for (std::size_t oy = 0; oy < oySize; ++oy) {
+                                output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy*strideDims[1]]+weights[wIndex+7]*input[iIndex+oy*strideDims[1]+1]+weights[wIndex+8]*input[iIndex+oy*strideDims[1]+2];
+                            }
+                        }
+                    }
+                }
+                output += outChannels_s;
+            }
+        }
+    } else if (dilated_kernel_x == 1 && dilated_kernel_y == 1) {
+        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+            for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
+                // If bias = nullptr, set B(0)
+                B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
+                std::fill(output, output+outChannels_s, biasVal);
+                for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
+                    std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
+                    const std::size_t wIndex = (inCh + outCh*inputDims[1]);
+                    if (strideDims[0] == 1 && strideDims[1] == 1) {
+                        for (std::size_t oIndex = 0; oIndex < oxSize*oySize; ++oIndex, ++iIndex) {
+                            output[oIndex] += weights[wIndex] * input[iIndex];
+                        }
+                    } else  {
+                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=inputDims[3]*strideDims[0]) {
+                            for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) {
+                                output[oIndex + oy] += weights[wIndex+0]*input[iIndex+iy];
+                            }
+                        }
+                    }
+                }
+                output += outChannels_s;
+            }
+        }
+    } else {
+        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
+            for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
+                // If bias = nullptr, set B(0)
+                B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
+                std::fill(output, output+outChannels_s, biasVal);
+                for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
+                    std::size_t iIndex_channel = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
+                    const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0] * kernelDims[1];
+                    // loop over each ouput line
+                    for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex_channel+=inputDims[3]*strideDims[0]) {
+                        // loop over associated input line
+                        for (std::size_t ky = 0, ix = 0; ky < kernelDims[0]; ++ky, ix += inputDims[3]*dilationDims[0]) {
+                            // loop over the entire line
+                            for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) {
+                                const std::size_t iIndex = iIndex_channel + ix + iy;
+                                // loop over elements assosicated with one output
+                                for (std::size_t kx = 0;  kx < kernelDims[0]; ++kx) {
+                                    output[oIndex + oy] += weights[wIndex+kernelDims[0]*ky+kx]*input[iIndex+kx*dilationDims[1]];
+                                }
+                            }
+                        }
+                    }
+                }
+                output += outChannels_s;
+            }
+        }
+    }
+}
+// Kernels registration to implementation entry point
+REGISTRAR(ConvImpl2D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvImpl2D_cpu_forward_kernel<float, float, float, float>, nullptr});
+REGISTRAR(ConvImpl2D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float16, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvImpl2D_cpu_forward_kernel<half_float::half, half_float::half, half_float::half, half_float::half>, nullptr});
+REGISTRAR(ConvImpl2D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvImpl2D_cpu_forward_kernel<int32_t, int32_t, int32_t, int32_t>, nullptr});
+REGISTRAR(ConvImpl2D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel, Aidge::ConvImpl2D_cpu_forward_kernel<double, double, double, double>, nullptr});
+}  // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_CONVIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/DivImpl.hpp
+++ b/include/aidge/backend/cpu/operator/DivImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_DIVIMPL_H_
+#define AIDGE_CPU_OPERATOR_DIVIMPL_H_
+#include <memory>
+#include <tuple>
+#include <vector>
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Div.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+namespace Aidge {
+// Operator implementation entry point for the backend
+using DivImpl_cpu = OperatorImpl_cpu<Div_Op,
+    void(const std::size_t, const std::size_t, const std::size_t, const void*, const void*,void*)>;
+// Implementation entry point registration to Operator
+REGISTRAR(Div_Op, "cpu", Aidge::DivImpl_cpu::create);
+}  // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_DIVIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/DivImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/DivImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_DIVIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_DIVIMPL_KERNELS_H_
+#include <numeric>     // std::accumulate
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::int32_t, std::int64_t
+#include <functional>  // std::multiplies
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/backend/cpu/data/Broadcasting.hpp"
+#include "aidge/backend/cpu/operator/DivImpl.hpp"
+namespace Aidge {
+// template <class I1, class I2, class O>
+// void DivImpl_cpu_forward_kernel(const std::vector<std::size_t>& input1Dims,
+//                                 const std::vector<std::size_t>& input2Dims,
+//                                 const std::vector<std::size_t>& outputDims,
+//                                 const void* input1_,
+//                                 const void* input2_,
+//                                 void* output_) {
+//     const I1* input_1 = static_cast<const I1*>(input1_);
+//     const I2* input_2 = static_cast<const I2*>(input2_);
+//     O* output = static_cast<O*>(output_);
+//     const std::size_t totalElements = std::accumulate(outputDims.cbegin(), outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+// 	for (std::size_t oIndex = 0; oIndex < totalElements; ++oIndex)
+// 	{
+// 		std::vector<std::size_t> indexes = getMultiDimIndices(outputDims, oIndex);
+// 		std::size_t idx1 = getFlattenedIndex(input1Dims, indexes);
+// 		std::size_t idx2 = getFlattenedIndex(input2Dims, indexes);
+//         // TODO assert if input_2 is bad?
+//         output[oIndex] = input_1[idx1] / input_2[idx2];
+//     }
+// }
+template <class I1, class I2, class O>
+constexpr void DivImpl_cpu_forward_kernel(const std::size_t input1size_,
+                                const std::size_t input2size_,
+                                const std::size_t output1size_,
+                                const void* input1_,
+                                const void* input2_,
+                                void* output_) {
+    const I1* input_1 = static_cast<const I1*>(input1_);
+    const I2* input_2 = static_cast<const I2*>(input2_);
+    O* output = static_cast<O*>(output_);
+// suppose values are contiguous in memory
+    for (std::size_t i = 0; i < output1size_; ++i) {
+        const std::size_t in1_id = (input1size_ != 1) ? i : 0;
+        const std::size_t in2_id = (input2size_ != 1) ? i : 0;
+        output[i] = static_cast<O>(input_1[in1_id] / input_2[in2_id]);
+    }
+}
+// Kernels registration to implementation entry point
+REGISTRAR(DivImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::DivImpl_cpu_forward_kernel<float, float, float>, nullptr});
+REGISTRAR(DivImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::DivImpl_cpu_forward_kernel<double, double, double>, nullptr});
+REGISTRAR(DivImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::inPlaceModel, Aidge::DivImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, nullptr});
+}  // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_DIVIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/ErfImpl.hpp
+++ b/include/aidge/backend/cpu/operator/ErfImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_ERFIMPL_H_
+#define AIDGE_CPU_OPERATOR_ERFIMPL_H_
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Erf.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include <memory>
+#include <vector>
+namespace Aidge {
+// Operator implementation entry point for the backend
+using ErfImpl_cpu = OperatorImpl_cpu<Erf_Op,
+    void(const std::size_t, const void*, void*)>;
+// Implementation entry point registration to Operator
+REGISTRAR(Erf_Op, "cpu", Aidge::ErfImpl_cpu::create);
+}  // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_ERFIMPL_H_ */
--- a/include/aidge/operator/LeakyReLUImpl_forward_kernels.hpp
+++ b/include/aidge/operator/LeakyReLUImpl_forward_kernels.hpp
 /********************************************************************************
 * Copyright (c) 2023 CEA-List
 *
 * This program and the accompanying materials are made available under the
 * terms of the Eclipse Public License 2.0 which is available at
 * http://www.eclipse.org/legal/epl-2.0.
 *
 * SPDX-License-Identifier: EPL-2.0
 *
 ********************************************************************************/
-#ifndef AIDGE_CPU_OPERATOR_LEAKYRELUIMPL_FORWARD_KERNEL_H_
+#ifndef AIDGE_CPU_OPERATOR_ERFIMPL_KERNELS_H_
-#define AIDGE_CPU_OPERATOR_LEAKYRELUIMPL_FORWARD_KERNEL_H_
+#define AIDGE_CPU_OPERATOR_ERFIMPL_KERNELS_H_
-#include "aidge/utils/Registrar.hpp"
+#include <cmath>
-#include "aidge/operator/LeakyReLUImpl.hpp"
+#include "aidge/utils/Registrar.hpp"
-namespace Aidge {
+#include "aidge/backend/cpu/operator/ErfImpl.hpp"
-template <class I, class O>
-void LeakyReLUImpl_cpu_forward_kernel(const LeakyReLU_Op::Parameters& params,
+namespace Aidge {
-                                     std::size_t inputLenght,
+template <class I, class O>
-                                     const void* input_,
+void ErfImpl_cpu_forward_kernel(std::size_t inputLenght,
-                                     void* output_) {
+                                     const void* input_,
+                                     void* output_) {
-    const I* input = static_cast<const I*>(input_);
-    O* output = static_cast<O*>(output_);
+    const I* input = static_cast<const I*>(input_);
-    I negativeSlope = static_cast<I>(std::get<0>(params));
+    O* output = static_cast<O*>(output_);
    for (std::size_t i = 0; i < inputLenght; ++i) {
-        output[i] = input[i] >= 0 ? input[i] : input[i] * negativeSlope;
+        output[i] = std::erf(input[i]);
    }
 }
-namespace {
+// Kernels registration to implementation entry point
-static Registrar<LeakyReLUImplForward_cpu> registrarLeakyReLUImplForward_cpu_Float32(
+REGISTRAR(ErfImpl_cpu,
-        {DataType::Float32, DataType::Float32}, Aidge::LeakyReLUImpl_cpu_forward_kernel<float, float>);
+    {DataType::Float32},
-static Registrar<LeakyReLUImplForward_cpu> registrarLeakyReLUImplForward_cpu_Int32(
+    {ProdConso::inPlaceModel, Aidge::ErfImpl_cpu_forward_kernel<float, float>, nullptr});
-        {DataType::Int32, DataType::Int32}, Aidge::LeakyReLUImpl_cpu_forward_kernel<int, int>);
+REGISTRAR(ErfImpl_cpu,
-static Registrar<LeakyReLUImplForward_cpu> registrarLeakyReLUImplForward_cpu_Float64(
+    {DataType::Float64},
-        {DataType::Float64, DataType::Float64}, Aidge::LeakyReLUImpl_cpu_forward_kernel<double, double>);
+    {ProdConso::inPlaceModel, Aidge::ErfImpl_cpu_forward_kernel<double, double>, nullptr});
-}  // namespace
+REGISTRAR(ErfImpl_cpu,
-}  // namespace Aidge
+    {DataType::Int32},
+    {ProdConso::inPlaceModel, Aidge::ErfImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr});
-#endif /* AIDGE_CPU_OPERATOR_LEAKYRELUIMPL_FORWARD_KERNEL_H_ */
+}  // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_ERFIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/ExpandImpl.hpp
+++ b/include/aidge/backend/cpu/operator/ExpandImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_EXPANDIMPL_H_
+#define AIDGE_CPU_OPERATOR_EXPANDIMPL_H_
+#include <memory>
+#include <vector>
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Expand.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+namespace Aidge {
+// Operator implementation entry point for the backend
+using ExpandImpl_cpu = OperatorImpl_cpu<Expand_Op,
+                                        void(const std::shared_ptr<Tensor> &,
+                                             const std::shared_ptr<Tensor> &,
+                                             void *,
+                                             const std::vector<DimSize_t> &)>;
+// Implementation entry point registration to Operator
+REGISTRAR(Expand_Op, "cpu", Aidge::ExpandImpl_cpu::create);
+} // namespace Aidge
+#endif /* _AIDGE_CPU_OPERATOR_EXPANDIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/ExpandImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ExpandImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_EXPANDIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_EXPANDIMPL_KERNELS_H_
+#include "aidge/backend/cpu/operator/ExpandImpl.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include <aidge/data/Data.hpp>
+#include <aidge/data/Tensor.hpp>
+#include <aidge/data/half.hpp>
+#include <aidge/scheduler/ProdConso.hpp>
+#include <aidge/utils/Types.h>
+#include <cmath>
+#include <cstdint> // std::int32_t, std::int64_t
+#include <memory>
+#include <numeric>
+namespace {
+// suppose values are contiguous in memory
+template <class IO>
+void expandContiguousArray(const std::size_t inputStackSize,
+                           const std::size_t outputStackSize,
+                           const IO *input,
+                           IO *output) {
+    for (std::size_t i = 0; i < outputStackSize; ++i) {
+        output[i] = (inputStackSize == 1) ? input[0] : input[i];
+    }
+    return;
+}
+} // namespace
+namespace Aidge {
+template <class IO>
+void ExpandImpl_cpu_forward_kernel(
+    const std::shared_ptr<Tensor> &inData,
+    const std::shared_ptr<Tensor> &_inExpandShape,
+    void *_output,
+    const std::vector<DimSize_t> &outputDims) {
+    // retrieving data of inputShape & dimensions of inputDims
+    // as the process will require to modify the values
+    IO *output = static_cast<IO *>(_output);
+    std::vector<DimSize_t> inExpandShape(_inExpandShape->size());
+    for (DimSize_t i = 0; i < _inExpandShape->size(); ++i) {
+        inExpandShape[i] = _inExpandShape->get<std::int64_t>(i);
+    }
+    std::vector<DimSize_t> inDataDims = inData->dims();
+    // Example with 2 tensors
+    // [5,2,1,7] & [2,6,7]
+    // 1. Same number of dimensions but adding 1s to le left of "smallest"
+    // tensor -> [5,2,1,7] & [1,2,6,7]
+    // 2. Find the highest equal dimension -> 3
+    //    Exception: if the first diverging dimension is the last one, then ->
+    //    4 (dims.size())
+    // 3. Compute the highest number of contiguous data -> 7
+    // 4. Compute stride and offset step for the broadcast mechanism
+    // 5. Call a simple kernel
+    // ## Compute compatible input dimensions
+    // special case for equal dimensions, the kernel is called with the entire
+    // arrays at once
+    if (inDataDims == inExpandShape) {
+        const std::size_t input0ContiguousSize =
+            std::accumulate(inDataDims.cbegin(),
+                            inDataDims.cend(),
+                            static_cast<std::size_t>(1),
+                            std::multiplies<std::size_t>());
+        for (std::size_t i = 0; i < input0ContiguousSize; ++i) {
+            output[i] = inData->get<IO>(i);
+        }
+        return;
+    }
+    // set dimensions to be of equal size by filling the smallest one with
+    // ones.
+    if (inDataDims.size() > inExpandShape.size()) {
+        inExpandShape.insert(inExpandShape.cbegin(),
+                             inDataDims.size() - inExpandShape.size(),
+                             static_cast<DimSize_t>(1));
+    } else if (_inExpandShape->size() > inDataDims.size()) {
+        inDataDims.insert(inDataDims.cbegin(),
+                          inExpandShape.size() - inDataDims.size(),
+                          static_cast<DimSize_t>(1));
+    }
+    const std::size_t nbDims = inDataDims.size();
+    // Find the highest equal dimension
+    // std::size_t contiguousIdx = nbDims - 1;
+    std::size_t contiguousIdx = nbDims;
+    while (contiguousIdx-- > 0) {
+        // for (; contiguousIdx+1 > 0; --contiguousIdx) {
+        if (inDataDims[contiguousIdx] != inExpandShape[contiguousIdx]) {
+            break;
+        }
+    }
+    if (contiguousIdx == (nbDims - 1)) {
+        // last dimensions of one of the input Tensor are of size 1
+        const std::vector<std::size_t> &dims =
+            (inDataDims[contiguousIdx] == 1) ? inDataDims : inExpandShape;
+        while ((contiguousIdx + 1 > 0) && (dims[contiguousIdx] == 1)) {
+            --contiguousIdx;
+        }
+    }
+    ++contiguousIdx;
+    // Compute the highest number of contiguous data for each Tensor
+    const std::size_t inputDataContiguousSize =
+        std::accumulate(inDataDims.cbegin() + contiguousIdx,
+                        inDataDims.cend(),
+                        static_cast<std::size_t>(1),
+                        std::multiplies<std::size_t>());
+    const std::size_t outputContiguousSize =
+        std::accumulate(outputDims.cbegin() + contiguousIdx,
+                        outputDims.cend(),
+                        static_cast<std::size_t>(1),
+                        std::multiplies<std::size_t>());
+    // initialize strides to iterate through data because of broadcasting
+    std::unique_ptr<std::int32_t[]> stridePostIn =
+        std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> strideStepIn =
+        std::make_unique<std::int32_t[]>(contiguousIdx);
+    if (contiguousIdx > 0) {
+        stridePostIn[contiguousIdx - 1] = 1;
+        for (std::size_t i = contiguousIdx - 2;
+             i != static_cast<std::size_t>(-1);
+             --i) {
+            stridePostIn[i] = stridePostIn[i + 1] *
+                              static_cast<std::int32_t>(inDataDims[i + 1]);
+        }
+        for (std::size_t i = 0; i != contiguousIdx; ++i) {
+            strideStepIn[i] = (inDataDims[i] == 1) ? 1 - stridePostIn[i] : 1;
+        }
+    }
+    // variables for arrays offsets
+    std::size_t offsetInData = 0;
+    std::size_t offsetOut = 0;
+    std::size_t dim = contiguousIdx - 1;
+    const std::size_t nbStacks =
+        std::accumulate(outputDims.cbegin(),
+                        outputDims.cbegin() + contiguousIdx,
+                        static_cast<std::size_t>(1),
+                        std::multiplies<std::size_t>());
+    for (std::size_t stack = 0; stack < nbStacks;) {
+        expandContiguousArray<IO>(
+            inputDataContiguousSize,
+            outputContiguousSize,
+            &static_cast<const IO *>(
+                inData->getImpl()
+                    ->rawPtr())[offsetInData * inputDataContiguousSize],
+            &output[offsetOut * outputContiguousSize]);
+        if (++stack < nbStacks) {
+            std::size_t tmpStack = stack;
+            while (tmpStack % outputDims[dim] == 0) {
+                tmpStack /= outputDims[dim];
+                dim--;
+            }
+            offsetInData += strideStepIn[dim];
+            ++offsetOut;
+            dim = contiguousIdx - 1;
+        }
+    }
+}
+REGISTRAR(ExpandImpl_cpu,
+          {{DataType::Int16, DataType::Int64}, {DataType::Int16}},
+          {ProdConso::inPlaceModel,
+           Aidge::ExpandImpl_cpu_forward_kernel<std::int16_t>,
+           nullptr});
+REGISTRAR(ExpandImpl_cpu,
+          {{DataType::Int32, DataType::Int64}, {DataType::Int32}},
+          {ProdConso::inPlaceModel,
+           Aidge::ExpandImpl_cpu_forward_kernel<std::int32_t>,
+           nullptr});
+REGISTRAR(ExpandImpl_cpu,
+          {{DataType::Int64, DataType::Int64}, {DataType::Int64}},
+          {ProdConso::inPlaceModel,
+           Aidge::ExpandImpl_cpu_forward_kernel<std::int64_t>,
+           nullptr});
+REGISTRAR(ExpandImpl_cpu,
+          {{DataType::Float16, DataType::Int64}, {DataType::Float16}},
+          {ProdConso::inPlaceModel,
+           Aidge::ExpandImpl_cpu_forward_kernel<half_float::half>,
+           nullptr});
+REGISTRAR(ExpandImpl_cpu,
+          {{DataType::Float32, DataType::Int64}, {DataType::Float32}},
+          {ProdConso::inPlaceModel,
+           Aidge::ExpandImpl_cpu_forward_kernel<float>,
+           nullptr});
+REGISTRAR(ExpandImpl_cpu,
+          {{DataType::Float64, DataType::Int64}, {DataType::Float64}},
+          {ProdConso::inPlaceModel,
+           Aidge::ExpandImpl_cpu_forward_kernel<double>,
+           nullptr});
+} // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_EXPANDIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/FCImpl.hpp
+++ b/include/aidge/backend/cpu/operator/FCImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_FCIMPL_H_
+#define AIDGE_CPU_OPERATOR_FCIMPL_H_
+#include <array>
+#include <memory>
+#include <vector>
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/FC.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+namespace Aidge {
+// Operator implementation entry point for the backend
+using FCImpl_cpu = OperatorImpl_cpu<FC_Op,
+    void(const DimSize_t,
+        const DimSize_t,
+        const DimSize_t,
+        const void *,
+        const void *,
+        const void *,
+        void *),
+    void(const DimSize_t,
+        const DimSize_t,
+        const DimSize_t,
+        const void *,
+        const void *,
+        const void *,
+        void *,
+        void *,
+        void *)>;
+// Implementation entry point registration to Operator
+REGISTRAR(FC_Op, "cpu", Aidge::FCImpl_cpu::create);
+}  // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_FCIMPL_H_ */
--- a/include/aidge/operator/FCImpl_forward_kernels.hpp
+++ b/include/aidge/operator/FCImpl_forward_kernels.hpp
 /********************************************************************************
 * Copyright (c) 2023 CEA-List
 *
 * This program and the accompanying materials are made available under the
 * terms of the Eclipse Public License 2.0 which is available at
 * http://www.eclipse.org/legal/epl-2.0.
 *
 * SPDX-License-Identifier: EPL-2.0
 *
 ********************************************************************************/
-#ifndef AIDGE_CPU_OPERATOR_FCIMPL_FORWARD_KERNEL_H_
+#ifndef AIDGE_CPU_OPERATOR_FCIMPL_KERNELS_H_
-#define AIDGE_CPU_OPERATOR_FCIMPL_FORWARD_KERNEL_H_
+#define AIDGE_CPU_OPERATOR_FCIMPL_KERNELS_H_
-#include "aidge/utils/Registrar.hpp"
+#include <algorithm>
-#include <algorithm>
+#include "aidge/backend/cpu/operator/FCImpl.hpp"
-#include "aidge/operator/FCImpl.hpp"
+#include "aidge/utils/Registrar.hpp"
 namespace Aidge {
 // template <class I, class W, class B, class O>
-// void FCImpl_cpu_forward_kernel(const FC_Op::Parameters& params, const std::array<DimSize_t, 4>& dims,
+// void FCImpl_cpu_forward_kernel(const FC_Op::Attrs& attrs, const std::array<DimSize_t, 4>& dims,
 //                                    const void* input_, const void* weights_, const void* biases_, void* output_) {
-//     // FIXME: missing FC parameters as arguments
+//     // FIXME: missing FC attributes as arguments
 //     const I* input = static_cast<const I*>(input_);
 //     const W* weights = static_cast<const W*>(weights_);
 //     const B* biases = static_cast<const B*>(biases_);
 //     O* output = static_cast<O*>(output_);
-//     for (std::size_t outIdx = 0; outIdx < std::get<0>(params); ++outIdx) {
+//     for (std::size_t outIdx = 0; outIdx < outputFeatureSize; ++outIdx) {
 //         std::size_t oIndex = outIdx * dims[3];
-//         const B bias = std::get<1>(params) ? B(0) : biases[outIdx];
+//         const B bias = std::get<0>(attrs) ? B(0) : biases[outIdx];
 //         for (std::size_t batch = 0; batch < dims[3]; ++batch) {
 //             output[oIndex + batch] = bias;
 //         }
 //     }
 //     for (std::size_t ix = 0; ix < dims[0]; ++ix) {
 //         for (std::size_t iy = 0; iy < dims[1]; ++iy) {
 //             for (std::size_t inCh = 0; inCh < dims[2]; ++inCh) {
 //                 const std::size_t iIndex = dims[3] * (inCh + dims[2] * (iy + dims[1] * ix));
-//                 for (std::size_t outCh = 0; outCh < std::get<0>(params); ++outCh) {
+//                 for (std::size_t outCh = 0; outCh < outputFeatureSize; ++outCh) {
 //                     const std::size_t oIndex = dims[3] * outCh;
-//                     const std::size_t wIndex = (inCh + dims[2] * (iy + dims[1] * ix)) * std::get<0>(params) +
+//                     const std::size_t wIndex = (inCh + dims[2] * (iy + dims[1] * ix)) * outputFeatureSize +
-//                                           outCh;  // (iIndex*std::get<0>(params) + oIndex)/dims[3];
+//                                           outCh;  // (iIndex*outputFeatureSize + oIndex)/dims[3];
 //                     for (std::size_t batch = 0; batch < dims[3]; ++batch) {
 //                         output[oIndex + batch] += weights[wIndex] * input[iIndex + batch];
 //                     }
 //                 }
 //             }
 //         }
 //     }
 // }
 // template <class I, class W, class B, class O>
-// void FCImpl_cpu_forward_kernel(const FC_Op::Parameters& params, const std::array<DimSize_t, 2>& dims,
+// void FCImpl_cpu_forward_kernel(const FC_Op::Attrs& attrs, const std::array<DimSize_t, 2>& dims,
 //                                    const void* input_, const void* weights_, const void* biases_, void* output_) {
-//     // FIXME: missing FC parameters as arguments
+//     // FIXME: missing FC attributes as arguments
 //     const I* input = static_cast<const I*>(input_);
 //     const W* weights = static_cast<const W*>(weights_);
 //     const B* biases = static_cast<const B*>(biases_);
 //     O* output = static_cast<O*>(output_);
 //     // let's have I.dims() = [N, C, H, W] instead of [H, W, C, N]
-//     for (std::size_t outIdx = 0; outIdx < std::get<0>(params); ++outIdx) {
+//     for (std::size_t outIdx = 0; outIdx < outputFeatureSize; ++outIdx) {
 //         std::size_t oIndex = outIdx * dims[0];
-//         const B bias = std::get<1>(params) ? B(0) : biases[outIdx];
+//         const B bias = std::get<0>(attrs) ? B(0) : biases[outIdx];
 //         for (std::size_t batch = 0; batch < dims[0]; ++batch) {
 //             output[oIndex + batch] = bias;
 //         }
 //     }
 //     for (std::size_t batch = 0; batch < dims[0]; ++batch) {
 //         const std::size_t oIndex = dims[1] * batch;
 //         for (std::size_t i = 0; i < dims[1]; ++i) {
-//             for (std::size_t outCh = 0; outCh < std::get<0>(params); ++outCh) {
+//             for (std::size_t outCh = 0; outCh < outputFeatureSize; ++outCh) {
-//                 std::size_t wIndex = i * std::get<0>(params) + outCh;  // (iIndex*std::get<0>(params) + oIndex)/dims[3];
+//                 std::size_t wIndex = i * outputFeatureSize + outCh;  // (iIndex*outputFeatureSize + oIndex)/dims[3];
 //                 output[oIndex + outCh] += weights[wIndex] * input[i + batch];
 //             }
 //         }
 //     }
 // }
 template <class I, class W, class B, class O>
-void FCImpl_cpu_forward_kernel(const FC_Op::Parameters& params, const DimSize_t batchSize, const DimSize_t oneInputSize,
+void FCImpl_cpu_forward_kernel(const DimSize_t batchSize,
-                                   const void* input_, const void* weights_, const void* biases_, void* output_) {
+                            const DimSize_t inputFeatureSize,
-    // FIXME: missing FC parameters as arguments
+                            const DimSize_t outputFeatureSize,
-    const I* input = static_cast<const I*>(input_);
+                            const void* input_,
-    const W* weights = static_cast<const W*>(weights_);
+                            const void* weights_,
-    const B* biases = static_cast<const B*>(biases_);
+                            const void* biases_,
-    O* output = static_cast<O*>(output_);
+                            void* output_) {
+    // FIXME: missing FC attributes as arguments
-    if (std::get<1>(params)) {
+    const I* input = static_cast<const I*>(input_);
-        std::fill(output, output+(batchSize*std::get<0>(params)), B(0));
+    const W* weights = static_cast<const W*>(weights_);
-    }
+    const B* biases = static_cast<const B*>(biases_);
-    else {
+    O* output = static_cast<O*>(output_);
-        for (std::size_t batch = 0; batch < batchSize; ++batch) {
-            std::copy(biases, biases+std::get<0>(params), output+(batch*std::get<0>(params)));
+    if (biases == nullptr) {
-        }
+        std::fill(output, output+(batchSize*outputFeatureSize), B(0));
    }
+    else {
-    for (std::size_t batch = 0; batch < batchSize; ++batch) {
+        for (std::size_t batch = 0; batch < batchSize; ++batch) {
-        for (std::size_t out = 0; out < std::get<0>(params); ++out) {
+            std::copy(biases, biases+outputFeatureSize, output+(batch*outputFeatureSize));
-            output[out + batch*std::get<0>(params)] = std::inner_product(input + batch*oneInputSize,
+        }
-                                                        input + (batch + 1)*oneInputSize,
+    }
-                                                        weights + out*oneInputSize,
-                                                        output[out + batch*std::get<0>(params)]);
+    for (std::size_t batch = 0; batch < batchSize; ++batch) {
-        }
+        for (std::size_t out = 0; out < outputFeatureSize; ++out) {
-    }
+            output[out + batch*outputFeatureSize] = std::inner_product(input + batch*inputFeatureSize,
-}
+                                                        input + (batch + 1)*inputFeatureSize,
+                                                        weights + out*inputFeatureSize,
+                                                        output[out + batch*outputFeatureSize]);
-namespace {
+        }
-static Registrar<FCImplForward_cpu> registrarFCImpl2DForward_cpu_Float32(
+    }
-        {DataType::Float32, DataType::Float32, DataType::Float32, DataType::Float32},
+}
-        Aidge::FCImpl_cpu_forward_kernel<float, float, float, float>);
-static Registrar<FCImplForward_cpu> registrarFCImpl2DForward_cpu_Int32(
+template <class I, class O, class W, class B>
-        {DataType::Int32, DataType::Int32, DataType::Int32, DataType::Int32},
+void FCImpl_cpu_backward_kernel(const DimSize_t batchSize,
-        Aidge::FCImpl_cpu_forward_kernel<int, int, int, int>);
+                                const DimSize_t inputFeatureSize,
-static Registrar<FCImplForward_cpu> registrarFCImpl2DForward_cpu_Float64(
+                                const DimSize_t outputFeatureSize,
-        {DataType::Float64, DataType::Float64, DataType::Float64, DataType::Float64},
+                                const void* input_,
-        Aidge::FCImpl_cpu_forward_kernel<double, double, double, double>);
+                                const void* originalInput_,
-}  // namespace
+                                const void* weight_,
+                                void* output_,
-}  // namespace Aidge
+                                void* weightGrad_,
+                                void* biasesGrad_)
-#endif /* AIDGE_CPU_OPERATOR_FCIMPL_FORWARD_KERNEL_H_ */
+{
+    // FIXME: missing FC attributes as arguments
+    const I* input  = static_cast<const I*>(input_);
+    const I* originalInput  = static_cast<const I*>(originalInput_);
+    const W* weight = static_cast<const W*>(weight_);
+    O* output       = static_cast<O*>(output_);
+    W* weightGrad   = static_cast<W*>(weightGrad_);
+    B* biasesGrad   = static_cast<B*>(biasesGrad_);
+    // bias grad
+    if (biasesGrad == nullptr) { // no bias
+        std::fill(biasesGrad, biasesGrad + outputFeatureSize, B(0));
+    } else {
+        for (std::size_t o = 0; o < outputFeatureSize; ++o) { // nb outputs
+            B sum{0};
+            for (std::size_t b = 0; b < batchSize; ++b) {
+                sum += input[b*outputFeatureSize + o];
+            }
+            biasesGrad[o] = sum;
+        }
+    }
+    // weight grad
+    for (std::size_t o = 0; o < outputFeatureSize; ++o) {
+        for (std::size_t c = 0; c < inputFeatureSize; ++c) {
+            W sum{0};
+            for (std::size_t b = 0; b < batchSize; ++b) {
+                sum += originalInput[b*inputFeatureSize + c]*input[b*outputFeatureSize + o];
+            }
+            weightGrad[o*inputFeatureSize + c] = sum;
+        }
+    }
+    // input grad
+    for (std::size_t b = 0; b < batchSize; ++b) {
+        for (std::size_t c = 0; c < inputFeatureSize; ++c) {
+            O sum{0};
+            for (std::size_t o = 0; o < outputFeatureSize; ++o) {
+                sum += weight[o*inputFeatureSize + c] * input[b*outputFeatureSize + o];
+            }
+            output[b*inputFeatureSize + c] = sum;
+        }
+    }
+}
+// Kernels registration to implementation entry point
+REGISTRAR(FCImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float32}},
+    {ProdConso::defaultModel, Aidge::FCImpl_cpu_forward_kernel<float, float, float, float>, Aidge::FCImpl_cpu_backward_kernel<float, float, float, float>});
+REGISTRAR(FCImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}},
+    {ProdConso::defaultModel, Aidge::FCImpl_cpu_forward_kernel<double, double, double, double>, Aidge::FCImpl_cpu_backward_kernel<double, double, double, double>});
+REGISTRAR(FCImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}},
+    {ProdConso::defaultModel, Aidge::FCImpl_cpu_forward_kernel<int32_t, int32_t, int32_t, int32_t>, Aidge::FCImpl_cpu_backward_kernel<int32_t, int32_t, int32_t, int32_t>});
+}  // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_FCIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/FoldImpl.hpp
+++ b/include/aidge/backend/cpu/operator/FoldImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_FOLDIMPL_H_
+#define AIDGE_CPU_OPERATOR_FOLDIMPL_H_
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Fold.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+namespace Aidge {
+// Operator implementation entry point for the backend
+using Fold2D_Op = Fold_Op<2>;
+using FoldImpl2D_cpu = OperatorImpl_cpu<Fold_Op<2>,
+    void(const std::array<DimSize_t, 2>&,
+        const std::array<DimSize_t, 2>&,
+        const std::array<DimSize_t, 2>&,
+        const std::array<DimSize_t, 2>&,
+        const std::vector<DimSize_t> &,
+        const void *,
+        void *)>;
+// Implementation entry point registration to Operator
+REGISTRAR(Fold2D_Op, "cpu", Aidge::FoldImpl2D_cpu::create);
+}  // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_FOLDIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/FoldImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/FoldImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_FOLDIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_FOLDIMPL_KERNELS_H_
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/backend/cpu/operator/FoldImpl.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include <cmath>
+#include <array>
+#include <algorithm>
+namespace Aidge {
+template <class I, class O>
+void FoldImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& outputDims,
+                                    const std::array<DimSize_t, 2>& strideDims,
+                                    const std::array<DimSize_t, 2>& dilationDims,
+                                    const std::array<DimSize_t, 2>& kernelDims,
+                                    const std::vector<DimSize_t> &dims,
+                                    const void *input_, void *output_)
+{
+    const I *input = static_cast<const I *>(input_);
+    O *output = static_cast<O *>(output_);
+    const DimSize_t inHeight = outputDims[0];
+    const DimSize_t inWidth = outputDims[1];
+    const DimSize_t kernelExtentHeight = dilationDims[0] *
+                                            (kernelDims[0] - 1) + 1;
+    const DimSize_t outHeight = 1 + static_cast<DimSize_t>(
+                    floor(static_cast<float>(inHeight - kernelExtentHeight) /
+                            static_cast<float>(strideDims[0])));
+    const DimSize_t kernelExtentWidth = dilationDims[1] *
+                                            (kernelDims[1] - 1) + 1;
+    const DimSize_t outWidth = 1 + static_cast<DimSize_t>(
+                    floor(static_cast<float>(inWidth - kernelExtentWidth) /
+                            static_cast<float>(strideDims[1])));
+    const DimSize_t outChannels = dims[dims.size() - 2];
+    const DimSize_t inChannels = outChannels / kernelDims[0] / kernelDims[1];
+    std::fill_n(output, dims[0] * outHeight * outWidth * outChannels, O(0));
+    for (DimSize_t n = 0; n < dims[0]; ++n) {
+        for (DimSize_t outC = 0; outC < outChannels; ++outC) {
+            const auto inOffsetW = outC % kernelDims[1];
+            const auto inOffsetH = (outC / kernelDims[1]) % kernelDims[0];
+            const auto inC = outC / kernelDims[0] / kernelDims[1];
+            for (DimSize_t outH = 0; outH < outHeight; ++outH) {
+                const auto inH = outH * strideDims[0] + inOffsetH * dilationDims[0];
+                for (DimSize_t outW = 0; outW < outWidth; ++outW) {
+                    const auto inW = outW * strideDims[1] + inOffsetW * dilationDims[1];
+                    output[((n * inChannels + inC) * inHeight + inH) * inWidth + inW] +=
+                        input[((n * outChannels + outC) * outHeight + outH) * outWidth + outW];
+                }
+            }
+        }
+    }
+}
+// Kernels registration to implementation entry point
+REGISTRAR(FoldImpl2D_cpu,
+    {DataType::Float32},
+    {ProdConso::defaultModel, Aidge::FoldImpl2D_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(FoldImpl2D_cpu,
+    {DataType::Float64},
+    {ProdConso::defaultModel, Aidge::FoldImpl2D_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(FoldImpl2D_cpu,
+    {DataType::Int32},
+    {ProdConso::defaultModel, Aidge::FoldImpl2D_cpu_forward_kernel<int32_t, int32_t>, nullptr});
+}  // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_FOLDIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp
+++ b/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_H_
+#define AIDGE_CPU_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_H_
+#include <memory>
+#include <vector>
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/GlobalAveragePooling.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+namespace Aidge {
+// Operator implementation entry point for the backend
+using GlobalAveragePoolingImpl_cpu = OperatorImpl_cpu<GlobalAveragePooling_Op,
+    void(const std::vector<DimSize_t> &, const void *, void *)>;
+// Implementation entry point registration to Operator
+REGISTRAR(GlobalAveragePooling_Op, "cpu", Aidge::GlobalAveragePoolingImpl_cpu::create);
+} // namespace Aidge
+#endif /* _AIDGE_CPU_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_KERNELS_H_
+#include <cstddef>
+#include <functional>  // std::multiplies
+#include <numeric>     // std::accumulate
+#include <vector>
+#include "aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp"
+#include "aidge/data/Data.hpp"
+#include "aidge/utils/ErrorHandling.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+namespace Aidge {
+template <typename T>
+typename std::enable_if<std::is_floating_point<T>::value, T>::type
+stableMean(const T* vec, size_t size) {
+  T mean = 0;
+  for (size_t i = 0; i < size; ++i) {
+    mean = std::fma<T>(vec[i] - mean, 1.0f / (i + 1), mean);
+  }
+  return mean;
+}
+// Specialization for integers: perform the mean computation in float
+template <typename T>
+typename std::enable_if<!std::is_floating_point<T>::value, T>::type
+stableMean(const T* vec, size_t size) {
+  double mean = 0;
+  for (size_t i = 0; i < size; ++i) {
+    mean = std::fma<double>(vec[i] - mean, 1.0f / (i + 1), mean);
+  }
+  return mean;
+}
+template <typename T>
+typename std::enable_if<std::is_floating_point<T>::value, T>::type
+castFromFloat(T value) {
+  return value;
+}
+template <typename T>
+typename std::enable_if<!std::is_floating_point<T>::value, T>::type
+castFromFloat(double value) {
+  return static_cast<T>(std::nearbyint(value));
+}
+template <class I, class O>
+void GlobalAveragePoolingImpl_cpu_forward_kernel(
+    const std::vector<DimSize_t> &dims, const void *input_, void *output_) {
+  // error checking
+    AIDGE_ASSERT(dims.size() >= 3,"GlobalAveragePool needs at least a 3 dimensions "
+                 "input, number of input dim : {}",
+                 dims.size());
+  // computation
+  const I *input = static_cast<const I *>(input_);
+  O *output = static_cast<O *>(output_);
+  DimSize_t nb_elems = std::accumulate(dims.begin(), dims.end(), std::size_t(1),
+                                       std::multiplies<std::size_t>());
+  const DimSize_t in_batch_nb_elems{nb_elems / dims[0]};
+  const DimSize_t in_channel_nb_elems{in_batch_nb_elems / dims[1]};
+  const DimSize_t out_batch_nb_elems{dims[1]};
+  // parse channel by channel and fill each output with the average of the
+  // values in the channel
+  for (DimSize_t batch = 0; batch < dims[0]; ++batch) {
+    for (DimSize_t channel = 0; channel < dims[1]; ++channel) {
+      const I *filter_start = std::next(
+          input, (batch * in_batch_nb_elems) + (channel * in_channel_nb_elems));
+      output[batch * out_batch_nb_elems + channel] = castFromFloat<O>(stableMean<I>(filter_start, in_channel_nb_elems));
+    }
+  }
+}
+// Kernels registration to implementation entry point
+REGISTRAR(GlobalAveragePoolingImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::defaultModel, Aidge::GlobalAveragePoolingImpl_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(GlobalAveragePoolingImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::defaultModel, Aidge::GlobalAveragePoolingImpl_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(GlobalAveragePoolingImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::defaultModel, Aidge::GlobalAveragePoolingImpl_cpu_forward_kernel<int32_t, int32_t>, nullptr});
+} // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_KERNELS_H_ */
--- a/include/aidge/backend/cpu/operator/GridSampleImpl.hpp
+++ b/include/aidge/backend/cpu/operator/GridSampleImpl.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_GRIDSAMPLEIMPL_H_
+#define AIDGE_CPU_OPERATOR_GRIDSAMPLEIMPL_H_
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/GridSample.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+namespace Aidge {
+// Operator implementation entry point for the backend
+using GridSampleImpl_cpu = OperatorImpl_cpu<GridSample_Op,
+    void(const GridSample_Op&,
+        const std::shared_ptr<Tensor>&,
+        const std::shared_ptr<Tensor>&,
+        const std::shared_ptr<Tensor>&)>;
+// Implementation entry point registration to Operator
+REGISTRAR(GridSample_Op, "cpu", Aidge::GridSampleImpl_cpu::create);
+}  // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_GRIDSAMPLEIMPL_H_ */
--- a/include/aidge/backend/cpu/operator/GridSampleImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/GridSampleImpl_kernels.hpp
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+#ifndef AIDGE_CPU_OPERATOR_CONVIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_CONVIMPL_KERNELS_H_
+#include <algorithm>  // std::max, std::min
+#include <cmath>      // std::fabs, std::trunf, std::nearbyint
+#include <cstddef>    // std::size_t
+#include <cstdint>    // std::int64_t
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include "aidge/backend/cpu/operator/GridSampleImpl.hpp"
+#include "aidge/data/half.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+static bool in_bound(float coord, float lower_bound, float upper_bound) noexcept {
+    return (coord > lower_bound) && (coord < upper_bound);
+}
+static float unnormalized_coord(float coord, float new_lower_bound, float new_upper_bound) noexcept {
+    return (coord + 1) / 2 * (new_upper_bound - new_lower_bound) + new_lower_bound;
+}
+// unused
+// static float normalized_coord(float coord, float prev_lower_bound, float prev_upper_bound) noexcept {
+//     return (coord + prev_lower_bound) / (prev_upper_bound-prev_lower_bound) * 2 - 1;
+// }
+static float unnormalize_grid_sample_coord(float coord, std::size_t size, bool align_corners) noexcept {
+    return align_corners ? unnormalized_coord(coord, 0.0f, static_cast<float>(size) - 1.0f)
+                         : unnormalized_coord(coord, -0.5f, static_cast<float>(size) - 0.5f);
+}
+// unused
+// static float normalize_grid_sample_coord(float coord, std::size_t size, bool align_corners) noexcept {
+//     return align_corners ? normalized_coord(coord, 0.0f, static_cast<float>(size) - 1.0f)
+//                          : normalized_coord(coord, -0.5f, static_cast<float>(size) - 0.5f);
+// }
+static float update_normalized_coord_with_padding(float coord, Aidge::GridSample_Op::PaddingMode padding_mode) {
+    if (!in_bound(coord, -1.0f, 1.0f)) {
+        if (padding_mode == Aidge::GridSample_Op::PaddingMode::Border) {
+            coord = std::min(std::max(-1.0f, coord), 1.0f);
+        }
+        else if (padding_mode == Aidge::GridSample_Op::PaddingMode::Reflection) {
+            float abs_coord = std::fabs(coord);
+            float int_coord = std::truncf(abs_coord);
+            std::int32_t nb_refl = static_cast<std::int32_t>((int_coord - 1) / 2);
+            float res = ((nb_refl + 1)*2) - abs_coord;
+            coord = (coord > 0) ? (nb_refl % 2 == 0 ? res : -res) \
+                            : (nb_refl % 2 == 0 ? -res : res);
+        }
+    }
+    return coord;
+}
+static std::int64_t update_unnormalized_coord_with_padding(std::int64_t coord, std::int64_t size, Aidge::GridSample_Op::PaddingMode padding_mode) {
+    if (!in_bound(coord, 0, size)) {
+        // out of bound. switch padding mode
+        if (padding_mode == Aidge::GridSample_Op::PaddingMode::Border) {
+            coord = std::min(std::max(std::int64_t(0), coord), size-std::int64_t(1));
+        } else if (padding_mode == Aidge::GridSample_Op::PaddingMode::Reflection) {
+            const std::int64_t quotient = coord / (size-1);
+            const std::int64_t remainer = std::abs(coord - quotient*(size-1));
+            coord = (quotient % 2 == 0) ? remainer : size - 1 - remainer;
+        }
+    }
+    return coord;
+}
+namespace Aidge {
+/**
+ * @brief Forward kernel for 1D GridSample on CPU backend.
+ * @tparam I Input data type.
+ * @tparam O Output data type.
+ * @param params tuple of Attributes from the Operator
+ * @param inputDims Array of input dimensions.
+ * @param input_ const input Tensor.
+ * @param grid_ const grid Tensor.
+ * @param output_ Output Tensor.
+ */
+template <class I, class O>
+void GridSampleImpl1D_cpu_forward_kernel(const GridSample_Op& op,
+                            const std::shared_ptr<Tensor>& in0,
+                            const std::shared_ptr<Tensor>& in1,
+                            const std::shared_ptr<Tensor>& out)
+{
+    const I* const input = static_cast<const I *>(in0->getImpl()->rawPtr());
+    const I* input_ptr = input;
+    float* const grid = static_cast<float*>(in1->getImpl()->rawPtr());
+    float* grid_ptr = grid;
+    O* const output = static_cast<O*>(out->getImpl()->rawPtr());
+    O* output_ptr = output;
+    const std::size_t N = in0->dim(0);
+    const std::size_t C = in0->dim(1);
+    const std::size_t in_H = in0->dim(2);
+    const std::size_t grid_H = in1->dim(1);
+    const std::size_t in_N_s = in0->stride(0);
+    const std::size_t in_C_s = in0->stride(1);
+    const std::size_t in_H_s = in0->stride(2);
+    const std::size_t grid_N_s = in1->stride(0);
+    const std::size_t grid_H_s = in1->stride(1);
+    const std::size_t out_N_s = out->stride(0);
+    const std::size_t out_C_s = out->stride(1);
+    const std::size_t out_H_s = out->stride(2);
+    float* grid_ptr_N = grid;
+    const I* input_ptr_N = input;
+    O* output_ptr_N = output;
+    for (std::size_t n = 0; n < N; ++n) {
+        grid_ptr = grid_ptr_N;
+        for (std::size_t grid_x = 0; grid_x < grid_H; ++grid_x) {
+            output_ptr = output_ptr_N + grid_x*out_H_s;
+            /*
+            * change grid_x coord to match padding_mode
+            * Change range from [-1, 1] to [0, H-1] or [-0.5, H-0.5] according to align_corners
+            * Handle computation of interpolation
+            *   any value outside bounds is considered 0
+            *   if nearest:
+            *   else if linear:
+            *   else if cubic:
+            *   else : nothing
+            */
+            float x = *grid_ptr;
+            x = update_normalized_coord_with_padding(x, op.paddingMode());
+            x = unnormalize_grid_sample_coord(x, in_H, op.alignCorners());
+            if (op.mode() == GridSample_Op::Mode::Nearest) {
+                const std::int64_t x_rounded = std::nearbyintf(x);
+                if (in_bound(x_rounded, 0, in_H)) {
+                    input_ptr = input_ptr_N + x_rounded*in_H_s;
+                    for (std::size_t c = 0; c < C; ++c) {
+                        *output_ptr = *input_ptr;
+                        input_ptr += in_C_s;
+                        output_ptr += out_C_s;
+                    }
+                } else {
+                    for (std::size_t c = 0; c < C; ++c) {
+                        *output_ptr = O(0);
+                        output_ptr += out_C_s;
+                    }
+                }
+            } else if (op.mode() == GridSample_Op::Mode::Linear) {
+                const std::int64_t x_inf = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(x)), in_H, op.paddingMode());
+                const std::int64_t x_sup = update_unnormalized_coord_with_padding(x_inf + 1, in_H, op.paddingMode());
+                const I* input_ptr_NC = input_ptr_N;
+                for (std::size_t c = 0; c < C; ++c) {
+                    const I f_inf = in_bound(x_inf, 0, in_H) ?
+                        input_ptr_NC[static_cast<std::size_t>(x_inf)*in_H_s] : I(0);
+                    const I f_sup = in_bound(x_sup, 0, in_H) ?
+                        input_ptr_NC[static_cast<std::size_t>(x_sup)*in_H_s] : I(0);
+                    *output_ptr = static_cast<O>(static_cast<I>(x - x_inf)*f_inf \
+                            + static_cast<I>(x_sup - x)*f_sup);
+                    input_ptr_NC += in_C_s;
+                    output_ptr += out_C_s;
+                }
+            } else if (op.mode() == GridSample_Op::Mode::Cubic) {
+                const std::int64_t x_inf = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(x)), in_H, op.paddingMode());
+                const std::int64_t x_sup = update_unnormalized_coord_with_padding(x_inf + 1, in_H, op.paddingMode());
+                const std::int64_t x_inf_inf = update_unnormalized_coord_with_padding(x_inf - 1, in_H, op.paddingMode());
+                const std::int64_t x_sup_sup = update_unnormalized_coord_with_padding(x_sup + 1, in_H, op.paddingMode());
+                const I x1 = static_cast<I>(x - static_cast<float>(x_inf));
+                const I x2 = x1 * x1;
+                const I x3 = x1 * x2;
+                const I* input_ptr_NC = input_ptr_N;
+                for (std::size_t c = 0; c < C; ++c) {
+                    const I f_inf_inf = in_bound(x_inf_inf, 0, in_H) ? input_ptr_NC[x_inf_inf*in_H_s] : I(0);
+                    const I f_inf = in_bound(x_inf, 0, in_H) ? input_ptr_NC[x_inf*in_H_s] : I(0);
+                    const I f_sup = in_bound(x_sup, 0, in_H) ? input_ptr_NC[x_sup*in_H_s] : I(0);
+                    const I f_sup_sup = in_bound(x_sup_sup, 0, in_H) ? input_ptr_NC[x_sup_sup*in_H_s] : I(0);
+                    const I m_inf = (f_sup - f_inf_inf) / I(2);
+                    const I m_sup = (f_sup_sup - f_inf) / I(2);
+                    *output_ptr = f_inf \
+                        + x1 * m_inf \
+                        + x2 * (3 * (f_sup - f_inf) - 2 * m_inf - m_sup) \
+                        + x3 * (2*(f_inf - f_sup) + m_inf + m_sup);
+                    input_ptr_NC += in_C_s;
+                    output_ptr += out_C_s;
+                }
+            }
+            grid_ptr += grid_H_s;
+        }
+        input_ptr_N += in_N_s;
+        grid_ptr_N += grid_N_s;
+        output_ptr_N += out_N_s;
+    }
+}
+// Kernels registration to implementation entry point
+// only accept 1st input with only 1 spatial feat. (nb dims = 1)
+REGISTRAR(GridSampleImpl_cpu,
+    {{{DataType::Any, DataFormat::Any, {{-1, -1}}}, {DataType::Any}}, {{DataType::Float16}}},
+    {ProdConso::defaultModel, Aidge::GridSampleImpl1D_cpu_forward_kernel<half_float::half, half_float::half>, nullptr});
+REGISTRAR(GridSampleImpl_cpu,
+    {{{DataType::Any, DataFormat::Any, {{-1, -1}}}, {DataType::Any}}, {{DataType::Float32}}},
+    {ProdConso::defaultModel, Aidge::GridSampleImpl1D_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(GridSampleImpl_cpu,
+    {{{DataType::Any, DataFormat::Any, {{-1, -1}}}, {DataType::Any}}, {{DataType::Float64}}},
+    {ProdConso::defaultModel, Aidge::GridSampleImpl1D_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(GridSampleImpl_cpu,
+    {{{DataType::Any, DataFormat::Any, {{-1, -1}}}, {DataType::Any}}, {{DataType::Int32}}},
+    {ProdConso::defaultModel, Aidge::GridSampleImpl1D_cpu_forward_kernel<int32_t, int32_t>, nullptr});
+/**
+ * @brief Forward kernel for 1D GridSample on CPU backend.
+ * @tparam I Input data type.
+ * @tparam O Output data type.
+ * @param params tuple of Attributes from the Operator
+ * @param inputDims Array of input dimensions.
+ * @param input_ const input Tensor.
+ * @param grid_ const grid Tensor.
+ * @param output_ Output Tensor.
+ */
+template <class I, class O>
+void GridSampleImpl2D_cpu_forward_kernel(const GridSample_Op& op,
+                            const std::shared_ptr<Tensor>& in0,
+                            const std::shared_ptr<Tensor>& in1,
+                            const std::shared_ptr<Tensor>& out)
+{
+    const I* input = static_cast<const I *>(in0->getImpl()->rawPtr());
+    const I* input_ptr = input;
+    float* const grid = static_cast<float*>(in0->getImpl()->rawPtr());
+    float* grid_ptr = grid;
+    O* const output = static_cast<O*>(out->getImpl()->rawPtr());
+    const std::size_t N = in0->dim(0);
+    const std::size_t C = in0->dim(1);
+    const std::size_t in_H = in0->dim(2);
+    const std::size_t in_W = in0->dim(3);
+    const std::size_t grid_H = in1->dim(1);
+    const std::size_t grid_W = in1->dim(2);
+    const std::size_t in_N_s = in0->stride(0);
+    const std::size_t in_C_s = in0->stride(1);
+    const std::size_t in_H_s = in0->stride(2);
+    const std::size_t in_W_s = in0->stride(3);
+    const std::size_t grid_N_s = in1->stride(0);
+    const std::size_t grid_H_s = in1->stride(1);
+    const std::size_t grid_W_s = in1->stride(2);
+    const std::size_t grid_Coord_s = in1->stride(3);
+    const std::size_t out_N_s = out->stride(0);
+    const std::size_t out_C_s = out->stride(1);
+    const std::size_t out_H_s = out->stride(2);
+    const std::size_t out_W_s = out->stride(3);
+    float* grid_ptr_N = grid;
+    const I* input_ptr_N = input;
+    O* output_ptr_N = output;
+    for (std::size_t n = 0; n < N; ++n) {
+        for (std::size_t grid_y = 0; grid_y < grid_H; ++grid_y) {
+            for (std::size_t grid_x = 0; grid_x < grid_W; ++grid_x) {
+                O* output_ptr = output_ptr_N + grid_y*out_H_s + grid_y*out_W_s;
+                grid_ptr = grid_ptr_N + grid_y*grid_H_s + grid_x*grid_W_s;
+                /*
+                * change grid_x coord to match padding_mode
+                * Change range from [-1, 1] to [0, H-1] or [-0.5, H-0.5] according to align_corners
+                * Handle computation of interpolation
+                *   any value outside bounds is considered 0
+                *   if nearest:
+                *   else if linear:
+                *   else if cubic:
+                *   else : nothing
+                */
+                float x = *grid_ptr;
+                float y = grid_ptr[grid_Coord_s];
+                x = update_normalized_coord_with_padding(x, op.paddingMode());
+                x = unnormalize_grid_sample_coord(x, in_W, op.alignCorners());
+                y = update_normalized_coord_with_padding(y, op.paddingMode());
+                y = unnormalize_grid_sample_coord(y, in_H, op.alignCorners());
+                if (op.mode() == GridSample_Op::Mode::Nearest) {
+                    const std::int64_t x_rounded = std::nearbyintf(x);
+                    const std::int64_t y_rounded = std::nearbyintf(y);
+                    if (in_bound(x_rounded, 0, in_W) && in_bound(y_rounded, 0, in_H)) {
+                        input_ptr = input_ptr_N + y_rounded*in_H_s + x_rounded*in_W_s;
+                        for (std::size_t c = 0; c < C; ++c) {
+                            *output_ptr = *input_ptr;
+                            input_ptr += in_C_s;
+                            output_ptr += out_C_s;
+                        }
+                    } else {
+                        for (std::size_t c = 0; c < C; ++c) {
+                            *output_ptr = O(0);
+                            output_ptr += out_C_s;
+                        }
+                    }
+                } else if (op.mode() == GridSample_Op::Mode::Linear) {
+                    const std::int64_t x_r = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(x)), in_W, op.paddingMode()); // right
+                    const std::int64_t x_l = update_unnormalized_coord_with_padding(x_r + 1, in_W, op.paddingMode()); // left
+                    const std::int64_t y_t = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(y)), in_H, op.paddingMode()); // top
+                    const std::int64_t y_b = update_unnormalized_coord_with_padding(y_t + 1, in_H, op.paddingMode()); // bottom
+                    const I* input_ptr_NC = input_ptr_N;
+                    for (std::size_t c = 0; c < C; ++c) {
+                        const I f_tr = (in_bound(x_r, 0, in_W) && in_bound(y_t, 0, in_H)) ?
+                            input_ptr_NC[static_cast<std::size_t>(y_t)*in_H_s
+                                         + static_cast<std::size_t>(x_r)*in_W_s]
+                                : I(0);
+                        const I f_tl = (in_bound(x_l, 0, in_W) && in_bound(y_t, 0, in_H)) ?
+                            input_ptr_NC[static_cast<std::size_t>(y_t)*in_H_s
+                                         + static_cast<std::size_t>(x_l)*in_W_s]
+                                : I(0);
+                        const I f_br = (in_bound(x_r, 0, in_W) && in_bound(y_b, 0, in_H)) ?
+                            input_ptr_NC[static_cast<std::size_t>(y_b)*in_H_s
+                                         + static_cast<std::size_t>(x_r)*in_W_s]
+                                : I(0);
+                        const I f_bl = (in_bound(x_l, 0, in_W) && in_bound(y_b, 0, in_H)) ?
+                            input_ptr_NC[static_cast<std::size_t>(y_b)*in_H_s
+                                         + static_cast<std::size_t>(x_l)*in_W_s]
+                                : I(0);
+                        // compute weighted sum of the 4 corners
+                        const I w_tr = static_cast<I>((y - static_cast<float>(y_t))*(static_cast<float>(x_r) - x));
+                        const I w_tl = static_cast<I>((y - static_cast<float>(y_t))*(x - static_cast<float>(x_l)));
+                        const I w_br = static_cast<I>((static_cast<float>(y_b) - y)*(static_cast<float>(x_r) - x));
+                        const I w_bl = static_cast<I>((static_cast<float>(y_b) - y)*(x - static_cast<float>(x_l)));
+                        *output_ptr = static_cast<O>(w_tr*f_tr + w_tl*f_tl + w_br*f_br + w_bl*f_bl);
+                        input_ptr_NC += in_C_s;
+                        output_ptr += out_C_s;
+                    }
+                } else if (op.mode() == GridSample_Op::Mode::Cubic) {
+                    /*
+                    *  .. .. .. .. .. ..
+                    *  .. 00 01 02 03 ..
+                    *  .. 10 11 12 13 ..
+                    *  .. 20 21 22 23 ..
+                    *  .. 30 31 32 33 ..
+                    *  .. .. .. .. .. ..
+                    */
+                    const std::int64_t x_1 = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(x)), in_W, op.paddingMode());
+                    const std::int64_t x_0 = update_unnormalized_coord_with_padding(x_1 - 1, in_W, op.paddingMode());
+                    const std::int64_t x_2 = update_unnormalized_coord_with_padding(x_1 + 1, in_W, op.paddingMode());
+                    const std::int64_t x_3 = update_unnormalized_coord_with_padding(x_1 + 2, in_W, op.paddingMode());
+                    const std::int64_t y_1 = update_unnormalized_coord_with_padding(static_cast<std::int64_t>(std::floor(y)), in_H, op.paddingMode());
+                    const std::int64_t y_0 = update_unnormalized_coord_with_padding(y_1 - 1, in_H, op.paddingMode());
+                    const std::int64_t y_2 = update_unnormalized_coord_with_padding(y_1 + 1, in_H, op.paddingMode());
+                    const std::int64_t y_3 = update_unnormalized_coord_with_padding(y_1 + 2, in_H, op.paddingMode());
+                    const I* input_ptr_NC = input_ptr_N;
+                    for (std::size_t c = 0; c < C; ++c) {
+                        const I f_00 = in_bound(x_0, 0, in_W) && in_bound(y_0, 0, in_H) ?
+                                        input_ptr_NC[x_0*in_W_s + y_0*in_H_s] : I(0);
+                        const I f_01 = in_bound(x_0, 0, in_W) && in_bound(y_1, 0, in_H) ?
+                                        input_ptr_NC[x_0*in_W_s + y_1*in_H_s] : I(0);
+                        const I f_02 = in_bound(x_0, 0, in_W) && in_bound(y_2, 0, in_H) ?
+                                        input_ptr_NC[x_0*in_W_s + y_2*in_H_s] : I(0);
+                        const I f_03 = in_bound(x_0, 0, in_W) && in_bound(y_3, 0, in_H) ?
+                                        input_ptr_NC[x_0*in_W_s + y_3*in_H_s] : I(0);
+                        const I f_10 = in_bound(x_1, 0, in_W) && in_bound(y_0, 0, in_H) ?
+                                        input_ptr_NC[x_1*in_W_s + y_0*in_H_s] : I(0);
+                        const I f_20 = in_bound(x_2, 0, in_W) && in_bound(y_0, 0, in_H) ?
+                                        input_ptr_NC[x_2*in_W_s + y_0*in_H_s] : I(0);
+                        const I f_30 = in_bound(x_3, 0, in_W) && in_bound(y_0, 0, in_H) ?
+                                        input_ptr_NC[x_3*in_W_s + y_0*in_H_s] : I(0);
+                        const I f_11 = in_bound(x_1, 0, in_W) && in_bound(y_1, 0, in_H) ?
+                                        input_ptr_NC[x_1*in_W_s + y_1*in_H_s] : I(0);
+                        const I f_12 = in_bound(x_1, 0, in_W) && in_bound(y_2, 0, in_H) ?
+                                        input_ptr_NC[x_1*in_W_s + y_2*in_H_s] : I(0);
+                        const I f_13 = in_bound(x_1, 0, in_W) && in_bound(y_3, 0, in_H) ?
+                                        input_ptr_NC[x_1*in_W_s + y_3*in_H_s] : I(0);
+                        const I f_21 = in_bound(x_2, 0, in_W) && in_bound(y_1, 0, in_H) ?
+                                        input_ptr_NC[x_2*in_W_s + y_1*in_H_s] : I(0);
+                        const I f_22 = in_bound(x_2, 0, in_W) && in_bound(y_2, 0, in_H) ?
+                                        input_ptr_NC[x_2*in_W_s + y_2*in_H_s] : I(0);
+                        const I f_23 = in_bound(x_2, 0, in_W) && in_bound(y_3, 0, in_H) ?
+                                        input_ptr_NC[x_2*in_W_s + y_3*in_H_s] : I(0);
+                        const I f_31 = in_bound(x_3, 0, in_W) && in_bound(y_1, 0, in_H) ?
+                                        input_ptr_NC[x_3*in_W_s + y_1*in_H_s] : I(0);
+                        const I f_32 = in_bound(x_3, 0, in_W) && in_bound(y_2, 0, in_H) ?
+                                        input_ptr_NC[x_3*in_W_s + y_2*in_H_s] : I(0);
+                        const I f_33 = in_bound(x_3, 0, in_W) && in_bound(y_3, 0, in_H) ?
+                                        input_ptr_NC[x_3*in_W_s + y_3*in_H_s] : I(0);
+                        const I mx_11 = (f_21 - f_01) / I(2);
+                        const I mx_12 = (f_22 - f_02) / I(2);
+                        const I mx_21 = (f_31 - f_11) / I(2);
+                        const I mx_22 = (f_32 - f_12) / I(2);
+                        const I my_11 = (f_12 - f_10) / I(2);
+                        const I my_12 = (f_13 - f_11) / I(2);
+                        const I my_21 = (f_22 - f_20) / I(2);
+                        const I my_22 = (f_23 - f_21) / I(2);
+                        const I mxy_11 = (f_22 - f_20 - f_02 - + f_00) / I(4);
+                        const I mxy_12 = (f_23 - f_21 - f_03 - + f_01) / I(4);
+                        const I mxy_21 = (f_32 - f_30 - f_12 - + f_10) / I(4);
+                        const I mxy_22 = (f_33 - f_31 - f_13 - + f_11) / I(4);
+                        const I a_00 = f_11;
+                        const I a_10 = mx_11;
+                        const I a_20 = I(3)*(f_21 - f_11) - I(2)*mx_11 - mx_21;
+                        const I a_30 = I(2)*(f_11 - f_21) + mx_11 + mx_21;
+                        const I a_01 = my_11;
+                        const I a_11 = mxy_11;
+                        const I a_21 = I(3)*(my_21 - my_11) - I(2)*mxy_11 - mxy_21;
+                        const I a_31 = I(2)*(my_11 - my_21) + mxy_11 + mxy_21;
+                        const I a_02 = I(3)*(f_12 - f_11) - I(2)*my_11 - my_12;
+                        const I a_12 = I(3)*(mx_12 - mx_11) - I(2)*mxy_11 - mxy_12;
+                        const I a_22 = I(9)*(f_11 + f_22 - f_21 - f_12) + I(3)*(I(2)*(mx_11 - mx_12 + my_11 - my_21) + mx_21 - mx_22 + my_12 - my_22) + mxy_22 + I(2)*(mxy_12 + mxy_21 + I(2)*mxy_11);
+                        const I a_32 = - mxy_12 - mxy_22 + I(2)*(my_22 - my_12 - mxy_11 - mxy_21 + I(2)*(my_21 - my_11) + I(3)*(f_21 + f_12 - f_11 - f_22)) + I(3)*(mx_12 + mx_22 - mx_11 - mx_21);
+                        const I a_03 = I(2)*(f_11 - f_12) + my_11 + my_12;
+                        const I a_13 = I(2)*(mx_11 - mx_12) + mxy_11 + mxy_12;
+                        const I a_23 = - mxy_21 - mxy_22 + I(2)*(-mx_21 + mx_22 - mxy_11 - mxy_12 + I(2)*(mx_12 - mx_11) + I(3)*(f_12 + f_21 - f_11 - f_22)) + I(3)*(my_21 + my_22 - my_11 - my_12);
+                        const I a_33 = mxy_11 + mxy_21 + mxy_12 + mxy_22 + I(2)*(mx_11 + mx_21 - mx_12 - mx_22 + my_11 - my_21 + my_12 - my_22 + I(2)*(f_11 - f_21 - f_12 + f_22));
+                        const I x2 = static_cast<I>(x*x);
+                        const I x3 = static_cast<I>(x*x*x);
+                        const I y2 = static_cast<I>(y*y);
+                        const I y3 = static_cast<I>(y*y*y);
+                        *output_ptr = static_cast<O>( \
+                            a_00 + a_10*x + a_20*x2 + a_30*x3 \
+                            + a_01*y + a_11*x*y + a_21*x2*y + a_31*x3*y \
+                            + a_02*y2 + a_12*x*y2 + a_22*x2*y2 + a_32*x3*y2 \
+                            + a_03*y3 + a_13*x*y3 + a_23*x2*y3 + a_33*x3*y3);
+                        input_ptr_NC += in_C_s;
+                        output_ptr += out_C_s;
+                    }
+                }
+            }
+        }
+        input_ptr_N += in_N_s;
+        grid_ptr_N += grid_N_s;
+        output_ptr_N += out_N_s;
+    }
+}
+// Kernels registration to implementation entry point
+// only accept 1st input with only 2 spatial feat. (nb dims = 2)
+REGISTRAR(GridSampleImpl_cpu,
+    {{{DataType::Any, DataFormat::Any, {{-1, -1}, {-1, -1}}}, {DataType::Any}}, {{DataType::Float16}}},
+    {ProdConso::defaultModel, Aidge::GridSampleImpl2D_cpu_forward_kernel<half_float::half, half_float::half>, nullptr});
+REGISTRAR(GridSampleImpl_cpu,
+    {{{DataType::Any, DataFormat::Any, {{-1, -1}, {-1, -1}}}, {DataType::Any}}, {{DataType::Float32}}},
+    {ProdConso::defaultModel, Aidge::GridSampleImpl2D_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(GridSampleImpl_cpu,
+    {{{DataType::Any, DataFormat::Any, {{-1, -1}, {-1, -1}}}, {DataType::Any}}, {{DataType::Float64}}},
+    {ProdConso::defaultModel, Aidge::GridSampleImpl2D_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(GridSampleImpl_cpu,
+    {{{DataType::Any, DataFormat::Any, {{-1, -1}, {-1, -1}}}, {DataType::Any}}, {{DataType::Int32}}},
+    {ProdConso::defaultModel, Aidge::GridSampleImpl2D_cpu_forward_kernel<int32_t, int32_t>, nullptr});
+}  // namespace Aidge
+#endif /* AIDGE_CPU_OPERATOR_CONVIMPL_KERNELS_H_ */
No results found