From f43bebf77f0286f0c9d065f9b332f6871290416c Mon Sep 17 00:00:00 2001
From: Vincent TEMPLIER <vincent.templier@cea.fr>
Date: Wed, 26 Jul 2023 13:47:17 +0000
Subject: [PATCH] Add other operators in CPU library

---
 aidge/_CPU/include/operator/ConvImpl.hpp      |  58 ++++++++
 .../include/operator/ConvImpl_kernels.hpp     | 132 ++++++++++++++++++
 aidge/_CPU/include/operator/FCImpl.hpp        |  51 +++++++
 .../_CPU/include/operator/FCImpl_kernels.hpp  | 117 ++++++++++++++++
 aidge/_CPU/include/operator/ProducerImpl.hpp  |  45 ++++++
 aidge/_CPU/include/operator/ReLUImpl.hpp      |  51 +++++++
 .../include/operator/ReLUImpl_kernels.hpp     |  33 +++++
 aidge/_CPU/src/operator/ConvImpl.cpp          |  75 ++++++++++
 aidge/_CPU/src/operator/FCImpl.cpp            | 114 +++++++++++++++
 aidge/_CPU/src/operator/ProducerImpl.cpp      |  62 ++++++++
 aidge/_CPU/src/operator/ReLUImpl.cpp          |  68 +++++++++
 11 files changed, 806 insertions(+)
 create mode 100644 aidge/_CPU/include/operator/ConvImpl.hpp
 create mode 100644 aidge/_CPU/include/operator/ConvImpl_kernels.hpp
 create mode 100644 aidge/_CPU/include/operator/FCImpl.hpp
 create mode 100644 aidge/_CPU/include/operator/FCImpl_kernels.hpp
 create mode 100644 aidge/_CPU/include/operator/ProducerImpl.hpp
 create mode 100644 aidge/_CPU/include/operator/ReLUImpl.hpp
 create mode 100644 aidge/_CPU/include/operator/ReLUImpl_kernels.hpp
 create mode 100644 aidge/_CPU/src/operator/ConvImpl.cpp
 create mode 100644 aidge/_CPU/src/operator/FCImpl.cpp
 create mode 100644 aidge/_CPU/src/operator/ProducerImpl.cpp
 create mode 100644 aidge/_CPU/src/operator/ReLUImpl.cpp

diff --git a/aidge/_CPU/include/operator/ConvImpl.hpp b/aidge/_CPU/include/operator/ConvImpl.hpp
new file mode 100644
index 00000000..5b7f7dca
--- /dev/null
+++ b/aidge/_CPU/include/operator/ConvImpl.hpp
@@ -0,0 +1,58 @@
+#ifndef ConvImpl2D_ref_cpp_H_
+#define ConvImpl2D_ref_cpp_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include "backend/OperatorImpl.hpp"
+#include "operator/Conv.hpp"
+#include "utils/Registrar.hpp"
+#include "utils/Types.h"
+
+namespace Aidge {
+// class Conv_Op;
+
+// compute kernel registry for forward and backward
+class ConvImpl2DForward_ref_cpp
+    : public Registrable<std::tuple<DataType, DataType, DataType, DataType>,
+                         void(const Conv_Op<2>::Parameters &, const std::array<DimSize_t, 4> &, const void *,
+                              const void *, const void *, void *)> {};
+class ConvImpl2DBackward_ref_cpp
+    : public Registrable<std::tuple<DataType, DataType, DataType, DataType>,
+                         void(const Conv_Op<2>::Parameters &, const std::array<DimSize_t, 4> &, const void *,
+                              const void *, const void *, void *)> {};
+
+class ConvImpl2D_ref_cpp : public OperatorImpl {
+   private:
+    const Conv_Op<2> &mOp;
+    std::array<NbElts_t, 3> mNbConsumedData;
+    std::array<NbElts_t, 1> mNbProducedData;
+
+   public:
+    ConvImpl2D_ref_cpp(const Conv_Op<2> &op) : mOp(op), mNbConsumedData({0, 0, 0}), mNbProducedData({0}) {}
+
+    static std::unique_ptr<ConvImpl2D_ref_cpp> create(const Conv_Op<2> &op) {
+        return std::make_unique<ConvImpl2D_ref_cpp>(op);
+    }
+
+   public:
+    NbElts_t getNbRequiredData(IOIndex_t inputIdx) const override final;
+    NbElts_t getNbRequiredProtected(IOIndex_t inputIdx) const override final;
+    NbElts_t getRequiredMemory(IOIndex_t outputIdx, const std::vector<DimSize_t> &inputsSize) const override final;
+    NbElts_t getNbConsumedData(IOIndex_t inputIdx) const override final;
+    NbElts_t getNbProducedData(IOIndex_t outputIdx) const override final;
+
+    void forward();
+
+    void backward();
+};
+
+namespace {
+// add ref_cpp backend to Conv_Op<2> implementation registry
+static Registrar<Conv_Op<2>> registrarConvImpl2D_ref_cpp("cpu", Aidge::ConvImpl2D_ref_cpp::create);
+}  // namespace
+}  // namespace Aidge
+
+#endif /* ConvImpl2D_ref_cpp_H_ */
\ No newline at end of file
diff --git a/aidge/_CPU/include/operator/ConvImpl_kernels.hpp b/aidge/_CPU/include/operator/ConvImpl_kernels.hpp
new file mode 100644
index 00000000..ddd1764d
--- /dev/null
+++ b/aidge/_CPU/include/operator/ConvImpl_kernels.hpp
@@ -0,0 +1,132 @@
+
+
+#ifndef ConvImpl2D_ref_cpp_forward_kernel_H_
+#define ConvImpl2D_ref_cpp_forward_kernel_H_
+
+#include "utils/Registrar.hpp"
+
+#include "operator/ConvImpl.hpp"
+#include "utils/Types.h"
+#include <array>
+#include <algorithm>
+
+namespace Aidge {
+/**
+ * @brief Forward kernel for 2D Convolution on CPU backend.
+ * @tparam I Input data type.
+ * @tparam W Weight data type.
+ * @tparam B Bias data type.
+ * @tparam O Output data type.
+ * @param params tuple of Parameters from the Operator
+ * @param dims Array of input dimensions.
+ * @param input_ const input Tensor.
+ * @param weights_ const weight Tensor.
+ * @param biases_ const Biais Tensor.
+ * @param output_ Output Tensor.
+ */
+template <class I, class W, class B, class O>
+void ConvImpl2D_ref_cpp_forward_kernel(const Conv_Op<2>::Parameters &params, const std::array<DimSize_t, 4> &dims,
+                                       const void *input_, const void *weights_, const void *biases_, void *output_) {
+    // FIXME: missing convolution parameters as arguments
+    const I *input = static_cast<const I *>(input_);
+    const W *weights = static_cast<const W *>(weights_);
+    const B *biases = static_cast<const B *>(biases_);
+    O *output = static_cast<O *>(output_);
+/*
+    // output H size
+    const std::size_t oxSize =
+            static_cast<std::size_t>(static_cast<float>(dims[0] - std::get<4>(params)[0] + std::get<0>(params)[0]) /
+                                static_cast<float>(std::get<0>(params)[0]));
+    // output W size
+    const std::size_t oySize =
+            static_cast<std::size_t>(static_cast<float>(dims[1] - std::get<4>(params)[1] + std::get<0>(params)[1]) /
+                                static_cast<float>(std::get<0>(params)[1]));
+
+    // TODO: kernel computation
+    // output (Xout, Yout, outCh, batch)
+    // input  (Xin, Yin, inCh, batch)
+    // weight (kernelX, kernelY, inCh, outCh)
+    // does not take Dilation parameter into account
+    for (std::size_t ox = 0; ox < oxSize; ++ox) {
+        for (std::size_t oy = 0; oy < oySize; ++oy) {
+            const std::size_t ix = ox * std::get<0>(params)[0];
+            const std::size_t iy = oy * std::get<0>(params)[1];
+
+            for (std::size_t outCh = 0; outCh < std::get<3>(params); ++outCh) {
+                const std::size_t oIndex = dims[3] * (outCh + std::get<3>(params) * (oy + oySize * ox));
+                B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
+                for (std::size_t batch = 0; batch < dims[3]; ++batch) {
+                    output[oIndex + batch] = biasVal;
+                }
+                for (std::size_t inCh = 0; inCh < dims[2]; ++inCh) {
+                    for (std::size_t sx = 0; sx < std::get<4>(params)[0]; ++sx) {
+                        for (std::size_t sy = 0; sy < std::get<4>(params)[1]; ++sy) {
+                            const std::size_t wIndex =
+                                    outCh + std::get<3>(params) * (inCh + dims[2] * (sy + std::get<4>(params)[1] * sx));
+                            std::size_t iIndex = dims[3] * (inCh + dims[2] * ((iy + sy) + dims[1] * (ix + sx)));
+                            for (std::size_t batch = 0; batch < dims[3]; ++batch) {
+                                output[oIndex + batch] += weights[wIndex] * input[iIndex + batch];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+*/
+
+    
+    // output H size
+    const std::size_t oxSize =
+            static_cast<std::size_t>(static_cast<float>(dims[2] - std::get<4>(params)[0] + std::get<0>(params)[0]) /
+                                static_cast<float>(std::get<0>(params)[0]));
+    // output W size
+    const std::size_t oySize =
+            static_cast<std::size_t>(static_cast<float>(dims[3] - std::get<4>(params)[1] + std::get<0>(params)[1]) /
+                                static_cast<float>(std::get<0>(params)[1]));
+
+    // TODO: kernel computation
+    // output (batch, outCh, Xout, Yout)
+    // input  (batch, inCh, Xin, Yin)
+    // weight (outCh, inCh, kernelX, kernelY)
+    // does not take Dilation parameter into account
+    for (std::size_t batch = 0; batch < dims[0]; ++batch) {
+        for (std::size_t outCh = 0; outCh < std::get<3>(params); ++outCh) {
+            const std::size_t oIndex = (outCh + batch*std::get<3>(params)) * oxSize * oySize;
+            B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
+            std::fill(output + oIndex, output+(oIndex+oxSize*oySize), biasVal);
+            for (std::size_t inCh = 0; inCh < dims[1]; ++inCh) {
+                const std::size_t iIndex = (inCh + batch*dims[1]) * dims[2] * dims[3];
+                const std::size_t wIndex = (inCh + outCh*dims[1]) * std::get<4>(params)[0] * std::get<4>(params)[1];
+                for (std::size_t ox = 0; ox < oxSize; ++ox) {
+                    for (std::size_t oy = 0; oy < oySize; ++oy) {
+                        const std::size_t oIndexFull = oIndex + ox*oySize + oy;
+                        const std::size_t ix = ox * std::get<0>(params)[0];
+                        const std::size_t iy = oy * std::get<0>(params)[1];
+                        for (std::size_t sx = 0; sx < std::get<4>(params)[0]; ++sx) {
+                            for (std::size_t sy = 0; sy < std::get<4>(params)[1]; ++sy) {
+                                output[oIndexFull] += weights[wIndex + sx*std::get<4>(params)[1] + sy] * 
+                                                        input[iIndex + (ix+sx)*dims[3] + (iy+sy)];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+namespace {
+static Registrar<ConvImpl2DForward_ref_cpp> registrarConvImpl2DForward_ref_cpp_Float32(
+        {DataType::Float32, DataType::Float32, DataType::Float32, DataType::Float32},
+        Aidge::ConvImpl2D_ref_cpp_forward_kernel<float, float, float, float>);
+static Registrar<ConvImpl2DForward_ref_cpp> registrarConvImpl2DForward_ref_cpp_Int32(
+        {DataType::Int32, DataType::Int32, DataType::Int32, DataType::Int32},
+        Aidge::ConvImpl2D_ref_cpp_forward_kernel<int, int, int, int>);
+static Registrar<ConvImpl2DForward_ref_cpp> registrarConvImpl2DForward_ref_cpp_Float64(
+        {DataType::Float64, DataType::Float64, DataType::Float64, DataType::Float64},
+        Aidge::ConvImpl2D_ref_cpp_forward_kernel<double, double, double, double>);
+}  // namespace
+}  // namespace Aidge
+
+#endif /* ConvImpl2D_ref_cpp_forward_kernel_H_ */
diff --git a/aidge/_CPU/include/operator/FCImpl.hpp b/aidge/_CPU/include/operator/FCImpl.hpp
new file mode 100644
index 00000000..dd973409
--- /dev/null
+++ b/aidge/_CPU/include/operator/FCImpl.hpp
@@ -0,0 +1,51 @@
+#ifndef FCImpl_ref_cpp_H_
+#define FCImpl_ref_cpp_H_
+
+#include "backend/OperatorImpl.hpp"
+#include "operator/FC.hpp"
+#include "utils/Registrar.hpp"
+#include "utils/Types.h"
+#include <memory>
+#include <vector>
+#include <array>
+
+namespace Aidge {
+// class FC_Op;
+
+// compute kernel registry for forward and backward
+class FCImplForward_ref_cpp : public Registrable<std::tuple<DataType, DataType, DataType, DataType>,
+                                                 void(const FC_Op::Parameters &, const DimSize_t, const DimSize_t,
+                                                      const void *, const void *, const void *, void *)> {};
+class FCImplBackward_ref_cpp : public Registrable<std::tuple<DataType, DataType, DataType, DataType>,
+                                                  void(const FC_Op::Parameters &, const DimSize_t, const DimSize_t,
+                                                       const void *, const void *, const void *, void *)> {};
+
+class FCImpl_ref_cpp : public OperatorImpl {
+   private:
+    const FC_Op &mOp;
+    std::array<NbElts_t, 3> mNbConsumedData;
+    std::array<NbElts_t, 1> mNbProducedData;
+
+   public:
+    FCImpl_ref_cpp(const FC_Op &op) : mOp(op), mNbConsumedData({0, 0, 0}), mNbProducedData({0}) {}
+
+    static std::unique_ptr<FCImpl_ref_cpp> create(const FC_Op &op) { return std::make_unique<FCImpl_ref_cpp>(op); }
+
+   public:
+    NbElts_t getNbRequiredData(IOIndex_t inputIdx) const override final;
+    NbElts_t getNbRequiredProtected(IOIndex_t inputIdx) const override final;
+    NbElts_t getRequiredMemory(IOIndex_t outputIdx, const std::vector<DimSize_t> &inputsSize) const override final;
+    NbElts_t getNbConsumedData(IOIndex_t inputIdx) const override final;
+    NbElts_t getNbProducedData(IOIndex_t outputIdx) const override final;
+
+    void forward();
+
+    void backward();
+};
+
+namespace {
+static Registrar<FC_Op> registrarFCImpl_ref_cpp("cpu", Aidge::FCImpl_ref_cpp::create);
+}
+}  // namespace Aidge
+
+#endif /* FCImpl_ref_cpp_H_ */
\ No newline at end of file
diff --git a/aidge/_CPU/include/operator/FCImpl_kernels.hpp b/aidge/_CPU/include/operator/FCImpl_kernels.hpp
new file mode 100644
index 00000000..9b0f9ef6
--- /dev/null
+++ b/aidge/_CPU/include/operator/FCImpl_kernels.hpp
@@ -0,0 +1,117 @@
+#ifndef FCImpl_ref_cpp_forward_kernel_H_
+#define FCImpl_ref_cpp_forward_kernel_H_
+
+#include "utils/Registrar.hpp"
+#include <algorithm>
+
+#include "operator/FCImpl.hpp"
+
+namespace Aidge {
+// template <class I, class W, class B, class O>
+// void FCImpl_ref_cpp_forward_kernel(const FC_Op::Parameters& params, const std::array<DimSize_t, 4>& dims,
+//                                    const void* input_, const void* weights_, const void* biases_, void* output_) {
+//     // FIXME: missing FC parameters as arguments
+//     const I* input = static_cast<const I*>(input_);
+//     const W* weights = static_cast<const W*>(weights_);
+//     const B* biases = static_cast<const B*>(biases_);
+//     O* output = static_cast<O*>(output_);
+
+//     for (std::size_t outIdx = 0; outIdx < std::get<0>(params); ++outIdx) {
+//         std::size_t oIndex = outIdx * dims[3];
+//         const B bias = std::get<1>(params) ? B(0) : biases[outIdx];
+//         for (std::size_t batch = 0; batch < dims[3]; ++batch) {
+//             output[oIndex + batch] = bias;
+//         }
+//     }
+
+//     for (std::size_t ix = 0; ix < dims[0]; ++ix) {
+//         for (std::size_t iy = 0; iy < dims[1]; ++iy) {
+//             for (std::size_t inCh = 0; inCh < dims[2]; ++inCh) {
+//                 const std::size_t iIndex = dims[3] * (inCh + dims[2] * (iy + dims[1] * ix));
+//                 for (std::size_t outCh = 0; outCh < std::get<0>(params); ++outCh) {
+//                     const std::size_t oIndex = dims[3] * outCh;
+//                     const std::size_t wIndex = (inCh + dims[2] * (iy + dims[1] * ix)) * std::get<0>(params) +
+//                                           outCh;  // (iIndex*std::get<0>(params) + oIndex)/dims[3];
+//                     for (std::size_t batch = 0; batch < dims[3]; ++batch) {
+//                         output[oIndex + batch] += weights[wIndex] * input[iIndex + batch];
+//                     }
+//                 }
+//             }
+//         }
+//     }
+// }
+
+// template <class I, class W, class B, class O>
+// void FCImpl_ref_cpp_forward_kernel(const FC_Op::Parameters& params, const std::array<DimSize_t, 2>& dims,
+//                                    const void* input_, const void* weights_, const void* biases_, void* output_) {
+//     // FIXME: missing FC parameters as arguments
+//     const I* input = static_cast<const I*>(input_);
+//     const W* weights = static_cast<const W*>(weights_);
+//     const B* biases = static_cast<const B*>(biases_);
+//     O* output = static_cast<O*>(output_);
+
+//     // let's have I.dims() = [N, C, H, W] instead of [H, W, C, N]
+
+//     for (std::size_t outIdx = 0; outIdx < std::get<0>(params); ++outIdx) {
+//         std::size_t oIndex = outIdx * dims[0];
+//         const B bias = std::get<1>(params) ? B(0) : biases[outIdx];
+//         for (std::size_t batch = 0; batch < dims[0]; ++batch) {
+//             output[oIndex + batch] = bias;
+//         }
+//     }
+
+//     for (std::size_t batch = 0; batch < dims[0]; ++batch) {
+//         const std::size_t oIndex = dims[1] * batch;
+//         for (std::size_t i = 0; i < dims[1]; ++i) {
+//             for (std::size_t outCh = 0; outCh < std::get<0>(params); ++outCh) {
+//                 std::size_t wIndex = i * std::get<0>(params) + outCh;  // (iIndex*std::get<0>(params) + oIndex)/dims[3];
+//                 output[oIndex + outCh] += weights[wIndex] * input[i + batch];
+//             }
+//         }
+//     }
+// }
+
+template <class I, class W, class B, class O>
+void FCImpl_ref_cpp_forward_kernel(const FC_Op::Parameters& params, const DimSize_t batchSize, const DimSize_t oneInputSize,
+                                   const void* input_, const void* weights_, const void* biases_, void* output_) {
+    // FIXME: missing FC parameters as arguments
+    const I* input = static_cast<const I*>(input_);
+    const W* weights = static_cast<const W*>(weights_);
+    const B* biases = static_cast<const B*>(biases_);
+    O* output = static_cast<O*>(output_);
+
+    if (std::get<1>(params)) {
+        std::fill(output, output+(batchSize*std::get<0>(params)), B(0));
+    }
+    else {
+        for (std::size_t batch = 0; batch < batchSize; ++batch) {
+            std::copy(biases, biases+std::get<0>(params), output+(batch*std::get<0>(params)));
+        }
+    }
+
+    for (std::size_t batch = 0; batch < batchSize; ++batch) {
+        for (std::size_t out = 0; out < std::get<0>(params); ++out) {
+            output[out + batch*std::get<0>(params)] = std::inner_product(input + batch*oneInputSize,
+                                                        input + (batch + 1)*oneInputSize,
+                                                        weights + out*oneInputSize,
+                                                        output[out + batch*std::get<0>(params)]);
+        }
+    }
+}
+
+
+namespace {
+static Registrar<FCImplForward_ref_cpp> registrarFCImpl2DForward_ref_cpp_Float32(
+        {DataType::Float32, DataType::Float32, DataType::Float32, DataType::Float32},
+        Aidge::FCImpl_ref_cpp_forward_kernel<float, float, float, float>);
+static Registrar<FCImplForward_ref_cpp> registrarFCImpl2DForward_ref_cpp_Int32(
+        {DataType::Int32, DataType::Int32, DataType::Int32, DataType::Int32},
+        Aidge::FCImpl_ref_cpp_forward_kernel<int, int, int, int>);
+static Registrar<FCImplForward_ref_cpp> registrarFCImpl2DForward_ref_cpp_Float64(
+        {DataType::Float64, DataType::Float64, DataType::Float64, DataType::Float64},
+        Aidge::FCImpl_ref_cpp_forward_kernel<double, double, double, double>);
+}  // namespace
+
+}  // namespace Aidge
+
+#endif /* FCImpl_ref_cpp_forward_kernel_H_ */
diff --git a/aidge/_CPU/include/operator/ProducerImpl.hpp b/aidge/_CPU/include/operator/ProducerImpl.hpp
new file mode 100644
index 00000000..71cef1d4
--- /dev/null
+++ b/aidge/_CPU/include/operator/ProducerImpl.hpp
@@ -0,0 +1,45 @@
+#ifndef ProducerImpl_ref_cpp_H_
+#define ProducerImpl_ref_cpp_H_
+
+#include "utils/Types.h"
+
+#include "backend/OperatorImpl.hpp"
+#include "operator/Producer.hpp"
+#include "utils/Registrar.hpp"
+#include <memory>
+
+namespace Aidge {
+template <DimIdx_t DIM>
+class ProducerImpl_ref_cpp : public OperatorImpl {
+   private:
+    const Producer_Op<DIM> &mOp;
+
+   public:
+    ProducerImpl_ref_cpp(const Producer_Op<DIM> &op) : mOp(op) {}
+
+    static std::unique_ptr<ProducerImpl_ref_cpp> create(const Producer_Op<DIM> &op) {
+        return std::make_unique<ProducerImpl_ref_cpp>(op);
+    }
+
+   public:
+    NbElts_t getNbRequiredData(IOIndex_t inputIdx) const override final;
+    NbElts_t getNbRequiredProtected(IOIndex_t inputIdx) const override final;
+    NbElts_t getRequiredMemory(IOIndex_t outputIdx, const std::vector<DimSize_t> &inputsSize) const override final;
+    NbElts_t getNbConsumedData(IOIndex_t inputIdx) const override final;
+    NbElts_t getNbProducedData(IOIndex_t outputIdx) const override final;
+
+    void forward();
+
+    void backward();
+};
+
+namespace {
+static Registrar<Producer_Op<1>> registrarProducer1DImpl_ref_cpp("cpu", Aidge::ProducerImpl_ref_cpp<1>::create);
+static Registrar<Producer_Op<2>> registrarProducer2DImpl_ref_cpp("cpu", Aidge::ProducerImpl_ref_cpp<2>::create);
+static Registrar<Producer_Op<3>> registrarProducer3DImpl_ref_cpp("cpu", Aidge::ProducerImpl_ref_cpp<3>::create);
+static Registrar<Producer_Op<4>> registrarProducer4DImpl_ref_cpp("cpu", Aidge::ProducerImpl_ref_cpp<4>::create);
+static Registrar<Producer_Op<5>> registrarProducer5DImpl_ref_cpp("cpu", Aidge::ProducerImpl_ref_cpp<5>::create);
+}  // namespace
+}  // namespace Aidge
+
+#endif /* ProducerImpl_ref_cpp_H_ */
\ No newline at end of file
diff --git a/aidge/_CPU/include/operator/ReLUImpl.hpp b/aidge/_CPU/include/operator/ReLUImpl.hpp
new file mode 100644
index 00000000..f3001626
--- /dev/null
+++ b/aidge/_CPU/include/operator/ReLUImpl.hpp
@@ -0,0 +1,51 @@
+#ifndef ReLUImpl_ref_cpp_H_
+#define ReLUImpl_ref_cpp_H_
+
+#include "backend/OperatorImpl.hpp"
+#include "operator/ReLU.hpp"
+#include "utils/Registrar.hpp"
+#include <memory>
+#include <vector>
+
+namespace Aidge {
+// class ReLU_Op;
+
+// compute kernel registry for forward and backward
+class ReLUImplForward_ref_cpp
+    : public Registrable<std::tuple<DataType, DataType>, void(const ReLU_Op::Parameters&, std::size_t, const void*, void*)> {
+};
+class ReLUImplBackward_ref_cpp
+    : public Registrable<std::tuple<DataType, DataType>, void(const ReLU_Op::Parameters&, std::size_t, const void*, void*)> {
+};
+
+class ReLUImpl_ref_cpp : public OperatorImpl {
+   private:
+    const ReLU_Op& mOp;
+    std::array<NbElts_t, 1> mNbConsumedData;
+    std::array<NbElts_t, 1> mNbProducedData;
+
+   public:
+    ReLUImpl_ref_cpp(const ReLU_Op& op) : mOp(op), mNbConsumedData({0}), mNbProducedData({0}) {}
+
+    static std::unique_ptr<ReLUImpl_ref_cpp> create(const ReLU_Op& op) {
+        return std::make_unique<ReLUImpl_ref_cpp>(op);
+    }
+
+   public:
+    NbElts_t getNbRequiredData(IOIndex_t inputIdx) const override final;
+    NbElts_t getNbRequiredProtected(IOIndex_t inputIdx) const override final;
+    NbElts_t getRequiredMemory(IOIndex_t outputIdx, const std::vector<DimSize_t>& inputsSize) const override final;
+    NbElts_t getNbConsumedData(IOIndex_t inputIdx) const override final;
+    NbElts_t getNbProducedData(IOIndex_t outputIdx) const override final;
+
+    void forward();
+
+    void backward();
+};
+
+namespace {
+static Registrar<ReLU_Op> registrarReLUImpl_ref_cpp("cpu", Aidge::ReLUImpl_ref_cpp::create);
+}
+}  // namespace Aidge
+
+#endif /* ReLUImpl_ref_cpp_H_ */
\ No newline at end of file
diff --git a/aidge/_CPU/include/operator/ReLUImpl_kernels.hpp b/aidge/_CPU/include/operator/ReLUImpl_kernels.hpp
new file mode 100644
index 00000000..11be9714
--- /dev/null
+++ b/aidge/_CPU/include/operator/ReLUImpl_kernels.hpp
@@ -0,0 +1,33 @@
+#ifndef ReLUImpl_ref_cpp_forward_kernel_H_
+#define ReLUImpl_ref_cpp_forward_kernel_H_
+
+#include "utils/Registrar.hpp"
+
+#include "operator/ReLUImpl.hpp"
+
+namespace Aidge {
+template <class I, class O>
+void ReLUImpl_ref_cpp_forward_kernel(const ReLU_Op::Parameters& params,
+                                     std::size_t inputLenght,
+                                     const void* input_,
+                                     void* output_) {
+    // FIXME: missing ReLU parameters as arguments
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+
+    for (std::size_t i = 0; i < inputLenght; ++i) {
+        output[i] = input[i] >= 0 ? input[i] : input[i] * static_cast<I>(std::get<0>(params));
+    }
+}
+
+namespace {
+static Registrar<ReLUImplForward_ref_cpp> registrarReLUImplForward_ref_cpp_Float32(
+        {DataType::Float32, DataType::Float32}, Aidge::ReLUImpl_ref_cpp_forward_kernel<float, float>);
+static Registrar<ReLUImplForward_ref_cpp> registrarReLUImplForward_ref_cpp_Int32(
+        {DataType::Int32, DataType::Int32}, Aidge::ReLUImpl_ref_cpp_forward_kernel<int, int>);
+static Registrar<ReLUImplForward_ref_cpp> registrarReLUImplForward_ref_cpp_Float64(
+        {DataType::Float64, DataType::Float64}, Aidge::ReLUImpl_ref_cpp_forward_kernel<double, double>);
+}  // namespace
+}  // namespace Aidge
+
+#endif /* ReLUImpl_ref_cpp_forward_kernel_H_ */
diff --git a/aidge/_CPU/src/operator/ConvImpl.cpp b/aidge/_CPU/src/operator/ConvImpl.cpp
new file mode 100644
index 00000000..eb90a3af
--- /dev/null
+++ b/aidge/_CPU/src/operator/ConvImpl.cpp
@@ -0,0 +1,75 @@
+
+#include "operator/ConvImpl.hpp"
+
+#include <cassert>
+#include <chrono>
+#include <numeric>
+#include <thread>
+#include <vector>
+
+#include "operator/ConvImpl_kernels.hpp"
+#include "operator/Conv.hpp"
+#include "utils/Types.h"
+
+Aidge::NbElts_t Aidge::ConvImpl2D_ref_cpp::getNbRequiredData(Aidge::IOIndex_t inputIdx) const {
+    assert(mOp.getInput(inputIdx) && "requires valid input");
+
+    // Requires the whole tensors
+    const auto &inputDims = std::static_pointer_cast<Tensor>(mOp.getInput(inputIdx))->dims();
+
+    return std::accumulate(inputDims.begin(), inputDims.end(), Aidge::NbElts_t(1), std::multiplies<NbElts_t>());
+}
+
+Aidge::NbElts_t Aidge::ConvImpl2D_ref_cpp::getNbRequiredProtected(IOIndex_t /*inputIdx*/) const {
+    // for the direct convolution algorithm, convolutions can be in-place, if
+    // there is no padding!
+    return 0;
+}
+
+Aidge::NbElts_t Aidge::ConvImpl2D_ref_cpp::getRequiredMemory(Aidge::IOIndex_t outputIdx,
+                                                           const std::vector<Aidge::DimSize_t> & /*inputsSize*/) const {
+    // Requires the whole tensors, regardless of available data on inputs
+    assert(outputIdx == 0 && "operator has only one output");
+
+    const auto &outputDims = std::static_pointer_cast<Tensor>(mOp.getOutput(0))->dims();
+    return std::accumulate(outputDims.begin(), outputDims.end(), NbElts_t(1), std::multiplies<NbElts_t>());
+}
+
+Aidge::NbElts_t Aidge::ConvImpl2D_ref_cpp::getNbConsumedData(Aidge::IOIndex_t inputIdx) const {
+    assert(static_cast<std::size_t>(inputIdx) < mNbConsumedData.size());
+    return mNbConsumedData[static_cast<std::size_t>(inputIdx)];
+}
+
+Aidge::NbElts_t Aidge::ConvImpl2D_ref_cpp::getNbProducedData(Aidge::IOIndex_t outputIdx) const {
+    assert((outputIdx == 0) && (static_cast<std::size_t>(outputIdx) < mNbProducedData.size()));
+    return mNbProducedData[static_cast<std::size_t>(outputIdx)];
+}
+
+void Aidge::ConvImpl2D_ref_cpp::forward() {
+    // FIXME: uncomment the following code once memory handling will work
+    assert(mOp.mInputs[0] && "missing input #0");
+    assert(mOp.mInputs[1] && "missing input #1");
+    assert(mOp.mInputs[2] && "missing input #2");
+
+    // Find the correct kernel type
+    auto kernelFunc =
+            Registrar<ConvImpl2DForward_ref_cpp>::create({mOp.mInputs[0]->dataType(), mOp.mInputs[1]->dataType(),
+                                                          mOp.mInputs[2]->dataType(), mOp.mOutput->dataType()});
+
+    // Call kernel
+    kernelFunc(mOp.getParams(), std::static_pointer_cast<Tensor>(mOp.mInputs[0])->dims<4>(),
+               mOp.mInputs[0]->getImpl()->rawPtr(), mOp.mInputs[1]->getImpl()->rawPtr(),
+               mOp.mInputs[2]->getImpl()->rawPtr(), mOp.mOutput->getImpl()->rawPtr());
+
+    // FIXME: Dummy wait for some earlier scheduler tests
+    std::this_thread::sleep_for(std::chrono::milliseconds(mOp.get<ConvParam::OutChannels>()));
+
+    // Update producer-consumer data
+    for (std::size_t inputIdx = 0; inputIdx < mNbConsumedData.size(); ++inputIdx)
+        mNbConsumedData[inputIdx] += getNbRequiredData(static_cast<IOIndex_t>(inputIdx));  // each input is consumed by the minimum
+                                                                   // amount for a forward pass
+
+    mNbProducedData[0] += getRequiredMemory(0, {});
+}
+
+void Aidge::ConvImpl2D_ref_cpp::backward() { printf("Not implemented yet.\n"); }
diff --git a/aidge/_CPU/src/operator/FCImpl.cpp b/aidge/_CPU/src/operator/FCImpl.cpp
new file mode 100644
index 00000000..63b530df
--- /dev/null
+++ b/aidge/_CPU/src/operator/FCImpl.cpp
@@ -0,0 +1,114 @@
+
+#include <cassert>
+#include <chrono>
+#include <numeric>
+#include <thread>
+#include <vector>
+
+#include "operator/FC.hpp"
+
+#include "operator/FCImpl.hpp"
+#include "operator/FCImpl_kernels.hpp"
+#include "utils/Types.h"
+
+Aidge::NbElts_t Aidge::FCImpl_ref_cpp::getNbRequiredData(Aidge::IOIndex_t inputIdx) const
+{
+    assert(mOp.getInput(inputIdx) && "requires valid input");
+
+    // Requires the whole tensors
+    const auto &inputDims
+        = std::static_pointer_cast<Tensor>(mOp.getInput(inputIdx))->dims();
+
+    return std::accumulate(
+        inputDims.begin(),
+        inputDims.end(),
+        Aidge::NbElts_t(1),
+        std::multiplies<Aidge::NbElts_t>());
+}
+
+Aidge::NbElts_t
+    Aidge::FCImpl_ref_cpp::getNbRequiredProtected(Aidge::IOIndex_t /*inputIdx*/) const
+{
+    // for the direct convolution algorithm, convolutions can be in-place, if
+    // there is no padding!
+    return 0;
+}
+
+Aidge::NbElts_t Aidge::FCImpl_ref_cpp::getRequiredMemory(
+    IOIndex_t outputIdx, const std::vector<DimSize_t> & /*inputsSize*/) const
+{
+    // Requires the whole tensors, regardless of available data on inputs
+    assert(outputIdx == 0 && "operator has only one output");
+
+    const auto &outputDims = std::static_pointer_cast<Tensor>(mOp.getOutput(0))->dims();
+    return std::accumulate(
+        outputDims.begin(),
+        outputDims.end(),
+        static_cast<NbElts_t>(1),
+        std::multiplies<NbElts_t>());
+}
+
+Aidge::NbElts_t Aidge::FCImpl_ref_cpp::getNbConsumedData(Aidge::IOIndex_t inputIdx) const
+{
+    assert((inputIdx >= 0) && (static_cast<IONb_t>(inputIdx) < mNbConsumedData.size()));
+    return mNbConsumedData[static_cast<std::size_t>(inputIdx)];
+}
+
+Aidge::NbElts_t Aidge::FCImpl_ref_cpp::getNbProducedData(Aidge::IOIndex_t outputIdx) const
+{
+    assert(static_cast<std::size_t>(outputIdx) < mNbProducedData.size());
+    return mNbProducedData[static_cast<std::size_t>(outputIdx)];
+}
+
+void Aidge::FCImpl_ref_cpp::forward()
+{
+    // FIXME: uncomment the following code once memory handling will work
+    assert(mOp.mInputs[0] && "missing input #0");
+    assert(mOp.mInputs[1] && "missing input #1");
+    assert(mOp.mInputs[2] && "missing input #2");
+
+    // Find the correct kernel type
+    auto kernelFunc = Registrar<FCImplForward_ref_cpp>::create(
+        {mOp.mInputs[0]->dataType(),
+         mOp.mInputs[1]->dataType(),
+         mOp.mInputs[2]->dataType(),
+         mOp.mOutput->dataType()});
+
+    // Call kernel
+    // if (mOp.mInputs[0]->nbDims() == 4) {
+    //     kernelFunc(
+    //         mOp.getParams(),
+    //         std::static_pointer_cast<Tensor>(mOp.mInputs[0])->dims<4>(),
+    //         mOp.mInputs[0]->getImpl()->rawPtr(),
+    //         mOp.mInputs[1]->getImpl()->rawPtr(),
+    //         mOp.mInputs[2]->getImpl()->rawPtr(),
+    //         mOp.mOutput->getImpl()->rawPtr());
+    // }
+    // else 
+    kernelFunc(
+        mOp.getParams(),
+        mOp.mInputs[0]->dims()[0],
+        mOp.mInputs[0]->sizeM1(),
+        mOp.mInputs[0]->getImpl()->rawPtr(),
+        mOp.mInputs[1]->getImpl()->rawPtr(),
+        mOp.mInputs[2]->getImpl()->rawPtr(),
+        mOp.mOutput->getImpl()->rawPtr());
+    
+    
+
+    // FIXME: Dummy wait for some earlier scheduler tests
+    std::this_thread::sleep_for(std::chrono::milliseconds(mOp.get<FCParam::OutChannels>()));
+
+    // Update producer-consumer data
+    for (IOIndex_t inputIdx = 0; static_cast<std::size_t>(inputIdx) < mNbConsumedData.size(); ++inputIdx)
+        mNbConsumedData[inputIdx]
+            += getNbRequiredData(static_cast<std::size_t>(inputIdx)); // each input is consumed by the minimum
+                                              // amount for a forward pass
+
+    mNbProducedData[0] += getRequiredMemory(0, {});
+}
+
+void Aidge::FCImpl_ref_cpp::backward()
+{
+    printf("Not implemented yet.\n");
+}
diff --git a/aidge/_CPU/src/operator/ProducerImpl.cpp b/aidge/_CPU/src/operator/ProducerImpl.cpp
new file mode 100644
index 00000000..5db6ae51
--- /dev/null
+++ b/aidge/_CPU/src/operator/ProducerImpl.cpp
@@ -0,0 +1,62 @@
+
+#include <cassert>
+#include <numeric>
+#include <vector>
+
+#include "data/Tensor.hpp"
+#include "operator/Producer.hpp"
+#include "utils/Types.h"
+
+#include "operator/ProducerImpl.hpp"
+
+template<Aidge::DimIdx_t DIM>
+std::size_t Aidge::ProducerImpl_ref_cpp<DIM>::getNbRequiredData(
+    Aidge::IOIndex_t /*inputIdx*/) const
+{
+    return 0;
+}
+
+template<Aidge::DimIdx_t DIM>
+Aidge::DimSize_t Aidge::ProducerImpl_ref_cpp<DIM>::getNbConsumedData(
+    Aidge::IOIndex_t /*inputIdx*/) const
+{
+    return 0;
+}
+
+template<Aidge::DimIdx_t DIM>
+std::size_t Aidge::ProducerImpl_ref_cpp<DIM>::getNbRequiredProtected(
+    Aidge::IOIndex_t /*inputIdx*/) const
+{
+    return 0;
+}
+
+template<Aidge::DimIdx_t DIM>
+std::size_t Aidge::ProducerImpl_ref_cpp<DIM>::getRequiredMemory(
+    IOIndex_t outputIdx, const std::vector<DimSize_t> & /*inputsSize*/) const
+{
+    // Requires the whole tensors, regardless of available data on inputs
+    assert(outputIdx == 0 && "operator has only one output");
+
+    const auto &outputDims = std::static_pointer_cast<Tensor>(mOp.getOutput(0))->dims();
+    return std::accumulate(
+        outputDims.begin(),
+        outputDims.end(),
+        NbElts_t(1),
+        std::multiplies<NbElts_t>());
+}
+
+template<Aidge::DimIdx_t DIM>
+Aidge::DimSize_t Aidge::ProducerImpl_ref_cpp<DIM>::getNbProducedData(
+    Aidge::IOIndex_t /*outputIdx*/) const
+{
+    return getRequiredMemory(0, {});
+}
+
+template<Aidge::DimIdx_t DIM> void Aidge::ProducerImpl_ref_cpp<DIM>::forward()
+{
+}
+
+template<Aidge::DimIdx_t DIM> void Aidge::ProducerImpl_ref_cpp<DIM>::backward()
+{
+    printf("Not implemented yet.\n");
+}
\ No newline at end of file
diff --git a/aidge/_CPU/src/operator/ReLUImpl.cpp b/aidge/_CPU/src/operator/ReLUImpl.cpp
new file mode 100644
index 00000000..a345ee92
--- /dev/null
+++ b/aidge/_CPU/src/operator/ReLUImpl.cpp
@@ -0,0 +1,68 @@
+
+#include <cassert>
+#include <numeric>
+#include <chrono>
+#include <thread>
+
+#include "operator/ReLU.hpp"
+
+#include "operator/ReLUImpl.hpp"
+#include "operator/ReLUImpl_kernels.hpp"
+#include "utils/Types.h"
+#include <numeric>
+#include <vector>
+
+// FIXME: replace whole Tensor with minimum needed data quantity
+Aidge::NbElts_t Aidge::ReLUImpl_ref_cpp::getNbRequiredData(Aidge::IOIndex_t /*inputIdx*/) const {
+    assert(mOp.getInput(0) && "requires valid input");
+
+    // Requires the whole tensors
+    const auto& inputDims = std::static_pointer_cast<Tensor>(mOp.getInput(0))->dims();
+
+    return std::accumulate(inputDims.begin(), inputDims.end(),
+                        static_cast<NbElts_t>(1), std::multiplies<NbElts_t>());
+}
+
+Aidge::NbElts_t Aidge::ReLUImpl_ref_cpp::getNbRequiredProtected(Aidge::IOIndex_t /*inputIdx*/) const {
+    // for the direct convolution algorithm, convolutions can be in-place, if there is no padding!
+    return 0;
+}
+
+Aidge::NbElts_t Aidge::ReLUImpl_ref_cpp::getRequiredMemory(Aidge::IOIndex_t /*outputIdx*/, const std::vector<Aidge::DimSize_t>& /*inputsSize*/) const {
+    const auto& outputDims = std::static_pointer_cast<Tensor>(mOp.getOutput(0))->dims();
+    return std::accumulate(outputDims.begin(), outputDims.end(),
+                        static_cast<NbElts_t>(1), std::multiplies<NbElts_t>());
+}
+
+Aidge::NbElts_t Aidge::ReLUImpl_ref_cpp::getNbConsumedData(Aidge::IOIndex_t /*inputIdx*/) const {
+    return mNbConsumedData[0];
+}
+
+Aidge::NbElts_t Aidge::ReLUImpl_ref_cpp::getNbProducedData(Aidge::IOIndex_t /*outputIdx*/) const {
+    return mNbProducedData[0];
+}
+
+void Aidge::ReLUImpl_ref_cpp::forward() {
+    // FIXME: uncomment the following code once memory handling will work
+    assert(mOp.mInputs[0] && "missing input #0");
+
+    // Find the correct kernel type
+    auto kernelFunc = Registrar<ReLUImplForward_ref_cpp>::create({
+        mOp.mInputs[0]->dataType(),
+        mOp.mOutput->dataType()});
+
+    // Call kernel
+    kernelFunc(mOp.getParams(),
+        std::static_pointer_cast<Tensor>(mOp.getInput(0))->size(),
+        mOp.mInputs[0]->getImpl()->rawPtr(),
+        mOp.mOutput->getImpl()->rawPtr());
+
+
+    mNbConsumedData[0]+= getNbRequiredData(0); // each input is consumed by the minimum amount for a forward pass
+
+    mNbProducedData[0]+= getRequiredMemory(0, {});
+}
+
+void Aidge::ReLUImpl_ref_cpp::backward() {
+    printf("Not implemented yet.\n");
+}
-- 
GitLab