From 742b763f233004df607c59a65a5d881f41ab0f6d Mon Sep 17 00:00:00 2001
From: cmoineau <cyril.moineau@cea.fr>
Date: Mon, 10 Feb 2025 09:47:43 +0000
Subject: [PATCH 01/22] Add back MR
 https://gitlab.eclipse.org/eclipse/aidge/aidge_backend_cpu/-/merge_requests/131.

---
 unit_tests/operator/Test_MetaOperator.cpp | 765 ++++++++++++++++------
 1 file changed, 573 insertions(+), 192 deletions(-)

diff --git a/unit_tests/operator/Test_MetaOperator.cpp b/unit_tests/operator/Test_MetaOperator.cpp
index 271a1e2f..4fe39630 100644
--- a/unit_tests/operator/Test_MetaOperator.cpp
+++ b/unit_tests/operator/Test_MetaOperator.cpp
@@ -9,70 +9,79 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
 #include <cmath>
 #include <cstdlib>
 #include <memory>
+#include <random>
+
+#include <catch2/catch_test_macros.hpp>
 
-#include "aidge/utils/TensorUtils.hpp"
 #include "aidge/backend/cpu/operator/ConvImpl.hpp"
 #include "aidge/backend/cpu/operator/PadImpl.hpp"
 #include "aidge/data/Tensor.hpp"
+#include "aidge/filler/Filler.hpp"
 #include "aidge/operator/Conv.hpp"
+#include "aidge/operator/FC.hpp"
+#include "aidge/operator/Identity.hpp"
 #include "aidge/operator/MetaOperator.hpp"
 #include "aidge/operator/MetaOperatorDefs.hpp"
 #include "aidge/operator/Pad.hpp"
 #include "aidge/operator/Pop.hpp"
-#include "aidge/scheduler/SequentialScheduler.hpp"
+#include "aidge/operator/Stack.hpp"
 #include "aidge/scheduler/ParallelScheduler.hpp"
+#include "aidge/scheduler/SequentialScheduler.hpp"
+#include "aidge/utils/TensorUtils.hpp"
 
 using namespace Aidge;
 
 TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
-  SECTION("PaddedConv(forward)") {
-    std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(
-            Array4D<double, 4, 3, 3, 3>{{{{{6.20986394e-01, 1.19775136e-03, 7.22876095e-02},
-                                          {1.16492919e-01, 8.21634093e-02, 1.17413265e-01},
-                                          {2.23743494e-01, 3.99495413e-01, 5.55552411e-01}},
-                                         {{6.64970077e-01, 9.62199940e-01, 4.87531967e-01},
-                                          {6.12586558e-01, 8.09918671e-02, 8.40649383e-01},
-                                          {4.15264406e-01, 8.28247138e-01, 1.52301135e-01}},
-                                         {{1.76992844e-02, 7.78697112e-01, 8.14531592e-01},
-                                          {1.36960611e-01, 4.64806728e-01, 4.85150000e-01},
-                                          {4.34776520e-01, 9.51740977e-01, 9.05793799e-01}}},
-
-                                        {{{1.71925246e-02, 1.91082720e-01, 3.67982644e-01},
-                                          {1.56806559e-01, 6.22280998e-01, 3.15827594e-01},
-                                          {6.04359038e-01, 2.83095947e-01, 6.11168892e-01}},
-                                         {{2.76942832e-01, 1.89768419e-01, 8.07988176e-01},
-                                          {1.67925807e-01, 2.68356150e-01, 6.28875602e-01},
-                                          {1.69093357e-04, 9.64788636e-01, 7.29254981e-01}},
-                                         {{6.34030122e-01, 1.32087038e-01, 3.33857107e-01},
-                                          {7.63047502e-01, 5.12539506e-02, 9.77400493e-01},
-                                          {8.06151288e-01, 2.60237147e-01, 3.93729313e-01}}},
-
-                                        {{{5.84605240e-01, 4.74648725e-01, 8.54111741e-01},
-                                          {7.10897067e-02, 5.02579011e-01, 3.35236224e-01},
-                                          {9.08637408e-01, 8.02903830e-01, 2.83929907e-01}},
-                                         {{3.68206999e-01, 9.18579021e-02, 7.33168098e-01},
-                                          {1.59875539e-01, 9.13163381e-01, 3.59806060e-01},
-                                          {1.41295882e-01, 7.00312185e-01, 5.63728289e-01}},
-                                         {{9.39513546e-01, 1.91704891e-01, 1.11454944e-01},
-                                          {5.46298282e-01, 2.89698587e-01, 2.62612651e-01},
-                                          {1.18554992e-01, 4.32147376e-02, 7.53016994e-01}}},
-
-                                        {{{9.53179175e-01, 2.05041054e-02, 1.11318451e-01},
-                                          {8.67878485e-01, 2.93263422e-01, 8.03912714e-01},
-                                          {8.93620255e-01, 1.37831128e-01, 3.83640583e-01}},
-                                         {{3.96020188e-01, 6.24959320e-01, 1.90709175e-01},
-                                          {5.80538620e-01, 6.63031275e-01, 2.07247191e-01},
-                                          {5.65672171e-01, 5.57014317e-01, 9.26909496e-01}},
-                                         {{3.43901418e-01, 4.47741636e-01, 6.59249367e-01},
-                                          {7.34639028e-01, 2.84957200e-02, 9.70225217e-01},
-                                          {1.33578790e-02, 6.12054702e-01, 9.36685235e-02}}}}});
-    std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(
-            Array1D<double, 4>{{0.16884905, 0.27994487, 0.57227465, 0.06435205}});
-    std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<double, 2, 3, 5, 5>{
+    SECTION("PaddedConv(forward)") {
+        std::shared_ptr<Tensor> myWeights =
+            std::make_shared<Tensor>(Array4D<double, 4, 3, 3, 3>{
+                {{{{6.20986394e-01, 1.19775136e-03, 7.22876095e-02},
+                   {1.16492919e-01, 8.21634093e-02, 1.17413265e-01},
+                   {2.23743494e-01, 3.99495413e-01, 5.55552411e-01}},
+                  {{6.64970077e-01, 9.62199940e-01, 4.87531967e-01},
+                   {6.12586558e-01, 8.09918671e-02, 8.40649383e-01},
+                   {4.15264406e-01, 8.28247138e-01, 1.52301135e-01}},
+                  {{1.76992844e-02, 7.78697112e-01, 8.14531592e-01},
+                   {1.36960611e-01, 4.64806728e-01, 4.85150000e-01},
+                   {4.34776520e-01, 9.51740977e-01, 9.05793799e-01}}},
+
+                 {{{1.71925246e-02, 1.91082720e-01, 3.67982644e-01},
+                   {1.56806559e-01, 6.22280998e-01, 3.15827594e-01},
+                   {6.04359038e-01, 2.83095947e-01, 6.11168892e-01}},
+                  {{2.76942832e-01, 1.89768419e-01, 8.07988176e-01},
+                   {1.67925807e-01, 2.68356150e-01, 6.28875602e-01},
+                   {1.69093357e-04, 9.64788636e-01, 7.29254981e-01}},
+                  {{6.34030122e-01, 1.32087038e-01, 3.33857107e-01},
+                   {7.63047502e-01, 5.12539506e-02, 9.77400493e-01},
+                   {8.06151288e-01, 2.60237147e-01, 3.93729313e-01}}},
+
+                 {{{5.84605240e-01, 4.74648725e-01, 8.54111741e-01},
+                   {7.10897067e-02, 5.02579011e-01, 3.35236224e-01},
+                   {9.08637408e-01, 8.02903830e-01, 2.83929907e-01}},
+                  {{3.68206999e-01, 9.18579021e-02, 7.33168098e-01},
+                   {1.59875539e-01, 9.13163381e-01, 3.59806060e-01},
+                   {1.41295882e-01, 7.00312185e-01, 5.63728289e-01}},
+                  {{9.39513546e-01, 1.91704891e-01, 1.11454944e-01},
+                   {5.46298282e-01, 2.89698587e-01, 2.62612651e-01},
+                   {1.18554992e-01, 4.32147376e-02, 7.53016994e-01}}},
+
+                 {{{9.53179175e-01, 2.05041054e-02, 1.11318451e-01},
+                   {8.67878485e-01, 2.93263422e-01, 8.03912714e-01},
+                   {8.93620255e-01, 1.37831128e-01, 3.83640583e-01}},
+                  {{3.96020188e-01, 6.24959320e-01, 1.90709175e-01},
+                   {5.80538620e-01, 6.63031275e-01, 2.07247191e-01},
+                   {5.65672171e-01, 5.57014317e-01, 9.26909496e-01}},
+                  {{3.43901418e-01, 4.47741636e-01, 6.59249367e-01},
+                   {7.34639028e-01, 2.84957200e-02, 9.70225217e-01},
+                   {1.33578790e-02, 6.12054702e-01, 9.36685235e-02}}}}});
+        std::shared_ptr<Tensor> myBias =
+            std::make_shared<Tensor>(Array1D<double, 4>{
+                {0.16884905, 0.27994487, 0.57227465, 0.06435205}});
+        std::shared_ptr<Tensor> myInput = std::make_shared<
+            Tensor>(Array4D<double, 2, 3, 5, 5>{
             // NCHW
             {{{{0.43224481, 0.9047832, 0.18402257, 0.06162838, 0.52490127},
                {0.27773404, 0.55402353, 0.9485062, 0.31197083, 0.80328607},
@@ -108,93 +117,107 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
                {0.95873236, 0.6742374, 0.55679676, 0.6323497, 0.34072958},
                {0.49694061, 0.79173045, 0.19738225, 0.14755281, 0.80818177},
                {0.02332061, 0.74270703, 0.59415632, 0.08195934, 0.46295434},
-               {0.71426058, 0.85032931, 0.90750818, 0.28768431, 0.4401146}}}}});
-
-    std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(
-            Array4D<double, 2, 4, 5, 5>{{{{{3.40294218, 3.74021220, 4.02050114, 4.07054710, 2.46286273},
-                {4.61770582, 6.70517588, 6.50356627, 6.29688787, 3.53332567},
-                {5.47480106, 5.92094421, 6.64605665, 7.95090199, 4.28721523},
-                {4.01485729, 6.06748962, 7.52447891, 7.37980652, 5.28401136},
-                {2.83065438, 3.62033439, 3.56222963, 5.56103945, 3.23335814}},
-
-                {{3.30230498, 4.92814112, 4.34710836, 3.96262765, 2.97987890},
-                {4.49693012, 6.68929291, 5.53603029, 5.68874264, 4.28756475},
-                {4.20528078, 6.82776880, 6.70569849, 7.12809610, 4.40845442},
-                {4.31169367, 6.73352146, 6.30962515, 7.45826864, 4.99164438},
-                {2.18136287, 4.28968000, 4.20080042, 4.89814138, 2.87394023}},
-
-                {{3.54787683, 4.35851812, 4.63881302, 4.23359537, 3.16992092},
-                {5.25099468, 7.54282856, 6.69849157, 5.64309788, 4.56919575},
-                {4.71914101, 7.52830601, 6.71450949, 7.81113863, 5.84658146},
-                {4.97893143, 7.39293909, 6.89905310, 8.14430809, 5.62998581},
-                {2.79735112, 4.80967140, 5.57630205, 5.38828325, 4.57078695}},
-
-                {{3.03048635, 5.04540300, 4.21824932, 4.87323284, 2.35113740},
-                {4.45167351, 6.47721338, 7.40922976, 6.70445728, 3.60700107},
-                {3.77927423, 6.82826376, 7.41777134, 7.57402420, 5.13131523},
-                {4.08747244, 7.07994175, 7.57206821, 8.51897335, 5.26987123},
-                {2.34426999, 4.60127831, 4.86486769, 6.01579571, 3.97803569}}},
-
-
-                {{{3.84700942, 4.25972605, 3.05269003, 3.78043652, 2.08771229},
-                {6.00459957, 6.05633259, 4.45951605, 4.54089880, 4.03066444},
-                {5.41579390, 7.29543972, 6.18680000, 5.58812714, 3.45964241},
-                {6.04531050, 7.70924091, 5.52207708, 5.02131319, 4.09403706},
-                {3.18092418, 4.45422697, 4.04294252, 3.86577177, 2.18776536}},
-
-                {{4.02600670, 4.27603531, 3.81011319, 4.03631020, 2.57254648},
-                {5.33471155, 5.72588634, 5.12079763, 5.11733150, 3.76836705},
-                {5.62947607, 5.92492962, 6.24170446, 6.44130468, 3.44276404},
-                {5.38414621, 6.02679539, 5.88985586, 5.90263271, 3.15044069},
-                {3.31261086, 4.44371319, 3.47660780, 4.15411520, 1.48961508}},
-
-                {{3.95879412, 4.17324543, 3.70114422, 3.27447152, 3.09713888},
-                {5.78258181, 6.57920837, 4.99913597, 6.20961237, 4.98552179},
-                {5.84685421, 7.19971228, 6.66386652, 6.68013430, 4.90963316},
-                {5.24417877, 7.06430531, 6.58512402, 6.02492285, 4.48986387},
-                {3.64294529, 5.00678444, 5.04760027, 4.72895622, 2.67990756}},
-
-                {{3.48610687, 4.12853813, 4.07563591, 3.51327014, 2.44217038},
-                {4.80529881, 7.33211374, 5.14774036, 4.77281189, 4.44612408},
-                {5.11703110, 7.55168772, 7.14374542, 6.43696356, 4.10621357},
-                {5.41270018, 6.85949135, 6.73503923, 5.74601364, 4.46150303},
-                {3.16612267, 4.38248920, 5.23248482, 4.21292210, 2.86031270}}}}});
-
-    std::shared_ptr<Node> myConv = Conv<2>(3, 4, {3, 3}, "myconv");
-    auto convOp = std::static_pointer_cast<OperatorTensor>(myConv->getOperator());
-
-    std::shared_ptr<Node> myPad =
+               {0.71426058,
+                0.85032931,
+                0.90750818,
+                0.28768431,
+                0.4401146}}}}});
+
+        std::shared_ptr<Tensor> myOutput = std::make_shared<
+            Tensor>(Array4D<double, 2, 4, 5, 5>{
+            {{{{3.40294218, 3.74021220, 4.02050114, 4.07054710, 2.46286273},
+               {4.61770582, 6.70517588, 6.50356627, 6.29688787, 3.53332567},
+               {5.47480106, 5.92094421, 6.64605665, 7.95090199, 4.28721523},
+               {4.01485729, 6.06748962, 7.52447891, 7.37980652, 5.28401136},
+               {2.83065438, 3.62033439, 3.56222963, 5.56103945, 3.23335814}},
+
+              {{3.30230498, 4.92814112, 4.34710836, 3.96262765, 2.97987890},
+               {4.49693012, 6.68929291, 5.53603029, 5.68874264, 4.28756475},
+               {4.20528078, 6.82776880, 6.70569849, 7.12809610, 4.40845442},
+               {4.31169367, 6.73352146, 6.30962515, 7.45826864, 4.99164438},
+               {2.18136287, 4.28968000, 4.20080042, 4.89814138, 2.87394023}},
+
+              {{3.54787683, 4.35851812, 4.63881302, 4.23359537, 3.16992092},
+               {5.25099468, 7.54282856, 6.69849157, 5.64309788, 4.56919575},
+               {4.71914101, 7.52830601, 6.71450949, 7.81113863, 5.84658146},
+               {4.97893143, 7.39293909, 6.89905310, 8.14430809, 5.62998581},
+               {2.79735112, 4.80967140, 5.57630205, 5.38828325, 4.57078695}},
+
+              {{3.03048635, 5.04540300, 4.21824932, 4.87323284, 2.35113740},
+               {4.45167351, 6.47721338, 7.40922976, 6.70445728, 3.60700107},
+               {3.77927423, 6.82826376, 7.41777134, 7.57402420, 5.13131523},
+               {4.08747244, 7.07994175, 7.57206821, 8.51897335, 5.26987123},
+               {2.34426999, 4.60127831, 4.86486769, 6.01579571, 3.97803569}}},
+
+             {{{3.84700942, 4.25972605, 3.05269003, 3.78043652, 2.08771229},
+               {6.00459957, 6.05633259, 4.45951605, 4.54089880, 4.03066444},
+               {5.41579390, 7.29543972, 6.18680000, 5.58812714, 3.45964241},
+               {6.04531050, 7.70924091, 5.52207708, 5.02131319, 4.09403706},
+               {3.18092418, 4.45422697, 4.04294252, 3.86577177, 2.18776536}},
+
+              {{4.02600670, 4.27603531, 3.81011319, 4.03631020, 2.57254648},
+               {5.33471155, 5.72588634, 5.12079763, 5.11733150, 3.76836705},
+               {5.62947607, 5.92492962, 6.24170446, 6.44130468, 3.44276404},
+               {5.38414621, 6.02679539, 5.88985586, 5.90263271, 3.15044069},
+               {3.31261086, 4.44371319, 3.47660780, 4.15411520, 1.48961508}},
+
+              {{3.95879412, 4.17324543, 3.70114422, 3.27447152, 3.09713888},
+               {5.78258181, 6.57920837, 4.99913597, 6.20961237, 4.98552179},
+               {5.84685421, 7.19971228, 6.66386652, 6.68013430, 4.90963316},
+               {5.24417877, 7.06430531, 6.58512402, 6.02492285, 4.48986387},
+               {3.64294529, 5.00678444, 5.04760027, 4.72895622, 2.67990756}},
+
+              {{3.48610687, 4.12853813, 4.07563591, 3.51327014, 2.44217038},
+               {4.80529881, 7.33211374, 5.14774036, 4.77281189, 4.44612408},
+               {5.11703110, 7.55168772, 7.14374542, 6.43696356, 4.10621357},
+               {5.41270018, 6.85949135, 6.73503923, 5.74601364, 4.46150303},
+               {3.16612267,
+                4.38248920,
+                5.23248482,
+                4.21292210,
+                2.86031270}}}}});
+
+        std::shared_ptr<Node> myConv = Conv<2>(3, 4, {3, 3}, "myconv");
+        auto convOp =
+            std::static_pointer_cast<OperatorTensor>(myConv->getOperator());
+
+        std::shared_ptr<Node> myPad =
             Pad<2>({1, 1, 1, 1}, "myPad", PadBorderType::Constant, 0.0);
-    auto padOp = std::static_pointer_cast<OperatorTensor>(myPad->getOperator());
-
-    convOp->setInput(1, myWeights);
-    convOp->setInput(2, myBias);
-
-    myPad->addChild(myConv, 0, 0);
-    padOp->setInput(0, myInput);
-
-    padOp->setDataType(DataType::Float64);
-    padOp->setBackend("cpu");
-    convOp->setDataType(DataType::Float64);
-    convOp->setBackend("cpu");
-
-    myPad->forward();
-    myConv->forward();
-    convOp -> getOutput(0) -> print();
-
-    double* computedOutput = static_cast<double*>(convOp->getOutput(0)->getImpl()->rawPtr());
-    double* expectedOutput = static_cast<double*>(myOutput->getImpl()->rawPtr());
-    for (std::size_t i = 0; i < myOutput->size(); ++i) {
-        REQUIRE(std::abs(computedOutput[i] - expectedOutput[i]) < 1e-5);
-    }
+        auto padOp =
+            std::static_pointer_cast<OperatorTensor>(myPad->getOperator());
+
+        convOp->setInput(1, myWeights);
+        convOp->setInput(2, myBias);
+
+        myPad->addChild(myConv, 0, 0);
+        padOp->setInput(0, myInput);
+
+        padOp->setDataType(DataType::Float64);
+        padOp->setBackend("cpu");
+        convOp->setDataType(DataType::Float64);
+        convOp->setBackend("cpu");
+
+        myPad->forward();
+        myConv->forward();
+        convOp->getOutput(0)->print();
+
+        double *computedOutput =
+            static_cast<double *>(convOp->getOutput(0)->getImpl()->rawPtr());
+        double *expectedOutput =
+            static_cast<double *>(myOutput->getImpl()->rawPtr());
+        for (std::size_t i = 0; i < myOutput->size(); ++i) {
+            REQUIRE(std::abs(computedOutput[i] - expectedOutput[i]) < 1e-5);
+        }
 
-    std::shared_ptr<Node> myPaddedConv =
+        std::shared_ptr<Node> myPaddedConv =
             PaddedConv(3, 4, {3, 3}, "myPaddedConv", {1, 1}, {1, 1, 1, 1});
-  }
+    }
     SECTION("LSTM(forward)") {
+
         auto pop = Pop();
         auto myLSTM = LSTM(32, 64, 0, true, "ltsm");
-        auto op = std::dynamic_pointer_cast<MetaOperator_Op>(myLSTM->getOperator());
+        auto op =
+            std::dynamic_pointer_cast<MetaOperator_Op>(myLSTM->getOperator());
 
         auto microGraph = op->getMicroGraph();
         microGraph->save("lstm", false, true);
@@ -209,14 +232,14 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
         }
         REQUIRE(myLSTM->nbOutputs() == 2);
 
-        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
-            Array2D<float, 16, 32>{});
-        std::shared_ptr<Tensor> myInit = std::make_shared<Tensor>(
-            Array2D<float, 32, 64>{});
-        std::shared_ptr<Tensor> myInitW = std::make_shared<Tensor>(
-            Array2D<float, 64, 32>{});
-        std::shared_ptr<Tensor> myInitR = std::make_shared<Tensor>(
-            Array2D<float, 64, 64>{});
+        std::shared_ptr<Tensor> myInput =
+            std::make_shared<Tensor>(Array2D<float, 16, 32>{});
+        std::shared_ptr<Tensor> myInit =
+            std::make_shared<Tensor>(Array2D<float, 32, 64>{});
+        std::shared_ptr<Tensor> myInitW =
+            std::make_shared<Tensor>(Array2D<float, 64, 32>{});
+        std::shared_ptr<Tensor> myInitR =
+            std::make_shared<Tensor>(Array2D<float, 64, 64>{});
 
         pop->addChild(myLSTM, 0, 0);
         pop->getOperator()->associateInput(0, myInput);
@@ -246,7 +269,9 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
         microGraph->save("lstm_dims", true, true);
         REQUIRE(op->dimsForwarded());
 
-        auto microGraphScheduler = std::dynamic_pointer_cast<MetaOperator_Op>(op)->getMicroGraphScheduler();
+        auto microGraphScheduler =
+            std::dynamic_pointer_cast<MetaOperator_Op>(op)
+                ->getMicroGraphScheduler();
         microGraphScheduler->saveSchedulingDiagram("lstm_scheduling");
 
         REQUIRE(op->getNbConsumedData(0).data == 512);
@@ -257,11 +282,14 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
         REQUIRE(microGraphScheduler->getStaticScheduling(1).size() == 24);
         REQUIRE(microGraphScheduler->getStaticScheduling(15).size() == 24);
     }
+
     SECTION("LSTM(forward_values)") {
         auto myLSTM = LSTM(2, 3, 0, true, "ltsm");
-        auto op = std::static_pointer_cast<OperatorTensor>(myLSTM->getOperator());
+        auto op =
+            std::static_pointer_cast<OperatorTensor>(myLSTM->getOperator());
 
-        auto microGraph = std::dynamic_pointer_cast<MetaOperator_Op>(op)->getMicroGraph();
+        auto microGraph =
+            std::dynamic_pointer_cast<MetaOperator_Op>(op)->getMicroGraph();
         microGraph->save("lstm", false, false);
 
         REQUIRE(myLSTM->nbInputs() == 3 + 8 + 8);
@@ -276,12 +304,14 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
 
         std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
             Array2D<float, 3, 2>{{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}}});
-        std::shared_ptr<Tensor> myInit = std::make_shared<Tensor>(
-            Array2D<float, 3, 3>{{{0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}}});
+        std::shared_ptr<Tensor> myInit =
+            std::make_shared<Tensor>(Array2D<float, 3, 3>{
+                {{0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}}});
         std::shared_ptr<Tensor> myInitW = std::make_shared<Tensor>(
             Array2D<float, 3, 2>{{{0.1, 0.1}, {0.1, 0.1}, {0.1, 0.1}}});
-        std::shared_ptr<Tensor> myInitR = std::make_shared<Tensor>(
-            Array2D<float, 3, 3>{{{0.1, 0.1, 0.1}, {0.1, 0.1, 0.1}, {0.1, 0.1, 0.1}}});
+        std::shared_ptr<Tensor> myInitR =
+            std::make_shared<Tensor>(Array2D<float, 3, 3>{
+                {{0.1, 0.1, 0.1}, {0.1, 0.1, 0.1}, {0.1, 0.1, 0.1}}});
 
         op->associateInput(0, myInput);
         op->associateInput(17, myInit);
@@ -308,12 +338,13 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
         microGraph->save("lstm_values_dims", false, true);
 
         std::shared_ptr<Tensor> myHiddenState = std::make_shared<Tensor>(
-                Array2D<float, 3, 3>{{{0.0952412, 0.0952412, 0.0952412},
-                                     {0.25606447, 0.25606447, 0.25606447},
-                                     {0.40323776, 0.40323776, 0.40323776}}});
+            Array2D<float, 3, 3>{{{0.0952412, 0.0952412, 0.0952412},
+                                  {0.25606447, 0.25606447, 0.25606447},
+                                  {0.40323776, 0.40323776, 0.40323776}}});
 
-
-        auto microGraphScheduler = std::dynamic_pointer_cast<MetaOperator_Op>(op)->getMicroGraphScheduler();
+        auto microGraphScheduler =
+            std::dynamic_pointer_cast<MetaOperator_Op>(op)
+                ->getMicroGraphScheduler();
         microGraphScheduler->saveSchedulingDiagram("lstm_values_scheduling");
 
         op->getOutput(0)->print();
@@ -321,11 +352,13 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
 
         REQUIRE(approxEq<float>(*(op->getOutput(0)), *myHiddenState));
     }
+
     SECTION("LSTM(forward_values_seq)") {
         auto pop = Pop();
         auto myLSTM = LSTM(2, 3, 2, true, "ltsm");
         auto myGraph = Sequential({pop, myLSTM});
-        auto op = std::static_pointer_cast<OperatorTensor>(myLSTM->getOperator());
+        auto op =
+            std::static_pointer_cast<OperatorTensor>(myLSTM->getOperator());
 
         REQUIRE(myLSTM->nbInputs() == 3 + 8 + 8);
         REQUIRE(myLSTM->inputCategory(0) == InputCategory::Data);
@@ -338,13 +371,16 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
         REQUIRE(myLSTM->nbOutputs() == 2);
 
         std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
-            Array3D<float, 2, 3, 2>{{{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}}, {{2.0, 3.0}, {4.0, 5.0}, {6.0, 7.0}}}});
-        std::shared_ptr<Tensor> myInit = std::make_shared<Tensor>(
-            Array2D<float, 3, 3>{{{0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}}});
+            Array3D<float, 2, 3, 2>{{{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}},
+                                     {{2.0, 3.0}, {4.0, 5.0}, {6.0, 7.0}}}});
+        std::shared_ptr<Tensor> myInit =
+            std::make_shared<Tensor>(Array2D<float, 3, 3>{
+                {{0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}}});
         std::shared_ptr<Tensor> myInitW = std::make_shared<Tensor>(
             Array2D<float, 3, 2>{{{0.1, 0.1}, {0.1, 0.1}, {0.1, 0.1}}});
-        std::shared_ptr<Tensor> myInitR = std::make_shared<Tensor>(
-            Array2D<float, 3, 3>{{{0.1, 0.1, 0.1}, {0.1, 0.1, 0.1}, {0.1, 0.1, 0.1}}});
+        std::shared_ptr<Tensor> myInitR =
+            std::make_shared<Tensor>(Array2D<float, 3, 3>{
+                {{0.1, 0.1, 0.1}, {0.1, 0.1, 0.1}, {0.1, 0.1, 0.1}}});
 
         pop->getOperator()->associateInput(0, myInput);
         op->associateInput(17, myInit);
@@ -371,9 +407,9 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
         scheduler.saveSchedulingDiagram("lstm_seq_schedule");
 
         std::shared_ptr<Tensor> myHiddenState = std::make_shared<Tensor>(
-                Array2D<float, 3, 3>{{{0.24439372, 0.24439372, 0.24439372},
-                                     {0.49801484, 0.49801484, 0.49801484},
-                                     {0.67162132, 0.67162132, 0.67162132}}});
+            Array2D<float, 3, 3>{{{0.24439372, 0.24439372, 0.24439372},
+                                  {0.49801484, 0.49801484, 0.49801484},
+                                  {0.67162132, 0.67162132, 0.67162132}}});
 
         myGraph->save("lstm_seq_mygraph", true, true);
 
@@ -382,10 +418,12 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
 
         REQUIRE(approxEq<float>(*(op->getOutput(0)), *myHiddenState));
     }
+
     SECTION("LSTM(forward_values_seq_flatten)(sequential)") {
         auto pop = Pop();
         auto myLSTM = LSTM(2, 3, 2, true, "ltsm");
-        auto op = std::static_pointer_cast<MetaOperator_Op>(myLSTM->getOperator());
+        auto op =
+            std::static_pointer_cast<MetaOperator_Op>(myLSTM->getOperator());
 
         // Here we test LSTM as it is was flatten in the graph.
         // We just borrow its micro-graph into our larger myGraph graph.
@@ -405,13 +443,16 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
         REQUIRE(myLSTM->nbOutputs() == 2);
 
         std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
-            Array3D<float, 2, 3, 2>{{{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}}, {{2.0, 3.0}, {4.0, 5.0}, {6.0, 7.0}}}});
-        std::shared_ptr<Tensor> myInit = std::make_shared<Tensor>(
-            Array2D<float, 3, 3>{{{0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}}});
+            Array3D<float, 2, 3, 2>{{{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}},
+                                     {{2.0, 3.0}, {4.0, 5.0}, {6.0, 7.0}}}});
+        std::shared_ptr<Tensor> myInit =
+            std::make_shared<Tensor>(Array2D<float, 3, 3>{
+                {{0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}}});
         std::shared_ptr<Tensor> myInitW = std::make_shared<Tensor>(
             Array2D<float, 3, 2>{{{0.1, 0.1}, {0.1, 0.1}, {0.1, 0.1}}});
-        std::shared_ptr<Tensor> myInitR = std::make_shared<Tensor>(
-            Array2D<float, 3, 3>{{{0.1, 0.1, 0.1}, {0.1, 0.1, 0.1}, {0.1, 0.1, 0.1}}});
+        std::shared_ptr<Tensor> myInitR =
+            std::make_shared<Tensor>(Array2D<float, 3, 3>{
+                {{0.1, 0.1, 0.1}, {0.1, 0.1, 0.1}, {0.1, 0.1, 0.1}}});
 
         pop->getOperator()->associateInput(0, myInput);
         op->associateInput(17, myInit);
@@ -419,16 +460,32 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
 
         // Weights X
         auto prodX = Producer(myInitW);
-        prodX->addChild(op->getMicroGraph()->getOrderedInputs()[1].first, 0, 1);
-        prodX->addChild(op->getMicroGraph()->getOrderedInputs()[2].first, 0, 1);
-        prodX->addChild(op->getMicroGraph()->getOrderedInputs()[3].first, 0, 1);
-        prodX->addChild(op->getMicroGraph()->getOrderedInputs()[4].first, 0, 1);
+        prodX->addChild(op->getMicroGraph()->getOrderedInputs()[1].first,
+                        0,
+                        1);
+        prodX->addChild(op->getMicroGraph()->getOrderedInputs()[2].first,
+                        0,
+                        1);
+        prodX->addChild(op->getMicroGraph()->getOrderedInputs()[3].first,
+                        0,
+                        1);
+        prodX->addChild(op->getMicroGraph()->getOrderedInputs()[4].first,
+                        0,
+                        1);
         // Weights H
         auto prodH = Producer(myInitR);
-        prodH->addChild(op->getMicroGraph()->getOrderedInputs()[5].first, 0, 1);
-        prodH->addChild(op->getMicroGraph()->getOrderedInputs()[6].first, 0, 1);
-        prodH->addChild(op->getMicroGraph()->getOrderedInputs()[7].first, 0, 1);
-        prodH->addChild(op->getMicroGraph()->getOrderedInputs()[8].first, 0, 1);
+        prodH->addChild(op->getMicroGraph()->getOrderedInputs()[5].first,
+                        0,
+                        1);
+        prodH->addChild(op->getMicroGraph()->getOrderedInputs()[6].first,
+                        0,
+                        1);
+        prodH->addChild(op->getMicroGraph()->getOrderedInputs()[7].first,
+                        0,
+                        1);
+        prodH->addChild(op->getMicroGraph()->getOrderedInputs()[8].first,
+                        0,
+                        1);
         myGraph->add({prodX, prodH});
 
         myGraph->setDataType(DataType::Float32);
@@ -436,9 +493,9 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
         myGraph->save("lstm_seq_flatten", true, true);
 
         std::shared_ptr<Tensor> myHiddenState = std::make_shared<Tensor>(
-                Array2D<float, 3, 3>{{{0.24439372, 0.24439372, 0.24439372},
-                                     {0.49801484, 0.49801484, 0.49801484},
-                                     {0.67162132, 0.67162132, 0.67162132}}});
+            Array2D<float, 3, 3>{{{0.24439372, 0.24439372, 0.24439372},
+                                  {0.49801484, 0.49801484, 0.49801484},
+                                  {0.67162132, 0.67162132, 0.67162132}}});
 
         auto scheduler = SequentialScheduler(myGraph);
         scheduler.generateScheduling();
@@ -454,7 +511,8 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
     SECTION("LSTM(forward_values_seq_flatten)(parallel)") {
         auto pop = Pop();
         auto myLSTM = LSTM(2, 3, 2, true, "ltsm");
-        auto op = std::static_pointer_cast<MetaOperator_Op>(myLSTM->getOperator());
+        auto op =
+            std::static_pointer_cast<MetaOperator_Op>(myLSTM->getOperator());
 
         // Here we test LSTM as it is was flatten in the graph.
         // We just borrow its micro-graph into our larger myGraph graph.
@@ -474,13 +532,16 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
         REQUIRE(myLSTM->nbOutputs() == 2);
 
         std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
-            Array3D<float, 2, 3, 2>{{{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}}, {{2.0, 3.0}, {4.0, 5.0}, {6.0, 7.0}}}});
-        std::shared_ptr<Tensor> myInit = std::make_shared<Tensor>(
-            Array2D<float, 3, 3>{{{0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}}});
+            Array3D<float, 2, 3, 2>{{{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}},
+                                     {{2.0, 3.0}, {4.0, 5.0}, {6.0, 7.0}}}});
+        std::shared_ptr<Tensor> myInit =
+            std::make_shared<Tensor>(Array2D<float, 3, 3>{
+                {{0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}}});
         std::shared_ptr<Tensor> myInitW = std::make_shared<Tensor>(
             Array2D<float, 3, 2>{{{0.1, 0.1}, {0.1, 0.1}, {0.1, 0.1}}});
-        std::shared_ptr<Tensor> myInitR = std::make_shared<Tensor>(
-            Array2D<float, 3, 3>{{{0.1, 0.1, 0.1}, {0.1, 0.1, 0.1}, {0.1, 0.1, 0.1}}});
+        std::shared_ptr<Tensor> myInitR =
+            std::make_shared<Tensor>(Array2D<float, 3, 3>{
+                {{0.1, 0.1, 0.1}, {0.1, 0.1, 0.1}, {0.1, 0.1, 0.1}}});
 
         pop->getOperator()->associateInput(0, myInput);
         op->associateInput(17, myInit);
@@ -488,16 +549,32 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
 
         // Weights X
         auto prodX = Producer(myInitW);
-        prodX->addChild(op->getMicroGraph()->getOrderedInputs()[1].first, 0, 1);
-        prodX->addChild(op->getMicroGraph()->getOrderedInputs()[2].first, 0, 1);
-        prodX->addChild(op->getMicroGraph()->getOrderedInputs()[3].first, 0, 1);
-        prodX->addChild(op->getMicroGraph()->getOrderedInputs()[4].first, 0, 1);
+        prodX->addChild(op->getMicroGraph()->getOrderedInputs()[1].first,
+                        0,
+                        1);
+        prodX->addChild(op->getMicroGraph()->getOrderedInputs()[2].first,
+                        0,
+                        1);
+        prodX->addChild(op->getMicroGraph()->getOrderedInputs()[3].first,
+                        0,
+                        1);
+        prodX->addChild(op->getMicroGraph()->getOrderedInputs()[4].first,
+                        0,
+                        1);
         // Weights H
         auto prodH = Producer(myInitR);
-        prodH->addChild(op->getMicroGraph()->getOrderedInputs()[5].first, 0, 1);
-        prodH->addChild(op->getMicroGraph()->getOrderedInputs()[6].first, 0, 1);
-        prodH->addChild(op->getMicroGraph()->getOrderedInputs()[7].first, 0, 1);
-        prodH->addChild(op->getMicroGraph()->getOrderedInputs()[8].first, 0, 1);
+        prodH->addChild(op->getMicroGraph()->getOrderedInputs()[5].first,
+                        0,
+                        1);
+        prodH->addChild(op->getMicroGraph()->getOrderedInputs()[6].first,
+                        0,
+                        1);
+        prodH->addChild(op->getMicroGraph()->getOrderedInputs()[7].first,
+                        0,
+                        1);
+        prodH->addChild(op->getMicroGraph()->getOrderedInputs()[8].first,
+                        0,
+                        1);
         myGraph->add({prodX, prodH});
 
         myGraph->setDataType(DataType::Float32);
@@ -505,9 +582,9 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
         myGraph->save("lstm_seq_flatten", true, true);
 
         std::shared_ptr<Tensor> myHiddenState = std::make_shared<Tensor>(
-                Array2D<float, 3, 3>{{{0.24439372, 0.24439372, 0.24439372},
-                                     {0.49801484, 0.49801484, 0.49801484},
-                                     {0.67162132, 0.67162132, 0.67162132}}});
+            Array2D<float, 3, 3>{{{0.24439372, 0.24439372, 0.24439372},
+                                  {0.49801484, 0.49801484, 0.49801484},
+                                  {0.67162132, 0.67162132, 0.67162132}}});
 
         auto scheduler = ParallelScheduler(myGraph);
         scheduler.generateScheduling();
@@ -519,4 +596,308 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
 
         REQUIRE(approxEq<float>(*(op->getOutput(0)), *myHiddenState));
     }
-}
\ No newline at end of file
+
+    SECTION("Leaky(forward)(fixed)") {
+
+        constexpr auto inChannels = 10;
+        constexpr auto outChannels = 5;
+
+        constexpr auto beta = 0.95;
+        constexpr auto threshold = 1.0;
+        constexpr auto nbTimeSteps = 2;
+
+        auto myWeights =
+            std::make_shared<Tensor>(Array2D<float, outChannels, inChannels>{{
+                {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0},
+                {1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1},
+                {0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 0.1, 0.2, 0.3, 0.4},
+                {0.4, 0.3, 0.2, 0.1, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5},
+                {0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0.0},
+            }});
+
+        auto myWeights2 =
+            std::make_shared<Tensor>(Array2D<float, inChannels, outChannels>{{
+                {0.1, 0.2, 0.3, 0.4, 0.5},
+                {0.6, 0.7, 0.8, 0.9, 1.0},
+                {1.0, 0.9, 0.8, 0.7, 0.6},
+                {0.5, 0.4, 0.3, 0.2, 0.1},
+                {0.5, 0.6, 0.7, 0.8, 0.9},
+                {1.0, 0.1, 0.2, 0.3, 0.4},
+                {0.4, 0.3, 0.2, 0.1, 0.0},
+                {0.1, 0.2, 0.3, 0.4, 0.5},
+                {0.9, 0.8, 0.7, 0.6, 0.5},
+                {0.4, 0.3, 0.2, 0.1, 0.0},
+            }});
+
+        auto myInput = std::make_shared<Tensor>(Array2D<float, 2, 10>{{
+            {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0},
+            {1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1},
+        }});
+
+        // py/snn Torch computed result, output of fc1 at time step 1
+        auto expectedOutputlif1ts1 =
+            std::make_shared<Tensor>(Array2D<float, 2, 5>{{
+                {3.850, 2.2000, 2.6500, 1.5000, 1.6500},
+                {2.200, 3.8500, 3.4000, 1.2500, 3.3000},
+            }});
+
+        auto expectedOutputfc2ts1 =
+            std::make_shared<Tensor>(Array2D<float, 2, 10>{{
+                {1.5000,
+                 4.0000,
+                 4.0000,
+                 1.5000,
+                 3.5000,
+                 2.0000,
+                 1.0000,
+                 1.5000,
+                 3.5000,
+                 1.0000},
+                {1.5000,
+                 4.0000,
+                 4.0000,
+                 1.5000,
+                 3.5000,
+                 2.0000,
+                 1.0000,
+                 1.5000,
+                 3.5000,
+                 1.0000},
+            }});
+
+        auto expectedOutputlif1ts2 =
+            std::make_shared<Tensor>(Array2D<float, 2, 5>{{
+                {6.5075, 3.2900, 4.1675, 1.9250, 2.2175},
+                {3.2900, 6.5075, 5.6300, 1.4375, 5.4350},
+            }});
+
+        // NOTE: Same output as before, because for all channels, we have a
+        // potential higher than threshold. Thus the lif neuron fires at every
+        // timestep for every channel.
+        auto expectedOutputfc2ts2 =
+            std::make_shared<Tensor>(Array2D<float, 2, 10>{{
+                {1.5000,
+                 4.0000,
+                 4.0000,
+                 1.5000,
+                 3.5000,
+                 2.0000,
+                 1.0000,
+                 1.5000,
+                 3.5000,
+                 1.0000},
+                {1.5000,
+                 4.0000,
+                 4.0000,
+                 1.5000,
+                 3.5000,
+                 2.0000,
+                 1.0000,
+                 1.5000,
+                 3.5000,
+                 1.0000},
+            }});
+
+        auto init = std::make_shared<Tensor>(Array2D<float, 2, 5>{});
+        uniformFiller<float>(init, 0.0, 0.0);
+
+        auto fc1 = FC(inChannels, outChannels, true, "myfc");
+        auto fc2 = FC(outChannels, inChannels, true, "fc2");
+        // NOTE: Account for init step by adding 1 to the max timestep
+        // parameter.
+        auto lif1 = Leaky(nbTimeSteps + 1, beta, threshold, "leaky");
+
+        // associateInput() does not work
+        fc1->input(1).first->getOperator()->setOutput(0, myWeights);
+        fc2->input(1).first->getOperator()->setOutput(0, myWeights2);
+
+        auto fc1Op =
+            std::static_pointer_cast<OperatorTensor>(fc1->getOperator());
+        auto lif1Op =
+            std::static_pointer_cast<MetaOperator_Op>(lif1->getOperator());
+        auto fc2Op =
+            std::static_pointer_cast<OperatorTensor>(fc2->getOperator());
+
+        fc1Op->associateInput(0, myInput);
+        lif1Op->associateInput(1, init);
+        lif1Op->associateInput(2, init);
+
+        fc1->addChild(lif1, 0, 0);
+        lif1->addChild(fc2, 1, 0);
+
+        auto g = std::make_shared<GraphView>();
+        g->add({fc1, lif1, fc2});
+        g->compile("cpu", DataType::Float32);
+        auto scheduler = SequentialScheduler(g);
+
+        // Forward 1 (simulate timestep 0)
+        scheduler.forward(true);
+        REQUIRE(approxEq<float>(*(lif1Op->getOutput(0)),
+                                *(expectedOutputlif1ts1)));
+        REQUIRE(
+            approxEq<float>(*(fc2Op->getOutput(0)), *(expectedOutputfc2ts1)));
+
+        // Forward 1 (simulate timestep 1)
+        scheduler.forward(true);
+        REQUIRE(approxEq<float>(*(lif1Op->getOutput(0)),
+                                *(expectedOutputlif1ts2)));
+        REQUIRE(
+            approxEq<float>(*(fc2Op->getOutput(0)), *(expectedOutputfc2ts2)));
+    }
+
+    SECTION("Leaky(forward)") {
+
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> valueDist(
+            0.1f,
+            1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2),
+                                                               std::size_t(4));
+        std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(3),
+                                                              std::size_t(3));
+        std::uniform_int_distribution<int> boolDist(0, 1);
+        std::uniform_real_distribution<float> betaDist(0,1);
+
+        const std::size_t nbDims = nbDimsDist(gen);
+        Log::info("Nbdims : {}", nbDims);
+        std::vector<std::size_t> dims;
+        for (std::size_t i = 0; i < nbDims; ++i) {
+            dims.push_back(dimSizeDist(gen));
+        }
+        Log::info("timesteps : {}", dims[0]);
+        Log::info("dimensions : ");
+        for (auto dim : dims) {
+            Log::info("{}", dim);
+        }
+
+        const auto nbTimeSteps = dims[0];
+        const auto beta = betaDist(gen);
+
+        auto myLeaky = Leaky(nbTimeSteps, beta, 1.0, "leaky");
+        auto op =
+            std::static_pointer_cast<MetaOperator_Op>(myLeaky->getOperator());
+        // auto stack = Stack(2);
+        auto mem_rec = Stack(nbTimeSteps, "mem_rec");
+        auto spk_rec = Stack(nbTimeSteps, "spk_rec");
+        auto pop = Pop("popinput");
+
+        // Here we test LSTM as it is was flatten in the graph.
+        // We just borrow its micro-graph into our larger myGraph graph.
+        auto myGraph = std::make_shared<GraphView>();
+
+        pop->addChild(op->getMicroGraph()->getOrderedInputs()[0].first, 0, 0);
+        // 0 for mem 1 for stack
+        op->getMicroGraph()->getOrderedOutputs()[1].first->addChild(mem_rec,
+                                                                    0,
+                                                                    0);
+        op->getMicroGraph()->getOrderedOutputs()[0].first->addChild(spk_rec,
+                                                                    0,
+                                                                    0);
+        for (auto node : op->getMicroGraph()->getOrderedOutputs()) {
+            Log::info("name  of output {}", node.first->name());
+        }
+
+        myGraph->add(pop);
+        myGraph->add(op->getMicroGraph());
+        myGraph->add(mem_rec);
+        myGraph->add(spk_rec);
+        myGraph->save("mg", true, true);
+
+        // 3 outputs
+        REQUIRE(myLeaky->nbInputs() == 3);
+        REQUIRE(myLeaky->inputCategory(0) == InputCategory::Data);
+        // Two spikes connected to nothing, + the Add node real output
+        REQUIRE(myLeaky->nbOutputs() == 4);
+
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
+            Array3D<float, 2, 3, 2>{{{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}},
+                                     {{2.0, 3.0}, {4.0, 5.0}, {6.0, 7.0}}}});
+
+        // std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(
+        //     Array3D<float, 2, 3, 2>{{{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}},
+        //                              {{2.0, 3.0}, {4.0, 5.0},
+        //                              {6.0, 7.0}}}});
+
+        // Generate input
+        std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>();
+        T0->setDataType(DataType::Float32);
+        T0->setBackend("cpu");
+
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>();
+        expectedOutput->setDataType(DataType::Float32);
+        expectedOutput->setBackend("cpu");
+
+        const auto nb_elements =
+            std::accumulate(dims.cbegin(),
+                            dims.cend(),
+                            std::size_t(1),
+                            std::multiplies<std::size_t>());
+        float *input = new float[nb_elements];
+        float *result = new float[nb_elements];
+
+        for (std::size_t i = 0; i < nb_elements; ++i) {
+            input[i] = valueDist(gen);
+        }
+        T0->resize(dims);
+        T0->getImpl()->setRawPtr(input, nb_elements);
+        T0->print();
+
+        // Elements popped at each time step
+        auto nbElementsPerTimeStep = nb_elements / dims[0];
+
+        // Init
+        for (int i = 0; i < nbElementsPerTimeStep; ++i) {
+            result[i] = input[i];
+        }
+
+        // Reccurence
+        for (int i = 1; i < dims[0]; ++i) {
+            auto offset = nbElementsPerTimeStep * i;
+            auto prev = nbElementsPerTimeStep * (i - 1);
+            for (int j = 0; j < nbElementsPerTimeStep; ++j) {
+                auto reset = (result[prev + j] > 1.0 ? 1 : 0);
+                result[offset + j] =
+                    result[prev + j] * beta + input[offset + j] - reset;
+            }
+        }
+
+        expectedOutput->resize(dims);
+        expectedOutput->getImpl()->setRawPtr(result, nb_elements);
+        Log::info("Expected ouptut : ");
+        expectedOutput->print();
+
+        std::shared_ptr<Tensor> myInit =
+            std::make_shared<Tensor>(Array2D<float, 3, 3>{
+                {{0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}}});
+
+        auto initMemdims =
+            std::vector<std::size_t>(dims.begin() + 1, dims.end());
+        Log::info("dimensions : ");
+        for (auto dim : initMemdims) {
+            Log::info("{}", dim);
+        }
+        std::shared_ptr<Tensor> myInitW = std::make_shared<Tensor>(
+            Array2D<float, 3, 2>{{{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}});
+
+        std::shared_ptr<Tensor> myInitR =
+            std::make_shared<Tensor>(initMemdims);
+        myInitR->setDataType(DataType::Float32);
+        myInitR->setBackend("cpu");
+        uniformFiller<float>(myInitR, 0, 0);
+
+        pop->getOperator()->associateInput(0, T0);
+        op->associateInput(1, myInitR);
+        op->associateInput(2, myInitR);
+
+        myGraph->compile("cpu", DataType::Float32);
+
+        auto scheduler = SequentialScheduler(myGraph);
+        REQUIRE_NOTHROW(scheduler.generateScheduling());
+        REQUIRE_NOTHROW(scheduler.forward(true));
+
+        auto memOp =
+            std::static_pointer_cast<OperatorTensor>(spk_rec->getOperator());
+        REQUIRE(approxEq<float>(*(memOp->getOutput(0)), *(expectedOutput)));
+    }
+}
-- 
GitLab


From ff3a3ed7ad4fc32ba3f0d80b35aeaaf5c1420a61 Mon Sep 17 00:00:00 2001
From: Jerome Hue <jerome.hue@cea.fr>
Date: Thu, 6 Feb 2025 11:59:50 +0100
Subject: [PATCH 02/22] Implement backward function for Div operator

---
 .../aidge/backend/cpu/operator/DivImpl.hpp    |  13 +-
 .../backend/cpu/operator/DivImpl_kernels.hpp  |  61 +++-
 src/operator/DivImpl.cpp                      |  23 +-
 unit_tests/operator/Test_DivImpl.cpp          | 271 ++++++++++++++++++
 4 files changed, 363 insertions(+), 5 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/DivImpl.hpp b/include/aidge/backend/cpu/operator/DivImpl.hpp
index 40c1b678..a507690b 100644
--- a/include/aidge/backend/cpu/operator/DivImpl.hpp
+++ b/include/aidge/backend/cpu/operator/DivImpl.hpp
@@ -24,7 +24,18 @@
 namespace Aidge {
 // Operator implementation entry point for the backend
 using DivImpl_cpu = OperatorImpl_cpu<Div_Op,
-    void(const std::size_t, const std::size_t, const std::size_t, const void*, const void*,void*)>;
+    void(const std::size_t, const std::size_t, const std::size_t, const void*, const void*,void*),
+    void(const std::size_t,
+        const std::size_t,
+        const std::size_t,
+        const std::vector<std::size_t>,
+        const std::vector<std::size_t>,
+        const std::vector<std::size_t>,
+        const void*,
+        const void*,
+        const void*,
+        void*,
+        void*)>;
 
 // Implementation entry point registration to Operator
 REGISTRAR(Div_Op, "cpu", Aidge::DivImpl_cpu::create);
diff --git a/include/aidge/backend/cpu/operator/DivImpl_kernels.hpp b/include/aidge/backend/cpu/operator/DivImpl_kernels.hpp
index ed6e55a7..5d3ee7f6 100644
--- a/include/aidge/backend/cpu/operator/DivImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/DivImpl_kernels.hpp
@@ -17,6 +17,7 @@
 #include <cstdint>     // std::int32_t, std::int64_t
 #include <functional>  // std::multiplies
 
+#include "aidge/backend/cpu/operator/MulImpl_kernels.hpp"
 #include "aidge/utils/Registrar.hpp"
 
 #include "aidge/backend/cpu/data/Broadcasting.hpp"
@@ -69,16 +70,70 @@ constexpr void DivImpl_cpu_forward_kernel(const std::size_t input1size_,
     }
 }
 
+
+template <class I1, class I2, class O>
+void DivImpl_cpu_backward_kernel(const std::size_t input0Length,
+                               const std::size_t input1Length,
+                               const std::size_t gradOutputLength,
+                               const std::vector<std::size_t>& dims0,
+                               const std::vector<std::size_t>& dims1,
+                               const std::vector<std::size_t>& outputDims,
+                               const void* input0_,
+                               const void* input1_,
+                               const void* grad_output_,
+                               void* gradientInput0_,
+                               void* gradientInput1_)
+{
+    const I1* input0 = static_cast<const I1*>(input0_);  // a
+    const I2* input1 = static_cast<const I2*>(input1_);  // b
+    const O* grad_output = static_cast<const O*>(grad_output_);
+    auto* grad_input_0 = static_cast<I1*>(gradientInput0_);  // gradient w.r.t. a
+    auto* grad_input_1 = static_cast<I2*>(gradientInput1_);  // gradient w.r.t. b
+
+    std::fill_n(grad_input_0, input0Length, static_cast<I1>(0));
+    std::fill_n(grad_input_1, input1Length, static_cast<I2>(0));
+
+    // Broadcast dims0 and dims1 to match the shape of outputDims
+    auto broadcastedDims0 = getBroadcastedDims(outputDims, dims0);
+    auto broadcastedDims1 = getBroadcastedDims(outputDims, dims1);
+
+    for (std::size_t i = 0; i < gradOutputLength; ++i) {
+        auto idxOutputGrad = getMultiDimIndices(outputDims, i);
+        std::vector<std::size_t> idxInput0(broadcastedDims0.size());
+        std::vector<std::size_t> idxInput1(broadcastedDims1.size());
+
+        // Map output indices to input indices, considering broadcasting
+        for (std::size_t dimension = 0; dimension < broadcastedDims0.size(); ++dimension) {
+            idxInput0[dimension] = (broadcastedDims0[dimension] == 1) ? 0 : idxOutputGrad[dimension];
+        }
+
+        for (std::size_t dimension = 0; dimension < broadcastedDims1.size(); ++dimension) {
+            idxInput1[dimension] = (broadcastedDims1[dimension] == 1) ? 0 : idxOutputGrad[dimension];
+        }
+
+        auto idx0 = getFlattenedIndex(broadcastedDims0, idxInput0);
+        auto idx1 = getFlattenedIndex(broadcastedDims1, idxInput1);
+
+        // grad_a = grad_output * (1/b)
+        grad_input_0[idx0] += static_cast<I1>(grad_output[i] / input1[idx1]);
+        
+        // grad_b = grad_output * (-a/bÂ²)
+        grad_input_1[idx1] += static_cast<I2>(grad_output[i] * (-input0[idx0] / (input1[idx1] * input1[idx1])));
+    }
+}
+
+
 // Kernels registration to implementation entry point
 REGISTRAR(DivImpl_cpu,
     {DataType::Float32},
-    {ProdConso::inPlaceModel, Aidge::DivImpl_cpu_forward_kernel<float, float, float>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::DivImpl_cpu_forward_kernel<float, float, float>, Aidge::DivImpl_cpu_backward_kernel<float, float, float>});
 REGISTRAR(DivImpl_cpu,
     {DataType::Float64},
-    {ProdConso::inPlaceModel, Aidge::DivImpl_cpu_forward_kernel<double, double, double>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::DivImpl_cpu_forward_kernel<double, double, double>, Aidge::DivImpl_cpu_backward_kernel<double, double, double>});
 REGISTRAR(DivImpl_cpu,
     {DataType::Int32},
-    {ProdConso::inPlaceModel, Aidge::DivImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::DivImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, 
+          Aidge::DivImpl_cpu_backward_kernel<std::int32_t, std::int32_t, std::int32_t>});
 }  // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_DIVIMPL_KERNELS_H_ */
diff --git a/src/operator/DivImpl.cpp b/src/operator/DivImpl.cpp
index 135b32b5..67444cb8 100644
--- a/src/operator/DivImpl.cpp
+++ b/src/operator/DivImpl.cpp
@@ -152,5 +152,26 @@ void Aidge::DivImpl_cpu::forward() {
 
 template <>
 void Aidge::DivImpl_cpu::backward() {
-    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Div_Op on backend cpu");
+    const Div_Op& op_ = dynamic_cast<const Div_Op&>(mOp);
+
+    auto in0 = op_.getInput(0);
+    auto in1 = op_.getInput(1);
+    auto in0grad = op_.getInput(0)->grad();
+    auto in1grad = op_.getInput(1)->grad();
+    auto out0grad = op_.getOutput(0)->grad();
+
+    const auto impl = Registrar<DivImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    impl.backward(in0grad->size(),
+               in1grad->size(),
+               out0grad->size(),
+               in0->dims(),
+               in1->dims(),
+               out0grad->dims(),
+               getCPUPtr(in0),
+               getCPUPtr(in1),
+               getCPUPtr(out0grad),
+               getCPUPtr(in0grad),
+               getCPUPtr(in1grad));
 }
+
diff --git a/unit_tests/operator/Test_DivImpl.cpp b/unit_tests/operator/Test_DivImpl.cpp
index 4037b2ad..4e7657ed 100644
--- a/unit_tests/operator/Test_DivImpl.cpp
+++ b/unit_tests/operator/Test_DivImpl.cpp
@@ -322,4 +322,275 @@ TEST_CASE("[cpu/operator] Div", "[Div][CPU]") {
         }
     }
 }
+
+TEST_CASE("[CPU/Operator] Div(Backward)", "[Div][CPU][Backward]") {
+    std::shared_ptr<Div_Op> op = std::make_shared<Div_Op>();
+    op->setDataType(DataType::Float32);
+    op->setBackend("cpu");
+
+    // NOTE: The first four tests use fixed values, the last one uses random values but static dimensions.
+
+    SECTION("Case 1: 1D and 2D Tensors") {
+        const auto T0 = std::make_shared<Tensor>(
+            Array2D<cpptype_t<DataType::Float32>, 2, 3>({{{1, 2, 3}, {4, 5, 6}}}));
+
+        const auto T1 =
+            std::make_shared<Tensor>(Array1D<cpptype_t<DataType::Float32>, 3>({0.1, 0.2, 0.3}));
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>(
+            Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}})));
+        op->forwardDims();
+
+        op->backward();
+
+        const Tensor expectedGrad0 =
+            Array2D<cpptype_t<DataType::Float32>, 2, 3>({{{10, 5, 3.3333}, {10, 5, 3.3333}}});
+
+        const Tensor expectedGrad1 = Array1D<cpptype_t<DataType::Float32>, 3>({-500, -175, -100});
+
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1));
+    }
+
+    SECTION("Case 2: 3D and 1D tensors") {
+        const auto T0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+            {{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}},
+              {{7.0, 8.0, 9.0}, {10.0, 11.0, 12.0}}}}));
+
+        const auto T1 =
+            std::make_shared<Tensor>(Array1D<float, 3>({0.3, 0.2, 0.1}));
+
+        const auto newGrad = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+            {{{{1, 1, 1}, {1, 1, 1}}, {{1, 1, 1}, {1, 1, 1}}}}));
+
+        const Tensor expectedGrad0 =
+            Array3D<float, 2, 2, 3>({{{{3.3333, 5.0, 10}, {3.3333, 5.0, 10}},
+                                      {{3.3333, 5.0, 10}, {3.3333, 5.0, 10}}}});
+
+        const Tensor expectedGrad1 = Array1D<cpptype_t<DataType::Float32>, 3>({-244.4444, -650.0, -3000.0});
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(newGrad);
+        op->forwardDims();
+
+        op->backward();
+
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1));
+    }
+
+    SECTION("Case 3: 4D and 2D tensors") {
+        const auto T0 = std::make_shared<Tensor>(Array4D<cpptype_t<DataType::Float32>, 2, 2, 3, 3>(
+            {{{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}, {7.0, 8.0, 9.0}},
+               {{10.0, 11.0, 12.0}, {13.0, 14.0, 15.0}, {16.0, 17.0, 18.0}}},
+              {{{19.0, 20.0, 21.0}, {22.0, 23.0, 24.0}, {25.0, 26.0, 27.0}},
+               {{28.0, 29.0, 30.0},
+                {31.0, 32.0, 33.0},
+                {34.0, 35.0, 36.0}}}}}));
+
+        const auto T1 = std::make_shared<Tensor>(Array2D<cpptype_t<DataType::Float32>, 3, 3>(
+            {{{0.5, 0.3, 0.1}, {0.4, 0.2, 0.6}, {0.7, 0.8, 0.9}}}));
+
+        const auto newGrad =
+            std::make_shared<Tensor>(Array4D<cpptype_t<DataType::Float32>, 2, 2, 3, 3>(
+                {{{{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}},
+                   {{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}},
+                  {{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}},
+                   {{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}}}}));
+
+        const Tensor expectedGrad0 =
+            Array4D<cpptype_t<DataType::Float32>, 2, 2, 3, 3>(
+                {{{{{2, 3.3333, 10}, {2.5, 5.0, 1.66667}, {1.42857, 1.2500, 1.11111}},
+                   {{2, 3.3333, 10}, {2.5, 5.0, 1.66667}, {1.42857, 1.2500, 1.11111}}},
+                  {{{2, 3.3333, 10}, {2.5, 5.0, 1.66667}, {1.42857, 1.2500, 1.11111}},
+                   {{2, 3.3333, 10}, {2.5, 5.0, 1.66667}, {1.42857, 1.2500, 1.11111}}}}});
+
+        const Tensor expectedGrad1 =
+            Array2D<cpptype_t<DataType::Float32>, 3, 3>({{{-232.0, -688.888, -6600.0},
+                                   {-437.5, -1850.0, -216.66667},
+                                   {-167.3469, -134.3750, -111.111}}});
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(newGrad);
+        op->forwardDims();
+
+        op->backward();
+
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1));
+    }
+
+    SECTION("Case 4: 3D and 2D tensors") {
+        const auto T0 = std::make_shared<Tensor>(
+            Array3D<float, 2, 3, 4>({{{
+                                          {1.0, 2.0, 3.0, 4.0},
+                                          {5.0, 6.0, 7.0, 8.0},
+                                          {9.0, 10.0, 11.0, 12.0},
+                                      },
+                                      {
+                                          {13.0, 14.0, 15.0, 16.0},
+                                          {17.0, 18.0, 19.0, 20.0},
+                                          {21.0, 22.0, 23.0, 24.0},
+                                      }}}));
+
+        const auto T1 = std::make_shared<Tensor>(
+            Array2D<cpptype_t<DataType::Float32>, 3, 4>({{{0.1, 0.2, 0.3, 0.4},
+                                   {0.5, 0.6, 0.7, 0.8},
+                                   {0.9, 1.0, 1.1, 1.2}}}));
+
+        const auto newGrad = std::make_shared<Tensor>(
+            Array3D<cpptype_t<DataType::Float32>, 2, 3, 4>({{{
+                                          {1.0, 1.0, 1.0, 1.0},
+                                          {1.0, 1.0, 1.0, 1.0},
+                                          {1.0, 1.0, 1.0, 1.0},
+                                      },
+                                      {
+                                          {1.0, 1.0, 1.0, 1.0},
+                                          {1.0, 1.0, 1.0, 1.0},
+                                          {1.0, 1.0, 1.0, 1.0},
+                                      }}}));
+
+        const Tensor expectedGrad0 =
+            Array3D<cpptype_t<DataType::Float32>, 2, 3, 4>({{{
+                                       {10, 5, 3.33333, 2.5},
+                                       {2, 1.66667, 1.42857, 1.2500},
+                                       {1.11111, 1.0, 0.90909, 0.83333}},
+                                      {{10, 5, 3.33333, 2.5},
+                                       {2, 1.66667, 1.42857, 1.2500},
+                                       {1.11111, 1.0, 0.90909, 0.83333}}}});
+
+        const Tensor expectedGrad1 =
+            Array2D<cpptype_t<DataType::Float32>, 3, 4>({{
+                                   {-1400.0, -400.0, -200.0, -125.0},
+                                   {-88.0, -66.66667, -53.0612, -43.750},
+                                   {-37.0370, -32.0, -28.0992, -25.00}}});
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(newGrad);
+        op->forwardDims();
+
+        op->backward();
+
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1));
+    }
+
+    SECTION("Case 5: Tensors with random values") {
+
+        // Use random values
+        const std::vector<std::size_t> dims0 = {5, 2, 1, 7}; // First tensor
+        const std::vector<std::size_t> dims1 = {2, 6, 7};    // Second tensor
+        const std::vector<std::size_t> outputDims = {5, 2, 6, 7};
+
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> dist(0.1f, 1.0f);
+
+        auto T0 = std::make_shared<Tensor>(dims0);
+        T0->setDataType(DataType::Float32);
+        T0->setBackend("cpu");
+        float* input0Data = static_cast<float*>(T0->getImpl()->rawPtr());
+        // Fill with random values
+        for (std::size_t i = 0; i < T0->size(); ++i) {
+            input0Data[i] = dist(gen);
+        }
+
+        auto T1 = std::make_shared<Tensor>(dims1);
+        T1->setDataType(DataType::Float32);
+        T1->setBackend("cpu");
+        float* input1Data = static_cast<float*>(T1->getImpl()->rawPtr());
+        // Fill with random values
+        for (std::size_t i = 0; i < T1->size(); ++i) {
+            input1Data[i] = dist(gen);
+        }
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+
+        op->forwardDims();
+        op->forward();
+
+        Tensor expectedOutput{outputDims};
+        expectedOutput.setBackend("cpu");
+        float* expectedOutputData = static_cast<float*>(expectedOutput.getImpl()->rawPtr());
+
+        for (std::size_t n = 0; n < 5; ++n) {
+            for (std::size_t c = 0; c < 2; ++c) {
+                for (std::size_t h = 0; h < 6; ++h) {
+                    for (std::size_t w = 0; w < 7; ++w) {
+                        std::size_t outIdx = w + 7 * (h + 6 * (c + 2 * n));
+                        std::size_t in0Idx =
+                            w + 7 * (0 + 1 * (c + 2 * n)); // middle dim is 1
+                        std::size_t in1Idx =
+                            w + 7 * (h + 6 * c);           // no n dimension
+
+                        expectedOutputData[outIdx] = input0Data[in0Idx] / input1Data[in1Idx];
+                    }
+                }
+            }
+        }
+
+        auto outputTensor = op->getOutput(0);
+
+        REQUIRE(approxEq<float>(*outputTensor, expectedOutput));
+
+        // Backward pass
+        std::vector<float> gradOutputData(expectedOutput.size());
+        for (auto &val : gradOutputData) {
+            val = dist(gen);
+        }
+
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>());
+        op->getOutput(0)->grad()->resize(outputDims);
+        op->getOutput(0)->grad()->getImpl()->setRawPtr(gradOutputData.data(),
+                                                       expectedOutput.size());
+
+        // Compute reference gradients
+        std::vector<float> expectedGrad0(T0->size(), 0.0f);
+        std::vector<float> expectedGrad1(T1->size(), 0.0f);
+
+        for (std::size_t n = 0; n < 5; ++n) {
+            for (std::size_t c = 0; c < 2; ++c) {
+                for (std::size_t h = 0; h < 6; ++h) {
+                    for (std::size_t w = 0; w < 7; ++w) {
+                        std::size_t outIdx = w + 7 * (h + 6 * (c + 2 * n));
+                        std::size_t in0Idx = w + 7 * (0 + 1 * (c + 2 * n));
+                        std::size_t in1Idx = w + 7 * (h + 6 * c);
+
+                        expectedGrad0[in0Idx] += 
+                            gradOutputData[outIdx] * (1.0f / input1Data[in1Idx]);
+
+                        expectedGrad1[in1Idx] += 
+                            gradOutputData[outIdx] * (-input0Data[in0Idx] / (input1Data[in1Idx] * input1Data[in1Idx]));
+                    }
+                }
+            }
+        }
+
+        // Perform backward pass
+        op->backward();
+
+        auto expectedGrad0Tensor = std::make_shared<Tensor>();
+        expectedGrad0Tensor->resize(T0->dims());
+        expectedGrad0Tensor->setBackend("cpu");
+        expectedGrad0Tensor->setDataType(DataType::Float32);
+        expectedGrad0Tensor->getImpl()->setRawPtr(expectedGrad0.data(),
+                                                    expectedGrad0.size());
+
+        auto expectedGrad1Tensor = std::make_shared<Tensor>(T1->dims());
+        expectedGrad1Tensor->setBackend("cpu");
+        expectedGrad1Tensor->setDataType(DataType::Float32);
+        expectedGrad1Tensor->getImpl()->setRawPtr(expectedGrad1.data(),
+                                                    expectedGrad1.size());
+
+        // Verify backward pass
+        REQUIRE(approxEq<float>(*T0->grad(), *expectedGrad0Tensor));
+        REQUIRE(approxEq<float>(*T1->grad(), *expectedGrad1Tensor));
+    }
+}
 } // namespace Aidge
-- 
GitLab


From 8a6699936cc68401f588760e51b7382dfef32fc7 Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Thu, 20 Feb 2025 11:10:50 +0100
Subject: [PATCH 03/22] Added /bigobj for unit tests on Windows

---
 unit_tests/CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/unit_tests/CMakeLists.txt b/unit_tests/CMakeLists.txt
index 6c7af9c3..e1f261d0 100644
--- a/unit_tests/CMakeLists.txt
+++ b/unit_tests/CMakeLists.txt
@@ -25,6 +25,10 @@ target_link_libraries(tests${module_name} PRIVATE ${module_name})
 
 target_link_libraries(tests${module_name} PRIVATE Catch2::Catch2WithMain)
 
+target_compile_options(tests${module_name} PRIVATE
+    $<$<CXX_COMPILER_ID:MSVC>:
+    /bigobj>)
+
 list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras)
 include(CTest)
 include(Catch)
-- 
GitLab


From 97d0996af09c481aec835064ec6ad30027a10c40 Mon Sep 17 00:00:00 2001
From: hrouis <houssemeddine.rouis92@gmail.com>
Date: Fri, 24 Jan 2025 16:06:22 +0100
Subject: [PATCH 04/22] add Equal operator

---
 include/aidge/backend/cpu.hpp                 |   1 +
 .../aidge/backend/cpu/operator/EqualImpl.hpp  |  32 +++
 .../cpu/operator/EqualImpl_kernels.hpp        | 163 ++++++++++++++
 src/operator/EqualImpl.cpp                    |  61 ++++++
 unit_tests/operator/Test_EqualImpl.cpp        | 205 ++++++++++++++++++
 5 files changed, 462 insertions(+)
 create mode 100644 include/aidge/backend/cpu/operator/EqualImpl.hpp
 create mode 100644 include/aidge/backend/cpu/operator/EqualImpl_kernels.hpp
 create mode 100644 src/operator/EqualImpl.cpp
 create mode 100644 unit_tests/operator/Test_EqualImpl.cpp

diff --git a/include/aidge/backend/cpu.hpp b/include/aidge/backend/cpu.hpp
index 5db19a2b..ffc03ae5 100644
--- a/include/aidge/backend/cpu.hpp
+++ b/include/aidge/backend/cpu.hpp
@@ -29,6 +29,7 @@
 #include "aidge/backend/cpu/operator/ConvImpl.hpp"
 #include "aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp"
 #include "aidge/backend/cpu/operator/DivImpl.hpp"
+#include "aidge/backend/cpu/operator/EqualImpl.hpp"
 #include "aidge/backend/cpu/operator/ErfImpl.hpp"
 #include "aidge/backend/cpu/operator/ExpandImpl.hpp"
 #include "aidge/backend/cpu/operator/FCImpl.hpp"
diff --git a/include/aidge/backend/cpu/operator/EqualImpl.hpp b/include/aidge/backend/cpu/operator/EqualImpl.hpp
new file mode 100644
index 00000000..e2489096
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/EqualImpl.hpp
@@ -0,0 +1,32 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_EQUALIMPL_H_
+#define AIDGE_CPU_OPERATOR_EQUALIMPL_H_
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Equal.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include <memory>
+#include <vector>
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using EqualImpl_cpu = OperatorImpl_cpu<Equal_Op,
+    void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*, void*)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(Equal_Op, "cpu", Aidge::EqualImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_EQUALIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/EqualImpl_kernels.hpp b/include/aidge/backend/cpu/operator/EqualImpl_kernels.hpp
new file mode 100644
index 00000000..3c8ff0f4
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/EqualImpl_kernels.hpp
@@ -0,0 +1,163 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_EQUALIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_EQUALIMPL_KERNELS_H_
+
+#include "aidge/backend/cpu/operator/EqualImpl.hpp"
+#include "aidge/utils/Registrar.hpp"
+
+namespace Aidge {
+
+namespace {
+// suppose values are contiguous in memory
+template <class I, class O>
+void equal_contiguous_arrays(const std::size_t input1size,
+                            const std::size_t input2size,
+                            const std::size_t output1size,
+                            const I* input1,
+                            const I* input2,
+                            O* output)
+{
+    for (std::size_t i = 0; i < output1size; ++i)
+    {
+        const std::size_t in1_id = (input1size != 1) ? i : 0;
+        const std::size_t in2_id = (input2size != 1) ? i : 0;
+        output[i] = static_cast<O>(input1[in1_id] == input2[in2_id]);
+    }
+}
+}
+
+
+template <class I, class O>
+void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
+                                std::vector<std::size_t> dims1,
+                                const std::vector<std::size_t>& outputDims,
+                                const void* input0_,
+                                const void* input1_,
+                                void* output_) {
+
+    const I* input_0 = static_cast<const I*>(input0_);
+    const I* input_1 = static_cast<const I*>(input1_);
+    O* output = static_cast<O*>(output_);
+
+    // [5,2,1,7] & [2,6,7]
+    // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7]
+    // 2. Find the highest equal dimension -> 3
+    //    Exception: if the first diverging dimension is the last one, then -> 4 (dims.size())
+    // 3. Compute the highest number of contiguous data -> 7
+    // 4. Compute stride and offset step for the broadcast mechanism
+    // 5. Call a simple kernel
+
+    // special case for equal dimensions, the kernel is called with the entire arrays at once
+    if (dims0 == dims1) {
+        const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+        for (std::size_t i = 0; i < input0_contiguous_size; ++i)
+        {
+            output[i] = static_cast<O>(input_0[i] == input_1[i]);
+        }
+        return;
+    }
+
+    // set dimensions to be of equal size by filling the smallest one with ones.
+    if (dims0.size() > dims1.size()) {
+        dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1));
+    }
+    else if (dims1.size() > dims0.size()) {
+        dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1));
+    }
+
+    const std::size_t nbDims = dims0.size();
+
+    // Find the highest equal dimension
+    // std::size_t contiguousIdx = nbDims - 1;
+    std::size_t contiguousIdx = nbDims;
+    while (contiguousIdx-- > 0) {
+    // for (; contiguousIdx+1 > 0; --contiguousIdx) {
+        if (dims0[contiguousIdx] != dims1[contiguousIdx]) {
+            if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1
+                const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1;
+                while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) {
+                    --contiguousIdx;
+                }
+            }
+            break;
+        }
+    }
+    ++contiguousIdx;
+
+    // Compute the highest number of contiguous data for each Tensor
+    const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+
+    // initialize strides to iterate through data because of broadcasting
+    std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    if (contiguousIdx > 0) {
+        stride_post0[contiguousIdx - 1] = 1;
+        stride_post1[contiguousIdx - 1] = 1;
+        for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) {
+            stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]);
+            stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]);
+        }
+        for (std::size_t i = 0; i != contiguousIdx; ++i) {
+            stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1;
+            stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1;
+        }
+    }
+
+    // variables for arrays offsets
+    std::size_t offsetIn0 = 0;
+    std::size_t offsetIn1 = 0;
+    std::size_t offsetOut = 0;
+
+
+    std::size_t dim = contiguousIdx - 1;
+    const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
+    for (std::size_t stack = 0; stack < nbStacks;) {
+        equal_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
+                    input_0 + offsetIn0*input0_contiguous_size,
+                    input_1 + offsetIn1*input1_contiguous_size,
+                    output + offsetOut*output_contiguous_size);
+        if (++stack < nbStacks) {
+            std::size_t tmp_stack = stack;
+            while(tmp_stack % outputDims[dim] == 0) {
+                tmp_stack /= outputDims[dim];
+                dim--;
+            }
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = contiguousIdx - 1;
+        }
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(EqualImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float32}},
+    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(EqualImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}},
+    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(EqualImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}},
+    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr});
+REGISTRAR(EqualImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int64}},
+    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, nullptr});
+
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_EQUALIMPL_KERNELS_H_ */
diff --git a/src/operator/EqualImpl.cpp b/src/operator/EqualImpl.cpp
new file mode 100644
index 00000000..5926212e
--- /dev/null
+++ b/src/operator/EqualImpl.cpp
@@ -0,0 +1,61 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <cassert>
+#include <chrono>  // std::chrono::milliseconds
+#include <numeric> // std::accumulate
+#include <thread>  // std::this_thread::sleep_for
+#include <vector>
+
+#include "aidge/operator/Equal.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/Broadcasting.hpp"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+
+#include "aidge/backend/cpu/operator/EqualImpl.hpp"
+#include "aidge/backend/cpu/operator/EqualImpl_kernels.hpp"
+
+template <>
+void Aidge::EqualImpl_cpu::forward() {
+    const Equal_Op& op = static_cast<const Equal_Op&>(mOp);
+    // Check inputs
+    AIDGE_ASSERT(op.getInput(0), "missing input in Equal operator");
+    AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Equal forward because the 0-th input has no implementation.");
+
+    AIDGE_ASSERT(op.getInput(1), "missing input in Equal operator");
+    AIDGE_ASSERT(op.getInput(1)->hasImpl(), "cannot run Equal forward because the 1st input has no implementation.");
+
+    AIDGE_ASSERT(op.getInput(1)->dataType() == op.getInput(0)->dataType(), "Cannot Equal inputs with two differents data type.");
+
+    // Find the correct kernel type
+    const auto impl = Registrar<EqualImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Convert input data (no overhead if not needed!)
+    // TODO: right now, if needed, memory will be allocated/deallocated at each
+    // call to forward(). We might put the following shared_ptr as members of
+    // this class to avoid that.
+    std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback;
+    const auto& input0 = op.getInput(0)->refCastFrom(input0Fallback, *op.getInput(0));
+    const auto& input1 = op.getInput(1)->refCastFrom(input1Fallback, *op.getInput(1));
+
+
+    impl.forward(op.getInput(0)->dims(),
+                op.getInput(1)->dims(),
+                op.getOutput(0)->dims(),
+                input0.getImpl()->rawPtr(),
+                input1.getImpl()->rawPtr(),
+                getCPUPtr(op.getRawOutput(0)));
+}
+
+template <>
+void Aidge::EqualImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Equal_Op on backend cpu");
+}
diff --git a/unit_tests/operator/Test_EqualImpl.cpp b/unit_tests/operator/Test_EqualImpl.cpp
new file mode 100644
index 00000000..a229b8ce
--- /dev/null
+++ b/unit_tests/operator/Test_EqualImpl.cpp
@@ -0,0 +1,205 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <catch2/catch_test_macros.hpp>
+#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
+
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/Equal.hpp"
+
+#include "aidge/backend/cpu.hpp"
+
+using namespace Aidge;
+
+TEST_CASE("[cpu/operator] Equal(forward)", "[Equal][CPU]") {
+        SECTION("ForwardDims")
+    {
+        constexpr std::uint16_t NBTRIALS = 10;
+        // Create a random number generator
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
+        std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
+        std::uniform_int_distribution<int> boolDist(0,1);
+
+        SECTION("Same dimensions") {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                DimSize_t nbDims = nbDimsDist(gen);
+                std::vector<DimSize_t> dims(nbDims);
+                for (std::size_t i = 0; i < nbDims; i++) {
+                    dims[i] = dimSizeDist(gen);
+                }
+
+                std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims);
+                myInput1->setBackend("cpu");
+                myInput1->setDataType(DataType::Float32);
+                myInput1->zeros();
+                std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims);
+                myInput2->setBackend("cpu");
+                myInput2->setDataType(DataType::Float32);
+                myInput2->zeros();
+                std::shared_ptr<Node> myEqual = Equal();
+                auto op = std::static_pointer_cast<OperatorTensor>(myEqual -> getOperator());
+                op->associateInput(0,myInput1);
+                op->associateInput(1,myInput2);
+                op->setDataType(DataType::Float32);
+                op->setBackend("cpu");
+                op->forwardDims();
+
+                const auto outputDims = op->getOutput(0)->dims();
+                REQUIRE(outputDims == dims);
+            }
+        }
+        SECTION("Broadcasting") {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                DimSize_t nbDims = nbDimsDist(gen);
+                std::vector<DimSize_t> dims1(nbDims, 1);
+                std::vector<DimSize_t> dims2(nbDims, 1);
+                std::vector<DimSize_t> expectedOutDims;
+                for (std::size_t i = 0; i < nbDims; i++) {
+                    DimSize_t dim = dimSizeDist(gen);
+                    if (boolDist(gen)) {
+                        dims1[i] = dim;
+                    }
+                    if (boolDist(gen)) {
+                        dims2[i] = dim;
+                    }
+                    expectedOutDims.push_back(std::max(dims1[i],dims2[i]));
+                }
+
+
+                std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims1);
+                myInput1->setBackend("cpu");
+                myInput1->setDataType(DataType::Float32);
+                myInput1->zeros();
+                std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims2);
+                myInput2->setBackend("cpu");
+                myInput2->setDataType(DataType::Float32);
+                myInput2->zeros();
+                std::shared_ptr<Node> myEqual = Equal();
+                auto op = std::static_pointer_cast<OperatorTensor>(myEqual -> getOperator());
+                op->associateInput(0,myInput1);
+                op->associateInput(1,myInput2);
+                op->setDataType(DataType::Float32);
+                op->setBackend("cpu");
+
+                op->forwardDims();
+
+                const auto outputDims = op->getOutput(0)->dims();
+                REQUIRE(outputDims == expectedOutDims);
+            }
+        }
+    }
+    SECTION("Same size inputs") {
+        std::shared_ptr<Tensor> input1 = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
+        {                                       //
+            {                                   //
+                {{20, 15},{31, 11},{22, 49}},   //
+                {{41, 10},{24, 51},{27, 52}},   //
+                {{26, 53},{27, 54},{28, 55}}    //
+            },                                  //
+            {                                   //
+                {{29, 56},{30, 57},{31, 58}},   //
+                {{32, 59},{33, 60},{34, 61}},   //
+                {{35, 62},{36, 63},{37, 64}}    //
+            },                                  //
+            {                                   //
+                {{38, 65},{39, 66},{40, 67}},   //
+                {{41, 68},{42, 69},{43, 70}},   //
+                {{44, 71},{45, 72},{46, 73}}    //
+            }                                   //
+        }                                       //
+    });                                         //
+        std::shared_ptr<Tensor> input2 = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
+            {                                       //
+                {                                   //
+                    {{20, 47},{21, 48},{22, 49}},   //
+                    {{23, 50},{24, 51},{25, 52}},   //
+                    {{17, 53},{27, 26},{14, 33}}    //
+                },                                  //
+                {                                   //
+                    {{29, 56},{30, 57},{31, 58}},   //
+                    {{72, 44},{33, 20},{27, 55}},   //
+                    {{35, 24},{25, 63},{28, 64}}    //
+                },                                  //
+                {                                   //
+                    {{32, 65},{39, 66},{40, 70}},   //
+                    {{41, 53},{42, 60},{34, 70}},   //
+                    {{44, 71},{30, 12},{46, 73}}    //
+                }                                   //
+            }                                       //
+        });                                         //
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
+            {
+                {
+                    {{1, 0},{0, 0},{1, 1}},
+                    {{0, 0},{1, 1},{0, 1}},
+                    {{0, 1},{1, 0},{0, 0}}
+                },
+                {
+                    {{1, 1},{1, 1},{1, 1}},
+                    {{0, 0},{1, 0},{0, 0}},
+                    {{1, 0},{0, 1},{0, 1}}
+                },
+                {
+                    {{0, 1},{1, 1},{1, 0}},
+                    {{1, 0},{1, 0},{0, 1}},
+                    {{1, 1},{0, 0},{1, 1}}
+                }
+            }
+        });
+
+        std::shared_ptr<Node> myEqual = Equal();
+        auto op = std::static_pointer_cast<OperatorTensor>(myEqual -> getOperator());
+        op->associateInput(0, input1);
+        op->associateInput(1, input2);
+        op->setBackend("cpu");
+        op->setDataType(DataType::Int32);
+        myEqual->forward();
+
+        REQUIRE(*(op->getOutput(0)) == *expectedOutput);
+    }
+
+    SECTION("Broadcasting") {
+        std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<int,1,3,3,2> {
+        {                                       //
+            {                                   //
+                {{10, 20},{22, 23},{20, 20}},   //
+                {{10, 15},{10, 29},{20, 20}},   //
+                {{26, 25},{33, 20},{10, 20}}    //
+            }                                   //
+        }                                       //
+        });                                     //
+
+        std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<int,2> {{10, 20}});  
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,1,3,3,2> {
+            {                                   //
+                {                               //
+                    {{ 1, 1},{ 0, 0},{ 0, 1}},  //
+                    {{ 1, 0},{ 1, 0},{ 0, 1}},  //
+                    {{ 0, 0},{ 0, 1},{ 1, 1}}   //
+                }                               //
+            }                                   //
+        });                                     //
+
+        std::shared_ptr<Node> myEqual = Equal();
+        auto op = std::static_pointer_cast<OperatorTensor>(myEqual -> getOperator());
+        op->associateInput(0, input_1);
+        op->associateInput(1, input_2);
+        op->setDataType(DataType::Int32);
+        op->setBackend("cpu");
+        myEqual->forward();
+        op->getOutput(0)->print();
+        expectedOutput->print();
+        REQUIRE(*op->getOutput(0) == *expectedOutput);
+    }
+}
\ No newline at end of file
-- 
GitLab


From 8701618c638707e45478b781a78a1d64ec16f407 Mon Sep 17 00:00:00 2001
From: hrouis <houssemeddine.rouis92@gmail.com>
Date: Fri, 24 Jan 2025 16:07:13 +0100
Subject: [PATCH 05/22] fix And operator

---
 .../backend/cpu/operator/AndImpl_kernels.hpp  |  29 ++-
 unit_tests/operator/Test_AndImpl.cpp          | 191 +++++++++---------
 2 files changed, 108 insertions(+), 112 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp
index 73b710e0..d7c8ebcf 100644
--- a/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp
@@ -20,7 +20,7 @@ namespace Aidge {
 namespace {
 // suppose values are contiguous in memory
 template <class I, class O>
-void equal_contiguous_arrays(const std::size_t input1size,
+void and_contiguous_arrays(const std::size_t input1size,
                             const std::size_t input2size,
                             const std::size_t output1size,
                             const I* input1,
@@ -31,14 +31,14 @@ void equal_contiguous_arrays(const std::size_t input1size,
     {
         const std::size_t in1_id = (input1size != 1) ? i : 0;
         const std::size_t in2_id = (input2size != 1) ? i : 0;
-        output[i] = static_cast<O>(input1[in1_id] == input2[in2_id]);
+        output[i] = static_cast<O>(input1[in1_id] && input2[in2_id]);
     }
 }
 }
 
 
 template <class I, class O>
-void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
+void AndImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
                                 std::vector<std::size_t> dims1,
                                 const std::vector<std::size_t>& outputDims,
                                 const void* input0_,
@@ -60,9 +60,8 @@ void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
     // special case for equal dimensions, the kernel is called with the entire arrays at once
     if (dims0 == dims1) {
         const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
-        for (std::size_t i = 0; i < input0_contiguous_size; ++i)
-        {
-            output[i] = static_cast<O>(input_0[i] == input_1[i]);
+        for (std::size_t i = 0; i < input0_contiguous_size; ++i) {
+            output[i] = static_cast<O>(input_0[i] && input_1[i]);
         }
         return;
     }
@@ -126,7 +125,7 @@ void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
     std::size_t dim = contiguousIdx - 1;
     const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
     for (std::size_t stack = 0; stack < nbStacks;) {
-        equal_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
+        and_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
                     input_0 + offsetIn0*input0_contiguous_size,
                     input_1 + offsetIn1*input1_contiguous_size,
                     output + offsetOut*output_contiguous_size);
@@ -146,17 +145,17 @@ void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
 
 // Kernels registration to implementation entry point
 REGISTRAR(AndImpl_cpu,
-    {DataType::Float32},
-    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<float, float>, nullptr});
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float32}},
+    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<float, float>, nullptr});
 REGISTRAR(AndImpl_cpu,
-    {DataType::Float64},
-    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<double, double>, nullptr});
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}},
+    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<double, double>, nullptr});
 REGISTRAR(AndImpl_cpu,
-    {DataType::Int32},
-    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr});
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}},
+    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr});
 REGISTRAR(AndImpl_cpu,
-    {DataType::Int64},
-    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, nullptr});
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int64}},
+    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, nullptr});
 
 }  // namespace Aidge
 
diff --git a/unit_tests/operator/Test_AndImpl.cpp b/unit_tests/operator/Test_AndImpl.cpp
index c2309dce..978a89e5 100644
--- a/unit_tests/operator/Test_AndImpl.cpp
+++ b/unit_tests/operator/Test_AndImpl.cpp
@@ -26,75 +26,92 @@
 using namespace Aidge;
 
 TEST_CASE("[cpu/operator] And(forward)", "[And][CPU]") {
-        SECTION("ForwardDims")
-    {
+    SECTION("ForwardDims") {
         constexpr std::uint16_t NBTRIALS = 10;
         // Create a random number generator
         std::random_device rd;
         std::mt19937 gen(rd());
-        std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
-        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
-        std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
-        std::uniform_int_distribution<int> boolDist(0,1);
+        std::uniform_int_distribution<int> boolDist(0, 1); // Use 0 for false, 1 for true
+        std::uniform_int_distribution<std::size_t> dimSizeDist(2, 10);
+        std::uniform_int_distribution<std::size_t> nbDimsDist(1, 5);
 
         SECTION("Same dimensions") {
             for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
                 DimSize_t nbDims = nbDimsDist(gen);
                 std::vector<DimSize_t> dims(nbDims);
-                for (std::size_t i = 0; i < nbDims; i++) {
+                for (std::size_t i = 0; i < nbDims; ++i) {
                     dims[i] = dimSizeDist(gen);
                 }
-
+                const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                float* array0 = new float[nb_elements];
+                float* array1 = new float[nb_elements];
+                for (std::size_t i = 0; i < nb_elements; ++i) {
+                    array0[i] = boolDist(gen);
+                    array1[i] = boolDist(gen);
+                }
                 std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims);
-                myInput1->setBackend("cpu");
-                myInput1->setDataType(DataType::Float32);
-                myInput1->zeros();
                 std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims);
-                myInput2->setBackend("cpu");
+                myInput1->setDataType(DataType::Float32);
                 myInput2->setDataType(DataType::Float32);
-                myInput2->zeros();
+                myInput1->setBackend("cpu");
+                myInput2->setBackend("cpu");
+
+                myInput1 -> getImpl() -> setRawPtr(array0, nb_elements);
+                myInput2 -> getImpl() -> setRawPtr(array1, nb_elements);
+
                 std::shared_ptr<Node> myAnd = And();
-                auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator());
-                op->associateInput(0,myInput1);
-                op->associateInput(1,myInput2);
+                auto op = std::static_pointer_cast<OperatorTensor>(myAnd->getOperator());
+                op->associateInput(0, myInput1);
+                op->associateInput(1, myInput2);
                 op->setDataType(DataType::Float32);
                 op->setBackend("cpu");
                 op->forwardDims();
 
                 const auto outputDims = op->getOutput(0)->dims();
                 REQUIRE(outputDims == dims);
+                delete[] array0;
+                delete[] array1;
             }
         }
+
         SECTION("Broadcasting") {
             for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
                 DimSize_t nbDims = nbDimsDist(gen);
                 std::vector<DimSize_t> dims1(nbDims, 1);
                 std::vector<DimSize_t> dims2(nbDims, 1);
                 std::vector<DimSize_t> expectedOutDims;
-                for (std::size_t i = 0; i < nbDims; i++) {
+                for (std::size_t i = 0; i < nbDims; ++i) {
                     DimSize_t dim = dimSizeDist(gen);
-                    if (boolDist(gen)) {
-                        dims1[i] = dim;
-                    }
-                    if (boolDist(gen)) {
-                        dims2[i] = dim;
-                    }
-                    expectedOutDims.push_back(std::max(dims1[i],dims2[i]));
+                    if (boolDist(gen)) dims1[i] = dim;
+                    if (boolDist(gen)) dims2[i] = dim;
+                    expectedOutDims.push_back(std::max(dims1[i], dims2[i]));
                 }
 
+                const std::size_t nb_elements0 = std::accumulate(dims1.cbegin(), dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                const std::size_t nb_elements1 = std::accumulate(dims2.cbegin(), dims2.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                float* array0 = new float[nb_elements0];
+                float* array1 = new float[nb_elements1];
+                for (std::size_t i = 0; i < nb_elements0; ++i) {
+                    array0[i] = boolDist(gen);
+                }
+                for (std::size_t i = 0; i < nb_elements1; ++i) {
+                    array1[i] = boolDist(gen);
+                }
 
                 std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims1);
-                myInput1->setBackend("cpu");
-                myInput1->setDataType(DataType::Float32);
-                myInput1->zeros();
                 std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims2);
-                myInput2->setBackend("cpu");
+                myInput1->setDataType(DataType::Float32);
                 myInput2->setDataType(DataType::Float32);
-                myInput2->zeros();
+                myInput1->setBackend("cpu");
+                myInput2->setBackend("cpu");
+                myInput1 -> getImpl() -> setRawPtr(array0, nb_elements0);
+                myInput2 -> getImpl() -> setRawPtr(array1, nb_elements1);
+
+
                 std::shared_ptr<Node> myAnd = And();
-                auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator());
-                op->associateInput(0,myInput1);
-                op->associateInput(1,myInput2);
+                auto op = std::static_pointer_cast<OperatorTensor>(myAnd->getOperator());
+                op->associateInput(0, myInput1);
+                op->associateInput(1, myInput2);
                 op->setDataType(DataType::Float32);
                 op->setBackend("cpu");
 
@@ -102,80 +119,48 @@ TEST_CASE("[cpu/operator] And(forward)", "[And][CPU]") {
 
                 const auto outputDims = op->getOutput(0)->dims();
                 REQUIRE(outputDims == expectedOutDims);
+                delete[] array0;
+                delete[] array1;
             }
         }
     }
+
     SECTION("Same size inputs") {
-        std::shared_ptr<Tensor> input1 = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
-        {                                       //
-            {                                   //
-                {{20, 15},{31, 11},{22, 49}},   //
-                {{41, 10},{24, 51},{27, 52}},   //
-                {{26, 53},{27, 54},{28, 55}}    //
-            },                                  //
-            {                                   //
-                {{29, 56},{30, 57},{31, 58}},   //
-                {{32, 59},{33, 60},{34, 61}},   //
-                {{35, 62},{36, 63},{37, 64}}    //
-            },                                  //
-            {                                   //
-                {{38, 65},{39, 66},{40, 67}},   //
-                {{41, 68},{42, 69},{43, 70}},   //
-                {{44, 71},{45, 72},{46, 73}}    //
-            }                                   //
-        }                                       //
-    });                                         //
-        std::shared_ptr<Tensor> input2 = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
-            {                                       //
-                {                                   //
-                    {{20, 47},{21, 48},{22, 49}},   //
-                    {{23, 50},{24, 51},{25, 52}},   //
-                    {{17, 53},{27, 26},{14, 33}}    //
-                },                                  //
-                {                                   //
-                    {{29, 56},{30, 57},{31, 58}},   //
-                    {{72, 44},{33, 20},{27, 55}},   //
-                    {{35, 24},{25, 63},{28, 64}}    //
-                },                                  //
-                {                                   //
-                    {{32, 65},{39, 66},{40, 70}},   //
-                    {{41, 53},{42, 60},{34, 70}},   //
-                    {{44, 71},{30, 12},{46, 73}}    //
-                }                                   //
-            }                                       //
-        });                                         //
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
+        std::shared_ptr<Tensor> input1 = std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 2>{
             {
-                {
-                    {{1, 0},{0, 0},{1, 1}},
-                    {{0, 0},{1, 1},{0, 1}},
-                    {{0, 1},{1, 0},{0, 0}}
-                },
-                {
-                    {{1, 1},{1, 1},{1, 1}},
-                    {{0, 0},{1, 0},{0, 0}},
-                    {{1, 0},{0, 1},{0, 1}}
-                },
-                {
-                    {{0, 1},{1, 1},{1, 0}},
-                    {{1, 0},{1, 0},{0, 1}},
-                    {{1, 1},{0, 0},{1, 1}}
-                }
-            }
-        });
+                {{{1, 0}, {0, 1}},
+                {{1, 1}, {0, 0}}},
+                {{{0, 1}, {1, 0}},
+                {{1, 0}, {0, 1}}}}
+            });
+        std::shared_ptr<Tensor> input2 = std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 2>{
+            {
+                {{{1, 1}, {0, 0}},
+                {{0, 1}, {1, 1}}},
+                {{{1, 1}, {0, 0}},
+                {{0, 1}, {1, 0}}}}
+            });
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 2>{
+            {
+                {{{1, 0}, {0, 0}},
+                {{0, 1}, {0, 0}}},
+                {{{0, 1}, {0, 0}},
+                {{0, 0}, {0, 0}}}}
+            });
 
         std::shared_ptr<Node> myAnd = And();
-        auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator());
+        auto op = std::static_pointer_cast<OperatorTensor>(myAnd->getOperator());
         op->associateInput(0, input1);
         op->associateInput(1, input2);
         op->setBackend("cpu");
-        op->setDataType(DataType::Int32);
+        op->setDataType(DataType::Float32);
         myAnd->forward();
-
+        op->getOutput(0)->print();
         REQUIRE(*(op->getOutput(0)) == *expectedOutput);
     }
 
     SECTION("Broadcasting") {
+<<<<<<< HEAD
         std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<int,1,3,3,2> {
         {                                       //
             {                                   //
@@ -196,16 +181,28 @@ TEST_CASE("[cpu/operator] And(forward)", "[And][CPU]") {
                 }                               //
             }                                   //
         });                                     //
+=======
+        std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<float, 1, 2, 2, 2>{
+            {
+                {{{1, 0}, {1, 0}},
+                {{1, 1}, {0, 0}}}}
+            });
+        std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<float, 2>{{1, 0}});
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<float, 1, 2, 2, 2>{
+            {
+                {{{1, 0}, {1, 0}},
+                {{1, 0}, {0, 0}}}}
+            });
+>>>>>>> fix and kernel and unit tests
 
         std::shared_ptr<Node> myAnd = And();
-        auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator());
+        auto op = std::static_pointer_cast<OperatorTensor>(myAnd->getOperator());
         op->associateInput(0, input_1);
         op->associateInput(1, input_2);
-        op->setDataType(DataType::Int32);
+        op->setDataType(DataType::Float32);
         op->setBackend("cpu");
         myAnd->forward();
-        op->getOutput(0)->print();
-        expectedOutput->print();
-        REQUIRE(*op->getOutput(0) == *expectedOutput);
+
+        REQUIRE(*(op->getOutput(0)) == *expectedOutput);
     }
-}
\ No newline at end of file
+}
-- 
GitLab


From 0fbfb571b52f38f02c1a691cecac20d3843cbf22 Mon Sep 17 00:00:00 2001
From: hrouis <houssemeddine.rouis92@gmail.com>
Date: Fri, 24 Jan 2025 16:08:17 +0100
Subject: [PATCH 06/22] add dilations to maxpool

---
 .../backend/cpu/operator/MaxPoolingImpl.hpp   |   1 +
 .../cpu/operator/MaxPoolingImpl_kernels.hpp   | 126 ++----------------
 src/operator/MaxPoolingImpl.cpp               |   1 +
 unit_tests/operator/Test_MaxPoolingImpl.cpp   |  35 +++++
 4 files changed, 49 insertions(+), 114 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp b/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp
index 68cc3621..062088a1 100644
--- a/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp
+++ b/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp
@@ -28,6 +28,7 @@ namespace Aidge {
 using MaxPooling2D_Op = MaxPooling_Op<2>;
 using MaxPoolingImpl2D_cpu = OperatorImpl_cpu<MaxPooling_Op<2>,
     void(const std::array<DimSize_t, 2>&,
+                            const std::array<DimSize_t, 2>&,
                             const std::array<DimSize_t, 2>&,
                             const bool,
                             const std::array<DimSize_t, 4> &,
diff --git a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
index 7b6f04f1..250b11b0 100644
--- a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
@@ -35,28 +35,23 @@ namespace Aidge {
 template <class I, class O>
 void MaxPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
                                         const std::array<DimSize_t, 2>& kernelDims,
+                                        const std::array<DimSize_t, 2>& dilations,
                                         const bool /*ceilMode*/,
                                         const std::array<DimSize_t, 4> &dims,
                                         const void *input_,
                                         void *output_) {
-    // FIXME: missing convolution parameters as arguments
     const I *input = static_cast<const I *>(input_);
     O *output = static_cast<O *>(output_);
 
     // output H size
     const std::size_t oxSize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(dims[2] - kernelDims[0] + strideDims[0]) /
+            static_cast<std::size_t>(std::floor(static_cast<float>(dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0]) /
                                 static_cast<float>(strideDims[0])));
     // output W size
     const std::size_t oySize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(dims[3] - kernelDims[1] + strideDims[1]) /
+            static_cast<std::size_t>(std::floor(static_cast<float>(dims[3] - (kernelDims[1] - 1) * dilations[1] - 1 + strideDims[1]) /
                                 static_cast<float>(strideDims[1])));
 
-    // TODO: kernel computation
-    // output (batch, outCh, Xout, Yout)
-    // input  (batch, ch, Xin, Yin)
-    // weight (outCh, ch, kernelX, kernelY)
-    // does not take Dilation parameter into account
     using signedsize = std::make_signed<std::size_t>::type;
     for (std::size_t batch = 0; batch < dims[0]; ++batch) {
         for (std::size_t ch = 0; ch < dims[1]; ++ch) {
@@ -77,12 +72,15 @@ void MaxPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideD
                     I poolValue(0.0);
                     bool valid = false;
 
-                    for (unsigned int channel = 0; channel < dims[1];
-                            ++channel){
-                        for (unsigned int sy = syMin; sy < syMax; ++sy) {
-                            for (unsigned int sx = sxMin; sx < sxMax; ++sx)
-                            {
-                                const I value = input[iIndex + (ix+sx)*dims[3] + (iy+sy)];
+                    for (unsigned int sy = syMin; sy < syMax; ++sy) {
+                        for (unsigned int sx = sxMin; sx < sxMax; ++sx) {
+                            // Apply dilation factor to kernel indices
+                            const std::size_t dilated_sx = sx * dilations[0];
+                            const std::size_t dilated_sy = sy * dilations[1];
+
+                            // Ensure indices are within bounds
+                            if ((ix + dilated_sx) < dims[2] && (iy + dilated_sy) < dims[3]) {
+                                const I value = input[iIndex + (ix + dilated_sx) * dims[3] + (iy + dilated_sy)];
 
                                 if (!valid || value > poolValue) {
                                     poolValue = value;
@@ -98,106 +96,6 @@ void MaxPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideD
     }
 }
 
-//N2D2 version
-/*
-template <class T>
-void N2D2::PoolCell_Frame_Kernels::forwardMax(const T* alpha,
-                                              const Tensor<T>&
-                                              inputs,
-                                              const Descriptor& desc,
-                                              const T* beta,
-                                              Tensor<T>& outputs,
-                                              Tensor<ArgMax>& argMax,
-                                              bool useArgMax,
-                                              const Tensor<bool>& maps)
-{
-    const unsigned int size = inputs.dimB() * outputs.dimZ();
-
-#if defined(_OPENMP) && _OPENMP >= 200805
-#pragma omp parallel for collapse(2) if (size > 16)
-#else
-#pragma omp parallel for if (inputs.dimB() > 4 && size > 16)
-#endif
-    for (int batchPos = 0; batchPos < (int)inputs.dimB(); ++batchPos) {
-        for (unsigned int output = 0; output < outputs.dimZ(); ++output) {
-            for (unsigned int oy = 0; oy < outputs.dimY(); ++oy) {
-                for (unsigned int ox = 0; ox < outputs.dimX(); ++ox) {
-                    const unsigned int sxMin = (unsigned int)std::max(
-                        desc.padding[0] - (int)(ox * desc.stride[0]), 0);
-                    const unsigned int syMin = (unsigned int)std::max(
-                        desc.padding[1] - (int)(oy * desc.stride[1]), 0);
-                    const unsigned int sxMax = Utils::clamp
-                        <int>(inputs.dimX() + desc.padding[0] - ox * desc.stride[0],
-                              0,
-                              desc.pool[0]);
-                    const unsigned int syMax = Utils::clamp
-                        <int>(inputs.dimY() + desc.padding[1] - oy * desc.stride[1],
-                              0,
-                              desc.pool[1]);
-
-                    const int ix = (int)(ox * desc.stride[0]) - desc.padding[0];
-                    const int iy = (int)(oy * desc.stride[1]) - desc.padding[1];
-
-                    T poolValue(0.0);
-
-                    // For each output, compute the pool value
-                    if (useArgMax) {
-                        const ArgMax inputMax
-                            = argMax(ox, oy, output, batchPos);
-
-                        if (inputMax.valid) {
-                            poolValue = inputs(inputMax.ix,
-                                               inputMax.iy,
-                                               inputMax.channel,
-                                               batchPos);
-                        }
-                    }
-                    else {
-                        unsigned int ixMax = 0;
-                        unsigned int iyMax = 0;
-                        unsigned int channelMax = 0;
-                        bool valid = false;
-
-                        for (unsigned int channel = 0; channel < inputs.dimZ();
-                             ++channel)
-                        {
-                            if (!maps.empty() && !maps(output, channel))
-                                continue;
-
-                            for (unsigned int sy = syMin; sy < syMax; ++sy) {
-                                for (unsigned int sx = sxMin; sx < sxMax; ++sx)
-                                {
-                                    const T value = inputs(ix + sx,
-                                                                 iy + sy,
-                                                                 channel,
-                                                                 batchPos);
-
-                                    if (!valid || value > poolValue) {
-                                        poolValue = value;
-                                        valid = true;
-
-                                        ixMax = ix + sx;
-                                        iyMax = iy + sy;
-                                        channelMax = channel;
-                                    }
-                                }
-                            }
-                        }
-
-                        argMax(ox, oy, output, batchPos)
-                            = ArgMax(ixMax, iyMax, channelMax, valid);
-                    }
-
-                    outputs(ox, oy, output, batchPos)
-                        = (*alpha) * poolValue
-                          + (*beta) * outputs(ox, oy, output, batchPos);
-                }
-            }
-        }
-    }
-}
-
-*/
 
 // Kernels registration to implementation entry point
 REGISTRAR(MaxPoolingImpl2D_cpu,
diff --git a/src/operator/MaxPoolingImpl.cpp b/src/operator/MaxPoolingImpl.cpp
index 90075a39..13ef75b0 100644
--- a/src/operator/MaxPoolingImpl.cpp
+++ b/src/operator/MaxPoolingImpl.cpp
@@ -30,6 +30,7 @@ void Aidge::MaxPoolingImpl2D_cpu::forward() {
     // Call kernel
     impl.forward(op_.strideDims(),
                 op_.kernelDims(),
+                op_.dilations(),
                 op_.ceilMode(),
                 op_.getInput(0)->template dims<4>(),
                 getCPUPtr(mOp.getRawInput(0)),
diff --git a/unit_tests/operator/Test_MaxPoolingImpl.cpp b/unit_tests/operator/Test_MaxPoolingImpl.cpp
index de02df2b..6b7e6d2f 100644
--- a/unit_tests/operator/Test_MaxPoolingImpl.cpp
+++ b/unit_tests/operator/Test_MaxPoolingImpl.cpp
@@ -80,4 +80,39 @@ TEST_CASE("[cpu/operator] MaxPooling(forward)", "[MaxPooling][CPU]") {
         op->getOutput(0)->print();
         REQUIRE(*(op->getOutput(0)) == myOutput);
     }
+    SECTION("Dilation") {
+        std::shared_ptr<Node> myMaxPool = MaxPooling({2,2}, "mycdw", {2,2}, {2,2}); // Dilation 2x2
+        auto op = std::static_pointer_cast<OperatorTensor>(myMaxPool -> getOperator());
+
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,2,2,2> {
+            {
+                {
+                    {
+                        {0.71470, 0.52770},
+                        {0.71470, 0.48740}
+                    },
+                    {
+                        {2.23290, 0.48590},
+                        {2.23290, 0.07000}
+                    }
+                },
+                {
+                    {
+                        {1.76530, 1.20710},
+                        {1.76530, 1.20710}
+                    },
+                    {
+                        {1.04290, 0.67760},
+                        {1.72170, 0.67760}
+                    }
+                }
+            }
+        });
+        myMaxPool->getOperator()->associateInput(0,myInput);
+        myMaxPool->getOperator()->setDataType(DataType::Float32);
+        myMaxPool->getOperator()->setBackend("cpu");
+        myMaxPool->forward();
+        op->getOutput(0)->print();
+        REQUIRE(*(op->getOutput(0)) == *myOutput);
+    }
 }
\ No newline at end of file
-- 
GitLab


From 96cea9f4cbbb9d07d8d1cf2ac439c04d860613fa Mon Sep 17 00:00:00 2001
From: hrouis <houssemeddine.rouis92@gmail.com>
Date: Mon, 27 Jan 2025 15:21:08 +0100
Subject: [PATCH 07/22] add dilations and cielmode to AvgPooling

---
 .../backend/cpu/operator/AvgPoolingImpl.hpp   |  2 +
 .../cpu/operator/AvgPoolingImpl_kernels.hpp   | 76 ++++++++-----------
 src/operator/AvgPoolingImpl.cpp               |  2 +
 unit_tests/operator/Test_AndImpl.cpp          | 23 ------
 4 files changed, 36 insertions(+), 67 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/AvgPoolingImpl.hpp b/include/aidge/backend/cpu/operator/AvgPoolingImpl.hpp
index adea96ca..7c76657f 100644
--- a/include/aidge/backend/cpu/operator/AvgPoolingImpl.hpp
+++ b/include/aidge/backend/cpu/operator/AvgPoolingImpl.hpp
@@ -28,8 +28,10 @@ namespace Aidge {
 using AvgPooling2D_Op = AvgPooling_Op<2>;
 using AvgPoolingImpl2D_cpu = OperatorImpl_cpu<AvgPooling_Op<2>,
     void(const std::array<DimSize_t, 2>&,
+        const std::array<DimSize_t, 2>&,
         const std::array<DimSize_t, 2>&,
         const std::array<DimSize_t, 4>&,
+        bool,
         const void *,
         void *)>;
 
diff --git a/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
index f6da9dcb..68dbfbe7 100644
--- a/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
@@ -35,66 +35,54 @@ namespace Aidge {
 template <class I, class O>
 void AvgPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
                                         const std::array<DimSize_t, 2>& kernelDims,
+                                        const std::array<DimSize_t, 2>& dilations,
                                         const std::array<DimSize_t, 4> &dims,
+                                        bool ceilMode,
                                         const void *input_,
                                         void *output_) {
-    // FIXME: missing convolution attributes as arguments
     const I *input = static_cast<const I *>(input_);
     O *output = static_cast<O *>(output_);
 
+    // Calculate output dimensions based on ceilMode and dilations
+    auto compute_output_size = [&](DimSize_t inputDim, DimSize_t kernelDim, DimSize_t stride, DimSize_t dilation) {
+        DimSize_t effectiveKernelDim = (kernelDim - 1) * dilation + 1;
+        float result = static_cast<float>(inputDim - effectiveKernelDim + stride) / static_cast<float>(stride);
+        return ceilMode ? static_cast<DimSize_t>(std::ceil(result)) : static_cast<DimSize_t>(std::floor(result));
+    };
 
-    // output H size
-    const std::size_t oxSize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(dims[2] - kernelDims[0] + strideDims[0]) /
-                                static_cast<float>(strideDims[0])));
-    // output W size
-    const std::size_t oySize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(dims[3] - kernelDims[1] + strideDims[1]) /
-                                static_cast<float>(strideDims[1])));
+    const std::size_t oxSize = compute_output_size(dims[2], kernelDims[0], strideDims[0], dilations[0]);
+    const std::size_t oySize = compute_output_size(dims[3], kernelDims[1], strideDims[1], dilations[1]);
 
-    // TODO: kernel computation
-    // output (batch, outCh, Xout, Yout)
-    // input  (batch, ch, Xin, Yin)
-    // weight (outCh, ch, kernelX, kernelY)
-    // does not take Dilation attribute into account
     using signedsize = std::make_signed<std::size_t>::type;
+
     for (std::size_t batch = 0; batch < dims[0]; ++batch) {
         for (std::size_t ch = 0; ch < dims[1]; ++ch) {
-            const std::size_t oIndex = (ch + batch*dims[1]) * oxSize * oySize;
-            const std::size_t iIndex = (ch + batch*dims[1]) * dims[2] * dims[3];
-            std::fill(output + oIndex, output+(oIndex+oxSize*oySize), 0);
+            const std::size_t oIndex = (ch + batch * dims[1]) * oxSize * oySize;
+            const std::size_t iIndex = (ch + batch * dims[1]) * dims[2] * dims[3];
+            std::fill(output + oIndex, output + (oIndex + oxSize * oySize), 0);
+
             for (std::size_t ox = 0; ox < oxSize; ++ox) {
-                const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]);
-                const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
-                const std::size_t sxMax = (static_cast<signedsize>(dims[2]) + difx) < 0 ? 0 : ((dims[2] + difx) > kernelDims[0] ? kernelDims[0] : dims[2] + difx);
+                const signedsize startx = static_cast<signedsize>(ox * strideDims[0]) - (dilations[0] - 1);
+                const std::size_t sxMin = static_cast<std::size_t>(std::max(startx, signedsize(0)));
+                const std::size_t sxMax = std::min(dims[2], static_cast<std::size_t>(startx + kernelDims[0] * dilations[0]));
+
                 for (std::size_t oy = 0; oy < oySize; ++oy) {
-                    const signedsize dify = static_cast<signedsize>(- oy * strideDims[1]);
-                    const std::size_t syMin = static_cast<std::size_t>(std::max(dify, signedsize(0)));
-                    const std::size_t syMax = (static_cast<signedsize>(dims[3]) + dify) < 0 ? 0 : ((dims[3] + dify) > kernelDims[1] ? kernelDims[1] : dims[3] + dify);
-                    const std::size_t oIndexFull = oIndex + ox*oySize + oy;
-                    const std::size_t ix = ox * strideDims[0];
-                    const std::size_t iy = oy * strideDims[1];
+                    const signedsize starty = static_cast<signedsize>(oy * strideDims[1]) - (dilations[1] - 1);
+                    const std::size_t syMin = static_cast<std::size_t>(std::max(starty, signedsize(0)));
+                    const std::size_t syMax = std::min(dims[3], static_cast<std::size_t>(starty + kernelDims[1] * dilations[1]));
 
-                    if (sxMin == 0 && syMin == 0 && sxMax == 3 && syMax == 3) {
-                        output[oIndexFull] += static_cast<O>(
-                                               input[iIndex + (ix+0)*dims[3] + (iy+0)] +
-                                               input[iIndex + (ix+0)*dims[3] + (iy+1)] +
-                                               input[iIndex + (ix+0)*dims[3] + (iy+2)] +
-                                               input[iIndex + (ix+1)*dims[3] + (iy+0)] +
-                                               input[iIndex + (ix+1)*dims[3] + (iy+1)] +
-                                               input[iIndex + (ix+1)*dims[3] + (iy+2)] +
-                                               input[iIndex + (ix+2)*dims[3] + (iy+0)] +
-                                               input[iIndex + (ix+2)*dims[3] + (iy+1)] +
-                                               input[iIndex + (ix+2)*dims[3] + (iy+2)]) / O(9);
-                    } else {
-                        for (std::size_t sx = sxMin; sx < sxMax; ++sx) {
-                            for (std::size_t sy = syMin; sy < syMax; ++sy) {
-                                output[oIndexFull] += input[iIndex + (ix+sx)*dims[3] + (iy+sy)];
-                            }
+                    const std::size_t oIndexFull = oIndex + ox * oySize + oy;
+                    O sum = static_cast<O>(0);
+                    std::size_t count = 0;
+
+                    for (std::size_t sx = sxMin; sx < sxMax; sx += dilations[0]) {
+                        for (std::size_t sy = syMin; sy < syMax; sy += dilations[1]) {
+                            sum += static_cast<O>(input[iIndex + sx * dims[3] + sy]);
+                            ++count;
                         }
-                        // padding not used
-                        output[oIndexFull] /= (sxMax - sxMin) * (syMax - syMin);
                     }
+
+                    output[oIndexFull] = sum / static_cast<O>(count);
                 }
             }
         }
diff --git a/src/operator/AvgPoolingImpl.cpp b/src/operator/AvgPoolingImpl.cpp
index 01a5e8cf..eb5ef87b 100644
--- a/src/operator/AvgPoolingImpl.cpp
+++ b/src/operator/AvgPoolingImpl.cpp
@@ -32,7 +32,9 @@ void Aidge::AvgPoolingImpl2D_cpu::forward() {
     // Call kernel
     impl.forward(op_.strideDims(),
                op_.kernelDims(),
+               op_.dilations(),
                op_.getInput(0)->template dims<4>(),
+               op_.ceilMode(),
                getCPUPtr(op_.getInput(0)),
                getCPUPtr(op_.getOutput(0)));
 }
diff --git a/unit_tests/operator/Test_AndImpl.cpp b/unit_tests/operator/Test_AndImpl.cpp
index 978a89e5..148298d5 100644
--- a/unit_tests/operator/Test_AndImpl.cpp
+++ b/unit_tests/operator/Test_AndImpl.cpp
@@ -160,28 +160,6 @@ TEST_CASE("[cpu/operator] And(forward)", "[And][CPU]") {
     }
 
     SECTION("Broadcasting") {
-<<<<<<< HEAD
-        std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<int,1,3,3,2> {
-        {                                       //
-            {                                   //
-                {{10, 20},{22, 23},{20, 20}},   //
-                {{10, 15},{10, 29},{20, 20}},   //
-                {{26, 25},{33, 20},{10, 20}}    //
-            }                                   //
-        }                                       //
-        });                                     //
-
-        std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<int,2> {{10, 20}});
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,1,3,3,2> {
-            {                                   //
-                {                               //
-                    {{ 1, 1},{ 0, 0},{ 0, 1}},  //
-                    {{ 1, 0},{ 1, 0},{ 0, 1}},  //
-                    {{ 0, 0},{ 0, 1},{ 1, 1}}   //
-                }                               //
-            }                                   //
-        });                                     //
-=======
         std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<float, 1, 2, 2, 2>{
             {
                 {{{1, 0}, {1, 0}},
@@ -193,7 +171,6 @@ TEST_CASE("[cpu/operator] And(forward)", "[And][CPU]") {
                 {{{1, 0}, {1, 0}},
                 {{1, 0}, {0, 0}}}}
             });
->>>>>>> fix and kernel and unit tests
 
         std::shared_ptr<Node> myAnd = And();
         auto op = std::static_pointer_cast<OperatorTensor>(myAnd->getOperator());
-- 
GitLab


From c12d2826171a88b4c87ce5b222363256a222b262 Mon Sep 17 00:00:00 2001
From: hrouis <houssemeddine.rouis92@gmail.com>
Date: Mon, 3 Feb 2025 10:11:02 +0100
Subject: [PATCH 08/22] handle ceil_mode in pooling kernels

---
 .../cpu/operator/AvgPoolingImpl_kernels.hpp   | 56 ++++++++++++-------
 .../cpu/operator/MaxPoolingImpl_kernels.hpp   | 20 ++++---
 unit_tests/operator/Test_AvgPoolingImpl.cpp   | 35 +++++++++++-
 3 files changed, 82 insertions(+), 29 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
index 68dbfbe7..78f8446a 100644
--- a/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
@@ -43,15 +43,20 @@ void AvgPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideD
     const I *input = static_cast<const I *>(input_);
     O *output = static_cast<O *>(output_);
 
-    // Calculate output dimensions based on ceilMode and dilations
-    auto compute_output_size = [&](DimSize_t inputDim, DimSize_t kernelDim, DimSize_t stride, DimSize_t dilation) {
-        DimSize_t effectiveKernelDim = (kernelDim - 1) * dilation + 1;
-        float result = static_cast<float>(inputDim - effectiveKernelDim + stride) / static_cast<float>(stride);
-        return ceilMode ? static_cast<DimSize_t>(std::ceil(result)) : static_cast<DimSize_t>(std::floor(result));
-    };
-
-    const std::size_t oxSize = compute_output_size(dims[2], kernelDims[0], strideDims[0], dilations[0]);
-    const std::size_t oySize = compute_output_size(dims[3], kernelDims[1], strideDims[1], dilations[1]);
+    // output H size
+    const std::size_t oxSize = 
+        ceilMode 
+        ? static_cast<std::size_t>(std::ceil(static_cast<float>(dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0]) /
+                                            static_cast<float>(strideDims[0])))
+        : static_cast<std::size_t>(std::floor(static_cast<float>(dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0]) /
+                                            static_cast<float>(strideDims[0])));
+    // output W size
+    const std::size_t oySize = 
+        ceilMode 
+        ? static_cast<std::size_t>(std::ceil(static_cast<float>(dims[3] - (kernelDims[1] - 1) * dilations[1] - 1 + strideDims[1]) /
+                                            static_cast<float>(strideDims[1])))
+        : static_cast<std::size_t>(std::floor(static_cast<float>(dims[3] - (kernelDims[1] - 1) * dilations[1] - 1 + strideDims[1]) /
+                                            static_cast<float>(strideDims[1])));
 
     using signedsize = std::make_signed<std::size_t>::type;
 
@@ -59,30 +64,39 @@ void AvgPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideD
         for (std::size_t ch = 0; ch < dims[1]; ++ch) {
             const std::size_t oIndex = (ch + batch * dims[1]) * oxSize * oySize;
             const std::size_t iIndex = (ch + batch * dims[1]) * dims[2] * dims[3];
-            std::fill(output + oIndex, output + (oIndex + oxSize * oySize), 0);
 
             for (std::size_t ox = 0; ox < oxSize; ++ox) {
-                const signedsize startx = static_cast<signedsize>(ox * strideDims[0]) - (dilations[0] - 1);
-                const std::size_t sxMin = static_cast<std::size_t>(std::max(startx, signedsize(0)));
-                const std::size_t sxMax = std::min(dims[2], static_cast<std::size_t>(startx + kernelDims[0] * dilations[0]));
+                const signedsize difx = static_cast<signedsize>(-ox * strideDims[0]);
+                const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
+                const std::size_t sxMax = (static_cast<signedsize>(dims[2]) + difx) < 0 ? 0 : ((dims[2] + difx) > kernelDims[0] ? kernelDims[0] : dims[2] + difx);
 
                 for (std::size_t oy = 0; oy < oySize; ++oy) {
-                    const signedsize starty = static_cast<signedsize>(oy * strideDims[1]) - (dilations[1] - 1);
-                    const std::size_t syMin = static_cast<std::size_t>(std::max(starty, signedsize(0)));
-                    const std::size_t syMax = std::min(dims[3], static_cast<std::size_t>(starty + kernelDims[1] * dilations[1]));
+                    const signedsize dify = static_cast<signedsize>(-oy * strideDims[1]);
+                    const std::size_t syMin = static_cast<std::size_t>(std::max(dify, signedsize(0)));
+                    const std::size_t syMax = (static_cast<signedsize>(dims[3]) + dify) < 0 ? 0 : ((dims[3] + dify) > kernelDims[1] ? kernelDims[1] : dims[3] + dify);
 
                     const std::size_t oIndexFull = oIndex + ox * oySize + oy;
+                    const std::size_t ix = ox * strideDims[0];
+                    const std::size_t iy = oy * strideDims[1];
+
                     O sum = static_cast<O>(0);
                     std::size_t count = 0;
 
-                    for (std::size_t sx = sxMin; sx < sxMax; sx += dilations[0]) {
-                        for (std::size_t sy = syMin; sy < syMax; sy += dilations[1]) {
-                            sum += static_cast<O>(input[iIndex + sx * dims[3] + sy]);
-                            ++count;
+                    for (unsigned int sy = syMin; sy < syMax; ++sy) {
+                        for (unsigned int sx = sxMin; sx < sxMax; ++sx) {
+                            // Apply dilation factor
+                            const std::size_t dilated_sx = sx * dilations[0];
+                            const std::size_t dilated_sy = sy * dilations[1];
+
+                            // Ensure within bounds
+                            if ((ix + dilated_sx) < dims[2] && (iy + dilated_sy) < dims[3]) {
+                                sum += static_cast<O>(input[iIndex + (ix + dilated_sx) * dims[3] + (iy + dilated_sy)]);
+                                ++count;
+                            }
                         }
                     }
 
-                    output[oIndexFull] = sum / static_cast<O>(count);
+                    output[oIndexFull] = count > 0 ? sum / static_cast<O>(count) : 0;
                 }
             }
         }
diff --git a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
index 250b11b0..d5ac02fe 100644
--- a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
@@ -36,7 +36,7 @@ template <class I, class O>
 void MaxPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
                                         const std::array<DimSize_t, 2>& kernelDims,
                                         const std::array<DimSize_t, 2>& dilations,
-                                        const bool /*ceilMode*/,
+                                        const bool ceilMode,
                                         const std::array<DimSize_t, 4> &dims,
                                         const void *input_,
                                         void *output_) {
@@ -44,13 +44,19 @@ void MaxPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideD
     O *output = static_cast<O *>(output_);
 
     // output H size
-    const std::size_t oxSize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0]) /
-                                static_cast<float>(strideDims[0])));
+    const std::size_t oxSize = 
+        ceilMode 
+        ? static_cast<std::size_t>(std::ceil(static_cast<float>(dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0]) /
+                                            static_cast<float>(strideDims[0])))
+        : static_cast<std::size_t>(std::floor(static_cast<float>(dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0]) /
+                                            static_cast<float>(strideDims[0])));
     // output W size
-    const std::size_t oySize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(dims[3] - (kernelDims[1] - 1) * dilations[1] - 1 + strideDims[1]) /
-                                static_cast<float>(strideDims[1])));
+    const std::size_t oySize = 
+        ceilMode 
+        ? static_cast<std::size_t>(std::ceil(static_cast<float>(dims[3] - (kernelDims[1] - 1) * dilations[1] - 1 + strideDims[1]) /
+                                            static_cast<float>(strideDims[1])))
+        : static_cast<std::size_t>(std::floor(static_cast<float>(dims[3] - (kernelDims[1] - 1) * dilations[1] - 1 + strideDims[1]) /
+                                            static_cast<float>(strideDims[1])));
 
     using signedsize = std::make_signed<std::size_t>::type;
     for (std::size_t batch = 0; batch < dims[0]; ++batch) {
diff --git a/unit_tests/operator/Test_AvgPoolingImpl.cpp b/unit_tests/operator/Test_AvgPoolingImpl.cpp
index 372febc6..21a7a680 100644
--- a/unit_tests/operator/Test_AvgPoolingImpl.cpp
+++ b/unit_tests/operator/Test_AvgPoolingImpl.cpp
@@ -110,5 +110,38 @@ TEST_CASE("[cpu/operator] AvgPooling(forward)", "[AvgPooling][CPU]") {
             REQUIRE(std::abs(outPtr[i] - expectedOutPtr[i]) < 0.00001);
         }
     }
-    // std::cout << static_cast<Tensor>((*op)["weight"])[0][0][0][0] << std::endl;
+    SECTION("Dilations") {
+        std::shared_ptr<Tensor> myInput3 = std::make_shared<Tensor>(Array4D<float,1,1,5,5> { // NCHW
+        {
+            {
+                {{ 1,  2,  3,  4,  5},
+                { 6,  7,  8,  9, 10},
+                {11, 12, 13, 14, 15},
+                {16, 17, 18, 19, 20},
+                {21, 22, 23, 24, 25}}
+            }
+        }
+        });
+
+        // Dilation of 2 means we take every second element in the window
+        std::shared_ptr<Node> myAvgPool = AvgPooling({2,2}, "mycdw", {1,1}, {2,2}); 
+        auto op = std::static_pointer_cast<AvgPooling_Op<2>>(myAvgPool -> getOperator());
+
+        std::shared_ptr<Tensor> myOutput3 = std::make_shared<Tensor>(Array4D<float,1,1,3,3> {
+            {
+                {
+                    {{  7,  8,  9},
+                    { 12, 13, 14},
+                    { 17, 18, 19}}
+                }
+            }
+        });
+
+        op->associateInput(0, myInput3);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cpu");
+        myAvgPool->forward();
+        op->getOutput(0)->print();
+        REQUIRE(*(op->getOutput(0)) == *myOutput3);
+    }
 }
\ No newline at end of file
-- 
GitLab


From b3ae66f75c1dfbc3b4ae3e08f198889cbf837937 Mon Sep 17 00:00:00 2001
From: hrouis <houssemeddine.rouis92@gmail.com>
Date: Mon, 3 Feb 2025 15:09:26 +0100
Subject: [PATCH 09/22] add ceil_mode tests for Avg and Max Pooling

---
 .../cpu/operator/MaxPoolingImpl_kernels.hpp   |  1 +
 unit_tests/operator/Test_AvgPoolingImpl.cpp   | 57 +++++++++++++++++++
 unit_tests/operator/Test_MaxPoolingImpl.cpp   | 57 +++++++++++++++++++
 3 files changed, 115 insertions(+)

diff --git a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
index d5ac02fe..027fc02a 100644
--- a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
@@ -16,6 +16,7 @@
 #include <cmath>
 #include <tuple>
 
+
 #include "aidge/backend/cpu/operator/MaxPoolingImpl.hpp"
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
 #include "aidge/data/Data.hpp"
diff --git a/unit_tests/operator/Test_AvgPoolingImpl.cpp b/unit_tests/operator/Test_AvgPoolingImpl.cpp
index 21a7a680..f116934c 100644
--- a/unit_tests/operator/Test_AvgPoolingImpl.cpp
+++ b/unit_tests/operator/Test_AvgPoolingImpl.cpp
@@ -144,4 +144,61 @@ TEST_CASE("[cpu/operator] AvgPooling(forward)", "[AvgPooling][CPU]") {
         op->getOutput(0)->print();
         REQUIRE(*(op->getOutput(0)) == *myOutput3);
     }
+    SECTION("Ceil Mode") {
+        std::shared_ptr<Tensor> myInput4 = std::make_shared<Tensor>(Array4D<float,1,1,5,5> { // NCHW
+        {
+            {
+                {
+                    { 1,  2,  3,  4,  5},
+                    { 6,  7,  8,  9, 10},
+                    {11, 12, 13, 14, 15},
+                    {16, 17, 18, 19, 20},
+                    {21, 22, 23, 24, 25}
+                }
+            }
+        }
+        });
+
+        // AvgPool with ceil_mode = true
+        std::shared_ptr<Node> myAvgPool1 = AvgPooling({2,2}, "mycdw", {2,2}, {1,1}, true);
+        auto op1 = std::static_pointer_cast<AvgPooling_Op<2>>(myAvgPool1 -> getOperator());
+
+        std::shared_ptr<Tensor> myOutput4 = std::make_shared<Tensor>(Array4D<float,1,1,3,3> {
+            {
+                {
+                    {
+                        {  4.0,  6.0,  7.5 },
+                        { 14.0, 16.0, 17.5 },
+                        { 21.5, 23.5, 25.0 }
+                    }
+                }
+            }
+        });
+        op1->associateInput(0, myInput4);
+        op1->setDataType(DataType::Float32);
+        op1->setBackend("cpu");
+        myAvgPool1->forward();
+        op1->getOutput(0)->print();
+        REQUIRE(*(op1->getOutput(0)) == *myOutput4);
+
+        // AvgPool with ceil_mode = false
+        std::shared_ptr<Node> myAvgPool2 = AvgPooling({2,2}, "mycdw", {2,2}, {1,1}, false);
+        auto op2 = std::static_pointer_cast<AvgPooling_Op<2>>(myAvgPool2 -> getOperator());
+        std::shared_ptr<Tensor> myOutput5 = std::make_shared<Tensor>(Array4D<float,1,1,2,2> {
+            {
+                {
+                    {
+                        {  4.0,  6.0 },
+                        { 14.0, 16.0 }
+                    }
+                }
+            }
+        });
+        op2->associateInput(0, myInput4);
+        op2->setDataType(DataType::Float32);
+        op2->setBackend("cpu");
+        myAvgPool2->forward();
+        op2->getOutput(0)->print();
+        REQUIRE(*(op2->getOutput(0)) == *myOutput5);
+    }
 }
\ No newline at end of file
diff --git a/unit_tests/operator/Test_MaxPoolingImpl.cpp b/unit_tests/operator/Test_MaxPoolingImpl.cpp
index 6b7e6d2f..d480fc30 100644
--- a/unit_tests/operator/Test_MaxPoolingImpl.cpp
+++ b/unit_tests/operator/Test_MaxPoolingImpl.cpp
@@ -115,4 +115,61 @@ TEST_CASE("[cpu/operator] MaxPooling(forward)", "[MaxPooling][CPU]") {
         op->getOutput(0)->print();
         REQUIRE(*(op->getOutput(0)) == *myOutput);
     }
+    SECTION("Ceil Mode") {
+        std::shared_ptr<Tensor> myInput4 = std::make_shared<Tensor>(Array4D<float,1,1,5,5> { // NCHW
+        {
+            {
+                {
+                    { 1,  2,  3,  4,  5},
+                    { 6,  7,  8,  9, 10},
+                    {11, 12, 13, 14, 15},
+                    {16, 17, 18, 19, 20},
+                    {21, 22, 23, 24, 25}
+                }
+            }
+        }
+        });
+
+        // MaxPool with ceil_mode = true
+        std::shared_ptr<Node> myMaxPool1 = MaxPooling({2,2}, "mycdw", {2,2}, {1,1}, true);
+        auto op1 = std::static_pointer_cast<OperatorTensor>(myMaxPool1 -> getOperator());
+
+        std::shared_ptr<Tensor> myOutput4 = std::make_shared<Tensor>(Array4D<float,1,1,3,3> {
+            {
+                {
+                    {
+                        {  7.0,  9.0, 10.0 },
+                        { 17.0, 19.0, 20.0 },
+                        { 22.0, 24.0, 25.0 }
+                    }
+                }
+            }
+        });
+        op1->associateInput(0, myInput4);
+        op1->setDataType(DataType::Float32);
+        op1->setBackend("cpu");
+        myMaxPool1->forward();
+        op1->getOutput(0)->print();
+        REQUIRE(*(op1->getOutput(0)) == *myOutput4);
+
+        // MaxPool with ceil_mode = false
+        std::shared_ptr<Node> myMaxPool2 = MaxPooling({2,2}, "mycdw", {2,2}, {1,1}, false);
+        auto op2 = std::static_pointer_cast<OperatorTensor>(myMaxPool2 -> getOperator());
+        std::shared_ptr<Tensor> myOutput5 = std::make_shared<Tensor>(Array4D<float,1,1,2,2> {
+            {
+                {
+                    {
+                        {  7.0,  9.0 },
+                        { 17.0, 19.0 }
+                    }
+                }
+            }
+        });
+        op2->associateInput(0, myInput4);
+        op2->setDataType(DataType::Float32);
+        op2->setBackend("cpu");
+        myMaxPool2->forward();
+        op2->getOutput(0)->print();
+        REQUIRE(*(op2->getOutput(0)) == *myOutput5);
+    }
 }
\ No newline at end of file
-- 
GitLab


From 40a34dc2c1910bba8e983adc28929204a4e62f45 Mon Sep 17 00:00:00 2001
From: hrouis <houssemeddine.rouis92@gmail.com>
Date: Wed, 19 Feb 2025 10:32:39 +0100
Subject: [PATCH 10/22] separate fwdDims tests section from fwd section

---
 unit_tests/operator/Test_EqualImpl.cpp | 145 ++++++++++++-------------
 1 file changed, 72 insertions(+), 73 deletions(-)

diff --git a/unit_tests/operator/Test_EqualImpl.cpp b/unit_tests/operator/Test_EqualImpl.cpp
index a229b8ce..013e16eb 100644
--- a/unit_tests/operator/Test_EqualImpl.cpp
+++ b/unit_tests/operator/Test_EqualImpl.cpp
@@ -19,86 +19,85 @@
 
 using namespace Aidge;
 
-TEST_CASE("[cpu/operator] Equal(forward)", "[Equal][CPU]") {
-        SECTION("ForwardDims")
-    {
-        constexpr std::uint16_t NBTRIALS = 10;
-        // Create a random number generator
-        std::random_device rd;
-        std::mt19937 gen(rd());
-        std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
-        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
-        std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
-        std::uniform_int_distribution<int> boolDist(0,1);
-
-        SECTION("Same dimensions") {
-            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
-                DimSize_t nbDims = nbDimsDist(gen);
-                std::vector<DimSize_t> dims(nbDims);
-                for (std::size_t i = 0; i < nbDims; i++) {
-                    dims[i] = dimSizeDist(gen);
-                }
-
-                std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims);
-                myInput1->setBackend("cpu");
-                myInput1->setDataType(DataType::Float32);
-                myInput1->zeros();
-                std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims);
-                myInput2->setBackend("cpu");
-                myInput2->setDataType(DataType::Float32);
-                myInput2->zeros();
-                std::shared_ptr<Node> myEqual = Equal();
-                auto op = std::static_pointer_cast<OperatorTensor>(myEqual -> getOperator());
-                op->associateInput(0,myInput1);
-                op->associateInput(1,myInput2);
-                op->setDataType(DataType::Float32);
-                op->setBackend("cpu");
-                op->forwardDims();
-
-                const auto outputDims = op->getOutput(0)->dims();
-                REQUIRE(outputDims == dims);
+TEST_CASE("[cpu/operator] Equal(forwardDims)", "[Equal][CPU]") {
+    constexpr std::uint16_t NBTRIALS = 10;
+    // Create a random number generator
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
+    std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
+    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
+    std::uniform_int_distribution<int> boolDist(0,1);
+
+    SECTION("Same dimensions") {
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+            DimSize_t nbDims = nbDimsDist(gen);
+            std::vector<DimSize_t> dims(nbDims);
+            for (std::size_t i = 0; i < nbDims; i++) {
+                dims[i] = dimSizeDist(gen);
             }
+
+            std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims);
+            myInput1->setBackend("cpu");
+            myInput1->setDataType(DataType::Float32);
+            myInput1->zeros();
+            std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims);
+            myInput2->setBackend("cpu");
+            myInput2->setDataType(DataType::Float32);
+            myInput2->zeros();
+            std::shared_ptr<Node> myEqual = Equal();
+            auto op = std::static_pointer_cast<OperatorTensor>(myEqual -> getOperator());
+            op->associateInput(0,myInput1);
+            op->associateInput(1,myInput2);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            op->forwardDims();
+
+            const auto outputDims = op->getOutput(0)->dims();
+            REQUIRE(outputDims == dims);
         }
-        SECTION("Broadcasting") {
-            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
-                DimSize_t nbDims = nbDimsDist(gen);
-                std::vector<DimSize_t> dims1(nbDims, 1);
-                std::vector<DimSize_t> dims2(nbDims, 1);
-                std::vector<DimSize_t> expectedOutDims;
-                for (std::size_t i = 0; i < nbDims; i++) {
-                    DimSize_t dim = dimSizeDist(gen);
-                    if (boolDist(gen)) {
-                        dims1[i] = dim;
-                    }
-                    if (boolDist(gen)) {
-                        dims2[i] = dim;
-                    }
-                    expectedOutDims.push_back(std::max(dims1[i],dims2[i]));
+    }
+    SECTION("Broadcasting") {
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+            DimSize_t nbDims = nbDimsDist(gen);
+            std::vector<DimSize_t> dims1(nbDims, 1);
+            std::vector<DimSize_t> dims2(nbDims, 1);
+            std::vector<DimSize_t> expectedOutDims;
+            for (std::size_t i = 0; i < nbDims; i++) {
+                DimSize_t dim = dimSizeDist(gen);
+                if (boolDist(gen)) {
+                    dims1[i] = dim;
+                }
+                if (boolDist(gen)) {
+                    dims2[i] = dim;
                 }
+                expectedOutDims.push_back(std::max(dims1[i],dims2[i]));
+            }
 
 
-                std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims1);
-                myInput1->setBackend("cpu");
-                myInput1->setDataType(DataType::Float32);
-                myInput1->zeros();
-                std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims2);
-                myInput2->setBackend("cpu");
-                myInput2->setDataType(DataType::Float32);
-                myInput2->zeros();
-                std::shared_ptr<Node> myEqual = Equal();
-                auto op = std::static_pointer_cast<OperatorTensor>(myEqual -> getOperator());
-                op->associateInput(0,myInput1);
-                op->associateInput(1,myInput2);
-                op->setDataType(DataType::Float32);
-                op->setBackend("cpu");
-
-                op->forwardDims();
-
-                const auto outputDims = op->getOutput(0)->dims();
-                REQUIRE(outputDims == expectedOutDims);
-            }
+            std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims1);
+            myInput1->setBackend("cpu");
+            myInput1->setDataType(DataType::Float32);
+            myInput1->zeros();
+            std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims2);
+            myInput2->setBackend("cpu");
+            myInput2->setDataType(DataType::Float32);
+            myInput2->zeros();
+            std::shared_ptr<Node> myEqual = Equal();
+            auto op = std::static_pointer_cast<OperatorTensor>(myEqual -> getOperator());
+            op->associateInput(0,myInput1);
+            op->associateInput(1,myInput2);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+
+            op->forwardDims();
+
+            const auto outputDims = op->getOutput(0)->dims();
+            REQUIRE(outputDims == expectedOutDims);
         }
     }
+}
+TEST_CASE("[cpu/operator] Equal(forward)", "[Equal][CPU]") {
     SECTION("Same size inputs") {
         std::shared_ptr<Tensor> input1 = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
         {                                       //
-- 
GitLab


From f3de3e10f342b3dd573609d8f835ed822950e5d7 Mon Sep 17 00:00:00 2001
From: hrouis <houssemeddine.rouis92@gmail.com>
Date: Thu, 20 Feb 2025 11:35:10 +0100
Subject: [PATCH 11/22] remove unnecessary header in Equal tests

---
 unit_tests/operator/Test_EqualImpl.cpp | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/unit_tests/operator/Test_EqualImpl.cpp b/unit_tests/operator/Test_EqualImpl.cpp
index 013e16eb..bd9fa94f 100644
--- a/unit_tests/operator/Test_EqualImpl.cpp
+++ b/unit_tests/operator/Test_EqualImpl.cpp
@@ -15,8 +15,6 @@
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/Equal.hpp"
 
-#include "aidge/backend/cpu.hpp"
-
 using namespace Aidge;
 
 TEST_CASE("[cpu/operator] Equal(forwardDims)", "[Equal][CPU]") {
@@ -137,7 +135,7 @@ TEST_CASE("[cpu/operator] Equal(forward)", "[Equal][CPU]") {
                 }                                   //
             }                                       //
         });                                         //
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
+        Tensor expectedOutput =Tensor(Array4D<int,3,3,3,2> {
             {
                 {
                     {{1, 0},{0, 0},{1, 1}},
@@ -165,7 +163,7 @@ TEST_CASE("[cpu/operator] Equal(forward)", "[Equal][CPU]") {
         op->setDataType(DataType::Int32);
         myEqual->forward();
 
-        REQUIRE(*(op->getOutput(0)) == *expectedOutput);
+        REQUIRE(*(op->getOutput(0)) == expectedOutput);
     }
 
     SECTION("Broadcasting") {
@@ -180,7 +178,7 @@ TEST_CASE("[cpu/operator] Equal(forward)", "[Equal][CPU]") {
         });                                     //
 
         std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<int,2> {{10, 20}});  
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,1,3,3,2> {
+        Tensor expectedOutput = Tensor(Array4D<int,1,3,3,2> {
             {                                   //
                 {                               //
                     {{ 1, 1},{ 0, 0},{ 0, 1}},  //
@@ -198,7 +196,7 @@ TEST_CASE("[cpu/operator] Equal(forward)", "[Equal][CPU]") {
         op->setBackend("cpu");
         myEqual->forward();
         op->getOutput(0)->print();
-        expectedOutput->print();
-        REQUIRE(*op->getOutput(0) == *expectedOutput);
+
+        REQUIRE(*op->getOutput(0) == expectedOutput);
     }
 }
\ No newline at end of file
-- 
GitLab


From dcbd4ebd1fe6d4eb065496f8ab0b62b771b42589 Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Fri, 14 Feb 2025 17:57:38 +0100
Subject: [PATCH 12/22] Add Mod

---
 .../aidge/backend/cpu/operator/ModImpl.hpp    |  33 +++++
 .../backend/cpu/operator/ModImpl_kernels.hpp  |  77 ++++++++++
 src/operator/ModImpl.cpp                      | 131 ++++++++++++++++++
 3 files changed, 241 insertions(+)
 create mode 100644 include/aidge/backend/cpu/operator/ModImpl.hpp
 create mode 100644 include/aidge/backend/cpu/operator/ModImpl_kernels.hpp
 create mode 100644 src/operator/ModImpl.cpp

diff --git a/include/aidge/backend/cpu/operator/ModImpl.hpp b/include/aidge/backend/cpu/operator/ModImpl.hpp
new file mode 100644
index 00000000..96ff599b
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/ModImpl.hpp
@@ -0,0 +1,33 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_MODIMPL_H_
+#define AIDGE_CPU_OPERATOR_MODIMPL_H_
+
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Mod.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using ModImpl_cpu = OperatorImpl_cpu<Mod_Op,
+    void(bool, const std::size_t, const std::size_t, const std::size_t, const void*, const void*,void*)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(Mod_Op, "cpu", Aidge::ModImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_MODIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/ModImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ModImpl_kernels.hpp
new file mode 100644
index 00000000..940fa482
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/ModImpl_kernels.hpp
@@ -0,0 +1,77 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_MODIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_MODIMPL_KERNELS_H_
+
+#include <numeric>     // std::accumulate
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::int32_t, std::int64_t
+#include <functional>  // std::multiplies
+
+#include "aidge/utils/Registrar.hpp"
+
+#include "aidge/backend/cpu/data/Broadcasting.hpp"
+#include "aidge/backend/cpu/operator/ModImpl.hpp"
+
+namespace Aidge {
+
+template <typename T,  
+    typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
+static inline T modulus(T a, T b) {
+    return a % b;
+}
+
+template <typename T,  
+    typename std::enable_if<!std::is_integral<T>::value>::type* = nullptr>
+static inline T modulus(T /*a*/, T /*b*/) {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Mod Operator with fmod attribute set to false only supports integer types.");
+}
+
+template <class I1, class I2, class O>
+constexpr void ModImpl_cpu_forward_kernel(bool fmod,
+                                const std::size_t input1size_,
+                                const std::size_t input2size_,
+                                const std::size_t output1size_,
+                                const void* input1_,
+                                const void* input2_,
+                                void* output_) {
+
+    const I1* input_1 = static_cast<const I1*>(input1_);
+    const I2* input_2 = static_cast<const I2*>(input2_);
+    O* output = static_cast<O*>(output_);
+
+// suppose values are contiguous in memory
+    for (std::size_t i = 0; i < output1size_; ++i) {
+        const std::size_t in1_id = (input1size_ != 1) ? i : 0;
+        const std::size_t in2_id = (input2size_ != 1) ? i : 0;
+        if (fmod) {
+            output[i] = static_cast<O>(std::fmod(input_1[in1_id], input_2[in2_id]));
+        }
+        else {
+            output[i] = static_cast<O>(modulus(input_1[in1_id], input_2[in2_id]));
+        }
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(ModImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::ModImpl_cpu_forward_kernel<float, float, float>, nullptr});
+REGISTRAR(ModImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::ModImpl_cpu_forward_kernel<double, double, double>, nullptr});
+REGISTRAR(ModImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::inPlaceModel, Aidge::ModImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, nullptr});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_MODIMPL_KERNELS_H_ */
diff --git a/src/operator/ModImpl.cpp b/src/operator/ModImpl.cpp
new file mode 100644
index 00000000..161f7bc1
--- /dev/null
+++ b/src/operator/ModImpl.cpp
@@ -0,0 +1,131 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <memory>
+#include <vector>
+
+#include "aidge/backend/cpu/data/Broadcasting.hpp"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include "aidge/backend/cpu/operator/ModImpl.hpp"
+#include "aidge/backend/cpu/operator/ModImpl_kernels.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/utils/Types.h"
+
+template <>
+void Aidge::ModImpl_cpu::forward() {
+    // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7]
+    // 2. Find the highest equal dimension -> 3
+    //    Exception: if the first diverging dimension is the last one, then -> 4 (dims.size())
+    // 3. Compute the highest number of contiguous data -> 7
+    // 4. Compute stride and offset step for the broadcast mechanism
+    // 5. Call a simple kernel
+    const auto& opTensor = static_cast<const Mod_Op&>(mOp);
+
+    // Find the correct kernel type
+    const auto impl = Registrar<ModImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Compute compatible input dimensions
+    std::vector<std::size_t>        dims0   = opTensor.getInput(0)->dims();
+    std::vector<std::size_t>        dims1   = opTensor.getInput(1)->dims();
+    const std::vector<std::size_t>& outDims = opTensor.getOutput(0)->dims();
+
+    // special case for equal dimensions, the kernel is called with the entire arrays at once
+    if (dims0 == dims1) {
+        const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+        impl.forward(opTensor.fmod(),
+                    input0_contiguous_size, input0_contiguous_size, input0_contiguous_size,
+                    getCPUPtr(mOp.getRawInput(0)),
+                    getCPUPtr(mOp.getRawInput(1)),
+                    getCPUPtr(mOp.getRawOutput(0)));
+        return;
+    }
+
+    // set dimensions to be of equal size by filling the smallest one with ones.
+    if (dims0.size() > dims1.size()) {
+        dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1));
+    }
+    else if (dims1.size() > dims0.size()) {
+        dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1));
+    }
+
+    const std::size_t nbDims = dims0.size();
+
+    // Find the highest equal dimension
+    // std::size_t contiguousIdx = nbDims - 1;
+    std::size_t contiguousIdx = nbDims;
+    while (contiguousIdx-- > 0) {
+    // for (; contiguousIdx+1 > 0; --contiguousIdx) {
+        if (dims0[contiguousIdx] != dims1[contiguousIdx]) {
+            if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1
+                const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1;
+                while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) {
+                    --contiguousIdx;
+                }
+            }
+            break;
+        }
+    }
+    ++contiguousIdx;
+
+    // Compute the highest number of contiguous data for each Tensor
+    const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t output_contiguous_size = std::accumulate(outDims.cbegin()+contiguousIdx, outDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+
+    // initialize strides to iterate through data because of broadcasting
+    std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    if (contiguousIdx > 0) {
+        stride_post0[contiguousIdx - 1] = 1;
+        stride_post1[contiguousIdx - 1] = 1;
+        for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) {
+            stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]);
+            stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]);
+        }
+        for (std::size_t i = 0; i != contiguousIdx; ++i) {
+            stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1;
+            stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1;
+        }
+    }
+
+    // variables for arrays offsets
+    std::size_t offsetIn0 = 0;
+    std::size_t offsetIn1 = 0;
+    std::size_t offsetOut = 0;
+
+
+    std::size_t dim = contiguousIdx - 1;
+    const std::size_t nbStacks = std::accumulate(outDims.cbegin(), outDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
+    for (std::size_t stack = 0; stack < nbStacks;) {
+        impl.forward(opTensor.fmod(), input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
+                    getCPUPtr(mOp.getRawInput(0), offsetIn0*input0_contiguous_size),
+                    getCPUPtr(mOp.getRawInput(1), offsetIn1*input1_contiguous_size),
+                    getCPUPtr(mOp.getRawOutput(0), offsetOut*output_contiguous_size));
+        if (++stack < nbStacks) {
+            std::size_t tmp_stack = stack;
+            while(tmp_stack % outDims[dim] == 0) {
+                tmp_stack /= outDims[dim];
+                dim--;
+            }
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = contiguousIdx - 1;
+        }
+    }
+}
+
+template <>
+void Aidge::ModImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Mod_Op on backend cpu");
+}
-- 
GitLab


From 1c023cb2c416ad660acef30a6e913b8b40c1c8d5 Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Sun, 16 Feb 2025 16:39:05 +0100
Subject: [PATCH 13/22] Fixed typo

---
 include/aidge/backend/cpu/operator/AbsImpl_kernels.hpp |  4 ++--
 .../aidge/backend/cpu/operator/AtanImpl_kernels.hpp    |  8 ++++----
 include/aidge/backend/cpu/operator/ErfImpl_kernels.hpp |  4 ++--
 .../backend/cpu/operator/HeavisideImpl_kernels.hpp     |  4 ++--
 .../backend/cpu/operator/LeakyReLUImpl_kernels.hpp     |  8 ++++----
 include/aidge/backend/cpu/operator/LnImpl_kernels.hpp  | 10 +++++-----
 .../aidge/backend/cpu/operator/ReLUImpl_kernels.hpp    | 10 +++++-----
 .../aidge/backend/cpu/operator/RoundImpl_kernels.hpp   |  4 ++--
 .../aidge/backend/cpu/operator/ScalingImpl_kernels.hpp |  4 ++--
 .../aidge/backend/cpu/operator/SigmoidImpl_kernels.hpp | 10 +++++-----
 .../aidge/backend/cpu/operator/SqrtImpl_kernels.hpp    |  8 ++++----
 .../aidge/backend/cpu/operator/TanhImpl_kernels.hpp    | 10 +++++-----
 12 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/AbsImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AbsImpl_kernels.hpp
index 16e5f9de..e6474cf2 100644
--- a/include/aidge/backend/cpu/operator/AbsImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AbsImpl_kernels.hpp
@@ -20,14 +20,14 @@
 
 namespace Aidge {
 template <class I, class O>
-void AbsImpl_cpu_forward_kernel(std::size_t inputLenght,
+void AbsImpl_cpu_forward_kernel(std::size_t inputLength,
                                      const void* input_,
                                      void* output_) {
 
     const I* input = static_cast<const I*>(input_);
     O* output = static_cast<O*>(output_);
 
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+    for (std::size_t i = 0; i < inputLength; ++i) {
         output[i] = std::abs(input[i]);
     }
 }
diff --git a/include/aidge/backend/cpu/operator/AtanImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AtanImpl_kernels.hpp
index 2a786339..141e5b60 100644
--- a/include/aidge/backend/cpu/operator/AtanImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AtanImpl_kernels.hpp
@@ -20,20 +20,20 @@
 
 namespace Aidge {
 template <class I, class O>
-void AtanImpl_cpu_forward_kernel(std::size_t inputLenght,
+void AtanImpl_cpu_forward_kernel(std::size_t inputLength,
                                     const void* input_,
                                     void* output_) {
     const I* input = static_cast<const I*>(input_);
     O* output = static_cast<O*>(output_);
 
-    for (size_t i = 0; i < inputLenght; ++i) {
+    for (size_t i = 0; i < inputLength; ++i) {
         output[i] = static_cast<O>(atan(input[i]));
     }
 
 }
 
 template <class O, class GI, class GO>
-void AtanImpl_cpu_backward_kernel(const std::size_t inputLenght,
+void AtanImpl_cpu_backward_kernel(const std::size_t inputLength,
                                      const void* output_, const void* grad_output_,
 				     void* grad_input_) {
     const O* output = static_cast<const O*>(output_);
@@ -41,7 +41,7 @@ void AtanImpl_cpu_backward_kernel(const std::size_t inputLenght,
     GI* grad_input = static_cast<GI*>(grad_input_);
 
     // Apply the derivative of atan for each element in the input array
-    for (size_t i = 0; i < inputLenght; ++i) {
+    for (size_t i = 0; i < inputLength; ++i) {
         // dx = dy * (1 / (1 + x^2))
         grad_input[i] = grad_output[i] * static_cast<O>(1.0 / (1.0 + output[i] * output[i]));
     }
diff --git a/include/aidge/backend/cpu/operator/ErfImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ErfImpl_kernels.hpp
index 02041f55..709f4a6f 100644
--- a/include/aidge/backend/cpu/operator/ErfImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ErfImpl_kernels.hpp
@@ -20,14 +20,14 @@
 
 namespace Aidge {
 template <class I, class O>
-void ErfImpl_cpu_forward_kernel(std::size_t inputLenght,
+void ErfImpl_cpu_forward_kernel(std::size_t inputLength,
                                      const void* input_,
                                      void* output_) {
 
     const I* input = static_cast<const I*>(input_);
     O* output = static_cast<O*>(output_);
 
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+    for (std::size_t i = 0; i < inputLength; ++i) {
         output[i] = std::erf(input[i]);
     }
 }
diff --git a/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp b/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp
index 3fd6ca7d..06d7fff8 100644
--- a/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp
@@ -23,14 +23,14 @@
 namespace Aidge {
 
 template <class I, class O>
-void HeavisideImplCpuForwardKernel(std::size_t inputLenght,
+void HeavisideImplCpuForwardKernel(std::size_t inputLength,
                                    const void *input_,
                                    void *output_,
                                    const float value) {
     const I *input = static_cast<const I *>(input_);
     O *output = static_cast<O *>(output_);
 
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+    for (std::size_t i = 0; i < inputLength; ++i) {
         output[i] = (input[i] > 0) ? 1 : (input[i] == 0 ? value : 0);
     }
 }
diff --git a/include/aidge/backend/cpu/operator/LeakyReLUImpl_kernels.hpp b/include/aidge/backend/cpu/operator/LeakyReLUImpl_kernels.hpp
index bc856f70..7afd8298 100644
--- a/include/aidge/backend/cpu/operator/LeakyReLUImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/LeakyReLUImpl_kernels.hpp
@@ -19,7 +19,7 @@
 namespace Aidge {
 template <class I, class O>
 void LeakyReLUImpl_cpu_forward_kernel(const float negativeSlope_,
-                                     std::size_t inputLenght,
+                                     std::size_t inputLength,
                                      const void* input_,
                                      void* output_) {
 
@@ -27,14 +27,14 @@ void LeakyReLUImpl_cpu_forward_kernel(const float negativeSlope_,
     O* output = static_cast<O*>(output_);
     const I negativeSlope = static_cast<const I>(negativeSlope_);
 
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+    for (std::size_t i = 0; i < inputLength; ++i) {
         output[i] = (input[i] >= 0) ? input[i] : input[i] * negativeSlope;
     }
 }
 
 template <class I, class O>
 void LeakyReLUImpl_cpu_backward_kernel(const float negativeSlope_,
-                                     std::size_t inputLenght,
+                                     std::size_t inputLength,
                                      const void* input_,
                                      void* output_) {
 
@@ -42,7 +42,7 @@ void LeakyReLUImpl_cpu_backward_kernel(const float negativeSlope_,
     O* output = static_cast<O*>(output_);
     const I negativeSlope = static_cast<const I>(negativeSlope_);
 
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+    for (std::size_t i = 0; i < inputLength; ++i) {
         output[i] = (input[i] > 0) ? input[i] : negativeSlope*input[i];
     }
 }
diff --git a/include/aidge/backend/cpu/operator/LnImpl_kernels.hpp b/include/aidge/backend/cpu/operator/LnImpl_kernels.hpp
index b30b05bb..ee2864b6 100755
--- a/include/aidge/backend/cpu/operator/LnImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/LnImpl_kernels.hpp
@@ -18,7 +18,7 @@
 
 namespace Aidge {
 template <class I, class O>
-void LnImpl_cpu_forward_kernel(std::size_t inputLenght,
+void LnImpl_cpu_forward_kernel(std::size_t inputLength,
                                const void* input_,
                                void* output_) {
 
@@ -26,8 +26,8 @@ void LnImpl_cpu_forward_kernel(std::size_t inputLenght,
     O* output = static_cast<O*>(output_);
 	const float eps = 1.0e-20f;
 
-//#pragma omp parallel for if (inputLenght > 1024)
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+//#pragma omp parallel for if (inputLength > 1024)
+    for (std::size_t i = 0; i < inputLength; ++i) {
 		if (input[i] > I(eps)) {
 			output[i] = std::log(input[i]);
 		} else {
@@ -37,7 +37,7 @@ void LnImpl_cpu_forward_kernel(std::size_t inputLenght,
 }
 
 template <class I, class GI, class GO>
-void LnImpl_cpu_backward_kernel(const std::size_t inputLenght,
+void LnImpl_cpu_backward_kernel(const std::size_t inputLength,
                                 const void* input_, const void* grad_output_,
 	                            void* grad_input_) {
 						 
@@ -46,7 +46,7 @@ void LnImpl_cpu_backward_kernel(const std::size_t inputLenght,
     GI* grad_input = static_cast<GI*>(grad_input_);
 	const float eps = 1.0e-20f;
 	
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+    for (std::size_t i = 0; i < inputLength; ++i) {
 		if (input[i] > I(eps)) {
 			grad_input[i] = grad_output[i] / input[i];
 		} else {
diff --git a/include/aidge/backend/cpu/operator/ReLUImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ReLUImpl_kernels.hpp
index e39e9b7d..bb5d7cc3 100644
--- a/include/aidge/backend/cpu/operator/ReLUImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ReLUImpl_kernels.hpp
@@ -26,27 +26,27 @@
 namespace Aidge {
 // Kernels
 template <class I, class O>
-void ReLUImpl_cpu_forward_kernel(std::size_t inputLenght,
+void ReLUImpl_cpu_forward_kernel(std::size_t inputLength,
                                      const void* input_,
                                      void* output_) {
 
     const I* input = static_cast<const I*>(input_);
     O* output = static_cast<O*>(output_);
 
-//#pragma omp parallel for if (inputLenght > 1024)
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+//#pragma omp parallel for if (inputLength > 1024)
+    for (std::size_t i = 0; i < inputLength; ++i) {
         output[i] = (input[i] > 0) ? input[i] : 0;
     }
 }
 
 template <class I, class GI, class GO>
-void ReLUImpl_cpu_backward_kernel(const std::size_t inputLenght,
+void ReLUImpl_cpu_backward_kernel(const std::size_t inputLength,
                                   const void* input_, const void* grad_output_,
 				  void* grad_input_) {
     const I* input = static_cast<const I*>(input_);
     const GO* grad_output = static_cast<const GO*>(grad_output_);
     GI* grad_input = static_cast<GI*>(grad_input_);
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+    for (std::size_t i = 0; i < inputLength; ++i) {
         grad_input[i] = (input[i] > 0) ? grad_output[i] : 0;
     }
 }
diff --git a/include/aidge/backend/cpu/operator/RoundImpl_kernels.hpp b/include/aidge/backend/cpu/operator/RoundImpl_kernels.hpp
index ba9c63bc..7ac4319b 100644
--- a/include/aidge/backend/cpu/operator/RoundImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/RoundImpl_kernels.hpp
@@ -21,14 +21,14 @@
 
 namespace Aidge {
 template <class I, class O>
-void RoundImpl_cpu_forward_kernel(const std::size_t inputLenght,
+void RoundImpl_cpu_forward_kernel(const std::size_t inputLength,
                                      const void* input_,
                                      void* output_) {
 
     const I* input = static_cast<const I*>(input_);
     O* output = static_cast<O*>(output_);
 
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+    for (std::size_t i = 0; i < inputLength; ++i) {
         //std::round would not work since it doesn't follow the halves rules (See ONNX Round)
         output[i] = static_cast<O>(std::nearbyint(static_cast<float>(input[i])));
     }
diff --git a/include/aidge/backend/cpu/operator/ScalingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ScalingImpl_kernels.hpp
index c758c9cf..f9ca00b7 100644
--- a/include/aidge/backend/cpu/operator/ScalingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ScalingImpl_kernels.hpp
@@ -76,14 +76,14 @@ template <class I, class O>
 void ScalingImpl_cpu_forward_kernel(const float scalingFactor,
                                     const std::size_t quantizedNbBits,
                                     const bool isOutputUnsigned,
-                                    std::size_t inputLenght,
+                                    std::size_t inputLength,
                                     const void* input_,
                                     void* output_) {
 
     const I* input = static_cast<const I*>(input_);
     O* output = static_cast<O*>(output_);
 
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+    for (std::size_t i = 0; i < inputLength; ++i) {
         output[i] = static_cast<O>(input[i] * static_cast<I>(scalingFactor));
 
         if(quantizedNbBits > 0) {
diff --git a/include/aidge/backend/cpu/operator/SigmoidImpl_kernels.hpp b/include/aidge/backend/cpu/operator/SigmoidImpl_kernels.hpp
index dfd71ce0..83ad4575 100644
--- a/include/aidge/backend/cpu/operator/SigmoidImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/SigmoidImpl_kernels.hpp
@@ -18,15 +18,15 @@
 
 namespace Aidge {
 template <class I, class O>
-void SigmoidImpl_cpu_forward_kernel(std::size_t inputLenght,
+void SigmoidImpl_cpu_forward_kernel(std::size_t inputLength,
                                     const void* input_,
                                     void* output_) {
 
     const I* input = static_cast<const I*>(input_);
     O* output = static_cast<O*>(output_);
 
-//#pragma omp parallel for if (inputLenght > 1024)
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+//#pragma omp parallel for if (inputLength > 1024)
+    for (std::size_t i = 0; i < inputLength; ++i) {
 		if (input[i] > I(0)) {
 			output[i] = O(1) / (O(1) + std::exp(-input[i]));
 		} else {
@@ -36,13 +36,13 @@ void SigmoidImpl_cpu_forward_kernel(std::size_t inputLenght,
 }
 
 template <class O, class GI, class GO>
-void SigmoidImpl_cpu_backward_kernel(const std::size_t inputLenght,
+void SigmoidImpl_cpu_backward_kernel(const std::size_t inputLength,
                                      const void* output_, const void* grad_output_,
 				     void* grad_input_) {
     const O* output = static_cast<const O*>(output_);
     const GO* grad_output = static_cast<const GO*>(grad_output_);
     GI* grad_input = static_cast<GI*>(grad_input_);
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+    for (std::size_t i = 0; i < inputLength; ++i) {
         grad_input[i] = output[i] * (O(1) - output[i]) * grad_output[i];
     }
 }
diff --git a/include/aidge/backend/cpu/operator/SqrtImpl_kernels.hpp b/include/aidge/backend/cpu/operator/SqrtImpl_kernels.hpp
index 0464119c..1ce1ef9b 100644
--- a/include/aidge/backend/cpu/operator/SqrtImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/SqrtImpl_kernels.hpp
@@ -21,27 +21,27 @@
 
 namespace Aidge {
 template <class I, class O>
-void SqrtImpl_cpu_forward_kernel(const std::size_t inputLenght,
+void SqrtImpl_cpu_forward_kernel(const std::size_t inputLength,
                                      const void* input_,
                                      void* output_) {
 
     const I* input = static_cast<const I*>(input_);
     O* output = static_cast<O*>(output_);
 
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+    for (std::size_t i = 0; i < inputLength; ++i) {
         output[i] = static_cast<O>(std::sqrt(static_cast<float>(input[i])));
     }
 }
 
 template <class I, class O>
-void SqrtImpl_cpu_backward_kernel(const std::size_t inputLenght,
+void SqrtImpl_cpu_backward_kernel(const std::size_t inputLength,
                                      const void* input_,
                                      void* output_) {
 
     const I* input = static_cast<const I*>(input_);
     O* output = static_cast<O*>(output_);
 
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+    for (std::size_t i = 0; i < inputLength; ++i) {
         output[i] = static_cast<O>(0.5/(std::sqrt(static_cast<float>(input[i]))));
     }
 }
diff --git a/include/aidge/backend/cpu/operator/TanhImpl_kernels.hpp b/include/aidge/backend/cpu/operator/TanhImpl_kernels.hpp
index fdcac210..49cfe9cb 100644
--- a/include/aidge/backend/cpu/operator/TanhImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/TanhImpl_kernels.hpp
@@ -18,27 +18,27 @@
 
 namespace Aidge {
 template <class I, class O>
-void TanhImpl_cpu_forward_kernel(std::size_t inputLenght,
+void TanhImpl_cpu_forward_kernel(std::size_t inputLength,
                                      const void* input_,
                                      void* output_) {
 
     const I* input = static_cast<const I*>(input_);
     O* output = static_cast<O*>(output_);
 
-//#pragma omp parallel for if (inputLenght > 1024)
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+//#pragma omp parallel for if (inputLength > 1024)
+    for (std::size_t i = 0; i < inputLength; ++i) {
         output[i] = std::tanh(input[i]);
     }
 }
 
 template <class O, class GI, class GO>
-void TanhImpl_cpu_backward_kernel(const std::size_t inputLenght,
+void TanhImpl_cpu_backward_kernel(const std::size_t inputLength,
                                   const void* output_, const void* grad_output_,
 			          void* grad_input_) {
     const O* output = static_cast<const O*>(output_);
     const GO* grad_output = static_cast<const GO*>(grad_output_);
     GI* grad_input = static_cast<GI*>(grad_input_);
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+    for (std::size_t i = 0; i < inputLength; ++i) {
         grad_input[i] = (O(1) - output[i] * output[i]) * grad_output[i];
     }
 }
-- 
GitLab


From 06c6f8b120b9cbc772ae367ea9827a0e7f8bf040 Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Sun, 16 Feb 2025 16:42:03 +0100
Subject: [PATCH 14/22] Fixed missing include

---
 unit_tests/operator/Test_MetaOperator.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unit_tests/operator/Test_MetaOperator.cpp b/unit_tests/operator/Test_MetaOperator.cpp
index 4fe39630..adc548b9 100644
--- a/unit_tests/operator/Test_MetaOperator.cpp
+++ b/unit_tests/operator/Test_MetaOperator.cpp
@@ -18,6 +18,7 @@
 
 #include "aidge/backend/cpu/operator/ConvImpl.hpp"
 #include "aidge/backend/cpu/operator/PadImpl.hpp"
+#include "aidge/backend/cpu/operator/TanhImpl.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/filler/Filler.hpp"
 #include "aidge/operator/Conv.hpp"
-- 
GitLab


From 79e60036680239d2ee9d41c5e56bb6742f740585 Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Sun, 16 Feb 2025 17:53:50 +0100
Subject: [PATCH 15/22] Working concept

---
 CMakeLists.txt                                | 12 ++++
 include/aidge/backend/cpu.hpp                 |  2 +
 .../backend/cpu/operator/CryptoHashImpl.hpp   | 36 +++++++++++
 .../cpu/operator/CryptoHashImpl_kernels.hpp   | 52 ++++++++++++++++
 .../backend/cpu/operator/ModImpl_kernels.hpp  |  3 +
 src/operator/CryptoHashImpl.cpp               | 46 +++++++++++++++
 unit_tests/operator/Test_CryptoHash.cpp       | 56 ++++++++++++++++++
 unit_tests/scheduler/Test_Scheduler.cpp       | 59 +++++++++++++++++++
 8 files changed, 266 insertions(+)
 create mode 100644 include/aidge/backend/cpu/operator/CryptoHashImpl.hpp
 create mode 100644 include/aidge/backend/cpu/operator/CryptoHashImpl_kernels.hpp
 create mode 100644 src/operator/CryptoHashImpl.cpp
 create mode 100644 unit_tests/operator/Test_CryptoHash.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 66ef8ff2..2d4bc8ec 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -64,6 +64,14 @@ if(NOT $ENV{AIDGE_INSTALL} STREQUAL "")
 endif()
 find_package(aidge_core REQUIRED)
 
+find_package(OpenSSL QUIET)
+if(OpenSSL_FOUND)
+    message(STATUS "OpenSSL found: ${OPENSSL_VERSION}")
+    add_definitions(-DWITH_OPENSSL)
+else()
+    message(WARNING "OpenSSL not found, SHA256 will not be available.")
+endif()
+
 ##############################################
 # Create target and set properties
 file(GLOB_RECURSE src_files "src/*.cpp")
@@ -112,6 +120,10 @@ target_include_directories(${module_name}
         ${CMAKE_CURRENT_SOURCE_DIR}/src
 )
 
+if(OpenSSL_FOUND)
+    target_link_libraries(${module_name} PRIVATE OpenSSL::SSL OpenSSL::Crypto)
+endif()
+
 target_compile_features(${module_name} PRIVATE cxx_std_14)
 
 target_compile_options(${module_name} PRIVATE
diff --git a/include/aidge/backend/cpu.hpp b/include/aidge/backend/cpu.hpp
index ffc03ae5..80574b4a 100644
--- a/include/aidge/backend/cpu.hpp
+++ b/include/aidge/backend/cpu.hpp
@@ -28,6 +28,7 @@
 #include "aidge/backend/cpu/operator/ConvDepthWiseImpl.hpp"
 #include "aidge/backend/cpu/operator/ConvImpl.hpp"
 #include "aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp"
+#include "aidge/backend/cpu/operator/CryptoHashImpl.hpp"
 #include "aidge/backend/cpu/operator/DivImpl.hpp"
 #include "aidge/backend/cpu/operator/EqualImpl.hpp"
 #include "aidge/backend/cpu/operator/ErfImpl.hpp"
@@ -40,6 +41,7 @@
 #include "aidge/backend/cpu/operator/LeakyReLUImpl.hpp"
 #include "aidge/backend/cpu/operator/LnImpl.hpp"
 #include "aidge/backend/cpu/operator/MatMulImpl.hpp"
+#include "aidge/backend/cpu/operator/ModImpl.hpp"
 #include "aidge/backend/cpu/operator/MulImpl.hpp"
 #include "aidge/backend/cpu/operator/PadImpl.hpp"
 #include "aidge/backend/cpu/operator/PaddedConvImpl.hpp"
diff --git a/include/aidge/backend/cpu/operator/CryptoHashImpl.hpp b/include/aidge/backend/cpu/operator/CryptoHashImpl.hpp
new file mode 100644
index 00000000..d7f07f99
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/CryptoHashImpl.hpp
@@ -0,0 +1,36 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_TANHIMPL_H_
+#define AIDGE_CPU_OPERATOR_TANHIMPL_H_
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/CryptoHash.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include <memory>
+#include <vector>
+
+#ifdef WITH_OPENSSL
+#include <openssl/sha.h>
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using CryptoHashImpl_cpu = OperatorImpl_cpu<CryptoHash_Op,
+    void(const std::size_t, const void*, void*)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(CryptoHash_Op, "cpu", Aidge::CryptoHashImpl_cpu::create);
+}  // namespace Aidge
+#endif
+
+#endif /* AIDGE_CPU_OPERATOR_TANHIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/CryptoHashImpl_kernels.hpp b/include/aidge/backend/cpu/operator/CryptoHashImpl_kernels.hpp
new file mode 100644
index 00000000..cd596b69
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/CryptoHashImpl_kernels.hpp
@@ -0,0 +1,52 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_CRYPTOHASHIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_CRYPTOHASHIMPL_KERNELS_H_
+
+#include "aidge/utils/Registrar.hpp"
+
+#include "aidge/backend/cpu/operator/CryptoHashImpl.hpp"
+
+#ifdef WITH_OPENSSL
+namespace Aidge {
+template <class I, class O>
+void CryptoHashImpl_cpu_forward_kernel(std::size_t inputLength,
+                                     const void* input_,
+                                     void* output_) {
+
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+
+    // output must be at least SHA256_DIGEST_LENGTH bytes length
+    SHA256(reinterpret_cast<const uint8_t*>(input), inputLength * sizeof(I), reinterpret_cast<uint8_t*>(output));
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(CryptoHashImpl_cpu,
+    {{DataType::UInt8, DataFormat::Any}, {DataType::UInt8}},
+    {ProdConso::inPlaceModel, Aidge::CryptoHashImpl_cpu_forward_kernel<uint8_t, uint8_t>, nullptr});
+REGISTRAR(CryptoHashImpl_cpu,
+    {{DataType::UInt8, DataFormat::Any}, {DataType::UInt64}},
+    {ProdConso::inPlaceModel, Aidge::CryptoHashImpl_cpu_forward_kernel<uint8_t, uint64_t>, nullptr});
+REGISTRAR(CryptoHashImpl_cpu,
+    {{DataType::Float32, DataFormat::Any}, {DataType::UInt8}},
+    {ProdConso::inPlaceModel, Aidge::CryptoHashImpl_cpu_forward_kernel<float, uint8_t>, nullptr});
+REGISTRAR(CryptoHashImpl_cpu,
+    {{DataType::Float32, DataFormat::Any}, {DataType::UInt64}},
+    {ProdConso::inPlaceModel, Aidge::CryptoHashImpl_cpu_forward_kernel<float, uint64_t>, nullptr});
+REGISTRAR(CryptoHashImpl_cpu,
+    {{DataType::Float64, DataFormat::Any}, {DataType::UInt8}},
+    {ProdConso::inPlaceModel, Aidge::CryptoHashImpl_cpu_forward_kernel<double, uint8_t>, nullptr});
+}  // namespace Aidge
+#endif
+
+#endif /* AIDGE_CPU_OPERATOR_CRYPTOHASHIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/ModImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ModImpl_kernels.hpp
index 940fa482..15d18bf4 100644
--- a/include/aidge/backend/cpu/operator/ModImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ModImpl_kernels.hpp
@@ -72,6 +72,9 @@ REGISTRAR(ModImpl_cpu,
 REGISTRAR(ModImpl_cpu,
     {DataType::Int32},
     {ProdConso::inPlaceModel, Aidge::ModImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, nullptr});
+REGISTRAR(ModImpl_cpu,
+    {DataType::UInt64},
+    {ProdConso::inPlaceModel, Aidge::ModImpl_cpu_forward_kernel<std::uint64_t, std::uint64_t, std::uint64_t>, nullptr});
 }  // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_MODIMPL_KERNELS_H_ */
diff --git a/src/operator/CryptoHashImpl.cpp b/src/operator/CryptoHashImpl.cpp
new file mode 100644
index 00000000..10d82dd0
--- /dev/null
+++ b/src/operator/CryptoHashImpl.cpp
@@ -0,0 +1,46 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <cassert>
+#include <chrono>  // std::chrono::milliseconds
+#include <numeric> // std::accumulate
+#include <thread>  // std::this_thread::sleep_for
+#include <vector>
+
+#include "aidge/operator/CryptoHash.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+
+#include "aidge/backend/cpu/operator/CryptoHashImpl.hpp"
+#include "aidge/backend/cpu/operator/CryptoHashImpl_kernels.hpp"
+
+#ifdef WITH_OPENSSL
+template <>
+void Aidge::CryptoHashImpl_cpu::forward() {
+	const CryptoHash_Op& op_ = dynamic_cast<const CryptoHash_Op&>(mOp);
+    std::shared_ptr<Tensor> in0 = op_.getInput(0);
+    std::shared_ptr<Tensor> out0 = op_.getOutput(0);
+    AIDGE_ASSERT(in0, "missing input #0");
+
+    // Find the correct kernel type
+    const auto impl = Registrar<CryptoHashImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Call kernel
+    impl.forward(in0->size(),
+        getCPUPtr(mOp.getRawInput(0)),
+        getCPUPtr(mOp.getRawOutput(0)));
+}
+
+template <>
+void Aidge::CryptoHashImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not available for CryptoHash_Op");
+}
+#endif
diff --git a/unit_tests/operator/Test_CryptoHash.cpp b/unit_tests/operator/Test_CryptoHash.cpp
new file mode 100644
index 00000000..7453ea19
--- /dev/null
+++ b/unit_tests/operator/Test_CryptoHash.cpp
@@ -0,0 +1,56 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <cmath>    // std::abs
+#include <cstddef>  // std::size_t
+#include <memory>
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu/operator/CryptoHashImpl.hpp"
+#include "aidge/data/Data.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/graph/Node.hpp"
+#include "aidge/operator/CryptoHash.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"
+
+using namespace Aidge;
+
+#ifdef WITH_OPENSSL
+TEST_CASE("[cpu/operator] CryptoHash(forward)") {
+  SECTION("1D Tensor") {
+    std::shared_ptr<Tensor> input0 =
+        std::make_shared<Tensor>(Array1D<uint8_t, 5>{
+            {'a', 'b', 'c', 'd', 'e'}});
+    std::shared_ptr<Tensor> expectedOutput =
+        std::make_shared<Tensor>(Array1D<uint8_t, 32>{
+            {0x36, 0xbb, 0xe5, 0x0e, 0xd9, 0x68, 0x41, 0xd1,
+             0x04, 0x43, 0xbc, 0xb6, 0x70, 0xd6, 0x55, 0x4f,
+             0x0a, 0x34, 0xb7, 0x61, 0xbe, 0x67, 0xec, 0x9c,
+             0x4a, 0x8a, 0xd2, 0xc0, 0xc4, 0x4c, 0xa4, 0x2c}});
+
+    std::shared_ptr<Node> myCryptoHash = CryptoHash();
+    auto op = std::static_pointer_cast<CryptoHash_Op>(myCryptoHash->getOperator());
+    op->associateInput(0, input0);
+    op->setDataType(DataType::UInt8);
+    op->setBackend("cpu");
+    myCryptoHash->forward();
+
+    REQUIRE(op->getOutput(0)->size() == 32);
+
+    uint8_t* resPtr = static_cast<uint8_t*>(op->getOutput(0)->getImpl()->rawPtr());
+    uint8_t* expectedPtr = static_cast<uint8_t*>(expectedOutput->getImpl()->rawPtr());
+    for (std::size_t i = 0; i < expectedOutput->size(); ++i) {
+      REQUIRE(resPtr[i] == expectedPtr[i]);
+    }
+  }
+}
+#endif
diff --git a/unit_tests/scheduler/Test_Scheduler.cpp b/unit_tests/scheduler/Test_Scheduler.cpp
index 956169c3..5bd86eec 100644
--- a/unit_tests/scheduler/Test_Scheduler.cpp
+++ b/unit_tests/scheduler/Test_Scheduler.cpp
@@ -21,6 +21,10 @@
 #include "aidge/operator/Pop.hpp"
 #include "aidge/operator/Stack.hpp"
 #include "aidge/operator/Identity.hpp"
+#include "aidge/operator/CryptoHash.hpp"
+#include "aidge/operator/Mod.hpp"
+#include "aidge/operator/Tanh.hpp"
+#include "aidge/operator/Select.hpp"
 #include "aidge/operator/MetaOperator.hpp"
 #include "aidge/scheduler/SequentialScheduler.hpp"
 #include "aidge/scheduler/ParallelScheduler.hpp"
@@ -30,6 +34,9 @@
 #include "aidge/backend/cpu/operator/ReLUImpl.hpp"
 #include "aidge/backend/cpu/operator/SqrtImpl.hpp"
 #include "aidge/backend/cpu/operator/AddImpl.hpp"
+#include "aidge/backend/cpu/operator/CryptoHashImpl.hpp"
+#include "aidge/backend/cpu/operator/ModImpl.hpp"
+#include "aidge/backend/cpu/operator/TanhImpl.hpp"
 
 #include "aidge/recipes/GraphViewHelper.hpp"
 
@@ -512,4 +519,56 @@ TEST_CASE("[cpu/scheduler] Accumulate", "[scheduler]") {
     std::shared_ptr<Tensor> output = std::static_pointer_cast<OperatorTensor>(pop_o->getOperator())->getOutput(0);
     REQUIRE(*output == *expectedOutput);
 }
+
+#ifdef WITH_OPENSSL
+TEST_CASE("[cpu/scheduler] Select", "[scheduler]") {
+    std::shared_ptr<Tensor> in = std::make_shared<Tensor>(
+            Array2D<float, 2, 3>{{{1, 2, 3}, {4, 5, 6}}});
+
+    std::shared_ptr<GraphView> g = Sequential({
+        Producer(in, "input"),
+        Parallel({
+            Sequential({
+                CryptoHash("hash"),
+                Mod("mod")
+            }),
+            ReLU("relu"),
+            Tanh("tanh"),
+            Sqrt("sqrt")
+        }),
+        Select(3, "select")
+    });
+
+    auto modProd = Producer(std::make_shared<Tensor>(Array1D<uint64_t, 1>{{3}}));
+    modProd->addChild(g->getNode("mod"), 0, 1);
+    g->add(modProd);
+
+    g->getNode("hash")->getOperator()->setDataType(DataType::UInt64);
+    g->getNode("mod")->getOperator()->setDataType(DataType::UInt64);
+    g->setBackend("cpu");
+    g->save("select");
+
+    auto scheduler = SequentialScheduler(g);
+    scheduler.generateScheduling();
+    scheduler.saveStaticSchedulingDiagram("select_scheduling");
+    REQUIRE_NOTHROW(scheduler.forward(true));
+    
+    g->save("select_forwarded");
+
+    auto expectedOutputHash = std::make_shared<Tensor>(
+        Array1D<uint64_t, 4>{{0x1b7cf58dfe2dae24, 0x3bac903def4ce580, 0x5f5a347389d97f41, 0x2c2dc759abc6b61}});
+    auto outputHash = std::static_pointer_cast<OperatorTensor>(g->getNode("hash")->getOperator())->getOutput(0);
+    REQUIRE(*outputHash == *expectedOutputHash);
+
+    auto expectedOutputMod = std::make_shared<Tensor>(
+        Array1D<uint64_t, 4>{{2, 1, 1, 2}});
+    auto outputMod = std::static_pointer_cast<OperatorTensor>(g->getNode("mod")->getOperator())->getOutput(0);
+    REQUIRE(*outputMod == *expectedOutputMod);
+
+    auto expectedOutput = std::make_shared<Tensor>(
+        Array2D<float, 2, 3>{{{std::sqrt(1), std::sqrt(2), std::sqrt(3)}, {std::sqrt(4), std::sqrt(5), std::sqrt(6)}}});
+    auto output = std::static_pointer_cast<OperatorTensor>(g->getNode("select")->getOperator())->getOutput(0);
+    REQUIRE(*output == *expectedOutput);
+}
+#endif
 } // namespace Aidge
-- 
GitLab


From 652eb2810fd6a0a0360f8881cd3e9b41343d8340 Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Sun, 16 Feb 2025 23:46:45 +0100
Subject: [PATCH 16/22] Working concept of with tagConditionalNodes()

---
 unit_tests/scheduler/Test_Scheduler.cpp | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/unit_tests/scheduler/Test_Scheduler.cpp b/unit_tests/scheduler/Test_Scheduler.cpp
index 5bd86eec..54e57ec4 100644
--- a/unit_tests/scheduler/Test_Scheduler.cpp
+++ b/unit_tests/scheduler/Test_Scheduler.cpp
@@ -569,6 +569,24 @@ TEST_CASE("[cpu/scheduler] Select", "[scheduler]") {
         Array2D<float, 2, 3>{{{std::sqrt(1), std::sqrt(2), std::sqrt(3)}, {std::sqrt(4), std::sqrt(5), std::sqrt(6)}}});
     auto output = std::static_pointer_cast<OperatorTensor>(g->getNode("select")->getOperator())->getOutput(0);
     REQUIRE(*output == *expectedOutput);
+
+    scheduler.resetScheduling();
+    scheduler.tagConditionalNodes();
+
+    REQUIRE(g->getNode("relu")->attributes()->hasAttr("schedule.cond"));
+    REQUIRE(g->getNode("relu")->attributes()->getAttr<std::set<std::pair<NodePtr, size_t>>>("schedule.cond")
+        == std::set<std::pair<NodePtr, size_t>>{{g->getNode("select"), 0}});
+    REQUIRE(g->getNode("tanh")->attributes()->hasAttr("schedule.cond"));
+    REQUIRE(g->getNode("tanh")->attributes()->getAttr<std::set<std::pair<NodePtr, size_t>>>("schedule.cond")
+        == std::set<std::pair<NodePtr, size_t>>{{g->getNode("select"), 1}});
+    REQUIRE(g->getNode("sqrt")->attributes()->hasAttr("schedule.cond"));
+    REQUIRE(g->getNode("sqrt")->attributes()->getAttr<std::set<std::pair<NodePtr, size_t>>>("schedule.cond")
+        == std::set<std::pair<NodePtr, size_t>>{{g->getNode("select"), 2}});
+    REQUIRE(!g->getNode("input")->attributes()->hasAttr("schedule.cond"));
+
+    scheduler.generateScheduling();
+    scheduler.saveStaticSchedulingDiagram("select_scheduling_tag");
+    REQUIRE_NOTHROW(scheduler.forward(true));
 }
 #endif
 } // namespace Aidge
-- 
GitLab


From e13b5fa521c5f979602e41236ece5795eaed8635 Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Thu, 20 Feb 2025 09:09:30 +0100
Subject: [PATCH 17/22] Export OpenSSL dependency

---
 CMakeLists.txt                    | 2 ++
 aidge_backend_cpu-config.cmake.in | 4 ++++
 2 files changed, 6 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2d4bc8ec..729853ee 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -120,8 +120,10 @@ target_include_directories(${module_name}
         ${CMAKE_CURRENT_SOURCE_DIR}/src
 )
 
+set(AIDGE_REQUIRES_OPENSSL FALSE)
 if(OpenSSL_FOUND)
     target_link_libraries(${module_name} PRIVATE OpenSSL::SSL OpenSSL::Crypto)
+    set(AIDGE_REQUIRES_OPENSSL TRUE)
 endif()
 
 target_compile_features(${module_name} PRIVATE cxx_std_14)
diff --git a/aidge_backend_cpu-config.cmake.in b/aidge_backend_cpu-config.cmake.in
index d8e1372b..7582102c 100644
--- a/aidge_backend_cpu-config.cmake.in
+++ b/aidge_backend_cpu-config.cmake.in
@@ -2,6 +2,10 @@
 
 include(CMakeFindDependencyMacro)
 find_dependency(aidge_core)
+set(AIDGE_REQUIRES_OPENSSL @AIDGE_REQUIRES_OPENSSL@)
+if (AIDGE_REQUIRES_OPENSSL)
+    find_dependency(OpenSSL)
+endif()
 
 include(CMakeFindDependencyMacro)
 
-- 
GitLab


From f53364a0301fc933ec2e48d6c7f5488a76470d77 Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Fri, 21 Feb 2025 14:49:00 +0100
Subject: [PATCH 18/22] Renaming

---
 aidge_backend_cpu/unit_tests/test_scheduler.py | 8 ++++----
 unit_tests/operator/Test_MetaOperator.cpp      | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/aidge_backend_cpu/unit_tests/test_scheduler.py b/aidge_backend_cpu/unit_tests/test_scheduler.py
index 494f3456..b60ff3f0 100644
--- a/aidge_backend_cpu/unit_tests/test_scheduler.py
+++ b/aidge_backend_cpu/unit_tests/test_scheduler.py
@@ -57,9 +57,9 @@ class test_scheduler(unittest.TestCase):
         scheduler = aidge_core.SequentialScheduler(graph_view)
         scheduler.generate_scheduling()
 
-        self.assertEqual(len(scheduler.get_static_scheduling()), 10)
+        self.assertEqual(len(scheduler.get_sequential_static_scheduling()), 10)
         # Do not care about the order of execution of the producers
-        self.assertListEqual([i.name() for i in scheduler.get_static_scheduling()[-3:]], EXPECTED_SCHEDULE)
+        self.assertListEqual([i.name() for i in scheduler.get_sequential_static_scheduling()[-3:]], EXPECTED_SCHEDULE)
 
 
     def test_parallel_scheduling(self):
@@ -83,9 +83,9 @@ class test_scheduler(unittest.TestCase):
         scheduler = aidge_core.SequentialScheduler(graph_view)
         scheduler.generate_scheduling()
 
-        self.assertEqual(len(scheduler.get_static_scheduling()), 11)
+        self.assertEqual(len(scheduler.get_sequential_static_scheduling()), 11)
         # Do not care about the order of execution of the producers
-        self.assertTrue([i.name() for i in scheduler.get_static_scheduling()[-4:]] in EXPECTED_SCHEDULE)
+        self.assertTrue([i.name() for i in scheduler.get_sequential_static_scheduling()[-4:]] in EXPECTED_SCHEDULE)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/unit_tests/operator/Test_MetaOperator.cpp b/unit_tests/operator/Test_MetaOperator.cpp
index adc548b9..bb9027d3 100644
--- a/unit_tests/operator/Test_MetaOperator.cpp
+++ b/unit_tests/operator/Test_MetaOperator.cpp
@@ -279,9 +279,9 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
         REQUIRE(op->getNbConsumedData(1).data == 32768);
         REQUIRE(op->getNbProducedData(0).data == 34816);
         REQUIRE(op->getNbProducedData(1).data == 34816);
-        REQUIRE(microGraphScheduler->getStaticScheduling(0).size() == 26);
-        REQUIRE(microGraphScheduler->getStaticScheduling(1).size() == 24);
-        REQUIRE(microGraphScheduler->getStaticScheduling(15).size() == 24);
+        REQUIRE(microGraphScheduler->getSequentialStaticScheduling(0).size() == 26);
+        REQUIRE(microGraphScheduler->getSequentialStaticScheduling(1).size() == 24);
+        REQUIRE(microGraphScheduler->getSequentialStaticScheduling(15).size() == 24);
     }
 
     SECTION("LSTM(forward_values)") {
-- 
GitLab


From 47bb2b3f69b7642f5d22b62d81559347ebd4b6ff Mon Sep 17 00:00:00 2001
From: NAUD Maxence <maxence.naud@cea.fr>
Date: Mon, 24 Feb 2025 13:49:17 +0000
Subject: [PATCH 19/22] Fix some imports following aidge_core update

---
 src/operator/PadImpl.cpp       | 6 +++---
 src/operator/ReduceSumImpl.cpp | 7 +++++--
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/operator/PadImpl.cpp b/src/operator/PadImpl.cpp
index cdae21f8..9a54437f 100644
--- a/src/operator/PadImpl.cpp
+++ b/src/operator/PadImpl.cpp
@@ -9,14 +9,14 @@
  *
  ********************************************************************************/
 
+#include <cstddef>
 #include <vector>
 
-#include "aidge/utils/Types.h"
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
-#include "aidge/operator/Conv.hpp"
-
 #include "aidge/backend/cpu/operator/PadImpl.hpp"
 #include "aidge/backend/cpu/operator/PadImpl_kernels.hpp"
+#include "aidge/operator/Pad.hpp"
+#include "aidge/utils/Types.h"
 
 Aidge::Elts_t Aidge::Pad_ProdConso_cpu::getNbRequiredProtected(Aidge::IOIndex_t inputIdx) const {
     AIDGE_ASSERT(inputIdx == 0, "input index out of range."
diff --git a/src/operator/ReduceSumImpl.cpp b/src/operator/ReduceSumImpl.cpp
index aad08018..93a89a34 100644
--- a/src/operator/ReduceSumImpl.cpp
+++ b/src/operator/ReduceSumImpl.cpp
@@ -12,11 +12,14 @@
 #include "aidge/backend/cpu/operator/ReduceSumImpl.hpp"
 
 #include <memory>
+#include <stdexcept>
 #include <vector>
 
-#include "aidge/utils/Types.h"
-#include "aidge/operator/ReduceSum.hpp"
 #include "aidge/backend/cpu/operator/ReduceSumImpl_kernels.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/ReduceSum.hpp"
+#include "aidge/utils/ErrorHandling.hpp"
+#include "aidge/utils/Types.h"
 
 template <>
 void Aidge::ReduceSumImpl_cpu::forward() {
-- 
GitLab


From 3f4cd6e77aae54e674ab3f9aec0e0675cbd6860d Mon Sep 17 00:00:00 2001
From: Jerome Hue <jerome.hue@cea.fr>
Date: Thu, 6 Feb 2025 10:49:15 +0100
Subject: [PATCH 20/22] Implement backward function of Add operator

---
 .../aidge/backend/cpu/operator/AddImpl.hpp    |  14 +-
 .../backend/cpu/operator/AddImpl_kernels.hpp  |  64 +++-
 src/operator/AddImpl.cpp                      |  25 +-
 unit_tests/operator/Test_AddImpl.cpp          | 275 +++++++++++++++++-
 4 files changed, 368 insertions(+), 10 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/AddImpl.hpp b/include/aidge/backend/cpu/operator/AddImpl.hpp
index e39c35b4..ca04dff9 100644
--- a/include/aidge/backend/cpu/operator/AddImpl.hpp
+++ b/include/aidge/backend/cpu/operator/AddImpl.hpp
@@ -25,7 +25,19 @@
 namespace Aidge {
 // Operator implementation entry point for the backend
 using AddImpl_cpu = OperatorImpl_cpu<Add_Op,
-    void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*, void*)>;
+    void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*, void*),
+    void(const std::size_t, 
+         const std::size_t, 
+         const std::size_t, 
+         const std::vector<std::size_t>&, 
+         const std::vector<std::size_t>&, 
+         const std::vector<std::size_t>&, 
+         const void*, 
+         const void*, 
+         const void*, 
+         void*, 
+         void*)
+>;
 
 // Implementation entry point registration to Operator
 REGISTRAR(Add_Op, "cpu", Aidge::AddImpl_cpu::create);
diff --git a/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp
index e6d13fcf..d6fff9b5 100644
--- a/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp
@@ -147,25 +147,75 @@ void AddImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
     }
 }
 
+template <class I, class O>
+void AddImpl_cpu_backward_kernel(const std::size_t input0Length,
+                               const std::size_t input1Length,
+                               const std::size_t gradOutputLength,
+                               const std::vector<std::size_t>& dims0,
+                               const std::vector<std::size_t>& dims1,
+                               const std::vector<std::size_t>& outputDims,
+                               const void* input0_,
+                               const void* input1_,
+                               const void* grad_output_,
+                               void* gradientInput0_,
+                               void* gradientInput1_)
+{
+    // TODO: Remove input0/1 from the function
+    const I* input0 = static_cast<const I*>(input0_);
+    const I* input1 = static_cast<const I*>(input1_);
+    const O* gradOutput = static_cast<const O*>(grad_output_);
+    auto* gradInput0 = static_cast<I*>(gradientInput0_);
+    auto* gradInput1 = static_cast<I*>(gradientInput1_);
+
+    std::fill_n(gradInput0, input0Length, static_cast<I>(0));
+    std::fill_n(gradInput1, input1Length, static_cast<I>(0));
+
+    auto broadcastedDims0 = getBroadcastedDims(outputDims, dims0);
+    auto broadcastedDims1 = getBroadcastedDims(outputDims, dims1);
+
+    for (std::size_t i = 0; i < gradOutputLength; ++i) {
+        auto idxOutputGrad = getMultiDimIndices(outputDims, i);
+        std::vector<std::size_t> idxInput0(broadcastedDims0.size());
+        std::vector<std::size_t> idxInput1(broadcastedDims1.size());
+
+        for (std::size_t dimension = 0; dimension < broadcastedDims0.size(); ++dimension) {
+            idxInput0[dimension] = (broadcastedDims0[dimension] == 1) ? 0 : idxOutputGrad[dimension];
+        }
+
+        for (std::size_t dimension = 0; dimension < broadcastedDims1.size(); ++dimension) {
+            idxInput1[dimension] = (broadcastedDims1[dimension] == 1) ? 0 : idxOutputGrad[dimension];
+        }
+
+        auto idx0 = getFlattenedIndex(broadcastedDims0, idxInput0);
+        auto idx1 = getFlattenedIndex(broadcastedDims1, idxInput1);
+
+        // For addition: gradient of both inputs is just the output gradient
+        // (unlike multiplication where we need to multiply by the other input,
+        // or subtraction where we need to negate one of them)
+        gradInput0[idx0] += static_cast<I>(gradOutput[i]);
+        gradInput1[idx1] += static_cast<I>(gradOutput[i]);
+    }
+}
+
 // Kernels registration to implementation entry point
 REGISTRAR(AddImpl_cpu,
     {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float32}},
-    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<float, float>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<float, float>, Aidge::AddImpl_cpu_backward_kernel<float, float>});
 REGISTRAR(AddImpl_cpu,
     {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}},
-    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<double, double>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<double, double>, Aidge::AddImpl_cpu_backward_kernel<double, double>});
 REGISTRAR(AddImpl_cpu,
     {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int8}},
-    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int8_t, std::int8_t>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int8_t, std::int8_t>, Aidge::AddImpl_cpu_backward_kernel<std::int8_t, std::int8_t>});
 REGISTRAR(AddImpl_cpu,
     {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::UInt8}},
-    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::uint8_t, std::uint8_t>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::uint8_t, std::uint8_t>, Aidge::AddImpl_cpu_backward_kernel<std::uint8_t, std::uint8_t>});
 REGISTRAR(AddImpl_cpu,
     {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}},
-    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, Aidge::AddImpl_cpu_backward_kernel<std::int32_t, std::int32_t>});
 REGISTRAR(AddImpl_cpu,
     {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int64}},
-    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, Aidge::AddImpl_cpu_backward_kernel<std::int64_t, std::int64_t>});
 }  // namespace Aidge
 
-#endif /* AIDGE_CPU_OPERATOR_ADDIMPL_CPU_KERNELS_H_ */
\ No newline at end of file
+#endif /* AIDGE_CPU_OPERATOR_ADDIMPL_CPU_KERNELS_H_ */
diff --git a/src/operator/AddImpl.cpp b/src/operator/AddImpl.cpp
index 101743ec..b027fb87 100644
--- a/src/operator/AddImpl.cpp
+++ b/src/operator/AddImpl.cpp
@@ -55,5 +55,28 @@ void  Aidge::AddImpl_cpu::forward() {
 
 template <>
 void Aidge::AddImpl_cpu::backward() {
-    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Add_Op on backend cpu");
+    const Add_Op& op_ = dynamic_cast<const Add_Op&>(mOp);
+
+    auto in0 = op_.getInput(0);
+    auto in1 = op_.getInput(1);
+    auto in0grad = op_.getInput(0)->grad();
+    auto in1grad = op_.getInput(1)->grad();
+    auto out0grad = op_.getOutput(0)->grad();
+
+    // Find the correct kernel type
+    const auto impl = Registrar<AddImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Call kernel
+    impl.backward(in0grad->size(),
+               in1grad->size(),
+               out0grad->size(),
+               in0->dims(),
+               in1->dims(),
+               out0grad->dims(),
+               getCPUPtr(in0),
+               getCPUPtr(in1),
+               getCPUPtr(out0grad),
+               getCPUPtr(in0grad),
+               getCPUPtr(in1grad));
+
 }
diff --git a/unit_tests/operator/Test_AddImpl.cpp b/unit_tests/operator/Test_AddImpl.cpp
index bff9629b..4538b322 100644
--- a/unit_tests/operator/Test_AddImpl.cpp
+++ b/unit_tests/operator/Test_AddImpl.cpp
@@ -10,6 +10,7 @@
  ********************************************************************************/
 
 #include <memory>
+#include <random>
 
 #include <catch2/catch_test_macros.hpp>
 
@@ -19,6 +20,7 @@
 #include "aidge/graph/Node.hpp"
 #include "aidge/operator/Add.hpp"
 #include "aidge/utils/ArrayHelpers.hpp"
+#include "aidge/utils/TensorUtils.hpp"
 
 using namespace Aidge;
 
@@ -139,4 +141,275 @@ TEST_CASE("[cpu/operator] Add(forward)", "[Add][CPU]") {
         Log::info("Expected Add_1 Tensor:\n{}", expectedOutput);
         REQUIRE(*op_1->getOutput(0) == expectedOutput);
     }
-}
\ No newline at end of file
+}
+
+TEST_CASE("[cpu/operator] Add(backward)", "[Add][CPU]") {
+    std::shared_ptr<Add_Op> op = std::make_shared<Add_Op>();
+    op->setDataType(DataType::Float32);
+    op->setBackend("cpu");
+
+    // NOTE: The first four tests use fixed values, the last one uses random values but static dimensions.
+
+    SECTION("Case 1: 1D and 2D Tensors") {
+        const auto T0 = std::make_shared<Tensor>(
+            Array2D<cpptype_t<DataType::Float32>, 2, 3>({{{1, 2, 3}, {4, 5, 6}}}));
+
+        const auto T1 =
+            std::make_shared<Tensor>(Array1D<cpptype_t<DataType::Float32>, 3>({0.1, 0.2, 0.3}));
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>(
+            Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}})));
+        op->forwardDims();
+
+        op->backward();
+
+        const Tensor expectedGrad0 =
+            Array2D<cpptype_t<DataType::Float32>, 2, 3>({{{1, 1, 1}, {1, 1, 1}}});
+
+        const Tensor expectedGrad1 = Array1D<cpptype_t<DataType::Float32>, 3>({2, 2, 2});
+
+
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1));
+    }
+
+    SECTION("Case 2: 3D and 1D tensors") {
+        const auto T0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+            {{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}},
+              {{7.0, 8.0, 9.0}, {10.0, 11.0, 12.0}}}}));
+
+        const auto T1 =
+            std::make_shared<Tensor>(Array1D<float, 3>({0.3, 0.2, 0.1}));
+
+        const auto newGrad = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+            {{{{1, 1, 1}, {1, 1, 1}}, {{1, 1, 1}, {1, 1, 1}}}}));
+
+        const Tensor expectedGrad0 =
+            Array3D<float, 2, 2, 3>({{{{1, 1, 1}, {1, 1, 1}},
+                                      {{1, 1, 1}, {1, 1, 1}}}});
+
+        const Tensor expectedGrad1 = Array1D<cpptype_t<DataType::Float32>, 3>({4, 4, 4});
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(newGrad);
+        op->forwardDims();
+        op->backward();
+
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1));
+    }
+
+    SECTION("Case 3: 4D and 2D tensors") {
+        const auto T0 = std::make_shared<Tensor>(Array4D<cpptype_t<DataType::Float32>, 2, 2, 3, 3>(
+            {{{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}, {7.0, 8.0, 9.0}},
+               {{10.0, 11.0, 12.0}, {13.0, 14.0, 15.0}, {16.0, 17.0, 18.0}}},
+              {{{19.0, 20.0, 21.0}, {22.0, 23.0, 24.0}, {25.0, 26.0, 27.0}},
+               {{28.0, 29.0, 30.0},
+                {31.0, 32.0, 33.0},
+                {34.0, 35.0, 36.0}}}}}));
+
+        const auto T1 = std::make_shared<Tensor>(Array2D<cpptype_t<DataType::Float32>, 3, 3>(
+            {{{0.5, 0.3, 0.1}, {0.4, 0.2, 0.6}, {0.7, 0.8, 0.9}}}));
+
+        const auto newGrad =
+            std::make_shared<Tensor>(Array4D<cpptype_t<DataType::Float32>, 2, 2, 3, 3>(
+                {{{{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}},
+                   {{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}},
+                  {{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}},
+                   {{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}}}}));
+
+        const Tensor expectedGrad0 =
+            Array4D<cpptype_t<DataType::Float32>, 2, 2, 3, 3>(
+                {{{{{1, 1, 1}, {1, 1, 1}, {1, 1, 1}},
+                   {{1, 1, 1}, {1, 1, 1}, {1, 1, 1}}},
+                  {{{1, 1, 1}, {1, 1, 1}, {1, 1, 1}},
+                   {{1, 1, 1}, {1, 1, 1}, {1, 1, 1}}}}});
+
+        const Tensor expectedGrad1 =
+            Array2D<cpptype_t<DataType::Float32>, 3, 3>({{
+                                   {4.0, 4.0, 4.0},
+                                   {4.0, 4.0, 4.0},
+                                   {4.0, 4.0, 4.0}}});
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(newGrad);
+        op->forwardDims();
+
+        op->backward();
+
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1));
+    }
+
+    SECTION("Case 4: 3D and 2D tensors") {
+        const auto T0 = std::make_shared<Tensor>(
+            Array3D<float, 2, 3, 4>({{{
+                                          {1.0, 2.0, 3.0, 4.0},
+                                          {5.0, 6.0, 7.0, 8.0},
+                                          {9.0, 10.0, 11.0, 12.0},
+                                      },
+                                      {
+                                          {13.0, 14.0, 15.0, 16.0},
+                                          {17.0, 18.0, 19.0, 20.0},
+                                          {21.0, 22.0, 23.0, 24.0},
+                                      }}}));
+
+        const auto T1 = std::make_shared<Tensor>(
+            Array2D<cpptype_t<DataType::Float32>, 3, 4>({{{0.1, 0.2, 0.3, 0.4},
+                                   {0.5, 0.6, 0.7, 0.8},
+                                   {0.9, 1.0, 1.1, 1.2}}}));
+
+        const auto newGrad = std::make_shared<Tensor>(
+            Array3D<cpptype_t<DataType::Float32>, 2, 3, 4>({{{
+                                          {1.0, 1.0, 1.0, 1.0},
+                                          {1.0, 1.0, 1.0, 1.0},
+                                          {1.0, 1.0, 1.0, 1.0},
+                                      },
+                                      {
+                                          {1.0, 1.0, 1.0, 1.0},
+                                          {1.0, 1.0, 1.0, 1.0},
+                                          {1.0, 1.0, 1.0, 1.0},
+                                      }}}));
+
+        const Tensor expectedGrad0 =
+            Array3D<cpptype_t<DataType::Float32>, 2, 3, 4>({{{{1, 1, 1, 1},
+                                       {1, 1, 1, 1},
+                                       {1, 1, 1, 1}},
+                                      {{1, 1, 1, 1},
+                                       {1, 1, 1, 1},
+                                       {1, 1, 1, 1}}}});
+
+        const Tensor expectedGrad1 =
+            Array2D<cpptype_t<DataType::Float32>, 3, 4>({{{2.0, 2.0, 2.0, 2.0},
+                                   {2.0, 2.0, 2.0, 2.0},
+                                   {2.0, 2.0, 2.0, 2.0}}});
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(newGrad);
+        op->forwardDims();
+
+        op->backward();
+
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1));
+    }
+
+    SECTION("Case 5: Tensors with random values") {
+
+        // Use random values
+        const std::vector<std::size_t> dims0 = {5, 2, 1, 7}; // First tensor
+        const std::vector<std::size_t> dims1 = {2, 6, 7};    // Second tensor
+        const std::vector<std::size_t> outputDims = {5, 2, 6, 7};
+
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> dist(0.1f, 1.0f);
+
+        auto T0 = std::make_shared<Tensor>(dims0);
+        T0->setDataType(DataType::Float32);
+        T0->setBackend("cpu");
+        float* input0Data = static_cast<float*>(T0->getImpl()->rawPtr());
+        // Fill with random values
+        for (std::size_t i = 0; i < T0->size(); ++i) {
+            input0Data[i] = dist(gen);
+        }
+
+        auto T1 = std::make_shared<Tensor>(dims1);
+        T1->setDataType(DataType::Float32);
+        T1->setBackend("cpu");
+        float* input1Data = static_cast<float*>(T1->getImpl()->rawPtr());
+        // Fill with random values
+        for (std::size_t i = 0; i < T1->size(); ++i) {
+            input1Data[i] = dist(gen);
+        }
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+
+        op->forwardDims();
+        op->forward();
+
+        Tensor expectedOutput{outputDims};
+        expectedOutput.setBackend("cpu");
+        float* expectedOutputData = static_cast<float*>(expectedOutput.getImpl()->rawPtr());
+
+        for (std::size_t n = 0; n < 5; ++n) {
+            for (std::size_t c = 0; c < 2; ++c) {
+                for (std::size_t h = 0; h < 6; ++h) {
+                    for (std::size_t w = 0; w < 7; ++w) {
+                        std::size_t outIdx = w + 7 * (h + 6 * (c + 2 * n));
+                        std::size_t in0Idx =
+                            w + 7 * (0 + 1 * (c + 2 * n)); // middle dim is 1
+                        std::size_t in1Idx =
+                            w + 7 * (h + 6 * c);           // no n dimension
+
+                        expectedOutputData[outIdx] = input0Data[in0Idx] + input1Data[in1Idx];
+                    }
+                }
+            }
+        }
+
+        auto outputTensor = op->getOutput(0);
+
+        REQUIRE(approxEq<float>(*outputTensor, expectedOutput));
+
+        // Backward pass
+        std::vector<float> gradOutputData(expectedOutput.size());
+        for (auto &val : gradOutputData) {
+            val = dist(gen);
+        }
+
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>());
+        op->getOutput(0)->grad()->resize(outputDims);
+        op->getOutput(0)->grad()->getImpl()->setRawPtr(gradOutputData.data(),
+                                                       expectedOutput.size());
+
+        // Compute reference gradients
+        std::vector<float> expectedGrad0(T0->size(), 0.0f);
+        std::vector<float> expectedGrad1(T1->size(), 0.0f);
+
+        for (std::size_t n = 0; n < 5; ++n) {
+            for (std::size_t c = 0; c < 2; ++c) {
+                for (std::size_t h = 0; h < 6; ++h) {
+                    for (std::size_t w = 0; w < 7; ++w) {
+                        std::size_t outIdx = w + 7 * (h + 6 * (c + 2 * n));
+                        std::size_t in0Idx = w + 7 * (0 + 1 * (c + 2 * n));
+                        std::size_t in1Idx = w + 7 * (h + 6 * c);
+
+                        // Gradient for input0: just accumulate grad_output
+                        expectedGrad0[in0Idx] += gradOutputData[outIdx];
+
+                        // Gradient for input1: just accumulate grad_output
+                        expectedGrad1[in1Idx] += gradOutputData[outIdx];
+                    }
+                }
+            }
+        }
+
+        // Perform backward pass
+        op->backward();
+
+        auto expectedGrad0Tensor = std::make_shared<Tensor>();
+        expectedGrad0Tensor->resize(T0->dims());
+        expectedGrad0Tensor->setBackend("cpu");
+        expectedGrad0Tensor->setDataType(DataType::Float32);
+        expectedGrad0Tensor->getImpl()->setRawPtr(expectedGrad0.data(),
+                                                    expectedGrad0.size());
+
+        auto expectedGrad1Tensor = std::make_shared<Tensor>(T1->dims());
+        expectedGrad1Tensor->setBackend("cpu");
+        expectedGrad1Tensor->setDataType(DataType::Float32);
+        expectedGrad1Tensor->getImpl()->setRawPtr(expectedGrad1.data(),
+                                                    expectedGrad1.size());
+
+        // Verify backward pass
+        REQUIRE(approxEq<float>(*T0->grad(), *expectedGrad0Tensor));
+        REQUIRE(approxEq<float>(*T1->grad(), *expectedGrad1Tensor));
+    }
+}
+
-- 
GitLab


From 393fb207a6599cdfbbbe141e3cb29a3a5cae8246 Mon Sep 17 00:00:00 2001
From: NAUD Maxence <maxence.naud@cea.fr>
Date: Wed, 26 Feb 2025 14:48:17 +0000
Subject: [PATCH 21/22] [upd] ConstantOfShape kernel to use Tensor as inputs
 and avoid redundant size computation

---
 .../cpu/operator/ConstantOfShapeImpl.hpp        |  8 +++-----
 .../operator/ConstantOfShapeImpl_kernels.hpp    | 17 ++++-------------
 src/operator/ConstantOfShapeImpl.cpp            |  9 +++------
 3 files changed, 10 insertions(+), 24 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp b/include/aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp
index 83e7e030..b595ec93 100644
--- a/include/aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp
+++ b/include/aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp
@@ -12,23 +12,21 @@
 #ifndef AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_H_
 #define AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_H_
 
-#include <cstddef>
 #include <memory>
-#include <vector>
 
 #include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/ConstantOfShape.hpp"
 #include "aidge/utils/Registrar.hpp"
-#include "aidge/utils/Types.h"
 
 namespace Aidge {
+
+class Tensor;
 // Operator implementation entry point for the backend
 using ConstantOfShapeImpl_cpu = OperatorImpl_cpu<ConstantOfShape_Op,
-    void(const std::vector<DimSize_t>, const Tensor&, void *)>;
+    void(const std::shared_ptr<Tensor>&, const Tensor&)>;
 
 // Implementation entry point registration to Operator
 REGISTRAR(ConstantOfShape_Op, "cpu", Aidge::ConstantOfShapeImpl_cpu::create);
 } // namespace Aidge
 
 #endif /* _AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_H_ */
-
diff --git a/include/aidge/backend/cpu/operator/ConstantOfShapeImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConstantOfShapeImpl_kernels.hpp
index 18ab9c0a..c42cc76a 100644
--- a/include/aidge/backend/cpu/operator/ConstantOfShapeImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConstantOfShapeImpl_kernels.hpp
@@ -30,20 +30,11 @@
 namespace Aidge {
 template <class O>
 void ConstantOfShapeimpl_cpu_forward_kernel(
-    const std::vector<DimSize_t> output_dims, const Tensor &value,
-    void *output_) {
+    const std::shared_ptr<Tensor>& output_, const Tensor &value) {
 
-  O *output = static_cast<O *>(output_);
-  O val;
-  std::copy(static_cast<O *>(value.getImpl()->hostPtr()),
-            static_cast<O *>(value.getImpl()->hostPtr()) +
-                static_cast<NbElts_t>(1),
-            &val);
-  const size_t output_size = std::accumulate(
-      output_dims.begin(), output_dims.end(), 1, std::multiplies<DimSize_t>());
-  for (size_t i = 0; i < output_size; ++i) {
-    output[i] = val;
-  }
+  O* output = static_cast<O*>(output_->getImpl()->hostPtr());
+  const O val = *reinterpret_cast<O*>(value.getImpl()->hostPtr());
+  std::fill_n(output, output_->size(), val);
 }
 
 // Kernels registration to implementation entry point
diff --git a/src/operator/ConstantOfShapeImpl.cpp b/src/operator/ConstantOfShapeImpl.cpp
index 16e4b762..1d41160b 100644
--- a/src/operator/ConstantOfShapeImpl.cpp
+++ b/src/operator/ConstantOfShapeImpl.cpp
@@ -13,15 +13,14 @@
 
 #include <functional>
 #include <memory>
-#include <vector>
+#include <stdexcept>   // std::runtime_error
 
 #include "aidge/backend/cpu/operator/ConstantOfShapeImpl_kernels.hpp"
-#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/ConstantOfShape.hpp"
+#include "aidge/backend/OperatorImpl.hpp"  // Aidge::getBestMatch, Aidge::getRequiredSpec
 #include "aidge/utils/ErrorHandling.hpp"
 #include "aidge/utils/Registrar.hpp"
-#include "aidge/utils/Types.h"
 
 template <>
 void Aidge::ConstantOfShapeImpl_cpu::forward() {
@@ -33,9 +32,7 @@ void Aidge::ConstantOfShapeImpl_cpu::forward() {
     const auto impl = Registrar<ConstantOfShapeImpl_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Call kernel
-    impl.forward(op_.getOutput(0)->dims(),
-             op_.value(), 
-             op_.getOutput(0)->getImpl()->rawPtr());
+    impl.forward(op_.getOutput(0), op_.value());
 }
 
 template <>
-- 
GitLab


From 9d9647aa0f91f637c5cd063b78b8a68075c2294e Mon Sep 17 00:00:00 2001
From: NAUD Maxence <maxence.naud@cea.fr>
Date: Wed, 26 Feb 2025 14:51:38 +0000
Subject: [PATCH 22/22] [upd] tests following 'aidge_core' changes

---
 .../operator/Test_ConstantOfShapeImpl.cpp     | 139 +++++++++---------
 .../recipies/Test_FoldConstantOfShape.cpp     |  50 +++++++
 2 files changed, 119 insertions(+), 70 deletions(-)
 create mode 100644 unit_tests/recipies/Test_FoldConstantOfShape.cpp

diff --git a/unit_tests/operator/Test_ConstantOfShapeImpl.cpp b/unit_tests/operator/Test_ConstantOfShapeImpl.cpp
index 8ec1669b..6833d836 100644
--- a/unit_tests/operator/Test_ConstantOfShapeImpl.cpp
+++ b/unit_tests/operator/Test_ConstantOfShapeImpl.cpp
@@ -27,89 +27,88 @@
 #include "aidge/data/Tensor.hpp"
 #include "aidge/filler/Filler.hpp"
 #include "aidge/operator/ConstantOfShape.hpp"
-#include "aidge/operator/OperatorTensor.hpp"
 #include "aidge/utils/TensorUtils.hpp"
 #include "aidge/utils/Types.h"
 
 namespace Aidge {
-TEST_CASE("[cpu/operator] ConstantOfShape", "[ConstantOfShape][CPU]") {
-  constexpr std::uint16_t NBTRIALS = 10;
-  // Create a random number generator
-  auto random_seed = Catch::Generators::Detail::getSeed;
-  std::mt19937 gen(random_seed());
-  std::uniform_real_distribution<float> valueDist(
-      0.1f, 1.1f); // Random float distribution between 0 and 1
-  std::uniform_int_distribution<DimSize_t> input_tensor_size_dist(
-      std::size_t(1), std::size_t(10));
-  std::uniform_int_distribution<int64_t> input_tensor_values_dist(
-      std::size_t(1), std::size_t(7));
-  std::uniform_real_distribution<double> operator_attr_value_dist(-100., 100.);
 
-  ///////////////////////////////////////////////
-  // SETUP FUNCTIONS
-  auto generate_input_tensor =
-      [&gen, &input_tensor_size_dist,
-       &input_tensor_values_dist]() -> std::shared_ptr<Tensor> {
-    std::vector<DimSize_t> input_dims;
-    input_dims.push_back(input_tensor_size_dist(gen));
+TEST_CASE("[cpu/operator] ConstantOfShape(forward)", "[ConstantOfShape][CPU][forward]") {
+    constexpr std::uint16_t NBTRIALS = 10;
+    // Create a random number generator
+    auto random_seed = Catch::Generators::Detail::getSeed;
+    std::mt19937 gen(random_seed());
+    std::uniform_real_distribution<float> valueDist(
+            0.1f, 1.1f); // Random float distribution between 0 and 1
+    std::uniform_int_distribution<DimSize_t> input_tensor_size_dist(
+            std::size_t(1), std::size_t(10));
+    std::uniform_int_distribution<int64_t> input_tensor_values_dist(
+            std::size_t(1), std::size_t(7));
+    std::uniform_real_distribution<double> operator_attr_value_dist(-100., 100.);
 
-    auto result = std::make_shared<Tensor>(input_dims);
-    result->setDataType(DataType::Int64);
-    result->setBackend("cpu");
-    for (DimSize_t i = 0; i < result->size(); ++i) {
-      result->set<std::int64_t>(i, input_tensor_values_dist(gen));
-    }
-    return result;
-  };
+    ///////////////////////////////////////////////
+    // SETUP FUNCTIONS
+    auto generate_input_tensor =
+            [&gen, &input_tensor_size_dist,
+             &input_tensor_values_dist]() -> std::shared_ptr<Tensor> {
+        std::vector<DimSize_t> input_dims;
+        input_dims.push_back(input_tensor_size_dist(gen));
 
-  auto generate_random_operator =
-      [&gen,
-       &operator_attr_value_dist]() -> std::shared_ptr<ConstantOfShape_Op> {
-    auto node = ConstantOfShape(Tensor(operator_attr_value_dist(gen)));
-    auto op = std::static_pointer_cast<ConstantOfShape_Op>(node->getOperator());
-    op->setDataType(DataType::Float64);
-    op->setBackend("cpu");
-    return op;
-  };
+        auto result = std::make_shared<Tensor>(input_dims);
+        result->setDataType(DataType::Int64);
+        result->setBackend("cpu");
+        for (DimSize_t i = 0; i < result->size(); ++i) {
+            result->set<std::int64_t>(i, input_tensor_values_dist(gen));
+        }
+        return result;
+    };
 
-  auto generate_output_tensor = [](std::shared_ptr<Tensor> input_tensor,
-                                   std::shared_ptr<ConstantOfShape_Op> op) {
-    std::vector<DimSize_t> output_dims;
-    output_dims.reserve(input_tensor->size());
-    for (DimSize_t i = 0; i < input_tensor->size(); ++i) {
-      output_dims.push_back(input_tensor->get<int64_t>(i));
-    }
-    auto result = std::make_shared<Tensor>(output_dims);
-    result->setDataType(op->value().dataType());
-    result->setBackend("cpu");
-    constantFiller(result, op->value().get<double>(0));
-    return result;
-  };
+    auto generate_random_operator =
+            [&gen,
+             &operator_attr_value_dist]() -> std::shared_ptr<ConstantOfShape_Op> {
+        std::shared_ptr<ConstantOfShape_Op> op = std::make_shared<ConstantOfShape_Op>(Tensor(operator_attr_value_dist(gen)));
+        op->setDataType(DataType::Float64);
+        op->setBackend("cpu");
+        return op;
+    };
+
+    auto generate_output_tensor = [](std::shared_ptr<Tensor> input_tensor,
+                                      std::shared_ptr<ConstantOfShape_Op> op) {
+        std::vector<DimSize_t> output_dims;
+        output_dims.reserve(input_tensor->size());
+        for (DimSize_t i = 0; i < input_tensor->size(); ++i) {
+            output_dims.push_back(input_tensor->get<std::int64_t>(i));
+        }
+        auto result = std::make_shared<Tensor>(output_dims);
+        result->setDataType(op->value().dataType());
+        result->setBackend("cpu");
+        constantFiller(result, op->value().get<double>(0));
+        return result;
+    };
 
-  /////////////////////////////////////
-  // BENCHMARKING
-  std::chrono::time_point<std::chrono::system_clock> start;
-  std::chrono::time_point<std::chrono::system_clock> end;
-  std::chrono::duration<double, std::micro> duration{};
-  int number_of_operation{0};
+    /////////////////////////////////////
+    // BENCHMARKING
+    std::chrono::time_point<std::chrono::system_clock> start;
+    std::chrono::time_point<std::chrono::system_clock> end;
+    std::chrono::duration<double, std::micro> duration{};
+    int number_of_operation{0};
 
-  SECTION("ConstantOfShapeImpl_cpu::forward()") {
-    for (int i = 0; i < NBTRIALS; ++i) {
-      auto input_T = generate_input_tensor();
-      std::shared_ptr<ConstantOfShape_Op> op = generate_random_operator();
-      auto output_T = generate_output_tensor(input_T, op);
-      op->associateInput(0, input_T);
+    SECTION("ConstantOfShapeImpl_cpu::forward()") {
+        for (int i = 0; i < NBTRIALS; ++i) {
+            auto input_T = generate_input_tensor();
+            std::shared_ptr<ConstantOfShape_Op> op = generate_random_operator();
+            auto output_T = generate_output_tensor(input_T, op);
+            op->associateInput(0, input_T);
 
-      REQUIRE(op->forwardDims(true));
-      REQUIRE_NOTHROW(op->forward());
+            REQUIRE(op->forwardDims(true));
+            REQUIRE_NOTHROW(op->forward());
 
-      CHECK(output_T->nbDims() == op->getOutput(0)->nbDims());
-      for (DimIdx_t i = 0; i < output_T->nbDims(); ++i) {
-        CHECK(output_T->dims().at(i) == op->getOutput(0)->dims().at(i));
-      }
-      CHECK(approxEq<double>(*output_T, *op->getOutput(0)));
+            CHECK(output_T->nbDims() == op->getOutput(0)->nbDims());
+            for (DimIdx_t i = 0; i < output_T->nbDims(); ++i) {
+                CHECK(output_T->dims().at(i) == op->getOutput(0)->dims().at(i));
+            }
+            CHECK(approxEq<double>(*output_T, *op->getOutput(0)));
+        }
     }
-  }
 }
 } // namespace Aidge
 
diff --git a/unit_tests/recipies/Test_FoldConstantOfShape.cpp b/unit_tests/recipies/Test_FoldConstantOfShape.cpp
new file mode 100644
index 00000000..a1c09b15
--- /dev/null
+++ b/unit_tests/recipies/Test_FoldConstantOfShape.cpp
@@ -0,0 +1,50 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+ #include "aidge/graph/GraphView.hpp"
+ #include "aidge/operator/Identity.hpp"
+ #include "aidge/recipes/Recipes.hpp"
+
+ #include <cstdint>  // std::int64_t
+ #include <memory>
+
+ #include <catch2/catch_test_macros.hpp>
+
+ #include "aidge/graph/OpArgs.hpp"
+ #include "aidge/operator/ConstantOfShape.hpp"
+ #include "aidge/operator/Conv.hpp"
+ #include "aidge/operator/Producer.hpp"
+ #include "aidge/operator/ReLU.hpp"
+ #include "aidge/recipes/Recipes.hpp"
+ #include "aidge/utils/ArrayHelpers.hpp"
+ #include "aidge/utils/Types.h"
+
+ namespace Aidge {
+
+ TEST_CASE("[cpu/recipes] foldConstantOfShape",
+           "[ConstantOfShape][foldConstantOfShape][recipes]") {
+   auto input_T = std::make_shared<Tensor>(Array1D<std::int64_t, 4>({1, 1, 3, 3}));
+
+   auto model = std::make_shared<GraphView>();
+   SECTION("Sequential model") {
+     model = Sequential({
+         Producer(input_T, "prod_0", true),
+         ConstantOfShape(3, "constantOfShape_0"),
+         Conv(1, 1, {3, 3}, "Conv_0"),
+         ReLU("ReLU_1")
+     });
+     // aidge_backend_cpu loaded. Recipe should work
+     REQUIRE(foldConstantOfShape(model) == 1);
+     CHECK(model->forwardDims());
+   }
+ }
+
+ }  // namespace Aidge
-- 
GitLab