From 742b763f233004df607c59a65a5d881f41ab0f6d Mon Sep 17 00:00:00 2001
From: cmoineau <cyril.moineau@cea.fr>
Date: Mon, 10 Feb 2025 09:47:43 +0000
Subject: [PATCH 001/108] Add back MR
 https://gitlab.eclipse.org/eclipse/aidge/aidge_backend_cpu/-/merge_requests/131.

---
 unit_tests/operator/Test_MetaOperator.cpp | 765 ++++++++++++++++------
 1 file changed, 573 insertions(+), 192 deletions(-)

diff --git a/unit_tests/operator/Test_MetaOperator.cpp b/unit_tests/operator/Test_MetaOperator.cpp
index 271a1e2f..4fe39630 100644
--- a/unit_tests/operator/Test_MetaOperator.cpp
+++ b/unit_tests/operator/Test_MetaOperator.cpp
@@ -9,70 +9,79 @@
  *
  ********************************************************************************/
 
-#include <catch2/catch_test_macros.hpp>
 #include <cmath>
 #include <cstdlib>
 #include <memory>
+#include <random>
+
+#include <catch2/catch_test_macros.hpp>
 
-#include "aidge/utils/TensorUtils.hpp"
 #include "aidge/backend/cpu/operator/ConvImpl.hpp"
 #include "aidge/backend/cpu/operator/PadImpl.hpp"
 #include "aidge/data/Tensor.hpp"
+#include "aidge/filler/Filler.hpp"
 #include "aidge/operator/Conv.hpp"
+#include "aidge/operator/FC.hpp"
+#include "aidge/operator/Identity.hpp"
 #include "aidge/operator/MetaOperator.hpp"
 #include "aidge/operator/MetaOperatorDefs.hpp"
 #include "aidge/operator/Pad.hpp"
 #include "aidge/operator/Pop.hpp"
-#include "aidge/scheduler/SequentialScheduler.hpp"
+#include "aidge/operator/Stack.hpp"
 #include "aidge/scheduler/ParallelScheduler.hpp"
+#include "aidge/scheduler/SequentialScheduler.hpp"
+#include "aidge/utils/TensorUtils.hpp"
 
 using namespace Aidge;
 
 TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
-  SECTION("PaddedConv(forward)") {
-    std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(
-            Array4D<double, 4, 3, 3, 3>{{{{{6.20986394e-01, 1.19775136e-03, 7.22876095e-02},
-                                          {1.16492919e-01, 8.21634093e-02, 1.17413265e-01},
-                                          {2.23743494e-01, 3.99495413e-01, 5.55552411e-01}},
-                                         {{6.64970077e-01, 9.62199940e-01, 4.87531967e-01},
-                                          {6.12586558e-01, 8.09918671e-02, 8.40649383e-01},
-                                          {4.15264406e-01, 8.28247138e-01, 1.52301135e-01}},
-                                         {{1.76992844e-02, 7.78697112e-01, 8.14531592e-01},
-                                          {1.36960611e-01, 4.64806728e-01, 4.85150000e-01},
-                                          {4.34776520e-01, 9.51740977e-01, 9.05793799e-01}}},
-
-                                        {{{1.71925246e-02, 1.91082720e-01, 3.67982644e-01},
-                                          {1.56806559e-01, 6.22280998e-01, 3.15827594e-01},
-                                          {6.04359038e-01, 2.83095947e-01, 6.11168892e-01}},
-                                         {{2.76942832e-01, 1.89768419e-01, 8.07988176e-01},
-                                          {1.67925807e-01, 2.68356150e-01, 6.28875602e-01},
-                                          {1.69093357e-04, 9.64788636e-01, 7.29254981e-01}},
-                                         {{6.34030122e-01, 1.32087038e-01, 3.33857107e-01},
-                                          {7.63047502e-01, 5.12539506e-02, 9.77400493e-01},
-                                          {8.06151288e-01, 2.60237147e-01, 3.93729313e-01}}},
-
-                                        {{{5.84605240e-01, 4.74648725e-01, 8.54111741e-01},
-                                          {7.10897067e-02, 5.02579011e-01, 3.35236224e-01},
-                                          {9.08637408e-01, 8.02903830e-01, 2.83929907e-01}},
-                                         {{3.68206999e-01, 9.18579021e-02, 7.33168098e-01},
-                                          {1.59875539e-01, 9.13163381e-01, 3.59806060e-01},
-                                          {1.41295882e-01, 7.00312185e-01, 5.63728289e-01}},
-                                         {{9.39513546e-01, 1.91704891e-01, 1.11454944e-01},
-                                          {5.46298282e-01, 2.89698587e-01, 2.62612651e-01},
-                                          {1.18554992e-01, 4.32147376e-02, 7.53016994e-01}}},
-
-                                        {{{9.53179175e-01, 2.05041054e-02, 1.11318451e-01},
-                                          {8.67878485e-01, 2.93263422e-01, 8.03912714e-01},
-                                          {8.93620255e-01, 1.37831128e-01, 3.83640583e-01}},
-                                         {{3.96020188e-01, 6.24959320e-01, 1.90709175e-01},
-                                          {5.80538620e-01, 6.63031275e-01, 2.07247191e-01},
-                                          {5.65672171e-01, 5.57014317e-01, 9.26909496e-01}},
-                                         {{3.43901418e-01, 4.47741636e-01, 6.59249367e-01},
-                                          {7.34639028e-01, 2.84957200e-02, 9.70225217e-01},
-                                          {1.33578790e-02, 6.12054702e-01, 9.36685235e-02}}}}});
-    std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(
-            Array1D<double, 4>{{0.16884905, 0.27994487, 0.57227465, 0.06435205}});
-    std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array4D<double, 2, 3, 5, 5>{
+    SECTION("PaddedConv(forward)") {
+        std::shared_ptr<Tensor> myWeights =
+            std::make_shared<Tensor>(Array4D<double, 4, 3, 3, 3>{
+                {{{{6.20986394e-01, 1.19775136e-03, 7.22876095e-02},
+                   {1.16492919e-01, 8.21634093e-02, 1.17413265e-01},
+                   {2.23743494e-01, 3.99495413e-01, 5.55552411e-01}},
+                  {{6.64970077e-01, 9.62199940e-01, 4.87531967e-01},
+                   {6.12586558e-01, 8.09918671e-02, 8.40649383e-01},
+                   {4.15264406e-01, 8.28247138e-01, 1.52301135e-01}},
+                  {{1.76992844e-02, 7.78697112e-01, 8.14531592e-01},
+                   {1.36960611e-01, 4.64806728e-01, 4.85150000e-01},
+                   {4.34776520e-01, 9.51740977e-01, 9.05793799e-01}}},
+
+                 {{{1.71925246e-02, 1.91082720e-01, 3.67982644e-01},
+                   {1.56806559e-01, 6.22280998e-01, 3.15827594e-01},
+                   {6.04359038e-01, 2.83095947e-01, 6.11168892e-01}},
+                  {{2.76942832e-01, 1.89768419e-01, 8.07988176e-01},
+                   {1.67925807e-01, 2.68356150e-01, 6.28875602e-01},
+                   {1.69093357e-04, 9.64788636e-01, 7.29254981e-01}},
+                  {{6.34030122e-01, 1.32087038e-01, 3.33857107e-01},
+                   {7.63047502e-01, 5.12539506e-02, 9.77400493e-01},
+                   {8.06151288e-01, 2.60237147e-01, 3.93729313e-01}}},
+
+                 {{{5.84605240e-01, 4.74648725e-01, 8.54111741e-01},
+                   {7.10897067e-02, 5.02579011e-01, 3.35236224e-01},
+                   {9.08637408e-01, 8.02903830e-01, 2.83929907e-01}},
+                  {{3.68206999e-01, 9.18579021e-02, 7.33168098e-01},
+                   {1.59875539e-01, 9.13163381e-01, 3.59806060e-01},
+                   {1.41295882e-01, 7.00312185e-01, 5.63728289e-01}},
+                  {{9.39513546e-01, 1.91704891e-01, 1.11454944e-01},
+                   {5.46298282e-01, 2.89698587e-01, 2.62612651e-01},
+                   {1.18554992e-01, 4.32147376e-02, 7.53016994e-01}}},
+
+                 {{{9.53179175e-01, 2.05041054e-02, 1.11318451e-01},
+                   {8.67878485e-01, 2.93263422e-01, 8.03912714e-01},
+                   {8.93620255e-01, 1.37831128e-01, 3.83640583e-01}},
+                  {{3.96020188e-01, 6.24959320e-01, 1.90709175e-01},
+                   {5.80538620e-01, 6.63031275e-01, 2.07247191e-01},
+                   {5.65672171e-01, 5.57014317e-01, 9.26909496e-01}},
+                  {{3.43901418e-01, 4.47741636e-01, 6.59249367e-01},
+                   {7.34639028e-01, 2.84957200e-02, 9.70225217e-01},
+                   {1.33578790e-02, 6.12054702e-01, 9.36685235e-02}}}}});
+        std::shared_ptr<Tensor> myBias =
+            std::make_shared<Tensor>(Array1D<double, 4>{
+                {0.16884905, 0.27994487, 0.57227465, 0.06435205}});
+        std::shared_ptr<Tensor> myInput = std::make_shared<
+            Tensor>(Array4D<double, 2, 3, 5, 5>{
             // NCHW
             {{{{0.43224481, 0.9047832, 0.18402257, 0.06162838, 0.52490127},
                {0.27773404, 0.55402353, 0.9485062, 0.31197083, 0.80328607},
@@ -108,93 +117,107 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
                {0.95873236, 0.6742374, 0.55679676, 0.6323497, 0.34072958},
                {0.49694061, 0.79173045, 0.19738225, 0.14755281, 0.80818177},
                {0.02332061, 0.74270703, 0.59415632, 0.08195934, 0.46295434},
-               {0.71426058, 0.85032931, 0.90750818, 0.28768431, 0.4401146}}}}});
-
-    std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(
-            Array4D<double, 2, 4, 5, 5>{{{{{3.40294218, 3.74021220, 4.02050114, 4.07054710, 2.46286273},
-                {4.61770582, 6.70517588, 6.50356627, 6.29688787, 3.53332567},
-                {5.47480106, 5.92094421, 6.64605665, 7.95090199, 4.28721523},
-                {4.01485729, 6.06748962, 7.52447891, 7.37980652, 5.28401136},
-                {2.83065438, 3.62033439, 3.56222963, 5.56103945, 3.23335814}},
-
-                {{3.30230498, 4.92814112, 4.34710836, 3.96262765, 2.97987890},
-                {4.49693012, 6.68929291, 5.53603029, 5.68874264, 4.28756475},
-                {4.20528078, 6.82776880, 6.70569849, 7.12809610, 4.40845442},
-                {4.31169367, 6.73352146, 6.30962515, 7.45826864, 4.99164438},
-                {2.18136287, 4.28968000, 4.20080042, 4.89814138, 2.87394023}},
-
-                {{3.54787683, 4.35851812, 4.63881302, 4.23359537, 3.16992092},
-                {5.25099468, 7.54282856, 6.69849157, 5.64309788, 4.56919575},
-                {4.71914101, 7.52830601, 6.71450949, 7.81113863, 5.84658146},
-                {4.97893143, 7.39293909, 6.89905310, 8.14430809, 5.62998581},
-                {2.79735112, 4.80967140, 5.57630205, 5.38828325, 4.57078695}},
-
-                {{3.03048635, 5.04540300, 4.21824932, 4.87323284, 2.35113740},
-                {4.45167351, 6.47721338, 7.40922976, 6.70445728, 3.60700107},
-                {3.77927423, 6.82826376, 7.41777134, 7.57402420, 5.13131523},
-                {4.08747244, 7.07994175, 7.57206821, 8.51897335, 5.26987123},
-                {2.34426999, 4.60127831, 4.86486769, 6.01579571, 3.97803569}}},
-
-
-                {{{3.84700942, 4.25972605, 3.05269003, 3.78043652, 2.08771229},
-                {6.00459957, 6.05633259, 4.45951605, 4.54089880, 4.03066444},
-                {5.41579390, 7.29543972, 6.18680000, 5.58812714, 3.45964241},
-                {6.04531050, 7.70924091, 5.52207708, 5.02131319, 4.09403706},
-                {3.18092418, 4.45422697, 4.04294252, 3.86577177, 2.18776536}},
-
-                {{4.02600670, 4.27603531, 3.81011319, 4.03631020, 2.57254648},
-                {5.33471155, 5.72588634, 5.12079763, 5.11733150, 3.76836705},
-                {5.62947607, 5.92492962, 6.24170446, 6.44130468, 3.44276404},
-                {5.38414621, 6.02679539, 5.88985586, 5.90263271, 3.15044069},
-                {3.31261086, 4.44371319, 3.47660780, 4.15411520, 1.48961508}},
-
-                {{3.95879412, 4.17324543, 3.70114422, 3.27447152, 3.09713888},
-                {5.78258181, 6.57920837, 4.99913597, 6.20961237, 4.98552179},
-                {5.84685421, 7.19971228, 6.66386652, 6.68013430, 4.90963316},
-                {5.24417877, 7.06430531, 6.58512402, 6.02492285, 4.48986387},
-                {3.64294529, 5.00678444, 5.04760027, 4.72895622, 2.67990756}},
-
-                {{3.48610687, 4.12853813, 4.07563591, 3.51327014, 2.44217038},
-                {4.80529881, 7.33211374, 5.14774036, 4.77281189, 4.44612408},
-                {5.11703110, 7.55168772, 7.14374542, 6.43696356, 4.10621357},
-                {5.41270018, 6.85949135, 6.73503923, 5.74601364, 4.46150303},
-                {3.16612267, 4.38248920, 5.23248482, 4.21292210, 2.86031270}}}}});
-
-    std::shared_ptr<Node> myConv = Conv<2>(3, 4, {3, 3}, "myconv");
-    auto convOp = std::static_pointer_cast<OperatorTensor>(myConv->getOperator());
-
-    std::shared_ptr<Node> myPad =
+               {0.71426058,
+                0.85032931,
+                0.90750818,
+                0.28768431,
+                0.4401146}}}}});
+
+        std::shared_ptr<Tensor> myOutput = std::make_shared<
+            Tensor>(Array4D<double, 2, 4, 5, 5>{
+            {{{{3.40294218, 3.74021220, 4.02050114, 4.07054710, 2.46286273},
+               {4.61770582, 6.70517588, 6.50356627, 6.29688787, 3.53332567},
+               {5.47480106, 5.92094421, 6.64605665, 7.95090199, 4.28721523},
+               {4.01485729, 6.06748962, 7.52447891, 7.37980652, 5.28401136},
+               {2.83065438, 3.62033439, 3.56222963, 5.56103945, 3.23335814}},
+
+              {{3.30230498, 4.92814112, 4.34710836, 3.96262765, 2.97987890},
+               {4.49693012, 6.68929291, 5.53603029, 5.68874264, 4.28756475},
+               {4.20528078, 6.82776880, 6.70569849, 7.12809610, 4.40845442},
+               {4.31169367, 6.73352146, 6.30962515, 7.45826864, 4.99164438},
+               {2.18136287, 4.28968000, 4.20080042, 4.89814138, 2.87394023}},
+
+              {{3.54787683, 4.35851812, 4.63881302, 4.23359537, 3.16992092},
+               {5.25099468, 7.54282856, 6.69849157, 5.64309788, 4.56919575},
+               {4.71914101, 7.52830601, 6.71450949, 7.81113863, 5.84658146},
+               {4.97893143, 7.39293909, 6.89905310, 8.14430809, 5.62998581},
+               {2.79735112, 4.80967140, 5.57630205, 5.38828325, 4.57078695}},
+
+              {{3.03048635, 5.04540300, 4.21824932, 4.87323284, 2.35113740},
+               {4.45167351, 6.47721338, 7.40922976, 6.70445728, 3.60700107},
+               {3.77927423, 6.82826376, 7.41777134, 7.57402420, 5.13131523},
+               {4.08747244, 7.07994175, 7.57206821, 8.51897335, 5.26987123},
+               {2.34426999, 4.60127831, 4.86486769, 6.01579571, 3.97803569}}},
+
+             {{{3.84700942, 4.25972605, 3.05269003, 3.78043652, 2.08771229},
+               {6.00459957, 6.05633259, 4.45951605, 4.54089880, 4.03066444},
+               {5.41579390, 7.29543972, 6.18680000, 5.58812714, 3.45964241},
+               {6.04531050, 7.70924091, 5.52207708, 5.02131319, 4.09403706},
+               {3.18092418, 4.45422697, 4.04294252, 3.86577177, 2.18776536}},
+
+              {{4.02600670, 4.27603531, 3.81011319, 4.03631020, 2.57254648},
+               {5.33471155, 5.72588634, 5.12079763, 5.11733150, 3.76836705},
+               {5.62947607, 5.92492962, 6.24170446, 6.44130468, 3.44276404},
+               {5.38414621, 6.02679539, 5.88985586, 5.90263271, 3.15044069},
+               {3.31261086, 4.44371319, 3.47660780, 4.15411520, 1.48961508}},
+
+              {{3.95879412, 4.17324543, 3.70114422, 3.27447152, 3.09713888},
+               {5.78258181, 6.57920837, 4.99913597, 6.20961237, 4.98552179},
+               {5.84685421, 7.19971228, 6.66386652, 6.68013430, 4.90963316},
+               {5.24417877, 7.06430531, 6.58512402, 6.02492285, 4.48986387},
+               {3.64294529, 5.00678444, 5.04760027, 4.72895622, 2.67990756}},
+
+              {{3.48610687, 4.12853813, 4.07563591, 3.51327014, 2.44217038},
+               {4.80529881, 7.33211374, 5.14774036, 4.77281189, 4.44612408},
+               {5.11703110, 7.55168772, 7.14374542, 6.43696356, 4.10621357},
+               {5.41270018, 6.85949135, 6.73503923, 5.74601364, 4.46150303},
+               {3.16612267,
+                4.38248920,
+                5.23248482,
+                4.21292210,
+                2.86031270}}}}});
+
+        std::shared_ptr<Node> myConv = Conv<2>(3, 4, {3, 3}, "myconv");
+        auto convOp =
+            std::static_pointer_cast<OperatorTensor>(myConv->getOperator());
+
+        std::shared_ptr<Node> myPad =
             Pad<2>({1, 1, 1, 1}, "myPad", PadBorderType::Constant, 0.0);
-    auto padOp = std::static_pointer_cast<OperatorTensor>(myPad->getOperator());
-
-    convOp->setInput(1, myWeights);
-    convOp->setInput(2, myBias);
-
-    myPad->addChild(myConv, 0, 0);
-    padOp->setInput(0, myInput);
-
-    padOp->setDataType(DataType::Float64);
-    padOp->setBackend("cpu");
-    convOp->setDataType(DataType::Float64);
-    convOp->setBackend("cpu");
-
-    myPad->forward();
-    myConv->forward();
-    convOp -> getOutput(0) -> print();
-
-    double* computedOutput = static_cast<double*>(convOp->getOutput(0)->getImpl()->rawPtr());
-    double* expectedOutput = static_cast<double*>(myOutput->getImpl()->rawPtr());
-    for (std::size_t i = 0; i < myOutput->size(); ++i) {
-        REQUIRE(std::abs(computedOutput[i] - expectedOutput[i]) < 1e-5);
-    }
+        auto padOp =
+            std::static_pointer_cast<OperatorTensor>(myPad->getOperator());
+
+        convOp->setInput(1, myWeights);
+        convOp->setInput(2, myBias);
+
+        myPad->addChild(myConv, 0, 0);
+        padOp->setInput(0, myInput);
+
+        padOp->setDataType(DataType::Float64);
+        padOp->setBackend("cpu");
+        convOp->setDataType(DataType::Float64);
+        convOp->setBackend("cpu");
+
+        myPad->forward();
+        myConv->forward();
+        convOp->getOutput(0)->print();
+
+        double *computedOutput =
+            static_cast<double *>(convOp->getOutput(0)->getImpl()->rawPtr());
+        double *expectedOutput =
+            static_cast<double *>(myOutput->getImpl()->rawPtr());
+        for (std::size_t i = 0; i < myOutput->size(); ++i) {
+            REQUIRE(std::abs(computedOutput[i] - expectedOutput[i]) < 1e-5);
+        }
 
-    std::shared_ptr<Node> myPaddedConv =
+        std::shared_ptr<Node> myPaddedConv =
             PaddedConv(3, 4, {3, 3}, "myPaddedConv", {1, 1}, {1, 1, 1, 1});
-  }
+    }
     SECTION("LSTM(forward)") {
+
         auto pop = Pop();
         auto myLSTM = LSTM(32, 64, 0, true, "ltsm");
-        auto op = std::dynamic_pointer_cast<MetaOperator_Op>(myLSTM->getOperator());
+        auto op =
+            std::dynamic_pointer_cast<MetaOperator_Op>(myLSTM->getOperator());
 
         auto microGraph = op->getMicroGraph();
         microGraph->save("lstm", false, true);
@@ -209,14 +232,14 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
         }
         REQUIRE(myLSTM->nbOutputs() == 2);
 
-        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
-            Array2D<float, 16, 32>{});
-        std::shared_ptr<Tensor> myInit = std::make_shared<Tensor>(
-            Array2D<float, 32, 64>{});
-        std::shared_ptr<Tensor> myInitW = std::make_shared<Tensor>(
-            Array2D<float, 64, 32>{});
-        std::shared_ptr<Tensor> myInitR = std::make_shared<Tensor>(
-            Array2D<float, 64, 64>{});
+        std::shared_ptr<Tensor> myInput =
+            std::make_shared<Tensor>(Array2D<float, 16, 32>{});
+        std::shared_ptr<Tensor> myInit =
+            std::make_shared<Tensor>(Array2D<float, 32, 64>{});
+        std::shared_ptr<Tensor> myInitW =
+            std::make_shared<Tensor>(Array2D<float, 64, 32>{});
+        std::shared_ptr<Tensor> myInitR =
+            std::make_shared<Tensor>(Array2D<float, 64, 64>{});
 
         pop->addChild(myLSTM, 0, 0);
         pop->getOperator()->associateInput(0, myInput);
@@ -246,7 +269,9 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
         microGraph->save("lstm_dims", true, true);
         REQUIRE(op->dimsForwarded());
 
-        auto microGraphScheduler = std::dynamic_pointer_cast<MetaOperator_Op>(op)->getMicroGraphScheduler();
+        auto microGraphScheduler =
+            std::dynamic_pointer_cast<MetaOperator_Op>(op)
+                ->getMicroGraphScheduler();
         microGraphScheduler->saveSchedulingDiagram("lstm_scheduling");
 
         REQUIRE(op->getNbConsumedData(0).data == 512);
@@ -257,11 +282,14 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
         REQUIRE(microGraphScheduler->getStaticScheduling(1).size() == 24);
         REQUIRE(microGraphScheduler->getStaticScheduling(15).size() == 24);
     }
+
     SECTION("LSTM(forward_values)") {
         auto myLSTM = LSTM(2, 3, 0, true, "ltsm");
-        auto op = std::static_pointer_cast<OperatorTensor>(myLSTM->getOperator());
+        auto op =
+            std::static_pointer_cast<OperatorTensor>(myLSTM->getOperator());
 
-        auto microGraph = std::dynamic_pointer_cast<MetaOperator_Op>(op)->getMicroGraph();
+        auto microGraph =
+            std::dynamic_pointer_cast<MetaOperator_Op>(op)->getMicroGraph();
         microGraph->save("lstm", false, false);
 
         REQUIRE(myLSTM->nbInputs() == 3 + 8 + 8);
@@ -276,12 +304,14 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
 
         std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
             Array2D<float, 3, 2>{{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}}});
-        std::shared_ptr<Tensor> myInit = std::make_shared<Tensor>(
-            Array2D<float, 3, 3>{{{0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}}});
+        std::shared_ptr<Tensor> myInit =
+            std::make_shared<Tensor>(Array2D<float, 3, 3>{
+                {{0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}}});
         std::shared_ptr<Tensor> myInitW = std::make_shared<Tensor>(
             Array2D<float, 3, 2>{{{0.1, 0.1}, {0.1, 0.1}, {0.1, 0.1}}});
-        std::shared_ptr<Tensor> myInitR = std::make_shared<Tensor>(
-            Array2D<float, 3, 3>{{{0.1, 0.1, 0.1}, {0.1, 0.1, 0.1}, {0.1, 0.1, 0.1}}});
+        std::shared_ptr<Tensor> myInitR =
+            std::make_shared<Tensor>(Array2D<float, 3, 3>{
+                {{0.1, 0.1, 0.1}, {0.1, 0.1, 0.1}, {0.1, 0.1, 0.1}}});
 
         op->associateInput(0, myInput);
         op->associateInput(17, myInit);
@@ -308,12 +338,13 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
         microGraph->save("lstm_values_dims", false, true);
 
         std::shared_ptr<Tensor> myHiddenState = std::make_shared<Tensor>(
-                Array2D<float, 3, 3>{{{0.0952412, 0.0952412, 0.0952412},
-                                     {0.25606447, 0.25606447, 0.25606447},
-                                     {0.40323776, 0.40323776, 0.40323776}}});
+            Array2D<float, 3, 3>{{{0.0952412, 0.0952412, 0.0952412},
+                                  {0.25606447, 0.25606447, 0.25606447},
+                                  {0.40323776, 0.40323776, 0.40323776}}});
 
-
-        auto microGraphScheduler = std::dynamic_pointer_cast<MetaOperator_Op>(op)->getMicroGraphScheduler();
+        auto microGraphScheduler =
+            std::dynamic_pointer_cast<MetaOperator_Op>(op)
+                ->getMicroGraphScheduler();
         microGraphScheduler->saveSchedulingDiagram("lstm_values_scheduling");
 
         op->getOutput(0)->print();
@@ -321,11 +352,13 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
 
         REQUIRE(approxEq<float>(*(op->getOutput(0)), *myHiddenState));
     }
+
     SECTION("LSTM(forward_values_seq)") {
         auto pop = Pop();
         auto myLSTM = LSTM(2, 3, 2, true, "ltsm");
         auto myGraph = Sequential({pop, myLSTM});
-        auto op = std::static_pointer_cast<OperatorTensor>(myLSTM->getOperator());
+        auto op =
+            std::static_pointer_cast<OperatorTensor>(myLSTM->getOperator());
 
         REQUIRE(myLSTM->nbInputs() == 3 + 8 + 8);
         REQUIRE(myLSTM->inputCategory(0) == InputCategory::Data);
@@ -338,13 +371,16 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
         REQUIRE(myLSTM->nbOutputs() == 2);
 
         std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
-            Array3D<float, 2, 3, 2>{{{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}}, {{2.0, 3.0}, {4.0, 5.0}, {6.0, 7.0}}}});
-        std::shared_ptr<Tensor> myInit = std::make_shared<Tensor>(
-            Array2D<float, 3, 3>{{{0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}}});
+            Array3D<float, 2, 3, 2>{{{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}},
+                                     {{2.0, 3.0}, {4.0, 5.0}, {6.0, 7.0}}}});
+        std::shared_ptr<Tensor> myInit =
+            std::make_shared<Tensor>(Array2D<float, 3, 3>{
+                {{0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}}});
         std::shared_ptr<Tensor> myInitW = std::make_shared<Tensor>(
             Array2D<float, 3, 2>{{{0.1, 0.1}, {0.1, 0.1}, {0.1, 0.1}}});
-        std::shared_ptr<Tensor> myInitR = std::make_shared<Tensor>(
-            Array2D<float, 3, 3>{{{0.1, 0.1, 0.1}, {0.1, 0.1, 0.1}, {0.1, 0.1, 0.1}}});
+        std::shared_ptr<Tensor> myInitR =
+            std::make_shared<Tensor>(Array2D<float, 3, 3>{
+                {{0.1, 0.1, 0.1}, {0.1, 0.1, 0.1}, {0.1, 0.1, 0.1}}});
 
         pop->getOperator()->associateInput(0, myInput);
         op->associateInput(17, myInit);
@@ -371,9 +407,9 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
         scheduler.saveSchedulingDiagram("lstm_seq_schedule");
 
         std::shared_ptr<Tensor> myHiddenState = std::make_shared<Tensor>(
-                Array2D<float, 3, 3>{{{0.24439372, 0.24439372, 0.24439372},
-                                     {0.49801484, 0.49801484, 0.49801484},
-                                     {0.67162132, 0.67162132, 0.67162132}}});
+            Array2D<float, 3, 3>{{{0.24439372, 0.24439372, 0.24439372},
+                                  {0.49801484, 0.49801484, 0.49801484},
+                                  {0.67162132, 0.67162132, 0.67162132}}});
 
         myGraph->save("lstm_seq_mygraph", true, true);
 
@@ -382,10 +418,12 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
 
         REQUIRE(approxEq<float>(*(op->getOutput(0)), *myHiddenState));
     }
+
     SECTION("LSTM(forward_values_seq_flatten)(sequential)") {
         auto pop = Pop();
         auto myLSTM = LSTM(2, 3, 2, true, "ltsm");
-        auto op = std::static_pointer_cast<MetaOperator_Op>(myLSTM->getOperator());
+        auto op =
+            std::static_pointer_cast<MetaOperator_Op>(myLSTM->getOperator());
 
         // Here we test LSTM as it is was flatten in the graph.
         // We just borrow its micro-graph into our larger myGraph graph.
@@ -405,13 +443,16 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
         REQUIRE(myLSTM->nbOutputs() == 2);
 
         std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
-            Array3D<float, 2, 3, 2>{{{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}}, {{2.0, 3.0}, {4.0, 5.0}, {6.0, 7.0}}}});
-        std::shared_ptr<Tensor> myInit = std::make_shared<Tensor>(
-            Array2D<float, 3, 3>{{{0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}}});
+            Array3D<float, 2, 3, 2>{{{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}},
+                                     {{2.0, 3.0}, {4.0, 5.0}, {6.0, 7.0}}}});
+        std::shared_ptr<Tensor> myInit =
+            std::make_shared<Tensor>(Array2D<float, 3, 3>{
+                {{0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}}});
         std::shared_ptr<Tensor> myInitW = std::make_shared<Tensor>(
             Array2D<float, 3, 2>{{{0.1, 0.1}, {0.1, 0.1}, {0.1, 0.1}}});
-        std::shared_ptr<Tensor> myInitR = std::make_shared<Tensor>(
-            Array2D<float, 3, 3>{{{0.1, 0.1, 0.1}, {0.1, 0.1, 0.1}, {0.1, 0.1, 0.1}}});
+        std::shared_ptr<Tensor> myInitR =
+            std::make_shared<Tensor>(Array2D<float, 3, 3>{
+                {{0.1, 0.1, 0.1}, {0.1, 0.1, 0.1}, {0.1, 0.1, 0.1}}});
 
         pop->getOperator()->associateInput(0, myInput);
         op->associateInput(17, myInit);
@@ -419,16 +460,32 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
 
         // Weights X
         auto prodX = Producer(myInitW);
-        prodX->addChild(op->getMicroGraph()->getOrderedInputs()[1].first, 0, 1);
-        prodX->addChild(op->getMicroGraph()->getOrderedInputs()[2].first, 0, 1);
-        prodX->addChild(op->getMicroGraph()->getOrderedInputs()[3].first, 0, 1);
-        prodX->addChild(op->getMicroGraph()->getOrderedInputs()[4].first, 0, 1);
+        prodX->addChild(op->getMicroGraph()->getOrderedInputs()[1].first,
+                        0,
+                        1);
+        prodX->addChild(op->getMicroGraph()->getOrderedInputs()[2].first,
+                        0,
+                        1);
+        prodX->addChild(op->getMicroGraph()->getOrderedInputs()[3].first,
+                        0,
+                        1);
+        prodX->addChild(op->getMicroGraph()->getOrderedInputs()[4].first,
+                        0,
+                        1);
         // Weights H
         auto prodH = Producer(myInitR);
-        prodH->addChild(op->getMicroGraph()->getOrderedInputs()[5].first, 0, 1);
-        prodH->addChild(op->getMicroGraph()->getOrderedInputs()[6].first, 0, 1);
-        prodH->addChild(op->getMicroGraph()->getOrderedInputs()[7].first, 0, 1);
-        prodH->addChild(op->getMicroGraph()->getOrderedInputs()[8].first, 0, 1);
+        prodH->addChild(op->getMicroGraph()->getOrderedInputs()[5].first,
+                        0,
+                        1);
+        prodH->addChild(op->getMicroGraph()->getOrderedInputs()[6].first,
+                        0,
+                        1);
+        prodH->addChild(op->getMicroGraph()->getOrderedInputs()[7].first,
+                        0,
+                        1);
+        prodH->addChild(op->getMicroGraph()->getOrderedInputs()[8].first,
+                        0,
+                        1);
         myGraph->add({prodX, prodH});
 
         myGraph->setDataType(DataType::Float32);
@@ -436,9 +493,9 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
         myGraph->save("lstm_seq_flatten", true, true);
 
         std::shared_ptr<Tensor> myHiddenState = std::make_shared<Tensor>(
-                Array2D<float, 3, 3>{{{0.24439372, 0.24439372, 0.24439372},
-                                     {0.49801484, 0.49801484, 0.49801484},
-                                     {0.67162132, 0.67162132, 0.67162132}}});
+            Array2D<float, 3, 3>{{{0.24439372, 0.24439372, 0.24439372},
+                                  {0.49801484, 0.49801484, 0.49801484},
+                                  {0.67162132, 0.67162132, 0.67162132}}});
 
         auto scheduler = SequentialScheduler(myGraph);
         scheduler.generateScheduling();
@@ -454,7 +511,8 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
     SECTION("LSTM(forward_values_seq_flatten)(parallel)") {
         auto pop = Pop();
         auto myLSTM = LSTM(2, 3, 2, true, "ltsm");
-        auto op = std::static_pointer_cast<MetaOperator_Op>(myLSTM->getOperator());
+        auto op =
+            std::static_pointer_cast<MetaOperator_Op>(myLSTM->getOperator());
 
         // Here we test LSTM as it is was flatten in the graph.
         // We just borrow its micro-graph into our larger myGraph graph.
@@ -474,13 +532,16 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
         REQUIRE(myLSTM->nbOutputs() == 2);
 
         std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
-            Array3D<float, 2, 3, 2>{{{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}}, {{2.0, 3.0}, {4.0, 5.0}, {6.0, 7.0}}}});
-        std::shared_ptr<Tensor> myInit = std::make_shared<Tensor>(
-            Array2D<float, 3, 3>{{{0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}}});
+            Array3D<float, 2, 3, 2>{{{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}},
+                                     {{2.0, 3.0}, {4.0, 5.0}, {6.0, 7.0}}}});
+        std::shared_ptr<Tensor> myInit =
+            std::make_shared<Tensor>(Array2D<float, 3, 3>{
+                {{0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}}});
         std::shared_ptr<Tensor> myInitW = std::make_shared<Tensor>(
             Array2D<float, 3, 2>{{{0.1, 0.1}, {0.1, 0.1}, {0.1, 0.1}}});
-        std::shared_ptr<Tensor> myInitR = std::make_shared<Tensor>(
-            Array2D<float, 3, 3>{{{0.1, 0.1, 0.1}, {0.1, 0.1, 0.1}, {0.1, 0.1, 0.1}}});
+        std::shared_ptr<Tensor> myInitR =
+            std::make_shared<Tensor>(Array2D<float, 3, 3>{
+                {{0.1, 0.1, 0.1}, {0.1, 0.1, 0.1}, {0.1, 0.1, 0.1}}});
 
         pop->getOperator()->associateInput(0, myInput);
         op->associateInput(17, myInit);
@@ -488,16 +549,32 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
 
         // Weights X
         auto prodX = Producer(myInitW);
-        prodX->addChild(op->getMicroGraph()->getOrderedInputs()[1].first, 0, 1);
-        prodX->addChild(op->getMicroGraph()->getOrderedInputs()[2].first, 0, 1);
-        prodX->addChild(op->getMicroGraph()->getOrderedInputs()[3].first, 0, 1);
-        prodX->addChild(op->getMicroGraph()->getOrderedInputs()[4].first, 0, 1);
+        prodX->addChild(op->getMicroGraph()->getOrderedInputs()[1].first,
+                        0,
+                        1);
+        prodX->addChild(op->getMicroGraph()->getOrderedInputs()[2].first,
+                        0,
+                        1);
+        prodX->addChild(op->getMicroGraph()->getOrderedInputs()[3].first,
+                        0,
+                        1);
+        prodX->addChild(op->getMicroGraph()->getOrderedInputs()[4].first,
+                        0,
+                        1);
         // Weights H
         auto prodH = Producer(myInitR);
-        prodH->addChild(op->getMicroGraph()->getOrderedInputs()[5].first, 0, 1);
-        prodH->addChild(op->getMicroGraph()->getOrderedInputs()[6].first, 0, 1);
-        prodH->addChild(op->getMicroGraph()->getOrderedInputs()[7].first, 0, 1);
-        prodH->addChild(op->getMicroGraph()->getOrderedInputs()[8].first, 0, 1);
+        prodH->addChild(op->getMicroGraph()->getOrderedInputs()[5].first,
+                        0,
+                        1);
+        prodH->addChild(op->getMicroGraph()->getOrderedInputs()[6].first,
+                        0,
+                        1);
+        prodH->addChild(op->getMicroGraph()->getOrderedInputs()[7].first,
+                        0,
+                        1);
+        prodH->addChild(op->getMicroGraph()->getOrderedInputs()[8].first,
+                        0,
+                        1);
         myGraph->add({prodX, prodH});
 
         myGraph->setDataType(DataType::Float32);
@@ -505,9 +582,9 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
         myGraph->save("lstm_seq_flatten", true, true);
 
         std::shared_ptr<Tensor> myHiddenState = std::make_shared<Tensor>(
-                Array2D<float, 3, 3>{{{0.24439372, 0.24439372, 0.24439372},
-                                     {0.49801484, 0.49801484, 0.49801484},
-                                     {0.67162132, 0.67162132, 0.67162132}}});
+            Array2D<float, 3, 3>{{{0.24439372, 0.24439372, 0.24439372},
+                                  {0.49801484, 0.49801484, 0.49801484},
+                                  {0.67162132, 0.67162132, 0.67162132}}});
 
         auto scheduler = ParallelScheduler(myGraph);
         scheduler.generateScheduling();
@@ -519,4 +596,308 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
 
         REQUIRE(approxEq<float>(*(op->getOutput(0)), *myHiddenState));
     }
-}
\ No newline at end of file
+
+    SECTION("Leaky(forward)(fixed)") {
+
+        constexpr auto inChannels = 10;
+        constexpr auto outChannels = 5;
+
+        constexpr auto beta = 0.95;
+        constexpr auto threshold = 1.0;
+        constexpr auto nbTimeSteps = 2;
+
+        auto myWeights =
+            std::make_shared<Tensor>(Array2D<float, outChannels, inChannels>{{
+                {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0},
+                {1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1},
+                {0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 0.1, 0.2, 0.3, 0.4},
+                {0.4, 0.3, 0.2, 0.1, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5},
+                {0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0.0},
+            }});
+
+        auto myWeights2 =
+            std::make_shared<Tensor>(Array2D<float, inChannels, outChannels>{{
+                {0.1, 0.2, 0.3, 0.4, 0.5},
+                {0.6, 0.7, 0.8, 0.9, 1.0},
+                {1.0, 0.9, 0.8, 0.7, 0.6},
+                {0.5, 0.4, 0.3, 0.2, 0.1},
+                {0.5, 0.6, 0.7, 0.8, 0.9},
+                {1.0, 0.1, 0.2, 0.3, 0.4},
+                {0.4, 0.3, 0.2, 0.1, 0.0},
+                {0.1, 0.2, 0.3, 0.4, 0.5},
+                {0.9, 0.8, 0.7, 0.6, 0.5},
+                {0.4, 0.3, 0.2, 0.1, 0.0},
+            }});
+
+        auto myInput = std::make_shared<Tensor>(Array2D<float, 2, 10>{{
+            {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0},
+            {1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1},
+        }});
+
+        // py/snn Torch computed result, output of fc1 at time step 1
+        auto expectedOutputlif1ts1 =
+            std::make_shared<Tensor>(Array2D<float, 2, 5>{{
+                {3.850, 2.2000, 2.6500, 1.5000, 1.6500},
+                {2.200, 3.8500, 3.4000, 1.2500, 3.3000},
+            }});
+
+        auto expectedOutputfc2ts1 =
+            std::make_shared<Tensor>(Array2D<float, 2, 10>{{
+                {1.5000,
+                 4.0000,
+                 4.0000,
+                 1.5000,
+                 3.5000,
+                 2.0000,
+                 1.0000,
+                 1.5000,
+                 3.5000,
+                 1.0000},
+                {1.5000,
+                 4.0000,
+                 4.0000,
+                 1.5000,
+                 3.5000,
+                 2.0000,
+                 1.0000,
+                 1.5000,
+                 3.5000,
+                 1.0000},
+            }});
+
+        auto expectedOutputlif1ts2 =
+            std::make_shared<Tensor>(Array2D<float, 2, 5>{{
+                {6.5075, 3.2900, 4.1675, 1.9250, 2.2175},
+                {3.2900, 6.5075, 5.6300, 1.4375, 5.4350},
+            }});
+
+        // NOTE: Same output as before, because for all channels, we have a
+        // potential higher than threshold. Thus the lif neuron fires at every
+        // timestep for every channel.
+        auto expectedOutputfc2ts2 =
+            std::make_shared<Tensor>(Array2D<float, 2, 10>{{
+                {1.5000,
+                 4.0000,
+                 4.0000,
+                 1.5000,
+                 3.5000,
+                 2.0000,
+                 1.0000,
+                 1.5000,
+                 3.5000,
+                 1.0000},
+                {1.5000,
+                 4.0000,
+                 4.0000,
+                 1.5000,
+                 3.5000,
+                 2.0000,
+                 1.0000,
+                 1.5000,
+                 3.5000,
+                 1.0000},
+            }});
+
+        auto init = std::make_shared<Tensor>(Array2D<float, 2, 5>{});
+        uniformFiller<float>(init, 0.0, 0.0);
+
+        auto fc1 = FC(inChannels, outChannels, true, "myfc");
+        auto fc2 = FC(outChannels, inChannels, true, "fc2");
+        // NOTE: Account for init step by adding 1 to the max timestep
+        // parameter.
+        auto lif1 = Leaky(nbTimeSteps + 1, beta, threshold, "leaky");
+
+        // associateInput() does not work
+        fc1->input(1).first->getOperator()->setOutput(0, myWeights);
+        fc2->input(1).first->getOperator()->setOutput(0, myWeights2);
+
+        auto fc1Op =
+            std::static_pointer_cast<OperatorTensor>(fc1->getOperator());
+        auto lif1Op =
+            std::static_pointer_cast<MetaOperator_Op>(lif1->getOperator());
+        auto fc2Op =
+            std::static_pointer_cast<OperatorTensor>(fc2->getOperator());
+
+        fc1Op->associateInput(0, myInput);
+        lif1Op->associateInput(1, init);
+        lif1Op->associateInput(2, init);
+
+        fc1->addChild(lif1, 0, 0);
+        lif1->addChild(fc2, 1, 0);
+
+        auto g = std::make_shared<GraphView>();
+        g->add({fc1, lif1, fc2});
+        g->compile("cpu", DataType::Float32);
+        auto scheduler = SequentialScheduler(g);
+
+        // Forward 1 (simulate timestep 0)
+        scheduler.forward(true);
+        REQUIRE(approxEq<float>(*(lif1Op->getOutput(0)),
+                                *(expectedOutputlif1ts1)));
+        REQUIRE(
+            approxEq<float>(*(fc2Op->getOutput(0)), *(expectedOutputfc2ts1)));
+
+        // Forward 1 (simulate timestep 1)
+        scheduler.forward(true);
+        REQUIRE(approxEq<float>(*(lif1Op->getOutput(0)),
+                                *(expectedOutputlif1ts2)));
+        REQUIRE(
+            approxEq<float>(*(fc2Op->getOutput(0)), *(expectedOutputfc2ts2)));
+    }
+
+    SECTION("Leaky(forward)") {
+
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> valueDist(
+            0.1f,
+            1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2),
+                                                               std::size_t(4));
+        std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(3),
+                                                              std::size_t(3));
+        std::uniform_int_distribution<int> boolDist(0, 1);
+        std::uniform_real_distribution<float> betaDist(0,1);
+
+        const std::size_t nbDims = nbDimsDist(gen);
+        Log::info("Nbdims : {}", nbDims);
+        std::vector<std::size_t> dims;
+        for (std::size_t i = 0; i < nbDims; ++i) {
+            dims.push_back(dimSizeDist(gen));
+        }
+        Log::info("timesteps : {}", dims[0]);
+        Log::info("dimensions : ");
+        for (auto dim : dims) {
+            Log::info("{}", dim);
+        }
+
+        const auto nbTimeSteps = dims[0];
+        const auto beta = betaDist(gen);
+
+        auto myLeaky = Leaky(nbTimeSteps, beta, 1.0, "leaky");
+        auto op =
+            std::static_pointer_cast<MetaOperator_Op>(myLeaky->getOperator());
+        // auto stack = Stack(2);
+        auto mem_rec = Stack(nbTimeSteps, "mem_rec");
+        auto spk_rec = Stack(nbTimeSteps, "spk_rec");
+        auto pop = Pop("popinput");
+
+        // Here we test LSTM as it is was flatten in the graph.
+        // We just borrow its micro-graph into our larger myGraph graph.
+        auto myGraph = std::make_shared<GraphView>();
+
+        pop->addChild(op->getMicroGraph()->getOrderedInputs()[0].first, 0, 0);
+        // 0 for mem 1 for stack
+        op->getMicroGraph()->getOrderedOutputs()[1].first->addChild(mem_rec,
+                                                                    0,
+                                                                    0);
+        op->getMicroGraph()->getOrderedOutputs()[0].first->addChild(spk_rec,
+                                                                    0,
+                                                                    0);
+        for (auto node : op->getMicroGraph()->getOrderedOutputs()) {
+            Log::info("name  of output {}", node.first->name());
+        }
+
+        myGraph->add(pop);
+        myGraph->add(op->getMicroGraph());
+        myGraph->add(mem_rec);
+        myGraph->add(spk_rec);
+        myGraph->save("mg", true, true);
+
+        // 3 outputs
+        REQUIRE(myLeaky->nbInputs() == 3);
+        REQUIRE(myLeaky->inputCategory(0) == InputCategory::Data);
+        // Two spikes connected to nothing, + the Add node real output
+        REQUIRE(myLeaky->nbOutputs() == 4);
+
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
+            Array3D<float, 2, 3, 2>{{{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}},
+                                     {{2.0, 3.0}, {4.0, 5.0}, {6.0, 7.0}}}});
+
+        // std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(
+        //     Array3D<float, 2, 3, 2>{{{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}},
+        //                              {{2.0, 3.0}, {4.0, 5.0},
+        //                              {6.0, 7.0}}}});
+
+        // Generate input
+        std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>();
+        T0->setDataType(DataType::Float32);
+        T0->setBackend("cpu");
+
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>();
+        expectedOutput->setDataType(DataType::Float32);
+        expectedOutput->setBackend("cpu");
+
+        const auto nb_elements =
+            std::accumulate(dims.cbegin(),
+                            dims.cend(),
+                            std::size_t(1),
+                            std::multiplies<std::size_t>());
+        float *input = new float[nb_elements];
+        float *result = new float[nb_elements];
+
+        for (std::size_t i = 0; i < nb_elements; ++i) {
+            input[i] = valueDist(gen);
+        }
+        T0->resize(dims);
+        T0->getImpl()->setRawPtr(input, nb_elements);
+        T0->print();
+
+        // Elements popped at each time step
+        auto nbElementsPerTimeStep = nb_elements / dims[0];
+
+        // Init
+        for (int i = 0; i < nbElementsPerTimeStep; ++i) {
+            result[i] = input[i];
+        }
+
+        // Reccurence
+        for (int i = 1; i < dims[0]; ++i) {
+            auto offset = nbElementsPerTimeStep * i;
+            auto prev = nbElementsPerTimeStep * (i - 1);
+            for (int j = 0; j < nbElementsPerTimeStep; ++j) {
+                auto reset = (result[prev + j] > 1.0 ? 1 : 0);
+                result[offset + j] =
+                    result[prev + j] * beta + input[offset + j] - reset;
+            }
+        }
+
+        expectedOutput->resize(dims);
+        expectedOutput->getImpl()->setRawPtr(result, nb_elements);
+        Log::info("Expected ouptut : ");
+        expectedOutput->print();
+
+        std::shared_ptr<Tensor> myInit =
+            std::make_shared<Tensor>(Array2D<float, 3, 3>{
+                {{0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}}});
+
+        auto initMemdims =
+            std::vector<std::size_t>(dims.begin() + 1, dims.end());
+        Log::info("dimensions : ");
+        for (auto dim : initMemdims) {
+            Log::info("{}", dim);
+        }
+        std::shared_ptr<Tensor> myInitW = std::make_shared<Tensor>(
+            Array2D<float, 3, 2>{{{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}});
+
+        std::shared_ptr<Tensor> myInitR =
+            std::make_shared<Tensor>(initMemdims);
+        myInitR->setDataType(DataType::Float32);
+        myInitR->setBackend("cpu");
+        uniformFiller<float>(myInitR, 0, 0);
+
+        pop->getOperator()->associateInput(0, T0);
+        op->associateInput(1, myInitR);
+        op->associateInput(2, myInitR);
+
+        myGraph->compile("cpu", DataType::Float32);
+
+        auto scheduler = SequentialScheduler(myGraph);
+        REQUIRE_NOTHROW(scheduler.generateScheduling());
+        REQUIRE_NOTHROW(scheduler.forward(true));
+
+        auto memOp =
+            std::static_pointer_cast<OperatorTensor>(spk_rec->getOperator());
+        REQUIRE(approxEq<float>(*(memOp->getOutput(0)), *(expectedOutput)));
+    }
+}
-- 
GitLab


From ff3a3ed7ad4fc32ba3f0d80b35aeaaf5c1420a61 Mon Sep 17 00:00:00 2001
From: Jerome Hue <jerome.hue@cea.fr>
Date: Thu, 6 Feb 2025 11:59:50 +0100
Subject: [PATCH 002/108] Implement backward function for Div operator

---
 .../aidge/backend/cpu/operator/DivImpl.hpp    |  13 +-
 .../backend/cpu/operator/DivImpl_kernels.hpp  |  61 +++-
 src/operator/DivImpl.cpp                      |  23 +-
 unit_tests/operator/Test_DivImpl.cpp          | 271 ++++++++++++++++++
 4 files changed, 363 insertions(+), 5 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/DivImpl.hpp b/include/aidge/backend/cpu/operator/DivImpl.hpp
index 40c1b678..a507690b 100644
--- a/include/aidge/backend/cpu/operator/DivImpl.hpp
+++ b/include/aidge/backend/cpu/operator/DivImpl.hpp
@@ -24,7 +24,18 @@
 namespace Aidge {
 // Operator implementation entry point for the backend
 using DivImpl_cpu = OperatorImpl_cpu<Div_Op,
-    void(const std::size_t, const std::size_t, const std::size_t, const void*, const void*,void*)>;
+    void(const std::size_t, const std::size_t, const std::size_t, const void*, const void*,void*),
+    void(const std::size_t,
+        const std::size_t,
+        const std::size_t,
+        const std::vector<std::size_t>,
+        const std::vector<std::size_t>,
+        const std::vector<std::size_t>,
+        const void*,
+        const void*,
+        const void*,
+        void*,
+        void*)>;
 
 // Implementation entry point registration to Operator
 REGISTRAR(Div_Op, "cpu", Aidge::DivImpl_cpu::create);
diff --git a/include/aidge/backend/cpu/operator/DivImpl_kernels.hpp b/include/aidge/backend/cpu/operator/DivImpl_kernels.hpp
index ed6e55a7..5d3ee7f6 100644
--- a/include/aidge/backend/cpu/operator/DivImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/DivImpl_kernels.hpp
@@ -17,6 +17,7 @@
 #include <cstdint>     // std::int32_t, std::int64_t
 #include <functional>  // std::multiplies
 
+#include "aidge/backend/cpu/operator/MulImpl_kernels.hpp"
 #include "aidge/utils/Registrar.hpp"
 
 #include "aidge/backend/cpu/data/Broadcasting.hpp"
@@ -69,16 +70,70 @@ constexpr void DivImpl_cpu_forward_kernel(const std::size_t input1size_,
     }
 }
 
+
+template <class I1, class I2, class O>
+void DivImpl_cpu_backward_kernel(const std::size_t input0Length,
+                               const std::size_t input1Length,
+                               const std::size_t gradOutputLength,
+                               const std::vector<std::size_t>& dims0,
+                               const std::vector<std::size_t>& dims1,
+                               const std::vector<std::size_t>& outputDims,
+                               const void* input0_,
+                               const void* input1_,
+                               const void* grad_output_,
+                               void* gradientInput0_,
+                               void* gradientInput1_)
+{
+    const I1* input0 = static_cast<const I1*>(input0_);  // a
+    const I2* input1 = static_cast<const I2*>(input1_);  // b
+    const O* grad_output = static_cast<const O*>(grad_output_);
+    auto* grad_input_0 = static_cast<I1*>(gradientInput0_);  // gradient w.r.t. a
+    auto* grad_input_1 = static_cast<I2*>(gradientInput1_);  // gradient w.r.t. b
+
+    std::fill_n(grad_input_0, input0Length, static_cast<I1>(0));
+    std::fill_n(grad_input_1, input1Length, static_cast<I2>(0));
+
+    // Broadcast dims0 and dims1 to match the shape of outputDims
+    auto broadcastedDims0 = getBroadcastedDims(outputDims, dims0);
+    auto broadcastedDims1 = getBroadcastedDims(outputDims, dims1);
+
+    for (std::size_t i = 0; i < gradOutputLength; ++i) {
+        auto idxOutputGrad = getMultiDimIndices(outputDims, i);
+        std::vector<std::size_t> idxInput0(broadcastedDims0.size());
+        std::vector<std::size_t> idxInput1(broadcastedDims1.size());
+
+        // Map output indices to input indices, considering broadcasting
+        for (std::size_t dimension = 0; dimension < broadcastedDims0.size(); ++dimension) {
+            idxInput0[dimension] = (broadcastedDims0[dimension] == 1) ? 0 : idxOutputGrad[dimension];
+        }
+
+        for (std::size_t dimension = 0; dimension < broadcastedDims1.size(); ++dimension) {
+            idxInput1[dimension] = (broadcastedDims1[dimension] == 1) ? 0 : idxOutputGrad[dimension];
+        }
+
+        auto idx0 = getFlattenedIndex(broadcastedDims0, idxInput0);
+        auto idx1 = getFlattenedIndex(broadcastedDims1, idxInput1);
+
+        // grad_a = grad_output * (1/b)
+        grad_input_0[idx0] += static_cast<I1>(grad_output[i] / input1[idx1]);
+        
+        // grad_b = grad_output * (-a/b²)
+        grad_input_1[idx1] += static_cast<I2>(grad_output[i] * (-input0[idx0] / (input1[idx1] * input1[idx1])));
+    }
+}
+
+
 // Kernels registration to implementation entry point
 REGISTRAR(DivImpl_cpu,
     {DataType::Float32},
-    {ProdConso::inPlaceModel, Aidge::DivImpl_cpu_forward_kernel<float, float, float>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::DivImpl_cpu_forward_kernel<float, float, float>, Aidge::DivImpl_cpu_backward_kernel<float, float, float>});
 REGISTRAR(DivImpl_cpu,
     {DataType::Float64},
-    {ProdConso::inPlaceModel, Aidge::DivImpl_cpu_forward_kernel<double, double, double>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::DivImpl_cpu_forward_kernel<double, double, double>, Aidge::DivImpl_cpu_backward_kernel<double, double, double>});
 REGISTRAR(DivImpl_cpu,
     {DataType::Int32},
-    {ProdConso::inPlaceModel, Aidge::DivImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::DivImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, 
+          Aidge::DivImpl_cpu_backward_kernel<std::int32_t, std::int32_t, std::int32_t>});
 }  // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_DIVIMPL_KERNELS_H_ */
diff --git a/src/operator/DivImpl.cpp b/src/operator/DivImpl.cpp
index 135b32b5..67444cb8 100644
--- a/src/operator/DivImpl.cpp
+++ b/src/operator/DivImpl.cpp
@@ -152,5 +152,26 @@ void Aidge::DivImpl_cpu::forward() {
 
 template <>
 void Aidge::DivImpl_cpu::backward() {
-    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Div_Op on backend cpu");
+    const Div_Op& op_ = dynamic_cast<const Div_Op&>(mOp);
+
+    auto in0 = op_.getInput(0);
+    auto in1 = op_.getInput(1);
+    auto in0grad = op_.getInput(0)->grad();
+    auto in1grad = op_.getInput(1)->grad();
+    auto out0grad = op_.getOutput(0)->grad();
+
+    const auto impl = Registrar<DivImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    impl.backward(in0grad->size(),
+               in1grad->size(),
+               out0grad->size(),
+               in0->dims(),
+               in1->dims(),
+               out0grad->dims(),
+               getCPUPtr(in0),
+               getCPUPtr(in1),
+               getCPUPtr(out0grad),
+               getCPUPtr(in0grad),
+               getCPUPtr(in1grad));
 }
+
diff --git a/unit_tests/operator/Test_DivImpl.cpp b/unit_tests/operator/Test_DivImpl.cpp
index 4037b2ad..4e7657ed 100644
--- a/unit_tests/operator/Test_DivImpl.cpp
+++ b/unit_tests/operator/Test_DivImpl.cpp
@@ -322,4 +322,275 @@ TEST_CASE("[cpu/operator] Div", "[Div][CPU]") {
         }
     }
 }
+
+TEST_CASE("[CPU/Operator] Div(Backward)", "[Div][CPU][Backward]") {
+    std::shared_ptr<Div_Op> op = std::make_shared<Div_Op>();
+    op->setDataType(DataType::Float32);
+    op->setBackend("cpu");
+
+    // NOTE: The first four tests use fixed values, the last one uses random values but static dimensions.
+
+    SECTION("Case 1: 1D and 2D Tensors") {
+        const auto T0 = std::make_shared<Tensor>(
+            Array2D<cpptype_t<DataType::Float32>, 2, 3>({{{1, 2, 3}, {4, 5, 6}}}));
+
+        const auto T1 =
+            std::make_shared<Tensor>(Array1D<cpptype_t<DataType::Float32>, 3>({0.1, 0.2, 0.3}));
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>(
+            Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}})));
+        op->forwardDims();
+
+        op->backward();
+
+        const Tensor expectedGrad0 =
+            Array2D<cpptype_t<DataType::Float32>, 2, 3>({{{10, 5, 3.3333}, {10, 5, 3.3333}}});
+
+        const Tensor expectedGrad1 = Array1D<cpptype_t<DataType::Float32>, 3>({-500, -175, -100});
+
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1));
+    }
+
+    SECTION("Case 2: 3D and 1D tensors") {
+        const auto T0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+            {{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}},
+              {{7.0, 8.0, 9.0}, {10.0, 11.0, 12.0}}}}));
+
+        const auto T1 =
+            std::make_shared<Tensor>(Array1D<float, 3>({0.3, 0.2, 0.1}));
+
+        const auto newGrad = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+            {{{{1, 1, 1}, {1, 1, 1}}, {{1, 1, 1}, {1, 1, 1}}}}));
+
+        const Tensor expectedGrad0 =
+            Array3D<float, 2, 2, 3>({{{{3.3333, 5.0, 10}, {3.3333, 5.0, 10}},
+                                      {{3.3333, 5.0, 10}, {3.3333, 5.0, 10}}}});
+
+        const Tensor expectedGrad1 = Array1D<cpptype_t<DataType::Float32>, 3>({-244.4444, -650.0, -3000.0});
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(newGrad);
+        op->forwardDims();
+
+        op->backward();
+
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1));
+    }
+
+    SECTION("Case 3: 4D and 2D tensors") {
+        const auto T0 = std::make_shared<Tensor>(Array4D<cpptype_t<DataType::Float32>, 2, 2, 3, 3>(
+            {{{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}, {7.0, 8.0, 9.0}},
+               {{10.0, 11.0, 12.0}, {13.0, 14.0, 15.0}, {16.0, 17.0, 18.0}}},
+              {{{19.0, 20.0, 21.0}, {22.0, 23.0, 24.0}, {25.0, 26.0, 27.0}},
+               {{28.0, 29.0, 30.0},
+                {31.0, 32.0, 33.0},
+                {34.0, 35.0, 36.0}}}}}));
+
+        const auto T1 = std::make_shared<Tensor>(Array2D<cpptype_t<DataType::Float32>, 3, 3>(
+            {{{0.5, 0.3, 0.1}, {0.4, 0.2, 0.6}, {0.7, 0.8, 0.9}}}));
+
+        const auto newGrad =
+            std::make_shared<Tensor>(Array4D<cpptype_t<DataType::Float32>, 2, 2, 3, 3>(
+                {{{{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}},
+                   {{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}},
+                  {{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}},
+                   {{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}}}}));
+
+        const Tensor expectedGrad0 =
+            Array4D<cpptype_t<DataType::Float32>, 2, 2, 3, 3>(
+                {{{{{2, 3.3333, 10}, {2.5, 5.0, 1.66667}, {1.42857, 1.2500, 1.11111}},
+                   {{2, 3.3333, 10}, {2.5, 5.0, 1.66667}, {1.42857, 1.2500, 1.11111}}},
+                  {{{2, 3.3333, 10}, {2.5, 5.0, 1.66667}, {1.42857, 1.2500, 1.11111}},
+                   {{2, 3.3333, 10}, {2.5, 5.0, 1.66667}, {1.42857, 1.2500, 1.11111}}}}});
+
+        const Tensor expectedGrad1 =
+            Array2D<cpptype_t<DataType::Float32>, 3, 3>({{{-232.0, -688.888, -6600.0},
+                                   {-437.5, -1850.0, -216.66667},
+                                   {-167.3469, -134.3750, -111.111}}});
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(newGrad);
+        op->forwardDims();
+
+        op->backward();
+
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1));
+    }
+
+    SECTION("Case 4: 3D and 2D tensors") {
+        const auto T0 = std::make_shared<Tensor>(
+            Array3D<float, 2, 3, 4>({{{
+                                          {1.0, 2.0, 3.0, 4.0},
+                                          {5.0, 6.0, 7.0, 8.0},
+                                          {9.0, 10.0, 11.0, 12.0},
+                                      },
+                                      {
+                                          {13.0, 14.0, 15.0, 16.0},
+                                          {17.0, 18.0, 19.0, 20.0},
+                                          {21.0, 22.0, 23.0, 24.0},
+                                      }}}));
+
+        const auto T1 = std::make_shared<Tensor>(
+            Array2D<cpptype_t<DataType::Float32>, 3, 4>({{{0.1, 0.2, 0.3, 0.4},
+                                   {0.5, 0.6, 0.7, 0.8},
+                                   {0.9, 1.0, 1.1, 1.2}}}));
+
+        const auto newGrad = std::make_shared<Tensor>(
+            Array3D<cpptype_t<DataType::Float32>, 2, 3, 4>({{{
+                                          {1.0, 1.0, 1.0, 1.0},
+                                          {1.0, 1.0, 1.0, 1.0},
+                                          {1.0, 1.0, 1.0, 1.0},
+                                      },
+                                      {
+                                          {1.0, 1.0, 1.0, 1.0},
+                                          {1.0, 1.0, 1.0, 1.0},
+                                          {1.0, 1.0, 1.0, 1.0},
+                                      }}}));
+
+        const Tensor expectedGrad0 =
+            Array3D<cpptype_t<DataType::Float32>, 2, 3, 4>({{{
+                                       {10, 5, 3.33333, 2.5},
+                                       {2, 1.66667, 1.42857, 1.2500},
+                                       {1.11111, 1.0, 0.90909, 0.83333}},
+                                      {{10, 5, 3.33333, 2.5},
+                                       {2, 1.66667, 1.42857, 1.2500},
+                                       {1.11111, 1.0, 0.90909, 0.83333}}}});
+
+        const Tensor expectedGrad1 =
+            Array2D<cpptype_t<DataType::Float32>, 3, 4>({{
+                                   {-1400.0, -400.0, -200.0, -125.0},
+                                   {-88.0, -66.66667, -53.0612, -43.750},
+                                   {-37.0370, -32.0, -28.0992, -25.00}}});
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(newGrad);
+        op->forwardDims();
+
+        op->backward();
+
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1));
+    }
+
+    SECTION("Case 5: Tensors with random values") {
+
+        // Use random values
+        const std::vector<std::size_t> dims0 = {5, 2, 1, 7}; // First tensor
+        const std::vector<std::size_t> dims1 = {2, 6, 7};    // Second tensor
+        const std::vector<std::size_t> outputDims = {5, 2, 6, 7};
+
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> dist(0.1f, 1.0f);
+
+        auto T0 = std::make_shared<Tensor>(dims0);
+        T0->setDataType(DataType::Float32);
+        T0->setBackend("cpu");
+        float* input0Data = static_cast<float*>(T0->getImpl()->rawPtr());
+        // Fill with random values
+        for (std::size_t i = 0; i < T0->size(); ++i) {
+            input0Data[i] = dist(gen);
+        }
+
+        auto T1 = std::make_shared<Tensor>(dims1);
+        T1->setDataType(DataType::Float32);
+        T1->setBackend("cpu");
+        float* input1Data = static_cast<float*>(T1->getImpl()->rawPtr());
+        // Fill with random values
+        for (std::size_t i = 0; i < T1->size(); ++i) {
+            input1Data[i] = dist(gen);
+        }
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+
+        op->forwardDims();
+        op->forward();
+
+        Tensor expectedOutput{outputDims};
+        expectedOutput.setBackend("cpu");
+        float* expectedOutputData = static_cast<float*>(expectedOutput.getImpl()->rawPtr());
+
+        for (std::size_t n = 0; n < 5; ++n) {
+            for (std::size_t c = 0; c < 2; ++c) {
+                for (std::size_t h = 0; h < 6; ++h) {
+                    for (std::size_t w = 0; w < 7; ++w) {
+                        std::size_t outIdx = w + 7 * (h + 6 * (c + 2 * n));
+                        std::size_t in0Idx =
+                            w + 7 * (0 + 1 * (c + 2 * n)); // middle dim is 1
+                        std::size_t in1Idx =
+                            w + 7 * (h + 6 * c);           // no n dimension
+
+                        expectedOutputData[outIdx] = input0Data[in0Idx] / input1Data[in1Idx];
+                    }
+                }
+            }
+        }
+
+        auto outputTensor = op->getOutput(0);
+
+        REQUIRE(approxEq<float>(*outputTensor, expectedOutput));
+
+        // Backward pass
+        std::vector<float> gradOutputData(expectedOutput.size());
+        for (auto &val : gradOutputData) {
+            val = dist(gen);
+        }
+
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>());
+        op->getOutput(0)->grad()->resize(outputDims);
+        op->getOutput(0)->grad()->getImpl()->setRawPtr(gradOutputData.data(),
+                                                       expectedOutput.size());
+
+        // Compute reference gradients
+        std::vector<float> expectedGrad0(T0->size(), 0.0f);
+        std::vector<float> expectedGrad1(T1->size(), 0.0f);
+
+        for (std::size_t n = 0; n < 5; ++n) {
+            for (std::size_t c = 0; c < 2; ++c) {
+                for (std::size_t h = 0; h < 6; ++h) {
+                    for (std::size_t w = 0; w < 7; ++w) {
+                        std::size_t outIdx = w + 7 * (h + 6 * (c + 2 * n));
+                        std::size_t in0Idx = w + 7 * (0 + 1 * (c + 2 * n));
+                        std::size_t in1Idx = w + 7 * (h + 6 * c);
+
+                        expectedGrad0[in0Idx] += 
+                            gradOutputData[outIdx] * (1.0f / input1Data[in1Idx]);
+
+                        expectedGrad1[in1Idx] += 
+                            gradOutputData[outIdx] * (-input0Data[in0Idx] / (input1Data[in1Idx] * input1Data[in1Idx]));
+                    }
+                }
+            }
+        }
+
+        // Perform backward pass
+        op->backward();
+
+        auto expectedGrad0Tensor = std::make_shared<Tensor>();
+        expectedGrad0Tensor->resize(T0->dims());
+        expectedGrad0Tensor->setBackend("cpu");
+        expectedGrad0Tensor->setDataType(DataType::Float32);
+        expectedGrad0Tensor->getImpl()->setRawPtr(expectedGrad0.data(),
+                                                    expectedGrad0.size());
+
+        auto expectedGrad1Tensor = std::make_shared<Tensor>(T1->dims());
+        expectedGrad1Tensor->setBackend("cpu");
+        expectedGrad1Tensor->setDataType(DataType::Float32);
+        expectedGrad1Tensor->getImpl()->setRawPtr(expectedGrad1.data(),
+                                                    expectedGrad1.size());
+
+        // Verify backward pass
+        REQUIRE(approxEq<float>(*T0->grad(), *expectedGrad0Tensor));
+        REQUIRE(approxEq<float>(*T1->grad(), *expectedGrad1Tensor));
+    }
+}
 } // namespace Aidge
-- 
GitLab


From 8a6699936cc68401f588760e51b7382dfef32fc7 Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Thu, 20 Feb 2025 11:10:50 +0100
Subject: [PATCH 003/108] Added /bigobj for unit tests on Windows

---
 unit_tests/CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/unit_tests/CMakeLists.txt b/unit_tests/CMakeLists.txt
index 6c7af9c3..e1f261d0 100644
--- a/unit_tests/CMakeLists.txt
+++ b/unit_tests/CMakeLists.txt
@@ -25,6 +25,10 @@ target_link_libraries(tests${module_name} PRIVATE ${module_name})
 
 target_link_libraries(tests${module_name} PRIVATE Catch2::Catch2WithMain)
 
+target_compile_options(tests${module_name} PRIVATE
+    $<$<CXX_COMPILER_ID:MSVC>:
+    /bigobj>)
+
 list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras)
 include(CTest)
 include(Catch)
-- 
GitLab


From 97d0996af09c481aec835064ec6ad30027a10c40 Mon Sep 17 00:00:00 2001
From: hrouis <houssemeddine.rouis92@gmail.com>
Date: Fri, 24 Jan 2025 16:06:22 +0100
Subject: [PATCH 004/108] add Equal operator

---
 include/aidge/backend/cpu.hpp                 |   1 +
 .../aidge/backend/cpu/operator/EqualImpl.hpp  |  32 +++
 .../cpu/operator/EqualImpl_kernels.hpp        | 163 ++++++++++++++
 src/operator/EqualImpl.cpp                    |  61 ++++++
 unit_tests/operator/Test_EqualImpl.cpp        | 205 ++++++++++++++++++
 5 files changed, 462 insertions(+)
 create mode 100644 include/aidge/backend/cpu/operator/EqualImpl.hpp
 create mode 100644 include/aidge/backend/cpu/operator/EqualImpl_kernels.hpp
 create mode 100644 src/operator/EqualImpl.cpp
 create mode 100644 unit_tests/operator/Test_EqualImpl.cpp

diff --git a/include/aidge/backend/cpu.hpp b/include/aidge/backend/cpu.hpp
index 5db19a2b..ffc03ae5 100644
--- a/include/aidge/backend/cpu.hpp
+++ b/include/aidge/backend/cpu.hpp
@@ -29,6 +29,7 @@
 #include "aidge/backend/cpu/operator/ConvImpl.hpp"
 #include "aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp"
 #include "aidge/backend/cpu/operator/DivImpl.hpp"
+#include "aidge/backend/cpu/operator/EqualImpl.hpp"
 #include "aidge/backend/cpu/operator/ErfImpl.hpp"
 #include "aidge/backend/cpu/operator/ExpandImpl.hpp"
 #include "aidge/backend/cpu/operator/FCImpl.hpp"
diff --git a/include/aidge/backend/cpu/operator/EqualImpl.hpp b/include/aidge/backend/cpu/operator/EqualImpl.hpp
new file mode 100644
index 00000000..e2489096
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/EqualImpl.hpp
@@ -0,0 +1,32 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_EQUALIMPL_H_
+#define AIDGE_CPU_OPERATOR_EQUALIMPL_H_
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Equal.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include <memory>
+#include <vector>
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using EqualImpl_cpu = OperatorImpl_cpu<Equal_Op,
+    void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*, void*)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(Equal_Op, "cpu", Aidge::EqualImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_EQUALIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/EqualImpl_kernels.hpp b/include/aidge/backend/cpu/operator/EqualImpl_kernels.hpp
new file mode 100644
index 00000000..3c8ff0f4
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/EqualImpl_kernels.hpp
@@ -0,0 +1,163 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_EQUALIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_EQUALIMPL_KERNELS_H_
+
+#include "aidge/backend/cpu/operator/EqualImpl.hpp"
+#include "aidge/utils/Registrar.hpp"
+
+namespace Aidge {
+
+namespace {
+// suppose values are contiguous in memory
+template <class I, class O>
+void equal_contiguous_arrays(const std::size_t input1size,
+                            const std::size_t input2size,
+                            const std::size_t output1size,
+                            const I* input1,
+                            const I* input2,
+                            O* output)
+{
+    for (std::size_t i = 0; i < output1size; ++i)
+    {
+        const std::size_t in1_id = (input1size != 1) ? i : 0;
+        const std::size_t in2_id = (input2size != 1) ? i : 0;
+        output[i] = static_cast<O>(input1[in1_id] == input2[in2_id]);
+    }
+}
+}
+
+
+template <class I, class O>
+void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
+                                std::vector<std::size_t> dims1,
+                                const std::vector<std::size_t>& outputDims,
+                                const void* input0_,
+                                const void* input1_,
+                                void* output_) {
+
+    const I* input_0 = static_cast<const I*>(input0_);
+    const I* input_1 = static_cast<const I*>(input1_);
+    O* output = static_cast<O*>(output_);
+
+    // [5,2,1,7] & [2,6,7]
+    // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7]
+    // 2. Find the highest equal dimension -> 3
+    //    Exception: if the first diverging dimension is the last one, then -> 4 (dims.size())
+    // 3. Compute the highest number of contiguous data -> 7
+    // 4. Compute stride and offset step for the broadcast mechanism
+    // 5. Call a simple kernel
+
+    // special case for equal dimensions, the kernel is called with the entire arrays at once
+    if (dims0 == dims1) {
+        const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+        for (std::size_t i = 0; i < input0_contiguous_size; ++i)
+        {
+            output[i] = static_cast<O>(input_0[i] == input_1[i]);
+        }
+        return;
+    }
+
+    // set dimensions to be of equal size by filling the smallest one with ones.
+    if (dims0.size() > dims1.size()) {
+        dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1));
+    }
+    else if (dims1.size() > dims0.size()) {
+        dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1));
+    }
+
+    const std::size_t nbDims = dims0.size();
+
+    // Find the highest equal dimension
+    // std::size_t contiguousIdx = nbDims - 1;
+    std::size_t contiguousIdx = nbDims;
+    while (contiguousIdx-- > 0) {
+    // for (; contiguousIdx+1 > 0; --contiguousIdx) {
+        if (dims0[contiguousIdx] != dims1[contiguousIdx]) {
+            if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1
+                const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1;
+                while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) {
+                    --contiguousIdx;
+                }
+            }
+            break;
+        }
+    }
+    ++contiguousIdx;
+
+    // Compute the highest number of contiguous data for each Tensor
+    const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+
+    // initialize strides to iterate through data because of broadcasting
+    std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    if (contiguousIdx > 0) {
+        stride_post0[contiguousIdx - 1] = 1;
+        stride_post1[contiguousIdx - 1] = 1;
+        for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) {
+            stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]);
+            stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]);
+        }
+        for (std::size_t i = 0; i != contiguousIdx; ++i) {
+            stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1;
+            stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1;
+        }
+    }
+
+    // variables for arrays offsets
+    std::size_t offsetIn0 = 0;
+    std::size_t offsetIn1 = 0;
+    std::size_t offsetOut = 0;
+
+
+    std::size_t dim = contiguousIdx - 1;
+    const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
+    for (std::size_t stack = 0; stack < nbStacks;) {
+        equal_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
+                    input_0 + offsetIn0*input0_contiguous_size,
+                    input_1 + offsetIn1*input1_contiguous_size,
+                    output + offsetOut*output_contiguous_size);
+        if (++stack < nbStacks) {
+            std::size_t tmp_stack = stack;
+            while(tmp_stack % outputDims[dim] == 0) {
+                tmp_stack /= outputDims[dim];
+                dim--;
+            }
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = contiguousIdx - 1;
+        }
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(EqualImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float32}},
+    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(EqualImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}},
+    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(EqualImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}},
+    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr});
+REGISTRAR(EqualImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int64}},
+    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, nullptr});
+
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_EQUALIMPL_KERNELS_H_ */
diff --git a/src/operator/EqualImpl.cpp b/src/operator/EqualImpl.cpp
new file mode 100644
index 00000000..5926212e
--- /dev/null
+++ b/src/operator/EqualImpl.cpp
@@ -0,0 +1,61 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <cassert>
+#include <chrono>  // std::chrono::milliseconds
+#include <numeric> // std::accumulate
+#include <thread>  // std::this_thread::sleep_for
+#include <vector>
+
+#include "aidge/operator/Equal.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/Broadcasting.hpp"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+
+#include "aidge/backend/cpu/operator/EqualImpl.hpp"
+#include "aidge/backend/cpu/operator/EqualImpl_kernels.hpp"
+
+template <>
+void Aidge::EqualImpl_cpu::forward() {
+    const Equal_Op& op = static_cast<const Equal_Op&>(mOp);
+    // Check inputs
+    AIDGE_ASSERT(op.getInput(0), "missing input in Equal operator");
+    AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Equal forward because the 0-th input has no implementation.");
+
+    AIDGE_ASSERT(op.getInput(1), "missing input in Equal operator");
+    AIDGE_ASSERT(op.getInput(1)->hasImpl(), "cannot run Equal forward because the 1st input has no implementation.");
+
+    AIDGE_ASSERT(op.getInput(1)->dataType() == op.getInput(0)->dataType(), "Cannot Equal inputs with two differents data type.");
+
+    // Find the correct kernel type
+    const auto impl = Registrar<EqualImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Convert input data (no overhead if not needed!)
+    // TODO: right now, if needed, memory will be allocated/deallocated at each
+    // call to forward(). We might put the following shared_ptr as members of
+    // this class to avoid that.
+    std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback;
+    const auto& input0 = op.getInput(0)->refCastFrom(input0Fallback, *op.getInput(0));
+    const auto& input1 = op.getInput(1)->refCastFrom(input1Fallback, *op.getInput(1));
+
+
+    impl.forward(op.getInput(0)->dims(),
+                op.getInput(1)->dims(),
+                op.getOutput(0)->dims(),
+                input0.getImpl()->rawPtr(),
+                input1.getImpl()->rawPtr(),
+                getCPUPtr(op.getRawOutput(0)));
+}
+
+template <>
+void Aidge::EqualImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Equal_Op on backend cpu");
+}
diff --git a/unit_tests/operator/Test_EqualImpl.cpp b/unit_tests/operator/Test_EqualImpl.cpp
new file mode 100644
index 00000000..a229b8ce
--- /dev/null
+++ b/unit_tests/operator/Test_EqualImpl.cpp
@@ -0,0 +1,205 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <catch2/catch_test_macros.hpp>
+#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
+
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/Equal.hpp"
+
+#include "aidge/backend/cpu.hpp"
+
+using namespace Aidge;
+
+TEST_CASE("[cpu/operator] Equal(forward)", "[Equal][CPU]") {
+        SECTION("ForwardDims")
+    {
+        constexpr std::uint16_t NBTRIALS = 10;
+        // Create a random number generator
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
+        std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
+        std::uniform_int_distribution<int> boolDist(0,1);
+
+        SECTION("Same dimensions") {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                DimSize_t nbDims = nbDimsDist(gen);
+                std::vector<DimSize_t> dims(nbDims);
+                for (std::size_t i = 0; i < nbDims; i++) {
+                    dims[i] = dimSizeDist(gen);
+                }
+
+                std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims);
+                myInput1->setBackend("cpu");
+                myInput1->setDataType(DataType::Float32);
+                myInput1->zeros();
+                std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims);
+                myInput2->setBackend("cpu");
+                myInput2->setDataType(DataType::Float32);
+                myInput2->zeros();
+                std::shared_ptr<Node> myEqual = Equal();
+                auto op = std::static_pointer_cast<OperatorTensor>(myEqual -> getOperator());
+                op->associateInput(0,myInput1);
+                op->associateInput(1,myInput2);
+                op->setDataType(DataType::Float32);
+                op->setBackend("cpu");
+                op->forwardDims();
+
+                const auto outputDims = op->getOutput(0)->dims();
+                REQUIRE(outputDims == dims);
+            }
+        }
+        SECTION("Broadcasting") {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                DimSize_t nbDims = nbDimsDist(gen);
+                std::vector<DimSize_t> dims1(nbDims, 1);
+                std::vector<DimSize_t> dims2(nbDims, 1);
+                std::vector<DimSize_t> expectedOutDims;
+                for (std::size_t i = 0; i < nbDims; i++) {
+                    DimSize_t dim = dimSizeDist(gen);
+                    if (boolDist(gen)) {
+                        dims1[i] = dim;
+                    }
+                    if (boolDist(gen)) {
+                        dims2[i] = dim;
+                    }
+                    expectedOutDims.push_back(std::max(dims1[i],dims2[i]));
+                }
+
+
+                std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims1);
+                myInput1->setBackend("cpu");
+                myInput1->setDataType(DataType::Float32);
+                myInput1->zeros();
+                std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims2);
+                myInput2->setBackend("cpu");
+                myInput2->setDataType(DataType::Float32);
+                myInput2->zeros();
+                std::shared_ptr<Node> myEqual = Equal();
+                auto op = std::static_pointer_cast<OperatorTensor>(myEqual -> getOperator());
+                op->associateInput(0,myInput1);
+                op->associateInput(1,myInput2);
+                op->setDataType(DataType::Float32);
+                op->setBackend("cpu");
+
+                op->forwardDims();
+
+                const auto outputDims = op->getOutput(0)->dims();
+                REQUIRE(outputDims == expectedOutDims);
+            }
+        }
+    }
+    SECTION("Same size inputs") {
+        std::shared_ptr<Tensor> input1 = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
+        {                                       //
+            {                                   //
+                {{20, 15},{31, 11},{22, 49}},   //
+                {{41, 10},{24, 51},{27, 52}},   //
+                {{26, 53},{27, 54},{28, 55}}    //
+            },                                  //
+            {                                   //
+                {{29, 56},{30, 57},{31, 58}},   //
+                {{32, 59},{33, 60},{34, 61}},   //
+                {{35, 62},{36, 63},{37, 64}}    //
+            },                                  //
+            {                                   //
+                {{38, 65},{39, 66},{40, 67}},   //
+                {{41, 68},{42, 69},{43, 70}},   //
+                {{44, 71},{45, 72},{46, 73}}    //
+            }                                   //
+        }                                       //
+    });                                         //
+        std::shared_ptr<Tensor> input2 = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
+            {                                       //
+                {                                   //
+                    {{20, 47},{21, 48},{22, 49}},   //
+                    {{23, 50},{24, 51},{25, 52}},   //
+                    {{17, 53},{27, 26},{14, 33}}    //
+                },                                  //
+                {                                   //
+                    {{29, 56},{30, 57},{31, 58}},   //
+                    {{72, 44},{33, 20},{27, 55}},   //
+                    {{35, 24},{25, 63},{28, 64}}    //
+                },                                  //
+                {                                   //
+                    {{32, 65},{39, 66},{40, 70}},   //
+                    {{41, 53},{42, 60},{34, 70}},   //
+                    {{44, 71},{30, 12},{46, 73}}    //
+                }                                   //
+            }                                       //
+        });                                         //
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
+            {
+                {
+                    {{1, 0},{0, 0},{1, 1}},
+                    {{0, 0},{1, 1},{0, 1}},
+                    {{0, 1},{1, 0},{0, 0}}
+                },
+                {
+                    {{1, 1},{1, 1},{1, 1}},
+                    {{0, 0},{1, 0},{0, 0}},
+                    {{1, 0},{0, 1},{0, 1}}
+                },
+                {
+                    {{0, 1},{1, 1},{1, 0}},
+                    {{1, 0},{1, 0},{0, 1}},
+                    {{1, 1},{0, 0},{1, 1}}
+                }
+            }
+        });
+
+        std::shared_ptr<Node> myEqual = Equal();
+        auto op = std::static_pointer_cast<OperatorTensor>(myEqual -> getOperator());
+        op->associateInput(0, input1);
+        op->associateInput(1, input2);
+        op->setBackend("cpu");
+        op->setDataType(DataType::Int32);
+        myEqual->forward();
+
+        REQUIRE(*(op->getOutput(0)) == *expectedOutput);
+    }
+
+    SECTION("Broadcasting") {
+        std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<int,1,3,3,2> {
+        {                                       //
+            {                                   //
+                {{10, 20},{22, 23},{20, 20}},   //
+                {{10, 15},{10, 29},{20, 20}},   //
+                {{26, 25},{33, 20},{10, 20}}    //
+            }                                   //
+        }                                       //
+        });                                     //
+
+        std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<int,2> {{10, 20}});  
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,1,3,3,2> {
+            {                                   //
+                {                               //
+                    {{ 1, 1},{ 0, 0},{ 0, 1}},  //
+                    {{ 1, 0},{ 1, 0},{ 0, 1}},  //
+                    {{ 0, 0},{ 0, 1},{ 1, 1}}   //
+                }                               //
+            }                                   //
+        });                                     //
+
+        std::shared_ptr<Node> myEqual = Equal();
+        auto op = std::static_pointer_cast<OperatorTensor>(myEqual -> getOperator());
+        op->associateInput(0, input_1);
+        op->associateInput(1, input_2);
+        op->setDataType(DataType::Int32);
+        op->setBackend("cpu");
+        myEqual->forward();
+        op->getOutput(0)->print();
+        expectedOutput->print();
+        REQUIRE(*op->getOutput(0) == *expectedOutput);
+    }
+}
\ No newline at end of file
-- 
GitLab


From 8701618c638707e45478b781a78a1d64ec16f407 Mon Sep 17 00:00:00 2001
From: hrouis <houssemeddine.rouis92@gmail.com>
Date: Fri, 24 Jan 2025 16:07:13 +0100
Subject: [PATCH 005/108] fix And operator

---
 .../backend/cpu/operator/AndImpl_kernels.hpp  |  29 ++-
 unit_tests/operator/Test_AndImpl.cpp          | 191 +++++++++---------
 2 files changed, 108 insertions(+), 112 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp
index 73b710e0..d7c8ebcf 100644
--- a/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp
@@ -20,7 +20,7 @@ namespace Aidge {
 namespace {
 // suppose values are contiguous in memory
 template <class I, class O>
-void equal_contiguous_arrays(const std::size_t input1size,
+void and_contiguous_arrays(const std::size_t input1size,
                             const std::size_t input2size,
                             const std::size_t output1size,
                             const I* input1,
@@ -31,14 +31,14 @@ void equal_contiguous_arrays(const std::size_t input1size,
     {
         const std::size_t in1_id = (input1size != 1) ? i : 0;
         const std::size_t in2_id = (input2size != 1) ? i : 0;
-        output[i] = static_cast<O>(input1[in1_id] == input2[in2_id]);
+        output[i] = static_cast<O>(input1[in1_id] && input2[in2_id]);
     }
 }
 }
 
 
 template <class I, class O>
-void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
+void AndImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
                                 std::vector<std::size_t> dims1,
                                 const std::vector<std::size_t>& outputDims,
                                 const void* input0_,
@@ -60,9 +60,8 @@ void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
     // special case for equal dimensions, the kernel is called with the entire arrays at once
     if (dims0 == dims1) {
         const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
-        for (std::size_t i = 0; i < input0_contiguous_size; ++i)
-        {
-            output[i] = static_cast<O>(input_0[i] == input_1[i]);
+        for (std::size_t i = 0; i < input0_contiguous_size; ++i) {
+            output[i] = static_cast<O>(input_0[i] && input_1[i]);
         }
         return;
     }
@@ -126,7 +125,7 @@ void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
     std::size_t dim = contiguousIdx - 1;
     const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
     for (std::size_t stack = 0; stack < nbStacks;) {
-        equal_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
+        and_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
                     input_0 + offsetIn0*input0_contiguous_size,
                     input_1 + offsetIn1*input1_contiguous_size,
                     output + offsetOut*output_contiguous_size);
@@ -146,17 +145,17 @@ void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
 
 // Kernels registration to implementation entry point
 REGISTRAR(AndImpl_cpu,
-    {DataType::Float32},
-    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<float, float>, nullptr});
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float32}},
+    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<float, float>, nullptr});
 REGISTRAR(AndImpl_cpu,
-    {DataType::Float64},
-    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<double, double>, nullptr});
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}},
+    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<double, double>, nullptr});
 REGISTRAR(AndImpl_cpu,
-    {DataType::Int32},
-    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr});
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}},
+    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr});
 REGISTRAR(AndImpl_cpu,
-    {DataType::Int64},
-    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, nullptr});
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int64}},
+    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, nullptr});
 
 }  // namespace Aidge
 
diff --git a/unit_tests/operator/Test_AndImpl.cpp b/unit_tests/operator/Test_AndImpl.cpp
index c2309dce..978a89e5 100644
--- a/unit_tests/operator/Test_AndImpl.cpp
+++ b/unit_tests/operator/Test_AndImpl.cpp
@@ -26,75 +26,92 @@
 using namespace Aidge;
 
 TEST_CASE("[cpu/operator] And(forward)", "[And][CPU]") {
-        SECTION("ForwardDims")
-    {
+    SECTION("ForwardDims") {
         constexpr std::uint16_t NBTRIALS = 10;
         // Create a random number generator
         std::random_device rd;
         std::mt19937 gen(rd());
-        std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
-        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
-        std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
-        std::uniform_int_distribution<int> boolDist(0,1);
+        std::uniform_int_distribution<int> boolDist(0, 1); // Use 0 for false, 1 for true
+        std::uniform_int_distribution<std::size_t> dimSizeDist(2, 10);
+        std::uniform_int_distribution<std::size_t> nbDimsDist(1, 5);
 
         SECTION("Same dimensions") {
             for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
                 DimSize_t nbDims = nbDimsDist(gen);
                 std::vector<DimSize_t> dims(nbDims);
-                for (std::size_t i = 0; i < nbDims; i++) {
+                for (std::size_t i = 0; i < nbDims; ++i) {
                     dims[i] = dimSizeDist(gen);
                 }
-
+                const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                float* array0 = new float[nb_elements];
+                float* array1 = new float[nb_elements];
+                for (std::size_t i = 0; i < nb_elements; ++i) {
+                    array0[i] = boolDist(gen);
+                    array1[i] = boolDist(gen);
+                }
                 std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims);
-                myInput1->setBackend("cpu");
-                myInput1->setDataType(DataType::Float32);
-                myInput1->zeros();
                 std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims);
-                myInput2->setBackend("cpu");
+                myInput1->setDataType(DataType::Float32);
                 myInput2->setDataType(DataType::Float32);
-                myInput2->zeros();
+                myInput1->setBackend("cpu");
+                myInput2->setBackend("cpu");
+
+                myInput1 -> getImpl() -> setRawPtr(array0, nb_elements);
+                myInput2 -> getImpl() -> setRawPtr(array1, nb_elements);
+
                 std::shared_ptr<Node> myAnd = And();
-                auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator());
-                op->associateInput(0,myInput1);
-                op->associateInput(1,myInput2);
+                auto op = std::static_pointer_cast<OperatorTensor>(myAnd->getOperator());
+                op->associateInput(0, myInput1);
+                op->associateInput(1, myInput2);
                 op->setDataType(DataType::Float32);
                 op->setBackend("cpu");
                 op->forwardDims();
 
                 const auto outputDims = op->getOutput(0)->dims();
                 REQUIRE(outputDims == dims);
+                delete[] array0;
+                delete[] array1;
             }
         }
+
         SECTION("Broadcasting") {
             for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
                 DimSize_t nbDims = nbDimsDist(gen);
                 std::vector<DimSize_t> dims1(nbDims, 1);
                 std::vector<DimSize_t> dims2(nbDims, 1);
                 std::vector<DimSize_t> expectedOutDims;
-                for (std::size_t i = 0; i < nbDims; i++) {
+                for (std::size_t i = 0; i < nbDims; ++i) {
                     DimSize_t dim = dimSizeDist(gen);
-                    if (boolDist(gen)) {
-                        dims1[i] = dim;
-                    }
-                    if (boolDist(gen)) {
-                        dims2[i] = dim;
-                    }
-                    expectedOutDims.push_back(std::max(dims1[i],dims2[i]));
+                    if (boolDist(gen)) dims1[i] = dim;
+                    if (boolDist(gen)) dims2[i] = dim;
+                    expectedOutDims.push_back(std::max(dims1[i], dims2[i]));
                 }
 
+                const std::size_t nb_elements0 = std::accumulate(dims1.cbegin(), dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                const std::size_t nb_elements1 = std::accumulate(dims2.cbegin(), dims2.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                float* array0 = new float[nb_elements0];
+                float* array1 = new float[nb_elements1];
+                for (std::size_t i = 0; i < nb_elements0; ++i) {
+                    array0[i] = boolDist(gen);
+                }
+                for (std::size_t i = 0; i < nb_elements1; ++i) {
+                    array1[i] = boolDist(gen);
+                }
 
                 std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims1);
-                myInput1->setBackend("cpu");
-                myInput1->setDataType(DataType::Float32);
-                myInput1->zeros();
                 std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims2);
-                myInput2->setBackend("cpu");
+                myInput1->setDataType(DataType::Float32);
                 myInput2->setDataType(DataType::Float32);
-                myInput2->zeros();
+                myInput1->setBackend("cpu");
+                myInput2->setBackend("cpu");
+                myInput1 -> getImpl() -> setRawPtr(array0, nb_elements0);
+                myInput2 -> getImpl() -> setRawPtr(array1, nb_elements1);
+
+
                 std::shared_ptr<Node> myAnd = And();
-                auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator());
-                op->associateInput(0,myInput1);
-                op->associateInput(1,myInput2);
+                auto op = std::static_pointer_cast<OperatorTensor>(myAnd->getOperator());
+                op->associateInput(0, myInput1);
+                op->associateInput(1, myInput2);
                 op->setDataType(DataType::Float32);
                 op->setBackend("cpu");
 
@@ -102,80 +119,48 @@ TEST_CASE("[cpu/operator] And(forward)", "[And][CPU]") {
 
                 const auto outputDims = op->getOutput(0)->dims();
                 REQUIRE(outputDims == expectedOutDims);
+                delete[] array0;
+                delete[] array1;
             }
         }
     }
+
     SECTION("Same size inputs") {
-        std::shared_ptr<Tensor> input1 = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
-        {                                       //
-            {                                   //
-                {{20, 15},{31, 11},{22, 49}},   //
-                {{41, 10},{24, 51},{27, 52}},   //
-                {{26, 53},{27, 54},{28, 55}}    //
-            },                                  //
-            {                                   //
-                {{29, 56},{30, 57},{31, 58}},   //
-                {{32, 59},{33, 60},{34, 61}},   //
-                {{35, 62},{36, 63},{37, 64}}    //
-            },                                  //
-            {                                   //
-                {{38, 65},{39, 66},{40, 67}},   //
-                {{41, 68},{42, 69},{43, 70}},   //
-                {{44, 71},{45, 72},{46, 73}}    //
-            }                                   //
-        }                                       //
-    });                                         //
-        std::shared_ptr<Tensor> input2 = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
-            {                                       //
-                {                                   //
-                    {{20, 47},{21, 48},{22, 49}},   //
-                    {{23, 50},{24, 51},{25, 52}},   //
-                    {{17, 53},{27, 26},{14, 33}}    //
-                },                                  //
-                {                                   //
-                    {{29, 56},{30, 57},{31, 58}},   //
-                    {{72, 44},{33, 20},{27, 55}},   //
-                    {{35, 24},{25, 63},{28, 64}}    //
-                },                                  //
-                {                                   //
-                    {{32, 65},{39, 66},{40, 70}},   //
-                    {{41, 53},{42, 60},{34, 70}},   //
-                    {{44, 71},{30, 12},{46, 73}}    //
-                }                                   //
-            }                                       //
-        });                                         //
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
+        std::shared_ptr<Tensor> input1 = std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 2>{
             {
-                {
-                    {{1, 0},{0, 0},{1, 1}},
-                    {{0, 0},{1, 1},{0, 1}},
-                    {{0, 1},{1, 0},{0, 0}}
-                },
-                {
-                    {{1, 1},{1, 1},{1, 1}},
-                    {{0, 0},{1, 0},{0, 0}},
-                    {{1, 0},{0, 1},{0, 1}}
-                },
-                {
-                    {{0, 1},{1, 1},{1, 0}},
-                    {{1, 0},{1, 0},{0, 1}},
-                    {{1, 1},{0, 0},{1, 1}}
-                }
-            }
-        });
+                {{{1, 0}, {0, 1}},
+                {{1, 1}, {0, 0}}},
+                {{{0, 1}, {1, 0}},
+                {{1, 0}, {0, 1}}}}
+            });
+        std::shared_ptr<Tensor> input2 = std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 2>{
+            {
+                {{{1, 1}, {0, 0}},
+                {{0, 1}, {1, 1}}},
+                {{{1, 1}, {0, 0}},
+                {{0, 1}, {1, 0}}}}
+            });
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 2>{
+            {
+                {{{1, 0}, {0, 0}},
+                {{0, 1}, {0, 0}}},
+                {{{0, 1}, {0, 0}},
+                {{0, 0}, {0, 0}}}}
+            });
 
         std::shared_ptr<Node> myAnd = And();
-        auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator());
+        auto op = std::static_pointer_cast<OperatorTensor>(myAnd->getOperator());
         op->associateInput(0, input1);
         op->associateInput(1, input2);
         op->setBackend("cpu");
-        op->setDataType(DataType::Int32);
+        op->setDataType(DataType::Float32);
         myAnd->forward();
-
+        op->getOutput(0)->print();
         REQUIRE(*(op->getOutput(0)) == *expectedOutput);
     }
 
     SECTION("Broadcasting") {
+<<<<<<< HEAD
         std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<int,1,3,3,2> {
         {                                       //
             {                                   //
@@ -196,16 +181,28 @@ TEST_CASE("[cpu/operator] And(forward)", "[And][CPU]") {
                 }                               //
             }                                   //
         });                                     //
+=======
+        std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<float, 1, 2, 2, 2>{
+            {
+                {{{1, 0}, {1, 0}},
+                {{1, 1}, {0, 0}}}}
+            });
+        std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<float, 2>{{1, 0}});
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<float, 1, 2, 2, 2>{
+            {
+                {{{1, 0}, {1, 0}},
+                {{1, 0}, {0, 0}}}}
+            });
+>>>>>>> fix and kernel and unit tests
 
         std::shared_ptr<Node> myAnd = And();
-        auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator());
+        auto op = std::static_pointer_cast<OperatorTensor>(myAnd->getOperator());
         op->associateInput(0, input_1);
         op->associateInput(1, input_2);
-        op->setDataType(DataType::Int32);
+        op->setDataType(DataType::Float32);
         op->setBackend("cpu");
         myAnd->forward();
-        op->getOutput(0)->print();
-        expectedOutput->print();
-        REQUIRE(*op->getOutput(0) == *expectedOutput);
+
+        REQUIRE(*(op->getOutput(0)) == *expectedOutput);
     }
-}
\ No newline at end of file
+}
-- 
GitLab


From 0fbfb571b52f38f02c1a691cecac20d3843cbf22 Mon Sep 17 00:00:00 2001
From: hrouis <houssemeddine.rouis92@gmail.com>
Date: Fri, 24 Jan 2025 16:08:17 +0100
Subject: [PATCH 006/108] add dilations to maxpool

---
 .../backend/cpu/operator/MaxPoolingImpl.hpp   |   1 +
 .../cpu/operator/MaxPoolingImpl_kernels.hpp   | 126 ++----------------
 src/operator/MaxPoolingImpl.cpp               |   1 +
 unit_tests/operator/Test_MaxPoolingImpl.cpp   |  35 +++++
 4 files changed, 49 insertions(+), 114 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp b/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp
index 68cc3621..062088a1 100644
--- a/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp
+++ b/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp
@@ -28,6 +28,7 @@ namespace Aidge {
 using MaxPooling2D_Op = MaxPooling_Op<2>;
 using MaxPoolingImpl2D_cpu = OperatorImpl_cpu<MaxPooling_Op<2>,
     void(const std::array<DimSize_t, 2>&,
+                            const std::array<DimSize_t, 2>&,
                             const std::array<DimSize_t, 2>&,
                             const bool,
                             const std::array<DimSize_t, 4> &,
diff --git a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
index 7b6f04f1..250b11b0 100644
--- a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
@@ -35,28 +35,23 @@ namespace Aidge {
 template <class I, class O>
 void MaxPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
                                         const std::array<DimSize_t, 2>& kernelDims,
+                                        const std::array<DimSize_t, 2>& dilations,
                                         const bool /*ceilMode*/,
                                         const std::array<DimSize_t, 4> &dims,
                                         const void *input_,
                                         void *output_) {
-    // FIXME: missing convolution parameters as arguments
     const I *input = static_cast<const I *>(input_);
     O *output = static_cast<O *>(output_);
 
     // output H size
     const std::size_t oxSize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(dims[2] - kernelDims[0] + strideDims[0]) /
+            static_cast<std::size_t>(std::floor(static_cast<float>(dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0]) /
                                 static_cast<float>(strideDims[0])));
     // output W size
     const std::size_t oySize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(dims[3] - kernelDims[1] + strideDims[1]) /
+            static_cast<std::size_t>(std::floor(static_cast<float>(dims[3] - (kernelDims[1] - 1) * dilations[1] - 1 + strideDims[1]) /
                                 static_cast<float>(strideDims[1])));
 
-    // TODO: kernel computation
-    // output (batch, outCh, Xout, Yout)
-    // input  (batch, ch, Xin, Yin)
-    // weight (outCh, ch, kernelX, kernelY)
-    // does not take Dilation parameter into account
     using signedsize = std::make_signed<std::size_t>::type;
     for (std::size_t batch = 0; batch < dims[0]; ++batch) {
         for (std::size_t ch = 0; ch < dims[1]; ++ch) {
@@ -77,12 +72,15 @@ void MaxPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideD
                     I poolValue(0.0);
                     bool valid = false;
 
-                    for (unsigned int channel = 0; channel < dims[1];
-                            ++channel){
-                        for (unsigned int sy = syMin; sy < syMax; ++sy) {
-                            for (unsigned int sx = sxMin; sx < sxMax; ++sx)
-                            {
-                                const I value = input[iIndex + (ix+sx)*dims[3] + (iy+sy)];
+                    for (unsigned int sy = syMin; sy < syMax; ++sy) {
+                        for (unsigned int sx = sxMin; sx < sxMax; ++sx) {
+                            // Apply dilation factor to kernel indices
+                            const std::size_t dilated_sx = sx * dilations[0];
+                            const std::size_t dilated_sy = sy * dilations[1];
+
+                            // Ensure indices are within bounds
+                            if ((ix + dilated_sx) < dims[2] && (iy + dilated_sy) < dims[3]) {
+                                const I value = input[iIndex + (ix + dilated_sx) * dims[3] + (iy + dilated_sy)];
 
                                 if (!valid || value > poolValue) {
                                     poolValue = value;
@@ -98,106 +96,6 @@ void MaxPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideD
     }
 }
 
-//N2D2 version
-/*
-template <class T>
-void N2D2::PoolCell_Frame_Kernels::forwardMax(const T* alpha,
-                                              const Tensor<T>&
-                                              inputs,
-                                              const Descriptor& desc,
-                                              const T* beta,
-                                              Tensor<T>& outputs,
-                                              Tensor<ArgMax>& argMax,
-                                              bool useArgMax,
-                                              const Tensor<bool>& maps)
-{
-    const unsigned int size = inputs.dimB() * outputs.dimZ();
-
-#if defined(_OPENMP) && _OPENMP >= 200805
-#pragma omp parallel for collapse(2) if (size > 16)
-#else
-#pragma omp parallel for if (inputs.dimB() > 4 && size > 16)
-#endif
-    for (int batchPos = 0; batchPos < (int)inputs.dimB(); ++batchPos) {
-        for (unsigned int output = 0; output < outputs.dimZ(); ++output) {
-            for (unsigned int oy = 0; oy < outputs.dimY(); ++oy) {
-                for (unsigned int ox = 0; ox < outputs.dimX(); ++ox) {
-                    const unsigned int sxMin = (unsigned int)std::max(
-                        desc.padding[0] - (int)(ox * desc.stride[0]), 0);
-                    const unsigned int syMin = (unsigned int)std::max(
-                        desc.padding[1] - (int)(oy * desc.stride[1]), 0);
-                    const unsigned int sxMax = Utils::clamp
-                        <int>(inputs.dimX() + desc.padding[0] - ox * desc.stride[0],
-                              0,
-                              desc.pool[0]);
-                    const unsigned int syMax = Utils::clamp
-                        <int>(inputs.dimY() + desc.padding[1] - oy * desc.stride[1],
-                              0,
-                              desc.pool[1]);
-
-                    const int ix = (int)(ox * desc.stride[0]) - desc.padding[0];
-                    const int iy = (int)(oy * desc.stride[1]) - desc.padding[1];
-
-                    T poolValue(0.0);
-
-                    // For each output, compute the pool value
-                    if (useArgMax) {
-                        const ArgMax inputMax
-                            = argMax(ox, oy, output, batchPos);
-
-                        if (inputMax.valid) {
-                            poolValue = inputs(inputMax.ix,
-                                               inputMax.iy,
-                                               inputMax.channel,
-                                               batchPos);
-                        }
-                    }
-                    else {
-                        unsigned int ixMax = 0;
-                        unsigned int iyMax = 0;
-                        unsigned int channelMax = 0;
-                        bool valid = false;
-
-                        for (unsigned int channel = 0; channel < inputs.dimZ();
-                             ++channel)
-                        {
-                            if (!maps.empty() && !maps(output, channel))
-                                continue;
-
-                            for (unsigned int sy = syMin; sy < syMax; ++sy) {
-                                for (unsigned int sx = sxMin; sx < sxMax; ++sx)
-                                {
-                                    const T value = inputs(ix + sx,
-                                                                 iy + sy,
-                                                                 channel,
-                                                                 batchPos);
-
-                                    if (!valid || value > poolValue) {
-                                        poolValue = value;
-                                        valid = true;
-
-                                        ixMax = ix + sx;
-                                        iyMax = iy + sy;
-                                        channelMax = channel;
-                                    }
-                                }
-                            }
-                        }
-
-                        argMax(ox, oy, output, batchPos)
-                            = ArgMax(ixMax, iyMax, channelMax, valid);
-                    }
-
-                    outputs(ox, oy, output, batchPos)
-                        = (*alpha) * poolValue
-                          + (*beta) * outputs(ox, oy, output, batchPos);
-                }
-            }
-        }
-    }
-}
-
-*/
 
 // Kernels registration to implementation entry point
 REGISTRAR(MaxPoolingImpl2D_cpu,
diff --git a/src/operator/MaxPoolingImpl.cpp b/src/operator/MaxPoolingImpl.cpp
index 90075a39..13ef75b0 100644
--- a/src/operator/MaxPoolingImpl.cpp
+++ b/src/operator/MaxPoolingImpl.cpp
@@ -30,6 +30,7 @@ void Aidge::MaxPoolingImpl2D_cpu::forward() {
     // Call kernel
     impl.forward(op_.strideDims(),
                 op_.kernelDims(),
+                op_.dilations(),
                 op_.ceilMode(),
                 op_.getInput(0)->template dims<4>(),
                 getCPUPtr(mOp.getRawInput(0)),
diff --git a/unit_tests/operator/Test_MaxPoolingImpl.cpp b/unit_tests/operator/Test_MaxPoolingImpl.cpp
index de02df2b..6b7e6d2f 100644
--- a/unit_tests/operator/Test_MaxPoolingImpl.cpp
+++ b/unit_tests/operator/Test_MaxPoolingImpl.cpp
@@ -80,4 +80,39 @@ TEST_CASE("[cpu/operator] MaxPooling(forward)", "[MaxPooling][CPU]") {
         op->getOutput(0)->print();
         REQUIRE(*(op->getOutput(0)) == myOutput);
     }
+    SECTION("Dilation") {
+        std::shared_ptr<Node> myMaxPool = MaxPooling({2,2}, "mycdw", {2,2}, {2,2}); // Dilation 2x2
+        auto op = std::static_pointer_cast<OperatorTensor>(myMaxPool -> getOperator());
+
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,2,2,2> {
+            {
+                {
+                    {
+                        {0.71470, 0.52770},
+                        {0.71470, 0.48740}
+                    },
+                    {
+                        {2.23290, 0.48590},
+                        {2.23290, 0.07000}
+                    }
+                },
+                {
+                    {
+                        {1.76530, 1.20710},
+                        {1.76530, 1.20710}
+                    },
+                    {
+                        {1.04290, 0.67760},
+                        {1.72170, 0.67760}
+                    }
+                }
+            }
+        });
+        myMaxPool->getOperator()->associateInput(0,myInput);
+        myMaxPool->getOperator()->setDataType(DataType::Float32);
+        myMaxPool->getOperator()->setBackend("cpu");
+        myMaxPool->forward();
+        op->getOutput(0)->print();
+        REQUIRE(*(op->getOutput(0)) == *myOutput);
+    }
 }
\ No newline at end of file
-- 
GitLab


From 96cea9f4cbbb9d07d8d1cf2ac439c04d860613fa Mon Sep 17 00:00:00 2001
From: hrouis <houssemeddine.rouis92@gmail.com>
Date: Mon, 27 Jan 2025 15:21:08 +0100
Subject: [PATCH 007/108] add dilations and cielmode to AvgPooling

---
 .../backend/cpu/operator/AvgPoolingImpl.hpp   |  2 +
 .../cpu/operator/AvgPoolingImpl_kernels.hpp   | 76 ++++++++-----------
 src/operator/AvgPoolingImpl.cpp               |  2 +
 unit_tests/operator/Test_AndImpl.cpp          | 23 ------
 4 files changed, 36 insertions(+), 67 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/AvgPoolingImpl.hpp b/include/aidge/backend/cpu/operator/AvgPoolingImpl.hpp
index adea96ca..7c76657f 100644
--- a/include/aidge/backend/cpu/operator/AvgPoolingImpl.hpp
+++ b/include/aidge/backend/cpu/operator/AvgPoolingImpl.hpp
@@ -28,8 +28,10 @@ namespace Aidge {
 using AvgPooling2D_Op = AvgPooling_Op<2>;
 using AvgPoolingImpl2D_cpu = OperatorImpl_cpu<AvgPooling_Op<2>,
     void(const std::array<DimSize_t, 2>&,
+        const std::array<DimSize_t, 2>&,
         const std::array<DimSize_t, 2>&,
         const std::array<DimSize_t, 4>&,
+        bool,
         const void *,
         void *)>;
 
diff --git a/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
index f6da9dcb..68dbfbe7 100644
--- a/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
@@ -35,66 +35,54 @@ namespace Aidge {
 template <class I, class O>
 void AvgPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
                                         const std::array<DimSize_t, 2>& kernelDims,
+                                        const std::array<DimSize_t, 2>& dilations,
                                         const std::array<DimSize_t, 4> &dims,
+                                        bool ceilMode,
                                         const void *input_,
                                         void *output_) {
-    // FIXME: missing convolution attributes as arguments
     const I *input = static_cast<const I *>(input_);
     O *output = static_cast<O *>(output_);
 
+    // Calculate output dimensions based on ceilMode and dilations
+    auto compute_output_size = [&](DimSize_t inputDim, DimSize_t kernelDim, DimSize_t stride, DimSize_t dilation) {
+        DimSize_t effectiveKernelDim = (kernelDim - 1) * dilation + 1;
+        float result = static_cast<float>(inputDim - effectiveKernelDim + stride) / static_cast<float>(stride);
+        return ceilMode ? static_cast<DimSize_t>(std::ceil(result)) : static_cast<DimSize_t>(std::floor(result));
+    };
 
-    // output H size
-    const std::size_t oxSize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(dims[2] - kernelDims[0] + strideDims[0]) /
-                                static_cast<float>(strideDims[0])));
-    // output W size
-    const std::size_t oySize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(dims[3] - kernelDims[1] + strideDims[1]) /
-                                static_cast<float>(strideDims[1])));
+    const std::size_t oxSize = compute_output_size(dims[2], kernelDims[0], strideDims[0], dilations[0]);
+    const std::size_t oySize = compute_output_size(dims[3], kernelDims[1], strideDims[1], dilations[1]);
 
-    // TODO: kernel computation
-    // output (batch, outCh, Xout, Yout)
-    // input  (batch, ch, Xin, Yin)
-    // weight (outCh, ch, kernelX, kernelY)
-    // does not take Dilation attribute into account
     using signedsize = std::make_signed<std::size_t>::type;
+
     for (std::size_t batch = 0; batch < dims[0]; ++batch) {
         for (std::size_t ch = 0; ch < dims[1]; ++ch) {
-            const std::size_t oIndex = (ch + batch*dims[1]) * oxSize * oySize;
-            const std::size_t iIndex = (ch + batch*dims[1]) * dims[2] * dims[3];
-            std::fill(output + oIndex, output+(oIndex+oxSize*oySize), 0);
+            const std::size_t oIndex = (ch + batch * dims[1]) * oxSize * oySize;
+            const std::size_t iIndex = (ch + batch * dims[1]) * dims[2] * dims[3];
+            std::fill(output + oIndex, output + (oIndex + oxSize * oySize), 0);
+
             for (std::size_t ox = 0; ox < oxSize; ++ox) {
-                const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]);
-                const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
-                const std::size_t sxMax = (static_cast<signedsize>(dims[2]) + difx) < 0 ? 0 : ((dims[2] + difx) > kernelDims[0] ? kernelDims[0] : dims[2] + difx);
+                const signedsize startx = static_cast<signedsize>(ox * strideDims[0]) - (dilations[0] - 1);
+                const std::size_t sxMin = static_cast<std::size_t>(std::max(startx, signedsize(0)));
+                const std::size_t sxMax = std::min(dims[2], static_cast<std::size_t>(startx + kernelDims[0] * dilations[0]));
+
                 for (std::size_t oy = 0; oy < oySize; ++oy) {
-                    const signedsize dify = static_cast<signedsize>(- oy * strideDims[1]);
-                    const std::size_t syMin = static_cast<std::size_t>(std::max(dify, signedsize(0)));
-                    const std::size_t syMax = (static_cast<signedsize>(dims[3]) + dify) < 0 ? 0 : ((dims[3] + dify) > kernelDims[1] ? kernelDims[1] : dims[3] + dify);
-                    const std::size_t oIndexFull = oIndex + ox*oySize + oy;
-                    const std::size_t ix = ox * strideDims[0];
-                    const std::size_t iy = oy * strideDims[1];
+                    const signedsize starty = static_cast<signedsize>(oy * strideDims[1]) - (dilations[1] - 1);
+                    const std::size_t syMin = static_cast<std::size_t>(std::max(starty, signedsize(0)));
+                    const std::size_t syMax = std::min(dims[3], static_cast<std::size_t>(starty + kernelDims[1] * dilations[1]));
 
-                    if (sxMin == 0 && syMin == 0 && sxMax == 3 && syMax == 3) {
-                        output[oIndexFull] += static_cast<O>(
-                                               input[iIndex + (ix+0)*dims[3] + (iy+0)] +
-                                               input[iIndex + (ix+0)*dims[3] + (iy+1)] +
-                                               input[iIndex + (ix+0)*dims[3] + (iy+2)] +
-                                               input[iIndex + (ix+1)*dims[3] + (iy+0)] +
-                                               input[iIndex + (ix+1)*dims[3] + (iy+1)] +
-                                               input[iIndex + (ix+1)*dims[3] + (iy+2)] +
-                                               input[iIndex + (ix+2)*dims[3] + (iy+0)] +
-                                               input[iIndex + (ix+2)*dims[3] + (iy+1)] +
-                                               input[iIndex + (ix+2)*dims[3] + (iy+2)]) / O(9);
-                    } else {
-                        for (std::size_t sx = sxMin; sx < sxMax; ++sx) {
-                            for (std::size_t sy = syMin; sy < syMax; ++sy) {
-                                output[oIndexFull] += input[iIndex + (ix+sx)*dims[3] + (iy+sy)];
-                            }
+                    const std::size_t oIndexFull = oIndex + ox * oySize + oy;
+                    O sum = static_cast<O>(0);
+                    std::size_t count = 0;
+
+                    for (std::size_t sx = sxMin; sx < sxMax; sx += dilations[0]) {
+                        for (std::size_t sy = syMin; sy < syMax; sy += dilations[1]) {
+                            sum += static_cast<O>(input[iIndex + sx * dims[3] + sy]);
+                            ++count;
                         }
-                        // padding not used
-                        output[oIndexFull] /= (sxMax - sxMin) * (syMax - syMin);
                     }
+
+                    output[oIndexFull] = sum / static_cast<O>(count);
                 }
             }
         }
diff --git a/src/operator/AvgPoolingImpl.cpp b/src/operator/AvgPoolingImpl.cpp
index 01a5e8cf..eb5ef87b 100644
--- a/src/operator/AvgPoolingImpl.cpp
+++ b/src/operator/AvgPoolingImpl.cpp
@@ -32,7 +32,9 @@ void Aidge::AvgPoolingImpl2D_cpu::forward() {
     // Call kernel
     impl.forward(op_.strideDims(),
                op_.kernelDims(),
+               op_.dilations(),
                op_.getInput(0)->template dims<4>(),
+               op_.ceilMode(),
                getCPUPtr(op_.getInput(0)),
                getCPUPtr(op_.getOutput(0)));
 }
diff --git a/unit_tests/operator/Test_AndImpl.cpp b/unit_tests/operator/Test_AndImpl.cpp
index 978a89e5..148298d5 100644
--- a/unit_tests/operator/Test_AndImpl.cpp
+++ b/unit_tests/operator/Test_AndImpl.cpp
@@ -160,28 +160,6 @@ TEST_CASE("[cpu/operator] And(forward)", "[And][CPU]") {
     }
 
     SECTION("Broadcasting") {
-<<<<<<< HEAD
-        std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<int,1,3,3,2> {
-        {                                       //
-            {                                   //
-                {{10, 20},{22, 23},{20, 20}},   //
-                {{10, 15},{10, 29},{20, 20}},   //
-                {{26, 25},{33, 20},{10, 20}}    //
-            }                                   //
-        }                                       //
-        });                                     //
-
-        std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<int,2> {{10, 20}});
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,1,3,3,2> {
-            {                                   //
-                {                               //
-                    {{ 1, 1},{ 0, 0},{ 0, 1}},  //
-                    {{ 1, 0},{ 1, 0},{ 0, 1}},  //
-                    {{ 0, 0},{ 0, 1},{ 1, 1}}   //
-                }                               //
-            }                                   //
-        });                                     //
-=======
         std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<float, 1, 2, 2, 2>{
             {
                 {{{1, 0}, {1, 0}},
@@ -193,7 +171,6 @@ TEST_CASE("[cpu/operator] And(forward)", "[And][CPU]") {
                 {{{1, 0}, {1, 0}},
                 {{1, 0}, {0, 0}}}}
             });
->>>>>>> fix and kernel and unit tests
 
         std::shared_ptr<Node> myAnd = And();
         auto op = std::static_pointer_cast<OperatorTensor>(myAnd->getOperator());
-- 
GitLab


From c12d2826171a88b4c87ce5b222363256a222b262 Mon Sep 17 00:00:00 2001
From: hrouis <houssemeddine.rouis92@gmail.com>
Date: Mon, 3 Feb 2025 10:11:02 +0100
Subject: [PATCH 008/108] handle ceil_mode in pooling kernels

---
 .../cpu/operator/AvgPoolingImpl_kernels.hpp   | 56 ++++++++++++-------
 .../cpu/operator/MaxPoolingImpl_kernels.hpp   | 20 ++++---
 unit_tests/operator/Test_AvgPoolingImpl.cpp   | 35 +++++++++++-
 3 files changed, 82 insertions(+), 29 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
index 68dbfbe7..78f8446a 100644
--- a/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
@@ -43,15 +43,20 @@ void AvgPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideD
     const I *input = static_cast<const I *>(input_);
     O *output = static_cast<O *>(output_);
 
-    // Calculate output dimensions based on ceilMode and dilations
-    auto compute_output_size = [&](DimSize_t inputDim, DimSize_t kernelDim, DimSize_t stride, DimSize_t dilation) {
-        DimSize_t effectiveKernelDim = (kernelDim - 1) * dilation + 1;
-        float result = static_cast<float>(inputDim - effectiveKernelDim + stride) / static_cast<float>(stride);
-        return ceilMode ? static_cast<DimSize_t>(std::ceil(result)) : static_cast<DimSize_t>(std::floor(result));
-    };
-
-    const std::size_t oxSize = compute_output_size(dims[2], kernelDims[0], strideDims[0], dilations[0]);
-    const std::size_t oySize = compute_output_size(dims[3], kernelDims[1], strideDims[1], dilations[1]);
+    // output H size
+    const std::size_t oxSize = 
+        ceilMode 
+        ? static_cast<std::size_t>(std::ceil(static_cast<float>(dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0]) /
+                                            static_cast<float>(strideDims[0])))
+        : static_cast<std::size_t>(std::floor(static_cast<float>(dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0]) /
+                                            static_cast<float>(strideDims[0])));
+    // output W size
+    const std::size_t oySize = 
+        ceilMode 
+        ? static_cast<std::size_t>(std::ceil(static_cast<float>(dims[3] - (kernelDims[1] - 1) * dilations[1] - 1 + strideDims[1]) /
+                                            static_cast<float>(strideDims[1])))
+        : static_cast<std::size_t>(std::floor(static_cast<float>(dims[3] - (kernelDims[1] - 1) * dilations[1] - 1 + strideDims[1]) /
+                                            static_cast<float>(strideDims[1])));
 
     using signedsize = std::make_signed<std::size_t>::type;
 
@@ -59,30 +64,39 @@ void AvgPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideD
         for (std::size_t ch = 0; ch < dims[1]; ++ch) {
             const std::size_t oIndex = (ch + batch * dims[1]) * oxSize * oySize;
             const std::size_t iIndex = (ch + batch * dims[1]) * dims[2] * dims[3];
-            std::fill(output + oIndex, output + (oIndex + oxSize * oySize), 0);
 
             for (std::size_t ox = 0; ox < oxSize; ++ox) {
-                const signedsize startx = static_cast<signedsize>(ox * strideDims[0]) - (dilations[0] - 1);
-                const std::size_t sxMin = static_cast<std::size_t>(std::max(startx, signedsize(0)));
-                const std::size_t sxMax = std::min(dims[2], static_cast<std::size_t>(startx + kernelDims[0] * dilations[0]));
+                const signedsize difx = static_cast<signedsize>(-ox * strideDims[0]);
+                const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
+                const std::size_t sxMax = (static_cast<signedsize>(dims[2]) + difx) < 0 ? 0 : ((dims[2] + difx) > kernelDims[0] ? kernelDims[0] : dims[2] + difx);
 
                 for (std::size_t oy = 0; oy < oySize; ++oy) {
-                    const signedsize starty = static_cast<signedsize>(oy * strideDims[1]) - (dilations[1] - 1);
-                    const std::size_t syMin = static_cast<std::size_t>(std::max(starty, signedsize(0)));
-                    const std::size_t syMax = std::min(dims[3], static_cast<std::size_t>(starty + kernelDims[1] * dilations[1]));
+                    const signedsize dify = static_cast<signedsize>(-oy * strideDims[1]);
+                    const std::size_t syMin = static_cast<std::size_t>(std::max(dify, signedsize(0)));
+                    const std::size_t syMax = (static_cast<signedsize>(dims[3]) + dify) < 0 ? 0 : ((dims[3] + dify) > kernelDims[1] ? kernelDims[1] : dims[3] + dify);
 
                     const std::size_t oIndexFull = oIndex + ox * oySize + oy;
+                    const std::size_t ix = ox * strideDims[0];
+                    const std::size_t iy = oy * strideDims[1];
+
                     O sum = static_cast<O>(0);
                     std::size_t count = 0;
 
-                    for (std::size_t sx = sxMin; sx < sxMax; sx += dilations[0]) {
-                        for (std::size_t sy = syMin; sy < syMax; sy += dilations[1]) {
-                            sum += static_cast<O>(input[iIndex + sx * dims[3] + sy]);
-                            ++count;
+                    for (unsigned int sy = syMin; sy < syMax; ++sy) {
+                        for (unsigned int sx = sxMin; sx < sxMax; ++sx) {
+                            // Apply dilation factor
+                            const std::size_t dilated_sx = sx * dilations[0];
+                            const std::size_t dilated_sy = sy * dilations[1];
+
+                            // Ensure within bounds
+                            if ((ix + dilated_sx) < dims[2] && (iy + dilated_sy) < dims[3]) {
+                                sum += static_cast<O>(input[iIndex + (ix + dilated_sx) * dims[3] + (iy + dilated_sy)]);
+                                ++count;
+                            }
                         }
                     }
 
-                    output[oIndexFull] = sum / static_cast<O>(count);
+                    output[oIndexFull] = count > 0 ? sum / static_cast<O>(count) : 0;
                 }
             }
         }
diff --git a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
index 250b11b0..d5ac02fe 100644
--- a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
@@ -36,7 +36,7 @@ template <class I, class O>
 void MaxPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
                                         const std::array<DimSize_t, 2>& kernelDims,
                                         const std::array<DimSize_t, 2>& dilations,
-                                        const bool /*ceilMode*/,
+                                        const bool ceilMode,
                                         const std::array<DimSize_t, 4> &dims,
                                         const void *input_,
                                         void *output_) {
@@ -44,13 +44,19 @@ void MaxPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideD
     O *output = static_cast<O *>(output_);
 
     // output H size
-    const std::size_t oxSize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0]) /
-                                static_cast<float>(strideDims[0])));
+    const std::size_t oxSize = 
+        ceilMode 
+        ? static_cast<std::size_t>(std::ceil(static_cast<float>(dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0]) /
+                                            static_cast<float>(strideDims[0])))
+        : static_cast<std::size_t>(std::floor(static_cast<float>(dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0]) /
+                                            static_cast<float>(strideDims[0])));
     // output W size
-    const std::size_t oySize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(dims[3] - (kernelDims[1] - 1) * dilations[1] - 1 + strideDims[1]) /
-                                static_cast<float>(strideDims[1])));
+    const std::size_t oySize = 
+        ceilMode 
+        ? static_cast<std::size_t>(std::ceil(static_cast<float>(dims[3] - (kernelDims[1] - 1) * dilations[1] - 1 + strideDims[1]) /
+                                            static_cast<float>(strideDims[1])))
+        : static_cast<std::size_t>(std::floor(static_cast<float>(dims[3] - (kernelDims[1] - 1) * dilations[1] - 1 + strideDims[1]) /
+                                            static_cast<float>(strideDims[1])));
 
     using signedsize = std::make_signed<std::size_t>::type;
     for (std::size_t batch = 0; batch < dims[0]; ++batch) {
diff --git a/unit_tests/operator/Test_AvgPoolingImpl.cpp b/unit_tests/operator/Test_AvgPoolingImpl.cpp
index 372febc6..21a7a680 100644
--- a/unit_tests/operator/Test_AvgPoolingImpl.cpp
+++ b/unit_tests/operator/Test_AvgPoolingImpl.cpp
@@ -110,5 +110,38 @@ TEST_CASE("[cpu/operator] AvgPooling(forward)", "[AvgPooling][CPU]") {
             REQUIRE(std::abs(outPtr[i] - expectedOutPtr[i]) < 0.00001);
         }
     }
-    // std::cout << static_cast<Tensor>((*op)["weight"])[0][0][0][0] << std::endl;
+    SECTION("Dilations") {
+        std::shared_ptr<Tensor> myInput3 = std::make_shared<Tensor>(Array4D<float,1,1,5,5> { // NCHW
+        {
+            {
+                {{ 1,  2,  3,  4,  5},
+                { 6,  7,  8,  9, 10},
+                {11, 12, 13, 14, 15},
+                {16, 17, 18, 19, 20},
+                {21, 22, 23, 24, 25}}
+            }
+        }
+        });
+
+        // Dilation of 2 means we take every second element in the window
+        std::shared_ptr<Node> myAvgPool = AvgPooling({2,2}, "mycdw", {1,1}, {2,2}); 
+        auto op = std::static_pointer_cast<AvgPooling_Op<2>>(myAvgPool -> getOperator());
+
+        std::shared_ptr<Tensor> myOutput3 = std::make_shared<Tensor>(Array4D<float,1,1,3,3> {
+            {
+                {
+                    {{  7,  8,  9},
+                    { 12, 13, 14},
+                    { 17, 18, 19}}
+                }
+            }
+        });
+
+        op->associateInput(0, myInput3);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cpu");
+        myAvgPool->forward();
+        op->getOutput(0)->print();
+        REQUIRE(*(op->getOutput(0)) == *myOutput3);
+    }
 }
\ No newline at end of file
-- 
GitLab


From b3ae66f75c1dfbc3b4ae3e08f198889cbf837937 Mon Sep 17 00:00:00 2001
From: hrouis <houssemeddine.rouis92@gmail.com>
Date: Mon, 3 Feb 2025 15:09:26 +0100
Subject: [PATCH 009/108] add ceil_mode tests for Avg and Max Pooling

---
 .../cpu/operator/MaxPoolingImpl_kernels.hpp   |  1 +
 unit_tests/operator/Test_AvgPoolingImpl.cpp   | 57 +++++++++++++++++++
 unit_tests/operator/Test_MaxPoolingImpl.cpp   | 57 +++++++++++++++++++
 3 files changed, 115 insertions(+)

diff --git a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
index d5ac02fe..027fc02a 100644
--- a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
@@ -16,6 +16,7 @@
 #include <cmath>
 #include <tuple>
 
+
 #include "aidge/backend/cpu/operator/MaxPoolingImpl.hpp"
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
 #include "aidge/data/Data.hpp"
diff --git a/unit_tests/operator/Test_AvgPoolingImpl.cpp b/unit_tests/operator/Test_AvgPoolingImpl.cpp
index 21a7a680..f116934c 100644
--- a/unit_tests/operator/Test_AvgPoolingImpl.cpp
+++ b/unit_tests/operator/Test_AvgPoolingImpl.cpp
@@ -144,4 +144,61 @@ TEST_CASE("[cpu/operator] AvgPooling(forward)", "[AvgPooling][CPU]") {
         op->getOutput(0)->print();
         REQUIRE(*(op->getOutput(0)) == *myOutput3);
     }
+    SECTION("Ceil Mode") {
+        std::shared_ptr<Tensor> myInput4 = std::make_shared<Tensor>(Array4D<float,1,1,5,5> { // NCHW
+        {
+            {
+                {
+                    { 1,  2,  3,  4,  5},
+                    { 6,  7,  8,  9, 10},
+                    {11, 12, 13, 14, 15},
+                    {16, 17, 18, 19, 20},
+                    {21, 22, 23, 24, 25}
+                }
+            }
+        }
+        });
+
+        // AvgPool with ceil_mode = true
+        std::shared_ptr<Node> myAvgPool1 = AvgPooling({2,2}, "mycdw", {2,2}, {1,1}, true);
+        auto op1 = std::static_pointer_cast<AvgPooling_Op<2>>(myAvgPool1 -> getOperator());
+
+        std::shared_ptr<Tensor> myOutput4 = std::make_shared<Tensor>(Array4D<float,1,1,3,3> {
+            {
+                {
+                    {
+                        {  4.0,  6.0,  7.5 },
+                        { 14.0, 16.0, 17.5 },
+                        { 21.5, 23.5, 25.0 }
+                    }
+                }
+            }
+        });
+        op1->associateInput(0, myInput4);
+        op1->setDataType(DataType::Float32);
+        op1->setBackend("cpu");
+        myAvgPool1->forward();
+        op1->getOutput(0)->print();
+        REQUIRE(*(op1->getOutput(0)) == *myOutput4);
+
+        // AvgPool with ceil_mode = false
+        std::shared_ptr<Node> myAvgPool2 = AvgPooling({2,2}, "mycdw", {2,2}, {1,1}, false);
+        auto op2 = std::static_pointer_cast<AvgPooling_Op<2>>(myAvgPool2 -> getOperator());
+        std::shared_ptr<Tensor> myOutput5 = std::make_shared<Tensor>(Array4D<float,1,1,2,2> {
+            {
+                {
+                    {
+                        {  4.0,  6.0 },
+                        { 14.0, 16.0 }
+                    }
+                }
+            }
+        });
+        op2->associateInput(0, myInput4);
+        op2->setDataType(DataType::Float32);
+        op2->setBackend("cpu");
+        myAvgPool2->forward();
+        op2->getOutput(0)->print();
+        REQUIRE(*(op2->getOutput(0)) == *myOutput5);
+    }
 }
\ No newline at end of file
diff --git a/unit_tests/operator/Test_MaxPoolingImpl.cpp b/unit_tests/operator/Test_MaxPoolingImpl.cpp
index 6b7e6d2f..d480fc30 100644
--- a/unit_tests/operator/Test_MaxPoolingImpl.cpp
+++ b/unit_tests/operator/Test_MaxPoolingImpl.cpp
@@ -115,4 +115,61 @@ TEST_CASE("[cpu/operator] MaxPooling(forward)", "[MaxPooling][CPU]") {
         op->getOutput(0)->print();
         REQUIRE(*(op->getOutput(0)) == *myOutput);
     }
+    SECTION("Ceil Mode") {
+        std::shared_ptr<Tensor> myInput4 = std::make_shared<Tensor>(Array4D<float,1,1,5,5> { // NCHW
+        {
+            {
+                {
+                    { 1,  2,  3,  4,  5},
+                    { 6,  7,  8,  9, 10},
+                    {11, 12, 13, 14, 15},
+                    {16, 17, 18, 19, 20},
+                    {21, 22, 23, 24, 25}
+                }
+            }
+        }
+        });
+
+        // MaxPool with ceil_mode = true
+        std::shared_ptr<Node> myMaxPool1 = MaxPooling({2,2}, "mycdw", {2,2}, {1,1}, true);
+        auto op1 = std::static_pointer_cast<OperatorTensor>(myMaxPool1 -> getOperator());
+
+        std::shared_ptr<Tensor> myOutput4 = std::make_shared<Tensor>(Array4D<float,1,1,3,3> {
+            {
+                {
+                    {
+                        {  7.0,  9.0, 10.0 },
+                        { 17.0, 19.0, 20.0 },
+                        { 22.0, 24.0, 25.0 }
+                    }
+                }
+            }
+        });
+        op1->associateInput(0, myInput4);
+        op1->setDataType(DataType::Float32);
+        op1->setBackend("cpu");
+        myMaxPool1->forward();
+        op1->getOutput(0)->print();
+        REQUIRE(*(op1->getOutput(0)) == *myOutput4);
+
+        // MaxPool with ceil_mode = false
+        std::shared_ptr<Node> myMaxPool2 = MaxPooling({2,2}, "mycdw", {2,2}, {1,1}, false);
+        auto op2 = std::static_pointer_cast<OperatorTensor>(myMaxPool2 -> getOperator());
+        std::shared_ptr<Tensor> myOutput5 = std::make_shared<Tensor>(Array4D<float,1,1,2,2> {
+            {
+                {
+                    {
+                        {  7.0,  9.0 },
+                        { 17.0, 19.0 }
+                    }
+                }
+            }
+        });
+        op2->associateInput(0, myInput4);
+        op2->setDataType(DataType::Float32);
+        op2->setBackend("cpu");
+        myMaxPool2->forward();
+        op2->getOutput(0)->print();
+        REQUIRE(*(op2->getOutput(0)) == *myOutput5);
+    }
 }
\ No newline at end of file
-- 
GitLab


From 40a34dc2c1910bba8e983adc28929204a4e62f45 Mon Sep 17 00:00:00 2001
From: hrouis <houssemeddine.rouis92@gmail.com>
Date: Wed, 19 Feb 2025 10:32:39 +0100
Subject: [PATCH 010/108] separate fwdDims tests section from fwd section

---
 unit_tests/operator/Test_EqualImpl.cpp | 145 ++++++++++++-------------
 1 file changed, 72 insertions(+), 73 deletions(-)

diff --git a/unit_tests/operator/Test_EqualImpl.cpp b/unit_tests/operator/Test_EqualImpl.cpp
index a229b8ce..013e16eb 100644
--- a/unit_tests/operator/Test_EqualImpl.cpp
+++ b/unit_tests/operator/Test_EqualImpl.cpp
@@ -19,86 +19,85 @@
 
 using namespace Aidge;
 
-TEST_CASE("[cpu/operator] Equal(forward)", "[Equal][CPU]") {
-        SECTION("ForwardDims")
-    {
-        constexpr std::uint16_t NBTRIALS = 10;
-        // Create a random number generator
-        std::random_device rd;
-        std::mt19937 gen(rd());
-        std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
-        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
-        std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
-        std::uniform_int_distribution<int> boolDist(0,1);
-
-        SECTION("Same dimensions") {
-            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
-                DimSize_t nbDims = nbDimsDist(gen);
-                std::vector<DimSize_t> dims(nbDims);
-                for (std::size_t i = 0; i < nbDims; i++) {
-                    dims[i] = dimSizeDist(gen);
-                }
-
-                std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims);
-                myInput1->setBackend("cpu");
-                myInput1->setDataType(DataType::Float32);
-                myInput1->zeros();
-                std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims);
-                myInput2->setBackend("cpu");
-                myInput2->setDataType(DataType::Float32);
-                myInput2->zeros();
-                std::shared_ptr<Node> myEqual = Equal();
-                auto op = std::static_pointer_cast<OperatorTensor>(myEqual -> getOperator());
-                op->associateInput(0,myInput1);
-                op->associateInput(1,myInput2);
-                op->setDataType(DataType::Float32);
-                op->setBackend("cpu");
-                op->forwardDims();
-
-                const auto outputDims = op->getOutput(0)->dims();
-                REQUIRE(outputDims == dims);
+TEST_CASE("[cpu/operator] Equal(forwardDims)", "[Equal][CPU]") {
+    constexpr std::uint16_t NBTRIALS = 10;
+    // Create a random number generator
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
+    std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
+    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
+    std::uniform_int_distribution<int> boolDist(0,1);
+
+    SECTION("Same dimensions") {
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+            DimSize_t nbDims = nbDimsDist(gen);
+            std::vector<DimSize_t> dims(nbDims);
+            for (std::size_t i = 0; i < nbDims; i++) {
+                dims[i] = dimSizeDist(gen);
             }
+
+            std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims);
+            myInput1->setBackend("cpu");
+            myInput1->setDataType(DataType::Float32);
+            myInput1->zeros();
+            std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims);
+            myInput2->setBackend("cpu");
+            myInput2->setDataType(DataType::Float32);
+            myInput2->zeros();
+            std::shared_ptr<Node> myEqual = Equal();
+            auto op = std::static_pointer_cast<OperatorTensor>(myEqual -> getOperator());
+            op->associateInput(0,myInput1);
+            op->associateInput(1,myInput2);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            op->forwardDims();
+
+            const auto outputDims = op->getOutput(0)->dims();
+            REQUIRE(outputDims == dims);
         }
-        SECTION("Broadcasting") {
-            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
-                DimSize_t nbDims = nbDimsDist(gen);
-                std::vector<DimSize_t> dims1(nbDims, 1);
-                std::vector<DimSize_t> dims2(nbDims, 1);
-                std::vector<DimSize_t> expectedOutDims;
-                for (std::size_t i = 0; i < nbDims; i++) {
-                    DimSize_t dim = dimSizeDist(gen);
-                    if (boolDist(gen)) {
-                        dims1[i] = dim;
-                    }
-                    if (boolDist(gen)) {
-                        dims2[i] = dim;
-                    }
-                    expectedOutDims.push_back(std::max(dims1[i],dims2[i]));
+    }
+    SECTION("Broadcasting") {
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+            DimSize_t nbDims = nbDimsDist(gen);
+            std::vector<DimSize_t> dims1(nbDims, 1);
+            std::vector<DimSize_t> dims2(nbDims, 1);
+            std::vector<DimSize_t> expectedOutDims;
+            for (std::size_t i = 0; i < nbDims; i++) {
+                DimSize_t dim = dimSizeDist(gen);
+                if (boolDist(gen)) {
+                    dims1[i] = dim;
+                }
+                if (boolDist(gen)) {
+                    dims2[i] = dim;
                 }
+                expectedOutDims.push_back(std::max(dims1[i],dims2[i]));
+            }
 
 
-                std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims1);
-                myInput1->setBackend("cpu");
-                myInput1->setDataType(DataType::Float32);
-                myInput1->zeros();
-                std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims2);
-                myInput2->setBackend("cpu");
-                myInput2->setDataType(DataType::Float32);
-                myInput2->zeros();
-                std::shared_ptr<Node> myEqual = Equal();
-                auto op = std::static_pointer_cast<OperatorTensor>(myEqual -> getOperator());
-                op->associateInput(0,myInput1);
-                op->associateInput(1,myInput2);
-                op->setDataType(DataType::Float32);
-                op->setBackend("cpu");
-
-                op->forwardDims();
-
-                const auto outputDims = op->getOutput(0)->dims();
-                REQUIRE(outputDims == expectedOutDims);
-            }
+            std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims1);
+            myInput1->setBackend("cpu");
+            myInput1->setDataType(DataType::Float32);
+            myInput1->zeros();
+            std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims2);
+            myInput2->setBackend("cpu");
+            myInput2->setDataType(DataType::Float32);
+            myInput2->zeros();
+            std::shared_ptr<Node> myEqual = Equal();
+            auto op = std::static_pointer_cast<OperatorTensor>(myEqual -> getOperator());
+            op->associateInput(0,myInput1);
+            op->associateInput(1,myInput2);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+
+            op->forwardDims();
+
+            const auto outputDims = op->getOutput(0)->dims();
+            REQUIRE(outputDims == expectedOutDims);
         }
     }
+}
+TEST_CASE("[cpu/operator] Equal(forward)", "[Equal][CPU]") {
     SECTION("Same size inputs") {
         std::shared_ptr<Tensor> input1 = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
         {                                       //
-- 
GitLab


From f3de3e10f342b3dd573609d8f835ed822950e5d7 Mon Sep 17 00:00:00 2001
From: hrouis <houssemeddine.rouis92@gmail.com>
Date: Thu, 20 Feb 2025 11:35:10 +0100
Subject: [PATCH 011/108] remove unnecessary header in Equal tests

---
 unit_tests/operator/Test_EqualImpl.cpp | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/unit_tests/operator/Test_EqualImpl.cpp b/unit_tests/operator/Test_EqualImpl.cpp
index 013e16eb..bd9fa94f 100644
--- a/unit_tests/operator/Test_EqualImpl.cpp
+++ b/unit_tests/operator/Test_EqualImpl.cpp
@@ -15,8 +15,6 @@
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/Equal.hpp"
 
-#include "aidge/backend/cpu.hpp"
-
 using namespace Aidge;
 
 TEST_CASE("[cpu/operator] Equal(forwardDims)", "[Equal][CPU]") {
@@ -137,7 +135,7 @@ TEST_CASE("[cpu/operator] Equal(forward)", "[Equal][CPU]") {
                 }                                   //
             }                                       //
         });                                         //
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
+        Tensor expectedOutput =Tensor(Array4D<int,3,3,3,2> {
             {
                 {
                     {{1, 0},{0, 0},{1, 1}},
@@ -165,7 +163,7 @@ TEST_CASE("[cpu/operator] Equal(forward)", "[Equal][CPU]") {
         op->setDataType(DataType::Int32);
         myEqual->forward();
 
-        REQUIRE(*(op->getOutput(0)) == *expectedOutput);
+        REQUIRE(*(op->getOutput(0)) == expectedOutput);
     }
 
     SECTION("Broadcasting") {
@@ -180,7 +178,7 @@ TEST_CASE("[cpu/operator] Equal(forward)", "[Equal][CPU]") {
         });                                     //
 
         std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<int,2> {{10, 20}});  
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,1,3,3,2> {
+        Tensor expectedOutput = Tensor(Array4D<int,1,3,3,2> {
             {                                   //
                 {                               //
                     {{ 1, 1},{ 0, 0},{ 0, 1}},  //
@@ -198,7 +196,7 @@ TEST_CASE("[cpu/operator] Equal(forward)", "[Equal][CPU]") {
         op->setBackend("cpu");
         myEqual->forward();
         op->getOutput(0)->print();
-        expectedOutput->print();
-        REQUIRE(*op->getOutput(0) == *expectedOutput);
+
+        REQUIRE(*op->getOutput(0) == expectedOutput);
     }
 }
\ No newline at end of file
-- 
GitLab


From dcbd4ebd1fe6d4eb065496f8ab0b62b771b42589 Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Fri, 14 Feb 2025 17:57:38 +0100
Subject: [PATCH 012/108] Add Mod

---
 .../aidge/backend/cpu/operator/ModImpl.hpp    |  33 +++++
 .../backend/cpu/operator/ModImpl_kernels.hpp  |  77 ++++++++++
 src/operator/ModImpl.cpp                      | 131 ++++++++++++++++++
 3 files changed, 241 insertions(+)
 create mode 100644 include/aidge/backend/cpu/operator/ModImpl.hpp
 create mode 100644 include/aidge/backend/cpu/operator/ModImpl_kernels.hpp
 create mode 100644 src/operator/ModImpl.cpp

diff --git a/include/aidge/backend/cpu/operator/ModImpl.hpp b/include/aidge/backend/cpu/operator/ModImpl.hpp
new file mode 100644
index 00000000..96ff599b
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/ModImpl.hpp
@@ -0,0 +1,33 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_MODIMPL_H_
+#define AIDGE_CPU_OPERATOR_MODIMPL_H_
+
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Mod.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using ModImpl_cpu = OperatorImpl_cpu<Mod_Op,
+    void(bool, const std::size_t, const std::size_t, const std::size_t, const void*, const void*,void*)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(Mod_Op, "cpu", Aidge::ModImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_MODIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/ModImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ModImpl_kernels.hpp
new file mode 100644
index 00000000..940fa482
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/ModImpl_kernels.hpp
@@ -0,0 +1,77 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_MODIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_MODIMPL_KERNELS_H_
+
+#include <numeric>     // std::accumulate
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::int32_t, std::int64_t
+#include <functional>  // std::multiplies
+
+#include "aidge/utils/Registrar.hpp"
+
+#include "aidge/backend/cpu/data/Broadcasting.hpp"
+#include "aidge/backend/cpu/operator/ModImpl.hpp"
+
+namespace Aidge {
+
+template <typename T,  
+    typename std::enable_if<std::is_integral<T>::value>::type* = nullptr>
+static inline T modulus(T a, T b) {
+    return a % b;
+}
+
+template <typename T,  
+    typename std::enable_if<!std::is_integral<T>::value>::type* = nullptr>
+static inline T modulus(T /*a*/, T /*b*/) {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Mod Operator with fmod attribute set to false only supports integer types.");
+}
+
+template <class I1, class I2, class O>
+constexpr void ModImpl_cpu_forward_kernel(bool fmod,
+                                const std::size_t input1size_,
+                                const std::size_t input2size_,
+                                const std::size_t output1size_,
+                                const void* input1_,
+                                const void* input2_,
+                                void* output_) {
+
+    const I1* input_1 = static_cast<const I1*>(input1_);
+    const I2* input_2 = static_cast<const I2*>(input2_);
+    O* output = static_cast<O*>(output_);
+
+// suppose values are contiguous in memory
+    for (std::size_t i = 0; i < output1size_; ++i) {
+        const std::size_t in1_id = (input1size_ != 1) ? i : 0;
+        const std::size_t in2_id = (input2size_ != 1) ? i : 0;
+        if (fmod) {
+            output[i] = static_cast<O>(std::fmod(input_1[in1_id], input_2[in2_id]));
+        }
+        else {
+            output[i] = static_cast<O>(modulus(input_1[in1_id], input_2[in2_id]));
+        }
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(ModImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::ModImpl_cpu_forward_kernel<float, float, float>, nullptr});
+REGISTRAR(ModImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::ModImpl_cpu_forward_kernel<double, double, double>, nullptr});
+REGISTRAR(ModImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::inPlaceModel, Aidge::ModImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, nullptr});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_MODIMPL_KERNELS_H_ */
diff --git a/src/operator/ModImpl.cpp b/src/operator/ModImpl.cpp
new file mode 100644
index 00000000..161f7bc1
--- /dev/null
+++ b/src/operator/ModImpl.cpp
@@ -0,0 +1,131 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <memory>
+#include <vector>
+
+#include "aidge/backend/cpu/data/Broadcasting.hpp"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include "aidge/backend/cpu/operator/ModImpl.hpp"
+#include "aidge/backend/cpu/operator/ModImpl_kernels.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/utils/Types.h"
+
+template <>
+void Aidge::ModImpl_cpu::forward() {
+    // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7]
+    // 2. Find the highest equal dimension -> 3
+    //    Exception: if the first diverging dimension is the last one, then -> 4 (dims.size())
+    // 3. Compute the highest number of contiguous data -> 7
+    // 4. Compute stride and offset step for the broadcast mechanism
+    // 5. Call a simple kernel
+    const auto& opTensor = static_cast<const Mod_Op&>(mOp);
+
+    // Find the correct kernel type
+    const auto impl = Registrar<ModImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Compute compatible input dimensions
+    std::vector<std::size_t>        dims0   = opTensor.getInput(0)->dims();
+    std::vector<std::size_t>        dims1   = opTensor.getInput(1)->dims();
+    const std::vector<std::size_t>& outDims = opTensor.getOutput(0)->dims();
+
+    // special case for equal dimensions, the kernel is called with the entire arrays at once
+    if (dims0 == dims1) {
+        const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+        impl.forward(opTensor.fmod(),
+                    input0_contiguous_size, input0_contiguous_size, input0_contiguous_size,
+                    getCPUPtr(mOp.getRawInput(0)),
+                    getCPUPtr(mOp.getRawInput(1)),
+                    getCPUPtr(mOp.getRawOutput(0)));
+        return;
+    }
+
+    // set dimensions to be of equal size by filling the smallest one with ones.
+    if (dims0.size() > dims1.size()) {
+        dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1));
+    }
+    else if (dims1.size() > dims0.size()) {
+        dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1));
+    }
+
+    const std::size_t nbDims = dims0.size();
+
+    // Find the highest equal dimension
+    // std::size_t contiguousIdx = nbDims - 1;
+    std::size_t contiguousIdx = nbDims;
+    while (contiguousIdx-- > 0) {
+    // for (; contiguousIdx+1 > 0; --contiguousIdx) {
+        if (dims0[contiguousIdx] != dims1[contiguousIdx]) {
+            if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1
+                const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1;
+                while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) {
+                    --contiguousIdx;
+                }
+            }
+            break;
+        }
+    }
+    ++contiguousIdx;
+
+    // Compute the highest number of contiguous data for each Tensor
+    const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t output_contiguous_size = std::accumulate(outDims.cbegin()+contiguousIdx, outDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+
+    // initialize strides to iterate through data because of broadcasting
+    std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    if (contiguousIdx > 0) {
+        stride_post0[contiguousIdx - 1] = 1;
+        stride_post1[contiguousIdx - 1] = 1;
+        for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) {
+            stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]);
+            stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]);
+        }
+        for (std::size_t i = 0; i != contiguousIdx; ++i) {
+            stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1;
+            stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1;
+        }
+    }
+
+    // variables for arrays offsets
+    std::size_t offsetIn0 = 0;
+    std::size_t offsetIn1 = 0;
+    std::size_t offsetOut = 0;
+
+
+    std::size_t dim = contiguousIdx - 1;
+    const std::size_t nbStacks = std::accumulate(outDims.cbegin(), outDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
+    for (std::size_t stack = 0; stack < nbStacks;) {
+        impl.forward(opTensor.fmod(), input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
+                    getCPUPtr(mOp.getRawInput(0), offsetIn0*input0_contiguous_size),
+                    getCPUPtr(mOp.getRawInput(1), offsetIn1*input1_contiguous_size),
+                    getCPUPtr(mOp.getRawOutput(0), offsetOut*output_contiguous_size));
+        if (++stack < nbStacks) {
+            std::size_t tmp_stack = stack;
+            while(tmp_stack % outDims[dim] == 0) {
+                tmp_stack /= outDims[dim];
+                dim--;
+            }
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = contiguousIdx - 1;
+        }
+    }
+}
+
+template <>
+void Aidge::ModImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Mod_Op on backend cpu");
+}
-- 
GitLab


From 1c023cb2c416ad660acef30a6e913b8b40c1c8d5 Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Sun, 16 Feb 2025 16:39:05 +0100
Subject: [PATCH 013/108] Fixed typo

---
 include/aidge/backend/cpu/operator/AbsImpl_kernels.hpp |  4 ++--
 .../aidge/backend/cpu/operator/AtanImpl_kernels.hpp    |  8 ++++----
 include/aidge/backend/cpu/operator/ErfImpl_kernels.hpp |  4 ++--
 .../backend/cpu/operator/HeavisideImpl_kernels.hpp     |  4 ++--
 .../backend/cpu/operator/LeakyReLUImpl_kernels.hpp     |  8 ++++----
 include/aidge/backend/cpu/operator/LnImpl_kernels.hpp  | 10 +++++-----
 .../aidge/backend/cpu/operator/ReLUImpl_kernels.hpp    | 10 +++++-----
 .../aidge/backend/cpu/operator/RoundImpl_kernels.hpp   |  4 ++--
 .../aidge/backend/cpu/operator/ScalingImpl_kernels.hpp |  4 ++--
 .../aidge/backend/cpu/operator/SigmoidImpl_kernels.hpp | 10 +++++-----
 .../aidge/backend/cpu/operator/SqrtImpl_kernels.hpp    |  8 ++++----
 .../aidge/backend/cpu/operator/TanhImpl_kernels.hpp    | 10 +++++-----
 12 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/AbsImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AbsImpl_kernels.hpp
index 16e5f9de..e6474cf2 100644
--- a/include/aidge/backend/cpu/operator/AbsImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AbsImpl_kernels.hpp
@@ -20,14 +20,14 @@
 
 namespace Aidge {
 template <class I, class O>
-void AbsImpl_cpu_forward_kernel(std::size_t inputLenght,
+void AbsImpl_cpu_forward_kernel(std::size_t inputLength,
                                      const void* input_,
                                      void* output_) {
 
     const I* input = static_cast<const I*>(input_);
     O* output = static_cast<O*>(output_);
 
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+    for (std::size_t i = 0; i < inputLength; ++i) {
         output[i] = std::abs(input[i]);
     }
 }
diff --git a/include/aidge/backend/cpu/operator/AtanImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AtanImpl_kernels.hpp
index 2a786339..141e5b60 100644
--- a/include/aidge/backend/cpu/operator/AtanImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AtanImpl_kernels.hpp
@@ -20,20 +20,20 @@
 
 namespace Aidge {
 template <class I, class O>
-void AtanImpl_cpu_forward_kernel(std::size_t inputLenght,
+void AtanImpl_cpu_forward_kernel(std::size_t inputLength,
                                     const void* input_,
                                     void* output_) {
     const I* input = static_cast<const I*>(input_);
     O* output = static_cast<O*>(output_);
 
-    for (size_t i = 0; i < inputLenght; ++i) {
+    for (size_t i = 0; i < inputLength; ++i) {
         output[i] = static_cast<O>(atan(input[i]));
     }
 
 }
 
 template <class O, class GI, class GO>
-void AtanImpl_cpu_backward_kernel(const std::size_t inputLenght,
+void AtanImpl_cpu_backward_kernel(const std::size_t inputLength,
                                      const void* output_, const void* grad_output_,
 				     void* grad_input_) {
     const O* output = static_cast<const O*>(output_);
@@ -41,7 +41,7 @@ void AtanImpl_cpu_backward_kernel(const std::size_t inputLenght,
     GI* grad_input = static_cast<GI*>(grad_input_);
 
     // Apply the derivative of atan for each element in the input array
-    for (size_t i = 0; i < inputLenght; ++i) {
+    for (size_t i = 0; i < inputLength; ++i) {
         // dx = dy * (1 / (1 + x^2))
         grad_input[i] = grad_output[i] * static_cast<O>(1.0 / (1.0 + output[i] * output[i]));
     }
diff --git a/include/aidge/backend/cpu/operator/ErfImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ErfImpl_kernels.hpp
index 02041f55..709f4a6f 100644
--- a/include/aidge/backend/cpu/operator/ErfImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ErfImpl_kernels.hpp
@@ -20,14 +20,14 @@
 
 namespace Aidge {
 template <class I, class O>
-void ErfImpl_cpu_forward_kernel(std::size_t inputLenght,
+void ErfImpl_cpu_forward_kernel(std::size_t inputLength,
                                      const void* input_,
                                      void* output_) {
 
     const I* input = static_cast<const I*>(input_);
     O* output = static_cast<O*>(output_);
 
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+    for (std::size_t i = 0; i < inputLength; ++i) {
         output[i] = std::erf(input[i]);
     }
 }
diff --git a/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp b/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp
index 3fd6ca7d..06d7fff8 100644
--- a/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp
@@ -23,14 +23,14 @@
 namespace Aidge {
 
 template <class I, class O>
-void HeavisideImplCpuForwardKernel(std::size_t inputLenght,
+void HeavisideImplCpuForwardKernel(std::size_t inputLength,
                                    const void *input_,
                                    void *output_,
                                    const float value) {
     const I *input = static_cast<const I *>(input_);
     O *output = static_cast<O *>(output_);
 
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+    for (std::size_t i = 0; i < inputLength; ++i) {
         output[i] = (input[i] > 0) ? 1 : (input[i] == 0 ? value : 0);
     }
 }
diff --git a/include/aidge/backend/cpu/operator/LeakyReLUImpl_kernels.hpp b/include/aidge/backend/cpu/operator/LeakyReLUImpl_kernels.hpp
index bc856f70..7afd8298 100644
--- a/include/aidge/backend/cpu/operator/LeakyReLUImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/LeakyReLUImpl_kernels.hpp
@@ -19,7 +19,7 @@
 namespace Aidge {
 template <class I, class O>
 void LeakyReLUImpl_cpu_forward_kernel(const float negativeSlope_,
-                                     std::size_t inputLenght,
+                                     std::size_t inputLength,
                                      const void* input_,
                                      void* output_) {
 
@@ -27,14 +27,14 @@ void LeakyReLUImpl_cpu_forward_kernel(const float negativeSlope_,
     O* output = static_cast<O*>(output_);
     const I negativeSlope = static_cast<const I>(negativeSlope_);
 
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+    for (std::size_t i = 0; i < inputLength; ++i) {
         output[i] = (input[i] >= 0) ? input[i] : input[i] * negativeSlope;
     }
 }
 
 template <class I, class O>
 void LeakyReLUImpl_cpu_backward_kernel(const float negativeSlope_,
-                                     std::size_t inputLenght,
+                                     std::size_t inputLength,
                                      const void* input_,
                                      void* output_) {
 
@@ -42,7 +42,7 @@ void LeakyReLUImpl_cpu_backward_kernel(const float negativeSlope_,
     O* output = static_cast<O*>(output_);
     const I negativeSlope = static_cast<const I>(negativeSlope_);
 
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+    for (std::size_t i = 0; i < inputLength; ++i) {
         output[i] = (input[i] > 0) ? input[i] : negativeSlope*input[i];
     }
 }
diff --git a/include/aidge/backend/cpu/operator/LnImpl_kernels.hpp b/include/aidge/backend/cpu/operator/LnImpl_kernels.hpp
index b30b05bb..ee2864b6 100755
--- a/include/aidge/backend/cpu/operator/LnImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/LnImpl_kernels.hpp
@@ -18,7 +18,7 @@
 
 namespace Aidge {
 template <class I, class O>
-void LnImpl_cpu_forward_kernel(std::size_t inputLenght,
+void LnImpl_cpu_forward_kernel(std::size_t inputLength,
                                const void* input_,
                                void* output_) {
 
@@ -26,8 +26,8 @@ void LnImpl_cpu_forward_kernel(std::size_t inputLenght,
     O* output = static_cast<O*>(output_);
 	const float eps = 1.0e-20f;
 
-//#pragma omp parallel for if (inputLenght > 1024)
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+//#pragma omp parallel for if (inputLength > 1024)
+    for (std::size_t i = 0; i < inputLength; ++i) {
 		if (input[i] > I(eps)) {
 			output[i] = std::log(input[i]);
 		} else {
@@ -37,7 +37,7 @@ void LnImpl_cpu_forward_kernel(std::size_t inputLenght,
 }
 
 template <class I, class GI, class GO>
-void LnImpl_cpu_backward_kernel(const std::size_t inputLenght,
+void LnImpl_cpu_backward_kernel(const std::size_t inputLength,
                                 const void* input_, const void* grad_output_,
 	                            void* grad_input_) {
 						 
@@ -46,7 +46,7 @@ void LnImpl_cpu_backward_kernel(const std::size_t inputLenght,
     GI* grad_input = static_cast<GI*>(grad_input_);
 	const float eps = 1.0e-20f;
 	
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+    for (std::size_t i = 0; i < inputLength; ++i) {
 		if (input[i] > I(eps)) {
 			grad_input[i] = grad_output[i] / input[i];
 		} else {
diff --git a/include/aidge/backend/cpu/operator/ReLUImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ReLUImpl_kernels.hpp
index e39e9b7d..bb5d7cc3 100644
--- a/include/aidge/backend/cpu/operator/ReLUImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ReLUImpl_kernels.hpp
@@ -26,27 +26,27 @@
 namespace Aidge {
 // Kernels
 template <class I, class O>
-void ReLUImpl_cpu_forward_kernel(std::size_t inputLenght,
+void ReLUImpl_cpu_forward_kernel(std::size_t inputLength,
                                      const void* input_,
                                      void* output_) {
 
     const I* input = static_cast<const I*>(input_);
     O* output = static_cast<O*>(output_);
 
-//#pragma omp parallel for if (inputLenght > 1024)
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+//#pragma omp parallel for if (inputLength > 1024)
+    for (std::size_t i = 0; i < inputLength; ++i) {
         output[i] = (input[i] > 0) ? input[i] : 0;
     }
 }
 
 template <class I, class GI, class GO>
-void ReLUImpl_cpu_backward_kernel(const std::size_t inputLenght,
+void ReLUImpl_cpu_backward_kernel(const std::size_t inputLength,
                                   const void* input_, const void* grad_output_,
 				  void* grad_input_) {
     const I* input = static_cast<const I*>(input_);
     const GO* grad_output = static_cast<const GO*>(grad_output_);
     GI* grad_input = static_cast<GI*>(grad_input_);
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+    for (std::size_t i = 0; i < inputLength; ++i) {
         grad_input[i] = (input[i] > 0) ? grad_output[i] : 0;
     }
 }
diff --git a/include/aidge/backend/cpu/operator/RoundImpl_kernels.hpp b/include/aidge/backend/cpu/operator/RoundImpl_kernels.hpp
index ba9c63bc..7ac4319b 100644
--- a/include/aidge/backend/cpu/operator/RoundImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/RoundImpl_kernels.hpp
@@ -21,14 +21,14 @@
 
 namespace Aidge {
 template <class I, class O>
-void RoundImpl_cpu_forward_kernel(const std::size_t inputLenght,
+void RoundImpl_cpu_forward_kernel(const std::size_t inputLength,
                                      const void* input_,
                                      void* output_) {
 
     const I* input = static_cast<const I*>(input_);
     O* output = static_cast<O*>(output_);
 
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+    for (std::size_t i = 0; i < inputLength; ++i) {
         //std::round would not work since it doesn't follow the halves rules (See ONNX Round)
         output[i] = static_cast<O>(std::nearbyint(static_cast<float>(input[i])));
     }
diff --git a/include/aidge/backend/cpu/operator/ScalingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ScalingImpl_kernels.hpp
index c758c9cf..f9ca00b7 100644
--- a/include/aidge/backend/cpu/operator/ScalingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ScalingImpl_kernels.hpp
@@ -76,14 +76,14 @@ template <class I, class O>
 void ScalingImpl_cpu_forward_kernel(const float scalingFactor,
                                     const std::size_t quantizedNbBits,
                                     const bool isOutputUnsigned,
-                                    std::size_t inputLenght,
+                                    std::size_t inputLength,
                                     const void* input_,
                                     void* output_) {
 
     const I* input = static_cast<const I*>(input_);
     O* output = static_cast<O*>(output_);
 
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+    for (std::size_t i = 0; i < inputLength; ++i) {
         output[i] = static_cast<O>(input[i] * static_cast<I>(scalingFactor));
 
         if(quantizedNbBits > 0) {
diff --git a/include/aidge/backend/cpu/operator/SigmoidImpl_kernels.hpp b/include/aidge/backend/cpu/operator/SigmoidImpl_kernels.hpp
index dfd71ce0..83ad4575 100644
--- a/include/aidge/backend/cpu/operator/SigmoidImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/SigmoidImpl_kernels.hpp
@@ -18,15 +18,15 @@
 
 namespace Aidge {
 template <class I, class O>
-void SigmoidImpl_cpu_forward_kernel(std::size_t inputLenght,
+void SigmoidImpl_cpu_forward_kernel(std::size_t inputLength,
                                     const void* input_,
                                     void* output_) {
 
     const I* input = static_cast<const I*>(input_);
     O* output = static_cast<O*>(output_);
 
-//#pragma omp parallel for if (inputLenght > 1024)
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+//#pragma omp parallel for if (inputLength > 1024)
+    for (std::size_t i = 0; i < inputLength; ++i) {
 		if (input[i] > I(0)) {
 			output[i] = O(1) / (O(1) + std::exp(-input[i]));
 		} else {
@@ -36,13 +36,13 @@ void SigmoidImpl_cpu_forward_kernel(std::size_t inputLenght,
 }
 
 template <class O, class GI, class GO>
-void SigmoidImpl_cpu_backward_kernel(const std::size_t inputLenght,
+void SigmoidImpl_cpu_backward_kernel(const std::size_t inputLength,
                                      const void* output_, const void* grad_output_,
 				     void* grad_input_) {
     const O* output = static_cast<const O*>(output_);
     const GO* grad_output = static_cast<const GO*>(grad_output_);
     GI* grad_input = static_cast<GI*>(grad_input_);
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+    for (std::size_t i = 0; i < inputLength; ++i) {
         grad_input[i] = output[i] * (O(1) - output[i]) * grad_output[i];
     }
 }
diff --git a/include/aidge/backend/cpu/operator/SqrtImpl_kernels.hpp b/include/aidge/backend/cpu/operator/SqrtImpl_kernels.hpp
index 0464119c..1ce1ef9b 100644
--- a/include/aidge/backend/cpu/operator/SqrtImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/SqrtImpl_kernels.hpp
@@ -21,27 +21,27 @@
 
 namespace Aidge {
 template <class I, class O>
-void SqrtImpl_cpu_forward_kernel(const std::size_t inputLenght,
+void SqrtImpl_cpu_forward_kernel(const std::size_t inputLength,
                                      const void* input_,
                                      void* output_) {
 
     const I* input = static_cast<const I*>(input_);
     O* output = static_cast<O*>(output_);
 
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+    for (std::size_t i = 0; i < inputLength; ++i) {
         output[i] = static_cast<O>(std::sqrt(static_cast<float>(input[i])));
     }
 }
 
 template <class I, class O>
-void SqrtImpl_cpu_backward_kernel(const std::size_t inputLenght,
+void SqrtImpl_cpu_backward_kernel(const std::size_t inputLength,
                                      const void* input_,
                                      void* output_) {
 
     const I* input = static_cast<const I*>(input_);
     O* output = static_cast<O*>(output_);
 
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+    for (std::size_t i = 0; i < inputLength; ++i) {
         output[i] = static_cast<O>(0.5/(std::sqrt(static_cast<float>(input[i]))));
     }
 }
diff --git a/include/aidge/backend/cpu/operator/TanhImpl_kernels.hpp b/include/aidge/backend/cpu/operator/TanhImpl_kernels.hpp
index fdcac210..49cfe9cb 100644
--- a/include/aidge/backend/cpu/operator/TanhImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/TanhImpl_kernels.hpp
@@ -18,27 +18,27 @@
 
 namespace Aidge {
 template <class I, class O>
-void TanhImpl_cpu_forward_kernel(std::size_t inputLenght,
+void TanhImpl_cpu_forward_kernel(std::size_t inputLength,
                                      const void* input_,
                                      void* output_) {
 
     const I* input = static_cast<const I*>(input_);
     O* output = static_cast<O*>(output_);
 
-//#pragma omp parallel for if (inputLenght > 1024)
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+//#pragma omp parallel for if (inputLength > 1024)
+    for (std::size_t i = 0; i < inputLength; ++i) {
         output[i] = std::tanh(input[i]);
     }
 }
 
 template <class O, class GI, class GO>
-void TanhImpl_cpu_backward_kernel(const std::size_t inputLenght,
+void TanhImpl_cpu_backward_kernel(const std::size_t inputLength,
                                   const void* output_, const void* grad_output_,
 			          void* grad_input_) {
     const O* output = static_cast<const O*>(output_);
     const GO* grad_output = static_cast<const GO*>(grad_output_);
     GI* grad_input = static_cast<GI*>(grad_input_);
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+    for (std::size_t i = 0; i < inputLength; ++i) {
         grad_input[i] = (O(1) - output[i] * output[i]) * grad_output[i];
     }
 }
-- 
GitLab


From 06c6f8b120b9cbc772ae367ea9827a0e7f8bf040 Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Sun, 16 Feb 2025 16:42:03 +0100
Subject: [PATCH 014/108] Fixed missing include

---
 unit_tests/operator/Test_MetaOperator.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unit_tests/operator/Test_MetaOperator.cpp b/unit_tests/operator/Test_MetaOperator.cpp
index 4fe39630..adc548b9 100644
--- a/unit_tests/operator/Test_MetaOperator.cpp
+++ b/unit_tests/operator/Test_MetaOperator.cpp
@@ -18,6 +18,7 @@
 
 #include "aidge/backend/cpu/operator/ConvImpl.hpp"
 #include "aidge/backend/cpu/operator/PadImpl.hpp"
+#include "aidge/backend/cpu/operator/TanhImpl.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/filler/Filler.hpp"
 #include "aidge/operator/Conv.hpp"
-- 
GitLab


From 79e60036680239d2ee9d41c5e56bb6742f740585 Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Sun, 16 Feb 2025 17:53:50 +0100
Subject: [PATCH 015/108] Working concept

---
 CMakeLists.txt                                | 12 ++++
 include/aidge/backend/cpu.hpp                 |  2 +
 .../backend/cpu/operator/CryptoHashImpl.hpp   | 36 +++++++++++
 .../cpu/operator/CryptoHashImpl_kernels.hpp   | 52 ++++++++++++++++
 .../backend/cpu/operator/ModImpl_kernels.hpp  |  3 +
 src/operator/CryptoHashImpl.cpp               | 46 +++++++++++++++
 unit_tests/operator/Test_CryptoHash.cpp       | 56 ++++++++++++++++++
 unit_tests/scheduler/Test_Scheduler.cpp       | 59 +++++++++++++++++++
 8 files changed, 266 insertions(+)
 create mode 100644 include/aidge/backend/cpu/operator/CryptoHashImpl.hpp
 create mode 100644 include/aidge/backend/cpu/operator/CryptoHashImpl_kernels.hpp
 create mode 100644 src/operator/CryptoHashImpl.cpp
 create mode 100644 unit_tests/operator/Test_CryptoHash.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 66ef8ff2..2d4bc8ec 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -64,6 +64,14 @@ if(NOT $ENV{AIDGE_INSTALL} STREQUAL "")
 endif()
 find_package(aidge_core REQUIRED)
 
+find_package(OpenSSL QUIET)
+if(OpenSSL_FOUND)
+    message(STATUS "OpenSSL found: ${OPENSSL_VERSION}")
+    add_definitions(-DWITH_OPENSSL)
+else()
+    message(WARNING "OpenSSL not found, SHA256 will not be available.")
+endif()
+
 ##############################################
 # Create target and set properties
 file(GLOB_RECURSE src_files "src/*.cpp")
@@ -112,6 +120,10 @@ target_include_directories(${module_name}
         ${CMAKE_CURRENT_SOURCE_DIR}/src
 )
 
+if(OpenSSL_FOUND)
+    target_link_libraries(${module_name} PRIVATE OpenSSL::SSL OpenSSL::Crypto)
+endif()
+
 target_compile_features(${module_name} PRIVATE cxx_std_14)
 
 target_compile_options(${module_name} PRIVATE
diff --git a/include/aidge/backend/cpu.hpp b/include/aidge/backend/cpu.hpp
index ffc03ae5..80574b4a 100644
--- a/include/aidge/backend/cpu.hpp
+++ b/include/aidge/backend/cpu.hpp
@@ -28,6 +28,7 @@
 #include "aidge/backend/cpu/operator/ConvDepthWiseImpl.hpp"
 #include "aidge/backend/cpu/operator/ConvImpl.hpp"
 #include "aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp"
+#include "aidge/backend/cpu/operator/CryptoHashImpl.hpp"
 #include "aidge/backend/cpu/operator/DivImpl.hpp"
 #include "aidge/backend/cpu/operator/EqualImpl.hpp"
 #include "aidge/backend/cpu/operator/ErfImpl.hpp"
@@ -40,6 +41,7 @@
 #include "aidge/backend/cpu/operator/LeakyReLUImpl.hpp"
 #include "aidge/backend/cpu/operator/LnImpl.hpp"
 #include "aidge/backend/cpu/operator/MatMulImpl.hpp"
+#include "aidge/backend/cpu/operator/ModImpl.hpp"
 #include "aidge/backend/cpu/operator/MulImpl.hpp"
 #include "aidge/backend/cpu/operator/PadImpl.hpp"
 #include "aidge/backend/cpu/operator/PaddedConvImpl.hpp"
diff --git a/include/aidge/backend/cpu/operator/CryptoHashImpl.hpp b/include/aidge/backend/cpu/operator/CryptoHashImpl.hpp
new file mode 100644
index 00000000..d7f07f99
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/CryptoHashImpl.hpp
@@ -0,0 +1,36 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_TANHIMPL_H_
+#define AIDGE_CPU_OPERATOR_TANHIMPL_H_
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/CryptoHash.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include <memory>
+#include <vector>
+
+#ifdef WITH_OPENSSL
+#include <openssl/sha.h>
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using CryptoHashImpl_cpu = OperatorImpl_cpu<CryptoHash_Op,
+    void(const std::size_t, const void*, void*)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(CryptoHash_Op, "cpu", Aidge::CryptoHashImpl_cpu::create);
+}  // namespace Aidge
+#endif
+
+#endif /* AIDGE_CPU_OPERATOR_TANHIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/CryptoHashImpl_kernels.hpp b/include/aidge/backend/cpu/operator/CryptoHashImpl_kernels.hpp
new file mode 100644
index 00000000..cd596b69
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/CryptoHashImpl_kernels.hpp
@@ -0,0 +1,52 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_CRYPTOHASHIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_CRYPTOHASHIMPL_KERNELS_H_
+
+#include "aidge/utils/Registrar.hpp"
+
+#include "aidge/backend/cpu/operator/CryptoHashImpl.hpp"
+
+#ifdef WITH_OPENSSL
+namespace Aidge {
+template <class I, class O>
+void CryptoHashImpl_cpu_forward_kernel(std::size_t inputLength,
+                                     const void* input_,
+                                     void* output_) {
+
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+
+    // output must be at least SHA256_DIGEST_LENGTH bytes length
+    SHA256(reinterpret_cast<const uint8_t*>(input), inputLength * sizeof(I), reinterpret_cast<uint8_t*>(output));
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(CryptoHashImpl_cpu,
+    {{DataType::UInt8, DataFormat::Any}, {DataType::UInt8}},
+    {ProdConso::inPlaceModel, Aidge::CryptoHashImpl_cpu_forward_kernel<uint8_t, uint8_t>, nullptr});
+REGISTRAR(CryptoHashImpl_cpu,
+    {{DataType::UInt8, DataFormat::Any}, {DataType::UInt64}},
+    {ProdConso::inPlaceModel, Aidge::CryptoHashImpl_cpu_forward_kernel<uint8_t, uint64_t>, nullptr});
+REGISTRAR(CryptoHashImpl_cpu,
+    {{DataType::Float32, DataFormat::Any}, {DataType::UInt8}},
+    {ProdConso::inPlaceModel, Aidge::CryptoHashImpl_cpu_forward_kernel<float, uint8_t>, nullptr});
+REGISTRAR(CryptoHashImpl_cpu,
+    {{DataType::Float32, DataFormat::Any}, {DataType::UInt64}},
+    {ProdConso::inPlaceModel, Aidge::CryptoHashImpl_cpu_forward_kernel<float, uint64_t>, nullptr});
+REGISTRAR(CryptoHashImpl_cpu,
+    {{DataType::Float64, DataFormat::Any}, {DataType::UInt8}},
+    {ProdConso::inPlaceModel, Aidge::CryptoHashImpl_cpu_forward_kernel<double, uint8_t>, nullptr});
+}  // namespace Aidge
+#endif
+
+#endif /* AIDGE_CPU_OPERATOR_CRYPTOHASHIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/ModImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ModImpl_kernels.hpp
index 940fa482..15d18bf4 100644
--- a/include/aidge/backend/cpu/operator/ModImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ModImpl_kernels.hpp
@@ -72,6 +72,9 @@ REGISTRAR(ModImpl_cpu,
 REGISTRAR(ModImpl_cpu,
     {DataType::Int32},
     {ProdConso::inPlaceModel, Aidge::ModImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, nullptr});
+REGISTRAR(ModImpl_cpu,
+    {DataType::UInt64},
+    {ProdConso::inPlaceModel, Aidge::ModImpl_cpu_forward_kernel<std::uint64_t, std::uint64_t, std::uint64_t>, nullptr});
 }  // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_MODIMPL_KERNELS_H_ */
diff --git a/src/operator/CryptoHashImpl.cpp b/src/operator/CryptoHashImpl.cpp
new file mode 100644
index 00000000..10d82dd0
--- /dev/null
+++ b/src/operator/CryptoHashImpl.cpp
@@ -0,0 +1,46 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <cassert>
+#include <chrono>  // std::chrono::milliseconds
+#include <numeric> // std::accumulate
+#include <thread>  // std::this_thread::sleep_for
+#include <vector>
+
+#include "aidge/operator/CryptoHash.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+
+#include "aidge/backend/cpu/operator/CryptoHashImpl.hpp"
+#include "aidge/backend/cpu/operator/CryptoHashImpl_kernels.hpp"
+
+#ifdef WITH_OPENSSL
+template <>
+void Aidge::CryptoHashImpl_cpu::forward() {
+	const CryptoHash_Op& op_ = dynamic_cast<const CryptoHash_Op&>(mOp);
+    std::shared_ptr<Tensor> in0 = op_.getInput(0);
+    std::shared_ptr<Tensor> out0 = op_.getOutput(0);
+    AIDGE_ASSERT(in0, "missing input #0");
+
+    // Find the correct kernel type
+    const auto impl = Registrar<CryptoHashImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Call kernel
+    impl.forward(in0->size(),
+        getCPUPtr(mOp.getRawInput(0)),
+        getCPUPtr(mOp.getRawOutput(0)));
+}
+
+template <>
+void Aidge::CryptoHashImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not available for CryptoHash_Op");
+}
+#endif
diff --git a/unit_tests/operator/Test_CryptoHash.cpp b/unit_tests/operator/Test_CryptoHash.cpp
new file mode 100644
index 00000000..7453ea19
--- /dev/null
+++ b/unit_tests/operator/Test_CryptoHash.cpp
@@ -0,0 +1,56 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <cmath>    // std::abs
+#include <cstddef>  // std::size_t
+#include <memory>
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/backend/cpu/operator/CryptoHashImpl.hpp"
+#include "aidge/data/Data.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/graph/Node.hpp"
+#include "aidge/operator/CryptoHash.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"
+
+using namespace Aidge;
+
+#ifdef WITH_OPENSSL
+TEST_CASE("[cpu/operator] CryptoHash(forward)") {
+  SECTION("1D Tensor") {
+    std::shared_ptr<Tensor> input0 =
+        std::make_shared<Tensor>(Array1D<uint8_t, 5>{
+            {'a', 'b', 'c', 'd', 'e'}});
+    std::shared_ptr<Tensor> expectedOutput =
+        std::make_shared<Tensor>(Array1D<uint8_t, 32>{
+            {0x36, 0xbb, 0xe5, 0x0e, 0xd9, 0x68, 0x41, 0xd1,
+             0x04, 0x43, 0xbc, 0xb6, 0x70, 0xd6, 0x55, 0x4f,
+             0x0a, 0x34, 0xb7, 0x61, 0xbe, 0x67, 0xec, 0x9c,
+             0x4a, 0x8a, 0xd2, 0xc0, 0xc4, 0x4c, 0xa4, 0x2c}});
+
+    std::shared_ptr<Node> myCryptoHash = CryptoHash();
+    auto op = std::static_pointer_cast<CryptoHash_Op>(myCryptoHash->getOperator());
+    op->associateInput(0, input0);
+    op->setDataType(DataType::UInt8);
+    op->setBackend("cpu");
+    myCryptoHash->forward();
+
+    REQUIRE(op->getOutput(0)->size() == 32);
+
+    uint8_t* resPtr = static_cast<uint8_t*>(op->getOutput(0)->getImpl()->rawPtr());
+    uint8_t* expectedPtr = static_cast<uint8_t*>(expectedOutput->getImpl()->rawPtr());
+    for (std::size_t i = 0; i < expectedOutput->size(); ++i) {
+      REQUIRE(resPtr[i] == expectedPtr[i]);
+    }
+  }
+}
+#endif
diff --git a/unit_tests/scheduler/Test_Scheduler.cpp b/unit_tests/scheduler/Test_Scheduler.cpp
index 956169c3..5bd86eec 100644
--- a/unit_tests/scheduler/Test_Scheduler.cpp
+++ b/unit_tests/scheduler/Test_Scheduler.cpp
@@ -21,6 +21,10 @@
 #include "aidge/operator/Pop.hpp"
 #include "aidge/operator/Stack.hpp"
 #include "aidge/operator/Identity.hpp"
+#include "aidge/operator/CryptoHash.hpp"
+#include "aidge/operator/Mod.hpp"
+#include "aidge/operator/Tanh.hpp"
+#include "aidge/operator/Select.hpp"
 #include "aidge/operator/MetaOperator.hpp"
 #include "aidge/scheduler/SequentialScheduler.hpp"
 #include "aidge/scheduler/ParallelScheduler.hpp"
@@ -30,6 +34,9 @@
 #include "aidge/backend/cpu/operator/ReLUImpl.hpp"
 #include "aidge/backend/cpu/operator/SqrtImpl.hpp"
 #include "aidge/backend/cpu/operator/AddImpl.hpp"
+#include "aidge/backend/cpu/operator/CryptoHashImpl.hpp"
+#include "aidge/backend/cpu/operator/ModImpl.hpp"
+#include "aidge/backend/cpu/operator/TanhImpl.hpp"
 
 #include "aidge/recipes/GraphViewHelper.hpp"
 
@@ -512,4 +519,56 @@ TEST_CASE("[cpu/scheduler] Accumulate", "[scheduler]") {
     std::shared_ptr<Tensor> output = std::static_pointer_cast<OperatorTensor>(pop_o->getOperator())->getOutput(0);
     REQUIRE(*output == *expectedOutput);
 }
+
+#ifdef WITH_OPENSSL
+TEST_CASE("[cpu/scheduler] Select", "[scheduler]") {
+    std::shared_ptr<Tensor> in = std::make_shared<Tensor>(
+            Array2D<float, 2, 3>{{{1, 2, 3}, {4, 5, 6}}});
+
+    std::shared_ptr<GraphView> g = Sequential({
+        Producer(in, "input"),
+        Parallel({
+            Sequential({
+                CryptoHash("hash"),
+                Mod("mod")
+            }),
+            ReLU("relu"),
+            Tanh("tanh"),
+            Sqrt("sqrt")
+        }),
+        Select(3, "select")
+    });
+
+    auto modProd = Producer(std::make_shared<Tensor>(Array1D<uint64_t, 1>{{3}}));
+    modProd->addChild(g->getNode("mod"), 0, 1);
+    g->add(modProd);
+
+    g->getNode("hash")->getOperator()->setDataType(DataType::UInt64);
+    g->getNode("mod")->getOperator()->setDataType(DataType::UInt64);
+    g->setBackend("cpu");
+    g->save("select");
+
+    auto scheduler = SequentialScheduler(g);
+    scheduler.generateScheduling();
+    scheduler.saveStaticSchedulingDiagram("select_scheduling");
+    REQUIRE_NOTHROW(scheduler.forward(true));
+    
+    g->save("select_forwarded");
+
+    auto expectedOutputHash = std::make_shared<Tensor>(
+        Array1D<uint64_t, 4>{{0x1b7cf58dfe2dae24, 0x3bac903def4ce580, 0x5f5a347389d97f41, 0x2c2dc759abc6b61}});
+    auto outputHash = std::static_pointer_cast<OperatorTensor>(g->getNode("hash")->getOperator())->getOutput(0);
+    REQUIRE(*outputHash == *expectedOutputHash);
+
+    auto expectedOutputMod = std::make_shared<Tensor>(
+        Array1D<uint64_t, 4>{{2, 1, 1, 2}});
+    auto outputMod = std::static_pointer_cast<OperatorTensor>(g->getNode("mod")->getOperator())->getOutput(0);
+    REQUIRE(*outputMod == *expectedOutputMod);
+
+    auto expectedOutput = std::make_shared<Tensor>(
+        Array2D<float, 2, 3>{{{std::sqrt(1), std::sqrt(2), std::sqrt(3)}, {std::sqrt(4), std::sqrt(5), std::sqrt(6)}}});
+    auto output = std::static_pointer_cast<OperatorTensor>(g->getNode("select")->getOperator())->getOutput(0);
+    REQUIRE(*output == *expectedOutput);
+}
+#endif
 } // namespace Aidge
-- 
GitLab


From 652eb2810fd6a0a0360f8881cd3e9b41343d8340 Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Sun, 16 Feb 2025 23:46:45 +0100
Subject: [PATCH 016/108] Working concept of with tagConditionalNodes()

---
 unit_tests/scheduler/Test_Scheduler.cpp | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/unit_tests/scheduler/Test_Scheduler.cpp b/unit_tests/scheduler/Test_Scheduler.cpp
index 5bd86eec..54e57ec4 100644
--- a/unit_tests/scheduler/Test_Scheduler.cpp
+++ b/unit_tests/scheduler/Test_Scheduler.cpp
@@ -569,6 +569,24 @@ TEST_CASE("[cpu/scheduler] Select", "[scheduler]") {
         Array2D<float, 2, 3>{{{std::sqrt(1), std::sqrt(2), std::sqrt(3)}, {std::sqrt(4), std::sqrt(5), std::sqrt(6)}}});
     auto output = std::static_pointer_cast<OperatorTensor>(g->getNode("select")->getOperator())->getOutput(0);
     REQUIRE(*output == *expectedOutput);
+
+    scheduler.resetScheduling();
+    scheduler.tagConditionalNodes();
+
+    REQUIRE(g->getNode("relu")->attributes()->hasAttr("schedule.cond"));
+    REQUIRE(g->getNode("relu")->attributes()->getAttr<std::set<std::pair<NodePtr, size_t>>>("schedule.cond")
+        == std::set<std::pair<NodePtr, size_t>>{{g->getNode("select"), 0}});
+    REQUIRE(g->getNode("tanh")->attributes()->hasAttr("schedule.cond"));
+    REQUIRE(g->getNode("tanh")->attributes()->getAttr<std::set<std::pair<NodePtr, size_t>>>("schedule.cond")
+        == std::set<std::pair<NodePtr, size_t>>{{g->getNode("select"), 1}});
+    REQUIRE(g->getNode("sqrt")->attributes()->hasAttr("schedule.cond"));
+    REQUIRE(g->getNode("sqrt")->attributes()->getAttr<std::set<std::pair<NodePtr, size_t>>>("schedule.cond")
+        == std::set<std::pair<NodePtr, size_t>>{{g->getNode("select"), 2}});
+    REQUIRE(!g->getNode("input")->attributes()->hasAttr("schedule.cond"));
+
+    scheduler.generateScheduling();
+    scheduler.saveStaticSchedulingDiagram("select_scheduling_tag");
+    REQUIRE_NOTHROW(scheduler.forward(true));
 }
 #endif
 } // namespace Aidge
-- 
GitLab


From e13b5fa521c5f979602e41236ece5795eaed8635 Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Thu, 20 Feb 2025 09:09:30 +0100
Subject: [PATCH 017/108] Export OpenSSL dependency

---
 CMakeLists.txt                    | 2 ++
 aidge_backend_cpu-config.cmake.in | 4 ++++
 2 files changed, 6 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2d4bc8ec..729853ee 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -120,8 +120,10 @@ target_include_directories(${module_name}
         ${CMAKE_CURRENT_SOURCE_DIR}/src
 )
 
+set(AIDGE_REQUIRES_OPENSSL FALSE)
 if(OpenSSL_FOUND)
     target_link_libraries(${module_name} PRIVATE OpenSSL::SSL OpenSSL::Crypto)
+    set(AIDGE_REQUIRES_OPENSSL TRUE)
 endif()
 
 target_compile_features(${module_name} PRIVATE cxx_std_14)
diff --git a/aidge_backend_cpu-config.cmake.in b/aidge_backend_cpu-config.cmake.in
index d8e1372b..7582102c 100644
--- a/aidge_backend_cpu-config.cmake.in
+++ b/aidge_backend_cpu-config.cmake.in
@@ -2,6 +2,10 @@
 
 include(CMakeFindDependencyMacro)
 find_dependency(aidge_core)
+set(AIDGE_REQUIRES_OPENSSL @AIDGE_REQUIRES_OPENSSL@)
+if (AIDGE_REQUIRES_OPENSSL)
+    find_dependency(OpenSSL)
+endif()
 
 include(CMakeFindDependencyMacro)
 
-- 
GitLab


From f53364a0301fc933ec2e48d6c7f5488a76470d77 Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Fri, 21 Feb 2025 14:49:00 +0100
Subject: [PATCH 018/108] Renaming

---
 aidge_backend_cpu/unit_tests/test_scheduler.py | 8 ++++----
 unit_tests/operator/Test_MetaOperator.cpp      | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/aidge_backend_cpu/unit_tests/test_scheduler.py b/aidge_backend_cpu/unit_tests/test_scheduler.py
index 494f3456..b60ff3f0 100644
--- a/aidge_backend_cpu/unit_tests/test_scheduler.py
+++ b/aidge_backend_cpu/unit_tests/test_scheduler.py
@@ -57,9 +57,9 @@ class test_scheduler(unittest.TestCase):
         scheduler = aidge_core.SequentialScheduler(graph_view)
         scheduler.generate_scheduling()
 
-        self.assertEqual(len(scheduler.get_static_scheduling()), 10)
+        self.assertEqual(len(scheduler.get_sequential_static_scheduling()), 10)
         # Do not care about the order of execution of the producers
-        self.assertListEqual([i.name() for i in scheduler.get_static_scheduling()[-3:]], EXPECTED_SCHEDULE)
+        self.assertListEqual([i.name() for i in scheduler.get_sequential_static_scheduling()[-3:]], EXPECTED_SCHEDULE)
 
 
     def test_parallel_scheduling(self):
@@ -83,9 +83,9 @@ class test_scheduler(unittest.TestCase):
         scheduler = aidge_core.SequentialScheduler(graph_view)
         scheduler.generate_scheduling()
 
-        self.assertEqual(len(scheduler.get_static_scheduling()), 11)
+        self.assertEqual(len(scheduler.get_sequential_static_scheduling()), 11)
         # Do not care about the order of execution of the producers
-        self.assertTrue([i.name() for i in scheduler.get_static_scheduling()[-4:]] in EXPECTED_SCHEDULE)
+        self.assertTrue([i.name() for i in scheduler.get_sequential_static_scheduling()[-4:]] in EXPECTED_SCHEDULE)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/unit_tests/operator/Test_MetaOperator.cpp b/unit_tests/operator/Test_MetaOperator.cpp
index adc548b9..bb9027d3 100644
--- a/unit_tests/operator/Test_MetaOperator.cpp
+++ b/unit_tests/operator/Test_MetaOperator.cpp
@@ -279,9 +279,9 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
         REQUIRE(op->getNbConsumedData(1).data == 32768);
         REQUIRE(op->getNbProducedData(0).data == 34816);
         REQUIRE(op->getNbProducedData(1).data == 34816);
-        REQUIRE(microGraphScheduler->getStaticScheduling(0).size() == 26);
-        REQUIRE(microGraphScheduler->getStaticScheduling(1).size() == 24);
-        REQUIRE(microGraphScheduler->getStaticScheduling(15).size() == 24);
+        REQUIRE(microGraphScheduler->getSequentialStaticScheduling(0).size() == 26);
+        REQUIRE(microGraphScheduler->getSequentialStaticScheduling(1).size() == 24);
+        REQUIRE(microGraphScheduler->getSequentialStaticScheduling(15).size() == 24);
     }
 
     SECTION("LSTM(forward_values)") {
-- 
GitLab


From 47bb2b3f69b7642f5d22b62d81559347ebd4b6ff Mon Sep 17 00:00:00 2001
From: NAUD Maxence <maxence.naud@cea.fr>
Date: Mon, 24 Feb 2025 13:49:17 +0000
Subject: [PATCH 019/108] Fix some imports following aidge_core update

---
 src/operator/PadImpl.cpp       | 6 +++---
 src/operator/ReduceSumImpl.cpp | 7 +++++--
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/operator/PadImpl.cpp b/src/operator/PadImpl.cpp
index cdae21f8..9a54437f 100644
--- a/src/operator/PadImpl.cpp
+++ b/src/operator/PadImpl.cpp
@@ -9,14 +9,14 @@
  *
  ********************************************************************************/
 
+#include <cstddef>
 #include <vector>
 
-#include "aidge/utils/Types.h"
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
-#include "aidge/operator/Conv.hpp"
-
 #include "aidge/backend/cpu/operator/PadImpl.hpp"
 #include "aidge/backend/cpu/operator/PadImpl_kernels.hpp"
+#include "aidge/operator/Pad.hpp"
+#include "aidge/utils/Types.h"
 
 Aidge::Elts_t Aidge::Pad_ProdConso_cpu::getNbRequiredProtected(Aidge::IOIndex_t inputIdx) const {
     AIDGE_ASSERT(inputIdx == 0, "input index out of range."
diff --git a/src/operator/ReduceSumImpl.cpp b/src/operator/ReduceSumImpl.cpp
index aad08018..93a89a34 100644
--- a/src/operator/ReduceSumImpl.cpp
+++ b/src/operator/ReduceSumImpl.cpp
@@ -12,11 +12,14 @@
 #include "aidge/backend/cpu/operator/ReduceSumImpl.hpp"
 
 #include <memory>
+#include <stdexcept>
 #include <vector>
 
-#include "aidge/utils/Types.h"
-#include "aidge/operator/ReduceSum.hpp"
 #include "aidge/backend/cpu/operator/ReduceSumImpl_kernels.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/ReduceSum.hpp"
+#include "aidge/utils/ErrorHandling.hpp"
+#include "aidge/utils/Types.h"
 
 template <>
 void Aidge::ReduceSumImpl_cpu::forward() {
-- 
GitLab


From 3f4cd6e77aae54e674ab3f9aec0e0675cbd6860d Mon Sep 17 00:00:00 2001
From: Jerome Hue <jerome.hue@cea.fr>
Date: Thu, 6 Feb 2025 10:49:15 +0100
Subject: [PATCH 020/108] Implement backward function of Add operator

---
 .../aidge/backend/cpu/operator/AddImpl.hpp    |  14 +-
 .../backend/cpu/operator/AddImpl_kernels.hpp  |  64 +++-
 src/operator/AddImpl.cpp                      |  25 +-
 unit_tests/operator/Test_AddImpl.cpp          | 275 +++++++++++++++++-
 4 files changed, 368 insertions(+), 10 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/AddImpl.hpp b/include/aidge/backend/cpu/operator/AddImpl.hpp
index e39c35b4..ca04dff9 100644
--- a/include/aidge/backend/cpu/operator/AddImpl.hpp
+++ b/include/aidge/backend/cpu/operator/AddImpl.hpp
@@ -25,7 +25,19 @@
 namespace Aidge {
 // Operator implementation entry point for the backend
 using AddImpl_cpu = OperatorImpl_cpu<Add_Op,
-    void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*, void*)>;
+    void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*, void*),
+    void(const std::size_t, 
+         const std::size_t, 
+         const std::size_t, 
+         const std::vector<std::size_t>&, 
+         const std::vector<std::size_t>&, 
+         const std::vector<std::size_t>&, 
+         const void*, 
+         const void*, 
+         const void*, 
+         void*, 
+         void*)
+>;
 
 // Implementation entry point registration to Operator
 REGISTRAR(Add_Op, "cpu", Aidge::AddImpl_cpu::create);
diff --git a/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp
index e6d13fcf..d6fff9b5 100644
--- a/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp
@@ -147,25 +147,75 @@ void AddImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
     }
 }
 
+template <class I, class O>
+void AddImpl_cpu_backward_kernel(const std::size_t input0Length,
+                               const std::size_t input1Length,
+                               const std::size_t gradOutputLength,
+                               const std::vector<std::size_t>& dims0,
+                               const std::vector<std::size_t>& dims1,
+                               const std::vector<std::size_t>& outputDims,
+                               const void* input0_,
+                               const void* input1_,
+                               const void* grad_output_,
+                               void* gradientInput0_,
+                               void* gradientInput1_)
+{
+    // TODO: Remove input0/1 from the function
+    const I* input0 = static_cast<const I*>(input0_);
+    const I* input1 = static_cast<const I*>(input1_);
+    const O* gradOutput = static_cast<const O*>(grad_output_);
+    auto* gradInput0 = static_cast<I*>(gradientInput0_);
+    auto* gradInput1 = static_cast<I*>(gradientInput1_);
+
+    std::fill_n(gradInput0, input0Length, static_cast<I>(0));
+    std::fill_n(gradInput1, input1Length, static_cast<I>(0));
+
+    auto broadcastedDims0 = getBroadcastedDims(outputDims, dims0);
+    auto broadcastedDims1 = getBroadcastedDims(outputDims, dims1);
+
+    for (std::size_t i = 0; i < gradOutputLength; ++i) {
+        auto idxOutputGrad = getMultiDimIndices(outputDims, i);
+        std::vector<std::size_t> idxInput0(broadcastedDims0.size());
+        std::vector<std::size_t> idxInput1(broadcastedDims1.size());
+
+        for (std::size_t dimension = 0; dimension < broadcastedDims0.size(); ++dimension) {
+            idxInput0[dimension] = (broadcastedDims0[dimension] == 1) ? 0 : idxOutputGrad[dimension];
+        }
+
+        for (std::size_t dimension = 0; dimension < broadcastedDims1.size(); ++dimension) {
+            idxInput1[dimension] = (broadcastedDims1[dimension] == 1) ? 0 : idxOutputGrad[dimension];
+        }
+
+        auto idx0 = getFlattenedIndex(broadcastedDims0, idxInput0);
+        auto idx1 = getFlattenedIndex(broadcastedDims1, idxInput1);
+
+        // For addition: gradient of both inputs is just the output gradient
+        // (unlike multiplication where we need to multiply by the other input,
+        // or subtraction where we need to negate one of them)
+        gradInput0[idx0] += static_cast<I>(gradOutput[i]);
+        gradInput1[idx1] += static_cast<I>(gradOutput[i]);
+    }
+}
+
 // Kernels registration to implementation entry point
 REGISTRAR(AddImpl_cpu,
     {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float32}},
-    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<float, float>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<float, float>, Aidge::AddImpl_cpu_backward_kernel<float, float>});
 REGISTRAR(AddImpl_cpu,
     {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}},
-    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<double, double>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<double, double>, Aidge::AddImpl_cpu_backward_kernel<double, double>});
 REGISTRAR(AddImpl_cpu,
     {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int8}},
-    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int8_t, std::int8_t>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int8_t, std::int8_t>, Aidge::AddImpl_cpu_backward_kernel<std::int8_t, std::int8_t>});
 REGISTRAR(AddImpl_cpu,
     {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::UInt8}},
-    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::uint8_t, std::uint8_t>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::uint8_t, std::uint8_t>, Aidge::AddImpl_cpu_backward_kernel<std::uint8_t, std::uint8_t>});
 REGISTRAR(AddImpl_cpu,
     {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}},
-    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, Aidge::AddImpl_cpu_backward_kernel<std::int32_t, std::int32_t>});
 REGISTRAR(AddImpl_cpu,
     {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int64}},
-    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, Aidge::AddImpl_cpu_backward_kernel<std::int64_t, std::int64_t>});
 }  // namespace Aidge
 
-#endif /* AIDGE_CPU_OPERATOR_ADDIMPL_CPU_KERNELS_H_ */
\ No newline at end of file
+#endif /* AIDGE_CPU_OPERATOR_ADDIMPL_CPU_KERNELS_H_ */
diff --git a/src/operator/AddImpl.cpp b/src/operator/AddImpl.cpp
index 101743ec..b027fb87 100644
--- a/src/operator/AddImpl.cpp
+++ b/src/operator/AddImpl.cpp
@@ -55,5 +55,28 @@ void  Aidge::AddImpl_cpu::forward() {
 
 template <>
 void Aidge::AddImpl_cpu::backward() {
-    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Add_Op on backend cpu");
+    const Add_Op& op_ = dynamic_cast<const Add_Op&>(mOp);
+
+    auto in0 = op_.getInput(0);
+    auto in1 = op_.getInput(1);
+    auto in0grad = op_.getInput(0)->grad();
+    auto in1grad = op_.getInput(1)->grad();
+    auto out0grad = op_.getOutput(0)->grad();
+
+    // Find the correct kernel type
+    const auto impl = Registrar<AddImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Call kernel
+    impl.backward(in0grad->size(),
+               in1grad->size(),
+               out0grad->size(),
+               in0->dims(),
+               in1->dims(),
+               out0grad->dims(),
+               getCPUPtr(in0),
+               getCPUPtr(in1),
+               getCPUPtr(out0grad),
+               getCPUPtr(in0grad),
+               getCPUPtr(in1grad));
+
 }
diff --git a/unit_tests/operator/Test_AddImpl.cpp b/unit_tests/operator/Test_AddImpl.cpp
index bff9629b..4538b322 100644
--- a/unit_tests/operator/Test_AddImpl.cpp
+++ b/unit_tests/operator/Test_AddImpl.cpp
@@ -10,6 +10,7 @@
  ********************************************************************************/
 
 #include <memory>
+#include <random>
 
 #include <catch2/catch_test_macros.hpp>
 
@@ -19,6 +20,7 @@
 #include "aidge/graph/Node.hpp"
 #include "aidge/operator/Add.hpp"
 #include "aidge/utils/ArrayHelpers.hpp"
+#include "aidge/utils/TensorUtils.hpp"
 
 using namespace Aidge;
 
@@ -139,4 +141,275 @@ TEST_CASE("[cpu/operator] Add(forward)", "[Add][CPU]") {
         Log::info("Expected Add_1 Tensor:\n{}", expectedOutput);
         REQUIRE(*op_1->getOutput(0) == expectedOutput);
     }
-}
\ No newline at end of file
+}
+
+TEST_CASE("[cpu/operator] Add(backward)", "[Add][CPU]") {
+    std::shared_ptr<Add_Op> op = std::make_shared<Add_Op>();
+    op->setDataType(DataType::Float32);
+    op->setBackend("cpu");
+
+    // NOTE: The first four tests use fixed values, the last one uses random values but static dimensions.
+
+    SECTION("Case 1: 1D and 2D Tensors") {
+        const auto T0 = std::make_shared<Tensor>(
+            Array2D<cpptype_t<DataType::Float32>, 2, 3>({{{1, 2, 3}, {4, 5, 6}}}));
+
+        const auto T1 =
+            std::make_shared<Tensor>(Array1D<cpptype_t<DataType::Float32>, 3>({0.1, 0.2, 0.3}));
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>(
+            Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}})));
+        op->forwardDims();
+
+        op->backward();
+
+        const Tensor expectedGrad0 =
+            Array2D<cpptype_t<DataType::Float32>, 2, 3>({{{1, 1, 1}, {1, 1, 1}}});
+
+        const Tensor expectedGrad1 = Array1D<cpptype_t<DataType::Float32>, 3>({2, 2, 2});
+
+
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1));
+    }
+
+    SECTION("Case 2: 3D and 1D tensors") {
+        const auto T0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+            {{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}},
+              {{7.0, 8.0, 9.0}, {10.0, 11.0, 12.0}}}}));
+
+        const auto T1 =
+            std::make_shared<Tensor>(Array1D<float, 3>({0.3, 0.2, 0.1}));
+
+        const auto newGrad = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+            {{{{1, 1, 1}, {1, 1, 1}}, {{1, 1, 1}, {1, 1, 1}}}}));
+
+        const Tensor expectedGrad0 =
+            Array3D<float, 2, 2, 3>({{{{1, 1, 1}, {1, 1, 1}},
+                                      {{1, 1, 1}, {1, 1, 1}}}});
+
+        const Tensor expectedGrad1 = Array1D<cpptype_t<DataType::Float32>, 3>({4, 4, 4});
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(newGrad);
+        op->forwardDims();
+        op->backward();
+
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1));
+    }
+
+    SECTION("Case 3: 4D and 2D tensors") {
+        const auto T0 = std::make_shared<Tensor>(Array4D<cpptype_t<DataType::Float32>, 2, 2, 3, 3>(
+            {{{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}, {7.0, 8.0, 9.0}},
+               {{10.0, 11.0, 12.0}, {13.0, 14.0, 15.0}, {16.0, 17.0, 18.0}}},
+              {{{19.0, 20.0, 21.0}, {22.0, 23.0, 24.0}, {25.0, 26.0, 27.0}},
+               {{28.0, 29.0, 30.0},
+                {31.0, 32.0, 33.0},
+                {34.0, 35.0, 36.0}}}}}));
+
+        const auto T1 = std::make_shared<Tensor>(Array2D<cpptype_t<DataType::Float32>, 3, 3>(
+            {{{0.5, 0.3, 0.1}, {0.4, 0.2, 0.6}, {0.7, 0.8, 0.9}}}));
+
+        const auto newGrad =
+            std::make_shared<Tensor>(Array4D<cpptype_t<DataType::Float32>, 2, 2, 3, 3>(
+                {{{{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}},
+                   {{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}},
+                  {{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}},
+                   {{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}}}}));
+
+        const Tensor expectedGrad0 =
+            Array4D<cpptype_t<DataType::Float32>, 2, 2, 3, 3>(
+                {{{{{1, 1, 1}, {1, 1, 1}, {1, 1, 1}},
+                   {{1, 1, 1}, {1, 1, 1}, {1, 1, 1}}},
+                  {{{1, 1, 1}, {1, 1, 1}, {1, 1, 1}},
+                   {{1, 1, 1}, {1, 1, 1}, {1, 1, 1}}}}});
+
+        const Tensor expectedGrad1 =
+            Array2D<cpptype_t<DataType::Float32>, 3, 3>({{
+                                   {4.0, 4.0, 4.0},
+                                   {4.0, 4.0, 4.0},
+                                   {4.0, 4.0, 4.0}}});
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(newGrad);
+        op->forwardDims();
+
+        op->backward();
+
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1));
+    }
+
+    SECTION("Case 4: 3D and 2D tensors") {
+        const auto T0 = std::make_shared<Tensor>(
+            Array3D<float, 2, 3, 4>({{{
+                                          {1.0, 2.0, 3.0, 4.0},
+                                          {5.0, 6.0, 7.0, 8.0},
+                                          {9.0, 10.0, 11.0, 12.0},
+                                      },
+                                      {
+                                          {13.0, 14.0, 15.0, 16.0},
+                                          {17.0, 18.0, 19.0, 20.0},
+                                          {21.0, 22.0, 23.0, 24.0},
+                                      }}}));
+
+        const auto T1 = std::make_shared<Tensor>(
+            Array2D<cpptype_t<DataType::Float32>, 3, 4>({{{0.1, 0.2, 0.3, 0.4},
+                                   {0.5, 0.6, 0.7, 0.8},
+                                   {0.9, 1.0, 1.1, 1.2}}}));
+
+        const auto newGrad = std::make_shared<Tensor>(
+            Array3D<cpptype_t<DataType::Float32>, 2, 3, 4>({{{
+                                          {1.0, 1.0, 1.0, 1.0},
+                                          {1.0, 1.0, 1.0, 1.0},
+                                          {1.0, 1.0, 1.0, 1.0},
+                                      },
+                                      {
+                                          {1.0, 1.0, 1.0, 1.0},
+                                          {1.0, 1.0, 1.0, 1.0},
+                                          {1.0, 1.0, 1.0, 1.0},
+                                      }}}));
+
+        const Tensor expectedGrad0 =
+            Array3D<cpptype_t<DataType::Float32>, 2, 3, 4>({{{{1, 1, 1, 1},
+                                       {1, 1, 1, 1},
+                                       {1, 1, 1, 1}},
+                                      {{1, 1, 1, 1},
+                                       {1, 1, 1, 1},
+                                       {1, 1, 1, 1}}}});
+
+        const Tensor expectedGrad1 =
+            Array2D<cpptype_t<DataType::Float32>, 3, 4>({{{2.0, 2.0, 2.0, 2.0},
+                                   {2.0, 2.0, 2.0, 2.0},
+                                   {2.0, 2.0, 2.0, 2.0}}});
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(newGrad);
+        op->forwardDims();
+
+        op->backward();
+
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1));
+    }
+
+    SECTION("Case 5: Tensors with random values") {
+
+        // Use random values
+        const std::vector<std::size_t> dims0 = {5, 2, 1, 7}; // First tensor
+        const std::vector<std::size_t> dims1 = {2, 6, 7};    // Second tensor
+        const std::vector<std::size_t> outputDims = {5, 2, 6, 7};
+
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> dist(0.1f, 1.0f);
+
+        auto T0 = std::make_shared<Tensor>(dims0);
+        T0->setDataType(DataType::Float32);
+        T0->setBackend("cpu");
+        float* input0Data = static_cast<float*>(T0->getImpl()->rawPtr());
+        // Fill with random values
+        for (std::size_t i = 0; i < T0->size(); ++i) {
+            input0Data[i] = dist(gen);
+        }
+
+        auto T1 = std::make_shared<Tensor>(dims1);
+        T1->setDataType(DataType::Float32);
+        T1->setBackend("cpu");
+        float* input1Data = static_cast<float*>(T1->getImpl()->rawPtr());
+        // Fill with random values
+        for (std::size_t i = 0; i < T1->size(); ++i) {
+            input1Data[i] = dist(gen);
+        }
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+
+        op->forwardDims();
+        op->forward();
+
+        Tensor expectedOutput{outputDims};
+        expectedOutput.setBackend("cpu");
+        float* expectedOutputData = static_cast<float*>(expectedOutput.getImpl()->rawPtr());
+
+        for (std::size_t n = 0; n < 5; ++n) {
+            for (std::size_t c = 0; c < 2; ++c) {
+                for (std::size_t h = 0; h < 6; ++h) {
+                    for (std::size_t w = 0; w < 7; ++w) {
+                        std::size_t outIdx = w + 7 * (h + 6 * (c + 2 * n));
+                        std::size_t in0Idx =
+                            w + 7 * (0 + 1 * (c + 2 * n)); // middle dim is 1
+                        std::size_t in1Idx =
+                            w + 7 * (h + 6 * c);           // no n dimension
+
+                        expectedOutputData[outIdx] = input0Data[in0Idx] + input1Data[in1Idx];
+                    }
+                }
+            }
+        }
+
+        auto outputTensor = op->getOutput(0);
+
+        REQUIRE(approxEq<float>(*outputTensor, expectedOutput));
+
+        // Backward pass
+        std::vector<float> gradOutputData(expectedOutput.size());
+        for (auto &val : gradOutputData) {
+            val = dist(gen);
+        }
+
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>());
+        op->getOutput(0)->grad()->resize(outputDims);
+        op->getOutput(0)->grad()->getImpl()->setRawPtr(gradOutputData.data(),
+                                                       expectedOutput.size());
+
+        // Compute reference gradients
+        std::vector<float> expectedGrad0(T0->size(), 0.0f);
+        std::vector<float> expectedGrad1(T1->size(), 0.0f);
+
+        for (std::size_t n = 0; n < 5; ++n) {
+            for (std::size_t c = 0; c < 2; ++c) {
+                for (std::size_t h = 0; h < 6; ++h) {
+                    for (std::size_t w = 0; w < 7; ++w) {
+                        std::size_t outIdx = w + 7 * (h + 6 * (c + 2 * n));
+                        std::size_t in0Idx = w + 7 * (0 + 1 * (c + 2 * n));
+                        std::size_t in1Idx = w + 7 * (h + 6 * c);
+
+                        // Gradient for input0: just accumulate grad_output
+                        expectedGrad0[in0Idx] += gradOutputData[outIdx];
+
+                        // Gradient for input1: just accumulate grad_output
+                        expectedGrad1[in1Idx] += gradOutputData[outIdx];
+                    }
+                }
+            }
+        }
+
+        // Perform backward pass
+        op->backward();
+
+        auto expectedGrad0Tensor = std::make_shared<Tensor>();
+        expectedGrad0Tensor->resize(T0->dims());
+        expectedGrad0Tensor->setBackend("cpu");
+        expectedGrad0Tensor->setDataType(DataType::Float32);
+        expectedGrad0Tensor->getImpl()->setRawPtr(expectedGrad0.data(),
+                                                    expectedGrad0.size());
+
+        auto expectedGrad1Tensor = std::make_shared<Tensor>(T1->dims());
+        expectedGrad1Tensor->setBackend("cpu");
+        expectedGrad1Tensor->setDataType(DataType::Float32);
+        expectedGrad1Tensor->getImpl()->setRawPtr(expectedGrad1.data(),
+                                                    expectedGrad1.size());
+
+        // Verify backward pass
+        REQUIRE(approxEq<float>(*T0->grad(), *expectedGrad0Tensor));
+        REQUIRE(approxEq<float>(*T1->grad(), *expectedGrad1Tensor));
+    }
+}
+
-- 
GitLab


From 393fb207a6599cdfbbbe141e3cb29a3a5cae8246 Mon Sep 17 00:00:00 2001
From: NAUD Maxence <maxence.naud@cea.fr>
Date: Wed, 26 Feb 2025 14:48:17 +0000
Subject: [PATCH 021/108] [upd] ConstantOfShape kernel to use Tensor as inputs
 and avoid redundant size computation

---
 .../cpu/operator/ConstantOfShapeImpl.hpp        |  8 +++-----
 .../operator/ConstantOfShapeImpl_kernels.hpp    | 17 ++++-------------
 src/operator/ConstantOfShapeImpl.cpp            |  9 +++------
 3 files changed, 10 insertions(+), 24 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp b/include/aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp
index 83e7e030..b595ec93 100644
--- a/include/aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp
+++ b/include/aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp
@@ -12,23 +12,21 @@
 #ifndef AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_H_
 #define AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_H_
 
-#include <cstddef>
 #include <memory>
-#include <vector>
 
 #include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/ConstantOfShape.hpp"
 #include "aidge/utils/Registrar.hpp"
-#include "aidge/utils/Types.h"
 
 namespace Aidge {
+
+class Tensor;
 // Operator implementation entry point for the backend
 using ConstantOfShapeImpl_cpu = OperatorImpl_cpu<ConstantOfShape_Op,
-    void(const std::vector<DimSize_t>, const Tensor&, void *)>;
+    void(const std::shared_ptr<Tensor>&, const Tensor&)>;
 
 // Implementation entry point registration to Operator
 REGISTRAR(ConstantOfShape_Op, "cpu", Aidge::ConstantOfShapeImpl_cpu::create);
 } // namespace Aidge
 
 #endif /* _AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_H_ */
-
diff --git a/include/aidge/backend/cpu/operator/ConstantOfShapeImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConstantOfShapeImpl_kernels.hpp
index 18ab9c0a..c42cc76a 100644
--- a/include/aidge/backend/cpu/operator/ConstantOfShapeImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConstantOfShapeImpl_kernels.hpp
@@ -30,20 +30,11 @@
 namespace Aidge {
 template <class O>
 void ConstantOfShapeimpl_cpu_forward_kernel(
-    const std::vector<DimSize_t> output_dims, const Tensor &value,
-    void *output_) {
+    const std::shared_ptr<Tensor>& output_, const Tensor &value) {
 
-  O *output = static_cast<O *>(output_);
-  O val;
-  std::copy(static_cast<O *>(value.getImpl()->hostPtr()),
-            static_cast<O *>(value.getImpl()->hostPtr()) +
-                static_cast<NbElts_t>(1),
-            &val);
-  const size_t output_size = std::accumulate(
-      output_dims.begin(), output_dims.end(), 1, std::multiplies<DimSize_t>());
-  for (size_t i = 0; i < output_size; ++i) {
-    output[i] = val;
-  }
+  O* output = static_cast<O*>(output_->getImpl()->hostPtr());
+  const O val = *reinterpret_cast<O*>(value.getImpl()->hostPtr());
+  std::fill_n(output, output_->size(), val);
 }
 
 // Kernels registration to implementation entry point
diff --git a/src/operator/ConstantOfShapeImpl.cpp b/src/operator/ConstantOfShapeImpl.cpp
index 16e4b762..1d41160b 100644
--- a/src/operator/ConstantOfShapeImpl.cpp
+++ b/src/operator/ConstantOfShapeImpl.cpp
@@ -13,15 +13,14 @@
 
 #include <functional>
 #include <memory>
-#include <vector>
+#include <stdexcept>   // std::runtime_error
 
 #include "aidge/backend/cpu/operator/ConstantOfShapeImpl_kernels.hpp"
-#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/ConstantOfShape.hpp"
+#include "aidge/backend/OperatorImpl.hpp"  // Aidge::getBestMatch, Aidge::getRequiredSpec
 #include "aidge/utils/ErrorHandling.hpp"
 #include "aidge/utils/Registrar.hpp"
-#include "aidge/utils/Types.h"
 
 template <>
 void Aidge::ConstantOfShapeImpl_cpu::forward() {
@@ -33,9 +32,7 @@ void Aidge::ConstantOfShapeImpl_cpu::forward() {
     const auto impl = Registrar<ConstantOfShapeImpl_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Call kernel
-    impl.forward(op_.getOutput(0)->dims(),
-             op_.value(), 
-             op_.getOutput(0)->getImpl()->rawPtr());
+    impl.forward(op_.getOutput(0), op_.value());
 }
 
 template <>
-- 
GitLab


From 9d9647aa0f91f637c5cd063b78b8a68075c2294e Mon Sep 17 00:00:00 2001
From: NAUD Maxence <maxence.naud@cea.fr>
Date: Wed, 26 Feb 2025 14:51:38 +0000
Subject: [PATCH 022/108] [upd] tests following 'aidge_core' changes

---
 .../operator/Test_ConstantOfShapeImpl.cpp     | 139 +++++++++---------
 .../recipies/Test_FoldConstantOfShape.cpp     |  50 +++++++
 2 files changed, 119 insertions(+), 70 deletions(-)
 create mode 100644 unit_tests/recipies/Test_FoldConstantOfShape.cpp

diff --git a/unit_tests/operator/Test_ConstantOfShapeImpl.cpp b/unit_tests/operator/Test_ConstantOfShapeImpl.cpp
index 8ec1669b..6833d836 100644
--- a/unit_tests/operator/Test_ConstantOfShapeImpl.cpp
+++ b/unit_tests/operator/Test_ConstantOfShapeImpl.cpp
@@ -27,89 +27,88 @@
 #include "aidge/data/Tensor.hpp"
 #include "aidge/filler/Filler.hpp"
 #include "aidge/operator/ConstantOfShape.hpp"
-#include "aidge/operator/OperatorTensor.hpp"
 #include "aidge/utils/TensorUtils.hpp"
 #include "aidge/utils/Types.h"
 
 namespace Aidge {
-TEST_CASE("[cpu/operator] ConstantOfShape", "[ConstantOfShape][CPU]") {
-  constexpr std::uint16_t NBTRIALS = 10;
-  // Create a random number generator
-  auto random_seed = Catch::Generators::Detail::getSeed;
-  std::mt19937 gen(random_seed());
-  std::uniform_real_distribution<float> valueDist(
-      0.1f, 1.1f); // Random float distribution between 0 and 1
-  std::uniform_int_distribution<DimSize_t> input_tensor_size_dist(
-      std::size_t(1), std::size_t(10));
-  std::uniform_int_distribution<int64_t> input_tensor_values_dist(
-      std::size_t(1), std::size_t(7));
-  std::uniform_real_distribution<double> operator_attr_value_dist(-100., 100.);
 
-  ///////////////////////////////////////////////
-  // SETUP FUNCTIONS
-  auto generate_input_tensor =
-      [&gen, &input_tensor_size_dist,
-       &input_tensor_values_dist]() -> std::shared_ptr<Tensor> {
-    std::vector<DimSize_t> input_dims;
-    input_dims.push_back(input_tensor_size_dist(gen));
+TEST_CASE("[cpu/operator] ConstantOfShape(forward)", "[ConstantOfShape][CPU][forward]") {
+    constexpr std::uint16_t NBTRIALS = 10;
+    // Create a random number generator
+    auto random_seed = Catch::Generators::Detail::getSeed;
+    std::mt19937 gen(random_seed());
+    std::uniform_real_distribution<float> valueDist(
+            0.1f, 1.1f); // Random float distribution between 0 and 1
+    std::uniform_int_distribution<DimSize_t> input_tensor_size_dist(
+            std::size_t(1), std::size_t(10));
+    std::uniform_int_distribution<int64_t> input_tensor_values_dist(
+            std::size_t(1), std::size_t(7));
+    std::uniform_real_distribution<double> operator_attr_value_dist(-100., 100.);
 
-    auto result = std::make_shared<Tensor>(input_dims);
-    result->setDataType(DataType::Int64);
-    result->setBackend("cpu");
-    for (DimSize_t i = 0; i < result->size(); ++i) {
-      result->set<std::int64_t>(i, input_tensor_values_dist(gen));
-    }
-    return result;
-  };
+    ///////////////////////////////////////////////
+    // SETUP FUNCTIONS
+    auto generate_input_tensor =
+            [&gen, &input_tensor_size_dist,
+             &input_tensor_values_dist]() -> std::shared_ptr<Tensor> {
+        std::vector<DimSize_t> input_dims;
+        input_dims.push_back(input_tensor_size_dist(gen));
 
-  auto generate_random_operator =
-      [&gen,
-       &operator_attr_value_dist]() -> std::shared_ptr<ConstantOfShape_Op> {
-    auto node = ConstantOfShape(Tensor(operator_attr_value_dist(gen)));
-    auto op = std::static_pointer_cast<ConstantOfShape_Op>(node->getOperator());
-    op->setDataType(DataType::Float64);
-    op->setBackend("cpu");
-    return op;
-  };
+        auto result = std::make_shared<Tensor>(input_dims);
+        result->setDataType(DataType::Int64);
+        result->setBackend("cpu");
+        for (DimSize_t i = 0; i < result->size(); ++i) {
+            result->set<std::int64_t>(i, input_tensor_values_dist(gen));
+        }
+        return result;
+    };
 
-  auto generate_output_tensor = [](std::shared_ptr<Tensor> input_tensor,
-                                   std::shared_ptr<ConstantOfShape_Op> op) {
-    std::vector<DimSize_t> output_dims;
-    output_dims.reserve(input_tensor->size());
-    for (DimSize_t i = 0; i < input_tensor->size(); ++i) {
-      output_dims.push_back(input_tensor->get<int64_t>(i));
-    }
-    auto result = std::make_shared<Tensor>(output_dims);
-    result->setDataType(op->value().dataType());
-    result->setBackend("cpu");
-    constantFiller(result, op->value().get<double>(0));
-    return result;
-  };
+    auto generate_random_operator =
+            [&gen,
+             &operator_attr_value_dist]() -> std::shared_ptr<ConstantOfShape_Op> {
+        std::shared_ptr<ConstantOfShape_Op> op = std::make_shared<ConstantOfShape_Op>(Tensor(operator_attr_value_dist(gen)));
+        op->setDataType(DataType::Float64);
+        op->setBackend("cpu");
+        return op;
+    };
+
+    auto generate_output_tensor = [](std::shared_ptr<Tensor> input_tensor,
+                                      std::shared_ptr<ConstantOfShape_Op> op) {
+        std::vector<DimSize_t> output_dims;
+        output_dims.reserve(input_tensor->size());
+        for (DimSize_t i = 0; i < input_tensor->size(); ++i) {
+            output_dims.push_back(input_tensor->get<std::int64_t>(i));
+        }
+        auto result = std::make_shared<Tensor>(output_dims);
+        result->setDataType(op->value().dataType());
+        result->setBackend("cpu");
+        constantFiller(result, op->value().get<double>(0));
+        return result;
+    };
 
-  /////////////////////////////////////
-  // BENCHMARKING
-  std::chrono::time_point<std::chrono::system_clock> start;
-  std::chrono::time_point<std::chrono::system_clock> end;
-  std::chrono::duration<double, std::micro> duration{};
-  int number_of_operation{0};
+    /////////////////////////////////////
+    // BENCHMARKING
+    std::chrono::time_point<std::chrono::system_clock> start;
+    std::chrono::time_point<std::chrono::system_clock> end;
+    std::chrono::duration<double, std::micro> duration{};
+    int number_of_operation{0};
 
-  SECTION("ConstantOfShapeImpl_cpu::forward()") {
-    for (int i = 0; i < NBTRIALS; ++i) {
-      auto input_T = generate_input_tensor();
-      std::shared_ptr<ConstantOfShape_Op> op = generate_random_operator();
-      auto output_T = generate_output_tensor(input_T, op);
-      op->associateInput(0, input_T);
+    SECTION("ConstantOfShapeImpl_cpu::forward()") {
+        for (int i = 0; i < NBTRIALS; ++i) {
+            auto input_T = generate_input_tensor();
+            std::shared_ptr<ConstantOfShape_Op> op = generate_random_operator();
+            auto output_T = generate_output_tensor(input_T, op);
+            op->associateInput(0, input_T);
 
-      REQUIRE(op->forwardDims(true));
-      REQUIRE_NOTHROW(op->forward());
+            REQUIRE(op->forwardDims(true));
+            REQUIRE_NOTHROW(op->forward());
 
-      CHECK(output_T->nbDims() == op->getOutput(0)->nbDims());
-      for (DimIdx_t i = 0; i < output_T->nbDims(); ++i) {
-        CHECK(output_T->dims().at(i) == op->getOutput(0)->dims().at(i));
-      }
-      CHECK(approxEq<double>(*output_T, *op->getOutput(0)));
+            CHECK(output_T->nbDims() == op->getOutput(0)->nbDims());
+            for (DimIdx_t i = 0; i < output_T->nbDims(); ++i) {
+                CHECK(output_T->dims().at(i) == op->getOutput(0)->dims().at(i));
+            }
+            CHECK(approxEq<double>(*output_T, *op->getOutput(0)));
+        }
     }
-  }
 }
 } // namespace Aidge
 
diff --git a/unit_tests/recipies/Test_FoldConstantOfShape.cpp b/unit_tests/recipies/Test_FoldConstantOfShape.cpp
new file mode 100644
index 00000000..a1c09b15
--- /dev/null
+++ b/unit_tests/recipies/Test_FoldConstantOfShape.cpp
@@ -0,0 +1,50 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+ #include "aidge/graph/GraphView.hpp"
+ #include "aidge/operator/Identity.hpp"
+ #include "aidge/recipes/Recipes.hpp"
+
+ #include <cstdint>  // std::int64_t
+ #include <memory>
+
+ #include <catch2/catch_test_macros.hpp>
+
+ #include "aidge/graph/OpArgs.hpp"
+ #include "aidge/operator/ConstantOfShape.hpp"
+ #include "aidge/operator/Conv.hpp"
+ #include "aidge/operator/Producer.hpp"
+ #include "aidge/operator/ReLU.hpp"
+ #include "aidge/recipes/Recipes.hpp"
+ #include "aidge/utils/ArrayHelpers.hpp"
+ #include "aidge/utils/Types.h"
+
+ namespace Aidge {
+
+ TEST_CASE("[cpu/recipes] foldConstantOfShape",
+           "[ConstantOfShape][foldConstantOfShape][recipes]") {
+   auto input_T = std::make_shared<Tensor>(Array1D<std::int64_t, 4>({1, 1, 3, 3}));
+
+   auto model = std::make_shared<GraphView>();
+   SECTION("Sequential model") {
+     model = Sequential({
+         Producer(input_T, "prod_0", true),
+         ConstantOfShape(3, "constantOfShape_0"),
+         Conv(1, 1, {3, 3}, "Conv_0"),
+         ReLU("ReLU_1")
+     });
+     // aidge_backend_cpu loaded. Recipe should work
+     REQUIRE(foldConstantOfShape(model) == 1);
+     CHECK(model->forwardDims());
+   }
+ }
+
+ }  // namespace Aidge
-- 
GitLab


From 4453b5fb94c0c2f5f5ea7de5b2519c4a788eb9e5 Mon Sep 17 00:00:00 2001
From: Jerome Hue <jerome.hue@cea.fr>
Date: Thu, 6 Feb 2025 11:59:50 +0100
Subject: [PATCH 023/108] Implement backward function for Div operator

---
 .../aidge/backend/cpu/operator/DivImpl.hpp    |  13 +-
 .../backend/cpu/operator/DivImpl_kernels.hpp  |  61 +++-
 src/operator/DivImpl.cpp                      |  23 +-
 unit_tests/operator/Test_DivImpl.cpp          | 271 ++++++++++++++++++
 4 files changed, 363 insertions(+), 5 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/DivImpl.hpp b/include/aidge/backend/cpu/operator/DivImpl.hpp
index 40c1b678..a507690b 100644
--- a/include/aidge/backend/cpu/operator/DivImpl.hpp
+++ b/include/aidge/backend/cpu/operator/DivImpl.hpp
@@ -24,7 +24,18 @@
 namespace Aidge {
 // Operator implementation entry point for the backend
 using DivImpl_cpu = OperatorImpl_cpu<Div_Op,
-    void(const std::size_t, const std::size_t, const std::size_t, const void*, const void*,void*)>;
+    void(const std::size_t, const std::size_t, const std::size_t, const void*, const void*,void*),
+    void(const std::size_t,
+        const std::size_t,
+        const std::size_t,
+        const std::vector<std::size_t>,
+        const std::vector<std::size_t>,
+        const std::vector<std::size_t>,
+        const void*,
+        const void*,
+        const void*,
+        void*,
+        void*)>;
 
 // Implementation entry point registration to Operator
 REGISTRAR(Div_Op, "cpu", Aidge::DivImpl_cpu::create);
diff --git a/include/aidge/backend/cpu/operator/DivImpl_kernels.hpp b/include/aidge/backend/cpu/operator/DivImpl_kernels.hpp
index ed6e55a7..5d3ee7f6 100644
--- a/include/aidge/backend/cpu/operator/DivImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/DivImpl_kernels.hpp
@@ -17,6 +17,7 @@
 #include <cstdint>     // std::int32_t, std::int64_t
 #include <functional>  // std::multiplies
 
+#include "aidge/backend/cpu/operator/MulImpl_kernels.hpp"
 #include "aidge/utils/Registrar.hpp"
 
 #include "aidge/backend/cpu/data/Broadcasting.hpp"
@@ -69,16 +70,70 @@ constexpr void DivImpl_cpu_forward_kernel(const std::size_t input1size_,
     }
 }
 
+
+template <class I1, class I2, class O>
+void DivImpl_cpu_backward_kernel(const std::size_t input0Length,
+                               const std::size_t input1Length,
+                               const std::size_t gradOutputLength,
+                               const std::vector<std::size_t>& dims0,
+                               const std::vector<std::size_t>& dims1,
+                               const std::vector<std::size_t>& outputDims,
+                               const void* input0_,
+                               const void* input1_,
+                               const void* grad_output_,
+                               void* gradientInput0_,
+                               void* gradientInput1_)
+{
+    const I1* input0 = static_cast<const I1*>(input0_);  // a
+    const I2* input1 = static_cast<const I2*>(input1_);  // b
+    const O* grad_output = static_cast<const O*>(grad_output_);
+    auto* grad_input_0 = static_cast<I1*>(gradientInput0_);  // gradient w.r.t. a
+    auto* grad_input_1 = static_cast<I2*>(gradientInput1_);  // gradient w.r.t. b
+
+    std::fill_n(grad_input_0, input0Length, static_cast<I1>(0));
+    std::fill_n(grad_input_1, input1Length, static_cast<I2>(0));
+
+    // Broadcast dims0 and dims1 to match the shape of outputDims
+    auto broadcastedDims0 = getBroadcastedDims(outputDims, dims0);
+    auto broadcastedDims1 = getBroadcastedDims(outputDims, dims1);
+
+    for (std::size_t i = 0; i < gradOutputLength; ++i) {
+        auto idxOutputGrad = getMultiDimIndices(outputDims, i);
+        std::vector<std::size_t> idxInput0(broadcastedDims0.size());
+        std::vector<std::size_t> idxInput1(broadcastedDims1.size());
+
+        // Map output indices to input indices, considering broadcasting
+        for (std::size_t dimension = 0; dimension < broadcastedDims0.size(); ++dimension) {
+            idxInput0[dimension] = (broadcastedDims0[dimension] == 1) ? 0 : idxOutputGrad[dimension];
+        }
+
+        for (std::size_t dimension = 0; dimension < broadcastedDims1.size(); ++dimension) {
+            idxInput1[dimension] = (broadcastedDims1[dimension] == 1) ? 0 : idxOutputGrad[dimension];
+        }
+
+        auto idx0 = getFlattenedIndex(broadcastedDims0, idxInput0);
+        auto idx1 = getFlattenedIndex(broadcastedDims1, idxInput1);
+
+        // grad_a = grad_output * (1/b)
+        grad_input_0[idx0] += static_cast<I1>(grad_output[i] / input1[idx1]);
+        
+        // grad_b = grad_output * (-a/b²)
+        grad_input_1[idx1] += static_cast<I2>(grad_output[i] * (-input0[idx0] / (input1[idx1] * input1[idx1])));
+    }
+}
+
+
 // Kernels registration to implementation entry point
 REGISTRAR(DivImpl_cpu,
     {DataType::Float32},
-    {ProdConso::inPlaceModel, Aidge::DivImpl_cpu_forward_kernel<float, float, float>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::DivImpl_cpu_forward_kernel<float, float, float>, Aidge::DivImpl_cpu_backward_kernel<float, float, float>});
 REGISTRAR(DivImpl_cpu,
     {DataType::Float64},
-    {ProdConso::inPlaceModel, Aidge::DivImpl_cpu_forward_kernel<double, double, double>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::DivImpl_cpu_forward_kernel<double, double, double>, Aidge::DivImpl_cpu_backward_kernel<double, double, double>});
 REGISTRAR(DivImpl_cpu,
     {DataType::Int32},
-    {ProdConso::inPlaceModel, Aidge::DivImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::DivImpl_cpu_forward_kernel<std::int32_t, std::int32_t, std::int32_t>, 
+          Aidge::DivImpl_cpu_backward_kernel<std::int32_t, std::int32_t, std::int32_t>});
 }  // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_DIVIMPL_KERNELS_H_ */
diff --git a/src/operator/DivImpl.cpp b/src/operator/DivImpl.cpp
index 135b32b5..67444cb8 100644
--- a/src/operator/DivImpl.cpp
+++ b/src/operator/DivImpl.cpp
@@ -152,5 +152,26 @@ void Aidge::DivImpl_cpu::forward() {
 
 template <>
 void Aidge::DivImpl_cpu::backward() {
-    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Div_Op on backend cpu");
+    const Div_Op& op_ = dynamic_cast<const Div_Op&>(mOp);
+
+    auto in0 = op_.getInput(0);
+    auto in1 = op_.getInput(1);
+    auto in0grad = op_.getInput(0)->grad();
+    auto in1grad = op_.getInput(1)->grad();
+    auto out0grad = op_.getOutput(0)->grad();
+
+    const auto impl = Registrar<DivImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    impl.backward(in0grad->size(),
+               in1grad->size(),
+               out0grad->size(),
+               in0->dims(),
+               in1->dims(),
+               out0grad->dims(),
+               getCPUPtr(in0),
+               getCPUPtr(in1),
+               getCPUPtr(out0grad),
+               getCPUPtr(in0grad),
+               getCPUPtr(in1grad));
 }
+
diff --git a/unit_tests/operator/Test_DivImpl.cpp b/unit_tests/operator/Test_DivImpl.cpp
index 4037b2ad..4e7657ed 100644
--- a/unit_tests/operator/Test_DivImpl.cpp
+++ b/unit_tests/operator/Test_DivImpl.cpp
@@ -322,4 +322,275 @@ TEST_CASE("[cpu/operator] Div", "[Div][CPU]") {
         }
     }
 }
+
+TEST_CASE("[CPU/Operator] Div(Backward)", "[Div][CPU][Backward]") {
+    std::shared_ptr<Div_Op> op = std::make_shared<Div_Op>();
+    op->setDataType(DataType::Float32);
+    op->setBackend("cpu");
+
+    // NOTE: The first four tests use fixed values, the last one uses random values but static dimensions.
+
+    SECTION("Case 1: 1D and 2D Tensors") {
+        const auto T0 = std::make_shared<Tensor>(
+            Array2D<cpptype_t<DataType::Float32>, 2, 3>({{{1, 2, 3}, {4, 5, 6}}}));
+
+        const auto T1 =
+            std::make_shared<Tensor>(Array1D<cpptype_t<DataType::Float32>, 3>({0.1, 0.2, 0.3}));
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>(
+            Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}})));
+        op->forwardDims();
+
+        op->backward();
+
+        const Tensor expectedGrad0 =
+            Array2D<cpptype_t<DataType::Float32>, 2, 3>({{{10, 5, 3.3333}, {10, 5, 3.3333}}});
+
+        const Tensor expectedGrad1 = Array1D<cpptype_t<DataType::Float32>, 3>({-500, -175, -100});
+
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1));
+    }
+
+    SECTION("Case 2: 3D and 1D tensors") {
+        const auto T0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+            {{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}},
+              {{7.0, 8.0, 9.0}, {10.0, 11.0, 12.0}}}}));
+
+        const auto T1 =
+            std::make_shared<Tensor>(Array1D<float, 3>({0.3, 0.2, 0.1}));
+
+        const auto newGrad = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+            {{{{1, 1, 1}, {1, 1, 1}}, {{1, 1, 1}, {1, 1, 1}}}}));
+
+        const Tensor expectedGrad0 =
+            Array3D<float, 2, 2, 3>({{{{3.3333, 5.0, 10}, {3.3333, 5.0, 10}},
+                                      {{3.3333, 5.0, 10}, {3.3333, 5.0, 10}}}});
+
+        const Tensor expectedGrad1 = Array1D<cpptype_t<DataType::Float32>, 3>({-244.4444, -650.0, -3000.0});
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(newGrad);
+        op->forwardDims();
+
+        op->backward();
+
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1));
+    }
+
+    SECTION("Case 3: 4D and 2D tensors") {
+        const auto T0 = std::make_shared<Tensor>(Array4D<cpptype_t<DataType::Float32>, 2, 2, 3, 3>(
+            {{{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}, {7.0, 8.0, 9.0}},
+               {{10.0, 11.0, 12.0}, {13.0, 14.0, 15.0}, {16.0, 17.0, 18.0}}},
+              {{{19.0, 20.0, 21.0}, {22.0, 23.0, 24.0}, {25.0, 26.0, 27.0}},
+               {{28.0, 29.0, 30.0},
+                {31.0, 32.0, 33.0},
+                {34.0, 35.0, 36.0}}}}}));
+
+        const auto T1 = std::make_shared<Tensor>(Array2D<cpptype_t<DataType::Float32>, 3, 3>(
+            {{{0.5, 0.3, 0.1}, {0.4, 0.2, 0.6}, {0.7, 0.8, 0.9}}}));
+
+        const auto newGrad =
+            std::make_shared<Tensor>(Array4D<cpptype_t<DataType::Float32>, 2, 2, 3, 3>(
+                {{{{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}},
+                   {{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}},
+                  {{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}},
+                   {{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}}}}));
+
+        const Tensor expectedGrad0 =
+            Array4D<cpptype_t<DataType::Float32>, 2, 2, 3, 3>(
+                {{{{{2, 3.3333, 10}, {2.5, 5.0, 1.66667}, {1.42857, 1.2500, 1.11111}},
+                   {{2, 3.3333, 10}, {2.5, 5.0, 1.66667}, {1.42857, 1.2500, 1.11111}}},
+                  {{{2, 3.3333, 10}, {2.5, 5.0, 1.66667}, {1.42857, 1.2500, 1.11111}},
+                   {{2, 3.3333, 10}, {2.5, 5.0, 1.66667}, {1.42857, 1.2500, 1.11111}}}}});
+
+        const Tensor expectedGrad1 =
+            Array2D<cpptype_t<DataType::Float32>, 3, 3>({{{-232.0, -688.888, -6600.0},
+                                   {-437.5, -1850.0, -216.66667},
+                                   {-167.3469, -134.3750, -111.111}}});
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(newGrad);
+        op->forwardDims();
+
+        op->backward();
+
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1));
+    }
+
+    SECTION("Case 4: 3D and 2D tensors") {
+        const auto T0 = std::make_shared<Tensor>(
+            Array3D<float, 2, 3, 4>({{{
+                                          {1.0, 2.0, 3.0, 4.0},
+                                          {5.0, 6.0, 7.0, 8.0},
+                                          {9.0, 10.0, 11.0, 12.0},
+                                      },
+                                      {
+                                          {13.0, 14.0, 15.0, 16.0},
+                                          {17.0, 18.0, 19.0, 20.0},
+                                          {21.0, 22.0, 23.0, 24.0},
+                                      }}}));
+
+        const auto T1 = std::make_shared<Tensor>(
+            Array2D<cpptype_t<DataType::Float32>, 3, 4>({{{0.1, 0.2, 0.3, 0.4},
+                                   {0.5, 0.6, 0.7, 0.8},
+                                   {0.9, 1.0, 1.1, 1.2}}}));
+
+        const auto newGrad = std::make_shared<Tensor>(
+            Array3D<cpptype_t<DataType::Float32>, 2, 3, 4>({{{
+                                          {1.0, 1.0, 1.0, 1.0},
+                                          {1.0, 1.0, 1.0, 1.0},
+                                          {1.0, 1.0, 1.0, 1.0},
+                                      },
+                                      {
+                                          {1.0, 1.0, 1.0, 1.0},
+                                          {1.0, 1.0, 1.0, 1.0},
+                                          {1.0, 1.0, 1.0, 1.0},
+                                      }}}));
+
+        const Tensor expectedGrad0 =
+            Array3D<cpptype_t<DataType::Float32>, 2, 3, 4>({{{
+                                       {10, 5, 3.33333, 2.5},
+                                       {2, 1.66667, 1.42857, 1.2500},
+                                       {1.11111, 1.0, 0.90909, 0.83333}},
+                                      {{10, 5, 3.33333, 2.5},
+                                       {2, 1.66667, 1.42857, 1.2500},
+                                       {1.11111, 1.0, 0.90909, 0.83333}}}});
+
+        const Tensor expectedGrad1 =
+            Array2D<cpptype_t<DataType::Float32>, 3, 4>({{
+                                   {-1400.0, -400.0, -200.0, -125.0},
+                                   {-88.0, -66.66667, -53.0612, -43.750},
+                                   {-37.0370, -32.0, -28.0992, -25.00}}});
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(newGrad);
+        op->forwardDims();
+
+        op->backward();
+
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1));
+    }
+
+    SECTION("Case 5: Tensors with random values") {
+
+        // Use random values
+        const std::vector<std::size_t> dims0 = {5, 2, 1, 7}; // First tensor
+        const std::vector<std::size_t> dims1 = {2, 6, 7};    // Second tensor
+        const std::vector<std::size_t> outputDims = {5, 2, 6, 7};
+
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> dist(0.1f, 1.0f);
+
+        auto T0 = std::make_shared<Tensor>(dims0);
+        T0->setDataType(DataType::Float32);
+        T0->setBackend("cpu");
+        float* input0Data = static_cast<float*>(T0->getImpl()->rawPtr());
+        // Fill with random values
+        for (std::size_t i = 0; i < T0->size(); ++i) {
+            input0Data[i] = dist(gen);
+        }
+
+        auto T1 = std::make_shared<Tensor>(dims1);
+        T1->setDataType(DataType::Float32);
+        T1->setBackend("cpu");
+        float* input1Data = static_cast<float*>(T1->getImpl()->rawPtr());
+        // Fill with random values
+        for (std::size_t i = 0; i < T1->size(); ++i) {
+            input1Data[i] = dist(gen);
+        }
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+
+        op->forwardDims();
+        op->forward();
+
+        Tensor expectedOutput{outputDims};
+        expectedOutput.setBackend("cpu");
+        float* expectedOutputData = static_cast<float*>(expectedOutput.getImpl()->rawPtr());
+
+        for (std::size_t n = 0; n < 5; ++n) {
+            for (std::size_t c = 0; c < 2; ++c) {
+                for (std::size_t h = 0; h < 6; ++h) {
+                    for (std::size_t w = 0; w < 7; ++w) {
+                        std::size_t outIdx = w + 7 * (h + 6 * (c + 2 * n));
+                        std::size_t in0Idx =
+                            w + 7 * (0 + 1 * (c + 2 * n)); // middle dim is 1
+                        std::size_t in1Idx =
+                            w + 7 * (h + 6 * c);           // no n dimension
+
+                        expectedOutputData[outIdx] = input0Data[in0Idx] / input1Data[in1Idx];
+                    }
+                }
+            }
+        }
+
+        auto outputTensor = op->getOutput(0);
+
+        REQUIRE(approxEq<float>(*outputTensor, expectedOutput));
+
+        // Backward pass
+        std::vector<float> gradOutputData(expectedOutput.size());
+        for (auto &val : gradOutputData) {
+            val = dist(gen);
+        }
+
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>());
+        op->getOutput(0)->grad()->resize(outputDims);
+        op->getOutput(0)->grad()->getImpl()->setRawPtr(gradOutputData.data(),
+                                                       expectedOutput.size());
+
+        // Compute reference gradients
+        std::vector<float> expectedGrad0(T0->size(), 0.0f);
+        std::vector<float> expectedGrad1(T1->size(), 0.0f);
+
+        for (std::size_t n = 0; n < 5; ++n) {
+            for (std::size_t c = 0; c < 2; ++c) {
+                for (std::size_t h = 0; h < 6; ++h) {
+                    for (std::size_t w = 0; w < 7; ++w) {
+                        std::size_t outIdx = w + 7 * (h + 6 * (c + 2 * n));
+                        std::size_t in0Idx = w + 7 * (0 + 1 * (c + 2 * n));
+                        std::size_t in1Idx = w + 7 * (h + 6 * c);
+
+                        expectedGrad0[in0Idx] += 
+                            gradOutputData[outIdx] * (1.0f / input1Data[in1Idx]);
+
+                        expectedGrad1[in1Idx] += 
+                            gradOutputData[outIdx] * (-input0Data[in0Idx] / (input1Data[in1Idx] * input1Data[in1Idx]));
+                    }
+                }
+            }
+        }
+
+        // Perform backward pass
+        op->backward();
+
+        auto expectedGrad0Tensor = std::make_shared<Tensor>();
+        expectedGrad0Tensor->resize(T0->dims());
+        expectedGrad0Tensor->setBackend("cpu");
+        expectedGrad0Tensor->setDataType(DataType::Float32);
+        expectedGrad0Tensor->getImpl()->setRawPtr(expectedGrad0.data(),
+                                                    expectedGrad0.size());
+
+        auto expectedGrad1Tensor = std::make_shared<Tensor>(T1->dims());
+        expectedGrad1Tensor->setBackend("cpu");
+        expectedGrad1Tensor->setDataType(DataType::Float32);
+        expectedGrad1Tensor->getImpl()->setRawPtr(expectedGrad1.data(),
+                                                    expectedGrad1.size());
+
+        // Verify backward pass
+        REQUIRE(approxEq<float>(*T0->grad(), *expectedGrad0Tensor));
+        REQUIRE(approxEq<float>(*T1->grad(), *expectedGrad1Tensor));
+    }
+}
 } // namespace Aidge
-- 
GitLab


From a30d9359a29a8c8b2b56129ac6c2324681bfa975 Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Thu, 20 Feb 2025 11:10:50 +0100
Subject: [PATCH 024/108] Added /bigobj for unit tests on Windows

---
 unit_tests/CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/unit_tests/CMakeLists.txt b/unit_tests/CMakeLists.txt
index 6c7af9c3..e1f261d0 100644
--- a/unit_tests/CMakeLists.txt
+++ b/unit_tests/CMakeLists.txt
@@ -25,6 +25,10 @@ target_link_libraries(tests${module_name} PRIVATE ${module_name})
 
 target_link_libraries(tests${module_name} PRIVATE Catch2::Catch2WithMain)
 
+target_compile_options(tests${module_name} PRIVATE
+    $<$<CXX_COMPILER_ID:MSVC>:
+    /bigobj>)
+
 list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras)
 include(CTest)
 include(Catch)
-- 
GitLab


From 45bedc954160b4456a2ed0f4784d731fe34e3b9e Mon Sep 17 00:00:00 2001
From: hrouis <houssemeddine.rouis92@gmail.com>
Date: Fri, 24 Jan 2025 16:06:22 +0100
Subject: [PATCH 025/108] add Equal operator

---
 include/aidge/backend/cpu.hpp                 |   1 +
 .../aidge/backend/cpu/operator/EqualImpl.hpp  |  32 +++
 .../cpu/operator/EqualImpl_kernels.hpp        | 163 ++++++++++++++
 src/operator/EqualImpl.cpp                    |  61 ++++++
 unit_tests/operator/Test_EqualImpl.cpp        | 205 ++++++++++++++++++
 5 files changed, 462 insertions(+)
 create mode 100644 include/aidge/backend/cpu/operator/EqualImpl.hpp
 create mode 100644 include/aidge/backend/cpu/operator/EqualImpl_kernels.hpp
 create mode 100644 src/operator/EqualImpl.cpp
 create mode 100644 unit_tests/operator/Test_EqualImpl.cpp

diff --git a/include/aidge/backend/cpu.hpp b/include/aidge/backend/cpu.hpp
index 5db19a2b..ffc03ae5 100644
--- a/include/aidge/backend/cpu.hpp
+++ b/include/aidge/backend/cpu.hpp
@@ -29,6 +29,7 @@
 #include "aidge/backend/cpu/operator/ConvImpl.hpp"
 #include "aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp"
 #include "aidge/backend/cpu/operator/DivImpl.hpp"
+#include "aidge/backend/cpu/operator/EqualImpl.hpp"
 #include "aidge/backend/cpu/operator/ErfImpl.hpp"
 #include "aidge/backend/cpu/operator/ExpandImpl.hpp"
 #include "aidge/backend/cpu/operator/FCImpl.hpp"
diff --git a/include/aidge/backend/cpu/operator/EqualImpl.hpp b/include/aidge/backend/cpu/operator/EqualImpl.hpp
new file mode 100644
index 00000000..e2489096
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/EqualImpl.hpp
@@ -0,0 +1,32 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_EQUALIMPL_H_
+#define AIDGE_CPU_OPERATOR_EQUALIMPL_H_
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Equal.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include <memory>
+#include <vector>
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using EqualImpl_cpu = OperatorImpl_cpu<Equal_Op,
+    void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*, void*)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(Equal_Op, "cpu", Aidge::EqualImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_EQUALIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/EqualImpl_kernels.hpp b/include/aidge/backend/cpu/operator/EqualImpl_kernels.hpp
new file mode 100644
index 00000000..3c8ff0f4
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/EqualImpl_kernels.hpp
@@ -0,0 +1,163 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_EQUALIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_EQUALIMPL_KERNELS_H_
+
+#include "aidge/backend/cpu/operator/EqualImpl.hpp"
+#include "aidge/utils/Registrar.hpp"
+
+namespace Aidge {
+
+namespace {
+// suppose values are contiguous in memory
+template <class I, class O>
+void equal_contiguous_arrays(const std::size_t input1size,
+                            const std::size_t input2size,
+                            const std::size_t output1size,
+                            const I* input1,
+                            const I* input2,
+                            O* output)
+{
+    for (std::size_t i = 0; i < output1size; ++i)
+    {
+        const std::size_t in1_id = (input1size != 1) ? i : 0;
+        const std::size_t in2_id = (input2size != 1) ? i : 0;
+        output[i] = static_cast<O>(input1[in1_id] == input2[in2_id]);
+    }
+}
+}
+
+
+template <class I, class O>
+void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
+                                std::vector<std::size_t> dims1,
+                                const std::vector<std::size_t>& outputDims,
+                                const void* input0_,
+                                const void* input1_,
+                                void* output_) {
+
+    const I* input_0 = static_cast<const I*>(input0_);
+    const I* input_1 = static_cast<const I*>(input1_);
+    O* output = static_cast<O*>(output_);
+
+    // [5,2,1,7] & [2,6,7]
+    // 1. Same number of dimensions -> [5,2,1,7] & [1,2,6,7]
+    // 2. Find the highest equal dimension -> 3
+    //    Exception: if the first diverging dimension is the last one, then -> 4 (dims.size())
+    // 3. Compute the highest number of contiguous data -> 7
+    // 4. Compute stride and offset step for the broadcast mechanism
+    // 5. Call a simple kernel
+
+    // special case for equal dimensions, the kernel is called with the entire arrays at once
+    if (dims0 == dims1) {
+        const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+        for (std::size_t i = 0; i < input0_contiguous_size; ++i)
+        {
+            output[i] = static_cast<O>(input_0[i] == input_1[i]);
+        }
+        return;
+    }
+
+    // set dimensions to be of equal size by filling the smallest one with ones.
+    if (dims0.size() > dims1.size()) {
+        dims1.insert(dims1.cbegin(), dims0.size() - dims1.size(), std::size_t(1));
+    }
+    else if (dims1.size() > dims0.size()) {
+        dims0.insert(dims0.cbegin(), dims1.size() - dims0.size(), std::size_t(1));
+    }
+
+    const std::size_t nbDims = dims0.size();
+
+    // Find the highest equal dimension
+    // std::size_t contiguousIdx = nbDims - 1;
+    std::size_t contiguousIdx = nbDims;
+    while (contiguousIdx-- > 0) {
+    // for (; contiguousIdx+1 > 0; --contiguousIdx) {
+        if (dims0[contiguousIdx] != dims1[contiguousIdx]) {
+            if (contiguousIdx == (nbDims -1)) { // last dimensions of one of the input Tensor are of size 1
+                const std::vector<std::size_t>& dims = (dims0[contiguousIdx] == 1) ? dims0 : dims1;
+                while ((contiguousIdx+1 > 0) && (dims[contiguousIdx] == 1)) {
+                    --contiguousIdx;
+                }
+            }
+            break;
+        }
+    }
+    ++contiguousIdx;
+
+    // Compute the highest number of contiguous data for each Tensor
+    const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin()+contiguousIdx, dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t input1_contiguous_size = std::accumulate(dims1.cbegin()+contiguousIdx, dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
+    const std::size_t output_contiguous_size = std::accumulate(outputDims.cbegin()+contiguousIdx, outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+
+    // initialize strides to iterate through data because of broadcasting
+    std::unique_ptr<std::int32_t[]> stride_post0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_post1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step0 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    std::unique_ptr<std::int32_t[]> stride_step1 = std::make_unique<std::int32_t[]>(contiguousIdx);
+    if (contiguousIdx > 0) {
+        stride_post0[contiguousIdx - 1] = 1;
+        stride_post1[contiguousIdx - 1] = 1;
+        for (std::size_t i = contiguousIdx - 2; i != static_cast<std::size_t>(-1); --i) {
+            stride_post0[i] = stride_post0[i+1]*static_cast<std::int32_t>(dims0[i+1]);
+            stride_post1[i] = stride_post1[i+1]*static_cast<std::int32_t>(dims1[i+1]);
+        }
+        for (std::size_t i = 0; i != contiguousIdx; ++i) {
+            stride_step0[i] = (dims0[i] == 1) ? 1 - stride_post0[i] : 1;
+            stride_step1[i] = (dims1[i] == 1) ? 1 - stride_post1[i] : 1;
+        }
+    }
+
+    // variables for arrays offsets
+    std::size_t offsetIn0 = 0;
+    std::size_t offsetIn1 = 0;
+    std::size_t offsetOut = 0;
+
+
+    std::size_t dim = contiguousIdx - 1;
+    const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
+    for (std::size_t stack = 0; stack < nbStacks;) {
+        equal_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
+                    input_0 + offsetIn0*input0_contiguous_size,
+                    input_1 + offsetIn1*input1_contiguous_size,
+                    output + offsetOut*output_contiguous_size);
+        if (++stack < nbStacks) {
+            std::size_t tmp_stack = stack;
+            while(tmp_stack % outputDims[dim] == 0) {
+                tmp_stack /= outputDims[dim];
+                dim--;
+            }
+            offsetIn0 += stride_step0[dim];
+            offsetIn1 += stride_step1[dim];
+            ++offsetOut;
+            dim = contiguousIdx - 1;
+        }
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(EqualImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float32}},
+    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(EqualImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}},
+    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(EqualImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}},
+    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr});
+REGISTRAR(EqualImpl_cpu,
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int64}},
+    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, nullptr});
+
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_EQUALIMPL_KERNELS_H_ */
diff --git a/src/operator/EqualImpl.cpp b/src/operator/EqualImpl.cpp
new file mode 100644
index 00000000..5926212e
--- /dev/null
+++ b/src/operator/EqualImpl.cpp
@@ -0,0 +1,61 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <cassert>
+#include <chrono>  // std::chrono::milliseconds
+#include <numeric> // std::accumulate
+#include <thread>  // std::this_thread::sleep_for
+#include <vector>
+
+#include "aidge/operator/Equal.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/backend/cpu/data/Broadcasting.hpp"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+
+#include "aidge/backend/cpu/operator/EqualImpl.hpp"
+#include "aidge/backend/cpu/operator/EqualImpl_kernels.hpp"
+
+template <>
+void Aidge::EqualImpl_cpu::forward() {
+    const Equal_Op& op = static_cast<const Equal_Op&>(mOp);
+    // Check inputs
+    AIDGE_ASSERT(op.getInput(0), "missing input in Equal operator");
+    AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Equal forward because the 0-th input has no implementation.");
+
+    AIDGE_ASSERT(op.getInput(1), "missing input in Equal operator");
+    AIDGE_ASSERT(op.getInput(1)->hasImpl(), "cannot run Equal forward because the 1st input has no implementation.");
+
+    AIDGE_ASSERT(op.getInput(1)->dataType() == op.getInput(0)->dataType(), "Cannot Equal inputs with two differents data type.");
+
+    // Find the correct kernel type
+    const auto impl = Registrar<EqualImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Convert input data (no overhead if not needed!)
+    // TODO: right now, if needed, memory will be allocated/deallocated at each
+    // call to forward(). We might put the following shared_ptr as members of
+    // this class to avoid that.
+    std::shared_ptr<Tensor> input0Fallback, input1Fallback, input2Fallback;
+    const auto& input0 = op.getInput(0)->refCastFrom(input0Fallback, *op.getInput(0));
+    const auto& input1 = op.getInput(1)->refCastFrom(input1Fallback, *op.getInput(1));
+
+
+    impl.forward(op.getInput(0)->dims(),
+                op.getInput(1)->dims(),
+                op.getOutput(0)->dims(),
+                input0.getImpl()->rawPtr(),
+                input1.getImpl()->rawPtr(),
+                getCPUPtr(op.getRawOutput(0)));
+}
+
+template <>
+void Aidge::EqualImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Equal_Op on backend cpu");
+}
diff --git a/unit_tests/operator/Test_EqualImpl.cpp b/unit_tests/operator/Test_EqualImpl.cpp
new file mode 100644
index 00000000..a229b8ce
--- /dev/null
+++ b/unit_tests/operator/Test_EqualImpl.cpp
@@ -0,0 +1,205 @@
+/********************************************************************************
+ * Copyright (c) 2024 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <catch2/catch_test_macros.hpp>
+#include <random>    // std::random_device, std::mt19937, std::uniform_real_distribution
+
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/Equal.hpp"
+
+#include "aidge/backend/cpu.hpp"
+
+using namespace Aidge;
+
+TEST_CASE("[cpu/operator] Equal(forward)", "[Equal][CPU]") {
+        SECTION("ForwardDims")
+    {
+        constexpr std::uint16_t NBTRIALS = 10;
+        // Create a random number generator
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
+        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
+        std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
+        std::uniform_int_distribution<int> boolDist(0,1);
+
+        SECTION("Same dimensions") {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                DimSize_t nbDims = nbDimsDist(gen);
+                std::vector<DimSize_t> dims(nbDims);
+                for (std::size_t i = 0; i < nbDims; i++) {
+                    dims[i] = dimSizeDist(gen);
+                }
+
+                std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims);
+                myInput1->setBackend("cpu");
+                myInput1->setDataType(DataType::Float32);
+                myInput1->zeros();
+                std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims);
+                myInput2->setBackend("cpu");
+                myInput2->setDataType(DataType::Float32);
+                myInput2->zeros();
+                std::shared_ptr<Node> myEqual = Equal();
+                auto op = std::static_pointer_cast<OperatorTensor>(myEqual -> getOperator());
+                op->associateInput(0,myInput1);
+                op->associateInput(1,myInput2);
+                op->setDataType(DataType::Float32);
+                op->setBackend("cpu");
+                op->forwardDims();
+
+                const auto outputDims = op->getOutput(0)->dims();
+                REQUIRE(outputDims == dims);
+            }
+        }
+        SECTION("Broadcasting") {
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                DimSize_t nbDims = nbDimsDist(gen);
+                std::vector<DimSize_t> dims1(nbDims, 1);
+                std::vector<DimSize_t> dims2(nbDims, 1);
+                std::vector<DimSize_t> expectedOutDims;
+                for (std::size_t i = 0; i < nbDims; i++) {
+                    DimSize_t dim = dimSizeDist(gen);
+                    if (boolDist(gen)) {
+                        dims1[i] = dim;
+                    }
+                    if (boolDist(gen)) {
+                        dims2[i] = dim;
+                    }
+                    expectedOutDims.push_back(std::max(dims1[i],dims2[i]));
+                }
+
+
+                std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims1);
+                myInput1->setBackend("cpu");
+                myInput1->setDataType(DataType::Float32);
+                myInput1->zeros();
+                std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims2);
+                myInput2->setBackend("cpu");
+                myInput2->setDataType(DataType::Float32);
+                myInput2->zeros();
+                std::shared_ptr<Node> myEqual = Equal();
+                auto op = std::static_pointer_cast<OperatorTensor>(myEqual -> getOperator());
+                op->associateInput(0,myInput1);
+                op->associateInput(1,myInput2);
+                op->setDataType(DataType::Float32);
+                op->setBackend("cpu");
+
+                op->forwardDims();
+
+                const auto outputDims = op->getOutput(0)->dims();
+                REQUIRE(outputDims == expectedOutDims);
+            }
+        }
+    }
+    SECTION("Same size inputs") {
+        std::shared_ptr<Tensor> input1 = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
+        {                                       //
+            {                                   //
+                {{20, 15},{31, 11},{22, 49}},   //
+                {{41, 10},{24, 51},{27, 52}},   //
+                {{26, 53},{27, 54},{28, 55}}    //
+            },                                  //
+            {                                   //
+                {{29, 56},{30, 57},{31, 58}},   //
+                {{32, 59},{33, 60},{34, 61}},   //
+                {{35, 62},{36, 63},{37, 64}}    //
+            },                                  //
+            {                                   //
+                {{38, 65},{39, 66},{40, 67}},   //
+                {{41, 68},{42, 69},{43, 70}},   //
+                {{44, 71},{45, 72},{46, 73}}    //
+            }                                   //
+        }                                       //
+    });                                         //
+        std::shared_ptr<Tensor> input2 = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
+            {                                       //
+                {                                   //
+                    {{20, 47},{21, 48},{22, 49}},   //
+                    {{23, 50},{24, 51},{25, 52}},   //
+                    {{17, 53},{27, 26},{14, 33}}    //
+                },                                  //
+                {                                   //
+                    {{29, 56},{30, 57},{31, 58}},   //
+                    {{72, 44},{33, 20},{27, 55}},   //
+                    {{35, 24},{25, 63},{28, 64}}    //
+                },                                  //
+                {                                   //
+                    {{32, 65},{39, 66},{40, 70}},   //
+                    {{41, 53},{42, 60},{34, 70}},   //
+                    {{44, 71},{30, 12},{46, 73}}    //
+                }                                   //
+            }                                       //
+        });                                         //
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
+            {
+                {
+                    {{1, 0},{0, 0},{1, 1}},
+                    {{0, 0},{1, 1},{0, 1}},
+                    {{0, 1},{1, 0},{0, 0}}
+                },
+                {
+                    {{1, 1},{1, 1},{1, 1}},
+                    {{0, 0},{1, 0},{0, 0}},
+                    {{1, 0},{0, 1},{0, 1}}
+                },
+                {
+                    {{0, 1},{1, 1},{1, 0}},
+                    {{1, 0},{1, 0},{0, 1}},
+                    {{1, 1},{0, 0},{1, 1}}
+                }
+            }
+        });
+
+        std::shared_ptr<Node> myEqual = Equal();
+        auto op = std::static_pointer_cast<OperatorTensor>(myEqual -> getOperator());
+        op->associateInput(0, input1);
+        op->associateInput(1, input2);
+        op->setBackend("cpu");
+        op->setDataType(DataType::Int32);
+        myEqual->forward();
+
+        REQUIRE(*(op->getOutput(0)) == *expectedOutput);
+    }
+
+    SECTION("Broadcasting") {
+        std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<int,1,3,3,2> {
+        {                                       //
+            {                                   //
+                {{10, 20},{22, 23},{20, 20}},   //
+                {{10, 15},{10, 29},{20, 20}},   //
+                {{26, 25},{33, 20},{10, 20}}    //
+            }                                   //
+        }                                       //
+        });                                     //
+
+        std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<int,2> {{10, 20}});  
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,1,3,3,2> {
+            {                                   //
+                {                               //
+                    {{ 1, 1},{ 0, 0},{ 0, 1}},  //
+                    {{ 1, 0},{ 1, 0},{ 0, 1}},  //
+                    {{ 0, 0},{ 0, 1},{ 1, 1}}   //
+                }                               //
+            }                                   //
+        });                                     //
+
+        std::shared_ptr<Node> myEqual = Equal();
+        auto op = std::static_pointer_cast<OperatorTensor>(myEqual -> getOperator());
+        op->associateInput(0, input_1);
+        op->associateInput(1, input_2);
+        op->setDataType(DataType::Int32);
+        op->setBackend("cpu");
+        myEqual->forward();
+        op->getOutput(0)->print();
+        expectedOutput->print();
+        REQUIRE(*op->getOutput(0) == *expectedOutput);
+    }
+}
\ No newline at end of file
-- 
GitLab


From 0f05e5fbcddbe69708d41afb7a41e34f132b134c Mon Sep 17 00:00:00 2001
From: hrouis <houssemeddine.rouis92@gmail.com>
Date: Fri, 24 Jan 2025 16:07:13 +0100
Subject: [PATCH 026/108] fix And operator

---
 .../backend/cpu/operator/AndImpl_kernels.hpp  |  29 ++-
 unit_tests/operator/Test_AndImpl.cpp          | 191 +++++++++---------
 2 files changed, 108 insertions(+), 112 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp
index 73b710e0..d7c8ebcf 100644
--- a/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AndImpl_kernels.hpp
@@ -20,7 +20,7 @@ namespace Aidge {
 namespace {
 // suppose values are contiguous in memory
 template <class I, class O>
-void equal_contiguous_arrays(const std::size_t input1size,
+void and_contiguous_arrays(const std::size_t input1size,
                             const std::size_t input2size,
                             const std::size_t output1size,
                             const I* input1,
@@ -31,14 +31,14 @@ void equal_contiguous_arrays(const std::size_t input1size,
     {
         const std::size_t in1_id = (input1size != 1) ? i : 0;
         const std::size_t in2_id = (input2size != 1) ? i : 0;
-        output[i] = static_cast<O>(input1[in1_id] == input2[in2_id]);
+        output[i] = static_cast<O>(input1[in1_id] && input2[in2_id]);
     }
 }
 }
 
 
 template <class I, class O>
-void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
+void AndImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
                                 std::vector<std::size_t> dims1,
                                 const std::vector<std::size_t>& outputDims,
                                 const void* input0_,
@@ -60,9 +60,8 @@ void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
     // special case for equal dimensions, the kernel is called with the entire arrays at once
     if (dims0 == dims1) {
         const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
-        for (std::size_t i = 0; i < input0_contiguous_size; ++i)
-        {
-            output[i] = static_cast<O>(input_0[i] == input_1[i]);
+        for (std::size_t i = 0; i < input0_contiguous_size; ++i) {
+            output[i] = static_cast<O>(input_0[i] && input_1[i]);
         }
         return;
     }
@@ -126,7 +125,7 @@ void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
     std::size_t dim = contiguousIdx - 1;
     const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
     for (std::size_t stack = 0; stack < nbStacks;) {
-        equal_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
+        and_contiguous_arrays<I,O>(input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
                     input_0 + offsetIn0*input0_contiguous_size,
                     input_1 + offsetIn1*input1_contiguous_size,
                     output + offsetOut*output_contiguous_size);
@@ -146,17 +145,17 @@ void EqualImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
 
 // Kernels registration to implementation entry point
 REGISTRAR(AndImpl_cpu,
-    {DataType::Float32},
-    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<float, float>, nullptr});
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float32}},
+    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<float, float>, nullptr});
 REGISTRAR(AndImpl_cpu,
-    {DataType::Float64},
-    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<double, double>, nullptr});
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}},
+    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<double, double>, nullptr});
 REGISTRAR(AndImpl_cpu,
-    {DataType::Int32},
-    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr});
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}},
+    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr});
 REGISTRAR(AndImpl_cpu,
-    {DataType::Int64},
-    {ProdConso::inPlaceModel, Aidge::EqualImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, nullptr});
+    {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int64}},
+    {ProdConso::inPlaceModel, Aidge::AndImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, nullptr});
 
 }  // namespace Aidge
 
diff --git a/unit_tests/operator/Test_AndImpl.cpp b/unit_tests/operator/Test_AndImpl.cpp
index c2309dce..978a89e5 100644
--- a/unit_tests/operator/Test_AndImpl.cpp
+++ b/unit_tests/operator/Test_AndImpl.cpp
@@ -26,75 +26,92 @@
 using namespace Aidge;
 
 TEST_CASE("[cpu/operator] And(forward)", "[And][CPU]") {
-        SECTION("ForwardDims")
-    {
+    SECTION("ForwardDims") {
         constexpr std::uint16_t NBTRIALS = 10;
         // Create a random number generator
         std::random_device rd;
         std::mt19937 gen(rd());
-        std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
-        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
-        std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
-        std::uniform_int_distribution<int> boolDist(0,1);
+        std::uniform_int_distribution<int> boolDist(0, 1); // Use 0 for false, 1 for true
+        std::uniform_int_distribution<std::size_t> dimSizeDist(2, 10);
+        std::uniform_int_distribution<std::size_t> nbDimsDist(1, 5);
 
         SECTION("Same dimensions") {
             for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
                 DimSize_t nbDims = nbDimsDist(gen);
                 std::vector<DimSize_t> dims(nbDims);
-                for (std::size_t i = 0; i < nbDims; i++) {
+                for (std::size_t i = 0; i < nbDims; ++i) {
                     dims[i] = dimSizeDist(gen);
                 }
-
+                const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                float* array0 = new float[nb_elements];
+                float* array1 = new float[nb_elements];
+                for (std::size_t i = 0; i < nb_elements; ++i) {
+                    array0[i] = boolDist(gen);
+                    array1[i] = boolDist(gen);
+                }
                 std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims);
-                myInput1->setBackend("cpu");
-                myInput1->setDataType(DataType::Float32);
-                myInput1->zeros();
                 std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims);
-                myInput2->setBackend("cpu");
+                myInput1->setDataType(DataType::Float32);
                 myInput2->setDataType(DataType::Float32);
-                myInput2->zeros();
+                myInput1->setBackend("cpu");
+                myInput2->setBackend("cpu");
+
+                myInput1 -> getImpl() -> setRawPtr(array0, nb_elements);
+                myInput2 -> getImpl() -> setRawPtr(array1, nb_elements);
+
                 std::shared_ptr<Node> myAnd = And();
-                auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator());
-                op->associateInput(0,myInput1);
-                op->associateInput(1,myInput2);
+                auto op = std::static_pointer_cast<OperatorTensor>(myAnd->getOperator());
+                op->associateInput(0, myInput1);
+                op->associateInput(1, myInput2);
                 op->setDataType(DataType::Float32);
                 op->setBackend("cpu");
                 op->forwardDims();
 
                 const auto outputDims = op->getOutput(0)->dims();
                 REQUIRE(outputDims == dims);
+                delete[] array0;
+                delete[] array1;
             }
         }
+
         SECTION("Broadcasting") {
             for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
                 DimSize_t nbDims = nbDimsDist(gen);
                 std::vector<DimSize_t> dims1(nbDims, 1);
                 std::vector<DimSize_t> dims2(nbDims, 1);
                 std::vector<DimSize_t> expectedOutDims;
-                for (std::size_t i = 0; i < nbDims; i++) {
+                for (std::size_t i = 0; i < nbDims; ++i) {
                     DimSize_t dim = dimSizeDist(gen);
-                    if (boolDist(gen)) {
-                        dims1[i] = dim;
-                    }
-                    if (boolDist(gen)) {
-                        dims2[i] = dim;
-                    }
-                    expectedOutDims.push_back(std::max(dims1[i],dims2[i]));
+                    if (boolDist(gen)) dims1[i] = dim;
+                    if (boolDist(gen)) dims2[i] = dim;
+                    expectedOutDims.push_back(std::max(dims1[i], dims2[i]));
                 }
 
+                const std::size_t nb_elements0 = std::accumulate(dims1.cbegin(), dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                const std::size_t nb_elements1 = std::accumulate(dims2.cbegin(), dims2.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                float* array0 = new float[nb_elements0];
+                float* array1 = new float[nb_elements1];
+                for (std::size_t i = 0; i < nb_elements0; ++i) {
+                    array0[i] = boolDist(gen);
+                }
+                for (std::size_t i = 0; i < nb_elements1; ++i) {
+                    array1[i] = boolDist(gen);
+                }
 
                 std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims1);
-                myInput1->setBackend("cpu");
-                myInput1->setDataType(DataType::Float32);
-                myInput1->zeros();
                 std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims2);
-                myInput2->setBackend("cpu");
+                myInput1->setDataType(DataType::Float32);
                 myInput2->setDataType(DataType::Float32);
-                myInput2->zeros();
+                myInput1->setBackend("cpu");
+                myInput2->setBackend("cpu");
+                myInput1 -> getImpl() -> setRawPtr(array0, nb_elements0);
+                myInput2 -> getImpl() -> setRawPtr(array1, nb_elements1);
+
+
                 std::shared_ptr<Node> myAnd = And();
-                auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator());
-                op->associateInput(0,myInput1);
-                op->associateInput(1,myInput2);
+                auto op = std::static_pointer_cast<OperatorTensor>(myAnd->getOperator());
+                op->associateInput(0, myInput1);
+                op->associateInput(1, myInput2);
                 op->setDataType(DataType::Float32);
                 op->setBackend("cpu");
 
@@ -102,80 +119,48 @@ TEST_CASE("[cpu/operator] And(forward)", "[And][CPU]") {
 
                 const auto outputDims = op->getOutput(0)->dims();
                 REQUIRE(outputDims == expectedOutDims);
+                delete[] array0;
+                delete[] array1;
             }
         }
     }
+
     SECTION("Same size inputs") {
-        std::shared_ptr<Tensor> input1 = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
-        {                                       //
-            {                                   //
-                {{20, 15},{31, 11},{22, 49}},   //
-                {{41, 10},{24, 51},{27, 52}},   //
-                {{26, 53},{27, 54},{28, 55}}    //
-            },                                  //
-            {                                   //
-                {{29, 56},{30, 57},{31, 58}},   //
-                {{32, 59},{33, 60},{34, 61}},   //
-                {{35, 62},{36, 63},{37, 64}}    //
-            },                                  //
-            {                                   //
-                {{38, 65},{39, 66},{40, 67}},   //
-                {{41, 68},{42, 69},{43, 70}},   //
-                {{44, 71},{45, 72},{46, 73}}    //
-            }                                   //
-        }                                       //
-    });                                         //
-        std::shared_ptr<Tensor> input2 = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
-            {                                       //
-                {                                   //
-                    {{20, 47},{21, 48},{22, 49}},   //
-                    {{23, 50},{24, 51},{25, 52}},   //
-                    {{17, 53},{27, 26},{14, 33}}    //
-                },                                  //
-                {                                   //
-                    {{29, 56},{30, 57},{31, 58}},   //
-                    {{72, 44},{33, 20},{27, 55}},   //
-                    {{35, 24},{25, 63},{28, 64}}    //
-                },                                  //
-                {                                   //
-                    {{32, 65},{39, 66},{40, 70}},   //
-                    {{41, 53},{42, 60},{34, 70}},   //
-                    {{44, 71},{30, 12},{46, 73}}    //
-                }                                   //
-            }                                       //
-        });                                         //
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
+        std::shared_ptr<Tensor> input1 = std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 2>{
             {
-                {
-                    {{1, 0},{0, 0},{1, 1}},
-                    {{0, 0},{1, 1},{0, 1}},
-                    {{0, 1},{1, 0},{0, 0}}
-                },
-                {
-                    {{1, 1},{1, 1},{1, 1}},
-                    {{0, 0},{1, 0},{0, 0}},
-                    {{1, 0},{0, 1},{0, 1}}
-                },
-                {
-                    {{0, 1},{1, 1},{1, 0}},
-                    {{1, 0},{1, 0},{0, 1}},
-                    {{1, 1},{0, 0},{1, 1}}
-                }
-            }
-        });
+                {{{1, 0}, {0, 1}},
+                {{1, 1}, {0, 0}}},
+                {{{0, 1}, {1, 0}},
+                {{1, 0}, {0, 1}}}}
+            });
+        std::shared_ptr<Tensor> input2 = std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 2>{
+            {
+                {{{1, 1}, {0, 0}},
+                {{0, 1}, {1, 1}}},
+                {{{1, 1}, {0, 0}},
+                {{0, 1}, {1, 0}}}}
+            });
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<float, 2, 2, 2, 2>{
+            {
+                {{{1, 0}, {0, 0}},
+                {{0, 1}, {0, 0}}},
+                {{{0, 1}, {0, 0}},
+                {{0, 0}, {0, 0}}}}
+            });
 
         std::shared_ptr<Node> myAnd = And();
-        auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator());
+        auto op = std::static_pointer_cast<OperatorTensor>(myAnd->getOperator());
         op->associateInput(0, input1);
         op->associateInput(1, input2);
         op->setBackend("cpu");
-        op->setDataType(DataType::Int32);
+        op->setDataType(DataType::Float32);
         myAnd->forward();
-
+        op->getOutput(0)->print();
         REQUIRE(*(op->getOutput(0)) == *expectedOutput);
     }
 
     SECTION("Broadcasting") {
+<<<<<<< HEAD
         std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<int,1,3,3,2> {
         {                                       //
             {                                   //
@@ -196,16 +181,28 @@ TEST_CASE("[cpu/operator] And(forward)", "[And][CPU]") {
                 }                               //
             }                                   //
         });                                     //
+=======
+        std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<float, 1, 2, 2, 2>{
+            {
+                {{{1, 0}, {1, 0}},
+                {{1, 1}, {0, 0}}}}
+            });
+        std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<float, 2>{{1, 0}});
+        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<float, 1, 2, 2, 2>{
+            {
+                {{{1, 0}, {1, 0}},
+                {{1, 0}, {0, 0}}}}
+            });
+>>>>>>> fix and kernel and unit tests
 
         std::shared_ptr<Node> myAnd = And();
-        auto op = std::static_pointer_cast<OperatorTensor>(myAnd -> getOperator());
+        auto op = std::static_pointer_cast<OperatorTensor>(myAnd->getOperator());
         op->associateInput(0, input_1);
         op->associateInput(1, input_2);
-        op->setDataType(DataType::Int32);
+        op->setDataType(DataType::Float32);
         op->setBackend("cpu");
         myAnd->forward();
-        op->getOutput(0)->print();
-        expectedOutput->print();
-        REQUIRE(*op->getOutput(0) == *expectedOutput);
+
+        REQUIRE(*(op->getOutput(0)) == *expectedOutput);
     }
-}
\ No newline at end of file
+}
-- 
GitLab


From 39117d5af7bca55f0aa55a13db0770479378ed90 Mon Sep 17 00:00:00 2001
From: hrouis <houssemeddine.rouis92@gmail.com>
Date: Fri, 24 Jan 2025 16:08:17 +0100
Subject: [PATCH 027/108] add dilations to maxpool

---
 .../backend/cpu/operator/MaxPoolingImpl.hpp   |   1 +
 .../cpu/operator/MaxPoolingImpl_kernels.hpp   | 126 ++----------------
 src/operator/MaxPoolingImpl.cpp               |   1 +
 unit_tests/operator/Test_MaxPoolingImpl.cpp   |  35 +++++
 4 files changed, 49 insertions(+), 114 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp b/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp
index 68cc3621..062088a1 100644
--- a/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp
+++ b/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp
@@ -28,6 +28,7 @@ namespace Aidge {
 using MaxPooling2D_Op = MaxPooling_Op<2>;
 using MaxPoolingImpl2D_cpu = OperatorImpl_cpu<MaxPooling_Op<2>,
     void(const std::array<DimSize_t, 2>&,
+                            const std::array<DimSize_t, 2>&,
                             const std::array<DimSize_t, 2>&,
                             const bool,
                             const std::array<DimSize_t, 4> &,
diff --git a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
index 7b6f04f1..250b11b0 100644
--- a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
@@ -35,28 +35,23 @@ namespace Aidge {
 template <class I, class O>
 void MaxPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
                                         const std::array<DimSize_t, 2>& kernelDims,
+                                        const std::array<DimSize_t, 2>& dilations,
                                         const bool /*ceilMode*/,
                                         const std::array<DimSize_t, 4> &dims,
                                         const void *input_,
                                         void *output_) {
-    // FIXME: missing convolution parameters as arguments
     const I *input = static_cast<const I *>(input_);
     O *output = static_cast<O *>(output_);
 
     // output H size
     const std::size_t oxSize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(dims[2] - kernelDims[0] + strideDims[0]) /
+            static_cast<std::size_t>(std::floor(static_cast<float>(dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0]) /
                                 static_cast<float>(strideDims[0])));
     // output W size
     const std::size_t oySize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(dims[3] - kernelDims[1] + strideDims[1]) /
+            static_cast<std::size_t>(std::floor(static_cast<float>(dims[3] - (kernelDims[1] - 1) * dilations[1] - 1 + strideDims[1]) /
                                 static_cast<float>(strideDims[1])));
 
-    // TODO: kernel computation
-    // output (batch, outCh, Xout, Yout)
-    // input  (batch, ch, Xin, Yin)
-    // weight (outCh, ch, kernelX, kernelY)
-    // does not take Dilation parameter into account
     using signedsize = std::make_signed<std::size_t>::type;
     for (std::size_t batch = 0; batch < dims[0]; ++batch) {
         for (std::size_t ch = 0; ch < dims[1]; ++ch) {
@@ -77,12 +72,15 @@ void MaxPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideD
                     I poolValue(0.0);
                     bool valid = false;
 
-                    for (unsigned int channel = 0; channel < dims[1];
-                            ++channel){
-                        for (unsigned int sy = syMin; sy < syMax; ++sy) {
-                            for (unsigned int sx = sxMin; sx < sxMax; ++sx)
-                            {
-                                const I value = input[iIndex + (ix+sx)*dims[3] + (iy+sy)];
+                    for (unsigned int sy = syMin; sy < syMax; ++sy) {
+                        for (unsigned int sx = sxMin; sx < sxMax; ++sx) {
+                            // Apply dilation factor to kernel indices
+                            const std::size_t dilated_sx = sx * dilations[0];
+                            const std::size_t dilated_sy = sy * dilations[1];
+
+                            // Ensure indices are within bounds
+                            if ((ix + dilated_sx) < dims[2] && (iy + dilated_sy) < dims[3]) {
+                                const I value = input[iIndex + (ix + dilated_sx) * dims[3] + (iy + dilated_sy)];
 
                                 if (!valid || value > poolValue) {
                                     poolValue = value;
@@ -98,106 +96,6 @@ void MaxPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideD
     }
 }
 
-//N2D2 version
-/*
-template <class T>
-void N2D2::PoolCell_Frame_Kernels::forwardMax(const T* alpha,
-                                              const Tensor<T>&
-                                              inputs,
-                                              const Descriptor& desc,
-                                              const T* beta,
-                                              Tensor<T>& outputs,
-                                              Tensor<ArgMax>& argMax,
-                                              bool useArgMax,
-                                              const Tensor<bool>& maps)
-{
-    const unsigned int size = inputs.dimB() * outputs.dimZ();
-
-#if defined(_OPENMP) && _OPENMP >= 200805
-#pragma omp parallel for collapse(2) if (size > 16)
-#else
-#pragma omp parallel for if (inputs.dimB() > 4 && size > 16)
-#endif
-    for (int batchPos = 0; batchPos < (int)inputs.dimB(); ++batchPos) {
-        for (unsigned int output = 0; output < outputs.dimZ(); ++output) {
-            for (unsigned int oy = 0; oy < outputs.dimY(); ++oy) {
-                for (unsigned int ox = 0; ox < outputs.dimX(); ++ox) {
-                    const unsigned int sxMin = (unsigned int)std::max(
-                        desc.padding[0] - (int)(ox * desc.stride[0]), 0);
-                    const unsigned int syMin = (unsigned int)std::max(
-                        desc.padding[1] - (int)(oy * desc.stride[1]), 0);
-                    const unsigned int sxMax = Utils::clamp
-                        <int>(inputs.dimX() + desc.padding[0] - ox * desc.stride[0],
-                              0,
-                              desc.pool[0]);
-                    const unsigned int syMax = Utils::clamp
-                        <int>(inputs.dimY() + desc.padding[1] - oy * desc.stride[1],
-                              0,
-                              desc.pool[1]);
-
-                    const int ix = (int)(ox * desc.stride[0]) - desc.padding[0];
-                    const int iy = (int)(oy * desc.stride[1]) - desc.padding[1];
-
-                    T poolValue(0.0);
-
-                    // For each output, compute the pool value
-                    if (useArgMax) {
-                        const ArgMax inputMax
-                            = argMax(ox, oy, output, batchPos);
-
-                        if (inputMax.valid) {
-                            poolValue = inputs(inputMax.ix,
-                                               inputMax.iy,
-                                               inputMax.channel,
-                                               batchPos);
-                        }
-                    }
-                    else {
-                        unsigned int ixMax = 0;
-                        unsigned int iyMax = 0;
-                        unsigned int channelMax = 0;
-                        bool valid = false;
-
-                        for (unsigned int channel = 0; channel < inputs.dimZ();
-                             ++channel)
-                        {
-                            if (!maps.empty() && !maps(output, channel))
-                                continue;
-
-                            for (unsigned int sy = syMin; sy < syMax; ++sy) {
-                                for (unsigned int sx = sxMin; sx < sxMax; ++sx)
-                                {
-                                    const T value = inputs(ix + sx,
-                                                                 iy + sy,
-                                                                 channel,
-                                                                 batchPos);
-
-                                    if (!valid || value > poolValue) {
-                                        poolValue = value;
-                                        valid = true;
-
-                                        ixMax = ix + sx;
-                                        iyMax = iy + sy;
-                                        channelMax = channel;
-                                    }
-                                }
-                            }
-                        }
-
-                        argMax(ox, oy, output, batchPos)
-                            = ArgMax(ixMax, iyMax, channelMax, valid);
-                    }
-
-                    outputs(ox, oy, output, batchPos)
-                        = (*alpha) * poolValue
-                          + (*beta) * outputs(ox, oy, output, batchPos);
-                }
-            }
-        }
-    }
-}
-
-*/
 
 // Kernels registration to implementation entry point
 REGISTRAR(MaxPoolingImpl2D_cpu,
diff --git a/src/operator/MaxPoolingImpl.cpp b/src/operator/MaxPoolingImpl.cpp
index 90075a39..13ef75b0 100644
--- a/src/operator/MaxPoolingImpl.cpp
+++ b/src/operator/MaxPoolingImpl.cpp
@@ -30,6 +30,7 @@ void Aidge::MaxPoolingImpl2D_cpu::forward() {
     // Call kernel
     impl.forward(op_.strideDims(),
                 op_.kernelDims(),
+                op_.dilations(),
                 op_.ceilMode(),
                 op_.getInput(0)->template dims<4>(),
                 getCPUPtr(mOp.getRawInput(0)),
diff --git a/unit_tests/operator/Test_MaxPoolingImpl.cpp b/unit_tests/operator/Test_MaxPoolingImpl.cpp
index de02df2b..6b7e6d2f 100644
--- a/unit_tests/operator/Test_MaxPoolingImpl.cpp
+++ b/unit_tests/operator/Test_MaxPoolingImpl.cpp
@@ -80,4 +80,39 @@ TEST_CASE("[cpu/operator] MaxPooling(forward)", "[MaxPooling][CPU]") {
         op->getOutput(0)->print();
         REQUIRE(*(op->getOutput(0)) == myOutput);
     }
+    SECTION("Dilation") {
+        std::shared_ptr<Node> myMaxPool = MaxPooling({2,2}, "mycdw", {2,2}, {2,2}); // Dilation 2x2
+        auto op = std::static_pointer_cast<OperatorTensor>(myMaxPool -> getOperator());
+
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array4D<float,2,2,2,2> {
+            {
+                {
+                    {
+                        {0.71470, 0.52770},
+                        {0.71470, 0.48740}
+                    },
+                    {
+                        {2.23290, 0.48590},
+                        {2.23290, 0.07000}
+                    }
+                },
+                {
+                    {
+                        {1.76530, 1.20710},
+                        {1.76530, 1.20710}
+                    },
+                    {
+                        {1.04290, 0.67760},
+                        {1.72170, 0.67760}
+                    }
+                }
+            }
+        });
+        myMaxPool->getOperator()->associateInput(0,myInput);
+        myMaxPool->getOperator()->setDataType(DataType::Float32);
+        myMaxPool->getOperator()->setBackend("cpu");
+        myMaxPool->forward();
+        op->getOutput(0)->print();
+        REQUIRE(*(op->getOutput(0)) == *myOutput);
+    }
 }
\ No newline at end of file
-- 
GitLab


From 1096664a72bad887267fc06324368b026a699a8e Mon Sep 17 00:00:00 2001
From: hrouis <houssemeddine.rouis92@gmail.com>
Date: Mon, 27 Jan 2025 15:21:08 +0100
Subject: [PATCH 028/108] add dilations and cielmode to AvgPooling

---
 .../backend/cpu/operator/AvgPoolingImpl.hpp   |  2 +
 .../cpu/operator/AvgPoolingImpl_kernels.hpp   | 76 ++++++++-----------
 src/operator/AvgPoolingImpl.cpp               |  2 +
 unit_tests/operator/Test_AndImpl.cpp          | 23 ------
 4 files changed, 36 insertions(+), 67 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/AvgPoolingImpl.hpp b/include/aidge/backend/cpu/operator/AvgPoolingImpl.hpp
index adea96ca..7c76657f 100644
--- a/include/aidge/backend/cpu/operator/AvgPoolingImpl.hpp
+++ b/include/aidge/backend/cpu/operator/AvgPoolingImpl.hpp
@@ -28,8 +28,10 @@ namespace Aidge {
 using AvgPooling2D_Op = AvgPooling_Op<2>;
 using AvgPoolingImpl2D_cpu = OperatorImpl_cpu<AvgPooling_Op<2>,
     void(const std::array<DimSize_t, 2>&,
+        const std::array<DimSize_t, 2>&,
         const std::array<DimSize_t, 2>&,
         const std::array<DimSize_t, 4>&,
+        bool,
         const void *,
         void *)>;
 
diff --git a/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
index f6da9dcb..68dbfbe7 100644
--- a/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
@@ -35,66 +35,54 @@ namespace Aidge {
 template <class I, class O>
 void AvgPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
                                         const std::array<DimSize_t, 2>& kernelDims,
+                                        const std::array<DimSize_t, 2>& dilations,
                                         const std::array<DimSize_t, 4> &dims,
+                                        bool ceilMode,
                                         const void *input_,
                                         void *output_) {
-    // FIXME: missing convolution attributes as arguments
     const I *input = static_cast<const I *>(input_);
     O *output = static_cast<O *>(output_);
 
+    // Calculate output dimensions based on ceilMode and dilations
+    auto compute_output_size = [&](DimSize_t inputDim, DimSize_t kernelDim, DimSize_t stride, DimSize_t dilation) {
+        DimSize_t effectiveKernelDim = (kernelDim - 1) * dilation + 1;
+        float result = static_cast<float>(inputDim - effectiveKernelDim + stride) / static_cast<float>(stride);
+        return ceilMode ? static_cast<DimSize_t>(std::ceil(result)) : static_cast<DimSize_t>(std::floor(result));
+    };
 
-    // output H size
-    const std::size_t oxSize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(dims[2] - kernelDims[0] + strideDims[0]) /
-                                static_cast<float>(strideDims[0])));
-    // output W size
-    const std::size_t oySize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(dims[3] - kernelDims[1] + strideDims[1]) /
-                                static_cast<float>(strideDims[1])));
+    const std::size_t oxSize = compute_output_size(dims[2], kernelDims[0], strideDims[0], dilations[0]);
+    const std::size_t oySize = compute_output_size(dims[3], kernelDims[1], strideDims[1], dilations[1]);
 
-    // TODO: kernel computation
-    // output (batch, outCh, Xout, Yout)
-    // input  (batch, ch, Xin, Yin)
-    // weight (outCh, ch, kernelX, kernelY)
-    // does not take Dilation attribute into account
     using signedsize = std::make_signed<std::size_t>::type;
+
     for (std::size_t batch = 0; batch < dims[0]; ++batch) {
         for (std::size_t ch = 0; ch < dims[1]; ++ch) {
-            const std::size_t oIndex = (ch + batch*dims[1]) * oxSize * oySize;
-            const std::size_t iIndex = (ch + batch*dims[1]) * dims[2] * dims[3];
-            std::fill(output + oIndex, output+(oIndex+oxSize*oySize), 0);
+            const std::size_t oIndex = (ch + batch * dims[1]) * oxSize * oySize;
+            const std::size_t iIndex = (ch + batch * dims[1]) * dims[2] * dims[3];
+            std::fill(output + oIndex, output + (oIndex + oxSize * oySize), 0);
+
             for (std::size_t ox = 0; ox < oxSize; ++ox) {
-                const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]);
-                const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
-                const std::size_t sxMax = (static_cast<signedsize>(dims[2]) + difx) < 0 ? 0 : ((dims[2] + difx) > kernelDims[0] ? kernelDims[0] : dims[2] + difx);
+                const signedsize startx = static_cast<signedsize>(ox * strideDims[0]) - (dilations[0] - 1);
+                const std::size_t sxMin = static_cast<std::size_t>(std::max(startx, signedsize(0)));
+                const std::size_t sxMax = std::min(dims[2], static_cast<std::size_t>(startx + kernelDims[0] * dilations[0]));
+
                 for (std::size_t oy = 0; oy < oySize; ++oy) {
-                    const signedsize dify = static_cast<signedsize>(- oy * strideDims[1]);
-                    const std::size_t syMin = static_cast<std::size_t>(std::max(dify, signedsize(0)));
-                    const std::size_t syMax = (static_cast<signedsize>(dims[3]) + dify) < 0 ? 0 : ((dims[3] + dify) > kernelDims[1] ? kernelDims[1] : dims[3] + dify);
-                    const std::size_t oIndexFull = oIndex + ox*oySize + oy;
-                    const std::size_t ix = ox * strideDims[0];
-                    const std::size_t iy = oy * strideDims[1];
+                    const signedsize starty = static_cast<signedsize>(oy * strideDims[1]) - (dilations[1] - 1);
+                    const std::size_t syMin = static_cast<std::size_t>(std::max(starty, signedsize(0)));
+                    const std::size_t syMax = std::min(dims[3], static_cast<std::size_t>(starty + kernelDims[1] * dilations[1]));
 
-                    if (sxMin == 0 && syMin == 0 && sxMax == 3 && syMax == 3) {
-                        output[oIndexFull] += static_cast<O>(
-                                               input[iIndex + (ix+0)*dims[3] + (iy+0)] +
-                                               input[iIndex + (ix+0)*dims[3] + (iy+1)] +
-                                               input[iIndex + (ix+0)*dims[3] + (iy+2)] +
-                                               input[iIndex + (ix+1)*dims[3] + (iy+0)] +
-                                               input[iIndex + (ix+1)*dims[3] + (iy+1)] +
-                                               input[iIndex + (ix+1)*dims[3] + (iy+2)] +
-                                               input[iIndex + (ix+2)*dims[3] + (iy+0)] +
-                                               input[iIndex + (ix+2)*dims[3] + (iy+1)] +
-                                               input[iIndex + (ix+2)*dims[3] + (iy+2)]) / O(9);
-                    } else {
-                        for (std::size_t sx = sxMin; sx < sxMax; ++sx) {
-                            for (std::size_t sy = syMin; sy < syMax; ++sy) {
-                                output[oIndexFull] += input[iIndex + (ix+sx)*dims[3] + (iy+sy)];
-                            }
+                    const std::size_t oIndexFull = oIndex + ox * oySize + oy;
+                    O sum = static_cast<O>(0);
+                    std::size_t count = 0;
+
+                    for (std::size_t sx = sxMin; sx < sxMax; sx += dilations[0]) {
+                        for (std::size_t sy = syMin; sy < syMax; sy += dilations[1]) {
+                            sum += static_cast<O>(input[iIndex + sx * dims[3] + sy]);
+                            ++count;
                         }
-                        // padding not used
-                        output[oIndexFull] /= (sxMax - sxMin) * (syMax - syMin);
                     }
+
+                    output[oIndexFull] = sum / static_cast<O>(count);
                 }
             }
         }
diff --git a/src/operator/AvgPoolingImpl.cpp b/src/operator/AvgPoolingImpl.cpp
index 01a5e8cf..eb5ef87b 100644
--- a/src/operator/AvgPoolingImpl.cpp
+++ b/src/operator/AvgPoolingImpl.cpp
@@ -32,7 +32,9 @@ void Aidge::AvgPoolingImpl2D_cpu::forward() {
     // Call kernel
     impl.forward(op_.strideDims(),
                op_.kernelDims(),
+               op_.dilations(),
                op_.getInput(0)->template dims<4>(),
+               op_.ceilMode(),
                getCPUPtr(op_.getInput(0)),
                getCPUPtr(op_.getOutput(0)));
 }
diff --git a/unit_tests/operator/Test_AndImpl.cpp b/unit_tests/operator/Test_AndImpl.cpp
index 978a89e5..148298d5 100644
--- a/unit_tests/operator/Test_AndImpl.cpp
+++ b/unit_tests/operator/Test_AndImpl.cpp
@@ -160,28 +160,6 @@ TEST_CASE("[cpu/operator] And(forward)", "[And][CPU]") {
     }
 
     SECTION("Broadcasting") {
-<<<<<<< HEAD
-        std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<int,1,3,3,2> {
-        {                                       //
-            {                                   //
-                {{10, 20},{22, 23},{20, 20}},   //
-                {{10, 15},{10, 29},{20, 20}},   //
-                {{26, 25},{33, 20},{10, 20}}    //
-            }                                   //
-        }                                       //
-        });                                     //
-
-        std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<int,2> {{10, 20}});
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,1,3,3,2> {
-            {                                   //
-                {                               //
-                    {{ 1, 1},{ 0, 0},{ 0, 1}},  //
-                    {{ 1, 0},{ 1, 0},{ 0, 1}},  //
-                    {{ 0, 0},{ 0, 1},{ 1, 1}}   //
-                }                               //
-            }                                   //
-        });                                     //
-=======
         std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<float, 1, 2, 2, 2>{
             {
                 {{{1, 0}, {1, 0}},
@@ -193,7 +171,6 @@ TEST_CASE("[cpu/operator] And(forward)", "[And][CPU]") {
                 {{{1, 0}, {1, 0}},
                 {{1, 0}, {0, 0}}}}
             });
->>>>>>> fix and kernel and unit tests
 
         std::shared_ptr<Node> myAnd = And();
         auto op = std::static_pointer_cast<OperatorTensor>(myAnd->getOperator());
-- 
GitLab


From a814fc02df5cb6f5850a99a350d2e6c986da5838 Mon Sep 17 00:00:00 2001
From: hrouis <houssemeddine.rouis92@gmail.com>
Date: Mon, 3 Feb 2025 10:11:02 +0100
Subject: [PATCH 029/108] handle ceil_mode in pooling kernels

---
 .../cpu/operator/AvgPoolingImpl_kernels.hpp   | 56 ++++++++++++-------
 .../cpu/operator/MaxPoolingImpl_kernels.hpp   | 20 ++++---
 unit_tests/operator/Test_AvgPoolingImpl.cpp   | 35 +++++++++++-
 3 files changed, 82 insertions(+), 29 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
index 68dbfbe7..78f8446a 100644
--- a/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
@@ -43,15 +43,20 @@ void AvgPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideD
     const I *input = static_cast<const I *>(input_);
     O *output = static_cast<O *>(output_);
 
-    // Calculate output dimensions based on ceilMode and dilations
-    auto compute_output_size = [&](DimSize_t inputDim, DimSize_t kernelDim, DimSize_t stride, DimSize_t dilation) {
-        DimSize_t effectiveKernelDim = (kernelDim - 1) * dilation + 1;
-        float result = static_cast<float>(inputDim - effectiveKernelDim + stride) / static_cast<float>(stride);
-        return ceilMode ? static_cast<DimSize_t>(std::ceil(result)) : static_cast<DimSize_t>(std::floor(result));
-    };
-
-    const std::size_t oxSize = compute_output_size(dims[2], kernelDims[0], strideDims[0], dilations[0]);
-    const std::size_t oySize = compute_output_size(dims[3], kernelDims[1], strideDims[1], dilations[1]);
+    // output H size
+    const std::size_t oxSize = 
+        ceilMode 
+        ? static_cast<std::size_t>(std::ceil(static_cast<float>(dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0]) /
+                                            static_cast<float>(strideDims[0])))
+        : static_cast<std::size_t>(std::floor(static_cast<float>(dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0]) /
+                                            static_cast<float>(strideDims[0])));
+    // output W size
+    const std::size_t oySize = 
+        ceilMode 
+        ? static_cast<std::size_t>(std::ceil(static_cast<float>(dims[3] - (kernelDims[1] - 1) * dilations[1] - 1 + strideDims[1]) /
+                                            static_cast<float>(strideDims[1])))
+        : static_cast<std::size_t>(std::floor(static_cast<float>(dims[3] - (kernelDims[1] - 1) * dilations[1] - 1 + strideDims[1]) /
+                                            static_cast<float>(strideDims[1])));
 
     using signedsize = std::make_signed<std::size_t>::type;
 
@@ -59,30 +64,39 @@ void AvgPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideD
         for (std::size_t ch = 0; ch < dims[1]; ++ch) {
             const std::size_t oIndex = (ch + batch * dims[1]) * oxSize * oySize;
             const std::size_t iIndex = (ch + batch * dims[1]) * dims[2] * dims[3];
-            std::fill(output + oIndex, output + (oIndex + oxSize * oySize), 0);
 
             for (std::size_t ox = 0; ox < oxSize; ++ox) {
-                const signedsize startx = static_cast<signedsize>(ox * strideDims[0]) - (dilations[0] - 1);
-                const std::size_t sxMin = static_cast<std::size_t>(std::max(startx, signedsize(0)));
-                const std::size_t sxMax = std::min(dims[2], static_cast<std::size_t>(startx + kernelDims[0] * dilations[0]));
+                const signedsize difx = static_cast<signedsize>(-ox * strideDims[0]);
+                const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
+                const std::size_t sxMax = (static_cast<signedsize>(dims[2]) + difx) < 0 ? 0 : ((dims[2] + difx) > kernelDims[0] ? kernelDims[0] : dims[2] + difx);
 
                 for (std::size_t oy = 0; oy < oySize; ++oy) {
-                    const signedsize starty = static_cast<signedsize>(oy * strideDims[1]) - (dilations[1] - 1);
-                    const std::size_t syMin = static_cast<std::size_t>(std::max(starty, signedsize(0)));
-                    const std::size_t syMax = std::min(dims[3], static_cast<std::size_t>(starty + kernelDims[1] * dilations[1]));
+                    const signedsize dify = static_cast<signedsize>(-oy * strideDims[1]);
+                    const std::size_t syMin = static_cast<std::size_t>(std::max(dify, signedsize(0)));
+                    const std::size_t syMax = (static_cast<signedsize>(dims[3]) + dify) < 0 ? 0 : ((dims[3] + dify) > kernelDims[1] ? kernelDims[1] : dims[3] + dify);
 
                     const std::size_t oIndexFull = oIndex + ox * oySize + oy;
+                    const std::size_t ix = ox * strideDims[0];
+                    const std::size_t iy = oy * strideDims[1];
+
                     O sum = static_cast<O>(0);
                     std::size_t count = 0;
 
-                    for (std::size_t sx = sxMin; sx < sxMax; sx += dilations[0]) {
-                        for (std::size_t sy = syMin; sy < syMax; sy += dilations[1]) {
-                            sum += static_cast<O>(input[iIndex + sx * dims[3] + sy]);
-                            ++count;
+                    for (unsigned int sy = syMin; sy < syMax; ++sy) {
+                        for (unsigned int sx = sxMin; sx < sxMax; ++sx) {
+                            // Apply dilation factor
+                            const std::size_t dilated_sx = sx * dilations[0];
+                            const std::size_t dilated_sy = sy * dilations[1];
+
+                            // Ensure within bounds
+                            if ((ix + dilated_sx) < dims[2] && (iy + dilated_sy) < dims[3]) {
+                                sum += static_cast<O>(input[iIndex + (ix + dilated_sx) * dims[3] + (iy + dilated_sy)]);
+                                ++count;
+                            }
                         }
                     }
 
-                    output[oIndexFull] = sum / static_cast<O>(count);
+                    output[oIndexFull] = count > 0 ? sum / static_cast<O>(count) : 0;
                 }
             }
         }
diff --git a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
index 250b11b0..d5ac02fe 100644
--- a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
@@ -36,7 +36,7 @@ template <class I, class O>
 void MaxPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
                                         const std::array<DimSize_t, 2>& kernelDims,
                                         const std::array<DimSize_t, 2>& dilations,
-                                        const bool /*ceilMode*/,
+                                        const bool ceilMode,
                                         const std::array<DimSize_t, 4> &dims,
                                         const void *input_,
                                         void *output_) {
@@ -44,13 +44,19 @@ void MaxPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideD
     O *output = static_cast<O *>(output_);
 
     // output H size
-    const std::size_t oxSize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0]) /
-                                static_cast<float>(strideDims[0])));
+    const std::size_t oxSize = 
+        ceilMode 
+        ? static_cast<std::size_t>(std::ceil(static_cast<float>(dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0]) /
+                                            static_cast<float>(strideDims[0])))
+        : static_cast<std::size_t>(std::floor(static_cast<float>(dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0]) /
+                                            static_cast<float>(strideDims[0])));
     // output W size
-    const std::size_t oySize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(dims[3] - (kernelDims[1] - 1) * dilations[1] - 1 + strideDims[1]) /
-                                static_cast<float>(strideDims[1])));
+    const std::size_t oySize = 
+        ceilMode 
+        ? static_cast<std::size_t>(std::ceil(static_cast<float>(dims[3] - (kernelDims[1] - 1) * dilations[1] - 1 + strideDims[1]) /
+                                            static_cast<float>(strideDims[1])))
+        : static_cast<std::size_t>(std::floor(static_cast<float>(dims[3] - (kernelDims[1] - 1) * dilations[1] - 1 + strideDims[1]) /
+                                            static_cast<float>(strideDims[1])));
 
     using signedsize = std::make_signed<std::size_t>::type;
     for (std::size_t batch = 0; batch < dims[0]; ++batch) {
diff --git a/unit_tests/operator/Test_AvgPoolingImpl.cpp b/unit_tests/operator/Test_AvgPoolingImpl.cpp
index 372febc6..21a7a680 100644
--- a/unit_tests/operator/Test_AvgPoolingImpl.cpp
+++ b/unit_tests/operator/Test_AvgPoolingImpl.cpp
@@ -110,5 +110,38 @@ TEST_CASE("[cpu/operator] AvgPooling(forward)", "[AvgPooling][CPU]") {
             REQUIRE(std::abs(outPtr[i] - expectedOutPtr[i]) < 0.00001);
         }
     }
-    // std::cout << static_cast<Tensor>((*op)["weight"])[0][0][0][0] << std::endl;
+    SECTION("Dilations") {
+        std::shared_ptr<Tensor> myInput3 = std::make_shared<Tensor>(Array4D<float,1,1,5,5> { // NCHW
+        {
+            {
+                {{ 1,  2,  3,  4,  5},
+                { 6,  7,  8,  9, 10},
+                {11, 12, 13, 14, 15},
+                {16, 17, 18, 19, 20},
+                {21, 22, 23, 24, 25}}
+            }
+        }
+        });
+
+        // Dilation of 2 means we take every second element in the window
+        std::shared_ptr<Node> myAvgPool = AvgPooling({2,2}, "mycdw", {1,1}, {2,2}); 
+        auto op = std::static_pointer_cast<AvgPooling_Op<2>>(myAvgPool -> getOperator());
+
+        std::shared_ptr<Tensor> myOutput3 = std::make_shared<Tensor>(Array4D<float,1,1,3,3> {
+            {
+                {
+                    {{  7,  8,  9},
+                    { 12, 13, 14},
+                    { 17, 18, 19}}
+                }
+            }
+        });
+
+        op->associateInput(0, myInput3);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cpu");
+        myAvgPool->forward();
+        op->getOutput(0)->print();
+        REQUIRE(*(op->getOutput(0)) == *myOutput3);
+    }
 }
\ No newline at end of file
-- 
GitLab


From 54988d11af0e7e7dfb3ea8c56bc54eb03216bda0 Mon Sep 17 00:00:00 2001
From: hrouis <houssemeddine.rouis92@gmail.com>
Date: Mon, 3 Feb 2025 15:09:26 +0100
Subject: [PATCH 030/108] add ceil_mode tests for Avg and Max Pooling

---
 .../cpu/operator/MaxPoolingImpl_kernels.hpp   |  1 +
 unit_tests/operator/Test_AvgPoolingImpl.cpp   | 57 +++++++++++++++++++
 unit_tests/operator/Test_MaxPoolingImpl.cpp   | 57 +++++++++++++++++++
 3 files changed, 115 insertions(+)

diff --git a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
index d5ac02fe..027fc02a 100644
--- a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
@@ -16,6 +16,7 @@
 #include <cmath>
 #include <tuple>
 
+
 #include "aidge/backend/cpu/operator/MaxPoolingImpl.hpp"
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
 #include "aidge/data/Data.hpp"
diff --git a/unit_tests/operator/Test_AvgPoolingImpl.cpp b/unit_tests/operator/Test_AvgPoolingImpl.cpp
index 21a7a680..f116934c 100644
--- a/unit_tests/operator/Test_AvgPoolingImpl.cpp
+++ b/unit_tests/operator/Test_AvgPoolingImpl.cpp
@@ -144,4 +144,61 @@ TEST_CASE("[cpu/operator] AvgPooling(forward)", "[AvgPooling][CPU]") {
         op->getOutput(0)->print();
         REQUIRE(*(op->getOutput(0)) == *myOutput3);
     }
+    SECTION("Ceil Mode") {
+        std::shared_ptr<Tensor> myInput4 = std::make_shared<Tensor>(Array4D<float,1,1,5,5> { // NCHW
+        {
+            {
+                {
+                    { 1,  2,  3,  4,  5},
+                    { 6,  7,  8,  9, 10},
+                    {11, 12, 13, 14, 15},
+                    {16, 17, 18, 19, 20},
+                    {21, 22, 23, 24, 25}
+                }
+            }
+        }
+        });
+
+        // AvgPool with ceil_mode = true
+        std::shared_ptr<Node> myAvgPool1 = AvgPooling({2,2}, "mycdw", {2,2}, {1,1}, true);
+        auto op1 = std::static_pointer_cast<AvgPooling_Op<2>>(myAvgPool1 -> getOperator());
+
+        std::shared_ptr<Tensor> myOutput4 = std::make_shared<Tensor>(Array4D<float,1,1,3,3> {
+            {
+                {
+                    {
+                        {  4.0,  6.0,  7.5 },
+                        { 14.0, 16.0, 17.5 },
+                        { 21.5, 23.5, 25.0 }
+                    }
+                }
+            }
+        });
+        op1->associateInput(0, myInput4);
+        op1->setDataType(DataType::Float32);
+        op1->setBackend("cpu");
+        myAvgPool1->forward();
+        op1->getOutput(0)->print();
+        REQUIRE(*(op1->getOutput(0)) == *myOutput4);
+
+        // AvgPool with ceil_mode = false
+        std::shared_ptr<Node> myAvgPool2 = AvgPooling({2,2}, "mycdw", {2,2}, {1,1}, false);
+        auto op2 = std::static_pointer_cast<AvgPooling_Op<2>>(myAvgPool2 -> getOperator());
+        std::shared_ptr<Tensor> myOutput5 = std::make_shared<Tensor>(Array4D<float,1,1,2,2> {
+            {
+                {
+                    {
+                        {  4.0,  6.0 },
+                        { 14.0, 16.0 }
+                    }
+                }
+            }
+        });
+        op2->associateInput(0, myInput4);
+        op2->setDataType(DataType::Float32);
+        op2->setBackend("cpu");
+        myAvgPool2->forward();
+        op2->getOutput(0)->print();
+        REQUIRE(*(op2->getOutput(0)) == *myOutput5);
+    }
 }
\ No newline at end of file
diff --git a/unit_tests/operator/Test_MaxPoolingImpl.cpp b/unit_tests/operator/Test_MaxPoolingImpl.cpp
index 6b7e6d2f..d480fc30 100644
--- a/unit_tests/operator/Test_MaxPoolingImpl.cpp
+++ b/unit_tests/operator/Test_MaxPoolingImpl.cpp
@@ -115,4 +115,61 @@ TEST_CASE("[cpu/operator] MaxPooling(forward)", "[MaxPooling][CPU]") {
         op->getOutput(0)->print();
         REQUIRE(*(op->getOutput(0)) == *myOutput);
     }
+    SECTION("Ceil Mode") {
+        std::shared_ptr<Tensor> myInput4 = std::make_shared<Tensor>(Array4D<float,1,1,5,5> { // NCHW
+        {
+            {
+                {
+                    { 1,  2,  3,  4,  5},
+                    { 6,  7,  8,  9, 10},
+                    {11, 12, 13, 14, 15},
+                    {16, 17, 18, 19, 20},
+                    {21, 22, 23, 24, 25}
+                }
+            }
+        }
+        });
+
+        // MaxPool with ceil_mode = true
+        std::shared_ptr<Node> myMaxPool1 = MaxPooling({2,2}, "mycdw", {2,2}, {1,1}, true);
+        auto op1 = std::static_pointer_cast<OperatorTensor>(myMaxPool1 -> getOperator());
+
+        std::shared_ptr<Tensor> myOutput4 = std::make_shared<Tensor>(Array4D<float,1,1,3,3> {
+            {
+                {
+                    {
+                        {  7.0,  9.0, 10.0 },
+                        { 17.0, 19.0, 20.0 },
+                        { 22.0, 24.0, 25.0 }
+                    }
+                }
+            }
+        });
+        op1->associateInput(0, myInput4);
+        op1->setDataType(DataType::Float32);
+        op1->setBackend("cpu");
+        myMaxPool1->forward();
+        op1->getOutput(0)->print();
+        REQUIRE(*(op1->getOutput(0)) == *myOutput4);
+
+        // MaxPool with ceil_mode = false
+        std::shared_ptr<Node> myMaxPool2 = MaxPooling({2,2}, "mycdw", {2,2}, {1,1}, false);
+        auto op2 = std::static_pointer_cast<OperatorTensor>(myMaxPool2 -> getOperator());
+        std::shared_ptr<Tensor> myOutput5 = std::make_shared<Tensor>(Array4D<float,1,1,2,2> {
+            {
+                {
+                    {
+                        {  7.0,  9.0 },
+                        { 17.0, 19.0 }
+                    }
+                }
+            }
+        });
+        op2->associateInput(0, myInput4);
+        op2->setDataType(DataType::Float32);
+        op2->setBackend("cpu");
+        myMaxPool2->forward();
+        op2->getOutput(0)->print();
+        REQUIRE(*(op2->getOutput(0)) == *myOutput5);
+    }
 }
\ No newline at end of file
-- 
GitLab


From e2adb2e41735464c7cacf09c828e5974af92a166 Mon Sep 17 00:00:00 2001
From: hrouis <houssemeddine.rouis92@gmail.com>
Date: Wed, 19 Feb 2025 10:32:39 +0100
Subject: [PATCH 031/108] separate fwdDims tests section from fwd section

---
 unit_tests/operator/Test_EqualImpl.cpp | 145 ++++++++++++-------------
 1 file changed, 72 insertions(+), 73 deletions(-)

diff --git a/unit_tests/operator/Test_EqualImpl.cpp b/unit_tests/operator/Test_EqualImpl.cpp
index a229b8ce..013e16eb 100644
--- a/unit_tests/operator/Test_EqualImpl.cpp
+++ b/unit_tests/operator/Test_EqualImpl.cpp
@@ -19,86 +19,85 @@
 
 using namespace Aidge;
 
-TEST_CASE("[cpu/operator] Equal(forward)", "[Equal][CPU]") {
-        SECTION("ForwardDims")
-    {
-        constexpr std::uint16_t NBTRIALS = 10;
-        // Create a random number generator
-        std::random_device rd;
-        std::mt19937 gen(rd());
-        std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
-        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
-        std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
-        std::uniform_int_distribution<int> boolDist(0,1);
-
-        SECTION("Same dimensions") {
-            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
-                DimSize_t nbDims = nbDimsDist(gen);
-                std::vector<DimSize_t> dims(nbDims);
-                for (std::size_t i = 0; i < nbDims; i++) {
-                    dims[i] = dimSizeDist(gen);
-                }
-
-                std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims);
-                myInput1->setBackend("cpu");
-                myInput1->setDataType(DataType::Float32);
-                myInput1->zeros();
-                std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims);
-                myInput2->setBackend("cpu");
-                myInput2->setDataType(DataType::Float32);
-                myInput2->zeros();
-                std::shared_ptr<Node> myEqual = Equal();
-                auto op = std::static_pointer_cast<OperatorTensor>(myEqual -> getOperator());
-                op->associateInput(0,myInput1);
-                op->associateInput(1,myInput2);
-                op->setDataType(DataType::Float32);
-                op->setBackend("cpu");
-                op->forwardDims();
-
-                const auto outputDims = op->getOutput(0)->dims();
-                REQUIRE(outputDims == dims);
+TEST_CASE("[cpu/operator] Equal(forwardDims)", "[Equal][CPU]") {
+    constexpr std::uint16_t NBTRIALS = 10;
+    // Create a random number generator
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
+    std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
+    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
+    std::uniform_int_distribution<int> boolDist(0,1);
+
+    SECTION("Same dimensions") {
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+            DimSize_t nbDims = nbDimsDist(gen);
+            std::vector<DimSize_t> dims(nbDims);
+            for (std::size_t i = 0; i < nbDims; i++) {
+                dims[i] = dimSizeDist(gen);
             }
+
+            std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims);
+            myInput1->setBackend("cpu");
+            myInput1->setDataType(DataType::Float32);
+            myInput1->zeros();
+            std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims);
+            myInput2->setBackend("cpu");
+            myInput2->setDataType(DataType::Float32);
+            myInput2->zeros();
+            std::shared_ptr<Node> myEqual = Equal();
+            auto op = std::static_pointer_cast<OperatorTensor>(myEqual -> getOperator());
+            op->associateInput(0,myInput1);
+            op->associateInput(1,myInput2);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            op->forwardDims();
+
+            const auto outputDims = op->getOutput(0)->dims();
+            REQUIRE(outputDims == dims);
         }
-        SECTION("Broadcasting") {
-            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
-                DimSize_t nbDims = nbDimsDist(gen);
-                std::vector<DimSize_t> dims1(nbDims, 1);
-                std::vector<DimSize_t> dims2(nbDims, 1);
-                std::vector<DimSize_t> expectedOutDims;
-                for (std::size_t i = 0; i < nbDims; i++) {
-                    DimSize_t dim = dimSizeDist(gen);
-                    if (boolDist(gen)) {
-                        dims1[i] = dim;
-                    }
-                    if (boolDist(gen)) {
-                        dims2[i] = dim;
-                    }
-                    expectedOutDims.push_back(std::max(dims1[i],dims2[i]));
+    }
+    SECTION("Broadcasting") {
+        for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+            DimSize_t nbDims = nbDimsDist(gen);
+            std::vector<DimSize_t> dims1(nbDims, 1);
+            std::vector<DimSize_t> dims2(nbDims, 1);
+            std::vector<DimSize_t> expectedOutDims;
+            for (std::size_t i = 0; i < nbDims; i++) {
+                DimSize_t dim = dimSizeDist(gen);
+                if (boolDist(gen)) {
+                    dims1[i] = dim;
+                }
+                if (boolDist(gen)) {
+                    dims2[i] = dim;
                 }
+                expectedOutDims.push_back(std::max(dims1[i],dims2[i]));
+            }
 
 
-                std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims1);
-                myInput1->setBackend("cpu");
-                myInput1->setDataType(DataType::Float32);
-                myInput1->zeros();
-                std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims2);
-                myInput2->setBackend("cpu");
-                myInput2->setDataType(DataType::Float32);
-                myInput2->zeros();
-                std::shared_ptr<Node> myEqual = Equal();
-                auto op = std::static_pointer_cast<OperatorTensor>(myEqual -> getOperator());
-                op->associateInput(0,myInput1);
-                op->associateInput(1,myInput2);
-                op->setDataType(DataType::Float32);
-                op->setBackend("cpu");
-
-                op->forwardDims();
-
-                const auto outputDims = op->getOutput(0)->dims();
-                REQUIRE(outputDims == expectedOutDims);
-            }
+            std::shared_ptr<Tensor> myInput1 = std::make_shared<Tensor>(dims1);
+            myInput1->setBackend("cpu");
+            myInput1->setDataType(DataType::Float32);
+            myInput1->zeros();
+            std::shared_ptr<Tensor> myInput2 = std::make_shared<Tensor>(dims2);
+            myInput2->setBackend("cpu");
+            myInput2->setDataType(DataType::Float32);
+            myInput2->zeros();
+            std::shared_ptr<Node> myEqual = Equal();
+            auto op = std::static_pointer_cast<OperatorTensor>(myEqual -> getOperator());
+            op->associateInput(0,myInput1);
+            op->associateInput(1,myInput2);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+
+            op->forwardDims();
+
+            const auto outputDims = op->getOutput(0)->dims();
+            REQUIRE(outputDims == expectedOutDims);
         }
     }
+}
+TEST_CASE("[cpu/operator] Equal(forward)", "[Equal][CPU]") {
     SECTION("Same size inputs") {
         std::shared_ptr<Tensor> input1 = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
         {                                       //
-- 
GitLab


From 94f05103807bb751069ac3b9c2867844245e146d Mon Sep 17 00:00:00 2001
From: hrouis <houssemeddine.rouis92@gmail.com>
Date: Thu, 20 Feb 2025 11:35:10 +0100
Subject: [PATCH 032/108] remove unnecessary header in Equal tests

---
 unit_tests/operator/Test_EqualImpl.cpp | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/unit_tests/operator/Test_EqualImpl.cpp b/unit_tests/operator/Test_EqualImpl.cpp
index 013e16eb..bd9fa94f 100644
--- a/unit_tests/operator/Test_EqualImpl.cpp
+++ b/unit_tests/operator/Test_EqualImpl.cpp
@@ -15,8 +15,6 @@
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/Equal.hpp"
 
-#include "aidge/backend/cpu.hpp"
-
 using namespace Aidge;
 
 TEST_CASE("[cpu/operator] Equal(forwardDims)", "[Equal][CPU]") {
@@ -137,7 +135,7 @@ TEST_CASE("[cpu/operator] Equal(forward)", "[Equal][CPU]") {
                 }                                   //
             }                                       //
         });                                         //
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
+        Tensor expectedOutput =Tensor(Array4D<int,3,3,3,2> {
             {
                 {
                     {{1, 0},{0, 0},{1, 1}},
@@ -165,7 +163,7 @@ TEST_CASE("[cpu/operator] Equal(forward)", "[Equal][CPU]") {
         op->setDataType(DataType::Int32);
         myEqual->forward();
 
-        REQUIRE(*(op->getOutput(0)) == *expectedOutput);
+        REQUIRE(*(op->getOutput(0)) == expectedOutput);
     }
 
     SECTION("Broadcasting") {
@@ -180,7 +178,7 @@ TEST_CASE("[cpu/operator] Equal(forward)", "[Equal][CPU]") {
         });                                     //
 
         std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<int,2> {{10, 20}});  
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,1,3,3,2> {
+        Tensor expectedOutput = Tensor(Array4D<int,1,3,3,2> {
             {                                   //
                 {                               //
                     {{ 1, 1},{ 0, 0},{ 0, 1}},  //
@@ -198,7 +196,7 @@ TEST_CASE("[cpu/operator] Equal(forward)", "[Equal][CPU]") {
         op->setBackend("cpu");
         myEqual->forward();
         op->getOutput(0)->print();
-        expectedOutput->print();
-        REQUIRE(*op->getOutput(0) == *expectedOutput);
+
+        REQUIRE(*op->getOutput(0) == expectedOutput);
     }
 }
\ No newline at end of file
-- 
GitLab


From 128b735f5e2888d0597bfe1d66b1fa6aa80cad02 Mon Sep 17 00:00:00 2001
From: NAUD Maxence <maxence.naud@cea.fr>
Date: Mon, 24 Feb 2025 13:49:17 +0000
Subject: [PATCH 033/108] Fix some imports following aidge_core update

---
 src/operator/PadImpl.cpp       | 6 +++---
 src/operator/ReduceSumImpl.cpp | 7 +++++--
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/operator/PadImpl.cpp b/src/operator/PadImpl.cpp
index cdae21f8..9a54437f 100644
--- a/src/operator/PadImpl.cpp
+++ b/src/operator/PadImpl.cpp
@@ -9,14 +9,14 @@
  *
  ********************************************************************************/
 
+#include <cstddef>
 #include <vector>
 
-#include "aidge/utils/Types.h"
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
-#include "aidge/operator/Conv.hpp"
-
 #include "aidge/backend/cpu/operator/PadImpl.hpp"
 #include "aidge/backend/cpu/operator/PadImpl_kernels.hpp"
+#include "aidge/operator/Pad.hpp"
+#include "aidge/utils/Types.h"
 
 Aidge::Elts_t Aidge::Pad_ProdConso_cpu::getNbRequiredProtected(Aidge::IOIndex_t inputIdx) const {
     AIDGE_ASSERT(inputIdx == 0, "input index out of range."
diff --git a/src/operator/ReduceSumImpl.cpp b/src/operator/ReduceSumImpl.cpp
index aad08018..93a89a34 100644
--- a/src/operator/ReduceSumImpl.cpp
+++ b/src/operator/ReduceSumImpl.cpp
@@ -12,11 +12,14 @@
 #include "aidge/backend/cpu/operator/ReduceSumImpl.hpp"
 
 #include <memory>
+#include <stdexcept>
 #include <vector>
 
-#include "aidge/utils/Types.h"
-#include "aidge/operator/ReduceSum.hpp"
 #include "aidge/backend/cpu/operator/ReduceSumImpl_kernels.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/ReduceSum.hpp"
+#include "aidge/utils/ErrorHandling.hpp"
+#include "aidge/utils/Types.h"
 
 template <>
 void Aidge::ReduceSumImpl_cpu::forward() {
-- 
GitLab


From 4fa8bf81f05ad7f5c9110522ea5131e4d06da3f4 Mon Sep 17 00:00:00 2001
From: Jerome Hue <jerome.hue@cea.fr>
Date: Thu, 6 Feb 2025 10:49:15 +0100
Subject: [PATCH 034/108] Implement backward function of Add operator

---
 .../aidge/backend/cpu/operator/AddImpl.hpp    |  14 +-
 .../backend/cpu/operator/AddImpl_kernels.hpp  |  64 +++-
 src/operator/AddImpl.cpp                      |  25 +-
 unit_tests/operator/Test_AddImpl.cpp          | 275 +++++++++++++++++-
 4 files changed, 368 insertions(+), 10 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/AddImpl.hpp b/include/aidge/backend/cpu/operator/AddImpl.hpp
index e39c35b4..ca04dff9 100644
--- a/include/aidge/backend/cpu/operator/AddImpl.hpp
+++ b/include/aidge/backend/cpu/operator/AddImpl.hpp
@@ -25,7 +25,19 @@
 namespace Aidge {
 // Operator implementation entry point for the backend
 using AddImpl_cpu = OperatorImpl_cpu<Add_Op,
-    void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*, void*)>;
+    void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*, void*),
+    void(const std::size_t, 
+         const std::size_t, 
+         const std::size_t, 
+         const std::vector<std::size_t>&, 
+         const std::vector<std::size_t>&, 
+         const std::vector<std::size_t>&, 
+         const void*, 
+         const void*, 
+         const void*, 
+         void*, 
+         void*)
+>;
 
 // Implementation entry point registration to Operator
 REGISTRAR(Add_Op, "cpu", Aidge::AddImpl_cpu::create);
diff --git a/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp
index e6d13fcf..d6fff9b5 100644
--- a/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp
@@ -147,25 +147,75 @@ void AddImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
     }
 }
 
+template <class I, class O>
+void AddImpl_cpu_backward_kernel(const std::size_t input0Length,
+                               const std::size_t input1Length,
+                               const std::size_t gradOutputLength,
+                               const std::vector<std::size_t>& dims0,
+                               const std::vector<std::size_t>& dims1,
+                               const std::vector<std::size_t>& outputDims,
+                               const void* input0_,
+                               const void* input1_,
+                               const void* grad_output_,
+                               void* gradientInput0_,
+                               void* gradientInput1_)
+{
+    // TODO: Remove input0/1 from the function
+    const I* input0 = static_cast<const I*>(input0_);
+    const I* input1 = static_cast<const I*>(input1_);
+    const O* gradOutput = static_cast<const O*>(grad_output_);
+    auto* gradInput0 = static_cast<I*>(gradientInput0_);
+    auto* gradInput1 = static_cast<I*>(gradientInput1_);
+
+    std::fill_n(gradInput0, input0Length, static_cast<I>(0));
+    std::fill_n(gradInput1, input1Length, static_cast<I>(0));
+
+    auto broadcastedDims0 = getBroadcastedDims(outputDims, dims0);
+    auto broadcastedDims1 = getBroadcastedDims(outputDims, dims1);
+
+    for (std::size_t i = 0; i < gradOutputLength; ++i) {
+        auto idxOutputGrad = getMultiDimIndices(outputDims, i);
+        std::vector<std::size_t> idxInput0(broadcastedDims0.size());
+        std::vector<std::size_t> idxInput1(broadcastedDims1.size());
+
+        for (std::size_t dimension = 0; dimension < broadcastedDims0.size(); ++dimension) {
+            idxInput0[dimension] = (broadcastedDims0[dimension] == 1) ? 0 : idxOutputGrad[dimension];
+        }
+
+        for (std::size_t dimension = 0; dimension < broadcastedDims1.size(); ++dimension) {
+            idxInput1[dimension] = (broadcastedDims1[dimension] == 1) ? 0 : idxOutputGrad[dimension];
+        }
+
+        auto idx0 = getFlattenedIndex(broadcastedDims0, idxInput0);
+        auto idx1 = getFlattenedIndex(broadcastedDims1, idxInput1);
+
+        // For addition: gradient of both inputs is just the output gradient
+        // (unlike multiplication where we need to multiply by the other input,
+        // or subtraction where we need to negate one of them)
+        gradInput0[idx0] += static_cast<I>(gradOutput[i]);
+        gradInput1[idx1] += static_cast<I>(gradOutput[i]);
+    }
+}
+
 // Kernels registration to implementation entry point
 REGISTRAR(AddImpl_cpu,
     {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float32}},
-    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<float, float>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<float, float>, Aidge::AddImpl_cpu_backward_kernel<float, float>});
 REGISTRAR(AddImpl_cpu,
     {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}},
-    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<double, double>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<double, double>, Aidge::AddImpl_cpu_backward_kernel<double, double>});
 REGISTRAR(AddImpl_cpu,
     {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int8}},
-    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int8_t, std::int8_t>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int8_t, std::int8_t>, Aidge::AddImpl_cpu_backward_kernel<std::int8_t, std::int8_t>});
 REGISTRAR(AddImpl_cpu,
     {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::UInt8}},
-    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::uint8_t, std::uint8_t>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::uint8_t, std::uint8_t>, Aidge::AddImpl_cpu_backward_kernel<std::uint8_t, std::uint8_t>});
 REGISTRAR(AddImpl_cpu,
     {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}},
-    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, Aidge::AddImpl_cpu_backward_kernel<std::int32_t, std::int32_t>});
 REGISTRAR(AddImpl_cpu,
     {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int64}},
-    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::AddImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, Aidge::AddImpl_cpu_backward_kernel<std::int64_t, std::int64_t>});
 }  // namespace Aidge
 
-#endif /* AIDGE_CPU_OPERATOR_ADDIMPL_CPU_KERNELS_H_ */
\ No newline at end of file
+#endif /* AIDGE_CPU_OPERATOR_ADDIMPL_CPU_KERNELS_H_ */
diff --git a/src/operator/AddImpl.cpp b/src/operator/AddImpl.cpp
index 101743ec..b027fb87 100644
--- a/src/operator/AddImpl.cpp
+++ b/src/operator/AddImpl.cpp
@@ -55,5 +55,28 @@ void  Aidge::AddImpl_cpu::forward() {
 
 template <>
 void Aidge::AddImpl_cpu::backward() {
-    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Add_Op on backend cpu");
+    const Add_Op& op_ = dynamic_cast<const Add_Op&>(mOp);
+
+    auto in0 = op_.getInput(0);
+    auto in1 = op_.getInput(1);
+    auto in0grad = op_.getInput(0)->grad();
+    auto in1grad = op_.getInput(1)->grad();
+    auto out0grad = op_.getOutput(0)->grad();
+
+    // Find the correct kernel type
+    const auto impl = Registrar<AddImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Call kernel
+    impl.backward(in0grad->size(),
+               in1grad->size(),
+               out0grad->size(),
+               in0->dims(),
+               in1->dims(),
+               out0grad->dims(),
+               getCPUPtr(in0),
+               getCPUPtr(in1),
+               getCPUPtr(out0grad),
+               getCPUPtr(in0grad),
+               getCPUPtr(in1grad));
+
 }
diff --git a/unit_tests/operator/Test_AddImpl.cpp b/unit_tests/operator/Test_AddImpl.cpp
index bff9629b..4538b322 100644
--- a/unit_tests/operator/Test_AddImpl.cpp
+++ b/unit_tests/operator/Test_AddImpl.cpp
@@ -10,6 +10,7 @@
  ********************************************************************************/
 
 #include <memory>
+#include <random>
 
 #include <catch2/catch_test_macros.hpp>
 
@@ -19,6 +20,7 @@
 #include "aidge/graph/Node.hpp"
 #include "aidge/operator/Add.hpp"
 #include "aidge/utils/ArrayHelpers.hpp"
+#include "aidge/utils/TensorUtils.hpp"
 
 using namespace Aidge;
 
@@ -139,4 +141,275 @@ TEST_CASE("[cpu/operator] Add(forward)", "[Add][CPU]") {
         Log::info("Expected Add_1 Tensor:\n{}", expectedOutput);
         REQUIRE(*op_1->getOutput(0) == expectedOutput);
     }
-}
\ No newline at end of file
+}
+
+TEST_CASE("[cpu/operator] Add(backward)", "[Add][CPU]") {
+    std::shared_ptr<Add_Op> op = std::make_shared<Add_Op>();
+    op->setDataType(DataType::Float32);
+    op->setBackend("cpu");
+
+    // NOTE: The first four tests use fixed values, the last one uses random values but static dimensions.
+
+    SECTION("Case 1: 1D and 2D Tensors") {
+        const auto T0 = std::make_shared<Tensor>(
+            Array2D<cpptype_t<DataType::Float32>, 2, 3>({{{1, 2, 3}, {4, 5, 6}}}));
+
+        const auto T1 =
+            std::make_shared<Tensor>(Array1D<cpptype_t<DataType::Float32>, 3>({0.1, 0.2, 0.3}));
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>(
+            Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}})));
+        op->forwardDims();
+
+        op->backward();
+
+        const Tensor expectedGrad0 =
+            Array2D<cpptype_t<DataType::Float32>, 2, 3>({{{1, 1, 1}, {1, 1, 1}}});
+
+        const Tensor expectedGrad1 = Array1D<cpptype_t<DataType::Float32>, 3>({2, 2, 2});
+
+
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1));
+    }
+
+    SECTION("Case 2: 3D and 1D tensors") {
+        const auto T0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+            {{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}},
+              {{7.0, 8.0, 9.0}, {10.0, 11.0, 12.0}}}}));
+
+        const auto T1 =
+            std::make_shared<Tensor>(Array1D<float, 3>({0.3, 0.2, 0.1}));
+
+        const auto newGrad = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+            {{{{1, 1, 1}, {1, 1, 1}}, {{1, 1, 1}, {1, 1, 1}}}}));
+
+        const Tensor expectedGrad0 =
+            Array3D<float, 2, 2, 3>({{{{1, 1, 1}, {1, 1, 1}},
+                                      {{1, 1, 1}, {1, 1, 1}}}});
+
+        const Tensor expectedGrad1 = Array1D<cpptype_t<DataType::Float32>, 3>({4, 4, 4});
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(newGrad);
+        op->forwardDims();
+        op->backward();
+
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1));
+    }
+
+    SECTION("Case 3: 4D and 2D tensors") {
+        const auto T0 = std::make_shared<Tensor>(Array4D<cpptype_t<DataType::Float32>, 2, 2, 3, 3>(
+            {{{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}, {7.0, 8.0, 9.0}},
+               {{10.0, 11.0, 12.0}, {13.0, 14.0, 15.0}, {16.0, 17.0, 18.0}}},
+              {{{19.0, 20.0, 21.0}, {22.0, 23.0, 24.0}, {25.0, 26.0, 27.0}},
+               {{28.0, 29.0, 30.0},
+                {31.0, 32.0, 33.0},
+                {34.0, 35.0, 36.0}}}}}));
+
+        const auto T1 = std::make_shared<Tensor>(Array2D<cpptype_t<DataType::Float32>, 3, 3>(
+            {{{0.5, 0.3, 0.1}, {0.4, 0.2, 0.6}, {0.7, 0.8, 0.9}}}));
+
+        const auto newGrad =
+            std::make_shared<Tensor>(Array4D<cpptype_t<DataType::Float32>, 2, 2, 3, 3>(
+                {{{{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}},
+                   {{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}},
+                  {{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}},
+                   {{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}}}}));
+
+        const Tensor expectedGrad0 =
+            Array4D<cpptype_t<DataType::Float32>, 2, 2, 3, 3>(
+                {{{{{1, 1, 1}, {1, 1, 1}, {1, 1, 1}},
+                   {{1, 1, 1}, {1, 1, 1}, {1, 1, 1}}},
+                  {{{1, 1, 1}, {1, 1, 1}, {1, 1, 1}},
+                   {{1, 1, 1}, {1, 1, 1}, {1, 1, 1}}}}});
+
+        const Tensor expectedGrad1 =
+            Array2D<cpptype_t<DataType::Float32>, 3, 3>({{
+                                   {4.0, 4.0, 4.0},
+                                   {4.0, 4.0, 4.0},
+                                   {4.0, 4.0, 4.0}}});
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(newGrad);
+        op->forwardDims();
+
+        op->backward();
+
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1));
+    }
+
+    SECTION("Case 4: 3D and 2D tensors") {
+        const auto T0 = std::make_shared<Tensor>(
+            Array3D<float, 2, 3, 4>({{{
+                                          {1.0, 2.0, 3.0, 4.0},
+                                          {5.0, 6.0, 7.0, 8.0},
+                                          {9.0, 10.0, 11.0, 12.0},
+                                      },
+                                      {
+                                          {13.0, 14.0, 15.0, 16.0},
+                                          {17.0, 18.0, 19.0, 20.0},
+                                          {21.0, 22.0, 23.0, 24.0},
+                                      }}}));
+
+        const auto T1 = std::make_shared<Tensor>(
+            Array2D<cpptype_t<DataType::Float32>, 3, 4>({{{0.1, 0.2, 0.3, 0.4},
+                                   {0.5, 0.6, 0.7, 0.8},
+                                   {0.9, 1.0, 1.1, 1.2}}}));
+
+        const auto newGrad = std::make_shared<Tensor>(
+            Array3D<cpptype_t<DataType::Float32>, 2, 3, 4>({{{
+                                          {1.0, 1.0, 1.0, 1.0},
+                                          {1.0, 1.0, 1.0, 1.0},
+                                          {1.0, 1.0, 1.0, 1.0},
+                                      },
+                                      {
+                                          {1.0, 1.0, 1.0, 1.0},
+                                          {1.0, 1.0, 1.0, 1.0},
+                                          {1.0, 1.0, 1.0, 1.0},
+                                      }}}));
+
+        const Tensor expectedGrad0 =
+            Array3D<cpptype_t<DataType::Float32>, 2, 3, 4>({{{{1, 1, 1, 1},
+                                       {1, 1, 1, 1},
+                                       {1, 1, 1, 1}},
+                                      {{1, 1, 1, 1},
+                                       {1, 1, 1, 1},
+                                       {1, 1, 1, 1}}}});
+
+        const Tensor expectedGrad1 =
+            Array2D<cpptype_t<DataType::Float32>, 3, 4>({{{2.0, 2.0, 2.0, 2.0},
+                                   {2.0, 2.0, 2.0, 2.0},
+                                   {2.0, 2.0, 2.0, 2.0}}});
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(newGrad);
+        op->forwardDims();
+
+        op->backward();
+
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
+        REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(1)->grad()), expectedGrad1));
+    }
+
+    SECTION("Case 5: Tensors with random values") {
+
+        // Use random values
+        const std::vector<std::size_t> dims0 = {5, 2, 1, 7}; // First tensor
+        const std::vector<std::size_t> dims1 = {2, 6, 7};    // Second tensor
+        const std::vector<std::size_t> outputDims = {5, 2, 6, 7};
+
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> dist(0.1f, 1.0f);
+
+        auto T0 = std::make_shared<Tensor>(dims0);
+        T0->setDataType(DataType::Float32);
+        T0->setBackend("cpu");
+        float* input0Data = static_cast<float*>(T0->getImpl()->rawPtr());
+        // Fill with random values
+        for (std::size_t i = 0; i < T0->size(); ++i) {
+            input0Data[i] = dist(gen);
+        }
+
+        auto T1 = std::make_shared<Tensor>(dims1);
+        T1->setDataType(DataType::Float32);
+        T1->setBackend("cpu");
+        float* input1Data = static_cast<float*>(T1->getImpl()->rawPtr());
+        // Fill with random values
+        for (std::size_t i = 0; i < T1->size(); ++i) {
+            input1Data[i] = dist(gen);
+        }
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+
+        op->forwardDims();
+        op->forward();
+
+        Tensor expectedOutput{outputDims};
+        expectedOutput.setBackend("cpu");
+        float* expectedOutputData = static_cast<float*>(expectedOutput.getImpl()->rawPtr());
+
+        for (std::size_t n = 0; n < 5; ++n) {
+            for (std::size_t c = 0; c < 2; ++c) {
+                for (std::size_t h = 0; h < 6; ++h) {
+                    for (std::size_t w = 0; w < 7; ++w) {
+                        std::size_t outIdx = w + 7 * (h + 6 * (c + 2 * n));
+                        std::size_t in0Idx =
+                            w + 7 * (0 + 1 * (c + 2 * n)); // middle dim is 1
+                        std::size_t in1Idx =
+                            w + 7 * (h + 6 * c);           // no n dimension
+
+                        expectedOutputData[outIdx] = input0Data[in0Idx] + input1Data[in1Idx];
+                    }
+                }
+            }
+        }
+
+        auto outputTensor = op->getOutput(0);
+
+        REQUIRE(approxEq<float>(*outputTensor, expectedOutput));
+
+        // Backward pass
+        std::vector<float> gradOutputData(expectedOutput.size());
+        for (auto &val : gradOutputData) {
+            val = dist(gen);
+        }
+
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>());
+        op->getOutput(0)->grad()->resize(outputDims);
+        op->getOutput(0)->grad()->getImpl()->setRawPtr(gradOutputData.data(),
+                                                       expectedOutput.size());
+
+        // Compute reference gradients
+        std::vector<float> expectedGrad0(T0->size(), 0.0f);
+        std::vector<float> expectedGrad1(T1->size(), 0.0f);
+
+        for (std::size_t n = 0; n < 5; ++n) {
+            for (std::size_t c = 0; c < 2; ++c) {
+                for (std::size_t h = 0; h < 6; ++h) {
+                    for (std::size_t w = 0; w < 7; ++w) {
+                        std::size_t outIdx = w + 7 * (h + 6 * (c + 2 * n));
+                        std::size_t in0Idx = w + 7 * (0 + 1 * (c + 2 * n));
+                        std::size_t in1Idx = w + 7 * (h + 6 * c);
+
+                        // Gradient for input0: just accumulate grad_output
+                        expectedGrad0[in0Idx] += gradOutputData[outIdx];
+
+                        // Gradient for input1: just accumulate grad_output
+                        expectedGrad1[in1Idx] += gradOutputData[outIdx];
+                    }
+                }
+            }
+        }
+
+        // Perform backward pass
+        op->backward();
+
+        auto expectedGrad0Tensor = std::make_shared<Tensor>();
+        expectedGrad0Tensor->resize(T0->dims());
+        expectedGrad0Tensor->setBackend("cpu");
+        expectedGrad0Tensor->setDataType(DataType::Float32);
+        expectedGrad0Tensor->getImpl()->setRawPtr(expectedGrad0.data(),
+                                                    expectedGrad0.size());
+
+        auto expectedGrad1Tensor = std::make_shared<Tensor>(T1->dims());
+        expectedGrad1Tensor->setBackend("cpu");
+        expectedGrad1Tensor->setDataType(DataType::Float32);
+        expectedGrad1Tensor->getImpl()->setRawPtr(expectedGrad1.data(),
+                                                    expectedGrad1.size());
+
+        // Verify backward pass
+        REQUIRE(approxEq<float>(*T0->grad(), *expectedGrad0Tensor));
+        REQUIRE(approxEq<float>(*T1->grad(), *expectedGrad1Tensor));
+    }
+}
+
-- 
GitLab


From 5c480cffebd20cd497476cef233c5f1eefef762b Mon Sep 17 00:00:00 2001
From: NAUD Maxence <maxence.naud@cea.fr>
Date: Wed, 26 Feb 2025 14:48:17 +0000
Subject: [PATCH 035/108] [upd] ConstantOfShape kernel to use Tensor as inputs
 and avoid redundant size computation

---
 .../cpu/operator/ConstantOfShapeImpl.hpp        |  8 +++-----
 .../operator/ConstantOfShapeImpl_kernels.hpp    | 17 ++++-------------
 src/operator/ConstantOfShapeImpl.cpp            |  9 +++------
 3 files changed, 10 insertions(+), 24 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp b/include/aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp
index 83e7e030..b595ec93 100644
--- a/include/aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp
+++ b/include/aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp
@@ -12,23 +12,21 @@
 #ifndef AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_H_
 #define AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_H_
 
-#include <cstddef>
 #include <memory>
-#include <vector>
 
 #include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/ConstantOfShape.hpp"
 #include "aidge/utils/Registrar.hpp"
-#include "aidge/utils/Types.h"
 
 namespace Aidge {
+
+class Tensor;
 // Operator implementation entry point for the backend
 using ConstantOfShapeImpl_cpu = OperatorImpl_cpu<ConstantOfShape_Op,
-    void(const std::vector<DimSize_t>, const Tensor&, void *)>;
+    void(const std::shared_ptr<Tensor>&, const Tensor&)>;
 
 // Implementation entry point registration to Operator
 REGISTRAR(ConstantOfShape_Op, "cpu", Aidge::ConstantOfShapeImpl_cpu::create);
 } // namespace Aidge
 
 #endif /* _AIDGE_CPU_OPERATOR_CONSTANTOFSHAPEIMPL_H_ */
-
diff --git a/include/aidge/backend/cpu/operator/ConstantOfShapeImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConstantOfShapeImpl_kernels.hpp
index 18ab9c0a..c42cc76a 100644
--- a/include/aidge/backend/cpu/operator/ConstantOfShapeImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConstantOfShapeImpl_kernels.hpp
@@ -30,20 +30,11 @@
 namespace Aidge {
 template <class O>
 void ConstantOfShapeimpl_cpu_forward_kernel(
-    const std::vector<DimSize_t> output_dims, const Tensor &value,
-    void *output_) {
+    const std::shared_ptr<Tensor>& output_, const Tensor &value) {
 
-  O *output = static_cast<O *>(output_);
-  O val;
-  std::copy(static_cast<O *>(value.getImpl()->hostPtr()),
-            static_cast<O *>(value.getImpl()->hostPtr()) +
-                static_cast<NbElts_t>(1),
-            &val);
-  const size_t output_size = std::accumulate(
-      output_dims.begin(), output_dims.end(), 1, std::multiplies<DimSize_t>());
-  for (size_t i = 0; i < output_size; ++i) {
-    output[i] = val;
-  }
+  O* output = static_cast<O*>(output_->getImpl()->hostPtr());
+  const O val = *reinterpret_cast<O*>(value.getImpl()->hostPtr());
+  std::fill_n(output, output_->size(), val);
 }
 
 // Kernels registration to implementation entry point
diff --git a/src/operator/ConstantOfShapeImpl.cpp b/src/operator/ConstantOfShapeImpl.cpp
index 16e4b762..1d41160b 100644
--- a/src/operator/ConstantOfShapeImpl.cpp
+++ b/src/operator/ConstantOfShapeImpl.cpp
@@ -13,15 +13,14 @@
 
 #include <functional>
 #include <memory>
-#include <vector>
+#include <stdexcept>   // std::runtime_error
 
 #include "aidge/backend/cpu/operator/ConstantOfShapeImpl_kernels.hpp"
-#include "aidge/data/Data.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/ConstantOfShape.hpp"
+#include "aidge/backend/OperatorImpl.hpp"  // Aidge::getBestMatch, Aidge::getRequiredSpec
 #include "aidge/utils/ErrorHandling.hpp"
 #include "aidge/utils/Registrar.hpp"
-#include "aidge/utils/Types.h"
 
 template <>
 void Aidge::ConstantOfShapeImpl_cpu::forward() {
@@ -33,9 +32,7 @@ void Aidge::ConstantOfShapeImpl_cpu::forward() {
     const auto impl = Registrar<ConstantOfShapeImpl_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Call kernel
-    impl.forward(op_.getOutput(0)->dims(),
-             op_.value(), 
-             op_.getOutput(0)->getImpl()->rawPtr());
+    impl.forward(op_.getOutput(0), op_.value());
 }
 
 template <>
-- 
GitLab


From e8e3f535a7ffb953feb7d11a912dd0424ab6be4f Mon Sep 17 00:00:00 2001
From: NAUD Maxence <maxence.naud@cea.fr>
Date: Wed, 26 Feb 2025 14:51:38 +0000
Subject: [PATCH 036/108] [upd] tests following 'aidge_core' changes

---
 .../operator/Test_ConstantOfShapeImpl.cpp     | 139 +++++++++---------
 .../recipies/Test_FoldConstantOfShape.cpp     |  50 +++++++
 2 files changed, 119 insertions(+), 70 deletions(-)
 create mode 100644 unit_tests/recipies/Test_FoldConstantOfShape.cpp

diff --git a/unit_tests/operator/Test_ConstantOfShapeImpl.cpp b/unit_tests/operator/Test_ConstantOfShapeImpl.cpp
index 8ec1669b..6833d836 100644
--- a/unit_tests/operator/Test_ConstantOfShapeImpl.cpp
+++ b/unit_tests/operator/Test_ConstantOfShapeImpl.cpp
@@ -27,89 +27,88 @@
 #include "aidge/data/Tensor.hpp"
 #include "aidge/filler/Filler.hpp"
 #include "aidge/operator/ConstantOfShape.hpp"
-#include "aidge/operator/OperatorTensor.hpp"
 #include "aidge/utils/TensorUtils.hpp"
 #include "aidge/utils/Types.h"
 
 namespace Aidge {
-TEST_CASE("[cpu/operator] ConstantOfShape", "[ConstantOfShape][CPU]") {
-  constexpr std::uint16_t NBTRIALS = 10;
-  // Create a random number generator
-  auto random_seed = Catch::Generators::Detail::getSeed;
-  std::mt19937 gen(random_seed());
-  std::uniform_real_distribution<float> valueDist(
-      0.1f, 1.1f); // Random float distribution between 0 and 1
-  std::uniform_int_distribution<DimSize_t> input_tensor_size_dist(
-      std::size_t(1), std::size_t(10));
-  std::uniform_int_distribution<int64_t> input_tensor_values_dist(
-      std::size_t(1), std::size_t(7));
-  std::uniform_real_distribution<double> operator_attr_value_dist(-100., 100.);
 
-  ///////////////////////////////////////////////
-  // SETUP FUNCTIONS
-  auto generate_input_tensor =
-      [&gen, &input_tensor_size_dist,
-       &input_tensor_values_dist]() -> std::shared_ptr<Tensor> {
-    std::vector<DimSize_t> input_dims;
-    input_dims.push_back(input_tensor_size_dist(gen));
+TEST_CASE("[cpu/operator] ConstantOfShape(forward)", "[ConstantOfShape][CPU][forward]") {
+    constexpr std::uint16_t NBTRIALS = 10;
+    // Create a random number generator
+    auto random_seed = Catch::Generators::Detail::getSeed;
+    std::mt19937 gen(random_seed());
+    std::uniform_real_distribution<float> valueDist(
+            0.1f, 1.1f); // Random float distribution between 0 and 1
+    std::uniform_int_distribution<DimSize_t> input_tensor_size_dist(
+            std::size_t(1), std::size_t(10));
+    std::uniform_int_distribution<int64_t> input_tensor_values_dist(
+            std::size_t(1), std::size_t(7));
+    std::uniform_real_distribution<double> operator_attr_value_dist(-100., 100.);
 
-    auto result = std::make_shared<Tensor>(input_dims);
-    result->setDataType(DataType::Int64);
-    result->setBackend("cpu");
-    for (DimSize_t i = 0; i < result->size(); ++i) {
-      result->set<std::int64_t>(i, input_tensor_values_dist(gen));
-    }
-    return result;
-  };
+    ///////////////////////////////////////////////
+    // SETUP FUNCTIONS
+    auto generate_input_tensor =
+            [&gen, &input_tensor_size_dist,
+             &input_tensor_values_dist]() -> std::shared_ptr<Tensor> {
+        std::vector<DimSize_t> input_dims;
+        input_dims.push_back(input_tensor_size_dist(gen));
 
-  auto generate_random_operator =
-      [&gen,
-       &operator_attr_value_dist]() -> std::shared_ptr<ConstantOfShape_Op> {
-    auto node = ConstantOfShape(Tensor(operator_attr_value_dist(gen)));
-    auto op = std::static_pointer_cast<ConstantOfShape_Op>(node->getOperator());
-    op->setDataType(DataType::Float64);
-    op->setBackend("cpu");
-    return op;
-  };
+        auto result = std::make_shared<Tensor>(input_dims);
+        result->setDataType(DataType::Int64);
+        result->setBackend("cpu");
+        for (DimSize_t i = 0; i < result->size(); ++i) {
+            result->set<std::int64_t>(i, input_tensor_values_dist(gen));
+        }
+        return result;
+    };
 
-  auto generate_output_tensor = [](std::shared_ptr<Tensor> input_tensor,
-                                   std::shared_ptr<ConstantOfShape_Op> op) {
-    std::vector<DimSize_t> output_dims;
-    output_dims.reserve(input_tensor->size());
-    for (DimSize_t i = 0; i < input_tensor->size(); ++i) {
-      output_dims.push_back(input_tensor->get<int64_t>(i));
-    }
-    auto result = std::make_shared<Tensor>(output_dims);
-    result->setDataType(op->value().dataType());
-    result->setBackend("cpu");
-    constantFiller(result, op->value().get<double>(0));
-    return result;
-  };
+    auto generate_random_operator =
+            [&gen,
+             &operator_attr_value_dist]() -> std::shared_ptr<ConstantOfShape_Op> {
+        std::shared_ptr<ConstantOfShape_Op> op = std::make_shared<ConstantOfShape_Op>(Tensor(operator_attr_value_dist(gen)));
+        op->setDataType(DataType::Float64);
+        op->setBackend("cpu");
+        return op;
+    };
+
+    auto generate_output_tensor = [](std::shared_ptr<Tensor> input_tensor,
+                                      std::shared_ptr<ConstantOfShape_Op> op) {
+        std::vector<DimSize_t> output_dims;
+        output_dims.reserve(input_tensor->size());
+        for (DimSize_t i = 0; i < input_tensor->size(); ++i) {
+            output_dims.push_back(input_tensor->get<std::int64_t>(i));
+        }
+        auto result = std::make_shared<Tensor>(output_dims);
+        result->setDataType(op->value().dataType());
+        result->setBackend("cpu");
+        constantFiller(result, op->value().get<double>(0));
+        return result;
+    };
 
-  /////////////////////////////////////
-  // BENCHMARKING
-  std::chrono::time_point<std::chrono::system_clock> start;
-  std::chrono::time_point<std::chrono::system_clock> end;
-  std::chrono::duration<double, std::micro> duration{};
-  int number_of_operation{0};
+    /////////////////////////////////////
+    // BENCHMARKING
+    std::chrono::time_point<std::chrono::system_clock> start;
+    std::chrono::time_point<std::chrono::system_clock> end;
+    std::chrono::duration<double, std::micro> duration{};
+    int number_of_operation{0};
 
-  SECTION("ConstantOfShapeImpl_cpu::forward()") {
-    for (int i = 0; i < NBTRIALS; ++i) {
-      auto input_T = generate_input_tensor();
-      std::shared_ptr<ConstantOfShape_Op> op = generate_random_operator();
-      auto output_T = generate_output_tensor(input_T, op);
-      op->associateInput(0, input_T);
+    SECTION("ConstantOfShapeImpl_cpu::forward()") {
+        for (int i = 0; i < NBTRIALS; ++i) {
+            auto input_T = generate_input_tensor();
+            std::shared_ptr<ConstantOfShape_Op> op = generate_random_operator();
+            auto output_T = generate_output_tensor(input_T, op);
+            op->associateInput(0, input_T);
 
-      REQUIRE(op->forwardDims(true));
-      REQUIRE_NOTHROW(op->forward());
+            REQUIRE(op->forwardDims(true));
+            REQUIRE_NOTHROW(op->forward());
 
-      CHECK(output_T->nbDims() == op->getOutput(0)->nbDims());
-      for (DimIdx_t i = 0; i < output_T->nbDims(); ++i) {
-        CHECK(output_T->dims().at(i) == op->getOutput(0)->dims().at(i));
-      }
-      CHECK(approxEq<double>(*output_T, *op->getOutput(0)));
+            CHECK(output_T->nbDims() == op->getOutput(0)->nbDims());
+            for (DimIdx_t i = 0; i < output_T->nbDims(); ++i) {
+                CHECK(output_T->dims().at(i) == op->getOutput(0)->dims().at(i));
+            }
+            CHECK(approxEq<double>(*output_T, *op->getOutput(0)));
+        }
     }
-  }
 }
 } // namespace Aidge
 
diff --git a/unit_tests/recipies/Test_FoldConstantOfShape.cpp b/unit_tests/recipies/Test_FoldConstantOfShape.cpp
new file mode 100644
index 00000000..a1c09b15
--- /dev/null
+++ b/unit_tests/recipies/Test_FoldConstantOfShape.cpp
@@ -0,0 +1,50 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+ #include "aidge/graph/GraphView.hpp"
+ #include "aidge/operator/Identity.hpp"
+ #include "aidge/recipes/Recipes.hpp"
+
+ #include <cstdint>  // std::int64_t
+ #include <memory>
+
+ #include <catch2/catch_test_macros.hpp>
+
+ #include "aidge/graph/OpArgs.hpp"
+ #include "aidge/operator/ConstantOfShape.hpp"
+ #include "aidge/operator/Conv.hpp"
+ #include "aidge/operator/Producer.hpp"
+ #include "aidge/operator/ReLU.hpp"
+ #include "aidge/recipes/Recipes.hpp"
+ #include "aidge/utils/ArrayHelpers.hpp"
+ #include "aidge/utils/Types.h"
+
+ namespace Aidge {
+
+ TEST_CASE("[cpu/recipes] foldConstantOfShape",
+           "[ConstantOfShape][foldConstantOfShape][recipes]") {
+   auto input_T = std::make_shared<Tensor>(Array1D<std::int64_t, 4>({1, 1, 3, 3}));
+
+   auto model = std::make_shared<GraphView>();
+   SECTION("Sequential model") {
+     model = Sequential({
+         Producer(input_T, "prod_0", true),
+         ConstantOfShape(3, "constantOfShape_0"),
+         Conv(1, 1, {3, 3}, "Conv_0"),
+         ReLU("ReLU_1")
+     });
+     // aidge_backend_cpu loaded. Recipe should work
+     REQUIRE(foldConstantOfShape(model) == 1);
+     CHECK(model->forwardDims());
+   }
+ }
+
+ }  // namespace Aidge
-- 
GitLab


From 50a2eb35d99fbc4336f6ecedb5eb746604edd8af Mon Sep 17 00:00:00 2001
From: Jerome Hue <jerome.hue@cea.fr>
Date: Wed, 22 Jan 2025 15:16:35 +0100
Subject: [PATCH 037/108] Add an (unregistered) backward kernel function for
 Sub

---
 .../backend/cpu/operator/SubImpl_kernels.hpp  | 50 +++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp b/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp
index 1d789c3c..a1954776 100644
--- a/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp
@@ -42,6 +42,7 @@ void sub_contiguous_arrays(const std::size_t input1size,
 
 
 namespace Aidge {
+
 template <class I1, class I2, class O>
 void SubImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
                                 std::vector<std::size_t> dims1,
@@ -149,6 +150,55 @@ void SubImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
     }
 }
 
+template <class I1, class I2, class O>
+void SubImpl_cpu_backward_kernel(const std::size_t input0Length,
+                               const std::size_t input1Length,
+                               const std::size_t gradOutputLength,
+                               const std::vector<std::size_t>& dims0,
+                               const std::vector<std::size_t>& dims1,
+                               const std::vector<std::size_t>& outputDims,
+                               const void* input0_,
+                               const void* input1_,
+                               const void* grad_output_,
+                               void* gradientInput0_,
+                               void* gradientInput1_)
+{
+    const I1* input0 = static_cast<const I1*>(input0_);
+    const I2* input1 = static_cast<const I2*>(input1_);
+    const O* grad_output = static_cast<const O*>(grad_output_);
+    auto* grad_input_0 = static_cast<I1*>(gradientInput0_);
+    auto* grad_input_1 = static_cast<I2*>(gradientInput1_);
+
+    std::fill_n(grad_input_0, input0Length, static_cast<I1>(0));
+    std::fill_n(grad_input_1, input1Length, static_cast<I2>(0));
+
+    auto broadcastedDims0 = getBroadcastedDims(outputDims, dims0);
+    auto broadcastedDims1 = getBroadcastedDims(outputDims, dims1);
+
+    for (std::size_t i = 0; i < gradOutputLength; ++i) {
+        auto idxOutputGrad = getMultiDimIndices(outputDims, i);
+        std::vector<std::size_t> idxInput0(broadcastedDims0.size());
+        std::vector<std::size_t> idxInput1(broadcastedDims1.size());
+
+        for (std::size_t dimension = 0; dimension < broadcastedDims0.size(); ++dimension) {
+            idxInput0[dimension] = (broadcastedDims0[dimension] == 1) ? 0 : idxOutputGrad[dimension];
+        }
+
+        for (std::size_t dimension = 0; dimension < broadcastedDims1.size(); ++dimension) {
+            idxInput1[dimension] = (broadcastedDims1[dimension] == 1) ? 0 : idxOutputGrad[dimension];
+        }
+
+        auto idx0 = getFlattenedIndex(broadcastedDims0, idxInput0);
+        auto idx1 = getFlattenedIndex(broadcastedDims1, idxInput1);
+
+        // For subtraction: gradient of first input is 1 * grad_output
+        grad_input_0[idx0] += static_cast<I1>(grad_output[i]);
+        // For subtraction: gradient of second input is -1 * grad_output
+        grad_input_1[idx1] += static_cast<I2>(-grad_output[i]);
+    }
+}
+
+
 // Kernels registration to implementation entry point
 REGISTRAR(SubImpl_cpu,
     {DataType::Float32},
-- 
GitLab


From 1b7c5b5b0a7c3d7f80c0ddec7e8a5a311648c7ee Mon Sep 17 00:00:00 2001
From: Jerome Hue <jerome.hue@cea.fr>
Date: Wed, 22 Jan 2025 15:39:38 +0100
Subject: [PATCH 038/108] Add test and register Sub backward

---
 .../aidge/backend/cpu/operator/SubImpl.hpp    |  14 +-
 .../backend/cpu/operator/SubImpl_kernels.hpp  |   4 +-
 src/operator/SubImpl.cpp                      |  26 ++-
 unit_tests/operator/Test_SubImpl.cpp          | 161 ++++++++++++++++++
 4 files changed, 200 insertions(+), 5 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/SubImpl.hpp b/include/aidge/backend/cpu/operator/SubImpl.hpp
index eed26ddc..064b5329 100644
--- a/include/aidge/backend/cpu/operator/SubImpl.hpp
+++ b/include/aidge/backend/cpu/operator/SubImpl.hpp
@@ -23,7 +23,19 @@
 namespace Aidge {
 // Operator implementation entry point for the backend
 using SubImpl_cpu = OperatorImpl_cpu<Sub_Op,
-    void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*,void*)>;
+    void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*,void*),  
+    void(const std::size_t,
+         const std::size_t,
+         const std::size_t,
+         const std::vector<std::size_t>&,
+         const std::vector<std::size_t>&,
+         const std::vector<std::size_t>&,
+         const void*,
+         const void*,
+         const void*,
+         void*,
+         void*)
+>;
 
 // Implementation entry point registration to Operator
 REGISTRAR(Sub_Op, "cpu", Aidge::SubImpl_cpu::create);
diff --git a/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp b/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp
index a1954776..cb16c037 100644
--- a/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp
@@ -163,8 +163,6 @@ void SubImpl_cpu_backward_kernel(const std::size_t input0Length,
                                void* gradientInput0_,
                                void* gradientInput1_)
 {
-    const I1* input0 = static_cast<const I1*>(input0_);
-    const I2* input1 = static_cast<const I2*>(input1_);
     const O* grad_output = static_cast<const O*>(grad_output_);
     auto* grad_input_0 = static_cast<I1*>(gradientInput0_);
     auto* grad_input_1 = static_cast<I2*>(gradientInput1_);
@@ -202,7 +200,7 @@ void SubImpl_cpu_backward_kernel(const std::size_t input0Length,
 // Kernels registration to implementation entry point
 REGISTRAR(SubImpl_cpu,
     {DataType::Float32},
-    {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<float, float, float>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<float, float, float>, Aidge::SubImpl_cpu_backward_kernel<float,float,float>});
 REGISTRAR(SubImpl_cpu,
     {DataType::Float64},
     {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<double, double, double>, nullptr});
diff --git a/src/operator/SubImpl.cpp b/src/operator/SubImpl.cpp
index e36abe2a..cce4e27a 100644
--- a/src/operator/SubImpl.cpp
+++ b/src/operator/SubImpl.cpp
@@ -41,5 +41,29 @@ void Aidge::SubImpl_cpu::forward() {
 
 template <>
 void Aidge::SubImpl_cpu::backward() {
-    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Sub_Op on backend cpu");
+
+    const Sub_Op& op_ = dynamic_cast<const Sub_Op&>(mOp);
+
+    auto in0 = op_.getInput(0);
+    auto in1 = op_.getInput(1);
+    auto in0grad = op_.getInput(0)->grad();
+    auto in1grad = op_.getInput(1)->grad();
+    auto out0grad = op_.getOutput(0)->grad();
+
+    // Find the correct kernel type
+    const auto impl = Registrar<SubImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Call kernel
+    impl.backward(/* input0Length */ in0grad->size(),
+                  /* input1Length */ in1grad->size(),
+                  /* grad0Length  */ out0grad->size(),
+                  /* input0Dims   */ in0->dims(),
+                  /* input1Dims   */ in1->dims(),
+               out0grad->dims(),
+               getCPUPtr(in0),
+               getCPUPtr(in1),
+               getCPUPtr(out0grad),
+               getCPUPtr(in0grad),
+               getCPUPtr(in1grad));
+
 }
diff --git a/unit_tests/operator/Test_SubImpl.cpp b/unit_tests/operator/Test_SubImpl.cpp
index 1317e88a..d9b6207b 100644
--- a/unit_tests/operator/Test_SubImpl.cpp
+++ b/unit_tests/operator/Test_SubImpl.cpp
@@ -322,4 +322,165 @@ TEST_CASE("[cpu/operator] Sub", "[Sub][CPU]") {
         }
     }
 }
+
+
+TEST_CASE("[CPU/Operator] Sub(Backward)", "[Sub][CPU][Backward]") {
+    std::shared_ptr<Node> mySub = Sub();
+    auto op = std::static_pointer_cast<OperatorTensor>(mySub->getOperator());
+    op->setDataType(DataType::Float32);
+    op->setBackend("cpu");
+
+    SECTION("Case 1: 1D and 2D Tensors") {
+        const auto T0 = std::make_shared<Tensor>(
+            Array2D<float, 2, 3>({{{1, 2, 3}, {4, 5, 6}}}));
+
+        const auto T1 =
+            std::make_shared<Tensor>(Array1D<float, 3>({0.1, 0.2, 0.3}));
+
+        T0->setDataType(DataType::Float32);
+        T0->setBackend("cpu");
+        T1->setDataType(DataType::Float32);
+        T1->setBackend("cpu");
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>(
+            Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}})));
+        op->forwardDims();
+
+        mySub->backward();
+
+        // For subtraction: grad_input0 = grad_output
+        const auto expectedGrad0 = std::make_shared<Tensor>(
+            Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}}));
+
+        // For subtraction: grad_input1 = -grad_output (summed across broadcast dimensions)
+        const auto expectedGrad1 =
+            std::make_shared<Tensor>(Array1D<float, 3>({-2, -2, -2}));
+
+        REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0));
+        REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1));
+    }
+
+    SECTION("Case 2: 3D and 1D tensors") {
+        const auto T0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+            {{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}},
+              {{7.0, 8.0, 9.0}, {10.0, 11.0, 12.0}}}}));
+
+        const auto T1 =
+            std::make_shared<Tensor>(Array1D<float, 3>({0.3, 0.2, 0.1}));
+
+        const auto newGrad = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+            {{{{1, 1, 1}, {1, 1, 1}}, {{1, 1, 1}, {1, 1, 1}}}}));
+
+        const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+            {{{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}},
+              {{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}}}));
+
+        const auto expectedGrad1 =
+            std::make_shared<Tensor>(Array1D<float, 3>({-4.0, -4.0, -4.0}));
+
+        for (auto T : {T0, T1, newGrad, expectedGrad0, expectedGrad1}) {
+            T->setBackend("cpu");
+            T->setDataType(DataType::Float32);
+        }
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(newGrad);
+        op->forwardDims();
+
+        mySub->backward();
+
+        REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0));
+        REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1));
+    }
+
+    SECTION("Case 3: Random values with broadcasting") {
+        // Use random values
+        std::vector<std::size_t> dims0 = {5, 2, 1, 7}; // First tensor
+        std::vector<std::size_t> dims1 = {2, 6, 7};    // Second tensor
+        std::vector<std::size_t> outputDims = {5, 2, 6, 7};
+
+        const auto input0Size = 5 * 2 * 1 * 7;
+        const auto input1Size = 2 * 6 * 7;
+        const auto outputSize = 5 * 2 * 6 * 7;
+
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> dist(0.1f, 1.0f);
+
+        std::vector<float> input0Data(input0Size);
+        std::vector<float> input1Data(input1Size);
+        std::vector<float> gradOutputData(outputSize);
+
+        // Fill with random values
+        for (auto &val : input0Data) val = dist(gen);
+        for (auto &val : input1Data) val = dist(gen);
+        for (auto &val : gradOutputData) val = dist(gen);
+
+        auto T0 = std::make_shared<Tensor>();
+        auto T1 = std::make_shared<Tensor>();
+
+        T0->setDataType(DataType::Float32);
+        T0->setBackend("cpu");
+        T0->resize(dims0);
+        T0->getImpl()->setRawPtr(input0Data.data(), input0Size);
+
+        T1->setDataType(DataType::Float32);
+        T1->setBackend("cpu");
+        T1->resize(dims1);
+        T1->getImpl()->setRawPtr(input1Data.data(), input1Size);
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+
+        // Set gradient of output
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>());
+        op->getOutput(0)->grad()->resize(outputDims);
+        op->getOutput(0)->grad()->getImpl()->setRawPtr(gradOutputData.data(), outputSize);
+
+        op->forwardDims();
+
+        // Compute reference gradients
+        std::vector<float> expectedGrad0(input0Size, 0.0f);
+        std::vector<float> expectedGrad1(input1Size, 0.0f);
+
+        for (std::size_t n = 0; n < 5; ++n) {
+            for (std::size_t c = 0; c < 2; ++c) {
+                for (std::size_t h = 0; h < 6; ++h) {
+                    for (std::size_t w = 0; w < 7; ++w) {
+                        std::size_t outIdx = w + 7 * (h + 6 * (c + 2 * n));
+                        std::size_t in0Idx = w + 7 * (0 + 1 * (c + 2 * n));
+                        std::size_t in1Idx = w + 7 * (h + 6 * c);
+
+                        // Gradient for input0: grad_output
+                        expectedGrad0[in0Idx] += gradOutputData[outIdx];
+                        // Gradient for input1: -grad_output
+                        expectedGrad1[in1Idx] += -gradOutputData[outIdx];
+                    }
+                }
+            }
+        }
+
+        // Perform backward pass
+        mySub->backward();
+
+        auto expectedGrad0Tensor = std::make_shared<Tensor>();
+        expectedGrad0Tensor->resize(T0->dims());
+        expectedGrad0Tensor->setBackend("cpu");
+        expectedGrad0Tensor->setDataType(DataType::Float32);
+        expectedGrad0Tensor->getImpl()->setRawPtr(expectedGrad0.data(), expectedGrad0.size());
+
+        auto expectedGrad1Tensor = std::make_shared<Tensor>();
+        expectedGrad1Tensor->resize(T1->dims());
+        expectedGrad1Tensor->setBackend("cpu");
+        expectedGrad1Tensor->setDataType(DataType::Float32);
+        expectedGrad1Tensor->getImpl()->setRawPtr(expectedGrad1.data(), expectedGrad1.size());
+
+        // Verify backward pass
+        REQUIRE(approxEq<float>(*T0->grad(), *expectedGrad0Tensor));
+        REQUIRE(approxEq<float>(*T1->grad(), *expectedGrad1Tensor));
+    }
+}
 } // namespace Aidge
-- 
GitLab


From 39d17c4ca8d050ee600f7acfffd11f79870eba80 Mon Sep 17 00:00:00 2001
From: Jerome Hue <jerome.hue@cea.fr>
Date: Mon, 3 Mar 2025 09:08:01 +0100
Subject: [PATCH 039/108] Fix warnings by removing unused parameters in Sub
 backward kernel

---
 include/aidge/backend/cpu/operator/SubImpl.hpp         |  6 +-----
 include/aidge/backend/cpu/operator/SubImpl_kernels.hpp |  2 --
 src/operator/SubImpl.cpp                               | 10 ++++------
 unit_tests/operator/Test_MetaOperator.cpp              |  4 ++--
 4 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/SubImpl.hpp b/include/aidge/backend/cpu/operator/SubImpl.hpp
index 064b5329..1f94ff13 100644
--- a/include/aidge/backend/cpu/operator/SubImpl.hpp
+++ b/include/aidge/backend/cpu/operator/SubImpl.hpp
@@ -15,9 +15,7 @@
 #include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/Sub.hpp"
 #include "aidge/utils/Registrar.hpp"
-#include "aidge/utils/Types.h"
-#include "aidge/backend/cpu/data/GetCPUPtr.h"
-#include <memory>
+
 #include <vector>
 
 namespace Aidge {
@@ -31,8 +29,6 @@ using SubImpl_cpu = OperatorImpl_cpu<Sub_Op,
          const std::vector<std::size_t>&,
          const std::vector<std::size_t>&,
          const void*,
-         const void*,
-         const void*,
          void*,
          void*)
 >;
diff --git a/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp b/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp
index cb16c037..8d3d80e9 100644
--- a/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp
@@ -157,8 +157,6 @@ void SubImpl_cpu_backward_kernel(const std::size_t input0Length,
                                const std::vector<std::size_t>& dims0,
                                const std::vector<std::size_t>& dims1,
                                const std::vector<std::size_t>& outputDims,
-                               const void* input0_,
-                               const void* input1_,
                                const void* grad_output_,
                                void* gradientInput0_,
                                void* gradientInput1_)
diff --git a/src/operator/SubImpl.cpp b/src/operator/SubImpl.cpp
index cce4e27a..7f57bf2f 100644
--- a/src/operator/SubImpl.cpp
+++ b/src/operator/SubImpl.cpp
@@ -59,11 +59,9 @@ void Aidge::SubImpl_cpu::backward() {
                   /* grad0Length  */ out0grad->size(),
                   /* input0Dims   */ in0->dims(),
                   /* input1Dims   */ in1->dims(),
-               out0grad->dims(),
-               getCPUPtr(in0),
-               getCPUPtr(in1),
-               getCPUPtr(out0grad),
-               getCPUPtr(in0grad),
-               getCPUPtr(in1grad));
+                  /* outputDims   */ out0grad->dims(),
+                  /* gradOutput   */ getCPUPtr(out0grad),
+                  /* gradInput0   */ getCPUPtr(in0grad),
+                  /* gradInput1   */ getCPUPtr(in1grad));
 
 }
diff --git a/unit_tests/operator/Test_MetaOperator.cpp b/unit_tests/operator/Test_MetaOperator.cpp
index 23bacda5..64c6886a 100644
--- a/unit_tests/operator/Test_MetaOperator.cpp
+++ b/unit_tests/operator/Test_MetaOperator.cpp
@@ -705,7 +705,7 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
         auto fc2 = FC(outChannels, inChannels, true, "fc2");
         // NOTE: Account for init step by adding 1 to the max timestep
         // parameter.
-        auto lif1 = Leaky(nbTimeSteps + 1, beta, threshold, "leaky");
+        auto lif1 = Leaky(nbTimeSteps + 1, beta, threshold, LeakyReset::Subtraction, "leaky");
 
         // associateInput() does not work
         fc1->input(1).first->getOperator()->setOutput(0, myWeights);
@@ -774,7 +774,7 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
         const auto nbTimeSteps = dims[0];
         const auto beta = betaDist(gen); 
 
-        auto myLeaky = Leaky(nbTimeSteps, beta, 1.0, "leaky");
+        auto myLeaky = Leaky(nbTimeSteps, beta, 1.0, LeakyReset::Subtraction, "leaky");
         auto op =
             std::static_pointer_cast<MetaOperator_Op>(myLeaky->getOperator());
         // auto stack = Stack(2);
-- 
GitLab


From d7cafea1be49987ceb7eda9d60d0ed5769f530c9 Mon Sep 17 00:00:00 2001
From: Jerome Hue <jerome.hue@cea.fr>
Date: Mon, 3 Mar 2025 09:21:44 +0100
Subject: [PATCH 040/108] Fix warnings by removing unused parameters in Add
 backward kernel

---
 include/aidge/backend/cpu/operator/AddImpl.hpp         | 2 --
 include/aidge/backend/cpu/operator/AddImpl_kernels.hpp | 4 ----
 src/operator/AddImpl.cpp                               | 2 --
 unit_tests/operator/Test_MetaOperator.cpp              | 2 +-
 4 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/AddImpl.hpp b/include/aidge/backend/cpu/operator/AddImpl.hpp
index ca04dff9..cfb85ecf 100644
--- a/include/aidge/backend/cpu/operator/AddImpl.hpp
+++ b/include/aidge/backend/cpu/operator/AddImpl.hpp
@@ -33,8 +33,6 @@ using AddImpl_cpu = OperatorImpl_cpu<Add_Op,
          const std::vector<std::size_t>&, 
          const std::vector<std::size_t>&, 
          const void*, 
-         const void*, 
-         const void*, 
          void*, 
          void*)
 >;
diff --git a/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp
index d6fff9b5..4be47849 100644
--- a/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp
@@ -154,15 +154,11 @@ void AddImpl_cpu_backward_kernel(const std::size_t input0Length,
                                const std::vector<std::size_t>& dims0,
                                const std::vector<std::size_t>& dims1,
                                const std::vector<std::size_t>& outputDims,
-                               const void* input0_,
-                               const void* input1_,
                                const void* grad_output_,
                                void* gradientInput0_,
                                void* gradientInput1_)
 {
     // TODO: Remove input0/1 from the function
-    const I* input0 = static_cast<const I*>(input0_);
-    const I* input1 = static_cast<const I*>(input1_);
     const O* gradOutput = static_cast<const O*>(grad_output_);
     auto* gradInput0 = static_cast<I*>(gradientInput0_);
     auto* gradInput1 = static_cast<I*>(gradientInput1_);
diff --git a/src/operator/AddImpl.cpp b/src/operator/AddImpl.cpp
index b027fb87..cff61287 100644
--- a/src/operator/AddImpl.cpp
+++ b/src/operator/AddImpl.cpp
@@ -73,8 +73,6 @@ void Aidge::AddImpl_cpu::backward() {
                in0->dims(),
                in1->dims(),
                out0grad->dims(),
-               getCPUPtr(in0),
-               getCPUPtr(in1),
                getCPUPtr(out0grad),
                getCPUPtr(in0grad),
                getCPUPtr(in1grad));
diff --git a/unit_tests/operator/Test_MetaOperator.cpp b/unit_tests/operator/Test_MetaOperator.cpp
index 64c6886a..0c4a64bb 100644
--- a/unit_tests/operator/Test_MetaOperator.cpp
+++ b/unit_tests/operator/Test_MetaOperator.cpp
@@ -772,7 +772,7 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
         }
 
         const auto nbTimeSteps = dims[0];
-        const auto beta = betaDist(gen); 
+        const auto beta = betaDist(gen);
 
         auto myLeaky = Leaky(nbTimeSteps, beta, 1.0, LeakyReset::Subtraction, "leaky");
         auto op =
-- 
GitLab


From 0d7ea89558f4f9433cd8f5c97178c2a47da7b811 Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Tue, 4 Mar 2025 10:40:45 +0100
Subject: [PATCH 041/108] Fixed typo

---
 include/aidge/backend/cpu/operator/AbsImpl_kernels.hpp |  4 ++--
 .../aidge/backend/cpu/operator/AtanImpl_kernels.hpp    |  8 ++++----
 include/aidge/backend/cpu/operator/ErfImpl_kernels.hpp |  4 ++--
 .../backend/cpu/operator/HeavisideImpl_kernels.hpp     |  4 ++--
 .../backend/cpu/operator/LeakyReLUImpl_kernels.hpp     |  8 ++++----
 include/aidge/backend/cpu/operator/LnImpl_kernels.hpp  | 10 +++++-----
 .../aidge/backend/cpu/operator/ReLUImpl_kernels.hpp    | 10 +++++-----
 .../aidge/backend/cpu/operator/RoundImpl_kernels.hpp   |  4 ++--
 .../aidge/backend/cpu/operator/ScalingImpl_kernels.hpp |  4 ++--
 .../aidge/backend/cpu/operator/SigmoidImpl_kernels.hpp | 10 +++++-----
 .../aidge/backend/cpu/operator/SqrtImpl_kernels.hpp    |  8 ++++----
 .../aidge/backend/cpu/operator/TanhImpl_kernels.hpp    | 10 +++++-----
 12 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/AbsImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AbsImpl_kernels.hpp
index 16e5f9de..e6474cf2 100644
--- a/include/aidge/backend/cpu/operator/AbsImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AbsImpl_kernels.hpp
@@ -20,14 +20,14 @@
 
 namespace Aidge {
 template <class I, class O>
-void AbsImpl_cpu_forward_kernel(std::size_t inputLenght,
+void AbsImpl_cpu_forward_kernel(std::size_t inputLength,
                                      const void* input_,
                                      void* output_) {
 
     const I* input = static_cast<const I*>(input_);
     O* output = static_cast<O*>(output_);
 
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+    for (std::size_t i = 0; i < inputLength; ++i) {
         output[i] = std::abs(input[i]);
     }
 }
diff --git a/include/aidge/backend/cpu/operator/AtanImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AtanImpl_kernels.hpp
index 2a786339..141e5b60 100644
--- a/include/aidge/backend/cpu/operator/AtanImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AtanImpl_kernels.hpp
@@ -20,20 +20,20 @@
 
 namespace Aidge {
 template <class I, class O>
-void AtanImpl_cpu_forward_kernel(std::size_t inputLenght,
+void AtanImpl_cpu_forward_kernel(std::size_t inputLength,
                                     const void* input_,
                                     void* output_) {
     const I* input = static_cast<const I*>(input_);
     O* output = static_cast<O*>(output_);
 
-    for (size_t i = 0; i < inputLenght; ++i) {
+    for (size_t i = 0; i < inputLength; ++i) {
         output[i] = static_cast<O>(atan(input[i]));
     }
 
 }
 
 template <class O, class GI, class GO>
-void AtanImpl_cpu_backward_kernel(const std::size_t inputLenght,
+void AtanImpl_cpu_backward_kernel(const std::size_t inputLength,
                                      const void* output_, const void* grad_output_,
 				     void* grad_input_) {
     const O* output = static_cast<const O*>(output_);
@@ -41,7 +41,7 @@ void AtanImpl_cpu_backward_kernel(const std::size_t inputLenght,
     GI* grad_input = static_cast<GI*>(grad_input_);
 
     // Apply the derivative of atan for each element in the input array
-    for (size_t i = 0; i < inputLenght; ++i) {
+    for (size_t i = 0; i < inputLength; ++i) {
         // dx = dy * (1 / (1 + x^2))
         grad_input[i] = grad_output[i] * static_cast<O>(1.0 / (1.0 + output[i] * output[i]));
     }
diff --git a/include/aidge/backend/cpu/operator/ErfImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ErfImpl_kernels.hpp
index 02041f55..709f4a6f 100644
--- a/include/aidge/backend/cpu/operator/ErfImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ErfImpl_kernels.hpp
@@ -20,14 +20,14 @@
 
 namespace Aidge {
 template <class I, class O>
-void ErfImpl_cpu_forward_kernel(std::size_t inputLenght,
+void ErfImpl_cpu_forward_kernel(std::size_t inputLength,
                                      const void* input_,
                                      void* output_) {
 
     const I* input = static_cast<const I*>(input_);
     O* output = static_cast<O*>(output_);
 
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+    for (std::size_t i = 0; i < inputLength; ++i) {
         output[i] = std::erf(input[i]);
     }
 }
diff --git a/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp b/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp
index 3fd6ca7d..06d7fff8 100644
--- a/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp
@@ -23,14 +23,14 @@
 namespace Aidge {
 
 template <class I, class O>
-void HeavisideImplCpuForwardKernel(std::size_t inputLenght,
+void HeavisideImplCpuForwardKernel(std::size_t inputLength,
                                    const void *input_,
                                    void *output_,
                                    const float value) {
     const I *input = static_cast<const I *>(input_);
     O *output = static_cast<O *>(output_);
 
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+    for (std::size_t i = 0; i < inputLength; ++i) {
         output[i] = (input[i] > 0) ? 1 : (input[i] == 0 ? value : 0);
     }
 }
diff --git a/include/aidge/backend/cpu/operator/LeakyReLUImpl_kernels.hpp b/include/aidge/backend/cpu/operator/LeakyReLUImpl_kernels.hpp
index bc856f70..7afd8298 100644
--- a/include/aidge/backend/cpu/operator/LeakyReLUImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/LeakyReLUImpl_kernels.hpp
@@ -19,7 +19,7 @@
 namespace Aidge {
 template <class I, class O>
 void LeakyReLUImpl_cpu_forward_kernel(const float negativeSlope_,
-                                     std::size_t inputLenght,
+                                     std::size_t inputLength,
                                      const void* input_,
                                      void* output_) {
 
@@ -27,14 +27,14 @@ void LeakyReLUImpl_cpu_forward_kernel(const float negativeSlope_,
     O* output = static_cast<O*>(output_);
     const I negativeSlope = static_cast<const I>(negativeSlope_);
 
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+    for (std::size_t i = 0; i < inputLength; ++i) {
         output[i] = (input[i] >= 0) ? input[i] : input[i] * negativeSlope;
     }
 }
 
 template <class I, class O>
 void LeakyReLUImpl_cpu_backward_kernel(const float negativeSlope_,
-                                     std::size_t inputLenght,
+                                     std::size_t inputLength,
                                      const void* input_,
                                      void* output_) {
 
@@ -42,7 +42,7 @@ void LeakyReLUImpl_cpu_backward_kernel(const float negativeSlope_,
     O* output = static_cast<O*>(output_);
     const I negativeSlope = static_cast<const I>(negativeSlope_);
 
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+    for (std::size_t i = 0; i < inputLength; ++i) {
         output[i] = (input[i] > 0) ? input[i] : negativeSlope*input[i];
     }
 }
diff --git a/include/aidge/backend/cpu/operator/LnImpl_kernels.hpp b/include/aidge/backend/cpu/operator/LnImpl_kernels.hpp
index b30b05bb..ee2864b6 100755
--- a/include/aidge/backend/cpu/operator/LnImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/LnImpl_kernels.hpp
@@ -18,7 +18,7 @@
 
 namespace Aidge {
 template <class I, class O>
-void LnImpl_cpu_forward_kernel(std::size_t inputLenght,
+void LnImpl_cpu_forward_kernel(std::size_t inputLength,
                                const void* input_,
                                void* output_) {
 
@@ -26,8 +26,8 @@ void LnImpl_cpu_forward_kernel(std::size_t inputLenght,
     O* output = static_cast<O*>(output_);
 	const float eps = 1.0e-20f;
 
-//#pragma omp parallel for if (inputLenght > 1024)
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+//#pragma omp parallel for if (inputLength > 1024)
+    for (std::size_t i = 0; i < inputLength; ++i) {
 		if (input[i] > I(eps)) {
 			output[i] = std::log(input[i]);
 		} else {
@@ -37,7 +37,7 @@ void LnImpl_cpu_forward_kernel(std::size_t inputLenght,
 }
 
 template <class I, class GI, class GO>
-void LnImpl_cpu_backward_kernel(const std::size_t inputLenght,
+void LnImpl_cpu_backward_kernel(const std::size_t inputLength,
                                 const void* input_, const void* grad_output_,
 	                            void* grad_input_) {
 						 
@@ -46,7 +46,7 @@ void LnImpl_cpu_backward_kernel(const std::size_t inputLenght,
     GI* grad_input = static_cast<GI*>(grad_input_);
 	const float eps = 1.0e-20f;
 	
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+    for (std::size_t i = 0; i < inputLength; ++i) {
 		if (input[i] > I(eps)) {
 			grad_input[i] = grad_output[i] / input[i];
 		} else {
diff --git a/include/aidge/backend/cpu/operator/ReLUImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ReLUImpl_kernels.hpp
index e39e9b7d..bb5d7cc3 100644
--- a/include/aidge/backend/cpu/operator/ReLUImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ReLUImpl_kernels.hpp
@@ -26,27 +26,27 @@
 namespace Aidge {
 // Kernels
 template <class I, class O>
-void ReLUImpl_cpu_forward_kernel(std::size_t inputLenght,
+void ReLUImpl_cpu_forward_kernel(std::size_t inputLength,
                                      const void* input_,
                                      void* output_) {
 
     const I* input = static_cast<const I*>(input_);
     O* output = static_cast<O*>(output_);
 
-//#pragma omp parallel for if (inputLenght > 1024)
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+//#pragma omp parallel for if (inputLength > 1024)
+    for (std::size_t i = 0; i < inputLength; ++i) {
         output[i] = (input[i] > 0) ? input[i] : 0;
     }
 }
 
 template <class I, class GI, class GO>
-void ReLUImpl_cpu_backward_kernel(const std::size_t inputLenght,
+void ReLUImpl_cpu_backward_kernel(const std::size_t inputLength,
                                   const void* input_, const void* grad_output_,
 				  void* grad_input_) {
     const I* input = static_cast<const I*>(input_);
     const GO* grad_output = static_cast<const GO*>(grad_output_);
     GI* grad_input = static_cast<GI*>(grad_input_);
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+    for (std::size_t i = 0; i < inputLength; ++i) {
         grad_input[i] = (input[i] > 0) ? grad_output[i] : 0;
     }
 }
diff --git a/include/aidge/backend/cpu/operator/RoundImpl_kernels.hpp b/include/aidge/backend/cpu/operator/RoundImpl_kernels.hpp
index ba9c63bc..7ac4319b 100644
--- a/include/aidge/backend/cpu/operator/RoundImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/RoundImpl_kernels.hpp
@@ -21,14 +21,14 @@
 
 namespace Aidge {
 template <class I, class O>
-void RoundImpl_cpu_forward_kernel(const std::size_t inputLenght,
+void RoundImpl_cpu_forward_kernel(const std::size_t inputLength,
                                      const void* input_,
                                      void* output_) {
 
     const I* input = static_cast<const I*>(input_);
     O* output = static_cast<O*>(output_);
 
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+    for (std::size_t i = 0; i < inputLength; ++i) {
         //std::round would not work since it doesn't follow the halves rules (See ONNX Round)
         output[i] = static_cast<O>(std::nearbyint(static_cast<float>(input[i])));
     }
diff --git a/include/aidge/backend/cpu/operator/ScalingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ScalingImpl_kernels.hpp
index c758c9cf..f9ca00b7 100644
--- a/include/aidge/backend/cpu/operator/ScalingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ScalingImpl_kernels.hpp
@@ -76,14 +76,14 @@ template <class I, class O>
 void ScalingImpl_cpu_forward_kernel(const float scalingFactor,
                                     const std::size_t quantizedNbBits,
                                     const bool isOutputUnsigned,
-                                    std::size_t inputLenght,
+                                    std::size_t inputLength,
                                     const void* input_,
                                     void* output_) {
 
     const I* input = static_cast<const I*>(input_);
     O* output = static_cast<O*>(output_);
 
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+    for (std::size_t i = 0; i < inputLength; ++i) {
         output[i] = static_cast<O>(input[i] * static_cast<I>(scalingFactor));
 
         if(quantizedNbBits > 0) {
diff --git a/include/aidge/backend/cpu/operator/SigmoidImpl_kernels.hpp b/include/aidge/backend/cpu/operator/SigmoidImpl_kernels.hpp
index dfd71ce0..83ad4575 100644
--- a/include/aidge/backend/cpu/operator/SigmoidImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/SigmoidImpl_kernels.hpp
@@ -18,15 +18,15 @@
 
 namespace Aidge {
 template <class I, class O>
-void SigmoidImpl_cpu_forward_kernel(std::size_t inputLenght,
+void SigmoidImpl_cpu_forward_kernel(std::size_t inputLength,
                                     const void* input_,
                                     void* output_) {
 
     const I* input = static_cast<const I*>(input_);
     O* output = static_cast<O*>(output_);
 
-//#pragma omp parallel for if (inputLenght > 1024)
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+//#pragma omp parallel for if (inputLength > 1024)
+    for (std::size_t i = 0; i < inputLength; ++i) {
 		if (input[i] > I(0)) {
 			output[i] = O(1) / (O(1) + std::exp(-input[i]));
 		} else {
@@ -36,13 +36,13 @@ void SigmoidImpl_cpu_forward_kernel(std::size_t inputLenght,
 }
 
 template <class O, class GI, class GO>
-void SigmoidImpl_cpu_backward_kernel(const std::size_t inputLenght,
+void SigmoidImpl_cpu_backward_kernel(const std::size_t inputLength,
                                      const void* output_, const void* grad_output_,
 				     void* grad_input_) {
     const O* output = static_cast<const O*>(output_);
     const GO* grad_output = static_cast<const GO*>(grad_output_);
     GI* grad_input = static_cast<GI*>(grad_input_);
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+    for (std::size_t i = 0; i < inputLength; ++i) {
         grad_input[i] = output[i] * (O(1) - output[i]) * grad_output[i];
     }
 }
diff --git a/include/aidge/backend/cpu/operator/SqrtImpl_kernels.hpp b/include/aidge/backend/cpu/operator/SqrtImpl_kernels.hpp
index 0464119c..1ce1ef9b 100644
--- a/include/aidge/backend/cpu/operator/SqrtImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/SqrtImpl_kernels.hpp
@@ -21,27 +21,27 @@
 
 namespace Aidge {
 template <class I, class O>
-void SqrtImpl_cpu_forward_kernel(const std::size_t inputLenght,
+void SqrtImpl_cpu_forward_kernel(const std::size_t inputLength,
                                      const void* input_,
                                      void* output_) {
 
     const I* input = static_cast<const I*>(input_);
     O* output = static_cast<O*>(output_);
 
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+    for (std::size_t i = 0; i < inputLength; ++i) {
         output[i] = static_cast<O>(std::sqrt(static_cast<float>(input[i])));
     }
 }
 
 template <class I, class O>
-void SqrtImpl_cpu_backward_kernel(const std::size_t inputLenght,
+void SqrtImpl_cpu_backward_kernel(const std::size_t inputLength,
                                      const void* input_,
                                      void* output_) {
 
     const I* input = static_cast<const I*>(input_);
     O* output = static_cast<O*>(output_);
 
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+    for (std::size_t i = 0; i < inputLength; ++i) {
         output[i] = static_cast<O>(0.5/(std::sqrt(static_cast<float>(input[i]))));
     }
 }
diff --git a/include/aidge/backend/cpu/operator/TanhImpl_kernels.hpp b/include/aidge/backend/cpu/operator/TanhImpl_kernels.hpp
index fdcac210..49cfe9cb 100644
--- a/include/aidge/backend/cpu/operator/TanhImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/TanhImpl_kernels.hpp
@@ -18,27 +18,27 @@
 
 namespace Aidge {
 template <class I, class O>
-void TanhImpl_cpu_forward_kernel(std::size_t inputLenght,
+void TanhImpl_cpu_forward_kernel(std::size_t inputLength,
                                      const void* input_,
                                      void* output_) {
 
     const I* input = static_cast<const I*>(input_);
     O* output = static_cast<O*>(output_);
 
-//#pragma omp parallel for if (inputLenght > 1024)
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+//#pragma omp parallel for if (inputLength > 1024)
+    for (std::size_t i = 0; i < inputLength; ++i) {
         output[i] = std::tanh(input[i]);
     }
 }
 
 template <class O, class GI, class GO>
-void TanhImpl_cpu_backward_kernel(const std::size_t inputLenght,
+void TanhImpl_cpu_backward_kernel(const std::size_t inputLength,
                                   const void* output_, const void* grad_output_,
 			          void* grad_input_) {
     const O* output = static_cast<const O*>(output_);
     const GO* grad_output = static_cast<const GO*>(grad_output_);
     GI* grad_input = static_cast<GI*>(grad_input_);
-    for (std::size_t i = 0; i < inputLenght; ++i) {
+    for (std::size_t i = 0; i < inputLength; ++i) {
         grad_input[i] = (O(1) - output[i] * output[i]) * grad_output[i];
     }
 }
-- 
GitLab


From d4d09c91b42e17b0628db6af9fb0b056c04c4235 Mon Sep 17 00:00:00 2001
From: Jerome Hue <jerome.hue@cea.fr>
Date: Fri, 7 Mar 2025 14:20:11 +0100
Subject: [PATCH 042/108] chore: Clean and improve the Leaky MetaOperator test

---
 unit_tests/operator/Test_MetaOperator.cpp | 166 +++++++---------------
 1 file changed, 55 insertions(+), 111 deletions(-)

diff --git a/unit_tests/operator/Test_MetaOperator.cpp b/unit_tests/operator/Test_MetaOperator.cpp
index de720f5b..f781e5e2 100644
--- a/unit_tests/operator/Test_MetaOperator.cpp
+++ b/unit_tests/operator/Test_MetaOperator.cpp
@@ -750,155 +750,99 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
 
         std::random_device rd;
         std::mt19937 gen(rd());
-        std::uniform_real_distribution<float> valueDist(
-            0.1f,
-            1.1f); // Random float distribution between 0 and 1
-        std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2),
-                                                               std::size_t(4));
-        std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(3),
-                                                              std::size_t(3));
+        std::uniform_real_distribution<float> valueDist(0.1f,1.1f);
+        std::uniform_int_distribution<std::size_t> dimSizeDist(2,4);
+        std::uniform_int_distribution<std::size_t> nbDimsDist(3,3); // fixed to 3.
         std::uniform_int_distribution<int> boolDist(0, 1);
         std::uniform_real_distribution<float> betaDist(0,1);
+        std::uniform_real_distribution<float> thresholDist(0.1,3);
 
-        const std::size_t nbDims = nbDimsDist(gen);
-        Log::info("Nbdims : {}", nbDims);
-        std::vector<std::size_t> dims;
-        for (std::size_t i = 0; i < nbDims; ++i) {
-            dims.push_back(dimSizeDist(gen));
-        }
-        Log::info("timesteps : {}", dims[0]);
-        Log::info("dimensions : ");
-        for (auto dim : dims) {
-            Log::info("{}", dim);
-        }
-
-        const auto nbTimeSteps = dims[0];
         const auto beta = betaDist(gen);
+        const auto threshold = thresholDist(gen);
+        const auto nbDims = nbDimsDist(gen);
+        std::vector<std::size_t> dims(nbDims);
+        std::generate(dims.begin(), dims.end(), [&]() { return dimSizeDist(gen); });
+        const auto nbTimeSteps = dims[0];
 
-        auto myLeaky = Leaky(nbTimeSteps, beta, 1.0, LeakyReset::Subtraction, "leaky");
-        auto op =
-            std::static_pointer_cast<MetaOperator_Op>(myLeaky->getOperator());
-        // auto stack = Stack(2);
-        auto mem_rec = Stack(nbTimeSteps, "mem_rec");
-        auto spk_rec = Stack(nbTimeSteps, "spk_rec");
-        auto pop = Pop("popinput");
+        auto myLeaky = Leaky(nbTimeSteps, beta, threshold, LeakyReset::Subtraction, "leaky");
+        auto op = std::static_pointer_cast<MetaOperator_Op>(myLeaky->getOperator());
+        auto memoryRecord = Stack(nbTimeSteps, "mem_rec");
+        auto spikeRecord = Stack(nbTimeSteps, "spk_rec");
+        auto pop = Pop("input");
 
-        // Here we test LSTM as it is was flatten in the graph.
-        // We just borrow its micro-graph into our larger myGraph graph.
-        auto myGraph = std::make_shared<GraphView>();
+        auto leakyOutputs = op->getMicroGraph()->getOrderedOutputs();
+        auto leakyInputs = op->getMicroGraph()->getOrderedInputs();
+        pop->addChild(leakyInputs[0].first, 0, 0);
+        leakyOutputs[1].first->addChild(memoryRecord,0,0);
+        leakyOutputs[0].first->addChild(spikeRecord,0,0);
 
-        pop->addChild(op->getMicroGraph()->getOrderedInputs()[0].first, 0, 0);
-        // 0 for mem 1 for stack
-        op->getMicroGraph()->getOrderedOutputs()[1].first->addChild(mem_rec,
-                                                                    0,
-                                                                    0);
-        op->getMicroGraph()->getOrderedOutputs()[0].first->addChild(spk_rec,
-                                                                    0,
-                                                                    0);
-        for (auto node : op->getMicroGraph()->getOrderedOutputs()) {
-            Log::info("name  of output {}", node.first->name());
-        }
-
-        myGraph->add(pop);
+        auto myGraph = std::make_shared<GraphView>();
         myGraph->add(op->getMicroGraph());
-        myGraph->add(mem_rec);
-        myGraph->add(spk_rec);
-        myGraph->save("mg", true, true);
+        myGraph->add({pop, memoryRecord, spikeRecord});
 
-        // 3 outputs
         REQUIRE(myLeaky->nbInputs() == 3);
         REQUIRE(myLeaky->inputCategory(0) == InputCategory::Data);
-        // Two spikes connected to nothing, + the Add node real output
         REQUIRE(myLeaky->nbOutputs() == 4);
 
-        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(
-            Array3D<float, 2, 3, 2>{{{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}},
-                                     {{2.0, 3.0}, {4.0, 5.0}, {6.0, 7.0}}}});
-
-        // std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(
-        //     Array3D<float, 2, 3, 2>{{{{1.0, 2.0}, {3.0, 4.0}, {5.0, 6.0}},
-        //                              {{2.0, 3.0}, {4.0, 5.0},
-        //                              {6.0, 7.0}}}});
-
-        // Generate input
-        std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>();
-        T0->setDataType(DataType::Float32);
-        T0->setBackend("cpu");
-
-        std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>();
-        expectedOutput->setDataType(DataType::Float32);
-        expectedOutput->setBackend("cpu");
-
         const auto nb_elements =
             std::accumulate(dims.cbegin(),
                             dims.cend(),
                             std::size_t(1),
                             std::multiplies<std::size_t>());
-        float *input = new float[nb_elements];
-        float *result = new float[nb_elements];
+        const auto nbElementsPerTimeStep = nb_elements / dims[0];
 
-        for (std::size_t i = 0; i < nb_elements; ++i) {
-            input[i] = valueDist(gen);
-        }
-        T0->resize(dims);
-        T0->getImpl()->setRawPtr(input, nb_elements);
-        T0->print();
 
-        // Elements popped at each time step
-        auto nbElementsPerTimeStep = nb_elements / dims[0];
+        // Compute the expected result using ad-hoc implementation
 
         // Init
-        for (int i = 0; i < nbElementsPerTimeStep; ++i) {
-            result[i] = input[i];
-        }
-
-        // Reccurence
-        for (int i = 1; i < dims[0]; ++i) {
-            auto offset = nbElementsPerTimeStep * i;
-            auto prev = nbElementsPerTimeStep * (i - 1);
-            for (int j = 0; j < nbElementsPerTimeStep; ++j) {
-                auto reset = (result[prev + j] > 1.0 ? 1 : 0);
-                result[offset + j] =
-                    result[prev + j] * beta + input[offset + j] - reset;
+        auto *input = new float[nb_elements];
+        std::generate_n(input, nb_elements, [&]() { return valueDist(gen); });
+        auto *result = new float[nb_elements];
+        std::copy(input, input + nbElementsPerTimeStep, result);
+
+        // Recurrence calculation for each timestep
+        for (int timestep = 1; timestep < nbTimeSteps; ++timestep) {
+            const auto currentOffset = nbElementsPerTimeStep * timestep;
+            const auto previousOffset = nbElementsPerTimeStep * (timestep - 1);
+
+            for (int element = 0; element < nbElementsPerTimeStep; ++element) {
+                const auto previousValue = result[previousOffset + element];
+                const auto resetValue = (previousValue > threshold) ? threshold : 0;
+
+                result[currentOffset + element] =
+                    previousValue * beta + input[currentOffset + element] - resetValue;
             }
         }
 
+        auto expectedOutput = std::make_shared<Tensor>(DataType::Float32);
+        expectedOutput->setBackend("cpu");
         expectedOutput->resize(dims);
         expectedOutput->getImpl()->setRawPtr(result, nb_elements);
-        Log::info("Expected ouptut : ");
-        expectedOutput->print();
 
-        std::shared_ptr<Tensor> myInit =
-            std::make_shared<Tensor>(Array2D<float, 3, 3>{
-                {{0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}}});
 
-        auto initMemdims =
-            std::vector<std::size_t>(dims.begin() + 1, dims.end());
-        Log::info("dimensions : ");
-        for (auto dim : initMemdims) {
-            Log::info("{}", dim);
-        }
-        std::shared_ptr<Tensor> myInitW = std::make_shared<Tensor>(
-            Array2D<float, 3, 2>{{{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}});
+        // Compute the real result using our operator implemenation
+        auto inputTensor = std::make_shared<Tensor>(DataType::Float32);
+        inputTensor->setBackend("cpu");
+        inputTensor->resize(dims);
+        inputTensor->getImpl()->setRawPtr(input, nb_elements);
 
-        std::shared_ptr<Tensor> myInitR =
-            std::make_shared<Tensor>(initMemdims);
-        myInitR->setDataType(DataType::Float32);
-        myInitR->setBackend("cpu");
-        uniformFiller<float>(myInitR, 0, 0);
+        auto memoryInit = std::make_shared<Tensor>(DataType::Float32);
+        memoryInit->setBackend("cpu");
+        memoryInit->resize(std::vector<std::size_t>(dims.begin() + 1, dims.end()));
+        memoryInit->zeros();
 
-        pop->getOperator()->associateInput(0, T0);
-        op->associateInput(1, myInitR);
-        op->associateInput(2, myInitR);
+        pop->getOperator()->associateInput(0, inputTensor);
+        op->associateInput(1, memoryInit);
+        op->associateInput(2, memoryInit);
 
         myGraph->compile("cpu", DataType::Float32);
-
         auto scheduler = SequentialScheduler(myGraph);
         REQUIRE_NOTHROW(scheduler.generateScheduling());
         REQUIRE_NOTHROW(scheduler.forward(true));
 
+        // Compare expected output with actual output
         auto memOp =
-            std::static_pointer_cast<OperatorTensor>(spk_rec->getOperator());
+            std::static_pointer_cast<OperatorTensor>(spikeRecord->getOperator());
         REQUIRE(approxEq<float>(*(memOp->getOutput(0)), *(expectedOutput)));
     }
 }
-- 
GitLab


From 142e6e345c503675dc8aedfc28f6c40bf37c5c09 Mon Sep 17 00:00:00 2001
From: Jerome Hue <jerome.hue@cea.fr>
Date: Mon, 10 Mar 2025 10:22:49 +0100
Subject: [PATCH 043/108] Use nodes instead of Tensors

---
 unit_tests/operator/Test_MetaOperator.cpp | 43 ++++++++++++-----------
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/unit_tests/operator/Test_MetaOperator.cpp b/unit_tests/operator/Test_MetaOperator.cpp
index f781e5e2..7b0b80d8 100644
--- a/unit_tests/operator/Test_MetaOperator.cpp
+++ b/unit_tests/operator/Test_MetaOperator.cpp
@@ -745,7 +745,9 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
         REQUIRE(
             approxEq<float>(*(fc2Op->getOutput(0)), *(expectedOutputfc2ts2)));
     }
+}
 
+TEST_CASE("[cpu/operator] MetaOperator", "[Leaky][CPU]") {
     SECTION("Leaky(forward)") {
 
         std::random_device rd;
@@ -764,25 +766,15 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
         std::generate(dims.begin(), dims.end(), [&]() { return dimSizeDist(gen); });
         const auto nbTimeSteps = dims[0];
 
-        auto myLeaky = Leaky(nbTimeSteps, beta, threshold, LeakyReset::Subtraction, "leaky");
-        auto op = std::static_pointer_cast<MetaOperator_Op>(myLeaky->getOperator());
+        auto leakyNode = Leaky(nbTimeSteps, beta, threshold, LeakyReset::Subtraction, "leaky");
+        auto leakyOp = std::static_pointer_cast<MetaOperator_Op>(leakyNode->getOperator());
         auto memoryRecord = Stack(nbTimeSteps, "mem_rec");
         auto spikeRecord = Stack(nbTimeSteps, "spk_rec");
-        auto pop = Pop("input");
-
-        auto leakyOutputs = op->getMicroGraph()->getOrderedOutputs();
-        auto leakyInputs = op->getMicroGraph()->getOrderedInputs();
-        pop->addChild(leakyInputs[0].first, 0, 0);
-        leakyOutputs[1].first->addChild(memoryRecord,0,0);
-        leakyOutputs[0].first->addChild(spikeRecord,0,0);
-
-        auto myGraph = std::make_shared<GraphView>();
-        myGraph->add(op->getMicroGraph());
-        myGraph->add({pop, memoryRecord, spikeRecord});
+        auto popNode = Pop("input");
 
-        REQUIRE(myLeaky->nbInputs() == 3);
-        REQUIRE(myLeaky->inputCategory(0) == InputCategory::Data);
-        REQUIRE(myLeaky->nbOutputs() == 4);
+        REQUIRE(leakyNode->nbInputs() == 3);
+        REQUIRE(leakyNode->inputCategory(0) == InputCategory::Data);
+        REQUIRE(leakyNode->nbOutputs() == 4);
 
         const auto nb_elements =
             std::accumulate(dims.cbegin(),
@@ -830,19 +822,28 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
         memoryInit->setBackend("cpu");
         memoryInit->resize(std::vector<std::size_t>(dims.begin() + 1, dims.end()));
         memoryInit->zeros();
+        auto memoryInitNode = Producer(memoryInit);
 
-        pop->getOperator()->associateInput(0, inputTensor);
-        op->associateInput(1, memoryInit);
-        op->associateInput(2, memoryInit);
+        popNode->getOperator()->associateInput(0, inputTensor);
+        popNode->addChild(leakyNode,0, 0);
+        memoryInitNode->addChild(leakyNode, 0, 1);
+        memoryInitNode->addChild(leakyNode, 0, 2);
+        leakyNode->addChild(memoryRecord, 1, 0);
+        leakyNode->addChild(spikeRecord, 0, 0);
 
-        myGraph->compile("cpu", DataType::Float32);
-        auto scheduler = SequentialScheduler(myGraph);
+        auto g = std::make_shared<GraphView>();
+        g->add({popNode, leakyNode, memoryRecord, spikeRecord, memoryInitNode});
+        g->setDataType(DataType::Float32);
+        g->setBackend("cpu");
+
+        auto scheduler = SequentialScheduler(g);
         REQUIRE_NOTHROW(scheduler.generateScheduling());
         REQUIRE_NOTHROW(scheduler.forward(true));
 
         // Compare expected output with actual output
         auto memOp =
             std::static_pointer_cast<OperatorTensor>(spikeRecord->getOperator());
+        //memOp->getOutput(0)->print();
         REQUIRE(approxEq<float>(*(memOp->getOutput(0)), *(expectedOutput)));
     }
 }
-- 
GitLab


From e1322151a810d230f34675545183e97559a71d86 Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Tue, 4 Mar 2025 15:39:44 +0100
Subject: [PATCH 044/108] Added new scheduling policies

---
 unit_tests/scheduler/Test_Scheduler.cpp | 53 +++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/unit_tests/scheduler/Test_Scheduler.cpp b/unit_tests/scheduler/Test_Scheduler.cpp
index 54e57ec4..eb725ff3 100644
--- a/unit_tests/scheduler/Test_Scheduler.cpp
+++ b/unit_tests/scheduler/Test_Scheduler.cpp
@@ -17,6 +17,7 @@
 #include "aidge/graph/Node.hpp"
 #include "aidge/graph/GraphView.hpp"
 #include "aidge/graph/OpArgs.hpp"
+#include "aidge/operator/GenericOperator.hpp"
 #include "aidge/operator/Memorize.hpp"
 #include "aidge/operator/Pop.hpp"
 #include "aidge/operator/Stack.hpp"
@@ -28,6 +29,7 @@
 #include "aidge/operator/MetaOperator.hpp"
 #include "aidge/scheduler/SequentialScheduler.hpp"
 #include "aidge/scheduler/ParallelScheduler.hpp"
+#include "aidge/graph/Testing.hpp"
 
 #include "aidge/backend/cpu/operator/FCImpl.hpp"
 #include "aidge/backend/cpu/operator/ConvImpl.hpp"
@@ -520,6 +522,57 @@ TEST_CASE("[cpu/scheduler] Accumulate", "[scheduler]") {
     REQUIRE(*output == *expectedOutput);
 }
 
+TEST_CASE("[cpu/scheduler] Branch", "[scheduler]") {
+    std::shared_ptr<Tensor> in = std::make_shared<Tensor>(
+            Array2D<float, 2, 3>{{{1, 2, 3}, {4, 5, 6}}});
+
+    std::shared_ptr<GraphView> g = Sequential({
+        Producer(in, "input"),
+        Parallel({
+            Sequential({
+                GenericOperator("b0_op1", {InputCategory::Data}, 1),
+                GenericOperator("b0_op2", {InputCategory::Data}, 1),
+                GenericOperator("b0_op3", {InputCategory::Data}, 1),
+                GenericOperator("b0_op4", {InputCategory::Data}, 1),
+                GenericOperator("b0_op5", {InputCategory::Data}, 1)
+            }),
+            Sequential({
+                GenericOperator("b1_op1", {InputCategory::Data}, 1),
+                GenericOperator("b1_op2", {InputCategory::Data}, 1),
+                GenericOperator("b1_op3", {InputCategory::Data}, 1)
+            }),
+            Sequential({
+                GenericOperator("b2_op1", {InputCategory::Data}, 1)
+            })
+        }),
+        GenericOperator("op1", {InputCategory::Data, InputCategory::Data, InputCategory::Data}, 1),
+        GenericOperator("op2", {InputCategory::Data}, 1),
+        GenericOperator("op3", {InputCategory::Data}, 1)
+    });
+
+    g->save("branch_forwarded");
+
+    auto scheduler = SequentialScheduler(g);
+    scheduler.generateScheduling();
+    scheduler.saveStaticSchedulingDiagram("branch_scheduling");
+
+    // Default scheduling order is not necessarily determinist, but is garanteed to be correct in every case.
+    // This behavior might change in the future.
+    auto seqSchedule = scheduler.getSequentialStaticScheduling(0, Scheduler::SchedulingPolicy::Default);
+    fmt::println("seqSchedule = {}", seqSchedule);
+
+    scheduler.tagForkBranches();
+    g->save("branch_forwarded_tag");
+
+    seqSchedule = scheduler.getSequentialStaticScheduling(0, Scheduler::SchedulingPolicy::ShortestBranchFirst);
+    REQUIRE(nodePtrTo(seqSchedule, nodePtrToType) == std::vector<std::string>{
+        "Producer", "b2_op1", "b1_op1", "b1_op2", "b1_op3", "b0_op1", "b0_op2", "b0_op3", "b0_op4", "b0_op5", "op1", "op2", "op3"});
+
+    seqSchedule = scheduler.getSequentialStaticScheduling(0, Scheduler::SchedulingPolicy::LonguestBranchFirst);
+    REQUIRE(nodePtrTo(seqSchedule, nodePtrToType) == std::vector<std::string>{
+        "Producer", "b0_op1", "b0_op2", "b0_op3", "b0_op4", "b0_op5", "b1_op1", "b1_op2", "b1_op3", "b2_op1", "op1", "op2", "op3"});
+}
+
 #ifdef WITH_OPENSSL
 TEST_CASE("[cpu/scheduler] Select", "[scheduler]") {
     std::shared_ptr<Tensor> in = std::make_shared<Tensor>(
-- 
GitLab


From 616115446b9b4ede126cf88a124e5c40e61bddbb Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Tue, 11 Mar 2025 17:45:20 +0100
Subject: [PATCH 045/108] Fixed scheduler test

---
 unit_tests/scheduler/Test_Scheduler.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/unit_tests/scheduler/Test_Scheduler.cpp b/unit_tests/scheduler/Test_Scheduler.cpp
index eb725ff3..be87e8ac 100644
--- a/unit_tests/scheduler/Test_Scheduler.cpp
+++ b/unit_tests/scheduler/Test_Scheduler.cpp
@@ -558,17 +558,17 @@ TEST_CASE("[cpu/scheduler] Branch", "[scheduler]") {
 
     // Default scheduling order is not necessarily determinist, but is garanteed to be correct in every case.
     // This behavior might change in the future.
-    auto seqSchedule = scheduler.getSequentialStaticScheduling(0, Scheduler::SchedulingPolicy::Default);
+    auto seqSchedule = scheduler.Scheduler::getSequentialStaticScheduling(0, Scheduler::SchedulingPolicy::Default);
     fmt::println("seqSchedule = {}", seqSchedule);
 
     scheduler.tagForkBranches();
     g->save("branch_forwarded_tag");
 
-    seqSchedule = scheduler.getSequentialStaticScheduling(0, Scheduler::SchedulingPolicy::ShortestBranchFirst);
+    seqSchedule = scheduler.Scheduler::getSequentialStaticScheduling(0, Scheduler::SchedulingPolicy::ShortestBranchFirst);
     REQUIRE(nodePtrTo(seqSchedule, nodePtrToType) == std::vector<std::string>{
         "Producer", "b2_op1", "b1_op1", "b1_op2", "b1_op3", "b0_op1", "b0_op2", "b0_op3", "b0_op4", "b0_op5", "op1", "op2", "op3"});
 
-    seqSchedule = scheduler.getSequentialStaticScheduling(0, Scheduler::SchedulingPolicy::LonguestBranchFirst);
+    seqSchedule = scheduler.Scheduler::getSequentialStaticScheduling(0, Scheduler::SchedulingPolicy::LonguestBranchFirst);
     REQUIRE(nodePtrTo(seqSchedule, nodePtrToType) == std::vector<std::string>{
         "Producer", "b0_op1", "b0_op2", "b0_op3", "b0_op4", "b0_op5", "b1_op1", "b1_op2", "b1_op3", "b2_op1", "op1", "op2", "op3"});
 }
-- 
GitLab


From 2407e191c6a10e6bca0eb5d07493948dc6bf913c Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Tue, 11 Mar 2025 20:11:38 +0100
Subject: [PATCH 046/108] Fixed wrong computation in int

---
 .../cpu/operator/AvgPoolingImpl_kernels.hpp   | 22 ++++++++++++++++---
 .../GlobalAveragePoolingImpl_kernels.hpp      |  2 +-
 .../cpu/operator/ReduceMeanImpl_kernels.hpp   | 13 ++++++-----
 unit_tests/operator/Test_AvgPoolingImpl.cpp   | 21 ++++++++++++++++++
 .../Test_GlobalAveragePoolingImpl.cpp         | 21 ++++++++++++++++++
 5 files changed, 70 insertions(+), 9 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
index 78f8446a..1671759d 100644
--- a/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
@@ -23,6 +23,22 @@
 #include "aidge/utils/Types.h"
 
 namespace Aidge {
+
+template <typename T>
+using Acc_T = typename std::conditional<std::is_floating_point<T>::value, T, double>::type;
+
+template <typename T>
+typename std::enable_if<std::is_floating_point<T>::value, T>::type
+castFromFloat(T value) {
+  return value;
+}
+
+template <typename T>
+typename std::enable_if<!std::is_floating_point<T>::value, T>::type
+castFromFloat(double value) {
+  return static_cast<T>(std::nearbyint(value));
+}
+
 /**
  * @brief Forward kernel for 2D AvgPoolingolution on CPU backend.
  * @tparam I Input data type.
@@ -79,7 +95,7 @@ void AvgPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideD
                     const std::size_t ix = ox * strideDims[0];
                     const std::size_t iy = oy * strideDims[1];
 
-                    O sum = static_cast<O>(0);
+                    Acc_T<I> sum = static_cast<Acc_T<I>>(0);
                     std::size_t count = 0;
 
                     for (unsigned int sy = syMin; sy < syMax; ++sy) {
@@ -90,13 +106,13 @@ void AvgPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideD
 
                             // Ensure within bounds
                             if ((ix + dilated_sx) < dims[2] && (iy + dilated_sy) < dims[3]) {
-                                sum += static_cast<O>(input[iIndex + (ix + dilated_sx) * dims[3] + (iy + dilated_sy)]);
+                                sum += static_cast<Acc_T<I>>(input[iIndex + (ix + dilated_sx) * dims[3] + (iy + dilated_sy)]);
                                 ++count;
                             }
                         }
                     }
 
-                    output[oIndexFull] = count > 0 ? sum / static_cast<O>(count) : 0;
+                    output[oIndexFull] = count > 0 ? castFromFloat<O>(sum / count) : 0;
                 }
             }
         }
diff --git a/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp
index d5e5561d..7a47ccf3 100644
--- a/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp
@@ -38,7 +38,7 @@ stableMean(const T* vec, size_t size) {
 
 // Specialization for integers: perform the mean computation in float
 template <typename T>
-typename std::enable_if<!std::is_floating_point<T>::value, T>::type
+typename std::enable_if<!std::is_floating_point<T>::value, double>::type
 stableMean(const T* vec, size_t size) {
   double mean = 0;
   for (size_t i = 0; i < size; ++i) {
diff --git a/include/aidge/backend/cpu/operator/ReduceMeanImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ReduceMeanImpl_kernels.hpp
index 864b89c4..a1562322 100644
--- a/include/aidge/backend/cpu/operator/ReduceMeanImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ReduceMeanImpl_kernels.hpp
@@ -25,6 +25,9 @@
 #include "aidge/utils/Registrar.hpp"
 
 namespace Aidge {
+    
+template <typename T>
+using Acc_T = typename std::conditional<std::is_floating_point<T>::value, T, double>::type;
 
 template <typename T>
 typename std::enable_if<std::is_floating_point<T>::value, T>::type
@@ -38,7 +41,7 @@ stableMean(const T* vec, size_t len, size_t stride) {
 
 // Specialization for integers: perform the mean computation in float
 template <typename T>
-typename std::enable_if<!std::is_floating_point<T>::value, T>::type
+typename std::enable_if<!std::is_floating_point<T>::value, double>::type
 stableMean(const T* vec, size_t len, size_t stride) {
   double mean = 0;
   for (size_t i = 0; i < len; ++i) {
@@ -102,13 +105,13 @@ void ReduceMeanImpl_cpu_forward_kernel(const std::vector<std::int32_t>& axes,
         }
 
         // Type should be the return type of stableMean<I>(), which is always floating point
-        const decltype(stableMean<I>(input, 0, 0))* inputAccumulation = nullptr;
-        decltype(stableMean<I>(input, 0, 0))* outputAccumulation = nullptr;
+        const Acc_T<I>* inputAccumulation = nullptr;
+        Acc_T<I>* outputAccumulation = nullptr;
 
         for (const auto& axisInt : axes) {
             const std::size_t a = static_cast<std::size_t>(axisInt);
             outputElements /= inputDims[a];
-            outputAccumulation = new I[outputElements];
+            outputAccumulation = new Acc_T<I>[outputElements];
             const std::size_t dim_i = inputDims[a];
             for (std::size_t pre = 0; pre < stride_pre[a]; ++pre) {
                 for (std::size_t post = 0; post < stride_post[a]; ++post) {
@@ -118,7 +121,7 @@ void ReduceMeanImpl_cpu_forward_kernel(const std::vector<std::int32_t>& axes,
                         outputAccumulation[idx_o] = stableMean<I>(input + idx_i, dim_i, stride_post[a]);
                     }
                     else {
-                        outputAccumulation[idx_o] = stableMean<I>(inputAccumulation + idx_i, dim_i, stride_post[a]);
+                        outputAccumulation[idx_o] = stableMean<Acc_T<I>>(inputAccumulation + idx_i, dim_i, stride_post[a]);
                     }
                 }
             }
diff --git a/unit_tests/operator/Test_AvgPoolingImpl.cpp b/unit_tests/operator/Test_AvgPoolingImpl.cpp
index f116934c..d0299ab5 100644
--- a/unit_tests/operator/Test_AvgPoolingImpl.cpp
+++ b/unit_tests/operator/Test_AvgPoolingImpl.cpp
@@ -201,4 +201,25 @@ TEST_CASE("[cpu/operator] AvgPooling(forward)", "[AvgPooling][CPU]") {
         op2->getOutput(0)->print();
         REQUIRE(*(op2->getOutput(0)) == *myOutput5);
     }
+
+    SECTION("Simple test") {
+      std::shared_ptr<Tensor> tensor =
+          std::make_shared<Tensor>(Array4D<int32_t, 1, 1, 7, 7>{{{{
+              {0, 8, 26, 35, 49, 45, 22},
+              {2, 24, 48, 66, 60, 46, 26},
+              {8, 41, 64, 68, 39, 18, 9},
+              {10, 48, 72, 76, 42, 14, 9},
+              {6, 29, 52, 65, 27, 7, 3},
+              {1, 9, 24, 31, 18, 7, 1},
+              {0, 0, 4, 6, 7, 1, 1}}}}});
+
+        auto op = AvgPooling2D_Op({7, 7});
+        op.setDataType(DataType::Int32);
+        op.setBackend("cpu");
+
+        op.associateInput(0, tensor);
+        op.forwardDims();
+        op.forward();
+        REQUIRE(op.getOutput(0)->get<int32_t>(0) == 26);
+    }
 }
\ No newline at end of file
diff --git a/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp b/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp
index 8e8536ac..0fd7d84b 100644
--- a/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp
+++ b/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp
@@ -558,6 +558,27 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling",
       Log::info("Number of operations : {}\n", number_of_operation);
       Log::info("Operation / µs = {}\n", number_of_operation / duration.count());
     }
+  
+    SECTION("Simple test") {
+      std::shared_ptr<Tensor> tensor =
+          std::make_shared<Tensor>(Array4D<int32_t, 1, 1, 7, 7>{{{{
+              {0, 8, 26, 35, 49, 45, 22},
+              {2, 24, 48, 66, 60, 46, 26},
+              {8, 41, 64, 68, 39, 18, 9},
+              {10, 48, 72, 76, 42, 14, 9},
+              {6, 29, 52, 65, 27, 7, 3},
+              {1, 9, 24, 31, 18, 7, 1},
+              {0, 0, 4, 6, 7, 1, 1}}}}});
+
+        auto op = GlobalAveragePooling_Op();
+        op.setDataType(DataType::Int32);
+        op.setBackend("cpu");
+
+        op.associateInput(0, tensor);
+        op.forwardDims();
+        op.forward();
+        REQUIRE(op.getOutput(0)->get<int32_t>(0) == 26);
+    }
   }
 }
 } // namespace Aidge
-- 
GitLab


From ea9a0a70e58900bbc54aeded143e7a37f62bcf92 Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Tue, 11 Mar 2025 20:14:53 +0100
Subject: [PATCH 047/108] Removed unrelated change

---
 unit_tests/scheduler/Test_Scheduler.cpp | 53 -------------------------
 1 file changed, 53 deletions(-)

diff --git a/unit_tests/scheduler/Test_Scheduler.cpp b/unit_tests/scheduler/Test_Scheduler.cpp
index be87e8ac..54e57ec4 100644
--- a/unit_tests/scheduler/Test_Scheduler.cpp
+++ b/unit_tests/scheduler/Test_Scheduler.cpp
@@ -17,7 +17,6 @@
 #include "aidge/graph/Node.hpp"
 #include "aidge/graph/GraphView.hpp"
 #include "aidge/graph/OpArgs.hpp"
-#include "aidge/operator/GenericOperator.hpp"
 #include "aidge/operator/Memorize.hpp"
 #include "aidge/operator/Pop.hpp"
 #include "aidge/operator/Stack.hpp"
@@ -29,7 +28,6 @@
 #include "aidge/operator/MetaOperator.hpp"
 #include "aidge/scheduler/SequentialScheduler.hpp"
 #include "aidge/scheduler/ParallelScheduler.hpp"
-#include "aidge/graph/Testing.hpp"
 
 #include "aidge/backend/cpu/operator/FCImpl.hpp"
 #include "aidge/backend/cpu/operator/ConvImpl.hpp"
@@ -522,57 +520,6 @@ TEST_CASE("[cpu/scheduler] Accumulate", "[scheduler]") {
     REQUIRE(*output == *expectedOutput);
 }
 
-TEST_CASE("[cpu/scheduler] Branch", "[scheduler]") {
-    std::shared_ptr<Tensor> in = std::make_shared<Tensor>(
-            Array2D<float, 2, 3>{{{1, 2, 3}, {4, 5, 6}}});
-
-    std::shared_ptr<GraphView> g = Sequential({
-        Producer(in, "input"),
-        Parallel({
-            Sequential({
-                GenericOperator("b0_op1", {InputCategory::Data}, 1),
-                GenericOperator("b0_op2", {InputCategory::Data}, 1),
-                GenericOperator("b0_op3", {InputCategory::Data}, 1),
-                GenericOperator("b0_op4", {InputCategory::Data}, 1),
-                GenericOperator("b0_op5", {InputCategory::Data}, 1)
-            }),
-            Sequential({
-                GenericOperator("b1_op1", {InputCategory::Data}, 1),
-                GenericOperator("b1_op2", {InputCategory::Data}, 1),
-                GenericOperator("b1_op3", {InputCategory::Data}, 1)
-            }),
-            Sequential({
-                GenericOperator("b2_op1", {InputCategory::Data}, 1)
-            })
-        }),
-        GenericOperator("op1", {InputCategory::Data, InputCategory::Data, InputCategory::Data}, 1),
-        GenericOperator("op2", {InputCategory::Data}, 1),
-        GenericOperator("op3", {InputCategory::Data}, 1)
-    });
-
-    g->save("branch_forwarded");
-
-    auto scheduler = SequentialScheduler(g);
-    scheduler.generateScheduling();
-    scheduler.saveStaticSchedulingDiagram("branch_scheduling");
-
-    // Default scheduling order is not necessarily determinist, but is garanteed to be correct in every case.
-    // This behavior might change in the future.
-    auto seqSchedule = scheduler.Scheduler::getSequentialStaticScheduling(0, Scheduler::SchedulingPolicy::Default);
-    fmt::println("seqSchedule = {}", seqSchedule);
-
-    scheduler.tagForkBranches();
-    g->save("branch_forwarded_tag");
-
-    seqSchedule = scheduler.Scheduler::getSequentialStaticScheduling(0, Scheduler::SchedulingPolicy::ShortestBranchFirst);
-    REQUIRE(nodePtrTo(seqSchedule, nodePtrToType) == std::vector<std::string>{
-        "Producer", "b2_op1", "b1_op1", "b1_op2", "b1_op3", "b0_op1", "b0_op2", "b0_op3", "b0_op4", "b0_op5", "op1", "op2", "op3"});
-
-    seqSchedule = scheduler.Scheduler::getSequentialStaticScheduling(0, Scheduler::SchedulingPolicy::LonguestBranchFirst);
-    REQUIRE(nodePtrTo(seqSchedule, nodePtrToType) == std::vector<std::string>{
-        "Producer", "b0_op1", "b0_op2", "b0_op3", "b0_op4", "b0_op5", "b1_op1", "b1_op2", "b1_op3", "b2_op1", "op1", "op2", "op3"});
-}
-
 #ifdef WITH_OPENSSL
 TEST_CASE("[cpu/scheduler] Select", "[scheduler]") {
     std::shared_ptr<Tensor> in = std::make_shared<Tensor>(
-- 
GitLab


From d34c46219694ac7084073c097a2a6de04b223af9 Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Sun, 16 Mar 2025 11:30:52 +0100
Subject: [PATCH 048/108] Revert "Removed unrelated change"

This reverts commit ea9a0a70e58900bbc54aeded143e7a37f62bcf92.
---
 unit_tests/scheduler/Test_Scheduler.cpp | 53 +++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/unit_tests/scheduler/Test_Scheduler.cpp b/unit_tests/scheduler/Test_Scheduler.cpp
index 54e57ec4..be87e8ac 100644
--- a/unit_tests/scheduler/Test_Scheduler.cpp
+++ b/unit_tests/scheduler/Test_Scheduler.cpp
@@ -17,6 +17,7 @@
 #include "aidge/graph/Node.hpp"
 #include "aidge/graph/GraphView.hpp"
 #include "aidge/graph/OpArgs.hpp"
+#include "aidge/operator/GenericOperator.hpp"
 #include "aidge/operator/Memorize.hpp"
 #include "aidge/operator/Pop.hpp"
 #include "aidge/operator/Stack.hpp"
@@ -28,6 +29,7 @@
 #include "aidge/operator/MetaOperator.hpp"
 #include "aidge/scheduler/SequentialScheduler.hpp"
 #include "aidge/scheduler/ParallelScheduler.hpp"
+#include "aidge/graph/Testing.hpp"
 
 #include "aidge/backend/cpu/operator/FCImpl.hpp"
 #include "aidge/backend/cpu/operator/ConvImpl.hpp"
@@ -520,6 +522,57 @@ TEST_CASE("[cpu/scheduler] Accumulate", "[scheduler]") {
     REQUIRE(*output == *expectedOutput);
 }
 
+TEST_CASE("[cpu/scheduler] Branch", "[scheduler]") {
+    std::shared_ptr<Tensor> in = std::make_shared<Tensor>(
+            Array2D<float, 2, 3>{{{1, 2, 3}, {4, 5, 6}}});
+
+    std::shared_ptr<GraphView> g = Sequential({
+        Producer(in, "input"),
+        Parallel({
+            Sequential({
+                GenericOperator("b0_op1", {InputCategory::Data}, 1),
+                GenericOperator("b0_op2", {InputCategory::Data}, 1),
+                GenericOperator("b0_op3", {InputCategory::Data}, 1),
+                GenericOperator("b0_op4", {InputCategory::Data}, 1),
+                GenericOperator("b0_op5", {InputCategory::Data}, 1)
+            }),
+            Sequential({
+                GenericOperator("b1_op1", {InputCategory::Data}, 1),
+                GenericOperator("b1_op2", {InputCategory::Data}, 1),
+                GenericOperator("b1_op3", {InputCategory::Data}, 1)
+            }),
+            Sequential({
+                GenericOperator("b2_op1", {InputCategory::Data}, 1)
+            })
+        }),
+        GenericOperator("op1", {InputCategory::Data, InputCategory::Data, InputCategory::Data}, 1),
+        GenericOperator("op2", {InputCategory::Data}, 1),
+        GenericOperator("op3", {InputCategory::Data}, 1)
+    });
+
+    g->save("branch_forwarded");
+
+    auto scheduler = SequentialScheduler(g);
+    scheduler.generateScheduling();
+    scheduler.saveStaticSchedulingDiagram("branch_scheduling");
+
+    // Default scheduling order is not necessarily determinist, but is garanteed to be correct in every case.
+    // This behavior might change in the future.
+    auto seqSchedule = scheduler.Scheduler::getSequentialStaticScheduling(0, Scheduler::SchedulingPolicy::Default);
+    fmt::println("seqSchedule = {}", seqSchedule);
+
+    scheduler.tagForkBranches();
+    g->save("branch_forwarded_tag");
+
+    seqSchedule = scheduler.Scheduler::getSequentialStaticScheduling(0, Scheduler::SchedulingPolicy::ShortestBranchFirst);
+    REQUIRE(nodePtrTo(seqSchedule, nodePtrToType) == std::vector<std::string>{
+        "Producer", "b2_op1", "b1_op1", "b1_op2", "b1_op3", "b0_op1", "b0_op2", "b0_op3", "b0_op4", "b0_op5", "op1", "op2", "op3"});
+
+    seqSchedule = scheduler.Scheduler::getSequentialStaticScheduling(0, Scheduler::SchedulingPolicy::LonguestBranchFirst);
+    REQUIRE(nodePtrTo(seqSchedule, nodePtrToType) == std::vector<std::string>{
+        "Producer", "b0_op1", "b0_op2", "b0_op3", "b0_op4", "b0_op5", "b1_op1", "b1_op2", "b1_op3", "b2_op1", "op1", "op2", "op3"});
+}
+
 #ifdef WITH_OPENSSL
 TEST_CASE("[cpu/scheduler] Select", "[scheduler]") {
     std::shared_ptr<Tensor> in = std::make_shared<Tensor>(
-- 
GitLab


From 504790997a45d583b77ee2187c553a2c17f932de Mon Sep 17 00:00:00 2001
From: Noam ZERAH <noam.zerah@cea.fr>
Date: Tue, 25 Feb 2025 14:52:14 +0000
Subject: [PATCH 049/108] Updating cpu backend for bitshift with the new
 rounding attribute

---
 .../backend/cpu/operator/BitShiftImpl.hpp     |  1 +
 .../cpu/operator/BitShiftImpl_kernels.hpp     | 17 ++--
 src/operator/BitShiftImpl.cpp                 |  1 +
 unit_tests/operator/Test_BitShift.cpp         | 77 ++++++++++++++++++-
 4 files changed, 90 insertions(+), 6 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/BitShiftImpl.hpp b/include/aidge/backend/cpu/operator/BitShiftImpl.hpp
index 807d2b97..79b0c5a3 100644
--- a/include/aidge/backend/cpu/operator/BitShiftImpl.hpp
+++ b/include/aidge/backend/cpu/operator/BitShiftImpl.hpp
@@ -24,6 +24,7 @@ namespace Aidge {
 // Operator implementation entry point for the backend
 using BitShiftImpl_cpu = OperatorImpl_cpu<BitShift_Op,
     void(const BitShift_Op::BitShiftDirection,
+    const bool,
     std::vector<std::size_t>,
     std::vector<std::size_t>,
     const std::vector<std::size_t>&,
diff --git a/include/aidge/backend/cpu/operator/BitShiftImpl_kernels.hpp b/include/aidge/backend/cpu/operator/BitShiftImpl_kernels.hpp
index 1f2561af..89921d36 100644
--- a/include/aidge/backend/cpu/operator/BitShiftImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/BitShiftImpl_kernels.hpp
@@ -27,6 +27,7 @@ namespace {
 template <class I1, class I2, class O>
 void bitshift_contiguous_arrays(
     const Aidge::BitShift_Op::BitShiftDirection direction,
+    const bool rounding,
     const std::size_t input1size,
     const std::size_t input2size,
     const std::size_t output1size,
@@ -34,13 +35,18 @@ void bitshift_contiguous_arrays(
     const I2* input_2,
     O* output)
 {
-    if(direction == Aidge::BitShift_Op::BitShiftDirection::right) {
+    if (direction == Aidge::BitShift_Op::BitShiftDirection::right) {
         for (std::size_t i = 0; i < output1size; ++i) {
             const std::size_t idx1 = (input1size != 1) ? i : 0;
             const std::size_t idx2 = (input2size != 1) ? i : 0;
-            output[i]= input_1[idx1] >> input_2[idx2];
+            const int shift = input_2[idx2]; 
+            
+            if (rounding && shift > 0) {
+                output[i] = ((input_1[idx1] >> (shift - 1)) + 1) >> 1;
+            } else {
+                output[i] = input_1[idx1] >> shift;
+            }
         }
-
     } else {
         for (std::size_t i = 0; i < output1size; ++i) {
             const std::size_t idx1 = (input1size != 1) ? i : 0;
@@ -55,6 +61,7 @@ namespace Aidge {
 template <class I1, class I2, class O>
 void BitShiftImpl_cpu_forward_kernel(
                                 const BitShift_Op::BitShiftDirection direction,
+                                const bool rounding,
                                 std::vector<std::size_t> dims0,
                                 std::vector<std::size_t> dims1,
                                 const std::vector<std::size_t>& outputDims,
@@ -79,7 +86,7 @@ void BitShiftImpl_cpu_forward_kernel(
     // special case for equal dimensions, the kernel is called with the entire arrays at once
     if (dims0 == dims1) {
         const std::size_t input0_contiguous_size = std::accumulate(dims0.cbegin(), dims0.cend(), std::size_t(1), std::multiplies<std::size_t>());
-        bitshift_contiguous_arrays(direction, input0_contiguous_size, input0_contiguous_size, input0_contiguous_size, input_0, input_1, output);
+        bitshift_contiguous_arrays(direction, rounding, input0_contiguous_size, input0_contiguous_size, input0_contiguous_size, input_0, input_1, output);
         return;
     }
 
@@ -142,7 +149,7 @@ void BitShiftImpl_cpu_forward_kernel(
     std::size_t dim = contiguousIdx - 1;
     const std::size_t nbStacks = std::accumulate(outputDims.cbegin(), outputDims.cbegin() + contiguousIdx, std::size_t(1), std::multiplies<std::size_t>());
     for (std::size_t stack = 0; stack < nbStacks;) {
-        bitshift_contiguous_arrays<I1,I2,O>(direction, input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
+        bitshift_contiguous_arrays<I1,I2,O>(direction,rounding,input0_contiguous_size, input1_contiguous_size, output_contiguous_size,
                     input_0 + offsetIn0*input0_contiguous_size,
                     input_1 + offsetIn1*input1_contiguous_size,
                     output + offsetOut*output_contiguous_size);
diff --git a/src/operator/BitShiftImpl.cpp b/src/operator/BitShiftImpl.cpp
index c6940554..ad41cb15 100644
--- a/src/operator/BitShiftImpl.cpp
+++ b/src/operator/BitShiftImpl.cpp
@@ -33,6 +33,7 @@ void Aidge::BitShiftImpl_cpu::forward() {
     // Call kernel
     impl.forward(
         op_.direction(),
+        op_.rounding(),
         op_.getInput(0)->dims(),
         op_.getInput(1)->dims(),
         op_.getOutput(0)->dims(),
diff --git a/unit_tests/operator/Test_BitShift.cpp b/unit_tests/operator/Test_BitShift.cpp
index 33ab932e..9cce9d6d 100644
--- a/unit_tests/operator/Test_BitShift.cpp
+++ b/unit_tests/operator/Test_BitShift.cpp
@@ -8,7 +8,6 @@
  * SPDX-License-Identifier: EPL-2.0
  *
  ********************************************************************************/
-
 #include <chrono>      // std::micro, std::chrono::time_point,
                        // std::chrono::system_clock
 #include <cstddef>   // std::size_t
@@ -139,6 +138,82 @@ TEST_CASE("[cpu/operator] BitShift_TEST", "[BitShift][CPU]") {
             Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
             Log::info("total time: {}μs\n", duration.count());
         }
+        SECTION("Test Forward Kernel with same dimensions and applying rounding") {
+            std::shared_ptr<Node> RoundBitShift = BitShift(BitShift_Op::BitShiftDirection::right,true);
+            auto op_r = std::static_pointer_cast<OperatorTensor>(RoundBitShift-> getOperator());
+            op_r->setDataType(DataType::Int32);
+            op_r->setBackend("cpu");
+        
+            // Create 2 input Tensors
+            std::shared_ptr<Tensor> T0_r = std::make_shared<Tensor>();
+            op_r->associateInput(0,T0_r);
+            T0_r->setDataType(DataType::Int32);
+            T0_r->setBackend("cpu");
+            std::shared_ptr<Tensor> T1_r = std::make_shared<Tensor>();
+            op_r -> associateInput(1,T1_r);
+            T1_r->setDataType(DataType::Int32);
+            T1_r->setBackend("cpu");
+        
+            // Create results Tensor
+            std::shared_ptr<Tensor> Tres_r = std::make_shared<Tensor>();
+            Tres_r->setDataType(DataType::Int32);
+            Tres_r->setBackend("cpu");
+            std::size_t number_of_operation = 0;
+            
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                // generate 2 random Tensors
+                const std::size_t nbDims = nbDimsDist(gen);
+                std::vector<std::size_t> dims;
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    dims.push_back(dimSizeDist(gen));
+                }
+                const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                number_of_operation += nb_elements;
+
+                // without broadcasting
+                int* array0 = new int[nb_elements];
+                int* array1 = new int[nb_elements];
+                int* result = new int[nb_elements];
+                for (std::size_t i = 0; i < nb_elements; ++i) 
+                {
+                    array0[i] = valueDist(gen);
+                    array1[i] = std::abs(valueDist(gen)); // bitshift is impossible with negative value
+                    result[i] = array0[i] >> array1[i];
+                    if(array1[i] > 0) //Cannot use rounding when shift value is 0
+                        result[i] = ((array0[i] >> (array1[i] - 1)) + 1) >> 1;
+                }
+
+                // input0
+                T0_r->resize(dims);
+                T0_r -> getImpl() -> setRawPtr(array0, nb_elements);
+
+                // input1
+                T1_r->resize(dims);
+                T1_r -> getImpl() -> setRawPtr(array1, nb_elements);
+
+                // results
+                Tres_r->resize(dims);
+                Tres_r -> getImpl() -> setRawPtr(result, nb_elements);
+
+                op_r->forwardDims();
+                start = std::chrono::system_clock::now();
+                RoundBitShift->forward();
+                end = std::chrono::system_clock::now();
+                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+                bool is_eq_round = approxEq<int>(*(op_r->getOutput(0)), *Tres_r);
+                auto Output = *(op_r->getOutput(0));
+                auto prt = Output.getImpl()->rawPtr();
+
+                REQUIRE(is_eq_round);
+
+                delete[] array0;
+                delete[] array1;
+                delete[] result;
+            }
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {}μs\n", duration.count());
+        }
         SECTION("Test BitShift kernels with Broadcasting") {
             std::size_t number_of_operation = 0;
 
-- 
GitLab


From c0dbb037196d29bb45311af53e7d2892fa621a50 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gr=C3=A9goire=20KUBLER?= <gregoire.kubler@proton.me>
Date: Tue, 28 Jan 2025 11:00:52 +0100
Subject: [PATCH 050/108] feat : Addedd convolution 1/2D backward kernels

---
 .../aidge/backend/cpu/operator/ConvImpl.hpp   |  66 +-
 .../backend/cpu/operator/ConvImpl_kernels.hpp | 717 ++++++++++++-
 src/operator/ConvImpl.cpp                     |  93 +-
 unit_tests/operator/Test_ConvImpl.cpp         | 998 +++++++++++++++++-
 4 files changed, 1821 insertions(+), 53 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/ConvImpl.hpp b/include/aidge/backend/cpu/operator/ConvImpl.hpp
index c06d0912..8bf11ac0 100644
--- a/include/aidge/backend/cpu/operator/ConvImpl.hpp
+++ b/include/aidge/backend/cpu/operator/ConvImpl.hpp
@@ -13,45 +13,63 @@
 #define AIDGE_CPU_OPERATOR_CONVIMPL_H_
 
 #include <array>
-#include <memory>
-#include <tuple>
-#include <vector>
 
 #include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/Conv.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
-#include "aidge/backend/cpu/data/GetCPUPtr.h"
 
 namespace Aidge {
 // Operator implementation entry point for the backend
 using Conv1D_Op = Conv_Op<1>;
 using ConvImpl1D_cpu = OperatorImpl_cpu<Conv_Op<1>,
-    void(const std::array<DimSize_t, 1>&,
-        const std::array<DimSize_t, 1>&,
-        const std::array<DimSize_t, 1>&,
-        const std::array<DimSize_t, 3> &,
-        DimSize_t,
-        const void *,
-        const void *,
-        const void *,
-        void *)>;
+                                        void(const std::array<DimSize_t, 1> &,
+                                             const std::array<DimSize_t, 1> &,
+                                             const std::array<DimSize_t, 1> &,
+                                             const std::array<DimSize_t, 3> &,
+                                             DimSize_t,
+                                             const void *,
+                                             const void *,
+                                             const void *,
+                                             void *),
+                                        void(const std::array<DimSize_t, 1> &,
+                                             const std::array<DimSize_t, 1> &,
+                                             const std::array<DimSize_t, 1> &,
+                                             const std::array<DimSize_t, 3> &,
+                                             const std::array<DimSize_t, 3> &,
+                                             const void *,
+                                             const void *,
+                                             const void *,
+                                             void *,
+                                             void *,
+                                             void *)>;
 
 using Conv2D_Op = Conv_Op<2>;
-using ConvImpl2D_cpu = OperatorImpl_cpu<Conv_Op<2>,
-    void(const std::array<DimSize_t, 2>&,
-        const std::array<DimSize_t, 2>&,
-        const std::array<DimSize_t, 2>&,
-        const std::array<DimSize_t, 4> &,
-        DimSize_t,
-        const void *,
-        const void *,
-        const void *,
-        void *)>;
+using ConvImpl2D_cpu = OperatorImpl_cpu<Conv2D_Op,
+                                        void(const std::array<DimSize_t, 2> &,
+                                             const std::array<DimSize_t, 2> &,
+                                             const std::array<DimSize_t, 2> &,
+                                             const std::array<DimSize_t, 4> &,
+                                             DimSize_t,
+                                             const void *,
+                                             const void *,
+                                             const void *,
+                                             void *),
+                                        void(const std::array<DimSize_t, 2> &,
+                                             const std::array<DimSize_t, 2> &,
+                                             const std::array<DimSize_t, 2> &,
+                                             const std::array<DimSize_t, 4> &,
+                                             const std::array<DimSize_t, 4> &,
+                                             const void *,
+                                             const void *,
+                                             const void *,
+                                             void *,
+                                             void *,
+                                             void *)>;
 
 // Implementation entry point registration to Operator
 REGISTRAR(Conv1D_Op, "cpu", Aidge::ConvImpl1D_cpu::create);
 REGISTRAR(Conv2D_Op, "cpu", Aidge::ConvImpl2D_cpu::create);
-}  // namespace Aidge
+} // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_CONVIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
index 1229d571..70377260 100644
--- a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
@@ -25,6 +25,8 @@
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
 
 namespace Aidge {
+using std::array;
+
 /**
  * @brief Forward kernel for 1D Convolution on CPU backend.
  * @tparam I Input data type.
@@ -85,9 +87,80 @@ void ConvImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& strideDims,
                     const std::size_t oIndexFull = oIndex + ox;
                     const signedsize ix = static_cast<signedsize>(ox * strideDims[0]);
 
-                    for (std::size_t sx = sxMin; sx*dilationDims[0] < sxMax; ++sx) {
-                        output[oIndexFull] += weights[wIndex + sx] *
-                                                input[iIndex + static_cast<std::size_t>(ix+static_cast<signedsize>(sx*dilationDims[0]))];
+/**
+ * @brief perform 1D backpropagation for the data input
+ * @note INPUT & OUTPUT convention is the same as in the
+ * forward function
+ * @note formula :
+ * for i in 0..input_size:
+ *  for n in 0..weight_size:
+ *    dL     dYn  dL
+ *   ---- = ---- ----
+ *    dXi    dXi  Yn
+ * with : dYn / dXi = w_k
+ * for each input value
+ * for each weight
+ * for each output
+ * multiply the weight with the associated value
+ * @note kernel & stride are passed as single integers as they are just arrays
+ * of length 1
+ * @note reminder that kernel dimensions are
+ * {outChannels, inChannels, {kernelDims}}
+ * <=> {oDims[1], iDims[1], kernelDim}
+ * @tparam I Input data type.
+ * @tparam W Weight data type.
+ * @tparam O Output data type.
+ * @param[in] stride stride parameter of the convolution operator
+ * @param[in] dilation dilation parameter of the convolution operator
+ * @param[in] kDims dimension of the kernel
+ * @param[in] kStrides nb of elements contained per dimension of the kernel
+ * @param[in] weights kernel weights
+ * @param[in] oDims dimensions of the output
+ * @param[in] oStrides nb of elements contained per dimension of the output
+ * @param[in] oGrad output gradient
+ * @param[in] iDims input dimensions
+ * @param[in] iStrides nb of elements contained per dimension of the input
+ * @param[inout] iGrad gradients of the input to update
+ */
+template <class I, class W, class O>
+void conv1DBackwardInput(const DimSize_t &stride,
+                         const DimSize_t &dilation,
+                         const DimSize_t &kDim,
+                         const array<DimSize_t, 2> &kStrides,
+                         const W *weights,
+                         const array<DimSize_t, 3> &oDims,
+                         const array<DimSize_t, 2> &oStrides,
+                         const O *oGrad,
+                         const array<DimSize_t, 3> &iDims,
+                         const array<DimSize_t, 2> &iStrides,
+                         I *iGrad) {
+
+    array<DimSize_t, 2> iOffsets{0, 0};
+    array<DimSize_t, 2> oOffsets{0, 0};
+    array<DimSize_t, 2> kOffsets{0, 0};
+
+    for (std::size_t batch = 0; batch < iDims[0]; ++batch) {
+        iOffsets[0] = batch * iStrides[0];
+        oOffsets[0] = batch * oStrides[0];
+
+        for (DimSize_t oChannel = 0; oChannel < oDims[1]; oChannel++) {
+            oOffsets[1] = (oChannel * oStrides[1]) + oOffsets[0];
+            kOffsets[0] = oChannel * kStrides[0];
+
+            for (std::size_t iChannel = 0; iChannel < iDims[1]; ++iChannel) {
+                iOffsets[1] = (iChannel * iStrides[1]) + iOffsets[0];
+                kOffsets[1] = iChannel * kStrides[1] + kOffsets[0];
+
+                for (DimSize_t oX = 0; oX < oDims[2]; ++oX) {
+                    auto iX = oX * stride[0];
+                    auto inIdx = iX + iOffsets[1];
+
+                    for (DimSize_t kX = 0; kX < kDim[0]; ++kX) {
+                        auto dilatedKernelIdx = kX * dilation[0];
+
+                        iGrad[inIdx + dilatedKernelIdx] +=
+                            weights[kOffsets[1] + kX] *
+                            oGrad[oOffsets[1] + oX];
                     }
                 }
             }
@@ -95,20 +168,261 @@ void ConvImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& strideDims,
     }
 }
 
+/**
+ * @brief computes weight backpropagation for conv1D
+ * @note INPUT & OUTPUT convention is the same as in the
+ * forward function
+ * weight grad
+ * for i in 0..weight_size:
+ *  for n in 0..output_size:
+ *    dL     dYn  dL
+ *   ---- = ---- ----
+ *   dwi     dwi  Yn
+ * with : dYn / dwi = x_k
+ * @tparam I Input data type.
+ * @tparam W Weight data type.
+ * @tparam O Output data type.
+ * @param[in] stride stride parameter of the convolution operator
+ * @param[in] dilation dilation parameter of the convolution operator
+ * @param[in] iDims input dimensions
+ * @param[in] iStrides nb of elements contained per dimension of the input
+ * @param[inout] iGrad gradients of the input to update
+ * @param[in] oDims dimensions of the output
+ * @param[in] oStrides nb of elements contained per dimension of the output
+ * @param[in] oGrad output gradient
+ * @param[in] kDims dimension of the kernel
+ * @param[in] kStrides nb of elements contained per dimension of the kernel
+ * @param[in] weights kernel weights
+ */
+template <class I, class W, class O>
+static void conv1DBackwardWeights(const array<DimSize_t, 1> &stride,
+                                  const array<DimSize_t, 1> &dilation,
+                                  const array<DimSize_t, 3> &iDims,
+                                  const array<DimSize_t, 2> iStrides,
+                                  const I *input,
+                                  const array<DimSize_t, 3> &oDims,
+                                  const array<DimSize_t, 2> oStrides,
+                                  const O *oGrad,
+                                  const array<DimSize_t, 1> &kDim,
+                                  const array<DimSize_t, 2> kStrides,
+                                  W *weightsGrad) {
+
+    array<DimSize_t, 2> iOffsets{0, 0};
+    array<DimSize_t, 2> oOffsets{0, 0};
+    array<DimSize_t, 2> kOffsets{0, 0};
+
+    for (DimSize_t batch = 0; batch < oDims[0]; ++batch) {
+        iOffsets[0] = batch * iStrides[0];
+        oOffsets[0] = batch * oStrides[0];
+
+        for (DimSize_t oChannel = 0; oChannel < oDims[1]; ++oChannel) {
+            oOffsets[1] = oChannel * oStrides[1] + oOffsets[0];
+            kOffsets[0] = oChannel * kStrides[0];
+
+            for (DimSize_t iChannel = 0; iChannel < iDims[1]; ++iChannel) {
+                kOffsets[1] = iChannel * kStrides[1] + kOffsets[0];
+                iOffsets[1] = iChannel * iStrides[1] + iOffsets[0];
+                oOffsets[1] = oChannel * oStrides[1] + oOffsets[0];
+
+                for (DimSize_t kX = 0; kX < kDim[0]; ++kX) {
+
+                    for (DimSize_t oX = 0; oX < oDims[2]; ++oX) {
+                        const DimSize_t iX = oX * stride[0] + kX * dilation[0] ;
+
+                        weightsGrad[kOffsets[1] + kX] +=
+                            input[iOffsets[1] + iX] * oGrad[oOffsets[1] + oX];
+                    }
+                }
+            }
+        }
+    }
+}
+
+/**
+ * @brief computes bias backpropagation for conv1D operation
+ * @note INPUT & OUTPUT convention is the same as in the
+ * forward function
+ * @note formula :
+ * Bias grad:
+ * for i in 0..bias_size:
+ *  for n in 0..output_size:
+ *    dL     dYn  dL
+ *   ---- = ---- ----
+ *   dbi     dbi  Yn
+ * with : dYn / dbi = 1
+ *
+ * Hence the partial derivative of the loss wrt bias is the
+ * output loss. Hence the bias grad is just the sum of the
+ * loss values over the batch
+ * @tparam I Input data type.
+ * @tparam W Weight data type.
+ * @tparam B Bias data type.
+ * @tparam O Output data type.
+ * @param[in] oDims output tensor dimensions
+ * @param[in] oStrides nb of elements contained per dimension of the output
+ * tensor
+ * @param[in] oGrad output tensor gradients
+ * @param[inout] biasesGrad biases gradients
+ */
+template <class B, class O>
+static void conv1DBackwardBias(const array<DimSize_t, 3> &oDims,
+                               const array<DimSize_t, 2> &oStrides,
+                               const O *oGrad,
+                               B *biasesGrad) {
+    array<DimSize_t, 2> oOffsets{0, 0};
+
+    for (DimSize_t batchIdx = 0; batchIdx < oDims[0]; ++batchIdx) {
+        oOffsets[0] = batchIdx * oStrides[0];
+
+        for (DimSize_t oChannel = 0; oChannel < oDims[1]; ++oChannel) {
+            oOffsets[1] = oChannel * oStrides[1] + oOffsets[0];
+
+            for (DimSize_t oIdx = 0; oIdx < oDims[2]; oIdx++) {
+                biasesGrad[oChannel] += oGrad[oOffsets[1] + oIdx];
+            }
+        }
+    }
+}
+
+/**
+ * @brief Backward kernel for 1D Convolution on CPU backend.
+ * @note INPUT & OUTPUT convention is the same as in the
+ * forward function
+ *
+ * @tparam I Input data type.
+ * @tparam W Weight data type.
+ * @tparam B Bias data type.
+ * @tparam O Output data type.
+ * @param[in] const stride
+ * @param[in] const kernelDims
+ * @param[in] const iDims input data dimensions
+ * @param[in] const oDims output data dimmensions
+ * @param[in] const oChannels output channel number
+ * @param[in] const input_ const input Tensor.
+ * @param[in] const weights_ const weight Tensor.
+ * @param[in] const biases_ const Biais Tensor.
+ * @param[in] const output_ Output Tensor.
+ * @param[in] const oGrad_ gradients of output data
+ * @param[inout] iGrad_ gradients of input data
+ * @param[inout] weightsGrad_ gradients of the kernel weights
+ * @param[inout] biasesGrad_ gradients of the kernel biases
+ */
+template <class I, class W, class B, class O>
+void ConvImpl1D_cpu_backward_kernel(const array<DimSize_t,1> &stride,
+                                    const array<DimSize_t,1> &dilation,
+                                    const array<DimSize_t,1> &kernelDim,
+                                    const array<DimSize_t, 3> &inputDims,
+                                    const array<DimSize_t, 3> &outputDims,
+                                    const void *input_,
+                                    const void *weights_,
+                                    const void *oGrad_,
+                                    void *iGrad_,
+                                    void *weightsGrad_,
+                                    void *biasesGrad_) {
+
+    const I *input = static_cast<const I *>(input_);
+    I *iGrad = static_cast<I *>(iGrad_);
+    const I *oGrad = static_cast<const I *>(oGrad_);
+    const W *weights = static_cast<const W *>(weights_);
+    W *weightsGrad = static_cast<W *>(weightsGrad_);
+
+    //////////////////////////////
+    // COMPUTING STRIDES
+    //////////////////////////////
+    // NOTE: The ...Stride var represent the number of values contained in
+    // each dimension they will be used to compute the index offset of
+    // values while iterating on each tensor
+    // NOTE: They are 1 item shorter than their corresponding tensor as the
+    // number of total elements is not used except for gradient initialization
+
+    // {batch_stride, channel_stride, dim0_stride, dim1_stride}
+    const array<DimSize_t, 2> inputStrides{inputDims[1] * inputDims[2],
+                                           inputDims[2]};
+    const DimSize_t nbEltsInput = inputDims[0] * inputStrides[0];
+
+    // {batch_stride, channel_stride, dim0_stride, dim1_stride}
+    const array<DimSize_t, 2> outputStrides{outputDims[1] * outputDims[2],
+                                            outputDims[2]};
+
+    // NOTE: kernel dims = {iChannel, oChannel, kernelDim0, kernelDim1}
+    // kernel_strides = {iChannel, oChannel, kernelDim0}
+    const array<DimSize_t, 2> kernelStrides{
+        inputDims[1] * kernelDim[0],
+        kernelDim[0],
+    };
+    const DimSize_t nbEltsKernel = outputDims[1] * kernelStrides[0];
+
+    std::fill(iGrad, iGrad + nbEltsInput, I(0));
+    std::fill(weightsGrad, weightsGrad + nbEltsKernel, W(0));
+
+    conv1DBackwardInput(stride,
+                        dilation,
+                        kernelDim,
+                        kernelStrides,
+                        weights,
+                        outputDims,
+                        outputStrides,
+                        oGrad,
+                        inputDims,
+                        inputStrides,
+                        iGrad);
+
+    conv1DBackwardWeights(stride,
+                          dilation,
+                          inputDims,
+                          inputStrides,
+                          input,
+                          outputDims,
+                          outputStrides,
+                          oGrad,
+                          kernelDim,
+                          kernelStrides,
+                          weightsGrad);
+
+    if (biasesGrad_ != nullptr) {
+        B *biasesGrad = static_cast<B *>(biasesGrad_);
+        std::fill(biasesGrad, biasesGrad + outputDims[1], B(0));
+        conv1DBackwardBias(outputDims, outputStrides, oGrad, biasesGrad);
+    }
+}
+
 // Kernels registration to implementation entry point
 REGISTRAR(ConvImpl1D_cpu,
-    {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}},
-    {ProdConso::inPlaceModel, Aidge::ConvImpl1D_cpu_forward_kernel<float, float, float, float>, nullptr});
+          {{DataType::Any, DataFormat::NCHW},
+           {DataType::Float32, DataFormat::NCHW}},
+          {ProdConso::inPlaceModel,
+           ConvImpl1D_cpu_forward_kernel<float, float, float, float>,
+           ConvImpl1D_cpu_backward_kernel<float, float, float, float>});
 REGISTRAR(ConvImpl1D_cpu,
-    {{DataType::Any, DataFormat::NCHW}, {DataType::Float16, DataFormat::NCHW}},
-    {ProdConso::inPlaceModel, Aidge::ConvImpl1D_cpu_forward_kernel<half_float::half, half_float::half, half_float::half, half_float::half>, nullptr});
+          {{DataType::Any, DataFormat::NCHW},
+           {DataType::Float16, DataFormat::NCHW}},
+          {ProdConso::inPlaceModel,
+           ConvImpl1D_cpu_forward_kernel<half_float::half,
+                                         half_float::half,
+                                         half_float::half,
+                                         half_float::half>,
+           ConvImpl1D_cpu_backward_kernel<half_float::half,
+                                          half_float::half,
+                                          half_float::half,
+                                          half_float::half>});
 REGISTRAR(ConvImpl1D_cpu,
-    {{DataType::Any, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}},
-    {ProdConso::inPlaceModel, Aidge::ConvImpl1D_cpu_forward_kernel<int32_t, int32_t, int32_t, int32_t>, nullptr});
+          {{DataType::Any, DataFormat::NCHW},
+           {DataType::Float64, DataFormat::NCHW}},
+          {ProdConso::inPlaceModel,
+           ConvImpl1D_cpu_forward_kernel<double, double, double, double>,
+           ConvImpl1D_cpu_backward_kernel<double, double, double, double>});
 REGISTRAR(ConvImpl1D_cpu,
-    {{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}},
-    {ProdConso::inPlaceModel, Aidge::ConvImpl1D_cpu_forward_kernel<double, double, double, double>, nullptr});
-
+          {{DataType::Any, DataFormat::NCHW},
+           {DataType::Int32, DataFormat::NCHW}},
+          {ProdConso::inPlaceModel,
+           ConvImpl1D_cpu_forward_kernel<std::int32_t,
+                                         std::int32_t,
+                                         std::int32_t,
+                                         std::int32_t>,
+           ConvImpl1D_cpu_backward_kernel<std::int32_t,
+                                          std::int32_t,
+                                          std::int32_t,
+                                          std::int32_t>});
 
 /**
  * @brief Forward kernel for 2D Convolution on CPU backend.
@@ -256,21 +570,380 @@ void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
     }
 }
 
+/**
+ * @brief perform backpropagation for the input
+ * @note INPUT & OUTPUT convention is the same as in the
+ * forward function
+ * @note formula :
+ * for i in 0..input_size:
+ *  for n in 0..weight_size:
+ *    dL     dYn  dL
+ *   ---- = ---- ----
+ *    dXi    dXi  Yn
+ * with : dYn / dXi = w_k
+ * for each input value
+ * for each weight
+ * for each output
+ * multiply the weight with the associated value
+ * @note kernel & stride are passed as single integers as they are just arrays
+ * of length 1
+ * @note reminder that kernel dimensions are
+ * {outChannels, inChannels, {kernelDims}}
+ * <=> {oDims[1], iDims[1], kernelDim}
+ * @tparam I Input data type.
+ * @tparam W Weight data type.
+ * @tparam O Output data type.
+ * @param[in] stride stride parameter of the convolution operator
+ * @param[in] dilation dilation parameter of the convolution operator
+ * @param[in] kDims dimension of the kernel
+ * @param[in] kStrides nb of elements contained per dimension of the kernel
+ * @param[in] weights weights values
+ * @param[in] oDims dimensions of the output
+ * @param[in] oStrides nb of elements contained per dimension of the output
+ * @param[in] oGrad output gradient
+ * @param[in] iDims input dimensions
+ * @param[in] iStrides nb of elements contained per dimension of the input
+ * @param[inout] iGrad gradients of the input to update
+ */
+template <class I, class W, class O>
+void conv2DBackwardInput(const array<DimSize_t, 2> &stride,
+                         const array<DimSize_t, 2> &dilation,
+                         const array<DimSize_t, 2> &kDims,
+                         const array<DimSize_t, 3> &kStrides,
+                         const W *weights,
+                         const array<DimSize_t, 4> &oDims,
+                         const array<DimSize_t, 3> &oStrides,
+                         const O *oGrad,
+                         const array<DimSize_t, 4> &iDims,
+                         const array<DimSize_t, 3> &iStrides,
+                         I *iGrad) {
+    // records index offsets for each dimension that have a stride (== all
+    // dimension except the last) for every parsed tensor
+    array<DimSize_t, 3> kOffset{};
+    array<DimSize_t, 3> iOffset{};
+    array<DimSize_t, 3> oOffset{};
+
+    for (std::size_t batch = 0; batch < iDims[0]; ++batch) {
+        iOffset[0] = batch * iStrides[0];
+        oOffset[0] = batch * oStrides[0];
+
+        for (DimSize_t oChannel = 0; oChannel < oDims[1]; oChannel++) {
+            oOffset[1] = (oChannel * oStrides[1]) + oOffset[0];
+            kOffset[0] = (oChannel * kStrides[0]);
 
+            for (std::size_t iChannel = 0; iChannel < iDims[1]; ++iChannel) {
+                iOffset[1] = (iChannel * iStrides[1]) + iOffset[0];
+                kOffset[1] = iChannel * kStrides[1] + kOffset[0];
+
+                for (DimSize_t oX = 0; oX < oDims[2]; ++oX) {
+                    oOffset[2] = (oX * oStrides[2]) + oOffset[1];
+
+                    auto iX = oX * stride[0];
+                    iOffset[2] = (iX * iStrides[2]) + iOffset[1];
+
+                    for (DimSize_t oY = 0; oY < oDims[3]; ++oY) {
+                        auto oIdx = oOffset[2] + oY;
+
+                        auto iY = oY * stride[1];
+                        auto iIdx = iOffset[2] + iY;
+
+                        for (DimSize_t kX = 0; kX < kDims[0]; ++kX) {
+                            auto kDilX = kX * dilation[0];
+                            auto iDilKXOffset = kDilX * iStrides[2];
+
+                            kOffset[2] = (kX * kStrides[2]) + kOffset[1];
+
+                            for (DimSize_t kY = 0; kY < kDims[1]; ++kY) {
+                                auto kDilY = kY * dilation[1];
+
+                                iGrad[iIdx + iDilKXOffset + kDilY] +=
+                                    weights[kOffset[2] + kY] * oGrad[oIdx];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+/**
+ * @brief computes weight backpropagation for conv2D operation
+ * @note INPUT & OUTPUT convention is the same as in the
+ * forward function
+ * weight grad
+ * for i in 0..weight_size:
+ *  for n in 0..output_size:
+ *    dL     dYn  dL
+ *   ---- = ---- ----
+ *   dwi     dwi  Yn
+ * with : dYn / dwi = x_k
+ * @tparam I input dtype
+ * @tparam W weight dtype
+ * @tparam O output dtype
+ * @param[in] iDims input data dimensions
+ * @param[in] iBatchStride nb element in each input data batch
+ * @param[in] iChannelStride nb element in each input data channel
+ * @param[in] input input data
+ * @param[in] oDims output data dimmensions
+ * @param[in] oBatchStride nb element in each output data batch
+ * @param[in] oChannelStride nb element in each output data channel
+ * @param[in] oGrad gradients of output data
+ * @param[in] stride
+ * @param[in] kernelDims
+ * @param[inout] weightsGrad gradients of the kernel weights
+ */
+template <class I, class W, class O>
+void conv2DBackwardWeights(const array<DimSize_t, 4> &iDims,
+                           const array<DimSize_t, 3> &iStrides,
+                           const I *input,
+                           const array<DimSize_t, 4> &oDims,
+                           const array<DimSize_t, 3> &oStrides,
+                           const O *oGrad,
+                           const array<DimSize_t, 2> &kDim,
+                           const array<DimSize_t, 3> &kStrides,
+                           const array<DimSize_t, 2> &stride,
+                           const array<DimSize_t, 2> &dilation,
+                           W *weightsGrad) {
+    // records index offsets for each dimension that have a stride (== all
+    // dimension except the last) for every parsed tensor
+    array<DimSize_t, 3> iOffsets{0, 0, 0};
+    array<DimSize_t, 3> oOffsets{0, 0, 0};
+    array<DimSize_t, 3> kOffsets{0, 0, 0};
+
+    for (DimSize_t batchIdx = 0; batchIdx < oDims[0]; ++batchIdx) {
+        iOffsets[0] = batchIdx * iStrides[0];
+        oOffsets[0] = batchIdx * oStrides[0];
+
+        for (DimSize_t iChannel = 0; iChannel < iDims[1]; ++iChannel) {
+            iOffsets[1] = iChannel * iStrides[1] + iOffsets[0];
+            kOffsets[0] = iChannel * kStrides[0];
+
+            for (DimSize_t oChannel = 0; oChannel < oDims[1]; ++oChannel) {
+                oOffsets[1] = oChannel * oStrides[1] + oOffsets[0];
+                kOffsets[1] = oChannel * kStrides[1] + kOffsets[0];
+
+                for (DimSize_t kX = 0; kX < kDim[0]; ++kX) {
+                    kOffsets[2] = kX * kStrides[2] + kOffsets[1];
+                    for (DimSize_t kY = 0; kY < kDim[1]; ++kY) {
+
+                        for (DimSize_t oX = 0; oX < oDims[2]; ++oX) {
+                            const DimSize_t iX =
+                                oX * stride[0] + kX * dilation[0];
+
+                            oOffsets[2] = oX * oStrides[2] + oOffsets[1];
+                            iOffsets[2] = iX * iStrides[2] + iOffsets[1];
+
+                            for (DimSize_t oY = 0; oY < oDims[3]; ++oY) {
+                                const DimSize_t iY =
+                                    oY * stride[1] + kY * dilation[1];
+
+                                weightsGrad[kOffsets[2] + kY] +=
+                                    input[iOffsets[2] + iY] *
+                                    oGrad[oOffsets[2] + oY];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+/**
+ * @brief computes bias backpropagation for conv2D operation
+ * @note INPUT & OUTPUT convention is the same as in the
+ * forward function
+ * @note formula :
+ * Bias grad:
+ * for i in 0..bias_size:
+ *  for n in 0..output_size:
+ *    dL     dYn  dL
+ *   ---- = ---- ----
+ *   dbi     dbi  Yn
+ * with : dYn / dbi = 1
+ *
+ * Hence the partial derivative of the loss wrt bias is the
+ * output loss Hence the bias grad is just the sum of the
+ * loss values over the batch
+ * @tparam I Input data type.
+ * @tparam W Weight data type.
+ * @tparam B Bias data type.
+ * @tparam O Output data type.
+ * @param[in] oDims output tensor dimensions
+ * @param[in] oStrides nb of elements contained per dimension of the
+ * output
+ * @param[in] oGrad output tensor gradients
+ * @param[inout] biasesGrad biases gradients
+ */
+template <class B, class O>
+static void conv2DBackwardBias(const array<DimSize_t, 4> &oDims,
+                               const array<DimSize_t, 3> &oStrides,
+                               const O *oGrad,
+                               B *biasesGrad) {
+    // records all index offsets for output tensor
+    array<DimSize_t, 3> oOffsets{};
+    for (DimSize_t batchIdx = 0; batchIdx < oDims[0]; ++batchIdx) {
+        oOffsets[0] = batchIdx * oStrides[0];
+
+        for (DimSize_t oChannel = 0; oChannel < oDims[1]; ++oChannel) {
+            oOffsets[1] = oChannel * oStrides[1] + oOffsets[0];
+
+            for (DimSize_t oX = 0; oX < oDims[2]; ++oX) {
+                oOffsets[2] = oX * oStrides[2] + oOffsets[1];
+
+                for (DimSize_t oY = 0; oY < oDims[3]; ++oY) {
+                    biasesGrad[oChannel] += oGrad[oOffsets[2] + oY];
+                }
+            }
+        }
+    }
+}
+
+/**
+ * @brief Backward kernel for 2D Convolution on CPU backend.
+ * @note INPUT & OUTPUT convention is the same as in the
+ * forward function
+ *
+ * @tparam I Input data type.
+ * @tparam W Weight data type.
+ * @tparam B Bias data type.
+ * @tparam O Output data type.
+ * @param[in] const stride attribute of conv operator
+ * @param[in] const dilation attribute of conv operator
+ * @param[in] const kernelDims
+ * @param[in] const iDims input data dimensions
+ * @param[in] const oDims output data dimmensions
+ * @param[in] const input_ input tensor.
+ * @param[in] const weights_ kernel tensor.
+ * @param[in] const oGrad_ output tensor gradient.
+ * @param[inout] iGrad_ input tensor gradient.
+ * @param[inout] weightsGrad_  kernel weights tensor gradients
+ * @param[inout] biasesGrad_  kernel biases tensor gradients
+ */
+template <class I, class W, class B, class O>
+void ConvImpl2D_cpu_backward_kernel(const array<DimSize_t, 2> &stride,
+                                    const array<DimSize_t, 2> &dilation,
+                                    const array<DimSize_t, 2> &kernelDims,
+                                    const array<DimSize_t, 4> &inputDims,
+                                    const array<DimSize_t, 4> &outputDims,
+                                    const void *input_,
+                                    const void *weights_,
+                                    const void *oGrad_,
+                                    void *iGrad_,
+                                    void *weightsGrad_,
+                                    void *biasesGrad_) {
+
+    const I *input = static_cast<const I *>(input_);
+    I *iGrad = static_cast<I *>(iGrad_);
+    const I *outputGrad = static_cast<const I *>(oGrad_);
+    const W *weights = static_cast<const W *>(weights_);
+    W *weightsGrad = static_cast<W *>(weightsGrad_);
+
+    //////////////////////////////
+    // COMPUTING STRIDES
+    //////////////////////////////
+    // NOTE: The ...Stride var represent the number of values contained in
+    // each dimension they will be used to compute the index offset of
+    // values while iterating on each tensor
+    // NOTE: They are 1 item shorter than their corresponding tensor as the
+    // number of total elements is not used except for gradient initialization
+
+    // {batch_stride, channel_stride, dim0_stride, dim1_stride}
+    const array<DimSize_t, 3> inputStrides{
+        inputDims[1] * inputDims[2] * inputDims[3],
+        inputDims[2] * inputDims[3],
+        inputDims[3]};
+    const DimSize_t nbEltsInput = inputDims[0] * inputStrides[0];
+
+    // {batch_stride, channel_stride, dim0_stride, dim1_stride}
+    const array<DimSize_t, 3> outputStrides{
+        outputDims[1] * outputDims[2] * outputDims[3],
+        outputDims[2] * outputDims[3],
+        outputDims[3]};
+
+    // NOTE: kernel dims = {iChannel, oChannel, kernelDim0, kernelDim1}
+    // kernel_strides = {iChannel, oChannel, kernelDim0}
+    const array<DimSize_t, 3> kernelStrides{
+        inputDims[1] * kernelDims[0] * kernelDims[1],
+        kernelDims[0] * kernelDims[1],
+        kernelDims[1]};
+
+    const DimSize_t nbEltsKernel = outputDims[1] * kernelStrides[0];
+
+    ////////////////////////////
+    // prepping gradient arrays
+    std::fill(iGrad, iGrad + nbEltsInput, I(0));
+    std::fill(weightsGrad, weightsGrad + nbEltsKernel, W(0));
+
+    conv2DBackwardInput(stride,
+                        dilation,
+                        kernelDims,
+                        kernelStrides,
+                        weights,
+                        outputDims,
+                        outputStrides,
+                        outputGrad,
+                        inputDims,
+                        inputStrides,
+                        iGrad);
+
+    conv2DBackwardWeights(inputDims,
+                          inputStrides,
+                          input,
+                          outputDims,
+                          outputStrides,
+                          outputGrad,
+                          kernelDims,
+                          kernelStrides,
+                          stride,
+                          dilation,
+                          weightsGrad);
+
+    if (biasesGrad_ != nullptr) {
+        B *biasesGrad = static_cast<B *>(biasesGrad_);
+        std::fill(biasesGrad, biasesGrad + outputDims[1], B(0));
+        conv2DBackwardBias(outputDims, outputStrides, outputGrad, biasesGrad);
+    }
+}
 
 // Kernels registration to implementation entry point
 REGISTRAR(ConvImpl2D_cpu,
-    {{DataType::Any, DataFormat::NCHW}, {DataType::Float32, DataFormat::NCHW}},
-    {ProdConso::inPlaceModel, Aidge::ConvImpl2D_cpu_forward_kernel<float, float, float, float>, nullptr});
-REGISTRAR(ConvImpl2D_cpu,
-    {{DataType::Any, DataFormat::NCHW}, {DataType::Float16, DataFormat::NCHW}},
-    {ProdConso::inPlaceModel, Aidge::ConvImpl2D_cpu_forward_kernel<half_float::half, half_float::half, half_float::half, half_float::half>, nullptr});
-REGISTRAR(ConvImpl2D_cpu,
-    {{DataType::Any, DataFormat::NCHW}, {DataType::Int32, DataFormat::NCHW}},
-    {ProdConso::inPlaceModel, Aidge::ConvImpl2D_cpu_forward_kernel<int32_t, int32_t, int32_t, int32_t>, nullptr});
+          {{DataType::Any, DataFormat::NCHW},
+           {DataType::Float32, DataFormat::NCHW}},
+          {ProdConso::inPlaceModel,
+           Aidge::ConvImpl2D_cpu_forward_kernel<float, float, float, float>,
+           Aidge::ConvImpl2D_cpu_backward_kernel<float, float, float, float>});
 REGISTRAR(ConvImpl2D_cpu,
+          {{DataType::Any, DataFormat::NCHW},
+           {DataType::Float16, DataFormat::NCHW}},
+          {ProdConso::inPlaceModel,
+           Aidge::ConvImpl2D_cpu_forward_kernel<half_float::half,
+                                                half_float::half,
+                                                half_float::half,
+                                                half_float::half>,
+           Aidge::ConvImpl2D_cpu_backward_kernel<half_float::half,
+                                                 half_float::half,
+                                                 half_float::half,
+                                                 half_float::half>});
+REGISTRAR(
+    ConvImpl2D_cpu,
     {{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}},
-    {ProdConso::inPlaceModel, Aidge::ConvImpl2D_cpu_forward_kernel<double, double, double, double>, nullptr});
-}  // namespace Aidge
+    {ProdConso::inPlaceModel,
+     Aidge::ConvImpl2D_cpu_forward_kernel<double, double, double, double>,
+     Aidge::ConvImpl2D_cpu_backward_kernel<double, double, double, double>});
+REGISTRAR(ConvImpl2D_cpu,
+          {{DataType::Any, DataFormat::NCHW},
+           {DataType::Int32, DataFormat::NCHW}},
+          {ProdConso::inPlaceModel,
+           ConvImpl2D_cpu_forward_kernel<std::int32_t,
+                                         std::int32_t,
+                                         std::int32_t,
+                                         std::int32_t>,
+           ConvImpl2D_cpu_backward_kernel<std::int32_t,
+                                          std::int32_t,
+                                          std::int32_t,
+                                          std::int32_t>});
+} // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_CONVIMPL_KERNELS_H_ */
diff --git a/src/operator/ConvImpl.cpp b/src/operator/ConvImpl.cpp
index fdfe19fb..782a58d3 100644
--- a/src/operator/ConvImpl.cpp
+++ b/src/operator/ConvImpl.cpp
@@ -22,6 +22,8 @@
 #include "aidge/operator/Conv.hpp"
 #include "aidge/utils/Types.h"
 
+namespace Aidge {
+
 template <>
 void Aidge::ConvImpl1D_cpu::forward() {
     const auto& op_ = static_cast<const Conv_Op<1>&>(mOp);
@@ -55,9 +57,47 @@ void Aidge::ConvImpl1D_cpu::forward() {
             );
 }
 
-template <>
-void Aidge::ConvImpl1D_cpu::backward() {
-    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Conv_Op<1> on backend cpu");
+template <> void ConvImpl1D_cpu::backward() {
+    const auto &op = dynamic_cast<const Conv1D_Op &>(mOp);
+    const auto &outputGrad = op.getOutput(0)->grad();
+    AIDGE_ASSERT(outputGrad, "{}: missing ouput #0 gradient", op.type());
+    AIDGE_ASSERT(op.getInput(0)->grad(),
+                 "{}: missing data input(#0) gradient",
+                 op.type());
+    AIDGE_ASSERT(op.getInput(1)->grad(),
+                 "{}: missing weight input(#1) gradient",
+                 op.type());
+
+    std::shared_ptr<Tensor> inputDataGradFallback, inputWeightGradFallback,
+        inputBiasGradFallback;
+    const auto &inputDataGrad =
+        op.getInput(0)->grad()->refCastFrom(inputDataGradFallback,
+                                            *(op.getOutput(0)));
+    const auto &inputWeightGrad =
+        op.getInput(1)->grad()->refCastFrom(inputWeightGradFallback,
+                                            *(op.getOutput(0)));
+    const auto &inputBiasGrad =
+        (op.getInput(2) && op.getInput(2)->grad())
+            ? op.getInput(2)->grad()->refCastFrom(inputBiasGradFallback,
+                                                  *(op.getOutput(0)))
+            : Tensor();
+
+    // Call kernel
+    const auto impl =
+        Registrar<ConvImpl1D_cpu>::create(getBestMatch(getRequiredSpec()));
+    impl.backward(
+        op.strideDims(),
+        op.dilationDims(),
+        op.kernelDims(),
+        op.getInput(0)->template dims<3>(),
+        op.getOutput(0)->template dims<3>(),
+
+        getCPUPtr(op.getInput(0)),
+        getCPUPtr(op.getInput(1)),
+        getCPUPtr(outputGrad),
+        inputDataGrad.getImpl()->rawPtr(),
+        inputWeightGrad.getImpl()->rawPtr(),
+        op.getInput(2) ? inputBiasGrad.getImpl()->rawPtr() : nullptr);
 }
 
 template <>
@@ -93,7 +133,48 @@ void Aidge::ConvImpl2D_cpu::forward() {
             );
 }
 
-template <>
-void Aidge::ConvImpl2D_cpu::backward() {
-    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Conv_Op<2> on backend cpu");
+
+template <> void ConvImpl2D_cpu::backward() {
+    const auto &op = dynamic_cast<const Conv2D_Op &>(mOp);
+    const auto &outputGrad = op.getOutput(0)->grad();
+    AIDGE_ASSERT(outputGrad, "{}: missing ouput #0 gradient", op.type());
+    AIDGE_ASSERT(op.getInput(0)->grad(),
+                 "{}: missing data input(#0) gradient",
+                 op.type());
+    AIDGE_ASSERT(op.getInput(1)->grad(),
+                 "{}: missing weight input(#1) gradient",
+                 op.type());
+
+    std::shared_ptr<Tensor> inputDataGradFallback, inputWeightGradFallback,
+        inputBiasGradFallback;
+    const auto &inputDataGrad =
+        op.getInput(0)->grad()->refCastFrom(inputDataGradFallback,
+                                            *(op.getOutput(0)));
+    const auto &inputWeightGrad =
+        op.getInput(1)->grad()->refCastFrom(inputWeightGradFallback,
+                                            *(op.getOutput(0)));
+    const auto &inputBiasGrad =
+        (op.getInput(2) && op.getInput(2)->grad())
+            ? op.getInput(2)->grad()->refCastFrom(inputBiasGradFallback,
+                                                  *(op.getOutput(0)))
+            : Tensor();
+
+    // Call kernel
+    const auto impl =
+        Registrar<ConvImpl2D_cpu>::create(getBestMatch(getRequiredSpec()));
+    impl.backward(
+        op.strideDims(),
+        op.dilationDims(),
+        op.kernelDims(),
+        op.getInput(0)->template dims<4>(),
+        op.getOutput(0)->template dims<4>(),
+
+        getCPUPtr(op.getInput(0)),
+        getCPUPtr(op.getInput(1)),
+        getCPUPtr(outputGrad),
+        inputDataGrad.getImpl()->rawPtr(),
+        inputWeightGrad.getImpl()->rawPtr(),
+        op.getInput(2) ? inputBiasGrad.getImpl()->rawPtr() : nullptr);
 }
+
+} // namespace Aidge
diff --git a/unit_tests/operator/Test_ConvImpl.cpp b/unit_tests/operator/Test_ConvImpl.cpp
index f7be338c..69e806cb 100644
--- a/unit_tests/operator/Test_ConvImpl.cpp
+++ b/unit_tests/operator/Test_ConvImpl.cpp
@@ -1645,4 +1645,1000 @@ TEST_CASE("[cpu/operator] Conv(forward)", "[Conv][CPU]") {
             REQUIRE(approxEq<float>(*(conv_op.getOutput(0)),*expectedOutput, 1e-5f, 1e-6f));
         }
     }
-}
\ No newline at end of file
+}
+
+template <DimSize_t DIM>
+std::shared_ptr<OperatorTensor>
+setupTestConv(const DimSize_t batchSize,
+                      const DimSize_t inChannels,
+                      const DimSize_t outChannels,
+                      const std::array<DimSize_t, DIM> kernelSize,
+                      const std::array<DimSize_t, DIM> dataSize,
+                      const std::array<DimSize_t, DIM> stride,
+                      const std::array<DimSize_t, DIM> dilation,
+                      const std::array<DimSize_t, 2 * DIM> padding,
+                      const std::shared_ptr<Tensor> input,
+                      const std::shared_ptr<Tensor> weights,
+                      const std::shared_ptr<Tensor> biases) {
+    input->setBackend("cpu");
+    weights->setBackend("cpu");
+    biases->setBackend("cpu");
+    std::shared_ptr<Node> convNode;
+    convNode = Conv(inChannels,
+                    outChannels,
+                    kernelSize,
+                    "myconv",
+                    std::array<DimSize_t, DIM>({stride}),
+                    dilation);
+    auto op =
+        std::static_pointer_cast<OperatorTensor>(convNode->getOperator());
+
+    op->setDataType(DataType::Float32);
+    op->setBackend("cpu");
+
+    op->associateInput(0, input);
+    op->associateInput(1, weights);
+    op->associateInput(2, biases);
+
+    REQUIRE_NOTHROW(op->forwardDims(true));
+
+    return op;
+}
+
+TEST_CASE("[cpu/operator] Conv(backward)", "[Conv][CPU]") {
+    SECTION("1D") {
+        const std::size_t DIM = 1;
+        SECTION("no stride & no dilation, outChannels > inChannels") {
+
+            const DimSize_t batchSize = 1;
+            const DimSize_t inChannels = 2;
+            const DimSize_t outChannels = 3;
+            const DimSize_t kernelSize = 4;
+            const DimSize_t inDataSize = 12;
+
+            const DimSize_t stride = 1;
+            const DimSize_t dilation = 1;
+            const std::array<DimSize_t, 2 * DIM> padding({0, 0});
+
+            auto inputSize =
+                std::vector<DimSize_t>({batchSize, inChannels, inDataSize});
+
+            auto input = std::make_shared<Tensor>(
+                Array3D<float, batchSize, inChannels, inDataSize>(
+                    {{{{1.000000,
+                        1.000000,
+                        1.000000,
+                        1.000000,
+                        1.000000,
+                        1.000000,
+                        1.000000,
+                        1.000000,
+                        1.000000,
+                        1.000000,
+                        1.000000,
+                        1.000000},
+                       {1.000000,
+                        1.000000,
+                        1.000000,
+                        1.000000,
+                        1.000000,
+                        1.000000,
+                        1.000000,
+                        1.000000,
+                        1.000000,
+                        1.000000,
+                        1.000000,
+                        1.000000}}}}));
+
+            auto weights = std::make_shared<Tensor>(
+                Array3D<float, outChannels, inChannels, kernelSize>(
+                    {{{{0.100000, 0.100000, 0.100000, 0.100000},
+                       {0.100000, 0.100000, 0.100000, 0.100000}},
+                      {{0.100000, 0.100000, 0.100000, 0.100000},
+                       {0.100000, 0.100000, 0.100000, 0.100000}},
+                      {{0.100000, 0.100000, 0.100000, 0.100000},
+                       {0.100000, 0.100000, 0.100000, 0.100000}}}
+
+                    }));
+
+            auto biases = std::make_shared<Tensor>(
+                Array1D<float, outChannels>({0.010000, 0.010000, 0.010000}));
+
+            auto op = setupTestConv<DIM>(
+                batchSize,
+                inChannels,
+                outChannels,
+                std::array<DimSize_t, DIM>({kernelSize}),
+                std::array<DimSize_t, DIM>({inDataSize}),
+                std::array<DimSize_t, DIM>({stride}),
+                std::array<DimSize_t, DIM>({dilation}),
+                padding,
+                input,
+                weights,
+                biases);
+
+            ////////////////////////////////////
+            // setup gradients for backward
+            auto outputGrad =
+                std::make_shared<Tensor>(op->getOutput(0)->dims());
+            outputGrad->setDataType(DataType::Float32);
+            outputGrad->setBackend("cpu");
+            constantFiller(outputGrad, 1.f);
+            op->getOutput(0)->setGrad(outputGrad);
+
+            ////////////////////////////////////
+            // setup gradients for backward
+            REQUIRE_NOTHROW(op->backward());
+
+            SECTION("Input Grad") {
+                auto expectedInputGrad = std::make_shared<Tensor>(
+                    Array3D<float, batchSize, inChannels, inDataSize>(
+                        {{{{0.3000,
+                            0.6000,
+                            0.9000,
+                            1.2000,
+                            1.2000,
+                            1.2000,
+                            1.2000,
+                            1.2000,
+                            1.2000,
+                            0.9000,
+                            0.6000,
+                            0.3000},
+                           {0.3000,
+                            0.6000,
+                            0.9000,
+                            1.2000,
+                            1.2000,
+                            1.2000,
+                            1.2000,
+                            1.2000,
+                            1.2000,
+                            0.9000,
+                            0.6000,
+                            0.3000}}}}));
+                CHECK(approxEq<float, float>(*op->getInput(0)->grad(),
+                                             *expectedInputGrad));
+            }
+            SECTION("Weight grad") {
+                std::vector<DimSize_t> weightsSize(
+                    {outChannels, inChannels, kernelSize});
+                auto expectedWeightsGrad =
+                    std::make_shared<Tensor>(weightsSize);
+                expectedWeightsGrad->setBackend("cpu");
+                expectedWeightsGrad->setDataType(DataType::Float32);
+                constantFiller<float>(expectedWeightsGrad, 9.);
+
+                CHECK(approxEq<float, float>(*op->getInput(1)->grad(),
+                                             *expectedWeightsGrad));
+            }
+            SECTION("Bias Grad") {
+                std::vector<DimSize_t> biasesSize({outChannels});
+                auto expectedBiasGrad = std::make_shared<Tensor>(biasesSize);
+                expectedBiasGrad->setBackend("cpu");
+                expectedBiasGrad->setDataType(DataType::Float32);
+                constantFiller<float>(expectedBiasGrad, 9.);
+                CHECK(approxEq<float, float>(*op->getInput(2)->grad(),
+                                             *expectedBiasGrad));
+            }
+        }
+
+        SECTION("stride and no dilation, inChannel > outChannels") {
+            const DimSize_t batchSize = 2;
+            const DimSize_t inChannels = 3;
+            const DimSize_t outChannels = 1;
+            const DimSize_t kernelSize = 2;
+            const DimSize_t inDataSize = 8;
+            const DimSize_t stride = 3;
+            const DimSize_t dilation = 1;
+            const std::array<DimSize_t, 2 * DIM> padding({0, 0});
+
+            auto inputSize =
+                std::vector<DimSize_t>({batchSize, inChannels, inDataSize});
+
+            auto input = std::make_shared<Tensor>(
+                Array3D<float, batchSize, inChannels, inDataSize>(
+                    {{{{1., 1., 1., 1., 1., 1., 1., 1.},
+                       {1., 1., 1., 1., 1., 1., 1., 1.},
+                       {1., 1., 1., 1., 1., 1., 1., 1.}},
+
+                      {{1., 1., 1., 1., 1., 1., 1., 1.},
+                       {1., 1., 1., 1., 1., 1., 1., 1.},
+                       {1., 1., 1., 1., 1., 1., 1., 1.}}}}));
+            auto weights = std::make_shared<Tensor>(
+                Array3D<float, outChannels, inChannels, kernelSize>(
+                    {{{{0.1000, 0.1000},
+                       {0.1000, 0.1000},
+                       {0.1000, 0.1000}}}}));
+
+            auto biases = std::make_shared<Tensor>(
+                Array1D<float, outChannels>({0.060000}));
+
+            auto op = setupTestConv<DIM>(
+                batchSize,
+                inChannels,
+                outChannels,
+                std::array<DimSize_t, DIM>({kernelSize}),
+                std::array<DimSize_t, DIM>({inDataSize}),
+                std::array<DimSize_t, DIM>({stride}),
+                std::array<DimSize_t, DIM>({dilation}),
+                padding,
+                input,
+                weights,
+                biases);
+
+            ////////////////////////////////////
+            // setup gradients for backward
+            auto outputGrad =
+                std::make_shared<Tensor>(op->getOutput(0)->dims());
+            outputGrad->setDataType(DataType::Float32);
+            outputGrad->setBackend("cpu");
+            constantFiller(outputGrad, 1.f);
+            op->getOutput(0)->setGrad(outputGrad);
+
+            ////////////////////////////////////
+            // setup gradients for backward
+            REQUIRE_NOTHROW(op->backward());
+
+            SECTION("Input Grad") {
+                auto expectedInputGrad = std::make_shared<Tensor>(
+                    Array3D<float, batchSize, inChannels, inDataSize>(
+                        {{{{0.1000,
+                            0.1000,
+                            0.0000,
+                            0.1000,
+                            0.1000,
+                            0.0000,
+                            0.1000,
+                            0.1000},
+                           {0.1000,
+                            0.1000,
+                            0.0000,
+                            0.1000,
+                            0.1000,
+                            0.0000,
+                            0.1000,
+                            0.1000},
+                           {0.1000,
+                            0.1000,
+                            0.0000,
+                            0.1000,
+                            0.1000,
+                            0.0000,
+                            0.1000,
+                            0.1000}},
+
+                          {{0.1000,
+                            0.1000,
+                            0.0000,
+                            0.1000,
+                            0.1000,
+                            0.0000,
+                            0.1000,
+                            0.1000},
+                           {0.1000,
+                            0.1000,
+                            0.0000,
+                            0.1000,
+                            0.1000,
+                            0.0000,
+                            0.1000,
+                            0.1000},
+                           {0.1000,
+                            0.1000,
+                            0.0000,
+                            0.1000,
+                            0.1000,
+                            0.0000,
+                            0.1000,
+                            0.1000}}}}));
+                CHECK(approxEq<float, float>(*op->getInput(0)->grad(),
+                                             *expectedInputGrad));
+            }
+            SECTION("Weight grad") {
+                auto expectedWeightsGrad = std::make_shared<Tensor>(
+                    Array3D<float, outChannels, inChannels, kernelSize>(
+                        {{{{6., 6.}, {6., 6.}, {6., 6.}}}}));
+                CHECK(approxEq<float, float>(*op->getInput(1)->grad(),
+                                             *expectedWeightsGrad));
+            }
+            SECTION("Bias Grad") {
+                auto expectedBiasesGrad = std::make_shared<Tensor>(
+                    Array1D<float, outChannels>({6.}));
+                CHECK(approxEq<float, float>(*op->getInput(2)->grad(),
+                                             *expectedBiasesGrad));
+            }
+        }
+
+        SECTION("dilation, no stride") {
+            const DimSize_t batchSize = 2;
+            const DimSize_t inChannels = 3;
+            const DimSize_t outChannels = 1;
+            const DimSize_t kernelSize = 2;
+            const DimSize_t inDataSize = 8;
+
+            const DimSize_t stride = 1;
+            const DimSize_t dilation = 2;
+            const std::array<DimSize_t, 2 * DIM> padding({0, 0});
+
+            auto inputSize =
+                std::vector<DimSize_t>({batchSize, inChannels, inDataSize});
+
+            auto input = std::make_shared<Tensor>(
+                Array3D<float, batchSize, inChannels, inDataSize>(
+                    {{{{1., 1., 1., 1., 1., 1., 1., 1.},
+                       {1., 1., 1., 1., 1., 1., 1., 1.},
+                       {1., 1., 1., 1., 1., 1., 1., 1.}},
+
+                      {{1., 1., 1., 1., 1., 1., 1., 1.},
+                       {1., 1., 1., 1., 1., 1., 1., 1.},
+                       {1., 1., 1., 1., 1., 1., 1., 1.}}}}));
+            auto weights = std::make_shared<Tensor>(
+                Array3D<float, outChannels, inChannels, kernelSize>(
+                    {{{{0.1000, 0.1000},
+                       {0.1000, 0.1000},
+                       {0.1000, 0.1000}}}}));
+
+            auto biases = std::make_shared<Tensor>(
+                Array1D<float, outChannels>({0.060000}));
+
+            auto op = setupTestConv<DIM>(
+                batchSize,
+                inChannels,
+                outChannels,
+                std::array<DimSize_t, DIM>({kernelSize}),
+                std::array<DimSize_t, DIM>({inDataSize}),
+                std::array<DimSize_t, DIM>({stride}),
+                std::array<DimSize_t, DIM>({dilation}),
+                padding,
+                input,
+                weights,
+                biases);
+
+            ////////////////////////////////////
+            // setup gradients for backward
+            auto outputGrad =
+                std::make_shared<Tensor>(op->getOutput(0)->dims());
+            outputGrad->setDataType(DataType::Float32);
+            outputGrad->setBackend("cpu");
+            constantFiller(outputGrad, 1.f);
+            op->getOutput(0)->setGrad(outputGrad);
+
+            ////////////////////////////////////
+            // setup gradients for backward
+            REQUIRE_NOTHROW(op->backward());
+
+            SECTION("Input Grad") {
+                auto expectedInputGrad = std::make_shared<Tensor>(
+                    Array3D<float, batchSize, inChannels, inDataSize>(
+                        {{{{0.1000,
+                            0.1000,
+                            0.2000,
+                            0.2000,
+                            0.2000,
+                            0.2000,
+                            0.1000,
+                            0.1000},
+                           {0.1000,
+                            0.1000,
+                            0.2000,
+                            0.2000,
+                            0.2000,
+                            0.2000,
+                            0.1000,
+                            0.1000},
+                           {0.1000,
+                            0.1000,
+                            0.2000,
+                            0.2000,
+                            0.2000,
+                            0.2000,
+                            0.1000,
+                            0.1000}},
+
+                          {{0.1000,
+                            0.1000,
+                            0.2000,
+                            0.2000,
+                            0.2000,
+                            0.2000,
+                            0.1000,
+                            0.1000},
+                           {0.1000,
+                            0.1000,
+                            0.2000,
+                            0.2000,
+                            0.2000,
+                            0.2000,
+                            0.1000,
+                            0.1000},
+                           {0.1000,
+                            0.1000,
+                            0.2000,
+                            0.2000,
+                            0.2000,
+                            0.2000,
+                            0.1000,
+                            0.1000}}}}));
+                CHECK(approxEq<float, float>(*op->getInput(0)->grad(),
+                                             *expectedInputGrad));
+            }
+            SECTION("Weight grad") {
+                auto expectedWeightsGrad = std::make_shared<Tensor>(
+                    Array3D<float, outChannels, inChannels, kernelSize>(
+                        {{{{12., 12.}, {12., 12.}, {12., 12.}}}}));
+                CHECK(approxEq<float, float>(*op->getInput(1)->grad(),
+                                             *expectedWeightsGrad));
+            }
+            SECTION("Bias Grad") {
+                auto expectedBiasesGrad = std::make_shared<Tensor>(
+                    Array1D<float, outChannels>({12.}));
+                CHECK(approxEq<float, float>(*op->getInput(2)->grad(),
+                                             *expectedBiasesGrad));
+            }
+        }
+        SECTION("stride & dilation") {
+            const DimSize_t batchSize = 1;
+            const DimSize_t inChannels = 4;
+            const DimSize_t outChannels = 4;
+            const DimSize_t kernelSize = 3;
+            const DimSize_t inDataSize = 13;
+
+            const DimSize_t stride = 4;
+            const DimSize_t dilation = 3;
+            const std::array<DimSize_t, 2 * DIM> padding({0, 0});
+
+            auto inputSize =
+                std::vector<DimSize_t>({batchSize, inChannels, inDataSize});
+
+            auto input = std::make_shared<
+                Tensor>(Array3D<float, batchSize, inChannels, inDataSize>(
+                {{{{1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.},
+                   {1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.},
+                   {1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.},
+                   {1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.}}}}));
+            auto weights = std::make_shared<Tensor>(
+                Array3D<float, outChannels, inChannels, kernelSize>(
+                    {{{{0.1000, 0.1000, 0.1000},
+                       {0.1000, 0.1000, 0.1000},
+                       {0.1000, 0.1000, 0.1000},
+                       {0.1000, 0.1000, 0.1000}},
+
+                      {{0.1000, 0.1000, 0.1000},
+                       {0.1000, 0.1000, 0.1000},
+                       {0.1000, 0.1000, 0.1000},
+                       {0.1000, 0.1000, 0.1000}},
+
+                      {{0.1000, 0.1000, 0.1000},
+                       {0.1000, 0.1000, 0.1000},
+                       {0.1000, 0.1000, 0.1000},
+                       {0.1000, 0.1000, 0.1000}},
+
+                      {{0.1000, 0.1000, 0.1000},
+                       {0.1000, 0.1000, 0.1000},
+                       {0.1000, 0.1000, 0.1000},
+                       {0.1000, 0.1000, 0.1000}}}}));
+
+            auto biases = std::make_shared<Tensor>(Array1D<float, outChannels>(
+                {{0.0100, 0.0100, 0.0100, 0.0100}}));
+
+            auto op = setupTestConv<DIM>(
+                batchSize,
+                inChannels,
+                outChannels,
+                std::array<DimSize_t, DIM>({kernelSize}),
+                std::array<DimSize_t, DIM>({inDataSize}),
+                std::array<DimSize_t, DIM>({stride}),
+                std::array<DimSize_t, DIM>({dilation}),
+                padding,
+                input,
+                weights,
+                biases);
+
+            ////////////////////////////////////
+            // setup gradients for backward
+            auto outputGrad =
+                std::make_shared<Tensor>(op->getOutput(0)->dims());
+            outputGrad->setDataType(DataType::Float32);
+            outputGrad->setBackend("cpu");
+            constantFiller(outputGrad, 1.f);
+            op->getOutput(0)->setGrad(outputGrad);
+
+            ////////////////////////////////////
+            // setup gradients for backward
+            REQUIRE_NOTHROW(op->backward());
+
+            SECTION("Input Grad") {
+                auto expectedInputGrad = std::make_shared<Tensor>(
+                    Array3D<float, batchSize, inChannels, inDataSize>(
+                        {{{{0.4000,
+                            0.0000,
+                            0.0000,
+                            0.4000,
+                            0.4000,
+                            0.0000,
+                            0.4000,
+                            0.4000,
+                            0.0000,
+                            0.0000,
+                            0.4000,
+                            0.0000,
+                            0.0000},
+                           {0.4000,
+                            0.0000,
+                            0.0000,
+                            0.4000,
+                            0.4000,
+                            0.0000,
+                            0.4000,
+                            0.4000,
+                            0.0000,
+                            0.0000,
+                            0.4000,
+                            0.0000,
+                            0.0000},
+                           {0.4000,
+                            0.0000,
+                            0.0000,
+                            0.4000,
+                            0.4000,
+                            0.0000,
+                            0.4000,
+                            0.4000,
+                            0.0000,
+                            0.0000,
+                            0.4000,
+                            0.0000,
+                            0.0000},
+                           {0.4000,
+                            0.0000,
+                            0.0000,
+                            0.4000,
+                            0.4000,
+                            0.0000,
+                            0.4000,
+                            0.4000,
+                            0.0000,
+                            0.0000,
+                            0.4000,
+                            0.0000,
+                            0.0000}}}}));
+                CHECK(approxEq<float, float>(*op->getInput(0)->grad(),
+                                             *expectedInputGrad));
+            }
+            SECTION("Weight grad") {
+                auto expectedWeightsGrad = std::make_shared<Tensor>(
+                    Array3D<float, outChannels, inChannels, kernelSize>(
+                        {{{{2., 2., 2.},
+                           {2., 2., 2.},
+                           {2., 2., 2.},
+                           {2., 2., 2.}},
+
+                          {{2., 2., 2.},
+                           {2., 2., 2.},
+                           {2., 2., 2.},
+                           {2., 2., 2.}},
+
+                          {{2., 2., 2.},
+                           {2., 2., 2.},
+                           {2., 2., 2.},
+                           {2., 2., 2.}},
+
+                          {{2., 2., 2.},
+                           {2., 2., 2.},
+                           {2., 2., 2.},
+                           {2., 2., 2.}}}}));
+                CHECK(approxEq<float, float>(*op->getInput(1)->grad(),
+                                             *expectedWeightsGrad));
+            }
+            SECTION("Bias Grad") {
+                auto expectedBiasesGrad = std::make_shared<Tensor>(
+                    Array1D<float, outChannels>({{2., 2., 2., 2.}}));
+                CHECK(approxEq<float, float>(*op->getInput(2)->grad(),
+                                             *expectedBiasesGrad));
+            }
+        }
+
+        // Harder to read, look at previous tests in case of issue
+        SECTION("Sequential values") {
+            const DimSize_t batchSize = 1;
+            const DimSize_t inChannels = 2;
+            const DimSize_t outChannels = 2;
+            const DimSize_t kernelSize = 3;
+            const DimSize_t inDataSize = 8;
+
+            const DimSize_t stride = 2;
+            const DimSize_t dilation = 2;
+            const std::array<DimSize_t, 2 * DIM> padding({0, 0});
+
+            const DimSize_t outDataSize = 2;
+
+            auto inputSize =
+                std::vector<DimSize_t>({batchSize, inChannels, inDataSize});
+
+            auto input = std::make_shared<Tensor>(
+                Array3D<float, batchSize, inChannels, inDataSize>(
+                    {{{{1., 2., 3., 4., 5., 6., 7., 8.},
+                       {9., 10., 11., 12., 13., 14., 15., 16.}}}}));
+            auto weights = std::make_shared<Tensor>(
+                Array3D<float, outChannels, inChannels, kernelSize>(
+                    {{{{0.1000, 0.2000, 0.3000}, {0.4000, 0.5000, 0.6000}},
+
+                      {{0.7000, 0.8000, 0.9000}, {1.0000, 1.1000, 1.2000}}}}));
+
+            auto biases = std::make_shared<Tensor>(
+                Array1D<float, outChannels>({{0.0100, 0.0200}}));
+
+            auto outputGrad = std::make_shared<Tensor>(
+                Array3D<float, batchSize, outChannels, outDataSize>(
+                    {{{{1., 2.}, {3., 4.}}}}));
+
+            auto op = setupTestConv<DIM>(
+                batchSize,
+                inChannels,
+                outChannels,
+                std::array<DimSize_t, DIM>({kernelSize}),
+                std::array<DimSize_t, DIM>({inDataSize}),
+                std::array<DimSize_t, DIM>({stride}),
+                std::array<DimSize_t, DIM>({dilation}),
+                padding,
+                input,
+                weights,
+                biases);
+
+            ////////////////////////////////////
+            // setup gradients for backward
+            op->getOutput(0)->setGrad(outputGrad);
+
+            REQUIRE_NOTHROW(op->backward());
+
+            SECTION("Input Grad") {
+                auto expectedInputGrad = std::make_shared<Tensor>(
+                    Array3D<float, batchSize, inChannels, inDataSize>(
+                        {{{{2.2000,
+                            0.0000,
+                            5.6000,
+                            0.0000,
+                            6.6000,
+                            0.0000,
+                            4.2000,
+                            0.0000},
+                           {3.4000,
+                            0.0000,
+                            8.6000,
+                            0.0000,
+                            9.6000,
+                            0.0000,
+                            6.0000,
+                            0.0000}}}}));
+                CHECK(approxEq<float, float>(*op->getInput(0)->grad(),
+                                             *expectedInputGrad));
+            }
+            SECTION("Weight grad") {
+                auto expectedWeightsGrad = std::make_shared<Tensor>(
+                    Array3D<float, outChannels, inChannels, kernelSize>(
+                        {{{{7., 13., 19.}, {31., 37., 43.}},
+
+                          {{15., 29., 43.}, {71., 85., 99.}}}}));
+                CHECK(approxEq<float, float>(*op->getInput(1)->grad(),
+                                             *expectedWeightsGrad));
+            }
+            SECTION("Bias Grad") {
+                auto expectedBiasesGrad = std::make_shared<Tensor>(
+                    Array1D<float, outChannels>({{3., 7.}}));
+                CHECK(approxEq<float, float>(*op->getInput(2)->grad(),
+                                             *expectedBiasesGrad));
+            }
+        }
+        SECTION("random values testing") {
+            const DimSize_t batchSize = 1;
+            const DimSize_t inChannels = 4;
+            const DimSize_t outChannels = 4;
+            const DimSize_t kernelSize = 3;
+            const DimSize_t inDataSize = 13;
+            const DimSize_t outDataSize = 2;
+
+            const DimSize_t stride = 4;
+            const DimSize_t dilation = 3;
+            const std::array<DimSize_t, 2 * DIM> padding({0, 0});
+
+            auto inputSize =
+                std::vector<DimSize_t>({batchSize, inChannels, inDataSize});
+
+            auto input = std::make_shared<Tensor>(
+                Array3D<float, batchSize, inChannels, inDataSize>(
+                    {{{{0.180772,
+                        -0.069988,
+                        -0.359623,
+                        -0.915204,
+                        0.625765,
+                        0.025510,
+                        0.954514,
+                        0.064349,
+                        0.361151,
+                        1.167878,
+                        -1.349893,
+                        -0.510177,
+                        0.235958},
+                       {-0.239778,
+                        -0.921115,
+                        1.543297,
+                        1.348826,
+                        -0.139642,
+                        0.285797,
+                        0.965120,
+                        -2.037150,
+                        0.493136,
+                        1.486999,
+                        0.591033,
+                        0.126030,
+                        -1.562687},
+                       {-1.160103,
+                        -0.334841,
+                        0.447772,
+                        -0.801645,
+                        1.523611,
+                        2.508587,
+                        -0.663096,
+                        -0.251275,
+                        1.010145,
+                        0.121547,
+                        -1.510835,
+                        2.104773,
+                        2.762959},
+                       {-1.746529,
+                        0.410919,
+                        -0.242185,
+                        0.420812,
+                        0.277596,
+                        0.778898,
+                        1.533269,
+                        1.609736,
+                        -0.403228,
+                        -0.274928,
+                        1.473840,
+                        0.068826,
+                        1.332708}}}}));
+            auto weights = std::make_shared<Tensor>(
+                Array3D<float, outChannels, inChannels, kernelSize>(
+                    {{{{0.587285, 0.286069, 0.008287},
+                       {-0.252325, -1.324722, 0.189178},
+                       {0.021100, 0.940420, -0.557690},
+                       {-0.693927, -0.325247, 1.243933}},
+
+                      {{-1.167186, -0.409124, 1.260062},
+                       {-1.563006, 1.134614, -0.082384},
+                       {0.289316, 0.835773, -0.244991},
+                       {0.271223, 0.093636, -0.883432}},
+
+                      {{-0.327417, 0.078394, -0.380766},
+                       {0.377508, 0.111912, 2.314279},
+                       {-0.798906, -0.564303, -1.134660},
+                       {0.170527, 0.994665, 1.262572}},
+
+                      {{1.621816, 1.077471, 0.594781},
+                       {-1.529087, 2.043707, -0.165627},
+                       {0.087070, -0.527656, -0.100288},
+                       {1.053922, -0.623074, -1.590572}}}}));
+
+            auto biases = std::make_shared<Tensor>(Array1D<float, outChannels>(
+                {{1.285940, -0.051787, -0.968103, -0.586324}}));
+
+            auto op = setupTestConv<DIM>(
+                batchSize,
+                inChannels,
+                outChannels,
+                std::array<DimSize_t, DIM>({kernelSize}),
+                std::array<DimSize_t, DIM>({inDataSize}),
+                std::array<DimSize_t, DIM>({stride}),
+                std::array<DimSize_t, DIM>({dilation}),
+                padding,
+                input,
+                weights,
+                biases);
+
+            ////////////////////////////////////
+            // setup gradients for backward
+            auto outputGrad = std::make_shared<Tensor>(
+                Array3D<float, batchSize, outChannels, outDataSize>(
+                    {{{{0.053156, 1.189073},
+                       {0.100228, 1.042344},
+                       {-1.468991, 0.581337},
+                       {1.330418, 0.487802}}}}));
+            op->getOutput(0)->setGrad(outputGrad);
+
+            ////////////////////////////////////
+            // setup gradients for backward
+            REQUIRE_NOTHROW(op->backward());
+
+            SECTION("Input Grad") {
+                auto expectedInputGrad = std::make_shared<Tensor>(
+                    Array3D<float, batchSize, inChannels, inDataSize>(
+                        {{{{2.552898,
+                            0.000000,
+                            0.000000,
+                            1.292528,
+                            0.082501,
+                            0.000000,
+                            1.477383,
+                            0.484875,
+                            0.000000,
+                            0.000000,
+                            1.392054,
+                            0.000000,
+                            0.000000},
+                           {-2.758950,
+                            0.000000,
+                            0.000000,
+                            2.597889,
+                            -2.455656,
+                            0.000000,
+                            -3.618210,
+                            0.669449,
+                            0.000000,
+                            0.000000,
+                            1.403657,
+                            0.000000,
+                            0.000000},
+                           {1.319545,
+                            0.000000,
+                            0.000000,
+                            0.260710,
+                            -0.095303,
+                            0.000000,
+                            1.479181,
+                            1.403949,
+                            0.000000,
+                            0.000000,
+                            -1.627040,
+                            0.000000,
+                            0.000000},
+                           {1.141951,
+                            0.000000,
+                            0.000000,
+                            -2.298007,
+                            0.070817,
+                            0.000000,
+                            -3.993255,
+                            -0.014843,
+                            0.000000,
+                            0.000000,
+                            0.516383,
+                            0.000000,
+                            0.000000}}}}));
+                CHECK(approxEq<float, float>(*op->getInput(0)->grad(),
+                                             *expectedInputGrad,
+                                             1e-5,
+                                             1e-6));
+            }
+            SECTION("Weight grad") {
+                auto expectedWeightsGrad = std::make_shared<Tensor>(
+                    Array3D<float, outChannels, inChannels, kernelSize>(
+                        {{{{0.753690, 0.027866, -1.554383},
+                           {-0.178790, -2.350622, 0.754084},
+                           {1.750019, -0.341397, -1.831741},
+                           {0.237243, 1.936463, 1.834007}},
+
+                          {{0.670381, -0.024656, -1.311384},
+                           {-0.169587, -1.988220, 0.712792},
+                           {1.471852, -0.342263, -1.641270},
+                           {0.114300, 1.720076, 1.689925}},
+
+                          {{0.098228, 1.381835, -2.186914},
+                           {0.271054, -3.165683, -1.074165},
+                           {2.589912, 1.031534, 0.095779},
+                           {2.727013, 0.317630, -1.395561}},
+
+                          {{0.545751, -1.186215, 0.611421},
+                           {-0.387123, 0.800776, 1.572321},
+                           {-0.800201, -1.189095, -1.619183},
+                           {-2.188202, 1.345088, 2.758830}}}
+
+                        }));
+                CHECK(approxEq<float, float>(*op->getInput(1)->grad(),
+                                             *expectedWeightsGrad,
+                                             1e-5,
+                                             1e-6));
+            }
+            SECTION("Bias Grad") {
+                auto expectedBiasesGrad =
+                    std::make_shared<Tensor>(Array1D<float, outChannels>(
+                        {{1.242230, 1.142572, -0.887655, 1.818220}}));
+                CHECK(approxEq<float, float>(*op->getInput(2)->grad(),
+                                             *expectedBiasesGrad));
+            }
+        }
+    }
+    SECTION("2D") {
+        const DimSize_t DIM = 2;
+        SECTION("Sequential values") {
+            constexpr DimSize_t batchSize = 1;
+            constexpr DimSize_t inChannels = 1;
+            constexpr DimSize_t outChannels = 2;
+            constexpr std::array<DimSize_t, DIM> kernelSize = {1, 2};
+            constexpr std::array<DimSize_t, DIM> inDataSize = {3, 4};
+
+            constexpr std::array<DimSize_t, DIM> stride = {1, 2};
+            constexpr std::array<DimSize_t, DIM> dilation = {1, 2};
+            constexpr std::array<DimSize_t, 2 * DIM> padding({0, 0});
+
+            constexpr std::array<DimSize_t, DIM> outDataSize = {3, 1};
+
+            auto inputSize = std::vector<DimSize_t>(
+                {batchSize, inChannels, inDataSize[0], inDataSize[1]});
+
+            auto input = std::make_shared<Tensor>(
+                Array4D<float,
+                        batchSize,
+                        inChannels,
+                        inDataSize[0],
+                        inDataSize[1]>({{{{{1., 2., 3., 4.},
+                                           {5., 6., 7., 8.},
+                                           {9., 10., 11., 12.}}}}}));
+            auto weights = std::make_shared<Tensor>(
+                Array4D<float,
+                        outChannels,
+                        inChannels,
+                        kernelSize[0],
+                        kernelSize[1]>({{{{{1., 2.}}}, {{{3., 4.}}}}}));
+
+            auto biases = std::make_shared<Tensor>(
+                Array1D<float, outChannels>({{1., 2.}}));
+
+            auto outputGrad = std::make_shared<Tensor>(Array4D<float,
+                                                               batchSize,
+                                                               outChannels,
+                                                               outDataSize[0],
+                                                               outDataSize[1]>(
+                {{{{{1.}, {2.}, {3.}}, {{4.}, {5.}, {6.}}}}}));
+
+            auto op = setupTestConv<DIM>(batchSize,
+                                                 inChannels,
+                                                 outChannels,
+                                                 kernelSize,
+                                                 inDataSize,
+                                                 stride,
+                                                 dilation,
+                                                 padding,
+                                                 input,
+                                                 weights,
+                                                 biases);
+
+            ////////////////////////////////////
+            // setup gradients for backward
+            op->getOutput(0)->setGrad(outputGrad);
+
+            REQUIRE_NOTHROW(op->backward());
+
+            SECTION("Input Grad") {
+                auto expectedInputGrad = std::make_shared<Tensor>(
+                    Array4D<float,
+                            batchSize,
+                            inChannels,
+                            inDataSize[0],
+                            inDataSize[1]>({{{{{13., 0., 18., 0.},
+                                               {17., 0., 24., 0.},
+                                               {21., 0., 30., 0.}}}}}));
+                CHECK(approxEq<float, float>(*op->getInput(0)->grad(),
+                                             *expectedInputGrad));
+            }
+            SECTION("Weight grad") {
+                auto expectedWeightsGrad =
+                    std::make_shared<Tensor>(Array4D<float,
+                                                     outChannels,
+                                                     inChannels,
+                                                     kernelSize[0],
+                                                     kernelSize[1]>(
+                        {{{{{38., 50.}}}, {{{83., 113.}}}}}));
+                CHECK(approxEq<float, float>(*op->getInput(1)->grad(),
+                                             *expectedWeightsGrad));
+            }
+            SECTION("Bias Grad") {
+                auto expectedBiasesGrad = std::make_shared<Tensor>(
+                    Array1D<float, outChannels>({{6., 15.}}));
+                CHECK(approxEq<float, float>(*op->getInput(2)->grad(),
+                                             *expectedBiasesGrad));
+            }
+        }
+    }
+}
-- 
GitLab


From c368906dc8093f1b2ddea8df07dd36f5caaebb8e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gr=C3=A9goire=20KUBLER?= <gregoire.kubler@proton.me>
Date: Mon, 27 Jan 2025 16:14:20 +0100
Subject: [PATCH 051/108] feat : [ADD] convtranspose forward 1D & 2D

---
 include/aidge/backend/cpu.hpp                 |    1 +
 .../cpu/operator/ConvTransposeImpl.hpp        |   59 +
 .../operator/ConvTransposeImpl_kernels.hpp    |  305 +++
 src/operator/ConvTransposeImpl.cpp            |   91 +
 unit_tests/operator/Test_ConvTranspose.cpp    | 2298 +++++++++++++++++
 5 files changed, 2754 insertions(+)
 create mode 100644 include/aidge/backend/cpu/operator/ConvTransposeImpl.hpp
 create mode 100644 include/aidge/backend/cpu/operator/ConvTransposeImpl_kernels.hpp
 create mode 100644 src/operator/ConvTransposeImpl.cpp
 create mode 100644 unit_tests/operator/Test_ConvTranspose.cpp

diff --git a/include/aidge/backend/cpu.hpp b/include/aidge/backend/cpu.hpp
index 80574b4a..5c1f9b11 100644
--- a/include/aidge/backend/cpu.hpp
+++ b/include/aidge/backend/cpu.hpp
@@ -27,6 +27,7 @@
 #include "aidge/backend/cpu/operator/ClipImpl.hpp"
 #include "aidge/backend/cpu/operator/ConvDepthWiseImpl.hpp"
 #include "aidge/backend/cpu/operator/ConvImpl.hpp"
+#include "aidge/backend/cpu/operator/ConvTransposeImpl.hpp"
 #include "aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp"
 #include "aidge/backend/cpu/operator/CryptoHashImpl.hpp"
 #include "aidge/backend/cpu/operator/DivImpl.hpp"
diff --git a/include/aidge/backend/cpu/operator/ConvTransposeImpl.hpp b/include/aidge/backend/cpu/operator/ConvTransposeImpl.hpp
new file mode 100644
index 00000000..7604a96a
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/ConvTransposeImpl.hpp
@@ -0,0 +1,59 @@
+
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_CONVTRANSPOSEIMPL_H_
+#define AIDGE_CPU_OPERATOR_CONVTRANSPOSEIMPL_H_
+
+#include <array>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/ConvTranspose.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+
+using std::array;
+
+// Operator implementation entry point for the backend
+using ConvTranspose1D_Op = ConvTranspose_Op<1>;
+using ConvTransposeImpl1D_cpu =
+    OperatorImpl_cpu<ConvTranspose1D_Op,
+                     void(const array<DimSize_t,1> &,
+                          const array<DimSize_t,1> &,
+                          const array<DimSize_t,1> &,
+                          const array<DimSize_t, 3> &,
+                          const array<DimSize_t, 3> &,
+                          const void *,
+                          const void *,
+                          const void *,
+                          void *)>;
+
+using ConvTranspose2D_Op = ConvTranspose_Op<2>;
+using ConvTransposeImpl2D_cpu =
+ OperatorImpl_cpu<ConvTranspose2D_Op,
+                                        void(const array<DimSize_t, 2> &,
+                                             const array<DimSize_t, 2> &,
+                                             const array<DimSize_t, 2> &,
+                                             const array<DimSize_t, 4> &,
+                                             const array<DimSize_t, 4> &,
+                                             const void *,
+                                             const void *,
+                                             const void *,
+                                             void *)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(ConvTranspose1D_Op, "cpu", ConvTransposeImpl1D_cpu::create);
+REGISTRAR(ConvTranspose2D_Op, "cpu", ConvTransposeImpl2D_cpu::create);
+} // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_CONVTRANSPOSEIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/ConvTransposeImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConvTransposeImpl_kernels.hpp
new file mode 100644
index 00000000..e11dd262
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/ConvTransposeImpl_kernels.hpp
@@ -0,0 +1,305 @@
+/********************************************************************************
+ * Copyright (c) 2025 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_CONVTRANSPOSEIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_CONVTRANSPOSEIMPL_KERNELS_H_
+
+#include <array>
+
+#include "aidge/backend/cpu/operator/ConvTransposeImpl.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include <aidge/backend/cpu/operator/ConvImpl_kernels.hpp>
+#include <aidge/data/Data.hpp>
+#include <aidge/data/half.hpp>
+#include <aidge/scheduler/ProdConso.hpp>
+#include <aidge/utils/Types.h>
+
+namespace Aidge {
+
+using std::array;
+
+////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////
+// 1D
+////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////
+
+/**
+ * @brief performs forward bias operation for convtranspose operator
+ *
+ * @tparam B Bias data type.
+ * @tparam O Output data type.
+ * @param[in] bias bias values
+ * @param[in] oDims dimensions of the output
+ * @param[in] oStrides nb of elements contained per dimension of the output
+ * @param[out] output
+ */
+template <class B, class O>
+static void convTranspose1DForwardBias(const B *biases,
+                                       const array<DimSize_t, 3> &oDims,
+                                       const array<DimSize_t, 2> &oStrides,
+                                       O *output) {
+    array<DimSize_t, 2> outOffsets{0, 0};
+    for (DimSize_t batch = 0; batch < oDims[0]; ++batch) {
+        outOffsets[0] = batch * oStrides[0];
+        for (DimSize_t outCh = 0; outCh < oDims[1]; ++outCh) {
+            outOffsets[1] = outCh * oStrides[1] + outOffsets[0];
+            // If bias = nullptr, set B(0)
+            B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
+            std::fill(output + outOffsets[1],
+                      output + (outOffsets[1] + oDims[2]),
+                      biasVal);
+        }
+    }
+}
+
+/**
+ * @brief forward kernel for convtranspose
+ * @note ConvTranspose forward is simply convolution backward kernel.
+ * Check convolution functions for more in-depth details on how the
+ subfunctions are built.
+ * @tparam I Input data type.
+ * @tparam W Weight data type.
+ * @tparam B Bias data type.
+ * @tparam O Output data type.
+ * @param[in] stride stride parameter of the convTranspose operator
+ * @param[in] dilation dilation parameter of the convTranspose operator
+ * @param[in] inputDims input dimensions
+ * @param[in] outputDims output tensor dimensions
+ * @param[in] oStrides nb of elements contained per dimension of the output
+ * @param[in] input_ values
+ * @param[in] weight_ values
+ * @param[in] biases_ values
+ * @param[out] output
+ */
+template <class I, class W, class B, class O>
+void ConvTransposeImpl1D_cpu_forward_kernel(
+    const array<DimSize_t, 1> &stride,
+    const array<DimSize_t, 1> &dilation,
+    const array<DimSize_t, 1> &kernelDim,
+    const array<DimSize_t, 3> &inputDims,
+    const array<DimSize_t, 3> &outputDims,
+    const void *input_,
+    const void *weights_,
+    const void *biases_,
+    void *output_) {
+
+    const I *input = static_cast<const I *>(input_);
+    const W *weights = static_cast<const W *>(weights_);
+    O *output = static_cast<O *>(output_);
+
+    // {batch_stride, channel_stride, dim0_stride}
+    const array<DimSize_t, 2> inputStrides{inputDims[1] * inputDims[2],
+                                           inputDims[2]};
+
+    // {batch_stride, channel_stride, dim0_stride}
+    const array<DimSize_t, 2> outputStrides{outputDims[1] * outputDims[2],
+                                            outputDims[2]};
+
+    // NOTE: kernel dims = {inChannels, outChannels, kernelDims[0]}
+    const array<DimSize_t, 2> kernelStrides{
+        outputDims[1] * kernelDim[0],
+        kernelDim[0],
+    };
+
+    if (biases_ != nullptr) {
+        const B *biases = static_cast<const B *>(biases_);
+        convTranspose1DForwardBias(biases, outputDims, outputStrides, output);
+    }
+
+    conv1DBackwardInput(stride,
+                        dilation,
+                        kernelDim,
+                        kernelStrides,
+                        weights,
+                        inputDims,
+                        inputStrides,
+                        input,
+                        outputDims,
+                        outputStrides,
+                        output);
+}
+
+REGISTRAR(ConvTransposeImpl1D_cpu,
+          {{DataType::Any, DataFormat::NCHW},
+           {DataType::Int32, DataFormat::NCHW}},
+          {ProdConso::inPlaceModel,
+           ConvTransposeImpl1D_cpu_forward_kernel<std::int32_t,
+                                                  std::int32_t,
+                                                  std::int32_t,
+                                                  std::int32_t>,
+           nullptr});
+REGISTRAR(ConvTransposeImpl1D_cpu,
+          {{DataType::Any, DataFormat::NCHW},
+           {DataType::Float32, DataFormat::NCHW}},
+          {ProdConso::inPlaceModel,
+           ConvTransposeImpl1D_cpu_forward_kernel<float, float, float, float>,
+           nullptr});
+REGISTRAR(ConvTransposeImpl1D_cpu,
+          {{DataType::Any, DataFormat::NCHW},
+           {DataType::Float16, DataFormat::NCHW}},
+          {ProdConso::inPlaceModel,
+           ConvTransposeImpl1D_cpu_forward_kernel<half_float::half,
+                                                  half_float::half,
+                                                  half_float::half,
+                                                  half_float::half>,
+           nullptr});
+REGISTRAR(
+    ConvTransposeImpl1D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel,
+     ConvTransposeImpl1D_cpu_forward_kernel<double, double, double, double>,
+     nullptr});
+
+////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////
+// 2D
+////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////
+
+/**
+ * @brief performs forward bias operation for convtranspose operator
+ *
+ * @tparam B Bias data type.
+ * @tparam O Output data type.
+ * @param[in] bias bias values
+ * @param[in] oDims dimensions of the output
+ * @param[in] oStrides nb of elements contained per dimension of the output
+ * @param[out] output
+ */
+template <class B, class O>
+static void convTranspose2DForwardBias(const B *biases,
+                                       const array<DimSize_t, 4> &oDims,
+                                       const array<DimSize_t, 3> &oStrides,
+                                       O *output) {
+    array<DimSize_t, 2> outOffsets{0, 0};
+
+    for (DimSize_t batch = 0; batch < oDims[0]; ++batch) {
+        outOffsets[0] = batch * oStrides[0];
+
+        for (DimSize_t outCh = 0; outCh < oDims[1]; ++outCh) {
+            outOffsets[1] = outCh * oStrides[1] + outOffsets[0];
+            // If bias = nullptr, set B(0)
+            B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
+            std::fill(output + outOffsets[1],
+                      (output + outOffsets[1]) + oStrides[1],
+                      biasVal);
+        }
+    }
+}
+
+/**
+ * @brief forward kernel for convtranspose
+ * @note ConvTranspose forward is simply convolution backward kernel.
+ * Check convolution functions for more in-depth details on how the
+ subfunctions are built.
+ * @tparam I Input data type.
+ * @tparam W Weight data type.
+ * @tparam B Bias data type.
+ * @tparam O Output data type.
+ * @param[in] stride stride parameter of the convTranspose operator
+ * @param[in] dilation dilation parameter of the convTranspose operator
+ * @param[in] inputDims input dimensions
+ * @param[in] outputDims output tensor dimensions
+ * @param[in] oStrides nb of elements contained per dimension of the output
+ * @param[in] input_ values
+ * @param[in] weight_ values
+ * @param[in] biases_ values
+ * @param[out] output
+ */
+template <class I, class W, class B, class O>
+void ConvTransposeImpl2D_cpu_forward_kernel(
+    const array<DimSize_t, 2> &stride,
+    const array<DimSize_t, 2> &dilation,
+    const array<DimSize_t, 2> &kernelDims,
+    const array<DimSize_t, 4> &inputDims,
+    const array<DimSize_t, 4> &outputDims,
+    const void *input_,
+    const void *weights_,
+    const void *biases_,
+    void *output_) {
+
+    auto input = static_cast<const I *>(input_);
+    auto weights = static_cast<const W *>(weights_);
+    auto output = static_cast<O *>(output_);
+
+    // {channel_stride, dim0_stride, dim1_stride}
+    const array<DimSize_t, 3> inputStrides{
+        inputDims[1] * inputDims[2] * inputDims[3],
+        inputDims[2] * inputDims[3],
+        inputDims[3]};
+
+    // {channel_stride, dim0_stride, dim1_stride}
+    const array<DimSize_t, 3> outputStrides{
+        outputDims[1] * outputDims[2] * outputDims[3],
+        outputDims[2] * outputDims[3],
+        outputDims[3]};
+
+    // NOTE: kernel dims = {inChannels, outChannels, kernelDims[0],
+    // kernelDims[1]}
+    const array<DimSize_t, 3> kernelStrides{
+        outputDims[1] * kernelDims[0] * kernelDims[1],
+        kernelDims[0] * kernelDims[1],
+        kernelDims[1],
+    };
+
+    if (biases_ != nullptr) {
+        auto biases = static_cast<const B *>(biases_);
+        convTranspose2DForwardBias(biases, outputDims, outputStrides, output);
+    }
+
+    conv2DBackwardInput(stride,
+                        dilation,
+                        kernelDims,
+                        kernelStrides,
+                        weights,
+                        inputDims,
+                        inputStrides,
+                        input,
+                        outputDims,
+                        outputStrides,
+                        output);
+}
+
+REGISTRAR(ConvTransposeImpl2D_cpu,
+          {{DataType::Any, DataFormat::NCHW},
+           {DataType::Int32, DataFormat::NCHW}},
+          {ProdConso::inPlaceModel,
+           ConvTransposeImpl2D_cpu_forward_kernel<std::int32_t,
+                                                  std::int32_t,
+                                                  std::int32_t,
+                                                  std::int32_t>,
+           nullptr});
+REGISTRAR(ConvTransposeImpl2D_cpu,
+          {{DataType::Any, DataFormat::NCHW},
+           {DataType::Float16, DataFormat::NCHW}},
+          {ProdConso::inPlaceModel,
+           ConvTransposeImpl2D_cpu_forward_kernel<half_float::half,
+                                                  half_float::half,
+                                                  half_float::half,
+                                                  half_float::half>,
+           nullptr});
+REGISTRAR(ConvTransposeImpl2D_cpu,
+          {{DataType::Any, DataFormat::NCHW},
+           {DataType::Float32, DataFormat::NCHW}},
+          {ProdConso::inPlaceModel,
+           ConvTransposeImpl2D_cpu_forward_kernel<float, float, float, float>,
+           nullptr});
+REGISTRAR(
+    ConvTransposeImpl2D_cpu,
+    {{DataType::Any, DataFormat::NCHW}, {DataType::Float64, DataFormat::NCHW}},
+    {ProdConso::inPlaceModel,
+     ConvTransposeImpl2D_cpu_forward_kernel<double, double, double, double>,
+     nullptr});
+
+} // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_CONVTRANSPOSEIMPL_KERNELS_H_ */
diff --git a/src/operator/ConvTransposeImpl.cpp b/src/operator/ConvTransposeImpl.cpp
new file mode 100644
index 00000000..d1135cc9
--- /dev/null
+++ b/src/operator/ConvTransposeImpl.cpp
@@ -0,0 +1,91 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include "aidge/backend/cpu/operator/ConvTransposeImpl.hpp"
+#include "aidge/backend/cpu/operator/ConvTransposeImpl_kernels.hpp"
+
+template <> void Aidge::ConvTransposeImpl1D_cpu::forward() {
+    const auto &op = static_cast<const ConvTranspose_Op<1> &>(mOp);
+
+    AIDGE_ASSERT(op.getInput(0), "{}: missing data input (#0).", op.type());
+    AIDGE_ASSERT(op.getInput(1), "{}: missing bias input (#1).", op.type());
+    AIDGE_ASSERT(op.getInput(2), "{}: missing weight input (#1).", op.type());
+
+    std::shared_ptr<Tensor> inputDataFallback, inputWeightFallback,
+        inputBiasFallback;
+    const auto &inputData =
+        op.getInput(0)->refCastFrom(inputDataFallback, *op.getOutput(0));
+    const auto &inputWeight =
+        op.getInput(1)->refCastFrom(inputWeightFallback, *op.getOutput(0));
+    const auto &inputBias =
+        (op.getInput(2))
+            ? op.getInput(2)->refCastFrom(inputBiasFallback, *op.getOutput(0))
+            : Tensor();
+
+    // Call kernel
+    const auto impl = Registrar<ConvTransposeImpl1D_cpu>::create(
+        getBestMatch(getRequiredSpec()));
+    impl.forward(op.strideDims(),
+                 op.dilationDims(),
+                 op.kernelDims(),
+                 op.getInput(0)->template dims<3>(),
+                 op.getOutput(0)->template dims<3>(),
+                 inputData.getImpl()->hostPtr(),
+                 inputWeight.getImpl()->hostPtr(),
+                 op.getInput(2) ? inputBias.getImpl()->hostPtr() : nullptr,
+                 op.getOutput(0)->getImpl()->rawPtr());
+}
+
+template <> void Aidge::ConvTransposeImpl1D_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(
+        std::runtime_error,
+        "Backward not yet implemented for Conv_Op<1> on backend cpu");
+}
+
+template <> void Aidge::ConvTransposeImpl2D_cpu::forward() {
+    const auto &op = static_cast<const ConvTranspose_Op<2> &>(mOp);
+
+    AIDGE_ASSERT(op.getInput(0), "{}: missing data input (#0).", op.type());
+    AIDGE_ASSERT(op.getInput(1), "{}: missing bias input (#1).", op.type());
+    AIDGE_ASSERT(op.getInput(2), "{}: missing weight input (#1).", op.type());
+
+    std::shared_ptr<Tensor> inputDataFallback, inputWeightFallback,
+        inputBiasFallback;
+    const auto &inputData =
+        op.getInput(0)->refCastFrom(inputDataFallback, *op.getOutput(0));
+    const auto &inputWeight =
+        op.getInput(1)->refCastFrom(inputWeightFallback, *op.getOutput(0));
+    const auto &inputBias =
+        (op.getInput(2))
+            ? op.getInput(2)->refCastFrom(inputBiasFallback, *op.getOutput(0))
+            : Tensor();
+
+    // Call kernel
+    const auto impl = Registrar<ConvTransposeImpl2D_cpu>::create(
+        getBestMatch(getRequiredSpec()));
+
+    impl.forward(op.strideDims(),
+                 op.dilationDims(),
+                 op.kernelDims(),
+                 op.getInput(0)->template dims<4>(),
+                 op.getOutput(0)->template dims<4>(),
+                 inputData.getImpl()->hostPtr(),
+                 inputWeight.getImpl()->hostPtr(),
+                 op.getInput(2) ? inputBias.getImpl()->hostPtr() : nullptr,
+                 op.getOutput(0)->getImpl()->rawPtr());
+}
+
+template <> void Aidge::ConvTransposeImpl2D_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(
+        std::runtime_error,
+        "Backward not yet implemented for Conv_Op<2> on backend cpu");
+}
+
diff --git a/unit_tests/operator/Test_ConvTranspose.cpp b/unit_tests/operator/Test_ConvTranspose.cpp
new file mode 100644
index 00000000..6e889e80
--- /dev/null
+++ b/unit_tests/operator/Test_ConvTranspose.cpp
@@ -0,0 +1,2298 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <aidge/utils/Types.h>
+#include <memory>
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
+
+#include "aidge/backend/cpu/operator/ConvTransposeImpl.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/ConvTranspose.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+namespace Aidge {
+
+template <DimSize_t DIM>
+static std::shared_ptr<OperatorTensor>
+setupTestConvTranspose(const DimSize_t batchSize,
+                       const DimSize_t inChannels,
+                       const DimSize_t outChannels,
+                       const std::array<DimSize_t, DIM> kernelSize,
+                       const std::array<DimSize_t, DIM> dataSize,
+                       const std::array<DimSize_t, DIM> stride,
+                       const std::array<DimSize_t, DIM> dilation,
+                       const std::shared_ptr<Tensor> input,
+                       const std::shared_ptr<Tensor> weights,
+                       const std::shared_ptr<Tensor> biases) {
+    std::shared_ptr<Node> convTransposeNode;
+    convTransposeNode = ConvTranspose(inChannels,
+                                      outChannels,
+                                      kernelSize,
+                                      stride,
+                                      dilation,
+                                      false,
+                                      "myconv");
+    auto op = std::static_pointer_cast<OperatorTensor>(
+        convTransposeNode->getOperator());
+
+    op->associateInput(0, input);
+    op->setDataType(DataType::Float32);
+
+    input->setBackend("cpu");
+    op->setBackend("cpu");
+
+    weights->setBackend("cpu");
+    op->associateInput(1, weights);
+
+    biases->setBackend("cpu");
+    op->associateInput(2, biases);
+
+    REQUIRE_NOTHROW(op->forwardDims(true));
+
+    return op;
+}
+
+TEST_CASE("[cpu/operator] ConvTranspose(forward)", "[ConvTranspose][CPU]") {
+    constexpr DimSize_t DIM = 1;
+    SECTION("1D") {
+        SECTION("kernel = 2 , in/outChannels = 1") {
+            constexpr DimSize_t batchSize = 1;
+            constexpr DimSize_t inChannels = 1;
+            constexpr DimSize_t outChannels = 1;
+
+            constexpr std::array<DimSize_t, DIM> kernelSize{2};
+
+            constexpr std::array<DimSize_t, DIM> inDataSize{4};
+            constexpr std::array<DimSize_t, DIM> outDataSize{5};
+
+            constexpr std::array<DimSize_t, DIM> stride{1};
+            constexpr std::array<DimSize_t, DIM> dilation{1};
+
+            auto input = std::make_shared<Tensor>(
+                Array3D<float, batchSize, inChannels, inDataSize[0]>(
+                    {{{{1.000000, 2.000000, 3.000000, 4.000000}}}}));
+
+            auto weights = std::make_shared<Tensor>(
+                Array3D<float, inChannels, outChannels, kernelSize[0]>(
+                    {{{{0.100000, 0.200000}}}}));
+
+            auto biases = std::make_shared<Tensor>(
+                Array1D<float, outChannels>({{0.010000}}));
+
+            auto op = setupTestConvTranspose<DIM>(batchSize,
+                                                  inChannels,
+                                                  outChannels,
+                                                  kernelSize,
+                                                  inDataSize,
+                                                  stride,
+                                                  dilation,
+                                                  input,
+                                                  weights,
+                                                  biases);
+
+            REQUIRE_NOTHROW(op->forward());
+
+            auto expectedOutput = std::make_shared<Tensor>(
+                Array3D<float, batchSize, outChannels, outDataSize[0]>(
+                    {{{{0.110000, 0.410000, 0.710000, 1.010000, 0.810000}}}}));
+            CHECK(approxEq<float, float>(*op->getOutput(0), *expectedOutput));
+        }
+        SECTION("kernel = 2, inChannel = 2, outChannels = 1") {
+            constexpr DimSize_t batchSize = 1;
+            constexpr DimSize_t inChannels = 2;
+            constexpr DimSize_t outChannels = 1;
+
+            constexpr std::array<DimSize_t, DIM> kernelSize{2};
+
+            constexpr std::array<DimSize_t, DIM> inDataSize{4};
+            constexpr std::array<DimSize_t, DIM> outDataSize{5};
+
+            constexpr std::array<DimSize_t, DIM> stride{1};
+            constexpr std::array<DimSize_t, DIM> dilation{1};
+
+            auto input = std::make_shared<Tensor>(
+                Array3D<float, batchSize, inChannels, inDataSize[0]>(
+                    {{{{1.000000, 2.000000, 3.000000, 4.000000},
+                       {5.000000, 6.000000, 7.000000, 8.000000}}}}));
+
+            auto weights = std::make_shared<Tensor>(
+                Array3D<float, inChannels, outChannels, kernelSize[0]>(
+                    {{{{0.100000, 0.200000}}, {{0.300000, 0.400000}}}}));
+
+            auto biases = std::make_shared<Tensor>(
+                Array1D<float, outChannels>({{0.010000}}));
+
+            auto op = setupTestConvTranspose<DIM>(batchSize,
+                                                  inChannels,
+                                                  outChannels,
+                                                  kernelSize,
+                                                  inDataSize,
+                                                  stride,
+                                                  dilation,
+                                                  input,
+                                                  weights,
+                                                  biases);
+
+            REQUIRE_NOTHROW(op->forward());
+
+            auto expectedOutput = std::make_shared<Tensor>(
+                Array3D<float, batchSize, outChannels, outDataSize[0]>(
+                    {{{{1.610000, 4.210000, 5.210000, 6.210001, 4.010000}}}}));
+            CHECK(approxEq<float, float>(*op->getOutput(0), *expectedOutput));
+        }
+        SECTION("kernel = 2, inChannel = 1, outChannels = 2") {
+            constexpr DimSize_t batchSize = 1;
+            constexpr DimSize_t inChannels = 1;
+            constexpr DimSize_t outChannels = 2;
+
+            constexpr std::array<DimSize_t, DIM> kernelSize{2};
+
+            constexpr std::array<DimSize_t, DIM> inDataSize{4};
+            constexpr std::array<DimSize_t, DIM> outDataSize{5};
+
+            constexpr std::array<DimSize_t, DIM> stride{1};
+            constexpr std::array<DimSize_t, DIM> dilation{1};
+
+            auto input = std::make_shared<Tensor>(
+                Array3D<float, batchSize, inChannels, inDataSize[0]>(
+                    {{{{1., 2., 3., 4.}}}}));
+
+            auto weights = std::make_shared<Tensor>(
+                Array3D<float, inChannels, outChannels, kernelSize[0]>(
+                    {{{{0.1, 0.2}, {0.3, 0.4}}}}));
+
+            auto biases = std::make_shared<Tensor>(
+                Array1D<float, outChannels>({{0.01, 0.02}}));
+
+            auto op = setupTestConvTranspose<DIM>(batchSize,
+                                                  inChannels,
+                                                  outChannels,
+                                                  kernelSize,
+                                                  inDataSize,
+                                                  stride,
+                                                  dilation,
+                                                  input,
+                                                  weights,
+                                                  biases);
+
+            REQUIRE_NOTHROW(op->forward());
+
+            auto expectedOutput = std::make_shared<Tensor>(
+                Array3D<float, batchSize, outChannels, outDataSize[0]>(
+                    {{{{0.11, 0.41, 0.71, 1.01, 0.81},
+                       {0.32, 1.02, 1.72, 2.42, 1.62}}}}));
+
+            CHECK(approxEq<float, float>(*op->getOutput(0), *expectedOutput));
+        }
+        SECTION("kernel = 1, inChannel = 2, outChannels = 2") {
+            constexpr DimSize_t batchSize = 1;
+            constexpr DimSize_t inChannels = 2;
+            constexpr DimSize_t outChannels = 2;
+
+            constexpr std::array<DimSize_t, DIM> kernelSize{1};
+
+            constexpr std::array<DimSize_t, DIM> inDataSize{4};
+            constexpr std::array<DimSize_t, DIM> outDataSize{4};
+
+            constexpr std::array<DimSize_t, DIM> stride{1};
+            constexpr std::array<DimSize_t, DIM> dilation{1};
+
+            auto input = std::make_shared<Tensor>(
+                Array3D<float, batchSize, inChannels, inDataSize[0]>(
+                    {{{{1.000000, 2.000000, 3.000000, 4.000000},
+                       {5.000000, 6.000000, 7.000000, 8.000000}}}}));
+
+            auto weights = std::make_shared<Tensor>(
+                Array3D<float, inChannels, outChannels, kernelSize[0]>(
+                    {{{{0.100000}, {0.200000}},
+
+                      {{0.300000}, {0.400000}}}}));
+
+            auto biases = std::make_shared<Tensor>(
+                Array1D<float, outChannels>({{0.010000, 0.020000}}));
+
+            auto op = setupTestConvTranspose<DIM>(batchSize,
+                                                  inChannels,
+                                                  outChannels,
+                                                  kernelSize,
+                                                  inDataSize,
+                                                  stride,
+                                                  dilation,
+                                                  input,
+                                                  weights,
+                                                  biases);
+
+            REQUIRE_NOTHROW(op->forward());
+
+            auto expectedOutput = std::make_shared<Tensor>(
+                Array3D<float, batchSize, outChannels, outDataSize[0]>(
+                    {{{{1.610000, 2.010000, 2.410000, 2.810000},
+                       {2.220000, 2.820000, 3.420000, 4.020000}}}}));
+
+            CHECK(approxEq<float, float>(*op->getOutput(0), *expectedOutput));
+        }
+        SECTION("kernel = 2, inChannels = 2, outChannels = 3") {
+            constexpr DimSize_t batchSize = 1;
+            constexpr DimSize_t inChannels = 2;
+            constexpr DimSize_t outChannels = 3;
+
+            constexpr std::array<DimSize_t, DIM> kernelSize{2};
+
+            constexpr std::array<DimSize_t, DIM> inDataSize{4};
+            constexpr std::array<DimSize_t, DIM> outDataSize{5};
+
+            constexpr std::array<DimSize_t, DIM> stride{1};
+            constexpr std::array<DimSize_t, DIM> dilation{1};
+
+            auto input = std::make_shared<Tensor>(
+                Array3D<float, batchSize, inChannels, inDataSize[0]>(
+                    {{{{1., 2., 3., 4.}, {5., 6., 7., 8.}}}}));
+
+            auto weights = std::make_shared<Tensor>(
+                Array3D<float, inChannels, outChannels, kernelSize[0]>(
+                    {{{{0.10, 0.20}, {0.30, 0.40}, {0.50, 0.60}},
+
+                      {{0.70, 0.80}, {0.90, 1.}, {1.10, 1.20}}}}));
+
+            auto biases = std::make_shared<Tensor>(
+                Array1D<float, outChannels>({{0.010000, 0.020000, 0.030000}}));
+
+            auto op = setupTestConvTranspose<DIM>(batchSize,
+                                                  inChannels,
+                                                  outChannels,
+                                                  kernelSize,
+                                                  inDataSize,
+                                                  stride,
+                                                  dilation,
+                                                  input,
+                                                  weights,
+                                                  biases);
+
+            REQUIRE_NOTHROW(op->forward());
+
+            auto expectedOutput = std::make_shared<
+                Tensor>(Array3D<float, batchSize, outChannels, outDataSize[0]>(
+                {{{{3.610000, 8.610001, 10.410000, 12.210001, 7.210001},
+                   {4.820000, 11.420000, 14.020000, 16.620001, 9.620001},
+                   {6.030000, 14.230000, 17.630001, 21.030001, 12.030000}}}}));
+
+            CHECK(approxEq<float, float>(*op->getOutput(0), *expectedOutput));
+        }
+
+        SECTION("Big test to ensure kernel capabilities") {
+            constexpr DimSize_t batchSize = 2;
+            constexpr DimSize_t inChannels = 3;
+            constexpr DimSize_t outChannels = 4;
+
+            constexpr std::array<DimSize_t, DIM> kernelSize{6};
+
+            constexpr std::array<DimSize_t, DIM> inDataSize{6};
+            constexpr std::array<DimSize_t, DIM> outDataSize{11};
+
+            constexpr std::array<DimSize_t, DIM> stride{1};
+            constexpr std::array<DimSize_t, DIM> dilation{1};
+
+            auto input = std::make_shared<Tensor>(
+                Array3D<float, batchSize, inChannels, inDataSize[0]>(
+                    {{{{1., 2., 3., 4., 5., 6.},
+                       {7., 8., 9., 10., 11., 12.},
+                       {13., 14., 15., 16., 17., 18.}},
+
+                      {{19., 20., 21., 22., 23., 24.},
+                       {25., 26., 27., 28., 29., 30.},
+                       {31., 32., 33., 34., 35., 36.}}}}));
+
+            auto weights = std::make_shared<Tensor>(
+                Array3D<float, inChannels, outChannels, kernelSize[0]>(
+                    {{{{0.1, 0.2, 0.3, 0.4, 0.5, 0.6},
+                       {0.7, 0.8, 0.9, 1., 1.1, 1.2},
+                       {1.3, 1.4, 1.5, 1.6, 1.7, 1.8},
+                       {1.9, 2., 2.1, 2.2, 2.3, 2.4}},
+
+                      {{2.5, 2.6, 2.7, 2.8, 2.9, 3.},
+                       {3.1, 3.2, 3.3, 3.4, 3.5, 3.6},
+                       {3.7, 3.8, 3.9, 4., 4.1, 4.2},
+                       {4.3, 4.4, 4.5, 4.6, 4.7, 4.8}},
+
+                      {{4.9, 5., 5.1, 5.2, 5.3, 5.4},
+                       {5.5, 5.6, 5.7, 5.8, 5.9, 6.},
+                       {6.1, 6.2, 6.3, 6.4, 6.5, 6.6},
+                       {6.7, 6.8, 6.9, 7., 7.1, 7.2}}}}));
+
+            auto biases = std::make_shared<Tensor>(
+                Array1D<float, outChannels>({{0.01, 0.02, 0.03, 0.04}}));
+
+            auto op = setupTestConvTranspose<DIM>(batchSize,
+                                                  inChannels,
+                                                  outChannels,
+                                                  kernelSize,
+                                                  inDataSize,
+                                                  stride,
+                                                  dilation,
+                                                  input,
+                                                  weights,
+                                                  biases);
+
+            REQUIRE_NOTHROW(op->forward());
+
+            auto expectedOutput = std::make_shared<Tensor>(
+                Array3D<float, batchSize, outChannels, outDataSize[0]>(
+                    {{{{81.310005,
+                        172.210007,
+                        273.010010,
+                        384.010040,
+                        505.509979,
+                        637.810059,
+                        561.010010,
+                        472.809998,
+                        372.910004,
+                        261.010010,
+                        136.809998},
+                       {93.919998,
+                        199.220001,
+                        316.219971,
+                        445.220001,
+                        586.520081,
+                        740.420044,
+                        651.020020,
+                        548.420044,
+                        432.319977,
+                        302.420013,
+                        158.419998},
+                       {106.529999,
+                        226.230011,
+                        359.429993,
+                        506.430054,
+                        667.530090,
+                        843.030029,
+                        741.030029,
+                        624.030029,
+                        491.730042,
+                        343.829987,
+                        180.029999},
+                       {119.140007,
+                        253.240005,
+                        402.640045,
+                        567.640076,
+                        748.539978,
+                        945.639954,
+                        831.039978,
+                        699.640015,
+                        551.140015,
+                        385.239990,
+                        201.639999}},
+
+                      {{216.309998,
+                        447.610016,
+                        694.210022,
+                        956.410034,
+                        1234.510132,
+                        1528.810059,
+                        1317.010010,
+                        1088.410034,
+                        842.710022,
+                        579.610046,
+                        298.810028},
+                       {261.319977,
+                        539.420044,
+                        834.619995,
+                        1147.220093,
+                        1477.520142,
+                        1825.820068,
+                        1569.019897,
+                        1293.619995,
+                        999.320068,
+                        685.820007,
+                        352.819977},
+                       {306.329987,
+                        631.230042,
+                        975.030029,
+                        1338.030151,
+                        1720.530029,
+                        2122.829834,
+                        1821.029785,
+                        1498.830200,
+                        1155.930054,
+                        792.030029,
+                        406.830017},
+                       {351.340027,
+                        723.039978,
+                        1115.440063,
+                        1528.840210,
+                        1963.539917,
+                        2419.839844,
+                        2073.040283,
+                        1704.040039,
+                        1312.540039,
+                        898.239990,
+                        460.840027}}}}));
+            CHECK(approxEq<float, float>(*op->getOutput(0), *expectedOutput));
+        }
+    }
+
+    SECTION("2D") {
+        constexpr DimSize_t DIM = 2;
+        SECTION("inChannels = 1, outChannels = 2, kernelSize = {1,2}, "
+                "inDataSize = {2,3}") {
+            constexpr DimSize_t batchSize = 1;
+            constexpr DimSize_t inChannels = 1;
+            constexpr DimSize_t outChannels = 2;
+
+            constexpr std::array<DimSize_t, DIM> kernelSize{1, 2};
+
+            constexpr std::array<DimSize_t, DIM> inDataSize{2, 3};
+            constexpr std::array<DimSize_t, DIM> outDataSize{2, 4};
+
+            constexpr std::array<DimSize_t, DIM> stride{1, 1};
+            constexpr std::array<DimSize_t, DIM> dilation{1, 1};
+
+            auto input = std::make_shared<Tensor>(Array4D<float,
+                                                          batchSize,
+                                                          inChannels,
+                                                          inDataSize[0],
+                                                          inDataSize[1]>(
+                {{{{{1.000000, 2.000000, 3.000000},
+                    {4.000000, 5.000000, 6.000000}}}}}));
+
+            auto weights = std::make_shared<Tensor>(
+                Array4D<float,
+                        inChannels,
+                        outChannels,
+                        kernelSize[0],
+                        kernelSize[1]>({{{{{0.100000, 0.200000}},
+
+                                          {{0.300000, 0.400000}}}}}));
+
+            auto biases = std::make_shared<Tensor>(
+                Array1D<float, outChannels>({{0.010000}}));
+
+            auto op = setupTestConvTranspose<DIM>(batchSize,
+                                                  inChannels,
+                                                  outChannels,
+                                                  kernelSize,
+                                                  inDataSize,
+                                                  stride,
+                                                  dilation,
+                                                  input,
+                                                  weights,
+                                                  biases);
+
+            REQUIRE_NOTHROW(op->forward());
+
+            auto expectedOutput =
+                std::make_shared<Tensor>(Array4D<float,
+                                                 batchSize,
+                                                 outChannels,
+                                                 outDataSize[0],
+                                                 outDataSize[1]>(
+                    {{{{{0.110000, 0.410000, 0.710000, 0.610000},
+                        {0.410000, 1.310000, 1.610000, 1.210000}},
+
+                       {{0.320000, 1.020000, 1.720000, 1.220000},
+                        {1.220000, 3.120000, 3.820000, 2.420000}}}}}));
+        }
+        SECTION("inChannels = 1, outChannels = 2, kernelSize = {2,3}, "
+                "inDataSize = {2,3}") {
+            constexpr DimSize_t batchSize = 1;
+            constexpr DimSize_t inChannels = 1;
+            constexpr DimSize_t outChannels = 2;
+
+            constexpr std::array<DimSize_t, DIM> kernelSize{2, 3};
+
+            constexpr std::array<DimSize_t, DIM> inDataSize{2, 3};
+            constexpr std::array<DimSize_t, DIM> outDataSize{3, 5};
+
+            constexpr std::array<DimSize_t, DIM> stride{1, 1};
+            constexpr std::array<DimSize_t, DIM> dilation{1, 1};
+
+            auto input = std::make_shared<Tensor>(Array4D<float,
+                                                          batchSize,
+                                                          inChannels,
+                                                          inDataSize[0],
+                                                          inDataSize[1]>(
+                {{{{{1.000000, 2.000000, 3.000000},
+                    {4.000000, 5.000000, 6.000000}}}}}));
+
+            auto weights = std::make_shared<Tensor>(Array4D<float,
+                                                            inChannels,
+                                                            outChannels,
+                                                            kernelSize[0],
+                                                            kernelSize[1]>(
+                {{{{{0.100000, 0.200000, 0.300000},
+                    {0.400000, 0.500000, 0.600000}},
+
+                   {{0.700000, 0.800000, 0.900000},
+                    {1.000000, 1.100000, 1.200000}}}}}));
+
+            auto biases = std::make_shared<Tensor>(
+                Array1D<float, outChannels>({{0.010000, 0.020000}}));
+
+            auto op = setupTestConvTranspose<DIM>(batchSize,
+                                                  inChannels,
+                                                  outChannels,
+                                                  kernelSize,
+                                                  inDataSize,
+                                                  stride,
+                                                  dilation,
+                                                  input,
+                                                  weights,
+                                                  biases);
+
+            REQUIRE_NOTHROW(op->forward());
+
+            auto expectedOutput = std::make_shared<
+                Tensor>(Array4D<float,
+                                batchSize,
+                                outChannels,
+                                outDataSize[0],
+                                outDataSize[1]>(
+                {{{{{0.110000, 0.410000, 1.010000, 1.210000, 0.910000},
+                    {0.810000, 2.610000, 5.610000, 5.410000, 3.610000},
+                    {1.610000, 4.010000, 7.310000, 6.010000, 3.610000}},
+
+                   {{0.720000, 2.220000, 4.620000, 4.220000, 2.720000},
+                    {3.820000, 9.820001, 18.220001, 15.020000, 9.020000},
+                    {4.020000, 9.420000, 16.320000, 12.620001, 7.220000}}}}}));
+        }
+        SECTION("inChannels = 1, outChannels = 2, kernelSize = {2,3}, "
+                "inDataSize = {6,6}, stride = {2,  2}, dilation = {2,  2}") {
+            constexpr DimSize_t batchSize = 1;
+            constexpr DimSize_t inChannels = 1;
+            constexpr DimSize_t outChannels = 2;
+
+            constexpr std::array<DimSize_t, DIM> kernelSize{2, 3};
+
+            constexpr std::array<DimSize_t, DIM> inDataSize{4, 4};
+            constexpr std::array<DimSize_t, DIM> outDataSize{9, 11};
+
+            constexpr std::array<DimSize_t, DIM> stride{2, 2};
+            constexpr std::array<DimSize_t, DIM> dilation{2, 2};
+
+            auto input = std::make_shared<Tensor>(Array4D<float,
+                                                          batchSize,
+                                                          inChannels,
+                                                          inDataSize[0],
+                                                          inDataSize[1]>(
+                {{{{{1.00, 2.00, 3.00, 4.000000},
+                    {5.00, 6.00, 7.00, 8.000000},
+                    {9.00, 10.00, 11.00, 12.000000},
+                    {13.00, 14.00, 15.00, 16.000000}}}}}));
+
+            auto weights = std::make_shared<Tensor>(Array4D<float,
+                                                            inChannels,
+                                                            outChannels,
+                                                            kernelSize[0],
+                                                            kernelSize[1]>(
+                {{{{{0.10, 0.20, 0.300000}, {0.40, 0.50, 0.600000}},
+
+                   {{0.70, 0.80, 0.900000}, {1.00, 1.10, 1.200000}}}}}));
+
+            auto biases = std::make_shared<Tensor>(
+                Array1D<float, outChannels>({{0.01, 0.020000}}));
+
+            auto op = setupTestConvTranspose<DIM>(batchSize,
+                                                  inChannels,
+                                                  outChannels,
+                                                  kernelSize,
+                                                  inDataSize,
+                                                  stride,
+                                                  dilation,
+                                                  input,
+                                                  weights,
+                                                  biases);
+
+            REQUIRE_NOTHROW(op->forward());
+
+            auto expectedOutput = std::make_shared<Tensor>(
+                Array4D<float,
+                        batchSize,
+                        outChannels,
+                        outDataSize[0],
+                        outDataSize[1]>({{{{{0.11,
+                                             0.01,
+                                             0.41,
+                                             0.01,
+                                             1.01,
+                                             0.01,
+                                             1.61,
+                                             0.01,
+                                             1.71,
+                                             0.01,
+                                             1.210000},
+                                            {0.01,
+                                             0.01,
+                                             0.01,
+                                             0.01,
+                                             0.01,
+                                             0.01,
+                                             0.01,
+                                             0.01,
+                                             0.01,
+                                             0.01,
+                                             0.010000},
+                                            {0.91,
+                                             0.01,
+                                             2.91,
+                                             0.01,
+                                             6.210001,
+                                             0.01,
+                                             8.31,
+                                             0.01,
+                                             7.510001,
+                                             0.01,
+                                             4.810000},
+                                            {0.01,
+                                             0.01,
+                                             0.01,
+                                             0.01,
+                                             0.01,
+                                             0.01,
+                                             0.01,
+                                             0.01,
+                                             0.01,
+                                             0.01,
+                                             0.010000},
+                                            {2.91,
+                                             0.01,
+                                             7.710001,
+                                             0.01,
+                                             14.610001,
+                                             0.01,
+                                             16.710001,
+                                             0.01,
+                                             13.910002,
+                                             0.01,
+                                             8.410001},
+                                            {0.01,
+                                             0.01,
+                                             0.01,
+                                             0.01,
+                                             0.01,
+                                             0.01,
+                                             0.01,
+                                             0.01,
+                                             0.01,
+                                             0.01,
+                                             0.010000},
+                                            {4.91,
+                                             0.01,
+                                             12.51,
+                                             0.01,
+                                             23.01,
+                                             0.01,
+                                             25.110001,
+                                             0.01,
+                                             20.309999,
+                                             0.01,
+                                             12.010000},
+                                            {0.01,
+                                             0.01,
+                                             0.01,
+                                             0.01,
+                                             0.01,
+                                             0.01,
+                                             0.01,
+                                             0.01,
+                                             0.01,
+                                             0.01,
+                                             0.010000},
+                                            {5.210001,
+                                             0.01,
+                                             12.110001,
+                                             0.01,
+                                             20.809999,
+                                             0.01,
+                                             22.309999,
+                                             0.01,
+                                             17.01,
+                                             0.01,
+                                             9.610001}},
+
+                                           {{0.72,
+                                             0.02,
+                                             2.22,
+                                             0.02,
+                                             4.62,
+                                             0.02,
+                                             7.02,
+                                             0.02,
+                                             5.92,
+                                             0.02,
+                                             3.620000},
+                                            {0.02,
+                                             0.02,
+                                             0.02,
+                                             0.02,
+                                             0.02,
+                                             0.02,
+                                             0.02,
+                                             0.02,
+                                             0.02,
+                                             0.02,
+                                             0.020000},
+                                            {4.52,
+                                             0.02,
+                                             11.320001,
+                                             0.02,
+                                             20.620003,
+                                             0.02,
+                                             26.320002,
+                                             0.02,
+                                             20.720001,
+                                             0.02,
+                                             12.020000},
+                                            {0.02,
+                                             0.02,
+                                             0.02,
+                                             0.02,
+                                             0.02,
+                                             0.02,
+                                             0.02,
+                                             0.02,
+                                             0.02,
+                                             0.02,
+                                             0.020000},
+                                            {11.32,
+                                             0.02,
+                                             25.720001,
+                                             0.02,
+                                             43.420002,
+                                             0.02,
+                                             49.120003,
+                                             0.02,
+                                             36.720001,
+                                             0.02,
+                                             20.420002},
+                                            {0.02,
+                                             0.02,
+                                             0.02,
+                                             0.02,
+                                             0.02,
+                                             0.02,
+                                             0.02,
+                                             0.02,
+                                             0.02,
+                                             0.02,
+                                             0.020000},
+                                            {18.119999,
+                                             0.02,
+                                             40.120003,
+                                             0.02,
+                                             66.220001,
+                                             0.02,
+                                             71.919998,
+                                             0.02,
+                                             52.720001,
+                                             0.02,
+                                             28.820002},
+                                            {0.02,
+                                             0.02,
+                                             0.02,
+                                             0.02,
+                                             0.02,
+                                             0.02,
+                                             0.02,
+                                             0.02,
+                                             0.02,
+                                             0.02,
+                                             0.020000},
+                                            {13.02,
+                                             0.02,
+                                             28.32,
+                                             0.02,
+                                             46.02,
+                                             0.02,
+                                             49.320004,
+                                             0.02,
+                                             35.619999,
+                                             0.02,
+                                             19.220001}}}}}));
+        }
+        SECTION("inChannels = 4, outChannels = 3, kernelSize = {2,2}, "
+                "inDataSize = {3,3}, stride = {2,  2}, dilation = {2,  2}") {
+            constexpr DimSize_t batchSize = 1;
+            constexpr DimSize_t inChannels = 4;
+            constexpr DimSize_t outChannels = 3;
+
+            constexpr std::array<DimSize_t, DIM> kernelSize{2, 2};
+
+            constexpr std::array<DimSize_t, DIM> inDataSize{4, 4};
+            constexpr std::array<DimSize_t, DIM> outDataSize{7, 7};
+
+            constexpr std::array<DimSize_t, DIM> stride{2, 2};
+            constexpr std::array<DimSize_t, DIM> dilation{2, 2};
+
+            auto input = std::make_shared<Tensor>(Array4D<float,
+                                                          batchSize,
+                                                          inChannels,
+                                                          inDataSize[0],
+                                                          inDataSize[1]>(
+                {{{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}, {7.0, 8.0, 9.0}},
+
+                   {{10.0, 11.0, 12.0},
+                    {13.0, 14.0, 15.0},
+                    {16.0, 17.0, 18.0}},
+
+                   {{19.0, 20.0, 21.0},
+                    {22.0, 23.0, 24.0},
+                    {25.0, 26.0, 27.0}},
+
+                   {{28.0, 29.0, 30.0},
+                    {31.0, 32.0, 33.0},
+                    {34.0, 35.0, 36.0}}}}}));
+
+            auto weights = std::make_shared<Tensor>(
+                Array4D<float,
+                        inChannels,
+                        outChannels,
+                        kernelSize[0],
+                        kernelSize[1]>({{{{{0.1, 0.2}, {0.3, 0.4}},
+
+                                          {{0.5, 0.6}, {0.7, 0.8}},
+
+                                          {{0.9, 1.0}, {1.1, 1.2}}},
+
+                                         {{{1.3, 1.4}, {1.5, 1.6}},
+
+                                          {{1.7, 1.8}, {1.9, 2.0}},
+
+                                          {{2.1, 2.2}, {2.3, 2.4}}},
+
+                                         {{{2.5, 2.6}, {2.7, 2.8}},
+
+                                          {{2.9, 3.0}, {3.1, 3.2}},
+
+                                          {{3.3, 3.4}, {3.5, 3.6}}},
+
+                                         {{{3.7, 3.8}, {3.9, 4.0}},
+
+                                          {{4.1, 4.2}, {4.3, 4.4}},
+
+                                          {{4.5, 4.6}, {4.7, 4.8}}}}}));
+
+            auto biases = std::make_shared<Tensor>(
+                Array1D<float, outChannels>({{0.010000, 0.020000, 0.030000}}));
+
+            auto op = setupTestConvTranspose<DIM>(batchSize,
+                                                  inChannels,
+                                                  outChannels,
+                                                  kernelSize,
+                                                  inDataSize,
+                                                  stride,
+                                                  dilation,
+                                                  input,
+                                                  weights,
+                                                  biases);
+
+            REQUIRE_NOTHROW(op->forward());
+
+            auto expectedOutput = std::make_shared<Tensor>(
+                Array4D<float,
+                        batchSize,
+                        outChannels,
+                        outDataSize[0],
+                        outDataSize[1]>({{{{{164.209991,
+                                             0.010000,
+                                             341.809998,
+                                             0.010000,
+                                             357.410034,
+                                             0.010000,
+                                             186.009995},
+                                            {0.010000,
+                                             0.010000,
+                                             0.010000,
+                                             0.010000,
+                                             0.010000,
+                                             0.010000,
+                                             0.010000},
+                                            {362.809998,
+                                             0.010000,
+                                             754.410034,
+                                             0.010000,
+                                             787.210083,
+                                             0.010000,
+                                             409.210022},
+                                            {0.010000,
+                                             0.010000,
+                                             0.010000,
+                                             0.010000,
+                                             0.010000,
+                                             0.010000,
+                                             0.010000},
+                                            {410.809998,
+                                             0.010000,
+                                             852.810059,
+                                             0.010000,
+                                             885.609985,
+                                             0.010000,
+                                             459.610016},
+                                            {0.010000,
+                                             0.010000,
+                                             0.010000,
+                                             0.010000,
+                                             0.010000,
+                                             0.010000,
+                                             0.010000},
+                                            {226.209991,
+                                             0.010000,
+                                             469.010010,
+                                             0.010000,
+                                             486.210022,
+                                             0.010000,
+                                             252.009995}},
+
+                                           {{187.419998,
+                                             0.020000,
+                                             389.820007,
+                                             0.020000,
+                                             408.619995,
+                                             0.020000,
+                                             212.420013},
+                                            {0.020000,
+                                             0.020000,
+                                             0.020000,
+                                             0.020000,
+                                             0.020000,
+                                             0.020000,
+                                             0.020000},
+                                            {414.019989,
+                                             0.020000,
+                                             860.020020,
+                                             0.020000,
+                                             899.220032,
+                                             0.020000,
+                                             466.820007},
+                                            {0.020000,
+                                             0.020000,
+                                             0.020000,
+                                             0.020000,
+                                             0.020000,
+                                             0.020000,
+                                             0.020000},
+                                            {471.620026,
+                                             0.020000,
+                                             977.619995,
+                                             0.020000,
+                                             1016.820068,
+                                             0.020000,
+                                             526.820007},
+                                            {0.020000,
+                                             0.020000,
+                                             0.020000,
+                                             0.020000,
+                                             0.020000,
+                                             0.020000,
+                                             0.020000},
+                                            {259.019989,
+                                             0.020000,
+                                             536.220032,
+                                             0.020000,
+                                             556.619995,
+                                             0.020000,
+                                             288.019989}},
+
+                                           {{210.630005,
+                                             0.030000,
+                                             437.829987,
+                                             0.030000,
+                                             459.829987,
+                                             0.030000,
+                                             238.830002},
+                                            {0.030000,
+                                             0.030000,
+                                             0.030000,
+                                             0.030000,
+                                             0.030000,
+                                             0.030000,
+                                             0.030000},
+                                            {465.230011,
+                                             0.030000,
+                                             965.630005,
+                                             0.030000,
+                                             1011.230103,
+                                             0.030000,
+                                             524.430054},
+                                            {0.030000,
+                                             0.030000,
+                                             0.030000,
+                                             0.030000,
+                                             0.030000,
+                                             0.030000,
+                                             0.030000},
+                                            {532.430054,
+                                             0.030000,
+                                             1102.430054,
+                                             0.030000,
+                                             1148.030029,
+                                             0.030000,
+                                             594.030029},
+                                            {0.030000,
+                                             0.030000,
+                                             0.030000,
+                                             0.030000,
+                                             0.030000,
+                                             0.030000,
+                                             0.030000},
+                                            {291.830017,
+                                             0.030000,
+                                             603.430054,
+                                             0.030000,
+                                             627.030029,
+                                             0.030000,
+                                             324.029999}}}}}));
+        }
+        SECTION("Big test to ensure kernel capabilities 1") {
+            constexpr DimSize_t batchSize = 1;
+            constexpr DimSize_t inChannels = 3;
+            constexpr DimSize_t outChannels = 4;
+
+            constexpr std::array<DimSize_t, DIM> kernelSize{2, 2};
+
+            constexpr std::array<DimSize_t, DIM> inDataSize{6, 5};
+            constexpr std::array<DimSize_t, DIM> outDataSize{8, 17};
+
+            constexpr std::array<DimSize_t, DIM> stride{1, 3};
+            constexpr std::array<DimSize_t, DIM> dilation{2, 4};
+
+            auto input = std::make_shared<Tensor>(
+                Array4D<float,
+                        batchSize,
+                        inChannels,
+                        inDataSize[0],
+                        inDataSize[1]>({{{{{1., 2., 3., 4., 5.},
+                                           {6., 7., 8., 9., 10.},
+                                           {11., 12., 13., 14., 15.},
+                                           {16., 17., 18., 19., 20.},
+                                           {21., 22., 23., 24., 25.},
+                                           {26., 27., 28., 29., 30.}},
+
+                                          {{31., 32., 33., 34., 35.},
+                                           {36., 37., 38., 39., 40.},
+                                           {41., 42., 43., 44., 45.},
+                                           {46., 47., 48., 49., 50.},
+                                           {51., 52., 53., 54., 55.},
+                                           {56., 57., 58., 59., 60.}},
+
+                                          {{61., 62., 63., 64., 65.},
+                                           {66., 67., 68., 69., 70.},
+                                           {71., 72., 73., 74., 75.},
+                                           {76., 77., 78., 79., 80.},
+                                           {81., 82., 83., 84., 85.},
+                                           {86., 87., 88., 89., 90.}}}}}));
+
+            auto weights = std::make_shared<Tensor>(Array4D<float,
+                                                            inChannels,
+                                                            outChannels,
+                                                            kernelSize[0],
+                                                            kernelSize[1]>(
+                {{{{{0.100000, 0.200000}, {0.300000, 0.400000}},
+
+                   {{0.500000, 0.600000}, {0.700000, 0.800000}},
+
+                   {{0.900000, 1.000000}, {1.100000, 1.200000}},
+
+                   {{1.300000, 1.400000}, {1.500000, 1.600000}}},
+
+                  {{{1.700000, 1.800000}, {1.900000, 2.000000}},
+
+                   {{2.100000, 2.200000}, {2.300000, 2.400000}},
+
+                   {{2.500000, 2.600000}, {2.700000, 2.800000}},
+
+                   {{2.900000, 3.000000}, {3.100000, 3.200000}}},
+
+                  {{{3.300000, 3.400000}, {3.500000, 3.600000}},
+
+                   {{3.700000, 3.800000}, {3.900000, 4.000000}},
+
+                   {{4.100000, 4.200000}, {4.300000, 4.400000}},
+
+                   {{4.500000, 4.600000}, {4.700000, 4.800000}}}}}));
+
+            auto biases = std::make_shared<Tensor>(
+                Array1D<float, outChannels>({{0.01, 0.02, 0.03, 0.04}}));
+
+            auto op = setupTestConvTranspose<DIM>(batchSize,
+                                                  inChannels,
+                                                  outChannels,
+                                                  kernelSize,
+                                                  inDataSize,
+                                                  stride,
+                                                  dilation,
+                                                  input,
+                                                  weights,
+                                                  biases);
+
+            REQUIRE_NOTHROW(op->forward());
+
+            auto expectedOutput = std::make_shared<Tensor>(
+                Array4D<float,
+                        batchSize,
+                        outChannels,
+                        outDataSize[0],
+                        outDataSize[1]>({{{{{254.110001,
+                                             0.010000,
+                                             0.010000,
+                                             259.210022,
+                                             263.410034,
+                                             0.010000,
+                                             264.309998,
+                                             268.810028,
+                                             0.010000,
+                                             269.410004,
+                                             274.210022,
+                                             0.010000,
+                                             274.510010,
+                                             279.610016,
+                                             0.010000,
+                                             0.010000,
+                                             285.010010},
+                                            {279.610016,
+                                             0.010000,
+                                             0.010000,
+                                             284.710022,
+                                             290.410004,
+                                             0.010000,
+                                             289.809998,
+                                             295.810028,
+                                             0.010000,
+                                             294.910004,
+                                             301.210022,
+                                             0.010000,
+                                             300.010010,
+                                             306.610016,
+                                             0.010000,
+                                             0.010000,
+                                             312.010010},
+                                            {577.810059,
+                                             0.010000,
+                                             0.010000,
+                                             588.609985,
+                                             599.410034,
+                                             0.010000,
+                                             599.410034,
+                                             610.810059,
+                                             0.010000,
+                                             610.209961,
+                                             622.210022,
+                                             0.010000,
+                                             621.010010,
+                                             633.609985,
+                                             0.010000,
+                                             0.010000,
+                                             645.010010},
+                                            {631.810059,
+                                             0.010000,
+                                             0.010000,
+                                             642.609985,
+                                             656.410034,
+                                             0.010000,
+                                             653.410034,
+                                             667.810059,
+                                             0.010000,
+                                             664.209961,
+                                             679.210022,
+                                             0.010000,
+                                             675.010010,
+                                             690.609985,
+                                             0.010000,
+                                             0.010000,
+                                             702.010010},
+                                            {685.810059,
+                                             0.010000,
+                                             0.010000,
+                                             696.609985,
+                                             713.410034,
+                                             0.010000,
+                                             707.410034,
+                                             724.810059,
+                                             0.010000,
+                                             718.209961,
+                                             736.210022,
+                                             0.010000,
+                                             729.010010,
+                                             747.609985,
+                                             0.010000,
+                                             0.010000,
+                                             759.010010},
+                                            {739.810059,
+                                             0.010000,
+                                             0.010000,
+                                             750.609985,
+                                             770.410034,
+                                             0.010000,
+                                             761.410034,
+                                             781.810059,
+                                             0.010000,
+                                             772.209961,
+                                             793.210022,
+                                             0.010000,
+                                             783.010010,
+                                             804.609985,
+                                             0.010000,
+                                             0.010000,
+                                             816.010010},
+                                            {386.710022,
+                                             0.010000,
+                                             0.010000,
+                                             392.410004,
+                                             402.010010,
+                                             0.010000,
+                                             398.110016,
+                                             408.010010,
+                                             0.010000,
+                                             403.809998,
+                                             414.010010,
+                                             0.010000,
+                                             409.510010,
+                                             420.010010,
+                                             0.010000,
+                                             0.010000,
+                                             426.010010},
+                                            {415.210022,
+                                             0.010000,
+                                             0.010000,
+                                             420.910004,
+                                             432.010010,
+                                             0.010000,
+                                             426.610016,
+                                             438.010040,
+                                             0.010000,
+                                             432.309998,
+                                             444.010010,
+                                             0.010000,
+                                             438.010010,
+                                             450.010040,
+                                             0.010000,
+                                             0.010000,
+                                             456.010010}},
+
+                                           {{291.320007,
+                                             0.020000,
+                                             0.020000,
+                                             297.619995,
+                                             300.619995,
+                                             0.020000,
+                                             303.919983,
+                                             307.219971,
+                                             0.020000,
+                                             310.220001,
+                                             313.819977,
+                                             0.020000,
+                                             316.519989,
+                                             320.419983,
+                                             0.020000,
+                                             0.020000,
+                                             327.019989},
+                                            {322.820007,
+                                             0.020000,
+                                             0.020000,
+                                             329.119995,
+                                             333.619995,
+                                             0.020000,
+                                             335.419983,
+                                             340.219971,
+                                             0.020000,
+                                             341.720001,
+                                             346.819977,
+                                             0.020000,
+                                             348.019989,
+                                             353.419983,
+                                             0.020000,
+                                             0.020000,
+                                             360.019989},
+                                            {664.220032,
+                                             0.020000,
+                                             0.020000,
+                                             677.420044,
+                                             685.820068,
+                                             0.020000,
+                                             690.619995,
+                                             699.619995,
+                                             0.020000,
+                                             703.820068,
+                                             713.420044,
+                                             0.020000,
+                                             717.020020,
+                                             727.219971,
+                                             0.020000,
+                                             0.020000,
+                                             741.020020},
+                                            {730.220032,
+                                             0.020000,
+                                             0.020000,
+                                             743.420044,
+                                             754.820068,
+                                             0.020000,
+                                             756.619995,
+                                             768.619995,
+                                             0.020000,
+                                             769.820068,
+                                             782.420044,
+                                             0.020000,
+                                             783.020020,
+                                             796.219971,
+                                             0.020000,
+                                             0.020000,
+                                             810.020020},
+                                            {796.220032,
+                                             0.020000,
+                                             0.020000,
+                                             809.420044,
+                                             823.820068,
+                                             0.020000,
+                                             822.620056,
+                                             837.619995,
+                                             0.020000,
+                                             835.820068,
+                                             851.420044,
+                                             0.020000,
+                                             849.020020,
+                                             865.219971,
+                                             0.020000,
+                                             0.020000,
+                                             879.020020},
+                                            {862.220032,
+                                             0.020000,
+                                             0.020000,
+                                             875.420044,
+                                             892.820068,
+                                             0.020000,
+                                             888.619995,
+                                             906.619995,
+                                             0.020000,
+                                             901.820068,
+                                             920.420044,
+                                             0.020000,
+                                             915.020020,
+                                             934.219971,
+                                             0.020000,
+                                             0.020000,
+                                             948.020020},
+                                            {447.919983,
+                                             0.020000,
+                                             0.020000,
+                                             454.820007,
+                                             463.220001,
+                                             0.020000,
+                                             461.720001,
+                                             470.420013,
+                                             0.020000,
+                                             468.619995,
+                                             477.619995,
+                                             0.020000,
+                                             475.519989,
+                                             484.819977,
+                                             0.020000,
+                                             0.020000,
+                                             492.019989},
+                                            {482.419983,
+                                             0.020000,
+                                             0.020000,
+                                             489.320007,
+                                             499.220001,
+                                             0.020000,
+                                             496.220001,
+                                             506.420013,
+                                             0.020000,
+                                             503.119995,
+                                             513.619995,
+                                             0.020000,
+                                             510.019989,
+                                             520.820007,
+                                             0.020000,
+                                             0.020000,
+                                             528.020020}},
+
+                                           {{328.529999,
+                                             0.030000,
+                                             0.030000,
+                                             336.029999,
+                                             337.830017,
+                                             0.030000,
+                                             343.529999,
+                                             345.630035,
+                                             0.030000,
+                                             351.029999,
+                                             353.430023,
+                                             0.030000,
+                                             358.529999,
+                                             361.230011,
+                                             0.030000,
+                                             0.030000,
+                                             369.030029},
+                                            {366.029999,
+                                             0.030000,
+                                             0.030000,
+                                             373.529999,
+                                             376.830017,
+                                             0.030000,
+                                             381.029999,
+                                             384.630035,
+                                             0.030000,
+                                             388.529999,
+                                             392.430023,
+                                             0.030000,
+                                             396.029999,
+                                             400.230042,
+                                             0.030000,
+                                             0.030000,
+                                             408.030029},
+                                            {750.630005,
+                                             0.030000,
+                                             0.030000,
+                                             766.230042,
+                                             772.230042,
+                                             0.030000,
+                                             781.830078,
+                                             788.430054,
+                                             0.030000,
+                                             797.430054,
+                                             804.630066,
+                                             0.030000,
+                                             813.030029,
+                                             820.830078,
+                                             0.030000,
+                                             0.030000,
+                                             837.030029},
+                                            {828.630005,
+                                             0.030000,
+                                             0.030000,
+                                             844.230042,
+                                             853.230042,
+                                             0.030000,
+                                             859.830078,
+                                             869.430054,
+                                             0.030000,
+                                             875.430054,
+                                             885.630066,
+                                             0.030000,
+                                             891.030029,
+                                             901.830078,
+                                             0.030000,
+                                             0.030000,
+                                             918.030029},
+                                            {906.630005,
+                                             0.030000,
+                                             0.030000,
+                                             922.230042,
+                                             934.230042,
+                                             0.030000,
+                                             937.830078,
+                                             950.430054,
+                                             0.030000,
+                                             953.430054,
+                                             966.630066,
+                                             0.030000,
+                                             969.030029,
+                                             982.830078,
+                                             0.030000,
+                                             0.030000,
+                                             999.030090},
+                                            {984.630005,
+                                             0.030000,
+                                             0.030000,
+                                             1000.230042,
+                                             1015.230103,
+                                             0.030000,
+                                             1015.830078,
+                                             1031.430054,
+                                             0.030000,
+                                             1031.430054,
+                                             1047.630127,
+                                             0.030000,
+                                             1047.030029,
+                                             1063.830078,
+                                             0.030000,
+                                             0.030000,
+                                             1080.030029},
+                                            {509.130005,
+                                             0.030000,
+                                             0.030000,
+                                             517.230042,
+                                             524.430054,
+                                             0.030000,
+                                             525.330078,
+                                             532.830017,
+                                             0.030000,
+                                             533.430054,
+                                             541.230042,
+                                             0.030000,
+                                             541.530029,
+                                             549.630066,
+                                             0.030000,
+                                             0.030000,
+                                             558.030029},
+                                            {549.630066,
+                                             0.030000,
+                                             0.030000,
+                                             557.730042,
+                                             566.430054,
+                                             0.030000,
+                                             565.830078,
+                                             574.830017,
+                                             0.030000,
+                                             573.930054,
+                                             583.230042,
+                                             0.030000,
+                                             582.030029,
+                                             591.630066,
+                                             0.030000,
+                                             0.030000,
+                                             600.030029}},
+
+                                           {{365.740021,
+                                             0.040000,
+                                             0.040000,
+                                             374.440002,
+                                             375.040009,
+                                             0.040000,
+                                             383.140015,
+                                             384.040009,
+                                             0.040000,
+                                             391.839996,
+                                             393.040009,
+                                             0.040000,
+                                             400.540009,
+                                             402.040009,
+                                             0.040000,
+                                             0.040000,
+                                             411.040009},
+                                            {409.240021,
+                                             0.040000,
+                                             0.040000,
+                                             417.940002,
+                                             420.040009,
+                                             0.040000,
+                                             426.640015,
+                                             429.040009,
+                                             0.040000,
+                                             435.339996,
+                                             438.040009,
+                                             0.040000,
+                                             444.040009,
+                                             447.040009,
+                                             0.040000,
+                                             0.040000,
+                                             456.040009},
+                                            {837.039978,
+                                             0.040000,
+                                             0.040000,
+                                             855.040039,
+                                             858.639954,
+                                             0.040000,
+                                             873.039978,
+                                             877.239990,
+                                             0.040000,
+                                             891.039978,
+                                             895.840027,
+                                             0.040000,
+                                             909.039978,
+                                             914.440002,
+                                             0.040000,
+                                             0.040000,
+                                             933.039978},
+                                            {927.039978,
+                                             0.040000,
+                                             0.040000,
+                                             945.040039,
+                                             951.639954,
+                                             0.040000,
+                                             963.039978,
+                                             970.239990,
+                                             0.040000,
+                                             981.039978,
+                                             988.840027,
+                                             0.040000,
+                                             999.039978,
+                                             1007.440002,
+                                             0.040000,
+                                             0.040000,
+                                             1026.040039},
+                                            {1017.039978,
+                                             0.040000,
+                                             0.040000,
+                                             1035.040039,
+                                             1044.640015,
+                                             0.040000,
+                                             1053.040039,
+                                             1063.239990,
+                                             0.040000,
+                                             1071.040039,
+                                             1081.840088,
+                                             0.040000,
+                                             1089.040039,
+                                             1100.440063,
+                                             0.040000,
+                                             0.040000,
+                                             1119.040039},
+                                            {1107.040039,
+                                             0.040000,
+                                             0.040000,
+                                             1125.040039,
+                                             1137.640137,
+                                             0.040000,
+                                             1143.040039,
+                                             1156.239990,
+                                             0.040000,
+                                             1161.040039,
+                                             1174.840088,
+                                             0.040000,
+                                             1179.040039,
+                                             1193.440063,
+                                             0.040000,
+                                             0.040000,
+                                             1212.040039},
+                                            {570.340027,
+                                             0.040000,
+                                             0.040000,
+                                             579.640015,
+                                             585.640015,
+                                             0.040000,
+                                             588.940002,
+                                             595.239990,
+                                             0.040000,
+                                             598.239990,
+                                             604.840027,
+                                             0.040000,
+                                             607.540039,
+                                             614.440002,
+                                             0.040000,
+                                             0.040000,
+                                             624.039978},
+                                            {616.840027,
+                                             0.040000,
+                                             0.040000,
+                                             626.140015,
+                                             633.640015,
+                                             0.040000,
+                                             635.440002,
+                                             643.239990,
+                                             0.040000,
+                                             644.739990,
+                                             652.840027,
+                                             0.040000,
+                                             654.040039,
+                                             662.440002,
+                                             0.040000,
+                                             0.040000,
+                                             672.039978}}}}}));
+            CHECK(approxEq<float, float>(*op->getOutput(0), *expectedOutput));
+        }
+        SECTION("Big test to ensure kernel capabilities") {
+            constexpr DimSize_t batchSize = 1;
+            constexpr DimSize_t inChannels = 3;
+            constexpr DimSize_t outChannels = 4;
+
+            constexpr std::array<DimSize_t, DIM> kernelSize{6, 4};
+
+            constexpr std::array<DimSize_t, DIM> inDataSize{6, 5};
+            constexpr std::array<DimSize_t, DIM> outDataSize{16, 25};
+
+            constexpr std::array<DimSize_t, DIM> stride{1, 3};
+            constexpr std::array<DimSize_t, DIM> dilation{2, 4};
+
+            auto input = std::make_shared<Tensor>(
+                Array4D<float,
+                        batchSize,
+                        inChannels,
+                        inDataSize[0],
+                        inDataSize[1]>({{{{{1., 2., 3., 4., 5.},
+                                           {6., 7., 8., 9., 10.},
+                                           {11., 12., 13., 14., 15.},
+                                           {16., 17., 18., 19., 20.},
+                                           {21., 22., 23., 24., 25.},
+                                           {26., 27., 28., 29., 30.}},
+
+                                          {{31., 32., 33., 34., 35.},
+                                           {36., 37., 38., 39., 40.},
+                                           {41., 42., 43., 44., 45.},
+                                           {46., 47., 48., 49., 50.},
+                                           {51., 52., 53., 54., 55.},
+                                           {56., 57., 58., 59., 60.}},
+
+                                          {{61., 62., 63., 64., 65.},
+                                           {66., 67., 68., 69., 70.},
+                                           {71., 72., 73., 74., 75.},
+                                           {76., 77., 78., 79., 80.},
+                                           {81., 82., 83., 84., 85.},
+                                           {86., 87., 88., 89., 90.}}}}}));
+
+            auto weights = std::make_shared<Tensor>(Array4D<float,
+                                                            inChannels,
+                                                            outChannels,
+                                                            kernelSize[0],
+                                                            kernelSize[1]>(
+                {{{{{0.100000, 0.200000, 0.300000, 0.400000},
+                    {0.500000, 0.600000, 0.700000, 0.800000},
+                    {0.900000, 1.000000, 1.100000, 1.200000},
+                    {1.300000, 1.400000, 1.500000, 1.600000},
+                    {1.700000, 1.800000, 1.900000, 2.000000},
+                    {2.100000, 2.200000, 2.300000, 2.400000}},
+
+                   {{2.500000, 2.600000, 2.700000, 2.800000},
+                    {2.900000, 3.000000, 3.100000, 3.200000},
+                    {3.300000, 3.400000, 3.500000, 3.600000},
+                    {3.700000, 3.800000, 3.900000, 4.000000},
+                    {4.100000, 4.200000, 4.300000, 4.400000},
+                    {4.500000, 4.600000, 4.700000, 4.800000}},
+
+                   {{4.900000, 5.000000, 5.100000, 5.200000},
+                    {5.300000, 5.400000, 5.500000, 5.600000},
+                    {5.700000, 5.800000, 5.900000, 6.000000},
+                    {6.100000, 6.200000, 6.300000, 6.400000},
+                    {6.500000, 6.600000, 6.700000, 6.800000},
+                    {6.900000, 7.000000, 7.100000, 7.200000}},
+
+                   {{7.300000, 7.400000, 7.500000, 7.600000},
+                    {7.700000, 7.800000, 7.900000, 8.000000},
+                    {8.100000, 8.200000, 8.300000, 8.400001},
+                    {8.500000, 8.600000, 8.700000, 8.800000},
+                    {8.900001, 9.000000, 9.100000, 9.200000},
+                    {9.300000, 9.400001, 9.500000, 9.600000}}},
+
+                  {{{9.700000, 9.800000, 9.900001, 10.000000},
+                    {10.100000, 10.200000, 10.300000, 10.400001},
+                    {10.500000, 10.600000, 10.700000, 10.800000},
+                    {10.900001, 11.000000, 11.100000, 11.200000},
+                    {11.300000, 11.400001, 11.500000, 11.600000},
+                    {11.700000, 11.800000, 11.900001, 12.000000}},
+
+                   {{12.100000, 12.200000, 12.300000, 12.400001},
+                    {12.500000, 12.600000, 12.700000, 12.800000},
+                    {12.900001, 13.000000, 13.100000, 13.200000},
+                    {13.300000, 13.400001, 13.500000, 13.600000},
+                    {13.700000, 13.800000, 13.900001, 14.000000},
+                    {14.100000, 14.200000, 14.300000, 14.400001}},
+
+                   {{14.500000, 14.600000, 14.700000, 14.800000},
+                    {14.900001, 15.000000, 15.100000, 15.200000},
+                    {15.300000, 15.400001, 15.500000, 15.600000},
+                    {15.700000, 15.800000, 15.900001, 16.000000},
+                    {16.100000, 16.200001, 16.300001, 16.400000},
+                    {16.500000, 16.600000, 16.700001, 16.800001}},
+
+                   {{16.900000, 17.000000, 17.100000, 17.200001},
+                    {17.300001, 17.400000, 17.500000, 17.600000},
+                    {17.700001, 17.800001, 17.900000, 18.000000},
+                    {18.100000, 18.200001, 18.300001, 18.400000},
+                    {18.500000, 18.600000, 18.700001, 18.800001},
+                    {18.900000, 19.000000, 19.100000, 19.200001}}},
+
+                  {{{19.300001, 19.400000, 19.500000, 19.600000},
+                    {19.700001, 19.800001, 19.900000, 20.000000},
+                    {20.100000, 20.200001, 20.300001, 20.400000},
+                    {20.500000, 20.600000, 20.700001, 20.800001},
+                    {20.900000, 21.000000, 21.100000, 21.200001},
+                    {21.300001, 21.400000, 21.500000, 21.600000}},
+
+                   {{21.700001, 21.800001, 21.900000, 22.000000},
+                    {22.100000, 22.200001, 22.300001, 22.400000},
+                    {22.500000, 22.600000, 22.700001, 22.800001},
+                    {22.900000, 23.000000, 23.100000, 23.200001},
+                    {23.300001, 23.400000, 23.500000, 23.600000},
+                    {23.700001, 23.800001, 23.900000, 24.000000}},
+
+                   {{24.100000, 24.200001, 24.300001, 24.400000},
+                    {24.500000, 24.600000, 24.700001, 24.800001},
+                    {24.900000, 25.000000, 25.100000, 25.200001},
+                    {25.300001, 25.400000, 25.500000, 25.600000},
+                    {25.700001, 25.800001, 25.900000, 26.000000},
+                    {26.100000, 26.200001, 26.300001, 26.400000}},
+
+                   {{26.500000, 26.600000, 26.700001, 26.800001},
+                    {26.900000, 27.000000, 27.100000, 27.200001},
+                    {27.300001, 27.400000, 27.500000, 27.600000},
+                    {27.700001, 27.800001, 27.900000, 28.000000},
+                    {28.100000, 28.200001, 28.300001, 28.400000},
+                    {28.500000, 28.600000, 28.700001, 28.800001}}}}}));
+
+            auto biases = std::make_shared<Tensor>(
+                Array1D<float, outChannels>({{0.01, 0.02, 0.03, 0.04}}));
+
+            auto op = setupTestConvTranspose<DIM>(batchSize,
+                                                  inChannels,
+                                                  outChannels,
+                                                  kernelSize,
+                                                  inDataSize,
+                                                  stride,
+                                                  dilation,
+                                                  input,
+                                                  weights,
+                                                  biases);
+
+            REQUIRE_NOTHROW(op->forward());
+
+            auto expectedOutput =
+                std::make_shared<Tensor>(Array4D<float,
+                                                 batchSize,
+                                                 outChannels,
+                                                 outDataSize[0],
+                                                 outDataSize[1]>(
+                    {{{{{1478.110107, 0.010000,    0.010000,    1507.210083,
+                         1487.410034, 0.010000,    1536.310059, 1516.809937,
+                         1496.709961, 1565.410034, 1546.209961, 1526.410034,
+                         3100.510010, 1575.609985, 1556.109985, 1536.010010,
+                         1605.010010, 1585.810059, 1566.010010, 0.010000,
+                         1615.510010, 1596.010010, 0.010000,    0.010000,
+                         1626.010010},
+                        {1623.610107, 0.010000,    0.010000,    1652.710083,
+                         1634.410034, 0.010000,    1681.810059, 1663.809937,
+                         1645.209961, 1710.910034, 1693.209961, 1674.910034,
+                         3396.010010, 1722.609985, 1704.610107, 1686.010010,
+                         1752.010010, 1734.310059, 1716.010010, 0.010000,
+                         1764.010010, 1746.010010, 0.010000,    0.010000,
+                         1776.010010},
+                        {3284.410156, 0.010000,    0.010000,    3343.810303,
+                         3306.010010, 0.010000,    3403.210205, 3366.010010,
+                         3327.610107, 3462.610107, 3426.010010, 3388.209961,
+                         6871.209961, 3486.010010, 3448.810059, 3410.409912,
+                         3546.010010, 3509.409912, 3471.610107, 0.010000,
+                         3570.010010, 3532.810059, 0.010000,    0.010000,
+                         3594.010010},
+                        {3581.410156, 0.010000,    0.010000,    3640.810303,
+                         3606.010010, 0.010000,    3700.210205, 3666.010010,
+                         3630.610107, 3759.610107, 3726.010010, 3691.209961,
+                         7474.209961, 3786.010010, 3751.810059, 3716.409912,
+                         3846.010010, 3812.409912, 3777.610107, 0.010000,
+                         3873.010010, 3838.810059, 0.010000,    0.010000,
+                         3900.010010},
+                        {5430.910156,  0.010000,    0.010000,    5521.809570,
+                         5467.809570,  0.010000,    5612.709961, 5559.609863,
+                         5504.709961,  5703.609863, 5651.409668, 5597.409668,
+                         11336.110352, 5743.209961, 5690.109863, 5635.209473,
+                         5835.009766,  5782.809570, 5728.809570, 0.010000,
+                         5875.509766,  5822.409668, 0.010000,    0.010000,
+                         5916.009766},
+                        {5885.410156,  0.010000,    0.010000,    5976.310059,
+                         5926.809570,  0.010000,    6067.209961, 6018.609863,
+                         5968.209961,  6158.110352, 6110.409668, 6060.909668,
+                         12258.610352, 6202.209961, 6153.609375, 6103.209473,
+                         6294.009766,  6246.309570, 6196.809570, 0.010000,
+                         6339.009766,  6290.409668, 0.010000,    0.010000,
+                         6384.009766},
+                        {5578.509766,  0.010000,    0.010000,    5673.009766,
+                         5615.410156,  0.010000,    5767.510254, 5710.809570,
+                         5652.309570,  5862.009766, 5806.209961, 5748.609863,
+                         11645.710938, 5901.609863, 5844.909668, 5786.409668,
+                         5997.009766,  5941.209961, 5883.609375, 0.010000,
+                         6037.509766,  5980.809570, 0.010000,    0.010000,
+                         6078.009766},
+                        {6051.009766,  0.010000,    0.010000,    6145.509766,
+                         6092.410156,  0.010000,    6240.010254, 6187.810059,
+                         6133.809570,  6334.509766, 6283.209961, 6230.109863,
+                         12604.208984, 6378.610352, 6326.409668, 6272.410156,
+                         6474.009766,  6422.709961, 6369.609375, 0.010000,
+                         6519.009766,  6466.809570, 0.010000,    0.010000,
+                         6564.009766},
+                        {5726.109863,  0.010000,    0.010000,    5824.209473,
+                         5763.009766,  0.010000,    5922.309570, 5862.009766,
+                         5799.910156,  6020.409668, 5961.010254, 5899.809570,
+                         11955.309570, 6060.009766, 5999.709961, 5937.609863,
+                         6159.009766,  6099.609863, 6038.409668, 0.010000,
+                         6199.509766,  6139.209961, 0.010000,    0.010000,
+                         6240.009766},
+                        {6216.609863,  0.010000,    0.010000,    6314.709473,
+                         6258.009766,  0.010000,    6412.809570, 6357.009766,
+                         6299.410156,  6510.909668, 6456.010254, 6399.310059,
+                         12949.809570, 6555.009766, 6499.209961, 6441.609863,
+                         6654.009766,  6599.110352, 6542.409668, 0.010000,
+                         6699.009766,  6643.209961, 0.010000,    0.010000,
+                         6744.009766},
+                        {5873.709961,  0.010000,    0.010000,    5975.409668,
+                         5910.609863,  0.010000,    6077.109375, 6013.209473,
+                         5947.509766,  6178.809570, 6115.809570, 6051.009766,
+                         12264.910156, 6218.409668, 6154.510254, 6088.809570,
+                         6321.009766,  6258.009766, 6193.209961, 0.010000,
+                         6361.509766,  6297.610352, 0.010000,    0.010000,
+                         6402.009766},
+                        {6382.209473,  0.010000,    0.010000,    6483.910156,
+                         6423.609863,  0.010000,    6585.609375, 6526.209473,
+                         6465.009766,  6687.309570, 6628.809570, 6568.509766,
+                         13295.410156, 6731.409668, 6672.010254, 6610.810059,
+                         6834.009766,  6775.509766, 6715.209961, 0.010000,
+                         6879.009766,  6819.610352, 0.010000,    0.010000,
+                         6924.009766},
+                        {4320.009766, 0.010000,    0.010000,    4389.009766,
+                         4347.609863, 0.010000,    4458.009766, 4417.209961,
+                         4375.209961, 4527.009766, 4486.809570, 4445.409668,
+                         8998.809570, 4556.409668, 4515.609863, 4473.609863,
+                         4626.009766, 4585.809570, 4544.410156, 0.010000,
+                         4656.009766, 4615.209961, 0.010000,    0.010000,
+                         4686.009766},
+                        {4665.009766, 0.010000,    0.010000,    4734.009766,
+                         4695.609375, 0.010000,    4803.009766, 4765.209961,
+                         4726.209961, 4872.009766, 4834.809570, 4796.409668,
+                         9697.809570, 4904.409668, 4866.609863, 4827.609863,
+                         4974.009766, 4936.809570, 4898.410156, 0.010000,
+                         5007.009766, 4969.209961, 0.010000,    0.010000,
+                         5040.009766},
+                        {2366.110107, 0.010000,    0.010000,    2401.209961,
+                         2381.409912, 0.010000,    2436.310059, 2416.810059,
+                         2396.709961, 2471.410156, 2452.209961, 2432.409912,
+                         4918.509766, 2487.609863, 2468.110107, 2448.010010,
+                         2523.010010, 2503.810059, 2484.010010, 0.010000,
+                         2539.510010, 2520.010010, 0.010000,    0.010000,
+                         2556.010010},
+                        {2541.610107, 0.010000,    0.010000,    2576.710205,
+                         2558.409912, 0.010000,    2611.810059, 2593.810059,
+                         2575.209961, 2646.910156, 2629.209961, 2610.909912,
+                         5274.009766, 2664.609863, 2646.610107, 2628.010010,
+                         2700.010010, 2682.310059, 2664.010010, 0.010000,
+                         2718.010010, 2700.010010, 0.010000,    0.010000,
+                         2736.010010}},
+
+                       {{1701.320068, 0.020000,    0.020000,    1737.620117,
+                         1710.620117, 0.020000,    1773.920044, 1747.220093,
+                         1719.920044, 1810.220093, 1783.820068, 1756.819946,
+                         3575.719971, 1820.420044, 1793.719971, 1766.420044,
+                         1857.020142, 1830.619995, 1803.619995, 0.020000,
+                         1867.520020, 1840.820068, 0.020000,    0.020000,
+                         1878.020020},
+                        {1882.820068, 0.020000,    0.020000,    1919.120117,
+                         1893.620117, 0.020000,    1955.420044, 1930.220093,
+                         1904.420044, 1991.720093, 1966.820068, 1941.319946,
+                         3943.219971, 2003.420044, 1978.219971, 1952.420044,
+                         2040.020142, 2015.119995, 1989.620117, 0.020000,
+                         2052.020020, 2026.820068, 0.020000,    0.020000,
+                         2064.020020},
+                        {3802.820068, 0.020000,    0.020000,    3876.620117,
+                         3824.420166, 0.020000,    3950.420166, 3898.820068,
+                         3846.020020, 4024.220215, 3973.220215, 3921.020020,
+                         7965.620117, 4047.620117, 3996.020020, 3943.219727,
+                         4122.020020, 4071.020020, 4018.820068, 0.020000,
+                         4146.020020, 4094.419922, 0.020000,    0.020000,
+                         4170.020020},
+                        {4171.819824, 0.020000,    0.020000,    4245.620117,
+                         4196.420410, 0.020000,    4319.420410, 4270.819824,
+                         4221.020020, 4393.220215, 4345.220215, 4296.020020,
+                         8712.620117, 4419.620605, 4371.020020, 4321.219727,
+                         4494.020020, 4446.020020, 4396.819824, 0.020000,
+                         4521.020020, 4472.419922, 0.020000,    0.020000,
+                         4548.020020},
+                        {6316.520020,  0.020000,    0.020000,    6429.020020,
+                         6353.420410,  0.020000,    6541.520508, 6466.819824,
+                         6390.319824,  6654.020020, 6580.220215, 6504.620117,
+                         13193.718750, 6693.620605, 6618.919922, 6542.420410,
+                         6807.020020,  6733.220215, 6657.619629, 0.020000,
+                         6847.520020,  6772.819824, 0.020000,    0.020000,
+                         6888.020020},
+                        {6879.020020,  0.020000,    0.020000,    6991.520020,
+                         6920.420410,  0.020000,    7104.020508, 7033.820312,
+                         6961.819824,  7216.520020, 7147.220215, 7076.120117,
+                         14332.218750, 7260.620605, 7190.420410, 7118.420410,
+                         7374.020020,  7304.720215, 7233.619629, 0.020000,
+                         7419.020020,  7348.819824, 0.020000,    0.020000,
+                         7464.020020},
+                        {6464.120117,  0.020000,    0.020000,    6580.219727,
+                         6501.020020,  0.020000,    6696.319824, 6618.020020,
+                         6537.920410,  6812.419922, 6735.020508, 6655.819824,
+                         13503.319336, 6852.020020, 6773.720215, 6693.620117,
+                         6969.020020,  6891.620605, 6812.419922, 0.020000,
+                         7009.520020,  6931.220215, 0.020000,    0.020000,
+                         7050.020020},
+                        {7044.620117,  0.020000,    0.020000,    7160.720215,
+                         7086.020020,  0.020000,    7276.819824, 7203.020020,
+                         7127.420410,  7392.919434, 7320.020508, 7245.320312,
+                         14677.819336, 7437.020020, 7363.220215, 7287.620117,
+                         7554.020020,  7481.120605, 7406.420410, 0.020000,
+                         7599.020020,  7525.220215, 0.020000,    0.020000,
+                         7644.020020},
+                        {6611.719727,  0.020000,    0.020000,    6731.420410,
+                         6648.620117,  0.020000,    6851.119629, 6769.219727,
+                         6685.520020,  6970.819824, 6889.819824, 6807.020020,
+                         13812.919922, 7010.419922, 6928.520508, 6844.819824,
+                         7131.020020,  7050.020020, 6967.220215, 0.020000,
+                         7171.520020,  7089.620605, 0.020000,    0.020000,
+                         7212.020020},
+                        {7210.219727,  0.020000,    0.020000,    7329.920410,
+                         7251.620117,  0.020000,    7449.619629, 7372.220215,
+                         7293.020020,  7569.319824, 7492.819824, 7414.520020,
+                         15023.418945, 7613.419434, 7536.020508, 7456.820312,
+                         7734.020020,  7657.520020, 7579.220215, 0.020000,
+                         7779.020020,  7701.620605, 0.020000,    0.020000,
+                         7824.020020},
+                        {6759.319824,  0.020000,    0.020000,    6882.620117,
+                         6796.219727,  0.020000,    7005.919922, 6920.420410,
+                         6833.120117,  7129.220215, 7044.619629, 6958.219727,
+                         14122.519531, 7168.819824, 7083.319824, 6996.020020,
+                         7293.020020,  7208.419922, 7122.020508, 0.020000,
+                         7333.520020,  7248.020020, 0.020000,    0.020000,
+                         7374.020020},
+                        {7375.819824,  0.020000,    0.020000,    7499.120117,
+                         7417.219727,  0.020000,    7622.420410, 7541.420410,
+                         7458.620117,  7745.720215, 7665.619629, 7583.720215,
+                         15369.019531, 7789.819824, 7708.819824, 7626.020020,
+                         7914.020020,  7833.919434, 7752.020508, 0.020000,
+                         7959.020020,  7878.020020, 0.020000,    0.020000,
+                         8004.020020},
+                        {4982.420410,  0.020000,    0.020000,    5065.819824,
+                         5010.020020,  0.020000,    5149.220215, 5094.020020,
+                         5037.619629,  5232.620605, 5178.020020, 5122.220215,
+                         10381.219727, 5262.020020, 5206.819824, 5150.419922,
+                         5346.020020,  5291.419922, 5235.620117, 0.020000,
+                         5376.020020,  5320.819824, 0.020000,    0.020000,
+                         5406.020020},
+                        {5399.420410,  0.020000,    0.020000,    5482.820312,
+                         5430.020020,  0.020000,    5566.220215, 5514.020020,
+                         5460.619629,  5649.620605, 5598.020020, 5545.220215,
+                         11224.219727, 5682.020020, 5629.819824, 5576.419922,
+                         5766.020020,  5714.419922, 5661.620117, 0.020000,
+                         5799.020020,  5746.819824, 0.020000,    0.020000,
+                         5832.020020},
+                        {2733.320068, 0.020000,    0.020000,    2775.620117,
+                         2748.620117, 0.020000,    2817.920166, 2791.219971,
+                         2763.919922, 2860.220215, 2833.820068, 2806.820068,
+                         5681.720215, 2876.420166, 2849.719971, 2822.419922,
+                         2919.020020, 2892.619873, 2865.620117, 0.020000,
+                         2935.520020, 2908.820068, 0.020000,    0.020000,
+                         2952.020020},
+                        {2944.820068, 0.020000,    0.020000,    2987.120117,
+                         2961.620117, 0.020000,    3029.420166, 3004.220215,
+                         2978.419922, 3071.720215, 3046.820068, 3021.320068,
+                         6109.220215, 3089.420166, 3064.219971, 3038.419922,
+                         3132.020020, 3107.119873, 3081.620117, 0.020000,
+                         3150.020020, 3124.820068, 0.020000,    0.020000,
+                         3168.020020}},
+
+                       {{1924.530029, 0.030000,    0.030000,    1968.030029,
+                         1933.830078, 0.030000,    2011.530029, 1977.630127,
+                         1943.130127, 2055.030029, 2021.430054, 1987.230103,
+                         4050.929932, 2065.230225, 2031.330078, 1996.829956,
+                         2109.030029, 2075.430176, 2041.229980, 0.030000,
+                         2119.530029, 2085.630127, 0.030000,    0.030000,
+                         2130.030029},
+                        {2142.030029, 0.030000,    0.030000,    2185.530029,
+                         2152.830078, 0.030000,    2229.030029, 2196.630127,
+                         2163.630127, 2272.530029, 2240.430176, 2207.729980,
+                         4490.429688, 2284.230225, 2251.830078, 2218.830078,
+                         2328.030029, 2295.930176, 2263.229980, 0.030000,
+                         2340.030029, 2307.629883, 0.030000,    0.030000,
+                         2352.030029},
+                        {4321.229980, 0.030000,    0.030000,    4409.429688,
+                         4342.829590, 0.030000,    4497.629883, 4431.629883,
+                         4364.430176, 4585.829590, 4520.430176, 4453.829590,
+                         9060.030273, 4609.229980, 4543.229980, 4476.029785,
+                         4698.029785, 4632.630371, 4566.029785, 0.030000,
+                         4722.029785, 4656.029785, 0.030000,    0.030000,
+                         4746.029785},
+                        {4762.229980, 0.030000,    0.030000,    4850.429688,
+                         4786.829590, 0.030000,    4938.629883, 4875.629883,
+                         4811.430176, 5026.829590, 4964.430176, 4900.829590,
+                         9951.030273, 5053.229980, 4990.229980, 4926.029785,
+                         5142.029785, 5079.630371, 5016.029785, 0.030000,
+                         5169.029785, 5106.029785, 0.030000,    0.030000,
+                         5196.029785},
+                        {7202.129883,  0.030000,    0.030000,    7336.229492,
+                         7239.029785,  0.030000,    7470.329590, 7374.029785,
+                         7275.930176,  7604.429688, 7509.030273, 7411.829590,
+                         15051.330078, 7644.029785, 7547.729980, 7449.629883,
+                         7779.029785,  7683.630371, 7586.430176, 0.030000,
+                         7819.529785,  7723.229980, 0.030000,    0.030000,
+                         7860.029785},
+                        {7872.629883,  0.030000,    0.030000,    8006.729980,
+                         7914.029785,  0.030000,    8140.829590, 8049.029785,
+                         7955.430176,  8274.929688, 8184.030273, 8091.330078,
+                         16405.830078, 8319.030273, 8227.230469, 8133.629883,
+                         8454.030273,  8363.130859, 8270.430664, 0.030000,
+                         8499.030273,  8407.230469, 0.030000,    0.030000,
+                         8544.030273},
+                        {7349.729492,  0.030000,    0.030000,    7487.430176,
+                         7386.629883,  0.030000,    7625.129395, 7525.229980,
+                         7423.529785,  7762.829590, 7663.829590, 7563.029785,
+                         15360.929688, 7802.429688, 7702.530273, 7600.829590,
+                         7941.029785,  7842.029785, 7741.229980, 0.030000,
+                         7981.529785,  7881.630371, 0.030000,    0.030000,
+                         8022.029785},
+                        {8038.229492,  0.030000,    0.030000,    8175.930176,
+                         8079.629883,  0.030000,    8313.629883, 8218.230469,
+                         8121.029785,  8451.330078, 8356.830078, 8260.530273,
+                         16751.427734, 8495.429688, 8400.030273, 8302.831055,
+                         8634.030273,  8539.530273, 8443.230469, 0.030000,
+                         8679.030273,  8583.630859, 0.030000,    0.030000,
+                         8724.030273},
+                        {7497.329590,  0.030000,    0.030000,    7638.629883,
+                         7534.229492,  0.030000,    7779.930176, 7676.430176,
+                         7571.130371,  7921.229980, 7818.629395, 7714.229980,
+                         15670.530273, 7960.829590, 7857.329590, 7752.029785,
+                         8103.029785,  8000.429688, 7896.030273, 0.030000,
+                         8143.529785,  8040.029785, 0.030000,    0.030000,
+                         8184.029785},
+                        {8203.830078,  0.030000,    0.030000,    8345.129883,
+                         8245.229492,  0.030000,    8486.430664, 8387.430664,
+                         8286.630859,  8627.730469, 8529.629883, 8429.730469,
+                         17097.029297, 8671.830078, 8572.830078, 8472.030273,
+                         8814.030273,  8715.930664, 8616.030273, 0.030000,
+                         8859.030273,  8760.030273, 0.030000,    0.030000,
+                         8904.030273},
+                        {7644.930176,  0.030000,    0.030000,    7789.829590,
+                         7681.829590,  0.030000,    7934.729980, 7827.629883,
+                         7718.729980,  8079.630371, 7973.430176, 7865.430176,
+                         15980.130859, 8119.229980, 8012.129395, 7903.229980,
+                         8265.030273,  8158.830566, 8050.829590, 0.030000,
+                         8305.530273,  8198.430664, 0.030000,    0.030000,
+                         8346.030273},
+                        {8369.430664,  0.030000,    0.030000,    8514.331055,
+                         8410.830078,  0.030000,    8659.230469, 8556.629883,
+                         8452.231445,  8804.130859, 8702.430664, 8598.930664,
+                         17442.628906, 8848.230469, 8745.629883, 8641.230469,
+                         8994.030273,  8892.331055, 8788.830078, 0.030000,
+                         9039.030273,  8936.430664, 0.030000,    0.030000,
+                         9084.030273},
+                        {5644.829590,  0.030000,    0.030000,    5742.629883,
+                         5672.430176,  0.030000,    5840.430176, 5770.830078,
+                         5700.029785,  5938.229980, 5869.229980, 5799.029785,
+                         11763.630859, 5967.630371, 5898.029785, 5827.229980,
+                         6066.029785,  5997.029785, 5926.829590, 0.030000,
+                         6096.029785,  6026.430176, 0.030000,    0.030000,
+                         6126.029785},
+                        {6133.829590,  0.030000,    0.030000,    6231.629883,
+                         6164.430176,  0.030000,    6329.430176, 6262.830078,
+                         6195.029785,  6427.229980, 6361.229980, 6294.029785,
+                         12750.630859, 6459.630371, 6393.029785, 6325.229980,
+                         6558.029785,  6492.029785, 6424.829590, 0.030000,
+                         6591.029785,  6524.430176, 0.030000,    0.030000,
+                         6624.029785},
+                        {3100.530029, 0.030000,    0.030000,    3150.030029,
+                         3115.830078, 0.030000,    3199.530029, 3165.630127,
+                         3131.130127, 3249.030029, 3215.430176, 3181.230225,
+                         6444.930176, 3265.230225, 3231.330078, 3196.830078,
+                         3315.030029, 3281.430176, 3247.230225, 0.030000,
+                         3331.530029, 3297.630127, 0.030000,    0.030000,
+                         3348.030029},
+                        {3348.030029, 0.030000,    0.030000,    3397.530029,
+                         3364.830078, 0.030000,    3447.030029, 3414.630127,
+                         3381.630127, 3496.530029, 3464.430176, 3431.730225,
+                         6944.430176, 3514.230225, 3481.830078, 3448.830078,
+                         3564.030029, 3531.930176, 3499.230225, 0.030000,
+                         3582.030029, 3549.630127, 0.030000,    0.030000,
+                         3600.030029}},
+
+                       {{2147.739990, 0.040000,    0.040000,    2198.439941,
+                         2157.040039, 0.040000,    2249.140137, 2208.040039,
+                         2166.340088, 2299.840088, 2259.040039, 2217.640137,
+                         4526.140137, 2310.040039, 2268.940186, 2227.240234,
+                         2361.040039, 2320.240234, 2278.840088, 0.040000,
+                         2371.540039, 2330.440186, 0.040000,    0.040000,
+                         2382.040039},
+                        {2401.239990, 0.040000,    0.040000,    2451.939941,
+                         2412.040039, 0.040000,    2502.640137, 2463.040039,
+                         2422.840088, 2553.340088, 2514.040039, 2474.140137,
+                         5037.640137, 2565.040039, 2525.440186, 2485.240234,
+                         2616.040039, 2576.740234, 2536.840088, 0.040000,
+                         2628.040039, 2588.440186, 0.040000,    0.040000,
+                         2640.040039},
+                        {4839.640137,  0.040000,    0.040000,    4942.240234,
+                         4861.240234,  0.040000,    5044.839844, 4964.439941,
+                         4882.839844,  5147.440430, 5067.640137, 4986.640137,
+                         10154.440430, 5170.839844, 5090.440430, 5008.840332,
+                         5274.040039,  5194.240234, 5113.240234, 0.040000,
+                         5298.040039,  5217.640625, 0.040000,    0.040000,
+                         5322.040039},
+                        {5352.640137,  0.040000,    0.040000,    5455.240234,
+                         5377.240234,  0.040000,    5557.839844, 5480.439941,
+                         5401.839844,  5660.440430, 5583.640137, 5505.640137,
+                         11189.439453, 5686.839844, 5609.440430, 5530.840332,
+                         5790.040039,  5713.240234, 5635.240234, 0.040000,
+                         5817.040039,  5739.640625, 0.040000,    0.040000,
+                         5844.040039},
+                        {8087.740234,  0.040000,    0.040000,    8243.440430,
+                         8124.640625,  0.040000,    8399.139648, 8281.240234,
+                         8161.540039,  8554.840820, 8437.839844, 8319.040039,
+                         16908.937500, 8594.440430, 8476.540039, 8356.840820,
+                         8751.040039,  8634.040039, 8515.240234, 0.040000,
+                         8791.540039,  8673.640625, 0.040000,    0.040000,
+                         8832.040039},
+                        {8866.240234,  0.040000,    0.040000,    9021.940430,
+                         8907.640625,  0.040000,    9177.639648, 9064.240234,
+                         8949.040039,  9333.340820, 9220.839844, 9106.540039,
+                         18479.437500, 9377.440430, 9264.040039, 9148.840820,
+                         9534.040039,  9421.540039, 9307.240234, 0.040000,
+                         9579.040039,  9465.640625, 0.040000,    0.040000,
+                         9624.040039},
+                        {8235.339844,  0.040000,    0.040000,    8394.639648,
+                         8272.240234,  0.040000,    8553.940430, 8432.440430,
+                         8309.140625,  8713.240234, 8592.639648, 8470.240234,
+                         17218.539062, 8752.840820, 8631.339844, 8508.040039,
+                         8913.040039,  8792.440430, 8670.040039, 0.040000,
+                         8953.540039,  8832.040039, 0.040000,    0.040000,
+                         8994.040039},
+                        {9031.839844,  0.040000,    0.040000,    9191.139648,
+                         9073.240234,  0.040000,    9350.440430, 9233.440430,
+                         9114.640625,  9509.740234, 9393.639648, 9275.740234,
+                         18825.039062, 9553.839844, 9436.839844, 9318.040039,
+                         9714.040039,  9597.940430, 9480.040039, 0.040000,
+                         9759.040039,  9642.040039, 0.040000,    0.040000,
+                         9804.040039},
+                        {8382.940430,  0.040000,    0.040000,    8545.840820,
+                         8419.839844,  0.040000,    8708.740234, 8583.639648,
+                         8456.740234,  8871.640625, 8747.440430, 8621.440430,
+                         17528.138672, 8911.240234, 8786.139648, 8659.240234,
+                         9075.040039,  8950.840820, 8824.839844, 0.040000,
+                         9115.540039,  8990.440430, 0.040000,    0.040000,
+                         9156.040039},
+                        {9197.440430,  0.040000,    0.040000,    9360.340820,
+                         9238.839844,  0.040000,    9523.240234, 9402.639648,
+                         9280.240234,  9686.140625, 9566.440430, 9444.940430,
+                         19170.638672, 9730.240234, 9609.639648, 9487.240234,
+                         9894.040039,  9774.339844, 9652.839844, 0.040000,
+                         9939.040039,  9818.440430, 0.040000,    0.040000,
+                         9984.040039},
+                        {8530.540039,  0.040000,    0.040000,    8697.040039,
+                         8567.440430,  0.040000,    8863.540039, 8734.840820,
+                         8604.339844,  9030.040039, 8902.240234, 8772.639648,
+                         17837.740234, 9069.640625, 8940.940430, 8810.440430,
+                         9237.040039,  9109.240234, 8979.639648, 0.040000,
+                         9277.540039,  9148.840820, 0.040000,    0.040000,
+                         9318.040039},
+                        {9363.040039,  0.040000,    0.040000,    9529.540039,
+                         9404.440430,  0.040000,    9696.040039, 9571.840820,
+                         9445.839844,  9862.540039, 9739.240234, 9614.139648,
+                         19516.240234, 9906.640625, 9782.440430, 9656.440430,
+                         10074.040039, 9950.740234, 9825.639648, 0.040000,
+                         10119.040039, 9994.839844, 0.040000,    0.040000,
+                         10164.040039},
+                        {6307.240234,  0.040000,    0.040000,    6419.439941,
+                         6334.839844,  0.040000,    6531.640137, 6447.640137,
+                         6362.440430,  6643.839844, 6560.440430, 6475.840332,
+                         13146.040039, 6673.240234, 6589.240234, 6504.040039,
+                         6786.040039,  6702.640625, 6618.040039, 0.040000,
+                         6816.040039,  6732.040039, 0.040000,    0.040000,
+                         6846.040039},
+                        {6868.240234,  0.040000,    0.040000,    6980.439941,
+                         6898.839844,  0.040000,    7092.640137, 7011.640137,
+                         6929.440430,  7204.839844, 7124.440430, 7042.840332,
+                         14277.040039, 7237.240234, 7156.240234, 7074.040039,
+                         7350.040039,  7269.640625, 7188.040039, 0.040000,
+                         7383.040039,  7302.040039, 0.040000,    0.040000,
+                         7416.040039},
+                        {3467.739990, 0.040000,    0.040000,    3524.439941,
+                         3483.040039, 0.040000,    3581.140137, 3540.040039,
+                         3498.340088, 3637.840088, 3597.040039, 3555.640137,
+                         7208.140137, 3654.040039, 3612.940186, 3571.240234,
+                         3711.040039, 3670.240234, 3628.840088, 0.040000,
+                         3727.540039, 3686.440186, 0.040000,    0.040000,
+                         3744.040039},
+                        {3751.239990, 0.040000,    0.040000,    3807.939941,
+                         3768.040039, 0.040000,    3864.640137, 3825.040039,
+                         3784.840088, 3921.340088, 3882.040039, 3842.140137,
+                         7779.640137, 3939.040039, 3899.440186, 3859.240234,
+                         3996.040039, 3956.740234, 3916.840088, 0.040000,
+                         4014.040039, 3974.440186, 0.040000,    0.040000,
+                         4032.040039}}}}}));
+            CHECK(approxEq<float, float>(*op->getOutput(0), *expectedOutput));
+        }
+    }
+}
+
+} // namespace Aidge
-- 
GitLab


From a24858233c8483b66dc31d891b4b39a54ff0d45d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gr=C3=A9goire=20KUBLER?= <gregoire.kubler@proton.me>
Date: Mon, 27 Jan 2025 16:18:23 +0100
Subject: [PATCH 052/108] chore : cleanup headers

---
 include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp | 8 ++------
 src/operator/ConvImpl.cpp                               | 5 -----
 unit_tests/operator/Test_ClipImpl.cpp                   | 4 ++--
 unit_tests/operator/Test_ConvImpl.cpp                   | 1 +
 4 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
index 70377260..4e1861b4 100644
--- a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
@@ -13,16 +13,12 @@
 #define AIDGE_CPU_OPERATOR_CONVIMPL_KERNELS_H_
 
 #include <array>
-#include <memory>
-#include <tuple>
-#include <vector>
+#include <cstdint>
+#include <fmt/base.h>
 
-#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/backend/cpu/operator/ConvImpl.hpp"
-#include "aidge/operator/Conv.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
-#include "aidge/backend/cpu/data/GetCPUPtr.h"
 
 namespace Aidge {
 using std::array;
diff --git a/src/operator/ConvImpl.cpp b/src/operator/ConvImpl.cpp
index 782a58d3..ffbb75c1 100644
--- a/src/operator/ConvImpl.cpp
+++ b/src/operator/ConvImpl.cpp
@@ -13,14 +13,9 @@
 #include "aidge/backend/cpu/operator/ConvImpl_kernels.hpp"
 
 #include <cassert>
-#include <chrono>  // std::chrono::milliseconds
-#include <numeric> // std::accumulate
-#include <thread>  // std::this_thread::sleep_for
-#include <vector>
 
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
 #include "aidge/operator/Conv.hpp"
-#include "aidge/utils/Types.h"
 
 namespace Aidge {
 
diff --git a/unit_tests/operator/Test_ClipImpl.cpp b/unit_tests/operator/Test_ClipImpl.cpp
index 99147ac9..3d75ad78 100644
--- a/unit_tests/operator/Test_ClipImpl.cpp
+++ b/unit_tests/operator/Test_ClipImpl.cpp
@@ -315,5 +315,5 @@ TEST_CASE("[cpu/operator] Clip", "[Clip][CPU]")
         Log::info("total time: {}\n", duration.count());
     }
  }
-} // namespace Aidge
-}
\ No newline at end of file
+}
+}  // namespace Aidge
diff --git a/unit_tests/operator/Test_ConvImpl.cpp b/unit_tests/operator/Test_ConvImpl.cpp
index 69e806cb..59ec16dd 100644
--- a/unit_tests/operator/Test_ConvImpl.cpp
+++ b/unit_tests/operator/Test_ConvImpl.cpp
@@ -17,6 +17,7 @@
 #include "aidge/backend/cpu/operator/ConvImpl.hpp"
 #include "aidge/data/Data.hpp"  // DataType
 #include "aidge/data/Tensor.hpp"
+#include "aidge/filler/Filler.hpp"
 #include "aidge/graph/Node.hpp"
 #include "aidge/operator/Conv.hpp"
 #include "aidge/utils/TensorUtils.hpp"
-- 
GitLab


From 7bd9c550694f560301abe4bd04aaeb1c81fd8cea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gr=C3=A9goire=20KUBLER?= <gregoire.kubler@proton.me>
Date: Tue, 25 Feb 2025 13:31:12 +0000
Subject: [PATCH 053/108] chore : conv forward 1/2D formatting

---
 .../aidge/backend/cpu/operator/ConvImpl.hpp   |   1 +
 .../backend/cpu/operator/ConvImpl_kernels.hpp | 249 ++++++++++++------
 src/operator/ConvImpl.cpp                     |  21 +-
 3 files changed, 177 insertions(+), 94 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/ConvImpl.hpp b/include/aidge/backend/cpu/operator/ConvImpl.hpp
index 8bf11ac0..e480697b 100644
--- a/include/aidge/backend/cpu/operator/ConvImpl.hpp
+++ b/include/aidge/backend/cpu/operator/ConvImpl.hpp
@@ -20,6 +20,7 @@
 #include "aidge/utils/Types.h"
 
 namespace Aidge {
+
 // Operator implementation entry point for the backend
 using Conv1D_Op = Conv_Op<1>;
 using ConvImpl1D_cpu = OperatorImpl_cpu<Conv_Op<1>,
diff --git a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
index 4e1861b4..274f5f4f 100644
--- a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
@@ -37,16 +37,15 @@ using std::array;
  * @param output_ Output Tensor.
  */
 template <class I, class W, class B, class O>
-void ConvImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& strideDims,
-                            const std::array<DimSize_t, 1>& dilationDims,
-                            const std::array<DimSize_t, 1>& kernelDims,
-                            const std::array<DimSize_t, 3>& inputDims,
-                            DimSize_t outChannels,
-                            const void *input_,
-                            const void *weights_,
-                            const void *biases_,
-                            void *output_)
-{
+void ConvImpl1D_cpu_forward_kernel(const array<DimSize_t, 1> &strideDim,
+                                   const array<DimSize_t, 1> &dilationDim,
+                                   const array<DimSize_t, 1> &kernelDim,
+                                   const std::array<DimSize_t, 3> &inputDims,
+                                   DimSize_t outChannels,
+                                   const void *input_,
+                                   const void *weights_,
+                                   const void *biases_,
+                                   void *output_) {
     // FIXME: missing convolution attributes as arguments
     const I *input = static_cast<const I *>(input_);
     const W *weights = static_cast<const W *>(weights_);
@@ -54,34 +53,51 @@ void ConvImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& strideDims,
     O *output = static_cast<O *>(output_);
 
     // output H size
-    const std::size_t oxSize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilationDims[0]*(kernelDims[0] - 1) - 1 + strideDims[0]) /
-                                static_cast<float>(strideDims[0])));
-    const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1;
+    const std::size_t oxSize = static_cast<std::size_t>(std::floor(
+        static_cast<float>(inputDims[2] - dilationDim[0] * (kernelDim[0] - 1) -
+                           1 + strideDim[0]) /
+        static_cast<float>(strideDim[0])));
+    const DimSize_t dilated_kernel_x = dilationDim[0] * (kernelDim[0] - 1) + 1;
 
-    // TODO: kernel computation
-    // output (batch, outCh, Xout, Yout)
-    // input  (batch, inCh, Xin, Yin)
-    // weight (outCh, inCh, kernelX, kernelY)
-    // does not take Dilation attribute into account
     using signedsize = std::make_signed<std::size_t>::type;
     for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
         for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
-            const std::size_t oIndex = (outCh + batch*outChannels) * oxSize;
+            const std::size_t oIndex = (outCh + batch * outChannels) * oxSize;
             // If bias = nullptr, set B(0)
             B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
-            std::fill(output + oIndex, output+(oIndex+oxSize), biasVal);
+            std::fill(output + oIndex, output + (oIndex + oxSize), biasVal);
             for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
-                const std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2];
-                const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0];
+                const std::size_t iIndex =
+                    (inCh + batch * inputDims[1]) * inputDims[2];
+                const std::size_t wIndex =
+                    (inCh + outCh * inputDims[1]) * kernelDim[0];
                 for (std::size_t ox = 0; ox < oxSize; ++ox) {
-                    // const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]);
-                    // const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
-                    // const std::size_t sxMax = (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 : ((inputDims[2] + difx) > kernelDims[0] ? kernelDims[0] : inputDims[2] + difx);
+                    // const signedsize difx = static_cast<signedsize>(- ox *
+                    // strideDim[0s); const std::size_t sxMin =
+                    // static_cast<std::size_t>(std::max(difx, signedsize(0)));
+                    // const std::size_t sxMax =
+                    // (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 :
+                    // ((inputDims[2] + difx) > kernelDim[0s[0] ? kernelDim[0s
+                    // : inputDims[2] + difx);
                     const std::size_t sxMin = 0;
                     const std::size_t sxMax = dilated_kernel_x;
                     const std::size_t oIndexFull = oIndex + ox;
-                    const signedsize ix = static_cast<signedsize>(ox * strideDims[0]);
+                    const signedsize ix =
+                        static_cast<signedsize>(ox * strideDim[0]);
+
+                    for (std::size_t sx = sxMin; sx * dilationDim[0] < sxMax;
+                         ++sx) {
+                        output[oIndexFull] +=
+                            weights[wIndex + sx] *
+                            input[iIndex + static_cast<std::size_t>(
+                                               ix + static_cast<signedsize>(
+                                                        sx * dilationDim[0]))];
+                    }
+                }
+            }
+        }
+    }
+}
 
 /**
  * @brief perform 1D backpropagation for the data input
@@ -119,9 +135,9 @@ void ConvImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& strideDims,
  * @param[inout] iGrad gradients of the input to update
  */
 template <class I, class W, class O>
-void conv1DBackwardInput(const DimSize_t &stride,
-                         const DimSize_t &dilation,
-                         const DimSize_t &kDim,
+void conv1DBackwardInput(const array<DimSize_t, 1> &stride,
+                         const array<DimSize_t, 1> &dilation,
+                         const array<DimSize_t, 1> &kDim,
                          const array<DimSize_t, 2> &kStrides,
                          const W *weights,
                          const array<DimSize_t, 3> &oDims,
@@ -434,16 +450,15 @@ REGISTRAR(ConvImpl1D_cpu,
  * @param output_ Output Tensor.
  */
 template <class I, class W, class B, class O>
-void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
-                            const std::array<DimSize_t, 2>& dilationDims,
-                            const std::array<DimSize_t, 2>& kernelDims,
-                            const std::array<DimSize_t, 4> &inputDims,
-                            DimSize_t outChannels,
-                            const void *input_,
-                            const void *weights_,
-                            const void *biases_,
-                            void *output_)
-{
+void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims,
+                                   const array<DimSize_t, 2> &dilationDims,
+                                   const array<DimSize_t, 2> &kernelDims,
+                                   const array<DimSize_t, 4> &inputDims,
+                                   DimSize_t outChannels,
+                                   const void *input_,
+                                   const void *weights_,
+                                   const void *biases_,
+                                   void *output_) {
     // FIXME: missing convolution attributes as arguments
     const I *input = static_cast<const I *>(input_);
     const W *weights = static_cast<const W *>(weights_);
@@ -451,59 +466,102 @@ void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
     O *output = static_cast<O *>(output_);
 
     // output H size
-    const DimSize_t dilated_kernel_x = dilationDims[0]*(kernelDims[0] - 1) + 1;
-    const std::size_t oxSize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[2] - dilated_kernel_x + strideDims[0]) /
-                                static_cast<float>(strideDims[0])));
+    const DimSize_t dilated_kernel_x =
+        dilationDims[0] * (kernelDims[0] - 1) + 1;
+    const std::size_t oxSize = static_cast<std::size_t>(std::floor(
+        static_cast<float>(inputDims[2] - dilated_kernel_x + strideDims[0]) /
+        static_cast<float>(strideDims[0])));
     // output W size
-    const DimSize_t dilated_kernel_y = dilationDims[1]*(kernelDims[1] - 1) + 1;
-    const std::size_t oySize =
-            static_cast<std::size_t>(std::floor(static_cast<float>(inputDims[3] - dilated_kernel_y + strideDims[1]) /
-                                static_cast<float>(strideDims[1])));
-
+    const DimSize_t dilated_kernel_y =
+        dilationDims[1] * (kernelDims[1] - 1) + 1;
+    const std::size_t oySize = static_cast<std::size_t>(std::floor(
+        static_cast<float>(inputDims[3] - dilated_kernel_y + strideDims[1]) /
+        static_cast<float>(strideDims[1])));
 
     // TODO: kernel computation
     // output (batch, outCh, Xout, Yout)
     // input  (batch, inCh, Xin, Yin)
     // weight (outCh, inCh, kernelX, kernelY)
     // does not take Dilation attribute into account
-    const std::size_t outChannels_s =  oxSize * oySize;
+    const std::size_t outChannels_s = oxSize * oySize;
 
     if (dilated_kernel_x == 3 && dilated_kernel_y == 3) {
         for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
             for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
                 // If bias = nullptr, set B(0)
                 B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
-                std::fill(output, output+outChannels_s, biasVal);
+                std::fill(output, output + outChannels_s, biasVal);
                 for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
-                    std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
-                    const std::size_t wIndex = (inCh + outCh*inputDims[1]) * 9;
-                    if (strideDims[0] == 1 && strideDims[1]==1) {
-                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=inputDims[3]) {
+                    std::size_t iIndex = (inCh + batch * inputDims[1]) *
+                                         inputDims[2] * inputDims[3];
+                    const std::size_t wIndex =
+                        (inCh + outCh * inputDims[1]) * 9;
+                    if (strideDims[0] == 1 && strideDims[1] == 1) {
+                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize;
+                             ++ox, oIndex += oySize, iIndex -= inputDims[3]) {
                             for (std::size_t oy = 0; oy < oySize; ++oy) {
-                                output[oIndex + oy] += weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+1]+weights[wIndex+2]*input[iIndex+oy+2];
+                                output[oIndex + oy] +=
+                                    weights[wIndex + 0] * input[iIndex + oy] +
+                                    weights[wIndex + 1] *
+                                        input[iIndex + oy + 1] +
+                                    weights[wIndex + 2] *
+                                        input[iIndex + oy + 2];
                             }
-                            iIndex+=inputDims[3];
+                            iIndex += inputDims[3];
                             for (std::size_t oy = 0; oy < oySize; ++oy) {
-                                output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy]+weights[wIndex+4]*input[iIndex+oy+1]+weights[wIndex+5]*input[iIndex+oy+2];
+                                output[oIndex + oy] +=
+                                    weights[wIndex + 3] * input[iIndex + oy] +
+                                    weights[wIndex + 4] *
+                                        input[iIndex + oy + 1] +
+                                    weights[wIndex + 5] *
+                                        input[iIndex + oy + 2];
                             }
-                            iIndex+=inputDims[3];
+                            iIndex += inputDims[3];
                             for (std::size_t oy = 0; oy < oySize; ++oy) {
-                                output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy]+weights[wIndex+7]*input[iIndex+oy+1]+weights[wIndex+8]*input[iIndex+oy+2];
+                                output[oIndex + oy] +=
+                                    weights[wIndex + 6] * input[iIndex + oy] +
+                                    weights[wIndex + 7] *
+                                        input[iIndex + oy + 1] +
+                                    weights[wIndex + 8] *
+                                        input[iIndex + oy + 2];
                             }
                         }
                     } else {
-                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=(strideDims[0]-2)*inputDims[3]) {
+                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox,
+                                         oIndex += oySize,
+                                         iIndex += (strideDims[0] -
+                                                    2) * inputDims[3]) {
                             for (std::size_t oy = 0; oy < oySize; ++oy) {
-                                output[oIndex + oy] += weights[wIndex+0]*input[iIndex+oy*strideDims[1]]+weights[wIndex+1]*input[iIndex+oy*strideDims[1]+1]+weights[wIndex+2]*input[iIndex+oy*strideDims[1]+2];
+                                output[oIndex + oy] +=
+                                    weights[wIndex + 0] *
+                                        input[iIndex + oy * strideDims[1]] +
+                                    weights[wIndex + 1] *
+                                        input[iIndex + oy * strideDims[1] +
+                                              1] +
+                                    weights[wIndex + 2] *
+                                        input[iIndex + oy * strideDims[1] + 2];
                             }
-                            iIndex+=inputDims[3];
+                            iIndex += inputDims[3];
                             for (std::size_t oy = 0; oy < oySize; ++oy) {
-                                output[oIndex + oy] += weights[wIndex+3]*input[iIndex+oy*strideDims[1]]+weights[wIndex+4]*input[iIndex+oy*strideDims[1]+1]+weights[wIndex+5]*input[iIndex+oy*strideDims[1]+2];
+                                output[oIndex + oy] +=
+                                    weights[wIndex + 3] *
+                                        input[iIndex + oy * strideDims[1]] +
+                                    weights[wIndex + 4] *
+                                        input[iIndex + oy * strideDims[1] +
+                                              1] +
+                                    weights[wIndex + 5] *
+                                        input[iIndex + oy * strideDims[1] + 2];
                             }
-                            iIndex+=inputDims[3];
+                            iIndex += inputDims[3];
                             for (std::size_t oy = 0; oy < oySize; ++oy) {
-                                output[oIndex + oy] += weights[wIndex+6]*input[iIndex+oy*strideDims[1]]+weights[wIndex+7]*input[iIndex+oy*strideDims[1]+1]+weights[wIndex+8]*input[iIndex+oy*strideDims[1]+2];
+                                output[oIndex + oy] +=
+                                    weights[wIndex + 6] *
+                                        input[iIndex + oy * strideDims[1]] +
+                                    weights[wIndex + 7] *
+                                        input[iIndex + oy * strideDims[1] +
+                                              1] +
+                                    weights[wIndex + 8] *
+                                        input[iIndex + oy * strideDims[1] + 2];
                             }
                         }
                     }
@@ -516,18 +574,26 @@ void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
             for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
                 // If bias = nullptr, set B(0)
                 B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
-                std::fill(output, output+outChannels_s, biasVal);
+                std::fill(output, output + outChannels_s, biasVal);
                 for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
-                    std::size_t iIndex = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
-                    const std::size_t wIndex = (inCh + outCh*inputDims[1]);
+                    std::size_t iIndex = (inCh + batch * inputDims[1]) *
+                                         inputDims[2] * inputDims[3];
+                    const std::size_t wIndex = (inCh + outCh * inputDims[1]);
                     if (strideDims[0] == 1 && strideDims[1] == 1) {
-                        for (std::size_t oIndex = 0; oIndex < oxSize*oySize; ++oIndex, ++iIndex) {
+                        for (std::size_t oIndex = 0; oIndex < oxSize * oySize;
+                             ++oIndex, ++iIndex) {
                             output[oIndex] += weights[wIndex] * input[iIndex];
                         }
-                    } else  {
-                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=inputDims[3]*strideDims[0]) {
-                            for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) {
-                                output[oIndex + oy] += weights[wIndex+0]*input[iIndex+iy];
+                    } else {
+                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize;
+                             ++ox,
+                                         oIndex += oySize,
+                                         iIndex +=
+                                         inputDims[3] * strideDims[0]) {
+                            for (std::size_t oy = 0, iy = 0; oy < oySize;
+                                 ++oy, iy += strideDims[1]) {
+                                output[oIndex + oy] +=
+                                    weights[wIndex + 0] * input[iIndex + iy];
                             }
                         }
                     }
@@ -540,21 +606,36 @@ void ConvImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
             for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
                 // If bias = nullptr, set B(0)
                 B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
-                std::fill(output, output+outChannels_s, biasVal);
+                std::fill(output, output + outChannels_s, biasVal);
                 for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
-                    std::size_t iIndex_channel = (inCh + batch*inputDims[1]) * inputDims[2] * inputDims[3];
-                    const std::size_t wIndex = (inCh + outCh*inputDims[1]) * kernelDims[0] * kernelDims[1];
+                    std::size_t iIndex_channel =
+                        (inCh + batch * inputDims[1]) * inputDims[2] *
+                        inputDims[3];
+                    const std::size_t wIndex = (inCh + outCh * inputDims[1]) *
+                                               kernelDims[0] * kernelDims[1];
 
                     // loop over each ouput line
-                    for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex_channel+=inputDims[3]*strideDims[0]) {
+                    for (std::size_t ox = 0, oIndex = 0; ox < oxSize;
+                         ++ox,
+                                     oIndex += oySize,
+                                     iIndex_channel +=
+                                     inputDims[3] * strideDims[0]) {
                         // loop over associated input line
-                        for (std::size_t ky = 0, ix = 0; ky < kernelDims[0]; ++ky, ix += inputDims[3]*dilationDims[0]) {
+                        for (std::size_t ky = 0, ix = 0; ky < kernelDims[0];
+                             ++ky, ix += inputDims[3] * dilationDims[0]) {
                             // loop over the entire line
-                            for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) {
-                                const std::size_t iIndex = iIndex_channel + ix + iy;
-                                // loop over elements assosicated with one output
-                                for (std::size_t kx = 0;  kx < kernelDims[0]; ++kx) {
-                                    output[oIndex + oy] += weights[wIndex+kernelDims[0]*ky+kx]*input[iIndex+kx*dilationDims[1]];
+                            for (std::size_t oy = 0, iy = 0; oy < oySize;
+                                 ++oy, iy += strideDims[1]) {
+                                const std::size_t iIndex =
+                                    iIndex_channel + ix + iy;
+                                // loop over elements assosicated with one
+                                // output
+                                for (std::size_t kx = 0; kx < kernelDims[0];
+                                     ++kx) {
+                                    output[oIndex + oy] +=
+                                        weights[wIndex + kernelDims[0] * ky +
+                                                kx] *
+                                        input[iIndex + kx * dilationDims[1]];
                                 }
                             }
                         }
diff --git a/src/operator/ConvImpl.cpp b/src/operator/ConvImpl.cpp
index ffbb75c1..d23a9968 100644
--- a/src/operator/ConvImpl.cpp
+++ b/src/operator/ConvImpl.cpp
@@ -40,16 +40,17 @@ void Aidge::ConvImpl1D_cpu::forward() {
     const auto& input2 = (op_.getInput(2)) ? op_.getInput(2)->refCastFrom(input2Fallback, *op_.getOutput(0)) : Tensor();
 
     // Call kernel
-    impl.forward(op_.strideDims(),
-            op_.dilationDims(),
-            op_.kernelDims(),
-            op_.getInput(0)->template dims<3>(), // input dimensions
-            dynamic_cast<const Conv_Op<1>&>(mOp).outChannels(), // outChannels
-            input0.getImpl()->rawPtr(), // input
-            input1.getImpl()->rawPtr(), // weight
-            op_.getInput(2) ? input2.getImpl()->rawPtr() : nullptr, // bias
-            getCPUPtr(mOp.getRawOutput(0)) // output
-            );
+    impl.forward(
+        op_.strideDims(),
+        op_.dilationDims(),
+        op_.kernelDims(),
+        op_.getInput(0)->template dims<3>(), // input dimensions
+        dynamic_cast<const Conv_Op<1> &>(mOp).outChannels(),    // outChannels
+        input0.getImpl()->rawPtr(),                             // input
+        input1.getImpl()->rawPtr(),                             // weight
+        op_.getInput(2) ? input2.getImpl()->rawPtr() : nullptr, // bias
+        getCPUPtr(mOp.getRawOutput(0))                          // output
+    );
 }
 
 template <> void ConvImpl1D_cpu::backward() {
-- 
GitLab


From b8ee3750aa33b5630d73ba05d56b928e00ab0a1f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gr=C3=A9goire=20KUBLER?= <gregoire.kubler@proton.me>
Date: Tue, 25 Feb 2025 14:10:42 +0000
Subject: [PATCH 054/108] chore : cleanup test expand

set setupExpandTest as static function
header cleanup
removed "using namespace" in favor of "namespace{}" directive
---
 unit_tests/operator/Test_ExpandImpl.cpp | 26 ++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/unit_tests/operator/Test_ExpandImpl.cpp b/unit_tests/operator/Test_ExpandImpl.cpp
index 878c6081..ad30457d 100644
--- a/unit_tests/operator/Test_ExpandImpl.cpp
+++ b/unit_tests/operator/Test_ExpandImpl.cpp
@@ -13,20 +13,20 @@
 
 #include <catch2/catch_test_macros.hpp>
 
-#include "aidge/backend/cpu/data/TensorImpl.hpp"
-#include "aidge/backend/cpu/operator/ExpandImpl.hpp"
 #include "aidge/data/DataType.hpp"
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/Expand.hpp"
 #include "aidge/utils/ArrayHelpers.hpp"
 
-using std::shared_ptr;
 
-using namespace Aidge;
+namespace Aidge {
+
+using std::shared_ptr;
 
-void setupTestExpand(shared_ptr<Tensor> inputData,
-                     shared_ptr<Tensor> inputShape,
-                     shared_ptr<Expand_Op> &op) {
+static void setupTestExpand(shared_ptr<Tensor> inputData,
+                            shared_ptr<Tensor> inputShape,
+                            shared_ptr<Expand_Op> &op,
+                            Tensor &expectedOutput) {
 
     op->getOutput(0)->setDataType(inputData->dataType());
 
@@ -35,6 +35,9 @@ void setupTestExpand(shared_ptr<Tensor> inputData,
 
     inputShape->setBackend("cpu");
     op->associateInput(1, inputShape);
+
+    expectedOutput.setBackend("cpu");
+    expectedOutput.setDataType(DataType::Int32);
 }
 
 TEST_CASE("[cpu/operator] Expand(forward)", "[Expand][CPU]") {
@@ -49,7 +52,7 @@ TEST_CASE("[cpu/operator] Expand(forward)", "[Expand][CPU]") {
             Array4D<cpptype_t<DataType::Int32>, 1, 3, 4, 2>({{{{{1, 3}, {1, 3}, {1, 3}, {1, 3}},
                                         {{1, 3}, {1, 3}, {1, 3}, {1, 3}},
                                         {{1, 3}, {1, 3}, {1, 3}, {1, 3}}}}});
-        setupTestExpand(inputData, inputShape, op);
+        setupTestExpand(inputData, inputShape, op, expectedOutput);
 
         // forwardDims has already been tested in core
         CHECK(op->forwardDims(true));
@@ -63,7 +66,7 @@ TEST_CASE("[cpu/operator] Expand(forward)", "[Expand][CPU]") {
             std::make_shared<Tensor>(Array1D<std::int64_t, 2>({2, 3}));
         Tensor expectedOutput = Array3D<cpptype_t<DataType::Int32>, 2, 2, 3>(
             {{{{2, 1, 3}, {2, 1, 3}}, {{2, 1, 3}, {2, 1, 3}}}});
-        setupTestExpand(inputData, inputShape, op);
+        setupTestExpand(inputData, inputShape, op,expectedOutput);
 
         // forwardDims has already been tested in core
         CHECK(op->forwardDims(true));
@@ -77,7 +80,7 @@ TEST_CASE("[cpu/operator] Expand(forward)", "[Expand][CPU]") {
             std::make_shared<Tensor>(Array1D<std::int64_t, 1>({1}));
         Tensor expectedOutput =
             Array4D<cpptype_t<DataType::Int32>, 2, 1, 3, 1>({{{2, 1, 3}, {2, 1, 3}}});
-        setupTestExpand(inputData, inputShape, op);
+        setupTestExpand(inputData, inputShape, op, expectedOutput);
 
         // forwardDims has already been tested in core
         CHECK(op->forwardDims(true));
@@ -91,7 +94,7 @@ TEST_CASE("[cpu/operator] Expand(forward)", "[Expand][CPU]") {
             std::make_shared<Tensor>(Array1D<std::int64_t, 3>({2, 1, 1}));
         Tensor expectedOutput =
             Array4D<cpptype_t<DataType::Int32>, 1, 2, 3, 1>({{{{2, 1, 3}, {2, 1, 3}}}});
-        setupTestExpand(inputData, inputShape, op);
+        setupTestExpand(inputData, inputShape, op,expectedOutput);
 
         // forwardDims has already been tested in core
         CHECK(op->forwardDims(true));
@@ -101,3 +104,4 @@ TEST_CASE("[cpu/operator] Expand(forward)", "[Expand][CPU]") {
     SECTION("N-Dim to N-Dim") {}
     auto inputData = std::shared_ptr<Tensor>();
 }
+} // namespace Aidge
-- 
GitLab


From da5f38fac1318ffb4187b1d2dafc402e2745372b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gr=C3=A9goire=20KUBLER?= <gregoire.kubler@proton.me>
Date: Wed, 5 Mar 2025 14:34:57 +0000
Subject: [PATCH 055/108] chore : deleted dead commented code

---
 include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
index 274f5f4f..7ae9e45f 100644
--- a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
@@ -72,13 +72,6 @@ void ConvImpl1D_cpu_forward_kernel(const array<DimSize_t, 1> &strideDim,
                 const std::size_t wIndex =
                     (inCh + outCh * inputDims[1]) * kernelDim[0];
                 for (std::size_t ox = 0; ox < oxSize; ++ox) {
-                    // const signedsize difx = static_cast<signedsize>(- ox *
-                    // strideDim[0s); const std::size_t sxMin =
-                    // static_cast<std::size_t>(std::max(difx, signedsize(0)));
-                    // const std::size_t sxMax =
-                    // (static_cast<signedsize>(inputDims[2]) + difx) < 0 ? 0 :
-                    // ((inputDims[2] + difx) > kernelDim[0s[0] ? kernelDim[0s
-                    // : inputDims[2] + difx);
                     const std::size_t sxMin = 0;
                     const std::size_t sxMax = dilated_kernel_x;
                     const std::size_t oIndexFull = oIndex + ox;
-- 
GitLab


From 28ca848f333c2d3c5c231b487fba639e6e1be9fb Mon Sep 17 00:00:00 2001
From: Jerome Hue <jerome.hue@cea.fr>
Date: Fri, 14 Feb 2025 15:35:32 +0100
Subject: [PATCH 056/108] feat: Add surrogate backward function for Heaviside
 operator

---
 .../cpu/operator/HeavisideImpl_kernels.hpp    | 19 ++++++++++++++++++-
 src/operator/HeavisideImpl.cpp                | 19 ++++++++++++++++++-
 unit_tests/operator/Test_HeavisideImpl.cpp    | 11 +++++++++++
 3 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp b/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp
index 06d7fff8..0bbbddee 100644
--- a/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp
@@ -19,7 +19,6 @@
 #include "aidge/backend/cpu/operator/HeavisideImpl.hpp"
 #include "aidge/utils/ErrorHandling.hpp"
 
-
 namespace Aidge {
 
 template <class I, class O>
@@ -35,6 +34,24 @@ void HeavisideImplCpuForwardKernel(std::size_t inputLength,
     }
 }
 
+
+// Surrogate Gradient
+template <class O, class GO, class GI>
+void HeavisideImplCpuBackwardKernel(std::size_t inputLength, 
+                                    const void* output_,
+                                    const void* grad_output_,
+                                    void* grad_input_) {
+
+    const O* output = static_cast<const O*>(output_);
+    const GO* grad_output = static_cast<const GO*>(grad_output_);
+    GI* grad_input = static_cast<GI*>(grad_input_);
+
+    for (size_t i = 0; i < inputLength; ++i) {
+        // dx = dy * (1/PI) * (1 / (1 + (PI * x)^2))
+        grad_input[i] = (1 / M_PI) * grad_output[i] * static_cast<O>(1.0 / (1.0 + output[i] * output[i]));
+    }
+}
+
 // Kernels registration to implementation entry point
 REGISTRAR(HeavisideImplCpu,
           {DataType::Float32},
diff --git a/src/operator/HeavisideImpl.cpp b/src/operator/HeavisideImpl.cpp
index 56ceb9b0..2ead2978 100644
--- a/src/operator/HeavisideImpl.cpp
+++ b/src/operator/HeavisideImpl.cpp
@@ -32,6 +32,23 @@ template <> void Aidge::HeavisideImplCpu::forward() {
                  op_.value());
 }
 
-template <> void Aidge::HeavisideImplCpu::backward() {
+template <> 
+void Aidge::HeavisideImplCpu::backward() {
     AIDGE_THROW_OR_ABORT(std::runtime_error, "Heaviside backward not implemented yet");
+
+    // TODO: The following lines are assuming that the surrogate gradient is Atan
+    // remove that assumption by providing an attribute to Heaviside, 
+    // allowing to choose between different surrogate gradients.
+    
+    // const Heavisde_Op& op_ = dynamic_cast<const Heavisie_Op &>(mOp);
+
+
+
+    // ! backward of hs = forward of atan
+    //const auto impl = Registrar<HeavisideImplCpu>::create(getBestMatch(getRequiredSpec()));
+    // std::shared_ptr<Tensor> in0 = op_.getInput(0);
+    // std::shared_ptr<Tensor> out0 = op_.getOutput(0);
+  
+    //impl.forward()
 }
+
diff --git a/unit_tests/operator/Test_HeavisideImpl.cpp b/unit_tests/operator/Test_HeavisideImpl.cpp
index 4cbdf1a0..a0142513 100644
--- a/unit_tests/operator/Test_HeavisideImpl.cpp
+++ b/unit_tests/operator/Test_HeavisideImpl.cpp
@@ -11,6 +11,7 @@
 
 #include "aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp"
 
+#include <aidge/operator/Memorize.hpp>
 #include <memory>
 #include <cstdlib>
 #include <random>
@@ -22,6 +23,8 @@
 #include "aidge/graph/Node.hpp"
 #include "aidge/utils/TensorUtils.hpp"
 
+#include "aidge/operator/Add.hpp"
+
 namespace Aidge
 {
 
@@ -95,4 +98,12 @@ TEST_CASE("[cpu/operator] Heaviside(forward)", "[Heaviside][CPU]") {
         REQUIRE(approxEq<float>(*(op->getOutput(0)), *T1));
     }
 }
+
+TEST_CASE("[cpu/operator] Heaviside(backward)", "[Heaviside][CPU]") {
+
+    auto add = Add();
+    auto mem = Memorize(2);
+    auto hs = Heaviside(1);
+}
+
 }
-- 
GitLab


From 2d63028a6fd1416b8081d53bcb59b3f4e02e0cb3 Mon Sep 17 00:00:00 2001
From: Jerome Hue <jerome.hue@cea.fr>
Date: Fri, 28 Feb 2025 17:52:53 +0100
Subject: [PATCH 057/108] Add Heaviside backward surrogate and associated tests

---
 .../backend/cpu/operator/HeavisideImpl.hpp    |  2 +-
 .../cpu/operator/HeavisideImpl_kernels.hpp    |  5 ++--
 src/operator/HeavisideImpl.cpp                | 17 +++++------
 unit_tests/operator/Test_HeavisideImpl.cpp    | 28 ++++++++++++++++---
 4 files changed, 37 insertions(+), 15 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/HeavisideImpl.hpp b/include/aidge/backend/cpu/operator/HeavisideImpl.hpp
index 7a3ba9ad..877fa2a9 100644
--- a/include/aidge/backend/cpu/operator/HeavisideImpl.hpp
+++ b/include/aidge/backend/cpu/operator/HeavisideImpl.hpp
@@ -23,7 +23,7 @@ namespace Aidge {
 using HeavisideImplCpu =
     OperatorImpl_cpu<Heaviside_Op,
                      void(std::size_t, const void *, void *, const float),
-                     void(const float, std::size_t, const void *, void *)>;
+                     void(std::size_t, const void *, const void *, void *)>;
 
 // Implementation entry point registration for operator Heaviside
 REGISTRAR(Heaviside_Op, "cpu", HeavisideImplCpu::create);
diff --git a/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp b/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp
index 0bbbddee..7fc0eb0a 100644
--- a/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp
@@ -48,7 +48,8 @@ void HeavisideImplCpuBackwardKernel(std::size_t inputLength,
 
     for (size_t i = 0; i < inputLength; ++i) {
         // dx = dy * (1/PI) * (1 / (1 + (PI * x)^2))
-        grad_input[i] = (1 / M_PI) * grad_output[i] * static_cast<O>(1.0 / (1.0 + output[i] * output[i]));
+        // grad_input[i] = (1 / M_PI) * grad_output[i] * static_cast<O>(1.0 / (1.0 + (output[i] * output[i]) * (M_PI * M_PI)));
+        grad_input[i] = grad_output[i] * static_cast<O>(1.0 / (1.0 + (output[i] * output[i]) * (M_PI * M_PI)));
     }
 }
 
@@ -57,7 +58,7 @@ REGISTRAR(HeavisideImplCpu,
           {DataType::Float32},
           {ProdConso::inPlaceModel,
            Aidge::HeavisideImplCpuForwardKernel<float, float>,
-           nullptr});
+           Aidge::HeavisideImplCpuBackwardKernel<float,float,float>});
 } // namespace Aidge
 
 #endif // AIDGE_CPU_OPERATOR_HEAVISIDEIMPL_KERNELS_H__H_
diff --git a/src/operator/HeavisideImpl.cpp b/src/operator/HeavisideImpl.cpp
index 2ead2978..5bf77f87 100644
--- a/src/operator/HeavisideImpl.cpp
+++ b/src/operator/HeavisideImpl.cpp
@@ -34,21 +34,22 @@ template <> void Aidge::HeavisideImplCpu::forward() {
 
 template <> 
 void Aidge::HeavisideImplCpu::backward() {
-    AIDGE_THROW_OR_ABORT(std::runtime_error, "Heaviside backward not implemented yet");
 
     // TODO: The following lines are assuming that the surrogate gradient is Atan
     // remove that assumption by providing an attribute to Heaviside, 
     // allowing to choose between different surrogate gradients.
     
-    // const Heavisde_Op& op_ = dynamic_cast<const Heavisie_Op &>(mOp);
+    const Heaviside_Op& op_ = dynamic_cast<const Heaviside_Op &>(mOp);
 
+    const auto impl = Registrar<HeavisideImplCpu>::create(getBestMatch(getRequiredSpec()));
 
+    auto gra_int0 = op_.getInput(0)->grad();
+    auto gra_out0 = op_.getOutput(0)->grad();
 
-    // ! backward of hs = forward of atan
-    //const auto impl = Registrar<HeavisideImplCpu>::create(getBestMatch(getRequiredSpec()));
-    // std::shared_ptr<Tensor> in0 = op_.getInput(0);
-    // std::shared_ptr<Tensor> out0 = op_.getOutput(0);
-  
-    //impl.forward()
+    std::shared_ptr<Tensor> in0 = op_.getInput(0);
+    std::shared_ptr<Tensor> out0 = op_.getOutput(0);
+    AIDGE_ASSERT(out0, "missing output #0 for current {} operator", op_.type());
+
+    impl.backward(gra_int0->size(), getCPUPtr(in0), getCPUPtr(gra_out0), getCPUPtr(gra_int0));
 }
 
diff --git a/unit_tests/operator/Test_HeavisideImpl.cpp b/unit_tests/operator/Test_HeavisideImpl.cpp
index a0142513..515d6802 100644
--- a/unit_tests/operator/Test_HeavisideImpl.cpp
+++ b/unit_tests/operator/Test_HeavisideImpl.cpp
@@ -12,6 +12,7 @@
 #include "aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp"
 
 #include <aidge/operator/Memorize.hpp>
+#include <aidge/utils/Types.h>
 #include <memory>
 #include <cstdlib>
 #include <random>
@@ -100,10 +101,29 @@ TEST_CASE("[cpu/operator] Heaviside(forward)", "[Heaviside][CPU]") {
 }
 
 TEST_CASE("[cpu/operator] Heaviside(backward)", "[Heaviside][CPU]") {
+    auto hs = Heaviside(1.0f);
+    auto op = std::static_pointer_cast<OperatorTensor>(hs->getOperator());
+    op->setDataType(DataType::Float32);
+    op->setBackend("cpu");
 
-    auto add = Add();
-    auto mem = Memorize(2);
-    auto hs = Heaviside(1);
-}
+    auto input = Tensor(Array1D<float, 3>({1.0, -1.0, 1.0}));
+    input.setDataType(DataType::Float32);
+    input.setBackend("cpu");
+
+    auto grad = Tensor(Array1D<float, 3>({1.0, 1.0, 1.0}));
+    grad.setDataType(DataType::Float32);
+    grad.setBackend("cpu");
+
+    op->setInput(IOIndex_t(0), std::make_shared<Tensor>(input));
+    op->forward();
 
+    Log::info("Output : ");
+    op->getOutput(0)->print();
+
+    op->getOutput(0)->setGrad(std::make_shared<Tensor>(grad));
+    op->backward();
+
+    Log::info("Gradient : ");
+    op->getInput(0)->grad()->print();
+}
 }
-- 
GitLab


From 09bec2683d0312f8dc700d1e85f01dec254ceade Mon Sep 17 00:00:00 2001
From: Jerome Hue <jerome.hue@cea.fr>
Date: Mon, 3 Mar 2025 15:56:10 +0100
Subject: [PATCH 058/108] Improve test for Heaviside backward

---
 .../backend/cpu/operator/HeavisideImpl_kernels.hpp     | 10 +++++++---
 unit_tests/operator/Test_HeavisideImpl.cpp             |  7 ++-----
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp b/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp
index 7fc0eb0a..4e2a7db2 100644
--- a/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp
@@ -42,14 +42,18 @@ void HeavisideImplCpuBackwardKernel(std::size_t inputLength,
                                     const void* grad_output_,
                                     void* grad_input_) {
 
+    /*
+     * Heaviside is approximated by an arctan function for backward :
+     * S ~= \frac{1}{\pi}\text{arctan}(\pi U \frac{\alpha}{2})
+     * \frac{dS}{dU} = \frac{\alpha}{2} \frac{1}{(1+(\frac{\pi U \alpha}{2})^2)}}
+     * */
+
     const O* output = static_cast<const O*>(output_);
     const GO* grad_output = static_cast<const GO*>(grad_output_);
     GI* grad_input = static_cast<GI*>(grad_input_);
 
     for (size_t i = 0; i < inputLength; ++i) {
-        // dx = dy * (1/PI) * (1 / (1 + (PI * x)^2))
-        // grad_input[i] = (1 / M_PI) * grad_output[i] * static_cast<O>(1.0 / (1.0 + (output[i] * output[i]) * (M_PI * M_PI)));
-        grad_input[i] = grad_output[i] * static_cast<O>(1.0 / (1.0 + (output[i] * output[i]) * (M_PI * M_PI)));
+        grad_input[i] = grad_output[i] * static_cast<O>(1.0 / (1.0 + (output[i] * M_PI) * (output[i] * M_PI)));
     }
 }
 
diff --git a/unit_tests/operator/Test_HeavisideImpl.cpp b/unit_tests/operator/Test_HeavisideImpl.cpp
index 515d6802..e6aa38b8 100644
--- a/unit_tests/operator/Test_HeavisideImpl.cpp
+++ b/unit_tests/operator/Test_HeavisideImpl.cpp
@@ -117,13 +117,10 @@ TEST_CASE("[cpu/operator] Heaviside(backward)", "[Heaviside][CPU]") {
     op->setInput(IOIndex_t(0), std::make_shared<Tensor>(input));
     op->forward();
 
-    Log::info("Output : ");
-    op->getOutput(0)->print();
-
     op->getOutput(0)->setGrad(std::make_shared<Tensor>(grad));
     op->backward();
 
-    Log::info("Gradient : ");
-    op->getInput(0)->grad()->print();
+    auto expectedResult = Tensor(Array1D<float,3>({0.0920, 0.0920, 0.0920}));
+    REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), expectedResult));
 }
 }
-- 
GitLab


From eff8202099609940a906fe752dc50596bebd5c12 Mon Sep 17 00:00:00 2001
From: Jerome Hue <jerome.hue@cea.fr>
Date: Mon, 3 Mar 2025 16:44:35 +0100
Subject: [PATCH 059/108] Add randomness in heaviside backward test

---
 src/operator/HeavisideImpl.cpp             |  1 -
 unit_tests/operator/Test_HeavisideImpl.cpp | 57 +++++++++++++++++-----
 2 files changed, 45 insertions(+), 13 deletions(-)

diff --git a/src/operator/HeavisideImpl.cpp b/src/operator/HeavisideImpl.cpp
index 5bf77f87..8349a0ad 100644
--- a/src/operator/HeavisideImpl.cpp
+++ b/src/operator/HeavisideImpl.cpp
@@ -48,7 +48,6 @@ void Aidge::HeavisideImplCpu::backward() {
 
     std::shared_ptr<Tensor> in0 = op_.getInput(0);
     std::shared_ptr<Tensor> out0 = op_.getOutput(0);
-    AIDGE_ASSERT(out0, "missing output #0 for current {} operator", op_.type());
 
     impl.backward(gra_int0->size(), getCPUPtr(in0), getCPUPtr(gra_out0), getCPUPtr(gra_int0));
 }
diff --git a/unit_tests/operator/Test_HeavisideImpl.cpp b/unit_tests/operator/Test_HeavisideImpl.cpp
index e6aa38b8..2743027c 100644
--- a/unit_tests/operator/Test_HeavisideImpl.cpp
+++ b/unit_tests/operator/Test_HeavisideImpl.cpp
@@ -100,27 +100,60 @@ TEST_CASE("[cpu/operator] Heaviside(forward)", "[Heaviside][CPU]") {
     }
 }
 
+// TODO: Make this work for random intput.
 TEST_CASE("[cpu/operator] Heaviside(backward)", "[Heaviside][CPU]") {
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> valueDist(-2.0f, 2.0f);
+    std::uniform_int_distribution<std::size_t> sizeDist(5, 100);
+
+    const std::size_t tensorSize = sizeDist(gen);
+    
     auto hs = Heaviside(1.0f);
     auto op = std::static_pointer_cast<OperatorTensor>(hs->getOperator());
     op->setDataType(DataType::Float32);
     op->setBackend("cpu");
 
-    auto input = Tensor(Array1D<float, 3>({1.0, -1.0, 1.0}));
-    input.setDataType(DataType::Float32);
-    input.setBackend("cpu");
 
-    auto grad = Tensor(Array1D<float, 3>({1.0, 1.0, 1.0}));
-    grad.setDataType(DataType::Float32);
-    grad.setBackend("cpu");
+    auto inputTensor = std::make_shared<Tensor>(std::vector<std::size_t>{tensorSize});
+    inputTensor->setDataType(DataType::Float32);
+    inputTensor->setBackend("cpu");
+    auto* inputData = static_cast<float*>(inputTensor->getImpl()->rawPtr());
+    
+    for(std::size_t i = 0; i < tensorSize; ++i) {
+        inputData[i] = valueDist(gen);
+    }
+
+    auto gradTensor = std::make_shared<Tensor>(std::vector<std::size_t>{tensorSize});
+    gradTensor->setDataType(DataType::Float32);
+    gradTensor->setBackend("cpu");
+    auto* gradData = static_cast<float*>(gradTensor->getImpl()->rawPtr());
+    
+    for (std::size_t i = 0; i < tensorSize; ++i) {
+        gradData[i] = valueDist(gen);
+    }
 
-    op->setInput(IOIndex_t(0), std::make_shared<Tensor>(input));
+    op->setInput(IOIndex_t(0), inputTensor);
     op->forward();
-
-    op->getOutput(0)->setGrad(std::make_shared<Tensor>(grad));
+    
+    auto output = op->getOutput(0);
+    output->setGrad(gradTensor);
+    
+    // Backward pass
     op->backward();
-
-    auto expectedResult = Tensor(Array1D<float,3>({0.0920, 0.0920, 0.0920}));
-    REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), expectedResult));
+    
+    // Compute expected gradient manually
+    auto expectedGrad = std::make_shared<Tensor>(std::vector<std::size_t>{tensorSize});
+    expectedGrad->setDataType(DataType::Float32);
+    expectedGrad->setBackend("cpu");
+    auto* expectedGradData = static_cast<float*>(expectedGrad->getImpl()->rawPtr());
+    
+    for (std::size_t i = 0; i < tensorSize; ++i) {
+        expectedGradData[i] = gradData[i] * (1.0f / (1.0f + (inputData[i] * M_PI) * (inputData[i] * M_PI)));
+    }
+    
+    // Compare actual gradient with expected gradient
+    REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad));
 }
 }
-- 
GitLab


From c92b78e4c77c18165578a17caee985ad3a6108ce Mon Sep 17 00:00:00 2001
From: Jerome Hue <jerome.hue@cea.fr>
Date: Tue, 4 Mar 2025 10:09:25 +0100
Subject: [PATCH 060/108] Format file and remove unused variable

---
 src/operator/HeavisideImpl.cpp | 51 +++++++++++++++-------------------
 1 file changed, 23 insertions(+), 28 deletions(-)

diff --git a/src/operator/HeavisideImpl.cpp b/src/operator/HeavisideImpl.cpp
index 8349a0ad..3932eb33 100644
--- a/src/operator/HeavisideImpl.cpp
+++ b/src/operator/HeavisideImpl.cpp
@@ -13,42 +13,37 @@
 
 #include <stdexcept>
 
-#include "aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp"
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include "aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp"
 #include "aidge/utils/ErrorHandling.hpp"
 
 template <> void Aidge::HeavisideImplCpu::forward() {
-    const Heaviside_Op &op_ = dynamic_cast<const Heaviside_Op &>(mOp);
-    std::shared_ptr<Tensor> input0 = op_.getInput(0);
-    std::shared_ptr<Tensor> output0 = op_.getOutput(0);
-    AIDGE_ASSERT(input0, "missing input #0");
-
-    const auto impl =
-        Registrar<HeavisideImplCpu>::create(getBestMatch(getRequiredSpec()));
-
-    impl.forward(input0->size(),
-                 getCPUPtr(mOp.getRawInput(0)),
-                 getCPUPtr(mOp.getRawOutput(0)),
-                 op_.value());
-}
+  const Heaviside_Op &op_ = dynamic_cast<const Heaviside_Op &>(mOp);
+  std::shared_ptr<Tensor> input0 = op_.getInput(0);
+  std::shared_ptr<Tensor> output0 = op_.getOutput(0);
+  AIDGE_ASSERT(input0, "missing input #0");
 
-template <> 
-void Aidge::HeavisideImplCpu::backward() {
+  const auto impl =
+      Registrar<HeavisideImplCpu>::create(getBestMatch(getRequiredSpec()));
 
-    // TODO: The following lines are assuming that the surrogate gradient is Atan
-    // remove that assumption by providing an attribute to Heaviside, 
-    // allowing to choose between different surrogate gradients.
-    
-    const Heaviside_Op& op_ = dynamic_cast<const Heaviside_Op &>(mOp);
+  impl.forward(input0->size(), getCPUPtr(mOp.getRawInput(0)),
+               getCPUPtr(mOp.getRawOutput(0)), op_.value());
+}
 
-    const auto impl = Registrar<HeavisideImplCpu>::create(getBestMatch(getRequiredSpec()));
+template <> void Aidge::HeavisideImplCpu::backward() {
 
-    auto gra_int0 = op_.getInput(0)->grad();
-    auto gra_out0 = op_.getOutput(0)->grad();
+  // TODO: The following lines are assuming that the surrogate gradient is Atan
+  // remove that assumption by providing an attribute to Heaviside,
+  // allowing to choose between different surrogate gradients.
 
-    std::shared_ptr<Tensor> in0 = op_.getInput(0);
-    std::shared_ptr<Tensor> out0 = op_.getOutput(0);
+  const Heaviside_Op &op_ = dynamic_cast<const Heaviside_Op &>(mOp);
+  const auto impl =
+      Registrar<HeavisideImplCpu>::create(getBestMatch(getRequiredSpec()));
 
-    impl.backward(gra_int0->size(), getCPUPtr(in0), getCPUPtr(gra_out0), getCPUPtr(gra_int0));
-}
+  auto in0 = op_.getInput(0);
+  auto gra_int0 = op_.getInput(0)->grad();
+  auto gra_out0 = op_.getOutput(0)->grad();
 
+  impl.backward(gra_int0->size(), getCPUPtr(in0), getCPUPtr(gra_out0),
+                getCPUPtr(gra_int0));
+}
-- 
GitLab


From 9c83962af9e7bd99b64a5d48dc8b518515e72333 Mon Sep 17 00:00:00 2001
From: Jerome Hue <jerome.hue@cea.fr>
Date: Tue, 4 Mar 2025 10:59:06 +0100
Subject: [PATCH 061/108] Improve Heavisise Backward Test

Compare the result of surrogate gradient to the real Atan operator.
---
 unit_tests/operator/Test_HeavisideImpl.cpp | 36 +++++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/unit_tests/operator/Test_HeavisideImpl.cpp b/unit_tests/operator/Test_HeavisideImpl.cpp
index 2743027c..9241f0d3 100644
--- a/unit_tests/operator/Test_HeavisideImpl.cpp
+++ b/unit_tests/operator/Test_HeavisideImpl.cpp
@@ -11,7 +11,10 @@
 
 #include "aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp"
 
+#include <aidge/operator/Atan.hpp>
 #include <aidge/operator/Memorize.hpp>
+#include <aidge/operator/Mul.hpp>
+#include <aidge/operator/Producer.hpp>
 #include <aidge/utils/Types.h>
 #include <memory>
 #include <cstdlib>
@@ -100,7 +103,6 @@ TEST_CASE("[cpu/operator] Heaviside(forward)", "[Heaviside][CPU]") {
     }
 }
 
-// TODO: Make this work for random intput.
 TEST_CASE("[cpu/operator] Heaviside(backward)", "[Heaviside][CPU]") {
 
     std::random_device rd;
@@ -115,6 +117,7 @@ TEST_CASE("[cpu/operator] Heaviside(backward)", "[Heaviside][CPU]") {
     op->setDataType(DataType::Float32);
     op->setBackend("cpu");
 
+        
 
     auto inputTensor = std::make_shared<Tensor>(std::vector<std::size_t>{tensorSize});
     inputTensor->setDataType(DataType::Float32);
@@ -125,6 +128,29 @@ TEST_CASE("[cpu/operator] Heaviside(backward)", "[Heaviside][CPU]") {
         inputData[i] = valueDist(gen);
     }
 
+    // Compare it to the real Atan implementation
+    auto mul = Mul();
+    auto pi = std::make_shared<Tensor>(Array1D<float,1>{M_PI});
+    auto producer = Producer(pi);
+    auto atan = Atan();
+    auto mulOp = std::static_pointer_cast<OperatorTensor>(mul->getOperator());
+    auto piOp = std::static_pointer_cast<OperatorTensor>(producer->getOperator());
+    auto atanOp = std::static_pointer_cast<OperatorTensor>(atan->getOperator());
+    mulOp->setBackend("cpu");
+    piOp->setBackend("cpu");
+    atanOp->setBackend("cpu");
+    mulOp->setDataType(DataType::Float32);
+    piOp->setDataType(DataType::Float32);
+    atanOp->setDataType(DataType::Float32);
+
+
+    producer->addChild(mul,0,0);
+    mulOp->setInput(IOIndex_t(1),  inputTensor);
+    mulOp->forward();
+    auto outmul = mulOp->getOutput(0);
+    atanOp->setInput(0, inputTensor);
+    atanOp->forward();
+
     auto gradTensor = std::make_shared<Tensor>(std::vector<std::size_t>{tensorSize});
     gradTensor->setDataType(DataType::Float32);
     gradTensor->setBackend("cpu");
@@ -142,6 +168,10 @@ TEST_CASE("[cpu/operator] Heaviside(backward)", "[Heaviside][CPU]") {
     
     // Backward pass
     op->backward();
+
+    atanOp->setOutput(0, outmul);
+    atanOp->getOutput(0)->setGrad(gradTensor);
+    atanOp->backward();
     
     // Compute expected gradient manually
     auto expectedGrad = std::make_shared<Tensor>(std::vector<std::size_t>{tensorSize});
@@ -155,5 +185,9 @@ TEST_CASE("[cpu/operator] Heaviside(backward)", "[Heaviside][CPU]") {
     
     // Compare actual gradient with expected gradient
     REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad));
+
+    // Compare Atan(pi*input) to expected Gradient
+    REQUIRE(approxEq<float>(*(atanOp->getInput(0)->grad()), *expectedGrad));
 }
+
 }
-- 
GitLab


From e7c9550decce4d66d2c35be106fd9ba222dbe4c8 Mon Sep 17 00:00:00 2001
From: Jerome Hue <jerome.hue@cea.fr>
Date: Tue, 4 Mar 2025 11:40:40 +0100
Subject: [PATCH 062/108] Fix includes

---
 unit_tests/operator/Test_HeavisideImpl.cpp | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/unit_tests/operator/Test_HeavisideImpl.cpp b/unit_tests/operator/Test_HeavisideImpl.cpp
index 9241f0d3..d3ed3826 100644
--- a/unit_tests/operator/Test_HeavisideImpl.cpp
+++ b/unit_tests/operator/Test_HeavisideImpl.cpp
@@ -11,23 +11,22 @@
 
 #include "aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp"
 
-#include <aidge/operator/Atan.hpp>
-#include <aidge/operator/Memorize.hpp>
-#include <aidge/operator/Mul.hpp>
-#include <aidge/operator/Producer.hpp>
-#include <aidge/utils/Types.h>
 #include <memory>
+#include <cmath>
 #include <cstdlib>
 #include <random>
 
 #include <catch2/catch_test_macros.hpp>
 
-#include "aidge/data/Tensor.hpp"
 #include "aidge/backend/cpu/operator/HeavisideImpl.hpp"
+#include "aidge/data/Tensor.hpp"
 #include "aidge/graph/Node.hpp"
+#include "aidge/operator/Atan.hpp"
+#include "aidge/operator/Mul.hpp"
+#include "aidge/operator/Producer.hpp"
 #include "aidge/utils/TensorUtils.hpp"
+#include "aidge/utils/Types.h"
 
-#include "aidge/operator/Add.hpp"
 
 namespace Aidge
 {
-- 
GitLab


From 3be714857faff91636fc0736b11654ae404c4ee8 Mon Sep 17 00:00:00 2001
From: Jerome Hue <jerome.hue@cea.fr>
Date: Mon, 17 Mar 2025 16:06:57 +0100
Subject: [PATCH 063/108] Include <cmath> in HeavisideImpl kernels

---
 include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp b/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp
index 4e2a7db2..92f12fbe 100644
--- a/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp
@@ -15,6 +15,7 @@
 #include "aidge/utils/Registrar.hpp"
 
 #include <cstddef> // std::size_t
+#include <cmath>
 
 #include "aidge/backend/cpu/operator/HeavisideImpl.hpp"
 #include "aidge/utils/ErrorHandling.hpp"
@@ -37,7 +38,7 @@ void HeavisideImplCpuForwardKernel(std::size_t inputLength,
 
 // Surrogate Gradient
 template <class O, class GO, class GI>
-void HeavisideImplCpuBackwardKernel(std::size_t inputLength, 
+void HeavisideImplCpuBackwardKernel(std::size_t inputLength,
                                     const void* output_,
                                     const void* grad_output_,
                                     void* grad_input_) {
-- 
GitLab


From e02f7007fe94c5865b38957b1054867df144704b Mon Sep 17 00:00:00 2001
From: Jerome Hue <jerome.hue@cea.fr>
Date: Mon, 17 Mar 2025 16:16:04 +0100
Subject: [PATCH 064/108] Define _USE_MATH_DEFINES in kernel header

---
 include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp b/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp
index 92f12fbe..03815dc2 100644
--- a/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp
@@ -15,6 +15,7 @@
 #include "aidge/utils/Registrar.hpp"
 
 #include <cstddef> // std::size_t
+#define _USE_MATH_DEFINES
 #include <cmath>
 
 #include "aidge/backend/cpu/operator/HeavisideImpl.hpp"
-- 
GitLab


From 3a44ecdcbb7c368d734b662b9e9a9fcfedd18c97 Mon Sep 17 00:00:00 2001
From: Jerome Hue <jerome.hue@cea.fr>
Date: Mon, 17 Mar 2025 16:23:43 +0100
Subject: [PATCH 065/108] Replace cmath with math.h

---
 include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp b/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp
index 03815dc2..f397927a 100644
--- a/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp
@@ -15,8 +15,7 @@
 #include "aidge/utils/Registrar.hpp"
 
 #include <cstddef> // std::size_t
-#define _USE_MATH_DEFINES
-#include <cmath>
+#include <math.h>
 
 #include "aidge/backend/cpu/operator/HeavisideImpl.hpp"
 #include "aidge/utils/ErrorHandling.hpp"
-- 
GitLab


From 6db22c4a9e06eb28553ba9038956efe5a8198e1b Mon Sep 17 00:00:00 2001
From: Jerome Hue <jerome.hue@cea.fr>
Date: Mon, 17 Mar 2025 16:30:08 +0100
Subject: [PATCH 066/108] Enable Math Constants and Use cmath Header

---
 CMakeLists.txt                                               | 3 +++
 include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 729853ee..21c5c6b9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -86,6 +86,9 @@ target_link_libraries(${module_name}
         _aidge_core # _ is added because we link the exported target and not the project
 )
 
+# Add definition _USE_MATH_DEFINES to enable math constant definitions from math.h/cmath.
+target_compile_definitions(${module_name} PRIVATE _USE_MATH_DEFINES)
+
 #Set target properties
 set_property(TARGET ${module_name} PROPERTY POSITION_INDEPENDENT_CODE ON)
 
diff --git a/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp b/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp
index f397927a..92f12fbe 100644
--- a/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp
@@ -15,7 +15,7 @@
 #include "aidge/utils/Registrar.hpp"
 
 #include <cstddef> // std::size_t
-#include <math.h>
+#include <cmath>
 
 #include "aidge/backend/cpu/operator/HeavisideImpl.hpp"
 #include "aidge/utils/ErrorHandling.hpp"
-- 
GitLab


From eeded572c5a5dfd97417b78e7743146706fe7e95 Mon Sep 17 00:00:00 2001
From: Jerome Hue <jerome.hue@cea.fr>
Date: Mon, 17 Mar 2025 16:38:14 +0100
Subject: [PATCH 067/108] Add _USE_MATH_DEFINES to unit tests

---
 unit_tests/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/unit_tests/CMakeLists.txt b/unit_tests/CMakeLists.txt
index e1f261d0..571c96b9 100644
--- a/unit_tests/CMakeLists.txt
+++ b/unit_tests/CMakeLists.txt
@@ -21,6 +21,8 @@ file(GLOB_RECURSE src_files "*.cpp")
 
 add_executable(tests${module_name} ${src_files})
 
+target_compile_definitions(tests${module_name} PRIVATE _USE_MATH_DEFINES)
+
 target_link_libraries(tests${module_name} PRIVATE ${module_name})
 
 target_link_libraries(tests${module_name} PRIVATE Catch2::Catch2WithMain)
-- 
GitLab


From 9765d6e866ca8721eb389b3719c878db662be1f2 Mon Sep 17 00:00:00 2001
From: Jerome Hue <jerome.hue@cea.fr>
Date: Fri, 21 Mar 2025 16:02:51 +0100
Subject: [PATCH 068/108] Only define _USE_MATH_DEFINES on Windows platforms

The _USE_MATH_DEFINES macro is only needed on Windows to expose math
constants like M_PI in math.h/cmath.
---
 CMakeLists.txt            | 4 +++-
 unit_tests/CMakeLists.txt | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 21c5c6b9..6c87a89b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -87,7 +87,9 @@ target_link_libraries(${module_name}
 )
 
 # Add definition _USE_MATH_DEFINES to enable math constant definitions from math.h/cmath.
-target_compile_definitions(${module_name} PRIVATE _USE_MATH_DEFINES)
+if (WIN32)
+    target_compile_definitions(${module_name} PRIVATE _USE_MATH_DEFINES)
+endif()
 
 #Set target properties
 set_property(TARGET ${module_name} PROPERTY POSITION_INDEPENDENT_CODE ON)
diff --git a/unit_tests/CMakeLists.txt b/unit_tests/CMakeLists.txt
index 571c96b9..217cf8fb 100644
--- a/unit_tests/CMakeLists.txt
+++ b/unit_tests/CMakeLists.txt
@@ -21,7 +21,9 @@ file(GLOB_RECURSE src_files "*.cpp")
 
 add_executable(tests${module_name} ${src_files})
 
-target_compile_definitions(tests${module_name} PRIVATE _USE_MATH_DEFINES)
+if (WIN32)
+    target_compile_definitions(tests${module_name} PRIVATE _USE_MATH_DEFINES)
+endif()
 
 target_link_libraries(tests${module_name} PRIVATE ${module_name})
 
-- 
GitLab


From d95abd33ca7b9a7ee742452e87277883983f45da Mon Sep 17 00:00:00 2001
From: cmoineau <cyril.moineau@cea.fr>
Date: Wed, 26 Mar 2025 09:47:41 +0000
Subject: [PATCH 069/108] Fix clip implementation + add ReLU int8.

---
 .../aidge/backend/cpu/operator/ClipImpl_kernels.hpp   | 11 ++++++-----
 .../aidge/backend/cpu/operator/ReLUImpl_kernels.hpp   |  5 ++++-
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/ClipImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ClipImpl_kernels.hpp
index 1afac469..f7a64585 100644
--- a/include/aidge/backend/cpu/operator/ClipImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ClipImpl_kernels.hpp
@@ -23,13 +23,14 @@ void ClipImpl_cpu_forward_kernel(
         float max_,
         const void* input_,
         const std::size_t length,
-        void* output_) 
+        void* output_)
 {
     const I* input = static_cast<const I*>(input_);
     O* output = static_cast<O*>(output_);
-
+    I minCasted = static_cast<I>(min_);
+    I maxCasted = static_cast<I>(max_);
     for (std::size_t i = 0; i < length; ++i) {
-        output[i] = std::min(std::max(static_cast<float>(input[i]), min_), max_);
+        output[i] = std::min(std::max(input[i], minCasted), maxCasted);
     }
 }
 
@@ -38,9 +39,9 @@ void ClipImpl_cpu_backward_kernel(
         float min_,
         float max_,
         const std::size_t length,
-        const void* input_, 
+        const void* input_,
         const void* grad_output_,
-		void* grad_input_)           
+		void* grad_input_)
 {
     const I* input = static_cast<const I*>(input_);
     const GO* grad_output = static_cast<const GO*>(grad_output_);
diff --git a/include/aidge/backend/cpu/operator/ReLUImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ReLUImpl_kernels.hpp
index bb5d7cc3..6b7c3c9c 100644
--- a/include/aidge/backend/cpu/operator/ReLUImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ReLUImpl_kernels.hpp
@@ -60,7 +60,10 @@ REGISTRAR(ReLUImpl_cpu,
     {ProdConso::inPlaceModel, Aidge::ReLUImpl_cpu_forward_kernel<double, double>, Aidge::ReLUImpl_cpu_backward_kernel<double, double, double>});
 REGISTRAR(ReLUImpl_cpu,
     {DataType::Int32},
-    {ProdConso::inPlaceModel, Aidge::ReLUImpl_cpu_forward_kernel<int32_t, int32_t>, Aidge::ReLUImpl_cpu_backward_kernel<int32_t, int32_t, int32_t>});
+    {ProdConso::inPlaceModel, Aidge::ReLUImpl_cpu_forward_kernel<int32_t, int32_t>, nullptr});
+REGISTRAR(ReLUImpl_cpu,
+    {DataType::Int8},
+    {ProdConso::inPlaceModel, Aidge::ReLUImpl_cpu_forward_kernel<int8_t, int8_t>, nullptr});
 }  // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_RELUIMPL_KERNELS_H_ */
-- 
GitLab


From 73f1daebad81944320ced26280dd0d269c4358a3 Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Thu, 27 Mar 2025 09:45:38 +0100
Subject: [PATCH 070/108] Fixed duplicate header guard

---
 include/aidge/backend/cpu/operator/CryptoHashImpl.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/CryptoHashImpl.hpp b/include/aidge/backend/cpu/operator/CryptoHashImpl.hpp
index d7f07f99..8b616c1a 100644
--- a/include/aidge/backend/cpu/operator/CryptoHashImpl.hpp
+++ b/include/aidge/backend/cpu/operator/CryptoHashImpl.hpp
@@ -9,8 +9,8 @@
  *
  ********************************************************************************/
 
-#ifndef AIDGE_CPU_OPERATOR_TANHIMPL_H_
-#define AIDGE_CPU_OPERATOR_TANHIMPL_H_
+#ifndef AIDGE_CPU_OPERATOR_CRYPTOHASHIMPL_H_
+#define AIDGE_CPU_OPERATOR_CRYPTOHASHIMPL_H_
 
 #include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/CryptoHash.hpp"
@@ -33,4 +33,4 @@ REGISTRAR(CryptoHash_Op, "cpu", Aidge::CryptoHashImpl_cpu::create);
 }  // namespace Aidge
 #endif
 
-#endif /* AIDGE_CPU_OPERATOR_TANHIMPL_H_ */
+#endif /* AIDGE_CPU_OPERATOR_CRYPTOHASHIMPL_H_ */
-- 
GitLab


From 0e1ca6fbf90f841dab2221a9fd4cdb7534bfb2a4 Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Thu, 27 Mar 2025 22:33:42 +0100
Subject: [PATCH 071/108] TopK initial implementation

---
 .../aidge/backend/cpu/operator/TopKImpl.hpp   | 40 +++++++++
 .../backend/cpu/operator/TopKImpl_kernels.hpp | 81 +++++++++++++++++++
 src/operator/TopKImpl.cpp                     | 43 ++++++++++
 3 files changed, 164 insertions(+)
 create mode 100644 include/aidge/backend/cpu/operator/TopKImpl.hpp
 create mode 100644 include/aidge/backend/cpu/operator/TopKImpl_kernels.hpp
 create mode 100644 src/operator/TopKImpl.cpp

diff --git a/include/aidge/backend/cpu/operator/TopKImpl.hpp b/include/aidge/backend/cpu/operator/TopKImpl.hpp
new file mode 100644
index 00000000..d3060156
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/TopKImpl.hpp
@@ -0,0 +1,40 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_TOPKIMPL_H_
+#define AIDGE_CPU_OPERATOR_TOPKIMPL_H_
+
+#include <array>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/TopK.hpp"
+#include "aidge/utils/Registrar.hpp"
+#include "aidge/utils/Types.h"
+
+namespace Aidge {
+// Operator implementation entry point for the backend
+using TopKImpl_cpu = OperatorImpl_cpu<TopK_Op,
+    void(int64_t,
+        bool,
+        bool,
+        IOIndex_t,
+        const std::vector<DimSize_t>&,
+        const void*,
+        void*)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(TopK_Op, "cpu", Aidge::TopKImpl_cpu::create);
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_TOPKIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/TopKImpl_kernels.hpp b/include/aidge/backend/cpu/operator/TopKImpl_kernels.hpp
new file mode 100644
index 00000000..3993bf0d
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/TopKImpl_kernels.hpp
@@ -0,0 +1,81 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_TOPKIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_TOPKIMPL_KERNELS_H_
+
+#include <algorithm>   // std::for_each
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::int32_t
+#include <functional>  //std::multiplies
+#include <numeric>     //std::accumulate
+#include <vector>
+
+#include "aidge/backend/cpu/operator/TopKImpl.hpp"
+#include "aidge/data/Data.hpp"
+#include "aidge/operator/TopK.hpp"
+#include "aidge/utils/Registrar.hpp"
+
+namespace Aidge {
+
+template <class I, class O>
+void TopKImpl_cpu_forward_kernel(int64_t axis,
+                                 bool largest,
+                                 bool /*sorted*/,
+                                 IOIndex_t k,
+                                 const std::vector<DimSize_t>& inputDims,
+                                 const void* input_,
+                                 void* output_)
+{
+    const I* input = static_cast<const I*>(input_);
+    O* output = static_cast<O*>(output_);
+
+    const std::size_t nb_dims = inputDims.size();
+    const std::size_t stride_pre = std::accumulate(inputDims.cbegin(), inputDims.cbegin() + axis, 1, std::multiplies<std::size_t>());
+    const std::size_t stride_post = std::accumulate(inputDims.crbegin(), inputDims.crbegin() + nb_dims -1 - axis, 1, std::multiplies<std::size_t>());
+
+    const std::size_t dim_i = inputDims[axis];
+    std::vector<I> buffer(dim_i);
+
+    for (std::size_t pre = 0; pre < stride_pre; ++pre) {
+        for (std::size_t post = 0; post < stride_post; ++post) {
+            const std::size_t idx_i = pre * dim_i * stride_post + post;
+            const std::size_t idx_o = pre * k * stride_post + post;
+
+            for (size_t i = 0; i < dim_i; ++i) {
+                buffer[i] = input[idx_i + i * stride_post];
+            }
+
+            if (largest) {
+                std::partial_sort(buffer.begin(), buffer.begin() + k, buffer.end(), std::greater<I>());
+            }
+            else {
+                std::partial_sort(buffer.begin(), buffer.begin() + k, buffer.end(), std::less<I>());
+            }
+
+            std::copy(buffer.begin(), buffer.begin() + k, output + idx_o);
+        }
+    }
+}
+
+// Kernels registration to implementation entry point
+REGISTRAR(TopKImpl_cpu,
+    {DataType::Float32},
+    {ProdConso::inPlaceModel, Aidge::TopKImpl_cpu_forward_kernel<float, float>, nullptr});
+REGISTRAR(TopKImpl_cpu,
+    {DataType::Float64},
+    {ProdConso::inPlaceModel, Aidge::TopKImpl_cpu_forward_kernel<double, double>, nullptr});
+REGISTRAR(TopKImpl_cpu,
+    {DataType::Int32},
+    {ProdConso::inPlaceModel, Aidge::TopKImpl_cpu_forward_kernel<int32_t, int32_t>, nullptr});
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_TOPKIMPL_KERNELS_H_ */
diff --git a/src/operator/TopKImpl.cpp b/src/operator/TopKImpl.cpp
new file mode 100644
index 00000000..80bb87bf
--- /dev/null
+++ b/src/operator/TopKImpl.cpp
@@ -0,0 +1,43 @@
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <memory>
+#include <vector>
+
+#include "aidge/utils/Types.h"
+#include "aidge/operator/TopK.hpp"
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+
+#include "aidge/backend/cpu/operator/TopKImpl.hpp"
+#include "aidge/backend/cpu/operator/TopKImpl_kernels.hpp"
+
+template <>
+void Aidge::TopKImpl_cpu::forward() {
+    const TopK_Op& op_ = dynamic_cast<const TopK_Op&>(mOp);
+    std::int32_t axis = (op_.axis() >= 0) ? op_.axis() : op_.getInput(0)->nbDims() + op_.axis();
+
+    // Find the correct kernel type
+    const auto impl = Registrar<TopKImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Call kernel
+    impl.forward(axis,
+                op_.largest(),
+                op_.sorted(),
+                op_.k(),
+                op_.getInput(0)->dims(),
+                op_.getInput(0)->getImpl()->rawPtr(),
+                op_.getOutput(0)->getImpl()->rawPtr());
+}
+
+template <>
+void Aidge::TopKImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for TopK_Op on backend cpu");
+}
-- 
GitLab


From 8f05ddb126704f6f07d2948a846db7b746068422 Mon Sep 17 00:00:00 2001
From: cmoineau <cyril.moineau@cea.fr>
Date: Fri, 28 Mar 2025 10:21:33 +0000
Subject: [PATCH 072/108] Update Pybind 2.10.4 -> 2.13.6

---
 cmake/PybindModuleCreation.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/PybindModuleCreation.cmake b/cmake/PybindModuleCreation.cmake
index a520039f..e3fe6a73 100644
--- a/cmake/PybindModuleCreation.cmake
+++ b/cmake/PybindModuleCreation.cmake
@@ -1,10 +1,10 @@
-function(generate_python_binding pybind_module_name target_to_bind) 
+function(generate_python_binding pybind_module_name target_to_bind)
 
     find_package(Python COMPONENTS Interpreter Development.Module)
 
     Include(FetchContent)
 
-    set(PYBIND_VERSION v2.10.4)
+    set(PYBIND_VERSION v2.13.6)
     message(STATUS "Retrieving pybind ${PYBIND_VERSION} from git")
 
     FetchContent_Declare(
-- 
GitLab


From 81dfdcdb3cd536104324f2c9354427d231b5b310 Mon Sep 17 00:00:00 2001
From: cmoineau <cyril.moineau@cea.fr>
Date: Fri, 28 Mar 2025 10:22:26 +0000
Subject: [PATCH 073/108] Remove fmt header from ConvImpl_kernels.hpp

---
 include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
index 7ae9e45f..29aac6dc 100644
--- a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
@@ -14,7 +14,6 @@
 
 #include <array>
 #include <cstdint>
-#include <fmt/base.h>
 
 #include "aidge/backend/cpu/operator/ConvImpl.hpp"
 #include "aidge/utils/Registrar.hpp"
-- 
GitLab


From f44d6cf3ce7121919430c0eefeafa3ff27ee7f1d Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Fri, 28 Mar 2025 12:23:03 +0100
Subject: [PATCH 074/108] Added handling of second output

---
 .../aidge/backend/cpu/operator/TopKImpl.hpp   |  1 +
 .../backend/cpu/operator/TopKImpl_kernels.hpp | 35 ++++++++++++++-----
 src/operator/TopKImpl.cpp                     |  3 +-
 3 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/TopKImpl.hpp b/include/aidge/backend/cpu/operator/TopKImpl.hpp
index d3060156..05849b76 100644
--- a/include/aidge/backend/cpu/operator/TopKImpl.hpp
+++ b/include/aidge/backend/cpu/operator/TopKImpl.hpp
@@ -31,6 +31,7 @@ using TopKImpl_cpu = OperatorImpl_cpu<TopK_Op,
         IOIndex_t,
         const std::vector<DimSize_t>&,
         const void*,
+        void*,
         void*)>;
 
 // Implementation entry point registration to Operator
diff --git a/include/aidge/backend/cpu/operator/TopKImpl_kernels.hpp b/include/aidge/backend/cpu/operator/TopKImpl_kernels.hpp
index 3993bf0d..efe3a603 100644
--- a/include/aidge/backend/cpu/operator/TopKImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/TopKImpl_kernels.hpp
@@ -33,17 +33,19 @@ void TopKImpl_cpu_forward_kernel(int64_t axis,
                                  IOIndex_t k,
                                  const std::vector<DimSize_t>& inputDims,
                                  const void* input_,
-                                 void* output_)
+                                 void* output_,
+                                 void* indices_)
 {
     const I* input = static_cast<const I*>(input_);
     O* output = static_cast<O*>(output_);
+    int64_t* indices = static_cast<int64_t*>(indices_);
 
     const std::size_t nb_dims = inputDims.size();
     const std::size_t stride_pre = std::accumulate(inputDims.cbegin(), inputDims.cbegin() + axis, 1, std::multiplies<std::size_t>());
     const std::size_t stride_post = std::accumulate(inputDims.crbegin(), inputDims.crbegin() + nb_dims -1 - axis, 1, std::multiplies<std::size_t>());
 
     const std::size_t dim_i = inputDims[axis];
-    std::vector<I> buffer(dim_i);
+    std::vector<std::pair<I, int64_t>> buffer(dim_i);
 
     for (std::size_t pre = 0; pre < stride_pre; ++pre) {
         for (std::size_t post = 0; post < stride_post; ++post) {
@@ -51,30 +53,45 @@ void TopKImpl_cpu_forward_kernel(int64_t axis,
             const std::size_t idx_o = pre * k * stride_post + post;
 
             for (size_t i = 0; i < dim_i; ++i) {
-                buffer[i] = input[idx_i + i * stride_post];
+                const auto idx = idx_i + i * stride_post;
+                buffer[i] = std::make_pair(input[idx], idx);
             }
 
             if (largest) {
-                std::partial_sort(buffer.begin(), buffer.begin() + k, buffer.end(), std::greater<I>());
+                std::partial_sort(buffer.begin(), buffer.begin() + k, buffer.end(),
+                    [](const auto& lhs, const auto& rhs) { return lhs.first > rhs.first; });
             }
             else {
-                std::partial_sort(buffer.begin(), buffer.begin() + k, buffer.end(), std::less<I>());
+                std::partial_sort(buffer.begin(), buffer.begin() + k, buffer.end(),
+                    [](const auto& lhs, const auto& rhs) { return lhs.first < rhs.first; });
             }
 
-            std::copy(buffer.begin(), buffer.begin() + k, output + idx_o);
+            for (size_t i = 0; i < k; ++i) {
+                output[idx_o + i] = buffer[i].first;
+                indices[idx_o + i] = buffer[i].second;
+            }
         }
     }
 }
 
 // Kernels registration to implementation entry point
 REGISTRAR(TopKImpl_cpu,
-    {DataType::Float32},
+    {
+        {{DataType::Float32}},
+        {{DataType::Float32}, {DataType::Int64}}
+    },
     {ProdConso::inPlaceModel, Aidge::TopKImpl_cpu_forward_kernel<float, float>, nullptr});
 REGISTRAR(TopKImpl_cpu,
-    {DataType::Float64},
+    {
+        {{DataType::Float64}},
+        {{DataType::Float64}, {DataType::Int64}}
+    },
     {ProdConso::inPlaceModel, Aidge::TopKImpl_cpu_forward_kernel<double, double>, nullptr});
 REGISTRAR(TopKImpl_cpu,
-    {DataType::Int32},
+    {
+        {{DataType::Int32}},
+        {{DataType::Int32}, {DataType::Int64}}
+    },
     {ProdConso::inPlaceModel, Aidge::TopKImpl_cpu_forward_kernel<int32_t, int32_t>, nullptr});
 }  // namespace Aidge
 
diff --git a/src/operator/TopKImpl.cpp b/src/operator/TopKImpl.cpp
index 80bb87bf..b84ca9d1 100644
--- a/src/operator/TopKImpl.cpp
+++ b/src/operator/TopKImpl.cpp
@@ -34,7 +34,8 @@ void Aidge::TopKImpl_cpu::forward() {
                 op_.k(),
                 op_.getInput(0)->dims(),
                 op_.getInput(0)->getImpl()->rawPtr(),
-                op_.getOutput(0)->getImpl()->rawPtr());
+                op_.getOutput(0)->getImpl()->rawPtr(),
+                op_.getOutput(1)->getImpl()->rawPtr());
 }
 
 template <>
-- 
GitLab


From 8cc9edbace4454b0f3307081361dcf236f7b5efa Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Fri, 28 Mar 2025 15:35:45 +0100
Subject: [PATCH 075/108] Fixed missing clamping

---
 .../backend/cpu/operator/SliceImpl_kernels.hpp      | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/SliceImpl_kernels.hpp b/include/aidge/backend/cpu/operator/SliceImpl_kernels.hpp
index d290c40f..9ae42534 100644
--- a/include/aidge/backend/cpu/operator/SliceImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/SliceImpl_kernels.hpp
@@ -48,13 +48,16 @@ void SliceImpl_cpu_forward_kernel(const std::vector<std::int64_t>& starts,
                                                 static_cast<DimSize_t>(starts[i]) :
                                                 static_cast<DimSize_t>(starts[i] + static_cast<std::int64_t>(inputDims[axis])),
                                          dims[axis]-1);
-        const DimSize_t end = ends[i] >= 0 ?
+        const DimSize_t end = std::min(ends[i] >= 0 ?
                                         static_cast<DimSize_t>(ends[i]) :
-                                        static_cast<DimSize_t>(ends[i] + static_cast<std::int64_t>(inputDims[axis]));
+                                        static_cast<DimSize_t>(ends[i] + static_cast<std::int64_t>(inputDims[axis])),
+                                         dims[axis]);
         const std::int64_t step = steps[i];
 
         const std::size_t sliceSize = static_cast<std::size_t>(std::ceil((static_cast<float>(end) - static_cast<float>(start)) / static_cast<float>(step)));
 
+        totalSize /= dims[axis];
+        totalSize *= sliceSize;
         outputAccumulation = new I[totalSize];
         const std::size_t stride_pre = std::accumulate(dims.cbegin(), dims.cbegin() + axis, 1, std::multiplies<std::size_t>());
         const std::size_t stride_post = std::accumulate(dims.crbegin(), dims.crbegin() + nbDims -1 - axis, 1, std::multiplies<std::size_t>());
@@ -62,17 +65,13 @@ void SliceImpl_cpu_forward_kernel(const std::vector<std::int64_t>& starts,
         {
             const std::size_t idx_in = outer * stride_post * dims[axis] + start * stride_post;
             const std::size_t idx_out = outer * stride_post * sliceSize;
-            std::size_t addedSlices = 0;
             for (std::size_t inner = 0; inner < sliceSize; ++inner)
             {
                 std::copy_n(std::next(inputAccumulation, idx_in + inner * step * stride_post),
                             stride_post,
-                            std::next(outputAccumulation, idx_out + addedSlices * stride_post));
-                addedSlices++;
+                            std::next(outputAccumulation, idx_out + inner * stride_post));
             }
         }
-        totalSize /= dims[axis];
-        totalSize *= sliceSize;
         dims[axis] = sliceSize;
 
         if (inputAccumulation != input) {
-- 
GitLab


From c1a6d7693f3a794749f3e1f48e2f765ca2836ea2 Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Fri, 28 Mar 2025 15:51:11 +0100
Subject: [PATCH 076/108] FIxed TopK registration

---
 include/aidge/backend/cpu.hpp                           | 1 +
 include/aidge/backend/cpu/operator/TopKImpl_kernels.hpp | 6 +++---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/include/aidge/backend/cpu.hpp b/include/aidge/backend/cpu.hpp
index 5c1f9b11..b9334dca 100644
--- a/include/aidge/backend/cpu.hpp
+++ b/include/aidge/backend/cpu.hpp
@@ -58,6 +58,7 @@
 #include "aidge/backend/cpu/operator/SliceImpl.hpp"
 #include "aidge/backend/cpu/operator/SoftmaxImpl.hpp"
 #include "aidge/backend/cpu/operator/SubImpl.hpp"
+#include "aidge/backend/cpu/operator/TopKImpl.hpp"
 #include "aidge/backend/cpu/operator/TanhImpl.hpp"
 #include "aidge/backend/cpu/operator/WeightInterleavedImpl.hpp"
 
diff --git a/include/aidge/backend/cpu/operator/TopKImpl_kernels.hpp b/include/aidge/backend/cpu/operator/TopKImpl_kernels.hpp
index efe3a603..69d66913 100644
--- a/include/aidge/backend/cpu/operator/TopKImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/TopKImpl_kernels.hpp
@@ -77,19 +77,19 @@ void TopKImpl_cpu_forward_kernel(int64_t axis,
 // Kernels registration to implementation entry point
 REGISTRAR(TopKImpl_cpu,
     {
-        {{DataType::Float32}},
+        {{DataType::Float32}, {DataType::Any}},
         {{DataType::Float32}, {DataType::Int64}}
     },
     {ProdConso::inPlaceModel, Aidge::TopKImpl_cpu_forward_kernel<float, float>, nullptr});
 REGISTRAR(TopKImpl_cpu,
     {
-        {{DataType::Float64}},
+        {{DataType::Float64}, {DataType::Any}},
         {{DataType::Float64}, {DataType::Int64}}
     },
     {ProdConso::inPlaceModel, Aidge::TopKImpl_cpu_forward_kernel<double, double>, nullptr});
 REGISTRAR(TopKImpl_cpu,
     {
-        {{DataType::Int32}},
+        {{DataType::Int32}, {DataType::Any}},
         {{DataType::Int32}, {DataType::Int64}}
     },
     {ProdConso::inPlaceModel, Aidge::TopKImpl_cpu_forward_kernel<int32_t, int32_t>, nullptr});
-- 
GitLab


From 52287392d13c027d0a47a58c6ae3bb5fdf57385a Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Fri, 28 Mar 2025 16:09:15 +0100
Subject: [PATCH 077/108] Fixed indices to be conformant to ONNX

---
 include/aidge/backend/cpu/operator/TopKImpl_kernels.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/aidge/backend/cpu/operator/TopKImpl_kernels.hpp b/include/aidge/backend/cpu/operator/TopKImpl_kernels.hpp
index 69d66913..3d6bc3c8 100644
--- a/include/aidge/backend/cpu/operator/TopKImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/TopKImpl_kernels.hpp
@@ -54,7 +54,7 @@ void TopKImpl_cpu_forward_kernel(int64_t axis,
 
             for (size_t i = 0; i < dim_i; ++i) {
                 const auto idx = idx_i + i * stride_post;
-                buffer[i] = std::make_pair(input[idx], idx);
+                buffer[i] = std::make_pair(input[idx], i);
             }
 
             if (largest) {
-- 
GitLab


From a5c69ad94e12263d58578b594d43ace3a91effeb Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Fri, 28 Mar 2025 17:40:06 +0100
Subject: [PATCH 078/108] Added missing MaxPoolingImpl registration

---
 include/aidge/backend/cpu.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/aidge/backend/cpu.hpp b/include/aidge/backend/cpu.hpp
index b9334dca..1f9dd830 100644
--- a/include/aidge/backend/cpu.hpp
+++ b/include/aidge/backend/cpu.hpp
@@ -42,6 +42,7 @@
 #include "aidge/backend/cpu/operator/LeakyReLUImpl.hpp"
 #include "aidge/backend/cpu/operator/LnImpl.hpp"
 #include "aidge/backend/cpu/operator/MatMulImpl.hpp"
+#include "aidge/backend/cpu/operator/MaxPoolingImpl.hpp"
 #include "aidge/backend/cpu/operator/ModImpl.hpp"
 #include "aidge/backend/cpu/operator/MulImpl.hpp"
 #include "aidge/backend/cpu/operator/PadImpl.hpp"
-- 
GitLab


From afd3dee9b46a2dfb3b33e6fe192e2b7d975148d2 Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Mon, 31 Mar 2025 14:13:56 +0200
Subject: [PATCH 079/108] Fix handling of negative axis

---
 src/operator/ArgMaxImpl.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/operator/ArgMaxImpl.cpp b/src/operator/ArgMaxImpl.cpp
index b8fb85a7..5829070a 100644
--- a/src/operator/ArgMaxImpl.cpp
+++ b/src/operator/ArgMaxImpl.cpp
@@ -21,12 +21,13 @@
 template <>
 void Aidge::ArgMaxImpl_cpu::forward() {
     const ArgMax_Op& op_ = dynamic_cast<const ArgMax_Op&>(mOp);
+    std::int32_t axis = (op_.axis() >= 0) ? op_.axis() : op_.getInput(0)->nbDims() + op_.axis();
 
     // Find the correct kernel type
     const auto impl = Registrar<ArgMaxImpl_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Call kernel
-    impl.forward(op_.axis(),
+    impl.forward(axis,
                 op_.selectLastIndex(),
                 op_.getInput(0)->dims(),
                 op_.getInput(0)->getImpl()->rawPtr(),
-- 
GitLab


From 54455ecf15e428c0745534be0ef032af54ce35a1 Mon Sep 17 00:00:00 2001
From: NAUD Maxence <maxence.naud@cea.fr>
Date: Wed, 26 Mar 2025 17:10:26 +0100
Subject: [PATCH 080/108] fix: error related to 'fma' call with different input
 types. Also simplify GlobalAvgPooling logic

---
 .../cpu/operator/GlobalAveragePoolingImpl.hpp |  3 +-
 .../GlobalAveragePoolingImpl_kernels.hpp      | 95 ++++++++-----------
 .../backend/cpu/operator/ReduceMeanImpl.hpp   |  1 -
 .../cpu/operator/ReduceMeanImpl_kernels.hpp   | 20 ++--
 src/operator/GlobalAveragePoolingImpl.cpp     |  6 +-
 5 files changed, 52 insertions(+), 73 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp b/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp
index 4e04b1a5..a71174c0 100644
--- a/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp
+++ b/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp
@@ -18,12 +18,11 @@
 #include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/GlobalAveragePooling.hpp"
 #include "aidge/utils/Registrar.hpp"
-#include "aidge/utils/Types.h"
 
 namespace Aidge {
 // Operator implementation entry point for the backend
 using GlobalAveragePoolingImpl_cpu = OperatorImpl_cpu<GlobalAveragePooling_Op,
-    void(const std::vector<DimSize_t> &, const void *, void *)>;
+    void(const std::shared_ptr<Tensor>&, void *)>;
 
 // Implementation entry point registration to Operator
 REGISTRAR(GlobalAveragePooling_Op, "cpu", Aidge::GlobalAveragePoolingImpl_cpu::create);
diff --git a/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp
index 7a47ccf3..40dd3a69 100644
--- a/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp
@@ -12,92 +12,71 @@
 #ifndef AIDGE_CPU_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_KERNELS_H_
 #define AIDGE_CPU_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_KERNELS_H_
 
-#include <cstddef>
-#include <functional>  // std::multiplies
-#include <numeric>     // std::accumulate
+#include <cstddef>     // std::size_t
 #include <vector>
 
 #include "aidge/backend/cpu/operator/GlobalAveragePoolingImpl.hpp"
-#include "aidge/data/Data.hpp"
-#include "aidge/utils/ErrorHandling.hpp"
+#include "aidge/data/Tensor.hpp"
 #include "aidge/utils/Registrar.hpp"
 #include "aidge/utils/Types.h"
 
-
 namespace Aidge {
 
 template <typename T>
-typename std::enable_if<std::is_floating_point<T>::value, T>::type
-stableMean(const T* vec, size_t size) {
-  T mean = 0;
-  for (size_t i = 0; i < size; ++i) {
-    mean = std::fma<T>(vec[i] - mean, 1.0f / (i + 1), mean);
-  }
-  return mean;
+typename std::enable_if_t<std::is_floating_point<T>::value, T>
+static stableMean(const T* vec, std::size_t size) {
+    T mean{0};
+    for (std::size_t i = 0; i < size; ++i) {
+        mean = std::fma(vec[i] - mean, static_cast<T>(1) / static_cast<T>(i + 1), mean);
+    }
+    return mean;
 }
 
 // Specialization for integers: perform the mean computation in float
 template <typename T>
-typename std::enable_if<!std::is_floating_point<T>::value, double>::type
-stableMean(const T* vec, size_t size) {
-  double mean = 0;
-  for (size_t i = 0; i < size; ++i) {
-    mean = std::fma<double>(vec[i] - mean, 1.0f / (i + 1), mean);
-  }
-  return mean;
-}
-
-template <typename T>
-typename std::enable_if<std::is_floating_point<T>::value, T>::type
-castFromFloat(T value) {
-  return value;
-}
-
-template <typename T>
-typename std::enable_if<!std::is_floating_point<T>::value, T>::type
-castFromFloat(double value) {
-  return static_cast<T>(std::nearbyint(value));
+typename std::enable_if_t<!std::is_floating_point<T>::value, double>
+static stableMean(const T* vec, std::size_t size) {
+    double mean{0};
+    for (std::size_t i = 0; i < size; ++i) {
+        mean = std::fma<double>(static_cast<double>(vec[i]) - mean, 1.0 / static_cast<double>(i + 1), mean);
+    }
+    return mean;
 }
 
-template <class I, class O>
-void GlobalAveragePoolingImpl_cpu_forward_kernel(
-    const std::vector<DimSize_t> &dims, const void *input_, void *output_) {
-  // error checking
-    AIDGE_ASSERT(dims.size() >= 3,"GlobalAveragePool needs at least a 3 dimensions "
-                 "input, number of input dim : {}",
-                 dims.size());
+template <DataType DT_I, DataType DT_O = DT_I>
+void GlobalAveragePoolingImpl_cpu_forward_kernel(const std::shared_ptr<Tensor>& inputTensor, void *output_) {
 
-  // computation
-  const I *input = static_cast<const I *>(input_);
-  O *output = static_cast<O *>(output_);
+    // computation
+    using I = cpptype_t<DT_I>;
+    using O = cpptype_t<DT_O>;
+    const I *input = static_cast<const I *>(inputTensor->getImpl()->rawPtr());
+    O *output = static_cast<O *>(output_);
+    const auto& dims = inputTensor->dims();
 
-  DimSize_t nb_elems = std::accumulate(dims.begin(), dims.end(), std::size_t(1),
-                                       std::multiplies<std::size_t>());
+    const DimSize_t strides_channels = inputTensor->strides()[1];
 
-  const DimSize_t in_batch_nb_elems{nb_elems / dims[0]};
-  const DimSize_t in_channel_nb_elems{in_batch_nb_elems / dims[1]};
-  const DimSize_t out_batch_nb_elems{dims[1]};
-  // parse channel by channel and fill each output with the average of the
-  // values in the channel
-  for (DimSize_t batch = 0; batch < dims[0]; ++batch) {
-    for (DimSize_t channel = 0; channel < dims[1]; ++channel) {
-      const I *filter_start = std::next(
-          input, (batch * in_batch_nb_elems) + (channel * in_channel_nb_elems));
-      output[batch * out_batch_nb_elems + channel] = castFromFloat<O>(stableMean<I>(filter_start, in_channel_nb_elems));
+    // parse channel by channel and fill each output with the average of the
+    // values in the channel
+    std::size_t input_idx = 0;
+    std::size_t output_idx = 0;
+    for (DimSize_t batch = 0; batch < dims[0]; ++batch) {
+        for (DimSize_t channel = 0; channel < dims[1]; ++channel) {
+            output[output_idx++] = static_cast<O>(stableMean<I>(input + input_idx, strides_channels));
+            input_idx += strides_channels;
+        }
     }
-  }
 }
 
 // Kernels registration to implementation entry point
 REGISTRAR(GlobalAveragePoolingImpl_cpu,
     {DataType::Float32},
-    {ProdConso::defaultModel, Aidge::GlobalAveragePoolingImpl_cpu_forward_kernel<float, float>, nullptr});
+    {ProdConso::defaultModel, Aidge::GlobalAveragePoolingImpl_cpu_forward_kernel<DataType::Float32>, nullptr});
 REGISTRAR(GlobalAveragePoolingImpl_cpu,
     {DataType::Float64},
-    {ProdConso::defaultModel, Aidge::GlobalAveragePoolingImpl_cpu_forward_kernel<double, double>, nullptr});
+    {ProdConso::defaultModel, Aidge::GlobalAveragePoolingImpl_cpu_forward_kernel<DataType::Float64>, nullptr});
 REGISTRAR(GlobalAveragePoolingImpl_cpu,
     {DataType::Int32},
-    {ProdConso::defaultModel, Aidge::GlobalAveragePoolingImpl_cpu_forward_kernel<int32_t, int32_t>, nullptr});
+    {ProdConso::defaultModel, Aidge::GlobalAveragePoolingImpl_cpu_forward_kernel<DataType::Int32>, nullptr});
 } // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_GLOBALAVERAGEPOOLINGIMPL_KERNELS_H_ */
diff --git a/include/aidge/backend/cpu/operator/ReduceMeanImpl.hpp b/include/aidge/backend/cpu/operator/ReduceMeanImpl.hpp
index 1c50805d..d6c60c35 100644
--- a/include/aidge/backend/cpu/operator/ReduceMeanImpl.hpp
+++ b/include/aidge/backend/cpu/operator/ReduceMeanImpl.hpp
@@ -12,7 +12,6 @@
 #ifndef AIDGE_CPU_OPERATOR_REDUCEMEANIMPL_H_
 #define AIDGE_CPU_OPERATOR_REDUCEMEANIMPL_H_
 
-#include <array>
 #include <memory>
 #include <tuple>
 #include <vector>
diff --git a/include/aidge/backend/cpu/operator/ReduceMeanImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ReduceMeanImpl_kernels.hpp
index a1562322..73aa283d 100644
--- a/include/aidge/backend/cpu/operator/ReduceMeanImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ReduceMeanImpl_kernels.hpp
@@ -25,39 +25,39 @@
 #include "aidge/utils/Registrar.hpp"
 
 namespace Aidge {
-    
+
 template <typename T>
-using Acc_T = typename std::conditional<std::is_floating_point<T>::value, T, double>::type;
+using Acc_T = typename std::conditional_t<std::is_floating_point<T>::value, T, double>;
 
 template <typename T>
 typename std::enable_if<std::is_floating_point<T>::value, T>::type
-stableMean(const T* vec, size_t len, size_t stride) {
+stableMean(const T* vec, std::size_t len, std::size_t stride) {
   T mean = 0;
-  for (size_t i = 0; i < len; ++i) {
-    mean = std::fma<T>(vec[i * stride] - mean, 1.0f / (i + 1), mean);
+  for (std::size_t i = 0; i < len; ++i) {
+    mean = std::fma(vec[i * stride] - mean, static_cast<T>(1) / static_cast<T>(i + 1), mean);
   }
   return mean;
 }
 
 // Specialization for integers: perform the mean computation in float
 template <typename T>
-typename std::enable_if<!std::is_floating_point<T>::value, double>::type
-stableMean(const T* vec, size_t len, size_t stride) {
+typename std::enable_if_t<!std::is_floating_point<T>::value, double>
+stableMean(const T* vec, std::size_t len, std::size_t stride) {
   double mean = 0;
   for (size_t i = 0; i < len; ++i) {
-    mean = std::fma<double>(vec[i * stride] - mean, 1.0f / (i + 1), mean);
+    mean = std::fma<double>(static_cast<double>(vec[i * stride]) - mean, 1.0 / static_cast<double>(i + 1), mean);
   }
   return mean;
 }
 
 template <typename T>
-typename std::enable_if<std::is_floating_point<T>::value, T>::type
+typename std::enable_if_t<std::is_floating_point<T>::value, T>
 castFromFloat(T value) {
   return value;
 }
 
 template <typename T>
-typename std::enable_if<!std::is_floating_point<T>::value, T>::type
+typename std::enable_if_t<!std::is_floating_point<T>::value, T>
 castFromFloat(double value) {
   return static_cast<T>(std::nearbyint(value));
 }
diff --git a/src/operator/GlobalAveragePoolingImpl.cpp b/src/operator/GlobalAveragePoolingImpl.cpp
index c53f92e1..1b6d9a06 100644
--- a/src/operator/GlobalAveragePoolingImpl.cpp
+++ b/src/operator/GlobalAveragePoolingImpl.cpp
@@ -30,13 +30,15 @@ void Aidge::GlobalAveragePoolingImpl_cpu::forward()
     const GlobalAveragePooling_Op& op_ = static_cast<const GlobalAveragePooling_Op&>(mOp);
     // Check if input is provided
     AIDGE_ASSERT(op_.getInput(0), "missing input 0");
+    // error checking
+    AIDGE_ASSERT(op_.getInput(0)->nbDims() >= 3,"GlobalAveragePool needs at least a 3 dimensions "
+    "input. Got input dims {}", op_.getInput(0)->dims());
 
     // Find the correct kernel type
     const auto impl = Registrar<GlobalAveragePoolingImpl_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Call kernel
-    impl.forward(op_.getInput(0)->dims(),
-               op_.getInput(0)->getImpl()->rawPtr(),
+    impl.forward(op_.getInput(0),
                op_.getOutput(0)->getImpl()->rawPtr());
 }
 
-- 
GitLab


From 9fea6486fd0dda540b38307ac962b1b3a68570b8 Mon Sep 17 00:00:00 2001
From: NAUD Maxence <maxence.naud@cea.fr>
Date: Wed, 26 Mar 2025 17:10:43 +0100
Subject: [PATCH 081/108] fix: use aidge 'approxEq' function instead of manual
 comparison and change CHECKs for REQUIREs in interpolation test

---
 unit_tests/data/Test_Interpolation.cpp      | 62 +++++++++++----------
 unit_tests/operator/Test_ReduceMeanImpl.cpp | 10 ++--
 unit_tests/scheduler/Test_Scheduler.cpp     | 16 +++---
 3 files changed, 47 insertions(+), 41 deletions(-)

diff --git a/unit_tests/data/Test_Interpolation.cpp b/unit_tests/data/Test_Interpolation.cpp
index 5c3b56f0..4886885d 100644
--- a/unit_tests/data/Test_Interpolation.cpp
+++ b/unit_tests/data/Test_Interpolation.cpp
@@ -9,15 +9,21 @@
  *
  ********************************************************************************/
 
-#include <aidge/backend/cpu/data/Interpolation.hpp>
-#include <aidge/data/Interpolation.hpp>
-#include <aidge/data/Tensor.hpp>
-#include <aidge/filler/Filler.hpp>
-#include <aidge/utils/Types.h>
-#include <catch2/catch_test_macros.hpp>
+#include <cmath>   // std::fabs
+#include <cstdlib> // std::abs
 #include <limits>
+#include <memory>
+#include <set>
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
 
 #include "aidge/backend/cpu/data/Interpolation.hpp"
+#include "aidge/data/Interpolation.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/filler/Filler.hpp"
+#include "aidge/utils/Types.h"
+#include "aidge/utils/TensorUtils.hpp"
 
 namespace Aidge {
 
@@ -30,12 +36,12 @@ TEST_CASE("Interpolation", "[Interpolation][Data]") {
         SECTION("1D") {
             pointsToInterpolateInt =
                 std::set<Interpolation::Point<int>>({{{0}, 10}, {{1}, 20}});
-            CHECK(abs(InterpolationCPU::linear({0.5}, pointsToInterpolateInt) -
+            REQUIRE(std::abs(InterpolationCPU::linear({0.5}, pointsToInterpolateInt) -
                       15) <= std::numeric_limits<int>::epsilon());
 
             pointsToInterpolateFloat = std::set<Interpolation::Point<float>>(
                 {{{0}, .0F}, {{1}, 0.2F}});
-            CHECK(fabs(InterpolationCPU::linear({0.3},
+            REQUIRE(std::fabs(InterpolationCPU::linear({0.3},
                                                 pointsToInterpolateFloat) -
                        .06F) <= 1e-5);
         }
@@ -46,21 +52,21 @@ TEST_CASE("Interpolation", "[Interpolation][Data]") {
                                         {{14, 21}, 162.F},
                                         {{15, 20}, 210.F},
                                         {{15, 21}, 95.F}};
-            CHECK(fabs(InterpolationCPU::linear<float>(
-                           {14.5F, 20.2F},
-                           pointsToInterpolateFloat) -
-                       146.1) < 1e-5);
+            const Tensor interpolatedValue = Tensor(std::fabs(InterpolationCPU::linear<float>(
+                {14.5F, 20.2F},
+                pointsToInterpolateFloat)));
+            REQUIRE(approxEq<float>(interpolatedValue, Tensor(146.1f)));
             // pointsToInterpolateFloat = {{{0, 0}, .10F},
             //                             {{0, 1}, .20F},
             //                             {{1, 0}, .30F},
             //                             {{1, 1}, .40F}};
-            // CHECK(abs(InterpolationCPU::linear<float>({1.5, 0.5},
+            // REQUIRE(std::abs(InterpolationCPU::linear<float>({1.5, 0.5},
             //                                         pointsToInterpolateInt)
             //                                         -
             //           25) < std::numeric_limits<int>::epsilon());
 
             // pointsToInterpolateFloat = std::vector({0.1F, 0.2F, 0.3F,
-            // 0.4F}); CHECK(InterpolationCPU::linear(pointsToInterpolateFloat)
+            // 0.4F}); REQUIRE(InterpolationCPU::linear(pointsToInterpolateFloat)
             // == .25f);
         }
         SECTION("3D") {
@@ -72,7 +78,7 @@ TEST_CASE("Interpolation", "[Interpolation][Data]") {
                                         {{1, 0, 1}, .6F},
                                         {{1, 1, 0}, .7F},
                                         {{1, 1, 1}, .8F}};
-            CHECK(fabs(InterpolationCPU::linear({.5, .5, .5},
+            REQUIRE(std::fabs(InterpolationCPU::linear({.5, .5, .5},
                                                 pointsToInterpolateFloat) -
                        .45f) < 1e-5);
         }
@@ -94,7 +100,7 @@ TEST_CASE("Interpolation", "[Interpolation][Data]") {
                                             {{1, 1, 0, 1}, 1.4F},
                                             {{1, 1, 1, 0}, 1.5F},
                                             {{1, 1, 1, 1}, 1.6F}};
-                CHECK(fabs(InterpolationCPU::linear<float>(
+                REQUIRE(std::fabs(InterpolationCPU::linear<float>(
                                {.5, .5, .5, .5},
                                pointsToInterpolateFloat) -
                            .85f) < 0.0001);
@@ -139,25 +145,25 @@ TEST_CASE("Interpolation", "[Interpolation][Data]") {
                                                       {{4}, 5.0F}};
 
             SECTION("Floor") {
-                CHECK(InterpolationCPU::nearest(
+                REQUIRE(InterpolationCPU::nearest(
                           coordToInterpolate,
                           pointsToInterpolate,
                           Interpolation::Mode::Floor) == 1);
             }
             SECTION("Ceil") {
-                CHECK(InterpolationCPU::nearest(
+                REQUIRE(InterpolationCPU::nearest(
                           coordToInterpolate,
                           pointsToInterpolate,
                           Interpolation::Mode::Ceil) == 2);
             }
             SECTION("RoundPreferFloor") {
-                CHECK(InterpolationCPU::nearest(
+                REQUIRE(InterpolationCPU::nearest(
                           coordToInterpolate,
                           pointsToInterpolate,
                           Interpolation::Mode::RoundPreferFloor) == 1);
             }
             SECTION("RoundPreferCeil") {
-                CHECK(InterpolationCPU::nearest(
+                REQUIRE(InterpolationCPU::nearest(
                           coordToInterpolate,
                           pointsToInterpolate,
                           Interpolation::Mode::RoundPreferCeil) == 2);
@@ -172,26 +178,26 @@ TEST_CASE("Interpolation", "[Interpolation][Data]") {
                                    {{3, 3}, 50.0},
                                    {{3, 4}, 60.0}};
             SECTION("Floor") {
-                CHECK(InterpolationCPU::nearest(
+                REQUIRE(InterpolationCPU::nearest(
                           coordToInterpolate,
                           pointsToInterpolate,
                           Interpolation::Mode::Floor) == 30.);
             }
             SECTION("Ceil") {
-                CHECK(InterpolationCPU::nearest(
+                REQUIRE(InterpolationCPU::nearest(
                           coordToInterpolate,
                           pointsToInterpolate,
                           Interpolation::Mode::Ceil) == 60.);
             }
             SECTION("RoundPreferFloor") {
-                CHECK(InterpolationCPU::nearest(
+                REQUIRE(InterpolationCPU::nearest(
                           coordToInterpolate,
                           pointsToInterpolate,
                           Interpolation::Mode::RoundPreferFloor) ==
                       40.);
             }
             SECTION("RoundPreferCeil") {
-                CHECK(InterpolationCPU::nearest(
+                REQUIRE(InterpolationCPU::nearest(
                           coordToInterpolate,
                           pointsToInterpolate,
                           Interpolation::Mode::RoundPreferCeil) == 60.);
@@ -207,26 +213,26 @@ TEST_CASE("Interpolation", "[Interpolation][Data]") {
                                    {{2, 3, 4}, 50.0},
                                    {{3, 3, 4}, 60.0}};
             SECTION("Floor") {
-                CHECK(InterpolationCPU::nearest(
+                REQUIRE(InterpolationCPU::nearest(
                           coordToInterpolate,
                           pointsToInterpolate,
                           Interpolation::Mode::Floor) == 10.);
             }
             SECTION("Ceil") {
-                CHECK(InterpolationCPU::nearest(
+                REQUIRE(InterpolationCPU::nearest(
                           coordToInterpolate,
                           pointsToInterpolate,
                           Interpolation::Mode::Ceil) == 50.);
             }
             SECTION("RoundPreferFloor") {
-                CHECK(InterpolationCPU::nearest(
+                REQUIRE(InterpolationCPU::nearest(
                           coordToInterpolate,
                           pointsToInterpolate,
                           Interpolation::Mode::RoundPreferFloor) ==
                       30.);
             }
             SECTION("RoundPreferCeil") {
-                CHECK(InterpolationCPU::nearest(
+                REQUIRE(InterpolationCPU::nearest(
                           coordToInterpolate,
                           pointsToInterpolate,
                           Interpolation::Mode::RoundPreferCeil) == 30.);
diff --git a/unit_tests/operator/Test_ReduceMeanImpl.cpp b/unit_tests/operator/Test_ReduceMeanImpl.cpp
index 30ffeb0d..8841d677 100644
--- a/unit_tests/operator/Test_ReduceMeanImpl.cpp
+++ b/unit_tests/operator/Test_ReduceMeanImpl.cpp
@@ -156,7 +156,7 @@ TEST_CASE("[cpu/operator] ReduceMean(forward)", "[ReduceMean][CPU]") {
     }
     SECTION("KeepDims") {
         SECTION("test 1") {
-            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<cpptype_t<DataType::Float32>,3,2,2> {
                 {
                     {
                         { 5.0, 1.0 },
@@ -172,12 +172,12 @@ TEST_CASE("[cpu/operator] ReduceMean(forward)", "[ReduceMean][CPU]") {
                     }
                 }
             });
-            Tensor myOutput = Tensor(Array3D<float,3,1,2> {
+            Tensor myOutput = Tensor(Array3D<cpptype_t<DataType::Float32>,3,1,2> {
                 {
 
-                    {{ 12.5, 1.5 }},
-                    {{ 35.0, 1.5 }},
-                    {{ 57.5, 1.5 }}
+                    {{ 12.5f, 1.5f }},
+                    {{ 35.0f, 1.5f }},
+                    {{ 57.5f, 1.5f }}
                 }
             });
 
diff --git a/unit_tests/scheduler/Test_Scheduler.cpp b/unit_tests/scheduler/Test_Scheduler.cpp
index be87e8ac..eed4185d 100644
--- a/unit_tests/scheduler/Test_Scheduler.cpp
+++ b/unit_tests/scheduler/Test_Scheduler.cpp
@@ -482,7 +482,7 @@ TEST_CASE("[cpu/scheduler] Accumulate", "[scheduler]") {
                                  {{2.0, 3.0}, {4.0, 5.0}, {6.0, 7.0}}}});
 
     std::shared_ptr<Tensor> MemInit =
-        std::make_shared<Tensor>(Array2D<float, 3, 2>{
+        std::make_shared<Tensor>(Array2D<cpptype_t<DataType::Float32>, 3, 2>{
             {{0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}});
 
     auto meta = Accumulate(2, "accumulate");
@@ -517,14 +517,14 @@ TEST_CASE("[cpu/scheduler] Accumulate", "[scheduler]") {
     REQUIRE_NOTHROW(scheduler.forward(true));
 
     std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(
-        Array2D<float, 3, 2>{{{3.0, 5.0}, {7.0, 9.0}, {11.0, 13.0}}});
+        Array2D<cpptype_t<DataType::Float32>, 3, 2>{{{3.0, 5.0}, {7.0, 9.0}, {11.0, 13.0}}});
     std::shared_ptr<Tensor> output = std::static_pointer_cast<OperatorTensor>(pop_o->getOperator())->getOutput(0);
     REQUIRE(*output == *expectedOutput);
 }
 
 TEST_CASE("[cpu/scheduler] Branch", "[scheduler]") {
     std::shared_ptr<Tensor> in = std::make_shared<Tensor>(
-            Array2D<float, 2, 3>{{{1, 2, 3}, {4, 5, 6}}});
+            Array2D<cpptype_t<DataType::Float32>, 2, 3>{{{1, 2, 3}, {4, 5, 6}}});
 
     std::shared_ptr<GraphView> g = Sequential({
         Producer(in, "input"),
@@ -576,7 +576,7 @@ TEST_CASE("[cpu/scheduler] Branch", "[scheduler]") {
 #ifdef WITH_OPENSSL
 TEST_CASE("[cpu/scheduler] Select", "[scheduler]") {
     std::shared_ptr<Tensor> in = std::make_shared<Tensor>(
-            Array2D<float, 2, 3>{{{1, 2, 3}, {4, 5, 6}}});
+            Array2D<cpptype_t<DataType::Float32>, 2, 3>{{{1, 2, 3}, {4, 5, 6}}});
 
     std::shared_ptr<GraphView> g = Sequential({
         Producer(in, "input"),
@@ -605,21 +605,21 @@ TEST_CASE("[cpu/scheduler] Select", "[scheduler]") {
     scheduler.generateScheduling();
     scheduler.saveStaticSchedulingDiagram("select_scheduling");
     REQUIRE_NOTHROW(scheduler.forward(true));
-    
+
     g->save("select_forwarded");
 
     auto expectedOutputHash = std::make_shared<Tensor>(
-        Array1D<uint64_t, 4>{{0x1b7cf58dfe2dae24, 0x3bac903def4ce580, 0x5f5a347389d97f41, 0x2c2dc759abc6b61}});
+        Array1D<cpptype_t<DataType::UInt64>, 4>{{0x1b7cf58dfe2dae24, 0x3bac903def4ce580, 0x5f5a347389d97f41, 0x2c2dc759abc6b61}});
     auto outputHash = std::static_pointer_cast<OperatorTensor>(g->getNode("hash")->getOperator())->getOutput(0);
     REQUIRE(*outputHash == *expectedOutputHash);
 
     auto expectedOutputMod = std::make_shared<Tensor>(
-        Array1D<uint64_t, 4>{{2, 1, 1, 2}});
+        Array1D<cpptype_t<DataType::UInt64>, 4>{{2, 1, 1, 2}});
     auto outputMod = std::static_pointer_cast<OperatorTensor>(g->getNode("mod")->getOperator())->getOutput(0);
     REQUIRE(*outputMod == *expectedOutputMod);
 
     auto expectedOutput = std::make_shared<Tensor>(
-        Array2D<float, 2, 3>{{{std::sqrt(1), std::sqrt(2), std::sqrt(3)}, {std::sqrt(4), std::sqrt(5), std::sqrt(6)}}});
+        Array2D<cpptype_t<DataType::Float32>, 2, 3>{{{std::sqrt(1.0f), std::sqrt(2.0f), std::sqrt(3.0f)}, {std::sqrt(4.0f), std::sqrt(5.0f), std::sqrt(6.0f)}}});
     auto output = std::static_pointer_cast<OperatorTensor>(g->getNode("select")->getOperator())->getOutput(0);
     REQUIRE(*output == *expectedOutput);
 
-- 
GitLab


From cd824d73919d0204fba052c18dd4e3da5c23ee9f Mon Sep 17 00:00:00 2001
From: NAUD Maxence <maxence.naud@cea.fr>
Date: Mon, 31 Mar 2025 15:26:45 +0200
Subject: [PATCH 082/108] fix: move back 'castFromFloat' function in
 GloabalAvgPoolingImpl_kernel as 'static_cast' truncates and does not round as
 intended

---
 .../operator/GlobalAveragePoolingImpl_kernels.hpp  | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp
index 40dd3a69..cbe4f110 100644
--- a/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp
@@ -43,6 +43,18 @@ static stableMean(const T* vec, std::size_t size) {
     return mean;
 }
 
+template <typename T>
+typename std::enable_if_t<std::is_floating_point<T>::value, T>
+static castFromFloat(T value) {
+    return value;
+}
+
+template <typename T>
+typename std::enable_if_t<!std::is_floating_point<T>::value, T>
+static castFromFloat(double value) {
+    return static_cast<T>(std::nearbyint(value));
+}
+
 template <DataType DT_I, DataType DT_O = DT_I>
 void GlobalAveragePoolingImpl_cpu_forward_kernel(const std::shared_ptr<Tensor>& inputTensor, void *output_) {
 
@@ -61,7 +73,7 @@ void GlobalAveragePoolingImpl_cpu_forward_kernel(const std::shared_ptr<Tensor>&
     std::size_t output_idx = 0;
     for (DimSize_t batch = 0; batch < dims[0]; ++batch) {
         for (DimSize_t channel = 0; channel < dims[1]; ++channel) {
-            output[output_idx++] = static_cast<O>(stableMean<I>(input + input_idx, strides_channels));
+            output[output_idx++] = castFromFloat<O>(stableMean<I>(input + input_idx, strides_channels));
             input_idx += strides_channels;
         }
     }
-- 
GitLab


From 1ab29f55cd0ff892683e3387c5efbee8eff9caf5 Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Wed, 2 Apr 2025 09:05:05 +0200
Subject: [PATCH 083/108] Fix FC backward

---
 include/aidge/backend/cpu/operator/FCImpl_kernels.hpp | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp
index c57f86e6..b77f749f 100644
--- a/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp
@@ -136,15 +136,13 @@ void FCImpl_cpu_backward_kernel(const DimSize_t batchSize,
 
 
     // bias grad
-    if (biasesGrad == nullptr) { // no bias
-        std::fill(biasesGrad, biasesGrad + outputFeatureSize, B(0));
-    } else {
+    if (biasesGrad != nullptr) {
         for (std::size_t o = 0; o < outputFeatureSize; ++o) { // nb outputs
             B sum{0};
             for (std::size_t b = 0; b < batchSize; ++b) {
                 sum += input[b*outputFeatureSize + o];
             }
-            biasesGrad[o] = sum;
+            biasesGrad[o]+= sum;
         }
     }
 
@@ -155,7 +153,7 @@ void FCImpl_cpu_backward_kernel(const DimSize_t batchSize,
             for (std::size_t b = 0; b < batchSize; ++b) {
                 sum += originalInput[b*inputFeatureSize + c]*input[b*outputFeatureSize + o];
             }
-            weightGrad[o*inputFeatureSize + c] = sum;
+            weightGrad[o*inputFeatureSize + c]+= sum;
         }
     }
 
@@ -166,7 +164,7 @@ void FCImpl_cpu_backward_kernel(const DimSize_t batchSize,
             for (std::size_t o = 0; o < outputFeatureSize; ++o) {
                 sum += weight[o*inputFeatureSize + c] * input[b*outputFeatureSize + o];
             }
-            output[b*inputFeatureSize + c] = sum;
+            output[b*inputFeatureSize + c]+= sum;
         }
     }
 }
-- 
GitLab


From bd42c132038d4135bd8dd04b0ed23601d57d8cd8 Mon Sep 17 00:00:00 2001
From: Mickael GUIBERT <mickael.guibert@cea.fr>
Date: Tue, 1 Apr 2025 14:08:26 +0000
Subject: [PATCH 084/108] [Feat] add: unit test for conv2d operator int32_t

---
 unit_tests/operator/Test_ConvImpl.cpp | 71 +++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)

diff --git a/unit_tests/operator/Test_ConvImpl.cpp b/unit_tests/operator/Test_ConvImpl.cpp
index 59ec16dd..854789e3 100644
--- a/unit_tests/operator/Test_ConvImpl.cpp
+++ b/unit_tests/operator/Test_ConvImpl.cpp
@@ -21,6 +21,7 @@
 #include "aidge/graph/Node.hpp"
 #include "aidge/operator/Conv.hpp"
 #include "aidge/utils/TensorUtils.hpp"
+#include "aidge/operator/Pad.hpp"
 
 using namespace Aidge;
 
@@ -1646,6 +1647,76 @@ TEST_CASE("[cpu/operator] Conv(forward)", "[Conv][CPU]") {
             REQUIRE(approxEq<float>(*(conv_op.getOutput(0)),*expectedOutput, 1e-5f, 1e-6f));
         }
     }
+
+    SECTION("kernel size [7,7]") {
+        SECTION("stride [2,2], no dilation, with padding (3,3,3,3)") {
+            Conv_Op<2> conv_op = Conv_Op<2>({7,7}, {2,2});
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array1D<int32_t,3*4*4> {
+               {
+               54, 46, 32, 24, 18, 13, 13, 17, 22, 8, 34, 37,
+               37, 36, 30, 31, 28, 32, 32, 29, 29, 24, 18, 16,
+               57, 63, 57, 42, 30, 20, 17, 30, 41, 52, 46, 38,
+               65, 52, 60, 60, 59, 61, 65, 70, 69, 69, 71, 67
+               }
+            });
+            myInput->resize(std::vector<std::size_t>({1,4,4,3}));
+            myInput->setDataFormat(DataFormat::NHWC);
+            myInput->setDataFormat(DataFormat::NCHW);
+            std::shared_ptr<Tensor> myBiases = std::make_shared<Tensor>(Array1D<int32_t,1> {
+                {18300}
+            });
+            std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array4D<int32_t,1,3,7,7> {
+                {{{{   0,   0,  -1,   0,   1,   0,  -1},
+                    {   0,   0,   0,   1,   1,   0,  -1},
+                    {   0,   0,   0,   1,   1,   1,   0},
+                    {   0,   1,   1,   0,   1,   1,   0},
+                    {   0,   1,   1,   1,   1,   1,   0},
+                    {   0,   1,   1,   1,   1,   0,  -1},
+                    {  -1,   0,   1,   2,   2,   0,  -1}},
+                 
+                   {{   0,   0,  -1,   0,   0,   0,  -1},
+                    {   0,   0,   0,   1,   1,   0,   0},
+                    {   0,   0,   1,   1,   1,   1,   0},
+                    {   0,   1,   1,   1,   1,   1,   1},
+                    {   0,   1,   1,   1,   1,   1,   0},
+                    {   0,   1,   1,   0,   1,   0,   0},
+                    {  -1,   0,   1,   1,   1,   0,  -1}},
+                 
+                   {{   0,  -1,  -1,   0,   1,   0,  -1},
+                    {   0,   1,   1,   2,   2,   1,   0},
+                    {   0,   1,   1,   2,   2,   1,   1},
+                    {   0,   1,   1,   1,   1,   1,   2},
+                    {  -1,   1,   1,   0,   1,   1,   1},
+                    {  -1,   1,   1,   0,   0,   0,   0},
+                    {  -1,   0,   1,   1,   1,   0,   0}}}}
+            });
+            std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array1D<int32_t,1> {
+                {
+                   19282 
+                }
+            });
+            Pad_Op<2> pad_op = Pad_Op<2>({3,3});
+            pad_op.setBackend("cpu");
+            pad_op.associateInput(0,myInput);
+            pad_op.setDataType(DataType::Int32);
+            pad_op.forwardDims();
+            pad_op.forward();
+
+            conv_op.associateInput(0, pad_op.getOutput(0));
+            conv_op.associateInput(1, myWeights);
+            conv_op.associateInput(2, myBiases);
+            conv_op.setBackend("cpu");
+            conv_op.setDataType(DataType::Int32);
+            conv_op.forwardDims();
+            conv_op.forward();
+            conv_op.getOutput(0)->resize(std::vector<std::size_t>({1}));
+            //conv_op.getOutput(0)->print();
+            //fmt::print("{:.^20}\n", "truth");
+            //(*expectedOutput).print();
+            REQUIRE(*(conv_op.getOutput(0)) == *expectedOutput);
+        }
+    }
+
 }
 
 template <DimSize_t DIM>
-- 
GitLab


From f1813259f46a25a73f45fd0740a2e6ad43c6096a Mon Sep 17 00:00:00 2001
From: Adam Maroni <adamaroni@hotmail.fr>
Date: Sun, 23 Mar 2025 18:56:49 +0100
Subject: [PATCH 085/108] Refactoring of MaxPoolingImpl_kernels.hpp

---
 .../cpu/operator/MaxPoolingImpl_kernels.hpp   | 147 +++++++++++-------
 1 file changed, 87 insertions(+), 60 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
index 027fc02a..21eefb02 100644
--- a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
@@ -14,6 +14,7 @@
 
 #include <array>
 #include <cmath>
+#include <cstdint>
 #include <tuple>
 
 
@@ -34,75 +35,101 @@ namespace Aidge {
  * @param output_ Output Tensor.
  */
 template <class I, class O>
-void MaxPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideDims,
-                                        const std::array<DimSize_t, 2>& kernelDims,
-                                        const std::array<DimSize_t, 2>& dilations,
-                                        const bool ceilMode,
-                                        const std::array<DimSize_t, 4> &dims,
-                                        const void *input_,
-                                        void *output_) {
-    const I *input = static_cast<const I *>(input_);
-    O *output = static_cast<O *>(output_);
+void MaxPoolingImpl2D_cpu_forward_kernel(
+  const std::array<DimSize_t, 2>& strideDims,
+  const std::array<DimSize_t, 2>& kernelDims,
+  const std::array<DimSize_t, 2>& dilations,
+  const bool ceilMode,
+  const std::array<DimSize_t, 4> &dims,
+  const void *input_,
+  void *output_)
+{
+  const I *input = static_cast<const I *>(input_);
+  O *output = static_cast<O *>(output_);
 
-    // output H size
-    const std::size_t oxSize = 
-        ceilMode 
-        ? static_cast<std::size_t>(std::ceil(static_cast<float>(dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0]) /
-                                            static_cast<float>(strideDims[0])))
-        : static_cast<std::size_t>(std::floor(static_cast<float>(dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0]) /
-                                            static_cast<float>(strideDims[0])));
-    // output W size
-    const std::size_t oySize = 
-        ceilMode 
-        ? static_cast<std::size_t>(std::ceil(static_cast<float>(dims[3] - (kernelDims[1] - 1) * dilations[1] - 1 + strideDims[1]) /
-                                            static_cast<float>(strideDims[1])))
-        : static_cast<std::size_t>(std::floor(static_cast<float>(dims[3] - (kernelDims[1] - 1) * dilations[1] - 1 + strideDims[1]) /
-                                            static_cast<float>(strideDims[1])));
+  // output H size
+  auto hOut = static_cast<float>(
+    dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0]
+  ) / static_cast<float>(strideDims[0]);
+  const std::size_t outXSize = ceilMode
+    ? static_cast<std::size_t>(std::ceil(hOut))
+    : static_cast<std::size_t>(std::floor(hOut));
 
-    using signedsize = std::make_signed<std::size_t>::type;
-    for (std::size_t batch = 0; batch < dims[0]; ++batch) {
-        for (std::size_t ch = 0; ch < dims[1]; ++ch) {
-            const std::size_t oIndex = (ch + batch*dims[1]) * oxSize * oySize;
-            const std::size_t iIndex = (ch + batch*dims[1]) * dims[2] * dims[3];
-            for (std::size_t ox = 0; ox < oxSize; ++ox) {
-                const signedsize difx = static_cast<signedsize>(- ox * strideDims[0]);
-                const std::size_t sxMin = static_cast<std::size_t>(std::max(difx, signedsize(0)));
-                const std::size_t sxMax = (static_cast<signedsize>(dims[2]) + difx) < 0 ? 0 : ((dims[2] + difx) > kernelDims[0] ? kernelDims[0] : dims[2] + difx);
-                for (std::size_t oy = 0; oy < oySize; ++oy) {
-                    const signedsize dify = static_cast<signedsize>(- oy * strideDims[1]);
-                    const std::size_t syMin = static_cast<std::size_t>(std::max(dify, signedsize(0)));
-                    const std::size_t syMax = (static_cast<signedsize>(dims[3]) + dify) < 0 ? 0 : ((dims[3] + dify) > kernelDims[1] ? kernelDims[1] : dims[3] + dify);
-                    const std::size_t oIndexFull = oIndex + ox*oySize + oy;
-                    const std::size_t ix = ox * strideDims[0];
-                    const std::size_t iy = oy * strideDims[1];
+  // output W size
+  auto wOut = static_cast<float>( 
+      dims[3] - ( kernelDims[1] - 1) * dilations[1] - 1 + strideDims[1]
+    ) / static_cast<float>(strideDims[1]);
 
-                    I poolValue(0.0);
-                    bool valid = false;
+  const std::size_t outYSize = ceilMode
+    ? static_cast<std::size_t>(std::ceil(wOut))
+    : static_cast<std::size_t>(std::floor(wOut));
 
-                    for (unsigned int sy = syMin; sy < syMax; ++sy) {
-                        for (unsigned int sx = sxMin; sx < sxMax; ++sx) {
-                            // Apply dilation factor to kernel indices
-                            const std::size_t dilated_sx = sx * dilations[0];
-                            const std::size_t dilated_sy = sy * dilations[1];
+  using signedsize = std::make_signed<std::size_t>::type;
 
-                            // Ensure indices are within bounds
-                            if ((ix + dilated_sx) < dims[2] && (iy + dilated_sy) < dims[3]) {
-                                const I value = input[iIndex + (ix + dilated_sx) * dims[3] + (iy + dilated_sy)];
-
-                                if (!valid || value > poolValue) {
-                                    poolValue = value;
-                                    valid = true;
-                                }
-                            }
-                        }
-                    }
-                    output[oIndexFull] = poolValue;
+  for (std::size_t batch = 0; batch < dims[0]; ++batch){
+    for (std::size_t channel = 0; channel < dims[1]; ++channel){
+      auto batchChannelIndex = (channel + batch * dims[1]);
+      const std::size_t outputBaseIndex = batchChannelIndex * outXSize * outYSize;
+      const std::size_t inputBaseIndex = batchChannelIndex * dims[2] * dims[3];
+      for (std::size_t outX = 0; outX < outXSize; ++outX) {
+        const signedsize negStrideX = static_cast<signedsize>(
+		-outX * strideDims[0]
+	);
+        const std::size_t kernelXMin = static_cast<std::size_t>(
+          std::max(negStrideX, signedsize(0))
+        );
+        /* Compute kernelXMax */
+        std::size_t kernelXMax = dims[2] + negStrideX;
+        if ((static_cast<signedsize>(dims[2]) + negStrideX) < 0){
+          kernelXMax = 0;
+        }
+        else if (kernelXMax > kernelDims[0]){
+          kernelXMax = kernelDims[0];
+        }
+        for (std::size_t outY = 0; outY < outYSize; ++outY) {
+          const signedsize negStrideY = static_cast<signedsize>(-outY * strideDims[1]);
+          const std::size_t kernelYMin = static_cast<std::size_t>(
+            std::max(negStrideY, signedsize(0))
+          );
+          /* Compute kernelYMax */
+          std::size_t kernelYMax = dims[3] + negStrideY;
+          const std::size_t outputIndex = outputBaseIndex + outX * outYSize + outY;
+          const std::size_t strideXoffset = outX * strideDims[0];
+          const std::size_t strideYoffset = outY * strideDims[1];
+          I poolValue(0.0);
+          bool valid = false;
+          if (static_cast<signedsize>(dims[3]) + negStrideY < 0){
+            kernelYMax = 0;
+          }
+          else if(kernelYMax > kernelDims[1]){
+            kernelYMax = kernelDims[1];
+          }
+          for (unsigned int kY = kernelYMin; kY < kernelYMax ; ++kY){
+            for (unsigned int kX = kernelXMin; kX < kernelXMax; ++kX){
+              // Apply dilation factor to kernel indices
+              const std::size_t dilatedkernelX = kX * dilations[0];
+              const std::size_t dilatedkernelY = kY * dilations[1];
+              // Ensure indices are within bounds
+              auto inputXPostDilation = strideXoffset + dilatedkernelX;
+              auto inputYPostDilation = strideYoffset + dilatedkernelY;
+              if (inputXPostDilation < dims[2] && inputYPostDilation < dims[3]){
+                const I inputValue = input[
+		    inputBaseIndex + inputXPostDilation * dims[3] 
+		    + inputYPostDilation
+                ];
+                if (!valid || inputValue > poolValue) {
+                  poolValue = inputValue;
+                  valid = true;
                 }
+              }
             }
+          }
+          output[outputIndex] = poolValue;
         }
+      }
     }
-}
-
+  }
+} 
 
 // Kernels registration to implementation entry point
 REGISTRAR(MaxPoolingImpl2D_cpu,
-- 
GitLab


From 993b769e64cd2dd43f012ee336414a727d06e104 Mon Sep 17 00:00:00 2001
From: Adam Maroni <adamaroni@hotmail.fr>
Date: Wed, 26 Mar 2025 12:53:20 +0100
Subject: [PATCH 086/108] [Issue #250]: Add backward CPU implementation and
 unit testing for MaxPooling2D CPU

---
 .../backend/cpu/operator/MaxPoolingImpl.hpp   |   7 +
 .../cpu/operator/MaxPoolingImpl_kernels.hpp   | 135 ++++++++++++-
 src/operator/MaxPoolingImpl.cpp               |  19 +-
 unit_tests/operator/Test_MaxPoolingImpl.cpp   | 185 +++++++++++++++++-
 4 files changed, 337 insertions(+), 9 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp b/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp
index 062088a1..804fb33a 100644
--- a/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp
+++ b/include/aidge/backend/cpu/operator/MaxPoolingImpl.hpp
@@ -27,6 +27,13 @@ namespace Aidge {
 // Operator implementation entry point for the backend
 using MaxPooling2D_Op = MaxPooling_Op<2>;
 using MaxPoolingImpl2D_cpu = OperatorImpl_cpu<MaxPooling_Op<2>,
+    void(const std::array<DimSize_t, 2>&,
+                            const std::array<DimSize_t, 2>&,
+                            const std::array<DimSize_t, 2>&,
+                            const bool,
+                            const std::array<DimSize_t, 4> &,
+                            const void *,
+                            void *),
     void(const std::array<DimSize_t, 2>&,
                             const std::array<DimSize_t, 2>&,
                             const std::array<DimSize_t, 2>&,
diff --git a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
index 21eefb02..9a52c149 100644
--- a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
@@ -114,8 +114,8 @@ void MaxPoolingImpl2D_cpu_forward_kernel(
               auto inputYPostDilation = strideYoffset + dilatedkernelY;
               if (inputXPostDilation < dims[2] && inputYPostDilation < dims[3]){
                 const I inputValue = input[
-		    inputBaseIndex + inputXPostDilation * dims[3] 
-		    + inputYPostDilation
+                inputBaseIndex + inputXPostDilation * dims[3] 
+                + inputYPostDilation
                 ];
                 if (!valid || inputValue > poolValue) {
                   poolValue = inputValue;
@@ -131,16 +131,141 @@ void MaxPoolingImpl2D_cpu_forward_kernel(
   }
 } 
 
+
+template <class I, class O>
+void MaxPoolingImpl2D_cpu_backward_kernel(
+  const std::array<DimSize_t, 2>& strideDims,
+  const std::array<DimSize_t, 2>& kernelDims,
+  const std::array<DimSize_t, 2>& dilations,
+  const bool ceilMode,
+  const std::array<DimSize_t, 4> &dims,
+  const void *input_,
+  void *grad_
+)
+{
+  const I *input = static_cast<const I *>(input_);
+  I *grad = static_cast<I *>(grad_);
+
+  // Fill the gradient with 0 to avoid garbage data
+  std::fill(grad,
+	  grad + (dims[0] * dims[1] * dims[2] * dims[3]),
+	  static_cast<I>(0)
+  );
+
+  // output H size
+  auto hOut = static_cast<float>(
+    dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0]
+  ) / static_cast<float>(strideDims[0]);
+  const std::size_t outXSize = ceilMode
+    ? static_cast<std::size_t>(std::ceil(hOut))
+    : static_cast<std::size_t>(std::floor(hOut));
+
+  // output W size
+  auto wOut = static_cast<float>( 
+      dims[3] - ( kernelDims[1] - 1) * dilations[1] - 1 + strideDims[1]
+    ) / static_cast<float>(strideDims[1]);
+
+  const std::size_t outYSize = ceilMode
+    ? static_cast<std::size_t>(std::ceil(wOut))
+    : static_cast<std::size_t>(std::floor(wOut));
+
+  using signedsize = std::make_signed<std::size_t>::type;
+
+  for (std::size_t batch = 0; batch < dims[0]; ++batch){
+    for (std::size_t channel = 0; channel < dims[1]; ++channel){
+      auto batchChannelIndex = (channel + batch * dims[1]);
+      const std::size_t inputBaseIndex = batchChannelIndex * dims[2] * dims[3];
+      for (std::size_t outX = 0; outX < outXSize; ++outX) {
+        const signedsize negStrideX = static_cast<signedsize>(
+          -outX * strideDims[0]
+        );
+        const std::size_t kernelXMin = static_cast<std::size_t>(
+          std::max(negStrideX, signedsize(0))
+        );
+        /* Compute kernelXMax */
+        std::size_t kernelXMax = dims[2] + negStrideX;
+        if ((static_cast<signedsize>(dims[2]) + negStrideX) < 0){
+          kernelXMax = 0;
+        }
+        else if (kernelXMax > kernelDims[0]){
+          kernelXMax = kernelDims[0];
+        }
+        for (std::size_t outY = 0; outY < outYSize; ++outY) {
+          const signedsize negStrideY = static_cast<signedsize>(-outY * strideDims[1]);
+          const std::size_t kernelYMin = static_cast<std::size_t>(
+            std::max(negStrideY, signedsize(0))
+          );
+          /* Compute kernelYMax */
+          std::size_t kernelYMax = dims[3] + negStrideY;
+          const std::size_t strideXoffset = outX * strideDims[0];
+          const std::size_t strideYoffset = outY * strideDims[1];
+          I poolValue(0.0);
+          bool valid = false;
+          if (static_cast<signedsize>(dims[3]) + negStrideY < 0){
+            kernelYMax = 0;
+          }
+          else if(kernelYMax > kernelDims[1]){
+            kernelYMax = kernelDims[1];
+          }
+	  std::size_t saveIndex = 0;
+          for (unsigned int kY = kernelYMin; kY < kernelYMax ; ++kY){
+            for (unsigned int kX = kernelXMin; kX < kernelXMax; ++kX){
+              // Apply dilation factor to kernel indices
+              const std::size_t dilatedkernelX = kX * dilations[0];
+              const std::size_t dilatedkernelY = kY * dilations[1];
+              // Ensure indices are within bounds
+              auto inputXPostDilation = strideXoffset + dilatedkernelX;
+              auto inputYPostDilation = strideYoffset + dilatedkernelY;
+              if (inputXPostDilation < dims[2] && inputYPostDilation < dims[3]){
+		std::size_t inputIndex = 
+			inputBaseIndex + inputXPostDilation * dims[3] 
+			+ inputYPostDilation;
+                const I inputValue = input[inputIndex];
+                if (!valid || inputValue > poolValue) {
+                  poolValue = inputValue;
+		  saveIndex = inputIndex;
+                  valid = true;
+                }
+              }
+            }
+          }
+	  if (valid){
+		grad[saveIndex]++;
+	  }
+        }
+      }
+    }
+  }
+}
+
+
+
+
 // Kernels registration to implementation entry point
 REGISTRAR(MaxPoolingImpl2D_cpu,
     {DataType::Float32},
-    {ProdConso::inPlaceModel, Aidge::MaxPoolingImpl2D_cpu_forward_kernel<float, float>, nullptr});
+    {
+          ProdConso::inPlaceModel,
+          Aidge::MaxPoolingImpl2D_cpu_forward_kernel<float, float>,
+          Aidge::MaxPoolingImpl2D_cpu_backward_kernel<float, float>,
+    }
+);
 REGISTRAR(MaxPoolingImpl2D_cpu,
     {DataType::Float64},
-    {ProdConso::inPlaceModel, Aidge::MaxPoolingImpl2D_cpu_forward_kernel<double, double>, nullptr});
+    {
+          ProdConso::inPlaceModel,
+          Aidge::MaxPoolingImpl2D_cpu_forward_kernel<double, double>,
+          Aidge::MaxPoolingImpl2D_cpu_backward_kernel<double, double>,
+    }
+);
 REGISTRAR(MaxPoolingImpl2D_cpu,
     {DataType::Int32},
-    {ProdConso::inPlaceModel, Aidge::MaxPoolingImpl2D_cpu_forward_kernel<int32_t, int32_t>, nullptr});
+    {
+          ProdConso::inPlaceModel,
+          Aidge::MaxPoolingImpl2D_cpu_forward_kernel<int32_t, int32_t>,
+          Aidge::MaxPoolingImpl2D_cpu_backward_kernel<int32_t, int32_t>,
+    }
+);
 }  // namespace Aidge
 
 #endif /* AIDGE_CPU_OPERATOR_MaxPOOLINGIMPL_KERNELS_H_ */
diff --git a/src/operator/MaxPoolingImpl.cpp b/src/operator/MaxPoolingImpl.cpp
index 13ef75b0..42be049d 100644
--- a/src/operator/MaxPoolingImpl.cpp
+++ b/src/operator/MaxPoolingImpl.cpp
@@ -25,7 +25,8 @@ void Aidge::MaxPoolingImpl2D_cpu::forward() {
     AIDGE_ASSERT(op_.getInput(0), "missing input #0 in MaxPooling Operator.");
 
     // Find the correct kernel type
-    const auto impl = Registrar<MaxPoolingImpl2D_cpu>::create(getBestMatch(getRequiredSpec()));
+    const auto impl =
+	Registrar<MaxPoolingImpl2D_cpu>::create(getBestMatch(getRequiredSpec()));
 
     // Call kernel
     impl.forward(op_.strideDims(),
@@ -39,5 +40,19 @@ void Aidge::MaxPoolingImpl2D_cpu::forward() {
 
 template <>
 void Aidge::MaxPoolingImpl2D_cpu::backward() {
-    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for MaxPooling_Op<2> on backend cpu");
+    const auto& op_ = dynamic_cast<const MaxPooling_Op<2>&>(mOp);
+    AIDGE_ASSERT(op_.getInput(0), "missing input #0 in MaxPooling Operator.");
+
+    // Find the correct kernel type
+    const auto impl = 
+	Registrar<MaxPoolingImpl2D_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Call kernel
+	impl.backward(op_.strideDims(),
+		       op_.kernelDims(),
+		       op_.dilations(),
+		       op_.ceilMode(),
+		       op_.getInput(0)->template dims<4>(),
+		       getCPUPtr(mOp.getRawInput(0)),
+		       op_.getInput(0)->grad()->getImpl()->rawPtr());
 }
diff --git a/unit_tests/operator/Test_MaxPoolingImpl.cpp b/unit_tests/operator/Test_MaxPoolingImpl.cpp
index d480fc30..2bc5e1ee 100644
--- a/unit_tests/operator/Test_MaxPoolingImpl.cpp
+++ b/unit_tests/operator/Test_MaxPoolingImpl.cpp
@@ -55,7 +55,11 @@ TEST_CASE("[cpu/operator] MaxPooling(forward)", "[MaxPooling][CPU]") {
         }
     });
     SECTION("Stride") {
-        std::shared_ptr<MaxPooling_Op<2>> op = std::make_shared<MaxPooling_Op<2>>(std::array<std::size_t, 2>({2,2}), std::array<std::size_t, 2>({2,2}));
+        std::shared_ptr<MaxPooling_Op<2>> op =
+		std::make_shared<MaxPooling_Op<2>>(
+			std::array<std::size_t, 2>({2, 2}),
+			std::array<std::size_t, 2>({2, 2})
+		);
 
         Tensor myOutput = Array4D<float,2,2,2,2> {
             {
@@ -172,4 +176,181 @@ TEST_CASE("[cpu/operator] MaxPooling(forward)", "[MaxPooling][CPU]") {
         op2->getOutput(0)->print();
         REQUIRE(*(op2->getOutput(0)) == *myOutput5);
     }
-}
\ No newline at end of file
+}
+
+
+
+TEST_CASE("[cpu/operator] MaxPooling(backward)", "[MaxPooling][CPU]") {
+    std::shared_ptr<Tensor> myInput = 
+	std::make_shared<Tensor>(Array4D<float,2,2,5,5> { //NCHW
+		{
+		    {
+			{{-0.3848,  0.2166, -0.4373,  0.6142,  0.5277},
+			 {0.7995,  0.3638, -1.4589, -1.0843,  1.0918},
+			 {0.7147,  0.0936, -1.2902,  1.2037,  0.4874},
+			 {-0.5981,  2.1184, -0.9175,  1.3859,  0.3305},
+			 {-1.7700,  0.0563, -0.3914,  0.0538, -0.3955}},
+
+			{{-3.1409, -0.4554,  0.0524,  2.2291,  0.4859},
+			 {-0.7465, -0.6567, -2.3703, -0.6386, -1.4152},
+			 { 2.2329, -0.5850,  0.0700,  1.2838, -1.7363},
+			 { 0.2139,  0.0624, -1.0689, -0.8221, -0.8038},
+			 { 0.1886, -0.7840, -0.2313,  0.2651, -1.6244}}
+		    },
+		    {
+			{{ 0.4371,  1.6417,  0.9129,  0.6325,  0.5438},
+			 {-2.3552, -0.8850, -0.0232, -0.5462, -1.2011},
+			 {1.7653, -1.6668, -1.0814,  0.6182,  1.2071},
+			 {0.9541, -0.5133,  0.8664, -0.8892,  1.4585},
+			 {1.0220, -0.5107,  0.1829, -0.2301, -0.4268}},
+
+			{{ 1.0429,  0.6279, -0.2875,  0.7187, -0.1500},
+			 {1.6041,  2.9635,  1.4172, -0.7517,  0.5441},
+			 {-0.2276,  0.0857,  0.6776, -0.1389, -0.0614},
+			 {-0.1547, -0.3435,  0.0650, -0.5095, -1.8073},
+			 {1.7217,  0.3999, -0.5953,  1.0604, -0.4126}}
+		    }
+		}
+	});
+    SECTION("Stride") {
+        std::shared_ptr<MaxPooling_Op<2>> op =
+		std::make_shared<MaxPooling_Op<2>>(
+			std::array<std::size_t, 2>({2,2}),
+			std::array<std::size_t, 2>({2,2})
+		);
+
+		Tensor grad = Array4D<float,2,2,5,5> {
+			{
+				{
+					{{0, 0, 0, 1, 0},
+					{1, 0, 0, 0, 0},
+					{0, 0, 0, 0, 0},
+					{0, 1, 0, 1, 0},
+					{0, 0, 0, 0, 0}},
+
+					{{0, 1, 0, 1, 0},
+					{0, 0, 0, 0, 0},
+					{1, 0, 0, 1, 0},
+					{0, 0, 0, 0, 0},
+					{0, 0, 0, 0, 0}}
+				},
+				{
+					{{0, 1, 1, 0, 0},
+					{0, 0, 0, 0, 0},
+					{1, 0, 0, 0, 0},
+					{0, 0, 1, 0, 0},
+					{0, 0, 0, 0, 0}},
+
+					{{0, 0, 0, 0, 0},
+					{0, 1, 1, 0, 0},
+					{0, 1, 1, 0, 0},
+					{0, 0, 0, 0, 0},
+					{0, 0, 0, 0, 0}}
+				}
+			}
+		};
+        op->associateInput(0,myInput);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cpu");
+	op->backward();	
+	//op->getInput(0)->grad()->print();
+        REQUIRE(*(op->getInput(0)->grad()) == grad);
+    }
+    SECTION("Dilation"){
+        std::shared_ptr<Node> myMaxPool = MaxPooling({2,2}, "mycdw", {2,2}, {2,2}); // Dilation 2x2
+        auto op = std::static_pointer_cast<OperatorTensor>(myMaxPool -> getOperator());
+
+	Tensor grad = Array4D<float,2,2,5,5> {
+		{{{{0., 0., 0., 0., 1.},
+		  {0., 0., 0., 0., 0.},
+		  {2., 0., 0., 0., 1.},
+		  {0., 0., 0., 0., 0.},
+		  {0., 0., 0., 0., 0.}},
+
+		 {{0., 0., 0., 0., 1.},
+		  {0., 0., 0., 0., 0.},
+		  {2., 0., 1., 0., 0.},
+		  {0., 0., 0., 0., 0.},
+		  {0., 0., 0., 0., 0.}}},
+
+
+		{{{0., 0., 0., 0., 0.},
+		  {0., 0., 0., 0., 0.},
+		  {2., 0., 0., 0., 2.},
+		  {0., 0., 0., 0., 0.},
+		  {0., 0., 0., 0., 0.}},
+
+		 {{1., 0., 0., 0., 0.},
+		  {0., 0., 0., 0., 0.},
+		  {0., 0., 2., 0., 0.},
+		  {0., 0., 0., 0., 0.},
+		  {1., 0., 0., 0., 0.}}}}
+	};
+        myMaxPool->getOperator()->associateInput(0,myInput);
+        myMaxPool->getOperator()->setDataType(DataType::Float32);
+        myMaxPool->getOperator()->setBackend("cpu");
+	op->backward();	
+	//op->getInput(0)->grad()->print();
+        REQUIRE(*(op->getInput(0)->grad()) == grad);
+    }
+    SECTION("Ceil mode"){
+		std::shared_ptr<Tensor> myInput4 =
+			std::make_shared<Tensor>(Array4D<float,1,1,5,5> { // NCHW
+				{{{
+					{ 1,  2,  3,  4,  5},
+					{ 6,  7,  8,  9, 10},
+					{11, 12, 13, 14, 15},
+					{16, 17, 18, 19, 20},
+					{21, 22, 23, 24, 25}
+				}}}
+			});
+
+		// MaxPool with ceil_mode = true
+		std::shared_ptr<Node> myMaxPool1 =
+			MaxPooling({2,2}, "mycdw", {2,2}, {1,1}, true);
+		auto op1 = std::static_pointer_cast<OperatorTensor>(
+			myMaxPool1 -> getOperator()
+		);
+		Tensor grad = Array4D<float,1,1,5,5> {
+			{{{
+				{0, 0, 0, 0, 0}, 
+				{0, 1, 0, 1, 1}, 
+				{0, 0, 0, 0, 0}, 
+				{0, 1, 0, 1, 1}, 
+				{0, 1, 0, 1, 1}
+			}}}
+		};
+
+		op1->associateInput(0, myInput4);
+		op1->setDataType(DataType::Float32);
+		op1->setBackend("cpu");
+		op1->backward();	
+		//op1->getInput(0)->grad()->print();
+		REQUIRE(*(op1->getInput(0)->grad()) == grad);
+
+		// MaxPool with ceil_mode = false
+		std::shared_ptr<Node> myMaxPool2 =
+			MaxPooling({2,2}, "mycdw", {2,2}, {1,1}, false);
+		auto op2 = std::static_pointer_cast<OperatorTensor>(
+				myMaxPool2 -> getOperator()
+			);
+
+		Tensor grad2 = Array4D<float,1,1,5,5> {
+			{{{
+				{0, 0, 0, 0, 0}, 
+				{0, 1, 0, 1, 0}, 
+				{0, 0, 0, 0, 0}, 
+				{0, 1, 0, 1, 0}, 
+				{0, 0, 0, 0, 0}
+			}}}
+		};
+
+		//op2->resetInput(0);
+		op2->associateInput(0, myInput4);
+		op2->setDataType(DataType::Float32);
+		op2->setBackend("cpu");
+		myMaxPool2->backward();
+		op2->getInput(0)->grad()->print();
+		REQUIRE(*(op2->getInput(0)->grad()) == grad2);
+	}
+}
-- 
GitLab


From 05f02dd391d597dff64a10caac830d25e727a3a4 Mon Sep 17 00:00:00 2001
From: NAUD Maxence <maxence.naud@cea.fr>
Date: Thu, 3 Apr 2025 12:08:29 +0000
Subject: [PATCH 087/108] add: function to get inference time and output of
 some GraphView

---
 aidge_backend_cpu/benchmark.py | 40 ++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 aidge_backend_cpu/benchmark.py

diff --git a/aidge_backend_cpu/benchmark.py b/aidge_backend_cpu/benchmark.py
new file mode 100644
index 00000000..81dfc466
--- /dev/null
+++ b/aidge_backend_cpu/benchmark.py
@@ -0,0 +1,40 @@
+import time
+
+import numpy as np
+
+import aidge_core
+
+def prepare_model_scheduler_inputs(model: aidge_core.GraphView, input_data: list[str, np.ndarray]) -> tuple[aidge_core.GraphView, aidge_core.SequentialScheduler]:
+    # update model and inputs backend
+    model.set_backend("cpu")
+    ordered_inputs = [aidge_core.Tensor(i[1]) for i in input_data]
+    for ordered_input in ordered_inputs:
+        ordered_input.set_backend("cpu")
+
+    scheduler = aidge_core.SequentialScheduler(model)
+    scheduler.generate_scheduling()
+
+    return model, scheduler, ordered_inputs
+
+
+def measure_inference_time(model: aidge_core.GraphView, input_data: list[str, np.ndarray], nb_warmup: int = 10, nb_iterations: int = 50) -> list[float]:
+    model, scheduler, ordered_inputs = prepare_model_scheduler_inputs(model, input_data)
+
+    timings = []
+    # Warm-up runs.
+    for i in range(nb_warmup + nb_iterations):
+        if i < nb_warmup:
+            scheduler.forward(forward_dims=False, data=ordered_inputs)
+        else:
+            start = time.process_time()
+            scheduler.forward(forward_dims=False, data=ordered_inputs)
+            end = time.process_time()
+            timings.append((end - start))
+    return timings
+
+def compute_output(model: aidge_core.GraphView, input_data: list[str, np.ndarray]) -> list[np.ndarray]:
+    model, scheduler, ordered_inputs = prepare_model_scheduler_inputs(model, input_data)
+
+    scheduler.forward(forward_dims=False, data=ordered_inputs)
+
+    return [np.array(t[0].get_operator().get_output(t[1])) for t in model.get_ordered_outputs()]
\ No newline at end of file
-- 
GitLab


From 5cf9facba0ade7f46bcaeb4f02983269109b2b27 Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Tue, 8 Apr 2025 13:08:59 +0200
Subject: [PATCH 088/108] Fixed backward for Sqrt and LeakyReLU

---
 include/aidge/backend/cpu/operator/LeakyReLUImpl.hpp |  1 +
 .../backend/cpu/operator/LeakyReLUImpl_kernels.hpp   | 10 ++++++----
 include/aidge/backend/cpu/operator/SqrtImpl.hpp      |  2 +-
 .../aidge/backend/cpu/operator/SqrtImpl_kernels.hpp  | 12 +++++++-----
 src/operator/LeakyReLUImpl.cpp                       | 10 ++++++----
 src/operator/SqrtImpl.cpp                            |  2 ++
 6 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/LeakyReLUImpl.hpp b/include/aidge/backend/cpu/operator/LeakyReLUImpl.hpp
index 1e8c1a14..d4037901 100644
--- a/include/aidge/backend/cpu/operator/LeakyReLUImpl.hpp
+++ b/include/aidge/backend/cpu/operator/LeakyReLUImpl.hpp
@@ -32,6 +32,7 @@ using LeakyReLUImpl_cpu = OperatorImpl_cpu<LeakyReLU_Op,
     void(const float,
         std::size_t,
         const void*,
+        const void*,
         void*)>;
 
 // Implementation entry point registration to Operator
diff --git a/include/aidge/backend/cpu/operator/LeakyReLUImpl_kernels.hpp b/include/aidge/backend/cpu/operator/LeakyReLUImpl_kernels.hpp
index 7afd8298..1b4c3053 100644
--- a/include/aidge/backend/cpu/operator/LeakyReLUImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/LeakyReLUImpl_kernels.hpp
@@ -36,14 +36,16 @@ template <class I, class O>
 void LeakyReLUImpl_cpu_backward_kernel(const float negativeSlope_,
                                      std::size_t inputLength,
                                      const void* input_,
-                                     void* output_) {
+                                     const void* grad_output_,
+                                     void* grad_input_) {
 
-    const I* input = static_cast<const I*>(input_);
-    O* output = static_cast<O*>(output_);
+    const O* input = static_cast<const O*>(input_);
+    const I* grad_output = static_cast<const I*>(grad_output_);
+    O* grad_input = static_cast<O*>(grad_input_);
     const I negativeSlope = static_cast<const I>(negativeSlope_);
 
     for (std::size_t i = 0; i < inputLength; ++i) {
-        output[i] = (input[i] > 0) ? input[i] : negativeSlope*input[i];
+        grad_input[i] = (input[i] > 0) ? grad_output[i] : negativeSlope*grad_output[i];
     }
 }
 
diff --git a/include/aidge/backend/cpu/operator/SqrtImpl.hpp b/include/aidge/backend/cpu/operator/SqrtImpl.hpp
index dba75d1c..2f24277f 100644
--- a/include/aidge/backend/cpu/operator/SqrtImpl.hpp
+++ b/include/aidge/backend/cpu/operator/SqrtImpl.hpp
@@ -26,7 +26,7 @@ namespace Aidge {
 // Operator implementation entry point for the backend
 using SqrtImpl_cpu = OperatorImpl_cpu<Sqrt_Op,
     void(const std::size_t, const void*, void*),
-    void(const std::size_t, const void*, void*)>;
+    void(const std::size_t, const void*, const void*, void*)>;
 
 // Implementation entry point registration to Operator
 REGISTRAR(Sqrt_Op, "cpu", Aidge::SqrtImpl_cpu::create);
diff --git a/include/aidge/backend/cpu/operator/SqrtImpl_kernels.hpp b/include/aidge/backend/cpu/operator/SqrtImpl_kernels.hpp
index 1ce1ef9b..bccc195e 100644
--- a/include/aidge/backend/cpu/operator/SqrtImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/SqrtImpl_kernels.hpp
@@ -35,14 +35,16 @@ void SqrtImpl_cpu_forward_kernel(const std::size_t inputLength,
 
 template <class I, class O>
 void SqrtImpl_cpu_backward_kernel(const std::size_t inputLength,
-                                     const void* input_,
-                                     void* output_) {
+                                     const void* output_,
+                                     const void* grad_output_,
+                                     void* grad_input_) {
 
-    const I* input = static_cast<const I*>(input_);
-    O* output = static_cast<O*>(output_);
+    const I* output = static_cast<const I*>(output_);
+    const I* grad_output = static_cast<const I*>(grad_output_);
+    O* grad_input = static_cast<O*>(grad_input_);
 
     for (std::size_t i = 0; i < inputLength; ++i) {
-        output[i] = static_cast<O>(0.5/(std::sqrt(static_cast<float>(input[i]))));
+        grad_input[i] = static_cast<O>(0.5/output[i]) * grad_output[i];
     }
 }
 
diff --git a/src/operator/LeakyReLUImpl.cpp b/src/operator/LeakyReLUImpl.cpp
index 6c0802dd..2178ecc4 100644
--- a/src/operator/LeakyReLUImpl.cpp
+++ b/src/operator/LeakyReLUImpl.cpp
@@ -43,8 +43,9 @@ template <>
 void Aidge::LeakyReLUImpl_cpu::backward() {
     // reversing in and out Data for backprop
     const LeakyReLU_Op& op_ = dynamic_cast<const LeakyReLU_Op&>(mOp);
-    std::shared_ptr<Tensor> in0  = op_.getOutput(0)->grad();
-    std::shared_ptr<Tensor> out0 = op_.getInput(0)->grad();
+    std::shared_ptr<Tensor> in0 = op_.getInput(0)->grad();
+    std::shared_ptr<Tensor> out0grad  = op_.getOutput(0)->grad();
+    std::shared_ptr<Tensor> in0grad = op_.getInput(0)->grad();
     AIDGE_ASSERT(in0, "missing input #0");
 
     // Find the correct kernel type
@@ -52,7 +53,8 @@ void Aidge::LeakyReLUImpl_cpu::backward() {
 
     // Call kernel
     impl.backward(op_.negativeSlope(),
-        in0->size(),
+        out0grad->size(),
         getCPUPtr(in0),
-        getCPUPtr(out0));
+        getCPUPtr(out0grad),
+        getCPUPtr(in0grad));
 }
\ No newline at end of file
diff --git a/src/operator/SqrtImpl.cpp b/src/operator/SqrtImpl.cpp
index 25bdb42f..d93bfe1f 100644
--- a/src/operator/SqrtImpl.cpp
+++ b/src/operator/SqrtImpl.cpp
@@ -40,6 +40,7 @@ template <>
 void Aidge::SqrtImpl_cpu::backward() {
     // reversing in and out Data for backprop
     const Sqrt_Op& op_ = dynamic_cast<const Sqrt_Op&>(mOp);
+    std::shared_ptr<Tensor> out0  = op_.getOutput(0);
     std::shared_ptr<Tensor> out0grad  = op_.getOutput(0)->grad();
     std::shared_ptr<Tensor> in0grad = op_.getInput(0)->grad();
     AIDGE_ASSERT(out0grad, "missing output #0");
@@ -49,6 +50,7 @@ void Aidge::SqrtImpl_cpu::backward() {
 
     // Call kernel
     impl.backward(out0grad->size(),
+        getCPUPtr(out0),
         getCPUPtr(out0grad),
         getCPUPtr(in0grad));
 }
\ No newline at end of file
-- 
GitLab


From 84e1e2adf8e5d6c72b5121d9296e226df0882c86 Mon Sep 17 00:00:00 2001
From: Maxence Naud <maxence.naud@cea.fr>
Date: Wed, 9 Apr 2025 12:47:59 +0000
Subject: [PATCH 089/108] upd: remove 'Aidge::' scope from ConvImpl::forward
 functions

---
 src/operator/ConvImpl.cpp | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/operator/ConvImpl.cpp b/src/operator/ConvImpl.cpp
index d23a9968..eae5f109 100644
--- a/src/operator/ConvImpl.cpp
+++ b/src/operator/ConvImpl.cpp
@@ -12,15 +12,18 @@
 #include "aidge/backend/cpu/operator/ConvImpl.hpp"
 #include "aidge/backend/cpu/operator/ConvImpl_kernels.hpp"
 
-#include <cassert>
+#include <memory>
+#include <vector>
 
 #include "aidge/backend/cpu/data/GetCPUPtr.h"
 #include "aidge/operator/Conv.hpp"
+#include "aidge/utils/ErrorHandling.hpp"
+#include "aidge/utils/Types.h"
 
 namespace Aidge {
 
 template <>
-void Aidge::ConvImpl1D_cpu::forward() {
+void ConvImpl1D_cpu::forward() {
     const auto& op_ = static_cast<const Conv_Op<1>&>(mOp);
 
     // FIXME: uncomment the following code once memory handling will work
@@ -53,7 +56,8 @@ void Aidge::ConvImpl1D_cpu::forward() {
     );
 }
 
-template <> void ConvImpl1D_cpu::backward() {
+template <>
+void ConvImpl1D_cpu::backward() {
     const auto &op = dynamic_cast<const Conv1D_Op &>(mOp);
     const auto &outputGrad = op.getOutput(0)->grad();
     AIDGE_ASSERT(outputGrad, "{}: missing ouput #0 gradient", op.type());
@@ -97,7 +101,7 @@ template <> void ConvImpl1D_cpu::backward() {
 }
 
 template <>
-void Aidge::ConvImpl2D_cpu::forward() {
+void ConvImpl2D_cpu::forward() {
     const auto& op_ = dynamic_cast<const Conv_Op<2>&>(mOp);
 
     // FIXME: uncomment the following code once memory handling will work
@@ -130,7 +134,8 @@ void Aidge::ConvImpl2D_cpu::forward() {
 }
 
 
-template <> void ConvImpl2D_cpu::backward() {
+template <>
+void ConvImpl2D_cpu::backward() {
     const auto &op = dynamic_cast<const Conv2D_Op &>(mOp);
     const auto &outputGrad = op.getOutput(0)->grad();
     AIDGE_ASSERT(outputGrad, "{}: missing ouput #0 gradient", op.type());
-- 
GitLab


From b57e889efde0fa9da398b45d8981fa4e464533af Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Thu, 3 Apr 2025 15:51:05 +0200
Subject: [PATCH 090/108] Added OpenMP

---
 CMakeLists.txt                                |  6 +++
 .../cpu/operator/AvgPoolingImpl_kernels.hpp   |  3 ++
 .../cpu/operator/BatchNormImpl_kernels.hpp    |  3 ++
 .../operator/ConvDepthWiseImpl_kernels.hpp    | 35 +++++++++------
 .../backend/cpu/operator/ConvImpl_kernels.hpp | 43 +++++++++++++------
 .../GlobalAveragePoolingImpl_kernels.hpp      | 19 +++++---
 .../cpu/operator/MaxPoolingImpl_kernels.hpp   |  3 ++
 .../cpu/operator/SoftmaxImpl_kernels.hpp      |  3 ++
 8 files changed, 83 insertions(+), 32 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6c87a89b..d2c1d0a7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -64,6 +64,8 @@ if(NOT $ENV{AIDGE_INSTALL} STREQUAL "")
 endif()
 find_package(aidge_core REQUIRED)
 
+find_package(OpenMP)
+
 find_package(OpenSSL QUIET)
 if(OpenSSL_FOUND)
     message(STATUS "OpenSSL found: ${OPENSSL_VERSION}")
@@ -86,6 +88,10 @@ target_link_libraries(${module_name}
         _aidge_core # _ is added because we link the exported target and not the project
 )
 
+if(OpenMP_CXX_FOUND)
+    target_link_libraries(${module_name} PUBLIC OpenMP::OpenMP_CXX)
+endif()
+
 # Add definition _USE_MATH_DEFINES to enable math constant definitions from math.h/cmath.
 if (WIN32)
     target_compile_definitions(${module_name} PRIVATE _USE_MATH_DEFINES)
diff --git a/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
index 1671759d..0d73cb91 100644
--- a/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
@@ -76,6 +76,9 @@ void AvgPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideD
 
     using signedsize = std::make_signed<std::size_t>::type;
 
+#ifdef _OPENMP
+    #pragma omp parallel for collapse(2) if (dims[0] * dims[1] > 32)
+#endif
     for (std::size_t batch = 0; batch < dims[0]; ++batch) {
         for (std::size_t ch = 0; ch < dims[1]; ++ch) {
             const std::size_t oIndex = (ch + batch * dims[1]) * oxSize * oySize;
diff --git a/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp b/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp
index cf97f737..7bb7971e 100644
--- a/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp
@@ -53,6 +53,9 @@ void BatchNormImpl2D_cpu_forward_kernel(float epsilon, float momentum, const std
     const DimSize_t featureMapSize = (dims.size() > 2) ? std::accumulate(dims.begin() + 2, dims.end(), 1, std::multiplies<DimSize_t>()) : 1;
 
     if ((freeze == true) || (momentum == 0.0f)) {
+#ifdef _OPENMP
+        #pragma omp parallel for collapse(2) if (nbBatch * nbChannels > 32)
+#endif
         for (std::size_t batch = 0; batch < nbBatch; ++batch) {
             for (std::size_t ch = 0; ch < nbChannels; ++ch) {
                 const std::size_t ioIndex = (ch + batch*nbChannels) * featureMapSize;
diff --git a/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp
index 906ea1ad..b16a819b 100644
--- a/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp
@@ -65,6 +65,9 @@ void ConvDepthWiseImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& stri
     // weight (outCh, ch, kernelX, kernelY)
     // does not take Dilation attribute into account
     using signedsize = std::make_signed<std::size_t>::type;
+#ifdef _OPENMP
+    #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] > 32)
+#endif
     for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
         for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
             const std::size_t oIndex = (ch + batch*inputDims[1]) * oxSize;
@@ -152,16 +155,19 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri
     const std::size_t outChannels_s =  oxSize * oySize;
 
     if (dilated_kernel_x ==3 && dilated_kernel_y == 3) {
+#ifdef _OPENMP
+        #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] > 32)
+#endif
         for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
             for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
-
                 B biasVal = (biases != nullptr) ? biases[ch] : B(0);
 
+                std::size_t oIndex = (ch + batch*inputDims[1]) * outChannels_s;
                 std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
                 const std::size_t wIndex = ch * 9;
 
                 if (strideDims[0] == 1 && strideDims[1]==1) {
-                    for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=inputDims[3]) {
+                    for (std::size_t ox = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=inputDims[3]) {
                         for (std::size_t oy = 0; oy < oySize; ++oy) {
                             output[oIndex + oy] = biasVal + weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+1]+weights[wIndex+2]*input[iIndex+oy+2];
                         }
@@ -175,7 +181,7 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri
                         }
                     }
                 } else {
-                    for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=(strideDims[0]-2)*inputDims[3]) {
+                    for (std::size_t ox = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=(strideDims[0]-2)*inputDims[3]) {
                         for (std::size_t oy = 0; oy < oySize; ++oy) {
                             output[oIndex + oy] = biasVal + weights[wIndex+0]*input[iIndex+oy*strideDims[1]]+weights[wIndex+1]*input[iIndex+oy*strideDims[1]+1]+weights[wIndex+2]*input[iIndex+oy*strideDims[1]+2];
                         }
@@ -189,24 +195,25 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri
                         }
                     }
                 }
-                output += outChannels_s;
             }
         }
     } else if (dilated_kernel_x == 1 && dilated_kernel_y == 1) {
+#ifdef _OPENMP
+        #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] > 32)
+#endif
         for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
             for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
-
                 B biasVal = (biases != nullptr) ? biases[ch] : B(0);
 
+                std::size_t oIndex = (ch + batch*inputDims[1]) * outChannels_s;
                 std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
                 const std::size_t wIndex = ch;
 
                 if (strideDims[0] == 1 && strideDims[1] == 1) {
                     for (std::size_t i = iIndex; i < iIndex + oxSize*oySize; ++i) {
-                        output[i] = biasVal + weights[wIndex] * input[i];
+                        output[oIndex + i] = biasVal + weights[wIndex] * input[i];
                     }
                 } else  {
-                    std::size_t oIndex =  (ch + batch*inputDims[1]) * oxSize * oySize;
                     for (std::size_t ox = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=strideDims[0]*inputDims[3]) {
                         for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) {
                             output[oIndex + oy] = biasVal + weights[wIndex]*input[iIndex+iy];
@@ -216,19 +223,22 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri
             }
         }
     } else {
+#ifdef _OPENMP
+        #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] > 32)
+#endif
         for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
             for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
-
-                B biasVal = (biases != nullptr) ? biases[ch] : B(0);
-                std::fill(output, output+outChannels_s, biasVal);
-
+                const std::size_t oIndex = (ch + batch*inputDims[1]) * outChannels_s;
                 const std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
                 const std::size_t wIndex = ch * kernelDims[0] * kernelDims[1];
 
+                B biasVal = (biases != nullptr) ? biases[ch] : B(0);
+                std::fill(output + oIndex, output + oIndex + outChannels_s, biasVal);
+
                 for (std::size_t ox = 0; ox < oxSize; ++ox) {
                     for (std::size_t oy = 0; oy < oySize; ++oy) {
 
-                        const std::size_t oIndexFull = ox*oySize + oy;
+                        const std::size_t oIndexFull = oIndex + ox*oySize + oy;
                         const std::size_t ix = ox * strideDims[0];
                         const std::size_t iy = oy * strideDims[1];
 
@@ -240,7 +250,6 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri
                         }
                     }
                 }
-                output += outChannels_s;
             }
         }
     }
diff --git a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
index 29aac6dc..b1cd006e 100644
--- a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
@@ -59,6 +59,9 @@ void ConvImpl1D_cpu_forward_kernel(const array<DimSize_t, 1> &strideDim,
     const DimSize_t dilated_kernel_x = dilationDim[0] * (kernelDim[0] - 1) + 1;
 
     using signedsize = std::make_signed<std::size_t>::type;
+#ifdef _OPENMP
+    #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels > 32)
+#endif
     for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
         for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
             const std::size_t oIndex = (outCh + batch * outChannels) * oxSize;
@@ -478,18 +481,24 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims,
     const std::size_t outChannels_s = oxSize * oySize;
 
     if (dilated_kernel_x == 3 && dilated_kernel_y == 3) {
+#ifdef _OPENMP
+        #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels > 32)
+#endif
         for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
             for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
+                std::size_t oIndex = (outCh + batch*inputDims[1]) * outChannels_s;
+
                 // If bias = nullptr, set B(0)
                 B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
-                std::fill(output, output + outChannels_s, biasVal);
+                std::fill(output + oIndex, output + oIndex + outChannels_s, biasVal);
                 for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
+                    oIndex = (outCh + batch*inputDims[1]) * outChannels_s;
                     std::size_t iIndex = (inCh + batch * inputDims[1]) *
                                          inputDims[2] * inputDims[3];
                     const std::size_t wIndex =
                         (inCh + outCh * inputDims[1]) * 9;
                     if (strideDims[0] == 1 && strideDims[1] == 1) {
-                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize;
+                        for (std::size_t ox = 0; ox < oxSize;
                              ++ox, oIndex += oySize, iIndex -= inputDims[3]) {
                             for (std::size_t oy = 0; oy < oySize; ++oy) {
                                 output[oIndex + oy] +=
@@ -519,7 +528,7 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims,
                             }
                         }
                     } else {
-                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox,
+                        for (std::size_t ox = 0; ox < oxSize; ++ox,
                                          oIndex += oySize,
                                          iIndex += (strideDims[0] -
                                                     2) * inputDims[3]) {
@@ -558,26 +567,30 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims,
                         }
                     }
                 }
-                output += outChannels_s;
             }
         }
     } else if (dilated_kernel_x == 1 && dilated_kernel_y == 1) {
+#ifdef _OPENMP
+        #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels > 32)
+#endif
         for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
             for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
+                std::size_t oIndex = (outCh + batch*inputDims[1]) * outChannels_s;
+
                 // If bias = nullptr, set B(0)
                 B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
-                std::fill(output, output + outChannels_s, biasVal);
+                std::fill(output + oIndex, output + oIndex + outChannels_s, biasVal);
                 for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
+                    oIndex = (outCh + batch*inputDims[1]) * outChannels_s;
                     std::size_t iIndex = (inCh + batch * inputDims[1]) *
                                          inputDims[2] * inputDims[3];
                     const std::size_t wIndex = (inCh + outCh * inputDims[1]);
                     if (strideDims[0] == 1 && strideDims[1] == 1) {
-                        for (std::size_t oIndex = 0; oIndex < oxSize * oySize;
-                             ++oIndex, ++iIndex) {
-                            output[oIndex] += weights[wIndex] * input[iIndex];
+                        for (std::size_t i = 0; i < outChannels_s; ++i) {
+                            output[oIndex + i] += weights[wIndex] * input[iIndex + i];
                         }
                     } else {
-                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize;
+                        for (std::size_t ox = 0; ox < oxSize;
                              ++ox,
                                          oIndex += oySize,
                                          iIndex +=
@@ -590,16 +603,21 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims,
                         }
                     }
                 }
-                output += outChannels_s;
             }
         }
     } else {
+#ifdef _OPENMP
+        #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels > 32)
+#endif
         for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
             for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
+                std::size_t oIndex = (outCh + batch*inputDims[1]) * outChannels_s;
+
                 // If bias = nullptr, set B(0)
                 B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
-                std::fill(output, output + outChannels_s, biasVal);
+                std::fill(output + oIndex, output + oIndex + outChannels_s, biasVal);
                 for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
+                    oIndex = (outCh + batch*inputDims[1]) * outChannels_s;
                     std::size_t iIndex_channel =
                         (inCh + batch * inputDims[1]) * inputDims[2] *
                         inputDims[3];
@@ -607,7 +625,7 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims,
                                                kernelDims[0] * kernelDims[1];
 
                     // loop over each ouput line
-                    for (std::size_t ox = 0, oIndex = 0; ox < oxSize;
+                    for (std::size_t ox = 0; ox < oxSize;
                          ++ox,
                                      oIndex += oySize,
                                      iIndex_channel +=
@@ -633,7 +651,6 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims,
                         }
                     }
                 }
-                output += outChannels_s;
             }
         }
     }
diff --git a/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp
index cbe4f110..3915adb3 100644
--- a/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp
@@ -63,18 +63,25 @@ void GlobalAveragePoolingImpl_cpu_forward_kernel(const std::shared_ptr<Tensor>&
     using O = cpptype_t<DT_O>;
     const I *input = static_cast<const I *>(inputTensor->getImpl()->rawPtr());
     O *output = static_cast<O *>(output_);
-    const auto& dims = inputTensor->dims();
 
-    const DimSize_t strides_channels = inputTensor->strides()[1];
+    const auto& dims = inputTensor->dims();
+    DimSize_t nb_elems = std::accumulate(dims.begin(), dims.end(), std::size_t(1),
+                                         std::multiplies<std::size_t>());
+  
+    const DimSize_t in_batch_nb_elems{nb_elems / dims[0]};
+    const DimSize_t in_channel_nb_elems{in_batch_nb_elems / dims[1]};
+    const DimSize_t out_batch_nb_elems{dims[1]};
 
     // parse channel by channel and fill each output with the average of the
     // values in the channel
-    std::size_t input_idx = 0;
-    std::size_t output_idx = 0;
+#ifdef _OPENMP
+    #pragma omp parallel for collapse(2) if (dims[0] * dims[1] > 32)
+#endif
     for (DimSize_t batch = 0; batch < dims[0]; ++batch) {
         for (DimSize_t channel = 0; channel < dims[1]; ++channel) {
-            output[output_idx++] = castFromFloat<O>(stableMean<I>(input + input_idx, strides_channels));
-            input_idx += strides_channels;
+            const I *filter_start = std::next(
+                input, (batch * in_batch_nb_elems) + (channel * in_channel_nb_elems));
+            output[batch * out_batch_nb_elems + channel] = castFromFloat<O>(stableMean<I>(filter_start, in_channel_nb_elems));
         }
     }
 }
diff --git a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
index 9a52c149..9772b0ab 100644
--- a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
@@ -66,6 +66,9 @@ void MaxPoolingImpl2D_cpu_forward_kernel(
 
   using signedsize = std::make_signed<std::size_t>::type;
 
+#ifdef _OPENMP
+    #pragma omp parallel for collapse(2) if (dims[0] * dims[1] > 32)
+#endif
   for (std::size_t batch = 0; batch < dims[0]; ++batch){
     for (std::size_t channel = 0; channel < dims[1]; ++channel){
       auto batchChannelIndex = (channel + batch * dims[1]);
diff --git a/include/aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp b/include/aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp
index 07486a48..e74f3518 100644
--- a/include/aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp
@@ -37,6 +37,9 @@ void SoftmaxImpl_cpu_forward_kernel(std::size_t axisIdx, const std::vector<DimSi
         preAxisElems *= inputDims[i];
     }
 
+#ifdef _OPENMP
+    #pragma omp parallel for collapse(2) if (preAxisElems * postAxisElems > 32)
+#endif
     for (std::size_t i = 0; i < preAxisElems; ++i) {
         for (std::size_t j = 0; j < postAxisElems; ++j) {
             I maxVal = input[i * inputDims[axisIdx] * postAxisElems + j];
-- 
GitLab


From c08a3fa7616efb157fc84f1a357f3ac5349e8a33 Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Fri, 4 Apr 2025 10:06:54 +0200
Subject: [PATCH 091/108] Fixed Windows build

---
 .../cpu/operator/AvgPoolingImpl_kernels.hpp      |  4 ++--
 .../cpu/operator/BatchNormImpl_kernels.hpp       |  4 ++--
 .../cpu/operator/ConvDepthWiseImpl_kernels.hpp   | 16 ++++++++--------
 .../backend/cpu/operator/ConvImpl_kernels.hpp    | 16 ++++++++--------
 .../GlobalAveragePoolingImpl_kernels.hpp         |  4 ++--
 .../cpu/operator/MaxPoolingImpl_kernels.hpp      |  4 ++--
 .../backend/cpu/operator/SoftmaxImpl_kernels.hpp |  4 ++--
 7 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
index 0d73cb91..e7bc3a2b 100644
--- a/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
@@ -79,8 +79,8 @@ void AvgPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideD
 #ifdef _OPENMP
     #pragma omp parallel for collapse(2) if (dims[0] * dims[1] > 32)
 #endif
-    for (std::size_t batch = 0; batch < dims[0]; ++batch) {
-        for (std::size_t ch = 0; ch < dims[1]; ++ch) {
+    for (int batch = 0; batch < static_cast<int>(dims[0]); ++batch) {
+        for (int ch = 0; ch < static_cast<int>(dims[1]); ++ch) {
             const std::size_t oIndex = (ch + batch * dims[1]) * oxSize * oySize;
             const std::size_t iIndex = (ch + batch * dims[1]) * dims[2] * dims[3];
 
diff --git a/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp b/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp
index 7bb7971e..105a3300 100644
--- a/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp
@@ -56,8 +56,8 @@ void BatchNormImpl2D_cpu_forward_kernel(float epsilon, float momentum, const std
 #ifdef _OPENMP
         #pragma omp parallel for collapse(2) if (nbBatch * nbChannels > 32)
 #endif
-        for (std::size_t batch = 0; batch < nbBatch; ++batch) {
-            for (std::size_t ch = 0; ch < nbChannels; ++ch) {
+        for (int batch = 0; batch < static_cast<int>(nbBatch); ++batch) {
+            for (int ch = 0; ch < static_cast<int>(nbChannels); ++ch) {
                 const std::size_t ioIndex = (ch + batch*nbChannels) * featureMapSize;
                 std::fill(output + ioIndex, output + ioIndex + featureMapSize, shift[ch]);
                 const P var = std::sqrt(batchVar[ch] + static_cast<P>(epsilon));
diff --git a/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp
index b16a819b..3019b1d2 100644
--- a/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp
@@ -68,8 +68,8 @@ void ConvDepthWiseImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& stri
 #ifdef _OPENMP
     #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] > 32)
 #endif
-    for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
-        for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
+    for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) {
+        for (int ch = 0; ch < static_cast<int>(inputDims[1]); ++ch) {
             const std::size_t oIndex = (ch + batch*inputDims[1]) * oxSize;
             B biasVal = (biases != nullptr) ? biases[ch] : B(0);
             std::fill(output + oIndex, output+(oIndex+oxSize), biasVal);
@@ -158,8 +158,8 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri
 #ifdef _OPENMP
         #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] > 32)
 #endif
-        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
-            for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
+        for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) {
+            for (int ch = 0; ch < static_cast<int>(inputDims[1]); ++ch) {
                 B biasVal = (biases != nullptr) ? biases[ch] : B(0);
 
                 std::size_t oIndex = (ch + batch*inputDims[1]) * outChannels_s;
@@ -201,8 +201,8 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri
 #ifdef _OPENMP
         #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] > 32)
 #endif
-        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
-            for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
+        for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) {
+            for (int ch = 0; ch < static_cast<int>(inputDims[1]); ++ch) {
                 B biasVal = (biases != nullptr) ? biases[ch] : B(0);
 
                 std::size_t oIndex = (ch + batch*inputDims[1]) * outChannels_s;
@@ -226,8 +226,8 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri
 #ifdef _OPENMP
         #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] > 32)
 #endif
-        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
-            for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
+        for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) {
+            for (int ch = 0; ch < static_cast<int>(inputDims[1]); ++ch) {
                 const std::size_t oIndex = (ch + batch*inputDims[1]) * outChannels_s;
                 const std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
                 const std::size_t wIndex = ch * kernelDims[0] * kernelDims[1];
diff --git a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
index b1cd006e..d7276160 100644
--- a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
@@ -62,8 +62,8 @@ void ConvImpl1D_cpu_forward_kernel(const array<DimSize_t, 1> &strideDim,
 #ifdef _OPENMP
     #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels > 32)
 #endif
-    for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
-        for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
+    for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) {
+        for (int outCh = 0; outCh < static_cast<int>(outChannels); ++outCh) {
             const std::size_t oIndex = (outCh + batch * outChannels) * oxSize;
             // If bias = nullptr, set B(0)
             B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
@@ -484,8 +484,8 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims,
 #ifdef _OPENMP
         #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels > 32)
 #endif
-        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
-            for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
+        for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) {
+            for (int outCh = 0; outCh < static_cast<int>(outChannels); ++outCh) {
                 std::size_t oIndex = (outCh + batch*inputDims[1]) * outChannels_s;
 
                 // If bias = nullptr, set B(0)
@@ -573,8 +573,8 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims,
 #ifdef _OPENMP
         #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels > 32)
 #endif
-        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
-            for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
+        for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) {
+            for (int outCh = 0; outCh < static_cast<int>(outChannels); ++outCh) {
                 std::size_t oIndex = (outCh + batch*inputDims[1]) * outChannels_s;
 
                 // If bias = nullptr, set B(0)
@@ -609,8 +609,8 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims,
 #ifdef _OPENMP
         #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels > 32)
 #endif
-        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
-            for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
+        for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) {
+            for (int outCh = 0; outCh < static_cast<int>(outChannels); ++outCh) {
                 std::size_t oIndex = (outCh + batch*inputDims[1]) * outChannels_s;
 
                 // If bias = nullptr, set B(0)
diff --git a/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp
index 3915adb3..8ff1ad08 100644
--- a/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp
@@ -77,8 +77,8 @@ void GlobalAveragePoolingImpl_cpu_forward_kernel(const std::shared_ptr<Tensor>&
 #ifdef _OPENMP
     #pragma omp parallel for collapse(2) if (dims[0] * dims[1] > 32)
 #endif
-    for (DimSize_t batch = 0; batch < dims[0]; ++batch) {
-        for (DimSize_t channel = 0; channel < dims[1]; ++channel) {
+    for (int batch = 0; batch < static_cast<int>(dims[0]); ++batch) {
+        for (int channel = 0; channel < static_cast<int>(dims[1]); ++channel) {
             const I *filter_start = std::next(
                 input, (batch * in_batch_nb_elems) + (channel * in_channel_nb_elems));
             output[batch * out_batch_nb_elems + channel] = castFromFloat<O>(stableMean<I>(filter_start, in_channel_nb_elems));
diff --git a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
index 9772b0ab..b5f219f9 100644
--- a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
@@ -69,8 +69,8 @@ void MaxPoolingImpl2D_cpu_forward_kernel(
 #ifdef _OPENMP
     #pragma omp parallel for collapse(2) if (dims[0] * dims[1] > 32)
 #endif
-  for (std::size_t batch = 0; batch < dims[0]; ++batch){
-    for (std::size_t channel = 0; channel < dims[1]; ++channel){
+  for (int batch = 0; batch < static_cast<int>(dims[0]); ++batch){
+    for (int channel = 0; channel < static_cast<int>(dims[1]); ++channel){
       auto batchChannelIndex = (channel + batch * dims[1]);
       const std::size_t outputBaseIndex = batchChannelIndex * outXSize * outYSize;
       const std::size_t inputBaseIndex = batchChannelIndex * dims[2] * dims[3];
diff --git a/include/aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp b/include/aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp
index e74f3518..ab6790e2 100644
--- a/include/aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp
@@ -40,8 +40,8 @@ void SoftmaxImpl_cpu_forward_kernel(std::size_t axisIdx, const std::vector<DimSi
 #ifdef _OPENMP
     #pragma omp parallel for collapse(2) if (preAxisElems * postAxisElems > 32)
 #endif
-    for (std::size_t i = 0; i < preAxisElems; ++i) {
-        for (std::size_t j = 0; j < postAxisElems; ++j) {
+    for (int i = 0; i < static_cast<int>(preAxisElems); ++i) {
+        for (int j = 0; j < static_cast<int>(postAxisElems); ++j) {
             I maxVal = input[i * inputDims[axisIdx] * postAxisElems + j];
             for (std::size_t k = 1; k < inputDims[axisIdx]; ++k) {
                 std::size_t inIdx = i * inputDims[axisIdx] * postAxisElems + k * postAxisElems + j;
-- 
GitLab


From 472e2c62c9ba5c76f1dc6bbe55686d85976aa28f Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Mon, 7 Apr 2025 14:23:28 +0200
Subject: [PATCH 092/108] Optimized Fc Impl

---
 .../backend/cpu/operator/FCImpl_kernels.hpp   | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp
index b77f749f..ca4d5def 100644
--- a/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp
@@ -96,21 +96,16 @@ void FCImpl_cpu_forward_kernel(const DimSize_t batchSize,
     const B* biases = static_cast<const B*>(biases_);
     O* output = static_cast<O*>(output_);
 
-    if (biases == nullptr) {
-        std::fill(output, output+(batchSize*outputFeatureSize), B(0));
-    }
-    else {
-        for (std::size_t batch = 0; batch < batchSize; ++batch) {
-            std::copy(biases, biases+outputFeatureSize, output+(batch*outputFeatureSize));
-        }
-    }
-
-    for (std::size_t batch = 0; batch < batchSize; ++batch) {
-        for (std::size_t out = 0; out < outputFeatureSize; ++out) {
+#ifdef _OPENMP
+    #pragma omp parallel for collapse(2) if (batchSize * outputFeatureSize > 32)
+#endif
+    for (int batch = 0; batch < static_cast<int>(batchSize); ++batch) {
+        for (int out = 0; out < static_cast<int>(outputFeatureSize); ++out) {
+            const auto biasVal = (biases) ? biases[out] : B(0);
             output[out + batch*outputFeatureSize] = std::inner_product(input + batch*inputFeatureSize,
                                                         input + (batch + 1)*inputFeatureSize,
                                                         weights + out*inputFeatureSize,
-                                                        output[out + batch*outputFeatureSize]);
+                                                        biasVal);
         }
     }
 }
-- 
GitLab


From b9b5fa3abdccf83e77015908085eabfa7b033859 Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Mon, 7 Apr 2025 15:06:50 +0200
Subject: [PATCH 093/108] Fixed wrong offsets

---
 .../cpu/operator/ConvDepthWiseImpl_kernels.hpp       |  4 ++--
 .../aidge/backend/cpu/operator/ConvImpl_kernels.hpp  | 12 ++++++------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp
index 3019b1d2..aac83b1b 100644
--- a/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp
@@ -210,8 +210,8 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri
                 const std::size_t wIndex = ch;
 
                 if (strideDims[0] == 1 && strideDims[1] == 1) {
-                    for (std::size_t i = iIndex; i < iIndex + oxSize*oySize; ++i) {
-                        output[oIndex + i] = biasVal + weights[wIndex] * input[i];
+                    for (std::size_t i = 0; i < oxSize*oySize; ++i) {
+                        output[oIndex + i] = biasVal + weights[wIndex] * input[iIndex + i];
                     }
                 } else  {
                     for (std::size_t ox = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=strideDims[0]*inputDims[3]) {
diff --git a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
index d7276160..fc3904ad 100644
--- a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
@@ -486,13 +486,13 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims,
 #endif
         for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) {
             for (int outCh = 0; outCh < static_cast<int>(outChannels); ++outCh) {
-                std::size_t oIndex = (outCh + batch*inputDims[1]) * outChannels_s;
+                std::size_t oIndex = (outCh + batch*outChannels) * outChannels_s;
 
                 // If bias = nullptr, set B(0)
                 B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
                 std::fill(output + oIndex, output + oIndex + outChannels_s, biasVal);
                 for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
-                    oIndex = (outCh + batch*inputDims[1]) * outChannels_s;
+                    oIndex = (outCh + batch*outChannels) * outChannels_s;
                     std::size_t iIndex = (inCh + batch * inputDims[1]) *
                                          inputDims[2] * inputDims[3];
                     const std::size_t wIndex =
@@ -575,13 +575,13 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims,
 #endif
         for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) {
             for (int outCh = 0; outCh < static_cast<int>(outChannels); ++outCh) {
-                std::size_t oIndex = (outCh + batch*inputDims[1]) * outChannels_s;
+                std::size_t oIndex = (outCh + batch*outChannels) * outChannels_s;
 
                 // If bias = nullptr, set B(0)
                 B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
                 std::fill(output + oIndex, output + oIndex + outChannels_s, biasVal);
                 for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
-                    oIndex = (outCh + batch*inputDims[1]) * outChannels_s;
+                    oIndex = (outCh + batch*outChannels) * outChannels_s;
                     std::size_t iIndex = (inCh + batch * inputDims[1]) *
                                          inputDims[2] * inputDims[3];
                     const std::size_t wIndex = (inCh + outCh * inputDims[1]);
@@ -611,13 +611,13 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims,
 #endif
         for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) {
             for (int outCh = 0; outCh < static_cast<int>(outChannels); ++outCh) {
-                std::size_t oIndex = (outCh + batch*inputDims[1]) * outChannels_s;
+                std::size_t oIndex = (outCh + batch*outChannels) * outChannels_s;
 
                 // If bias = nullptr, set B(0)
                 B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
                 std::fill(output + oIndex, output + oIndex + outChannels_s, biasVal);
                 for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
-                    oIndex = (outCh + batch*inputDims[1]) * outChannels_s;
+                    oIndex = (outCh + batch*outChannels) * outChannels_s;
                     std::size_t iIndex_channel =
                         (inCh + batch * inputDims[1]) * inputDims[2] *
                         inputDims[3];
-- 
GitLab


From 6299d19022b79154ed34ca7b9090be10f2c8de8a Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Mon, 7 Apr 2025 15:13:08 +0200
Subject: [PATCH 094/108] Added OpenMP to MatMul

---
 include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp
index 5fc13baf..ec55d31a 100644
--- a/include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp
@@ -26,6 +26,9 @@ void MatMulImpl_cpu_forward_kernel(const std::size_t n, const std::size_t k, con
 
     std::memset(output, O(0), n * m * sizeof(O));
 
+#ifdef _OPENMP
+    #pragma omp parallel for if (n > 32)
+#endif
     for (std::size_t i = 0; i < n; ++i) {
         for (std::size_t l = 0; l < k; ++l) {
             for (std::size_t j = 0; j < m; ++j) {
-- 
GitLab


From 9545a3a7530aeb0636d0c993c71af5b2fa7b8b9b Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Mon, 7 Apr 2025 15:30:14 +0200
Subject: [PATCH 095/108] Fixed Windows build

---
 include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp
index ec55d31a..422020a6 100644
--- a/include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp
@@ -29,7 +29,7 @@ void MatMulImpl_cpu_forward_kernel(const std::size_t n, const std::size_t k, con
 #ifdef _OPENMP
     #pragma omp parallel for if (n > 32)
 #endif
-    for (std::size_t i = 0; i < n; ++i) {
+    for (int i = 0; i < static_cast<int>(n); ++i) {
         for (std::size_t l = 0; l < k; ++l) {
             for (std::size_t j = 0; j < m; ++j) {
                 output[i*m + j] += static_cast<O>(input1[i*k + l] * input2[l*m + j]);
-- 
GitLab


From 77cd45748e96a401aca25c75cf19f3e98ab7bc25 Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Tue, 8 Apr 2025 09:41:08 +0200
Subject: [PATCH 096/108] Fix backward not being run in unit test

---
 unit_tests/scheduler/Test_Scheduler.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unit_tests/scheduler/Test_Scheduler.cpp b/unit_tests/scheduler/Test_Scheduler.cpp
index eed4185d..0dfdbb30 100644
--- a/unit_tests/scheduler/Test_Scheduler.cpp
+++ b/unit_tests/scheduler/Test_Scheduler.cpp
@@ -436,6 +436,7 @@ TEST_CASE("[cpu/scheduler] SequentialScheduler(backward)", "[scheduler][backward
     // implem already set to default
     auto myProd = Producer(inputTensor, "prod");
     myProd -> addChild(gv);
+    gv->add(myProd);
     gv -> compile("cpu", DataType::Float32);
 
     SequentialScheduler scheduler(gv);
-- 
GitLab


From c1849ef26808f618c80fc81df7161f9b44e0111a Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Tue, 8 Apr 2025 19:38:46 +0200
Subject: [PATCH 097/108] Reduced OpenMP thresholds

---
 .../aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp | 2 +-
 .../aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp  | 2 +-
 .../backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp    | 8 ++++----
 include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp   | 8 ++++----
 include/aidge/backend/cpu/operator/FCImpl_kernels.hpp     | 2 +-
 .../cpu/operator/GlobalAveragePoolingImpl_kernels.hpp     | 2 +-
 include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp | 2 +-
 .../aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp | 2 +-
 .../aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp    | 2 +-
 9 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
index e7bc3a2b..f9cc13b5 100644
--- a/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
@@ -77,7 +77,7 @@ void AvgPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideD
     using signedsize = std::make_signed<std::size_t>::type;
 
 #ifdef _OPENMP
-    #pragma omp parallel for collapse(2) if (dims[0] * dims[1] > 32)
+    #pragma omp parallel for collapse(2) if (dims[0] * dims[1] >= 16)
 #endif
     for (int batch = 0; batch < static_cast<int>(dims[0]); ++batch) {
         for (int ch = 0; ch < static_cast<int>(dims[1]); ++ch) {
diff --git a/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp b/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp
index 105a3300..d1d7d529 100644
--- a/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp
@@ -54,7 +54,7 @@ void BatchNormImpl2D_cpu_forward_kernel(float epsilon, float momentum, const std
 
     if ((freeze == true) || (momentum == 0.0f)) {
 #ifdef _OPENMP
-        #pragma omp parallel for collapse(2) if (nbBatch * nbChannels > 32)
+        #pragma omp parallel for collapse(2) if (nbBatch * nbChannels >= 16)
 #endif
         for (int batch = 0; batch < static_cast<int>(nbBatch); ++batch) {
             for (int ch = 0; ch < static_cast<int>(nbChannels); ++ch) {
diff --git a/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp
index aac83b1b..0e2f5a72 100644
--- a/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp
@@ -66,7 +66,7 @@ void ConvDepthWiseImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& stri
     // does not take Dilation attribute into account
     using signedsize = std::make_signed<std::size_t>::type;
 #ifdef _OPENMP
-    #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] > 32)
+    #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] >= 16)
 #endif
     for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) {
         for (int ch = 0; ch < static_cast<int>(inputDims[1]); ++ch) {
@@ -156,7 +156,7 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri
 
     if (dilated_kernel_x ==3 && dilated_kernel_y == 3) {
 #ifdef _OPENMP
-        #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] > 32)
+        #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] >= 16)
 #endif
         for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) {
             for (int ch = 0; ch < static_cast<int>(inputDims[1]); ++ch) {
@@ -199,7 +199,7 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri
         }
     } else if (dilated_kernel_x == 1 && dilated_kernel_y == 1) {
 #ifdef _OPENMP
-        #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] > 32)
+        #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] >= 16)
 #endif
         for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) {
             for (int ch = 0; ch < static_cast<int>(inputDims[1]); ++ch) {
@@ -224,7 +224,7 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri
         }
     } else {
 #ifdef _OPENMP
-        #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] > 32)
+        #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] >= 16)
 #endif
         for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) {
             for (int ch = 0; ch < static_cast<int>(inputDims[1]); ++ch) {
diff --git a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
index fc3904ad..e1e76a33 100644
--- a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
@@ -60,7 +60,7 @@ void ConvImpl1D_cpu_forward_kernel(const array<DimSize_t, 1> &strideDim,
 
     using signedsize = std::make_signed<std::size_t>::type;
 #ifdef _OPENMP
-    #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels > 32)
+    #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels >= 16)
 #endif
     for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) {
         for (int outCh = 0; outCh < static_cast<int>(outChannels); ++outCh) {
@@ -482,7 +482,7 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims,
 
     if (dilated_kernel_x == 3 && dilated_kernel_y == 3) {
 #ifdef _OPENMP
-        #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels > 32)
+        #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels >= 16)
 #endif
         for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) {
             for (int outCh = 0; outCh < static_cast<int>(outChannels); ++outCh) {
@@ -571,7 +571,7 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims,
         }
     } else if (dilated_kernel_x == 1 && dilated_kernel_y == 1) {
 #ifdef _OPENMP
-        #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels > 32)
+        #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels >= 16)
 #endif
         for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) {
             for (int outCh = 0; outCh < static_cast<int>(outChannels); ++outCh) {
@@ -607,7 +607,7 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims,
         }
     } else {
 #ifdef _OPENMP
-        #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels > 32)
+        #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels >= 16)
 #endif
         for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) {
             for (int outCh = 0; outCh < static_cast<int>(outChannels); ++outCh) {
diff --git a/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp
index ca4d5def..b03e7f58 100644
--- a/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp
@@ -97,7 +97,7 @@ void FCImpl_cpu_forward_kernel(const DimSize_t batchSize,
     O* output = static_cast<O*>(output_);
 
 #ifdef _OPENMP
-    #pragma omp parallel for collapse(2) if (batchSize * outputFeatureSize > 32)
+    #pragma omp parallel for collapse(2) if (batchSize * outputFeatureSize >= 16)
 #endif
     for (int batch = 0; batch < static_cast<int>(batchSize); ++batch) {
         for (int out = 0; out < static_cast<int>(outputFeatureSize); ++out) {
diff --git a/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp
index 8ff1ad08..3cab0ad9 100644
--- a/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp
@@ -75,7 +75,7 @@ void GlobalAveragePoolingImpl_cpu_forward_kernel(const std::shared_ptr<Tensor>&
     // parse channel by channel and fill each output with the average of the
     // values in the channel
 #ifdef _OPENMP
-    #pragma omp parallel for collapse(2) if (dims[0] * dims[1] > 32)
+    #pragma omp parallel for collapse(2) if (dims[0] * dims[1] >= 16)
 #endif
     for (int batch = 0; batch < static_cast<int>(dims[0]); ++batch) {
         for (int channel = 0; channel < static_cast<int>(dims[1]); ++channel) {
diff --git a/include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp
index 422020a6..adcc8ddc 100644
--- a/include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp
@@ -27,7 +27,7 @@ void MatMulImpl_cpu_forward_kernel(const std::size_t n, const std::size_t k, con
     std::memset(output, O(0), n * m * sizeof(O));
 
 #ifdef _OPENMP
-    #pragma omp parallel for if (n > 32)
+    #pragma omp parallel for if (n >= 16)
 #endif
     for (int i = 0; i < static_cast<int>(n); ++i) {
         for (std::size_t l = 0; l < k; ++l) {
diff --git a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
index b5f219f9..7fe272d5 100644
--- a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
@@ -67,7 +67,7 @@ void MaxPoolingImpl2D_cpu_forward_kernel(
   using signedsize = std::make_signed<std::size_t>::type;
 
 #ifdef _OPENMP
-    #pragma omp parallel for collapse(2) if (dims[0] * dims[1] > 32)
+    #pragma omp parallel for collapse(2) if (dims[0] * dims[1] >= 16)
 #endif
   for (int batch = 0; batch < static_cast<int>(dims[0]); ++batch){
     for (int channel = 0; channel < static_cast<int>(dims[1]); ++channel){
diff --git a/include/aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp b/include/aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp
index ab6790e2..0e72710c 100644
--- a/include/aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp
@@ -38,7 +38,7 @@ void SoftmaxImpl_cpu_forward_kernel(std::size_t axisIdx, const std::vector<DimSi
     }
 
 #ifdef _OPENMP
-    #pragma omp parallel for collapse(2) if (preAxisElems * postAxisElems > 32)
+    #pragma omp parallel for collapse(2) if (preAxisElems * postAxisElems >= 16)
 #endif
     for (int i = 0; i < static_cast<int>(preAxisElems); ++i) {
         for (int j = 0; j < static_cast<int>(postAxisElems); ++j) {
-- 
GitLab


From 914cdda1a1c66a79b2248bcc309a2ade58300dfd Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Wed, 9 Apr 2025 15:34:16 +0200
Subject: [PATCH 098/108] Fixed compilation of other modules

---
 CMakeLists.txt                    | 3 ++-
 aidge_backend_cpu-config.cmake.in | 4 ++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d2c1d0a7..ce1b5062 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -89,7 +89,8 @@ target_link_libraries(${module_name}
 )
 
 if(OpenMP_CXX_FOUND)
-    target_link_libraries(${module_name} PUBLIC OpenMP::OpenMP_CXX)
+    target_link_libraries(${module_name} PRIVATE OpenMP::OpenMP_CXX)
+    set(AIDGE_REQUIRES_OPENMP TRUE)
 endif()
 
 # Add definition _USE_MATH_DEFINES to enable math constant definitions from math.h/cmath.
diff --git a/aidge_backend_cpu-config.cmake.in b/aidge_backend_cpu-config.cmake.in
index 7582102c..35865c71 100644
--- a/aidge_backend_cpu-config.cmake.in
+++ b/aidge_backend_cpu-config.cmake.in
@@ -2,6 +2,10 @@
 
 include(CMakeFindDependencyMacro)
 find_dependency(aidge_core)
+set(AIDGE_REQUIRES_OPENMP @AIDGE_REQUIRES_OPENMP@)
+if (AIDGE_REQUIRES_OPENMP)
+    find_dependency(OpenMP)
+endif()
 set(AIDGE_REQUIRES_OPENSSL @AIDGE_REQUIRES_OPENSSL@)
 if (AIDGE_REQUIRES_OPENSSL)
     find_dependency(OpenSSL)
-- 
GitLab


From 7d8a52b4e02173bb391f31067ffdf257ea75c5aa Mon Sep 17 00:00:00 2001
From: Antoni Olivier <olivier.antoni@cea.fr>
Date: Thu, 10 Apr 2025 14:02:59 +0200
Subject: [PATCH 099/108] Fix update gradient tensor dimesions

---
 unit_tests/operator/Test_AddImpl.cpp |  14 +-
 unit_tests/operator/Test_DivImpl.cpp |  13 +-
 unit_tests/operator/Test_MulImpl.cpp |  13 +-
 unit_tests/operator/Test_PowImpl.cpp | 974 ++++++++++++++-------------
 unit_tests/operator/Test_SubImpl.cpp |  12 +-
 5 files changed, 512 insertions(+), 514 deletions(-)

diff --git a/unit_tests/operator/Test_AddImpl.cpp b/unit_tests/operator/Test_AddImpl.cpp
index 4538b322..d9adb484 100644
--- a/unit_tests/operator/Test_AddImpl.cpp
+++ b/unit_tests/operator/Test_AddImpl.cpp
@@ -159,10 +159,10 @@ TEST_CASE("[cpu/operator] Add(backward)", "[Add][CPU]") {
 
         op->associateInput(0, T0);
         op->associateInput(1, T1);
-        op->getOutput(0)->setGrad(std::make_shared<Tensor>(
-            Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}})));
         op->forwardDims();
 
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>(
+            Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}})));
         op->backward();
 
         const Tensor expectedGrad0 =
@@ -194,8 +194,9 @@ TEST_CASE("[cpu/operator] Add(backward)", "[Add][CPU]") {
 
         op->associateInput(0, T0);
         op->associateInput(1, T1);
-        op->getOutput(0)->setGrad(newGrad);
         op->forwardDims();
+
+        op->getOutput(0)->setGrad(newGrad);
         op->backward();
 
         REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
@@ -236,9 +237,9 @@ TEST_CASE("[cpu/operator] Add(backward)", "[Add][CPU]") {
 
         op->associateInput(0, T0);
         op->associateInput(1, T1);
-        op->getOutput(0)->setGrad(newGrad);
         op->forwardDims();
 
+        op->getOutput(0)->setGrad(newGrad);
         op->backward();
 
         REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
@@ -290,8 +291,8 @@ TEST_CASE("[cpu/operator] Add(backward)", "[Add][CPU]") {
 
         op->associateInput(0, T0);
         op->associateInput(1, T1);
-        op->getOutput(0)->setGrad(newGrad);
         op->forwardDims();
+        op->getOutput(0)->setGrad(newGrad);
 
         op->backward();
 
@@ -364,8 +365,7 @@ TEST_CASE("[cpu/operator] Add(backward)", "[Add][CPU]") {
             val = dist(gen);
         }
 
-        op->getOutput(0)->setGrad(std::make_shared<Tensor>());
-        op->getOutput(0)->grad()->resize(outputDims);
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>(outputDims));
         op->getOutput(0)->grad()->getImpl()->setRawPtr(gradOutputData.data(),
                                                        expectedOutput.size());
 
diff --git a/unit_tests/operator/Test_DivImpl.cpp b/unit_tests/operator/Test_DivImpl.cpp
index 4e7657ed..f7993753 100644
--- a/unit_tests/operator/Test_DivImpl.cpp
+++ b/unit_tests/operator/Test_DivImpl.cpp
@@ -339,10 +339,10 @@ TEST_CASE("[CPU/Operator] Div(Backward)", "[Div][CPU][Backward]") {
 
         op->associateInput(0, T0);
         op->associateInput(1, T1);
-        op->getOutput(0)->setGrad(std::make_shared<Tensor>(
-            Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}})));
         op->forwardDims();
 
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>(
+            Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}})));
         op->backward();
 
         const Tensor expectedGrad0 =
@@ -373,9 +373,9 @@ TEST_CASE("[CPU/Operator] Div(Backward)", "[Div][CPU][Backward]") {
 
         op->associateInput(0, T0);
         op->associateInput(1, T1);
-        op->getOutput(0)->setGrad(newGrad);
         op->forwardDims();
 
+        op->getOutput(0)->setGrad(newGrad);
         op->backward();
 
         REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
@@ -415,9 +415,9 @@ TEST_CASE("[CPU/Operator] Div(Backward)", "[Div][CPU][Backward]") {
 
         op->associateInput(0, T0);
         op->associateInput(1, T1);
-        op->getOutput(0)->setGrad(newGrad);
         op->forwardDims();
 
+        op->getOutput(0)->setGrad(newGrad);
         op->backward();
 
         REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
@@ -471,9 +471,9 @@ TEST_CASE("[CPU/Operator] Div(Backward)", "[Div][CPU][Backward]") {
 
         op->associateInput(0, T0);
         op->associateInput(1, T1);
-        op->getOutput(0)->setGrad(newGrad);
         op->forwardDims();
 
+        op->getOutput(0)->setGrad(newGrad);
         op->backward();
 
         REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
@@ -545,8 +545,7 @@ TEST_CASE("[CPU/Operator] Div(Backward)", "[Div][CPU][Backward]") {
             val = dist(gen);
         }
 
-        op->getOutput(0)->setGrad(std::make_shared<Tensor>());
-        op->getOutput(0)->grad()->resize(outputDims);
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>(outputDims));
         op->getOutput(0)->grad()->getImpl()->setRawPtr(gradOutputData.data(),
                                                        expectedOutput.size());
 
diff --git a/unit_tests/operator/Test_MulImpl.cpp b/unit_tests/operator/Test_MulImpl.cpp
index 2937e949..a8e0fbdd 100644
--- a/unit_tests/operator/Test_MulImpl.cpp
+++ b/unit_tests/operator/Test_MulImpl.cpp
@@ -46,10 +46,10 @@ TEST_CASE("[CPU/Operator] Mul(Backward)", "[Mul][CPU][Backward]") {
 
         op->associateInput(0, T0);
         op->associateInput(1, T1);
-        op->getOutput(0)->setGrad(std::make_shared<Tensor>(
-            Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}})));
         op->forwardDims();
 
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>(
+            Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}})));
         op->backward();
 
         const Tensor expectedGrad0 =
@@ -80,9 +80,9 @@ TEST_CASE("[CPU/Operator] Mul(Backward)", "[Mul][CPU][Backward]") {
 
         op->associateInput(0, T0);
         op->associateInput(1, T1);
-        op->getOutput(0)->setGrad(newGrad);
         op->forwardDims();
 
+        op->getOutput(0)->setGrad(newGrad);
         op->backward();
 
         REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
@@ -122,9 +122,9 @@ TEST_CASE("[CPU/Operator] Mul(Backward)", "[Mul][CPU][Backward]") {
 
         op->associateInput(0, T0);
         op->associateInput(1, T1);
-        op->getOutput(0)->setGrad(newGrad);
         op->forwardDims();
 
+        op->getOutput(0)->setGrad(newGrad);
         op->backward();
 
         REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
@@ -176,9 +176,9 @@ TEST_CASE("[CPU/Operator] Mul(Backward)", "[Mul][CPU][Backward]") {
 
         op->associateInput(0, T0);
         op->associateInput(1, T1);
-        op->getOutput(0)->setGrad(newGrad);
         op->forwardDims();
 
+        op->getOutput(0)->setGrad(newGrad);
         op->backward();
 
         REQUIRE(approxEq<cpptype_t<DataType::Float32>>(*(op->getInput(0)->grad()), expectedGrad0));
@@ -250,8 +250,7 @@ TEST_CASE("[CPU/Operator] Mul(Backward)", "[Mul][CPU][Backward]") {
             val = dist(gen);
         }
 
-        op->getOutput(0)->setGrad(std::make_shared<Tensor>());
-        op->getOutput(0)->grad()->resize(outputDims);
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>(outputDims));
         op->getOutput(0)->grad()->getImpl()->setRawPtr(gradOutputData.data(),
                                                        expectedOutput.size());
 
diff --git a/unit_tests/operator/Test_PowImpl.cpp b/unit_tests/operator/Test_PowImpl.cpp
index 55a416c3..8f3b2c35 100644
--- a/unit_tests/operator/Test_PowImpl.cpp
+++ b/unit_tests/operator/Test_PowImpl.cpp
@@ -1,486 +1,488 @@
-/********************************************************************************
- * Copyright (c) 2023 CEA-List
- *
- * This program and the accompanying materials are made available under the
- * terms of the Eclipse Public License 2.0 which is available at
- * http://www.eclipse.org/legal/epl-2.0.
- *
- * SPDX-License-Identifier: EPL-2.0
- *
- ********************************************************************************/
-
-#include <chrono>      // std::micro, std::chrono::time_point,
-                       // std::chrono::system_clock, std::chrono::duration
-#include <cstddef>     // std::size_t
-#include <cstdint>     // std::uint16_t
-#include <functional>  // std::multiplies
-#include <memory>
-#include <numeric>     // std::accumulate
-#include <random>      // std::random_device, std::mt19937
-                       // std::uniform_int_distribution, std::uniform_real_distribution
-#include <vector>
-
-#include <catch2/catch_test_macros.hpp>
-#include <fmt/core.h>
-
-#include "aidge/backend/cpu/data/TensorImpl.hpp"
-#include "aidge/backend/cpu/operator/PowImpl.hpp"
-#include "aidge/data/Data.hpp"
-#include "aidge/data/Tensor.hpp"
-#include "aidge/operator/Pow.hpp"
-#include "aidge/utils/ArrayHelpers.hpp"
-#include "aidge/utils/TensorUtils.hpp"
-
-namespace Aidge {
-
-TEST_CASE("[cpu/operator] Pow", "[Pow][CPU]") {
-    constexpr std::uint16_t NBTRIALS = 10;
-    // Create a random number generator
-    std::random_device rd;
-    std::mt19937 gen(rd());
-    std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
-    std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
-    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
-    std::uniform_int_distribution<int> boolDist(0,1);
-
-    // Create MatPow Operator
-    std::shared_ptr<Node> myPow = Pow();
-    auto op = std::static_pointer_cast<OperatorTensor>(myPow-> getOperator());
-    op->setDataType(DataType::Float32);
-    op->setBackend("cpu");
-
-    // Create 2 input Tensors
-    std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>();
-    op->associateInput(0,T0);
-    T0->setDataType(DataType::Float32);
-    T0->setBackend("cpu");
-    std::shared_ptr<Tensor> T1 = std::make_shared<Tensor>();
-    op -> associateInput(1,T1);
-    T1->setDataType(DataType::Float32);
-    T1->setBackend("cpu");
-
-    // Create results Tensor
-    std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>();
-    Tres->setDataType(DataType::Float32);
-    Tres->setBackend("cpu");
-
-    // To measure execution time of 'MatPow_Op::forward()' member function call
-    std::chrono::time_point<std::chrono::system_clock> start;
-    std::chrono::time_point<std::chrono::system_clock> end;
-    std::chrono::duration<double, std::micro> duration{};
-
-    SECTION("PowImpl_cpu::forward()") {
-        SECTION("Scalar / Scalar") {
-
-        }
-        SECTION("Scalar / +1-D Tensor") {
-
-        }
-        SECTION("+1-D Tensor / +1-D Tensor - same dimensions") {
-            std::size_t number_of_operation = 0;
-
-            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
-                // generate 2 random Tensors
-                const std::size_t nbDims = nbDimsDist(gen);
-                std::vector<std::size_t> dims;
-                for (std::size_t i = 0; i < nbDims; ++i) {
-                    dims.push_back(dimSizeDist(gen));
-                }
-                const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
-                number_of_operation += nb_elements;
-
-                // without broadcasting
-                float* array0 = new float[nb_elements];
-                float* array1 = new float[nb_elements];
-                float* result = new float[nb_elements];
-
-                for (std::size_t i = 0; i < nb_elements; ++i) {
-                    array0[i] = valueDist(gen);
-                    array1[i] = valueDist(gen);
-                    result[i] = std::pow(array0[i], array1[i]);
-                }
-
-                // input0
-                T0->resize(dims);
-                T0 -> getImpl() -> setRawPtr(array0, nb_elements);
-
-                // input1
-                T1->resize(dims);
-                T1 -> getImpl() -> setRawPtr(array1, nb_elements);
-
-                // results
-                Tres->resize(dims);
-                Tres -> getImpl() -> setRawPtr(result, nb_elements);
-
-                op->forwardDims();
-                start = std::chrono::system_clock::now();
-                myPow->forward();
-                end = std::chrono::system_clock::now();
-                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
-
-                REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
-
-                delete[] array0;
-                delete[] array1;
-                delete[] result;
-
-                // with broadcasting
-            }
-            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
-            Log::info("total time: {} μs\n", duration.count());
-        }
-
-        SECTION("+1-D Tensor / +1-D Tensor - broadcasting") {
-            std::size_t number_of_operation = 0;
-
-            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
-                // generate 2 random Tensors
-                // handle dimensions, replace some dimensions with '1' to get broadcasting
-                constexpr std::size_t nbDims = 4;
-                std::vector<std::size_t> dims;
-                for (std::size_t i = 0; i < nbDims; ++i) {
-                    dims.push_back(dimSizeDist(gen));
-                }
-                std::vector<std::size_t> dims0 = dims;
-                std::vector<std::size_t> dims1 = dims;
-                std::vector<std::size_t> dimsOut = dims;
-                for (std::size_t i = 0; i < nbDims; ++i) {
-                    if (boolDist(gen)) {
-                        dims0[i] = 1;
-                    }
-                    if (boolDist(gen)) {
-                        dims1[i] = 1;
-                    }
-                    dimsOut[i] = (dims0[i] == 1) ? dims1[i] : dims0[i];
-                }
-
-                // create arrays and fill them with random values
-                float* array0 = new float[dims0[0]*dims0[1]*dims0[2]*dims0[3]];
-                float* array1 = new float[dims1[0]*dims1[1]*dims1[2]*dims1[3]];
-                float* result = new float[dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]];
-
-                for (std::size_t i = 0; i < dims0[0]*dims0[1]*dims0[2]*dims0[3]; ++i) {
-                    array0[i] = valueDist(gen);
-                }
-                for (std::size_t i = 0; i < dims1[0]*dims1[1]*dims1[2]*dims1[3]; ++i) {
-                    array1[i] = valueDist(gen);
-                }
-
-                // compute true result
-                const std::size_t strides0[nbDims] = {dims0[1]*dims0[2]*dims0[3], dims0[2]*dims0[3], dims0[3], 1};
-                const std::size_t strides1[nbDims] = {dims1[1]*dims1[2]*dims1[3], dims1[2]*dims1[3], dims1[3], 1};
-                for (std::size_t a = 0; a < dimsOut[0]; ++a) {
-                    for (std::size_t b = 0; b < dimsOut[1]; ++b) {
-                        const std::size_t idx0_0 = strides0[0] * ((dims0[0] > 1) ? a : 0)
-                                                    + strides0[1] * ((dims0[1] > 1) ? b : 0);
-                        const std::size_t idx1_0 = strides1[0] * ((dims1[0] > 1) ? a : 0)
-                                                    + strides1[1] * ((dims1[1] > 1) ? b : 0);
-                        for (std::size_t c = 0; c < dimsOut[2]; ++c) {
-                            const std::size_t idx_out = dimsOut[3] * (c + dimsOut[2] * (b + dimsOut[1] * a));
-                            for (std::size_t d = 0; d < dimsOut[3]; ++d) {
-                                std::size_t idx0 = idx0_0
-                                                    + strides0[2] * ((dims0[2] > 1) ? c : 0)
-                                                    + ((dims0[3] > 1) ? d : 0);
-                                std::size_t idx1 = idx1_0
-                                                    + strides1[2] * ((dims1[2] > 1) ? c : 0)
-                                                    + ((dims1[3] > 1) ? d : 0);
-                                result[idx_out + d] = std::pow(array0[idx0], array1[idx1]);
-                                // std::cout << "(" << idx0 << ", " << idx1 << ") -> " << array0[idx0] << " ** " << array1[idx1] << " -> " << idx_out + d << std::endl;
-                            }
-                        }
-                    }
-                }
-
-                // conversion to Aidge::Tensors
-                // input0
-                T0->resize(dims0);
-                T0 -> getImpl() -> setRawPtr(array0, dims0[0]*dims0[1]*dims0[2]*dims0[3]);
-
-                // input1
-                T1->resize(dims1);
-                T1 -> getImpl() -> setRawPtr(array1, dims1[0]*dims1[1]*dims1[2]*dims1[3]);
-
-                // results
-                Tres->resize(dimsOut);
-                Tres -> getImpl() -> setRawPtr(result, dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]);
-
-                // compute result
-                op->forwardDims();
-                start = std::chrono::system_clock::now();
-                myPow->forward();
-                end = std::chrono::system_clock::now();
-                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
-
-                // comparison between truth and computed result
-                REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
-
-                delete[] array0;
-                delete[] array1;
-                delete[] result;
-
-                const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
-                number_of_operation += nb_elements;
-            }
-            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
-            Log::info("total time: {} μs\n", duration.count());
-        }
-        SECTION("+1-D Tensor / 1-D Tensor") {
-            std::size_t number_of_operation = 0;
-            std::uniform_int_distribution<std::size_t> nbRemovedDimsDist(std::size_t(1), std::size_t(3));
-
-            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
-                // generate 2 random Tensors
-                // handle dimensions
-                constexpr std::size_t nbDims = 4;
-                std::vector<std::size_t> dims0(4);
-                for (std::size_t i = 0; i < nbDims; ++i) {
-                    dims0[i] = dimSizeDist(gen);
-                }
-                std::vector<std::size_t> dimsOut = dims0;
-                std::vector<std::size_t> dims1 = dims0;
-                for (std::size_t i = 0; i < nbDims; ++i) {
-                    if (boolDist(gen)) {
-                        dims1[i] = 1;
-                    }
-                }
-                dims1.erase(dims1.cbegin(), dims1.cbegin() + nbRemovedDimsDist(gen));
-
-                // create arrays and fill them with random values
-                float* array0 = new float[dims0[0]*dims0[1]*dims0[2]*dims0[3]];
-                std::size_t array1_size = std::accumulate(dims1.cbegin(), dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
-                float* array1 = new float[array1_size];
-                float* result = new float[dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]];
-
-                for (std::size_t i = 0; i < (dims0[0]*dims0[1]*dims0[2]*dims0[3]); ++i) {
-                    array0[i] = valueDist(gen);
-                }
-                for (std::size_t i = 0; i < array1_size; ++i) {
-                    array1[i] = valueDist(gen);
-                }
-
-                // compute true result
-                auto dims1_tmp = dims1;
-                dims1_tmp.insert(dims1_tmp.cbegin(), 4 - dims1_tmp.size(), std::size_t(1));
-
-                const std::size_t strides0[nbDims] = {dims0[1]*dims0[2]*dims0[3], dims0[2]*dims0[3], dims0[3], 1};
-                const std::size_t strides1[nbDims] = {dims1_tmp[1]*dims1_tmp[2]*dims1_tmp[3], dims1_tmp[2]*dims1_tmp[3], dims1_tmp[3], 1};
-                for (std::size_t a = 0; a < dimsOut[0]; ++a) {
-                    for (std::size_t b = 0; b < dimsOut[1]; ++b) {
-                        const std::size_t idx0_0 = strides0[0] * ((dims0[0] > 1) ? a : 0)
-                                                    + strides0[1] * ((dims0[1] > 1) ? b : 0);
-                        const std::size_t idx1_0 = strides1[0] * ((dims1_tmp[0] > 1) ? a : 0)
-                                                    + strides1[1] * ((dims1_tmp[1] > 1) ? b : 0);
-                        for (std::size_t c = 0; c < dimsOut[2]; ++c) {
-                            const std::size_t idx_out = dimsOut[3] * (c + dimsOut[2] * (b + dimsOut[1] * a));
-                            for (std::size_t d = 0; d < dimsOut[3]; ++d) {
-                                std::size_t idx0 = idx0_0
-                                                    + strides0[2] * ((dims0[2] > 1) ? c : 0)
-                                                    + ((dims0[3] > 1) ? d : 0);
-                                std::size_t idx1 = idx1_0
-                                                    + strides1[2] * ((dims1_tmp[2] > 1) ? c : 0)
-                                                    + ((dims1_tmp[3] > 1) ? d : 0);
-                                result[idx_out + d] = std::pow(array0[idx0], array1[idx1]);
-                                // std::cout << "(" << idx0 << ", " << idx1 << ") -> " << array0[idx0] << " ** " << array1[idx1] << " -> " << idx_out + d << std::endl;
-                            }
-                        }
-                    }
-                }
-
-                // conversion to Aidge::Tensors
-                // input0
-                T0->resize(dims0);
-                T0 -> getImpl() -> setRawPtr(array0, dims0[0]*dims0[1]*dims0[2]*dims0[3]);
-
-                // input1
-                T1->resize(dims1);
-                T1 -> getImpl() -> setRawPtr(array1, array1_size);
-
-                // results
-                Tres->resize(dimsOut);
-                Tres -> getImpl() -> setRawPtr(result, dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]);
-
-                // compute result
-                op->forwardDims();
-                start = std::chrono::system_clock::now();
-                myPow->forward();
-                end = std::chrono::system_clock::now();
-                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
-
-                // comparison between truth and computed result
-                REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
-
-                delete[] array0;
-                delete[] array1;
-                delete[] result;
-
-                const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
-                number_of_operation += nb_elements;
-            }
-
-            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
-            Log::info("total time: {} μs\n", duration.count());
-        }
-    }
-
-
-    SECTION("PowImpl_cpu::backward()") {
-        SECTION("3D Tensors") {
-            const auto input0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
-                {
-                    {
-                        {
-                            {2.0, 3.0},
-                            {4.0, 5.0}
-                        },
-                        {
-                            {6.0, 7.0},
-                            {8.0, 9.0}
-                        }
-                    }
-                }
-            ));
-            const auto input1 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
-                {
-                    {
-                        {
-                            {1.0, 2.0},
-                            {3.0, 2.0}
-                        },
-                        {
-                            {2.0, 3.0},
-                            {1.0, 0.5}
-                        }
-                    }
-                }
-            ));
-            const auto gradOut = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
-                {
-                    {
-                        {
-                            {0.5, 1.0},
-                            {1.5, 2.0}
-                        },
-                        {
-                            {2.5, 3.0},
-                            {3.5, 4.0}
-                        }
-                    }
-                }
-            ));
-            const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
-                {
-                    {
-                        {
-                            {0.50000000,   6.00000000},
-                            {72.00000000,  20.00000000}
-                        },
-                        {
-                            {30.00000000, 441.00000000},
-                            {3.50000000,   0.66666669}
-                        }
-                    }
-                }
-            ));
-            const auto expectedGrad1 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
-                {
-                    {
-                        {
-                            {  0.693147182, 9.88751030},
-                            {1.33084259e+02, 8.04718933e+01}
-                        },
-                        {
-                            {1.61258362e+02, 2.00234143e+03},
-                            {5.82243652e+01, 2.63666954e+01}
-                        }
-                    }
-                }
-            ));
-            for(const auto T: {input0, input1, gradOut, expectedGrad0, expectedGrad1})
-            {
-                    T->setBackend("cpu") ;
-                    T->setDataType(DataType::Float32);
-            }
-            std::shared_ptr<Node> powOp = Pow();
-            auto opr = std::static_pointer_cast<OperatorTensor>(powOp-> getOperator());
-            opr->setDataType(DataType::Float32);
-            opr->setBackend("cpu");
-            opr->associateInput(0, input0);
-            opr->associateInput(1, input1);
-            opr->getOutput(0)->setGrad(gradOut);
-            opr->forward();
-
-            powOp->backward();
-            REQUIRE(approxEq<float>(*(opr->getInput(0)->grad()), *expectedGrad0));
-            REQUIRE(approxEq<float>(*(opr->getInput(1)->grad()), *expectedGrad1));
-        }
-        SECTION("Broadcasting") {
-            const auto input0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
-                {
-                    {
-                        {
-                            {1.0, 2.0, 3.0},
-                            {4.0, 5.0, 6.0}
-                        },
-                        {
-                            {1.5, 2.5, 3.5},
-                            {4.5, 5.5, 6.5}
-                        }
-                    }
-                }
-            ));
-            const auto input1 = std::make_shared<Tensor>(Array1D<float, 3>(
-                {
-                    {0.1, 0.2, 0.3}
-                }
-            ));
-
-            const auto gradOut = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
-                {
-                    {
-                        {
-                            {1.0, 2.0, 3.0},
-                            {4.0, 5.0, 6.0}
-                        },
-                        {
-                            {6.0, 5.0, 4.0},
-                            {3.0, 2.0, 1.0}
-                        }
-                    }
-                }
-            ));
-            const Tensor expectedGrad0 = Array3D<float, 2, 2, 3>(
-                {
-                    {
-                        {
-                            {0.10000000, 0.22973967, 0.41711676},
-                            {0.11486985, 0.27594593, 0.51353097}
-                        },
-                        {
-                            {0.41655189, 0.48044977, 0.49926791},
-                            {0.07748720, 0.10227509, 0.08092485}
-                        }
-                    }
-                }
-            );
-            const Tensor expectedGrad1 = Array1D<float, 3>(
-                {
-                    {14.14779854, 22.99299049, 33.56402588}
-                }
-            );
-
-            std::shared_ptr<Node> powOp = Pow();
-            auto opr = std::static_pointer_cast<OperatorTensor>(powOp-> getOperator());
-            opr->setDataType(DataType::Float32);
-            opr->setBackend("cpu");
-            opr->associateInput(0, input0);
-            opr->associateInput(1, input1);
-            opr->getOutput(0)->setGrad(gradOut);
-            powOp->forward();
-
-            powOp->backward();
-            REQUIRE(approxEq<float>(*(opr->getInput(0)->grad()), expectedGrad0));
-            REQUIRE(approxEq<float>(*(opr->getInput(1)->grad()), expectedGrad1));
-        }
-    }
-}
-} // namespace Aidge
+/********************************************************************************
+ * Copyright (c) 2023 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <chrono>      // std::micro, std::chrono::time_point,
+                       // std::chrono::system_clock, std::chrono::duration
+#include <cstddef>     // std::size_t
+#include <cstdint>     // std::uint16_t
+#include <functional>  // std::multiplies
+#include <memory>
+#include <numeric>     // std::accumulate
+#include <random>      // std::random_device, std::mt19937
+                       // std::uniform_int_distribution, std::uniform_real_distribution
+#include <vector>
+
+#include <catch2/catch_test_macros.hpp>
+#include <fmt/core.h>
+
+#include "aidge/backend/cpu/data/TensorImpl.hpp"
+#include "aidge/backend/cpu/operator/PowImpl.hpp"
+#include "aidge/data/Data.hpp"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/Pow.hpp"
+#include "aidge/utils/ArrayHelpers.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+namespace Aidge {
+
+TEST_CASE("[cpu/operator] Pow", "[Pow][CPU]") {
+    constexpr std::uint16_t NBTRIALS = 10;
+    // Create a random number generator
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> valueDist(0.1f, 1.1f); // Random float distribution between 0 and 1
+    std::uniform_int_distribution<std::size_t> dimSizeDist(std::size_t(2), std::size_t(10));
+    std::uniform_int_distribution<std::size_t> nbDimsDist(std::size_t(1), std::size_t(5));
+    std::uniform_int_distribution<int> boolDist(0,1);
+
+    // Create MatPow Operator
+    std::shared_ptr<Node> myPow = Pow();
+    auto op = std::static_pointer_cast<OperatorTensor>(myPow-> getOperator());
+    op->setDataType(DataType::Float32);
+    op->setBackend("cpu");
+
+    // Create 2 input Tensors
+    std::shared_ptr<Tensor> T0 = std::make_shared<Tensor>();
+    op->associateInput(0,T0);
+    T0->setDataType(DataType::Float32);
+    T0->setBackend("cpu");
+    std::shared_ptr<Tensor> T1 = std::make_shared<Tensor>();
+    op -> associateInput(1,T1);
+    T1->setDataType(DataType::Float32);
+    T1->setBackend("cpu");
+
+    // Create results Tensor
+    std::shared_ptr<Tensor> Tres = std::make_shared<Tensor>();
+    Tres->setDataType(DataType::Float32);
+    Tres->setBackend("cpu");
+
+    // To measure execution time of 'MatPow_Op::forward()' member function call
+    std::chrono::time_point<std::chrono::system_clock> start;
+    std::chrono::time_point<std::chrono::system_clock> end;
+    std::chrono::duration<double, std::micro> duration{};
+
+    SECTION("PowImpl_cpu::forward()") {
+        SECTION("Scalar / Scalar") {
+
+        }
+        SECTION("Scalar / +1-D Tensor") {
+
+        }
+        SECTION("+1-D Tensor / +1-D Tensor - same dimensions") {
+            std::size_t number_of_operation = 0;
+
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                // generate 2 random Tensors
+                const std::size_t nbDims = nbDimsDist(gen);
+                std::vector<std::size_t> dims;
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    dims.push_back(dimSizeDist(gen));
+                }
+                const std::size_t nb_elements = std::accumulate(dims.cbegin(), dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                number_of_operation += nb_elements;
+
+                // without broadcasting
+                float* array0 = new float[nb_elements];
+                float* array1 = new float[nb_elements];
+                float* result = new float[nb_elements];
+
+                for (std::size_t i = 0; i < nb_elements; ++i) {
+                    array0[i] = valueDist(gen);
+                    array1[i] = valueDist(gen);
+                    result[i] = std::pow(array0[i], array1[i]);
+                }
+
+                // input0
+                T0->resize(dims);
+                T0 -> getImpl() -> setRawPtr(array0, nb_elements);
+
+                // input1
+                T1->resize(dims);
+                T1 -> getImpl() -> setRawPtr(array1, nb_elements);
+
+                // results
+                Tres->resize(dims);
+                Tres -> getImpl() -> setRawPtr(result, nb_elements);
+
+                op->forwardDims();
+                start = std::chrono::system_clock::now();
+                myPow->forward();
+                end = std::chrono::system_clock::now();
+                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+                REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
+
+                delete[] array0;
+                delete[] array1;
+                delete[] result;
+
+                // with broadcasting
+            }
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {} μs\n", duration.count());
+        }
+
+        SECTION("+1-D Tensor / +1-D Tensor - broadcasting") {
+            std::size_t number_of_operation = 0;
+
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                // generate 2 random Tensors
+                // handle dimensions, replace some dimensions with '1' to get broadcasting
+                constexpr std::size_t nbDims = 4;
+                std::vector<std::size_t> dims;
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    dims.push_back(dimSizeDist(gen));
+                }
+                std::vector<std::size_t> dims0 = dims;
+                std::vector<std::size_t> dims1 = dims;
+                std::vector<std::size_t> dimsOut = dims;
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    if (boolDist(gen)) {
+                        dims0[i] = 1;
+                    }
+                    if (boolDist(gen)) {
+                        dims1[i] = 1;
+                    }
+                    dimsOut[i] = (dims0[i] == 1) ? dims1[i] : dims0[i];
+                }
+
+                // create arrays and fill them with random values
+                float* array0 = new float[dims0[0]*dims0[1]*dims0[2]*dims0[3]];
+                float* array1 = new float[dims1[0]*dims1[1]*dims1[2]*dims1[3]];
+                float* result = new float[dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]];
+
+                for (std::size_t i = 0; i < dims0[0]*dims0[1]*dims0[2]*dims0[3]; ++i) {
+                    array0[i] = valueDist(gen);
+                }
+                for (std::size_t i = 0; i < dims1[0]*dims1[1]*dims1[2]*dims1[3]; ++i) {
+                    array1[i] = valueDist(gen);
+                }
+
+                // compute true result
+                const std::size_t strides0[nbDims] = {dims0[1]*dims0[2]*dims0[3], dims0[2]*dims0[3], dims0[3], 1};
+                const std::size_t strides1[nbDims] = {dims1[1]*dims1[2]*dims1[3], dims1[2]*dims1[3], dims1[3], 1};
+                for (std::size_t a = 0; a < dimsOut[0]; ++a) {
+                    for (std::size_t b = 0; b < dimsOut[1]; ++b) {
+                        const std::size_t idx0_0 = strides0[0] * ((dims0[0] > 1) ? a : 0)
+                                                    + strides0[1] * ((dims0[1] > 1) ? b : 0);
+                        const std::size_t idx1_0 = strides1[0] * ((dims1[0] > 1) ? a : 0)
+                                                    + strides1[1] * ((dims1[1] > 1) ? b : 0);
+                        for (std::size_t c = 0; c < dimsOut[2]; ++c) {
+                            const std::size_t idx_out = dimsOut[3] * (c + dimsOut[2] * (b + dimsOut[1] * a));
+                            for (std::size_t d = 0; d < dimsOut[3]; ++d) {
+                                std::size_t idx0 = idx0_0
+                                                    + strides0[2] * ((dims0[2] > 1) ? c : 0)
+                                                    + ((dims0[3] > 1) ? d : 0);
+                                std::size_t idx1 = idx1_0
+                                                    + strides1[2] * ((dims1[2] > 1) ? c : 0)
+                                                    + ((dims1[3] > 1) ? d : 0);
+                                result[idx_out + d] = std::pow(array0[idx0], array1[idx1]);
+                                // std::cout << "(" << idx0 << ", " << idx1 << ") -> " << array0[idx0] << " ** " << array1[idx1] << " -> " << idx_out + d << std::endl;
+                            }
+                        }
+                    }
+                }
+
+                // conversion to Aidge::Tensors
+                // input0
+                T0->resize(dims0);
+                T0 -> getImpl() -> setRawPtr(array0, dims0[0]*dims0[1]*dims0[2]*dims0[3]);
+
+                // input1
+                T1->resize(dims1);
+                T1 -> getImpl() -> setRawPtr(array1, dims1[0]*dims1[1]*dims1[2]*dims1[3]);
+
+                // results
+                Tres->resize(dimsOut);
+                Tres -> getImpl() -> setRawPtr(result, dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]);
+
+                // compute result
+                op->forwardDims();
+                start = std::chrono::system_clock::now();
+                myPow->forward();
+                end = std::chrono::system_clock::now();
+                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+                // comparison between truth and computed result
+                REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
+
+                delete[] array0;
+                delete[] array1;
+                delete[] result;
+
+                const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                number_of_operation += nb_elements;
+            }
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {} μs\n", duration.count());
+        }
+        SECTION("+1-D Tensor / 1-D Tensor") {
+            std::size_t number_of_operation = 0;
+            std::uniform_int_distribution<std::size_t> nbRemovedDimsDist(std::size_t(1), std::size_t(3));
+
+            for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
+                // generate 2 random Tensors
+                // handle dimensions
+                constexpr std::size_t nbDims = 4;
+                std::vector<std::size_t> dims0(4);
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    dims0[i] = dimSizeDist(gen);
+                }
+                std::vector<std::size_t> dimsOut = dims0;
+                std::vector<std::size_t> dims1 = dims0;
+                for (std::size_t i = 0; i < nbDims; ++i) {
+                    if (boolDist(gen)) {
+                        dims1[i] = 1;
+                    }
+                }
+                dims1.erase(dims1.cbegin(), dims1.cbegin() + nbRemovedDimsDist(gen));
+
+                // create arrays and fill them with random values
+                float* array0 = new float[dims0[0]*dims0[1]*dims0[2]*dims0[3]];
+                std::size_t array1_size = std::accumulate(dims1.cbegin(), dims1.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                float* array1 = new float[array1_size];
+                float* result = new float[dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]];
+
+                for (std::size_t i = 0; i < (dims0[0]*dims0[1]*dims0[2]*dims0[3]); ++i) {
+                    array0[i] = valueDist(gen);
+                }
+                for (std::size_t i = 0; i < array1_size; ++i) {
+                    array1[i] = valueDist(gen);
+                }
+
+                // compute true result
+                auto dims1_tmp = dims1;
+                dims1_tmp.insert(dims1_tmp.cbegin(), 4 - dims1_tmp.size(), std::size_t(1));
+
+                const std::size_t strides0[nbDims] = {dims0[1]*dims0[2]*dims0[3], dims0[2]*dims0[3], dims0[3], 1};
+                const std::size_t strides1[nbDims] = {dims1_tmp[1]*dims1_tmp[2]*dims1_tmp[3], dims1_tmp[2]*dims1_tmp[3], dims1_tmp[3], 1};
+                for (std::size_t a = 0; a < dimsOut[0]; ++a) {
+                    for (std::size_t b = 0; b < dimsOut[1]; ++b) {
+                        const std::size_t idx0_0 = strides0[0] * ((dims0[0] > 1) ? a : 0)
+                                                    + strides0[1] * ((dims0[1] > 1) ? b : 0);
+                        const std::size_t idx1_0 = strides1[0] * ((dims1_tmp[0] > 1) ? a : 0)
+                                                    + strides1[1] * ((dims1_tmp[1] > 1) ? b : 0);
+                        for (std::size_t c = 0; c < dimsOut[2]; ++c) {
+                            const std::size_t idx_out = dimsOut[3] * (c + dimsOut[2] * (b + dimsOut[1] * a));
+                            for (std::size_t d = 0; d < dimsOut[3]; ++d) {
+                                std::size_t idx0 = idx0_0
+                                                    + strides0[2] * ((dims0[2] > 1) ? c : 0)
+                                                    + ((dims0[3] > 1) ? d : 0);
+                                std::size_t idx1 = idx1_0
+                                                    + strides1[2] * ((dims1_tmp[2] > 1) ? c : 0)
+                                                    + ((dims1_tmp[3] > 1) ? d : 0);
+                                result[idx_out + d] = std::pow(array0[idx0], array1[idx1]);
+                                // std::cout << "(" << idx0 << ", " << idx1 << ") -> " << array0[idx0] << " ** " << array1[idx1] << " -> " << idx_out + d << std::endl;
+                            }
+                        }
+                    }
+                }
+
+                // conversion to Aidge::Tensors
+                // input0
+                T0->resize(dims0);
+                T0 -> getImpl() -> setRawPtr(array0, dims0[0]*dims0[1]*dims0[2]*dims0[3]);
+
+                // input1
+                T1->resize(dims1);
+                T1 -> getImpl() -> setRawPtr(array1, array1_size);
+
+                // results
+                Tres->resize(dimsOut);
+                Tres -> getImpl() -> setRawPtr(result, dimsOut[0]*dimsOut[1]*dimsOut[2]*dimsOut[3]);
+
+                // compute result
+                op->forwardDims();
+                start = std::chrono::system_clock::now();
+                myPow->forward();
+                end = std::chrono::system_clock::now();
+                duration += std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+
+                // comparison between truth and computed result
+                REQUIRE(approxEq<float>(*(op->getOutput(0)), *Tres));
+
+                delete[] array0;
+                delete[] array1;
+                delete[] result;
+
+                const std::size_t nb_elements = std::accumulate(dimsOut.cbegin(), dimsOut.cend(), std::size_t(1), std::multiplies<std::size_t>());
+                number_of_operation += nb_elements;
+            }
+
+            Log::info("number of elements over time spent: {}\n", (number_of_operation / duration.count()));
+            Log::info("total time: {} μs\n", duration.count());
+        }
+    }
+
+
+    SECTION("PowImpl_cpu::backward()") {
+        SECTION("3D Tensors") {
+            const auto input0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
+                {
+                    {
+                        {
+                            {2.0, 3.0},
+                            {4.0, 5.0}
+                        },
+                        {
+                            {6.0, 7.0},
+                            {8.0, 9.0}
+                        }
+                    }
+                }
+            ));
+            const auto input1 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
+                {
+                    {
+                        {
+                            {1.0, 2.0},
+                            {3.0, 2.0}
+                        },
+                        {
+                            {2.0, 3.0},
+                            {1.0, 0.5}
+                        }
+                    }
+                }
+            ));
+            const auto gradOut = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
+                {
+                    {
+                        {
+                            {0.5, 1.0},
+                            {1.5, 2.0}
+                        },
+                        {
+                            {2.5, 3.0},
+                            {3.5, 4.0}
+                        }
+                    }
+                }
+            ));
+            const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
+                {
+                    {
+                        {
+                            {0.50000000,   6.00000000},
+                            {72.00000000,  20.00000000}
+                        },
+                        {
+                            {30.00000000, 441.00000000},
+                            {3.50000000,   0.66666669}
+                        }
+                    }
+                }
+            ));
+            const auto expectedGrad1 = std::make_shared<Tensor>(Array3D<float, 2, 2, 2>(
+                {
+                    {
+                        {
+                            {  0.693147182, 9.88751030},
+                            {1.33084259e+02, 8.04718933e+01}
+                        },
+                        {
+                            {1.61258362e+02, 2.00234143e+03},
+                            {5.82243652e+01, 2.63666954e+01}
+                        }
+                    }
+                }
+            ));
+            for(const auto T: {input0, input1, gradOut, expectedGrad0, expectedGrad1})
+            {
+                    T->setBackend("cpu") ;
+                    T->setDataType(DataType::Float32);
+            }
+            std::shared_ptr<Node> powOp = Pow();
+            auto opr = std::static_pointer_cast<OperatorTensor>(powOp-> getOperator());
+            opr->setDataType(DataType::Float32);
+            opr->setBackend("cpu");
+            opr->associateInput(0, input0);
+            opr->associateInput(1, input1);
+            opr->forward();
+
+            opr->getOutput(0)->setGrad(gradOut);
+            powOp->backward();
+
+            REQUIRE(approxEq<float>(*(opr->getInput(0)->grad()), *expectedGrad0));
+            REQUIRE(approxEq<float>(*(opr->getInput(1)->grad()), *expectedGrad1));
+        }
+        SECTION("Broadcasting") {
+            const auto input0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+                {
+                    {
+                        {
+                            {1.0, 2.0, 3.0},
+                            {4.0, 5.0, 6.0}
+                        },
+                        {
+                            {1.5, 2.5, 3.5},
+                            {4.5, 5.5, 6.5}
+                        }
+                    }
+                }
+            ));
+            const auto input1 = std::make_shared<Tensor>(Array1D<float, 3>(
+                {
+                    {0.1, 0.2, 0.3}
+                }
+            ));
+
+            const auto gradOut = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+                {
+                    {
+                        {
+                            {1.0, 2.0, 3.0},
+                            {4.0, 5.0, 6.0}
+                        },
+                        {
+                            {6.0, 5.0, 4.0},
+                            {3.0, 2.0, 1.0}
+                        }
+                    }
+                }
+            ));
+            const Tensor expectedGrad0 = Array3D<float, 2, 2, 3>(
+                {
+                    {
+                        {
+                            {0.10000000, 0.22973967, 0.41711676},
+                            {0.11486985, 0.27594593, 0.51353097}
+                        },
+                        {
+                            {0.41655189, 0.48044977, 0.49926791},
+                            {0.07748720, 0.10227509, 0.08092485}
+                        }
+                    }
+                }
+            );
+            const Tensor expectedGrad1 = Array1D<float, 3>(
+                {
+                    {14.14779854, 22.99299049, 33.56402588}
+                }
+            );
+
+            std::shared_ptr<Node> powOp = Pow();
+            auto opr = std::static_pointer_cast<OperatorTensor>(powOp-> getOperator());
+            opr->setDataType(DataType::Float32);
+            opr->setBackend("cpu");
+            opr->associateInput(0, input0);
+            opr->associateInput(1, input1);
+            powOp->forward();
+
+            opr->getOutput(0)->setGrad(gradOut);
+            powOp->backward();
+
+            REQUIRE(approxEq<float>(*(opr->getInput(0)->grad()), expectedGrad0));
+            REQUIRE(approxEq<float>(*(opr->getInput(1)->grad()), expectedGrad1));
+        }
+    }
+}
+} // namespace Aidge
diff --git a/unit_tests/operator/Test_SubImpl.cpp b/unit_tests/operator/Test_SubImpl.cpp
index d9b6207b..f87f34d5 100644
--- a/unit_tests/operator/Test_SubImpl.cpp
+++ b/unit_tests/operator/Test_SubImpl.cpp
@@ -344,10 +344,10 @@ TEST_CASE("[CPU/Operator] Sub(Backward)", "[Sub][CPU][Backward]") {
 
         op->associateInput(0, T0);
         op->associateInput(1, T1);
-        op->getOutput(0)->setGrad(std::make_shared<Tensor>(
-            Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}})));
         op->forwardDims();
 
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>(
+            Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}})));
         mySub->backward();
 
         // For subtraction: grad_input0 = grad_output
@@ -387,9 +387,9 @@ TEST_CASE("[CPU/Operator] Sub(Backward)", "[Sub][CPU][Backward]") {
 
         op->associateInput(0, T0);
         op->associateInput(1, T1);
-        op->getOutput(0)->setGrad(newGrad);
         op->forwardDims();
 
+        op->getOutput(0)->setGrad(newGrad);
         mySub->backward();
 
         REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0));
@@ -434,14 +434,12 @@ TEST_CASE("[CPU/Operator] Sub(Backward)", "[Sub][CPU][Backward]") {
 
         op->associateInput(0, T0);
         op->associateInput(1, T1);
+	op->forwardDims();
 
         // Set gradient of output
-        op->getOutput(0)->setGrad(std::make_shared<Tensor>());
-        op->getOutput(0)->grad()->resize(outputDims);
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>(outputDims));
         op->getOutput(0)->grad()->getImpl()->setRawPtr(gradOutputData.data(), outputSize);
 
-        op->forwardDims();
-
         // Compute reference gradients
         std::vector<float> expectedGrad0(input0Size, 0.0f);
         std::vector<float> expectedGrad1(input1Size, 0.0f);
-- 
GitLab


From a6373db1af2f60d4b3036cbb596cb04971b8817f Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Tue, 15 Apr 2025 15:19:24 +0200
Subject: [PATCH 100/108] Added OpenMP for Resize and TopK operators

---
 .../backend/cpu/operator/ResizeImpl_kernels.hpp  | 12 +++++++-----
 .../backend/cpu/operator/TopKImpl_kernels.hpp    |  3 +++
 src/data/Interpolation.cpp                       | 16 ++++++++--------
 3 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/ResizeImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ResizeImpl_kernels.hpp
index 6449417b..fe82f194 100644
--- a/include/aidge/backend/cpu/operator/ResizeImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ResizeImpl_kernels.hpp
@@ -50,12 +50,13 @@ void ResizeImpl_cpu_forward_kernel(
                                           outputDims.cend(),
                                           1,
                                           std::multiplies<DimSize_t>());
-    std::vector<float> coordInApprox(inputDims.size());
-    std::vector<std::size_t> coordIn(inputDims.size());
-    std::vector<DimSize_t> coordOut;
+
+#ifdef _OPENMP
+    #pragma omp parallel for if (outputLen >= 16)
+#endif
     for (DimSize_t idxFlatOut = 0; idxFlatOut < outputLen; ++idxFlatOut) {
-        coordOut = Tensor::toCoord(outputDims, idxFlatOut);
-        coordInApprox =
+        const auto coordOut = Tensor::toCoord(outputDims, idxFlatOut);
+        auto coordInApprox =
             Interpolation::untransformCoordinates(coordOut,
                                                   inputDims,
                                                   outputDims,
@@ -72,6 +73,7 @@ void ResizeImpl_cpu_forward_kernel(
                     coordInApprox[i] = std::ceil(coordInApprox[i] - 0.5f);
                 }
             }
+            std::vector<std::size_t> coordIn(inputDims.size());
             if (Tensor::isInBounds<float>(inputDims, coordInApprox)) {
                 for (std::size_t i = 0; i < coordInApprox.size(); ++i) {
                     coordIn[i] = static_cast<std::size_t>(coordInApprox[i]);
diff --git a/include/aidge/backend/cpu/operator/TopKImpl_kernels.hpp b/include/aidge/backend/cpu/operator/TopKImpl_kernels.hpp
index 3d6bc3c8..51568188 100644
--- a/include/aidge/backend/cpu/operator/TopKImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/TopKImpl_kernels.hpp
@@ -47,6 +47,9 @@ void TopKImpl_cpu_forward_kernel(int64_t axis,
     const std::size_t dim_i = inputDims[axis];
     std::vector<std::pair<I, int64_t>> buffer(dim_i);
 
+#ifdef _OPENMP
+    #pragma omp parallel for collapse(2) if (stride_pre * stride_post >= 16)
+#endif
     for (std::size_t pre = 0; pre < stride_pre; ++pre) {
         for (std::size_t post = 0; post < stride_post; ++post) {
             const std::size_t idx_i = pre * dim_i * stride_post + post;
diff --git a/src/data/Interpolation.cpp b/src/data/Interpolation.cpp
index fbf224d8..a5ab8f95 100644
--- a/src/data/Interpolation.cpp
+++ b/src/data/Interpolation.cpp
@@ -79,10 +79,10 @@ InterpolationCPU::linearRecurse(const std::vector<float> &coordToInterpolate,
             pointsCoords,
             alongDim);
     }
-    Log::debug("\nEntering linear recurse with {} points.", points.size());
-    Log::debug("Points : {}", extractPtCoords(points));
-    Log::debug("coordsToInterpolate : {}", coordToInterpolate);
-    Log::debug("alongDim : {}", alongDim);
+    //Log::debug("\nEntering linear recurse with {} points.", points.size());
+    //Log::debug("Points : {}", extractPtCoords(points));
+    //Log::debug("coordsToInterpolate : {}", coordToInterpolate);
+    //Log::debug("alongDim : {}", alongDim);
 
     ///////////////////
     // COMPUTATION
@@ -98,9 +98,9 @@ InterpolationCPU::linearRecurse(const std::vector<float> &coordToInterpolate,
             upperPoints.insert(point);
         }
     }
-    Log::debug("alongDim : {}", alongDim);
-    Log::debug("lowerPoints : {}", extractPtCoords(lowerPoints));
-    Log::debug("upperPoints : {}", extractPtCoords(upperPoints));
+    //Log::debug("alongDim : {}", alongDim);
+    //Log::debug("lowerPoints : {}", extractPtCoords(lowerPoints));
+    //Log::debug("upperPoints : {}", extractPtCoords(upperPoints));
 
     // Here are 3 cases
     // 1. upper/lowerPoints.size() == 0
@@ -174,7 +174,7 @@ InterpolationCPU::linearRecurse(const std::vector<float> &coordToInterpolate,
     // 0 is just a sanity check to ensure later that all dims have been
     // interpolate
     interpolatedPoint.first[alongDim] = 0;
-    Log::debug("successfully returned from alongDim : {}", alongDim);
+    //Log::debug("successfully returned from alongDim : {}", alongDim);
     return std::set<Point<T>>({interpolatedPoint});
 }
 
-- 
GitLab


From 2ca700ec2ef124e1b4c1aedcc63072965fb638e8 Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Tue, 15 Apr 2025 15:26:56 +0200
Subject: [PATCH 101/108] Fixed Windows build

---
 include/aidge/backend/cpu/operator/ResizeImpl_kernels.hpp | 2 +-
 include/aidge/backend/cpu/operator/TopKImpl_kernels.hpp   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/ResizeImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ResizeImpl_kernels.hpp
index fe82f194..477f18cd 100644
--- a/include/aidge/backend/cpu/operator/ResizeImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ResizeImpl_kernels.hpp
@@ -54,7 +54,7 @@ void ResizeImpl_cpu_forward_kernel(
 #ifdef _OPENMP
     #pragma omp parallel for if (outputLen >= 16)
 #endif
-    for (DimSize_t idxFlatOut = 0; idxFlatOut < outputLen; ++idxFlatOut) {
+    for (int idxFlatOut = 0; idxFlatOut < static_cast<int>(outputLen); ++idxFlatOut) {
         const auto coordOut = Tensor::toCoord(outputDims, idxFlatOut);
         auto coordInApprox =
             Interpolation::untransformCoordinates(coordOut,
diff --git a/include/aidge/backend/cpu/operator/TopKImpl_kernels.hpp b/include/aidge/backend/cpu/operator/TopKImpl_kernels.hpp
index 51568188..9b219deb 100644
--- a/include/aidge/backend/cpu/operator/TopKImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/TopKImpl_kernels.hpp
@@ -50,8 +50,8 @@ void TopKImpl_cpu_forward_kernel(int64_t axis,
 #ifdef _OPENMP
     #pragma omp parallel for collapse(2) if (stride_pre * stride_post >= 16)
 #endif
-    for (std::size_t pre = 0; pre < stride_pre; ++pre) {
-        for (std::size_t post = 0; post < stride_post; ++post) {
+    for (int pre = 0; pre < static_cast<int>(stride_pre); ++pre) {
+        for (int post = 0; post < static_cast<int>(stride_post); ++post) {
             const std::size_t idx_i = pre * dim_i * stride_post + post;
             const std::size_t idx_o = pre * k * stride_post + post;
 
-- 
GitLab


From b854bc94fe2134997bfb2e25ff2fbc0d645bdb45 Mon Sep 17 00:00:00 2001
From: Antoni Olivier <olivier.antoni@cea.fr>
Date: Wed, 16 Apr 2025 14:10:45 +0200
Subject: [PATCH 102/108] Fix input gradient tensor in Backward method

---
 include/aidge/backend/cpu/operator/AtanImpl_kernels.hpp     | 2 +-
 include/aidge/backend/cpu/operator/ClipImpl_kernels.hpp     | 2 +-
 .../aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp    | 2 +-
 .../aidge/backend/cpu/operator/LeakyReLUImpl_kernels.hpp    | 2 +-
 include/aidge/backend/cpu/operator/LnImpl_kernels.hpp       | 4 +---
 .../aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp   | 6 ------
 include/aidge/backend/cpu/operator/MulImpl_kernels.hpp      | 3 ---
 include/aidge/backend/cpu/operator/PowImpl_kernels.hpp      | 6 ------
 include/aidge/backend/cpu/operator/ReLUImpl_kernels.hpp     | 2 +-
 include/aidge/backend/cpu/operator/SigmoidImpl_kernels.hpp  | 2 +-
 include/aidge/backend/cpu/operator/SqrtImpl_kernels.hpp     | 2 +-
 include/aidge/backend/cpu/operator/SubImpl_kernels.hpp      | 3 ---
 include/aidge/backend/cpu/operator/TanhImpl_kernels.hpp     | 2 +-
 unit_tests/operator/Test_MaxPoolingImpl.cpp                 | 2 +-
 14 files changed, 10 insertions(+), 30 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/AtanImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AtanImpl_kernels.hpp
index 141e5b60..e82f34fc 100644
--- a/include/aidge/backend/cpu/operator/AtanImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AtanImpl_kernels.hpp
@@ -43,7 +43,7 @@ void AtanImpl_cpu_backward_kernel(const std::size_t inputLength,
     // Apply the derivative of atan for each element in the input array
     for (size_t i = 0; i < inputLength; ++i) {
         // dx = dy * (1 / (1 + x^2))
-        grad_input[i] = grad_output[i] * static_cast<O>(1.0 / (1.0 + output[i] * output[i]));
+        grad_input[i] += grad_output[i] * static_cast<O>(1.0 / (1.0 + output[i] * output[i]));
     }
 }
 
diff --git a/include/aidge/backend/cpu/operator/ClipImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ClipImpl_kernels.hpp
index f7a64585..65bf5094 100644
--- a/include/aidge/backend/cpu/operator/ClipImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ClipImpl_kernels.hpp
@@ -48,7 +48,7 @@ void ClipImpl_cpu_backward_kernel(
     GI* grad_input = static_cast<GI*>(grad_input_);
 
     for (std::size_t i = 0; i < length; ++i) {
-        grad_input[i] = ((input[i] > min_) && (input[i] < max_)) ? grad_output[i] : 0;
+        grad_input[i] += ((input[i] > min_) && (input[i] < max_)) ? grad_output[i] : 0;
     }
 }
 
diff --git a/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp b/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp
index 92f12fbe..c823b294 100644
--- a/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/HeavisideImpl_kernels.hpp
@@ -54,7 +54,7 @@ void HeavisideImplCpuBackwardKernel(std::size_t inputLength,
     GI* grad_input = static_cast<GI*>(grad_input_);
 
     for (size_t i = 0; i < inputLength; ++i) {
-        grad_input[i] = grad_output[i] * static_cast<O>(1.0 / (1.0 + (output[i] * M_PI) * (output[i] * M_PI)));
+        grad_input[i] += grad_output[i] * static_cast<O>(1.0 / (1.0 + (output[i] * M_PI) * (output[i] * M_PI)));
     }
 }
 
diff --git a/include/aidge/backend/cpu/operator/LeakyReLUImpl_kernels.hpp b/include/aidge/backend/cpu/operator/LeakyReLUImpl_kernels.hpp
index 1b4c3053..236038c6 100644
--- a/include/aidge/backend/cpu/operator/LeakyReLUImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/LeakyReLUImpl_kernels.hpp
@@ -45,7 +45,7 @@ void LeakyReLUImpl_cpu_backward_kernel(const float negativeSlope_,
     const I negativeSlope = static_cast<const I>(negativeSlope_);
 
     for (std::size_t i = 0; i < inputLength; ++i) {
-        grad_input[i] = (input[i] > 0) ? grad_output[i] : negativeSlope*grad_output[i];
+        grad_input[i] += (input[i] > 0) ? grad_output[i] : negativeSlope*grad_output[i];
     }
 }
 
diff --git a/include/aidge/backend/cpu/operator/LnImpl_kernels.hpp b/include/aidge/backend/cpu/operator/LnImpl_kernels.hpp
index ee2864b6..8b57b417 100755
--- a/include/aidge/backend/cpu/operator/LnImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/LnImpl_kernels.hpp
@@ -48,9 +48,7 @@ void LnImpl_cpu_backward_kernel(const std::size_t inputLength,
 	
     for (std::size_t i = 0; i < inputLength; ++i) {
 		if (input[i] > I(eps)) {
-			grad_input[i] = grad_output[i] / input[i];
-		} else {
-			grad_input[i] = GI(0);
+			grad_input[i] += grad_output[i] / input[i];
 		}
     }
 }
diff --git a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
index 7fe272d5..3057878d 100644
--- a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
@@ -149,12 +149,6 @@ void MaxPoolingImpl2D_cpu_backward_kernel(
   const I *input = static_cast<const I *>(input_);
   I *grad = static_cast<I *>(grad_);
 
-  // Fill the gradient with 0 to avoid garbage data
-  std::fill(grad,
-	  grad + (dims[0] * dims[1] * dims[2] * dims[3]),
-	  static_cast<I>(0)
-  );
-
   // output H size
   auto hOut = static_cast<float>(
     dims[2] - (kernelDims[0] - 1) * dilations[0] - 1 + strideDims[0]
diff --git a/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp
index 36acb919..a88923fd 100644
--- a/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp
@@ -166,9 +166,6 @@ void MulImpl_cpu_backward_kernel(const std::size_t input0Length,
     auto* grad_input_0 = static_cast<I1*>(gradientInput0_);
     auto* grad_input_1 = static_cast<I2*>(gradientInput1_);
 
-    std::fill_n(grad_input_0, input0Length, static_cast<I1>(0));
-    std::fill_n(grad_input_1, input1Length, static_cast<I2>(0));
-
     // Broadcast dims0 and dims1 to match the shape of outputDims
     auto broadcastedDims0 = getBroadcastedDims(outputDims, dims0);
     auto broadcastedDims1 = getBroadcastedDims(outputDims, dims1);
diff --git a/include/aidge/backend/cpu/operator/PowImpl_kernels.hpp b/include/aidge/backend/cpu/operator/PowImpl_kernels.hpp
index cae10663..51fd1bb6 100644
--- a/include/aidge/backend/cpu/operator/PowImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/PowImpl_kernels.hpp
@@ -163,12 +163,6 @@ void PowImpl_cpu_backward_kernel(const std::vector<std::size_t>& input0Dims,
     I2* grad1 = static_cast<I2*>(gradientInput1_);
     const O* gradOut = static_cast<const O*>(gradOutput_);
 
-    // Fill input grads with zeros
-	std::size_t input0Elements = std::accumulate(input0Dims.cbegin(), input0Dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
-	std::fill(grad0, grad0 + input0Elements, I1(0));
-	std::size_t input1Elements = std::accumulate(input1Dims.cbegin(), input1Dims.cend(), std::size_t(1), std::multiplies<std::size_t>());
-	std::fill(grad1, grad1 + input1Elements, I2(0));
-
 	std::size_t totalElements = std::accumulate(outputDims.cbegin(), outputDims.cend(), std::size_t(1), std::multiplies<std::size_t>());
     for (size_t oIndex = 0; oIndex < totalElements; ++oIndex)
     {
diff --git a/include/aidge/backend/cpu/operator/ReLUImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ReLUImpl_kernels.hpp
index 6b7c3c9c..3789052c 100644
--- a/include/aidge/backend/cpu/operator/ReLUImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ReLUImpl_kernels.hpp
@@ -47,7 +47,7 @@ void ReLUImpl_cpu_backward_kernel(const std::size_t inputLength,
     const GO* grad_output = static_cast<const GO*>(grad_output_);
     GI* grad_input = static_cast<GI*>(grad_input_);
     for (std::size_t i = 0; i < inputLength; ++i) {
-        grad_input[i] = (input[i] > 0) ? grad_output[i] : 0;
+        grad_input[i] += (input[i] > 0) ? grad_output[i] : 0;
     }
 }
 
diff --git a/include/aidge/backend/cpu/operator/SigmoidImpl_kernels.hpp b/include/aidge/backend/cpu/operator/SigmoidImpl_kernels.hpp
index 83ad4575..b3446dba 100644
--- a/include/aidge/backend/cpu/operator/SigmoidImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/SigmoidImpl_kernels.hpp
@@ -43,7 +43,7 @@ void SigmoidImpl_cpu_backward_kernel(const std::size_t inputLength,
     const GO* grad_output = static_cast<const GO*>(grad_output_);
     GI* grad_input = static_cast<GI*>(grad_input_);
     for (std::size_t i = 0; i < inputLength; ++i) {
-        grad_input[i] = output[i] * (O(1) - output[i]) * grad_output[i];
+        grad_input[i] += output[i] * (O(1) - output[i]) * grad_output[i];
     }
 }
 
diff --git a/include/aidge/backend/cpu/operator/SqrtImpl_kernels.hpp b/include/aidge/backend/cpu/operator/SqrtImpl_kernels.hpp
index bccc195e..beddc74d 100644
--- a/include/aidge/backend/cpu/operator/SqrtImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/SqrtImpl_kernels.hpp
@@ -44,7 +44,7 @@ void SqrtImpl_cpu_backward_kernel(const std::size_t inputLength,
     O* grad_input = static_cast<O*>(grad_input_);
 
     for (std::size_t i = 0; i < inputLength; ++i) {
-        grad_input[i] = static_cast<O>(0.5/output[i]) * grad_output[i];
+        grad_input[i] += static_cast<O>(0.5/output[i]) * grad_output[i];
     }
 }
 
diff --git a/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp b/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp
index 8d3d80e9..751177a7 100644
--- a/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp
@@ -165,9 +165,6 @@ void SubImpl_cpu_backward_kernel(const std::size_t input0Length,
     auto* grad_input_0 = static_cast<I1*>(gradientInput0_);
     auto* grad_input_1 = static_cast<I2*>(gradientInput1_);
 
-    std::fill_n(grad_input_0, input0Length, static_cast<I1>(0));
-    std::fill_n(grad_input_1, input1Length, static_cast<I2>(0));
-
     auto broadcastedDims0 = getBroadcastedDims(outputDims, dims0);
     auto broadcastedDims1 = getBroadcastedDims(outputDims, dims1);
 
diff --git a/include/aidge/backend/cpu/operator/TanhImpl_kernels.hpp b/include/aidge/backend/cpu/operator/TanhImpl_kernels.hpp
index 49cfe9cb..ca4510d9 100644
--- a/include/aidge/backend/cpu/operator/TanhImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/TanhImpl_kernels.hpp
@@ -39,7 +39,7 @@ void TanhImpl_cpu_backward_kernel(const std::size_t inputLength,
     const GO* grad_output = static_cast<const GO*>(grad_output_);
     GI* grad_input = static_cast<GI*>(grad_input_);
     for (std::size_t i = 0; i < inputLength; ++i) {
-        grad_input[i] = (O(1) - output[i] * output[i]) * grad_output[i];
+        grad_input[i] += (O(1) - output[i] * output[i]) * grad_output[i];
     }
 }
 
diff --git a/unit_tests/operator/Test_MaxPoolingImpl.cpp b/unit_tests/operator/Test_MaxPoolingImpl.cpp
index 2bc5e1ee..e4d171c5 100644
--- a/unit_tests/operator/Test_MaxPoolingImpl.cpp
+++ b/unit_tests/operator/Test_MaxPoolingImpl.cpp
@@ -345,7 +345,7 @@ TEST_CASE("[cpu/operator] MaxPooling(backward)", "[MaxPooling][CPU]") {
 			}}}
 		};
 
-		//op2->resetInput(0);
+		myInput4->setGrad(nullptr);
 		op2->associateInput(0, myInput4);
 		op2->setDataType(DataType::Float32);
 		op2->setBackend("cpu");
-- 
GitLab


From ba9b463891a7d5751eba6f9313382355b3c7dee7 Mon Sep 17 00:00:00 2001
From: Marwa ABDELOUINISSE <marwa.abdelouinisse@cea.fr>
Date: Thu, 17 Apr 2025 11:09:29 +0000
Subject: [PATCH 103/108] add: Dropout Operator | rm: unused variables warnings

---
 include/aidge/backend/cpu.hpp                 |   3 +-
 .../backend/cpu/operator/DropoutImpl.hpp      |  35 ++++++
 .../cpu/operator/DropoutImpl_kernels.hpp      |  61 +++++++++++
 .../backend/cpu/operator/MulImpl_kernels.hpp  |   4 +-
 src/data/Interpolation.cpp                    |  15 +--
 src/operator/DropoutImpl.cpp                  |  49 +++++++++
 unit_tests/operator/Test_BitShift.cpp         |   8 +-
 .../operator/Test_ConstantOfShapeImpl.cpp     |   1 +
 unit_tests/operator/Test_ConvImpl.cpp         |   6 +-
 unit_tests/operator/Test_DivImpl.cpp          |   4 +-
 unit_tests/operator/Test_DropoutImpl.cpp      | 103 ++++++++++++++++++
 .../Test_GlobalAveragePoolingImpl.cpp         |   2 +-
 unit_tests/operator/Test_HeavisideImpl.cpp    |  18 +--
 unit_tests/operator/Test_MaxPoolingImpl.cpp   |  24 ++--
 .../operator/Test_WeightInterleavingImpl.cpp  |   2 -
 15 files changed, 292 insertions(+), 43 deletions(-)
 create mode 100644 include/aidge/backend/cpu/operator/DropoutImpl.hpp
 create mode 100644 include/aidge/backend/cpu/operator/DropoutImpl_kernels.hpp
 create mode 100644 src/operator/DropoutImpl.cpp
 create mode 100644 unit_tests/operator/Test_DropoutImpl.cpp

diff --git a/include/aidge/backend/cpu.hpp b/include/aidge/backend/cpu.hpp
index 1f9dd830..6d090403 100644
--- a/include/aidge/backend/cpu.hpp
+++ b/include/aidge/backend/cpu.hpp
@@ -31,6 +31,7 @@
 #include "aidge/backend/cpu/operator/ConstantOfShapeImpl.hpp"
 #include "aidge/backend/cpu/operator/CryptoHashImpl.hpp"
 #include "aidge/backend/cpu/operator/DivImpl.hpp"
+#include "aidge/backend/cpu/operator/DropoutImpl.hpp"
 #include "aidge/backend/cpu/operator/EqualImpl.hpp"
 #include "aidge/backend/cpu/operator/ErfImpl.hpp"
 #include "aidge/backend/cpu/operator/ExpandImpl.hpp"
@@ -65,4 +66,4 @@
 
 #include "aidge/backend/cpu/data/TensorImpl.hpp"
 
-#endif /* AIDGE_CPU_IMPORTS_H_ */
+#endif /* AIDGE_CPU_IMPORTS_H_ */
\ No newline at end of file
diff --git a/include/aidge/backend/cpu/operator/DropoutImpl.hpp b/include/aidge/backend/cpu/operator/DropoutImpl.hpp
new file mode 100644
index 00000000..e3f0d41e
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/DropoutImpl.hpp
@@ -0,0 +1,35 @@
+/********************************************************************************
+ * Copyright (c) 2025 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_DROPOUTIMPL_H_
+#define AIDGE_CPU_OPERATOR_DROPOUTIMPL_H_
+
+#include <cstddef> // std::size_t
+
+#include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+#include "aidge/operator/Dropout.hpp"
+#include "aidge/utils/Registrar.hpp"
+
+namespace Aidge {
+
+// Operator implementation entry point for the backend
+using DropoutImpl_cpu = OperatorImpl_cpu<Dropout_Op,
+    void(float,
+        std::size_t,
+        unsigned int,
+        const void*,
+        void*)>;
+
+// Implementation entry point registration to Operator
+REGISTRAR(Dropout_Op, "cpu", Aidge::DropoutImpl_cpu::create);
+
+}  // namespace Aidge
+
+#endif /* AIDGE_CPU_OPERATOR_DROPOUTIMPL_H_ */
diff --git a/include/aidge/backend/cpu/operator/DropoutImpl_kernels.hpp b/include/aidge/backend/cpu/operator/DropoutImpl_kernels.hpp
new file mode 100644
index 00000000..61e61680
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/DropoutImpl_kernels.hpp
@@ -0,0 +1,61 @@
+/********************************************************************************
+ * Copyright (c) 2025 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#ifndef AIDGE_CPU_OPERATOR_DROPOUTIMPL_KERNELS_H_
+#define AIDGE_CPU_OPERATOR_DROPOUTIMPL_KERNELS_H_
+
+#include <cstddef>   // std::size_t
+#include <memory>
+#include <random>
+
+#include "aidge/backend/cpu/operator/DropoutImpl.hpp"
+#include "aidge/data/DataType.hpp"
+#include "aidge/utils/Registrar.hpp"
+
+
+namespace Aidge {
+
+template <DataType DT_I, DataType DT_O = DT_I>
+void DropoutImpl_cpu_forward_kernel(float probability,
+                                    std::size_t nb_elements,
+                                    unsigned int seed,
+                                    const void* input_,
+                                    void* output_)
+{
+    using I = cpptype_t<DT_I>;
+    using O = cpptype_t<DT_O>;
+    const I *input = static_cast<const I *>(input_);
+    O *output = static_cast<O *>(output_);
+
+    // const unsigned int seed = static_cast<unsigned int>(std::random_device{}());
+    std::mt19937 rng(seed);
+    std::bernoulli_distribution bernoulli_dist(1.0f - probability); //bernoulli keep_prob
+
+    const I scale = I(1.0) / static_cast<I>(1.0f - probability);
+
+    for (std::size_t i = 0; i < nb_elements; ++i)
+    {
+        output[i] = bernoulli_dist(rng) ? static_cast<O>(input[i] * scale)  : static_cast<O>(0.0);
+    }
+
+}
+
+REGISTRAR(DropoutImpl_cpu,
+          {DataType::Float32},
+          {ProdConso::defaultModel, DropoutImpl_cpu_forward_kernel<DataType::Float32>, nullptr});
+
+REGISTRAR(DropoutImpl_cpu,
+          {DataType::Float64},
+          {ProdConso::defaultModel, DropoutImpl_cpu_forward_kernel<DataType::Float64>, nullptr});
+
+} // namespace aidge
+
+#endif  // AIDGE_CPU_OPERATOR_DROPOUTIMPL_KERNELS_H_
diff --git a/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp
index a88923fd..bda28f63 100644
--- a/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/MulImpl_kernels.hpp
@@ -148,8 +148,8 @@ void MulImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
 }
 
 template <class I1, class I2, class O>
-void MulImpl_cpu_backward_kernel(const std::size_t input0Length,
-                                  const std::size_t input1Length,
+void MulImpl_cpu_backward_kernel(const std::size_t /*input0Length*/,
+                                  const std::size_t /*input1Length*/,
                                   const std::size_t gradOutputLength,
                                   const std::vector<std::size_t>& dims0,
                                   const std::vector<std::size_t>& dims1,
diff --git a/src/data/Interpolation.cpp b/src/data/Interpolation.cpp
index a5ab8f95..24aeeb9f 100644
--- a/src/data/Interpolation.cpp
+++ b/src/data/Interpolation.cpp
@@ -39,13 +39,14 @@ InterpolationCPU::linearRecurse(const std::vector<float> &coordToInterpolate,
         return points;
     }
 
-    auto extractPtCoords = [](std::set<Point<T>> pts) -> std::set<Coords> {
-        std::set<Coords> result;
-        for (const auto &pt : pts) {
-            result.insert(pt.first);
-        }
-        return result;
-    };
+    // :!\ Warning: seems to be unused now
+    // auto extractPtCoords = [](std::set<Point<T>> pts) -> std::set<Coords> {
+    //     std::set<Coords> result;
+    //     for (const auto &pt : pts) {
+    //         result.insert(pt.first);
+    //     }
+    //     return result;
+    // };
     ///////////////////
     // ERROR CHECKING
     if (alongDim > coordToInterpolate.size() || points.size() == 0) {
diff --git a/src/operator/DropoutImpl.cpp b/src/operator/DropoutImpl.cpp
new file mode 100644
index 00000000..6975ce68
--- /dev/null
+++ b/src/operator/DropoutImpl.cpp
@@ -0,0 +1,49 @@
+/********************************************************************************
+ * Copyright (c) 2025 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include "aidge/backend/cpu/operator/DropoutImpl.hpp"
+
+#include <stdexcept> // std::runtime_erro
+#include <random>    // std::random_device
+
+#include "aidge/backend/cpu/data/GetCPUPtr.h"
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/Dropout.hpp"
+#include "aidge/utils/ErrorHandling.hpp"
+#include "aidge/utils/Registrar.hpp"
+
+
+#include "aidge/backend/cpu/operator/DropoutImpl_kernels.hpp"
+
+template <>
+void Aidge::DropoutImpl_cpu::forward() {
+    const Dropout_Op& op_ = dynamic_cast<const Dropout_Op&>(mOp);
+    // Check if input is provided
+    AIDGE_ASSERT(op_.getInput(0), "missing input #0 in Dropout Operator.");
+
+    // Get random seed
+    const unsigned int seed = static_cast<unsigned int>(std::random_device{}());
+
+    // Find the correct kernel type
+    const auto impl = Registrar<DropoutImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Call kernel
+    impl.forward(op_.probability(),
+                op_.getInput(0)->size(),
+                seed,
+                std::static_pointer_cast<Tensor>(mOp.getRawInput(0))->getImpl()->rawPtr(),
+                std::static_pointer_cast<Tensor>(mOp.getRawOutput(0))->getImpl()->rawPtr());
+}
+
+template <>
+void Aidge::DropoutImpl_cpu::backward() {
+    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Dropout_Op on backend cpu");
+}
\ No newline at end of file
diff --git a/unit_tests/operator/Test_BitShift.cpp b/unit_tests/operator/Test_BitShift.cpp
index 9cce9d6d..8d69d410 100644
--- a/unit_tests/operator/Test_BitShift.cpp
+++ b/unit_tests/operator/Test_BitShift.cpp
@@ -143,7 +143,7 @@ TEST_CASE("[cpu/operator] BitShift_TEST", "[BitShift][CPU]") {
             auto op_r = std::static_pointer_cast<OperatorTensor>(RoundBitShift-> getOperator());
             op_r->setDataType(DataType::Int32);
             op_r->setBackend("cpu");
-        
+
             // Create 2 input Tensors
             std::shared_ptr<Tensor> T0_r = std::make_shared<Tensor>();
             op_r->associateInput(0,T0_r);
@@ -153,13 +153,13 @@ TEST_CASE("[cpu/operator] BitShift_TEST", "[BitShift][CPU]") {
             op_r -> associateInput(1,T1_r);
             T1_r->setDataType(DataType::Int32);
             T1_r->setBackend("cpu");
-        
+
             // Create results Tensor
             std::shared_ptr<Tensor> Tres_r = std::make_shared<Tensor>();
             Tres_r->setDataType(DataType::Int32);
             Tres_r->setBackend("cpu");
             std::size_t number_of_operation = 0;
-            
+
             for (std::uint16_t trial = 0; trial < NBTRIALS; ++trial) {
                 // generate 2 random Tensors
                 const std::size_t nbDims = nbDimsDist(gen);
@@ -174,7 +174,7 @@ TEST_CASE("[cpu/operator] BitShift_TEST", "[BitShift][CPU]") {
                 int* array0 = new int[nb_elements];
                 int* array1 = new int[nb_elements];
                 int* result = new int[nb_elements];
-                for (std::size_t i = 0; i < nb_elements; ++i) 
+                for (std::size_t i = 0; i < nb_elements; ++i)
                 {
                     array0[i] = valueDist(gen);
                     array1[i] = std::abs(valueDist(gen)); // bitshift is impossible with negative value
diff --git a/unit_tests/operator/Test_ConstantOfShapeImpl.cpp b/unit_tests/operator/Test_ConstantOfShapeImpl.cpp
index 6833d836..9af2ca11 100644
--- a/unit_tests/operator/Test_ConstantOfShapeImpl.cpp
+++ b/unit_tests/operator/Test_ConstantOfShapeImpl.cpp
@@ -27,6 +27,7 @@
 #include "aidge/data/Tensor.hpp"
 #include "aidge/filler/Filler.hpp"
 #include "aidge/operator/ConstantOfShape.hpp"
+#include "aidge/operator/OperatorTensor.hpp"
 #include "aidge/utils/TensorUtils.hpp"
 #include "aidge/utils/Types.h"
 
diff --git a/unit_tests/operator/Test_ConvImpl.cpp b/unit_tests/operator/Test_ConvImpl.cpp
index 854789e3..c7242bbb 100644
--- a/unit_tests/operator/Test_ConvImpl.cpp
+++ b/unit_tests/operator/Test_ConvImpl.cpp
@@ -1673,7 +1673,7 @@ TEST_CASE("[cpu/operator] Conv(forward)", "[Conv][CPU]") {
                     {   0,   1,   1,   1,   1,   1,   0},
                     {   0,   1,   1,   1,   1,   0,  -1},
                     {  -1,   0,   1,   2,   2,   0,  -1}},
-                 
+
                    {{   0,   0,  -1,   0,   0,   0,  -1},
                     {   0,   0,   0,   1,   1,   0,   0},
                     {   0,   0,   1,   1,   1,   1,   0},
@@ -1681,7 +1681,7 @@ TEST_CASE("[cpu/operator] Conv(forward)", "[Conv][CPU]") {
                     {   0,   1,   1,   1,   1,   1,   0},
                     {   0,   1,   1,   0,   1,   0,   0},
                     {  -1,   0,   1,   1,   1,   0,  -1}},
-                 
+
                    {{   0,  -1,  -1,   0,   1,   0,  -1},
                     {   0,   1,   1,   2,   2,   1,   0},
                     {   0,   1,   1,   2,   2,   1,   1},
@@ -1692,7 +1692,7 @@ TEST_CASE("[cpu/operator] Conv(forward)", "[Conv][CPU]") {
             });
             std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array1D<int32_t,1> {
                 {
-                   19282 
+                   19282
                 }
             });
             Pad_Op<2> pad_op = Pad_Op<2>({3,3});
diff --git a/unit_tests/operator/Test_DivImpl.cpp b/unit_tests/operator/Test_DivImpl.cpp
index f7993753..37d11599 100644
--- a/unit_tests/operator/Test_DivImpl.cpp
+++ b/unit_tests/operator/Test_DivImpl.cpp
@@ -561,10 +561,10 @@ TEST_CASE("[CPU/Operator] Div(Backward)", "[Div][CPU][Backward]") {
                         std::size_t in0Idx = w + 7 * (0 + 1 * (c + 2 * n));
                         std::size_t in1Idx = w + 7 * (h + 6 * c);
 
-                        expectedGrad0[in0Idx] += 
+                        expectedGrad0[in0Idx] +=
                             gradOutputData[outIdx] * (1.0f / input1Data[in1Idx]);
 
-                        expectedGrad1[in1Idx] += 
+                        expectedGrad1[in1Idx] +=
                             gradOutputData[outIdx] * (-input0Data[in0Idx] / (input1Data[in1Idx] * input1Data[in1Idx]));
                     }
                 }
diff --git a/unit_tests/operator/Test_DropoutImpl.cpp b/unit_tests/operator/Test_DropoutImpl.cpp
new file mode 100644
index 00000000..a3c10ead
--- /dev/null
+++ b/unit_tests/operator/Test_DropoutImpl.cpp
@@ -0,0 +1,103 @@
+/********************************************************************************
+ * Copyright (c) 2025 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+#include <cstddef>  // std::size_t
+#include <memory>
+
+#include <catch2/catch_test_macros.hpp>
+
+#include "aidge/data/Tensor.hpp"
+#include "aidge/operator/Dropout.hpp"
+#include "aidge/utils/TensorUtils.hpp"
+
+using namespace Aidge;
+
+ TEST_CASE("[cpu/operator] Dropout(forward - inference mode / MC dropout)", "[Dropout][CPU]") {
+
+    SECTION("MC Dropout - check stochastic output and scaling") {
+        constexpr const std::size_t nb_elements = 6;
+        std::shared_ptr<Tensor> input = std::make_shared<Tensor>(Array1D<cpptype_t<DataType::Float32>,nb_elements> {
+            {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}
+        });
+
+        constexpr const float dropout_prob = 0.5f;
+        std::shared_ptr<Node> myDropout = Dropout(dropout_prob); // assumes dropout always active
+        auto op = std::static_pointer_cast<OperatorTensor>(myDropout->getOperator());
+
+        op->associateInput(0, input);
+        op->setBackend("cpu");
+        op->forwardDType();
+
+        myDropout->forward();
+        auto output = op->getOutput(0);
+
+        std::size_t num_zero = 0, num_scaled = 0;
+        constexpr const float scale = 1.0f / (1.0f - dropout_prob);
+
+        for (std::size_t i = 0; i < nb_elements; ++i) {
+            const float out = output->get<cpptype_t<DataType::Float32>>(i);
+            if (out == 0.0f)
+                ++num_zero;
+            else {
+                REQUIRE(approxEq<cpptype_t<DataType::Float32>>(Tensor(out),Tensor(scale)));  // scaled version of 1.0f
+                ++num_scaled;
+            }
+        }
+
+        // Ensure dropout is working
+        REQUIRE(num_zero + num_scaled == nb_elements);
+        REQUIRE(output->dims() == input->dims());  // TODO: test this in core module
+    }
+
+    SECTION("Stochasticity - multiple forward passes differ") {
+        // /!\ Warning: With too few elements, this test has a small
+        //     but real chance of failing.
+        constexpr const std::size_t nb_elements = 100;
+        std::shared_ptr<Tensor> input = std::make_shared<Tensor>(Array1D<cpptype_t<DataType::Float32>, nb_elements> {
+            {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+             1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+             1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+             1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+             1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+             1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+             1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+             1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+             1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+             1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}
+        });
+
+        constexpr const float dropout_prob = 0.3f;
+        std::shared_ptr<Node> myDropout = Dropout(dropout_prob);
+        auto op = std::static_pointer_cast<OperatorTensor>(myDropout->getOperator());
+        op->associateInput(0, input);
+        op->setBackend("cpu");
+        op->forwardDType();
+
+        std::vector<cpptype_t<DataType::Float32>> run1, run2;
+
+        myDropout->forward();
+        auto out1 = op->getOutput(0);
+        for (std::size_t i = 0; i < nb_elements; ++i)
+            run1.push_back(out1->get<cpptype_t<DataType::Float32>>(i));
+
+        myDropout->forward();
+        auto out2 = op->getOutput(0);
+        for (std::size_t i = 0; i < nb_elements; ++i)
+            run2.push_back(out2->get<cpptype_t<DataType::Float32>>(i));
+
+        // Not all elements should be identical between the two runs
+        std::size_t same_count = 0;
+        for (std::size_t i = 0; i < nb_elements; ++i) {
+            if (run1[i] == run2[i])
+                same_count++;
+        }
+    }
+}
\ No newline at end of file
diff --git a/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp b/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp
index 0fd7d84b..0ecb3163 100644
--- a/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp
+++ b/unit_tests/operator/Test_GlobalAveragePoolingImpl.cpp
@@ -558,7 +558,7 @@ TEST_CASE("[cpu/operator] GlobalAveragePooling",
       Log::info("Number of operations : {}\n", number_of_operation);
       Log::info("Operation / µs = {}\n", number_of_operation / duration.count());
     }
-  
+
     SECTION("Simple test") {
       std::shared_ptr<Tensor> tensor =
           std::make_shared<Tensor>(Array4D<int32_t, 1, 1, 7, 7>{{{{
diff --git a/unit_tests/operator/Test_HeavisideImpl.cpp b/unit_tests/operator/Test_HeavisideImpl.cpp
index d3ed3826..16fad24d 100644
--- a/unit_tests/operator/Test_HeavisideImpl.cpp
+++ b/unit_tests/operator/Test_HeavisideImpl.cpp
@@ -110,19 +110,19 @@ TEST_CASE("[cpu/operator] Heaviside(backward)", "[Heaviside][CPU]") {
     std::uniform_int_distribution<std::size_t> sizeDist(5, 100);
 
     const std::size_t tensorSize = sizeDist(gen);
-    
+
     auto hs = Heaviside(1.0f);
     auto op = std::static_pointer_cast<OperatorTensor>(hs->getOperator());
     op->setDataType(DataType::Float32);
     op->setBackend("cpu");
 
-        
+
 
     auto inputTensor = std::make_shared<Tensor>(std::vector<std::size_t>{tensorSize});
     inputTensor->setDataType(DataType::Float32);
     inputTensor->setBackend("cpu");
     auto* inputData = static_cast<float*>(inputTensor->getImpl()->rawPtr());
-    
+
     for(std::size_t i = 0; i < tensorSize; ++i) {
         inputData[i] = valueDist(gen);
     }
@@ -154,34 +154,34 @@ TEST_CASE("[cpu/operator] Heaviside(backward)", "[Heaviside][CPU]") {
     gradTensor->setDataType(DataType::Float32);
     gradTensor->setBackend("cpu");
     auto* gradData = static_cast<float*>(gradTensor->getImpl()->rawPtr());
-    
+
     for (std::size_t i = 0; i < tensorSize; ++i) {
         gradData[i] = valueDist(gen);
     }
 
     op->setInput(IOIndex_t(0), inputTensor);
     op->forward();
-    
+
     auto output = op->getOutput(0);
     output->setGrad(gradTensor);
-    
+
     // Backward pass
     op->backward();
 
     atanOp->setOutput(0, outmul);
     atanOp->getOutput(0)->setGrad(gradTensor);
     atanOp->backward();
-    
+
     // Compute expected gradient manually
     auto expectedGrad = std::make_shared<Tensor>(std::vector<std::size_t>{tensorSize});
     expectedGrad->setDataType(DataType::Float32);
     expectedGrad->setBackend("cpu");
     auto* expectedGradData = static_cast<float*>(expectedGrad->getImpl()->rawPtr());
-    
+
     for (std::size_t i = 0; i < tensorSize; ++i) {
         expectedGradData[i] = gradData[i] * (1.0f / (1.0f + (inputData[i] * M_PI) * (inputData[i] * M_PI)));
     }
-    
+
     // Compare actual gradient with expected gradient
     REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad));
 
diff --git a/unit_tests/operator/Test_MaxPoolingImpl.cpp b/unit_tests/operator/Test_MaxPoolingImpl.cpp
index e4d171c5..57d4190e 100644
--- a/unit_tests/operator/Test_MaxPoolingImpl.cpp
+++ b/unit_tests/operator/Test_MaxPoolingImpl.cpp
@@ -181,7 +181,7 @@ TEST_CASE("[cpu/operator] MaxPooling(forward)", "[MaxPooling][CPU]") {
 
 
 TEST_CASE("[cpu/operator] MaxPooling(backward)", "[MaxPooling][CPU]") {
-    std::shared_ptr<Tensor> myInput = 
+    std::shared_ptr<Tensor> myInput =
 	std::make_shared<Tensor>(Array4D<float,2,2,5,5> { //NCHW
 		{
 		    {
@@ -252,7 +252,7 @@ TEST_CASE("[cpu/operator] MaxPooling(backward)", "[MaxPooling][CPU]") {
         op->associateInput(0,myInput);
         op->setDataType(DataType::Float32);
         op->setBackend("cpu");
-	op->backward();	
+	op->backward();
 	//op->getInput(0)->grad()->print();
         REQUIRE(*(op->getInput(0)->grad()) == grad);
     }
@@ -289,7 +289,7 @@ TEST_CASE("[cpu/operator] MaxPooling(backward)", "[MaxPooling][CPU]") {
         myMaxPool->getOperator()->associateInput(0,myInput);
         myMaxPool->getOperator()->setDataType(DataType::Float32);
         myMaxPool->getOperator()->setBackend("cpu");
-	op->backward();	
+	op->backward();
 	//op->getInput(0)->grad()->print();
         REQUIRE(*(op->getInput(0)->grad()) == grad);
     }
@@ -313,10 +313,10 @@ TEST_CASE("[cpu/operator] MaxPooling(backward)", "[MaxPooling][CPU]") {
 		);
 		Tensor grad = Array4D<float,1,1,5,5> {
 			{{{
-				{0, 0, 0, 0, 0}, 
-				{0, 1, 0, 1, 1}, 
-				{0, 0, 0, 0, 0}, 
-				{0, 1, 0, 1, 1}, 
+				{0, 0, 0, 0, 0},
+				{0, 1, 0, 1, 1},
+				{0, 0, 0, 0, 0},
+				{0, 1, 0, 1, 1},
 				{0, 1, 0, 1, 1}
 			}}}
 		};
@@ -324,7 +324,7 @@ TEST_CASE("[cpu/operator] MaxPooling(backward)", "[MaxPooling][CPU]") {
 		op1->associateInput(0, myInput4);
 		op1->setDataType(DataType::Float32);
 		op1->setBackend("cpu");
-		op1->backward();	
+		op1->backward();
 		//op1->getInput(0)->grad()->print();
 		REQUIRE(*(op1->getInput(0)->grad()) == grad);
 
@@ -337,10 +337,10 @@ TEST_CASE("[cpu/operator] MaxPooling(backward)", "[MaxPooling][CPU]") {
 
 		Tensor grad2 = Array4D<float,1,1,5,5> {
 			{{{
-				{0, 0, 0, 0, 0}, 
-				{0, 1, 0, 1, 0}, 
-				{0, 0, 0, 0, 0}, 
-				{0, 1, 0, 1, 0}, 
+				{0, 0, 0, 0, 0},
+				{0, 1, 0, 1, 0},
+				{0, 0, 0, 0, 0},
+				{0, 1, 0, 1, 0},
 				{0, 0, 0, 0, 0}
 			}}}
 		};
diff --git a/unit_tests/operator/Test_WeightInterleavingImpl.cpp b/unit_tests/operator/Test_WeightInterleavingImpl.cpp
index c95c8fca..3c111625 100644
--- a/unit_tests/operator/Test_WeightInterleavingImpl.cpp
+++ b/unit_tests/operator/Test_WeightInterleavingImpl.cpp
@@ -23,7 +23,6 @@
 using namespace Aidge;
 
 TEST_CASE("[cpu/operator] WeightInterleaving", "[WeightInterleaving][CPU]") {
-
     std::shared_ptr<Node> myWeightInterleaving = WeightInterleaving();
     auto opWeightInterleaving = std::static_pointer_cast<WeightInterleaving_Op>(myWeightInterleaving -> getOperator());
 
@@ -415,7 +414,6 @@ TEST_CASE("[cpu/operator] WeightInterleaving", "[WeightInterleaving][CPU]") {
 
         // Create convolution node
         std::shared_ptr<Node> conv = Conv(4, 2, {3, 3}, "conv1");
-
         // Place the weight tensor in the weight producer of the conv
         auto weightProducer = conv->getParent(1);
         weightProducer->getOperator()->setOutput(0, weight);
-- 
GitLab


From a20978225b1bfc8ae735171df620ea9741f4d947 Mon Sep 17 00:00:00 2001
From: NAUD Maxence <maxence.naud@cea.fr>
Date: Thu, 17 Apr 2025 08:27:10 +0000
Subject: [PATCH 104/108] add: missing import of 'benchmark.py' file to python
 package

---
 aidge_backend_cpu/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/aidge_backend_cpu/__init__.py b/aidge_backend_cpu/__init__.py
index bb320b2f..b88917a2 100644
--- a/aidge_backend_cpu/__init__.py
+++ b/aidge_backend_cpu/__init__.py
@@ -1,2 +1,3 @@
 import aidge_core
 from aidge_backend_cpu.aidge_backend_cpu import * # import so generated by PyBind
+from . import benchmark
-- 
GitLab


From 9c8427cb729d1e021d5cb820b8162f715ec0c816 Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Fri, 18 Apr 2025 14:13:36 +0200
Subject: [PATCH 105/108] Fixed typo

---
 include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
index e1e76a33..d2b942f6 100644
--- a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
@@ -631,7 +631,7 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims,
                                      iIndex_channel +=
                                      inputDims[3] * strideDims[0]) {
                         // loop over associated input line
-                        for (std::size_t ky = 0, ix = 0; ky < kernelDims[0];
+                        for (std::size_t ky = 0, ix = 0; ky < kernelDims[1];
                              ++ky, ix += inputDims[3] * dilationDims[0]) {
                             // loop over the entire line
                             for (std::size_t oy = 0, iy = 0; oy < oySize;
-- 
GitLab


From fe6570e2f17cff547bd58535a93ef173705e7a49 Mon Sep 17 00:00:00 2001
From: cmoineau <cyril.moineau@cea.fr>
Date: Tue, 22 Apr 2025 11:01:07 +0000
Subject: [PATCH 106/108] Update 0.5.0 -> 0.6.0

---
 version.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version.txt b/version.txt
index 8f0916f7..a918a2aa 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.5.0
+0.6.0
-- 
GitLab


From cdc5ce15daaf78ef7ae8a821e8fa3c6ca3efaa3e Mon Sep 17 00:00:00 2001
From: cmoineau <cyril.moineau@cea.fr>
Date: Wed, 23 Apr 2025 07:48:34 +0000
Subject: [PATCH 107/108] Add option -DPYBIND11_FINDPYTHON=ON.

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 366a4825..ee54ca95 100644
--- a/setup.py
+++ b/setup.py
@@ -88,6 +88,7 @@ class AidgePkgBuild(build_ext):
             f"-DCMAKE_CXX_COMPILER={cxx_compiler}",
             f"-DENABLE_ASAN={asan}",
             "-DPYBIND=ON",
+            "-DPYBIND11_FINDPYTHON=ON",
             f"-DPYBIND_INSTALL_PREFIX:PATH={pybind_install_prefix}",
             "-DCMAKE_EXPORT_COMPILE_COMMANDS=1",
             "-DCOVERAGE=OFF",
-- 
GitLab


From eb3021b4e2aac37eca79a34422d71c5807d71c8b Mon Sep 17 00:00:00 2001
From: cmoineau <cyril.moineau@cea.fr>
Date: Wed, 23 Apr 2025 07:55:26 +0000
Subject: [PATCH 108/108] Update minimum version to python 3.10

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 39bed4d2..1e8869ad 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ description="CPU implementation of operators of the AIDGE framework"
 dependencies = [
     "numpy",
 ]
-requires-python = ">= 3.8"
+requires-python = ">= 3.10"
 readme = "README.md"
 license = { file = "LICENSE" }
 classifiers = [
-- 
GitLab