From f4a5d905f8793d03517269cb770603576d9b9e5e Mon Sep 17 00:00:00 2001
From: hrouis <houssemeddine.rouis92@gmail.com>
Date: Thu, 6 Feb 2025 10:56:31 +0100
Subject: [PATCH 1/8] add alpha and beta attr to FC impl

---
 include/aidge/backend/cpu/operator/FCImpl.hpp |  4 ++
 .../backend/cpu/operator/FCImpl_kernels.hpp   | 47 +++++++++-----
 src/operator/FCImpl.cpp                       |  4 ++
 unit_tests/operator/Test_FCImpl.cpp           | 63 +++++++++++++++++++
 4 files changed, 102 insertions(+), 16 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/FCImpl.hpp b/include/aidge/backend/cpu/operator/FCImpl.hpp
index e82352d9..4daa522f 100644
--- a/include/aidge/backend/cpu/operator/FCImpl.hpp
+++ b/include/aidge/backend/cpu/operator/FCImpl.hpp
@@ -27,6 +27,8 @@ using FCImpl_cpu = OperatorImpl_cpu<FC_Op,
     void(const DimSize_t,
         const DimSize_t,
         const DimSize_t,
+        const float,
+        const float,
         const void *,
         const void *,
         const void *,
@@ -34,6 +36,8 @@ using FCImpl_cpu = OperatorImpl_cpu<FC_Op,
     void(const DimSize_t,
         const DimSize_t,
         const DimSize_t,
+        const float,
+        const float,
         const void *,
         const void *,
         const void *,
diff --git a/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp
index c57f86e6..a1624bb5 100644
--- a/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp
@@ -86,6 +86,8 @@ template <class I, class W, class B, class O>
 void FCImpl_cpu_forward_kernel(const DimSize_t batchSize,
                             const DimSize_t inputFeatureSize,
                             const DimSize_t outputFeatureSize,
+                            const float alpha_,
+                            const float beta_,
                             const void* input_,
                             const void* weights_,
                             const void* biases_,
@@ -96,21 +98,29 @@ void FCImpl_cpu_forward_kernel(const DimSize_t batchSize,
     const B* biases = static_cast<const B*>(biases_);
     O* output = static_cast<O*>(output_);
 
+    const O alpha = static_cast<O>(alpha_);
+    const O beta = static_cast<O>(beta_);
+
     if (biases == nullptr) {
-        std::fill(output, output+(batchSize*outputFeatureSize), B(0));
-    }
-    else {
+        std::fill(output, output + (batchSize * outputFeatureSize), O(0));
+    } else { // Initialize output with bias * beta
         for (std::size_t batch = 0; batch < batchSize; ++batch) {
-            std::copy(biases, biases+outputFeatureSize, output+(batch*outputFeatureSize));
+            std::transform(
+                biases, biases + outputFeatureSize, output + batch * outputFeatureSize,
+                [beta](const B& bias) { return beta * static_cast<O>(bias); }
+            );
         }
     }
 
+    // Perform matrix-vector multiplication with alpha scaling
     for (std::size_t batch = 0; batch < batchSize; ++batch) {
         for (std::size_t out = 0; out < outputFeatureSize; ++out) {
-            output[out + batch*outputFeatureSize] = std::inner_product(input + batch*inputFeatureSize,
-                                                        input + (batch + 1)*inputFeatureSize,
-                                                        weights + out*inputFeatureSize,
-                                                        output[out + batch*outputFeatureSize]);
+            output[out + batch * outputFeatureSize] += alpha * std::inner_product(
+                input + batch * inputFeatureSize,
+                input + (batch + 1) * inputFeatureSize,
+                weights + out * inputFeatureSize,
+                O(0) // Initialize accumulator to zero
+            );
         }
     }
 }
@@ -119,6 +129,8 @@ template <class I, class O, class W, class B>
 void FCImpl_cpu_backward_kernel(const DimSize_t batchSize,
                                 const DimSize_t inputFeatureSize,
                                 const DimSize_t outputFeatureSize,
+                                const float alpha_,
+                                const float beta_,
                                 const void* input_,
                                 const void* originalInput_,
                                 const void* weight_,
@@ -134,17 +146,20 @@ void FCImpl_cpu_backward_kernel(const DimSize_t batchSize,
     W* weightGrad   = static_cast<W*>(weightGrad_);
     B* biasesGrad   = static_cast<B*>(biasesGrad_);
 
+    // Coefficients
+    const O alpha = static_cast<O>(alpha_);
+    const O beta  = static_cast<O>(beta_);
 
     // bias grad
     if (biasesGrad == nullptr) { // no bias
         std::fill(biasesGrad, biasesGrad + outputFeatureSize, B(0));
     } else {
-        for (std::size_t o = 0; o < outputFeatureSize; ++o) { // nb outputs
+        for (std::size_t o = 0; o < outputFeatureSize; ++o) {
             B sum{0};
             for (std::size_t b = 0; b < batchSize; ++b) {
-                sum += input[b*outputFeatureSize + o];
+                sum += input[b * outputFeatureSize + o];
             }
-            biasesGrad[o] = sum;
+            biasesGrad[o] = beta * biasesGrad[o] + alpha * sum;
         }
     }
 
@@ -153,20 +168,20 @@ void FCImpl_cpu_backward_kernel(const DimSize_t batchSize,
         for (std::size_t c = 0; c < inputFeatureSize; ++c) {
             W sum{0};
             for (std::size_t b = 0; b < batchSize; ++b) {
-                sum += originalInput[b*inputFeatureSize + c]*input[b*outputFeatureSize + o];
+                sum += originalInput[b * inputFeatureSize + c] * input[b * outputFeatureSize + o];
             }
-            weightGrad[o*inputFeatureSize + c] = sum;
+            weightGrad[o * inputFeatureSize + c] = beta * weightGrad[o * inputFeatureSize + c] + alpha * sum;
         }
     }
 
-    // input grad
+    // Input gradient (output)
     for (std::size_t b = 0; b < batchSize; ++b) {
         for (std::size_t c = 0; c < inputFeatureSize; ++c) {
             O sum{0};
             for (std::size_t o = 0; o < outputFeatureSize; ++o) {
-                sum += weight[o*inputFeatureSize + c] * input[b*outputFeatureSize + o];
+                sum += weight[o * inputFeatureSize + c] * input[b * outputFeatureSize + o];
             }
-            output[b*inputFeatureSize + c] = sum;
+            output[b * inputFeatureSize + c] = alpha * sum; // Apply alpha (no accumulation needed)
         }
     }
 }
diff --git a/src/operator/FCImpl.cpp b/src/operator/FCImpl.cpp
index 35945271..144cb1cd 100644
--- a/src/operator/FCImpl.cpp
+++ b/src/operator/FCImpl.cpp
@@ -46,6 +46,8 @@ void Aidge::FCImpl_cpu::forward()
     impl.forward(batchSize,
         input1.dims()[1], // nb input features
         input1.dims()[0], // nb output features
+        op_.alpha(),
+        op_.beta(),
         input0.getImpl()->rawPtr(),
         input1.getImpl()->rawPtr(),
         (op_.getInput(2)) ? input2.getImpl()->rawPtr() : nullptr,
@@ -77,6 +79,8 @@ void Aidge::FCImpl_cpu::backward()
     impl.backward(batchSize,
         input1grad.dims()[1], // nb input features
         input1grad.dims()[0], // nb output features
+        op_.alpha(),
+        op_.beta(),
         getCPUPtr(fc_grad),
         getCPUPtr(op_.getInput(0)),
         getCPUPtr(mOp.getRawInput(1)),
diff --git a/unit_tests/operator/Test_FCImpl.cpp b/unit_tests/operator/Test_FCImpl.cpp
index 8ac0afc3..41f95abe 100644
--- a/unit_tests/operator/Test_FCImpl.cpp
+++ b/unit_tests/operator/Test_FCImpl.cpp
@@ -10,6 +10,7 @@
  ********************************************************************************/
 
 #include <memory>
+#include <iostream>
 
 #include <catch2/catch_test_macros.hpp>
 
@@ -108,4 +109,66 @@ TEST_CASE("[cpu/oeprator] FC(forward)", "[FC][CPU]") {
     }
 
     // std::cout << static_cast<Tensor>((*myFC->getOperator())["weight"])[0][0][0][0] << std::endl;
+}
+
+
+TEST_CASE("[cpu/oeprator] FC(backward)", "[FC][CPU]") {
+
+    std::shared_ptr<Tensor> myInput =
+            std::make_shared<Tensor>(Array2D<float, 4, 3>{
+                              {
+                              {0.55043954, -0.080161572, 0.18495631},
+                              {-0.82497174, -0.95155114, 0.25449812},
+                              {1.6508394, 0.2518357, -0.49999624},
+                              {0.82770473, 0.28659272, -0.11644308}
+                              }});
+    std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array2D<float, 2, 3>{
+            {{0.044322353, 1.9578923, -1.96035},
+ {-1.1458585, -0.8235659, 0.24195994}}});
+    std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float, 2>{{1.5327742,  0.90154403}});
+
+    std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float, 4, 2>{
+            {{1.0376441, 0.38158852},
+            {-0.86573052, 2.6920884},
+            {3.0791781, -1.3184667},
+            {2.3588469, -0.31109101}}});
+
+    std::shared_ptr<Node> myFC = FC(3, 2, false, "myfc");
+    auto op = std::static_pointer_cast<OperatorTensor>(myFC -> getOperator());
+    op -> associateInput(0, myInput);
+    op -> associateInput(1, myWeights);
+    op -> associateInput(2, myBias);
+    op -> setDataType(DataType::Float32);
+    op -> setBackend("cpu");
+    myFC->forward();
+    op->getOutput(0)->print();
+    REQUIRE(approxEq<float>(*(op->getOutput(0)), *myOutput));
+
+    // Backward
+    std::shared_ptr<Tensor> myOutputGrad =
+            std::make_shared<Tensor>(Array2D<float, 4, 2>{
+                              {
+                              {1.373911, -1.2312084},
+                              {0.24750818, -0.71446633},
+                              {-1.5132738, -0.23136522},
+                              {0.20452768, -1.2200259}
+                              }});
+    std::shared_ptr<Tensor> expectedInputGrad =
+            std::make_shared<Tensor>(Array2D<float, 4, 3>{
+                              {
+                              {1.4716856, 3.7039511, -2.9912496},
+                              {0.82964748, 1.0730045, -0.65807492},
+                              {0.19803995, -2.7722826, 2.9105654},
+                              {1.4070423, 1.4052149, -0.69614327}
+                              }});
+    std::shared_ptr<Tensor> expectedWeightsGrad = std::make_shared<Tensor>(Array2D<float, 2, 3>{
+            {{-1.7768159, -0.66813177, 1.0499192},
+              {-1.4800593, 0.37063029, -0.15180479}}});
+    std::shared_ptr<Tensor> expectedBiasGrad = std::make_shared<Tensor>(Array1D<float, 2>{{0.31267303, -3.397066 }});
+
+    op->getOutput(0)->setGrad(myOutputGrad);
+    myFC->backward();
+    REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedInputGrad));
+    REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedWeightsGrad));
+    REQUIRE(approxEq<float>(*(op->getInput(2)->grad()), *expectedBiasGrad));
 }
\ No newline at end of file
-- 
GitLab


From a52c842f120e9b7c19de89ef9147d1ac0afea36d Mon Sep 17 00:00:00 2001
From: hrouis <houssemeddine.rouis92@gmail.com>
Date: Mon, 10 Feb 2025 15:27:36 +0100
Subject: [PATCH 2/8] support 2D Bias for FC operator

---
 include/aidge/backend/cpu/operator/FCImpl.hpp |   2 +
 .../backend/cpu/operator/FCImpl_kernels.hpp   | 101 +++++-------------
 src/operator/FCImpl.cpp                       |   6 +-
 unit_tests/operator/Test_FCImpl.cpp           |  67 +++++++++++-
 4 files changed, 99 insertions(+), 77 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/FCImpl.hpp b/include/aidge/backend/cpu/operator/FCImpl.hpp
index 4daa522f..b6e0d099 100644
--- a/include/aidge/backend/cpu/operator/FCImpl.hpp
+++ b/include/aidge/backend/cpu/operator/FCImpl.hpp
@@ -27,6 +27,7 @@ using FCImpl_cpu = OperatorImpl_cpu<FC_Op,
     void(const DimSize_t,
         const DimSize_t,
         const DimSize_t,
+        const bool,
         const float,
         const float,
         const void *,
@@ -36,6 +37,7 @@ using FCImpl_cpu = OperatorImpl_cpu<FC_Op,
     void(const DimSize_t,
         const DimSize_t,
         const DimSize_t,
+        const bool,
         const float,
         const float,
         const void *,
diff --git a/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp
index a1624bb5..7635fca3 100644
--- a/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp
@@ -18,74 +18,11 @@
 #include "aidge/utils/Registrar.hpp"
 
 namespace Aidge {
-// template <class I, class W, class B, class O>
-// void FCImpl_cpu_forward_kernel(const FC_Op::Attrs& attrs, const std::array<DimSize_t, 4>& dims,
-//                                    const void* input_, const void* weights_, const void* biases_, void* output_) {
-//     // FIXME: missing FC attributes as arguments
-//     const I* input = static_cast<const I*>(input_);
-//     const W* weights = static_cast<const W*>(weights_);
-//     const B* biases = static_cast<const B*>(biases_);
-//     O* output = static_cast<O*>(output_);
-
-//     for (std::size_t outIdx = 0; outIdx < outputFeatureSize; ++outIdx) {
-//         std::size_t oIndex = outIdx * dims[3];
-//         const B bias = std::get<0>(attrs) ? B(0) : biases[outIdx];
-//         for (std::size_t batch = 0; batch < dims[3]; ++batch) {
-//             output[oIndex + batch] = bias;
-//         }
-//     }
-
-//     for (std::size_t ix = 0; ix < dims[0]; ++ix) {
-//         for (std::size_t iy = 0; iy < dims[1]; ++iy) {
-//             for (std::size_t inCh = 0; inCh < dims[2]; ++inCh) {
-//                 const std::size_t iIndex = dims[3] * (inCh + dims[2] * (iy + dims[1] * ix));
-//                 for (std::size_t outCh = 0; outCh < outputFeatureSize; ++outCh) {
-//                     const std::size_t oIndex = dims[3] * outCh;
-//                     const std::size_t wIndex = (inCh + dims[2] * (iy + dims[1] * ix)) * outputFeatureSize +
-//                                           outCh;  // (iIndex*outputFeatureSize + oIndex)/dims[3];
-//                     for (std::size_t batch = 0; batch < dims[3]; ++batch) {
-//                         output[oIndex + batch] += weights[wIndex] * input[iIndex + batch];
-//                     }
-//                 }
-//             }
-//         }
-//     }
-// }
-
-// template <class I, class W, class B, class O>
-// void FCImpl_cpu_forward_kernel(const FC_Op::Attrs& attrs, const std::array<DimSize_t, 2>& dims,
-//                                    const void* input_, const void* weights_, const void* biases_, void* output_) {
-//     // FIXME: missing FC attributes as arguments
-//     const I* input = static_cast<const I*>(input_);
-//     const W* weights = static_cast<const W*>(weights_);
-//     const B* biases = static_cast<const B*>(biases_);
-//     O* output = static_cast<O*>(output_);
-
-//     // let's have I.dims() = [N, C, H, W] instead of [H, W, C, N]
-
-//     for (std::size_t outIdx = 0; outIdx < outputFeatureSize; ++outIdx) {
-//         std::size_t oIndex = outIdx * dims[0];
-//         const B bias = std::get<0>(attrs) ? B(0) : biases[outIdx];
-//         for (std::size_t batch = 0; batch < dims[0]; ++batch) {
-//             output[oIndex + batch] = bias;
-//         }
-//     }
-
-//     for (std::size_t batch = 0; batch < dims[0]; ++batch) {
-//         const std::size_t oIndex = dims[1] * batch;
-//         for (std::size_t i = 0; i < dims[1]; ++i) {
-//             for (std::size_t outCh = 0; outCh < outputFeatureSize; ++outCh) {
-//                 std::size_t wIndex = i * outputFeatureSize + outCh;  // (iIndex*outputFeatureSize + oIndex)/dims[3];
-//                 output[oIndex + outCh] += weights[wIndex] * input[i + batch];
-//             }
-//         }
-//     }
-// }
-
 template <class I, class W, class B, class O>
 void FCImpl_cpu_forward_kernel(const DimSize_t batchSize,
                             const DimSize_t inputFeatureSize,
                             const DimSize_t outputFeatureSize,
+                            const bool isBiasBatched,
                             const float alpha_,
                             const float beta_,
                             const void* input_,
@@ -104,12 +41,20 @@ void FCImpl_cpu_forward_kernel(const DimSize_t batchSize,
     if (biases == nullptr) {
         std::fill(output, output + (batchSize * outputFeatureSize), O(0));
     } else { // Initialize output with bias * beta
-        for (std::size_t batch = 0; batch < batchSize; ++batch) {
+        if (isBiasBatched) {
             std::transform(
-                biases, biases + outputFeatureSize, output + batch * outputFeatureSize,
+                biases, biases + batchSize * outputFeatureSize, output,
                 [beta](const B& bias) { return beta * static_cast<O>(bias); }
             );
         }
+        else {   // Bias 1D
+            for (std::size_t batch = 0; batch < batchSize; ++batch) {
+                std::transform(
+                    biases, biases + outputFeatureSize, output + batch * outputFeatureSize,
+                    [beta](const B& bias) { return beta * static_cast<O>(bias); }
+                );
+            }
+        }
     }
 
     // Perform matrix-vector multiplication with alpha scaling
@@ -129,6 +74,7 @@ template <class I, class O, class W, class B>
 void FCImpl_cpu_backward_kernel(const DimSize_t batchSize,
                                 const DimSize_t inputFeatureSize,
                                 const DimSize_t outputFeatureSize,
+                                const bool isBiasBatched,
                                 const float alpha_,
                                 const float beta_,
                                 const void* input_,
@@ -150,16 +96,23 @@ void FCImpl_cpu_backward_kernel(const DimSize_t batchSize,
     const O alpha = static_cast<O>(alpha_);
     const O beta  = static_cast<O>(beta_);
 
+    const DimSize_t biasSize = isBiasBatched ? batchSize * outputFeatureSize : outputFeatureSize;
     // bias grad
     if (biasesGrad == nullptr) { // no bias
-        std::fill(biasesGrad, biasesGrad + outputFeatureSize, B(0));
+        std::fill(biasesGrad, biasesGrad + biasSize, B(0));
     } else {
-        for (std::size_t o = 0; o < outputFeatureSize; ++o) {
-            B sum{0};
-            for (std::size_t b = 0; b < batchSize; ++b) {
-                sum += input[b * outputFeatureSize + o];
+        if (isBiasBatched) {
+            for (std::size_t o = 0; o < biasSize; ++o) {
+                biasesGrad[o] = beta * input[o];
+            }
+        } else { // BiasGrad 1D
+            for (std::size_t o = 0; o < outputFeatureSize; ++o) {
+                B sum{0};
+                for (std::size_t b = 0; b < batchSize; ++b) {
+                    sum += input[b * outputFeatureSize + o];
+                }
+                biasesGrad[o] = beta * sum;
             }
-            biasesGrad[o] = beta * biasesGrad[o] + alpha * sum;
         }
     }
 
@@ -170,7 +123,7 @@ void FCImpl_cpu_backward_kernel(const DimSize_t batchSize,
             for (std::size_t b = 0; b < batchSize; ++b) {
                 sum += originalInput[b * inputFeatureSize + c] * input[b * outputFeatureSize + o];
             }
-            weightGrad[o * inputFeatureSize + c] = beta * weightGrad[o * inputFeatureSize + c] + alpha * sum;
+            weightGrad[o * inputFeatureSize + c] = alpha * sum;
         }
     }
 
@@ -181,7 +134,7 @@ void FCImpl_cpu_backward_kernel(const DimSize_t batchSize,
             for (std::size_t o = 0; o < outputFeatureSize; ++o) {
                 sum += weight[o * inputFeatureSize + c] * input[b * outputFeatureSize + o];
             }
-            output[b * inputFeatureSize + c] = alpha * sum; // Apply alpha (no accumulation needed)
+            output[b * inputFeatureSize + c] = alpha * sum;
         }
     }
 }
diff --git a/src/operator/FCImpl.cpp b/src/operator/FCImpl.cpp
index 144cb1cd..4c1b6861 100644
--- a/src/operator/FCImpl.cpp
+++ b/src/operator/FCImpl.cpp
@@ -40,12 +40,13 @@ void Aidge::FCImpl_cpu::forward()
     const auto& input0 = op_.getInput(0)->refCastFrom(input0Fallback, *(op_.getOutput(0)));
     const auto& input1 = op_.getInput(1)->refCastFrom(input1Fallback, *(op_.getOutput(0)));
     const auto& input2 = (op_.getInput(2)) ? op_.getInput(2)->refCastFrom(input2Fallback, *(op_.getOutput(0))) : Tensor();
-
+    const bool isBiasBatched = input2.nbDims() == 2;
     // Call kernel
     const auto batchSize = (input0.dims().size() > 1) ? input0.dims()[0] : 1;
     impl.forward(batchSize,
         input1.dims()[1], // nb input features
         input1.dims()[0], // nb output features
+        isBiasBatched,
         op_.alpha(),
         op_.beta(),
         input0.getImpl()->rawPtr(),
@@ -73,12 +74,13 @@ void Aidge::FCImpl_cpu::backward()
     const auto& input0grad = op_.getInput(0)->grad()->refCastFrom(input0gradFallback, *(op_.getOutput(0)));
     const auto& input1grad = op_.getInput(1)->grad()->refCastFrom(input1gradFallback, *(op_.getOutput(0)));
     const auto& input2grad = (op_.getInput(2)) ? op_.getInput(2)->grad()->refCastFrom(input2gradFallback, *(op_.getOutput(0))) : Tensor();
-
+    const bool isBiasBatched = input2grad.nbDims() == 2;
     // Call kernel
     const auto batchSize = (input0grad.dims().size() > 1) ? input0grad.dims()[0] : 1;
     impl.backward(batchSize,
         input1grad.dims()[1], // nb input features
         input1grad.dims()[0], // nb output features
+        isBiasBatched,
         op_.alpha(),
         op_.beta(),
         getCPUPtr(fc_grad),
diff --git a/unit_tests/operator/Test_FCImpl.cpp b/unit_tests/operator/Test_FCImpl.cpp
index 41f95abe..324a8a18 100644
--- a/unit_tests/operator/Test_FCImpl.cpp
+++ b/unit_tests/operator/Test_FCImpl.cpp
@@ -113,7 +113,7 @@ TEST_CASE("[cpu/oeprator] FC(forward)", "[FC][CPU]") {
 
 
 TEST_CASE("[cpu/oeprator] FC(backward)", "[FC][CPU]") {
-
+  SECTION("2D Input 1D Bias"){
     std::shared_ptr<Tensor> myInput =
             std::make_shared<Tensor>(Array2D<float, 4, 3>{
                               {
@@ -171,4 +171,69 @@ TEST_CASE("[cpu/oeprator] FC(backward)", "[FC][CPU]") {
     REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedInputGrad));
     REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedWeightsGrad));
     REQUIRE(approxEq<float>(*(op->getInput(2)->grad()), *expectedBiasGrad));
+  }
+   SECTION("2D Input 2D Bias"){
+    std::shared_ptr<Tensor> myInput =
+            std::make_shared<Tensor>(Array2D<float, 4, 3>{
+						{
+						{1.2137502, -0.73264742, -0.64570695},
+						{0.85416597, -1.361271, -0.48093504},
+						{-1.2204953, -1.400124, 0.14018863},
+						{-1.1383502, -0.47965094, -0.64090657}
+			}});
+    std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array2D<float, 2, 3>{
+						{{0.12352414, -0.036496852, -0.15418212},
+						{-1.0330718, 0.011371522, -0.60658616}}});
+    std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array2D<float, 4, 2>{
+						{{-0.7607078, 0.54936022},
+						{-0.31278628, 0.68560582},
+						{0.78312093, 0.96373892},
+						{-1.2183768, -0.4587383}}});
+
+    std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float, 4, 2>{
+						{{0.17209265, -1.4664109},
+						{0.30229449, -0.86952901},
+						{0.14901026, 2.8016717},
+						{-0.65777171, 2.8892474}}});
+
+    std::shared_ptr<Node> myFC = FC(3, 2, false, "myfc", 2.0f, 0.5f);
+    auto op = std::static_pointer_cast<OperatorTensor>(myFC -> getOperator());
+    op -> associateInput(0, myInput);
+    op -> associateInput(1, myWeights);
+    op -> associateInput(2, myBias);
+    op -> setDataType(DataType::Float32);
+    op -> setBackend("cpu");
+    myFC->forward();
+    op->getOutput(0)->print();
+    REQUIRE(approxEq<float>(*(op->getOutput(0)), *myOutput));
+
+    // Backward
+    std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array2D<float, 4, 2>{
+						{
+						{0.19126743, -0.85291833},
+						{0.94577849, -1.0063207},
+						{-0.60322332, 0.65167785},
+						{-0.038923461, 0.94386256}
+						}});
+    std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array2D<float, 4, 3>{
+                              {
+							{1.8095039, -0.033359278, 0.97575688},
+							{2.312856, -0.091922671, 0.92919618},
+							{-1.4954853, 0.058852643, -0.60458499},
+							{-1.9597715, 0.024307476, -1.1330653}
+                              }});
+    std::shared_ptr<Tensor> expectedWeightsGrad = std::make_shared<Tensor>(Array2D<float, 2, 3>{
+							{{3.6410849, -1.1286706, -1.275959},
+							{-7.5292215, 1.2592185, 1.0422806}}});
+    std::shared_ptr<Tensor> expectedBiasGrad = std::make_shared<Tensor>(Array2D<float, 4, 2>{{{0.095633715, -0.42645916},
+							{0.47288924, -0.50316036},
+							{-0.30161166, 0.32583892},
+							{-0.01946173, 0.47193128 }}});
+
+    op->getOutput(0)->setGrad(myOutputGrad);
+    myFC->backward();
+    REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedInputGrad));
+    REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedWeightsGrad));
+    REQUIRE(approxEq<float>(*(op->getInput(2)->grad()), *expectedBiasGrad));
+  }
 }
\ No newline at end of file
-- 
GitLab


From a59730f1c82228864ff013c0f24b29621ccba54d Mon Sep 17 00:00:00 2001
From: hrouis <houssemeddine.rouis92@gmail.com>
Date: Tue, 11 Feb 2025 18:27:37 +0100
Subject: [PATCH 3/8] add transA and transB for FC

---
 include/aidge/backend/cpu/operator/FCImpl.hpp |   4 +
 .../backend/cpu/operator/FCImpl_kernels.hpp   |  89 +++--
 src/operator/FCImpl.cpp                       |  18 +-
 unit_tests/operator/Test_FCImpl.cpp           | 362 ++++++++++++------
 4 files changed, 304 insertions(+), 169 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/FCImpl.hpp b/include/aidge/backend/cpu/operator/FCImpl.hpp
index b6e0d099..b01c220f 100644
--- a/include/aidge/backend/cpu/operator/FCImpl.hpp
+++ b/include/aidge/backend/cpu/operator/FCImpl.hpp
@@ -28,6 +28,8 @@ using FCImpl_cpu = OperatorImpl_cpu<FC_Op,
         const DimSize_t,
         const DimSize_t,
         const bool,
+        const bool,
+        const bool,
         const float,
         const float,
         const void *,
@@ -38,6 +40,8 @@ using FCImpl_cpu = OperatorImpl_cpu<FC_Op,
         const DimSize_t,
         const DimSize_t,
         const bool,
+        const bool,
+        const bool,
         const float,
         const float,
         const void *,
diff --git a/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp
index 7635fca3..e928e9ea 100644
--- a/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp
@@ -23,13 +23,14 @@ void FCImpl_cpu_forward_kernel(const DimSize_t batchSize,
                             const DimSize_t inputFeatureSize,
                             const DimSize_t outputFeatureSize,
                             const bool isBiasBatched,
+                            const bool transA,
+                            const bool transB,
                             const float alpha_,
                             const float beta_,
                             const void* input_,
                             const void* weights_,
                             const void* biases_,
                             void* output_) {
-    // FIXME: missing FC attributes as arguments
     const I* input = static_cast<const I*>(input_);
     const W* weights = static_cast<const W*>(weights_);
     const B* biases = static_cast<const B*>(biases_);
@@ -40,14 +41,13 @@ void FCImpl_cpu_forward_kernel(const DimSize_t batchSize,
 
     if (biases == nullptr) {
         std::fill(output, output + (batchSize * outputFeatureSize), O(0));
-    } else { // Initialize output with bias * beta
-        if (isBiasBatched) {
+    } else { 
+        if (isBiasBatched) { // Bias is (batchSize, outputFeatureSize)
             std::transform(
                 biases, biases + batchSize * outputFeatureSize, output,
                 [beta](const B& bias) { return beta * static_cast<O>(bias); }
             );
-        }
-        else {   // Bias 1D
+        } else { // Bias is 1D (outputFeatureSize)
             for (std::size_t batch = 0; batch < batchSize; ++batch) {
                 std::transform(
                     biases, biases + outputFeatureSize, output + batch * outputFeatureSize,
@@ -57,17 +57,18 @@ void FCImpl_cpu_forward_kernel(const DimSize_t batchSize,
         }
     }
 
-    // Perform matrix-vector multiplication with alpha scaling
     for (std::size_t batch = 0; batch < batchSize; ++batch) {
         for (std::size_t out = 0; out < outputFeatureSize; ++out) {
-            output[out + batch * outputFeatureSize] += alpha * std::inner_product(
-                input + batch * inputFeatureSize,
-                input + (batch + 1) * inputFeatureSize,
-                weights + out * inputFeatureSize,
-                O(0) // Initialize accumulator to zero
-            );
+            O sum = O(0);
+            for (std::size_t i = 0; i < inputFeatureSize; ++i) {
+                std::size_t inputIdx = transA ? (i * batchSize + batch) : (batch * inputFeatureSize + i);
+                std::size_t weightIdx = transB ? (i * outputFeatureSize + out) : (out * inputFeatureSize + i);
+                sum += static_cast<O>(input[inputIdx]) * static_cast<O>(weights[weightIdx]);
+            }
+            output[batch * outputFeatureSize + out] += alpha * sum;
         }
     }
+
 }
 
 template <class I, class O, class W, class B>
@@ -75,6 +76,8 @@ void FCImpl_cpu_backward_kernel(const DimSize_t batchSize,
                                 const DimSize_t inputFeatureSize,
                                 const DimSize_t outputFeatureSize,
                                 const bool isBiasBatched,
+                                const bool transA,
+                                const bool transB,
                                 const float alpha_,
                                 const float beta_,
                                 const void* input_,
@@ -85,58 +88,64 @@ void FCImpl_cpu_backward_kernel(const DimSize_t batchSize,
                                 void* biasesGrad_)
 {
     // FIXME: missing FC attributes as arguments
-    const I* input  = static_cast<const I*>(input_);
-    const I* originalInput  = static_cast<const I*>(originalInput_);
-    const W* weight = static_cast<const W*>(weight_);
-    O* output       = static_cast<O*>(output_);
-    W* weightGrad   = static_cast<W*>(weightGrad_);
-    B* biasesGrad   = static_cast<B*>(biasesGrad_);
-
-    // Coefficients
+    const I* outputGrad = static_cast<const I*>(input_); // dY
+    const I* originalInput = static_cast<const I*>(originalInput_); // X (Input in forward pass)
+    const W* weight = static_cast<const W*>(weight_); // W
+    O* inputGrad = static_cast<O*>(output_); // dX
+    W* weightGrad = static_cast<W*>(weightGrad_); // dW
+    B* biasesGrad = static_cast<B*>(biasesGrad_); // dB
+
     const O alpha = static_cast<O>(alpha_);
-    const O beta  = static_cast<O>(beta_);
-
-    const DimSize_t biasSize = isBiasBatched ? batchSize * outputFeatureSize : outputFeatureSize;
-    // bias grad
-    if (biasesGrad == nullptr) { // no bias
-        std::fill(biasesGrad, biasesGrad + biasSize, B(0));
-    } else {
-        if (isBiasBatched) {
-            for (std::size_t o = 0; o < biasSize; ++o) {
-                biasesGrad[o] = beta * input[o];
+    const O beta = static_cast<O>(beta_);
+
+    // Compute bias gradient: dB = beta * dB + alpha * dY
+    if (biasesGrad != nullptr) { 
+        if (isBiasBatched) { // Bias is (batchSize, outputFeatureSize)
+            for (std::size_t b = 0; b < batchSize; ++b) {
+                for (std::size_t o = 0; o < outputFeatureSize; ++o) {
+                    biasesGrad[b * outputFeatureSize + o] = beta * outputGrad[b * outputFeatureSize + o];
+                }
             }
-        } else { // BiasGrad 1D
+        } else { // Bias is 1D (outputFeatureSize)
             for (std::size_t o = 0; o < outputFeatureSize; ++o) {
-                B sum{0};
+                O sum{0};
                 for (std::size_t b = 0; b < batchSize; ++b) {
-                    sum += input[b * outputFeatureSize + o];
+                    sum += outputGrad[b * outputFeatureSize + o];
                 }
                 biasesGrad[o] = beta * sum;
             }
         }
     }
 
-    // weight grad
+    // Compute weight gradient: dW = dY^T * X
     for (std::size_t o = 0; o < outputFeatureSize; ++o) {
         for (std::size_t c = 0; c < inputFeatureSize; ++c) {
-            W sum{0};
+            O sum{0};
             for (std::size_t b = 0; b < batchSize; ++b) {
-                sum += originalInput[b * inputFeatureSize + c] * input[b * outputFeatureSize + o];
+                std::size_t inputIdx = transA ? (c * batchSize + b) : (b * inputFeatureSize + c);
+                std::size_t outputIdx = b * outputFeatureSize + o;
+                sum += originalInput[inputIdx] * outputGrad[outputIdx];
             }
-            weightGrad[o * inputFeatureSize + c] = alpha * sum;
+            std::size_t weightIdx = transB ? (c * outputFeatureSize + o) : (o * inputFeatureSize + c);
+            weightGrad[weightIdx] = alpha * sum;
         }
     }
 
-    // Input gradient (output)
+
+    // Compute input gradient: dX = dY * W^T
     for (std::size_t b = 0; b < batchSize; ++b) {
         for (std::size_t c = 0; c < inputFeatureSize; ++c) {
             O sum{0};
             for (std::size_t o = 0; o < outputFeatureSize; ++o) {
-                sum += weight[o * inputFeatureSize + c] * input[b * outputFeatureSize + o];
+                std::size_t weightIdx = transB ? (c * outputFeatureSize + o) : (o * inputFeatureSize + c);
+                std::size_t outputIdx = b * outputFeatureSize + o;
+                sum += weight[weightIdx] * outputGrad[outputIdx];
             }
-            output[b * inputFeatureSize + c] = alpha * sum;
+            std::size_t inputIdx = transA ? (c * batchSize + b) : (b * inputFeatureSize + c);
+            inputGrad[inputIdx] = alpha * sum;
         }
     }
+
 }
 
 // Kernels registration to implementation entry point
diff --git a/src/operator/FCImpl.cpp b/src/operator/FCImpl.cpp
index 4c1b6861..2c56c97d 100644
--- a/src/operator/FCImpl.cpp
+++ b/src/operator/FCImpl.cpp
@@ -42,11 +42,14 @@ void Aidge::FCImpl_cpu::forward()
     const auto& input2 = (op_.getInput(2)) ? op_.getInput(2)->refCastFrom(input2Fallback, *(op_.getOutput(0))) : Tensor();
     const bool isBiasBatched = input2.nbDims() == 2;
     // Call kernel
-    const auto batchSize = (input0.dims().size() > 1) ? input0.dims()[0] : 1;
+    const DimSize_t nbInFeat = op_.transB()? input1.dims()[0]:input1.dims()[1];
+    const auto batchSize = input0.size() /nbInFeat;
     impl.forward(batchSize,
-        input1.dims()[1], // nb input features
-        input1.dims()[0], // nb output features
+        op_.transB()? input1.dims()[0]:input1.dims()[1], // nb input features
+        op_.transB()?input1.dims()[1]: input1.dims()[0], // nb output features
         isBiasBatched,
+        op_.transA(),
+        op_.transB(),
         op_.alpha(),
         op_.beta(),
         input0.getImpl()->rawPtr(),
@@ -76,11 +79,14 @@ void Aidge::FCImpl_cpu::backward()
     const auto& input2grad = (op_.getInput(2)) ? op_.getInput(2)->grad()->refCastFrom(input2gradFallback, *(op_.getOutput(0))) : Tensor();
     const bool isBiasBatched = input2grad.nbDims() == 2;
     // Call kernel
-    const auto batchSize = (input0grad.dims().size() > 1) ? input0grad.dims()[0] : 1;
+    const DimSize_t nbInFeat = op_.transB()? input1grad.dims()[0]:input1grad.dims()[1];
+    const auto batchSize = input0grad.size() /nbInFeat;
     impl.backward(batchSize,
-        input1grad.dims()[1], // nb input features
-        input1grad.dims()[0], // nb output features
+        nbInFeat, // nb input features
+        op_.transB()?input1grad.dims()[1]: input1grad.dims()[0], // nb output features
         isBiasBatched,
+        op_.transA(),
+        op_.transB(),
         op_.alpha(),
         op_.beta(),
         getCPUPtr(fc_grad),
diff --git a/unit_tests/operator/Test_FCImpl.cpp b/unit_tests/operator/Test_FCImpl.cpp
index 324a8a18..0c6cfa00 100644
--- a/unit_tests/operator/Test_FCImpl.cpp
+++ b/unit_tests/operator/Test_FCImpl.cpp
@@ -10,7 +10,6 @@
  ********************************************************************************/
 
 #include <memory>
-#include <iostream>
 
 #include <catch2/catch_test_macros.hpp>
 
@@ -108,132 +107,249 @@ TEST_CASE("[cpu/oeprator] FC(forward)", "[FC][CPU]") {
         REQUIRE(*(op->getOutput(0)) == myOutput);
     }
 
+    SECTION("transA and transB") {
+        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float, 3, 4>{{
+                {1.2137502, 0.85416597, -1.2204953, -1.1383502},
+                {-0.73264742, -1.361271, -1.400124, -0.47965094},
+                {-0.64570695, -0.48093504, 0.14018863, -0.64090657}
+        }}); // Transposed Input (4x3 → 3x4)
+
+        std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array2D<float, 3, 2>{{
+                {0.12352414, -1.0330718},
+                {-0.036496852, 0.011371522},
+                {-0.15418212, -0.60658616}
+        }}); // Transposed Weights (2x3 → 3x2)
+
+        std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array2D<float, 4, 2>{{
+                {-0.7607078, 0.54936022},
+                {-0.31278628, 0.68560582},
+                {0.78312093, 0.96373892},
+                {-1.2183768, -0.4587383}
+        }}); // Bias remains the same (4x2)
+
+        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float, 4, 2>{{
+                {0.17209265, -1.4664109},
+                {0.30229449, -0.86952901},
+                {0.14901026, 2.8016717},
+                {-0.65777171, 2.8892474}
+        }}); // Expected Output
+
+        std::shared_ptr<Node> myFC = FC(3, 2, false, "myfc", 2.0f, 0.5f, true, true); // transA = true, transB = true
+
+        auto op = std::static_pointer_cast<OperatorTensor>(myFC->getOperator());
+        op->associateInput(0, myInput);
+        op->associateInput(1, myWeights);
+        op->associateInput(2, myBias);
+        op->setDataType(DataType::Float32);
+        op->setBackend("cpu");
+
+        myFC->forward();
+        op->getOutput(0)->print();
+
+        REQUIRE(approxEq<float>(*(op->getOutput(0)), *myOutput));
+    }
     // std::cout << static_cast<Tensor>((*myFC->getOperator())["weight"])[0][0][0][0] << std::endl;
 }
 
 
 TEST_CASE("[cpu/oeprator] FC(backward)", "[FC][CPU]") {
-  SECTION("2D Input 1D Bias"){
-    std::shared_ptr<Tensor> myInput =
-            std::make_shared<Tensor>(Array2D<float, 4, 3>{
-                              {
-                              {0.55043954, -0.080161572, 0.18495631},
-                              {-0.82497174, -0.95155114, 0.25449812},
-                              {1.6508394, 0.2518357, -0.49999624},
-                              {0.82770473, 0.28659272, -0.11644308}
-                              }});
-    std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array2D<float, 2, 3>{
-            {{0.044322353, 1.9578923, -1.96035},
- {-1.1458585, -0.8235659, 0.24195994}}});
-    std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float, 2>{{1.5327742,  0.90154403}});
-
-    std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float, 4, 2>{
-            {{1.0376441, 0.38158852},
-            {-0.86573052, 2.6920884},
-            {3.0791781, -1.3184667},
-            {2.3588469, -0.31109101}}});
-
-    std::shared_ptr<Node> myFC = FC(3, 2, false, "myfc");
-    auto op = std::static_pointer_cast<OperatorTensor>(myFC -> getOperator());
-    op -> associateInput(0, myInput);
-    op -> associateInput(1, myWeights);
-    op -> associateInput(2, myBias);
-    op -> setDataType(DataType::Float32);
-    op -> setBackend("cpu");
-    myFC->forward();
-    op->getOutput(0)->print();
-    REQUIRE(approxEq<float>(*(op->getOutput(0)), *myOutput));
-
-    // Backward
-    std::shared_ptr<Tensor> myOutputGrad =
-            std::make_shared<Tensor>(Array2D<float, 4, 2>{
-                              {
-                              {1.373911, -1.2312084},
-                              {0.24750818, -0.71446633},
-                              {-1.5132738, -0.23136522},
-                              {0.20452768, -1.2200259}
-                              }});
-    std::shared_ptr<Tensor> expectedInputGrad =
-            std::make_shared<Tensor>(Array2D<float, 4, 3>{
-                              {
-                              {1.4716856, 3.7039511, -2.9912496},
-                              {0.82964748, 1.0730045, -0.65807492},
-                              {0.19803995, -2.7722826, 2.9105654},
-                              {1.4070423, 1.4052149, -0.69614327}
-                              }});
-    std::shared_ptr<Tensor> expectedWeightsGrad = std::make_shared<Tensor>(Array2D<float, 2, 3>{
-            {{-1.7768159, -0.66813177, 1.0499192},
-              {-1.4800593, 0.37063029, -0.15180479}}});
-    std::shared_ptr<Tensor> expectedBiasGrad = std::make_shared<Tensor>(Array1D<float, 2>{{0.31267303, -3.397066 }});
-
-    op->getOutput(0)->setGrad(myOutputGrad);
-    myFC->backward();
-    REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedInputGrad));
-    REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedWeightsGrad));
-    REQUIRE(approxEq<float>(*(op->getInput(2)->grad()), *expectedBiasGrad));
-  }
-   SECTION("2D Input 2D Bias"){
-    std::shared_ptr<Tensor> myInput =
-            std::make_shared<Tensor>(Array2D<float, 4, 3>{
-						{
-						{1.2137502, -0.73264742, -0.64570695},
-						{0.85416597, -1.361271, -0.48093504},
-						{-1.2204953, -1.400124, 0.14018863},
-						{-1.1383502, -0.47965094, -0.64090657}
+	SECTION("2D Input 1D Bias"){
+		std::shared_ptr<Tensor> myInput =
+				std::make_shared<Tensor>(Array2D<float, 4, 3>{
+								{
+								{0.55043954, -0.080161572, 0.18495631},
+								{-0.82497174, -0.95155114, 0.25449812},
+								{1.6508394, 0.2518357, -0.49999624},
+								{0.82770473, 0.28659272, -0.11644308}
+								}});
+		std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array2D<float, 2, 3>{
+				{{0.044322353, 1.9578923, -1.96035},
+	{			-1.1458585, -0.8235659, 0.24195994}}});
+		std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float, 2>{{1.5327742,  0.90154403}});
+
+		std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float, 4, 2>{
+				{{1.0376441, 0.38158852},
+				{-0.86573052, 2.6920884},
+				{3.0791781, -1.3184667},
+				{2.3588469, -0.31109101}}});
+
+		std::shared_ptr<Node> myFC = FC(3, 2, false, "myfc");
+		auto op = std::static_pointer_cast<OperatorTensor>(myFC -> getOperator());
+		op -> associateInput(0, myInput);
+		op -> associateInput(1, myWeights);
+		op -> associateInput(2, myBias);
+		op -> setDataType(DataType::Float32);
+		op -> setBackend("cpu");
+		myFC->forward();
+		op->getOutput(0)->print();
+		REQUIRE(approxEq<float>(*(op->getOutput(0)), *myOutput));
+
+		// Backward
+		std::shared_ptr<Tensor> myOutputGrad =
+				std::make_shared<Tensor>(Array2D<float, 4, 2>{
+								{
+								{1.373911, -1.2312084},
+								{0.24750818, -0.71446633},
+								{-1.5132738, -0.23136522},
+								{0.20452768, -1.2200259}
+								}});
+		std::shared_ptr<Tensor> expectedInputGrad =
+				std::make_shared<Tensor>(Array2D<float, 4, 3>{
+								{
+								{1.4716856, 3.7039511, -2.9912496},
+								{0.82964748, 1.0730045, -0.65807492},
+								{0.19803995, -2.7722826, 2.9105654},
+								{1.4070423, 1.4052149, -0.69614327}
+								}});
+		std::shared_ptr<Tensor> expectedWeightsGrad = std::make_shared<Tensor>(Array2D<float, 2, 3>{
+				{{-1.7768159, -0.66813177, 1.0499192},
+				{-1.4800593, 0.37063029, -0.15180479}}});
+		std::shared_ptr<Tensor> expectedBiasGrad = std::make_shared<Tensor>(Array1D<float, 2>{{0.31267303, -3.397066 }});
+
+		op->getOutput(0)->setGrad(myOutputGrad);
+		myFC->backward();
+		REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedInputGrad));
+		REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedWeightsGrad));
+		REQUIRE(approxEq<float>(*(op->getInput(2)->grad()), *expectedBiasGrad));
+	}
+	SECTION("2D Input 2D Bias"){
+		std::shared_ptr<Tensor> myInput =
+				std::make_shared<Tensor>(Array2D<float, 4, 3>{
+							{
+							{1.2137502, -0.73264742, -0.64570695},
+							{0.85416597, -1.361271, -0.48093504},
+							{-1.2204953, -1.400124, 0.14018863},
+							{-1.1383502, -0.47965094, -0.64090657}
+				}});
+		std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array2D<float, 2, 3>{
+							{{0.12352414, -0.036496852, -0.15418212},
+							{-1.0330718, 0.011371522, -0.60658616}}});
+		std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array2D<float, 4, 2>{
+							{{-0.7607078, 0.54936022},
+							{-0.31278628, 0.68560582},
+							{0.78312093, 0.96373892},
+							{-1.2183768, -0.4587383}}});
+
+		std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float, 4, 2>{
+							{{0.17209265, -1.4664109},
+							{0.30229449, -0.86952901},
+							{0.14901026, 2.8016717},
+							{-0.65777171, 2.8892474}}});
+
+		std::shared_ptr<Node> myFC = FC(3, 2, false, "myfc", 2.0f, 0.5f);
+		auto op = std::static_pointer_cast<OperatorTensor>(myFC -> getOperator());
+		op -> associateInput(0, myInput);
+		op -> associateInput(1, myWeights);
+		op -> associateInput(2, myBias);
+		op -> setDataType(DataType::Float32);
+		op -> setBackend("cpu");
+		myFC->forward();
+		op->getOutput(0)->print();
+		REQUIRE(approxEq<float>(*(op->getOutput(0)), *myOutput));
+
+		// Backward
+		std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array2D<float, 4, 2>{
+							{
+							{0.19126743, -0.85291833},
+							{0.94577849, -1.0063207},
+							{-0.60322332, 0.65167785},
+							{-0.038923461, 0.94386256}
+							}});
+		std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array2D<float, 4, 3>{
+								{
+								{1.8095039, -0.033359278, 0.97575688},
+								{2.312856, -0.091922671, 0.92919618},
+								{-1.4954853, 0.058852643, -0.60458499},
+								{-1.9597715, 0.024307476, -1.1330653}
+								}});
+		std::shared_ptr<Tensor> expectedWeightsGrad = std::make_shared<Tensor>(Array2D<float, 2, 3>{
+								{{3.6410849, -1.1286706, -1.275959},
+								{-7.5292215, 1.2592185, 1.0422806}}});
+		std::shared_ptr<Tensor> expectedBiasGrad = std::make_shared<Tensor>(Array2D<float, 4, 2>{{{0.095633715, -0.42645916},
+								{0.47288924, -0.50316036},
+								{-0.30161166, 0.32583892},
+								{-0.01946173, 0.47193128 }}});
+
+		op->getOutput(0)->setGrad(myOutputGrad);
+		myFC->backward();
+		REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedInputGrad));
+		REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedWeightsGrad));
+		REQUIRE(approxEq<float>(*(op->getInput(2)->grad()), *expectedBiasGrad));
+	}
+	SECTION("transA and transB") {
+			std::shared_ptr<Tensor> myInput =
+			std::make_shared<Tensor>(Array2D<float, 3, 4>{{
+					{0.51234567, -1.23456789, 0.67891234, -0.43219876},
+					{1.87654321, -0.98765432, 1.34567890, -1.23456789},
+					{-0.67891234, 0.43219876, -1.87654321, 0.98765432}
 			}});
-    std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array2D<float, 2, 3>{
-						{{0.12352414, -0.036496852, -0.15418212},
-						{-1.0330718, 0.011371522, -0.60658616}}});
-    std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array2D<float, 4, 2>{
-						{{-0.7607078, 0.54936022},
-						{-0.31278628, 0.68560582},
-						{0.78312093, 0.96373892},
-						{-1.2183768, -0.4587383}}});
-
-    std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float, 4, 2>{
-						{{0.17209265, -1.4664109},
-						{0.30229449, -0.86952901},
-						{0.14901026, 2.8016717},
-						{-0.65777171, 2.8892474}}});
-
-    std::shared_ptr<Node> myFC = FC(3, 2, false, "myfc", 2.0f, 0.5f);
-    auto op = std::static_pointer_cast<OperatorTensor>(myFC -> getOperator());
-    op -> associateInput(0, myInput);
-    op -> associateInput(1, myWeights);
-    op -> associateInput(2, myBias);
-    op -> setDataType(DataType::Float32);
-    op -> setBackend("cpu");
-    myFC->forward();
-    op->getOutput(0)->print();
-    REQUIRE(approxEq<float>(*(op->getOutput(0)), *myOutput));
-
-    // Backward
-    std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array2D<float, 4, 2>{
-						{
-						{0.19126743, -0.85291833},
-						{0.94577849, -1.0063207},
-						{-0.60322332, 0.65167785},
-						{-0.038923461, 0.94386256}
-						}});
-    std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array2D<float, 4, 3>{
-                              {
-							{1.8095039, -0.033359278, 0.97575688},
-							{2.312856, -0.091922671, 0.92919618},
-							{-1.4954853, 0.058852643, -0.60458499},
-							{-1.9597715, 0.024307476, -1.1330653}
-                              }});
-    std::shared_ptr<Tensor> expectedWeightsGrad = std::make_shared<Tensor>(Array2D<float, 2, 3>{
-							{{3.6410849, -1.1286706, -1.275959},
-							{-7.5292215, 1.2592185, 1.0422806}}});
-    std::shared_ptr<Tensor> expectedBiasGrad = std::make_shared<Tensor>(Array2D<float, 4, 2>{{{0.095633715, -0.42645916},
-							{0.47288924, -0.50316036},
-							{-0.30161166, 0.32583892},
-							{-0.01946173, 0.47193128 }}});
-
-    op->getOutput(0)->setGrad(myOutputGrad);
-    myFC->backward();
-    REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedInputGrad));
-    REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedWeightsGrad));
-    REQUIRE(approxEq<float>(*(op->getInput(2)->grad()), *expectedBiasGrad));
-  }
+
+			std::shared_ptr<Tensor> myWeights =
+			std::make_shared<Tensor>(Array2D<float, 3, 2>{{
+					{0.12345678, -1.34567890},
+					{-0.87654321, 0.56789012},
+					{0.23456789, -0.45678901}
+			}});
+
+			std::shared_ptr<Tensor> myBias =
+			std::make_shared<Tensor>(Array2D<float, 4, 2>{{
+					{0.65432109, -0.54321098},
+					{1.23456789, -1.09876543},
+					{-0.32109876, 0.98765432},
+					{-0.87654321, 0.76543210}
+			}});
+
+			std::shared_ptr<Tensor> myOutput =
+			std::make_shared<Tensor>(Array2D<float, 4, 2>{{
+					{-3.1545789, 1.101069},
+					{2.2466557, 1.2566758},
+					{-3.2323616, 1.9093952},
+					{2.0826612, -0.75857949}
+			}});
+
+			std::shared_ptr<Node> myFC = FC(3, 2, false, "myfc", 2.0f, 0.5f, true, true);
+			auto op = std::static_pointer_cast<OperatorTensor>(myFC->getOperator());
+			op->associateInput(0, myInput);
+			op->associateInput(1, myWeights);
+			op->associateInput(2, myBias);
+			op->setDataType(DataType::Float32);
+			op->setBackend("cpu");
+
+			// Forward pass
+			myFC->forward();
+			op->getOutput(0)->print();
+			REQUIRE(approxEq<float>(*(op->getOutput(0)), *myOutput));
+
+			// Backward Pass
+			std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array2D<float, 4, 2>{{
+				{0.19876543, -0.65432109},
+				{0.76543210, -1.23456789},
+				{-0.43210987, 1.09876543},
+				{-0.98765432, 0.87654321}
+			}});
+
+			std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array2D<float, 3, 4>{{
+				{1.8100901, 3.5116596, -3.0638645, -2.6029568},
+			{-1.0916179, -2.7440662, 2.0054817, 2.7270038},
+			{0.69102132, 1.4869658, -1.206526, -1.2641346}
+			}});
+
+			std::shared_ptr<Tensor> expectedWeightsGrad = std::make_shared<Tensor>(Array2D<float, 3, 2>{{
+				{-1.4192863, 3.1120875},
+				{0.50970948, 0.77579576},
+				{0.062572584, -2.571022}
+			}});
+
+			std::shared_ptr<Tensor> expectedBiasGrad = std::make_shared<Tensor>(Array2D<float, 4, 2>{{
+				{0.099382713, -0.32716054},
+				{0.38271606, -0.61728394},
+				{-0.21605493, 0.54938269},
+				{-0.49382716, 0.43827161}
+			}});
+			op->getOutput(0)->setGrad(myOutputGrad);
+			myFC->backward();
+			REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedInputGrad));
+			REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedWeightsGrad));
+			REQUIRE(approxEq<float>(*(op->getInput(2)->grad()), *expectedBiasGrad));
+	}
 }
\ No newline at end of file
-- 
GitLab


From ef53239c7d5cc0a6e2fcc9a30437d478057bb0ee Mon Sep 17 00:00:00 2001
From: hrouis <houssemeddine.rouis92@gmail.com>
Date: Tue, 18 Feb 2025 15:51:13 +0100
Subject: [PATCH 4/8] support only bias of size outChannels for FC

---
 include/aidge/backend/cpu/operator/FCImpl.hpp |   2 -
 .../backend/cpu/operator/FCImpl_kernels.hpp   |  31 +--
 src/operator/FCImpl.cpp                       |   4 -
 unit_tests/operator/Test_FCImpl.cpp           | 207 ++++++------------
 4 files changed, 74 insertions(+), 170 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/FCImpl.hpp b/include/aidge/backend/cpu/operator/FCImpl.hpp
index b01c220f..9249ba77 100644
--- a/include/aidge/backend/cpu/operator/FCImpl.hpp
+++ b/include/aidge/backend/cpu/operator/FCImpl.hpp
@@ -29,7 +29,6 @@ using FCImpl_cpu = OperatorImpl_cpu<FC_Op,
         const DimSize_t,
         const bool,
         const bool,
-        const bool,
         const float,
         const float,
         const void *,
@@ -41,7 +40,6 @@ using FCImpl_cpu = OperatorImpl_cpu<FC_Op,
         const DimSize_t,
         const bool,
         const bool,
-        const bool,
         const float,
         const float,
         const void *,
diff --git a/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp
index e928e9ea..aa4ffa2e 100644
--- a/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp
@@ -22,7 +22,6 @@ template <class I, class W, class B, class O>
 void FCImpl_cpu_forward_kernel(const DimSize_t batchSize,
                             const DimSize_t inputFeatureSize,
                             const DimSize_t outputFeatureSize,
-                            const bool isBiasBatched,
                             const bool transA,
                             const bool transB,
                             const float alpha_,
@@ -41,19 +40,12 @@ void FCImpl_cpu_forward_kernel(const DimSize_t batchSize,
 
     if (biases == nullptr) {
         std::fill(output, output + (batchSize * outputFeatureSize), O(0));
-    } else { 
-        if (isBiasBatched) { // Bias is (batchSize, outputFeatureSize)
+    } else {
+        for (std::size_t batch = 0; batch < batchSize; ++batch) {
             std::transform(
-                biases, biases + batchSize * outputFeatureSize, output,
+                biases, biases + outputFeatureSize, output + batch * outputFeatureSize,
                 [beta](const B& bias) { return beta * static_cast<O>(bias); }
             );
-        } else { // Bias is 1D (outputFeatureSize)
-            for (std::size_t batch = 0; batch < batchSize; ++batch) {
-                std::transform(
-                    biases, biases + outputFeatureSize, output + batch * outputFeatureSize,
-                    [beta](const B& bias) { return beta * static_cast<O>(bias); }
-                );
-            }
         }
     }
 
@@ -75,7 +67,6 @@ template <class I, class O, class W, class B>
 void FCImpl_cpu_backward_kernel(const DimSize_t batchSize,
                                 const DimSize_t inputFeatureSize,
                                 const DimSize_t outputFeatureSize,
-                                const bool isBiasBatched,
                                 const bool transA,
                                 const bool transB,
                                 const float alpha_,
@@ -100,20 +91,12 @@ void FCImpl_cpu_backward_kernel(const DimSize_t batchSize,
 
     // Compute bias gradient: dB = beta * dB + alpha * dY
     if (biasesGrad != nullptr) { 
-        if (isBiasBatched) { // Bias is (batchSize, outputFeatureSize)
+        for (std::size_t o = 0; o < outputFeatureSize; ++o) {
+            O sum{0};
             for (std::size_t b = 0; b < batchSize; ++b) {
-                for (std::size_t o = 0; o < outputFeatureSize; ++o) {
-                    biasesGrad[b * outputFeatureSize + o] = beta * outputGrad[b * outputFeatureSize + o];
-                }
-            }
-        } else { // Bias is 1D (outputFeatureSize)
-            for (std::size_t o = 0; o < outputFeatureSize; ++o) {
-                O sum{0};
-                for (std::size_t b = 0; b < batchSize; ++b) {
-                    sum += outputGrad[b * outputFeatureSize + o];
-                }
-                biasesGrad[o] = beta * sum;
+                sum += outputGrad[b * outputFeatureSize + o];
             }
+            biasesGrad[o] = beta * sum;
         }
     }
 
diff --git a/src/operator/FCImpl.cpp b/src/operator/FCImpl.cpp
index 2c56c97d..2a06803c 100644
--- a/src/operator/FCImpl.cpp
+++ b/src/operator/FCImpl.cpp
@@ -40,14 +40,12 @@ void Aidge::FCImpl_cpu::forward()
     const auto& input0 = op_.getInput(0)->refCastFrom(input0Fallback, *(op_.getOutput(0)));
     const auto& input1 = op_.getInput(1)->refCastFrom(input1Fallback, *(op_.getOutput(0)));
     const auto& input2 = (op_.getInput(2)) ? op_.getInput(2)->refCastFrom(input2Fallback, *(op_.getOutput(0))) : Tensor();
-    const bool isBiasBatched = input2.nbDims() == 2;
     // Call kernel
     const DimSize_t nbInFeat = op_.transB()? input1.dims()[0]:input1.dims()[1];
     const auto batchSize = input0.size() /nbInFeat;
     impl.forward(batchSize,
         op_.transB()? input1.dims()[0]:input1.dims()[1], // nb input features
         op_.transB()?input1.dims()[1]: input1.dims()[0], // nb output features
-        isBiasBatched,
         op_.transA(),
         op_.transB(),
         op_.alpha(),
@@ -77,14 +75,12 @@ void Aidge::FCImpl_cpu::backward()
     const auto& input0grad = op_.getInput(0)->grad()->refCastFrom(input0gradFallback, *(op_.getOutput(0)));
     const auto& input1grad = op_.getInput(1)->grad()->refCastFrom(input1gradFallback, *(op_.getOutput(0)));
     const auto& input2grad = (op_.getInput(2)) ? op_.getInput(2)->grad()->refCastFrom(input2gradFallback, *(op_.getOutput(0))) : Tensor();
-    const bool isBiasBatched = input2grad.nbDims() == 2;
     // Call kernel
     const DimSize_t nbInFeat = op_.transB()? input1grad.dims()[0]:input1grad.dims()[1];
     const auto batchSize = input0grad.size() /nbInFeat;
     impl.backward(batchSize,
         nbInFeat, // nb input features
         op_.transB()?input1grad.dims()[1]: input1grad.dims()[0], // nb output features
-        isBiasBatched,
         op_.transA(),
         op_.transB(),
         op_.alpha(),
diff --git a/unit_tests/operator/Test_FCImpl.cpp b/unit_tests/operator/Test_FCImpl.cpp
index 0c6cfa00..ea464f41 100644
--- a/unit_tests/operator/Test_FCImpl.cpp
+++ b/unit_tests/operator/Test_FCImpl.cpp
@@ -120,19 +120,16 @@ TEST_CASE("[cpu/oeprator] FC(forward)", "[FC][CPU]") {
                 {-0.15418212, -0.60658616}
         }}); // Transposed Weights (2x3 → 3x2)
 
-        std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array2D<float, 4, 2>{{
-                {-0.7607078, 0.54936022},
-                {-0.31278628, 0.68560582},
-                {0.78312093, 0.96373892},
-                {-1.2183768, -0.4587383}
-        }}); // Bias remains the same (4x2)
+		std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float, 2>{{
+				-0.31278628, 0.68560582
+		}});
 
-        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float, 4, 2>{{
-                {0.17209265, -1.4664109},
-                {0.30229449, -0.86952901},
-                {0.14901026, 2.8016717},
-                {-0.65777171, 2.8892474}
-        }}); // Expected Output
+		std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float, 4, 2>{{
+				{0.39605334, -1.3982881},
+				{0.30229449, -0.86952901},
+				{-0.39894336, 2.6626053},
+				{-0.20497644, 3.4614196}
+		}});
 
         std::shared_ptr<Node> myFC = FC(3, 2, false, "myfc", 2.0f, 0.5f, true, true); // transA = true, transB = true
 
@@ -212,144 +209,74 @@ TEST_CASE("[cpu/oeprator] FC(backward)", "[FC][CPU]") {
 		REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedWeightsGrad));
 		REQUIRE(approxEq<float>(*(op->getInput(2)->grad()), *expectedBiasGrad));
 	}
-	SECTION("2D Input 2D Bias"){
+	SECTION("transA and transB") {
 		std::shared_ptr<Tensor> myInput =
-				std::make_shared<Tensor>(Array2D<float, 4, 3>{
-							{
-							{1.2137502, -0.73264742, -0.64570695},
-							{0.85416597, -1.361271, -0.48093504},
-							{-1.2204953, -1.400124, 0.14018863},
-							{-1.1383502, -0.47965094, -0.64090657}
-				}});
-		std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array2D<float, 2, 3>{
-							{{0.12352414, -0.036496852, -0.15418212},
-							{-1.0330718, 0.011371522, -0.60658616}}});
-		std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array2D<float, 4, 2>{
-							{{-0.7607078, 0.54936022},
-							{-0.31278628, 0.68560582},
-							{0.78312093, 0.96373892},
-							{-1.2183768, -0.4587383}}});
-
-		std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float, 4, 2>{
-							{{0.17209265, -1.4664109},
-							{0.30229449, -0.86952901},
-							{0.14901026, 2.8016717},
-							{-0.65777171, 2.8892474}}});
-
-		std::shared_ptr<Node> myFC = FC(3, 2, false, "myfc", 2.0f, 0.5f);
-		auto op = std::static_pointer_cast<OperatorTensor>(myFC -> getOperator());
-		op -> associateInput(0, myInput);
-		op -> associateInput(1, myWeights);
-		op -> associateInput(2, myBias);
-		op -> setDataType(DataType::Float32);
-		op -> setBackend("cpu");
+		std::make_shared<Tensor>(Array2D<float, 3, 4>{{
+				{0.51234567, -1.23456789, 0.67891234, -0.43219876},
+				{1.87654321, -0.98765432, 1.34567890, -1.23456789},
+				{-0.67891234, 0.43219876, -1.87654321, 0.98765432}
+		}});
+
+		std::shared_ptr<Tensor> myWeights =
+		std::make_shared<Tensor>(Array2D<float, 3, 2>{{
+				{0.12345678, -1.34567890},
+				{-0.87654321, 0.56789012},
+				{0.23456789, -0.45678901}
+		}});
+
+		std::shared_ptr<Tensor> myBias =
+		std::make_shared<Tensor>(Array1D<float, 2>{
+				{0.65432109, -0.54321098}
+		});
+
+		std::shared_ptr<Tensor> myOutput =
+		std::make_shared<Tensor>(Array2D<float, 4, 2>{{
+				{-3.1545789, 1.101069},
+				{1.9565322, 1.534453},
+				{-2.7446516, 1.1439626},
+				{2.8480933, -1.412901}
+		}});
+
+		std::shared_ptr<Node> myFC = FC(3, 2, false, "myfc", 2.0f, 0.5f, true, true);
+		auto op = std::static_pointer_cast<OperatorTensor>(myFC->getOperator());
+		op->associateInput(0, myInput);
+		op->associateInput(1, myWeights);
+		op->associateInput(2, myBias);
+		op->setDataType(DataType::Float32);
+		op->setBackend("cpu");
+
+		// Forward pass
 		myFC->forward();
 		op->getOutput(0)->print();
 		REQUIRE(approxEq<float>(*(op->getOutput(0)), *myOutput));
 
-		// Backward
-		std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array2D<float, 4, 2>{
-							{
-							{0.19126743, -0.85291833},
-							{0.94577849, -1.0063207},
-							{-0.60322332, 0.65167785},
-							{-0.038923461, 0.94386256}
-							}});
-		std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array2D<float, 4, 3>{
-								{
-								{1.8095039, -0.033359278, 0.97575688},
-								{2.312856, -0.091922671, 0.92919618},
-								{-1.4954853, 0.058852643, -0.60458499},
-								{-1.9597715, 0.024307476, -1.1330653}
-								}});
-		std::shared_ptr<Tensor> expectedWeightsGrad = std::make_shared<Tensor>(Array2D<float, 2, 3>{
-								{{3.6410849, -1.1286706, -1.275959},
-								{-7.5292215, 1.2592185, 1.0422806}}});
-		std::shared_ptr<Tensor> expectedBiasGrad = std::make_shared<Tensor>(Array2D<float, 4, 2>{{{0.095633715, -0.42645916},
-								{0.47288924, -0.50316036},
-								{-0.30161166, 0.32583892},
-								{-0.01946173, 0.47193128 }}});
+		// Backward Pass
+		std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array2D<float, 4, 2>{{
+			{0.19876543, -0.65432109},
+			{0.76543210, -1.23456789},
+			{-0.43210987, 1.09876543},
+			{-0.98765432, 0.87654321}
+		}});
 
+		std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array2D<float, 3, 4>{{
+			{1.8100901, 3.5116596, -3.0638645, -2.6029568},
+			{-1.0916179, -2.7440662, 2.0054817, 2.7270038},
+			{0.69102132, 1.4869658, -1.206526, -1.2641346}
+		}});
+
+		std::shared_ptr<Tensor> expectedWeightsGrad = std::make_shared<Tensor>(Array2D<float, 3, 2>{{
+			{-1.4192863, 3.1120875},
+			{0.50970948, 0.77579576},
+			{0.062572584, -2.571022}
+		}});
+
+		std::shared_ptr<Tensor> expectedBiasGrad = std::make_shared<Tensor>(Array1D<float, 2>{
+			{-0.22778332,  0.04320982}
+		});
 		op->getOutput(0)->setGrad(myOutputGrad);
 		myFC->backward();
 		REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedInputGrad));
 		REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedWeightsGrad));
 		REQUIRE(approxEq<float>(*(op->getInput(2)->grad()), *expectedBiasGrad));
 	}
-	SECTION("transA and transB") {
-			std::shared_ptr<Tensor> myInput =
-			std::make_shared<Tensor>(Array2D<float, 3, 4>{{
-					{0.51234567, -1.23456789, 0.67891234, -0.43219876},
-					{1.87654321, -0.98765432, 1.34567890, -1.23456789},
-					{-0.67891234, 0.43219876, -1.87654321, 0.98765432}
-			}});
-
-			std::shared_ptr<Tensor> myWeights =
-			std::make_shared<Tensor>(Array2D<float, 3, 2>{{
-					{0.12345678, -1.34567890},
-					{-0.87654321, 0.56789012},
-					{0.23456789, -0.45678901}
-			}});
-
-			std::shared_ptr<Tensor> myBias =
-			std::make_shared<Tensor>(Array2D<float, 4, 2>{{
-					{0.65432109, -0.54321098},
-					{1.23456789, -1.09876543},
-					{-0.32109876, 0.98765432},
-					{-0.87654321, 0.76543210}
-			}});
-
-			std::shared_ptr<Tensor> myOutput =
-			std::make_shared<Tensor>(Array2D<float, 4, 2>{{
-					{-3.1545789, 1.101069},
-					{2.2466557, 1.2566758},
-					{-3.2323616, 1.9093952},
-					{2.0826612, -0.75857949}
-			}});
-
-			std::shared_ptr<Node> myFC = FC(3, 2, false, "myfc", 2.0f, 0.5f, true, true);
-			auto op = std::static_pointer_cast<OperatorTensor>(myFC->getOperator());
-			op->associateInput(0, myInput);
-			op->associateInput(1, myWeights);
-			op->associateInput(2, myBias);
-			op->setDataType(DataType::Float32);
-			op->setBackend("cpu");
-
-			// Forward pass
-			myFC->forward();
-			op->getOutput(0)->print();
-			REQUIRE(approxEq<float>(*(op->getOutput(0)), *myOutput));
-
-			// Backward Pass
-			std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array2D<float, 4, 2>{{
-				{0.19876543, -0.65432109},
-				{0.76543210, -1.23456789},
-				{-0.43210987, 1.09876543},
-				{-0.98765432, 0.87654321}
-			}});
-
-			std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array2D<float, 3, 4>{{
-				{1.8100901, 3.5116596, -3.0638645, -2.6029568},
-			{-1.0916179, -2.7440662, 2.0054817, 2.7270038},
-			{0.69102132, 1.4869658, -1.206526, -1.2641346}
-			}});
-
-			std::shared_ptr<Tensor> expectedWeightsGrad = std::make_shared<Tensor>(Array2D<float, 3, 2>{{
-				{-1.4192863, 3.1120875},
-				{0.50970948, 0.77579576},
-				{0.062572584, -2.571022}
-			}});
-
-			std::shared_ptr<Tensor> expectedBiasGrad = std::make_shared<Tensor>(Array2D<float, 4, 2>{{
-				{0.099382713, -0.32716054},
-				{0.38271606, -0.61728394},
-				{-0.21605493, 0.54938269},
-				{-0.49382716, 0.43827161}
-			}});
-			op->getOutput(0)->setGrad(myOutputGrad);
-			myFC->backward();
-			REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedInputGrad));
-			REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedWeightsGrad));
-			REQUIRE(approxEq<float>(*(op->getInput(2)->grad()), *expectedBiasGrad));
-	}
 }
\ No newline at end of file
-- 
GitLab


From 74899817bf12f4d2ffb72e9e3d60543492b1101a Mon Sep 17 00:00:00 2001
From: hrouis <houssemeddine.rouis92@gmail.com>
Date: Fri, 21 Feb 2025 16:27:13 +0100
Subject: [PATCH 5/8] update FC factory call

---
 unit_tests/operator/Test_FCImpl.cpp       | 52 +++++++++++------------
 unit_tests/operator/Test_MetaOperator.cpp |  4 +-
 unit_tests/scheduler/Test_CastMove.cpp    |  4 +-
 unit_tests/scheduler/Test_Scheduler.cpp   |  6 +--
 4 files changed, 32 insertions(+), 34 deletions(-)

diff --git a/unit_tests/operator/Test_FCImpl.cpp b/unit_tests/operator/Test_FCImpl.cpp
index ea464f41..ce41ec1e 100644
--- a/unit_tests/operator/Test_FCImpl.cpp
+++ b/unit_tests/operator/Test_FCImpl.cpp
@@ -19,6 +19,7 @@
 #include "aidge/data/Tensor.hpp"
 #include "aidge/operator/FC.hpp"
 #include "aidge/utils/ArrayHelpers.hpp"
+#include "aidge/utils/TensorUtils.hpp"
 
 using namespace Aidge;
 
@@ -48,7 +49,7 @@ TEST_CASE("[cpu/oeprator] FC(forward)", "[FC][CPU]") {
     Tensor myOutput = Array2D<int, 2, 5>{
             {{23601, 23602, 23603, 23604, 23605}, {68601, 68602, 68603, 68604, 68605}}};
 
-    std::shared_ptr<Node> myFC = FC(75, 5, false, "myfc");
+    std::shared_ptr<Node> myFC = FC(75, 5, 1.0f, 1.0f, false, false, false, "myfc");
     auto op = std::static_pointer_cast<FC_Op>(myFC -> getOperator());
     op -> setDataType(DataType::Int32);
     op -> setBackend("cpu");
@@ -124,14 +125,14 @@ TEST_CASE("[cpu/oeprator] FC(forward)", "[FC][CPU]") {
 				-0.31278628, 0.68560582
 		}});
 
-		std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float, 4, 2>{{
+		Tensor myOutput = Tensor(Array2D<float, 4, 2>{{
 				{0.39605334, -1.3982881},
 				{0.30229449, -0.86952901},
 				{-0.39894336, 2.6626053},
 				{-0.20497644, 3.4614196}
 		}});
 
-        std::shared_ptr<Node> myFC = FC(3, 2, false, "myfc", 2.0f, 0.5f, true, true); // transA = true, transB = true
+        std::shared_ptr<Node> myFC = FC(3, 2, 2.0f, 0.5f, false, true, true, "myfc"); // transA = true, transB = true
 
         auto op = std::static_pointer_cast<OperatorTensor>(myFC->getOperator());
         op->associateInput(0, myInput);
@@ -141,9 +142,8 @@ TEST_CASE("[cpu/oeprator] FC(forward)", "[FC][CPU]") {
         op->setBackend("cpu");
 
         myFC->forward();
-        op->getOutput(0)->print();
 
-        REQUIRE(approxEq<float>(*(op->getOutput(0)), *myOutput));
+        REQUIRE(approxEq<float>(*(op->getOutput(0)), myOutput));
     }
     // std::cout << static_cast<Tensor>((*myFC->getOperator())["weight"])[0][0][0][0] << std::endl;
 }
@@ -164,13 +164,13 @@ TEST_CASE("[cpu/oeprator] FC(backward)", "[FC][CPU]") {
 	{			-1.1458585, -0.8235659, 0.24195994}}});
 		std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float, 2>{{1.5327742,  0.90154403}});
 
-		std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array2D<float, 4, 2>{
+		Tensor myOutput = Tensor(Array2D<float, 4, 2>{
 				{{1.0376441, 0.38158852},
 				{-0.86573052, 2.6920884},
 				{3.0791781, -1.3184667},
 				{2.3588469, -0.31109101}}});
 
-		std::shared_ptr<Node> myFC = FC(3, 2, false, "myfc");
+		std::shared_ptr<Node> myFC = FC(3, 2, 1.0f, 1.0f, false, false, false, "myfc");
 		auto op = std::static_pointer_cast<OperatorTensor>(myFC -> getOperator());
 		op -> associateInput(0, myInput);
 		op -> associateInput(1, myWeights);
@@ -178,8 +178,8 @@ TEST_CASE("[cpu/oeprator] FC(backward)", "[FC][CPU]") {
 		op -> setDataType(DataType::Float32);
 		op -> setBackend("cpu");
 		myFC->forward();
-		op->getOutput(0)->print();
-		REQUIRE(approxEq<float>(*(op->getOutput(0)), *myOutput));
+
+		REQUIRE(approxEq<float>(*(op->getOutput(0)), myOutput));
 
 		// Backward
 		std::shared_ptr<Tensor> myOutputGrad =
@@ -190,24 +190,23 @@ TEST_CASE("[cpu/oeprator] FC(backward)", "[FC][CPU]") {
 								{-1.5132738, -0.23136522},
 								{0.20452768, -1.2200259}
 								}});
-		std::shared_ptr<Tensor> expectedInputGrad =
-				std::make_shared<Tensor>(Array2D<float, 4, 3>{
+		Tensor expectedInputGrad = Tensor(Array2D<float, 4, 3>{
 								{
 								{1.4716856, 3.7039511, -2.9912496},
 								{0.82964748, 1.0730045, -0.65807492},
 								{0.19803995, -2.7722826, 2.9105654},
 								{1.4070423, 1.4052149, -0.69614327}
 								}});
-		std::shared_ptr<Tensor> expectedWeightsGrad = std::make_shared<Tensor>(Array2D<float, 2, 3>{
+		Tensor expectedWeightsGrad = Tensor(Array2D<float, 2, 3>{
 				{{-1.7768159, -0.66813177, 1.0499192},
 				{-1.4800593, 0.37063029, -0.15180479}}});
-		std::shared_ptr<Tensor> expectedBiasGrad = std::make_shared<Tensor>(Array1D<float, 2>{{0.31267303, -3.397066 }});
+		Tensor expectedBiasGrad = Tensor(Array1D<float, 2>{{0.31267303, -3.397066 }});
 
 		op->getOutput(0)->setGrad(myOutputGrad);
 		myFC->backward();
-		REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedInputGrad));
-		REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedWeightsGrad));
-		REQUIRE(approxEq<float>(*(op->getInput(2)->grad()), *expectedBiasGrad));
+		REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), expectedInputGrad));
+		REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), expectedWeightsGrad));
+		REQUIRE(approxEq<float>(*(op->getInput(2)->grad()), expectedBiasGrad));
 	}
 	SECTION("transA and transB") {
 		std::shared_ptr<Tensor> myInput =
@@ -229,15 +228,14 @@ TEST_CASE("[cpu/oeprator] FC(backward)", "[FC][CPU]") {
 				{0.65432109, -0.54321098}
 		});
 
-		std::shared_ptr<Tensor> myOutput =
-		std::make_shared<Tensor>(Array2D<float, 4, 2>{{
+		Tensor myOutput = Tensor(Array2D<float, 4, 2>{{
 				{-3.1545789, 1.101069},
 				{1.9565322, 1.534453},
 				{-2.7446516, 1.1439626},
 				{2.8480933, -1.412901}
 		}});
 
-		std::shared_ptr<Node> myFC = FC(3, 2, false, "myfc", 2.0f, 0.5f, true, true);
+		std::shared_ptr<Node> myFC = FC(3, 2, 2.0f, 0.5f, false, true, true, "myfc");
 		auto op = std::static_pointer_cast<OperatorTensor>(myFC->getOperator());
 		op->associateInput(0, myInput);
 		op->associateInput(1, myWeights);
@@ -247,8 +245,8 @@ TEST_CASE("[cpu/oeprator] FC(backward)", "[FC][CPU]") {
 
 		// Forward pass
 		myFC->forward();
-		op->getOutput(0)->print();
-		REQUIRE(approxEq<float>(*(op->getOutput(0)), *myOutput));
+
+		REQUIRE(approxEq<float>(*(op->getOutput(0)), myOutput));
 
 		// Backward Pass
 		std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array2D<float, 4, 2>{{
@@ -258,25 +256,25 @@ TEST_CASE("[cpu/oeprator] FC(backward)", "[FC][CPU]") {
 			{-0.98765432, 0.87654321}
 		}});
 
-		std::shared_ptr<Tensor> expectedInputGrad = std::make_shared<Tensor>(Array2D<float, 3, 4>{{
+		Tensor expectedInputGrad = Tensor(Array2D<float, 3, 4>{{
 			{1.8100901, 3.5116596, -3.0638645, -2.6029568},
 			{-1.0916179, -2.7440662, 2.0054817, 2.7270038},
 			{0.69102132, 1.4869658, -1.206526, -1.2641346}
 		}});
 
-		std::shared_ptr<Tensor> expectedWeightsGrad = std::make_shared<Tensor>(Array2D<float, 3, 2>{{
+		Tensor expectedWeightsGrad = Tensor(Array2D<float, 3, 2>{{
 			{-1.4192863, 3.1120875},
 			{0.50970948, 0.77579576},
 			{0.062572584, -2.571022}
 		}});
 
-		std::shared_ptr<Tensor> expectedBiasGrad = std::make_shared<Tensor>(Array1D<float, 2>{
+		Tensor expectedBiasGrad = Tensor(Array1D<float, 2>{
 			{-0.22778332,  0.04320982}
 		});
 		op->getOutput(0)->setGrad(myOutputGrad);
 		myFC->backward();
-		REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedInputGrad));
-		REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedWeightsGrad));
-		REQUIRE(approxEq<float>(*(op->getInput(2)->grad()), *expectedBiasGrad));
+		REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), expectedInputGrad));
+		REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), expectedWeightsGrad));
+		REQUIRE(approxEq<float>(*(op->getInput(2)->grad()), expectedBiasGrad));
 	}
 }
\ No newline at end of file
diff --git a/unit_tests/operator/Test_MetaOperator.cpp b/unit_tests/operator/Test_MetaOperator.cpp
index 7b0b80d8..925a264a 100644
--- a/unit_tests/operator/Test_MetaOperator.cpp
+++ b/unit_tests/operator/Test_MetaOperator.cpp
@@ -702,8 +702,8 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
         auto init = std::make_shared<Tensor>(Array2D<float, 2, 5>{});
         uniformFiller<float>(init, 0.0, 0.0);
 
-        auto fc1 = FC(inChannels, outChannels, true, "myfc");
-        auto fc2 = FC(outChannels, inChannels, true, "fc2");
+        auto fc1 = FC(inChannels, outChannels, 1.0f, 1.0f, true, false, false, "myfc");
+        auto fc2 = FC(outChannels, inChannels, 1.0f, 1.0f, true, false, false, "fc2");
         // NOTE: Account for init step by adding 1 to the max timestep
         // parameter.
         auto lif1 = Leaky(nbTimeSteps + 1, beta, threshold, LeakyReset::Subtraction, "leaky");
diff --git a/unit_tests/scheduler/Test_CastMove.cpp b/unit_tests/scheduler/Test_CastMove.cpp
index b78e864f..fdeb2d79 100644
--- a/unit_tests/scheduler/Test_CastMove.cpp
+++ b/unit_tests/scheduler/Test_CastMove.cpp
@@ -56,7 +56,7 @@ TEST_CASE("[cpu/castmove] CastMove(forward)") {
                     Conv(1, 3, {3, 3}, "conv1"),
                     Conv(3, 4, {1, 1}, "conv2"),
                     Conv(4, 3, {1, 1}, "conv3"),
-                    FC(27, 5, false, "fc")});
+                    FC(27, 5, 1.0f, 1.0f, false, false, false, "fc")});
 
         g->getNode("conv1")->getOperator()->setInput(0, inputTensor);
         g->getNode("conv1")->getOperator()->setInput(1, weight1);
@@ -158,7 +158,7 @@ TEST_CASE("[cpu/castmove] CastMove(forward)") {
                     Conv(1, 3, {3, 3}, "conv1"),
                     Conv(3, 4, {1, 1}, "conv2"),
                     Conv(4, 3, {1, 1}, "conv3"),
-                    FC(27, 5, false, "fc")});
+                    FC(27, 5, 1.0f, 1.0f, false, false, false, "fc")});
 
         g->getNode("conv1")->getOperator()->setInput(0, inputTensor);
         g->getNode("conv1")->getOperator()->setInput(1, weight1);
diff --git a/unit_tests/scheduler/Test_Scheduler.cpp b/unit_tests/scheduler/Test_Scheduler.cpp
index be87e8ac..e264f114 100644
--- a/unit_tests/scheduler/Test_Scheduler.cpp
+++ b/unit_tests/scheduler/Test_Scheduler.cpp
@@ -71,7 +71,7 @@ TEST_CASE("[cpu/scheduler] SequentialScheduler(forward)") {
                     Conv(1, 3, {3, 3}, "conv1"),
                     Conv(3, 4, {1, 1}, "conv2"),
                     Conv(4, 3, {1, 1}, "conv3"),
-                    FC(27, 5, false, "fc")});
+                    FC(27, 5, 1.0f, 1.0f, false, false, false, "fc")});
 
         g->getNode("conv1")->getOperator()->setInput(0, inputTensor);
         g->getNode("conv1")->getOperator()->setInput(1, weight1);
@@ -173,7 +173,7 @@ TEST_CASE("[cpu/scheduler] SequentialScheduler(forward)") {
                                 Conv(3, 3, {1, 1}, "conv1.3")}),
                             Add("add2"),
                             Conv(3, 2, {1, 1}, "conv2"),
-                            FC(18, 5, false, "out")});
+                            FC(18, 5, 1.0f, 1.0f, false, false, false, "out")});
 
         g->getNode("inputConv")->getOperator()->setInput(0, inputTensor);
         g->getNode("inputConv")->getOperator()->setInput(1, weight1);
@@ -321,7 +321,7 @@ TEST_CASE("[cpu/scheduler] SequentialScheduler(forward)") {
                     Conv(1, 3, {3, 3}, "conv1"),
                     Conv(3, 4, {1, 1}, "conv2"),
                     Conv(4, 3, {1, 1}, "conv3"),
-                    FC(27, 5, false, "fc")});
+                    FC(27, 5, 1.0f, 1.0f, false, false, false, "fc")});
 
         // g->getNode("conv1")->getOperator()->setInput(0, inputTensor);
         g->getNode("conv1")->getOperator()->setInput(1, weight1);
-- 
GitLab


From 0eb2f2b47cdbdb0ef9037a0a0a33c709927bb668 Mon Sep 17 00:00:00 2001
From: hrouis <houssemeddine.rouis92@gmail.com>
Date: Mon, 24 Feb 2025 11:22:26 +0100
Subject: [PATCH 6/8] add hints to support batched biad in FC

---
 .../backend/cpu/operator/FCImpl_kernels.hpp     | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp
index aa4ffa2e..3542c8cf 100644
--- a/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp
@@ -41,6 +41,14 @@ void FCImpl_cpu_forward_kernel(const DimSize_t batchSize,
     if (biases == nullptr) {
         std::fill(output, output + (batchSize * outputFeatureSize), O(0));
     } else {
+        /* TODO: If we want to support batched Biases:
+            for (std::size_t batch = 0; batch < batchSize; ++batch) {
+                std::transform(
+                    biases, biases + outputFeatureSize, output + batch * outputFeatureSize,
+                    [beta](const B& bias) { return beta * static_cast<O>(bias); }
+            );
+
+        */
         for (std::size_t batch = 0; batch < batchSize; ++batch) {
             std::transform(
                 biases, biases + outputFeatureSize, output + batch * outputFeatureSize,
@@ -90,7 +98,14 @@ void FCImpl_cpu_backward_kernel(const DimSize_t batchSize,
     const O beta = static_cast<O>(beta_);
 
     // Compute bias gradient: dB = beta * dB + alpha * dY
-    if (biasesGrad != nullptr) { 
+    if (biasesGrad != nullptr) {
+        /* TODO: If we want to support batched Biases:
+            for (std::size_t b = 0; b < batchSize; ++b) {
+                for (std::size_t o = 0; o < outputFeatureSize; ++o) {
+                    biasesGrad[b * outputFeatureSize + o] = beta * outputGrad[b * outputFeatureSize + o];
+                }
+            }
+        */
         for (std::size_t o = 0; o < outputFeatureSize; ++o) {
             O sum{0};
             for (std::size_t b = 0; b < batchSize; ++b) {
-- 
GitLab


From 5c9c92b73d6c3384ebc08def25d9798e1af05a58 Mon Sep 17 00:00:00 2001
From: hrouis <houssemeddine.rouis92@gmail.com>
Date: Tue, 11 Mar 2025 12:02:39 +0100
Subject: [PATCH 7/8] add Sum operator

---
 include/aidge/backend/cpu.hpp                 |   1 +
 .../aidge/backend/cpu/operator/SumImpl.hpp    |  36 ++++
 .../backend/cpu/operator/SumImpl_kernels.hpp  |  59 ++++++
 src/operator/SumImpl.cpp                      |  71 +++++++
 unit_tests/operator/Test_SumImpl.cpp          | 176 ++++++++++++++++++
 5 files changed, 343 insertions(+)
 create mode 100644 include/aidge/backend/cpu/operator/SumImpl.hpp
 create mode 100644 include/aidge/backend/cpu/operator/SumImpl_kernels.hpp
 create mode 100644 src/operator/SumImpl.cpp
 create mode 100644 unit_tests/operator/Test_SumImpl.cpp

diff --git a/include/aidge/backend/cpu.hpp b/include/aidge/backend/cpu.hpp
index 5c1f9b11..ecf111a7 100644
--- a/include/aidge/backend/cpu.hpp
+++ b/include/aidge/backend/cpu.hpp
@@ -58,6 +58,7 @@
 #include "aidge/backend/cpu/operator/SliceImpl.hpp"
 #include "aidge/backend/cpu/operator/SoftmaxImpl.hpp"
 #include "aidge/backend/cpu/operator/SubImpl.hpp"
+#include "aidge/backend/cpu/operator/SumImpl.hpp"
 #include "aidge/backend/cpu/operator/TanhImpl.hpp"
 #include "aidge/backend/cpu/operator/WeightInterleavedImpl.hpp"
 
diff --git a/include/aidge/backend/cpu/operator/SumImpl.hpp b/include/aidge/backend/cpu/operator/SumImpl.hpp
new file mode 100644
index 00000000..54d68979
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/SumImpl.hpp
@@ -0,0 +1,36 @@
+/********************************************************************************
+ * Copyright (c) 2025 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+ #ifndef AIDGE_CPU_OPERATOR_SUMIMPL_H_
+ #define AIDGE_CPU_OPERATOR_SUMIMPL_H_
+ 
+ #include <cstddef>  // std::size_t
+ #include <memory>   // std::unique_ptr, std::make_unique
+ #include <string>
+ #include <vector>
+ 
+ #include "aidge/backend/cpu/operator/OperatorImpl.hpp"
+ #include "aidge/operator/Sum.hpp"
+ #include "aidge/utils/Registrar.hpp"
+ #include "aidge/utils/Types.h"
+ 
+ namespace Aidge {
+ // Operator implementation entry point for the backend
+ using SumImpl_cpu = OperatorImpl_cpu<Sum_Op,
+        void(const std::vector<const void*>, const std::vector<std::vector<std::size_t>>&, const std::size_t, const std::vector<std::size_t>&, void*)>;
+
+
+ // Implementation entry point registration to Operator
+ REGISTRAR(Sum_Op, "cpu", Aidge::SumImpl_cpu::create);
+ }  // namespace Aidge
+ 
+ #endif /* AIDGE_CPU_OPERATOR_SUMIMPL_H_ */
+ 
\ No newline at end of file
diff --git a/include/aidge/backend/cpu/operator/SumImpl_kernels.hpp b/include/aidge/backend/cpu/operator/SumImpl_kernels.hpp
new file mode 100644
index 00000000..0c5e137e
--- /dev/null
+++ b/include/aidge/backend/cpu/operator/SumImpl_kernels.hpp
@@ -0,0 +1,59 @@
+/********************************************************************************
+ * Copyright (c) 2025 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+ #ifndef AIDGE_CPU_OPERATOR_SUBIMPL_KERNELS_H_
+ #define AIDGE_CPU_OPERATOR_SUBIMPL_KERNELS_H_
+ 
+ #include "aidge/utils/Registrar.hpp"
+ 
+ #include <cstdint>     // std::int32_t, std::int64_t
+ 
+ #include "aidge/backend/cpu/data/Broadcasting.hpp"
+ #include "aidge/backend/cpu/operator/SumImpl.hpp"
+ 
+ namespace Aidge {
+ 
+ template <class I, class O>
+ void SumImpl_cpu_forward_kernel(const std::vector<const void*> inputs_, const std::vector<std::vector<std::size_t>>& inputDims, const std::size_t outputLength, const std::vector<std::size_t>& outDims, void* output_) {
+     std::vector<const I*> inputs;
+     for (const auto& input_ : inputs_) {
+         inputs.push_back(static_cast<const I*>(input_));
+     }
+     O* output = static_cast<O*>(output_);
+ 
+     for (std::size_t oIndex = 0; oIndex < outputLength; ++oIndex)
+     {
+         output[oIndex] = 0;
+         std::vector<size_t> indexes = getMultiDimIndices(outDims, oIndex);
+         for(std::size_t iIndex = 0; iIndex < inputs.size(); ++iIndex) {
+             std::size_t idx = getFlattenedIndex(inputDims[iIndex], indexes);
+             output[oIndex] += inputs[iIndex][idx];
+         }
+     }
+ }
+ 
+ // Kernels registration to implementation entry point
+ REGISTRAR(SumImpl_cpu,
+     {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float32}},
+     {ProdConso::inPlaceModel, Aidge::SumImpl_cpu_forward_kernel<float, float>, nullptr});
+ REGISTRAR(SumImpl_cpu,
+     {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Float64}},
+     {ProdConso::inPlaceModel, Aidge::SumImpl_cpu_forward_kernel<double, double>, nullptr});
+ REGISTRAR(SumImpl_cpu,
+     {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int32}},
+     {ProdConso::inPlaceModel, Aidge::SumImpl_cpu_forward_kernel<std::int32_t, std::int32_t>, nullptr});
+ REGISTRAR(SumImpl_cpu,
+     {ImplSpec::IOSpec{DataType::Any}, ImplSpec::IOSpec{DataType::Int64}},
+     {ProdConso::inPlaceModel, Aidge::SumImpl_cpu_forward_kernel<std::int64_t, std::int64_t>, nullptr});
+ }  // namespace Aidge
+ 
+ #endif /* AIDGE_CPU_OPERATOR_SUBIMPL_KERNELS_H_ */
+ 
\ No newline at end of file
diff --git a/src/operator/SumImpl.cpp b/src/operator/SumImpl.cpp
new file mode 100644
index 00000000..436fd78c
--- /dev/null
+++ b/src/operator/SumImpl.cpp
@@ -0,0 +1,71 @@
+/********************************************************************************
+ * Copyright (c) 2025 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+ #include "aidge/backend/cpu/operator/SumImpl.hpp"
+
+ #include <cassert>
+ #include <vector>
+ 
+ #include "aidge/backend/cpu/data/GetCPUPtr.h"
+ #include "aidge/backend/cpu/operator/SumImpl_kernels.hpp"
+ #include "aidge/data/Data.hpp"
+ #include "aidge/data/Tensor.hpp"
+ #include "aidge/utils/Types.h"
+ #include "aidge/utils/ErrorHandling.hpp"
+ 
+template <>
+void  Aidge::SumImpl_cpu::forward() {
+    const Sum_Op& op = static_cast<const Sum_Op&>(mOp);
+    // Check inputs
+    AIDGE_ASSERT(op.getInput(0), "missing input in Sum operator");
+    AIDGE_ASSERT(op.getInput(0)->hasImpl(), "cannot run Sum forward because input#0 has no implementation.");
+    DataType datatypeFirstInput = op.getInput(0)->dataType();
+    for (IOIndex_t i = 1; i < op.nbInputs(); ++i) {
+        AIDGE_ASSERT(op.getInput(i), "missing input in Sum operator");
+        AIDGE_ASSERT(op.getInput(i)->hasImpl(), "cannot run Sum forward because the input#{} has no implementation.", i);
+        AIDGE_ASSERT(op.getInput(i)->dataType() == datatypeFirstInput, "Cannot sum inputs with two differents data type.");
+    }
+
+    // Find the correct kernel type
+    const auto impl = Registrar<SumImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Convert input data (no overhead if not needed!)
+    // TODO: right now, if needed, memory will be allocated/deallocated at each
+    // call to forward(). We might put the following shared_ptr as members of
+    // this class to avoid that.
+    const std::size_t nbDims = op.getOutput(0)->nbDims();
+    std::vector<std::vector<std::size_t>> inputsDims;
+    std::vector<const void*> opInputs;
+    std::vector<std::shared_ptr<Tensor>> inputsFallback(op.nbInputs());
+    for (IOIndex_t i = 0; i < op.nbInputs(); ++i) {
+        std::vector<std::size_t> inputDims(nbDims, 1);
+        auto dims = op.getInput(i)->dims();
+		for(std::size_t j=dims.size()-1; j+1>0; --j)
+		{
+			std::size_t idx = nbDims - (dims.size()-j);
+			inputDims[idx] = dims[j];
+		}
+        inputsDims.push_back(inputDims);
+        const auto& input = op.getInput(i)->refCastFrom(inputsFallback[i], *op.getOutput(0));
+        opInputs.push_back(input.getImpl()->rawPtr());
+    }
+
+    impl.forward(opInputs,
+               inputsDims,
+               op.getOutput(0)->size(),
+               op.getOutput(0)->dims(),
+               getCPUPtr(op.getRawOutput(0)));
+}
+
+
+template <>
+void Aidge::SumImpl_cpu::backward() {
+}
\ No newline at end of file
diff --git a/unit_tests/operator/Test_SumImpl.cpp b/unit_tests/operator/Test_SumImpl.cpp
new file mode 100644
index 00000000..c3e81a90
--- /dev/null
+++ b/unit_tests/operator/Test_SumImpl.cpp
@@ -0,0 +1,176 @@
+/********************************************************************************
+ * Copyright (c) 2025 CEA-List
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0.
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ ********************************************************************************/
+
+ #include <catch2/catch_test_macros.hpp>
+
+ #include "aidge/data/Tensor.hpp"
+ #include "aidge/operator/Sum.hpp"
+ 
+ #include "aidge/backend/cpu.hpp"
+ 
+ using namespace Aidge;
+ 
+ TEST_CASE("[cpu/operator] Sum(forward)", "[Sum][CPU]") {
+     std::shared_ptr<Tensor> input1 = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
+         {                                       //
+             {                                   //
+                 {{20, 47},{21, 48},{22, 49}},   //
+                 {{23, 50},{24, 51},{25, 52}},   //
+                 {{26, 53},{27, 54},{28, 55}}    //
+             },                                  //
+             {                                   //
+                 {{29, 56},{30, 57},{31, 58}},   //
+                 {{32, 59},{33, 60},{34, 61}},   //
+                 {{35, 62},{36, 63},{37, 64}}    //
+             },                                  //
+             {                                   //
+                 {{38, 65},{39, 66},{40, 67}},   //
+                 {{41, 68},{42, 69},{43, 70}},   //
+                 {{44, 71},{45, 72},{46, 73}}    //
+             }                                   //
+         }                                       //
+     });                                         //
+ 
+     SECTION("One input") {
+         std::shared_ptr<Node> mySum = Sum(1);
+         auto op = std::static_pointer_cast<OperatorTensor>(mySum -> getOperator());
+         op->associateInput(0, input1);
+         op->setBackend("cpu");
+         op->setDataType(DataType::Int32);
+         mySum->forward();
+ 
+         REQUIRE(*(op->getOutput(0)) == *input1);
+     }
+ 
+     SECTION("Two inputs") {
+         std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
+             {
+                 {
+                     {{40,  94},{42,  96},{44,  98}},
+                     {{46, 100},{48, 102},{50, 104}},
+                     {{52, 106},{54, 108},{56, 110}}
+                 },
+                 {
+                     {{58, 112},{60, 114},{62, 116}},
+                     {{64, 118},{66, 120},{68, 122}},
+                     {{70, 124},{72, 126},{74, 128}}
+                 },
+                 {
+                     {{76, 130},{78, 132},{80, 134}},
+                     {{82, 136},{84, 138},{86, 140}},
+                     {{88, 142},{90, 144},{92, 146}}
+                 }
+             }
+         });
+ 
+         std::shared_ptr<Node> mySum = Sum(2);
+         auto op = std::static_pointer_cast<OperatorTensor>(mySum -> getOperator());
+         op->associateInput(0, input1);
+         op->associateInput(1, input1);
+         op->setBackend("cpu");
+         op->setDataType(DataType::Int32);
+         mySum->forward();
+ 
+         REQUIRE(*(op->getOutput(0)) == *expectedOutput);
+     }
+ 
+     SECTION("Three inputs") {
+         std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
+             {
+                 {
+                     {{ 60, 141},{ 63, 144},{ 66, 147}},
+                     {{ 69, 150},{ 72, 153},{ 75, 156}},
+                     {{ 78, 159},{ 81, 162},{ 84, 165}}
+                 },
+                 {
+                     {{ 87, 168},{ 90, 171},{ 93, 174}},
+                     {{ 96, 177},{ 99, 180},{102, 183}},
+                     {{105, 186},{108, 189},{111, 192}}
+                 },
+                 {
+                     {{114, 195},{117, 198},{120, 201}},
+                     {{123, 204},{126, 207},{129, 210}},
+                     {{132, 213},{135, 216},{138, 219}}
+                 }
+             }
+         });
+ 
+         std::shared_ptr<Node> mySum = Sum(3);
+         auto op = std::static_pointer_cast<OperatorTensor>(mySum -> getOperator());
+         op->associateInput(0, input1);
+         op->associateInput(1, input1);
+         op->associateInput(2, input1);
+         op->setDataType(DataType::Int32);
+         op->setBackend("cpu");
+         mySum->forward();
+ 
+         REQUIRE(*op->getOutput(0) == *expectedOutput);
+     }
+ 
+     SECTION("Broadcasting") {
+         std::shared_ptr<Tensor> input_0 = std::make_shared<Tensor>(Array4D<int,3,1,3,2> {
+         {                                       //
+             {                                   //
+                 {{0, 1},{2, 3},{4, 5}}          //
+             },                                  //
+             {                                   //
+                 {{6, 7},{8, 9},{10, 11}}        //
+             },                                  //
+             {                                   //
+                 {{12, 13},{14, 15},{16, 17}}    //
+             }                                   //
+         }                                       //
+         });                                     //
+         std::shared_ptr<Tensor> input_1 = std::make_shared<Tensor>(Array4D<int,1,3,3,2> {
+         {                                       //
+             {                                   //
+                 {{20, 21},{22, 23},{24, 25}},   //
+                 {{26, 27},{28, 29},{30, 31}},   //
+                 {{32, 33},{34, 35},{36, 37}}    //
+             }                                   //
+         }                                       //
+         });                                     //
+ 
+         std::shared_ptr<Tensor> input_2 = std::make_shared<Tensor>(Array1D<int,2> {{100,200}});  
+         std::shared_ptr<Tensor> expectedOutput = std::make_shared<Tensor>(Array4D<int,3,3,3,2> {
+             {                                               //
+                 {                                           //
+                     {{ 120, 222},{ 124, 226},{ 128, 230}},  //
+                     {{ 126, 228},{ 130, 232},{ 134, 236}},  //
+                     {{ 132, 234},{ 136, 238},{ 140, 242}}   //
+                 },                                          //
+                 {                                           //
+                     {{ 126, 228},{ 130, 232},{ 134, 236}},  //
+                     {{ 132, 234},{ 136, 238},{ 140, 242}},  //
+                     {{ 138, 240},{ 142, 244},{ 146, 248}}   //
+                 },                                          //
+                 {                                           //
+                     {{ 132, 234},{ 136, 238},{140, 242}},   //
+                     {{ 138, 240},{ 142, 244},{146, 248}},   //
+                     {{ 144, 246},{ 148, 250},{152, 254}}    //
+                 }                                           //
+             }                                               //
+         });                                                 //
+ 
+         std::shared_ptr<Node> mySum = Sum(3);
+         auto op = std::static_pointer_cast<OperatorTensor>(mySum -> getOperator());
+         op->associateInput(0, input_0);
+         op->associateInput(1, input_1);
+         op->associateInput(2, input_2);
+         op->setDataType(DataType::Int32);
+         op->setBackend("cpu");
+         mySum->forward();
+         op->getOutput(0)->print();
+         expectedOutput->print();
+         REQUIRE(*op->getOutput(0) == *expectedOutput);
+     }
+ }
+ 
\ No newline at end of file
-- 
GitLab


From 32c64230a062063744bac653ae473d8ab2e4e933 Mon Sep 17 00:00:00 2001
From: hrouis <houssemeddine.rouis92@gmail.com>
Date: Mon, 24 Mar 2025 13:49:45 +0100
Subject: [PATCH 8/8] remove transA and transB attr from FC to be handled in
 the metaop TransposeFC

---
 include/aidge/backend/cpu/operator/FCImpl.hpp |   4 -
 .../backend/cpu/operator/FCImpl_kernels.hpp   |  16 +--
 src/operator/FCImpl.cpp                       |  15 +--
 unit_tests/operator/Test_FCImpl.cpp           | 112 +-----------------
 unit_tests/operator/Test_MetaOperator.cpp     |   4 +-
 unit_tests/scheduler/Test_CastMove.cpp        |   4 +-
 unit_tests/scheduler/Test_Scheduler.cpp       |   6 +-
 7 files changed, 20 insertions(+), 141 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/FCImpl.hpp b/include/aidge/backend/cpu/operator/FCImpl.hpp
index 9249ba77..4daa522f 100644
--- a/include/aidge/backend/cpu/operator/FCImpl.hpp
+++ b/include/aidge/backend/cpu/operator/FCImpl.hpp
@@ -27,8 +27,6 @@ using FCImpl_cpu = OperatorImpl_cpu<FC_Op,
     void(const DimSize_t,
         const DimSize_t,
         const DimSize_t,
-        const bool,
-        const bool,
         const float,
         const float,
         const void *,
@@ -38,8 +36,6 @@ using FCImpl_cpu = OperatorImpl_cpu<FC_Op,
     void(const DimSize_t,
         const DimSize_t,
         const DimSize_t,
-        const bool,
-        const bool,
         const float,
         const float,
         const void *,
diff --git a/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp
index 3542c8cf..873830d3 100644
--- a/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp
@@ -22,8 +22,6 @@ template <class I, class W, class B, class O>
 void FCImpl_cpu_forward_kernel(const DimSize_t batchSize,
                             const DimSize_t inputFeatureSize,
                             const DimSize_t outputFeatureSize,
-                            const bool transA,
-                            const bool transB,
                             const float alpha_,
                             const float beta_,
                             const void* input_,
@@ -61,8 +59,8 @@ void FCImpl_cpu_forward_kernel(const DimSize_t batchSize,
         for (std::size_t out = 0; out < outputFeatureSize; ++out) {
             O sum = O(0);
             for (std::size_t i = 0; i < inputFeatureSize; ++i) {
-                std::size_t inputIdx = transA ? (i * batchSize + batch) : (batch * inputFeatureSize + i);
-                std::size_t weightIdx = transB ? (i * outputFeatureSize + out) : (out * inputFeatureSize + i);
+                std::size_t inputIdx = batch * inputFeatureSize + i;
+                std::size_t weightIdx = out * inputFeatureSize + i;
                 sum += static_cast<O>(input[inputIdx]) * static_cast<O>(weights[weightIdx]);
             }
             output[batch * outputFeatureSize + out] += alpha * sum;
@@ -75,8 +73,6 @@ template <class I, class O, class W, class B>
 void FCImpl_cpu_backward_kernel(const DimSize_t batchSize,
                                 const DimSize_t inputFeatureSize,
                                 const DimSize_t outputFeatureSize,
-                                const bool transA,
-                                const bool transB,
                                 const float alpha_,
                                 const float beta_,
                                 const void* input_,
@@ -120,11 +116,11 @@ void FCImpl_cpu_backward_kernel(const DimSize_t batchSize,
         for (std::size_t c = 0; c < inputFeatureSize; ++c) {
             O sum{0};
             for (std::size_t b = 0; b < batchSize; ++b) {
-                std::size_t inputIdx = transA ? (c * batchSize + b) : (b * inputFeatureSize + c);
+                std::size_t inputIdx = b * inputFeatureSize + c;
                 std::size_t outputIdx = b * outputFeatureSize + o;
                 sum += originalInput[inputIdx] * outputGrad[outputIdx];
             }
-            std::size_t weightIdx = transB ? (c * outputFeatureSize + o) : (o * inputFeatureSize + c);
+            std::size_t weightIdx = o * inputFeatureSize + c;
             weightGrad[weightIdx] = alpha * sum;
         }
     }
@@ -135,11 +131,11 @@ void FCImpl_cpu_backward_kernel(const DimSize_t batchSize,
         for (std::size_t c = 0; c < inputFeatureSize; ++c) {
             O sum{0};
             for (std::size_t o = 0; o < outputFeatureSize; ++o) {
-                std::size_t weightIdx = transB ? (c * outputFeatureSize + o) : (o * inputFeatureSize + c);
+                std::size_t weightIdx = o * inputFeatureSize + c;
                 std::size_t outputIdx = b * outputFeatureSize + o;
                 sum += weight[weightIdx] * outputGrad[outputIdx];
             }
-            std::size_t inputIdx = transA ? (c * batchSize + b) : (b * inputFeatureSize + c);
+            std::size_t inputIdx = b * inputFeatureSize + c;
             inputGrad[inputIdx] = alpha * sum;
         }
     }
diff --git a/src/operator/FCImpl.cpp b/src/operator/FCImpl.cpp
index 2a06803c..821f673d 100644
--- a/src/operator/FCImpl.cpp
+++ b/src/operator/FCImpl.cpp
@@ -41,13 +41,10 @@ void Aidge::FCImpl_cpu::forward()
     const auto& input1 = op_.getInput(1)->refCastFrom(input1Fallback, *(op_.getOutput(0)));
     const auto& input2 = (op_.getInput(2)) ? op_.getInput(2)->refCastFrom(input2Fallback, *(op_.getOutput(0))) : Tensor();
     // Call kernel
-    const DimSize_t nbInFeat = op_.transB()? input1.dims()[0]:input1.dims()[1];
-    const auto batchSize = input0.size() /nbInFeat;
+    const auto batchSize = input0.size() /input1.dims()[1];
     impl.forward(batchSize,
-        op_.transB()? input1.dims()[0]:input1.dims()[1], // nb input features
-        op_.transB()?input1.dims()[1]: input1.dims()[0], // nb output features
-        op_.transA(),
-        op_.transB(),
+        input1.dims()[1], // nb input features
+        input1.dims()[0], // nb output features
         op_.alpha(),
         op_.beta(),
         input0.getImpl()->rawPtr(),
@@ -76,13 +73,11 @@ void Aidge::FCImpl_cpu::backward()
     const auto& input1grad = op_.getInput(1)->grad()->refCastFrom(input1gradFallback, *(op_.getOutput(0)));
     const auto& input2grad = (op_.getInput(2)) ? op_.getInput(2)->grad()->refCastFrom(input2gradFallback, *(op_.getOutput(0))) : Tensor();
     // Call kernel
-    const DimSize_t nbInFeat = op_.transB()? input1grad.dims()[0]:input1grad.dims()[1];
+    const DimSize_t nbInFeat = input1grad.dims()[1];
     const auto batchSize = input0grad.size() /nbInFeat;
     impl.backward(batchSize,
         nbInFeat, // nb input features
-        op_.transB()?input1grad.dims()[1]: input1grad.dims()[0], // nb output features
-        op_.transA(),
-        op_.transB(),
+        input1grad.dims()[0], // nb output features
         op_.alpha(),
         op_.beta(),
         getCPUPtr(fc_grad),
diff --git a/unit_tests/operator/Test_FCImpl.cpp b/unit_tests/operator/Test_FCImpl.cpp
index ce41ec1e..a84ab63a 100644
--- a/unit_tests/operator/Test_FCImpl.cpp
+++ b/unit_tests/operator/Test_FCImpl.cpp
@@ -49,7 +49,7 @@ TEST_CASE("[cpu/oeprator] FC(forward)", "[FC][CPU]") {
     Tensor myOutput = Array2D<int, 2, 5>{
             {{23601, 23602, 23603, 23604, 23605}, {68601, 68602, 68603, 68604, 68605}}};
 
-    std::shared_ptr<Node> myFC = FC(75, 5, 1.0f, 1.0f, false, false, false, "myfc");
+    std::shared_ptr<Node> myFC = FC(75, 5, 1.0f, 1.0f, false, "myfc");
     auto op = std::static_pointer_cast<FC_Op>(myFC -> getOperator());
     op -> setDataType(DataType::Int32);
     op -> setBackend("cpu");
@@ -107,45 +107,6 @@ TEST_CASE("[cpu/oeprator] FC(forward)", "[FC][CPU]") {
         myFC->forward();
         REQUIRE(*(op->getOutput(0)) == myOutput);
     }
-
-    SECTION("transA and transB") {
-        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array2D<float, 3, 4>{{
-                {1.2137502, 0.85416597, -1.2204953, -1.1383502},
-                {-0.73264742, -1.361271, -1.400124, -0.47965094},
-                {-0.64570695, -0.48093504, 0.14018863, -0.64090657}
-        }}); // Transposed Input (4x3 → 3x4)
-
-        std::shared_ptr<Tensor> myWeights = std::make_shared<Tensor>(Array2D<float, 3, 2>{{
-                {0.12352414, -1.0330718},
-                {-0.036496852, 0.011371522},
-                {-0.15418212, -0.60658616}
-        }}); // Transposed Weights (2x3 → 3x2)
-
-		std::shared_ptr<Tensor> myBias = std::make_shared<Tensor>(Array1D<float, 2>{{
-				-0.31278628, 0.68560582
-		}});
-
-		Tensor myOutput = Tensor(Array2D<float, 4, 2>{{
-				{0.39605334, -1.3982881},
-				{0.30229449, -0.86952901},
-				{-0.39894336, 2.6626053},
-				{-0.20497644, 3.4614196}
-		}});
-
-        std::shared_ptr<Node> myFC = FC(3, 2, 2.0f, 0.5f, false, true, true, "myfc"); // transA = true, transB = true
-
-        auto op = std::static_pointer_cast<OperatorTensor>(myFC->getOperator());
-        op->associateInput(0, myInput);
-        op->associateInput(1, myWeights);
-        op->associateInput(2, myBias);
-        op->setDataType(DataType::Float32);
-        op->setBackend("cpu");
-
-        myFC->forward();
-
-        REQUIRE(approxEq<float>(*(op->getOutput(0)), myOutput));
-    }
-    // std::cout << static_cast<Tensor>((*myFC->getOperator())["weight"])[0][0][0][0] << std::endl;
 }
 
 
@@ -170,7 +131,7 @@ TEST_CASE("[cpu/oeprator] FC(backward)", "[FC][CPU]") {
 				{3.0791781, -1.3184667},
 				{2.3588469, -0.31109101}}});
 
-		std::shared_ptr<Node> myFC = FC(3, 2, 1.0f, 1.0f, false, false, false, "myfc");
+		std::shared_ptr<Node> myFC = FC(3, 2, 1.0f, 1.0f, false, "myfc");
 		auto op = std::static_pointer_cast<OperatorTensor>(myFC -> getOperator());
 		op -> associateInput(0, myInput);
 		op -> associateInput(1, myWeights);
@@ -208,73 +169,4 @@ TEST_CASE("[cpu/oeprator] FC(backward)", "[FC][CPU]") {
 		REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), expectedWeightsGrad));
 		REQUIRE(approxEq<float>(*(op->getInput(2)->grad()), expectedBiasGrad));
 	}
-	SECTION("transA and transB") {
-		std::shared_ptr<Tensor> myInput =
-		std::make_shared<Tensor>(Array2D<float, 3, 4>{{
-				{0.51234567, -1.23456789, 0.67891234, -0.43219876},
-				{1.87654321, -0.98765432, 1.34567890, -1.23456789},
-				{-0.67891234, 0.43219876, -1.87654321, 0.98765432}
-		}});
-
-		std::shared_ptr<Tensor> myWeights =
-		std::make_shared<Tensor>(Array2D<float, 3, 2>{{
-				{0.12345678, -1.34567890},
-				{-0.87654321, 0.56789012},
-				{0.23456789, -0.45678901}
-		}});
-
-		std::shared_ptr<Tensor> myBias =
-		std::make_shared<Tensor>(Array1D<float, 2>{
-				{0.65432109, -0.54321098}
-		});
-
-		Tensor myOutput = Tensor(Array2D<float, 4, 2>{{
-				{-3.1545789, 1.101069},
-				{1.9565322, 1.534453},
-				{-2.7446516, 1.1439626},
-				{2.8480933, -1.412901}
-		}});
-
-		std::shared_ptr<Node> myFC = FC(3, 2, 2.0f, 0.5f, false, true, true, "myfc");
-		auto op = std::static_pointer_cast<OperatorTensor>(myFC->getOperator());
-		op->associateInput(0, myInput);
-		op->associateInput(1, myWeights);
-		op->associateInput(2, myBias);
-		op->setDataType(DataType::Float32);
-		op->setBackend("cpu");
-
-		// Forward pass
-		myFC->forward();
-
-		REQUIRE(approxEq<float>(*(op->getOutput(0)), myOutput));
-
-		// Backward Pass
-		std::shared_ptr<Tensor> myOutputGrad = std::make_shared<Tensor>(Array2D<float, 4, 2>{{
-			{0.19876543, -0.65432109},
-			{0.76543210, -1.23456789},
-			{-0.43210987, 1.09876543},
-			{-0.98765432, 0.87654321}
-		}});
-
-		Tensor expectedInputGrad = Tensor(Array2D<float, 3, 4>{{
-			{1.8100901, 3.5116596, -3.0638645, -2.6029568},
-			{-1.0916179, -2.7440662, 2.0054817, 2.7270038},
-			{0.69102132, 1.4869658, -1.206526, -1.2641346}
-		}});
-
-		Tensor expectedWeightsGrad = Tensor(Array2D<float, 3, 2>{{
-			{-1.4192863, 3.1120875},
-			{0.50970948, 0.77579576},
-			{0.062572584, -2.571022}
-		}});
-
-		Tensor expectedBiasGrad = Tensor(Array1D<float, 2>{
-			{-0.22778332,  0.04320982}
-		});
-		op->getOutput(0)->setGrad(myOutputGrad);
-		myFC->backward();
-		REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), expectedInputGrad));
-		REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), expectedWeightsGrad));
-		REQUIRE(approxEq<float>(*(op->getInput(2)->grad()), expectedBiasGrad));
-	}
 }
\ No newline at end of file
diff --git a/unit_tests/operator/Test_MetaOperator.cpp b/unit_tests/operator/Test_MetaOperator.cpp
index 925a264a..7c1718dd 100644
--- a/unit_tests/operator/Test_MetaOperator.cpp
+++ b/unit_tests/operator/Test_MetaOperator.cpp
@@ -702,8 +702,8 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
         auto init = std::make_shared<Tensor>(Array2D<float, 2, 5>{});
         uniformFiller<float>(init, 0.0, 0.0);
 
-        auto fc1 = FC(inChannels, outChannels, 1.0f, 1.0f, true, false, false, "myfc");
-        auto fc2 = FC(outChannels, inChannels, 1.0f, 1.0f, true, false, false, "fc2");
+        auto fc1 = FC(inChannels, outChannels, 1.0f, 1.0f, true, "myfc");
+        auto fc2 = FC(outChannels, inChannels, 1.0f, 1.0f, true, "fc2");
         // NOTE: Account for init step by adding 1 to the max timestep
         // parameter.
         auto lif1 = Leaky(nbTimeSteps + 1, beta, threshold, LeakyReset::Subtraction, "leaky");
diff --git a/unit_tests/scheduler/Test_CastMove.cpp b/unit_tests/scheduler/Test_CastMove.cpp
index fdeb2d79..3f1538a4 100644
--- a/unit_tests/scheduler/Test_CastMove.cpp
+++ b/unit_tests/scheduler/Test_CastMove.cpp
@@ -56,7 +56,7 @@ TEST_CASE("[cpu/castmove] CastMove(forward)") {
                     Conv(1, 3, {3, 3}, "conv1"),
                     Conv(3, 4, {1, 1}, "conv2"),
                     Conv(4, 3, {1, 1}, "conv3"),
-                    FC(27, 5, 1.0f, 1.0f, false, false, false, "fc")});
+                    FC(27, 5, 1.0f, 1.0f, false, "fc")});
 
         g->getNode("conv1")->getOperator()->setInput(0, inputTensor);
         g->getNode("conv1")->getOperator()->setInput(1, weight1);
@@ -158,7 +158,7 @@ TEST_CASE("[cpu/castmove] CastMove(forward)") {
                     Conv(1, 3, {3, 3}, "conv1"),
                     Conv(3, 4, {1, 1}, "conv2"),
                     Conv(4, 3, {1, 1}, "conv3"),
-                    FC(27, 5, 1.0f, 1.0f, false, false, false, "fc")});
+                    FC(27, 5, 1.0f, 1.0f, false, "fc")});
 
         g->getNode("conv1")->getOperator()->setInput(0, inputTensor);
         g->getNode("conv1")->getOperator()->setInput(1, weight1);
diff --git a/unit_tests/scheduler/Test_Scheduler.cpp b/unit_tests/scheduler/Test_Scheduler.cpp
index e264f114..1361b8a2 100644
--- a/unit_tests/scheduler/Test_Scheduler.cpp
+++ b/unit_tests/scheduler/Test_Scheduler.cpp
@@ -71,7 +71,7 @@ TEST_CASE("[cpu/scheduler] SequentialScheduler(forward)") {
                     Conv(1, 3, {3, 3}, "conv1"),
                     Conv(3, 4, {1, 1}, "conv2"),
                     Conv(4, 3, {1, 1}, "conv3"),
-                    FC(27, 5, 1.0f, 1.0f, false, false, false, "fc")});
+                    FC(27, 5, 1.0f, 1.0f, false, "fc")});
 
         g->getNode("conv1")->getOperator()->setInput(0, inputTensor);
         g->getNode("conv1")->getOperator()->setInput(1, weight1);
@@ -173,7 +173,7 @@ TEST_CASE("[cpu/scheduler] SequentialScheduler(forward)") {
                                 Conv(3, 3, {1, 1}, "conv1.3")}),
                             Add("add2"),
                             Conv(3, 2, {1, 1}, "conv2"),
-                            FC(18, 5, 1.0f, 1.0f, false, false, false, "out")});
+                            FC(18, 5, 1.0f, 1.0f, false, "out")});
 
         g->getNode("inputConv")->getOperator()->setInput(0, inputTensor);
         g->getNode("inputConv")->getOperator()->setInput(1, weight1);
@@ -321,7 +321,7 @@ TEST_CASE("[cpu/scheduler] SequentialScheduler(forward)") {
                     Conv(1, 3, {3, 3}, "conv1"),
                     Conv(3, 4, {1, 1}, "conv2"),
                     Conv(4, 3, {1, 1}, "conv3"),
-                    FC(27, 5, 1.0f, 1.0f, false, false, false, "fc")});
+                    FC(27, 5, 1.0f, 1.0f, false, "fc")});
 
         // g->getNode("conv1")->getOperator()->setInput(0, inputTensor);
         g->getNode("conv1")->getOperator()->setInput(1, weight1);
-- 
GitLab