From 1b7c5b5b0a7c3d7f80c0ddec7e8a5a311648c7ee Mon Sep 17 00:00:00 2001
From: Jerome Hue <jerome.hue@cea.fr>
Date: Wed, 22 Jan 2025 15:39:38 +0100
Subject: [PATCH] Add test and register Sub backward

---
 .../aidge/backend/cpu/operator/SubImpl.hpp    |  14 +-
 .../backend/cpu/operator/SubImpl_kernels.hpp  |   4 +-
 src/operator/SubImpl.cpp                      |  26 ++-
 unit_tests/operator/Test_SubImpl.cpp          | 161 ++++++++++++++++++
 4 files changed, 200 insertions(+), 5 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/SubImpl.hpp b/include/aidge/backend/cpu/operator/SubImpl.hpp
index eed26ddc..064b5329 100644
--- a/include/aidge/backend/cpu/operator/SubImpl.hpp
+++ b/include/aidge/backend/cpu/operator/SubImpl.hpp
@@ -23,7 +23,19 @@
 namespace Aidge {
 // Operator implementation entry point for the backend
 using SubImpl_cpu = OperatorImpl_cpu<Sub_Op,
-    void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*,void*)>;
+    void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*,void*),  
+    void(const std::size_t,
+         const std::size_t,
+         const std::size_t,
+         const std::vector<std::size_t>&,
+         const std::vector<std::size_t>&,
+         const std::vector<std::size_t>&,
+         const void*,
+         const void*,
+         const void*,
+         void*,
+         void*)
+>;
 
 // Implementation entry point registration to Operator
 REGISTRAR(Sub_Op, "cpu", Aidge::SubImpl_cpu::create);
diff --git a/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp b/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp
index a1954776..cb16c037 100644
--- a/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp
@@ -163,8 +163,6 @@ void SubImpl_cpu_backward_kernel(const std::size_t input0Length,
                                void* gradientInput0_,
                                void* gradientInput1_)
 {
-    const I1* input0 = static_cast<const I1*>(input0_);
-    const I2* input1 = static_cast<const I2*>(input1_);
     const O* grad_output = static_cast<const O*>(grad_output_);
     auto* grad_input_0 = static_cast<I1*>(gradientInput0_);
     auto* grad_input_1 = static_cast<I2*>(gradientInput1_);
@@ -202,7 +200,7 @@ void SubImpl_cpu_backward_kernel(const std::size_t input0Length,
 // Kernels registration to implementation entry point
 REGISTRAR(SubImpl_cpu,
     {DataType::Float32},
-    {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<float, float, float>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<float, float, float>, Aidge::SubImpl_cpu_backward_kernel<float,float,float>});
 REGISTRAR(SubImpl_cpu,
     {DataType::Float64},
     {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<double, double, double>, nullptr});
diff --git a/src/operator/SubImpl.cpp b/src/operator/SubImpl.cpp
index e36abe2a..cce4e27a 100644
--- a/src/operator/SubImpl.cpp
+++ b/src/operator/SubImpl.cpp
@@ -41,5 +41,29 @@ void Aidge::SubImpl_cpu::forward() {
 
 template <>
 void Aidge::SubImpl_cpu::backward() {
-    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Sub_Op on backend cpu");
+
+    const Sub_Op& op_ = dynamic_cast<const Sub_Op&>(mOp);
+
+    auto in0 = op_.getInput(0);
+    auto in1 = op_.getInput(1);
+    auto in0grad = op_.getInput(0)->grad();
+    auto in1grad = op_.getInput(1)->grad();
+    auto out0grad = op_.getOutput(0)->grad();
+
+    // Find the correct kernel type
+    const auto impl = Registrar<SubImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Call kernel
+    impl.backward(/* input0Length */ in0grad->size(),
+                  /* input1Length */ in1grad->size(),
+                  /* grad0Length  */ out0grad->size(),
+                  /* input0Dims   */ in0->dims(),
+                  /* input1Dims   */ in1->dims(),
+               out0grad->dims(),
+               getCPUPtr(in0),
+               getCPUPtr(in1),
+               getCPUPtr(out0grad),
+               getCPUPtr(in0grad),
+               getCPUPtr(in1grad));
+
 }
diff --git a/unit_tests/operator/Test_SubImpl.cpp b/unit_tests/operator/Test_SubImpl.cpp
index 1317e88a..d9b6207b 100644
--- a/unit_tests/operator/Test_SubImpl.cpp
+++ b/unit_tests/operator/Test_SubImpl.cpp
@@ -322,4 +322,165 @@ TEST_CASE("[cpu/operator] Sub", "[Sub][CPU]") {
         }
     }
 }
+
+
+TEST_CASE("[CPU/Operator] Sub(Backward)", "[Sub][CPU][Backward]") {
+    std::shared_ptr<Node> mySub = Sub();
+    auto op = std::static_pointer_cast<OperatorTensor>(mySub->getOperator());
+    op->setDataType(DataType::Float32);
+    op->setBackend("cpu");
+
+    SECTION("Case 1: 1D and 2D Tensors") {
+        const auto T0 = std::make_shared<Tensor>(
+            Array2D<float, 2, 3>({{{1, 2, 3}, {4, 5, 6}}}));
+
+        const auto T1 =
+            std::make_shared<Tensor>(Array1D<float, 3>({0.1, 0.2, 0.3}));
+
+        T0->setDataType(DataType::Float32);
+        T0->setBackend("cpu");
+        T1->setDataType(DataType::Float32);
+        T1->setBackend("cpu");
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>(
+            Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}})));
+        op->forwardDims();
+
+        mySub->backward();
+
+        // For subtraction: grad_input0 = grad_output
+        const auto expectedGrad0 = std::make_shared<Tensor>(
+            Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}}));
+
+        // For subtraction: grad_input1 = -grad_output (summed across broadcast dimensions)
+        const auto expectedGrad1 =
+            std::make_shared<Tensor>(Array1D<float, 3>({-2, -2, -2}));
+
+        REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0));
+        REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1));
+    }
+
+    SECTION("Case 2: 3D and 1D tensors") {
+        const auto T0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+            {{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}},
+              {{7.0, 8.0, 9.0}, {10.0, 11.0, 12.0}}}}));
+
+        const auto T1 =
+            std::make_shared<Tensor>(Array1D<float, 3>({0.3, 0.2, 0.1}));
+
+        const auto newGrad = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+            {{{{1, 1, 1}, {1, 1, 1}}, {{1, 1, 1}, {1, 1, 1}}}}));
+
+        const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+            {{{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}},
+              {{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}}}));
+
+        const auto expectedGrad1 =
+            std::make_shared<Tensor>(Array1D<float, 3>({-4.0, -4.0, -4.0}));
+
+        for (auto T : {T0, T1, newGrad, expectedGrad0, expectedGrad1}) {
+            T->setBackend("cpu");
+            T->setDataType(DataType::Float32);
+        }
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(newGrad);
+        op->forwardDims();
+
+        mySub->backward();
+
+        REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0));
+        REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1));
+    }
+
+    SECTION("Case 3: Random values with broadcasting") {
+        // Use random values
+        std::vector<std::size_t> dims0 = {5, 2, 1, 7}; // First tensor
+        std::vector<std::size_t> dims1 = {2, 6, 7};    // Second tensor
+        std::vector<std::size_t> outputDims = {5, 2, 6, 7};
+
+        const auto input0Size = 5 * 2 * 1 * 7;
+        const auto input1Size = 2 * 6 * 7;
+        const auto outputSize = 5 * 2 * 6 * 7;
+
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> dist(0.1f, 1.0f);
+
+        std::vector<float> input0Data(input0Size);
+        std::vector<float> input1Data(input1Size);
+        std::vector<float> gradOutputData(outputSize);
+
+        // Fill with random values
+        for (auto &val : input0Data) val = dist(gen);
+        for (auto &val : input1Data) val = dist(gen);
+        for (auto &val : gradOutputData) val = dist(gen);
+
+        auto T0 = std::make_shared<Tensor>();
+        auto T1 = std::make_shared<Tensor>();
+
+        T0->setDataType(DataType::Float32);
+        T0->setBackend("cpu");
+        T0->resize(dims0);
+        T0->getImpl()->setRawPtr(input0Data.data(), input0Size);
+
+        T1->setDataType(DataType::Float32);
+        T1->setBackend("cpu");
+        T1->resize(dims1);
+        T1->getImpl()->setRawPtr(input1Data.data(), input1Size);
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+
+        // Set gradient of output
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>());
+        op->getOutput(0)->grad()->resize(outputDims);
+        op->getOutput(0)->grad()->getImpl()->setRawPtr(gradOutputData.data(), outputSize);
+
+        op->forwardDims();
+
+        // Compute reference gradients
+        std::vector<float> expectedGrad0(input0Size, 0.0f);
+        std::vector<float> expectedGrad1(input1Size, 0.0f);
+
+        for (std::size_t n = 0; n < 5; ++n) {
+            for (std::size_t c = 0; c < 2; ++c) {
+                for (std::size_t h = 0; h < 6; ++h) {
+                    for (std::size_t w = 0; w < 7; ++w) {
+                        std::size_t outIdx = w + 7 * (h + 6 * (c + 2 * n));
+                        std::size_t in0Idx = w + 7 * (0 + 1 * (c + 2 * n));
+                        std::size_t in1Idx = w + 7 * (h + 6 * c);
+
+                        // Gradient for input0: grad_output
+                        expectedGrad0[in0Idx] += gradOutputData[outIdx];
+                        // Gradient for input1: -grad_output
+                        expectedGrad1[in1Idx] += -gradOutputData[outIdx];
+                    }
+                }
+            }
+        }
+
+        // Perform backward pass
+        mySub->backward();
+
+        auto expectedGrad0Tensor = std::make_shared<Tensor>();
+        expectedGrad0Tensor->resize(T0->dims());
+        expectedGrad0Tensor->setBackend("cpu");
+        expectedGrad0Tensor->setDataType(DataType::Float32);
+        expectedGrad0Tensor->getImpl()->setRawPtr(expectedGrad0.data(), expectedGrad0.size());
+
+        auto expectedGrad1Tensor = std::make_shared<Tensor>();
+        expectedGrad1Tensor->resize(T1->dims());
+        expectedGrad1Tensor->setBackend("cpu");
+        expectedGrad1Tensor->setDataType(DataType::Float32);
+        expectedGrad1Tensor->getImpl()->setRawPtr(expectedGrad1.data(), expectedGrad1.size());
+
+        // Verify backward pass
+        REQUIRE(approxEq<float>(*T0->grad(), *expectedGrad0Tensor));
+        REQUIRE(approxEq<float>(*T1->grad(), *expectedGrad1Tensor));
+    }
+}
 } // namespace Aidge
-- 
GitLab