diff --git a/include/aidge/backend/cpu/operator/AddImpl.hpp b/include/aidge/backend/cpu/operator/AddImpl.hpp
index ca04dff9164ecc8492d9263b32b60272dbbad395..cfb85ecfa6a4c65d89079dc23944d6d85d99a785 100644
--- a/include/aidge/backend/cpu/operator/AddImpl.hpp
+++ b/include/aidge/backend/cpu/operator/AddImpl.hpp
@@ -33,8 +33,6 @@ using AddImpl_cpu = OperatorImpl_cpu<Add_Op,
          const std::vector<std::size_t>&, 
          const std::vector<std::size_t>&, 
          const void*, 
-         const void*, 
-         const void*, 
          void*, 
          void*)
 >;
diff --git a/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp
index d6fff9b58d381895350998e11bae01684718ad3e..4be47849db2fd5ee4e21d59a4f1199f13f60b3a9 100644
--- a/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp
@@ -154,15 +154,11 @@ void AddImpl_cpu_backward_kernel(const std::size_t input0Length,
                                const std::vector<std::size_t>& dims0,
                                const std::vector<std::size_t>& dims1,
                                const std::vector<std::size_t>& outputDims,
-                               const void* input0_,
-                               const void* input1_,
                                const void* grad_output_,
                                void* gradientInput0_,
                                void* gradientInput1_)
 {
     // TODO: Remove input0/1 from the function
-    const I* input0 = static_cast<const I*>(input0_);
-    const I* input1 = static_cast<const I*>(input1_);
     const O* gradOutput = static_cast<const O*>(grad_output_);
     auto* gradInput0 = static_cast<I*>(gradientInput0_);
     auto* gradInput1 = static_cast<I*>(gradientInput1_);
diff --git a/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp
index ed838a94cc0c0238a870427c3b774b29f7818b09..d5e5561d02aacd8532f74d2bfd4ee2fb5a5b5dc3 100644
--- a/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp
@@ -25,6 +25,40 @@
 
 
 namespace Aidge {
+
+template <typename T>
+typename std::enable_if<std::is_floating_point<T>::value, T>::type
+stableMean(const T* vec, size_t size) {
+  T mean = 0;
+  for (size_t i = 0; i < size; ++i) {
+    mean = std::fma<T>(vec[i] - mean, 1.0f / (i + 1), mean);
+  }
+  return mean;
+}
+
+// Specialization for integers: perform the mean computation in float
+template <typename T>
+typename std::enable_if<!std::is_floating_point<T>::value, T>::type
+stableMean(const T* vec, size_t size) {
+  double mean = 0;
+  for (size_t i = 0; i < size; ++i) {
+    mean = std::fma<double>(vec[i] - mean, 1.0f / (i + 1), mean);
+  }
+  return mean;
+}
+
+template <typename T>
+typename std::enable_if<std::is_floating_point<T>::value, T>::type
+castFromFloat(T value) {
+  return value;
+}
+
+template <typename T>
+typename std::enable_if<!std::is_floating_point<T>::value, T>::type
+castFromFloat(double value) {
+  return static_cast<T>(std::nearbyint(value));
+}
+
 template <class I, class O>
 void GlobalAveragePoolingImpl_cpu_forward_kernel(
     const std::vector<DimSize_t> &dims, const void *input_, void *output_) {
@@ -49,12 +83,7 @@ void GlobalAveragePoolingImpl_cpu_forward_kernel(
     for (DimSize_t channel = 0; channel < dims[1]; ++channel) {
       const I *filter_start = std::next(
           input, (batch * in_batch_nb_elems) + (channel * in_channel_nb_elems));
-      I mean = 0;
-      for (size_t i = 0; i < in_channel_nb_elems; ++i) {
-        // Single pass numerically stable mean, using the fmaf
-        mean = fmaf(filter_start[i] - mean, 1.0f/(i+1), mean);
-      }
-      output[batch * out_batch_nb_elems + channel] = mean;
+      output[batch * out_batch_nb_elems + channel] = castFromFloat<O>(stableMean<I>(filter_start, in_channel_nb_elems));
     }
   }
 }
diff --git a/include/aidge/backend/cpu/operator/ReduceMeanImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ReduceMeanImpl_kernels.hpp
index 5a143164d7e4fa2585ea72c38eaaa123f215d21a..864b89c4fa4667b70e43ed7436382e30bc150745 100644
--- a/include/aidge/backend/cpu/operator/ReduceMeanImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ReduceMeanImpl_kernels.hpp
@@ -25,6 +25,40 @@
 #include "aidge/utils/Registrar.hpp"
 
 namespace Aidge {
+
+template <typename T>
+typename std::enable_if<std::is_floating_point<T>::value, T>::type
+stableMean(const T* vec, size_t len, size_t stride) {
+  T mean = 0;
+  for (size_t i = 0; i < len; ++i) {
+    mean = std::fma<T>(vec[i * stride] - mean, 1.0f / (i + 1), mean);
+  }
+  return mean;
+}
+
+// Specialization for integers: perform the mean computation in float
+template <typename T>
+typename std::enable_if<!std::is_floating_point<T>::value, T>::type
+stableMean(const T* vec, size_t len, size_t stride) {
+  double mean = 0;
+  for (size_t i = 0; i < len; ++i) {
+    mean = std::fma<double>(vec[i * stride] - mean, 1.0f / (i + 1), mean);
+  }
+  return mean;
+}
+
+template <typename T>
+typename std::enable_if<std::is_floating_point<T>::value, T>::type
+castFromFloat(T value) {
+  return value;
+}
+
+template <typename T>
+typename std::enable_if<!std::is_floating_point<T>::value, T>::type
+castFromFloat(double value) {
+  return static_cast<T>(std::nearbyint(value));
+}
+
 template <class I, class O>
 void ReduceMeanImpl_cpu_forward_kernel(const std::vector<std::int32_t>& axes,
                                     DimSize_t /*keepDims*/,
@@ -50,12 +84,7 @@ void ReduceMeanImpl_cpu_forward_kernel(const std::vector<std::int32_t>& axes,
             for (std::size_t post = 0; post < stride_post; ++post) {
                 const std::size_t idx_i = pre * dim_i * stride_post + post;
                 const std::size_t idx_o = pre * stride_post + post;
-                O mean = 0;
-                for (std::size_t i = 0; i < dim_i; ++i) {
-                    // Single pass numerically stable mean, using the fmaf
-                    mean = fmaf(input[idx_i + i*stride_post] - mean, 1.0f/(i+1), mean);
-                }
-                output[idx_o]  = mean;
+                output[idx_o]  = castFromFloat<O>(stableMean(input + idx_i, dim_i, stride_post));
             }
         }
     } else {
@@ -72,8 +101,9 @@ void ReduceMeanImpl_cpu_forward_kernel(const std::vector<std::int32_t>& axes,
             stride_pre[i] = stride_pre[i-1]*inputDims[i-1];
         }
 
-        const I* inputAccumulation = input;
-        I* outputAccumulation = nullptr;
+        // Type should be the return type of stableMean<I>(), which is always floating point
+        const decltype(stableMean<I>(input, 0, 0))* inputAccumulation = nullptr;
+        decltype(stableMean<I>(input, 0, 0))* outputAccumulation = nullptr;
 
         for (const auto& axisInt : axes) {
             const std::size_t a = static_cast<std::size_t>(axisInt);
@@ -84,23 +114,23 @@ void ReduceMeanImpl_cpu_forward_kernel(const std::vector<std::int32_t>& axes,
                 for (std::size_t post = 0; post < stride_post[a]; ++post) {
                     const std::size_t idx_i = pre * dim_i * stride_post[a] + post;
                     const std::size_t idx_o = pre * stride_post[a] + post;
-                    I mean = 0;
-                    for (std::size_t i = 0; i < dim_i; ++i) {
-                        // Single pass numerically stable mean, using the fmaf
-                        mean = fmaf(inputAccumulation[idx_i + i*stride_post[a]] - mean, 1.0f/(i+1), mean);
+                    if (inputAccumulation == nullptr) {
+                        outputAccumulation[idx_o] = stableMean<I>(input + idx_i, dim_i, stride_post[a]);
+                    }
+                    else {
+                        outputAccumulation[idx_o] = stableMean<I>(inputAccumulation + idx_i, dim_i, stride_post[a]);
                     }
-                    outputAccumulation[idx_o] = mean;
                 }
             }
             std::for_each(stride_pre.get()+a+1, stride_pre.get()+nb_dims, [dim_i] (std::size_t& val) { val /= dim_i; });
-            if (inputAccumulation != input) {
+            if (inputAccumulation != nullptr) {
                 delete[] inputAccumulation;
             }
             inputAccumulation = outputAccumulation;
         }
 
-        // Copy elements from inputAccumulation to output while dividing by divisor
-        std::copy(inputAccumulation, inputAccumulation + outputElements, output);
+        std::transform(inputAccumulation, inputAccumulation + outputElements, output,
+            [](auto value) { return castFromFloat<O>(value); });
         if (outputAccumulation) {
             delete[] outputAccumulation;
         }
diff --git a/include/aidge/backend/cpu/operator/SubImpl.hpp b/include/aidge/backend/cpu/operator/SubImpl.hpp
index eed26ddcc9f57b3bb7796049a62f3f6be7de4eb5..1f94ff139c319916fed68120317c5f1931619495 100644
--- a/include/aidge/backend/cpu/operator/SubImpl.hpp
+++ b/include/aidge/backend/cpu/operator/SubImpl.hpp
@@ -15,15 +15,23 @@
 #include "aidge/backend/cpu/operator/OperatorImpl.hpp"
 #include "aidge/operator/Sub.hpp"
 #include "aidge/utils/Registrar.hpp"
-#include "aidge/utils/Types.h"
-#include "aidge/backend/cpu/data/GetCPUPtr.h"
-#include <memory>
+
 #include <vector>
 
 namespace Aidge {
 // Operator implementation entry point for the backend
 using SubImpl_cpu = OperatorImpl_cpu<Sub_Op,
-    void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*,void*)>;
+    void(std::vector<std::size_t>, std::vector<std::size_t>, const std::vector<std::size_t>&, const void*, const void*,void*),  
+    void(const std::size_t,
+         const std::size_t,
+         const std::size_t,
+         const std::vector<std::size_t>&,
+         const std::vector<std::size_t>&,
+         const std::vector<std::size_t>&,
+         const void*,
+         void*,
+         void*)
+>;
 
 // Implementation entry point registration to Operator
 REGISTRAR(Sub_Op, "cpu", Aidge::SubImpl_cpu::create);
diff --git a/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp b/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp
index 1d789c3c8886d35ce6597d5704c76060bad196c1..8d3d80e93e2e2f95d00026b9f3d1a1ff48b8d17a 100644
--- a/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp
@@ -42,6 +42,7 @@ void sub_contiguous_arrays(const std::size_t input1size,
 
 
 namespace Aidge {
+
 template <class I1, class I2, class O>
 void SubImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
                                 std::vector<std::size_t> dims1,
@@ -149,10 +150,55 @@ void SubImpl_cpu_forward_kernel(std::vector<std::size_t> dims0,
     }
 }
 
+template <class I1, class I2, class O>
+void SubImpl_cpu_backward_kernel(const std::size_t input0Length,
+                               const std::size_t input1Length,
+                               const std::size_t gradOutputLength,
+                               const std::vector<std::size_t>& dims0,
+                               const std::vector<std::size_t>& dims1,
+                               const std::vector<std::size_t>& outputDims,
+                               const void* grad_output_,
+                               void* gradientInput0_,
+                               void* gradientInput1_)
+{
+    const O* grad_output = static_cast<const O*>(grad_output_);
+    auto* grad_input_0 = static_cast<I1*>(gradientInput0_);
+    auto* grad_input_1 = static_cast<I2*>(gradientInput1_);
+
+    std::fill_n(grad_input_0, input0Length, static_cast<I1>(0));
+    std::fill_n(grad_input_1, input1Length, static_cast<I2>(0));
+
+    auto broadcastedDims0 = getBroadcastedDims(outputDims, dims0);
+    auto broadcastedDims1 = getBroadcastedDims(outputDims, dims1);
+
+    for (std::size_t i = 0; i < gradOutputLength; ++i) {
+        auto idxOutputGrad = getMultiDimIndices(outputDims, i);
+        std::vector<std::size_t> idxInput0(broadcastedDims0.size());
+        std::vector<std::size_t> idxInput1(broadcastedDims1.size());
+
+        for (std::size_t dimension = 0; dimension < broadcastedDims0.size(); ++dimension) {
+            idxInput0[dimension] = (broadcastedDims0[dimension] == 1) ? 0 : idxOutputGrad[dimension];
+        }
+
+        for (std::size_t dimension = 0; dimension < broadcastedDims1.size(); ++dimension) {
+            idxInput1[dimension] = (broadcastedDims1[dimension] == 1) ? 0 : idxOutputGrad[dimension];
+        }
+
+        auto idx0 = getFlattenedIndex(broadcastedDims0, idxInput0);
+        auto idx1 = getFlattenedIndex(broadcastedDims1, idxInput1);
+
+        // For subtraction: gradient of first input is 1 * grad_output
+        grad_input_0[idx0] += static_cast<I1>(grad_output[i]);
+        // For subtraction: gradient of second input is -1 * grad_output
+        grad_input_1[idx1] += static_cast<I2>(-grad_output[i]);
+    }
+}
+
+
 // Kernels registration to implementation entry point
 REGISTRAR(SubImpl_cpu,
     {DataType::Float32},
-    {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<float, float, float>, nullptr});
+    {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<float, float, float>, Aidge::SubImpl_cpu_backward_kernel<float,float,float>});
 REGISTRAR(SubImpl_cpu,
     {DataType::Float64},
     {ProdConso::inPlaceModel, Aidge::SubImpl_cpu_forward_kernel<double, double, double>, nullptr});
diff --git a/src/operator/AddImpl.cpp b/src/operator/AddImpl.cpp
index b027fb876c8e597c85d13fea0f1fb6dd1207a1d9..cff6128741db657136aca1006c0f273ce64aa87a 100644
--- a/src/operator/AddImpl.cpp
+++ b/src/operator/AddImpl.cpp
@@ -73,8 +73,6 @@ void Aidge::AddImpl_cpu::backward() {
                in0->dims(),
                in1->dims(),
                out0grad->dims(),
-               getCPUPtr(in0),
-               getCPUPtr(in1),
                getCPUPtr(out0grad),
                getCPUPtr(in0grad),
                getCPUPtr(in1grad));
diff --git a/src/operator/SubImpl.cpp b/src/operator/SubImpl.cpp
index e36abe2a9d68a2b56ab1777aa04b0e911df514c8..7f57bf2f1e16bbe6a6dd510f39463b611d925220 100644
--- a/src/operator/SubImpl.cpp
+++ b/src/operator/SubImpl.cpp
@@ -41,5 +41,27 @@ void Aidge::SubImpl_cpu::forward() {
 
 template <>
 void Aidge::SubImpl_cpu::backward() {
-    AIDGE_THROW_OR_ABORT(std::runtime_error, "Backward not yet implemented for Sub_Op on backend cpu");
+
+    const Sub_Op& op_ = dynamic_cast<const Sub_Op&>(mOp);
+
+    auto in0 = op_.getInput(0);
+    auto in1 = op_.getInput(1);
+    auto in0grad = op_.getInput(0)->grad();
+    auto in1grad = op_.getInput(1)->grad();
+    auto out0grad = op_.getOutput(0)->grad();
+
+    // Find the correct kernel type
+    const auto impl = Registrar<SubImpl_cpu>::create(getBestMatch(getRequiredSpec()));
+
+    // Call kernel
+    impl.backward(/* input0Length */ in0grad->size(),
+                  /* input1Length */ in1grad->size(),
+                  /* grad0Length  */ out0grad->size(),
+                  /* input0Dims   */ in0->dims(),
+                  /* input1Dims   */ in1->dims(),
+                  /* outputDims   */ out0grad->dims(),
+                  /* gradOutput   */ getCPUPtr(out0grad),
+                  /* gradInput0   */ getCPUPtr(in0grad),
+                  /* gradInput1   */ getCPUPtr(in1grad));
+
 }
diff --git a/unit_tests/operator/Test_MetaOperator.cpp b/unit_tests/operator/Test_MetaOperator.cpp
index bb9027d3ec0d8eb05dbd555f69bd6e8f5147246b..de720f5bc7e3c18bcd725c676d458f30344d3b1a 100644
--- a/unit_tests/operator/Test_MetaOperator.cpp
+++ b/unit_tests/operator/Test_MetaOperator.cpp
@@ -706,7 +706,7 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
         auto fc2 = FC(outChannels, inChannels, true, "fc2");
         // NOTE: Account for init step by adding 1 to the max timestep
         // parameter.
-        auto lif1 = Leaky(nbTimeSteps + 1, beta, threshold, "leaky");
+        auto lif1 = Leaky(nbTimeSteps + 1, beta, threshold, LeakyReset::Subtraction, "leaky");
 
         // associateInput() does not work
         fc1->input(1).first->getOperator()->setOutput(0, myWeights);
@@ -775,7 +775,7 @@ TEST_CASE("[cpu/operator] MetaOperator", "[MetaOperator][CPU]") {
         const auto nbTimeSteps = dims[0];
         const auto beta = betaDist(gen);
 
-        auto myLeaky = Leaky(nbTimeSteps, beta, 1.0, "leaky");
+        auto myLeaky = Leaky(nbTimeSteps, beta, 1.0, LeakyReset::Subtraction, "leaky");
         auto op =
             std::static_pointer_cast<MetaOperator_Op>(myLeaky->getOperator());
         // auto stack = Stack(2);
diff --git a/unit_tests/operator/Test_SubImpl.cpp b/unit_tests/operator/Test_SubImpl.cpp
index 1317e88a371e9a6e7a3deae5b7f662a9cd879a60..d9b6207b2c47f1004097509c853e69a8160fc22a 100644
--- a/unit_tests/operator/Test_SubImpl.cpp
+++ b/unit_tests/operator/Test_SubImpl.cpp
@@ -322,4 +322,165 @@ TEST_CASE("[cpu/operator] Sub", "[Sub][CPU]") {
         }
     }
 }
+
+
+TEST_CASE("[CPU/Operator] Sub(Backward)", "[Sub][CPU][Backward]") {
+    std::shared_ptr<Node> mySub = Sub();
+    auto op = std::static_pointer_cast<OperatorTensor>(mySub->getOperator());
+    op->setDataType(DataType::Float32);
+    op->setBackend("cpu");
+
+    SECTION("Case 1: 1D and 2D Tensors") {
+        const auto T0 = std::make_shared<Tensor>(
+            Array2D<float, 2, 3>({{{1, 2, 3}, {4, 5, 6}}}));
+
+        const auto T1 =
+            std::make_shared<Tensor>(Array1D<float, 3>({0.1, 0.2, 0.3}));
+
+        T0->setDataType(DataType::Float32);
+        T0->setBackend("cpu");
+        T1->setDataType(DataType::Float32);
+        T1->setBackend("cpu");
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>(
+            Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}})));
+        op->forwardDims();
+
+        mySub->backward();
+
+        // For subtraction: grad_input0 = grad_output
+        const auto expectedGrad0 = std::make_shared<Tensor>(
+            Array2D<float, 2, 3>({{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}}));
+
+        // For subtraction: grad_input1 = -grad_output (summed across broadcast dimensions)
+        const auto expectedGrad1 =
+            std::make_shared<Tensor>(Array1D<float, 3>({-2, -2, -2}));
+
+        REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0));
+        REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1));
+    }
+
+    SECTION("Case 2: 3D and 1D tensors") {
+        const auto T0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+            {{{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}},
+              {{7.0, 8.0, 9.0}, {10.0, 11.0, 12.0}}}}));
+
+        const auto T1 =
+            std::make_shared<Tensor>(Array1D<float, 3>({0.3, 0.2, 0.1}));
+
+        const auto newGrad = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+            {{{{1, 1, 1}, {1, 1, 1}}, {{1, 1, 1}, {1, 1, 1}}}}));
+
+        const auto expectedGrad0 = std::make_shared<Tensor>(Array3D<float, 2, 2, 3>(
+            {{{{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}},
+              {{1.0, 1.0, 1.0}, {1.0, 1.0, 1.0}}}}));
+
+        const auto expectedGrad1 =
+            std::make_shared<Tensor>(Array1D<float, 3>({-4.0, -4.0, -4.0}));
+
+        for (auto T : {T0, T1, newGrad, expectedGrad0, expectedGrad1}) {
+            T->setBackend("cpu");
+            T->setDataType(DataType::Float32);
+        }
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+        op->getOutput(0)->setGrad(newGrad);
+        op->forwardDims();
+
+        mySub->backward();
+
+        REQUIRE(approxEq<float>(*(op->getInput(0)->grad()), *expectedGrad0));
+        REQUIRE(approxEq<float>(*(op->getInput(1)->grad()), *expectedGrad1));
+    }
+
+    SECTION("Case 3: Random values with broadcasting") {
+        // Use random values
+        std::vector<std::size_t> dims0 = {5, 2, 1, 7}; // First tensor
+        std::vector<std::size_t> dims1 = {2, 6, 7};    // Second tensor
+        std::vector<std::size_t> outputDims = {5, 2, 6, 7};
+
+        const auto input0Size = 5 * 2 * 1 * 7;
+        const auto input1Size = 2 * 6 * 7;
+        const auto outputSize = 5 * 2 * 6 * 7;
+
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> dist(0.1f, 1.0f);
+
+        std::vector<float> input0Data(input0Size);
+        std::vector<float> input1Data(input1Size);
+        std::vector<float> gradOutputData(outputSize);
+
+        // Fill with random values
+        for (auto &val : input0Data) val = dist(gen);
+        for (auto &val : input1Data) val = dist(gen);
+        for (auto &val : gradOutputData) val = dist(gen);
+
+        auto T0 = std::make_shared<Tensor>();
+        auto T1 = std::make_shared<Tensor>();
+
+        T0->setDataType(DataType::Float32);
+        T0->setBackend("cpu");
+        T0->resize(dims0);
+        T0->getImpl()->setRawPtr(input0Data.data(), input0Size);
+
+        T1->setDataType(DataType::Float32);
+        T1->setBackend("cpu");
+        T1->resize(dims1);
+        T1->getImpl()->setRawPtr(input1Data.data(), input1Size);
+
+        op->associateInput(0, T0);
+        op->associateInput(1, T1);
+
+        // Set gradient of output
+        op->getOutput(0)->setGrad(std::make_shared<Tensor>());
+        op->getOutput(0)->grad()->resize(outputDims);
+        op->getOutput(0)->grad()->getImpl()->setRawPtr(gradOutputData.data(), outputSize);
+
+        op->forwardDims();
+
+        // Compute reference gradients
+        std::vector<float> expectedGrad0(input0Size, 0.0f);
+        std::vector<float> expectedGrad1(input1Size, 0.0f);
+
+        for (std::size_t n = 0; n < 5; ++n) {
+            for (std::size_t c = 0; c < 2; ++c) {
+                for (std::size_t h = 0; h < 6; ++h) {
+                    for (std::size_t w = 0; w < 7; ++w) {
+                        std::size_t outIdx = w + 7 * (h + 6 * (c + 2 * n));
+                        std::size_t in0Idx = w + 7 * (0 + 1 * (c + 2 * n));
+                        std::size_t in1Idx = w + 7 * (h + 6 * c);
+
+                        // Gradient for input0: grad_output
+                        expectedGrad0[in0Idx] += gradOutputData[outIdx];
+                        // Gradient for input1: -grad_output
+                        expectedGrad1[in1Idx] += -gradOutputData[outIdx];
+                    }
+                }
+            }
+        }
+
+        // Perform backward pass
+        mySub->backward();
+
+        auto expectedGrad0Tensor = std::make_shared<Tensor>();
+        expectedGrad0Tensor->resize(T0->dims());
+        expectedGrad0Tensor->setBackend("cpu");
+        expectedGrad0Tensor->setDataType(DataType::Float32);
+        expectedGrad0Tensor->getImpl()->setRawPtr(expectedGrad0.data(), expectedGrad0.size());
+
+        auto expectedGrad1Tensor = std::make_shared<Tensor>();
+        expectedGrad1Tensor->resize(T1->dims());
+        expectedGrad1Tensor->setBackend("cpu");
+        expectedGrad1Tensor->setDataType(DataType::Float32);
+        expectedGrad1Tensor->getImpl()->setRawPtr(expectedGrad1.data(), expectedGrad1.size());
+
+        // Verify backward pass
+        REQUIRE(approxEq<float>(*T0->grad(), *expectedGrad0Tensor));
+        REQUIRE(approxEq<float>(*T1->grad(), *expectedGrad1Tensor));
+    }
+}
 } // namespace Aidge