Optimize backward kernels of Sub and Add

Only in the case where the number of dimensions is the same for both input gradients. We know this is the case in Leaky node, for instance

Optimize backward kernels of Sub and Add
Only in the case where the number of dimensions is the same for both input gradients. We know this is the case in Leaky node, for instance
869cc27f · Jerome Hue · 382f6d47 · 869cc27f · 869cc27f
Commit 869cc27f authored 1 month ago by Jerome Hue
--- a/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AddImpl_kernels.hpp
@@ -163,6 +163,15 @@ void AddImpl_cpu_backward_kernel(const std::size_t /*input0Length*/,
    auto* gradInput0 = static_cast<I*>(gradientInput0_);
    auto* gradInput1 = static_cast<I*>(gradientInput1_);

+    // simple elementwise gradient addition when no broadcasting is required
+    if (dims0 == dims1) {
+        for (std::size_t i = 0; i < gradOutputLength; ++i) {
+            gradInput0[i] += static_cast<I>(gradOutput[i]);
+            gradInput1[i] += static_cast<I>(gradOutput[i]);
+        }
+        return;
+    }
+
    auto broadcastedDims0 = getBroadcastedDims(outputDims, dims0);
    auto broadcastedDims1 = getBroadcastedDims(outputDims, dims1);


--- a/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/SubImpl_kernels.hpp
@@ -165,6 +165,21 @@ void SubImpl_cpu_backward_kernel(const std::size_t /*input0Length*/,
    auto* grad_input_0 = static_cast<I1*>(gradientInput0_);
    auto* grad_input_1 = static_cast<I2*>(gradientInput1_);

+
+    // special case for equal dimensions, gradient can be computed directly
+    if (dims0 == dims1) {
+        const std::size_t contiguousSize = std::accumulate(
+            dims0.cbegin(), dims0.cend(),
+            static_cast<std::size_t>(1),
+            std::multiplies<std::size_t>()
+        );
+        for (std::size_t i = 0; i < contiguousSize; ++i) {
+            grad_input_0[i] += static_cast<I1>(grad_output[i]);
+            grad_input_1[i] += static_cast<I2>(-grad_output[i]);
+        }
+        return;
+    }
+
    auto broadcastedDims0 = getBroadcastedDims(outputDims, dims0);
    auto broadcastedDims1 = getBroadcastedDims(outputDims, dims1);