From 6faa813652c55dac15ae41457d30d90a9a0dcd22 Mon Sep 17 00:00:00 2001
From: NAUD Maxence <maxence.naud@cea.fr>
Date: Fri, 9 Feb 2024 09:53:07 +0000
Subject: [PATCH] [Add] small optimization

---
 .../LeakyReLUImpl_forward_kernels.hpp         |  2 +-
 .../ReduceMeanImpl_forward_kernels.hpp        | 96 +++++++++++--------
 2 files changed, 58 insertions(+), 40 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/LeakyReLUImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/LeakyReLUImpl_forward_kernels.hpp
index 761b9579..d10b32e1 100644
--- a/include/aidge/backend/cpu/operator/LeakyReLUImpl_forward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/LeakyReLUImpl_forward_kernels.hpp
@@ -25,7 +25,7 @@ void LeakyReLUImpl_cpu_forward_kernel(const LeakyReLU_Op::Attrs& attrs,
 
     const I* input = static_cast<const I*>(input_);
     O* output = static_cast<O*>(output_);
-    I negativeSlope = static_cast<I>(std::get<0>(attrs));
+    const I negativeSlope = static_cast<const I>(std::get<0>(attrs));
 
     for (std::size_t i = 0; i < inputLenght; ++i) {
         output[i] = input[i] >= 0 ? input[i] : input[i] * negativeSlope;
diff --git a/include/aidge/backend/cpu/operator/ReduceMeanImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/ReduceMeanImpl_forward_kernels.hpp
index 71888aa5..bc9ada0f 100644
--- a/include/aidge/backend/cpu/operator/ReduceMeanImpl_forward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ReduceMeanImpl_forward_kernels.hpp
@@ -33,56 +33,74 @@ void ReduceMeanImpl_cpu_forward_kernel(const typename ReduceMean_Op<DIM>::Attrs&
     O* output = static_cast<O*>(output_);
 
     const std::size_t nb_dims = inputDims.size();
-
     const std::size_t totalElements = std::accumulate(inputDims.cbegin(), inputDims.cend(), 1, std::multiplies<std::size_t>());
-    std::size_t outputElements = totalElements;
 
-    std::size_t *stride_post = new std::size_t[nb_dims];
-    stride_post[nb_dims - 1] = 1;
-    for (std::size_t i = nb_dims-2; i != static_cast<std::size_t>(-1); --i) {
-        stride_post[i] = stride_post[i+1]*inputDims[i+1];
-    }
-    std::size_t *stride_pre = new std::size_t[nb_dims];
-    stride_pre[0] = 1;
-    for (std::size_t i = 1; i < nb_dims; ++i) {
-        stride_pre[i] = stride_pre[i-1]*inputDims[i-1];
-    }
+    if (DIM == 1) {
+        const std::size_t stride_pre = std::accumulate(inputDims.cbegin(), inputDims.cbegin() + std::get<0>(attrs)[0], 1, std::multiplies<std::size_t>());
+        const std::size_t stride_post = std::accumulate(inputDims.crbegin(), inputDims.crbegin() + nb_dims -1 - std::get<0>(attrs)[0], 1, std::multiplies<std::size_t>());
 
-    const I* inputAccumulation = input;
-    I* outputAccumulation = nullptr;
-
-    for (const std::size_t& a : std::get<0>(attrs)) {
-        outputElements /= inputDims[a];
-        outputAccumulation = new I[outputElements];
-        const std::size_t dim_i = inputDims[a];
-        for (std::size_t pre = 0; pre < stride_pre[a]; ++pre) {
-            for (std::size_t post = 0; post < stride_post[a]; ++post) {
-                const std::size_t idx_i = pre * dim_i * stride_post[a] + post;
-                const std::size_t idx_o = pre * stride_post[a] + post;
-                outputAccumulation[idx_o] = inputAccumulation[idx_i];
+        const std::size_t dim_i = inputDims[std::get<0>(attrs)[0]];
+        for (std::size_t pre = 0; pre < stride_pre; ++pre) {
+            for (std::size_t post = 0; post < stride_post; ++post) {
+                const std::size_t idx_i = pre * dim_i * stride_post + post;
+                const std::size_t idx_o = pre * stride_post + post;
+                output[idx_o] = input[idx_i];
                 for (std::size_t i = 1; i < dim_i; ++i) {
-                    outputAccumulation[idx_o] += inputAccumulation[idx_i + i*stride_post[a]];
+                    output[idx_o] += input[idx_i + i*stride_post];
                 }
+                output[idx_o] /= dim_i;
             }
         }
-        std::for_each(stride_pre+a+1, stride_pre+nb_dims, [dim_i] (std::size_t& val) { val /= dim_i; });
-        if (inputAccumulation != input) {
-            delete[] inputAccumulation;
+    } else {
+        std::size_t outputElements = totalElements;
+
+        std::size_t *stride_post = new std::size_t[nb_dims];
+        stride_post[nb_dims - 1] = 1;
+        for (std::size_t i = nb_dims-2; i != static_cast<std::size_t>(-1); --i) {
+            stride_post[i] = stride_post[i+1]*inputDims[i+1];
+        }
+        std::size_t *stride_pre = new std::size_t[nb_dims];
+        stride_pre[0] = 1;
+        for (std::size_t i = 1; i < nb_dims; ++i) {
+            stride_pre[i] = stride_pre[i-1]*inputDims[i-1];
         }
-        inputAccumulation = outputAccumulation;
-    }
 
-    // Copy elements from inputAccumulation to output while dividing by divisor
-    I divisor = totalElements / outputElements;
-    std::transform(inputAccumulation, inputAccumulation + outputElements, output,
-                   [divisor](int element) { return element / divisor; });
-    if (outputAccumulation) {
-        delete[] outputAccumulation;
-    }
-    delete[] stride_post;
-    delete[] stride_pre;
+        const I* inputAccumulation = input;
+        I* outputAccumulation = nullptr;
 
+        for (const std::size_t& a : std::get<0>(attrs)) {
+            outputElements /= inputDims[a];
+            outputAccumulation = new I[outputElements];
+            const std::size_t dim_i = inputDims[a];
+            for (std::size_t pre = 0; pre < stride_pre[a]; ++pre) {
+                for (std::size_t post = 0; post < stride_post[a]; ++post) {
+                    const std::size_t idx_i = pre * dim_i * stride_post[a] + post;
+                    const std::size_t idx_o = pre * stride_post[a] + post;
+                    outputAccumulation[idx_o] = inputAccumulation[idx_i];
+                    for (std::size_t i = 1; i < dim_i; ++i) {
+                        outputAccumulation[idx_o] += inputAccumulation[idx_i + i*stride_post[a]];
+                    }
+                }
+            }
+            std::for_each(stride_pre+a+1, stride_pre+nb_dims, [dim_i] (std::size_t& val) { val /= dim_i; });
+            if (inputAccumulation != input) {
+                delete[] inputAccumulation;
+            }
+            inputAccumulation = outputAccumulation;
+        }
+
+        // Copy elements from inputAccumulation to output while dividing by divisor
+        I divisor = totalElements / outputElements;
+        std::transform(inputAccumulation, inputAccumulation + outputElements, output,
+                    [divisor](int element) { return element / divisor; });
+        if (outputAccumulation) {
+            delete[] outputAccumulation;
+        }
+        delete[] stride_post;
+        delete[] stride_pre;
+    }
 }
+
 namespace {
 // DIM = 1
 static Registrar<ReduceMeanImpl1DForward_cpu> registrarReduceMeanImplForward_1D_cpu_Float32(
-- 
GitLab