From 0c6edb5221964379bc5cc023a6c8651cf772ef26 Mon Sep 17 00:00:00 2001
From: NAUD Maxence <maxence.naud@cea.fr>
Date: Thu, 8 Feb 2024 17:41:42 +0000
Subject: [PATCH] [Fix] ReduceMean operator forward kernel with refactor

---
 .../ReduceMeanImpl_forward_kernels.hpp        |  87 ++++++++-------
 unit_tests/operator/Test_ReduceMeanImpl.cpp   | 100 +++++++++++++-----
 2 files changed, 114 insertions(+), 73 deletions(-)

diff --git a/include/aidge/backend/cpu/operator/ReduceMeanImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/ReduceMeanImpl_forward_kernels.hpp
index d9202615..71888aa5 100644
--- a/include/aidge/backend/cpu/operator/ReduceMeanImpl_forward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ReduceMeanImpl_forward_kernels.hpp
@@ -13,7 +13,7 @@
 #define AIDGE_CPU_OPERATOR_REDUCEMEANIMPL_FORWARD_KERNEL_H_
 
 #include <cstddef>
-#include <algorithm>
+#include <algorithm>   // std::copy, std::for_each
 #include <numeric>     //std::accumulate
 #include <functional>  //std::multiplies
 
@@ -32,57 +32,56 @@ void ReduceMeanImpl_cpu_forward_kernel(const typename ReduceMean_Op<DIM>::Attrs&
     const I* input = static_cast<const I*>(input_);
     O* output = static_cast<O*>(output_);
 
-    const DimSize_t keepDims = std::get<1>(attrs);
-    // Calculate the total number of elements in the input array
-    const std::size_t totalElements = std::accumulate(inputDims.cbegin(), inputDims.cend(), 1, std::multiplies<std::size_t>());
-
-
-    // Create a temporary arrays to store intermediate input/output for each Reduce op
-    std::vector<I> tempInArray(input, input + totalElements);
-    std::vector<I> tempOutArray(input, input + totalElements);
-    std::vector<std::size_t> currentDims = inputDims;
+    const std::size_t nb_dims = inputDims.size();
 
-    std::size_t addedElems = 0;
-    for(std::size_t i = 0; i < DIM ; ++i)
-    {
-		addedElems = 0;
-		const std::size_t axis = static_cast<std::size_t>(std::get<0>(attrs)[i]);
-
-		I* tempOutArrayPtr = tempOutArray.data();
+    const std::size_t totalElements = std::accumulate(inputDims.cbegin(), inputDims.cend(), 1, std::multiplies<std::size_t>());
+    std::size_t outputElements = totalElements;
 
-		std::size_t postAxisElems = 1;
-		for (std::size_t d = axis + 1; d < inputDims.size(); ++d) {
-			postAxisElems *= inputDims[d];
-		}
-		std::size_t preAxisElems = 1;
-		for (std::size_t d = 0; d < axis; ++d) {
-			preAxisElems *= inputDims[d];
-		}
+    std::size_t *stride_post = new std::size_t[nb_dims];
+    stride_post[nb_dims - 1] = 1;
+    for (std::size_t i = nb_dims-2; i != static_cast<std::size_t>(-1); --i) {
+        stride_post[i] = stride_post[i+1]*inputDims[i+1];
+    }
+    std::size_t *stride_pre = new std::size_t[nb_dims];
+    stride_pre[0] = 1;
+    for (std::size_t i = 1; i < nb_dims; ++i) {
+        stride_pre[i] = stride_pre[i-1]*inputDims[i-1];
+    }
 
-        for (std::size_t j=0; j<preAxisElems; ++j)
-        {
-            for (std::size_t k=0; k<postAxisElems; ++k)
-            {
-				// Compute the mean value for the element k of each stride
-                I mean = 0;
-                for(std::size_t l=0; l<currentDims[axis];l++)
-                {
-					std::size_t idx = j * (postAxisElems * currentDims[axis]) + l * postAxisElems + k;
-					mean += tempInArray[idx];
+    const I* inputAccumulation = input;
+    I* outputAccumulation = nullptr;
+
+    for (const std::size_t& a : std::get<0>(attrs)) {
+        outputElements /= inputDims[a];
+        outputAccumulation = new I[outputElements];
+        const std::size_t dim_i = inputDims[a];
+        for (std::size_t pre = 0; pre < stride_pre[a]; ++pre) {
+            for (std::size_t post = 0; post < stride_post[a]; ++post) {
+                const std::size_t idx_i = pre * dim_i * stride_post[a] + post;
+                const std::size_t idx_o = pre * stride_post[a] + post;
+                outputAccumulation[idx_o] = inputAccumulation[idx_i];
+                for (std::size_t i = 1; i < dim_i; ++i) {
+                    outputAccumulation[idx_o] += inputAccumulation[idx_i + i*stride_post[a]];
                 }
-                tempOutArrayPtr[addedElems] = mean / currentDims[axis];
-                addedElems++;
             }
         }
+        std::for_each(stride_pre+a+1, stride_pre+nb_dims, [dim_i] (std::size_t& val) { val /= dim_i; });
+        if (inputAccumulation != input) {
+            delete[] inputAccumulation;
+        }
+        inputAccumulation = outputAccumulation;
+    }
 
-        // Update the input for the next reduce operation
-        tempInArray.assign(tempOutArray.begin(), tempOutArray.begin() + addedElems);
-        if(keepDims)
-			currentDims[axis] = 1;
-        else if (currentDims.size()>1)
-			currentDims.erase(currentDims.begin()+axis);
+    // Copy elements from inputAccumulation to output while dividing by divisor
+    I divisor = totalElements / outputElements;
+    std::transform(inputAccumulation, inputAccumulation + outputElements, output,
+                   [divisor](int element) { return element / divisor; });
+    if (outputAccumulation) {
+        delete[] outputAccumulation;
     }
-	std::copy_n(tempInArray.cbegin(), addedElems, output);
+    delete[] stride_post;
+    delete[] stride_pre;
+
 }
 namespace {
 // DIM = 1
diff --git a/unit_tests/operator/Test_ReduceMeanImpl.cpp b/unit_tests/operator/Test_ReduceMeanImpl.cpp
index 2cde38d6..494b7a6a 100644
--- a/unit_tests/operator/Test_ReduceMeanImpl.cpp
+++ b/unit_tests/operator/Test_ReduceMeanImpl.cpp
@@ -22,41 +22,83 @@ using namespace Aidge;
 
 TEST_CASE("[cpu/operator] ReduceMean(forward)", "[ReduceMean][CPU]") {
     SECTION("KeepDims") {
-        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
-            {
-                {
-                    { 5.0, 1.0 },
-                    { 20.0, 2.0 }
-                },
+        SECTION("test 1") {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
                 {
-                    { 30.0, 1.0 },
-                    { 40.0, 2.0 }
-                },
+                    {
+                        { 5.0, 1.0 },
+                        { 20.0, 2.0 }
+                    },
+                    {
+                        { 30.0, 1.0 },
+                        { 40.0, 2.0 }
+                    },
+                    {
+                        { 55.0, 1.0 },
+                        { 60.0, 2.0 }
+                    }
+                }
+            });
+            Tensor myOutput = Tensor(Array3D<float,3,1,2> {
                 {
-                    { 55.0, 1.0 },
-                    { 60.0, 2.0 }
+
+                    {{ 12.5, 1.5 }},
+                    {{ 35.0, 1.5 }},
+                    {{ 57.5, 1.5 }}
                 }
-            }
-        });
-        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array3D<float,3,1,2> {
-            {
+            });
 
-                {{ 12.5, 1.5 }},
-                {{ 35.0, 1.5 }},
-                {{ 57.5, 1.5 }}
-            }
-        });
+            std::shared_ptr<Node> myReduceMean = ReduceMean({1}, 1);
+            auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            op->computeOutputDims();
+            myReduceMean->forward();
+            op->getOutput(0)->print();
 
-        std::shared_ptr<Node> myReduceMean = ReduceMean({1}, 1);
-        auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
-        op->associateInput(0,myInput);
-        op->setDataType(DataType::Float32);
-        op->setBackend("cpu");
-        op->computeOutputDims();
-        myReduceMean->forward();
-        op->getOutput(0)->print();
+            REQUIRE(*(op->getOutput(0)) == myOutput);
+        }
+        SECTION("test 2") {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,3,2> {
+                {
+                    {
+                        { 0.0, 0.0 },
+                        { 1.0, 1.0 },
+                        { 2.0, 2.0 }
+                    },
+                    {
+                        { 3.0, 3.0 },
+                        { 4.0, 4.0 },
+                        { 5.0, 5.0 }
+                    },
+                    {
+                        { 6.0, 6.0 },
+                        { 7.0, 7.0 },
+                        { 8.0, 8.0 }
+                    }
+                }
+            });
+            Tensor myOutput = Tensor(Array3D<float,3,1,1> {
+                {
 
-        REQUIRE(*(op->getOutput(0)) == *myOutput);
+                    {{ 1.0 }},
+                    {{ 4.0 }},
+                    {{ 7.0 }}
+                }
+            });
+
+            std::shared_ptr<Node> myReduceMean = ReduceMean({1, 2}, 1);
+            auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            op->computeOutputDims();
+            myReduceMean->forward();
+            myOutput.print();
+            op->getOutput(0)->print();
+            REQUIRE(*(op->getOutput(0)) == myOutput);
+        }
     }
     SECTION("not_KeepDims") {
         std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
-- 
GitLab