[Fix] ReduceMean operator forward kernel with refactor

0c6edb52 · Maxence Naud · d87663c3 · 0c6edb52 · 0c6edb52
Commit 0c6edb52 authored 1 year ago by Maxence Naud
--- a/include/aidge/backend/cpu/operator/ReduceMeanImpl_forward_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ReduceMeanImpl_forward_kernels.hpp
@@ -13,7 +13,7 @@
 #define AIDGE_CPU_OPERATOR_REDUCEMEANIMPL_FORWARD_KERNEL_H_
 #include <cstddef>
-#include <algorithm>
+#include <algorithm>   // std::copy, std::for_each
 #include <numeric>     //std::accumulate
 #include <functional>  //std::multiplies
@@ -32,57 +32,56 @@ void ReduceMeanImpl_cpu_forward_kernel(const typename ReduceMean_Op<DIM>::Attrs&
    const I* input = static_cast<const I*>(input_);
    O* output = static_cast<O*>(output_);
-    const DimSize_t keepDims = std::get<1>(attrs);
+    const std::size_t nb_dims = inputDims.size();
-    // Calculate the total number of elements in the input array
-    const std::size_t totalElements = std::accumulate(inputDims.cbegin(), inputDims.cend(), 1, std::multiplies<std::size_t>());
-    // Create a temporary arrays to store intermediate input/output for each Reduce op
-    std::vector<I> tempInArray(input, input + totalElements);
-    std::vector<I> tempOutArray(input, input + totalElements);
-    std::vector<std::size_t> currentDims = inputDims;
-    std::size_t addedElems = 0;
+    const std::size_t totalElements = std::accumulate(inputDims.cbegin(), inputDims.cend(), 1, std::multiplies<std::size_t>());
-    for(std::size_t i = 0; i < DIM ; ++i)
+    std::size_t outputElements = totalElements;
-    {
-		addedElems = 0;
-		const std::size_t axis = static_cast<std::size_t>(std::get<0>(attrs)[i]);
-		I* tempOutArrayPtr = tempOutArray.data();
-		std::size_t postAxisElems = 1;
+    std::size_t *stride_post = new std::size_t[nb_dims];
-		for (std::size_t d = axis + 1; d < inputDims.size(); ++d) {
+    stride_post[nb_dims - 1] = 1;
-			postAxisElems *= inputDims[d];
+    for (std::size_t i = nb_dims-2; i != static_cast<std::size_t>(-1); --i) {
-		}
+        stride_post[i] = stride_post[i+1]*inputDims[i+1];
-		std::size_t preAxisElems = 1;
+    }
-		for (std::size_t d = 0; d < axis; ++d) {
+    std::size_t *stride_pre = new std::size_t[nb_dims];
-			preAxisElems *= inputDims[d];
+    stride_pre[0] = 1;
-		}
+    for (std::size_t i = 1; i < nb_dims; ++i) {
+        stride_pre[i] = stride_pre[i-1]*inputDims[i-1];
+    }
-        for (std::size_t j=0; j<preAxisElems; ++j)
+    const I* inputAccumulation = input;
-        {
+    I* outputAccumulation = nullptr;
-            for (std::size_t k=0; k<postAxisElems; ++k)
-            {
+    for (const std::size_t& a : std::get<0>(attrs)) {
-				// Compute the mean value for the element k of each stride
+        outputElements /= inputDims[a];
-                I mean = 0;
+        outputAccumulation = new I[outputElements];
-                for(std::size_t l=0; l<currentDims[axis];l++)
+        const std::size_t dim_i = inputDims[a];
-                {
+        for (std::size_t pre = 0; pre < stride_pre[a]; ++pre) {
-					std::size_t idx = j * (postAxisElems * currentDims[axis]) + l * postAxisElems + k;
+            for (std::size_t post = 0; post < stride_post[a]; ++post) {
-					mean += tempInArray[idx];
+                const std::size_t idx_i = pre * dim_i * stride_post[a] + post;
+                const std::size_t idx_o = pre * stride_post[a] + post;
+                outputAccumulation[idx_o] = inputAccumulation[idx_i];
+                for (std::size_t i = 1; i < dim_i; ++i) {
+                    outputAccumulation[idx_o] += inputAccumulation[idx_i + i*stride_post[a]];
                }
-                tempOutArrayPtr[addedElems] = mean / currentDims[axis];
-                addedElems++;
            }
        }
+        std::for_each(stride_pre+a+1, stride_pre+nb_dims, [dim_i] (std::size_t& val) { val /= dim_i; });
+        if (inputAccumulation != input) {
+            delete[] inputAccumulation;
+        }
+        inputAccumulation = outputAccumulation;
+    }
-        // Update the input for the next reduce operation
+    // Copy elements from inputAccumulation to output while dividing by divisor
-        tempInArray.assign(tempOutArray.begin(), tempOutArray.begin() + addedElems);
+    I divisor = totalElements / outputElements;
-        if(keepDims)
+    std::transform(inputAccumulation, inputAccumulation + outputElements, output,
-			currentDims[axis] = 1;
+                   [divisor](int element) { return element / divisor; });
-        else if (currentDims.size()>1)
+    if (outputAccumulation) {
-			currentDims.erase(currentDims.begin()+axis);
+        delete[] outputAccumulation;
    }
-	std::copy_n(tempInArray.cbegin(), addedElems, output);
+    delete[] stride_post;
+    delete[] stride_pre;
 }
 namespace {
 // DIM = 1

--- a/unit_tests/operator/Test_ReduceMeanImpl.cpp
+++ b/unit_tests/operator/Test_ReduceMeanImpl.cpp
@@ -22,41 +22,83 @@ using namespace Aidge;
 TEST_CASE("[cpu/operator] ReduceMean(forward)", "[ReduceMean][CPU]") {
    SECTION("KeepDims") {
-        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
+        SECTION("test 1") {
-            {
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {
-                {
-                    { 5.0, 1.0 },
-                    { 20.0, 2.0 }
-                },
                {
-                    { 30.0, 1.0 },
+                    {
-                    { 40.0, 2.0 }
+                        { 5.0, 1.0 },
-                },
+                        { 20.0, 2.0 }
+                    },
+                    {
+                        { 30.0, 1.0 },
+                        { 40.0, 2.0 }
+                    },
+                    {
+                        { 55.0, 1.0 },
+                        { 60.0, 2.0 }
+                    }
+                }
+            });
+            Tensor myOutput = Tensor(Array3D<float,3,1,2> {
                {
-                    { 55.0, 1.0 },
-                    { 60.0, 2.0 }
+                    {{ 12.5, 1.5 }},
+                    {{ 35.0, 1.5 }},
+                    {{ 57.5, 1.5 }}
                }
-            }
+            });
-        });
-        std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array3D<float,3,1,2> {
-            {
-                {{ 12.5, 1.5 }},
+            std::shared_ptr<Node> myReduceMean = ReduceMean({1}, 1);
-                {{ 35.0, 1.5 }},
+            auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
-                {{ 57.5, 1.5 }}
+            op->associateInput(0,myInput);
-            }
+            op->setDataType(DataType::Float32);
-        });
+            op->setBackend("cpu");
+            op->computeOutputDims();
+            myReduceMean->forward();
+            op->getOutput(0)->print();
-        std::shared_ptr<Node> myReduceMean = ReduceMean({1}, 1);
+            REQUIRE(*(op->getOutput(0)) == myOutput);
-        auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
+        }
-        op->associateInput(0,myInput);
+        SECTION("test 2") {
-        op->setDataType(DataType::Float32);
+            std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,3,2> {
-        op->setBackend("cpu");
+                {
-        op->computeOutputDims();
+                    {
-        myReduceMean->forward();
+                        { 0.0, 0.0 },
-        op->getOutput(0)->print();
+                        { 1.0, 1.0 },
+                        { 2.0, 2.0 }
+                    },
+                    {
+                        { 3.0, 3.0 },
+                        { 4.0, 4.0 },
+                        { 5.0, 5.0 }
+                    },
+                    {
+                        { 6.0, 6.0 },
+                        { 7.0, 7.0 },
+                        { 8.0, 8.0 }
+                    }
+                }
+            });
+            Tensor myOutput = Tensor(Array3D<float,3,1,1> {
+                {
-        REQUIRE(*(op->getOutput(0)) == *myOutput);
+                    {{ 1.0 }},
+                    {{ 4.0 }},
+                    {{ 7.0 }}
+                }
+            });
+            std::shared_ptr<Node> myReduceMean = ReduceMean({1, 2}, 1);
+            auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator());
+            op->associateInput(0,myInput);
+            op->setDataType(DataType::Float32);
+            op->setBackend("cpu");
+            op->computeOutputDims();
+            myReduceMean->forward();
+            myOutput.print();
+            op->getOutput(0)->print();
+            REQUIRE(*(op->getOutput(0)) == myOutput);
+        }
    }
    SECTION("not_KeepDims") {
        std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> {