From 0c6edb5221964379bc5cc023a6c8651cf772ef26 Mon Sep 17 00:00:00 2001 From: NAUD Maxence <maxence.naud@cea.fr> Date: Thu, 8 Feb 2024 17:41:42 +0000 Subject: [PATCH] [Fix] ReduceMean operator forward kernel with refactor --- .../ReduceMeanImpl_forward_kernels.hpp | 87 ++++++++------- unit_tests/operator/Test_ReduceMeanImpl.cpp | 100 +++++++++++++----- 2 files changed, 114 insertions(+), 73 deletions(-) diff --git a/include/aidge/backend/cpu/operator/ReduceMeanImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/ReduceMeanImpl_forward_kernels.hpp index d9202615..71888aa5 100644 --- a/include/aidge/backend/cpu/operator/ReduceMeanImpl_forward_kernels.hpp +++ b/include/aidge/backend/cpu/operator/ReduceMeanImpl_forward_kernels.hpp @@ -13,7 +13,7 @@ #define AIDGE_CPU_OPERATOR_REDUCEMEANIMPL_FORWARD_KERNEL_H_ #include <cstddef> -#include <algorithm> +#include <algorithm> // std::copy, std::for_each #include <numeric> //std::accumulate #include <functional> //std::multiplies @@ -32,57 +32,56 @@ void ReduceMeanImpl_cpu_forward_kernel(const typename ReduceMean_Op<DIM>::Attrs& const I* input = static_cast<const I*>(input_); O* output = static_cast<O*>(output_); - const DimSize_t keepDims = std::get<1>(attrs); - // Calculate the total number of elements in the input array - const std::size_t totalElements = std::accumulate(inputDims.cbegin(), inputDims.cend(), 1, std::multiplies<std::size_t>()); - - - // Create a temporary arrays to store intermediate input/output for each Reduce op - std::vector<I> tempInArray(input, input + totalElements); - std::vector<I> tempOutArray(input, input + totalElements); - std::vector<std::size_t> currentDims = inputDims; + const std::size_t nb_dims = inputDims.size(); - std::size_t addedElems = 0; - for(std::size_t i = 0; i < DIM ; ++i) - { - addedElems = 0; - const std::size_t axis = static_cast<std::size_t>(std::get<0>(attrs)[i]); - - I* tempOutArrayPtr = tempOutArray.data(); + const std::size_t totalElements = std::accumulate(inputDims.cbegin(), inputDims.cend(), 1, std::multiplies<std::size_t>()); + std::size_t outputElements = totalElements; - std::size_t postAxisElems = 1; - for (std::size_t d = axis + 1; d < inputDims.size(); ++d) { - postAxisElems *= inputDims[d]; - } - std::size_t preAxisElems = 1; - for (std::size_t d = 0; d < axis; ++d) { - preAxisElems *= inputDims[d]; - } + std::size_t *stride_post = new std::size_t[nb_dims]; + stride_post[nb_dims - 1] = 1; + for (std::size_t i = nb_dims-2; i != static_cast<std::size_t>(-1); --i) { + stride_post[i] = stride_post[i+1]*inputDims[i+1]; + } + std::size_t *stride_pre = new std::size_t[nb_dims]; + stride_pre[0] = 1; + for (std::size_t i = 1; i < nb_dims; ++i) { + stride_pre[i] = stride_pre[i-1]*inputDims[i-1]; + } - for (std::size_t j=0; j<preAxisElems; ++j) - { - for (std::size_t k=0; k<postAxisElems; ++k) - { - // Compute the mean value for the element k of each stride - I mean = 0; - for(std::size_t l=0; l<currentDims[axis];l++) - { - std::size_t idx = j * (postAxisElems * currentDims[axis]) + l * postAxisElems + k; - mean += tempInArray[idx]; + const I* inputAccumulation = input; + I* outputAccumulation = nullptr; + + for (const std::size_t& a : std::get<0>(attrs)) { + outputElements /= inputDims[a]; + outputAccumulation = new I[outputElements]; + const std::size_t dim_i = inputDims[a]; + for (std::size_t pre = 0; pre < stride_pre[a]; ++pre) { + for (std::size_t post = 0; post < stride_post[a]; ++post) { + const std::size_t idx_i = pre * dim_i * stride_post[a] + post; + const std::size_t idx_o = pre * stride_post[a] + post; + outputAccumulation[idx_o] = inputAccumulation[idx_i]; + for (std::size_t i = 1; i < dim_i; ++i) { + outputAccumulation[idx_o] += inputAccumulation[idx_i + i*stride_post[a]]; } - tempOutArrayPtr[addedElems] = mean / currentDims[axis]; - addedElems++; } } + std::for_each(stride_pre+a+1, stride_pre+nb_dims, [dim_i] (std::size_t& val) { val /= dim_i; }); + if (inputAccumulation != input) { + delete[] inputAccumulation; + } + inputAccumulation = outputAccumulation; + } - // Update the input for the next reduce operation - tempInArray.assign(tempOutArray.begin(), tempOutArray.begin() + addedElems); - if(keepDims) - currentDims[axis] = 1; - else if (currentDims.size()>1) - currentDims.erase(currentDims.begin()+axis); + // Copy elements from inputAccumulation to output while dividing by divisor + I divisor = totalElements / outputElements; + std::transform(inputAccumulation, inputAccumulation + outputElements, output, + [divisor](int element) { return element / divisor; }); + if (outputAccumulation) { + delete[] outputAccumulation; } - std::copy_n(tempInArray.cbegin(), addedElems, output); + delete[] stride_post; + delete[] stride_pre; + } namespace { // DIM = 1 diff --git a/unit_tests/operator/Test_ReduceMeanImpl.cpp b/unit_tests/operator/Test_ReduceMeanImpl.cpp index 2cde38d6..494b7a6a 100644 --- a/unit_tests/operator/Test_ReduceMeanImpl.cpp +++ b/unit_tests/operator/Test_ReduceMeanImpl.cpp @@ -22,41 +22,83 @@ using namespace Aidge; TEST_CASE("[cpu/operator] ReduceMean(forward)", "[ReduceMean][CPU]") { SECTION("KeepDims") { - std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> { - { - { - { 5.0, 1.0 }, - { 20.0, 2.0 } - }, + SECTION("test 1") { + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> { { - { 30.0, 1.0 }, - { 40.0, 2.0 } - }, + { + { 5.0, 1.0 }, + { 20.0, 2.0 } + }, + { + { 30.0, 1.0 }, + { 40.0, 2.0 } + }, + { + { 55.0, 1.0 }, + { 60.0, 2.0 } + } + } + }); + Tensor myOutput = Tensor(Array3D<float,3,1,2> { { - { 55.0, 1.0 }, - { 60.0, 2.0 } + + {{ 12.5, 1.5 }}, + {{ 35.0, 1.5 }}, + {{ 57.5, 1.5 }} } - } - }); - std::shared_ptr<Tensor> myOutput = std::make_shared<Tensor>(Array3D<float,3,1,2> { - { + }); - {{ 12.5, 1.5 }}, - {{ 35.0, 1.5 }}, - {{ 57.5, 1.5 }} - } - }); + std::shared_ptr<Node> myReduceMean = ReduceMean({1}, 1); + auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator()); + op->associateInput(0,myInput); + op->setDataType(DataType::Float32); + op->setBackend("cpu"); + op->computeOutputDims(); + myReduceMean->forward(); + op->getOutput(0)->print(); - std::shared_ptr<Node> myReduceMean = ReduceMean({1}, 1); - auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator()); - op->associateInput(0,myInput); - op->setDataType(DataType::Float32); - op->setBackend("cpu"); - op->computeOutputDims(); - myReduceMean->forward(); - op->getOutput(0)->print(); + REQUIRE(*(op->getOutput(0)) == myOutput); + } + SECTION("test 2") { + std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,3,2> { + { + { + { 0.0, 0.0 }, + { 1.0, 1.0 }, + { 2.0, 2.0 } + }, + { + { 3.0, 3.0 }, + { 4.0, 4.0 }, + { 5.0, 5.0 } + }, + { + { 6.0, 6.0 }, + { 7.0, 7.0 }, + { 8.0, 8.0 } + } + } + }); + Tensor myOutput = Tensor(Array3D<float,3,1,1> { + { - REQUIRE(*(op->getOutput(0)) == *myOutput); + {{ 1.0 }}, + {{ 4.0 }}, + {{ 7.0 }} + } + }); + + std::shared_ptr<Node> myReduceMean = ReduceMean({1, 2}, 1); + auto op = std::static_pointer_cast<OperatorTensor>(myReduceMean -> getOperator()); + op->associateInput(0,myInput); + op->setDataType(DataType::Float32); + op->setBackend("cpu"); + op->computeOutputDims(); + myReduceMean->forward(); + myOutput.print(); + op->getOutput(0)->print(); + REQUIRE(*(op->getOutput(0)) == myOutput); + } } SECTION("not_KeepDims") { std::shared_ptr<Tensor> myInput = std::make_shared<Tensor>(Array3D<float,3,2,2> { -- GitLab