From 6faa813652c55dac15ae41457d30d90a9a0dcd22 Mon Sep 17 00:00:00 2001 From: NAUD Maxence <maxence.naud@cea.fr> Date: Fri, 9 Feb 2024 09:53:07 +0000 Subject: [PATCH] [Add] small optimization --- .../LeakyReLUImpl_forward_kernels.hpp | 2 +- .../ReduceMeanImpl_forward_kernels.hpp | 96 +++++++++++-------- 2 files changed, 58 insertions(+), 40 deletions(-) diff --git a/include/aidge/backend/cpu/operator/LeakyReLUImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/LeakyReLUImpl_forward_kernels.hpp index 761b9579..d10b32e1 100644 --- a/include/aidge/backend/cpu/operator/LeakyReLUImpl_forward_kernels.hpp +++ b/include/aidge/backend/cpu/operator/LeakyReLUImpl_forward_kernels.hpp @@ -25,7 +25,7 @@ void LeakyReLUImpl_cpu_forward_kernel(const LeakyReLU_Op::Attrs& attrs, const I* input = static_cast<const I*>(input_); O* output = static_cast<O*>(output_); - I negativeSlope = static_cast<I>(std::get<0>(attrs)); + const I negativeSlope = static_cast<const I>(std::get<0>(attrs)); for (std::size_t i = 0; i < inputLenght; ++i) { output[i] = input[i] >= 0 ? input[i] : input[i] * negativeSlope; diff --git a/include/aidge/backend/cpu/operator/ReduceMeanImpl_forward_kernels.hpp b/include/aidge/backend/cpu/operator/ReduceMeanImpl_forward_kernels.hpp index 71888aa5..bc9ada0f 100644 --- a/include/aidge/backend/cpu/operator/ReduceMeanImpl_forward_kernels.hpp +++ b/include/aidge/backend/cpu/operator/ReduceMeanImpl_forward_kernels.hpp @@ -33,56 +33,74 @@ void ReduceMeanImpl_cpu_forward_kernel(const typename ReduceMean_Op<DIM>::Attrs& O* output = static_cast<O*>(output_); const std::size_t nb_dims = inputDims.size(); - const std::size_t totalElements = std::accumulate(inputDims.cbegin(), inputDims.cend(), 1, std::multiplies<std::size_t>()); - std::size_t outputElements = totalElements; - std::size_t *stride_post = new std::size_t[nb_dims]; - stride_post[nb_dims - 1] = 1; - for (std::size_t i = nb_dims-2; i != static_cast<std::size_t>(-1); --i) { - stride_post[i] = stride_post[i+1]*inputDims[i+1]; - } - std::size_t *stride_pre = new std::size_t[nb_dims]; - stride_pre[0] = 1; - for (std::size_t i = 1; i < nb_dims; ++i) { - stride_pre[i] = stride_pre[i-1]*inputDims[i-1]; - } + if (DIM == 1) { + const std::size_t stride_pre = std::accumulate(inputDims.cbegin(), inputDims.cbegin() + std::get<0>(attrs)[0], 1, std::multiplies<std::size_t>()); + const std::size_t stride_post = std::accumulate(inputDims.crbegin(), inputDims.crbegin() + nb_dims -1 - std::get<0>(attrs)[0], 1, std::multiplies<std::size_t>()); - const I* inputAccumulation = input; - I* outputAccumulation = nullptr; - - for (const std::size_t& a : std::get<0>(attrs)) { - outputElements /= inputDims[a]; - outputAccumulation = new I[outputElements]; - const std::size_t dim_i = inputDims[a]; - for (std::size_t pre = 0; pre < stride_pre[a]; ++pre) { - for (std::size_t post = 0; post < stride_post[a]; ++post) { - const std::size_t idx_i = pre * dim_i * stride_post[a] + post; - const std::size_t idx_o = pre * stride_post[a] + post; - outputAccumulation[idx_o] = inputAccumulation[idx_i]; + const std::size_t dim_i = inputDims[std::get<0>(attrs)[0]]; + for (std::size_t pre = 0; pre < stride_pre; ++pre) { + for (std::size_t post = 0; post < stride_post; ++post) { + const std::size_t idx_i = pre * dim_i * stride_post + post; + const std::size_t idx_o = pre * stride_post + post; + output[idx_o] = input[idx_i]; for (std::size_t i = 1; i < dim_i; ++i) { - outputAccumulation[idx_o] += inputAccumulation[idx_i + i*stride_post[a]]; + output[idx_o] += input[idx_i + i*stride_post]; } + output[idx_o] /= dim_i; } } - std::for_each(stride_pre+a+1, stride_pre+nb_dims, [dim_i] (std::size_t& val) { val /= dim_i; }); - if (inputAccumulation != input) { - delete[] inputAccumulation; + } else { + std::size_t outputElements = totalElements; + + std::size_t *stride_post = new std::size_t[nb_dims]; + stride_post[nb_dims - 1] = 1; + for (std::size_t i = nb_dims-2; i != static_cast<std::size_t>(-1); --i) { + stride_post[i] = stride_post[i+1]*inputDims[i+1]; + } + std::size_t *stride_pre = new std::size_t[nb_dims]; + stride_pre[0] = 1; + for (std::size_t i = 1; i < nb_dims; ++i) { + stride_pre[i] = stride_pre[i-1]*inputDims[i-1]; } - inputAccumulation = outputAccumulation; - } - // Copy elements from inputAccumulation to output while dividing by divisor - I divisor = totalElements / outputElements; - std::transform(inputAccumulation, inputAccumulation + outputElements, output, - [divisor](int element) { return element / divisor; }); - if (outputAccumulation) { - delete[] outputAccumulation; - } - delete[] stride_post; - delete[] stride_pre; + const I* inputAccumulation = input; + I* outputAccumulation = nullptr; + for (const std::size_t& a : std::get<0>(attrs)) { + outputElements /= inputDims[a]; + outputAccumulation = new I[outputElements]; + const std::size_t dim_i = inputDims[a]; + for (std::size_t pre = 0; pre < stride_pre[a]; ++pre) { + for (std::size_t post = 0; post < stride_post[a]; ++post) { + const std::size_t idx_i = pre * dim_i * stride_post[a] + post; + const std::size_t idx_o = pre * stride_post[a] + post; + outputAccumulation[idx_o] = inputAccumulation[idx_i]; + for (std::size_t i = 1; i < dim_i; ++i) { + outputAccumulation[idx_o] += inputAccumulation[idx_i + i*stride_post[a]]; + } + } + } + std::for_each(stride_pre+a+1, stride_pre+nb_dims, [dim_i] (std::size_t& val) { val /= dim_i; }); + if (inputAccumulation != input) { + delete[] inputAccumulation; + } + inputAccumulation = outputAccumulation; + } + + // Copy elements from inputAccumulation to output while dividing by divisor + I divisor = totalElements / outputElements; + std::transform(inputAccumulation, inputAccumulation + outputElements, output, + [divisor](int element) { return element / divisor; }); + if (outputAccumulation) { + delete[] outputAccumulation; + } + delete[] stride_post; + delete[] stride_pre; + } } + namespace { // DIM = 1 static Registrar<ReduceMeanImpl1DForward_cpu> registrarReduceMeanImplForward_1D_cpu_Float32( -- GitLab