From 2bb9f35b309bc9b910a78fdb81e5342399be4573 Mon Sep 17 00:00:00 2001 From: Olivier BICHLER <olivier.bichler@cea.fr> Date: Thu, 6 Feb 2025 10:04:25 +0100 Subject: [PATCH] Fix bug eclipse/aidge/aidge_backend_cpu#40 --- .../GlobalAveragePoolingImpl_kernels.hpp | 29 +++++++++++---- .../cpu/operator/ReduceMeanImpl_kernels.hpp | 36 ++++++++++++------- 2 files changed, 47 insertions(+), 18 deletions(-) diff --git a/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp index ed838a94..e396020f 100644 --- a/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp @@ -25,6 +25,28 @@ namespace Aidge { + +template <typename T> +typename std::enable_if<std::is_floating_point<T>::value, T>::type +stableMean(const T* vec, size_t size) { + T mean = 0; + for (size_t i = 0; i < size; ++i) { + mean = std::fma<T>(vec[i] - mean, 1.0f / (i + 1), mean); + } + return mean; +} + +// Specialization for integers: perform the mean computation in float +template <typename T> +typename std::enable_if<!std::is_floating_point<T>::value, T>::type +stableMean(const T* vec, size_t size) { + float mean = 0; + for (size_t i = 0; i < size; ++i) { + mean = std::fma<float>(vec[i] - mean, 1.0f / (i + 1), mean); + } + return mean; +} + template <class I, class O> void GlobalAveragePoolingImpl_cpu_forward_kernel( const std::vector<DimSize_t> &dims, const void *input_, void *output_) { @@ -49,12 +71,7 @@ void GlobalAveragePoolingImpl_cpu_forward_kernel( for (DimSize_t channel = 0; channel < dims[1]; ++channel) { const I *filter_start = std::next( input, (batch * in_batch_nb_elems) + (channel * in_channel_nb_elems)); - I mean = 0; - for (size_t i = 0; i < in_channel_nb_elems; ++i) { - // Single pass numerically stable mean, using the fmaf - mean = fmaf(filter_start[i] - mean, 1.0f/(i+1), mean); - } - output[batch * out_batch_nb_elems + channel] = mean; + output[batch * out_batch_nb_elems + channel] = stableMean<I>(filter_start, in_channel_nb_elems); } } } diff --git a/include/aidge/backend/cpu/operator/ReduceMeanImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ReduceMeanImpl_kernels.hpp index 5a143164..0ab23655 100644 --- a/include/aidge/backend/cpu/operator/ReduceMeanImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/ReduceMeanImpl_kernels.hpp @@ -25,6 +25,28 @@ #include "aidge/utils/Registrar.hpp" namespace Aidge { + +template <typename T> +typename std::enable_if<std::is_floating_point<T>::value, T>::type +stableMean(const T* vec, size_t len, size_t stride) { + T mean = 0; + for (size_t i = 0; i < len; ++i) { + mean = std::fma<T>(vec[i * stride] - mean, 1.0f / (i + 1), mean); + } + return mean; +} + +// Specialization for integers: perform the mean computation in float +template <typename T> +typename std::enable_if<!std::is_floating_point<T>::value, T>::type +stableMean(const T* vec, size_t len, size_t stride) { + float mean = 0; + for (size_t i = 0; i < len; ++i) { + mean = std::fma<float>(vec[i * stride] - mean, 1.0f / (i + 1), mean); + } + return mean; +} + template <class I, class O> void ReduceMeanImpl_cpu_forward_kernel(const std::vector<std::int32_t>& axes, DimSize_t /*keepDims*/, @@ -50,12 +72,7 @@ void ReduceMeanImpl_cpu_forward_kernel(const std::vector<std::int32_t>& axes, for (std::size_t post = 0; post < stride_post; ++post) { const std::size_t idx_i = pre * dim_i * stride_post + post; const std::size_t idx_o = pre * stride_post + post; - O mean = 0; - for (std::size_t i = 0; i < dim_i; ++i) { - // Single pass numerically stable mean, using the fmaf - mean = fmaf(input[idx_i + i*stride_post] - mean, 1.0f/(i+1), mean); - } - output[idx_o] = mean; + output[idx_o] = stableMean<I>(input + idx_i, dim_i, stride_post); } } } else { @@ -84,12 +101,7 @@ void ReduceMeanImpl_cpu_forward_kernel(const std::vector<std::int32_t>& axes, for (std::size_t post = 0; post < stride_post[a]; ++post) { const std::size_t idx_i = pre * dim_i * stride_post[a] + post; const std::size_t idx_o = pre * stride_post[a] + post; - I mean = 0; - for (std::size_t i = 0; i < dim_i; ++i) { - // Single pass numerically stable mean, using the fmaf - mean = fmaf(inputAccumulation[idx_i + i*stride_post[a]] - mean, 1.0f/(i+1), mean); - } - outputAccumulation[idx_o] = mean; + outputAccumulation[idx_o] = stableMean<I>(inputAccumulation + idx_i, dim_i, stride_post[a]); } } std::for_each(stride_pre.get()+a+1, stride_pre.get()+nb_dims, [dim_i] (std::size_t& val) { val /= dim_i; }); -- GitLab