From c1849ef26808f618c80fc81df7161f9b44e0111a Mon Sep 17 00:00:00 2001 From: Olivier BICHLER <olivier.bichler@cea.fr> Date: Tue, 8 Apr 2025 19:38:46 +0200 Subject: [PATCH] Reduced OpenMP thresholds --- .../aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp | 2 +- .../aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp | 2 +- .../backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp | 8 ++++---- include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp | 8 ++++---- include/aidge/backend/cpu/operator/FCImpl_kernels.hpp | 2 +- .../cpu/operator/GlobalAveragePoolingImpl_kernels.hpp | 2 +- include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp | 2 +- .../aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp | 2 +- .../aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp | 2 +- 9 files changed, 15 insertions(+), 15 deletions(-) diff --git a/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp index e7bc3a2b..f9cc13b5 100644 --- a/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp @@ -77,7 +77,7 @@ void AvgPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideD using signedsize = std::make_signed<std::size_t>::type; #ifdef _OPENMP - #pragma omp parallel for collapse(2) if (dims[0] * dims[1] > 32) + #pragma omp parallel for collapse(2) if (dims[0] * dims[1] >= 16) #endif for (int batch = 0; batch < static_cast<int>(dims[0]); ++batch) { for (int ch = 0; ch < static_cast<int>(dims[1]); ++ch) { diff --git a/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp b/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp index 105a3300..d1d7d529 100644 --- a/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp @@ -54,7 +54,7 @@ void BatchNormImpl2D_cpu_forward_kernel(float epsilon, float momentum, const std if ((freeze == true) || (momentum == 0.0f)) { #ifdef _OPENMP - #pragma omp parallel for collapse(2) if (nbBatch * nbChannels > 32) + #pragma omp parallel for collapse(2) if (nbBatch * nbChannels >= 16) #endif for (int batch = 0; batch < static_cast<int>(nbBatch); ++batch) { for (int ch = 0; ch < static_cast<int>(nbChannels); ++ch) { diff --git a/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp index aac83b1b..0e2f5a72 100644 --- a/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp @@ -66,7 +66,7 @@ void ConvDepthWiseImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& stri // does not take Dilation attribute into account using signedsize = std::make_signed<std::size_t>::type; #ifdef _OPENMP - #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] > 32) + #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] >= 16) #endif for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) { for (int ch = 0; ch < static_cast<int>(inputDims[1]); ++ch) { @@ -156,7 +156,7 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri if (dilated_kernel_x ==3 && dilated_kernel_y == 3) { #ifdef _OPENMP - #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] > 32) + #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] >= 16) #endif for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) { for (int ch = 0; ch < static_cast<int>(inputDims[1]); ++ch) { @@ -199,7 +199,7 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri } } else if (dilated_kernel_x == 1 && dilated_kernel_y == 1) { #ifdef _OPENMP - #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] > 32) + #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] >= 16) #endif for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) { for (int ch = 0; ch < static_cast<int>(inputDims[1]); ++ch) { @@ -224,7 +224,7 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri } } else { #ifdef _OPENMP - #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] > 32) + #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] >= 16) #endif for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) { for (int ch = 0; ch < static_cast<int>(inputDims[1]); ++ch) { diff --git a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp index fc3904ad..e1e76a33 100644 --- a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp @@ -60,7 +60,7 @@ void ConvImpl1D_cpu_forward_kernel(const array<DimSize_t, 1> &strideDim, using signedsize = std::make_signed<std::size_t>::type; #ifdef _OPENMP - #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels > 32) + #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels >= 16) #endif for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) { for (int outCh = 0; outCh < static_cast<int>(outChannels); ++outCh) { @@ -482,7 +482,7 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims, if (dilated_kernel_x == 3 && dilated_kernel_y == 3) { #ifdef _OPENMP - #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels > 32) + #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels >= 16) #endif for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) { for (int outCh = 0; outCh < static_cast<int>(outChannels); ++outCh) { @@ -571,7 +571,7 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims, } } else if (dilated_kernel_x == 1 && dilated_kernel_y == 1) { #ifdef _OPENMP - #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels > 32) + #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels >= 16) #endif for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) { for (int outCh = 0; outCh < static_cast<int>(outChannels); ++outCh) { @@ -607,7 +607,7 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims, } } else { #ifdef _OPENMP - #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels > 32) + #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels >= 16) #endif for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) { for (int outCh = 0; outCh < static_cast<int>(outChannels); ++outCh) { diff --git a/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp index ca4d5def..b03e7f58 100644 --- a/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp @@ -97,7 +97,7 @@ void FCImpl_cpu_forward_kernel(const DimSize_t batchSize, O* output = static_cast<O*>(output_); #ifdef _OPENMP - #pragma omp parallel for collapse(2) if (batchSize * outputFeatureSize > 32) + #pragma omp parallel for collapse(2) if (batchSize * outputFeatureSize >= 16) #endif for (int batch = 0; batch < static_cast<int>(batchSize); ++batch) { for (int out = 0; out < static_cast<int>(outputFeatureSize); ++out) { diff --git a/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp index 8ff1ad08..3cab0ad9 100644 --- a/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp @@ -75,7 +75,7 @@ void GlobalAveragePoolingImpl_cpu_forward_kernel(const std::shared_ptr<Tensor>& // parse channel by channel and fill each output with the average of the // values in the channel #ifdef _OPENMP - #pragma omp parallel for collapse(2) if (dims[0] * dims[1] > 32) + #pragma omp parallel for collapse(2) if (dims[0] * dims[1] >= 16) #endif for (int batch = 0; batch < static_cast<int>(dims[0]); ++batch) { for (int channel = 0; channel < static_cast<int>(dims[1]); ++channel) { diff --git a/include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp index 422020a6..adcc8ddc 100644 --- a/include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp @@ -27,7 +27,7 @@ void MatMulImpl_cpu_forward_kernel(const std::size_t n, const std::size_t k, con std::memset(output, O(0), n * m * sizeof(O)); #ifdef _OPENMP - #pragma omp parallel for if (n > 32) + #pragma omp parallel for if (n >= 16) #endif for (int i = 0; i < static_cast<int>(n); ++i) { for (std::size_t l = 0; l < k; ++l) { diff --git a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp index b5f219f9..7fe272d5 100644 --- a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp @@ -67,7 +67,7 @@ void MaxPoolingImpl2D_cpu_forward_kernel( using signedsize = std::make_signed<std::size_t>::type; #ifdef _OPENMP - #pragma omp parallel for collapse(2) if (dims[0] * dims[1] > 32) + #pragma omp parallel for collapse(2) if (dims[0] * dims[1] >= 16) #endif for (int batch = 0; batch < static_cast<int>(dims[0]); ++batch){ for (int channel = 0; channel < static_cast<int>(dims[1]); ++channel){ diff --git a/include/aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp b/include/aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp index ab6790e2..0e72710c 100644 --- a/include/aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp @@ -38,7 +38,7 @@ void SoftmaxImpl_cpu_forward_kernel(std::size_t axisIdx, const std::vector<DimSi } #ifdef _OPENMP - #pragma omp parallel for collapse(2) if (preAxisElems * postAxisElems > 32) + #pragma omp parallel for collapse(2) if (preAxisElems * postAxisElems >= 16) #endif for (int i = 0; i < static_cast<int>(preAxisElems); ++i) { for (int j = 0; j < static_cast<int>(postAxisElems); ++j) { -- GitLab