diff --git a/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp index e7bc3a2b845d077877684f75d7980fcd1958eb6e..f9cc13b5b0be6e63aa2ac7da8d3eccbaf7c9cd2e 100644 --- a/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp @@ -77,7 +77,7 @@ void AvgPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideD using signedsize = std::make_signed<std::size_t>::type; #ifdef _OPENMP - #pragma omp parallel for collapse(2) if (dims[0] * dims[1] > 32) + #pragma omp parallel for collapse(2) if (dims[0] * dims[1] >= 16) #endif for (int batch = 0; batch < static_cast<int>(dims[0]); ++batch) { for (int ch = 0; ch < static_cast<int>(dims[1]); ++ch) { diff --git a/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp b/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp index 105a33007bae830507128804641b8feb16bc0848..d1d7d529756c1bbad2880579a5dac57ebd9e07c7 100644 --- a/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp @@ -54,7 +54,7 @@ void BatchNormImpl2D_cpu_forward_kernel(float epsilon, float momentum, const std if ((freeze == true) || (momentum == 0.0f)) { #ifdef _OPENMP - #pragma omp parallel for collapse(2) if (nbBatch * nbChannels > 32) + #pragma omp parallel for collapse(2) if (nbBatch * nbChannels >= 16) #endif for (int batch = 0; batch < static_cast<int>(nbBatch); ++batch) { for (int ch = 0; ch < static_cast<int>(nbChannels); ++ch) { diff --git a/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp index aac83b1be928735972010ae3f3554d7d9bf487c3..0e2f5a72e4ad1a7e2c8bd239e43914642121965f 100644 --- a/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp @@ -66,7 +66,7 @@ void ConvDepthWiseImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& stri // does not take Dilation attribute into account using signedsize = std::make_signed<std::size_t>::type; #ifdef _OPENMP - #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] > 32) + #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] >= 16) #endif for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) { for (int ch = 0; ch < static_cast<int>(inputDims[1]); ++ch) { @@ -156,7 +156,7 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri if (dilated_kernel_x ==3 && dilated_kernel_y == 3) { #ifdef _OPENMP - #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] > 32) + #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] >= 16) #endif for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) { for (int ch = 0; ch < static_cast<int>(inputDims[1]); ++ch) { @@ -199,7 +199,7 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri } } else if (dilated_kernel_x == 1 && dilated_kernel_y == 1) { #ifdef _OPENMP - #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] > 32) + #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] >= 16) #endif for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) { for (int ch = 0; ch < static_cast<int>(inputDims[1]); ++ch) { @@ -224,7 +224,7 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri } } else { #ifdef _OPENMP - #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] > 32) + #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] >= 16) #endif for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) { for (int ch = 0; ch < static_cast<int>(inputDims[1]); ++ch) { diff --git a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp index fc3904ad229dbd64ddf694fd7c5992558648d04b..e1e76a33120bb9536842a9f0db4cc789f8fe38a1 100644 --- a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp @@ -60,7 +60,7 @@ void ConvImpl1D_cpu_forward_kernel(const array<DimSize_t, 1> &strideDim, using signedsize = std::make_signed<std::size_t>::type; #ifdef _OPENMP - #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels > 32) + #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels >= 16) #endif for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) { for (int outCh = 0; outCh < static_cast<int>(outChannels); ++outCh) { @@ -482,7 +482,7 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims, if (dilated_kernel_x == 3 && dilated_kernel_y == 3) { #ifdef _OPENMP - #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels > 32) + #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels >= 16) #endif for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) { for (int outCh = 0; outCh < static_cast<int>(outChannels); ++outCh) { @@ -571,7 +571,7 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims, } } else if (dilated_kernel_x == 1 && dilated_kernel_y == 1) { #ifdef _OPENMP - #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels > 32) + #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels >= 16) #endif for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) { for (int outCh = 0; outCh < static_cast<int>(outChannels); ++outCh) { @@ -607,7 +607,7 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims, } } else { #ifdef _OPENMP - #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels > 32) + #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels >= 16) #endif for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) { for (int outCh = 0; outCh < static_cast<int>(outChannels); ++outCh) { diff --git a/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp index ca4d5def783a2fae87ee55ae0b5007c795c8b599..b03e7f58c19b119ec72306f7d9979607a707cde7 100644 --- a/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp @@ -97,7 +97,7 @@ void FCImpl_cpu_forward_kernel(const DimSize_t batchSize, O* output = static_cast<O*>(output_); #ifdef _OPENMP - #pragma omp parallel for collapse(2) if (batchSize * outputFeatureSize > 32) + #pragma omp parallel for collapse(2) if (batchSize * outputFeatureSize >= 16) #endif for (int batch = 0; batch < static_cast<int>(batchSize); ++batch) { for (int out = 0; out < static_cast<int>(outputFeatureSize); ++out) { diff --git a/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp index 8ff1ad08b0897dbc3d89d67632a555effba73a85..3cab0ad9647a974170bf682fcf3b57b306bd76bd 100644 --- a/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp @@ -75,7 +75,7 @@ void GlobalAveragePoolingImpl_cpu_forward_kernel(const std::shared_ptr<Tensor>& // parse channel by channel and fill each output with the average of the // values in the channel #ifdef _OPENMP - #pragma omp parallel for collapse(2) if (dims[0] * dims[1] > 32) + #pragma omp parallel for collapse(2) if (dims[0] * dims[1] >= 16) #endif for (int batch = 0; batch < static_cast<int>(dims[0]); ++batch) { for (int channel = 0; channel < static_cast<int>(dims[1]); ++channel) { diff --git a/include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp index 422020a61507b819f53affacb0326a2d24735cf3..adcc8ddc26a379e3a310aa1ab405841f7964037d 100644 --- a/include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp @@ -27,7 +27,7 @@ void MatMulImpl_cpu_forward_kernel(const std::size_t n, const std::size_t k, con std::memset(output, O(0), n * m * sizeof(O)); #ifdef _OPENMP - #pragma omp parallel for if (n > 32) + #pragma omp parallel for if (n >= 16) #endif for (int i = 0; i < static_cast<int>(n); ++i) { for (std::size_t l = 0; l < k; ++l) { diff --git a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp index b5f219f9086f387f86769a582c2a2cd6aaa42d9f..7fe272d5d23d6484ed03c6183a1972035aa1b563 100644 --- a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp @@ -67,7 +67,7 @@ void MaxPoolingImpl2D_cpu_forward_kernel( using signedsize = std::make_signed<std::size_t>::type; #ifdef _OPENMP - #pragma omp parallel for collapse(2) if (dims[0] * dims[1] > 32) + #pragma omp parallel for collapse(2) if (dims[0] * dims[1] >= 16) #endif for (int batch = 0; batch < static_cast<int>(dims[0]); ++batch){ for (int channel = 0; channel < static_cast<int>(dims[1]); ++channel){ diff --git a/include/aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp b/include/aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp index ab6790e257b04a2fb1ee3d3ed57c5c7220c6c456..0e72710cac4004876e8026ccdfbc38cb7c2618eb 100644 --- a/include/aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp @@ -38,7 +38,7 @@ void SoftmaxImpl_cpu_forward_kernel(std::size_t axisIdx, const std::vector<DimSi } #ifdef _OPENMP - #pragma omp parallel for collapse(2) if (preAxisElems * postAxisElems > 32) + #pragma omp parallel for collapse(2) if (preAxisElems * postAxisElems >= 16) #endif for (int i = 0; i < static_cast<int>(preAxisElems); ++i) { for (int j = 0; j < static_cast<int>(postAxisElems); ++j) {