Skip to content
Snippets Groups Projects
Commit c1849ef2 authored by Olivier BICHLER's avatar Olivier BICHLER
Browse files

Reduced OpenMP thresholds

parent 77cd4574
No related branches found
No related tags found
2 merge requests!166Update 0.5.0 -> 0.6.0,!158Added OpenMP
...@@ -77,7 +77,7 @@ void AvgPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideD ...@@ -77,7 +77,7 @@ void AvgPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideD
using signedsize = std::make_signed<std::size_t>::type; using signedsize = std::make_signed<std::size_t>::type;
#ifdef _OPENMP #ifdef _OPENMP
#pragma omp parallel for collapse(2) if (dims[0] * dims[1] > 32) #pragma omp parallel for collapse(2) if (dims[0] * dims[1] >= 16)
#endif #endif
for (int batch = 0; batch < static_cast<int>(dims[0]); ++batch) { for (int batch = 0; batch < static_cast<int>(dims[0]); ++batch) {
for (int ch = 0; ch < static_cast<int>(dims[1]); ++ch) { for (int ch = 0; ch < static_cast<int>(dims[1]); ++ch) {
......
...@@ -54,7 +54,7 @@ void BatchNormImpl2D_cpu_forward_kernel(float epsilon, float momentum, const std ...@@ -54,7 +54,7 @@ void BatchNormImpl2D_cpu_forward_kernel(float epsilon, float momentum, const std
if ((freeze == true) || (momentum == 0.0f)) { if ((freeze == true) || (momentum == 0.0f)) {
#ifdef _OPENMP #ifdef _OPENMP
#pragma omp parallel for collapse(2) if (nbBatch * nbChannels > 32) #pragma omp parallel for collapse(2) if (nbBatch * nbChannels >= 16)
#endif #endif
for (int batch = 0; batch < static_cast<int>(nbBatch); ++batch) { for (int batch = 0; batch < static_cast<int>(nbBatch); ++batch) {
for (int ch = 0; ch < static_cast<int>(nbChannels); ++ch) { for (int ch = 0; ch < static_cast<int>(nbChannels); ++ch) {
......
...@@ -66,7 +66,7 @@ void ConvDepthWiseImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& stri ...@@ -66,7 +66,7 @@ void ConvDepthWiseImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& stri
// does not take Dilation attribute into account // does not take Dilation attribute into account
using signedsize = std::make_signed<std::size_t>::type; using signedsize = std::make_signed<std::size_t>::type;
#ifdef _OPENMP #ifdef _OPENMP
#pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] > 32) #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] >= 16)
#endif #endif
for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) { for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) {
for (int ch = 0; ch < static_cast<int>(inputDims[1]); ++ch) { for (int ch = 0; ch < static_cast<int>(inputDims[1]); ++ch) {
...@@ -156,7 +156,7 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri ...@@ -156,7 +156,7 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri
if (dilated_kernel_x ==3 && dilated_kernel_y == 3) { if (dilated_kernel_x ==3 && dilated_kernel_y == 3) {
#ifdef _OPENMP #ifdef _OPENMP
#pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] > 32) #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] >= 16)
#endif #endif
for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) { for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) {
for (int ch = 0; ch < static_cast<int>(inputDims[1]); ++ch) { for (int ch = 0; ch < static_cast<int>(inputDims[1]); ++ch) {
...@@ -199,7 +199,7 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri ...@@ -199,7 +199,7 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri
} }
} else if (dilated_kernel_x == 1 && dilated_kernel_y == 1) { } else if (dilated_kernel_x == 1 && dilated_kernel_y == 1) {
#ifdef _OPENMP #ifdef _OPENMP
#pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] > 32) #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] >= 16)
#endif #endif
for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) { for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) {
for (int ch = 0; ch < static_cast<int>(inputDims[1]); ++ch) { for (int ch = 0; ch < static_cast<int>(inputDims[1]); ++ch) {
...@@ -224,7 +224,7 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri ...@@ -224,7 +224,7 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri
} }
} else { } else {
#ifdef _OPENMP #ifdef _OPENMP
#pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] > 32) #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] >= 16)
#endif #endif
for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) { for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) {
for (int ch = 0; ch < static_cast<int>(inputDims[1]); ++ch) { for (int ch = 0; ch < static_cast<int>(inputDims[1]); ++ch) {
......
...@@ -60,7 +60,7 @@ void ConvImpl1D_cpu_forward_kernel(const array<DimSize_t, 1> &strideDim, ...@@ -60,7 +60,7 @@ void ConvImpl1D_cpu_forward_kernel(const array<DimSize_t, 1> &strideDim,
using signedsize = std::make_signed<std::size_t>::type; using signedsize = std::make_signed<std::size_t>::type;
#ifdef _OPENMP #ifdef _OPENMP
#pragma omp parallel for collapse(2) if (inputDims[0] * outChannels > 32) #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels >= 16)
#endif #endif
for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) { for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) {
for (int outCh = 0; outCh < static_cast<int>(outChannels); ++outCh) { for (int outCh = 0; outCh < static_cast<int>(outChannels); ++outCh) {
...@@ -482,7 +482,7 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims, ...@@ -482,7 +482,7 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims,
if (dilated_kernel_x == 3 && dilated_kernel_y == 3) { if (dilated_kernel_x == 3 && dilated_kernel_y == 3) {
#ifdef _OPENMP #ifdef _OPENMP
#pragma omp parallel for collapse(2) if (inputDims[0] * outChannels > 32) #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels >= 16)
#endif #endif
for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) { for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) {
for (int outCh = 0; outCh < static_cast<int>(outChannels); ++outCh) { for (int outCh = 0; outCh < static_cast<int>(outChannels); ++outCh) {
...@@ -571,7 +571,7 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims, ...@@ -571,7 +571,7 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims,
} }
} else if (dilated_kernel_x == 1 && dilated_kernel_y == 1) { } else if (dilated_kernel_x == 1 && dilated_kernel_y == 1) {
#ifdef _OPENMP #ifdef _OPENMP
#pragma omp parallel for collapse(2) if (inputDims[0] * outChannels > 32) #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels >= 16)
#endif #endif
for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) { for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) {
for (int outCh = 0; outCh < static_cast<int>(outChannels); ++outCh) { for (int outCh = 0; outCh < static_cast<int>(outChannels); ++outCh) {
...@@ -607,7 +607,7 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims, ...@@ -607,7 +607,7 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims,
} }
} else { } else {
#ifdef _OPENMP #ifdef _OPENMP
#pragma omp parallel for collapse(2) if (inputDims[0] * outChannels > 32) #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels >= 16)
#endif #endif
for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) { for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) {
for (int outCh = 0; outCh < static_cast<int>(outChannels); ++outCh) { for (int outCh = 0; outCh < static_cast<int>(outChannels); ++outCh) {
......
...@@ -97,7 +97,7 @@ void FCImpl_cpu_forward_kernel(const DimSize_t batchSize, ...@@ -97,7 +97,7 @@ void FCImpl_cpu_forward_kernel(const DimSize_t batchSize,
O* output = static_cast<O*>(output_); O* output = static_cast<O*>(output_);
#ifdef _OPENMP #ifdef _OPENMP
#pragma omp parallel for collapse(2) if (batchSize * outputFeatureSize > 32) #pragma omp parallel for collapse(2) if (batchSize * outputFeatureSize >= 16)
#endif #endif
for (int batch = 0; batch < static_cast<int>(batchSize); ++batch) { for (int batch = 0; batch < static_cast<int>(batchSize); ++batch) {
for (int out = 0; out < static_cast<int>(outputFeatureSize); ++out) { for (int out = 0; out < static_cast<int>(outputFeatureSize); ++out) {
......
...@@ -75,7 +75,7 @@ void GlobalAveragePoolingImpl_cpu_forward_kernel(const std::shared_ptr<Tensor>& ...@@ -75,7 +75,7 @@ void GlobalAveragePoolingImpl_cpu_forward_kernel(const std::shared_ptr<Tensor>&
// parse channel by channel and fill each output with the average of the // parse channel by channel and fill each output with the average of the
// values in the channel // values in the channel
#ifdef _OPENMP #ifdef _OPENMP
#pragma omp parallel for collapse(2) if (dims[0] * dims[1] > 32) #pragma omp parallel for collapse(2) if (dims[0] * dims[1] >= 16)
#endif #endif
for (int batch = 0; batch < static_cast<int>(dims[0]); ++batch) { for (int batch = 0; batch < static_cast<int>(dims[0]); ++batch) {
for (int channel = 0; channel < static_cast<int>(dims[1]); ++channel) { for (int channel = 0; channel < static_cast<int>(dims[1]); ++channel) {
......
...@@ -27,7 +27,7 @@ void MatMulImpl_cpu_forward_kernel(const std::size_t n, const std::size_t k, con ...@@ -27,7 +27,7 @@ void MatMulImpl_cpu_forward_kernel(const std::size_t n, const std::size_t k, con
std::memset(output, O(0), n * m * sizeof(O)); std::memset(output, O(0), n * m * sizeof(O));
#ifdef _OPENMP #ifdef _OPENMP
#pragma omp parallel for if (n > 32) #pragma omp parallel for if (n >= 16)
#endif #endif
for (int i = 0; i < static_cast<int>(n); ++i) { for (int i = 0; i < static_cast<int>(n); ++i) {
for (std::size_t l = 0; l < k; ++l) { for (std::size_t l = 0; l < k; ++l) {
......
...@@ -67,7 +67,7 @@ void MaxPoolingImpl2D_cpu_forward_kernel( ...@@ -67,7 +67,7 @@ void MaxPoolingImpl2D_cpu_forward_kernel(
using signedsize = std::make_signed<std::size_t>::type; using signedsize = std::make_signed<std::size_t>::type;
#ifdef _OPENMP #ifdef _OPENMP
#pragma omp parallel for collapse(2) if (dims[0] * dims[1] > 32) #pragma omp parallel for collapse(2) if (dims[0] * dims[1] >= 16)
#endif #endif
for (int batch = 0; batch < static_cast<int>(dims[0]); ++batch){ for (int batch = 0; batch < static_cast<int>(dims[0]); ++batch){
for (int channel = 0; channel < static_cast<int>(dims[1]); ++channel){ for (int channel = 0; channel < static_cast<int>(dims[1]); ++channel){
......
...@@ -38,7 +38,7 @@ void SoftmaxImpl_cpu_forward_kernel(std::size_t axisIdx, const std::vector<DimSi ...@@ -38,7 +38,7 @@ void SoftmaxImpl_cpu_forward_kernel(std::size_t axisIdx, const std::vector<DimSi
} }
#ifdef _OPENMP #ifdef _OPENMP
#pragma omp parallel for collapse(2) if (preAxisElems * postAxisElems > 32) #pragma omp parallel for collapse(2) if (preAxisElems * postAxisElems >= 16)
#endif #endif
for (int i = 0; i < static_cast<int>(preAxisElems); ++i) { for (int i = 0; i < static_cast<int>(preAxisElems); ++i) {
for (int j = 0; j < static_cast<int>(postAxisElems); ++j) { for (int j = 0; j < static_cast<int>(postAxisElems); ++j) {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment