diff --git a/CMakeLists.txt b/CMakeLists.txt index 6c87a89b8ac1254f8bfb8fb990f8c03f7e593d61..d2c1d0a76053c3885aa2fdeb51383a5303c2c3db 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -64,6 +64,8 @@ if(NOT $ENV{AIDGE_INSTALL} STREQUAL "") endif() find_package(aidge_core REQUIRED) +find_package(OpenMP) + find_package(OpenSSL QUIET) if(OpenSSL_FOUND) message(STATUS "OpenSSL found: ${OPENSSL_VERSION}") @@ -86,6 +88,10 @@ target_link_libraries(${module_name} _aidge_core # _ is added because we link the exported target and not the project ) +if(OpenMP_CXX_FOUND) + target_link_libraries(${module_name} PUBLIC OpenMP::OpenMP_CXX) +endif() + # Add definition _USE_MATH_DEFINES to enable math constant definitions from math.h/cmath. if (WIN32) target_compile_definitions(${module_name} PRIVATE _USE_MATH_DEFINES) diff --git a/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp index 1671759d25a5965ceca57fc1167534d7986282c4..0d73cb912a0b8218c29d1f533b674a8ea5005d26 100644 --- a/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp @@ -76,6 +76,9 @@ void AvgPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideD using signedsize = std::make_signed<std::size_t>::type; +#ifdef _OPENMP + #pragma omp parallel for collapse(2) if (dims[0] * dims[1] > 32) +#endif for (std::size_t batch = 0; batch < dims[0]; ++batch) { for (std::size_t ch = 0; ch < dims[1]; ++ch) { const std::size_t oIndex = (ch + batch * dims[1]) * oxSize * oySize; diff --git a/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp b/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp index cf97f7372ac528ef28d0f378beb2650af32bfa30..7bb7971e35916b5ecf9c59ad3fe55965f53eed91 100644 --- a/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp @@ -53,6 +53,9 @@ void BatchNormImpl2D_cpu_forward_kernel(float epsilon, float momentum, const std const DimSize_t featureMapSize = (dims.size() > 2) ? std::accumulate(dims.begin() + 2, dims.end(), 1, std::multiplies<DimSize_t>()) : 1; if ((freeze == true) || (momentum == 0.0f)) { +#ifdef _OPENMP + #pragma omp parallel for collapse(2) if (nbBatch * nbChannels > 32) +#endif for (std::size_t batch = 0; batch < nbBatch; ++batch) { for (std::size_t ch = 0; ch < nbChannels; ++ch) { const std::size_t ioIndex = (ch + batch*nbChannels) * featureMapSize; diff --git a/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp index 906ea1adf744353372c844fd3e16b9dbd13e7f7d..b16a819b4b42127dddda0659099018a494d06bc9 100644 --- a/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp @@ -65,6 +65,9 @@ void ConvDepthWiseImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& stri // weight (outCh, ch, kernelX, kernelY) // does not take Dilation attribute into account using signedsize = std::make_signed<std::size_t>::type; +#ifdef _OPENMP + #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] > 32) +#endif for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { for (std::size_t ch = 0; ch < inputDims[1]; ++ch) { const std::size_t oIndex = (ch + batch*inputDims[1]) * oxSize; @@ -152,16 +155,19 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri const std::size_t outChannels_s = oxSize * oySize; if (dilated_kernel_x ==3 && dilated_kernel_y == 3) { +#ifdef _OPENMP + #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] > 32) +#endif for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { for (std::size_t ch = 0; ch < inputDims[1]; ++ch) { - B biasVal = (biases != nullptr) ? biases[ch] : B(0); + std::size_t oIndex = (ch + batch*inputDims[1]) * outChannels_s; std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3]; const std::size_t wIndex = ch * 9; if (strideDims[0] == 1 && strideDims[1]==1) { - for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=inputDims[3]) { + for (std::size_t ox = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=inputDims[3]) { for (std::size_t oy = 0; oy < oySize; ++oy) { output[oIndex + oy] = biasVal + weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+1]+weights[wIndex+2]*input[iIndex+oy+2]; } @@ -175,7 +181,7 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri } } } else { - for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=(strideDims[0]-2)*inputDims[3]) { + for (std::size_t ox = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=(strideDims[0]-2)*inputDims[3]) { for (std::size_t oy = 0; oy < oySize; ++oy) { output[oIndex + oy] = biasVal + weights[wIndex+0]*input[iIndex+oy*strideDims[1]]+weights[wIndex+1]*input[iIndex+oy*strideDims[1]+1]+weights[wIndex+2]*input[iIndex+oy*strideDims[1]+2]; } @@ -189,24 +195,25 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri } } } - output += outChannels_s; } } } else if (dilated_kernel_x == 1 && dilated_kernel_y == 1) { +#ifdef _OPENMP + #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] > 32) +#endif for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { for (std::size_t ch = 0; ch < inputDims[1]; ++ch) { - B biasVal = (biases != nullptr) ? biases[ch] : B(0); + std::size_t oIndex = (ch + batch*inputDims[1]) * outChannels_s; std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3]; const std::size_t wIndex = ch; if (strideDims[0] == 1 && strideDims[1] == 1) { for (std::size_t i = iIndex; i < iIndex + oxSize*oySize; ++i) { - output[i] = biasVal + weights[wIndex] * input[i]; + output[oIndex + i] = biasVal + weights[wIndex] * input[i]; } } else { - std::size_t oIndex = (ch + batch*inputDims[1]) * oxSize * oySize; for (std::size_t ox = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=strideDims[0]*inputDims[3]) { for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) { output[oIndex + oy] = biasVal + weights[wIndex]*input[iIndex+iy]; @@ -216,19 +223,22 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri } } } else { +#ifdef _OPENMP + #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] > 32) +#endif for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { for (std::size_t ch = 0; ch < inputDims[1]; ++ch) { - - B biasVal = (biases != nullptr) ? biases[ch] : B(0); - std::fill(output, output+outChannels_s, biasVal); - + const std::size_t oIndex = (ch + batch*inputDims[1]) * outChannels_s; const std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3]; const std::size_t wIndex = ch * kernelDims[0] * kernelDims[1]; + B biasVal = (biases != nullptr) ? biases[ch] : B(0); + std::fill(output + oIndex, output + oIndex + outChannels_s, biasVal); + for (std::size_t ox = 0; ox < oxSize; ++ox) { for (std::size_t oy = 0; oy < oySize; ++oy) { - const std::size_t oIndexFull = ox*oySize + oy; + const std::size_t oIndexFull = oIndex + ox*oySize + oy; const std::size_t ix = ox * strideDims[0]; const std::size_t iy = oy * strideDims[1]; @@ -240,7 +250,6 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri } } } - output += outChannels_s; } } } diff --git a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp index 29aac6dc585e41d39f0d03b3035a5294848f8436..b1cd006eae2d0ed9a224b8bdbf5b267472795720 100644 --- a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp @@ -59,6 +59,9 @@ void ConvImpl1D_cpu_forward_kernel(const array<DimSize_t, 1> &strideDim, const DimSize_t dilated_kernel_x = dilationDim[0] * (kernelDim[0] - 1) + 1; using signedsize = std::make_signed<std::size_t>::type; +#ifdef _OPENMP + #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels > 32) +#endif for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { for (std::size_t outCh = 0; outCh < outChannels; ++outCh) { const std::size_t oIndex = (outCh + batch * outChannels) * oxSize; @@ -478,18 +481,24 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims, const std::size_t outChannels_s = oxSize * oySize; if (dilated_kernel_x == 3 && dilated_kernel_y == 3) { +#ifdef _OPENMP + #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels > 32) +#endif for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { for (std::size_t outCh = 0; outCh < outChannels; ++outCh) { + std::size_t oIndex = (outCh + batch*inputDims[1]) * outChannels_s; + // If bias = nullptr, set B(0) B biasVal = (biases != nullptr) ? biases[outCh] : B(0); - std::fill(output, output + outChannels_s, biasVal); + std::fill(output + oIndex, output + oIndex + outChannels_s, biasVal); for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) { + oIndex = (outCh + batch*inputDims[1]) * outChannels_s; std::size_t iIndex = (inCh + batch * inputDims[1]) * inputDims[2] * inputDims[3]; const std::size_t wIndex = (inCh + outCh * inputDims[1]) * 9; if (strideDims[0] == 1 && strideDims[1] == 1) { - for (std::size_t ox = 0, oIndex = 0; ox < oxSize; + for (std::size_t ox = 0; ox < oxSize; ++ox, oIndex += oySize, iIndex -= inputDims[3]) { for (std::size_t oy = 0; oy < oySize; ++oy) { output[oIndex + oy] += @@ -519,7 +528,7 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims, } } } else { - for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, + for (std::size_t ox = 0; ox < oxSize; ++ox, oIndex += oySize, iIndex += (strideDims[0] - 2) * inputDims[3]) { @@ -558,26 +567,30 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims, } } } - output += outChannels_s; } } } else if (dilated_kernel_x == 1 && dilated_kernel_y == 1) { +#ifdef _OPENMP + #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels > 32) +#endif for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { for (std::size_t outCh = 0; outCh < outChannels; ++outCh) { + std::size_t oIndex = (outCh + batch*inputDims[1]) * outChannels_s; + // If bias = nullptr, set B(0) B biasVal = (biases != nullptr) ? biases[outCh] : B(0); - std::fill(output, output + outChannels_s, biasVal); + std::fill(output + oIndex, output + oIndex + outChannels_s, biasVal); for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) { + oIndex = (outCh + batch*inputDims[1]) * outChannels_s; std::size_t iIndex = (inCh + batch * inputDims[1]) * inputDims[2] * inputDims[3]; const std::size_t wIndex = (inCh + outCh * inputDims[1]); if (strideDims[0] == 1 && strideDims[1] == 1) { - for (std::size_t oIndex = 0; oIndex < oxSize * oySize; - ++oIndex, ++iIndex) { - output[oIndex] += weights[wIndex] * input[iIndex]; + for (std::size_t i = 0; i < outChannels_s; ++i) { + output[oIndex + i] += weights[wIndex] * input[iIndex + i]; } } else { - for (std::size_t ox = 0, oIndex = 0; ox < oxSize; + for (std::size_t ox = 0; ox < oxSize; ++ox, oIndex += oySize, iIndex += @@ -590,16 +603,21 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims, } } } - output += outChannels_s; } } } else { +#ifdef _OPENMP + #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels > 32) +#endif for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { for (std::size_t outCh = 0; outCh < outChannels; ++outCh) { + std::size_t oIndex = (outCh + batch*inputDims[1]) * outChannels_s; + // If bias = nullptr, set B(0) B biasVal = (biases != nullptr) ? biases[outCh] : B(0); - std::fill(output, output + outChannels_s, biasVal); + std::fill(output + oIndex, output + oIndex + outChannels_s, biasVal); for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) { + oIndex = (outCh + batch*inputDims[1]) * outChannels_s; std::size_t iIndex_channel = (inCh + batch * inputDims[1]) * inputDims[2] * inputDims[3]; @@ -607,7 +625,7 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims, kernelDims[0] * kernelDims[1]; // loop over each ouput line - for (std::size_t ox = 0, oIndex = 0; ox < oxSize; + for (std::size_t ox = 0; ox < oxSize; ++ox, oIndex += oySize, iIndex_channel += @@ -633,7 +651,6 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims, } } } - output += outChannels_s; } } } diff --git a/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp index cbe4f110fc74f387625132c4f0872123814c1a62..3915adb3a0fcfd8cef0bf78761b2169272e1c211 100644 --- a/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp @@ -63,18 +63,25 @@ void GlobalAveragePoolingImpl_cpu_forward_kernel(const std::shared_ptr<Tensor>& using O = cpptype_t<DT_O>; const I *input = static_cast<const I *>(inputTensor->getImpl()->rawPtr()); O *output = static_cast<O *>(output_); - const auto& dims = inputTensor->dims(); - const DimSize_t strides_channels = inputTensor->strides()[1]; + const auto& dims = inputTensor->dims(); + DimSize_t nb_elems = std::accumulate(dims.begin(), dims.end(), std::size_t(1), + std::multiplies<std::size_t>()); + + const DimSize_t in_batch_nb_elems{nb_elems / dims[0]}; + const DimSize_t in_channel_nb_elems{in_batch_nb_elems / dims[1]}; + const DimSize_t out_batch_nb_elems{dims[1]}; // parse channel by channel and fill each output with the average of the // values in the channel - std::size_t input_idx = 0; - std::size_t output_idx = 0; +#ifdef _OPENMP + #pragma omp parallel for collapse(2) if (dims[0] * dims[1] > 32) +#endif for (DimSize_t batch = 0; batch < dims[0]; ++batch) { for (DimSize_t channel = 0; channel < dims[1]; ++channel) { - output[output_idx++] = castFromFloat<O>(stableMean<I>(input + input_idx, strides_channels)); - input_idx += strides_channels; + const I *filter_start = std::next( + input, (batch * in_batch_nb_elems) + (channel * in_channel_nb_elems)); + output[batch * out_batch_nb_elems + channel] = castFromFloat<O>(stableMean<I>(filter_start, in_channel_nb_elems)); } } } diff --git a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp index 9a52c1491d1fb16302779b799d10c8286086a3c2..9772b0abd2dfd0e5a6b2e11a856f734592e478db 100644 --- a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp @@ -66,6 +66,9 @@ void MaxPoolingImpl2D_cpu_forward_kernel( using signedsize = std::make_signed<std::size_t>::type; +#ifdef _OPENMP + #pragma omp parallel for collapse(2) if (dims[0] * dims[1] > 32) +#endif for (std::size_t batch = 0; batch < dims[0]; ++batch){ for (std::size_t channel = 0; channel < dims[1]; ++channel){ auto batchChannelIndex = (channel + batch * dims[1]); diff --git a/include/aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp b/include/aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp index 07486a48f1b8cf29f6a6ef8aa934a9decdbafef7..e74f3518bf0f394d17d89542b2a2221047beb0af 100644 --- a/include/aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp +++ b/include/aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp @@ -37,6 +37,9 @@ void SoftmaxImpl_cpu_forward_kernel(std::size_t axisIdx, const std::vector<DimSi preAxisElems *= inputDims[i]; } +#ifdef _OPENMP + #pragma omp parallel for collapse(2) if (preAxisElems * postAxisElems > 32) +#endif for (std::size_t i = 0; i < preAxisElems; ++i) { for (std::size_t j = 0; j < postAxisElems; ++j) { I maxVal = input[i * inputDims[axisIdx] * postAxisElems + j];