Skip to content
Snippets Groups Projects
Commit b57e889e authored by Olivier BICHLER's avatar Olivier BICHLER
Browse files

Added OpenMP

parent 84e1e2ad
No related branches found
No related tags found
2 merge requests!166Update 0.5.0 -> 0.6.0,!158Added OpenMP
...@@ -64,6 +64,8 @@ if(NOT $ENV{AIDGE_INSTALL} STREQUAL "") ...@@ -64,6 +64,8 @@ if(NOT $ENV{AIDGE_INSTALL} STREQUAL "")
endif() endif()
find_package(aidge_core REQUIRED) find_package(aidge_core REQUIRED)
find_package(OpenMP)
find_package(OpenSSL QUIET) find_package(OpenSSL QUIET)
if(OpenSSL_FOUND) if(OpenSSL_FOUND)
message(STATUS "OpenSSL found: ${OPENSSL_VERSION}") message(STATUS "OpenSSL found: ${OPENSSL_VERSION}")
...@@ -86,6 +88,10 @@ target_link_libraries(${module_name} ...@@ -86,6 +88,10 @@ target_link_libraries(${module_name}
_aidge_core # _ is added because we link the exported target and not the project _aidge_core # _ is added because we link the exported target and not the project
) )
if(OpenMP_CXX_FOUND)
target_link_libraries(${module_name} PUBLIC OpenMP::OpenMP_CXX)
endif()
# Add definition _USE_MATH_DEFINES to enable math constant definitions from math.h/cmath. # Add definition _USE_MATH_DEFINES to enable math constant definitions from math.h/cmath.
if (WIN32) if (WIN32)
target_compile_definitions(${module_name} PRIVATE _USE_MATH_DEFINES) target_compile_definitions(${module_name} PRIVATE _USE_MATH_DEFINES)
......
...@@ -76,6 +76,9 @@ void AvgPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideD ...@@ -76,6 +76,9 @@ void AvgPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideD
using signedsize = std::make_signed<std::size_t>::type; using signedsize = std::make_signed<std::size_t>::type;
#ifdef _OPENMP
#pragma omp parallel for collapse(2) if (dims[0] * dims[1] > 32)
#endif
for (std::size_t batch = 0; batch < dims[0]; ++batch) { for (std::size_t batch = 0; batch < dims[0]; ++batch) {
for (std::size_t ch = 0; ch < dims[1]; ++ch) { for (std::size_t ch = 0; ch < dims[1]; ++ch) {
const std::size_t oIndex = (ch + batch * dims[1]) * oxSize * oySize; const std::size_t oIndex = (ch + batch * dims[1]) * oxSize * oySize;
......
...@@ -53,6 +53,9 @@ void BatchNormImpl2D_cpu_forward_kernel(float epsilon, float momentum, const std ...@@ -53,6 +53,9 @@ void BatchNormImpl2D_cpu_forward_kernel(float epsilon, float momentum, const std
const DimSize_t featureMapSize = (dims.size() > 2) ? std::accumulate(dims.begin() + 2, dims.end(), 1, std::multiplies<DimSize_t>()) : 1; const DimSize_t featureMapSize = (dims.size() > 2) ? std::accumulate(dims.begin() + 2, dims.end(), 1, std::multiplies<DimSize_t>()) : 1;
if ((freeze == true) || (momentum == 0.0f)) { if ((freeze == true) || (momentum == 0.0f)) {
#ifdef _OPENMP
#pragma omp parallel for collapse(2) if (nbBatch * nbChannels > 32)
#endif
for (std::size_t batch = 0; batch < nbBatch; ++batch) { for (std::size_t batch = 0; batch < nbBatch; ++batch) {
for (std::size_t ch = 0; ch < nbChannels; ++ch) { for (std::size_t ch = 0; ch < nbChannels; ++ch) {
const std::size_t ioIndex = (ch + batch*nbChannels) * featureMapSize; const std::size_t ioIndex = (ch + batch*nbChannels) * featureMapSize;
......
...@@ -65,6 +65,9 @@ void ConvDepthWiseImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& stri ...@@ -65,6 +65,9 @@ void ConvDepthWiseImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& stri
// weight (outCh, ch, kernelX, kernelY) // weight (outCh, ch, kernelX, kernelY)
// does not take Dilation attribute into account // does not take Dilation attribute into account
using signedsize = std::make_signed<std::size_t>::type; using signedsize = std::make_signed<std::size_t>::type;
#ifdef _OPENMP
#pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] > 32)
#endif
for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
for (std::size_t ch = 0; ch < inputDims[1]; ++ch) { for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
const std::size_t oIndex = (ch + batch*inputDims[1]) * oxSize; const std::size_t oIndex = (ch + batch*inputDims[1]) * oxSize;
...@@ -152,16 +155,19 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri ...@@ -152,16 +155,19 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri
const std::size_t outChannels_s = oxSize * oySize; const std::size_t outChannels_s = oxSize * oySize;
if (dilated_kernel_x ==3 && dilated_kernel_y == 3) { if (dilated_kernel_x ==3 && dilated_kernel_y == 3) {
#ifdef _OPENMP
#pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] > 32)
#endif
for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
for (std::size_t ch = 0; ch < inputDims[1]; ++ch) { for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
B biasVal = (biases != nullptr) ? biases[ch] : B(0); B biasVal = (biases != nullptr) ? biases[ch] : B(0);
std::size_t oIndex = (ch + batch*inputDims[1]) * outChannels_s;
std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3]; std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
const std::size_t wIndex = ch * 9; const std::size_t wIndex = ch * 9;
if (strideDims[0] == 1 && strideDims[1]==1) { if (strideDims[0] == 1 && strideDims[1]==1) {
for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=inputDims[3]) { for (std::size_t ox = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=inputDims[3]) {
for (std::size_t oy = 0; oy < oySize; ++oy) { for (std::size_t oy = 0; oy < oySize; ++oy) {
output[oIndex + oy] = biasVal + weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+1]+weights[wIndex+2]*input[iIndex+oy+2]; output[oIndex + oy] = biasVal + weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+1]+weights[wIndex+2]*input[iIndex+oy+2];
} }
...@@ -175,7 +181,7 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri ...@@ -175,7 +181,7 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri
} }
} }
} else { } else {
for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=(strideDims[0]-2)*inputDims[3]) { for (std::size_t ox = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=(strideDims[0]-2)*inputDims[3]) {
for (std::size_t oy = 0; oy < oySize; ++oy) { for (std::size_t oy = 0; oy < oySize; ++oy) {
output[oIndex + oy] = biasVal + weights[wIndex+0]*input[iIndex+oy*strideDims[1]]+weights[wIndex+1]*input[iIndex+oy*strideDims[1]+1]+weights[wIndex+2]*input[iIndex+oy*strideDims[1]+2]; output[oIndex + oy] = biasVal + weights[wIndex+0]*input[iIndex+oy*strideDims[1]]+weights[wIndex+1]*input[iIndex+oy*strideDims[1]+1]+weights[wIndex+2]*input[iIndex+oy*strideDims[1]+2];
} }
...@@ -189,24 +195,25 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri ...@@ -189,24 +195,25 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri
} }
} }
} }
output += outChannels_s;
} }
} }
} else if (dilated_kernel_x == 1 && dilated_kernel_y == 1) { } else if (dilated_kernel_x == 1 && dilated_kernel_y == 1) {
#ifdef _OPENMP
#pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] > 32)
#endif
for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
for (std::size_t ch = 0; ch < inputDims[1]; ++ch) { for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
B biasVal = (biases != nullptr) ? biases[ch] : B(0); B biasVal = (biases != nullptr) ? biases[ch] : B(0);
std::size_t oIndex = (ch + batch*inputDims[1]) * outChannels_s;
std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3]; std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
const std::size_t wIndex = ch; const std::size_t wIndex = ch;
if (strideDims[0] == 1 && strideDims[1] == 1) { if (strideDims[0] == 1 && strideDims[1] == 1) {
for (std::size_t i = iIndex; i < iIndex + oxSize*oySize; ++i) { for (std::size_t i = iIndex; i < iIndex + oxSize*oySize; ++i) {
output[i] = biasVal + weights[wIndex] * input[i]; output[oIndex + i] = biasVal + weights[wIndex] * input[i];
} }
} else { } else {
std::size_t oIndex = (ch + batch*inputDims[1]) * oxSize * oySize;
for (std::size_t ox = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=strideDims[0]*inputDims[3]) { for (std::size_t ox = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=strideDims[0]*inputDims[3]) {
for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) { for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) {
output[oIndex + oy] = biasVal + weights[wIndex]*input[iIndex+iy]; output[oIndex + oy] = biasVal + weights[wIndex]*input[iIndex+iy];
...@@ -216,19 +223,22 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri ...@@ -216,19 +223,22 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri
} }
} }
} else { } else {
#ifdef _OPENMP
#pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] > 32)
#endif
for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
for (std::size_t ch = 0; ch < inputDims[1]; ++ch) { for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
const std::size_t oIndex = (ch + batch*inputDims[1]) * outChannels_s;
B biasVal = (biases != nullptr) ? biases[ch] : B(0);
std::fill(output, output+outChannels_s, biasVal);
const std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3]; const std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
const std::size_t wIndex = ch * kernelDims[0] * kernelDims[1]; const std::size_t wIndex = ch * kernelDims[0] * kernelDims[1];
B biasVal = (biases != nullptr) ? biases[ch] : B(0);
std::fill(output + oIndex, output + oIndex + outChannels_s, biasVal);
for (std::size_t ox = 0; ox < oxSize; ++ox) { for (std::size_t ox = 0; ox < oxSize; ++ox) {
for (std::size_t oy = 0; oy < oySize; ++oy) { for (std::size_t oy = 0; oy < oySize; ++oy) {
const std::size_t oIndexFull = ox*oySize + oy; const std::size_t oIndexFull = oIndex + ox*oySize + oy;
const std::size_t ix = ox * strideDims[0]; const std::size_t ix = ox * strideDims[0];
const std::size_t iy = oy * strideDims[1]; const std::size_t iy = oy * strideDims[1];
...@@ -240,7 +250,6 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri ...@@ -240,7 +250,6 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri
} }
} }
} }
output += outChannels_s;
} }
} }
} }
......
...@@ -59,6 +59,9 @@ void ConvImpl1D_cpu_forward_kernel(const array<DimSize_t, 1> &strideDim, ...@@ -59,6 +59,9 @@ void ConvImpl1D_cpu_forward_kernel(const array<DimSize_t, 1> &strideDim,
const DimSize_t dilated_kernel_x = dilationDim[0] * (kernelDim[0] - 1) + 1; const DimSize_t dilated_kernel_x = dilationDim[0] * (kernelDim[0] - 1) + 1;
using signedsize = std::make_signed<std::size_t>::type; using signedsize = std::make_signed<std::size_t>::type;
#ifdef _OPENMP
#pragma omp parallel for collapse(2) if (inputDims[0] * outChannels > 32)
#endif
for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
for (std::size_t outCh = 0; outCh < outChannels; ++outCh) { for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
const std::size_t oIndex = (outCh + batch * outChannels) * oxSize; const std::size_t oIndex = (outCh + batch * outChannels) * oxSize;
...@@ -478,18 +481,24 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims, ...@@ -478,18 +481,24 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims,
const std::size_t outChannels_s = oxSize * oySize; const std::size_t outChannels_s = oxSize * oySize;
if (dilated_kernel_x == 3 && dilated_kernel_y == 3) { if (dilated_kernel_x == 3 && dilated_kernel_y == 3) {
#ifdef _OPENMP
#pragma omp parallel for collapse(2) if (inputDims[0] * outChannels > 32)
#endif
for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
for (std::size_t outCh = 0; outCh < outChannels; ++outCh) { for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
std::size_t oIndex = (outCh + batch*inputDims[1]) * outChannels_s;
// If bias = nullptr, set B(0) // If bias = nullptr, set B(0)
B biasVal = (biases != nullptr) ? biases[outCh] : B(0); B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
std::fill(output, output + outChannels_s, biasVal); std::fill(output + oIndex, output + oIndex + outChannels_s, biasVal);
for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) { for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
oIndex = (outCh + batch*inputDims[1]) * outChannels_s;
std::size_t iIndex = (inCh + batch * inputDims[1]) * std::size_t iIndex = (inCh + batch * inputDims[1]) *
inputDims[2] * inputDims[3]; inputDims[2] * inputDims[3];
const std::size_t wIndex = const std::size_t wIndex =
(inCh + outCh * inputDims[1]) * 9; (inCh + outCh * inputDims[1]) * 9;
if (strideDims[0] == 1 && strideDims[1] == 1) { if (strideDims[0] == 1 && strideDims[1] == 1) {
for (std::size_t ox = 0, oIndex = 0; ox < oxSize; for (std::size_t ox = 0; ox < oxSize;
++ox, oIndex += oySize, iIndex -= inputDims[3]) { ++ox, oIndex += oySize, iIndex -= inputDims[3]) {
for (std::size_t oy = 0; oy < oySize; ++oy) { for (std::size_t oy = 0; oy < oySize; ++oy) {
output[oIndex + oy] += output[oIndex + oy] +=
...@@ -519,7 +528,7 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims, ...@@ -519,7 +528,7 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims,
} }
} }
} else { } else {
for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, for (std::size_t ox = 0; ox < oxSize; ++ox,
oIndex += oySize, oIndex += oySize,
iIndex += (strideDims[0] - iIndex += (strideDims[0] -
2) * inputDims[3]) { 2) * inputDims[3]) {
...@@ -558,26 +567,30 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims, ...@@ -558,26 +567,30 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims,
} }
} }
} }
output += outChannels_s;
} }
} }
} else if (dilated_kernel_x == 1 && dilated_kernel_y == 1) { } else if (dilated_kernel_x == 1 && dilated_kernel_y == 1) {
#ifdef _OPENMP
#pragma omp parallel for collapse(2) if (inputDims[0] * outChannels > 32)
#endif
for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
for (std::size_t outCh = 0; outCh < outChannels; ++outCh) { for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
std::size_t oIndex = (outCh + batch*inputDims[1]) * outChannels_s;
// If bias = nullptr, set B(0) // If bias = nullptr, set B(0)
B biasVal = (biases != nullptr) ? biases[outCh] : B(0); B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
std::fill(output, output + outChannels_s, biasVal); std::fill(output + oIndex, output + oIndex + outChannels_s, biasVal);
for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) { for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
oIndex = (outCh + batch*inputDims[1]) * outChannels_s;
std::size_t iIndex = (inCh + batch * inputDims[1]) * std::size_t iIndex = (inCh + batch * inputDims[1]) *
inputDims[2] * inputDims[3]; inputDims[2] * inputDims[3];
const std::size_t wIndex = (inCh + outCh * inputDims[1]); const std::size_t wIndex = (inCh + outCh * inputDims[1]);
if (strideDims[0] == 1 && strideDims[1] == 1) { if (strideDims[0] == 1 && strideDims[1] == 1) {
for (std::size_t oIndex = 0; oIndex < oxSize * oySize; for (std::size_t i = 0; i < outChannels_s; ++i) {
++oIndex, ++iIndex) { output[oIndex + i] += weights[wIndex] * input[iIndex + i];
output[oIndex] += weights[wIndex] * input[iIndex];
} }
} else { } else {
for (std::size_t ox = 0, oIndex = 0; ox < oxSize; for (std::size_t ox = 0; ox < oxSize;
++ox, ++ox,
oIndex += oySize, oIndex += oySize,
iIndex += iIndex +=
...@@ -590,16 +603,21 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims, ...@@ -590,16 +603,21 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims,
} }
} }
} }
output += outChannels_s;
} }
} }
} else { } else {
#ifdef _OPENMP
#pragma omp parallel for collapse(2) if (inputDims[0] * outChannels > 32)
#endif
for (std::size_t batch = 0; batch < inputDims[0]; ++batch) { for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
for (std::size_t outCh = 0; outCh < outChannels; ++outCh) { for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
std::size_t oIndex = (outCh + batch*inputDims[1]) * outChannels_s;
// If bias = nullptr, set B(0) // If bias = nullptr, set B(0)
B biasVal = (biases != nullptr) ? biases[outCh] : B(0); B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
std::fill(output, output + outChannels_s, biasVal); std::fill(output + oIndex, output + oIndex + outChannels_s, biasVal);
for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) { for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
oIndex = (outCh + batch*inputDims[1]) * outChannels_s;
std::size_t iIndex_channel = std::size_t iIndex_channel =
(inCh + batch * inputDims[1]) * inputDims[2] * (inCh + batch * inputDims[1]) * inputDims[2] *
inputDims[3]; inputDims[3];
...@@ -607,7 +625,7 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims, ...@@ -607,7 +625,7 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims,
kernelDims[0] * kernelDims[1]; kernelDims[0] * kernelDims[1];
// loop over each ouput line // loop over each ouput line
for (std::size_t ox = 0, oIndex = 0; ox < oxSize; for (std::size_t ox = 0; ox < oxSize;
++ox, ++ox,
oIndex += oySize, oIndex += oySize,
iIndex_channel += iIndex_channel +=
...@@ -633,7 +651,6 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims, ...@@ -633,7 +651,6 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims,
} }
} }
} }
output += outChannels_s;
} }
} }
} }
......
...@@ -63,18 +63,25 @@ void GlobalAveragePoolingImpl_cpu_forward_kernel(const std::shared_ptr<Tensor>& ...@@ -63,18 +63,25 @@ void GlobalAveragePoolingImpl_cpu_forward_kernel(const std::shared_ptr<Tensor>&
using O = cpptype_t<DT_O>; using O = cpptype_t<DT_O>;
const I *input = static_cast<const I *>(inputTensor->getImpl()->rawPtr()); const I *input = static_cast<const I *>(inputTensor->getImpl()->rawPtr());
O *output = static_cast<O *>(output_); O *output = static_cast<O *>(output_);
const auto& dims = inputTensor->dims();
const DimSize_t strides_channels = inputTensor->strides()[1]; const auto& dims = inputTensor->dims();
DimSize_t nb_elems = std::accumulate(dims.begin(), dims.end(), std::size_t(1),
std::multiplies<std::size_t>());
const DimSize_t in_batch_nb_elems{nb_elems / dims[0]};
const DimSize_t in_channel_nb_elems{in_batch_nb_elems / dims[1]};
const DimSize_t out_batch_nb_elems{dims[1]};
// parse channel by channel and fill each output with the average of the // parse channel by channel and fill each output with the average of the
// values in the channel // values in the channel
std::size_t input_idx = 0; #ifdef _OPENMP
std::size_t output_idx = 0; #pragma omp parallel for collapse(2) if (dims[0] * dims[1] > 32)
#endif
for (DimSize_t batch = 0; batch < dims[0]; ++batch) { for (DimSize_t batch = 0; batch < dims[0]; ++batch) {
for (DimSize_t channel = 0; channel < dims[1]; ++channel) { for (DimSize_t channel = 0; channel < dims[1]; ++channel) {
output[output_idx++] = castFromFloat<O>(stableMean<I>(input + input_idx, strides_channels)); const I *filter_start = std::next(
input_idx += strides_channels; input, (batch * in_batch_nb_elems) + (channel * in_channel_nb_elems));
output[batch * out_batch_nb_elems + channel] = castFromFloat<O>(stableMean<I>(filter_start, in_channel_nb_elems));
} }
} }
} }
......
...@@ -66,6 +66,9 @@ void MaxPoolingImpl2D_cpu_forward_kernel( ...@@ -66,6 +66,9 @@ void MaxPoolingImpl2D_cpu_forward_kernel(
using signedsize = std::make_signed<std::size_t>::type; using signedsize = std::make_signed<std::size_t>::type;
#ifdef _OPENMP
#pragma omp parallel for collapse(2) if (dims[0] * dims[1] > 32)
#endif
for (std::size_t batch = 0; batch < dims[0]; ++batch){ for (std::size_t batch = 0; batch < dims[0]; ++batch){
for (std::size_t channel = 0; channel < dims[1]; ++channel){ for (std::size_t channel = 0; channel < dims[1]; ++channel){
auto batchChannelIndex = (channel + batch * dims[1]); auto batchChannelIndex = (channel + batch * dims[1]);
......
...@@ -37,6 +37,9 @@ void SoftmaxImpl_cpu_forward_kernel(std::size_t axisIdx, const std::vector<DimSi ...@@ -37,6 +37,9 @@ void SoftmaxImpl_cpu_forward_kernel(std::size_t axisIdx, const std::vector<DimSi
preAxisElems *= inputDims[i]; preAxisElems *= inputDims[i];
} }
#ifdef _OPENMP
#pragma omp parallel for collapse(2) if (preAxisElems * postAxisElems > 32)
#endif
for (std::size_t i = 0; i < preAxisElems; ++i) { for (std::size_t i = 0; i < preAxisElems; ++i) {
for (std::size_t j = 0; j < postAxisElems; ++j) { for (std::size_t j = 0; j < postAxisElems; ++j) {
I maxVal = input[i * inputDims[axisIdx] * postAxisElems + j]; I maxVal = input[i * inputDims[axisIdx] * postAxisElems + j];
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment