diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6c87a89b8ac1254f8bfb8fb990f8c03f7e593d61..ce1b50629a3e0ca97c986e7b3ce8d3df743f75e3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -64,6 +64,8 @@ if(NOT $ENV{AIDGE_INSTALL} STREQUAL "")
 endif()
 find_package(aidge_core REQUIRED)
 
+find_package(OpenMP)
+
 find_package(OpenSSL QUIET)
 if(OpenSSL_FOUND)
     message(STATUS "OpenSSL found: ${OPENSSL_VERSION}")
@@ -86,6 +88,11 @@ target_link_libraries(${module_name}
         _aidge_core # _ is added because we link the exported target and not the project
 )
 
+if(OpenMP_CXX_FOUND)
+    target_link_libraries(${module_name} PRIVATE OpenMP::OpenMP_CXX)
+    set(AIDGE_REQUIRES_OPENMP TRUE)
+endif()
+
 # Add definition _USE_MATH_DEFINES to enable math constant definitions from math.h/cmath.
 if (WIN32)
     target_compile_definitions(${module_name} PRIVATE _USE_MATH_DEFINES)
diff --git a/aidge_backend_cpu-config.cmake.in b/aidge_backend_cpu-config.cmake.in
index 7582102c24a551db7f346e1b614d7dcaa4940b1d..35865c71a87aebbb04abe6cd964f54e0f08029a0 100644
--- a/aidge_backend_cpu-config.cmake.in
+++ b/aidge_backend_cpu-config.cmake.in
@@ -2,6 +2,10 @@
 
 include(CMakeFindDependencyMacro)
 find_dependency(aidge_core)
+set(AIDGE_REQUIRES_OPENMP @AIDGE_REQUIRES_OPENMP@)
+if (AIDGE_REQUIRES_OPENMP)
+    find_dependency(OpenMP)
+endif()
 set(AIDGE_REQUIRES_OPENSSL @AIDGE_REQUIRES_OPENSSL@)
 if (AIDGE_REQUIRES_OPENSSL)
     find_dependency(OpenSSL)
diff --git a/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
index 1671759d25a5965ceca57fc1167534d7986282c4..f9cc13b5b0be6e63aa2ac7da8d3eccbaf7c9cd2e 100644
--- a/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/AvgPoolingImpl_kernels.hpp
@@ -76,8 +76,11 @@ void AvgPoolingImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& strideD
 
     using signedsize = std::make_signed<std::size_t>::type;
 
-    for (std::size_t batch = 0; batch < dims[0]; ++batch) {
-        for (std::size_t ch = 0; ch < dims[1]; ++ch) {
+#ifdef _OPENMP
+    #pragma omp parallel for collapse(2) if (dims[0] * dims[1] >= 16)
+#endif
+    for (int batch = 0; batch < static_cast<int>(dims[0]); ++batch) {
+        for (int ch = 0; ch < static_cast<int>(dims[1]); ++ch) {
             const std::size_t oIndex = (ch + batch * dims[1]) * oxSize * oySize;
             const std::size_t iIndex = (ch + batch * dims[1]) * dims[2] * dims[3];
 
diff --git a/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp b/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp
index cf97f7372ac528ef28d0f378beb2650af32bfa30..d1d7d529756c1bbad2880579a5dac57ebd9e07c7 100644
--- a/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/BatchNormImpl_kernels.hpp
@@ -53,8 +53,11 @@ void BatchNormImpl2D_cpu_forward_kernel(float epsilon, float momentum, const std
     const DimSize_t featureMapSize = (dims.size() > 2) ? std::accumulate(dims.begin() + 2, dims.end(), 1, std::multiplies<DimSize_t>()) : 1;
 
     if ((freeze == true) || (momentum == 0.0f)) {
-        for (std::size_t batch = 0; batch < nbBatch; ++batch) {
-            for (std::size_t ch = 0; ch < nbChannels; ++ch) {
+#ifdef _OPENMP
+        #pragma omp parallel for collapse(2) if (nbBatch * nbChannels >= 16)
+#endif
+        for (int batch = 0; batch < static_cast<int>(nbBatch); ++batch) {
+            for (int ch = 0; ch < static_cast<int>(nbChannels); ++ch) {
                 const std::size_t ioIndex = (ch + batch*nbChannels) * featureMapSize;
                 std::fill(output + ioIndex, output + ioIndex + featureMapSize, shift[ch]);
                 const P var = std::sqrt(batchVar[ch] + static_cast<P>(epsilon));
diff --git a/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp
index 906ea1adf744353372c844fd3e16b9dbd13e7f7d..0e2f5a72e4ad1a7e2c8bd239e43914642121965f 100644
--- a/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConvDepthWiseImpl_kernels.hpp
@@ -65,8 +65,11 @@ void ConvDepthWiseImpl1D_cpu_forward_kernel(const std::array<DimSize_t, 1>& stri
     // weight (outCh, ch, kernelX, kernelY)
     // does not take Dilation attribute into account
     using signedsize = std::make_signed<std::size_t>::type;
-    for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
-        for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
+#ifdef _OPENMP
+    #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] >= 16)
+#endif
+    for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) {
+        for (int ch = 0; ch < static_cast<int>(inputDims[1]); ++ch) {
             const std::size_t oIndex = (ch + batch*inputDims[1]) * oxSize;
             B biasVal = (biases != nullptr) ? biases[ch] : B(0);
             std::fill(output + oIndex, output+(oIndex+oxSize), biasVal);
@@ -152,16 +155,19 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri
     const std::size_t outChannels_s =  oxSize * oySize;
 
     if (dilated_kernel_x ==3 && dilated_kernel_y == 3) {
-        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
-            for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
-
+#ifdef _OPENMP
+        #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] >= 16)
+#endif
+        for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) {
+            for (int ch = 0; ch < static_cast<int>(inputDims[1]); ++ch) {
                 B biasVal = (biases != nullptr) ? biases[ch] : B(0);
 
+                std::size_t oIndex = (ch + batch*inputDims[1]) * outChannels_s;
                 std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
                 const std::size_t wIndex = ch * 9;
 
                 if (strideDims[0] == 1 && strideDims[1]==1) {
-                    for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=inputDims[3]) {
+                    for (std::size_t ox = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex-=inputDims[3]) {
                         for (std::size_t oy = 0; oy < oySize; ++oy) {
                             output[oIndex + oy] = biasVal + weights[wIndex+0]*input[iIndex+oy]+weights[wIndex+1]*input[iIndex+oy+1]+weights[wIndex+2]*input[iIndex+oy+2];
                         }
@@ -175,7 +181,7 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri
                         }
                     }
                 } else {
-                    for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=(strideDims[0]-2)*inputDims[3]) {
+                    for (std::size_t ox = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=(strideDims[0]-2)*inputDims[3]) {
                         for (std::size_t oy = 0; oy < oySize; ++oy) {
                             output[oIndex + oy] = biasVal + weights[wIndex+0]*input[iIndex+oy*strideDims[1]]+weights[wIndex+1]*input[iIndex+oy*strideDims[1]+1]+weights[wIndex+2]*input[iIndex+oy*strideDims[1]+2];
                         }
@@ -189,24 +195,25 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri
                         }
                     }
                 }
-                output += outChannels_s;
             }
         }
     } else if (dilated_kernel_x == 1 && dilated_kernel_y == 1) {
-        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
-            for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
-
+#ifdef _OPENMP
+        #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] >= 16)
+#endif
+        for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) {
+            for (int ch = 0; ch < static_cast<int>(inputDims[1]); ++ch) {
                 B biasVal = (biases != nullptr) ? biases[ch] : B(0);
 
+                std::size_t oIndex = (ch + batch*inputDims[1]) * outChannels_s;
                 std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
                 const std::size_t wIndex = ch;
 
                 if (strideDims[0] == 1 && strideDims[1] == 1) {
-                    for (std::size_t i = iIndex; i < iIndex + oxSize*oySize; ++i) {
-                        output[i] = biasVal + weights[wIndex] * input[i];
+                    for (std::size_t i = 0; i < oxSize*oySize; ++i) {
+                        output[oIndex + i] = biasVal + weights[wIndex] * input[iIndex + i];
                     }
                 } else  {
-                    std::size_t oIndex =  (ch + batch*inputDims[1]) * oxSize * oySize;
                     for (std::size_t ox = 0; ox < oxSize; ++ox, oIndex+=oySize, iIndex+=strideDims[0]*inputDims[3]) {
                         for (std::size_t oy = 0, iy = 0; oy < oySize; ++oy, iy+=strideDims[1]) {
                             output[oIndex + oy] = biasVal + weights[wIndex]*input[iIndex+iy];
@@ -216,19 +223,22 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri
             }
         }
     } else {
-        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
-            for (std::size_t ch = 0; ch < inputDims[1]; ++ch) {
-
-                B biasVal = (biases != nullptr) ? biases[ch] : B(0);
-                std::fill(output, output+outChannels_s, biasVal);
-
+#ifdef _OPENMP
+        #pragma omp parallel for collapse(2) if (inputDims[0] * inputDims[1] >= 16)
+#endif
+        for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) {
+            for (int ch = 0; ch < static_cast<int>(inputDims[1]); ++ch) {
+                const std::size_t oIndex = (ch + batch*inputDims[1]) * outChannels_s;
                 const std::size_t iIndex = (ch + batch*inputDims[1]) * inputDims[2] * inputDims[3];
                 const std::size_t wIndex = ch * kernelDims[0] * kernelDims[1];
 
+                B biasVal = (biases != nullptr) ? biases[ch] : B(0);
+                std::fill(output + oIndex, output + oIndex + outChannels_s, biasVal);
+
                 for (std::size_t ox = 0; ox < oxSize; ++ox) {
                     for (std::size_t oy = 0; oy < oySize; ++oy) {
 
-                        const std::size_t oIndexFull = ox*oySize + oy;
+                        const std::size_t oIndexFull = oIndex + ox*oySize + oy;
                         const std::size_t ix = ox * strideDims[0];
                         const std::size_t iy = oy * strideDims[1];
 
@@ -240,7 +250,6 @@ void ConvDepthWiseImpl2D_cpu_forward_kernel(const std::array<DimSize_t, 2>& stri
                         }
                     }
                 }
-                output += outChannels_s;
             }
         }
     }
diff --git a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
index 29aac6dc585e41d39f0d03b3035a5294848f8436..e1e76a33120bb9536842a9f0db4cc789f8fe38a1 100644
--- a/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/ConvImpl_kernels.hpp
@@ -59,8 +59,11 @@ void ConvImpl1D_cpu_forward_kernel(const array<DimSize_t, 1> &strideDim,
     const DimSize_t dilated_kernel_x = dilationDim[0] * (kernelDim[0] - 1) + 1;
 
     using signedsize = std::make_signed<std::size_t>::type;
-    for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
-        for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
+#ifdef _OPENMP
+    #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels >= 16)
+#endif
+    for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) {
+        for (int outCh = 0; outCh < static_cast<int>(outChannels); ++outCh) {
             const std::size_t oIndex = (outCh + batch * outChannels) * oxSize;
             // If bias = nullptr, set B(0)
             B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
@@ -478,18 +481,24 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims,
     const std::size_t outChannels_s = oxSize * oySize;
 
     if (dilated_kernel_x == 3 && dilated_kernel_y == 3) {
-        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
-            for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
+#ifdef _OPENMP
+        #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels >= 16)
+#endif
+        for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) {
+            for (int outCh = 0; outCh < static_cast<int>(outChannels); ++outCh) {
+                std::size_t oIndex = (outCh + batch*outChannels) * outChannels_s;
+
                 // If bias = nullptr, set B(0)
                 B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
-                std::fill(output, output + outChannels_s, biasVal);
+                std::fill(output + oIndex, output + oIndex + outChannels_s, biasVal);
                 for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
+                    oIndex = (outCh + batch*outChannels) * outChannels_s;
                     std::size_t iIndex = (inCh + batch * inputDims[1]) *
                                          inputDims[2] * inputDims[3];
                     const std::size_t wIndex =
                         (inCh + outCh * inputDims[1]) * 9;
                     if (strideDims[0] == 1 && strideDims[1] == 1) {
-                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize;
+                        for (std::size_t ox = 0; ox < oxSize;
                              ++ox, oIndex += oySize, iIndex -= inputDims[3]) {
                             for (std::size_t oy = 0; oy < oySize; ++oy) {
                                 output[oIndex + oy] +=
@@ -519,7 +528,7 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims,
                             }
                         }
                     } else {
-                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize; ++ox,
+                        for (std::size_t ox = 0; ox < oxSize; ++ox,
                                          oIndex += oySize,
                                          iIndex += (strideDims[0] -
                                                     2) * inputDims[3]) {
@@ -558,26 +567,30 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims,
                         }
                     }
                 }
-                output += outChannels_s;
             }
         }
     } else if (dilated_kernel_x == 1 && dilated_kernel_y == 1) {
-        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
-            for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
+#ifdef _OPENMP
+        #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels >= 16)
+#endif
+        for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) {
+            for (int outCh = 0; outCh < static_cast<int>(outChannels); ++outCh) {
+                std::size_t oIndex = (outCh + batch*outChannels) * outChannels_s;
+
                 // If bias = nullptr, set B(0)
                 B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
-                std::fill(output, output + outChannels_s, biasVal);
+                std::fill(output + oIndex, output + oIndex + outChannels_s, biasVal);
                 for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
+                    oIndex = (outCh + batch*outChannels) * outChannels_s;
                     std::size_t iIndex = (inCh + batch * inputDims[1]) *
                                          inputDims[2] * inputDims[3];
                     const std::size_t wIndex = (inCh + outCh * inputDims[1]);
                     if (strideDims[0] == 1 && strideDims[1] == 1) {
-                        for (std::size_t oIndex = 0; oIndex < oxSize * oySize;
-                             ++oIndex, ++iIndex) {
-                            output[oIndex] += weights[wIndex] * input[iIndex];
+                        for (std::size_t i = 0; i < outChannels_s; ++i) {
+                            output[oIndex + i] += weights[wIndex] * input[iIndex + i];
                         }
                     } else {
-                        for (std::size_t ox = 0, oIndex = 0; ox < oxSize;
+                        for (std::size_t ox = 0; ox < oxSize;
                              ++ox,
                                          oIndex += oySize,
                                          iIndex +=
@@ -590,16 +603,21 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims,
                         }
                     }
                 }
-                output += outChannels_s;
             }
         }
     } else {
-        for (std::size_t batch = 0; batch < inputDims[0]; ++batch) {
-            for (std::size_t outCh = 0; outCh < outChannels; ++outCh) {
+#ifdef _OPENMP
+        #pragma omp parallel for collapse(2) if (inputDims[0] * outChannels >= 16)
+#endif
+        for (int batch = 0; batch < static_cast<int>(inputDims[0]); ++batch) {
+            for (int outCh = 0; outCh < static_cast<int>(outChannels); ++outCh) {
+                std::size_t oIndex = (outCh + batch*outChannels) * outChannels_s;
+
                 // If bias = nullptr, set B(0)
                 B biasVal = (biases != nullptr) ? biases[outCh] : B(0);
-                std::fill(output, output + outChannels_s, biasVal);
+                std::fill(output + oIndex, output + oIndex + outChannels_s, biasVal);
                 for (std::size_t inCh = 0; inCh < inputDims[1]; ++inCh) {
+                    oIndex = (outCh + batch*outChannels) * outChannels_s;
                     std::size_t iIndex_channel =
                         (inCh + batch * inputDims[1]) * inputDims[2] *
                         inputDims[3];
@@ -607,7 +625,7 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims,
                                                kernelDims[0] * kernelDims[1];
 
                     // loop over each ouput line
-                    for (std::size_t ox = 0, oIndex = 0; ox < oxSize;
+                    for (std::size_t ox = 0; ox < oxSize;
                          ++ox,
                                      oIndex += oySize,
                                      iIndex_channel +=
@@ -633,7 +651,6 @@ void ConvImpl2D_cpu_forward_kernel(const array<DimSize_t, 2> &strideDims,
                         }
                     }
                 }
-                output += outChannels_s;
             }
         }
     }
diff --git a/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp
index b77f749f9d81af7ab2b94d078eca941218b3cd6f..b03e7f58c19b119ec72306f7d9979607a707cde7 100644
--- a/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/FCImpl_kernels.hpp
@@ -96,21 +96,16 @@ void FCImpl_cpu_forward_kernel(const DimSize_t batchSize,
     const B* biases = static_cast<const B*>(biases_);
     O* output = static_cast<O*>(output_);
 
-    if (biases == nullptr) {
-        std::fill(output, output+(batchSize*outputFeatureSize), B(0));
-    }
-    else {
-        for (std::size_t batch = 0; batch < batchSize; ++batch) {
-            std::copy(biases, biases+outputFeatureSize, output+(batch*outputFeatureSize));
-        }
-    }
-
-    for (std::size_t batch = 0; batch < batchSize; ++batch) {
-        for (std::size_t out = 0; out < outputFeatureSize; ++out) {
+#ifdef _OPENMP
+    #pragma omp parallel for collapse(2) if (batchSize * outputFeatureSize >= 16)
+#endif
+    for (int batch = 0; batch < static_cast<int>(batchSize); ++batch) {
+        for (int out = 0; out < static_cast<int>(outputFeatureSize); ++out) {
+            const auto biasVal = (biases) ? biases[out] : B(0);
             output[out + batch*outputFeatureSize] = std::inner_product(input + batch*inputFeatureSize,
                                                         input + (batch + 1)*inputFeatureSize,
                                                         weights + out*inputFeatureSize,
-                                                        output[out + batch*outputFeatureSize]);
+                                                        biasVal);
         }
     }
 }
diff --git a/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp
index cbe4f110fc74f387625132c4f0872123814c1a62..3cab0ad9647a974170bf682fcf3b57b306bd76bd 100644
--- a/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/GlobalAveragePoolingImpl_kernels.hpp
@@ -63,18 +63,25 @@ void GlobalAveragePoolingImpl_cpu_forward_kernel(const std::shared_ptr<Tensor>&
     using O = cpptype_t<DT_O>;
     const I *input = static_cast<const I *>(inputTensor->getImpl()->rawPtr());
     O *output = static_cast<O *>(output_);
-    const auto& dims = inputTensor->dims();
 
-    const DimSize_t strides_channels = inputTensor->strides()[1];
+    const auto& dims = inputTensor->dims();
+    DimSize_t nb_elems = std::accumulate(dims.begin(), dims.end(), std::size_t(1),
+                                         std::multiplies<std::size_t>());
+  
+    const DimSize_t in_batch_nb_elems{nb_elems / dims[0]};
+    const DimSize_t in_channel_nb_elems{in_batch_nb_elems / dims[1]};
+    const DimSize_t out_batch_nb_elems{dims[1]};
 
     // parse channel by channel and fill each output with the average of the
     // values in the channel
-    std::size_t input_idx = 0;
-    std::size_t output_idx = 0;
-    for (DimSize_t batch = 0; batch < dims[0]; ++batch) {
-        for (DimSize_t channel = 0; channel < dims[1]; ++channel) {
-            output[output_idx++] = castFromFloat<O>(stableMean<I>(input + input_idx, strides_channels));
-            input_idx += strides_channels;
+#ifdef _OPENMP
+    #pragma omp parallel for collapse(2) if (dims[0] * dims[1] >= 16)
+#endif
+    for (int batch = 0; batch < static_cast<int>(dims[0]); ++batch) {
+        for (int channel = 0; channel < static_cast<int>(dims[1]); ++channel) {
+            const I *filter_start = std::next(
+                input, (batch * in_batch_nb_elems) + (channel * in_channel_nb_elems));
+            output[batch * out_batch_nb_elems + channel] = castFromFloat<O>(stableMean<I>(filter_start, in_channel_nb_elems));
         }
     }
 }
diff --git a/include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp
index 5fc13baf49b1d0606eb4af5a54eec83fa5dce22a..adcc8ddc26a379e3a310aa1ab405841f7964037d 100644
--- a/include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/MatMulImpl_kernels.hpp
@@ -26,7 +26,10 @@ void MatMulImpl_cpu_forward_kernel(const std::size_t n, const std::size_t k, con
 
     std::memset(output, O(0), n * m * sizeof(O));
 
-    for (std::size_t i = 0; i < n; ++i) {
+#ifdef _OPENMP
+    #pragma omp parallel for if (n >= 16)
+#endif
+    for (int i = 0; i < static_cast<int>(n); ++i) {
         for (std::size_t l = 0; l < k; ++l) {
             for (std::size_t j = 0; j < m; ++j) {
                 output[i*m + j] += static_cast<O>(input1[i*k + l] * input2[l*m + j]);
diff --git a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
index 9a52c1491d1fb16302779b799d10c8286086a3c2..7fe272d5d23d6484ed03c6183a1972035aa1b563 100644
--- a/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/MaxPoolingImpl_kernels.hpp
@@ -66,8 +66,11 @@ void MaxPoolingImpl2D_cpu_forward_kernel(
 
   using signedsize = std::make_signed<std::size_t>::type;
 
-  for (std::size_t batch = 0; batch < dims[0]; ++batch){
-    for (std::size_t channel = 0; channel < dims[1]; ++channel){
+#ifdef _OPENMP
+    #pragma omp parallel for collapse(2) if (dims[0] * dims[1] >= 16)
+#endif
+  for (int batch = 0; batch < static_cast<int>(dims[0]); ++batch){
+    for (int channel = 0; channel < static_cast<int>(dims[1]); ++channel){
       auto batchChannelIndex = (channel + batch * dims[1]);
       const std::size_t outputBaseIndex = batchChannelIndex * outXSize * outYSize;
       const std::size_t inputBaseIndex = batchChannelIndex * dims[2] * dims[3];
diff --git a/include/aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp b/include/aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp
index 07486a48f1b8cf29f6a6ef8aa934a9decdbafef7..0e72710cac4004876e8026ccdfbc38cb7c2618eb 100644
--- a/include/aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp
+++ b/include/aidge/backend/cpu/operator/SoftmaxImpl_kernels.hpp
@@ -37,8 +37,11 @@ void SoftmaxImpl_cpu_forward_kernel(std::size_t axisIdx, const std::vector<DimSi
         preAxisElems *= inputDims[i];
     }
 
-    for (std::size_t i = 0; i < preAxisElems; ++i) {
-        for (std::size_t j = 0; j < postAxisElems; ++j) {
+#ifdef _OPENMP
+    #pragma omp parallel for collapse(2) if (preAxisElems * postAxisElems >= 16)
+#endif
+    for (int i = 0; i < static_cast<int>(preAxisElems); ++i) {
+        for (int j = 0; j < static_cast<int>(postAxisElems); ++j) {
             I maxVal = input[i * inputDims[axisIdx] * postAxisElems + j];
             for (std::size_t k = 1; k < inputDims[axisIdx]; ++k) {
                 std::size_t inIdx = i * inputDims[axisIdx] * postAxisElems + k * postAxisElems + j;
diff --git a/unit_tests/scheduler/Test_Scheduler.cpp b/unit_tests/scheduler/Test_Scheduler.cpp
index eed4185d7ac98107f6811f38d7f37851cb6801af..0dfdbb304f6b593903165b7566c68dad71f0b8a4 100644
--- a/unit_tests/scheduler/Test_Scheduler.cpp
+++ b/unit_tests/scheduler/Test_Scheduler.cpp
@@ -436,6 +436,7 @@ TEST_CASE("[cpu/scheduler] SequentialScheduler(backward)", "[scheduler][backward
     // implem already set to default
     auto myProd = Producer(inputTensor, "prod");
     myProd -> addChild(gv);
+    gv->add(myProd);
     gv -> compile("cpu", DataType::Float32);
 
     SequentialScheduler scheduler(gv);