diff --git a/aidge_export_cpp/kernels/convolution.hpp b/aidge_export_cpp/kernels/convolution.hpp
index 6ea9f0579b84dd5a28a5ea66a778326fcd9c84ce..5855654b39d5d7faf09e81735fbe80fa248ace94 100644
--- a/aidge_export_cpp/kernels/convolution.hpp
+++ b/aidge_export_cpp/kernels/convolution.hpp
@@ -48,7 +48,9 @@ void convolution_forward(
                     0, DILATED_KERNEL_HEIGHT);
         const int iy = (oy * STRIDE_Y) - PADDING_Y;
 
+#ifdef _OPENMP
 #pragma omp parallel for collapse(2)
+#endif
         for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
             for (int output = 0; output < NB_OUTPUTS; ++output) {
                 // moved to inner loop for collapsing -->
@@ -62,7 +64,7 @@ void convolution_forward(
                 const int ix = (ox * STRIDE_X) - PADDING_X;
 
                 const int oPos = (ox + OUTPUTS_WIDTH * oy);
-                int oOffset = NB_OUTPUTS * oPos;
+                const int oOffset = NB_OUTPUTS * oPos;
 
                 // <--
                 // Check if the biases are defined
@@ -77,7 +79,7 @@ void convolution_forward(
                     }
 
                     const int iPos = ix + CHANNELS_WIDTH * (iy + sy*DILATION_Y);
-                    int iOffset = NB_CHANNELS * iPos;
+                    const int iOffset = NB_CHANNELS * iPos;
 
                     const int wOffset = (output*KERNEL_HEIGHT + sy) * KERNEL_WIDTH * NB_CHANNELS;
 
@@ -98,7 +100,7 @@ void convolution_forward(
                                 continue;
                             }
 
-                            int iOffsetInRange = iOffset
+                            const int iOffsetInRange = iOffset
                                 + sx * DILATION_X * NB_CHANNELS;
 
                             macsOnRange<NB_CHANNELS>(
@@ -157,4 +159,158 @@ void convolution_forward(
                         (inputs, outputs, weights, b, rescaling);
 }
 
+template<int NB_CHANNELS, 
+         int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
+         int NB_OUTPUTS,
+         int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
+         int PADDING_Y, int PADDING_X,
+         int STRIDE_Y, int STRIDE_X,
+         int DILATION_Y, int DILATION_X,
+         int KERNEL_HEIGHT, int KERNEL_WIDTH,
+         ActivationFunction_T ACTIVATION,
+         typename Input_T, typename Output_T,
+         typename Weight_T, typename Bias_T,
+         typename Rescaling_T>
+__attribute__((always_inline)) inline
+void convolution_depthwise_forward(
+    const Input_T* __restrict inputs,
+    Output_T* __restrict outputs,
+    const Weight_T* __restrict weights,
+    const Bias_T* __restrict biases,
+    const Rescaling_T& __restrict rescaling)
+{
+    static_assert(NB_OUTPUTS % NB_CHANNELS == 0,
+        "NB_OUTPUTS should be a multiple of NB_CHANNELS.");
+
+    constexpr int DILATED_KERNEL_HEIGHT 
+            = KERNEL_HEIGHT + (DILATION_Y - 1) * (KERNEL_HEIGHT - 1);
+
+    constexpr int DILATED_KERNEL_WIDTH 
+            = KERNEL_WIDTH + (DILATION_X - 1) * (KERNEL_WIDTH - 1);
+
+    constexpr int OUTPUTS_HEIGHT_NOPAD
+        = (CHANNELS_HEIGHT - DILATION_Y * (KERNEL_HEIGHT - 1) - 1 + STRIDE_Y) / STRIDE_Y;
+    constexpr int OUTPUTS_WIDTH_NOPAD
+        = (CHANNELS_WIDTH - DILATION_X * (KERNEL_WIDTH - 1) - 1 + STRIDE_X) / STRIDE_X;
+
+    for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
+        const int syMin = (PADDING_Y == 0) ? 0
+            : max(PADDING_Y - (oy * STRIDE_Y), 0);
+        const int syMax = (PADDING_Y == 0
+                && OUTPUTS_HEIGHT == OUTPUTS_HEIGHT_NOPAD) ? DILATED_KERNEL_HEIGHT
+            : clamp(CHANNELS_HEIGHT + PADDING_Y - (oy * STRIDE_Y), 
+                    0, DILATED_KERNEL_HEIGHT);
+        const int iy = (oy * STRIDE_Y) - PADDING_Y;
+
+#ifdef _OPENMP
+#pragma omp parallel for collapse(2)
+#endif
+        for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
+            for (int output = 0; output < NB_OUTPUTS; ++output) {
+                // moved to inner loop for collapsing -->
+                const int sxMin = (PADDING_X == 0) ? 0
+                    : max(PADDING_X - (ox * STRIDE_X), 0);
+                const int sxMax = (PADDING_X == 0
+                        && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
+                            ? DILATED_KERNEL_WIDTH
+                    : clamp(CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X), 
+                            0, DILATED_KERNEL_WIDTH);
+                const int ix = (ox * STRIDE_X) - PADDING_X;
+
+                const int oPos = (ox + OUTPUTS_WIDTH * oy);
+                const int oOffset = NB_OUTPUTS * oPos;
+                // <--
+
+                const int channel = (output * NB_CHANNELS) / NB_OUTPUTS;
+
+                Bias_T weightedSum = biases ? biases[output] : 0;
+
+                for (int sy = 0; sy < KERNEL_HEIGHT; ++sy) {
+                    if ((PADDING_Y != 0
+                            || OUTPUTS_HEIGHT != OUTPUTS_HEIGHT_NOPAD)
+                        && ((sy*DILATION_Y < syMin) || (sy*DILATION_Y >= syMax)))
+                    {
+                        continue;
+                    }
+
+                    const int iPos = ix + CHANNELS_WIDTH * (iy + sy*DILATION_Y);
+                    const int iOffset = NB_CHANNELS * iPos;
+
+                    const int wOffset = (output*KERNEL_HEIGHT + sy) 
+                                        * KERNEL_WIDTH;
+
+                    if (DILATION_X == 1 && ((PADDING_X == 0
+                            && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
+                        || sxMax - sxMin == KERNEL_WIDTH))
+                    {
+                        macsOnRange<KERNEL_WIDTH, NB_CHANNELS>(
+                            inputs + iOffset + channel, 
+                            weights + wOffset, 
+                            weightedSum);
+                    }
+                    else {
+                        for (int sx = 0; sx < KERNEL_WIDTH; ++sx) {
+                            if ((PADDING_X != 0
+                                    || OUTPUTS_WIDTH != OUTPUTS_WIDTH_NOPAD)
+                                && ((sx*DILATION_X < sxMin) || (sx*DILATION_X >= sxMax)))
+                            {
+                                continue;
+                            }
+
+                            const int iOffsetInRange = iOffset
+                                + sx * DILATION_X * NB_CHANNELS;
+
+                            weightedSum += inputs[iOffsetInRange + channel]
+                                * weights[wOffset + sx];
+                        }
+                    }
+                }
+
+                outputs[oOffset + output] = activation_forward_value<Output_T>(weightedSum, output, ACTIVATION, rescaling);
+            }
+        }
+    }
+}
+
+// Template specialization when biases are not given to the convolution
+template<int NB_CHANNELS,
+         int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
+         int NB_OUTPUTS,
+         int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
+         int PADDING_Y, int PADDING_X,
+         int STRIDE_Y, int STRIDE_X,
+         int DILATION_Y, int DILATION_X,
+         int KERNEL_HEIGHT, int KERNEL_WIDTH,
+         ActivationFunction_T ACTIVATION,
+         typename Input_T, typename Output_T,
+         typename Weight_T,
+         typename Rescaling_T>
+__attribute__((always_inline)) inline
+void convolution_depthwise_forward(
+    const Input_T* __restrict inputs,
+    Output_T* __restrict outputs,
+    const Weight_T* __restrict weights,
+    std::nullptr_t __restrict,
+    const Rescaling_T& __restrict rescaling)
+{
+    const float* b = nullptr;
+
+    convolution_depthwise_forward<NB_CHANNELS,
+                        CHANNELS_HEIGHT,
+                        CHANNELS_WIDTH,
+                        NB_OUTPUTS,
+                        OUTPUTS_HEIGHT,
+                        OUTPUTS_WIDTH,
+                        PADDING_Y,
+                        PADDING_X,
+                        STRIDE_Y,
+                        STRIDE_X,
+                        DILATION_Y,
+                        DILATION_X,
+                        KERNEL_HEIGHT,
+                        KERNEL_WIDTH,
+                        ACTIVATION>
+                        (inputs, outputs, weights, b, rescaling);
+}
+
 #endif  // __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__
diff --git a/aidge_export_cpp/kernels/fullyconnected.hpp b/aidge_export_cpp/kernels/fullyconnected.hpp
index 2780de2deadd9c519597627e64eb5bedeee948f8..60805e7b90fa29ba00c6736bb8771985aeca19b4 100644
--- a/aidge_export_cpp/kernels/fullyconnected.hpp
+++ b/aidge_export_cpp/kernels/fullyconnected.hpp
@@ -28,6 +28,9 @@ void fullyconnected_forward (
     // It is only an issue if the FC was after a flatten layer.
     // Otherwise it is not an issue for the other FC because CHANNELS_WIDTH = CHANNELS_HEIGHT = 1
     // Solution: Add a system to check dataformat
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
     for (int och = 0; och < NB_OUTPUTS; och++) {
 
         Bias_T weightedSum = (biases) ? biases[och] : Bias_T(0);
@@ -45,7 +48,9 @@ void fullyconnected_forward (
     }
 /*
 Here the kernel to use with inputs in NHWC and weights in NHWC
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
     for (int och = 0; och < NB_OUTPUTS; och++) {
 
         Bias_T weightedSum = (biases) ? biases[och] : Bias_T(0);
diff --git a/aidge_export_cpp/kernels/leakyrelu.hpp b/aidge_export_cpp/kernels/leakyrelu.hpp
index 07352cd2bf8d73eb1bb3afdcca381fcec4729bbd..5e6598d8fe5d43d9ae9320498289577ab7695e97 100644
--- a/aidge_export_cpp/kernels/leakyrelu.hpp
+++ b/aidge_export_cpp/kernels/leakyrelu.hpp
@@ -11,7 +11,9 @@ void leakyrelu_forward (
     Output_T* __restrict outputs,
     const float negative_slope)
 {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
     for (int i = 0; i < NB_DATA; ++i) {
         if (inputs[i] >= 0) {
             outputs[i] = inputs[i];
diff --git a/aidge_export_cpp/kernels/pooling.hpp b/aidge_export_cpp/kernels/pooling.hpp
index a86fd4196a9f6e19f45dbdc4f1035c1e94e7d285..30fa766abbeded7eb55caf01902c216d95a2ed17 100644
--- a/aidge_export_cpp/kernels/pooling.hpp
+++ b/aidge_export_cpp/kernels/pooling.hpp
@@ -36,7 +36,9 @@ void pooling_forward(
                     0, POOL_HEIGHT);
         const int iy = (oy * STRIDE_Y) - PADDING_Y;
 
+#ifdef _OPENMP
 #pragma omp parallel for collapse(2)
+#endif
         for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
             for (int output = 0; output < NB_OUTPUTS; ++output) {
                 // moved to inner loop for collapsing -->
diff --git a/aidge_export_cpp/operators.py b/aidge_export_cpp/operators.py
index 26ca62155401707573d9625ad91a9b63cb1b4d2b..cb7a09c9fa01eee433f2cbcdb09cc66dc0bb17a6 100644
--- a/aidge_export_cpp/operators.py
+++ b/aidge_export_cpp/operators.py
@@ -187,6 +187,39 @@ class PaddedConvCPP(ExportNodeCpp):
 
         _setup_conv2D(self)
 
+@ExportLibCpp.register("ConvDepthWise2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
+class ConvDepthWiseCPP(ExportNodeCpp):
+    def __init__(self, node, mem_info):
+        super().__init__(node, mem_info)
+        self.attributes["depthwise"] = True
+
+        # No padding with Conv
+        # Use PaddedConv to add padding attribute
+        self.attributes["padding"] = [0, 0]
+
+        _setup_conv2D(self)
+
+@ExportLibCpp.register_metaop("PaddedConvDepthWise2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
+class PaddedConvDepthWiseCPP(ExportNodeCpp):
+    def __init__(self, node, mem_info):
+        super().__init__(node, mem_info)
+        self.attributes["depthwise"] = True
+
+        # TODO find a way to retrive attr for meta op
+        for n in self.operator.get_micro_graph().get_nodes():
+            if n.type() == "Pad2D":
+                self.attributes["padding"] = n.get_operator(
+                ).attr.begin_end_borders
+            if n.type() == "ConvDepthWise2D":
+                self.attributes["kernel_dims"] = n.get_operator(
+                ).attr.kernel_dims
+                self.attributes["stride_dims"] = n.get_operator(
+                ).attr.stride_dims
+                self.attributes["dilation_dims"] = n.get_operator(
+                ).attr.dilation_dims
+
+        _setup_conv2D(self)
+
 def _setup_elemwise_op(elemwise, op):
     """Common code (template and kernel setup) shared across all the different elementWise operator (Add, Sub,...)."""
 
diff --git a/aidge_export_cpp/templates/kernel_forward/convolution_forward.jinja b/aidge_export_cpp/templates/kernel_forward/convolution_forward.jinja
index 421013b9590dabe6ee0ac12f969494913414a530..7d0af8c6f75df47825e67a8b47258c3f8469fc6a 100644
--- a/aidge_export_cpp/templates/kernel_forward/convolution_forward.jinja
+++ b/aidge_export_cpp/templates/kernel_forward/convolution_forward.jinja
@@ -1,6 +1,6 @@
 {% filter indent(width=4, first=False) %}
 {% include "./_mem_offset.jinja" %}
-convolution_forward<{{ in_name[0]|upper }}_NB_CHANNELS,
+convolution{{ "_depthwise" if depthwise is defined else "" }}_forward<{{ in_name[0]|upper }}_NB_CHANNELS,
                     {{ in_name[0]|upper }}_IN_HEIGHT,
                     {{ in_name[0]|upper }}_IN_WIDTH,
                     {{ out_name[0]|upper }}_NB_OUTPUTS,
diff --git a/aidge_export_cpp/unit_tests/test_export.py b/aidge_export_cpp/unit_tests/test_export.py
index 607778d23deda862db73f5908fd1caa6ccc1d95b..65db07e33c68e4900476f97d92879ad7ca3ed4e2 100644
--- a/aidge_export_cpp/unit_tests/test_export.py
+++ b/aidge_export_cpp/unit_tests/test_export.py
@@ -410,6 +410,14 @@ class test_operator_export(unittest.TestCase):
 
         self.unit_test_export(model, "Conv2D", [[1, 3, 12, 12]], False, False)
 
+    def test_export_convDepthWise2D(self):
+        print("ConvDepthWise2D")
+        model = aidge_core.sequential([
+            aidge_core.ConvDepthWise2D(nb_channels=3, kernel_dims=(3, 3), name="conv")
+        ])
+
+        self.unit_test_export(model, "ConvDepthWise2D", [[1, 3, 12, 12]], False, False)
+
     def test_export_max_pooling(self):
         print("MaxPooling2D")
         model = aidge_core.sequential([