From 490df27c9d52446b805470d843fd47ec2ad60fb6 Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Mon, 28 Apr 2025 14:55:19 +0200
Subject: [PATCH 1/6] Added ConvDepthWise support

---
 aidge_export_cpp/kernels/convolution.hpp      | 116 +++++++++++++++++-
 aidge_export_cpp/operators.py                 |  33 +++++
 .../kernel_forward/convolution_forward.jinja  |   2 +-
 aidge_export_cpp/unit_tests/test_export.py    |   8 ++
 4 files changed, 155 insertions(+), 4 deletions(-)

diff --git a/aidge_export_cpp/kernels/convolution.hpp b/aidge_export_cpp/kernels/convolution.hpp
index 6ea9f05..38c8ad7 100644
--- a/aidge_export_cpp/kernels/convolution.hpp
+++ b/aidge_export_cpp/kernels/convolution.hpp
@@ -62,7 +62,7 @@ void convolution_forward(
                 const int ix = (ox * STRIDE_X) - PADDING_X;
 
                 const int oPos = (ox + OUTPUTS_WIDTH * oy);
-                int oOffset = NB_OUTPUTS * oPos;
+                const int oOffset = NB_OUTPUTS * oPos;
 
                 // <--
                 // Check if the biases are defined
@@ -77,7 +77,7 @@ void convolution_forward(
                     }
 
                     const int iPos = ix + CHANNELS_WIDTH * (iy + sy*DILATION_Y);
-                    int iOffset = NB_CHANNELS * iPos;
+                    const int iOffset = NB_CHANNELS * iPos;
 
                     const int wOffset = (output*KERNEL_HEIGHT + sy) * KERNEL_WIDTH * NB_CHANNELS;
 
@@ -98,7 +98,7 @@ void convolution_forward(
                                 continue;
                             }
 
-                            int iOffsetInRange = iOffset
+                            const int iOffsetInRange = iOffset
                                 + sx * DILATION_X * NB_CHANNELS;
 
                             macsOnRange<NB_CHANNELS>(
@@ -157,4 +157,114 @@ void convolution_forward(
                         (inputs, outputs, weights, b, rescaling);
 }
 
+template<int NB_CHANNELS, 
+         int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
+         int NB_OUTPUTS,
+         int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
+         int PADDING_Y, int PADDING_X,
+         int STRIDE_Y, int STRIDE_X,
+         int DILATION_Y, int DILATION_X,
+         int KERNEL_HEIGHT, int KERNEL_WIDTH,
+         ActivationFunction_T ACTIVATION,
+         typename Input_T, typename Output_T,
+         typename Weight_T, typename Bias_T,
+         typename Rescaling_T>
+__attribute__((always_inline)) inline
+void convolution_depthwise_forward(
+    const Input_T* __restrict inputs,
+    Output_T* __restrict outputs,
+    const Weight_T* __restrict weights,
+    const Bias_T* __restrict biases,
+    const Rescaling_T& __restrict rescaling)
+{
+    static_assert(NB_OUTPUTS % NB_CHANNELS == 0,
+        "NB_OUTPUTS should be a multiple of NB_CHANNELS.");
+    static_assert(DILATION_Y == 1,
+        "DILATION_Y != 1 not supported.");
+    static_assert(DILATION_X == 1,
+        "DILATION_X != 1 not supported.");
+
+    constexpr int OUTPUTS_HEIGHT_NOPAD
+        = (CHANNELS_HEIGHT - KERNEL_HEIGHT + STRIDE_Y) / STRIDE_Y;
+    constexpr int OUTPUTS_WIDTH_NOPAD
+        = (CHANNELS_WIDTH - KERNEL_WIDTH + STRIDE_X) / STRIDE_X;
+
+    for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
+        const int syMin = (PADDING_Y == 0) ? 0
+            : max(PADDING_Y - (oy * STRIDE_Y), 0);
+        const int syMax = (PADDING_Y == 0
+                && OUTPUTS_HEIGHT == OUTPUTS_HEIGHT_NOPAD) ? KERNEL_HEIGHT
+            : clamp(CHANNELS_HEIGHT + PADDING_Y - (oy * STRIDE_Y), 
+                    0, KERNEL_HEIGHT);
+        const int iy = (oy * STRIDE_Y) - PADDING_Y;
+
+#pragma omp parallel for collapse(2)
+        for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
+            for (int output = 0; output < NB_OUTPUTS; ++output) {
+                // moved to inner loop for collapsing -->
+                const int sxMin = (PADDING_X == 0) ? 0
+                    : max(PADDING_X - (ox * STRIDE_X), 0);
+                const int sxMax = (PADDING_X == 0
+                        && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
+                            ? KERNEL_WIDTH
+                    : clamp(CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X), 
+                            0, KERNEL_WIDTH);
+                const int ix = (ox * STRIDE_X) - PADDING_X;
+
+                const int oPos = (ox + OUTPUTS_WIDTH * oy);
+                const int oOffset = NB_OUTPUTS * oPos;
+                // <--
+
+                const int channel = (output * NB_CHANNELS) / NB_OUTPUTS;
+
+                Bias_T weightedSum = biases ? biases[output] : 0;
+
+                for (int sy = 0; sy < KERNEL_HEIGHT; ++sy) {
+                    if ((PADDING_Y != 0
+                            || OUTPUTS_HEIGHT != OUTPUTS_HEIGHT_NOPAD)
+                        && sy >= syMax - syMin)
+                    {
+                        break;
+                    }
+
+                    const int iPos = ((sxMin + ix)
+                                        + CHANNELS_WIDTH * (iy + syMin + sy));
+                    int iOffset = NB_CHANNELS * iPos;
+
+                    const int wOffset = (sxMin
+                        + KERNEL_WIDTH * (syMin + sy + KERNEL_HEIGHT * output));
+
+                    if ((PADDING_X == 0
+                            && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
+                        || sxMax - sxMin == KERNEL_WIDTH)
+                    {
+                        macsOnRange<KERNEL_WIDTH, NB_CHANNELS>(
+                            inputs + iOffset + channel, 
+                            weights + wOffset, 
+                            weightedSum);
+                    }
+                    else {
+                        for (int sx = 0; sx < KERNEL_WIDTH; ++sx) {
+                            if ((PADDING_X != 0
+                                    || OUTPUTS_WIDTH != OUTPUTS_WIDTH_NOPAD)
+                                && sx >= sxMax - sxMin)
+                            {
+                                break;
+                            }
+
+                            const int iOffsetInRange = iOffset
+                                + sx * NB_CHANNELS;
+
+                            weightedSum += inputs[iOffsetInRange + channel]
+                                * weights[wOffset + sx];
+                        }
+                    }
+                }
+
+                outputs[oOffset + output] = activation_forward_value<Output_T>(weightedSum, output, ACTIVATION, rescaling);
+            }
+        }
+    }
+}
+
 #endif  // __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__
diff --git a/aidge_export_cpp/operators.py b/aidge_export_cpp/operators.py
index 26ca621..b6121e4 100644
--- a/aidge_export_cpp/operators.py
+++ b/aidge_export_cpp/operators.py
@@ -187,6 +187,39 @@ class PaddedConvCPP(ExportNodeCpp):
 
         _setup_conv2D(self)
 
+@ExportLibCpp.register("ConvDepthWise2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
+class ConvCPP(ExportNodeCpp):
+    def __init__(self, node, mem_info):
+        super().__init__(node, mem_info)
+        self.attributes["depthwise"] = True
+
+        # No padding with Conv
+        # Use PaddedConv to add padding attribute
+        self.attributes["padding"] = [0, 0]
+
+        _setup_conv2D(self)
+
+@ExportLibCpp.register_metaop("PaddedConvDepthWise2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
+class PaddedConvCPP(ExportNodeCpp):
+    def __init__(self, node, mem_info):
+        super().__init__(node, mem_info)
+        self.attributes["depthwise"] = True
+
+        # TODO find a way to retrive attr for meta op
+        for n in self.operator.get_micro_graph().get_nodes():
+            if n.type() == "Pad2D":
+                self.attributes["padding"] = n.get_operator(
+                ).attr.begin_end_borders
+            if n.type() == "ConvDepthWise2D":
+                self.attributes["kernel_dims"] = n.get_operator(
+                ).attr.kernel_dims
+                self.attributes["stride_dims"] = n.get_operator(
+                ).attr.stride_dims
+                self.attributes["dilation_dims"] = n.get_operator(
+                ).attr.dilation_dims
+
+        _setup_conv2D(self)
+
 def _setup_elemwise_op(elemwise, op):
     """Common code (template and kernel setup) shared across all the different elementWise operator (Add, Sub,...)."""
 
diff --git a/aidge_export_cpp/templates/kernel_forward/convolution_forward.jinja b/aidge_export_cpp/templates/kernel_forward/convolution_forward.jinja
index 421013b..7d0af8c 100644
--- a/aidge_export_cpp/templates/kernel_forward/convolution_forward.jinja
+++ b/aidge_export_cpp/templates/kernel_forward/convolution_forward.jinja
@@ -1,6 +1,6 @@
 {% filter indent(width=4, first=False) %}
 {% include "./_mem_offset.jinja" %}
-convolution_forward<{{ in_name[0]|upper }}_NB_CHANNELS,
+convolution{{ "_depthwise" if depthwise is defined else "" }}_forward<{{ in_name[0]|upper }}_NB_CHANNELS,
                     {{ in_name[0]|upper }}_IN_HEIGHT,
                     {{ in_name[0]|upper }}_IN_WIDTH,
                     {{ out_name[0]|upper }}_NB_OUTPUTS,
diff --git a/aidge_export_cpp/unit_tests/test_export.py b/aidge_export_cpp/unit_tests/test_export.py
index 607778d..387c595 100644
--- a/aidge_export_cpp/unit_tests/test_export.py
+++ b/aidge_export_cpp/unit_tests/test_export.py
@@ -410,6 +410,14 @@ class test_operator_export(unittest.TestCase):
 
         self.unit_test_export(model, "Conv2D", [[1, 3, 12, 12]], False, False)
 
+    def test_export_convDepthWise2D(self):
+        print("Conv2D")
+        model = aidge_core.sequential([
+            aidge_core.ConvDepthWise2D(nb_channels=3, kernel_dims=(3, 3), name="conv")
+        ])
+
+        self.unit_test_export(model, "ConvDepthWise2D", [[1, 3, 12, 12]], False, False)
+
     def test_export_max_pooling(self):
         print("MaxPooling2D")
         model = aidge_core.sequential([
-- 
GitLab


From 9c03c1f6bb5c14f948929aae9716ef010e923bba Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Mon, 28 Apr 2025 15:01:16 +0200
Subject: [PATCH 2/6] Added dilation support

---
 aidge_export_cpp/kernels/convolution.hpp | 45 ++++++++++++------------
 1 file changed, 23 insertions(+), 22 deletions(-)

diff --git a/aidge_export_cpp/kernels/convolution.hpp b/aidge_export_cpp/kernels/convolution.hpp
index 38c8ad7..40f22c6 100644
--- a/aidge_export_cpp/kernels/convolution.hpp
+++ b/aidge_export_cpp/kernels/convolution.hpp
@@ -179,23 +179,25 @@ void convolution_depthwise_forward(
 {
     static_assert(NB_OUTPUTS % NB_CHANNELS == 0,
         "NB_OUTPUTS should be a multiple of NB_CHANNELS.");
-    static_assert(DILATION_Y == 1,
-        "DILATION_Y != 1 not supported.");
-    static_assert(DILATION_X == 1,
-        "DILATION_X != 1 not supported.");
+
+    constexpr int DILATED_KERNEL_HEIGHT 
+            = KERNEL_HEIGHT + (DILATION_Y - 1) * (KERNEL_HEIGHT - 1);
+
+    constexpr int DILATED_KERNEL_WIDTH 
+            = KERNEL_WIDTH + (DILATION_X - 1) * (KERNEL_WIDTH - 1);
 
     constexpr int OUTPUTS_HEIGHT_NOPAD
-        = (CHANNELS_HEIGHT - KERNEL_HEIGHT + STRIDE_Y) / STRIDE_Y;
+        = (CHANNELS_HEIGHT - DILATION_Y * (KERNEL_HEIGHT - 1) - 1 + STRIDE_Y) / STRIDE_Y;
     constexpr int OUTPUTS_WIDTH_NOPAD
-        = (CHANNELS_WIDTH - KERNEL_WIDTH + STRIDE_X) / STRIDE_X;
+        = (CHANNELS_WIDTH - DILATION_X * (KERNEL_WIDTH - 1) - 1 + STRIDE_X) / STRIDE_X;
 
     for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
         const int syMin = (PADDING_Y == 0) ? 0
             : max(PADDING_Y - (oy * STRIDE_Y), 0);
         const int syMax = (PADDING_Y == 0
-                && OUTPUTS_HEIGHT == OUTPUTS_HEIGHT_NOPAD) ? KERNEL_HEIGHT
+                && OUTPUTS_HEIGHT == OUTPUTS_HEIGHT_NOPAD) ? DILATED_KERNEL_HEIGHT
             : clamp(CHANNELS_HEIGHT + PADDING_Y - (oy * STRIDE_Y), 
-                    0, KERNEL_HEIGHT);
+                    0, DILATED_KERNEL_HEIGHT);
         const int iy = (oy * STRIDE_Y) - PADDING_Y;
 
 #pragma omp parallel for collapse(2)
@@ -206,9 +208,9 @@ void convolution_depthwise_forward(
                     : max(PADDING_X - (ox * STRIDE_X), 0);
                 const int sxMax = (PADDING_X == 0
                         && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
-                            ? KERNEL_WIDTH
+                            ? DILATED_KERNEL_WIDTH
                     : clamp(CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X), 
-                            0, KERNEL_WIDTH);
+                            0, DILATED_KERNEL_WIDTH);
                 const int ix = (ox * STRIDE_X) - PADDING_X;
 
                 const int oPos = (ox + OUTPUTS_WIDTH * oy);
@@ -222,21 +224,20 @@ void convolution_depthwise_forward(
                 for (int sy = 0; sy < KERNEL_HEIGHT; ++sy) {
                     if ((PADDING_Y != 0
                             || OUTPUTS_HEIGHT != OUTPUTS_HEIGHT_NOPAD)
-                        && sy >= syMax - syMin)
+                        && ((sy*DILATION_Y < syMin) || (sy*DILATION_Y >= syMax)))
                     {
-                        break;
+                        continue;
                     }
 
-                    const int iPos = ((sxMin + ix)
-                                        + CHANNELS_WIDTH * (iy + syMin + sy));
-                    int iOffset = NB_CHANNELS * iPos;
+                    const int iPos = ix + CHANNELS_WIDTH * (iy + sy*DILATION_Y);
+                    const int iOffset = NB_CHANNELS * iPos;
 
-                    const int wOffset = (sxMin
-                        + KERNEL_WIDTH * (syMin + sy + KERNEL_HEIGHT * output));
+                    const int wOffset = (output*KERNEL_HEIGHT + sy) 
+                                        * KERNEL_WIDTH * NB_CHANNELS;
 
-                    if ((PADDING_X == 0
+                    if (DILATION_X == 1 && ((PADDING_X == 0
                             && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
-                        || sxMax - sxMin == KERNEL_WIDTH)
+                        || sxMax - sxMin == KERNEL_WIDTH))
                     {
                         macsOnRange<KERNEL_WIDTH, NB_CHANNELS>(
                             inputs + iOffset + channel, 
@@ -247,13 +248,13 @@ void convolution_depthwise_forward(
                         for (int sx = 0; sx < KERNEL_WIDTH; ++sx) {
                             if ((PADDING_X != 0
                                     || OUTPUTS_WIDTH != OUTPUTS_WIDTH_NOPAD)
-                                && sx >= sxMax - sxMin)
+                                && ((sx*DILATION_X < sxMin) || (sx*DILATION_X >= sxMax)))
                             {
-                                break;
+                                continue;
                             }
 
                             const int iOffsetInRange = iOffset
-                                + sx * NB_CHANNELS;
+                                + sx * DILATION_X * NB_CHANNELS;
 
                             weightedSum += inputs[iOffsetInRange + channel]
                                 * weights[wOffset + sx];
-- 
GitLab


From e37444e768679c5e43f21c73f8587fcfd8d1500a Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Mon, 28 Apr 2025 15:06:20 +0200
Subject: [PATCH 3/6] Added _OPENMP guards

---
 aidge_export_cpp/kernels/convolution.hpp    | 4 ++++
 aidge_export_cpp/kernels/fullyconnected.hpp | 5 +++++
 aidge_export_cpp/kernels/leakyrelu.hpp      | 2 ++
 aidge_export_cpp/kernels/pooling.hpp        | 2 ++
 4 files changed, 13 insertions(+)

diff --git a/aidge_export_cpp/kernels/convolution.hpp b/aidge_export_cpp/kernels/convolution.hpp
index 40f22c6..7df55ab 100644
--- a/aidge_export_cpp/kernels/convolution.hpp
+++ b/aidge_export_cpp/kernels/convolution.hpp
@@ -48,7 +48,9 @@ void convolution_forward(
                     0, DILATED_KERNEL_HEIGHT);
         const int iy = (oy * STRIDE_Y) - PADDING_Y;
 
+#ifdef _OPENMP
 #pragma omp parallel for collapse(2)
+#endif
         for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
             for (int output = 0; output < NB_OUTPUTS; ++output) {
                 // moved to inner loop for collapsing -->
@@ -200,7 +202,9 @@ void convolution_depthwise_forward(
                     0, DILATED_KERNEL_HEIGHT);
         const int iy = (oy * STRIDE_Y) - PADDING_Y;
 
+#ifdef _OPENMP
 #pragma omp parallel for collapse(2)
+#endif
         for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
             for (int output = 0; output < NB_OUTPUTS; ++output) {
                 // moved to inner loop for collapsing -->
diff --git a/aidge_export_cpp/kernels/fullyconnected.hpp b/aidge_export_cpp/kernels/fullyconnected.hpp
index 2780de2..60805e7 100644
--- a/aidge_export_cpp/kernels/fullyconnected.hpp
+++ b/aidge_export_cpp/kernels/fullyconnected.hpp
@@ -28,6 +28,9 @@ void fullyconnected_forward (
     // It is only an issue if the FC was after a flatten layer.
     // Otherwise it is not an issue for the other FC because CHANNELS_WIDTH = CHANNELS_HEIGHT = 1
     // Solution: Add a system to check dataformat
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
     for (int och = 0; och < NB_OUTPUTS; och++) {
 
         Bias_T weightedSum = (biases) ? biases[och] : Bias_T(0);
@@ -45,7 +48,9 @@ void fullyconnected_forward (
     }
 /*
 Here the kernel to use with inputs in NHWC and weights in NHWC
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
     for (int och = 0; och < NB_OUTPUTS; och++) {
 
         Bias_T weightedSum = (biases) ? biases[och] : Bias_T(0);
diff --git a/aidge_export_cpp/kernels/leakyrelu.hpp b/aidge_export_cpp/kernels/leakyrelu.hpp
index 07352cd..5e6598d 100644
--- a/aidge_export_cpp/kernels/leakyrelu.hpp
+++ b/aidge_export_cpp/kernels/leakyrelu.hpp
@@ -11,7 +11,9 @@ void leakyrelu_forward (
     Output_T* __restrict outputs,
     const float negative_slope)
 {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
     for (int i = 0; i < NB_DATA; ++i) {
         if (inputs[i] >= 0) {
             outputs[i] = inputs[i];
diff --git a/aidge_export_cpp/kernels/pooling.hpp b/aidge_export_cpp/kernels/pooling.hpp
index a86fd41..30fa766 100644
--- a/aidge_export_cpp/kernels/pooling.hpp
+++ b/aidge_export_cpp/kernels/pooling.hpp
@@ -36,7 +36,9 @@ void pooling_forward(
                     0, POOL_HEIGHT);
         const int iy = (oy * STRIDE_Y) - PADDING_Y;
 
+#ifdef _OPENMP
 #pragma omp parallel for collapse(2)
+#endif
         for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
             for (int output = 0; output < NB_OUTPUTS; ++output) {
                 // moved to inner loop for collapsing -->
-- 
GitLab


From 7c75fde34e0301225762f663d55763254f9cb43a Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Mon, 28 Apr 2025 15:35:18 +0200
Subject: [PATCH 4/6] Fixed bug

---
 aidge_export_cpp/kernels/convolution.hpp   | 2 +-
 aidge_export_cpp/unit_tests/test_export.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/aidge_export_cpp/kernels/convolution.hpp b/aidge_export_cpp/kernels/convolution.hpp
index 7df55ab..1669ee8 100644
--- a/aidge_export_cpp/kernels/convolution.hpp
+++ b/aidge_export_cpp/kernels/convolution.hpp
@@ -237,7 +237,7 @@ void convolution_depthwise_forward(
                     const int iOffset = NB_CHANNELS * iPos;
 
                     const int wOffset = (output*KERNEL_HEIGHT + sy) 
-                                        * KERNEL_WIDTH * NB_CHANNELS;
+                                        * KERNEL_WIDTH;
 
                     if (DILATION_X == 1 && ((PADDING_X == 0
                             && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
diff --git a/aidge_export_cpp/unit_tests/test_export.py b/aidge_export_cpp/unit_tests/test_export.py
index 387c595..65db07e 100644
--- a/aidge_export_cpp/unit_tests/test_export.py
+++ b/aidge_export_cpp/unit_tests/test_export.py
@@ -411,7 +411,7 @@ class test_operator_export(unittest.TestCase):
         self.unit_test_export(model, "Conv2D", [[1, 3, 12, 12]], False, False)
 
     def test_export_convDepthWise2D(self):
-        print("Conv2D")
+        print("ConvDepthWise2D")
         model = aidge_core.sequential([
             aidge_core.ConvDepthWise2D(nb_channels=3, kernel_dims=(3, 3), name="conv")
         ])
-- 
GitLab


From cda4b85e8437f5bd722f8451bf765f15f031edb7 Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Mon, 28 Apr 2025 16:20:24 +0200
Subject: [PATCH 5/6] Added no bias indirection

---
 aidge_export_cpp/kernels/convolution.hpp | 41 ++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/aidge_export_cpp/kernels/convolution.hpp b/aidge_export_cpp/kernels/convolution.hpp
index 1669ee8..5855654 100644
--- a/aidge_export_cpp/kernels/convolution.hpp
+++ b/aidge_export_cpp/kernels/convolution.hpp
@@ -272,4 +272,45 @@ void convolution_depthwise_forward(
     }
 }
 
+// Template specialization when biases are not given to the convolution
+template<int NB_CHANNELS,
+         int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
+         int NB_OUTPUTS,
+         int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
+         int PADDING_Y, int PADDING_X,
+         int STRIDE_Y, int STRIDE_X,
+         int DILATION_Y, int DILATION_X,
+         int KERNEL_HEIGHT, int KERNEL_WIDTH,
+         ActivationFunction_T ACTIVATION,
+         typename Input_T, typename Output_T,
+         typename Weight_T,
+         typename Rescaling_T>
+__attribute__((always_inline)) inline
+void convolution_depthwise_forward(
+    const Input_T* __restrict inputs,
+    Output_T* __restrict outputs,
+    const Weight_T* __restrict weights,
+    std::nullptr_t __restrict,
+    const Rescaling_T& __restrict rescaling)
+{
+    const float* b = nullptr;
+
+    convolution_depthwise_forward<NB_CHANNELS,
+                        CHANNELS_HEIGHT,
+                        CHANNELS_WIDTH,
+                        NB_OUTPUTS,
+                        OUTPUTS_HEIGHT,
+                        OUTPUTS_WIDTH,
+                        PADDING_Y,
+                        PADDING_X,
+                        STRIDE_Y,
+                        STRIDE_X,
+                        DILATION_Y,
+                        DILATION_X,
+                        KERNEL_HEIGHT,
+                        KERNEL_WIDTH,
+                        ACTIVATION>
+                        (inputs, outputs, weights, b, rescaling);
+}
+
 #endif  // __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__
-- 
GitLab


From 5f41dd757c5926e4c6b67f0275886e11f616bdf5 Mon Sep 17 00:00:00 2001
From: Olivier BICHLER <olivier.bichler@cea.fr>
Date: Mon, 28 Apr 2025 16:30:34 +0200
Subject: [PATCH 6/6] Fixed duplicate name

---
 aidge_export_cpp/operators.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/aidge_export_cpp/operators.py b/aidge_export_cpp/operators.py
index b6121e4..cb7a09c 100644
--- a/aidge_export_cpp/operators.py
+++ b/aidge_export_cpp/operators.py
@@ -188,7 +188,7 @@ class PaddedConvCPP(ExportNodeCpp):
         _setup_conv2D(self)
 
 @ExportLibCpp.register("ConvDepthWise2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
-class ConvCPP(ExportNodeCpp):
+class ConvDepthWiseCPP(ExportNodeCpp):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
         self.attributes["depthwise"] = True
@@ -200,7 +200,7 @@ class ConvCPP(ExportNodeCpp):
         _setup_conv2D(self)
 
 @ExportLibCpp.register_metaop("PaddedConvDepthWise2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
-class PaddedConvCPP(ExportNodeCpp):
+class PaddedConvDepthWiseCPP(ExportNodeCpp):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
         self.attributes["depthwise"] = True
-- 
GitLab