Added ConvDepthWise support

490df27c · Olivier BICHLER · 07226da3 · 490df27c · 490df27c · 490df27c
Commit 490df27c authored 2 months ago by Olivier BICHLER
--- a/aidge_export_cpp/kernels/convolution.hpp
+++ b/aidge_export_cpp/kernels/convolution.hpp
@@ -62,7 +62,7 @@ void convolution_forward(
                const int ix = (ox * STRIDE_X) - PADDING_X;
                const int oPos = (ox + OUTPUTS_WIDTH * oy);
-                int oOffset = NB_OUTPUTS * oPos;
+                const int oOffset = NB_OUTPUTS * oPos;
                // <--
                // Check if the biases are defined
@@ -77,7 +77,7 @@ void convolution_forward(
                    }
                    const int iPos = ix + CHANNELS_WIDTH * (iy + sy*DILATION_Y);
-                    int iOffset = NB_CHANNELS * iPos;
+                    const int iOffset = NB_CHANNELS * iPos;
                    const int wOffset = (output*KERNEL_HEIGHT + sy) * KERNEL_WIDTH * NB_CHANNELS;
@@ -98,7 +98,7 @@ void convolution_forward(
                                continue;
                            }
-                            int iOffsetInRange = iOffset
+                            const int iOffsetInRange = iOffset
                                + sx * DILATION_X * NB_CHANNELS;
                            macsOnRange<NB_CHANNELS>(
@@ -157,4 +157,114 @@ void convolution_forward(
                        (inputs, outputs, weights, b, rescaling);
 }
+template<int NB_CHANNELS, 
+         int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
+         int NB_OUTPUTS,
+         int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
+         int PADDING_Y, int PADDING_X,
+         int STRIDE_Y, int STRIDE_X,
+         int DILATION_Y, int DILATION_X,
+         int KERNEL_HEIGHT, int KERNEL_WIDTH,
+         ActivationFunction_T ACTIVATION,
+         typename Input_T, typename Output_T,
+         typename Weight_T, typename Bias_T,
+         typename Rescaling_T>
+__attribute__((always_inline)) inline
+void convolution_depthwise_forward(
+    const Input_T* __restrict inputs,
+    Output_T* __restrict outputs,
+    const Weight_T* __restrict weights,
+    const Bias_T* __restrict biases,
+    const Rescaling_T& __restrict rescaling)
+{
+    static_assert(NB_OUTPUTS % NB_CHANNELS == 0,
+        "NB_OUTPUTS should be a multiple of NB_CHANNELS.");
+    static_assert(DILATION_Y == 1,
+        "DILATION_Y != 1 not supported.");
+    static_assert(DILATION_X == 1,
+        "DILATION_X != 1 not supported.");
+    constexpr int OUTPUTS_HEIGHT_NOPAD
+        = (CHANNELS_HEIGHT - KERNEL_HEIGHT + STRIDE_Y) / STRIDE_Y;
+    constexpr int OUTPUTS_WIDTH_NOPAD
+        = (CHANNELS_WIDTH - KERNEL_WIDTH + STRIDE_X) / STRIDE_X;
+    for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
+        const int syMin = (PADDING_Y == 0) ? 0
+            : max(PADDING_Y - (oy * STRIDE_Y), 0);
+        const int syMax = (PADDING_Y == 0
+                && OUTPUTS_HEIGHT == OUTPUTS_HEIGHT_NOPAD) ? KERNEL_HEIGHT
+            : clamp(CHANNELS_HEIGHT + PADDING_Y - (oy * STRIDE_Y), 
+                    0, KERNEL_HEIGHT);
+        const int iy = (oy * STRIDE_Y) - PADDING_Y;
+#pragma omp parallel for collapse(2)
+        for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
+            for (int output = 0; output < NB_OUTPUTS; ++output) {
+                // moved to inner loop for collapsing -->
+                const int sxMin = (PADDING_X == 0) ? 0
+                    : max(PADDING_X - (ox * STRIDE_X), 0);
+                const int sxMax = (PADDING_X == 0
+                        && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
+                            ? KERNEL_WIDTH
+                    : clamp(CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X), 
+                            0, KERNEL_WIDTH);
+                const int ix = (ox * STRIDE_X) - PADDING_X;
+                const int oPos = (ox + OUTPUTS_WIDTH * oy);
+                const int oOffset = NB_OUTPUTS * oPos;
+                // <--
+                const int channel = (output * NB_CHANNELS) / NB_OUTPUTS;
+                Bias_T weightedSum = biases ? biases[output] : 0;
+                for (int sy = 0; sy < KERNEL_HEIGHT; ++sy) {
+                    if ((PADDING_Y != 0
+                            || OUTPUTS_HEIGHT != OUTPUTS_HEIGHT_NOPAD)
+                        && sy >= syMax - syMin)
+                    {
+                        break;
+                    }
+                    const int iPos = ((sxMin + ix)
+                                        + CHANNELS_WIDTH * (iy + syMin + sy));
+                    int iOffset = NB_CHANNELS * iPos;
+                    const int wOffset = (sxMin
+                        + KERNEL_WIDTH * (syMin + sy + KERNEL_HEIGHT * output));
+                    if ((PADDING_X == 0
+                            && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
+                        || sxMax - sxMin == KERNEL_WIDTH)
+                    {
+                        macsOnRange<KERNEL_WIDTH, NB_CHANNELS>(
+                            inputs + iOffset + channel, 
+                            weights + wOffset, 
+                            weightedSum);
+                    }
+                    else {
+                        for (int sx = 0; sx < KERNEL_WIDTH; ++sx) {
+                            if ((PADDING_X != 0
+                                    || OUTPUTS_WIDTH != OUTPUTS_WIDTH_NOPAD)
+                                && sx >= sxMax - sxMin)
+                            {
+                                break;
+                            }
+                            const int iOffsetInRange = iOffset
+                                + sx * NB_CHANNELS;
+                            weightedSum += inputs[iOffsetInRange + channel]
+                                * weights[wOffset + sx];
+                        }
+                    }
+                }
+                outputs[oOffset + output] = activation_forward_value<Output_T>(weightedSum, output, ACTIVATION, rescaling);
+            }
+        }
+    }
+}
 #endif  // __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__
--- a/aidge_export_cpp/operators.py
+++ b/aidge_export_cpp/operators.py
@@ -187,6 +187,39 @@ class PaddedConvCPP(ExportNodeCpp):
        _setup_conv2D(self)
+@ExportLibCpp.register("ConvDepthWise2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
+class ConvCPP(ExportNodeCpp):
+    def __init__(self, node, mem_info):
+        super().__init__(node, mem_info)
+        self.attributes["depthwise"] = True
+        # No padding with Conv
+        # Use PaddedConv to add padding attribute
+        self.attributes["padding"] = [0, 0]
+        _setup_conv2D(self)
+@ExportLibCpp.register_metaop("PaddedConvDepthWise2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
+class PaddedConvCPP(ExportNodeCpp):
+    def __init__(self, node, mem_info):
+        super().__init__(node, mem_info)
+        self.attributes["depthwise"] = True
+        # TODO find a way to retrive attr for meta op
+        for n in self.operator.get_micro_graph().get_nodes():
+            if n.type() == "Pad2D":
+                self.attributes["padding"] = n.get_operator(
+                ).attr.begin_end_borders
+            if n.type() == "ConvDepthWise2D":
+                self.attributes["kernel_dims"] = n.get_operator(
+                ).attr.kernel_dims
+                self.attributes["stride_dims"] = n.get_operator(
+                ).attr.stride_dims
+                self.attributes["dilation_dims"] = n.get_operator(
+                ).attr.dilation_dims
+        _setup_conv2D(self)
 def _setup_elemwise_op(elemwise, op):
    """Common code (template and kernel setup) shared across all the different elementWise operator (Add, Sub,...)."""

--- a/aidge_export_cpp/templates/kernel_forward/convolution_forward.jinja
+++ b/aidge_export_cpp/templates/kernel_forward/convolution_forward.jinja
 {% filter indent(width=4, first=False) %}
 {% include "./_mem_offset.jinja" %}
-convolution_forward<{{ in_name[0]|upper }}_NB_CHANNELS,
+convolution{{ "_depthwise" if depthwise is defined else "" }}_forward<{{ in_name[0]|upper }}_NB_CHANNELS,
                    {{ in_name[0]|upper }}_IN_HEIGHT,
                    {{ in_name[0]|upper }}_IN_WIDTH,
                    {{ out_name[0]|upper }}_NB_OUTPUTS,

--- a/aidge_export_cpp/unit_tests/test_export.py
+++ b/aidge_export_cpp/unit_tests/test_export.py
@@ -410,6 +410,14 @@ class test_operator_export(unittest.TestCase):
        self.unit_test_export(model, "Conv2D", [[1, 3, 12, 12]], False, False)
+    def test_export_convDepthWise2D(self):
+        print("Conv2D")
+        model = aidge_core.sequential([
+            aidge_core.ConvDepthWise2D(nb_channels=3, kernel_dims=(3, 3), name="conv")
+        ])
+        self.unit_test_export(model, "ConvDepthWise2D", [[1, 3, 12, 12]], False, False)
    def test_export_max_pooling(self):
        print("MaxPooling2D")
        model = aidge_core.sequential([