added parameters groups

af87b9e5 · Matthew Newson · 9f86629b · af87b9e5 · af87b9e5 · af87b9e5
Commit af87b9e5 authored 1 month ago by Matthew Newson
--- a/aidge_export_cpp/kernels/convolution_depthwise.hpp
+++ b/aidge_export_cpp/kernels/convolution_depthwise.hpp
@@ -6,159 +6,77 @@
 #include "network/utils.hpp"
 #include "network/macs.hpp"
 #include "network/activation_utils.hpp"
+#include <omp.h>
+// Weights index en NHWC
+constexpr int inds_pos(int n, int c, int h, int w, int N, int C, int H, int W) {
+    return n * (H * W * C) +
+           h * (W * C) + 
+           w * C +
+           c;
+}
+// Image index in CHW
+constexpr int inds_pos(int c, int h, int w, int C, int H, int W) {
+    return c * (H * W) + 
+           h * W +
+           w;
+}
 template<int NB_CHANNELS, 
-         int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
+         int IN_HEIGHT, int IN_WIDTH,
-         int NB_OUTPUTS,
+         int NB_OUTPUTS, int GROUPS,
-         int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
+         int OUT_HEIGHT, int OUT_WIDTH,
         int PADDING_Y, int PADDING_X,
         int STRIDE_Y, int STRIDE_X,
         int DILATION_Y, int DILATION_X,
         int KERNEL_HEIGHT, int KERNEL_WIDTH,
         ActivationFunction_T ACTIVATION,
-         typename Input_T, typename Output_T,
+         typename Input_T, typename Output_T, 
         typename Weight_T, typename Bias_T,
         typename Rescaling_T>
-__attribute__((always_inline)) inline
+__attribute__((always_inline)) inline 
 void convolution_depthwise_forward(
-    const Input_T* __restrict inputs,
+    const Input_T* __restrict inputs,  
    Output_T* __restrict outputs,
    const Weight_T* __restrict weights,
    const Bias_T* __restrict biases,
    const Rescaling_T& __restrict rescaling)
 {
-    static_assert(NB_OUTPUTS % NB_CHANNELS == 0,
-        "NB_OUTPUTS should be a multiple of NB_CHANNELS.");
+    int c_in_g = NB_CHANNELS / GROUPS;
-    constexpr int DILATED_KERNEL_HEIGHT 
+    int c_out_g = NB_OUTPUTS / GROUPS;
-            = KERNEL_HEIGHT + (DILATION_Y - 1) * (KERNEL_HEIGHT - 1);
+    #pragma omp parallel for
+    for (int oc = 0; oc < NB_OUTPUTS; oc++) {
-    constexpr int DILATED_KERNEL_WIDTH 
+    	int g_oc = oc / c_out_g;
-            = KERNEL_WIDTH + (DILATION_X - 1) * (KERNEL_WIDTH - 1);
+        #pragma omp parallel for
+    	for (int i = 0; i < OUT_HEIGHT; ++i) {
-    constexpr int OUTPUTS_HEIGHT_NOPAD
+            #pragma omp parallel for
-        = (CHANNELS_HEIGHT - DILATION_Y * (KERNEL_HEIGHT - 1) - 1 + STRIDE_Y) / STRIDE_Y;
+    	    for (int j = 0; j < OUT_WIDTH; ++j) {
-    constexpr int OUTPUTS_WIDTH_NOPAD
+    	        Output_T value = biases[oc];
-        = (CHANNELS_WIDTH - DILATION_X * (KERNEL_WIDTH - 1) - 1 + STRIDE_X) / STRIDE_X;
+                #pragma omp parallel for
+    	        for (int ic = g_oc * c_in_g; ic < (g_oc + 1) * c_in_g; ++ic) {
-    for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
+                    #pragma omp parallel for
-        const int syMin = (PADDING_Y == 0) ? 0
+    	            for (int m = 0; m < KERNEL_HEIGHT; ++m) {
-            : max(PADDING_Y - (oy * STRIDE_Y), 0);
+                        #pragma omp parallel for
-        const int syMax = (PADDING_Y == 0
+    	                for (int n = 0; n < KERNEL_WIDTH; ++n) {
-                && OUTPUTS_HEIGHT == OUTPUTS_HEIGHT_NOPAD) ? DILATED_KERNEL_HEIGHT
+    	                    int i_p = i * STRIDE_X - PADDING_X + m * DILATION_X;
-            : clamp(CHANNELS_HEIGHT + PADDING_Y - (oy * STRIDE_Y), 
+                            int j_p = j * STRIDE_Y - PADDING_Y + n * DILATION_Y;
-                    0, DILATED_KERNEL_HEIGHT);
+                            if (i_p >= 0 && i_p < IN_HEIGHT && j_p >= 0 && j_p < IN_WIDTH) {
-        const int iy = (oy * STRIDE_Y) - PADDING_Y;
+                                value += weights[inds_pos(oc, ic % c_in_g, m, n, NB_OUTPUTS, c_in_g, KERNEL_HEIGHT, KERNEL_WIDTH)] *
+                                         inputs[inds_pos(ic, i_p, j_p, NB_CHANNELS, IN_HEIGHT, IN_WIDTH)];
-#ifdef _OPENMP
+    	                    }
-#pragma omp parallel for collapse(2)
+    	                }
-#endif
+    	            }
-        for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
+    	        }
-            for (int output = 0; output < NB_OUTPUTS; ++output) {
+    	        outputs[inds_pos(oc, i, j, NB_OUTPUTS, OUT_HEIGHT, OUT_WIDTH)] = activation_forward_value<Output_T>(value, oc, ACTIVATION, rescaling);
-                // moved to inner loop for collapsing -->
+    	    }
-                const int sxMin = (PADDING_X == 0) ? 0
+    	} 
-                    : max(PADDING_X - (ox * STRIDE_X), 0);
-                const int sxMax = (PADDING_X == 0
-                        && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
-                            ? DILATED_KERNEL_WIDTH
-                    : clamp(CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X), 
-                            0, DILATED_KERNEL_WIDTH);
-                const int ix = (ox * STRIDE_X) - PADDING_X;
-                const int oPos = (ox + OUTPUTS_WIDTH * oy);
-                const int oOffset = NB_OUTPUTS * oPos;
-                // <--
-                const int channel = (output * NB_CHANNELS) / NB_OUTPUTS;
-                Bias_T weightedSum = biases ? biases[output] : 0;
-                for (int sy = 0; sy < KERNEL_HEIGHT; ++sy) {
-                    if ((PADDING_Y != 0
-                            || OUTPUTS_HEIGHT != OUTPUTS_HEIGHT_NOPAD)
-                        && ((sy*DILATION_Y < syMin) || (sy*DILATION_Y >= syMax)))
-                    {
-                        continue;
-                    }
-                    const int iPos = ix + CHANNELS_WIDTH * (iy + sy*DILATION_Y);
-                    const int iOffset = NB_CHANNELS * iPos;
-                    const int wOffset = (output*KERNEL_HEIGHT + sy) 
-                                        * KERNEL_WIDTH;
-                    if (DILATION_X == 1 && ((PADDING_X == 0
-                            && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
-                        || sxMax - sxMin == KERNEL_WIDTH))
-                    {
-                        macsOnRange<KERNEL_WIDTH, NB_CHANNELS>(
-                            inputs + iOffset + channel, 
-                            weights + wOffset, 
-                            weightedSum);
-                    }
-                    else {
-                        for (int sx = 0; sx < KERNEL_WIDTH; ++sx) {
-                            if ((PADDING_X != 0
-                                    || OUTPUTS_WIDTH != OUTPUTS_WIDTH_NOPAD)
-                                && ((sx*DILATION_X < sxMin) || (sx*DILATION_X >= sxMax)))
-                            {
-                                continue;
-                            }
-                            const int iOffsetInRange = iOffset
-                                + sx * DILATION_X * NB_CHANNELS;
-                            weightedSum += inputs[iOffsetInRange + channel]
-                                * weights[wOffset + sx];
-                        }
-                    }
-                }
-                outputs[oOffset + output] = activation_forward_value<Output_T>(weightedSum, output, ACTIVATION, rescaling);
-            }
-        }
    }
 }
-// Template specialization when biases are not given to the convolution
-template<int NB_CHANNELS,
-         int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
-         int NB_OUTPUTS,
-         int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
-         int PADDING_Y, int PADDING_X,
-         int STRIDE_Y, int STRIDE_X,
-         int DILATION_Y, int DILATION_X,
-         int KERNEL_HEIGHT, int KERNEL_WIDTH,
-         ActivationFunction_T ACTIVATION,
-         typename Input_T, typename Output_T,
-         typename Weight_T,
-         typename Rescaling_T>
-__attribute__((always_inline)) inline
-void convolution_depthwise_forward(
-    const Input_T* __restrict inputs,
-    Output_T* __restrict outputs,
-    const Weight_T* __restrict weights,
-    std::nullptr_t __restrict,
-    const Rescaling_T& __restrict rescaling)
-{
-    const float* b = nullptr;
-    convolution_depthwise_forward<NB_CHANNELS,
-                        CHANNELS_HEIGHT,
-                        CHANNELS_WIDTH,
-                        NB_OUTPUTS,
-                        OUTPUTS_HEIGHT,
-                        OUTPUTS_WIDTH,
-                        PADDING_Y,
-                        PADDING_X,
-                        STRIDE_Y,
-                        STRIDE_X,
-                        DILATION_Y,
-                        DILATION_X,
-                        KERNEL_HEIGHT,
-                        KERNEL_WIDTH,
-                        ACTIVATION>
-                        (inputs, outputs, weights, b, rescaling);
-}
 #endif  // __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION_DEPTHWISE__
--- a/aidge_export_cpp/operators/Conv.py
+++ b/aidge_export_cpp/operators/Conv.py
@@ -16,6 +16,9 @@ class Conv(ExportNodeCpp):
        self.attributes["rescaling"] = "NoScaling"
        self.attributes["shift_value"] = 0
+        ## Groups 
+        self.attributes["groups"] = 1
        # Browse the metaop to update kernel attributes
        ConvNode = get_node_from_metaop(node, "Conv2D") 
        self.attributes["kernel_dims"] = ConvNode[0].get_operator().attr.kernel_dims

--- a/aidge_export_cpp/operators/ConvDw.py
+++ b/aidge_export_cpp/operators/ConvDw.py
@@ -17,6 +17,9 @@ class ConvDw(ExportNodeCpp):
        self.attributes["rescaling"] = "NoScaling"
        self.attributes["shift_value"] = 0
+        ## Groups
+        self.attributes["groups"] = self.attributes["out_chan"][0]
        # Browse the metaop to update kernel attributes
        ConvDwNode = get_node_from_metaop(node, "ConvDepthWise2D") 
        self.attributes["kernel_dims"] = ConvDwNode[0].get_operator().attr.kernel_dims

--- a/aidge_export_cpp/templates/configuration/convolution_config.jinja
+++ b/aidge_export_cpp/templates/configuration/convolution_config.jinja
@@ -5,6 +5,7 @@
 {# For layer configuration -#}
 {% include "./_def_io.jinja" %}
 {% include "./_meminfo.jinja" %}
+#define {{ name|upper }}_GROUPS {{ groups }}
 #define {{ name|upper }}_PADDING_Y {{ padding[0] }}
 #define {{ name|upper }}_PADDING_X {{ padding[1] }}
 #define {{ name|upper }}_STRIDE_Y {{ stride_dims[0] }}

--- a/aidge_export_cpp/templates/kernel_forward/convolution_forward.jinja
+++ b/aidge_export_cpp/templates/kernel_forward/convolution_forward.jinja
@@ -4,6 +4,7 @@ convolution{{ "_depthwise" if depthwise is defined else "" }}_forward<{{ in_name
                    {{ in_name[0]|upper }}_IN_HEIGHT,
                    {{ in_name[0]|upper }}_IN_WIDTH,
                    {{ out_name[0]|upper }}_NB_OUTPUTS,
+                    {{name|upper}}_GROUPS,
                    {{ out_name[0]|upper }}_OUT_HEIGHT,
                    {{ out_name[0]|upper }}_OUT_WIDTH,
                    {{name|upper}}_PADDING_Y,