diff --git a/aidge_export_cpp/export_utils.py b/aidge_export_cpp/export_utils.py index 4cbc38d7d43d56893dd093455310f891226023d6..e22524fb9058dfb4c8b023d0df8fbe11e2ff791b 100644 --- a/aidge_export_cpp/export_utils.py +++ b/aidge_export_cpp/export_utils.py @@ -27,6 +27,12 @@ def cpp_fuse_to_metaops(graph_view: aidge_core.GraphView): "ConvAct": "(QConv|Conv2D)->ReLU", "PadConvAct": "PadConv->ReLU", + # ConvDw + "QConvDw": "ConvDepthWise2D->(Quantizer|QMul)", + "ConvDwPad": "(QConvDw|ConvDepthWise2D)->Pad2D", + "ConvDwAct": "(QConvDw|ConvConvDepthWise2D2D)->ReLU", + "ConvDwActPad": "ConvDwAct->Pad2D", + # Max Pooling "PadMaxPool": "MaxPooling2D<-Pad2D", "MaxPoolAct": "MaxPooling2D->ReLU", diff --git a/aidge_export_cpp/kernels/convolution.hpp b/aidge_export_cpp/kernels/convolution.hpp index 0648d80f2b891c9b10cc6653649221974379db55..ed62401e69ff8d53b23ba9f88917bb54acd3740a 100644 --- a/aidge_export_cpp/kernels/convolution.hpp +++ b/aidge_export_cpp/kernels/convolution.hpp @@ -159,158 +159,4 @@ void convolution_forward( (inputs, outputs, weights, b, rescaling); } -template<int NB_CHANNELS, - int CHANNELS_HEIGHT, int CHANNELS_WIDTH, - int NB_OUTPUTS, - int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH, - int PADDING_Y, int PADDING_X, - int STRIDE_Y, int STRIDE_X, - int DILATION_Y, int DILATION_X, - int KERNEL_HEIGHT, int KERNEL_WIDTH, - ActivationFunction_T ACTIVATION, - typename Input_T, typename Output_T, - typename Weight_T, typename Bias_T, - typename Rescaling_T> -__attribute__((always_inline)) inline -void convolution_depthwise_forward( - const Input_T* __restrict inputs, - Output_T* __restrict outputs, - const Weight_T* __restrict weights, - const Bias_T* __restrict biases, - const Rescaling_T& __restrict rescaling) -{ - static_assert(NB_OUTPUTS % NB_CHANNELS == 0, - "NB_OUTPUTS should be a multiple of NB_CHANNELS."); - - constexpr int DILATED_KERNEL_HEIGHT - = KERNEL_HEIGHT + (DILATION_Y - 1) * (KERNEL_HEIGHT - 1); - - constexpr int DILATED_KERNEL_WIDTH - = KERNEL_WIDTH + (DILATION_X - 1) * (KERNEL_WIDTH - 1); - - constexpr int OUTPUTS_HEIGHT_NOPAD - = (CHANNELS_HEIGHT - DILATION_Y * (KERNEL_HEIGHT - 1) - 1 + STRIDE_Y) / STRIDE_Y; - constexpr int OUTPUTS_WIDTH_NOPAD - = (CHANNELS_WIDTH - DILATION_X * (KERNEL_WIDTH - 1) - 1 + STRIDE_X) / STRIDE_X; - - for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) { - const int syMin = (PADDING_Y == 0) ? 0 - : max(PADDING_Y - (oy * STRIDE_Y), 0); - const int syMax = (PADDING_Y == 0 - && OUTPUTS_HEIGHT == OUTPUTS_HEIGHT_NOPAD) ? DILATED_KERNEL_HEIGHT - : clamp(CHANNELS_HEIGHT + PADDING_Y - (oy * STRIDE_Y), - 0, DILATED_KERNEL_HEIGHT); - const int iy = (oy * STRIDE_Y) - PADDING_Y; - -#ifdef _OPENMP -#pragma omp parallel for collapse(2) -#endif - for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) { - for (int output = 0; output < NB_OUTPUTS; ++output) { - // moved to inner loop for collapsing --> - const int sxMin = (PADDING_X == 0) ? 0 - : max(PADDING_X - (ox * STRIDE_X), 0); - const int sxMax = (PADDING_X == 0 - && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD) - ? DILATED_KERNEL_WIDTH - : clamp(CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X), - 0, DILATED_KERNEL_WIDTH); - const int ix = (ox * STRIDE_X) - PADDING_X; - - const int oPos = (ox + OUTPUTS_WIDTH * oy); - const int oOffset = NB_OUTPUTS * oPos; - // <-- - - const int channel = (output * NB_CHANNELS) / NB_OUTPUTS; - - Bias_T weightedSum = biases ? biases[output] : 0; - - for (int sy = 0; sy < KERNEL_HEIGHT; ++sy) { - if ((PADDING_Y != 0 - || OUTPUTS_HEIGHT != OUTPUTS_HEIGHT_NOPAD) - && ((sy*DILATION_Y < syMin) || (sy*DILATION_Y >= syMax))) - { - continue; - } - - const int iPos = ix + CHANNELS_WIDTH * (iy + sy*DILATION_Y); - const int iOffset = NB_CHANNELS * iPos; - - const int wOffset = (output*KERNEL_HEIGHT + sy) - * KERNEL_WIDTH; - - if (DILATION_X == 1 && ((PADDING_X == 0 - && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD) - || sxMax - sxMin == KERNEL_WIDTH)) - { - macsOnRange<KERNEL_WIDTH, NB_CHANNELS>( - inputs + iOffset + channel, - weights + wOffset, - weightedSum); - } - else { - for (int sx = 0; sx < KERNEL_WIDTH; ++sx) { - if ((PADDING_X != 0 - || OUTPUTS_WIDTH != OUTPUTS_WIDTH_NOPAD) - && ((sx*DILATION_X < sxMin) || (sx*DILATION_X >= sxMax))) - { - continue; - } - - const int iOffsetInRange = iOffset - + sx * DILATION_X * NB_CHANNELS; - - weightedSum += inputs[iOffsetInRange + channel] - * weights[wOffset + sx]; - } - } - } - - outputs[oOffset + output] = activation_forward_value<Output_T>(weightedSum, output, ACTIVATION, rescaling); - } - } - } -} - -// Template specialization when biases are not given to the convolution -template<int NB_CHANNELS, - int CHANNELS_HEIGHT, int CHANNELS_WIDTH, - int NB_OUTPUTS, - int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH, - int PADDING_Y, int PADDING_X, - int STRIDE_Y, int STRIDE_X, - int DILATION_Y, int DILATION_X, - int KERNEL_HEIGHT, int KERNEL_WIDTH, - ActivationFunction_T ACTIVATION, - typename Input_T, typename Output_T, - typename Weight_T, - typename Rescaling_T> -__attribute__((always_inline)) inline -void convolution_depthwise_forward( - const Input_T* __restrict inputs, - Output_T* __restrict outputs, - const Weight_T* __restrict weights, - std::nullptr_t __restrict, - const Rescaling_T& __restrict rescaling) -{ - const float* b = nullptr; - - convolution_depthwise_forward<NB_CHANNELS, - CHANNELS_HEIGHT, - CHANNELS_WIDTH, - NB_OUTPUTS, - OUTPUTS_HEIGHT, - OUTPUTS_WIDTH, - PADDING_Y, - PADDING_X, - STRIDE_Y, - STRIDE_X, - DILATION_Y, - DILATION_X, - KERNEL_HEIGHT, - KERNEL_WIDTH, - ACTIVATION> - (inputs, outputs, weights, b, rescaling); -} - #endif // __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__ diff --git a/aidge_export_cpp/kernels/convolution_depthwise.hpp b/aidge_export_cpp/kernels/convolution_depthwise.hpp new file mode 100644 index 0000000000000000000000000000000000000000..244dd86bc01be7142474380f1e3393ce32446aaf --- /dev/null +++ b/aidge_export_cpp/kernels/convolution_depthwise.hpp @@ -0,0 +1,164 @@ +#ifndef __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION_DEPTHWISE__ +#define __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION_DEPTHWISE__ + +#include "network/typedefs.hpp" +#include "network/rescaling_utils.hpp" +#include "network/utils.hpp" +#include "network/macs.hpp" +#include "network/activation_utils.hpp" + +template<int NB_CHANNELS, + int CHANNELS_HEIGHT, int CHANNELS_WIDTH, + int NB_OUTPUTS, + int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH, + int PADDING_Y, int PADDING_X, + int STRIDE_Y, int STRIDE_X, + int DILATION_Y, int DILATION_X, + int KERNEL_HEIGHT, int KERNEL_WIDTH, + ActivationFunction_T ACTIVATION, + typename Input_T, typename Output_T, + typename Weight_T, typename Bias_T, + typename Rescaling_T> +__attribute__((always_inline)) inline +void convolution_depthwise_forward( + const Input_T* __restrict inputs, + Output_T* __restrict outputs, + const Weight_T* __restrict weights, + const Bias_T* __restrict biases, + const Rescaling_T& __restrict rescaling) +{ + static_assert(NB_OUTPUTS % NB_CHANNELS == 0, + "NB_OUTPUTS should be a multiple of NB_CHANNELS."); + + constexpr int DILATED_KERNEL_HEIGHT + = KERNEL_HEIGHT + (DILATION_Y - 1) * (KERNEL_HEIGHT - 1); + + constexpr int DILATED_KERNEL_WIDTH + = KERNEL_WIDTH + (DILATION_X - 1) * (KERNEL_WIDTH - 1); + + constexpr int OUTPUTS_HEIGHT_NOPAD + = (CHANNELS_HEIGHT - DILATION_Y * (KERNEL_HEIGHT - 1) - 1 + STRIDE_Y) / STRIDE_Y; + constexpr int OUTPUTS_WIDTH_NOPAD + = (CHANNELS_WIDTH - DILATION_X * (KERNEL_WIDTH - 1) - 1 + STRIDE_X) / STRIDE_X; + + for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) { + const int syMin = (PADDING_Y == 0) ? 0 + : max(PADDING_Y - (oy * STRIDE_Y), 0); + const int syMax = (PADDING_Y == 0 + && OUTPUTS_HEIGHT == OUTPUTS_HEIGHT_NOPAD) ? DILATED_KERNEL_HEIGHT + : clamp(CHANNELS_HEIGHT + PADDING_Y - (oy * STRIDE_Y), + 0, DILATED_KERNEL_HEIGHT); + const int iy = (oy * STRIDE_Y) - PADDING_Y; + +#ifdef _OPENMP +#pragma omp parallel for collapse(2) +#endif + for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) { + for (int output = 0; output < NB_OUTPUTS; ++output) { + // moved to inner loop for collapsing --> + const int sxMin = (PADDING_X == 0) ? 0 + : max(PADDING_X - (ox * STRIDE_X), 0); + const int sxMax = (PADDING_X == 0 + && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD) + ? DILATED_KERNEL_WIDTH + : clamp(CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X), + 0, DILATED_KERNEL_WIDTH); + const int ix = (ox * STRIDE_X) - PADDING_X; + + const int oPos = (ox + OUTPUTS_WIDTH * oy); + const int oOffset = NB_OUTPUTS * oPos; + // <-- + + const int channel = (output * NB_CHANNELS) / NB_OUTPUTS; + + Bias_T weightedSum = biases ? biases[output] : 0; + + for (int sy = 0; sy < KERNEL_HEIGHT; ++sy) { + if ((PADDING_Y != 0 + || OUTPUTS_HEIGHT != OUTPUTS_HEIGHT_NOPAD) + && ((sy*DILATION_Y < syMin) || (sy*DILATION_Y >= syMax))) + { + continue; + } + + const int iPos = ix + CHANNELS_WIDTH * (iy + sy*DILATION_Y); + const int iOffset = NB_CHANNELS * iPos; + + const int wOffset = (output*KERNEL_HEIGHT + sy) + * KERNEL_WIDTH; + + if (DILATION_X == 1 && ((PADDING_X == 0 + && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD) + || sxMax - sxMin == KERNEL_WIDTH)) + { + macsOnRange<KERNEL_WIDTH, NB_CHANNELS>( + inputs + iOffset + channel, + weights + wOffset, + weightedSum); + } + else { + for (int sx = 0; sx < KERNEL_WIDTH; ++sx) { + if ((PADDING_X != 0 + || OUTPUTS_WIDTH != OUTPUTS_WIDTH_NOPAD) + && ((sx*DILATION_X < sxMin) || (sx*DILATION_X >= sxMax))) + { + continue; + } + + const int iOffsetInRange = iOffset + + sx * DILATION_X * NB_CHANNELS; + + weightedSum += inputs[iOffsetInRange + channel] + * weights[wOffset + sx]; + } + } + } + + outputs[oOffset + output] = activation_forward_value<Output_T>(weightedSum, output, ACTIVATION, rescaling); + } + } + } +} + +// Template specialization when biases are not given to the convolution +template<int NB_CHANNELS, + int CHANNELS_HEIGHT, int CHANNELS_WIDTH, + int NB_OUTPUTS, + int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH, + int PADDING_Y, int PADDING_X, + int STRIDE_Y, int STRIDE_X, + int DILATION_Y, int DILATION_X, + int KERNEL_HEIGHT, int KERNEL_WIDTH, + ActivationFunction_T ACTIVATION, + typename Input_T, typename Output_T, + typename Weight_T, + typename Rescaling_T> +__attribute__((always_inline)) inline +void convolution_depthwise_forward( + const Input_T* __restrict inputs, + Output_T* __restrict outputs, + const Weight_T* __restrict weights, + std::nullptr_t __restrict, + const Rescaling_T& __restrict rescaling) +{ + const float* b = nullptr; + + convolution_depthwise_forward<NB_CHANNELS, + CHANNELS_HEIGHT, + CHANNELS_WIDTH, + NB_OUTPUTS, + OUTPUTS_HEIGHT, + OUTPUTS_WIDTH, + PADDING_Y, + PADDING_X, + STRIDE_Y, + STRIDE_X, + DILATION_Y, + DILATION_X, + KERNEL_HEIGHT, + KERNEL_WIDTH, + ACTIVATION> + (inputs, outputs, weights, b, rescaling); +} + +#endif // __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION_DEPTHWISE__ diff --git a/aidge_export_cpp/operators/ConvDw.py b/aidge_export_cpp/operators/ConvDw.py new file mode 100644 index 0000000000000000000000000000000000000000..41e4fdf07708bdfb891b9827784f2654114f5b6e --- /dev/null +++ b/aidge_export_cpp/operators/ConvDw.py @@ -0,0 +1,82 @@ +import aidge_core +from aidge_core.export_utils import ExportNodeCpp, get_node_from_metaop +from aidge_export_cpp import ROOT, ExportLibCpp, set_scaling_attributes + +@ExportLibCpp.register("ConvDw2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) +class ConvDw(ExportNodeCpp): + def __init__(self, node, mem_info): + super().__init__(node, mem_info) + + # Initialize kernel attributes + self.attributes["padding"] = [0, 0, 0, 0] + self.attributes["activation"] = "Linear" + self.attributes["depthwise"] = True + self.attributes["aidge_cmp"] = node.attributes().has_attr("aidge_cmp") + + ## Scaling + self.attributes["rescaling"] = "NoScaling" + self.attributes["shift_value"] = 0 + + # Browse the metaop to update kernel attributes + ConvDwNode = get_node_from_metaop(node, "ConvDw2D") + self.attributes["kernel_dims"] = ConvDwNode[0].get_operator().attr.kernel_dims + self.attributes["stride_dims"] = ConvDwNode[0].get_operator().attr.stride_dims + self.attributes["dilation_dims"] = ConvDwNode[0].get_operator().attr.dilation_dims + + # Template for layer configutation file generation + self.config_template = str(ROOT / "templates" / "configuration" / "convolution_config.jinja") + + # Template layer call function generation within the forward file + self.forward_template = str(ROOT / "templates" / "kernel_forward" / "convolution_forward.jinja") + + # Files to include within the generated forward.cpp file + self.include_list = [] + + # Path to the kernel(s) files to copy + self.add_kernel_to_copy(ROOT / "kernels" / "convolution_depthwise.hpp") + self.add_kernel_to_copy(ROOT / "static" / "macs.hpp", "include/network", fwd_include=False) + + # Include aidge outputs within the fwd file + if self.attributes["aidge_cmp"]: + self.include_list.append("network/utils.hpp") # aidge_cmp function + self.include_list.append("data/aidge_outputs/" + node.name() + ".hpp") + + +@ExportLibCpp.register_metaop("QConvDw", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) +class QConvDw(ConvDw): + def __init__(self, node, mem_info): + super().__init__(node, mem_info) + + # Look for Quantizer node and set shift and coef export node attributes + set_scaling_attributes(self, node) + + ## Set the scaling type + if self.attributes["shift_value"] != 0: + self.attributes["rescaling"] = "SingleShiftScaling" + + +@ExportLibCpp.register_metaop("PadConvDw", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) +class PadConvDw(QConvDw): + def __init__(self, node, mem_info): + super().__init__(node, mem_info) + + # Browse the metaop to update kernel attributes + PadNode = get_node_from_metaop(node, "Pad2D") + self.attributes["padding"] = PadNode[0].get_operator().attr.begin_end_borders + + +@ExportLibCpp.register_metaop("ConvDwAct", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) +class ConvDwAct(QConvDw): + def __init__(self, node, mem_info): + super().__init__(node, mem_info) + + # Browse the metaop to update kernel attributes + if get_node_from_metaop(node, "ReLU"): + self.attributes["activation"] = "Rectifier" + else: + aidge_core.Log.error(f"{node.type()} activation is not yet supported.") + +@ExportLibCpp.register_metaop("PadConvDwAct", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any))) +class PadConvDwAct(PadConvDw, ConvDwAct): + def __init__(self, node, mem_info): + super().__init__(node, mem_info) diff --git a/aidge_export_cpp/templates/configuration/convolution_config.jinja b/aidge_export_cpp/templates/configuration/convolution_config.jinja index f1a57db1b7511d270c3ab7d62a87008735a12df3..b72df4d10f5342f661e921f4b2a7dbaf79d32e85 100644 --- a/aidge_export_cpp/templates/configuration/convolution_config.jinja +++ b/aidge_export_cpp/templates/configuration/convolution_config.jinja @@ -17,7 +17,8 @@ {% include "./_rescaling.jinja" %} {#- Calculate sizes #} -{%- set weights_size = out_chan[0] * in_chan[0] * kernel_dims[1] * kernel_dims[0] %} +{%- set weights_size = out_chan[0] * kernel_dims[1] * kernel_dims[0] if depthwise is defined + else out_chan[0] * in_chan[0] * kernel_dims[1] * kernel_dims[0] %} #define {{ name|upper }}_WEIGHTS_SIZE {{ weights_size }} #define {{ name|upper }}_BIASES_SIZE {{ out_chan[0] }}