Merge branch 'low_bit_support' into 'dev'

Low bit support See merge request !16

Merge branch 'low_bit_support' into 'dev'
Low bit support See merge request !16
5f1820a1 · Maxence Naud · 1a4b72ef · 10bfb0ac · 5f1820a1 · 5f1820a1
Commit 5f1820a1 authored 6 months ago by Maxence Naud
--- a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Convolution/LowbitConv.hpp
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Convolution/LowbitConv.hpp
+/*
+    (C) Copyright 2017 CEA LIST. All Rights Reserved.
+    Contributor(s): N2D2 Team
+    This software is governed by the CeCILL-C license under French law and
+    abiding by the rules of distribution of free software.  You can  use,
+    modify and/ or redistribute the software under the terms of the CeCILL-C
+    license as circulated by CEA, CNRS and INRIA at the following URL
+    "http://www.cecill.info".
+    As a counterpart to the access to the source code and  rights to copy,
+    modify and redistribute granted by the license, users are provided only
+    with a limited warranty  and the software's author,  the holder of the
+    economic rights,  and the successive licensors  have only  limited
+    liability.
+    The fact that you are presently reading this means that you have had
+    knowledge of the CeCILL-C license and that you accept its terms.
+*/
+#ifndef __N2D2_EXPORT_ARM_CONV_CUSTOM_HPP__
+#define __N2D2_EXPORT_ARM_CONV_CUSTOM_HPP__
+#include <cmath>
+#include "kernels/typedefs.hpp"
+#include "assert.h"
+#include "utils.hpp"
+#include "kernels/Macs.hpp"
+#include "kernels/subkernels_functions.hpp"
+namespace N2D2_Export {
+template<int NB_CHANNELS, 
+         int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
+         int NB_OUTPUTS, 
+         int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
+         int PADDING_Y, int PADDING_X,
+         int STRIDE_Y, int STRIDE_X,
+         int KERNEL_HEIGHT, int KERNEL_WIDTH,
+         ActivationFunction_T ACTIVATION,
+        //  // Memory mapping: inputs
+        //  int INPUT_MEM_CONT_OFFSET,
+        //  int INPUT_MEM_CONT_SIZE,
+        //  int INPUT_MEM_WRAP_OFFSET,
+        //  int INPUT_MEM_WRAP_SIZE,
+        //  int INPUT_MEM_STRIDE,
+        //  // Memory mapping: outputs
+        //  int OUTPUT_MEM_CONT_OFFSET,
+        //  int OUTPUT_MEM_CONT_SIZE,
+        //  int OUTPUT_MEM_WRAP_OFFSET,
+        //  int OUTPUT_MEM_WRAP_SIZE,
+        //  int OUTPUT_MEM_STRIDE,
+         typename Sum_T, typename Input_T, typename Output_T, 
+         typename Weight_T, typename Bias_T, typename Rescaling_T>
+__attribute__((always_inline)) inline static
+void lowbitconvcellPropagate(const Input_T* __restrict inputs,
+                                      Output_T* __restrict outputs,
+                                      const Bias_T* __restrict biasses,
+                                      const Weight_T* __restrict weights,
+                                      const Rescaling_T& __restrict rescaling) 
+{
+    PackSupport infoPack = {0, 0};
+    constexpr int bits_norm_in = (std::numeric_limits<Input_T>::digits >= 8) 
+                        ? 8/std::ceil(8/(float)std::numeric_limits<Input_T>::digits) 
+                        : 8/std::floor(8/(float)std::numeric_limits<Input_T>::digits);
+    constexpr int bits_norm_wt = (std::numeric_limits<Weight_T>::digits >= 8) 
+                        ? 8/std::ceil(8/(float)std::numeric_limits<Weight_T>::digits) 
+                        : 8/std::floor(8/(float)std::numeric_limits<Weight_T>::digits);
+    constexpr int INPUTS_BYTE
+        = std::ceil(((NB_CHANNELS * bits_norm_in)
+          + (NB_CHANNELS * bits_norm_in) % 8) / (float)8);
+    constexpr int WEIGHTS_BYTE 
+        = std::ceil(((NB_CHANNELS * bits_norm_wt)
+          + (NB_CHANNELS * bits_norm_wt) % 8) / (float)8);
+    int outputOffset = 0;
+    int iy = 0;
+    for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
+        const int syMin = (PADDING_Y == 0) ? 0 : max(PADDING_Y - iy, 0);
+        const int syMax = (PADDING_Y == 0) ? KERNEL_HEIGHT 
+                                           : clamp(CHANNELS_HEIGHT + PADDING_Y - iy, 
+                                                   0, KERNEL_HEIGHT);
+        int ix = 0;
+        for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
+            const int sxMin = (PADDING_X == 0) ? 0 : max(PADDING_X - ix, 0);
+            const int sxMax = (PADDING_X == 0) ? KERNEL_WIDTH 
+                                               : clamp(CHANNELS_WIDTH + PADDING_X - ix,  
+                                                       0, KERNEL_WIDTH);
+            for (int och = 0; och < NB_OUTPUTS; ++och) {
+                Sum_T weightedSum = biasses[och];
+                for (int sy = 0; sy < KERNEL_HEIGHT; ++sy) {
+                    if (PADDING_Y != 0 && (sy < syMin || sy >= syMax)) {
+                        continue;
+                    }
+                    const int inputsOffset = (iy + sy - PADDING_Y) * CHANNELS_WIDTH * INPUTS_BYTE
+                                             + (ix - PADDING_X) * INPUTS_BYTE;
+                    const int weightsOffset = och * KERNEL_HEIGHT * KERNEL_WIDTH * WEIGHTS_BYTE
+                                              + sy * KERNEL_WIDTH * WEIGHTS_BYTE;
+                    // if (PADDING_X == 0
+                    //     && (NB_CHANNELS * std::numeric_limits<Weight_T>::digits % 8 == 0)
+                    //     && (NB_CHANNELS * std::numeric_limits<Input_T>::digits % 8 == 0)) {
+                    if (PADDING_X == 0
+                        && (NB_CHANNELS * bits_norm_wt % 8 == 0)
+                        && (NB_CHANNELS * bits_norm_in % 8 == 0)) {
+                        macsOnRange<KERNEL_WIDTH * NB_CHANNELS>(inputs + inputsOffset,
+                                                                weights + weightsOffset,
+                                                                weightedSum);
+                    } 
+                    else {
+                        for (int sx = 0; sx < KERNEL_WIDTH; ++sx) {
+                            if(sx < sxMin || sx >= sxMax) {
+                                continue;
+                            }
+                            macsOnRange<NB_CHANNELS>(inputs + inputsOffset + sx * INPUTS_BYTE,
+                                                     weights + weightsOffset + sx * WEIGHTS_BYTE,
+                                                     weightedSum);
+                        }
+                    }
+                }
+                Output_T output = sat<Output_T>(weightedSum,och, ACTIVATION, rescaling);
+                compact_data_during_loop(output, outputs, outputOffset, infoPack);
+            }
+            compact_data_end_loop(outputs, outputOffset, infoPack);
+            ix += STRIDE_X;
+        }
+        iy += STRIDE_Y;
+    }
+}
+}   // N2D2_Export
+#endif  // __N2D2_EXPORT_ARM_CONV_CUSTOM_HPP__
--- a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/FullyConnected/LowbitFc.hpp
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/FullyConnected/LowbitFc.hpp
+/*
+    (C) Copyright 2017 CEA LIST. All Rights Reserved.
+    Contributor(s): N2D2 Team
+    This software is governed by the CeCILL-C license under French law and
+    abiding by the rules of distribution of free software.  You can  use,
+    modify and/ or redistribute the software under the terms of the CeCILL-C
+    license as circulated by CEA, CNRS and INRIA at the following URL
+    "http://www.cecill.info".
+    As a counterpart to the access to the source code and  rights to copy,
+    modify and redistribute granted by the license, users are provided only
+    with a limited warranty  and the software's author,  the holder of the
+    economic rights,  and the successive licensors  have only  limited
+    liability.
+    The fact that you are presently reading this means that you have had
+    knowledge of the CeCILL-C license and that you accept its terms.
+*/
+#ifndef __N2D2_EXPORT_CPP_CUSTOMFC_HPP__
+#define __N2D2_EXPORT_CPP_CUSTOMFC_HPP__
+#include <cmath>
+#include "kernels/typedefs.hpp"
+#include "assert.h"
+#include "utils.hpp"
+#include "kernels/Macs.hpp"
+#include "kernels/subkernels_functions.hpp"
+namespace N2D2_Export {
+template<int NB_CHANNELS, int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
+         int NB_OUTPUTS, int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
+         ActivationFunction_T ACTIVATION,
+         typename Sum_T, typename Input_T, typename Output_T, 
+         typename Weight_T, typename Bias_T, typename Rescaling_T>
+__attribute__((always_inline)) inline static
+void lowbitfccellPropagate(const Input_T* __restrict inputs,
+                                    Output_T* __restrict outputs,
+                                    const Bias_T* __restrict biasses,
+                                    const Weight_T* __restrict weights,
+                                    const Rescaling_T& __restrict rescaling)
+{
+    static_assert(OUTPUTS_HEIGHT == 1, "Outputs height should be 1");
+    static_assert(OUTPUTS_WIDTH == 1, "Outputs width should be 1");
+    PackSupport infoPack = {0, 0};
+    constexpr int INPUTS_BYTE
+        = std::ceil(((NB_CHANNELS * std::numeric_limits<Input_T>::digits)
+          + (NB_CHANNELS * std::numeric_limits<Input_T>::digits) % 8) / (float)8);
+    constexpr int WEIGHTS_BYTE 
+        = std::ceil(((NB_CHANNELS * std::numeric_limits<Weight_T>::digits)
+          + (NB_CHANNELS * std::numeric_limits<Weight_T>::digits) % 8) / (float)8);
+    int outputOffset = 0;
+    for (int och = 0; och < NB_OUTPUTS; ++och) {
+        Sum_T weightedSum = biasses[och];
+        for (int iy = 0; iy < CHANNELS_HEIGHT; ++iy) {
+            for (int ix = 0; ix < CHANNELS_WIDTH; ++ix) {
+                const int weightsOffset = CHANNELS_HEIGHT * CHANNELS_WIDTH * WEIGHTS_BYTE * och 
+                                            + (CHANNELS_WIDTH * iy + ix) * WEIGHTS_BYTE;
+                const int inputsOffset = (CHANNELS_WIDTH * iy + ix) * INPUTS_BYTE;
+                macsOnRange<NB_CHANNELS>(inputs + inputsOffset,
+                                         weights + weightsOffset, 
+                                         weightedSum);
+            }
+        }
+        Output_T output = sat<Output_T>(weightedSum,och, ACTIVATION, rescaling);
+        compact_data_during_loop(output, outputs, outputOffset, infoPack);
+    }
+    compact_data_end_loop(outputs, outputOffset, infoPack);
+}
+}   // N2D2_Export
+#endif  // __N2D2_EXPORT_CPP_FC_HPP__
--- a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Pooling/LowbitPooling.hpp
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Pooling/LowbitPooling.hpp
+/*
+    (C) Copyright 2017 CEA LIST. All Rights Reserved.
+    Contributor(s): N2D2 Team
+    This software is governed by the CeCILL-C license under French law and
+    abiding by the rules of distribution of free software.  You can  use,
+    modify and/ or redistribute the software under the terms of the CeCILL-C
+    license as circulated by CEA, CNRS and INRIA at the following URL
+    "http://www.cecill.info".
+    As a counterpart to the access to the source code and  rights to copy,
+    modify and redistribute granted by the license, users are provided only
+    with a limited warranty  and the software's author,  the holder of the
+    economic rights,  and the successive licensors  have only  limited
+    liability.
+    The fact that you are presently reading this means that you have had
+    knowledge of the CeCILL-C license and that you accept its terms.
+*/
+#ifndef __N2D2_EXPORT_CPP_CUSTOMPOOLING_HPP__
+#define __N2D2_EXPORT_CPP_CUSTOMPOOLING_HPP__
+#include <cmath>
+#include "kernels/typedefs.hpp"
+#include "assert.h"
+#include "utils.hpp"
+#include "kernels/Macs.hpp"
+#include "kernels/subkernels_functions.hpp"
+namespace N2D2_Export {
+template<int NB_CHANNELS, int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
+        int NB_OUTPUTS, int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
+        int PADDING_Y, int PADDING_X,
+        int STRIDE_Y, int STRIDE_X,
+        int KERNEL_HEIGHT, int KERNEL_WIDTH,
+        Pooling_T POOLING, ActivationFunction_T ACTIVATION,
+        typename Input_T, typename Output_T>
+__attribute__((always_inline)) inline static
+void lowbitpoolcellPropagate(const Input_T* __restrict inputs,
+                                    Output_T* __restrict outputs)
+{
+    static_assert(std::is_same<Input_T, Output_T>::value, "Input_T and Output_T must be the same.");
+    static_assert(NB_CHANNELS == NB_OUTPUTS, "nb_channels should be equal to nb_outputs.");
+    static_assert(POOLING == Max , "Only supports Max and Average pooling.");
+    static_assert(ACTIVATION == Linear, "Only supports a Linear activation.");
+    PackSupport infoPack = {0, 0};
+    constexpr int INPUTS_BYTE
+        = std::ceil(((NB_CHANNELS * std::numeric_limits<Input_T>::digits)
+        + (NB_CHANNELS * std::numeric_limits<Input_T>::digits) % 8) / (float)8);
+    constexpr int OUTPUTS_BYTE
+        = std::ceil(((NB_OUTPUTS * std::numeric_limits<Output_T>::digits)
+        + (NB_OUTPUTS * std::numeric_limits<Output_T>::digits) % 8) / (float)8);
+    int outputOffset = 0;
+    int iy = 0;
+    for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
+        const int syMin = (PADDING_Y == 0) ? 0 : max(PADDING_Y - iy, 0);
+        const int syMax = (PADDING_Y == 0) ? KERNEL_HEIGHT 
+                                        : clamp(CHANNELS_HEIGHT + PADDING_Y - iy, 
+                                                0, KERNEL_HEIGHT);
+        int ix = 0;
+        for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
+            const int sxMin = (PADDING_X == 0) ? 0 : max(PADDING_X - ix, 0);
+            const int sxMax = (PADDING_X == 0) ? KERNEL_WIDTH 
+                                            : clamp(CHANNELS_WIDTH + PADDING_X - ix,  
+                                                    0, KERNEL_WIDTH);
+            int och_c = 0;
+            while (och_c < OUTPUTS_BYTE) {
+                // typename std::conditional<(!std::is_unsigned<Input_T>::value && 
+                //         std::numeric_limits<Input_T>::digits == 32), data<32>, udata<32>>::type maxVal;
+                // maxVal = decltype(maxVal)::lowest();
+                typename std::conditional<(!std::is_unsigned<Input_T>::value && 
+                        std::numeric_limits<Input_T>::digits == 32), int32_t, uint32_t>::type maxVal;
+                maxVal = std::numeric_limits<decltype(maxVal)>::lowest();
+                int nb_data = min(OUTPUTS_BYTE-och_c, get_pool_nbData(std::numeric_limits<Input_T>::digits));
+                for (int sy = 0; sy < KERNEL_HEIGHT; ++sy) {
+                    if (PADDING_Y != 0 && (sy < syMin || sy >= syMax)) {
+                        continue;
+                    }
+                    const int inputsOffset = (iy + sy - PADDING_Y) * CHANNELS_WIDTH * INPUTS_BYTE
+                                            + (ix - PADDING_X) * INPUTS_BYTE + och_c;
+                    for (int sx = 0; sx < KERNEL_WIDTH; ++sx) {
+                        if(sx < sxMin || sx >= sxMax) {
+                            continue;
+                        }
+                        parallelMaxPooling(inputs + inputsOffset + sx*INPUTS_BYTE, maxVal, nb_data);
+                    }
+                }
+                storeMaxPooling(outputs, outputOffset, maxVal, nb_data);
+                och_c += nb_data;
+            }
+            ix += STRIDE_X;
+        }
+        iy += STRIDE_Y;
+    }
+}
+}
+#endif
\ No newline at end of file
--- a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/Macs.hpp
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/Macs.hpp
--- a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/nn_scaling_functions.hpp
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/nn_scaling_functions.hpp
@@ -27,15 +27,15 @@
 namespace N2D2_Export {
-static int64_t toInt64(uint32_t lo, uint32_t hi) {
+// static int64_t toInt64(uint32_t lo, uint32_t hi) {
-    return (int64_t) (((uint64_t) hi) << 32ull) | ((uint64_t) lo);
+//     return (int64_t) (((uint64_t) hi) << 32ull) | ((uint64_t) lo);
-}
+// }
-static int64_t smlal(int32_t lhs, int32_t rhs, 
+// static int64_t smlal(int32_t lhs, int32_t rhs, 
-                     uint32_t accumLo, uint32_t accumHi) 
+//                      uint32_t accumLo, uint32_t accumHi) 
-{
+// {
-    return ((int64_t) lhs) * ((int64_t) rhs) + toInt64(accumLo, accumHi);
+//     return ((int64_t) lhs) * ((int64_t) rhs) + toInt64(accumLo, accumHi);
-}
+// }
 // ---------------------------------------------------
 // ------------------- No Scaling --------------------

--- a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/subkernels_functions.hpp
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/subkernels_functions.hpp
+/**
+ ******************************************************************************
+ * @file     subkernels_functions.hpp
+ * @brief    Header file for the network subkernels
+ * 
+ ******************************************************************************
+ * @attention
+ * 
+ * (C) Copyright 2021 CEA LIST. All Rights Reserved.
+ *  Contributor(s): Vincent TEMPLIER (vincent.templier@cea.fr)
+ * 
+ * This file is not part of the open source version of N2D2 and is NOT under
+ * the CeCILL-C license. This code is the property of the CEA. It can not be
+ * copied or disseminated without its authorization.
+ * 
+ ******************************************************************************
+ */
+#ifndef __SUBKERNELS_FUNCTIONS_H__
+#define __SUBKERNELS_FUNCTIONS_H__
+#include <cstring>
+#include <cmsis_compiler.h>
+#include "typedefs.hpp"
+#include "assert.h"
+// ----------------------------------------------------------------------------
+// -------------------------- Compression functions ---------------------------
+// ----------------------------------------------------------------------------
+/**
+ * @brief   Compact data during a loop with an accumulator
+ * @details This function is used in the network functions to compress 
+ *          and store a value in the outputs vector. The function adds 
+ *          the value to an accumulator. If the accumulator is full 
+ *          (ie all the available slots are taken), then the accumulator
+ *          is stored in the outputs. Otherwise, the accumulator temporaly
+ *          keeps the previous values and it is shifted by 
+ *          the number of bits required to store the quantized values.
+ * 
+ * @param[in]     value        Value to be stored in the accumulator
+ * @param[in,out] outputs      Pointer to compressed output vector
+ * @param[in,out] outputOffset Pointer to the current output index
+ * @param[in,out] infoPack     Object containing the accumulator
+ * @returns                    None
+ * 
+ */
+template<typename Output_T, typename std::enable_if_t<std::numeric_limits<Output_T>::digits < 8, int> = 0>
+__attribute__((always_inline)) static inline
+void compact_data_during_loop (Output_T value,
+                               Output_T* __restrict outputs,
+                               int& outputOffset,
+                               PackSupport& infoPack)
+{
+    if (std::numeric_limits<Output_T>::digits < 8) {
+        constexpr uint8_t mask = (1U << std::numeric_limits<Output_T>::digits) - 1;
+        constexpr uint8_t nbSlot = ceil((double)8/std::numeric_limits<Output_T>::digits);
+        infoPack.accumulator |= value.value & mask;
+        infoPack.cptAccumulator += 1;
+        if (infoPack.cptAccumulator == nbSlot) {
+            outputs[outputOffset] = (Output_T) infoPack.accumulator;
+            ++outputOffset;
+            infoPack.cptAccumulator = 0;
+            infoPack.accumulator = 0;
+        }
+        else {
+            infoPack.accumulator <<= std::numeric_limits<Output_T>::digits;
+        }
+    } else {
+        outputs[outputOffset] = (Output_T) value;
+        ++outputOffset;
+    }
+}
+template<typename Output_T, typename std::enable_if_t<std::numeric_limits<Output_T>::digits >= 8, int> = 0>
+__attribute__((always_inline)) static inline
+void compact_data_during_loop (const Output_T value,
+                               Output_T* __restrict outputs,
+                               int& outputOffset,
+                               PackSupport& infoPack)
+{
+    outputs[outputOffset] = value;
+}
+/**
+ * @brief   Compact data after a loop with an accumulator
+ * @details It may happen that the accumulator is not completely filled
+ *          after calling "compact_data_during_loop" and the stored 
+ *          quantized values in the accumulator have not been saved
+ *          in the outputs. Thus, this function adds extra zeros to the
+ *          accumulator until it is full. Then the accumulator is 
+ *          stored in the outputs. 
+ *          This function should always be called at the end of a loop
+ *          where "compact_data_during_loop" is called
+ * 
+ * @param[in,out] outputs      Pointer to compressed output vector
+ * @param[in,out] outputOffset Current output index
+ * @param[in,out] infoPack     Object containing the accumulator
+ * @returns                    None
+ * 
+ */
+template<typename Output_T, typename std::enable_if_t<std::numeric_limits<Output_T>::digits < 8, int> = 0>
+__attribute__((always_inline)) static inline
+void compact_data_end_loop (Output_T* __restrict outputs,
+                            int& outputOffset,
+                            PackSupport& infoPack)
+{
+    if (std::numeric_limits<Output_T>::digits < 8) {
+        // if data still accumulated but not stored
+        if (infoPack.cptAccumulator != 0) {
+            constexpr unsigned int nbSlot = ceil((double)8/std::numeric_limits<Output_T>::digits);
+            // Add extra zero to shift data to the left
+            infoPack.cptAccumulator += 1;
+            while (infoPack.cptAccumulator < nbSlot) {
+                infoPack.accumulator <<= std::numeric_limits<Output_T>::digits;
+                infoPack.cptAccumulator += 1;
+            }
+            outputs[outputOffset] = infoPack.accumulator;
+            ++outputOffset;
+            infoPack.cptAccumulator = 0;
+            infoPack.accumulator = 0;
+        }
+    }
+}
+template<typename Output_T, typename std::enable_if_t<std::numeric_limits<Output_T>::digits >= 8, int> = 0>
+__attribute__((always_inline)) static inline
+void compact_data_end_loop (Output_T* __restrict outputs,
+                            int& outputOffset,
+                            PackSupport& infoPack)
+{
+    //  Nothing
+}
+// ----------------------------------------------------------------------------
+// ------------------------- Pooling subfunctions -----------------------------
+// ------------------------------ Max Pooling ---------------------------------
+// ----------------------------------------------------------------------------
+__attribute__((always_inline)) static inline
+int get_pool_nbData (const int nbBits)
+{
+    int nb_data = 1;
+    switch (nbBits)
+    {
+    case 8: nb_data = 4;
+            break;
+    case 4: nb_data = 2;
+            break;
+    case 16: nb_data = 2;
+            break;
+    default:
+        break;
+    }
+    return nb_data;
+}
+template<typename Output_T,
+    typename std::enable_if<std::numeric_limits<Output_T>::digits == 4>::type* = nullptr>
+__attribute__((always_inline)) static inline
+void storeMaxPooling (Output_T* __restrict outputs,
+                      int& outputOffset,
+                      const uint32_t maxVal,
+                      const int nb_data)
+{
+    uint32_t data_val = maxVal;
+    assert(nb_data == 2 || nb_data == 1);
+    // Gather bytes in pairs of bytes
+    // Ex: 0x0A050403 -> 0x00A50043
+    data_val = ((data_val & 0x0F000F00) >> 4) | (data_val & 0x000F000F);
+    // Output compression and storage
+    for (int index = 0; index < nb_data; ++index) {
+        outputs[outputOffset] = (uint8_t) ((data_val >> 16*index) & 0xFF);
+        outputOffset += 1;
+    }
+}
+template<typename Output_T,
+    typename std::enable_if<std::numeric_limits<Output_T>::digits == 8>::type* = nullptr>
+__attribute__((always_inline)) static inline
+void storeMaxPooling (Output_T* __restrict outputs,
+                      int& outputOffset,
+                      const uint32_t maxVal,
+                      const int nb_data)
+{
+    memcpy(outputs, &maxVal, nb_data*sizeof(uint8_t));
+}
+template<typename Input_T,
+         typename std::enable_if<(std::is_unsigned<Input_T>::value
+         && std::numeric_limits<Input_T>::digits == 16)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+void parallelMaxPooling (const Input_T* __restrict inputs,
+                         uint32_t& maxVal,
+                         const int nb_data)
+{
+    assert(nb_data == 2 || nb_data == 1);
+    uint32_t in = 0;
+    memcpy((void*) &in, inputs, nb_data*sizeof(uint16_t));
+    maxVal = __UQSUB16(maxVal, in);
+    maxVal = __UQADD16(maxVal, in);
+}
+template<typename Input_T,
+         typename std::enable_if<(!std::is_unsigned<Input_T>::value
+         && std::numeric_limits<Input_T>::digits == 16)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+void parallelMaxPooling (const Input_T* __restrict inputs,
+                         uint32_t maxVal,
+                         const int nb_data)
+{
+    assert(nb_data == 2 || nb_data == 1);
+    uint32_t in = 0;
+    memcpy((void*) &in, inputs, nb_data*sizeof(uint16_t));
+    maxVal = __SSUB16(maxVal, in);
+    maxVal = __SEL(maxVal, 0);
+    maxVal = __SADD16(maxVal, in);
+}
+template<typename Input_T,
+         typename std::enable_if<(std::is_unsigned<Input_T>::value
+         && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+void parallelMaxPooling (const Input_T* __restrict inputs,
+                         uint32_t& maxVal,
+                         const int nb_data)
+{
+    assert(nb_data <= 4 && nb_data >= 1);
+    uint32_t in = 0;
+    memcpy((void*) &in, inputs, nb_data*sizeof(uint8_t));
+    maxVal = __UQSUB8(maxVal, in);
+    maxVal = __UQADD8(maxVal, in);
+}
+template<typename Input_T,
+         typename std::enable_if<(!std::is_unsigned<Input_T>::value
+         && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+void parallelMaxPooling (const Input_T* __restrict inputs,
+                         uint32_t maxVal,
+                         const int nb_data)
+{
+    assert(nb_data <= 4 && nb_data >= 1);
+    uint32_t in = 0;
+    memcpy((void*) &in, inputs, nb_data*sizeof(uint8_t));
+    maxVal = __SSUB8(maxVal, in);
+    maxVal = __SEL(maxVal, 0);
+    maxVal = __SADD8(maxVal, in);
+}
+template<typename Input_T,
+         typename std::enable_if<(std::is_unsigned<Input_T>::value
+         && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+void parallelMaxPooling (const Input_T* __restrict inputs,
+                         uint32_t& maxVal,
+                         const int nb_data)
+{
+    assert(nb_data == 2 || nb_data == 1);
+    uint32_t in = 0;
+    memcpy((void*) &in, inputs, nb_data*sizeof(uint8_t));
+    in = (in | in << 8) & 0xFF00FF;
+    in = (in | in << 4) & 0xF0F0F0F;
+    maxVal = __UQSUB8(maxVal, in);
+    maxVal = __UQADD8(maxVal, in);
+}
+template<typename Input_T,
+         typename std::enable_if<(!std::is_unsigned<Input_T>::value
+         && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
+__attribute__((always_inline)) static inline
+void parallelMaxPooling (const Input_T* __restrict inputs,
+                         uint32_t maxVal,
+                         const int nb_data)
+{
+    assert(nb_data == 2 || nb_data == 1);
+    uint32_t in = 0;
+    memcpy((void*) &in, inputs, nb_data*sizeof(uint8_t));
+    in = (in | in << 8) & 0xFF00FF;
+    in = (in | in << 4) & 0xF0F0F0F;
+    in += 0x78787878;
+    in ^= 0x78787878;
+    maxVal = __SSUB8(maxVal, in);
+    maxVal = __SEL(maxVal, 0);
+    maxVal = __SADD8(maxVal, in);
+}
+#endif
\ No newline at end of file
--- a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/swar_arm_acle.h
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/swar_arm_acle.h
+/**
+ ******************************************************************************
+ * @file     swar_arm_acle.h
+ * @brief    Complete ARM Non-NEON ACLE intrinsics for Cortex m7 and m4
+ * 
+ ******************************************************************************
+ * @attention
+ * 
+ * (C) Copyright 2021 CEA LIST. All Rights Reserved.
+ *  Contributor(s): Vincent TEMPLIER (vincent.templier@cea.fr)
+ *                  Philippe DORE (philippe.dore@cea.fr)
+ * 
+ * This file is not part of the open source version of N2D2 and is NOT under
+ * the CeCILL-C license. This code is the property of the CEA. It can not be
+ * copied or disseminated without its authorization.
+ * 
+ ******************************************************************************
+ */
+#ifndef _SWAR_ARM_ACLE_H
+#define _SWAR_ARM_ACLE_H
+#include <cmsis_compiler.h>
+#include "assert.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+/**
+ * @brief   Rotate right and perform dual extracted 8-bit to 16-bit signed addition
+ * @details This function rotates op2, extracts two 8-bit values from op2 (at bit positions [7:0] and [23:16]), 
+ *          sign-extend them to 16-bits each, and add the results to op1
+ * @param[in]  op1  Two 16-bit values in op1[15:0] and op1[31:16]
+ * @param[in]  op2  Two 8-bit values in op2[7:0] and op2[23:16] to be sign-extended
+ * @param[in]  ror  Number of bits to rotate op2. Only 8,16 and 24 are accepted  
+ * @returns         The addition of op1 and op2, where op2 has been rotated, the 8-bit values in op2[7:0] 
+ *                  and op2[23:16] have been extracted and sign-extended prior to the addition
+ * 
+ */
+__attribute__((always_inline)) __STATIC_INLINE 
+int32_t __SXTAB16_RORn (const int32_t op1, const int32_t op2, const int8_t ror)
+{
+    int32_t result;
+    assert((ror == 0) || (ror == 8) || (ror == 16) || (ror == 24));
+    __ASM volatile ("sxtab16 %0, %1, %2, ROR %3" : "=r" (result) : "r" (op1) , "r" (op2) , "i" (ror) );
+    return result;
+}
+/**
+ * @brief   Rotate right, dual extract 8-bits and sign extend each to 16-bits
+ * @param[in]  op1  Two 8-bit values in op1[7:0] and op1[23:16] to be sign-extended
+ * @param[in]  ror  Number of bits to rotate op1. Only 8,16 and 24 are accepted  
+ * @returns         The 8-bit values sign-extended to 16-bit values
+ * 
+ */
+__attribute__((always_inline)) __STATIC_INLINE 
+int32_t __SXTB16_RORn (const int32_t op1, const int8_t ror)
+{
+    int32_t result;
+    assert((ror == 0) || (ror == 8) || (ror == 16) || (ror == 24));
+    __ASM volatile ("sxtb16 %0, %1, ROR %2" : "=r" (result) : "r" (op1), "i" (ror) );
+    return result;
+}
+/**
+ * @brief   Rotate right and perform dual extracted 8-bit to 16-bit zero addition
+ * @details This function rotates op2, extracts two 8-bit values from op2 (at bit positions [7:0] and [23:16]), 
+ *          zero-extend them to 16-bits each, and add the results to op1
+ * @param[in]  op1  Two 16-bit values in op1[15:0] and op1[31:16]
+ * @param[in]  op2  Two 8-bit values in op2[7:0] and op2[23:16] to be zero-extended
+ * @param[in]  ror  Number of bits to rotate op2. Only 8,16 and 24 are accepted  
+ * @returns         The addition of op1 and op2, where op2 has been rotated, the 8-bit values in op2[7:0] 
+ *                  and op2[23:16] have been extracted and zero-extended prior to the addition
+ * 
+ */
+__attribute__((always_inline)) __STATIC_INLINE 
+uint32_t __UXTAB16_RORn (const uint32_t op1, const uint32_t op2, const int8_t ror)
+{
+    uint32_t result;
+    assert((ror == 0) || (ror == 8) || (ror == 16) || (ror == 24));
+    __ASM volatile ("uxtab16 %0, %1, %2, ROR %3" : "=r" (result) : "r" (op1) , "r" (op2) , "i" (ror) );
+    return result;
+}
+/**
+ * @brief   Rotate right, dual extract 8-bits and zero extend each to 16-bits
+ * @param[in]  op1  Two 8-bit values in op1[7:0] and op1[23:16] to be zero-extended
+ * @param[in]  ror  Number of bits to rotate op1. Only 8,16 and 24 are accepted  
+ * @returns         The 8-bit values zero-extended to 16-bit values
+ * 
+ */
+__attribute__((always_inline)) __STATIC_INLINE 
+uint32_t __UXTB16_RORn (const uint32_t op1, const int8_t ror)
+{
+    uint32_t result;
+    assert((ror == 0) || (ror == 8) || (ror == 16) || (ror == 24));
+    __ASM volatile ("uxtb16 %0, %1, ROR %2" : "=r" (result) : "r" (op1), "i" (ror) );
+    return result;
+}
+/**
+ * @brief   Sign extend Halfword
+ * @details Extends a 16-bit value to a signed 32-bit value
+ * @param[in]  op1  op1[15:0] to be sign-extended
+ * @returns         Register holding the sign-extended 32-bit value
+ * 
+ */
+__attribute__((always_inline)) __STATIC_INLINE 
+uint32_t __SXTH (const uint32_t op1)
+{
+    uint32_t result;
+    __ASM volatile ("sxth %0, %1" : "=r" (result) : "r" (op1));
+    return result;
+}
+/**
+ * @brief   Zero extend Halfword
+ * @details Extends a 16-bit value to an unsigned 32-bit value
+ * @param[in]  op1  op1[15:0] to be zero-extended
+ * @returns         Register holding the zero-extended 32-bit value
+ * 
+ */
+__attribute__((always_inline)) __STATIC_INLINE 
+uint32_t __UXTH (const uint32_t op1)
+{
+    uint32_t result;
+    __ASM volatile ("uxth %0, %1" : "=r" (result) : "r" (op1));
+    return result;
+}
+/**
+ * @brief   Rotate right and sign extend halfword
+ * @param[in]  op1  op1[15:0] to be sign-extended
+ * @param[in]  ror  Number of bits to rotate op1. Only 8,16 and 24 are accepted  
+ * @returns         Register holding the sign-extended 32-bit value
+ * 
+ */
+__attribute__((always_inline)) __STATIC_INLINE 
+uint32_t __SXTH_RORn (const uint32_t op1, const int8_t ror)
+{
+    uint32_t result;
+    assert((ror == 0) || (ror == 8) || (ror == 16) || (ror == 24));
+    __ASM volatile ("sxth %0, %1, ROR %2" : "=r" (result) : "r" (op1), "i" (ror) );
+    return result;
+}
+/**
+ * @brief   Rotate right and zero extend halfword
+ * @param[in]  op1  op1[15:0] to be zero-extended
+ * @param[in]  ror  Number of bits to rotate op1. Only 8,16 and 24 are accepted  
+ * @returns         Register holding the zero-extended 32-bit value
+ * 
+ */
+__attribute__((always_inline)) __STATIC_INLINE 
+uint32_t __UXTH_RORn (const uint32_t op1, const int8_t ror)
+{
+    uint32_t result;
+    assert((ror == 0) || (ror == 8) || (ror == 16) || (ror == 24));
+    __ASM volatile ("uxth %0, %1, ROR %2" : "=r" (result) : "r" (op1), "i" (ror) );
+    return result;
+}
+/**
+ * @brief   Sign extend Byte
+ * @details Extends a 8-bit value to a signed 32-bit value
+ * @param[in]  op1  op1[7:0] to be sign-extended
+ * @returns         Register holding the sign-extended 32-bit value
+ * 
+ */
+__attribute__((always_inline)) __STATIC_INLINE 
+uint32_t __SXTB (const uint32_t op1)
+{
+    uint32_t result;
+    __ASM volatile ("sxtb %0, %1" : "=r" (result) : "r" (op1));
+    return result;
+}
+/**
+ * @brief   Zero extend Byte
+ * @details Extends a 8-bit value to an unsigned 32-bit value
+ * @param[in]  op1  op1[7:0] to be zero-extended
+ * @returns         Register holding the zero-extended 32-bit value
+ * 
+ */
+__attribute__((always_inline)) __STATIC_INLINE 
+uint32_t __UXTB (const uint32_t op1)
+{
+    uint32_t result;
+    __ASM volatile ("uxtb %0, %1" : "=r" (result) : "r" (op1));
+    return result;
+}
+/**
+ * @brief   Rotate right and sign extend byte
+ * @param[in]  op1  op1[7:0] to be sign-extended
+ * @param[in]  ror  Number of bits to rotate op1. Only 8,16 and 24 are accepted  
+ * @returns         Register holding the sign-extended 32-bit value
+ * 
+ */
+__attribute__((always_inline)) __STATIC_INLINE 
+uint32_t __SXTB_RORn (const uint32_t op1, const int8_t ror)
+{
+    uint32_t result;
+    assert((ror == 0) || (ror == 8) || (ror == 16) || (ror == 24));
+    __ASM volatile ("sxtb %0, %1, ROR %2" : "=r" (result) : "r" (op1), "i" (ror) );
+    return result;
+}
+/**
+ * @brief   Rotate right and zero extend byte
+ * @param[in]  op1  op1[7:0] to be zero-extended
+ * @param[in]  ror  Number of bits to rotate op1. Only 8,16 and 24 are accepted  
+ * @returns         Register holding the zero-extended 32-bit value
+ * 
+ */
+__attribute__((always_inline)) __STATIC_INLINE 
+uint32_t __UXTB_RORn (const uint32_t op1, const int8_t ror)
+{
+    uint32_t result;
+    assert((ror == 0) || (ror == 8) || (ror == 16) || (ror == 24));
+    __ASM volatile ("uxtb %0, %1, ROR %2" : "=r" (result) : "r" (op1), "i" (ror) );
+    return result;
+}
+/**
+ * @brief   Signed Bit Field Extract
+ * @details Copies adjacent bits from one register into the least significant bits 
+ *          of a second register, and sign extends to 32 bits
+ * @param[in]  op1    Value to be extracted
+ * @param[in]  lsb    Position of the least significant bit of the bit field
+ * @param[in]  width  Width of the bit field
+ * @returns           Extracted bitfield and sign extended to 32 bits
+ * 
+ */
+__attribute__((always_inline)) __STATIC_INLINE 
+int32_t __SBFX (const uint32_t op1, const int8_t lsb, const int8_t width)
+{
+    int32_t result;
+    assert((lsb >= 0) && (lsb < 32) && (width >= 0) && (width < 32-lsb));
+    __ASM volatile ("sbfx %0, %1, %2, %3" : "=r" (result) : "r" (op1), "i" (lsb), "i" (width) );
+    return result;
+}
+/**
+ * @brief   Unsigned Bit Field Extract
+ * @details Copies adjacent bits from one register into the least significant bits 
+ *          of a second register, and zero extends to 32 bits
+ * @param[in]  op1    Value to be extracted
+ * @param[in]  lsb    Position of the least significant bit of the bit field
+ * @param[in]  width  Width of the bit field
+ * @returns           Extracted bitfield and zero extended to 32 bits
+ * 
+ */
+__attribute__((always_inline)) __STATIC_INLINE 
+uint32_t __UBFX (const uint32_t op1, const int8_t lsb, const int8_t width)
+{
+    uint32_t result;
+    assert((lsb >= 0) && (lsb < 32) && (width >= 0) && (width < 32-lsb));
+    __ASM volatile ("ubfx %0, %1, %2, %3" : "=r" (result) : "r" (op1), "i" (lsb), "i" (width) );
+    return result;
+}
+/**
+ * @brief   Bit Field Insert
+ * @details Copies a bitfield into one register from another register
+ *          It replaces width bits in op2 starting at the position lsb, 
+ *          with width bits from op1 starting at bit[0].  
+ *          Other bits in op2 are unchanged
+ * @param[in]      op1    Source value
+ * @param[in,out]  op2    Destination value 
+ * @param[in]      lsb    Position of the least significant bit of the bit field
+ * @param[in]      width  Width of the bit field
+ * @returns               The register which contains op2 and the added bitfield
+ * 
+ */
+__attribute__((always_inline)) __STATIC_INLINE 
+uint32_t __BFI (uint32_t op1, uint32_t op2, const int8_t lsb, const int8_t width)
+{
+    assert((lsb >= 0) && (lsb < 32) && (width >= 0) && (width < 32-lsb));
+    __ASM volatile ("bfi %0, %1, %2, %3" : "+r" (op2) : "r" (op1), "i" (lsb), "i" (width), "0" (op2) );
+    return op2;
+}
+/**
+ * @brief   Signed Divide
+ * @details Performs a signed integer division of the value in op1 
+ *          by the value in op2.
+ * @param[in]  op1  Register holding the value to be divided
+ * @param[in]  op2  Register holding the divisor
+ * @returns         Register holding the signed result op1/op2
+ * 
+ */
+__attribute__((always_inline)) __STATIC_INLINE 
+uint32_t __SDIV (const uint32_t op1, const uint32_t op2)
+{
+    uint32_t result;
+    __ASM volatile ("sdiv %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+    return result;
+}
+/**
+ * @brief   Unsigned Divide
+ * @details Performs an unsigned integer division of the value in op1 
+ *          by the value in op2.
+ * @param[in]  op1  Register holding the value to be divided
+ * @param[in]  op2  Register holding the divisor
+ * @returns         Register holding the unsigned result op1/op2
+ * 
+ */
+__attribute__((always_inline)) __STATIC_INLINE 
+uint32_t __UDIV (const uint32_t op1, const uint32_t op2)
+{
+    uint32_t result;
+    __ASM volatile ("udiv %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+    return result;
+}
+#ifdef __cplusplus
+}
+#endif
+#endif
--- a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/typedefs.h
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/typedefs.h
-/*
-    (C) Copyright 2015 CEA LIST. All Rights Reserved.
-    Contributor(s): N2D2 Team
-    This software is governed by the CeCILL-C license under French law and
-    abiding by the rules of distribution of free software.  You can  use,
-    modify and/ or redistribute the software under the terms of the CeCILL-C
-    license as circulated by CEA, CNRS and INRIA at the following URL
-    "http://www.cecill.info".
-    As a counterpart to the access to the source code and  rights to copy,
-    modify and redistribute granted by the license, users are provided only
-    with a limited warranty  and the software's author,  the holder of the
-    economic rights,  and the successive licensors  have only  limited
-    liability.
-    The fact that you are presently reading this means that you have had
-    knowledge of the CeCILL-C license and that you accept its terms.
-*/
-#ifndef __N2D2_TYPEDEFS_H__
-#define __N2D2_TYPEDEFS_H__
-#include <stdint.h>
-typedef enum {
-    HWC,
-    CHW
-} Format_T;
-typedef enum {
-    Logistic,
-    LogisticWithLoss,
-    FastSigmoid,
-    Tanh,
-    TanhLeCun,
-    Saturation,
-    Rectifier,
-    Linear,
-    Softplus
-} ActivationFunction_T;
-typedef enum {
-    Max,
-    Average
-} Pooling_T;
-typedef enum {
-    Sum,
-    Mult
-} OpMode_T;
-typedef enum {
-    PerLayer,
-    PerInput,
-    PerChannel
-} CoeffMode_T;
-#endif // __N2D2_TYPEDEFS_H__
--- a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/typedefs.hpp
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/typedefs.hpp
--- a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/utils.hpp
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/utils.hpp
@@ -28,7 +28,7 @@
 #include <cstring>
 #include <limits>
-#include "typedefs.h"
+#include "kernels/typedefs.hpp"
 namespace N2D2_Export {

--- a/aidge_export_arm_cortexm/_Aidge_Arm/templates/configuration/conv_config.jinja
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/templates/configuration/conv_config.jinja
@@ -2,8 +2,7 @@
 #ifndef {{ name|upper }}_LAYER_H
 #define {{ name|upper }}_LAYER_H
-#include "typedefs.h"
+#include "kernels/nn_scaling_functions.hpp"
-#include "nn_scaling_functions.hpp"
 {% include "./_def_io.jinja" %}
 {% include "./_meminfo.jinja" %}

--- a/aidge_export_arm_cortexm/_Aidge_Arm/templates/configuration/fc_config.jinja
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/templates/configuration/fc_config.jinja
@@ -2,8 +2,7 @@
 #ifndef {{ name|upper }}_LAYER_H
 #define {{ name|upper }}_LAYER_H
-#include "typedefs.h"
+#include "kernels/nn_scaling_functions.hpp"
-#include "nn_scaling_functions.hpp"
 {# For layer configuration -#}
 {% include "./_def_io.jinja" %}

--- a/aidge_export_arm_cortexm/_Aidge_Arm/templates/configuration/pool_config.jinja
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/templates/configuration/pool_config.jinja
@@ -2,7 +2,7 @@
 #ifndef {{ name|upper }}_LAYER_H
 #define {{ name|upper }}_LAYER_H
-#include "typedefs.h"
+{# #include "typedefs.h" #}
 {# For layer configuration -#}
 {% include "./_def_io.jinja" %}

--- a/aidge_export_arm_cortexm/_Aidge_Arm/templates/forward_call/forward.jinja
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/templates/forward_call/forward.jinja
+#include <stdint.h>
+#ifdef SAVE_OUTPUTS
+#include <sys/types.h>
+#include <sys/stat.h>
+#endif
+#include "include/forward.hpp"
+// Layer & memory configurations
+{%- for header in headers %}
+#include "{{ header }}"
+{%- endfor %}
+// blabla
+// Memory block
+{%- if mem_section == None %}
+static {{mem_ctype}} mem[{{peak_mem}}];
+{%- else %}
+static {{mem_ctype}} mem[{{peak_mem}}] __attribute__((section("{{ mem_section }}")));
+{%- endif %}
+{# Forward function #}
+{#- Support multiple inputs with different datatypes and multiple outputs with different datatypes -#}
+void {{ func_name }} (
+    {%- for i in range(inputs_name | length) -%}
+    const {{ inputs_dtype[i] }}* {{ inputs_name[i] }},
+    {%- endfor -%}
+    {%- for o in range(outputs_name | length) -%}
+    {{ outputs_dtype[o] }}** {{ outputs_name[o] }}_ptr{% if not loop.last %}, {% endif %}
+    {%- endfor -%})
+{
+    {%- for action in actions %}
+    {{ action }}
+    {%- endfor %}
+    {%- for output_name in outputs_name %}
+    *{{ output_name }}_ptr = {{ output_name }};
+    {%- endfor %}
+}
--- a/aidge_export_arm_cortexm/_Aidge_Arm/templates/forward_call/lowbit_conv_kernel.jinja
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/templates/forward_call/lowbit_conv_kernel.jinja
+{% filter indent(width=4, first=False) %}
+{% include "./_mem_offset.jinja" %}
+N2D2_Export::lowbitconvcellPropagate<{{ in_name[0]|upper }}_NB_CHANNELS,
+                               {{ in_name[0]|upper }}_IN_HEIGHT,
+                               {{ in_name[0]|upper }}_IN_WIDTH,
+                               {{ out_name[0]|upper }}_NB_OUTPUTS,
+                               {{ out_name[0]|upper }}_OUT_HEIGHT,
+                               {{ out_name[0]|upper }}_OUT_WIDTH,
+                               {{ name|upper }}_PADDING_Y,
+                               {{ name|upper }}_PADDING_X,
+                               {{ name|upper }}_STRIDE_Y,
+                               {{ name|upper }}_STRIDE_X,
+                               {{ name|upper }}_KERNEL_HEIGHT,
+                               {{ name|upper }}_KERNEL_WIDTH,
+                               {{ name|upper }}_ACTIVATION, data<32>>
+                               ({{in_name[0]}}, {{out_name[0]}}, {{in_name[2]}}, {{in_name[1]}}, {{ name|upper }}_SCALING);
+{% endfilter %}
--- a/aidge_export_arm_cortexm/_Aidge_Arm/templates/forward_call/lowbit_fc_kernel.jinja
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/templates/forward_call/lowbit_fc_kernel.jinja
+{% filter indent(width=4, first=False) %}
+{% include "./_mem_offset.jinja" %}
+N2D2_Export::lowbitfccellPropagate<{{ in_name[0] | upper }}_NB_CHANNELS,
+                             {{ in_name[0] | upper }}_IN_HEIGHT,
+                             {{ in_name[0] | upper }}_IN_WIDTH,
+                             {{ out_name[0] | upper }}_NB_OUTPUTS,
+                             {{ out_name[0] | upper }}_OUT_HEIGHT,
+                             {{ out_name[0] | upper }}_OUT_WIDTH,
+                             {{name|upper}}_ACTIVATION,
+                             data<32>>
+                             ({{ in_name[0] }}, {{ out_name[0] }}, {{ in_name[2] }}, {{ in_name[1] }}, {{ name | upper }}_SCALING);
+{% endfilter %}
\ No newline at end of file
--- a/aidge_export_arm_cortexm/_Aidge_Arm/templates/forward_call/lowbit_pool_kernel.jinja
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/templates/forward_call/lowbit_pool_kernel.jinja
+{% filter indent(width=4, first=False) %}
+{% include "./_mem_offset.jinja" %}
+N2D2_Export::lowbitpoolcellPropagate<{{ in_name[0]|upper }}_NB_CHANNELS,
+                               {{ in_name[0]|upper }}_IN_HEIGHT,
+                               {{ in_name[0]|upper }}_IN_WIDTH,
+                               {{ out_name[0]|upper }}_NB_OUTPUTS,
+                               {{ out_name[0]|upper }}_OUT_HEIGHT,
+                               {{ out_name[0]|upper }}_OUT_WIDTH,
+                               {{ name|upper }}_PADDING_Y,
+                               {{ name|upper }}_PADDING_X,
+                               {{ name|upper }}_STRIDE_Y,
+                               {{ name|upper }}_STRIDE_X,
+                               {{ name|upper }}_KERNEL_HEIGHT,
+                               {{ name|upper }}_KERNEL_WIDTH,
+                               {{ name|upper }}_POOLING_TYPE,
+                               {{ name|upper }}_ACTIVATION>
+                               ({{in_name[0]}}, {{out_name[0]}});
+{% endfilter %}
--- a/aidge_export_arm_cortexm/boards/stm32/F7/Drivers/CMSIS/Device/ST/STM32F7xx/Include/stm32f746xx.h
+++ b/aidge_export_arm_cortexm/boards/stm32/F7/Drivers/CMSIS/Device/ST/STM32F7xx/Include/stm32f746xx.h
--- a/aidge_export_arm_cortexm/boards/stm32/F7/Drivers/CMSIS/Device/ST/STM32F7xx/Include/stm32f7xx.h
+++ b/aidge_export_arm_cortexm/boards/stm32/F7/Drivers/CMSIS/Device/ST/STM32F7xx/Include/stm32f7xx.h
+/**
+  ******************************************************************************
+  * @file    stm32f7xx.h
+  * @author  MCD Application Team
+  * @brief   CMSIS STM32F7xx Device Peripheral Access Layer Header File.
+  *
+  *          The file is the unique include file that the application programmer
+  *          is using in the C source code, usually in main.c. This file contains:
+  *           - Configuration section that allows to select:
+  *              - The STM32F7xx device used in the target application
+  *              - To use or not the peripheral's drivers in application code(i.e.
+  *                code will be based on direct access to peripheral's registers
+  *                rather than drivers API), this option is controlled by
+  *                "#define USE_HAL_DRIVER"
+  *
+  ******************************************************************************
+  * @attention
+  *
+  * Copyright (c) 2016 STMicroelectronics.
+  * All rights reserved.
+  *
+  * This software is licensed under terms that can be found in the LICENSE file
+  * in the root directory of this software component.
+  * If no LICENSE file comes with this software, it is provided AS-IS.
+  *
+  ******************************************************************************
+  */
+/** @addtogroup CMSIS
+  * @{
+  */
+/** @addtogroup stm32f7xx
+  * @{
+  */
+#ifndef __STM32F7xx_H
+#define __STM32F7xx_H
+#ifdef __cplusplus
+ extern "C" {
+#endif /* __cplusplus */
+/** @addtogroup Library_configuration_section
+  * @{
+  */
+/**
+  * @brief STM32 Family
+  */
+#if !defined  (STM32F7)
+#define STM32F7
+#endif /* STM32F7 */
+/* Uncomment the line below according to the target STM32 device used in your
+   application
+  */
+#if !defined (STM32F756xx) && !defined (STM32F746xx) && !defined (STM32F745xx) && !defined (STM32F765xx) && \
+    !defined (STM32F767xx) && !defined (STM32F769xx) && !defined (STM32F777xx) && !defined (STM32F779xx) && \
+    !defined (STM32F722xx) && !defined (STM32F723xx) && !defined (STM32F732xx) && !defined (STM32F733xx) && \
+    !defined (STM32F730xx) && !defined (STM32F750xx)
+  /* #define STM32F756xx */   /*!< STM32F756VG, STM32F756ZG, STM32F756ZG, STM32F756IG, STM32F756BG,
+                                   STM32F756NG Devices */
+  /* #define STM32F746xx */   /*!< STM32F746VE, STM32F746VG, STM32F746ZE, STM32F746ZG, STM32F746IE, STM32F746IG,
+                                   STM32F746BE, STM32F746BG, STM32F746NE, STM32F746NG Devices */
+  /* #define STM32F745xx */   /*!< STM32F745VE, STM32F745VG, STM32F745ZG, STM32F745ZE, STM32F745IE, STM32F745IG Devices */
+  /* #define STM32F765xx */   /*!< STM32F765BI, STM32F765BG, STM32F765NI, STM32F765NG, STM32F765II, STM32F765IG,
+                                   STM32F765ZI, STM32F765ZG, STM32F765VI, STM32F765VG Devices */
+  /* #define STM32F767xx */   /*!< STM32F767BG, STM32F767BI, STM32F767IG, STM32F767II, STM32F767NG, STM32F767NI,
+                                   STM32F767VG, STM32F767VI, STM32F767ZG, STM32F767ZI Devices */
+  /* #define STM32F769xx */   /*!< STM32F769AG, STM32F769AI, STM32F769BG, STM32F769BI, STM32F769IG, STM32F769II,
+                                   STM32F769NG, STM32F769NI, STM32F768AI Devices */
+  /* #define STM32F777xx */   /*!< STM32F777VI, STM32F777ZI, STM32F777II, STM32F777BI, STM32F777NI Devices */
+  /* #define STM32F779xx */   /*!< STM32F779II, STM32F779BI, STM32F779NI, STM32F779AI, STM32F778AI Devices */
+  /* #define STM32F722xx */   /*!< STM32F722IE, STM32F722ZE, STM32F722VE, STM32F722RE, STM32F722IC, STM32F722ZC,
+                                   STM32F722VC, STM32F722RC Devices */
+  /* #define STM32F723xx */   /*!< STM32F723IE, STM32F723ZE, STM32F723VE, STM32F723IC, STM32F723ZC, STM32F723VC Devices */
+  /* #define STM32F732xx */   /*!< STM32F732IE, STM32F732ZE, STM32F732VE, STM32F732RE Devices */
+  /* #define STM32F733xx */   /*!< STM32F733IE, STM32F733ZE, STM32F733VE Devices */
+  /* #define STM32F730xx */   /*!< STM32F730R, STM32F730V, STM32F730Z, STM32F730I Devices */
+  /* #define STM32F750xx */   /*!< STM32F750V, STM32F750Z, STM32F750N Devices */
+#endif
+/*  Tip: To avoid modifying this file each time you need to switch between these
+        devices, you can define the device in your toolchain compiler preprocessor.
+  */
+#if !defined  (USE_HAL_DRIVER)
+/**
+ * @brief Comment the line below if you will not use the peripherals drivers.
+   In this case, these drivers will not be included and the application code will
+   be based on direct access to peripherals registers
+   */
+  /*#define USE_HAL_DRIVER */
+#endif /* USE_HAL_DRIVER */
+/**
+  * @brief CMSIS Device version number V1.2.9
+  */
+#define __STM32F7_CMSIS_VERSION_MAIN   (0x01) /*!< [31:24] main version */
+#define __STM32F7_CMSIS_VERSION_SUB1   (0x02) /*!< [23:16] sub1 version */
+#define __STM32F7_CMSIS_VERSION_SUB2   (0x09) /*!< [15:8]  sub2 version */
+#define __STM32F7_CMSIS_VERSION_RC     (0x00) /*!< [7:0]  release candidate */
+#define __STM32F7_CMSIS_VERSION        ((__STM32F7_CMSIS_VERSION_MAIN << 24)\
+                                       |(__STM32F7_CMSIS_VERSION_SUB1 << 16)\
+                                       |(__STM32F7_CMSIS_VERSION_SUB2 << 8 )\
+                                       |(__STM32F7_CMSIS_VERSION_RC))
+/**
+  * @}
+  */
+/** @addtogroup Device_Included
+  * @{
+  */
+#if defined(STM32F722xx)
+  #include "stm32f722xx.h"
+#elif defined(STM32F723xx)
+  #include "stm32f723xx.h"
+#elif defined(STM32F732xx)
+  #include "stm32f732xx.h"
+#elif defined(STM32F733xx)
+  #include "stm32f733xx.h"
+#elif defined(STM32F756xx)
+  #include "stm32f756xx.h"
+#elif defined(STM32F746xx)
+  #include "stm32f746xx.h"
+#elif defined(STM32F745xx)
+  #include "stm32f745xx.h"
+#elif defined(STM32F765xx)
+  #include "stm32f765xx.h"
+#elif defined(STM32F767xx)
+  #include "stm32f767xx.h"
+#elif defined(STM32F769xx)
+  #include "stm32f769xx.h"
+#elif defined(STM32F777xx)
+  #include "stm32f777xx.h"
+#elif defined(STM32F779xx)
+  #include "stm32f779xx.h"
+#elif defined(STM32F730xx)
+  #include "stm32f730xx.h"
+#elif defined(STM32F750xx)
+  #include "stm32f750xx.h"
+#else
+ #error "Please select first the target STM32F7xx device used in your application (in stm32f7xx.h file)"
+#endif
+/**
+  * @}
+  */
+/** @addtogroup Exported_types
+  * @{
+  */
+typedef enum
+{
+  RESET = 0U,
+  SET = !RESET
+} FlagStatus, ITStatus;
+typedef enum
+{
+  DISABLE = 0U,
+  ENABLE = !DISABLE
+} FunctionalState;
+#define IS_FUNCTIONAL_STATE(STATE) (((STATE) == DISABLE) || ((STATE) == ENABLE))
+typedef enum
+{
+  SUCCESS = 0U,
+  ERROR = !SUCCESS
+} ErrorStatus;
+/**
+  * @}
+  */
+/** @addtogroup Exported_macro
+  * @{
+  */
+#define SET_BIT(REG, BIT)     ((REG) |= (BIT))
+#define CLEAR_BIT(REG, BIT)   ((REG) &= ~(BIT))
+#define READ_BIT(REG, BIT)    ((REG) & (BIT))
+#define CLEAR_REG(REG)        ((REG) = (0x0))
+#define WRITE_REG(REG, VAL)   ((REG) = (VAL))
+#define READ_REG(REG)         ((REG))
+#define MODIFY_REG(REG, CLEARMASK, SETMASK)  WRITE_REG((REG), (((READ_REG(REG)) & (~(CLEARMASK))) | (SETMASK)))
+#define POSITION_VAL(VAL)     (__CLZ(__RBIT(VAL)))
+/* Use of CMSIS compiler intrinsics for register exclusive access */
+/* Atomic 32-bit register access macro to set one or several bits */
+#define ATOMIC_SET_BIT(REG, BIT)                             \
+  do {                                                       \
+    uint32_t val;                                            \
+    do {                                                     \
+      val = __LDREXW((__IO uint32_t *)&(REG)) | (BIT);       \
+    } while ((__STREXW(val,(__IO uint32_t *)&(REG))) != 0U); \
+  } while(0)
+/* Atomic 32-bit register access macro to clear one or several bits */
+#define ATOMIC_CLEAR_BIT(REG, BIT)                           \
+  do {                                                       \
+    uint32_t val;                                            \
+    do {                                                     \
+      val = __LDREXW((__IO uint32_t *)&(REG)) & ~(BIT);      \
+    } while ((__STREXW(val,(__IO uint32_t *)&(REG))) != 0U); \
+  } while(0)
+/* Atomic 32-bit register access macro to clear and set one or several bits */
+#define ATOMIC_MODIFY_REG(REG, CLEARMSK, SETMASK)                          \
+  do {                                                                     \
+    uint32_t val;                                                          \
+    do {                                                                   \
+      val = (__LDREXW((__IO uint32_t *)&(REG)) & ~(CLEARMSK)) | (SETMASK); \
+    } while ((__STREXW(val,(__IO uint32_t *)&(REG))) != 0U);               \
+  } while(0)
+/* Atomic 16-bit register access macro to set one or several bits */
+#define ATOMIC_SETH_BIT(REG, BIT)                            \
+  do {                                                       \
+    uint16_t val;                                            \
+    do {                                                     \
+      val = __LDREXH((__IO uint16_t *)&(REG)) | (BIT);       \
+    } while ((__STREXH(val,(__IO uint16_t *)&(REG))) != 0U); \
+  } while(0)
+/* Atomic 16-bit register access macro to clear one or several bits */
+#define ATOMIC_CLEARH_BIT(REG, BIT)                          \
+  do {                                                       \
+    uint16_t val;                                            \
+    do {                                                     \
+      val = __LDREXH((__IO uint16_t *)&(REG)) & ~(BIT);      \
+    } while ((__STREXH(val,(__IO uint16_t *)&(REG))) != 0U); \
+  } while(0)
+/* Atomic 16-bit register access macro to clear and set one or several bits */
+#define ATOMIC_MODIFYH_REG(REG, CLEARMSK, SETMASK)                         \
+  do {                                                                     \
+    uint16_t val;                                                          \
+    do {                                                                   \
+      val = (__LDREXH((__IO uint16_t *)&(REG)) & ~(CLEARMSK)) | (SETMASK); \
+    } while ((__STREXH(val,(__IO uint16_t *)&(REG))) != 0U);               \
+  } while(0)
+/**
+  * @}
+  */
+#ifdef USE_HAL_DRIVER
+ #include "stm32f7xx_hal.h"
+#endif /* USE_HAL_DRIVER */
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+#endif /* __STM32F7xx_H */
+/**
+  * @}
+  */
+/**
+  * @}
+  */
--- a/aidge_export_arm_cortexm/boards/stm32/F7/Drivers/CMSIS/Device/ST/STM32F7xx/Include/system_stm32f7xx.h
+++ b/aidge_export_arm_cortexm/boards/stm32/F7/Drivers/CMSIS/Device/ST/STM32F7xx/Include/system_stm32f7xx.h