diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Convolution/Conv.hpp b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Convolution/Conv.hpp
index 450106b1dc1bd7e7ba32f1129213019c107fd91a..fc1b97e504aa4ebd9683b5269cebd48718c2d248 100644
--- a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Convolution/Conv.hpp
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Convolution/Conv.hpp
@@ -21,15 +21,14 @@
 #ifndef __N2D2_EXPORT_CPP_CONV_HPP__
 #define __N2D2_EXPORT_CPP_CONV_HPP__
 
-#include "typedefs.h"
-#include "assert.h"
-#include "utils.hpp"
+#include "kernels/typedefs.hpp"
+#include "kernels/utils.hpp"
 #include "kernels/Macs.hpp"
 
 namespace N2D2_Export {
 
 
-template<int NB_CHANNELS, 
+template<int NB_CHANNELS,
          int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
          int NB_OUTPUTS,
          int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
@@ -69,7 +68,7 @@ void convcellPropagate(const Input_T* __restrict inputs,
             : max(PADDING_Y - (oy * STRIDE_Y), 0);
         const int syMax = (PADDING_Y == 0
                 && OUTPUTS_HEIGHT == OUTPUTS_HEIGHT_NOPAD) ? KERNEL_HEIGHT
-            : clamp(CHANNELS_HEIGHT + PADDING_Y - (oy * STRIDE_Y), 
+            : clamp(CHANNELS_HEIGHT + PADDING_Y - (oy * STRIDE_Y),
                     0, KERNEL_HEIGHT);
         const int iy = (oy * STRIDE_Y) - PADDING_Y;
 
@@ -78,7 +77,7 @@ void convcellPropagate(const Input_T* __restrict inputs,
                 : max(PADDING_X - (ox * STRIDE_X), 0);
             const int sxMax = (PADDING_X == 0
                     && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD) ? KERNEL_WIDTH
-                : clamp(CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X), 
+                : clamp(CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X),
                         0, KERNEL_WIDTH);
             const int ix = (ox * STRIDE_X) - PADDING_X;
 
@@ -132,8 +131,8 @@ void convcellPropagate(const Input_T* __restrict inputs,
                                 || sxMax - sxMin == KERNEL_WIDTH)))
                     {
                         macsOnRange<KERNEL_WIDTH * NB_CHANNELS>(
-                            inputs + iOffset, 
-                            weights + wOffset, 
+                            inputs + iOffset,
+                            weights + wOffset,
                             weightedSum);
                     }
                     else {
@@ -158,8 +157,8 @@ void convcellPropagate(const Input_T* __restrict inputs,
 
                             macsOnRange<NB_CHANNELS>(
                                 // same input line so no wrapping can occur
-                                inputs + iOffsetInRange, 
-                                weights + wOffset + sx * NB_CHANNELS, 
+                                inputs + iOffsetInRange,
+                                weights + wOffset + sx * NB_CHANNELS,
                                 weightedSum);
                         }
                     }
diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/FullyConnected/Fc.hpp b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/FullyConnected/Fc.hpp
index 9dce7efb5d2ac3b9ee287f82fbb51e46e00d5365..528a6f10ea70a8a9b2643ccbe6eec23301ff9c7b 100644
--- a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/FullyConnected/Fc.hpp
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/FullyConnected/Fc.hpp
@@ -21,15 +21,14 @@
 #ifndef __N2D2_EXPORT_CPP_FC_HPP__
 #define __N2D2_EXPORT_CPP_FC_HPP__
 
-#include "typedefs.h"
-#include "assert.h"
-#include "utils.hpp"
+#include "kernels/typedefs.hpp"
+#include "kernels/utils.hpp"
 #include "kernels/Macs.hpp"
 
 namespace N2D2_Export {
 
 
-template<int NB_CHANNELS, 
+template<int NB_CHANNELS,
          int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
          int NB_OUTPUTS,
          int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
@@ -84,8 +83,8 @@ void fccellPropagate(const Input_T* __restrict inputs,
 
             if (!wrapInRange && INPUT_MEM_STRIDE == NB_CHANNELS) {
                 macsOnRange<NB_CHANNELS * CHANNELS_WIDTH>(
-                    inputs + iOffset, 
-                    weights + wOffset, 
+                    inputs + iOffset,
+                    weights + wOffset,
                     weightedSum);
             }
             else {
@@ -102,7 +101,7 @@ void fccellPropagate(const Input_T* __restrict inputs,
 
                     macsOnRange<NB_CHANNELS>(
                         inputs + iOffsetInRange,
-                        weights + wOffset + ix * NB_CHANNELS, 
+                        weights + wOffset + ix * NB_CHANNELS,
                         weightedSum);
                 }
             }
diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Pooling/Pooling.hpp b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Pooling/Pooling.hpp
index 15126719090027775dcd3c9204d9233232e4bedf..30ee9e316e755ebb44b2775627a3a2cdf66e13a5 100644
--- a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Pooling/Pooling.hpp
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Pooling/Pooling.hpp
@@ -23,13 +23,13 @@
 
 #include <cassert>
 #include <cstdio>
-#include "typedefs.h"
-#include "utils.hpp"
+#include "kernels/typedefs.hpp"
+#include "kernels/utils.hpp"
 
 namespace N2D2_Export {
 
 
-template<int NB_CHANNELS, 
+template<int NB_CHANNELS,
         int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
         int NB_OUTPUTS,
         int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
@@ -74,7 +74,7 @@ void poolcellPropagate(const Input_T* __restrict inputs,
             : max(PADDING_Y - (oy * STRIDE_Y), 0);
         const int syMax = (PADDING_Y == 0
                 && OUTPUTS_HEIGHT == OUTPUTS_HEIGHT_NOPAD) ? POOL_HEIGHT
-            : clamp(CHANNELS_HEIGHT + PADDING_Y - (oy * STRIDE_Y), 
+            : clamp(CHANNELS_HEIGHT + PADDING_Y - (oy * STRIDE_Y),
                     0, POOL_HEIGHT);
         const int iy = (oy * STRIDE_Y) - PADDING_Y;
 
@@ -86,7 +86,7 @@ void poolcellPropagate(const Input_T* __restrict inputs,
                 const int sxMax = (PADDING_X == 0
                         && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
                             ? POOL_WIDTH
-                    : clamp(CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X), 
+                    : clamp(CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X),
                             0, POOL_WIDTH);
                 const int ix = (ox * STRIDE_X) - PADDING_X;
 
diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/Macs.hpp b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/Macs.hpp
index a0a1f85fe87c7a27ad685cdce9b49314d41e13bd..58a930536d770d4b4e43c316767efd5c8c102194 100644
--- a/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/Macs.hpp
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/kernels/Utils/Macs.hpp
@@ -1,3262 +1,211 @@
-// /*
-//     (C) Copyright 2017 CEA LIST. All Rights Reserved.
-//     Contributor(s): N2D2 Team
+/*
+    (C) Copyright 2017 CEA LIST. All Rights Reserved.
+    Contributor(s): N2D2 Team
 
-//     This software is governed by the CeCILL-C license under French law and
-//     abiding by the rules of distribution of free software.  You can  use,
-//     modify and/ or redistribute the software under the terms of the CeCILL-C
-//     license as circulated by CEA, CNRS and INRIA at the following URL
-//     "http://www.cecill.info".
+    This software is governed by the CeCILL-C license under French law and
+    abiding by the rules of distribution of free software.  You can  use,
+    modify and/ or redistribute the software under the terms of the CeCILL-C
+    license as circulated by CEA, CNRS and INRIA at the following URL
+    "http://www.cecill.info".
 
-//     As a counterpart to the access to the source code and  rights to copy,
-//     modify and redistribute granted by the license, users are provided only
-//     with a limited warranty  and the software's author,  the holder of the
-//     economic rights,  and the successive licensors  have only  limited
-//     liability.
+    As a counterpart to the access to the source code and  rights to copy,
+    modify and redistribute granted by the license, users are provided only
+    with a limited warranty  and the software's author,  the holder of the
+    economic rights,  and the successive licensors  have only  limited
+    liability.
 
-//     The fact that you are presently reading this means that you have had
-//     knowledge of the CeCILL-C license and that you accept its terms.
-// */
+    The fact that you are presently reading this means that you have had
+    knowledge of the CeCILL-C license and that you accept its terms.
+*/
 
-// #ifndef __N2D2_EXPORT_CPP_MACS_HPP__
-// #define __N2D2_EXPORT_CPP_MACS_HPP__
+#ifndef __N2D2_EXPORT_CPP_MACS_HPP__
+#define __N2D2_EXPORT_CPP_MACS_HPP__
 
-// #include <cstdint>
-// #include <limits>
-// #include <type_traits>
-// #include <cmsis_compiler.h>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <cmsis_compiler.h>
 
-// #include "swar_arm_acle.h"
+namespace N2D2_Export {
 
-// namespace N2D2_Export {
 
-
-// template<typename Input_T>
-// inline static
-// uint32_t XTB16(uint32_t val) 
-// {
-//     return std::is_unsigned<Input_T>::value ? __UXTB16(val) : __SXTB16(val);
-// }
-
-// template<int INPUTS_INC = 1,
-//          int WEIGHTS_INC = 1,
-//          typename Input_T,
-//          typename Weight_T,
-//          typename Sum_T>
-// inline static
-// Sum_T dualMac(const Input_T* __restrict inputs, 
-//               const Weight_T* __restrict weights, 
-//               Sum_T weightedSum) 
-// {
-//     weightedSum += inputs[0] * weights[0]
-//         + inputs[INPUTS_INC] * weights[WEIGHTS_INC];
-
-//     return weightedSum;
-// }
-
-// template<int INPUTS_INC = 1,
-//          int WEIGHTS_INC = 1,
-//          typename Input_T,
-//          typename Weight_T,
-//          typename Sum_T,
-//          typename std::enable_if<std::is_floating_point<Input_T>::value>::type* = nullptr>
-// inline static
-// Sum_T quadMac(const Input_T* __restrict inputs, 
-//               const Weight_T* __restrict weights, 
-//               Sum_T weightedSum) 
-// {
-//     weightedSum += inputs[0*INPUTS_INC] * weights[0*WEIGHTS_INC]
-//         + inputs[1*INPUTS_INC] * weights[1*WEIGHTS_INC]
-//         + inputs[2*INPUTS_INC] * weights[2*WEIGHTS_INC]
-//         + inputs[3*INPUTS_INC] * weights[3*WEIGHTS_INC];
-
-//     return weightedSum;
-// }
-
-// template<int INPUTS_INC = 1,
-//          int WEIGHTS_INC = 1,
-//          typename Input_T,
-//          typename Weight_T,
-//          typename Sum_T,
-//          typename std::enable_if<!std::is_floating_point<Input_T>::value>::type* = nullptr>
-// inline static
-// Sum_T quadMac(const Input_T* __restrict inputs, 
-//               const Weight_T* __restrict weights, 
-//               Sum_T weightedSum) 
-// {
-//     if(INPUTS_INC != 1 || WEIGHTS_INC != 1) {
-//         weightedSum += inputs[0*INPUTS_INC] * weights[0*WEIGHTS_INC]
-//             + inputs[1*INPUTS_INC] * weights[1*WEIGHTS_INC]
-//             + inputs[2*INPUTS_INC] * weights[2*WEIGHTS_INC]
-//             + inputs[3*INPUTS_INC] * weights[3*WEIGHTS_INC];
-
-//         return weightedSum;
-//     }
-
-//     // Inputs loading & preparation
-//     uint32_t in;
-//     memcpy((void*) &in, inputs, sizeof(in));
-    
-//     uint32_t in1 = XTB16<Input_T>(in);
-//     uint32_t in2 = XTB16<Input_T>(in >> 8);
-    
-//     // Weights loading & preparation
-//     uint32_t wt;
-//     memcpy((void*) &wt, weights, sizeof(wt));
-    
-//     uint32_t wt1 = XTB16<Weight_T>(wt);
-//     uint32_t wt2 = XTB16<Weight_T>(wt >> 8);
-
-//     // Computation
-//     if(std::is_same<Sum_T, int32_t>::value) {
-//         weightedSum = __SMLAD(in1, wt1, weightedSum);
-//         weightedSum = __SMLAD(in2, wt2, weightedSum);
-//     }
-//     else {
-//         weightedSum = __SMLALD(in1, wt1, weightedSum);
-//         weightedSum = __SMLALD(in2, wt2, weightedSum);
-        
-//     }
-    
-//     return weightedSum;
-// }
-
-
-
-
-// // ----------------------------------------------------------------------------
-// // -------------- MAC computing functions for kernel 4W-4A --------------------
-// // ----------------------------------------------------------------------------
-
-// /**
-//  * @brief   Unsigned mono mac operation (4W/4A version)
-//  * @details Performs one mac operation for signed 4-bits weights
-//  *          and unsigned 4-bits inputs.
-//  * 
-//  * @tparam  Input_T     Input type (should be udata<4>)
-//  * @tparam  Weight_T    Weight type (should be data<4>)
-//  * 
-//  * @param[in]      inputs          Pointer to input vector
-//  * @param[in]      weights         Pointer to kernel weights
-//  * @param[in,out]  weightedSum     Accumulating sum from the 
-//  *                                 previous mac operations
-//  * @returns                        Updated weightedSum with 
-//  *                                 the result of the dual mac operation
-//  */
-// template<typename Input_T, typename Weight_T, typename Sum_T,
-//          typename std::enable_if<(std::is_unsigned<Input_T>::value
-//          && std::numeric_limits<Weight_T>::digits == 4
-//          && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
-// __attribute__((always_inline)) static inline
-// Sum_T monoMac(const Input_T* __restrict inputs,
-//               const Weight_T* __restrict weights,
-//               Sum_T weightedSum)
-// {
-//     weightedSum += __UBFX(inputs[0], 4, 4) * __SBFX(weights[0], 4, 4);
-//     return weightedSum;
-// }
-
-// /**
-//  * @brief   Signed mono mac operation (4W/4A version)
-//  * @details Performs one mac operation for signed 4-bits weights
-//  *          and signed 4-bits inputs.
-//  * 
-//  * @tparam  Input_T     Input type (should be data<4>)
-//  * @tparam  Weight_T    Weight type (should be data<4>)
-//  * 
-//  * @param[in]      inputs          Pointer to input vector
-//  * @param[in]      weights         Pointer to kernel weights
-//  * @param[in,out]  weightedSum     Accumulating sum from the 
-//  *                                 previous mac operations
-//  * @returns                        Updated weightedSum with 
-//  *                                 the result of the dual mac operation
-//  */
-// template<typename Input_T, typename Weight_T, typename Sum_T,
-//          typename std::enable_if<(!std::is_unsigned<Input_T>::value
-//          && std::numeric_limits<Weight_T>::digits == 4
-//          && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
-// __attribute__((always_inline)) static inline
-// Sum_T monoMac(const Input_T* __restrict inputs,
-//               const Weight_T* __restrict weights,
-//               Sum_T weightedSum)
-// {
-//     weightedSum += __SBFX(inputs[0], 4, 4) * __SBFX(weights[0], 4, 4);
-//     return weightedSum;
-// }
-
-// /**
-//  * @brief   Unsigned dual mac operation (4W/4A version)
-//  * @details Performs two mac operations for signed 4-bits weights
-//  *          and unsigned 4-bits inputs. Extracts the two 4-bits weights
-//  *          from a stored 8-bits weight and associates them into 
-//  *          a 32-bits value. Then extracts the two 4-bits inputs
-//  *          from a stored 8-bits input and associates them into 
-//  *          a 32-bits value. Finally performs a dual mac operation 
-//  *          with the __SMLAD instruction
-//  * 
-//  * @tparam  Input_T     Input type (should be udata<4>)
-//  * @tparam  Weight_T    Weight type (should be data<4>)
-//  * 
-//  * @param[in]      inputs          Pointer to compressed input vector
-//  * @param[in]      weights         Pointer to compressed kernel weights
-//  * @param[in,out]  weightedSum     Accumulating sum from the 
-//  *                                 previous mac operations
-//  * @returns                        Updated weightedSum with 
-//  *                                 the result of the dual mac operation
-//  */
-// template<typename Input_T, typename Weight_T, typename Sum_T,
-//          typename std::enable_if<(std::is_unsigned<Input_T>::value
-//          && std::numeric_limits<Weight_T>::digits == 4
-//          && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
-// __attribute__((always_inline)) static inline
-// Sum_T dualMac(const Input_T* __restrict inputs,
-//               const Weight_T* __restrict weights,
-//               Sum_T weightedSum)
-// {
-//     uint8_t wt;
-//     std::memcpy((void*) &wt, weights, sizeof(wt));
-
-//     int32_t w0 = __SBFX(wt, 0, 4);
-//     int32_t w1 = __SBFX(wt, 4, 4);
-//     uint32_t wght = __BFI(w1, w0, 16, 16);
-
-//     uint8_t in;
-//     std::memcpy((void*) &in, inputs, sizeof(in));
-
-//     int32_t a0 = __UBFX(in, 0, 4);
-//     int32_t a1 = __UBFX(in, 4, 4);
-//     uint32_t act = __BFI(a1, a0, 16, 16);
-
-//     weightedSum = __SMLAD(act, wght, weightedSum);
-
-//     return weightedSum;
-// }
-
-// /**
-//  * @brief   Signed dual mac operation (4W/4A version)
-//  * @details Performs two mac operations for signed 4-bits weights
-//  *          and signed 4-bits inputs. Extracts the two 4-bits weights
-//  *          from a stored 8-bits weight and associates them into 
-//  *          a 32-bits value. Then extracts the two 4-bits inputs
-//  *          from a stored 8-bits input and associates them into 
-//  *          a 32-bits value. Finally performs a dual mac operation 
-//  *          with the __SMLAD instruction
-//  * 
-//  * @tparam  Input_T     Input type (should be data<4>)
-//  * @tparam  Weight_T    Weight type (should be data<4>)
-//  * 
-//  * @param[in]      inputs          Pointer to compressed input vector
-//  * @param[in]      weights         Pointer to compressed kernel weights
-//  * @param[in,out]  weightedSum     Accumulating sum from the 
-//  *                                 previous mac operations
-//  * @returns                        Updated weightedSum with 
-//  *                                 the result of the dual mac operation
-//  */
-// template<typename Input_T, typename Weight_T, typename Sum_T,
-//          typename std::enable_if<(!std::is_unsigned<Input_T>::value
-//          && std::numeric_limits<Weight_T>::digits == 4
-//          && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
-// __attribute__((always_inline)) static inline
-// Sum_T dualMac(const Input_T* __restrict inputs,
-//               const Weight_T* __restrict weights,
-//               Sum_T weightedSum)
-// {
-//     uint8_t wt;
-//     std::memcpy((void*) &wt, weights, sizeof(wt));
-
-//     int32_t w0 = __SBFX(wt, 0, 4);
-//     int32_t w1 = __SBFX(wt, 4, 4);
-//     uint32_t wght = __BFI(w1, w0, 16, 16);
-
-//     uint8_t in;
-//     std::memcpy((void*) &in, inputs, sizeof(in));
-
-//     int32_t a0 = __SBFX(in, 0, 4);
-//     int32_t a1 = __SBFX(in, 4, 4);
-//     uint32_t act = __BFI(a1, a0, 16, 16);
-
-//     weightedSum = __SMLAD(act, wght, weightedSum);
-
-//     return weightedSum;
-// }
-
-// /**
-//  * @brief   Unsigned quad mac operation (4W/4A version)
-//  * @details Performs four mac operations for signed 4-bits weights
-//  *          and unsigned 4-bits inputs. Extracts the four 4-bits weights
-//  *          from two stored 8-bits weights and associates them into 
-//  *          two 32-bits values. Then extracts the four 4-bits inputs
-//  *          from two stored 8-bits inputs and associates them into 
-//  *          two 32-bits values. Finally performs a double dual mac operation 
-//  *          with the __SMLAD instruction
-//  * 
-//  * @tparam  Input_T     Input type (should be udata<4>)
-//  * @tparam  Weight_T    Weight type (should be data<4>)
-//  * 
-//  * @param[in]      inputs          Pointer to compressed input vector
-//  * @param[in]      weights         Pointer to compressed kernel weights
-//  * @param[in,out]  weightedSum     Accumulating sum from the 
-//  *                                 previous mac operations
-//  * @returns                        Updated weightedSum with 
-//  *                                 the result of the quad mac operation
-//  */
-// template<typename Input_T, typename Weight_T, typename Sum_T,
-//          typename std::enable_if<(std::is_unsigned<Input_T>::value
-//          && std::numeric_limits<Weight_T>::digits == 4
-//          && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
-// __attribute__((always_inline)) static inline
-// Sum_T quadMac(const Input_T* __restrict inputs,
-//               const Weight_T* __restrict weights,
-//               Sum_T weightedSum)
-// {
-//     uint16_t wt;
-//     std::memcpy((void*) &wt, weights, sizeof(wt));
-
-//     int32_t w0 = __SBFX(wt, 0, 4);
-//     int32_t w1 = __SBFX(wt, 4, 4);
-//     int32_t w2 = __SBFX(wt, 8, 4);
-//     int32_t w3 = __SBFX(wt, 12, 4);
-
-//     uint32_t evenW1 = __BFI(w2, w0, 16, 16);
-//     uint32_t oddW1  = __BFI(w3, w1, 16, 16);
-
-//     uint16_t in;
-//     std::memcpy((void*) &in, inputs, sizeof(in));
-
-//     int32_t a0 = __UBFX(in, 0, 4);
-//     int32_t a1 = __UBFX(in, 4, 4);
-//     int32_t a2 = __UBFX(in, 8, 4);
-//     int32_t a3 = __UBFX(in, 12, 4);
-
-//     uint32_t evenA1 = __BFI(a2, a0, 16, 16);
-//     uint32_t oddA1  = __BFI(a3, a1, 16, 16);
-
-//     weightedSum = __SMLAD(evenA1, evenW1, weightedSum);
-//     weightedSum = __SMLAD(oddA1, oddW1, weightedSum);
-
-//     return weightedSum;
-// }
-
-// /**
-//  * @brief   Signed quad mac operation (4W/4A version)
-//  * @details Performs four mac operations for signed 4-bits weights
-//  *          and signed 4-bits inputs. Extracts the four 4-bits weights
-//  *          from two stored 8-bits weights and associates them into 
-//  *          two 32-bits values. Then extracts the four 4-bits inputs
-//  *          from two stored 8-bits inputs and associates them into 
-//  *          two 32-bits values. Finally performs a double dual mac operation 
-//  *          with the __SMLAD instruction
-//  * 
-//  * @tparam  Input_T     Input type (should be data<4>)
-//  * @tparam  Weight_T    Weight type (should be data<4>)
-//  * 
-//  * @param[in]      inputs          Pointer to compressed input vector
-//  * @param[in]      weights         Pointer to compressed kernel weights
-//  * @param[in,out]  weightedSum     Accumulating sum from the 
-//  *                                 previous mac operations
-//  * @returns                        Updated weightedSum with 
-//  *                                 the result of the quad mac operation
-//  */
-// template<typename Input_T, typename Weight_T, typename Sum_T,
-//          typename std::enable_if<(!std::is_unsigned<Input_T>::value
-//          && std::numeric_limits<Weight_T>::digits == 4
-//          && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
-// __attribute__((always_inline)) static inline
-// Sum_T quadMac(const Input_T* __restrict inputs,
-//               const Weight_T* __restrict weights,
-//               Sum_T weightedSum)
-// {
-//     uint16_t wt;
-//     std::memcpy((void*) &wt, weights, sizeof(wt));
-
-//     int32_t w0 = __SBFX(wt, 0, 4);
-//     int32_t w1 = __SBFX(wt, 4, 4);
-//     int32_t w2 = __SBFX(wt, 8, 4);
-//     int32_t w3 = __SBFX(wt, 12, 4);
-
-//     uint32_t evenW1 = __PKHBT(w2, w0, 16);
-//     uint32_t oddW1  = __PKHBT(w3, w1, 16);
-
-//     uint16_t in;
-//     std::memcpy((void*) &in, inputs, sizeof(in));
-
-//     int32_t a0 = __SBFX(in, 0, 4);
-//     int32_t a1 = __SBFX(in, 4, 4);
-//     int32_t a2 = __SBFX(in, 8, 4);
-//     int32_t a3 = __SBFX(in, 12, 4);
-    
-//     uint32_t evenA1 = __PKHBT(a2, a0, 16);
-//     uint32_t oddA1  = __PKHBT(a3, a1, 16);
-
-//     weightedSum = __SMLAD(evenA1, evenW1, weightedSum);
-//     weightedSum = __SMLAD(oddA1, oddW1, weightedSum);
-
-//     return weightedSum;
-// }
-
-// /**
-//  * @brief   Unsigned octo mac operation (4W/4A version)
-//  * @details Performs eight mac operations for signed 4-bits weights
-//  *          and unsigned 4-bits inputs. Extracts the eight 4-bits weights
-//  *          from four stored 8-bits weights and associates them into 
-//  *          four 32-bits values. Then extracts the eight 4-bits inputs
-//  *          from four stored 8-bits inputs and associates them into 
-//  *          four 32-bits values. Finally performs a quadruple dual mac operation 
-//  *          with the __SMLAD instruction
-//  * 
-//  * @tparam  Input_T     Input type (should be udata<4>)
-//  * @tparam  Weight_T    Weight type (should be data<4>)
-//  * 
-//  * @param[in]      inputs          Pointer to compressed input vector
-//  * @param[in]      weights         Pointer to compressed kernel weights
-//  * @param[in,out]  weightedSum     Accumulating sum from the 
-//  *                                 previous mac operations
-//  * @returns                        Updated weightedSum with 
-//  *                                 the result of the octo mac operation
-//  */
-// // template<typename Input_T, typename Weight_T,
-// //          typename std::enable_if<(std::is_unsigned<Input_T>::value
-// //          && std::numeric_limits<Weight_T>::digits == 4
-// //          && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
-// // __attribute__((always_inline)) static inline
-// // Sum_T octoMac(const Input_T* __restrict inputs,
-// //               const Weight_T* __restrict weights,
-// //               Sum_T weightedSum)
-// // {
-// //     uint32_t wt;
-// //     std::memcpy((void*) &wt, weights, sizeof(wt));
-
-// //     int32_t w0 = __SBFX(wt, 0, 4);
-// //     int32_t w1 = __SBFX(wt, 4, 4);
-// //     int32_t w2 = __SBFX(wt, 8, 4);
-// //     int32_t w3 = __SBFX(wt, 12, 4);
-// //     int32_t w4 = __SBFX(wt, 16, 4);
-// //     int32_t w5 = __SBFX(wt, 20, 4);
-// //     int32_t w6 = __SBFX(wt, 24, 4);
-// //     int32_t w7 = __SBFX(wt, 28, 4);
-
-// //     // uint32_t weight0 = __BFI(w4, w0, 16, 16);
-// //     // uint32_t weight1 = __BFI(w5, w1, 16, 16);
-// //     // uint32_t weight2 = __BFI(w6, w2, 16, 16);
-// //     // uint32_t weight3 = __BFI(w7, w3, 16, 16);
-
-// //     uint32_t weight0 = __PKHBT(w0, w4, 16);
-// //     uint32_t weight1 = __PKHBT(w1, w5, 16);
-// //     uint32_t weight2 = __PKHBT(w2, w6, 16);
-// //     uint32_t weight3 = __PKHBT(w3, w7, 16);
-
-// //     uint32_t in;
-// //     std::memcpy((void*) &in, inputs, sizeof(in));
-
-// //     uint32_t act0 = in & 0xF000F;
-// //     uint32_t act1 = (in >> 4) & 0xF000F;
-// //     uint32_t act2 = (in >> 8) & 0xF000F;
-// //     uint32_t act3 = (in >> 12) & 0xF000F;
-
-// //     weightedSum = __SMLAD(act0, weight0, weightedSum);
-// //     weightedSum = __SMLAD(act1, weight1, weightedSum);
-// //     weightedSum = __SMLAD(act2, weight2, weightedSum);
-// //     weightedSum = __SMLAD(act3, weight3, weightedSum);
-
-// //     return weightedSum;
-// // }
-
-// // template<typename Input_T, typename Weight_T,
-// //          typename std::enable_if<(std::is_unsigned<Input_T>::value
-// //          && std::numeric_limits<Weight_T>::digits == 4
-// //          && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
-// // __attribute__((always_inline)) static inline
-// // Sum_T octoMac(const Input_T* __restrict inputs,
-// //               const Weight_T* __restrict weights,
-// //               Sum_T weightedSum)
-// // {
-// //     union n2d2_dataword wt;
-// //     std::memcpy((void*) &wt, weights, sizeof(wt));
-
-// //     union n2d2_udataword in;
-// //     std::memcpy((void*) &in, inputs, sizeof(in));
-
-// //     for (int i = 0; i < 4; ++i) {
-// //         weightedSum += (data<32>)(in.half_bytes[i].fields.op0) * wt.half_bytes[i].fields.op0;
-// //         weightedSum += (data<32>)(in.half_bytes[i].fields.op1) * wt.half_bytes[i].fields.op1;
-// //     }
-
-// //     // weightedSum += (data<32>)(in.half_bytes[0].fields.op0) * wt.half_bytes[0].fields.op0;
-// //     // weightedSum += (data<32>)(in.half_bytes[0].fields.op1) * wt.half_bytes[0].fields.op1;
-// //     // weightedSum += (data<32>)(in.half_bytes[1].fields.op0) * wt.half_bytes[1].fields.op0;
-// //     // weightedSum += (data<32>)(in.half_bytes[1].fields.op1) * wt.half_bytes[1].fields.op1;
-// //     // weightedSum += (data<32>)(in.half_bytes[2].fields.op0) * wt.half_bytes[2].fields.op0;
-// //     // weightedSum += (data<32>)(in.half_bytes[2].fields.op1) * wt.half_bytes[2].fields.op1;
-// //     // weightedSum += (data<32>)(in.half_bytes[3].fields.op0) * wt.half_bytes[3].fields.op0;
-// //     // weightedSum += (data<32>)(in.half_bytes[3].fields.op1) * wt.half_bytes[3].fields.op1;
-
-// //     return weightedSum;
-// // }
-
-// template<typename Input_T, typename Weight_T, typename Sum_T,
-//          typename std::enable_if<(std::is_unsigned<Input_T>::value
-//          && std::numeric_limits<Weight_T>::digits == 4
-//          && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
-// __attribute__((always_inline)) static inline
-// Sum_T octoMac(const Input_T* __restrict inputs,
-//               const Weight_T* __restrict weights,
-//               Sum_T weightedSum)
-// {
-//     uint32_t wt;
-//     memcpy((void*) &wt, weights, sizeof(wt));
-
-//     // Works with weights * 4096 (weights << 12)
-//     const uint32_t WeightMask = 0xF000F000;
-//     uint32_t weight0 = WeightMask & (wt << 12);
-//     uint32_t weight1 = WeightMask & (wt << 8);
-//     uint32_t weight2 = WeightMask & (wt << 4);
-//     uint32_t weight3 = WeightMask & (wt);
-
-//     uint32_t in;
-//     memcpy((void*) &in, inputs, sizeof(in));
-
-//     const uint32_t ActMask = 0x000F000F; // to explicit instructions
-//     uint32_t act0 = in & ActMask;
-//     // Expect second operand shift
-//     uint32_t act1 = ActMask & (in >> 4);
-//     uint32_t act2 = ActMask & (in >> 8);
-//     uint32_t act3 = ActMask & (in >> 12);
-
-//     Sum_T sum = 0;
-//     sum = __SMLAD(act0, weight0, sum);
-//     sum = __SMLAD(act1, weight1, sum);
-//     sum = __SMLAD(act2, weight2, sum);
-//     sum = __SMLAD(act3, weight3, sum);
-
-//     return weightedSum + (sum >> 12);
-// }
-
-// /**
-//  * @brief   Signed octo mac operation (4W/4A version)
-//  * @details Performs eight mac operations for signed 4-bits weights
-//  *          and signed 4-bits inputs. Extracts the eight 4-bits weights
-//  *          from four stored 8-bits weights and associates them into 
-//  *          four 32-bits values. Then extracts the eight 4-bits inputs
-//  *          from four stored 8-bits inputs and associates them into 
-//  *          four 32-bits values. Finally performs a quadruple dual mac operation 
-//  *          with the __SMLAD instruction
-//  * 
-//  * @tparam  Input_T     Input type (should be data<4>)
-//  * @tparam  Weight_T    Weight type (should be data<4>)
-//  * 
-//  * @param[in]      inputs          Pointer to compressed input vector
-//  * @param[in]      weights         Pointer to compressed kernel weights
-//  * @param[in,out]  weightedSum     Accumulating sum from the 
-//  *                                 previous mac operations
-//  * @returns                        Updated weightedSum with 
-//  *                                 the result of the octo mac operation
-//  */
-// template<typename Input_T, typename Weight_T, typename Sum_T,
-//          typename std::enable_if<(!std::is_unsigned<Input_T>::value
-//          && std::numeric_limits<Weight_T>::digits == 4
-//          && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
-// __attribute__((always_inline)) static inline
-// Sum_T octoMac(const Input_T* __restrict inputs,
-//               const Weight_T* __restrict weights,
-//               Sum_T weightedSum)
-// {
-//     uint32_t wt;
-//     std::memcpy((void*) &wt, weights, sizeof(wt));
-
-//     int32_t w0 = __SBFX(wt, 0, 4);
-//     int32_t w1 = __SBFX(wt, 4, 4);
-//     int32_t w2 = __SBFX(wt, 8, 4);
-//     int32_t w3 = __SBFX(wt, 12, 4);
-//     int32_t w4 = __SBFX(wt, 16, 4);
-//     int32_t w5 = __SBFX(wt, 20, 4);
-//     int32_t w6 = __SBFX(wt, 24, 4);
-//     int32_t w7 = __SBFX(wt, 28, 4);
-
-//     uint32_t evenW1 = __PKHBT(w2, w0, 16);
-//     uint32_t oddW1  = __PKHBT(w3, w1, 16);
-//     uint32_t evenW2 = __PKHBT(w6, w4, 16);
-//     uint32_t oddW2  = __PKHBT(w7, w5, 16);
-
-//     uint32_t in;
-//     std::memcpy((void*) &in, inputs, sizeof(in));
-
-//     int32_t a0 = __SBFX(in, 0, 4);
-//     int32_t a1 = __SBFX(in, 4, 4);
-//     int32_t a2 = __SBFX(in, 8, 4);
-//     int32_t a3 = __SBFX(in, 12, 4);
-//     int32_t a4 = __SBFX(in, 16, 4);
-//     int32_t a5 = __SBFX(in, 20, 4);
-//     int32_t a6 = __SBFX(in, 24, 4);
-//     int32_t a7 = __SBFX(in, 28, 4);
-
-//     uint32_t evenA1 = __PKHBT(a2, a0, 16);
-//     uint32_t oddA1  = __PKHBT(a3, a1, 16);
-//     uint32_t evenA2 = __PKHBT(a6, a4, 16);
-//     uint32_t oddA2  = __PKHBT(a7, a5, 16);
-
-//     weightedSum = __SMLAD(evenA1, evenW1, weightedSum);
-//     weightedSum = __SMLAD(oddA1, oddW1, weightedSum);
-//     weightedSum = __SMLAD(evenA2, evenW2, weightedSum);
-//     weightedSum = __SMLAD(oddA2, oddW2, weightedSum);
-
-//     return weightedSum;
-// }
-
-
-// // template<typename Input_T, typename Weight_T, typename Sum_T,
-// //          typename std::enable_if<(std::is_unsigned<Input_T>::value
-// //          && std::numeric_limits<Weight_T>::digits == 4
-// //          && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
-// // void macsOnParallel(const Input_T* __restrict inputs,
-// //                     const Weight_T* __restrict weights,
-// //                     Sum_T* weightedSums,
-// //                     const int nb_data)
-// // {
-// //     uint32_t wt = 0;
-// //     std::memcpy((void*) &wt, weights, ceil((double)nb_data/2));
-
-// //     uint32_t in = 0;
-// //     std::memcpy((void*) &in, inputs, ceil((double)nb_data/2));
-
-// //     for (int i = 0; i < nb_data; ++i) {
-// //         weightedSums[i] += __SBFX(wt, 4*i, 4) * __UBFX(in, 4*i, 4);
-// //     }
-// // }
-
-// // template<typename Input_T, typename Weight_T, typename Sum_T,
-// //          typename std::enable_if<(!std::is_unsigned<Input_T>::value
-// //          && std::numeric_limits<Weight_T>::digits == 4
-// //          && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
-// // void macsOnParallel(const Input_T* __restrict inputs,
-// //                     const Weight_T* __restrict weights,
-// //                     Sum_T* weightedSums,
-// //                     const int nb_data)
-// // {
-// //     uint32_t wt = 0;
-// //     std::memcpy((void*) &wt, weights, ceil((double)nb_data/2));
-
-// //     uint32_t in = 0;
-// //     std::memcpy((void*) &in, inputs, ceil((double)nb_data/2));
-
-// //     for (int i = 0; i < nb_data; ++i) {
-// //         weightedSums[i] += __SBFX(wt, 4*i, 4) * __SBFX(in, 4*i, 4);
-// //     }
-// // }
-
-
-
-
-// // **************************************************************************
-// // * Multiply-accumulate the values in inputs and weights for NB_ITERATIONS *
-// // **************************************************************************
-
-// template<int NB_ITERATIONS,
-//          int INPUTS_INC = 1,
-//          int WEIGHTS_INC = 1,
-//          class Input_T, 
-//          class Weight_T,
-//          class Sum_T,
-//          typename std::enable_if<(NB_ITERATIONS == 0)>::type* = nullptr>
-// inline static 
-// void macsOnRange(const Input_T* __restrict /*inputs*/, 
-//                  const Weight_T* __restrict /*weights*/, 
-//                  Sum_T& __restrict /*weightedSum*/) 
-// {
-//     // Nothing to do
-// }
-
-// template<int NB_ITERATIONS,
-//          int INPUTS_INC = 1,
-//          int WEIGHTS_INC = 1,
-//          class Input_T, 
-//          class Weight_T,
-//          class Sum_T,
-//          typename std::enable_if<(NB_ITERATIONS == 1)>::type* = nullptr>
-// inline static 
-// void macsOnRange(const Input_T* __restrict inputs, 
-//                  const Weight_T* __restrict weights, 
-//                  Sum_T& __restrict weightedSum) 
-// {
-//     weightedSum += (*weights) * (*inputs);
-// }
-
-// template<int NB_ITERATIONS,
-//          int INPUTS_INC = 1,
-//          int WEIGHTS_INC = 1,
-//          class Input_T, 
-//          class Weight_T,
-//          class Sum_T,
-//          typename std::enable_if<(NB_ITERATIONS >= 2 && NB_ITERATIONS < 4)>::type* = nullptr>
-// inline static 
-// void macsOnRange(const Input_T* __restrict inputs, 
-//                  const Weight_T* __restrict weights, 
-//                  Sum_T& __restrict weightedSum) 
-// {
-//     weightedSum = dualMac<INPUTS_INC, WEIGHTS_INC>(inputs, weights, weightedSum);
-//     macsOnRange<NB_ITERATIONS - 2, INPUTS_INC, WEIGHTS_INC>(inputs + 2*INPUTS_INC, 
-//                                                             weights + 2*WEIGHTS_INC, 
-//                                                             weightedSum);
-// }
-
-// /**
-//  * @brief   MACs Processing
-//  * @details Performs NB_ITERATIONS MACs operations, storing results into the
-//  *          weightedSum variable. 
-//  * 
-//  * @tparam  NB_ITERATIONS   Number of MACs to perform
-//  * @tparam  INPUTS_INC      Input Stride
-//  * @tparam  WEIGHTS_INC     Weights Stride
-//  * @tparam  Input_T         Input Type
-//  * 
-//  * @param   inputs          Pointer to inputs vector
-//  * @param   weights         Pointer to weights vector
-//  * @param   weightedSum     Pointer to weightedSum
-// */
-// template<int NB_ITERATIONS,
-//          int INPUTS_INC = 1,
-//          int WEIGHTS_INC = 1,
-//          class Input_T, 
-//          class Weight_T,
-//          class Sum_T,
-//          typename std::enable_if<(NB_ITERATIONS >= 4)>::type* = nullptr>
-// inline static 
-// void macsOnRange(const Input_T* __restrict inputs, 
-//                  const Weight_T* __restrict weights, 
-//                  Sum_T& __restrict weightedSum) 
-// {
-//     weightedSum = quadMac<INPUTS_INC, WEIGHTS_INC>(inputs, weights, weightedSum);
-//     macsOnRange<NB_ITERATIONS - 4, INPUTS_INC, WEIGHTS_INC>(inputs + 4*INPUTS_INC, 
-//                                                             weights + 4*WEIGHTS_INC, 
-//                                                             weightedSum);
-// }
-
-
-// template<int NB_ITERATIONS, typename Input_T, typename Weight_T, typename Sum_T,
-//          typename std::enable_if<(NB_ITERATIONS >= 2 && NB_ITERATIONS < 4 && std::numeric_limits<Weight_T>::digits > 1)>::type* = nullptr>
-// __attribute__((always_inline)) static inline
-// void macsOnRange(const Input_T* __restrict inputs,
-//                  const Weight_T* __restrict weights,
-//                  Sum_T& weightedSum)
-// {
-//     constexpr unsigned int idxI 
-//         = (std::numeric_limits<Input_T>::digits > 4) ? 2 : 1;
-//     constexpr unsigned int idxW 
-//         = (std::numeric_limits<Weight_T>::digits > 4) ? 2 : 1;
-
-//     weightedSum = dualMac(inputs, weights, weightedSum);
-//     macsOnRange<NB_ITERATIONS - 2>(inputs + idxI, weights + idxW, weightedSum);
-// }
-
-// template<int NB_ITERATIONS, typename Input_T, typename Weight_T, typename Sum_T,
-//          typename std::enable_if<NB_ITERATIONS >= 4 
-//          && (std::numeric_limits<Weight_T>::digits > 4)>::type* = nullptr>
-// __attribute__((always_inline)) static inline
-// void macsOnRange(const Input_T* __restrict inputs,
-//                  const Weight_T* __restrict weights,
-//                  Sum_T& weightedSum)
-// {
-//     constexpr unsigned int idxI 
-//         = (std::numeric_limits<Input_T>::digits > 4) 
-//           ? 4 : (std::numeric_limits<Input_T>::digits == 4) ? 2 : 1;
-
-//     constexpr unsigned int idxW = 4;
-
-//     weightedSum = quadMac(inputs, weights, weightedSum);
-//     macsOnRange<NB_ITERATIONS - 4>(inputs + idxI, weights + idxW, weightedSum);
-// }
-
-// template<int NB_ITERATIONS, typename Input_T, typename Weight_T, typename Sum_T,
-//          typename std::enable_if<(NB_ITERATIONS >= 4 && NB_ITERATIONS < 8) 
-//          && (std::numeric_limits<Weight_T>::digits == 4)>::type* = nullptr>
-// __attribute__((always_inline)) static inline
-// void macsOnRange(const Input_T* __restrict inputs,
-//                  const Weight_T* __restrict weights,
-//                  Sum_T& weightedSum)
-// {
-//     constexpr unsigned int idxI 
-//         = (std::numeric_limits<Input_T>::digits > 4) 
-//           ? 4 : (std::numeric_limits<Input_T>::digits == 4) ? 2 : 1;
-
-//     constexpr unsigned int idxW = 2;
-
-//     weightedSum = quadMac(inputs, weights, weightedSum);
-//     macsOnRange<NB_ITERATIONS - 4>(inputs + idxI, weights + idxW, weightedSum);
-// }
-
-// template<int NB_ITERATIONS, typename Input_T, typename Weight_T, typename Sum_T,
-//          typename std::enable_if<NB_ITERATIONS >= 8 
-//          && (std::numeric_limits<Weight_T>::digits == 4)>::type* = nullptr>
-// __attribute__((always_inline)) static inline
-// void macsOnRange(const Input_T* __restrict inputs,
-//                  const Weight_T* __restrict weights,
-//                  Sum_T& weightedSum)
-// {
-//     constexpr unsigned int idxI 
-//         = (std::numeric_limits<Input_T>::digits > 4) 
-//           ? 8 : (std::numeric_limits<Input_T>::digits == 4) 
-//             ? 4 : (std::numeric_limits<Input_T>::digits == 2)
-//               ? 2 : 1;
-
-//     constexpr unsigned int idxW = 4;
-
-//     weightedSum = octoMac(inputs, weights, weightedSum);
-//     macsOnRange<NB_ITERATIONS - 8>(inputs + idxI, weights + idxW, weightedSum);
-// }
-
-
-// }   // N2D2_Export
-
-// #endif  // __N2D2_EXPORT_CPP_MACS_HPP__
-
-
-
-
-/**
- ******************************************************************************
- * @file     mac_functions.hpp
- * @brief    Mac operation functions for ARM Cortex m7 and m4
- *           This file provides different functions to perform
- *           signed and unsigned mac operations. Those functions can calculate
- *           up to eight mac operations at once.
- *           The file also provides two general mac operation which can be
- *           used in other files, especially in Network.hpp
- * 
- ******************************************************************************
- * @attention
- * 
- * (C) Copyright 2021 CEA LIST. All Rights Reserved.
- *  Contributor(s): Vincent TEMPLIER (vincent.templier@cea.fr)
- *                  Philippe DORE (philippe.dore@cea.fr)
- *                  David BRIAND (david.briand@cea.fr)
- * 
- * This file is not part of the open source version of N2D2 and is NOT under
- * the CeCILL-C license. This code is the property of the CEA. It can not be
- * copied or disseminated without its authorization.
- * 
- ******************************************************************************
- */
-
-#ifndef __N2D2_MAC_FUNCTIONS_HPP__
-#define __N2D2_MAC_FUNCTIONS_HPP__
-
-#include <cstring>
-#include "swar_arm_acle.h"
-#include "kernels/typedefs.hpp"
-
-
-// ----------------------------------------------------------------------------
-// --------------- MAC computing functions for all kernels --------------------
-// ----------------------------------------------------------------------------
-
-
-// ----------------------------------------------------------------------------
-// -------------- MAC computing functions for kernel 8W-8A --------------------
-// ----------------------------------------------------------------------------
-
-/**
- * @brief   Mono mac operation (8W/8A version)
- * @details Performs one mac operation for signed 8-bits weights
- *          and 8-bits inputs (signed or not).
- * 
- * @tparam  Input_T     Input type (udata<8> or data<8>)
- * @tparam  Weight_T    Weight type (should be data<8>)
- * 
- * @param[in]      inputs          Pointer to input vector
- * @param[in]      weights         Pointer to kernel weights
- * @param[in,out]  weightedSum     Accumulating sum from the 
- *                                 previous mac operations
- * @returns                        Updated weightedSum with 
- *                                 the result of the dual mac operation
- */
-template<typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 8
-         && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-Sum_T monoMac(const Input_T* __restrict inputs,
-              const Weight_T* __restrict weights,
-              Sum_T weightedSum)
-{
-    weightedSum += (Sum_T)inputs[0] * weights[0];
-    return weightedSum;
-}
-
-/**
- * @brief   Dual mac operation (8W/8A version)
- * @details Performs two mac operations for signed 8-bits weights
- *          and 8-bits inputs (signed or not).
- * 
- * @tparam  Input_T     Input type (udata<8> or data<8>)
- * @tparam  Weight_T    Weight type (should be data<8>)
- * 
- * @param[in]      inputs          Pointer to input vector
- * @param[in]      weights         Pointer to kernel weights
- * @param[in,out]  weightedSum     Accumulating sum from the 
- *                                 previous mac operations
- * @returns                        Updated weightedSum with 
- *                                 the result of the dual mac operation
- */
-template<typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 8
-         && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-Sum_T dualMac(const Input_T* __restrict inputs,
-              const Weight_T* __restrict weights,
-              Sum_T weightedSum)
-{
-    weightedSum += (Sum_T)inputs[0] * weights[0] + (Sum_T)inputs[1] * weights[1];
-    return weightedSum;
-}
-
-/**
- * @brief   Unsigned quad mac operation (8W/8A version)
- * @details Performs four mac operations for signed 8-bits weights
- *          and unsigned 8-bits inputs. Sign extends four 8-bits weights
- *          and associates them into two 32-bits values. Then zero extends 
- *          four 8-bits inputs and associates them into two 32-bits values. 
- *          Finally performs a double dual mac operation 
- *          with the __SMLAD instruction.
- * 
- * @tparam  Input_T     Input type (should be udata<8>)
- * @tparam  Weight_T    Weight type (should be data<8>)
- * 
- * @param[in]      inputs          Pointer to input vector
- * @param[in]      weights         Pointer to kernel weights
- * @param[in,out]  weightedSum     Accumulating sum from the 
- *                                 previous mac operations
- * @returns                        Updated weightedSum with 
- *                                 the result of the quad mac operation
- */
-template<typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::is_unsigned<Input_T>::value
-         && std::numeric_limits<Weight_T>::digits == 8
-         && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-Sum_T quadMac(const Input_T* __restrict inputs,
-              const Weight_T* __restrict weights,
-              Sum_T weightedSum)
-{
-    uint32_t in;
-    std::memcpy((void*) &in, inputs, sizeof(in));
-
-    uint32_t in1 = __UXTB16(in);
-    uint32_t in2 = __UXTB16_RORn(in, 8);
-
-    uint32_t wt;
-    std::memcpy((void*) &wt, weights, sizeof(wt));
-
-    uint32_t wt1 = __SXTB16(wt);
-    uint32_t wt2 = __SXTB16_RORn(wt, 8);
-
-    weightedSum = __SMLAD(in1, wt1, weightedSum);
-    weightedSum = __SMLAD(in2, wt2, weightedSum);
-    
-    return weightedSum;
-}
-
-/**
- * @brief   Signed quad mac operation (8W/8A version)
- * @details Performs four mac operations for signed 8-bits weights
- *          and signed 8-bits inputs. Sign extends four 8-bits weights
- *          and associates them into two 32-bits values. Then sign extends 
- *          four 8-bits inputs and associates them into two 32-bits values. 
- *          Finally performs a double dual mac operation 
- *          with the __SMLAD instruction.
- * 
- * @tparam  Input_T     Input type (should be data<8>)
- * @tparam  Weight_T    Weight type (should be data<8>)
- * 
- * @param[in]      inputs          Pointer to input vector
- * @param[in]      weights         Pointer to kernel weights
- * @param[in,out]  weightedSum     Accumulating sum from the 
- *                                 previous mac operations
- * @returns                        Updated weightedSum with 
- *                                 the result of the quad mac operation
- */
-template<typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(!std::is_unsigned<Input_T>::value
-         && std::numeric_limits<Weight_T>::digits == 8
-         && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-Sum_T quadMac(const Input_T* __restrict inputs,
-              const Weight_T* __restrict weights,
-              Sum_T weightedSum)
-{
-    uint32_t in;
-    std::memcpy((void*) &in, inputs, sizeof(in));
-
-    uint32_t in1 = __SXTB16(in);
-    uint32_t in2 = __SXTB16_RORn(in, 8);
-
-    uint32_t wt;
-    std::memcpy((void*) &wt, weights, sizeof(wt));
-
-    uint32_t wt1 = __SXTB16(wt);
-    uint32_t wt2 = __SXTB16_RORn(wt, 8);
-
-    weightedSum = __SMLAD(in1, wt1, weightedSum);
-    weightedSum = __SMLAD(in2, wt2, weightedSum);
-    
-    return weightedSum;
-}
-
-template<typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 8
-         && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr>
-void macsOnParallel(const Input_T* __restrict inputs,
-                    const Weight_T* __restrict weights,
-                    Sum_T* weightedSums,
-                    const int nb_data)
-{
-    union n2d2_dataword wt = {0};
-    std::memcpy((void*) &wt, weights, nb_data);
-
-    typename std::conditional<(!std::is_unsigned<Input_T>::value), 
-            union n2d2_dataword, union n2d2_udataword>::type in = {0};
-    std::memcpy((void*) &in, inputs, nb_data);
-
-    for (int i = 0; i < nb_data; ++i) {
-        weightedSums[i] += (Sum_T)wt.bytes[i] * in.bytes[i];
-    }
-}
-
-
-
-// ----------------------------------------------------------------------------
-// -------------- MAC computing functions for kernel 4W-8A --------------------
-// ----------------------------------------------------------------------------
-
-/**
- * @brief   Mono mac operation (4W/8A version)
- * @details Performs one mac operation for signed 4-bits weights
- *          and 8-bits inputs (signed or not).
- * 
- * @tparam  Input_T     Input type (udata<8> or data<8>)
- * @tparam  Weight_T    Weight type (should be data<4>)
- * 
- * @param[in]      inputs          Pointer to input vector
- * @param[in]      weights         Pointer to kernel weights
- * @param[in,out]  weightedSum     Accumulating sum from the 
- *                                 previous mac operations
- * @returns                        Updated weightedSum with 
- *                                 the result of the dual mac operation
- */
-template<typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 4
-         && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-Sum_T monoMac(const Input_T* __restrict inputs,
-              const Weight_T* __restrict weights,
-              Sum_T weightedSum)
-{
-    weightedSum += (Sum_T)inputs[0] * __SBFX(weights[0], 4, 4);
-    return weightedSum;
-}
-
-/**
- * @brief   Unsigned dual mac operation (4W/8A version)
- * @details Performs two mac operations for signed 4-bits weights
- *          and unsigned 8-bits inputs. Extracts the two 4-bits weights
- *          from a stored 8-bits weight and associates them into 
- *          a 32-bits value. Then zero extends two 8-bits inputs and 
- *          associates them into a 32-bits value. Finally performs a
- *          dual mac operation with the __SMLAD instruction
- * 
- * @tparam  Input_T     Input type (should be udata<8>)
- * @tparam  Weight_T    Weight type (should be data<4>)
- * 
- * @param[in]      inputs          Pointer to input vector
- * @param[in]      weights         Pointer to compressed kernel weights
- * @param[in,out]  weightedSum     Accumulating sum from the 
- *                                 previous mac operations
- * @returns                        Updated weightedSum with 
- *                                 the result of the dual mac operation
- */
-template<typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::is_unsigned<Input_T>::value
-         && std::numeric_limits<Weight_T>::digits == 4
-         && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-Sum_T dualMac(const Input_T* __restrict inputs,
-              const Weight_T* __restrict weights,
-              Sum_T weightedSum)
-{
-    uint8_t wt;
-    std::memcpy((void*) &wt, weights, sizeof(wt));
-
-    int32_t w0 = __SBFX(wt, 0, 4);
-    int32_t w1 = __SBFX(wt, 4, 4);
-    uint32_t wght = __BFI(w0, w1, 16, 16);
-
-    uint16_t in;
-    std::memcpy((void*) &in, inputs, sizeof(in));
-    
-    uint32_t act = ((in << 8) | in);
-    act = __UXTB16(act);
-    
-    weightedSum = __SMLAD(act, wght, weightedSum);
-
-    return weightedSum;
-}
-
-/**
- * @brief   Signed dual mac operation (4W/8A version)
- * @details Performs two mac operations for signed 4-bits weights
- *          and signed 8-bits inputs. Extracts the two 4-bits weights
- *          from a stored 8-bits weight and associates them into 
- *          a 32-bits value. Then sign extends two 8-bits inputs and 
- *          associates them into a 32-bits value. Finally performs a
- *          dual mac operation with the __SMLAD instruction
- * 
- * @tparam  Input_T     Input type (should be data<8>)
- * @tparam  Weight_T    Weight type (should be data<4>)
- * 
- * @param[in]      inputs          Pointer to input vector
- * @param[in]      weights         Pointer to compressed kernel weights
- * @param[in,out]  weightedSum     Accumulating sum from the 
- *                                 previous mac operations
- * @returns                        Updated weightedSum with 
- *                                 the result of the dual mac operation
- */
-template<typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(!std::is_unsigned<Input_T>::value
-         && std::numeric_limits<Weight_T>::digits == 4
-         && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-Sum_T dualMac(const Input_T* __restrict inputs,
-              const Weight_T* __restrict weights,
-              Sum_T weightedSum)
-{
-    uint8_t wt;
-    std::memcpy((void*) &wt, weights, sizeof(wt));
-
-    int32_t w0 = __SBFX(wt, 0, 4);
-    int32_t w1 = __SBFX(wt, 4, 4);
-    uint32_t wght = __BFI(w0, w1, 16, 16);
-
-    uint16_t in;
-    std::memcpy((void*) &in, inputs, sizeof(in));
-    
-    uint32_t act = ((in << 8) | in);
-    act = __SXTB16(act);
-    
-    weightedSum = __SMLAD(act, wght, weightedSum);
-
-    return weightedSum;
-}
-
-/**
- * @brief   Unsigned quad mac operation (4W/8A version)
- * @details Performs four mac operations for signed 4-bits weights
- *          and unsigned 8-bits inputs. Extracts the four 4-bits weights
- *          from two stored 8-bits weights and associates them into 
- *          two 32-bits values. Then zero extends four 8-bits inputs and 
- *          associates them into two 32-bits values. Finally performs a
- *          double dual mac operation with the __SMLAD instruction
- * 
- * @tparam  Input_T     Input type (should be udata<8>)
- * @tparam  Weight_T    Weight type (should be data<4>)
- * 
- * @param[in]      inputs          Pointer to input vector
- * @param[in]      weights         Pointer to compressed kernel weights
- * @param[in,out]  weightedSum     Accumulating sum from the 
- *                                 previous mac operations
- * @returns                        Updated weightedSum with 
- *                                 the result of the quad mac operation
- */
-template<typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::is_unsigned<Input_T>::value
-         && std::numeric_limits<Weight_T>::digits == 4
-         && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-Sum_T quadMac(const Input_T* __restrict inputs,
-              const Weight_T* __restrict weights,
-              Sum_T weightedSum)
-{
-    uint16_t wt;
-    std::memcpy((void*) &wt, weights, sizeof(wt));
-
-    int32_t w0 = __SBFX(wt, 0, 4);
-    int32_t w1 = __SBFX(wt, 4, 4);
-    int32_t w2 = __SBFX(wt, 8, 4);
-    int32_t w3 = __SBFX(wt, 12, 4);
-
-    uint32_t evenW1 = __PKHBT(w0, w2, 16);
-    uint32_t oddW1  = __PKHBT(w1, w3, 16);
-
-    uint32_t in;
-    std::memcpy((void*) &in, inputs, sizeof(in));
-
-    uint32_t evenA1 = __UXTB16(in);
-    uint32_t oddA1  = __UXTB16_RORn(in, 8);
-
-    weightedSum = __SMLAD(evenA1, oddW1, weightedSum);
-    weightedSum = __SMLAD(oddA1, evenW1, weightedSum);
-
-    return weightedSum;
-}
-
-/**
- * @brief   Signed quad mac operation (4W/8A version)
- * @details Performs four mac operations for signed 4-bits weights
- *          and signed 8-bits inputs. Extracts the four 4-bits weights
- *          from two stored 8-bits weights and associates them into 
- *          two 32-bits values. Then sign extends four 8-bits inputs and 
- *          associates them into two 32-bits values. Finally performs a
- *          double dual mac operation with the __SMLAD instruction
- * 
- * @tparam  Input_T     Input type (should be data<8>)
- * @tparam  Weight_T    Weight type (should be data<4>)
- * 
- * @param[in]      inputs          Pointer to input vector
- * @param[in]      weights         Pointer to compressed kernel weights
- * @param[in,out]  weightedSum     Accumulating sum from the 
- *                                 previous mac operations
- * @returns                        Updated weightedSum with 
- *                                 the result of the quad mac operation
- */
-template<typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(!std::is_unsigned<Input_T>::value
-         && std::numeric_limits<Weight_T>::digits == 4
-         && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-Sum_T quadMac(const Input_T* __restrict inputs,
-              const Weight_T* __restrict weights,
-              Sum_T weightedSum)
-{
-    uint16_t wt;
-    std::memcpy((void*) &wt, weights, sizeof(wt));
-
-    int32_t w0 = __SBFX(wt, 0, 4);
-    int32_t w1 = __SBFX(wt, 4, 4);
-    int32_t w2 = __SBFX(wt, 8, 4);
-    int32_t w3 = __SBFX(wt, 12, 4);
-
-    uint32_t evenW1 = __BFI(w2, w0, 16, 16);
-    uint32_t oddW1  = __BFI(w3, w1, 16, 16);
-
-    uint32_t in;
-    std::memcpy((void*) &in, inputs, sizeof(in));
-
-    uint32_t evenA1 = __SXTB16(in);
-    uint32_t oddA1  = __SXTB16_RORn(in, 8);
-
-    weightedSum = __SMLAD(evenA1, oddW1, weightedSum);
-    weightedSum = __SMLAD(oddA1, evenW1, weightedSum);
-
-    return weightedSum;
-}
-
-/**
- * @brief   Unsigned octo mac operation (4W/8A version)
- * @details Performs eight mac operations for signed 4-bits weights
- *          and unsigned 8-bits inputs. Extracts the eight 4-bits weights
- *          from four stored 8-bits weights and associates them into 
- *          four 32-bits values. Then zero extends eights 8-bits inputs and 
- *          associates them into four 32-bits values. Finally performs a
- *          quadruple dual mac operation with the __SMLAD instruction
- * 
- * @tparam  Input_T     Input type (should be udata<8>)
- * @tparam  Weight_T    Weight type (should be data<4>)
- * 
- * @param[in]      inputs          Pointer to input vector
- * @param[in]      weights         Pointer to compressed kernel weights
- * @param[in,out]  weightedSum     Accumulating sum from the 
- *                                 previous mac operations
- * @returns                        Updated weightedSum with 
- *                                 the result of the octo mac operation
- */
-template<typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::is_unsigned<Input_T>::value
-         && std::numeric_limits<Weight_T>::digits == 4
-         && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-Sum_T octoMac(const Input_T* __restrict inputs,
-              const Weight_T* __restrict weights,
-              Sum_T weightedSum)
-{
-    // uint32_t wt;
-    // std::memcpy((void*) &wt, weights, sizeof(wt));
-
-    // int32_t w0 = __SBFX(wt, 0, 4);
-    // int32_t w1 = __SBFX(wt, 4, 4);
-    // int32_t w2 = __SBFX(wt, 8, 4);
-    // int32_t w3 = __SBFX(wt, 12, 4);
-    // int32_t w4 = __SBFX(wt, 16, 4);
-    // int32_t w5 = __SBFX(wt, 20, 4);
-    // int32_t w6 = __SBFX(wt, 24, 4);
-    // int32_t w7 = __SBFX(wt, 28, 4);
-
-    // // uint32_t evenW1 = __BFI(w2, w0, 16, 16);
-    // // uint32_t oddW1  = __BFI(w3, w1, 16, 16);
-    // // uint32_t evenW2 = __BFI(w6, w4, 16, 16);
-    // // uint32_t oddW2  = __BFI(w7, w5, 16, 16);
-
-    // uint32_t evenW1 = __PKHBT(w0, w2, 16);
-    // uint32_t oddW1  = __PKHBT(w1, w3, 16);
-    // uint32_t evenW2 = __PKHBT(w4, w6, 16);
-    // uint32_t oddW2  = __PKHBT(w5, w7, 16);
-
-    // uint32_t in1, in2;
-    // std::memcpy((void*) &in1, inputs, sizeof(in1));
-    // std::memcpy((void*) &in2, (inputs + 4), sizeof(in2));
-    
-    // uint32_t evenA1 = __UXTB16(in1);
-    // uint32_t oddA1  = __UXTB16_RORn(in1, 8);
-    // uint32_t evenA2 = __UXTB16(in2);
-    // uint32_t oddA2  = __UXTB16_RORn(in2, 8);
-    
-    // weightedSum = __SMLAD(evenA1, oddW1, weightedSum);
-    // weightedSum = __SMLAD(oddA1, evenW1, weightedSum);
-    // weightedSum = __SMLAD(evenA2, oddW2, weightedSum);
-    // weightedSum = __SMLAD(oddA2, evenW2, weightedSum);
-
-    // 2nd implementation
-    // union n2d2_dataword wt;
-    // std::memcpy((void*) &wt, weights, sizeof(wt));
-
-    // union n2d2_udataword in1, in2;
-    // std::memcpy((void*) &in1, inputs, sizeof(in1));
-    // std::memcpy((void*) &in2, inputs + 4, sizeof(in2));
-
-    // weightedSum += (data<32>)(in1.bytes[0]) * wt.half_bytes[0].fields.op1;
-    // weightedSum += (data<32>)(in1.bytes[1]) * wt.half_bytes[0].fields.op0;
-    // weightedSum += (data<32>)(in1.bytes[2]) * wt.half_bytes[1].fields.op1;
-    // weightedSum += (data<32>)(in1.bytes[3]) * wt.half_bytes[1].fields.op0;
-    // weightedSum += (data<32>)(in2.bytes[0]) * wt.half_bytes[2].fields.op1;
-    // weightedSum += (data<32>)(in2.bytes[1]) * wt.half_bytes[2].fields.op0;
-    // weightedSum += (data<32>)(in2.bytes[2]) * wt.half_bytes[3].fields.op1;
-    // weightedSum += (data<32>)(in2.bytes[3]) * wt.half_bytes[3].fields.op0;
-
-    uint32_t wt;
-    memcpy((void*) &wt, weights, sizeof(wt));
-
-    // Works with weights * 4096 (weights << 12)
-    const uint32_t WeightMask = 0xF000F000;
-    uint32_t weight0 = WeightMask & (wt << 12);
-    uint32_t weight1 = WeightMask & (wt << 8);
-    uint32_t weight2 = WeightMask & (wt << 4);
-    uint32_t weight3 = WeightMask & (wt);
-
-    uint32_t in1, in2;
-    std::memcpy((void*) &in1, inputs, sizeof(in1));
-    std::memcpy((void*) &in2, (inputs + 4), sizeof(in2));
-
-    uint32_t in_a = __PKHBT(in1, in2, 16);
-    uint32_t in_b = __PKHTB(in2, in1, 16);
-    
-    uint32_t evenA1 = __UXTB16(in_a);
-    uint32_t oddA1  = __UXTB16_RORn(in_a, 8);
-    uint32_t evenA2 = __UXTB16(in_b);
-    uint32_t oddA2  = __UXTB16_RORn(in_b, 8);
-
-    Sum_T sum = 0;
-    sum = __SMLAD(oddA1, weight0, sum);
-    sum = __SMLAD(evenA1, weight1, sum);
-    sum = __SMLAD(oddA2, weight2, sum);
-    sum = __SMLAD(evenA2, weight3, sum);
-    weightedSum += sum >> 12;
-
-    return weightedSum;
-}
-
-/**
- * @brief   Signed octo mac operation (4W/8A version)
- * @details Performs eight mac operations for signed 4-bits weights
- *          and signed 8-bits inputs. Extracts the eight 4-bits weights
- *          from four stored 8-bits weights and associates them into 
- *          four 32-bits values. Then sign extends eights 8-bits inputs and 
- *          associates them into four 32-bits values. Finally performs a
- *          quadruple dual mac operation with the __SMLAD instruction
- * 
- * @tparam  Input_T     Input type (should be data<8>)
- * @tparam  Weight_T    Weight type (should be data<4>)
- * 
- * @param[in]      inputs          Pointer to input vector
- * @param[in]      weights         Pointer to compressed kernel weights
- * @param[in,out]  weightedSum     Accumulating sum from the 
- *                                 previous mac operations
- * @returns                        Updated weightedSum with 
- *                                 the result of the octo mac operation
- */
-template<typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(!std::is_unsigned<Input_T>::value
-         && std::numeric_limits<Weight_T>::digits == 4
-         && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-Sum_T octoMac(const Input_T* __restrict inputs,
-              const Weight_T* __restrict weights,
-              Sum_T weightedSum)
-{
-    uint32_t wt;
-    std::memcpy((void*) &wt, weights, sizeof(wt));
-
-    int32_t w0 = __SBFX(wt, 0, 4);
-    int32_t w1 = __SBFX(wt, 4, 4);
-    int32_t w2 = __SBFX(wt, 8, 4);
-    int32_t w3 = __SBFX(wt, 12, 4);
-    int32_t w4 = __SBFX(wt, 16, 4);
-    int32_t w5 = __SBFX(wt, 20, 4);
-    int32_t w6 = __SBFX(wt, 24, 4);
-    int32_t w7 = __SBFX(wt, 28, 4);
-
-    uint32_t evenW1 = __BFI(w2, w0, 16, 16);
-    uint32_t oddW1  = __BFI(w3, w1, 16, 16);
-    uint32_t evenW2 = __BFI(w6, w4, 16, 16);
-    uint32_t oddW2  = __BFI(w7, w5, 16, 16);
-
-    uint32_t in1, in2;
-    std::memcpy((void*) &in1, inputs, sizeof(in1));
-    std::memcpy((void*) &in2, (inputs + 4), sizeof(in2));
-    
-    uint32_t evenA1 = __SXTB16(in1);
-    uint32_t oddA1  = __SXTB16_RORn(in1, 8);
-    uint32_t evenA2 = __SXTB16(in2);
-    uint32_t oddA2  = __SXTB16_RORn(in2, 8);
-    
-    weightedSum = __SMLAD(evenA1, oddW1, weightedSum);
-    weightedSum = __SMLAD(oddA1, evenW1, weightedSum);
-    weightedSum = __SMLAD(evenA2, oddW2, weightedSum);
-    weightedSum = __SMLAD(oddA2, evenW2, weightedSum);
-
-    return weightedSum;
-}
-
-
-template<typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(
-            std::numeric_limits<Weight_T>::digits == 4 && 
-            std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr>
-void macsOnParallel(const Input_T* __restrict inputs,
-                    const Weight_T* __restrict weights,
-                    Sum_T* weightedSums,
-                    const int nb_data)
-{
-    uint32_t wt = 0;
-    std::memcpy((void*) &wt, weights, ceil((double)nb_data/2));
-
-    for (int i = 0; i < nb_data; ++i) {
-        weightedSums[i] += __SBFX(wt, 4*i, 4) * inputs[i];
-    }
-}
-
-
-// ----------------------------------------------------------------------------
-// -------------- MAC computing functions for kernel 4W-4A --------------------
-// ----------------------------------------------------------------------------
-
-/**
- * @brief   Unsigned mono mac operation (4W/4A version)
- * @details Performs one mac operation for signed 4-bits weights
- *          and unsigned 4-bits inputs.
- * 
- * @tparam  Input_T     Input type (should be udata<4>)
- * @tparam  Weight_T    Weight type (should be data<4>)
- * 
- * @param[in]      inputs          Pointer to input vector
- * @param[in]      weights         Pointer to kernel weights
- * @param[in,out]  weightedSum     Accumulating sum from the 
- *                                 previous mac operations
- * @returns                        Updated weightedSum with 
- *                                 the result of the dual mac operation
- */
-template<typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::is_unsigned<Input_T>::value
-         && std::numeric_limits<Weight_T>::digits == 4
-         && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-Sum_T monoMac(const Input_T* __restrict inputs,
-              const Weight_T* __restrict weights,
-              Sum_T weightedSum)
-{
-    weightedSum += __UBFX(inputs[0], 4, 4) * __SBFX(weights[0], 4, 4);
-    return weightedSum;
-}
-
-/**
- * @brief   Signed mono mac operation (4W/4A version)
- * @details Performs one mac operation for signed 4-bits weights
- *          and signed 4-bits inputs.
- * 
- * @tparam  Input_T     Input type (should be data<4>)
- * @tparam  Weight_T    Weight type (should be data<4>)
- * 
- * @param[in]      inputs          Pointer to input vector
- * @param[in]      weights         Pointer to kernel weights
- * @param[in,out]  weightedSum     Accumulating sum from the 
- *                                 previous mac operations
- * @returns                        Updated weightedSum with 
- *                                 the result of the dual mac operation
- */
-template<typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(!std::is_unsigned<Input_T>::value
-         && std::numeric_limits<Weight_T>::digits == 4
-         && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-Sum_T monoMac(const Input_T* __restrict inputs,
-              const Weight_T* __restrict weights,
-              Sum_T weightedSum)
-{
-    weightedSum += __SBFX(inputs[0], 4, 4) * __SBFX(weights[0], 4, 4);
-    return weightedSum;
-}
-
-/**
- * @brief   Unsigned dual mac operation (4W/4A version)
- * @details Performs two mac operations for signed 4-bits weights
- *          and unsigned 4-bits inputs. Extracts the two 4-bits weights
- *          from a stored 8-bits weight and associates them into 
- *          a 32-bits value. Then extracts the two 4-bits inputs
- *          from a stored 8-bits input and associates them into 
- *          a 32-bits value. Finally performs a dual mac operation 
- *          with the __SMLAD instruction
- * 
- * @tparam  Input_T     Input type (should be udata<4>)
- * @tparam  Weight_T    Weight type (should be data<4>)
- * 
- * @param[in]      inputs          Pointer to compressed input vector
- * @param[in]      weights         Pointer to compressed kernel weights
- * @param[in,out]  weightedSum     Accumulating sum from the 
- *                                 previous mac operations
- * @returns                        Updated weightedSum with 
- *                                 the result of the dual mac operation
- */
-template<typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::is_unsigned<Input_T>::value
-         && std::numeric_limits<Weight_T>::digits == 4
-         && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-Sum_T dualMac(const Input_T* __restrict inputs,
-              const Weight_T* __restrict weights,
-              Sum_T weightedSum)
-{
-    uint8_t wt;
-    std::memcpy((void*) &wt, weights, sizeof(wt));
-
-    int32_t w0 = __SBFX(wt, 0, 4);
-    int32_t w1 = __SBFX(wt, 4, 4);
-    uint32_t wght = __BFI(w1, w0, 16, 16);
-
-    uint8_t in;
-    std::memcpy((void*) &in, inputs, sizeof(in));
-
-    int32_t a0 = __UBFX(in, 0, 4);
-    int32_t a1 = __UBFX(in, 4, 4);
-    uint32_t act = __BFI(a1, a0, 16, 16);
-
-    weightedSum = __SMLAD(act, wght, weightedSum);
-
-    return weightedSum;
-}
-
-/**
- * @brief   Signed dual mac operation (4W/4A version)
- * @details Performs two mac operations for signed 4-bits weights
- *          and signed 4-bits inputs. Extracts the two 4-bits weights
- *          from a stored 8-bits weight and associates them into 
- *          a 32-bits value. Then extracts the two 4-bits inputs
- *          from a stored 8-bits input and associates them into 
- *          a 32-bits value. Finally performs a dual mac operation 
- *          with the __SMLAD instruction
- * 
- * @tparam  Input_T     Input type (should be data<4>)
- * @tparam  Weight_T    Weight type (should be data<4>)
- * 
- * @param[in]      inputs          Pointer to compressed input vector
- * @param[in]      weights         Pointer to compressed kernel weights
- * @param[in,out]  weightedSum     Accumulating sum from the 
- *                                 previous mac operations
- * @returns                        Updated weightedSum with 
- *                                 the result of the dual mac operation
- */
-template<typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(!std::is_unsigned<Input_T>::value
-         && std::numeric_limits<Weight_T>::digits == 4
-         && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-Sum_T dualMac(const Input_T* __restrict inputs,
-              const Weight_T* __restrict weights,
-              Sum_T weightedSum)
-{
-    uint8_t wt;
-    std::memcpy((void*) &wt, weights, sizeof(wt));
-
-    int32_t w0 = __SBFX(wt, 0, 4);
-    int32_t w1 = __SBFX(wt, 4, 4);
-    uint32_t wght = __BFI(w1, w0, 16, 16);
-
-    uint8_t in;
-    std::memcpy((void*) &in, inputs, sizeof(in));
-
-    int32_t a0 = __SBFX(in, 0, 4);
-    int32_t a1 = __SBFX(in, 4, 4);
-    uint32_t act = __BFI(a1, a0, 16, 16);
-
-    weightedSum = __SMLAD(act, wght, weightedSum);
-
-    return weightedSum;
-}
-
-/**
- * @brief   Unsigned quad mac operation (4W/4A version)
- * @details Performs four mac operations for signed 4-bits weights
- *          and unsigned 4-bits inputs. Extracts the four 4-bits weights
- *          from two stored 8-bits weights and associates them into 
- *          two 32-bits values. Then extracts the four 4-bits inputs
- *          from two stored 8-bits inputs and associates them into 
- *          two 32-bits values. Finally performs a double dual mac operation 
- *          with the __SMLAD instruction
- * 
- * @tparam  Input_T     Input type (should be udata<4>)
- * @tparam  Weight_T    Weight type (should be data<4>)
- * 
- * @param[in]      inputs          Pointer to compressed input vector
- * @param[in]      weights         Pointer to compressed kernel weights
- * @param[in,out]  weightedSum     Accumulating sum from the 
- *                                 previous mac operations
- * @returns                        Updated weightedSum with 
- *                                 the result of the quad mac operation
- */
-template<typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::is_unsigned<Input_T>::value
-         && std::numeric_limits<Weight_T>::digits == 4
-         && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-Sum_T quadMac(const Input_T* __restrict inputs,
-              const Weight_T* __restrict weights,
-              Sum_T weightedSum)
-{
-    uint16_t wt;
-    std::memcpy((void*) &wt, weights, sizeof(wt));
-
-    int32_t w0 = __SBFX(wt, 0, 4);
-    int32_t w1 = __SBFX(wt, 4, 4);
-    int32_t w2 = __SBFX(wt, 8, 4);
-    int32_t w3 = __SBFX(wt, 12, 4);
-
-    uint32_t evenW1 = __BFI(w2, w0, 16, 16);
-    uint32_t oddW1  = __BFI(w3, w1, 16, 16);
-
-    uint16_t in;
-    std::memcpy((void*) &in, inputs, sizeof(in));
-
-    int32_t a0 = __UBFX(in, 0, 4);
-    int32_t a1 = __UBFX(in, 4, 4);
-    int32_t a2 = __UBFX(in, 8, 4);
-    int32_t a3 = __UBFX(in, 12, 4);
-
-    uint32_t evenA1 = __BFI(a2, a0, 16, 16);
-    uint32_t oddA1  = __BFI(a3, a1, 16, 16);
-
-    weightedSum = __SMLAD(evenA1, evenW1, weightedSum);
-    weightedSum = __SMLAD(oddA1, oddW1, weightedSum);
-
-    return weightedSum;
-}
-
-/**
- * @brief   Signed quad mac operation (4W/4A version)
- * @details Performs four mac operations for signed 4-bits weights
- *          and signed 4-bits inputs. Extracts the four 4-bits weights
- *          from two stored 8-bits weights and associates them into 
- *          two 32-bits values. Then extracts the four 4-bits inputs
- *          from two stored 8-bits inputs and associates them into 
- *          two 32-bits values. Finally performs a double dual mac operation 
- *          with the __SMLAD instruction
- * 
- * @tparam  Input_T     Input type (should be data<4>)
- * @tparam  Weight_T    Weight type (should be data<4>)
- * 
- * @param[in]      inputs          Pointer to compressed input vector
- * @param[in]      weights         Pointer to compressed kernel weights
- * @param[in,out]  weightedSum     Accumulating sum from the 
- *                                 previous mac operations
- * @returns                        Updated weightedSum with 
- *                                 the result of the quad mac operation
- */
-template<typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(!std::is_unsigned<Input_T>::value
-         && std::numeric_limits<Weight_T>::digits == 4
-         && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-Sum_T quadMac(const Input_T* __restrict inputs,
-              const Weight_T* __restrict weights,
-              Sum_T weightedSum)
-{
-    uint16_t wt;
-    std::memcpy((void*) &wt, weights, sizeof(wt));
-
-    int32_t w0 = __SBFX(wt, 0, 4);
-    int32_t w1 = __SBFX(wt, 4, 4);
-    int32_t w2 = __SBFX(wt, 8, 4);
-    int32_t w3 = __SBFX(wt, 12, 4);
-
-    uint32_t evenW1 = __PKHBT(w2, w0, 16);
-    uint32_t oddW1  = __PKHBT(w3, w1, 16);
-
-    uint16_t in;
-    std::memcpy((void*) &in, inputs, sizeof(in));
-
-    int32_t a0 = __SBFX(in, 0, 4);
-    int32_t a1 = __SBFX(in, 4, 4);
-    int32_t a2 = __SBFX(in, 8, 4);
-    int32_t a3 = __SBFX(in, 12, 4);
-    
-    uint32_t evenA1 = __PKHBT(a2, a0, 16);
-    uint32_t oddA1  = __PKHBT(a3, a1, 16);
-
-    weightedSum = __SMLAD(evenA1, evenW1, weightedSum);
-    weightedSum = __SMLAD(oddA1, oddW1, weightedSum);
-
-    return weightedSum;
-}
-
-/**
- * @brief   Unsigned octo mac operation (4W/4A version)
- * @details Performs eight mac operations for signed 4-bits weights
- *          and unsigned 4-bits inputs. Extracts the eight 4-bits weights
- *          from four stored 8-bits weights and associates them into 
- *          four 32-bits values. Then extracts the eight 4-bits inputs
- *          from four stored 8-bits inputs and associates them into 
- *          four 32-bits values. Finally performs a quadruple dual mac operation 
- *          with the __SMLAD instruction
- * 
- * @tparam  Input_T     Input type (should be udata<4>)
- * @tparam  Weight_T    Weight type (should be data<4>)
- * 
- * @param[in]      inputs          Pointer to compressed input vector
- * @param[in]      weights         Pointer to compressed kernel weights
- * @param[in,out]  weightedSum     Accumulating sum from the 
- *                                 previous mac operations
- * @returns                        Updated weightedSum with 
- *                                 the result of the octo mac operation
- */
-// template<typename Input_T, typename Weight_T,
-//          typename std::enable_if<(std::is_unsigned<Input_T>::value
-//          && std::numeric_limits<Weight_T>::digits == 4
-//          && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
-// __attribute__((always_inline)) static inline
-// Sum_T octoMac(const Input_T* __restrict inputs,
-//               const Weight_T* __restrict weights,
-//               Sum_T weightedSum)
-// {
-//     uint32_t wt;
-//     std::memcpy((void*) &wt, weights, sizeof(wt));
-
-//     int32_t w0 = __SBFX(wt, 0, 4);
-//     int32_t w1 = __SBFX(wt, 4, 4);
-//     int32_t w2 = __SBFX(wt, 8, 4);
-//     int32_t w3 = __SBFX(wt, 12, 4);
-//     int32_t w4 = __SBFX(wt, 16, 4);
-//     int32_t w5 = __SBFX(wt, 20, 4);
-//     int32_t w6 = __SBFX(wt, 24, 4);
-//     int32_t w7 = __SBFX(wt, 28, 4);
-
-//     // uint32_t weight0 = __BFI(w4, w0, 16, 16);
-//     // uint32_t weight1 = __BFI(w5, w1, 16, 16);
-//     // uint32_t weight2 = __BFI(w6, w2, 16, 16);
-//     // uint32_t weight3 = __BFI(w7, w3, 16, 16);
-
-//     uint32_t weight0 = __PKHBT(w0, w4, 16);
-//     uint32_t weight1 = __PKHBT(w1, w5, 16);
-//     uint32_t weight2 = __PKHBT(w2, w6, 16);
-//     uint32_t weight3 = __PKHBT(w3, w7, 16);
-
-//     uint32_t in;
-//     std::memcpy((void*) &in, inputs, sizeof(in));
-
-//     uint32_t act0 = in & 0xF000F;
-//     uint32_t act1 = (in >> 4) & 0xF000F;
-//     uint32_t act2 = (in >> 8) & 0xF000F;
-//     uint32_t act3 = (in >> 12) & 0xF000F;
-
-//     weightedSum = __SMLAD(act0, weight0, weightedSum);
-//     weightedSum = __SMLAD(act1, weight1, weightedSum);
-//     weightedSum = __SMLAD(act2, weight2, weightedSum);
-//     weightedSum = __SMLAD(act3, weight3, weightedSum);
-
-//     return weightedSum;
-// }
-
-// template<typename Input_T, typename Weight_T,
-//          typename std::enable_if<(std::is_unsigned<Input_T>::value
-//          && std::numeric_limits<Weight_T>::digits == 4
-//          && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
-// __attribute__((always_inline)) static inline
-// Sum_T octoMac(const Input_T* __restrict inputs,
-//               const Weight_T* __restrict weights,
-//               Sum_T weightedSum)
-// {
-//     union n2d2_dataword wt;
-//     std::memcpy((void*) &wt, weights, sizeof(wt));
-
-//     union n2d2_udataword in;
-//     std::memcpy((void*) &in, inputs, sizeof(in));
-
-//     for (int i = 0; i < 4; ++i) {
-//         weightedSum += (data<32>)(in.half_bytes[i].fields.op0) * wt.half_bytes[i].fields.op0;
-//         weightedSum += (data<32>)(in.half_bytes[i].fields.op1) * wt.half_bytes[i].fields.op1;
-//     }
-
-//     // weightedSum += (data<32>)(in.half_bytes[0].fields.op0) * wt.half_bytes[0].fields.op0;
-//     // weightedSum += (data<32>)(in.half_bytes[0].fields.op1) * wt.half_bytes[0].fields.op1;
-//     // weightedSum += (data<32>)(in.half_bytes[1].fields.op0) * wt.half_bytes[1].fields.op0;
-//     // weightedSum += (data<32>)(in.half_bytes[1].fields.op1) * wt.half_bytes[1].fields.op1;
-//     // weightedSum += (data<32>)(in.half_bytes[2].fields.op0) * wt.half_bytes[2].fields.op0;
-//     // weightedSum += (data<32>)(in.half_bytes[2].fields.op1) * wt.half_bytes[2].fields.op1;
-//     // weightedSum += (data<32>)(in.half_bytes[3].fields.op0) * wt.half_bytes[3].fields.op0;
-//     // weightedSum += (data<32>)(in.half_bytes[3].fields.op1) * wt.half_bytes[3].fields.op1;
-
-//     return weightedSum;
-// }
-
-template<typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::is_unsigned<Input_T>::value
-         && std::numeric_limits<Weight_T>::digits == 4
-         && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-Sum_T octoMac(const Input_T* __restrict inputs,
-              const Weight_T* __restrict weights,
-              Sum_T weightedSum)
-{
-    uint32_t wt;
-    memcpy((void*) &wt, weights, sizeof(wt));
-
-    // Works with weights * 4096 (weights << 12)
-    const uint32_t WeightMask = 0xF000F000;
-    uint32_t weight0 = WeightMask & (wt << 12);
-    uint32_t weight1 = WeightMask & (wt << 8);
-    uint32_t weight2 = WeightMask & (wt << 4);
-    uint32_t weight3 = WeightMask & (wt);
-
-    uint32_t in;
-    memcpy((void*) &in, inputs, sizeof(in));
-
-    const uint32_t ActMask = 0x000F000F; // to explicit instructions
-    uint32_t act0 = in & ActMask;
-    // Expect second operand shift
-    uint32_t act1 = ActMask & (in >> 4);
-    uint32_t act2 = ActMask & (in >> 8);
-    uint32_t act3 = ActMask & (in >> 12);
-
-    Sum_T sum = 0;
-    sum = __SMLAD(act0, weight0, sum);
-    sum = __SMLAD(act1, weight1, sum);
-    sum = __SMLAD(act2, weight2, sum);
-    sum = __SMLAD(act3, weight3, sum);
-
-    return weightedSum + (sum >> 12);
-}
-
-/**
- * @brief   Signed octo mac operation (4W/4A version)
- * @details Performs eight mac operations for signed 4-bits weights
- *          and signed 4-bits inputs. Extracts the eight 4-bits weights
- *          from four stored 8-bits weights and associates them into 
- *          four 32-bits values. Then extracts the eight 4-bits inputs
- *          from four stored 8-bits inputs and associates them into 
- *          four 32-bits values. Finally performs a quadruple dual mac operation 
- *          with the __SMLAD instruction
- * 
- * @tparam  Input_T     Input type (should be data<4>)
- * @tparam  Weight_T    Weight type (should be data<4>)
- * 
- * @param[in]      inputs          Pointer to compressed input vector
- * @param[in]      weights         Pointer to compressed kernel weights
- * @param[in,out]  weightedSum     Accumulating sum from the 
- *                                 previous mac operations
- * @returns                        Updated weightedSum with 
- *                                 the result of the octo mac operation
- */
-template<typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(!std::is_unsigned<Input_T>::value
-         && std::numeric_limits<Weight_T>::digits == 4
-         && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-Sum_T octoMac(const Input_T* __restrict inputs,
-              const Weight_T* __restrict weights,
-              Sum_T weightedSum)
-{
-    uint32_t wt;
-    std::memcpy((void*) &wt, weights, sizeof(wt));
-
-    int32_t w0 = __SBFX(wt, 0, 4);
-    int32_t w1 = __SBFX(wt, 4, 4);
-    int32_t w2 = __SBFX(wt, 8, 4);
-    int32_t w3 = __SBFX(wt, 12, 4);
-    int32_t w4 = __SBFX(wt, 16, 4);
-    int32_t w5 = __SBFX(wt, 20, 4);
-    int32_t w6 = __SBFX(wt, 24, 4);
-    int32_t w7 = __SBFX(wt, 28, 4);
-
-    uint32_t evenW1 = __PKHBT(w2, w0, 16);
-    uint32_t oddW1  = __PKHBT(w3, w1, 16);
-    uint32_t evenW2 = __PKHBT(w6, w4, 16);
-    uint32_t oddW2  = __PKHBT(w7, w5, 16);
-
-    uint32_t in;
-    std::memcpy((void*) &in, inputs, sizeof(in));
-
-    int32_t a0 = __SBFX(in, 0, 4);
-    int32_t a1 = __SBFX(in, 4, 4);
-    int32_t a2 = __SBFX(in, 8, 4);
-    int32_t a3 = __SBFX(in, 12, 4);
-    int32_t a4 = __SBFX(in, 16, 4);
-    int32_t a5 = __SBFX(in, 20, 4);
-    int32_t a6 = __SBFX(in, 24, 4);
-    int32_t a7 = __SBFX(in, 28, 4);
-
-    uint32_t evenA1 = __PKHBT(a2, a0, 16);
-    uint32_t oddA1  = __PKHBT(a3, a1, 16);
-    uint32_t evenA2 = __PKHBT(a6, a4, 16);
-    uint32_t oddA2  = __PKHBT(a7, a5, 16);
-
-    weightedSum = __SMLAD(evenA1, evenW1, weightedSum);
-    weightedSum = __SMLAD(oddA1, oddW1, weightedSum);
-    weightedSum = __SMLAD(evenA2, evenW2, weightedSum);
-    weightedSum = __SMLAD(oddA2, oddW2, weightedSum);
-
-    return weightedSum;
-}
-
-
-template<typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::is_unsigned<Input_T>::value
-         && std::numeric_limits<Weight_T>::digits == 4
-         && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
-void macsOnParallel(const Input_T* __restrict inputs,
-                    const Weight_T* __restrict weights,
-                    Sum_T* weightedSums,
-                    const int nb_data)
-{
-    uint32_t wt = 0;
-    std::memcpy((void*) &wt, weights, ceil((double)nb_data/2));
-
-    uint32_t in = 0;
-    std::memcpy((void*) &in, inputs, ceil((double)nb_data/2));
-
-    for (int i = 0; i < nb_data; ++i) {
-        weightedSums[i] += __SBFX(wt, 4*i, 4) * __UBFX(in, 4*i, 4);
-    }
-}
-
-template<typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(!std::is_unsigned<Input_T>::value
-         && std::numeric_limits<Weight_T>::digits == 4
-         && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
-void macsOnParallel(const Input_T* __restrict inputs,
-                    const Weight_T* __restrict weights,
-                    Sum_T* weightedSums,
-                    const int nb_data)
-{
-    uint32_t wt = 0;
-    std::memcpy((void*) &wt, weights, ceil((double)nb_data/2));
-
-    uint32_t in = 0;
-    std::memcpy((void*) &in, inputs, ceil((double)nb_data/2));
-
-    for (int i = 0; i < nb_data; ++i) {
-        weightedSums[i] += __SBFX(wt, 4*i, 4) * __SBFX(in, 4*i, 4);
-    }
-}
-
-
-// ----------------------------------------------------------------------------
-// ------------------ Notes about performing MAC operations -------------------
-// --------------------------- with 1-bit weights -----------------------------
-// ----------------------------------------------------------------------------
-
-/**
- * @note How to perform MAC operations with 1-bit weight
- * 
- * Working with an 1-bit weight means working only with two possible values 
- * for each weight. Thus, it has been defined a convention that will be used 
- * in the following functions in this file.
- * Convention: when the value of a weight is 0, it means 1
- *             when the value of a weight is 1, it means -1
- * 
- * Example: let's take a simple dual MAC operation
- *          weightedSum = w0 * a0 + w1 * a1;
- * 
- * if w0 = 0x00 and w1 = 0x01 then weightedSum should be:
- *          weightedSum = a0 - a1;
- * 
- * To easily perform MAC operations and use as often as possible
- * SIMD instructions to parallelize and speed up MAC calculations, most of
- * the following functions use the same scheme:
- * 
- *  - Perform a parallel subtraction of 0 and the weights
- *      Some SIMD instructions as __USUB16 and __USUB8 can perform 
- *      parallel subtractions and activate a Greater or Equal flag (GE) if
- *      the results of each subtraction is positive. 
- *      Thus, if the result of 0 - w0 >= 0 ==> GE[0] = 1
- *                             0 - w0 < 0  ==> GE[0] = 0
- *      (the results of the subtractions are not saved because only the 
- *       GE flags trigger is required)
- * 
- *  - Use of the __SEL instruction to read the GE flags
- *      The __SEL can select an input from two values according to the
- *      the GE flag provided by the previous subtraction. In the case of 
- *      the 1W/8A project, the two possible values selected by __SEL are
- *      (+input) or (-input). Thus, __SEL is often used like "__SEL(in, -in)"
- *      The results of __SEL are saved as MAC results
- * 
- *  - Addition of the accumuling sums with the results of the MAC operations
- *      Use of __SADD16 or __SADD8 for signed additions
- * 
- */
-
-// ----------------------------------------------------------------------------
-// ----------------- MAC computing functions for kernel -----------------------
-// ------------------------------- 1W / 8A ------------------------------------
-// ------------------------------- 1W / 7A ------------------------------------
-// ------------------------------- 1W / 6A ------------------------------------
-// ------------------------------- 1W / 5A ------------------------------------
-// ----------------------------------------------------------------------------
-
-template<typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
-         && std::numeric_limits<Input_T>::digits <= 8 
-         && std::numeric_limits<Input_T>::digits > 4)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-Sum_T monoMac (const Input_T* __restrict inputs,
-               const Weight_T* __restrict weights,
-               Sum_T weightedSum)
-{
-    weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0])) : (Sum_T)(inputs[0]);
-    return weightedSum;
-}
-
-template<int NB_ITERATIONS,
-         typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
-         && std::numeric_limits<Input_T>::digits <= 8 
-         && std::numeric_limits<Input_T>::digits > 4
-         && NB_ITERATIONS == 2)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-void macsOnRange (const Input_T* __restrict inputs,
-                  const Weight_T* __restrict weights,
-                  Sum_T& weightedSum)
-{
-    weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0])) : (Sum_T)(inputs[0]);
-    weightedSum += (weights[0].fields.op6) ? (Sum_T)(-(inputs[1])) : (Sum_T)(inputs[1]);
-}
-
-template<int NB_ITERATIONS,
-         typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
-         && std::numeric_limits<Input_T>::digits <= 8 
-         && std::numeric_limits<Input_T>::digits > 4
-         && NB_ITERATIONS == 3)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-void macsOnRange (const Input_T* __restrict inputs,
-                  const Weight_T* __restrict weights,
-                  Sum_T& weightedSum)
-{
-    weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0])) : (Sum_T)(inputs[0]);
-    weightedSum += (weights[0].fields.op6) ? (Sum_T)(-(inputs[1])) : (Sum_T)(inputs[1]);
-    weightedSum += (weights[0].fields.op5) ? (Sum_T)(-(inputs[2])) : (Sum_T)(inputs[2]);
-}
-
-template<int NB_ITERATIONS,
-         typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
-         && std::numeric_limits<Input_T>::digits <= 8 
-         && std::numeric_limits<Input_T>::digits > 4
-         && NB_ITERATIONS == 4)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-void macsOnRange (const Input_T* __restrict inputs,
-                  const Weight_T* __restrict weights,
-                  Sum_T& weightedSum)
-{
-    weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0])) : (Sum_T)(inputs[0]);
-    weightedSum += (weights[0].fields.op6) ? (Sum_T)(-(inputs[1])) : (Sum_T)(inputs[1]);
-    weightedSum += (weights[0].fields.op5) ? (Sum_T)(-(inputs[2])) : (Sum_T)(inputs[2]);
-    weightedSum += (weights[0].fields.op4) ? (Sum_T)(-(inputs[3])) : (Sum_T)(inputs[3]);
-}
-
-template<int NB_ITERATIONS,
-         typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
-         && std::numeric_limits<Input_T>::digits <= 8 
-         && std::numeric_limits<Input_T>::digits > 4
-         && NB_ITERATIONS == 5)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-void macsOnRange (const Input_T* __restrict inputs,
-                  const Weight_T* __restrict weights,
-                  Sum_T& weightedSum)
-{
-    weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0])) : (Sum_T)(inputs[0]);
-    weightedSum += (weights[0].fields.op6) ? (Sum_T)(-(inputs[1])) : (Sum_T)(inputs[1]);
-    weightedSum += (weights[0].fields.op5) ? (Sum_T)(-(inputs[2])) : (Sum_T)(inputs[2]);
-    weightedSum += (weights[0].fields.op4) ? (Sum_T)(-(inputs[3])) : (Sum_T)(inputs[3]);
-    weightedSum += (weights[0].fields.op3) ? (Sum_T)(-(inputs[4])) : (Sum_T)(inputs[4]);
-}
-
-template<int NB_ITERATIONS,
-         typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
-         && std::numeric_limits<Input_T>::digits <= 8 
-         && std::numeric_limits<Input_T>::digits > 4
-         && NB_ITERATIONS == 6)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-void macsOnRange (const Input_T* __restrict inputs,
-                  const Weight_T* __restrict weights,
-                  Sum_T& weightedSum)
-{
-    weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0])) : (Sum_T)(inputs[0]);
-    weightedSum += (weights[0].fields.op6) ? (Sum_T)(-(inputs[1])) : (Sum_T)(inputs[1]);
-    weightedSum += (weights[0].fields.op5) ? (Sum_T)(-(inputs[2])) : (Sum_T)(inputs[2]);
-    weightedSum += (weights[0].fields.op4) ? (Sum_T)(-(inputs[3])) : (Sum_T)(inputs[3]);
-    weightedSum += (weights[0].fields.op3) ? (Sum_T)(-(inputs[4])) : (Sum_T)(inputs[4]);
-    weightedSum += (weights[0].fields.op2) ? (Sum_T)(-(inputs[5])) : (Sum_T)(inputs[5]);
-}
-
-template<int NB_ITERATIONS,
-         typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
-         && std::numeric_limits<Input_T>::digits <= 8 
-         && std::numeric_limits<Input_T>::digits > 4
-         && NB_ITERATIONS == 7)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-void macsOnRange (const Input_T* __restrict inputs,
-                  const Weight_T* __restrict weights,
-                  Sum_T& weightedSum)
-{
-    weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0])) : (Sum_T)(inputs[0]);
-    weightedSum += (weights[0].fields.op6) ? (Sum_T)(-(inputs[1])) : (Sum_T)(inputs[1]);
-    weightedSum += (weights[0].fields.op5) ? (Sum_T)(-(inputs[2])) : (Sum_T)(inputs[2]);
-    weightedSum += (weights[0].fields.op4) ? (Sum_T)(-(inputs[3])) : (Sum_T)(inputs[3]);
-    weightedSum += (weights[0].fields.op3) ? (Sum_T)(-(inputs[4])) : (Sum_T)(inputs[4]);
-    weightedSum += (weights[0].fields.op2) ? (Sum_T)(-(inputs[5])) : (Sum_T)(inputs[5]);
-    weightedSum += (weights[0].fields.op1) ? (Sum_T)(-(inputs[6])) : (Sum_T)(inputs[6]);
-}
-
-
-// ----------------------------------------------------------------------------
-// ----------------- MAC computing functions for kernel -----------------------
-// ------------------------------- 1W / 8A ------------------------------------
-// ----------------------------------------------------------------------------
-
-template<typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
-         && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-Sum_T octoMac (const Input_T* __restrict inputs,
-                  const Weight_T* __restrict weights,
-                  Sum_T weightedSum)
-{
-    uint32_t mac_result = 0;
-    uint32_t in;
-    uint32_t wt = 0;
-    std::memcpy((void*) &wt, weights, 1);
-    wt |= wt << 16;
-
-    memcpy((void*) &in, inputs, sizeof(in));
-    uint32_t evenA1 = __UXTB16(in);
-    uint32_t oddA1  = __UXTB16_RORn(in, 8);
-    uint32_t neg_evenA1 = __SSUB16(0, evenA1);
-    uint32_t neg_oddA1 = __SSUB16(0, oddA1);
-
-    __USUB16(0, wt & 0x40001);  
-    mac_result = __SEL(evenA1, neg_evenA1);
-    weightedSum = __SADD16(mac_result, weightedSum);  
-
-    __USUB16(0, wt & 0x80002);  
-    mac_result = __SEL(oddA1, neg_oddA1);
-    weightedSum = __SADD16(mac_result, weightedSum);  
-
-
-    memcpy((void*) &in, inputs + 4, sizeof(in));
-    evenA1 = __UXTB16(in);
-    oddA1  = __UXTB16_RORn(in, 8);
-    neg_evenA1 = __SSUB16(0, evenA1);
-    neg_oddA1 = __SSUB16(0, oddA1);
-
-    __USUB16(0, wt & 0x400010);  
-    mac_result = __SEL(evenA1, neg_evenA1);
-    weightedSum = __SADD16(mac_result, weightedSum);  
-
-    __USUB16(0, wt & 0x800020);  
-    mac_result = __SEL(oddA1, neg_oddA1);
-    weightedSum = __SADD16(mac_result, weightedSum);  
-
-    return weightedSum;
-}
-
-template<typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
-         && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-Sum_T quadquadMac (const Input_T* __restrict inputs,
-                   const Weight_T* __restrict weights,
-                   Sum_T weightedSum)
-{
-    uint32_t mac_result = 0;
-    uint32_t in;
-    uint32_t wt = 0;
-    std::memcpy((void*) &wt, weights, 2);
-    wt |= wt << 16;
-    
-    memcpy((void*) &in, inputs, sizeof(in));
-    uint32_t evenA1 = __UXTB16(in);
-    uint32_t oddA1  = __UXTB16_RORn(in, 8);
-    uint32_t neg_evenA1 = __SSUB16(0, evenA1);
-    uint32_t neg_oddA1 = __SSUB16(0, oddA1);
-
-    __USUB16(0, wt & 0x40001);  
-    mac_result = __SEL(evenA1, neg_evenA1);
-    weightedSum = __SADD16(mac_result, weightedSum);  
-
-    __USUB16(0, wt & 0x80002);  
-    mac_result = __SEL(oddA1, neg_oddA1);
-    weightedSum = __SADD16(mac_result, weightedSum);  
-
-
-    memcpy((void*) &in, inputs + 4, sizeof(in));
-    evenA1 = __UXTB16(in);
-    oddA1  = __UXTB16_RORn(in, 8);
-    neg_evenA1 = __SSUB16(0, evenA1);
-    neg_oddA1 = __SSUB16(0, oddA1);
-
-    __USUB16(0, wt & 0x400010);  
-    mac_result = __SEL(evenA1, neg_evenA1);
-    weightedSum = __SADD16(mac_result, weightedSum);  
-
-    __USUB16(0, wt & 0x800020);  
-    mac_result = __SEL(oddA1, neg_oddA1);
-    weightedSum = __SADD16(mac_result, weightedSum); 
-
-
-    memcpy((void*) &in, inputs + 8, sizeof(in));
-    evenA1 = __UXTB16(in);
-    oddA1  = __UXTB16_RORn(in, 8);
-    neg_evenA1 = __SSUB16(0, evenA1);
-    neg_oddA1 = __SSUB16(0, oddA1);
-
-    __USUB16(0, wt & 0x4000100);  
-    mac_result = __SEL(evenA1, neg_evenA1);
-    weightedSum = __SADD16(mac_result, weightedSum);  
-
-    __USUB16(0, wt & 0x8000200);  
-    mac_result = __SEL(oddA1, neg_oddA1);
-    weightedSum = __SADD16(mac_result, weightedSum);  
-
-
-    memcpy((void*) &in, inputs + 12, sizeof(in));
-    evenA1 = __UXTB16(in);
-    oddA1  = __UXTB16_RORn(in, 8);
-    neg_evenA1 = __SSUB16(0, evenA1);
-    neg_oddA1 = __SSUB16(0, oddA1);
-
-    __USUB16(0, wt & 0x40001000);  
-    mac_result = __SEL(evenA1, neg_evenA1);
-    weightedSum = __SADD16(mac_result, weightedSum);  
-
-    __USUB16(0, wt & 0x80002000);  
-    mac_result = __SEL(oddA1, neg_oddA1);
-    weightedSum = __SADD16(mac_result, weightedSum);
-
-    return weightedSum;
-}
-
-template<typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
-         && std::numeric_limits<Input_T>::digits == 8)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-Sum_T octoquadMac (const Input_T* __restrict inputs,
-                   const Weight_T* __restrict weights,
-                   Sum_T weightedSum)
-{
-    uint32_t mac_result = 0;
-    uint32_t in;
-    uint32_t wt;
-    memcpy((void*) &wt, weights, 4);
-    uint32_t wt1 = __PKHBT(wt, wt, 16);
-    uint32_t wt2 = __PKHTB(wt, wt, 16);
-
-    memcpy((void*) &in, inputs, sizeof(in));
-    uint32_t evenA1 = __UXTB16(in);
-    uint32_t oddA1  = __UXTB16_RORn(in, 8);
-    uint32_t neg_evenA1 = __SSUB16(0, evenA1);
-    uint32_t neg_oddA1 = __SSUB16(0, oddA1);
-
-    __USUB16(0, wt & 0x40001);  
-    mac_result = __SEL(evenA1, neg_evenA1);
-    weightedSum = __SADD16(mac_result, weightedSum);  
-
-    __USUB16(0, wt & 0x80002);  
-    mac_result = __SEL(oddA1, neg_oddA1);
-    weightedSum = __SADD16(mac_result, weightedSum);  
-
-
-    memcpy((void*) &in, inputs + 4, sizeof(in));
-    evenA1 = __UXTB16(in);
-    oddA1  = __UXTB16_RORn(in, 8);
-    neg_evenA1 = __SSUB16(0, evenA1);
-    neg_oddA1 = __SSUB16(0, oddA1);
-
-    __USUB16(0, wt & 0x400010);  
-    mac_result = __SEL(evenA1, neg_evenA1);
-    weightedSum = __SADD16(mac_result, weightedSum);  
-
-    __USUB16(0, wt & 0x800020);  
-    mac_result = __SEL(oddA1, neg_oddA1);
-    weightedSum = __SADD16(mac_result, weightedSum); 
-
-
-    memcpy((void*) &in, inputs + 8, sizeof(in));
-    evenA1 = __UXTB16(in);
-    oddA1  = __UXTB16_RORn(in, 8);
-    neg_evenA1 = __SSUB16(0, evenA1);
-    neg_oddA1 = __SSUB16(0, oddA1);
-
-    __USUB16(0, wt & 0x4000100);  
-    mac_result = __SEL(evenA1, neg_evenA1);
-    weightedSum = __SADD16(mac_result, weightedSum);  
-
-    __USUB16(0, wt & 0x8000200);  
-    mac_result = __SEL(oddA1, neg_oddA1);
-    weightedSum = __SADD16(mac_result, weightedSum);  
-
-
-    memcpy((void*) &in, inputs + 12, sizeof(in));
-    evenA1 = __UXTB16(in);
-    oddA1  = __UXTB16_RORn(in, 8);
-    neg_evenA1 = __SSUB16(0, evenA1);
-    neg_oddA1 = __SSUB16(0, oddA1);
-
-    __USUB16(0, wt & 0x40001000);  
-    mac_result = __SEL(evenA1, neg_evenA1);
-    weightedSum = __SADD16(mac_result, weightedSum);  
-
-    __USUB16(0, wt & 0x80002000);  
-    mac_result = __SEL(oddA1, neg_oddA1);
-    weightedSum = __SADD16(mac_result, weightedSum);
-
-
-    memcpy((void*) &in, inputs + 16, sizeof(in));
-    evenA1 = __UXTB16(in);
-    oddA1  = __UXTB16_RORn(in, 8);
-    neg_evenA1 = __SSUB16(0, evenA1);
-    neg_oddA1 = __SSUB16(0, oddA1);
-
-    __USUB16(0, wt2 & 0x40001);  
-    mac_result = __SEL(evenA1, neg_evenA1);
-    weightedSum = __SADD16(mac_result, weightedSum);  
-
-    __USUB16(0, wt2 & 0x80002);  
-    mac_result = __SEL(oddA1, neg_oddA1);
-    weightedSum = __SADD16(mac_result, weightedSum);  
-
-
-    memcpy((void*) &in, inputs + 20, sizeof(in));
-    evenA1 = __UXTB16(in);
-    oddA1  = __UXTB16_RORn(in, 8);
-    neg_evenA1 = __SSUB16(0, evenA1);
-    neg_oddA1 = __SSUB16(0, oddA1);
-
-    __USUB16(0, wt2 & 0x400010);  
-    mac_result = __SEL(evenA1, neg_evenA1);
-    weightedSum = __SADD16(mac_result, weightedSum);  
-
-    __USUB16(0, wt2 & 0x800020);  
-    mac_result = __SEL(oddA1, neg_oddA1);
-    weightedSum = __SADD16(mac_result, weightedSum); 
-
-
-    memcpy((void*) &in, inputs + 24, sizeof(in));
-    evenA1 = __UXTB16(in);
-    oddA1  = __UXTB16_RORn(in, 8);
-    neg_evenA1 = __SSUB16(0, evenA1);
-    neg_oddA1 = __SSUB16(0, oddA1);
-
-    __USUB16(0, wt2 & 0x4000100);  
-    mac_result = __SEL(evenA1, neg_evenA1);
-    weightedSum = __SADD16(mac_result, weightedSum);  
-
-    __USUB16(0, wt2 & 0x8000200);  
-    mac_result = __SEL(oddA1, neg_oddA1);
-    weightedSum = __SADD16(mac_result, weightedSum);  
-
-
-    memcpy((void*) &in, inputs + 28, sizeof(in));
-    evenA1 = __UXTB16(in);
-    oddA1  = __UXTB16_RORn(in, 8);
-    neg_evenA1 = __SSUB16(0, evenA1);
-    neg_oddA1 = __SSUB16(0, oddA1);
-
-    __USUB16(0, wt2 & 0x40001000);  
-    mac_result = __SEL(evenA1, neg_evenA1);
-    weightedSum = __SADD16(mac_result, weightedSum);  
-
-    __USUB16(0, wt2 & 0x80002000);  
-    mac_result = __SEL(oddA1, neg_oddA1);
-    weightedSum = __SADD16(mac_result, weightedSum);
-
-    return weightedSum;
-}
-
-// ----------------------------------------------------------------------------
-// ----------------- MAC computing functions for kernel -----------------------
-// ------------------------------- 1W / 7A ------------------------------------
-// ----------------------------------------------------------------------------
-
-template<typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
-         && std::numeric_limits<Input_T>::digits == 7)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-Sum_T octoMac (const Input_T* __restrict inputs,
-                  const Weight_T* __restrict weights,
-                  Sum_T weightedSum)
-{
-    uint32_t mac_result = 0;
-    uint32_t in;
-    uint32_t neg_in;
-    uint32_t wt = 0;
-    std::memcpy((void*) &wt, weights, 1);
-    wt |= wt << 8;
-    wt |= wt << 16;
-
-    memcpy((void*) &in, inputs, sizeof(in));
-
-    // Sign extend 
-    if (!std::is_unsigned<Input_T>::value)
-        in = (in + 0xC0C0C0C0) ^ 0xC0C0C0C0;
-
-    neg_in = __SSUB8(0, in);
-    __USUB8(0, wt & 0x08040201);  
-    mac_result = __SEL(in, neg_in);
-    weightedSum = __SXTAB16(weightedSum, mac_result);
-    weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); 
-
-    memcpy((void*) &in, inputs + 4, sizeof(in));
-
-    // Sign extend 
-    if (!std::is_unsigned<Input_T>::value)
-        in = (in + 0xC0C0C0C0) ^ 0xC0C0C0C0;
-
-    neg_in = __SSUB8(0, in);
-    __USUB8(0, wt & 0x80402010);  
-    mac_result = __SEL(in, neg_in);
-    weightedSum = __SXTAB16(weightedSum, mac_result);
-    weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); 
-
-    return weightedSum;
-}
-
-template<typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
-         && std::numeric_limits<Input_T>::digits == 7)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-Sum_T quadquadMac (const Input_T* __restrict inputs,
-                   const Weight_T* __restrict weights,
-                   Sum_T weightedSum)
-{
-    uint32_t mac_result = 0;
-    uint32_t in;
-    uint32_t neg_in;
-    uint32_t wt = 0;
-    std::memcpy((void*) &wt, weights, 4);
-
-    memcpy((void*) &in, inputs, sizeof(in));
-    neg_in = __SSUB8(0, in);
-    __USUB8(0, wt & 0x01010101);  
-    mac_result = __SEL(in, neg_in);
-    weightedSum = __SXTAB16(weightedSum, mac_result);
-    weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); 
-
-    memcpy((void*) &in, inputs + 4, sizeof(in));
-    neg_in = __SSUB8(0, in);
-    __USUB8(0, wt & 0x02020202);  
-    mac_result = __SEL(in, neg_in);
-    weightedSum = __SXTAB16(weightedSum, mac_result);
-    weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); 
-
-    memcpy((void*) &in, inputs + 8, sizeof(in));
-    neg_in = __SSUB8(0, in);
-    __USUB8(0, wt & 0x04040404);  
-    mac_result = __SEL(in, neg_in);
-    weightedSum = __SXTAB16(weightedSum, mac_result);
-    weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); 
-
-    memcpy((void*) &in, inputs + 12, sizeof(in));
-    neg_in = __SSUB8(0, in);
-    __USUB8(0, wt & 0x08080808);  
-    mac_result = __SEL(in, neg_in);
-    weightedSum = __SXTAB16(weightedSum, mac_result);
-    weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); 
-
-    return weightedSum;
-}
-
-template<typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
-         && std::numeric_limits<Input_T>::digits == 7)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-Sum_T octoquadMac (const Input_T* __restrict inputs,
-                   const Weight_T* __restrict weights,
-                   Sum_T weightedSum)
-{
-    uint32_t mac_result = 0;
-    uint32_t in;
-    uint32_t neg_in;
-    uint32_t wt = 0;
-    std::memcpy((void*) &wt, weights, 4);
-
-    memcpy((void*) &in, inputs, sizeof(in));
-    neg_in = __SSUB8(0, in);
-    __USUB8(0, wt & 0x01010101);  
-    mac_result = __SEL(in, neg_in);
-    weightedSum = __SXTAB16(weightedSum, mac_result);
-    weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); 
-
-    memcpy((void*) &in, inputs + 4, sizeof(in));
-    neg_in = __SSUB8(0, in);
-    __USUB8(0, wt & 0x02020202);  
-    mac_result = __SEL(in, neg_in);
-    weightedSum = __SXTAB16(weightedSum, mac_result);
-    weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); 
-
-    memcpy((void*) &in, inputs + 8, sizeof(in));
-    neg_in = __SSUB8(0, in);
-    __USUB8(0, wt & 0x04040404);  
-    mac_result = __SEL(in, neg_in);
-    weightedSum = __SXTAB16(weightedSum, mac_result);
-    weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); 
-
-    memcpy((void*) &in, inputs + 12, sizeof(in));
-    neg_in = __SSUB8(0, in);
-    __USUB8(0, wt & 0x08080808);  
-    mac_result = __SEL(in, neg_in);
-    weightedSum = __SXTAB16(weightedSum, mac_result);
-    weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); 
-
-    memcpy((void*) &in, inputs + 16, sizeof(in));
-    neg_in = __SSUB8(0, in);
-    __USUB8(0, wt & 0x10101010);  
-    mac_result = __SEL(in, neg_in);
-    weightedSum = __SXTAB16(weightedSum, mac_result);
-    weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); 
-
-    memcpy((void*) &in, inputs + 20, sizeof(in));
-    neg_in = __SSUB8(0, in);
-    __USUB8(0, wt & 0x20202020);  
-    mac_result = __SEL(in, neg_in);
-    weightedSum = __SXTAB16(weightedSum, mac_result);
-    weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); 
-
-    memcpy((void*) &in, inputs + 24, sizeof(in));
-    neg_in = __SSUB8(0, in);
-    __USUB8(0, wt & 0x40404040);  
-    mac_result = __SEL(in, neg_in);
-    weightedSum = __SXTAB16(weightedSum, mac_result);
-    weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); 
-
-    memcpy((void*) &in, inputs + 28, sizeof(in));
-    neg_in = __SSUB8(0, in);
-    __USUB8(0, wt & 0x80808080);  
-    mac_result = __SEL(in, neg_in);
-    weightedSum = __SXTAB16(weightedSum, mac_result);
-    weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); 
-
-    return weightedSum;
-}
-
-// ----------------------------------------------------------------------------
-// ----------------- MAC computing functions for kernel -----------------------
-// ------------------------------- 1W / 5A ------------------------------------
-// ----------------------------------------------------------------------------
-
-template<typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
-         && std::numeric_limits<Input_T>::digits == 5)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-Sum_T octoMac (const Input_T* __restrict inputs,
-                  const Weight_T* __restrict weights,
-                  Sum_T weightedSum)
-{
-    uint32_t sum = 0;
-    uint32_t mac_result = 0;
-    uint32_t in;
-    uint32_t neg_in;
-    uint32_t wt = 0;
-    std::memcpy((void*) &wt, weights, 1);
-    wt |= wt << 8;
-    wt |= wt << 16;
-
-    memcpy((void*) &in, inputs, sizeof(in));
-
-    // Sign extend 
-    if (!std::is_unsigned<Input_T>::value)
-        in = (in + 0x70707070) ^ 0x70707070;
-
-    neg_in = __SSUB8(0, in);
-    __USUB8(0, wt & 0x08040201);  
-    sum = __SEL(in, neg_in);
-
-    memcpy((void*) &in, inputs + 4, sizeof(in));
-
-    // Sign extend 
-    if (!std::is_unsigned<Input_T>::value)
-        in = (in + 0x70707070) ^ 0x70707070;
-
-    neg_in = __SSUB8(0, in);
-    __USUB8(0, wt & 0x80402010);  
-    mac_result = __SEL(in, neg_in);
-      
-    sum = __QADD8(sum, mac_result);
-
-    return weightedSum;
-}
-
-template<typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
-         && std::numeric_limits<Input_T>::digits == 5)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-Sum_T quadquadMac (const Input_T* __restrict inputs,
-                   const Weight_T* __restrict weights,
-                   Sum_T weightedSum)
-{
-    uint32_t sum = 0;
-    uint32_t mac_result = 0;
-    uint32_t in;
-    uint32_t neg_in;
-    uint32_t wt = 0;
-    std::memcpy((void*) &wt, weights, 2);
-
-    memcpy((void*) &in, inputs, sizeof(in));
-    neg_in = __SSUB8(0, in);
-    __USUB8(0, wt & 0x01010101);  
-    sum = __SEL(in, neg_in);
-
-    memcpy((void*) &in, inputs + 4, sizeof(in));
-    neg_in = __SSUB8(0, in);
-    __USUB8(0, wt & 0x02020202);   
-    mac_result = __SEL(in, neg_in);
-    sum = __QADD8(sum, mac_result);
-
-    memcpy((void*) &in, inputs + 8, sizeof(in));
-    neg_in = __SSUB8(0, in);
-    __USUB8(0, wt & 0x04040404);   
-    mac_result = __SEL(in, neg_in);
-    sum = __QADD8(sum, mac_result);
-
-    memcpy((void*) &in, inputs + 12, sizeof(in));
-    neg_in = __SSUB8(0, in);
-    __USUB8(0, wt & 0x08080808);   
-    mac_result = __SEL(in, neg_in);
-    sum = __QADD8(sum, mac_result);
-
-    weightedSum = __SXTAB16(weightedSum, sum);
-    weightedSum = __SXTAB16_RORn(weightedSum, sum, 8);
-
-    return weightedSum;
-}
-
-template<typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
-         && std::numeric_limits<Input_T>::digits == 5)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-Sum_T octoquadMac (const Input_T* __restrict inputs,
-                   const Weight_T* __restrict weights,
-                   Sum_T weightedSum)
-{
-    uint32_t sum = 0;
-    uint32_t mac_result = 0;
-    uint32_t in;
-    uint32_t neg_in;
-    uint32_t wt = 0;
-    std::memcpy((void*) &wt, weights, 4);
-
-    memcpy((void*) &in, inputs, sizeof(in));
-    neg_in = __SSUB8(0, in);
-    __USUB8(0, wt & 0x01010101);  
-    sum = __SEL(in, neg_in);
-
-    memcpy((void*) &in, inputs + 4, sizeof(in));
-    neg_in = __SSUB8(0, in);
-    __USUB8(0, wt & 0x02020202);   
-    mac_result = __SEL(in, neg_in);
-    sum = __QADD8(sum, mac_result);
-
-    memcpy((void*) &in, inputs + 8, sizeof(in));
-    neg_in = __SSUB8(0, in);
-    __USUB8(0, wt & 0x04040404);   
-    mac_result = __SEL(in, neg_in);
-    sum = __QADD8(sum, mac_result);
-
-    memcpy((void*) &in, inputs + 12, sizeof(in));
-    neg_in = __SSUB8(0, in);
-    __USUB8(0, wt & 0x08080808);   
-    mac_result = __SEL(in, neg_in);
-    sum = __QADD8(sum, mac_result);
-
-    memcpy((void*) &in, inputs + 16, sizeof(in));
-    neg_in = __SSUB8(0, in);
-    __USUB8(0, wt & 0x10101010);  
-    mac_result = __SEL(in, neg_in);
-    sum = __QADD8(sum, mac_result);
-
-    memcpy((void*) &in, inputs + 20, sizeof(in));
-    neg_in = __SSUB8(0, in);
-    __USUB8(0, wt & 0x20202020);   
-    mac_result = __SEL(in, neg_in);
-    sum = __QADD8(sum, mac_result);
-
-    memcpy((void*) &in, inputs + 24, sizeof(in));
-    neg_in = __SSUB8(0, in);
-    __USUB8(0, wt & 0x40404040);   
-    mac_result = __SEL(in, neg_in);
-    sum = __QADD8(sum, mac_result);
-
-    memcpy((void*) &in, inputs + 28, sizeof(in));
-    neg_in = __SSUB8(0, in);
-    __USUB8(0, wt & 0x80808080);   
-    mac_result = __SEL(in, neg_in);
-    sum = __QADD8(sum, mac_result);
-
-    weightedSum = __SXTAB16(weightedSum, sum);
-    weightedSum = __SXTAB16_RORn(weightedSum, sum, 8);
-
-    return weightedSum;
-}
-
-
-template<int NB_ITERATIONS,
-         typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
-         && std::numeric_limits<Input_T>::digits <= 8 
-         && std::numeric_limits<Input_T>::digits > 4
-         && NB_ITERATIONS >= 8 && NB_ITERATIONS < 16)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-void macsOnRange (const Input_T* __restrict inputs,
-                  const Weight_T* __restrict weights,
-                  Sum_T& weightedSum)
-{
-    weightedSum = octoMac(inputs, weights, weightedSum);
-    macsOnRange<NB_ITERATIONS-8>(inputs + 8, weights + 1, weightedSum);
-}
-
-template<int NB_ITERATIONS,
-         typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
-         && std::numeric_limits<Input_T>::digits <= 8 
-         && std::numeric_limits<Input_T>::digits > 4
-         && NB_ITERATIONS >= 16 && NB_ITERATIONS < 32)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-void macsOnRange (const Input_T* __restrict inputs,
-                  const Weight_T* __restrict weights,
-                  Sum_T& weightedSum)
+template<typename Input_T>
+inline static
+uint32_t XTB16(uint32_t val)
 {
-    weightedSum = quadquadMac(inputs, weights, weightedSum);
-    macsOnRange<NB_ITERATIONS-16>(inputs + 16, weights + 2, weightedSum);
+    return std::is_unsigned<Input_T>::value ? __UXTB16(val) : __SXTB16(val);
 }
 
-template<int NB_ITERATIONS,
-         typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
-         && std::numeric_limits<Input_T>::digits <= 8 
-         && std::numeric_limits<Input_T>::digits > 4
-         && NB_ITERATIONS >= 32)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-void macsOnRange (const Input_T* __restrict inputs,
-                  const Weight_T* __restrict weights,
-                  Sum_T& weightedSum)
+template<int INPUTS_INC = 1,
+         int WEIGHTS_INC = 1,
+         typename Input_T,
+         typename Weight_T,
+         typename Sum_T>
+inline static
+Sum_T dualMac(const Input_T* __restrict inputs,
+              const Weight_T* __restrict weights,
+              Sum_T weightedSum)
 {
-    weightedSum = octoquadMac(inputs, weights, weightedSum);
-    macsOnRange<NB_ITERATIONS-32>(inputs + 32, weights + 4, weightedSum);
-}
-
+    weightedSum += inputs[0] * weights[0]
+        + inputs[INPUTS_INC] * weights[WEIGHTS_INC];
 
-// ----------------------------------------------------------------------------
-// ----------------- MAC computing functions for kernel -----------------------
-// ------------------------------- 1W / 4A ------------------------------------
-// ----------------------------------------------------------------------------
-
-template<typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
-         && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-Sum_T monoMac (const Input_T* __restrict inputs,
-               const Weight_T* __restrict weights,
-               Sum_T weightedSum)
-{
-    weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0].fields.op1)) : (Sum_T)(inputs[0].fields.op1);
     return weightedSum;
 }
 
-template<int NB_ITERATIONS,
-         typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
-         && std::numeric_limits<Input_T>::digits == 4
-         && NB_ITERATIONS == 2)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-void macsOnRange (const Input_T* __restrict inputs,
-                  const Weight_T* __restrict weights,
-                  Sum_T& weightedSum)
-{
-    weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0].fields.op1)) : (Sum_T)(inputs[0].fields.op1);
-    weightedSum += (weights[0].fields.op6) ? (Sum_T)(-(inputs[0].fields.op0)) : (Sum_T)(inputs[0].fields.op0);
-}
-
-template<int NB_ITERATIONS,
-         typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
-         && std::numeric_limits<Input_T>::digits == 4
-         && NB_ITERATIONS == 3)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-void macsOnRange (const Input_T* __restrict inputs,
-                  const Weight_T* __restrict weights,
-                  Sum_T& weightedSum)
-{
-    weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0].fields.op1)) : (Sum_T)(inputs[0].fields.op1);
-    weightedSum += (weights[0].fields.op6) ? (Sum_T)(-(inputs[0].fields.op0)) : (Sum_T)(inputs[0].fields.op0);
-    weightedSum += (weights[0].fields.op5) ? (Sum_T)(-(inputs[1].fields.op1)) : (Sum_T)(inputs[1].fields.op1);
-}
-
-template<int NB_ITERATIONS,
-         typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
-         && std::numeric_limits<Input_T>::digits == 4
-         && NB_ITERATIONS == 4)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-void macsOnRange (const Input_T* __restrict inputs,
-                  const Weight_T* __restrict weights,
-                  Sum_T& weightedSum)
-{
-    weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0].fields.op1)) : (Sum_T)(inputs[0].fields.op1);
-    weightedSum += (weights[0].fields.op6) ? (Sum_T)(-(inputs[0].fields.op0)) : (Sum_T)(inputs[0].fields.op0);
-    weightedSum += (weights[0].fields.op5) ? (Sum_T)(-(inputs[1].fields.op1)) : (Sum_T)(inputs[1].fields.op1);
-    weightedSum += (weights[0].fields.op4) ? (Sum_T)(-(inputs[1].fields.op0)) : (Sum_T)(inputs[1].fields.op0);
-}
-
-template<int NB_ITERATIONS,
-         typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
-         && std::numeric_limits<Input_T>::digits == 4
-         && NB_ITERATIONS == 5)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-void macsOnRange (const Input_T* __restrict inputs,
-                  const Weight_T* __restrict weights,
-                  Sum_T& weightedSum)
-{
-    weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0].fields.op1)) : (Sum_T)(inputs[0].fields.op1);
-    weightedSum += (weights[0].fields.op6) ? (Sum_T)(-(inputs[0].fields.op0)) : (Sum_T)(inputs[0].fields.op0);
-    weightedSum += (weights[0].fields.op5) ? (Sum_T)(-(inputs[1].fields.op1)) : (Sum_T)(inputs[1].fields.op1);
-    weightedSum += (weights[0].fields.op4) ? (Sum_T)(-(inputs[1].fields.op0)) : (Sum_T)(inputs[1].fields.op0);
-    weightedSum += (weights[0].fields.op3) ? (Sum_T)(-(inputs[2].fields.op1)) : (Sum_T)(inputs[2].fields.op1);
-}
-
-template<int NB_ITERATIONS,
-         typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
-         && std::numeric_limits<Input_T>::digits == 4
-         && NB_ITERATIONS == 6)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-void macsOnRange (const Input_T* __restrict inputs,
-                  const Weight_T* __restrict weights,
-                  Sum_T& weightedSum)
-{
-    weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0].fields.op1)) : (Sum_T)(inputs[0].fields.op1);
-    weightedSum += (weights[0].fields.op6) ? (Sum_T)(-(inputs[0].fields.op0)) : (Sum_T)(inputs[0].fields.op0);
-    weightedSum += (weights[0].fields.op5) ? (Sum_T)(-(inputs[1].fields.op1)) : (Sum_T)(inputs[1].fields.op1);
-    weightedSum += (weights[0].fields.op4) ? (Sum_T)(-(inputs[1].fields.op0)) : (Sum_T)(inputs[1].fields.op0);
-    weightedSum += (weights[0].fields.op3) ? (Sum_T)(-(inputs[2].fields.op1)) : (Sum_T)(inputs[2].fields.op1);
-    weightedSum += (weights[0].fields.op2) ? (Sum_T)(-(inputs[2].fields.op0)) : (Sum_T)(inputs[2].fields.op0);
-}
-
-template<int NB_ITERATIONS,
-         typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
-         && std::numeric_limits<Input_T>::digits == 4
-         && NB_ITERATIONS == 7)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-void macsOnRange (const Input_T* __restrict inputs,
-                  const Weight_T* __restrict weights,
-                  Sum_T& weightedSum)
-{
-    weightedSum += (weights[0].fields.op7) ? (Sum_T)(-(inputs[0].fields.op1)) : (Sum_T)(inputs[0].fields.op1);
-    weightedSum += (weights[0].fields.op6) ? (Sum_T)(-(inputs[0].fields.op0)) : (Sum_T)(inputs[0].fields.op0);
-    weightedSum += (weights[0].fields.op5) ? (Sum_T)(-(inputs[1].fields.op1)) : (Sum_T)(inputs[1].fields.op1);
-    weightedSum += (weights[0].fields.op4) ? (Sum_T)(-(inputs[1].fields.op0)) : (Sum_T)(inputs[1].fields.op0);
-    weightedSum += (weights[0].fields.op3) ? (Sum_T)(-(inputs[2].fields.op1)) : (Sum_T)(inputs[2].fields.op1);
-    weightedSum += (weights[0].fields.op2) ? (Sum_T)(-(inputs[2].fields.op0)) : (Sum_T)(inputs[2].fields.op0);
-    weightedSum += (weights[0].fields.op1) ? (Sum_T)(-(inputs[3].fields.op1)) : (Sum_T)(inputs[3].fields.op1);
-}
-
-template<typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
-         && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-Sum_T octoMac (const Input_T* __restrict inputs,
-                  const Weight_T* __restrict weights,
-                  Sum_T weightedSum)
+template<int INPUTS_INC = 1,
+         int WEIGHTS_INC = 1,
+         typename Input_T,
+         typename Weight_T,
+         typename Sum_T,
+         typename std::enable_if<std::is_floating_point<Input_T>::value>::type* = nullptr>
+inline static
+Sum_T quadMac(const Input_T* __restrict inputs,
+              const Weight_T* __restrict weights,
+              Sum_T weightedSum)
 {
-    uint32_t sum = 0;
-    uint32_t mac_result = 0;
-    uint32_t in;
-    uint32_t neg_in;
-    uint32_t wt = 0;
-    std::memcpy((void*) &wt, weights, 1);
-    wt |= wt << 8;
-    wt |= wt << 16;
-
-    memcpy((void*) &in, inputs, sizeof(in));
-
-    neg_in = __SSUB8(0, in & 0x0F0F0F0F);
-    __USUB8(0, wt & 0x40100401);   
-    mac_result = __SEL(in, neg_in);
-    sum = __QADD8(sum, mac_result);
-
-    neg_in = __SSUB8(0, (in >> 4) & 0xF0F0F0F0);
-    __USUB8(0, wt & 0x80200802);   
-    mac_result = __SEL(in, neg_in);
-    sum = __QADD8(sum, mac_result);
+    weightedSum += inputs[0*INPUTS_INC] * weights[0*WEIGHTS_INC]
+        + inputs[1*INPUTS_INC] * weights[1*WEIGHTS_INC]
+        + inputs[2*INPUTS_INC] * weights[2*WEIGHTS_INC]
+        + inputs[3*INPUTS_INC] * weights[3*WEIGHTS_INC];
 
     return weightedSum;
 }
 
-template<typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
-         && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-Sum_T quadquadMac (const Input_T* __restrict inputs,
-                   const Weight_T* __restrict weights,
-                   Sum_T weightedSum)
+template<int INPUTS_INC = 1,
+         int WEIGHTS_INC = 1,
+         typename Input_T,
+         typename Weight_T,
+         typename Sum_T,
+         typename std::enable_if<!std::is_floating_point<Input_T>::value>::type* = nullptr>
+inline static
+Sum_T quadMac(const Input_T* __restrict inputs,
+              const Weight_T* __restrict weights,
+              Sum_T weightedSum)
 {
-    uint32_t sum = 0;
-    uint32_t mac_result = 0;
-    uint32_t in;
-    uint32_t neg_in;
-    uint32_t wt = 0;
-    std::memcpy((void*) &wt, weights, 4);
-
-    memcpy((void*) &in, inputs, sizeof(in));
-
-    neg_in = __SSUB8(0, in & 0x0F0F0F0F);
-    __USUB8(0, wt & 0x01010101);   
-    mac_result = __SEL(in, neg_in);
-    sum = __QADD8(sum, mac_result);
-
-    neg_in = __SSUB8(0, (in >> 4) & 0x0F0F0F0F);
-    __USUB8(0, wt & 0x02020202);   
-    mac_result = __SEL(in, neg_in);
-    sum = __QADD8(sum, mac_result);
+    if(INPUTS_INC != 1 || WEIGHTS_INC != 1) {
+        weightedSum += inputs[0*INPUTS_INC] * weights[0*WEIGHTS_INC]
+            + inputs[1*INPUTS_INC] * weights[1*WEIGHTS_INC]
+            + inputs[2*INPUTS_INC] * weights[2*WEIGHTS_INC]
+            + inputs[3*INPUTS_INC] * weights[3*WEIGHTS_INC];
 
-    memcpy((void*) &in, inputs + 4, sizeof(in));
-
-    neg_in = __SSUB8(0, in & 0x0F0F0F0F);
-    __USUB8(0, wt & 0x04040404);   
-    mac_result = __SEL(in, neg_in);
-    sum = __QADD8(sum, mac_result);
-
-    neg_in = __SSUB8(0, (in >> 4) & 0x0F0F0F0F);
-    __USUB8(0, wt & 0x08080808);   
-    mac_result = __SEL(in, neg_in);
-    sum = __QADD8(sum, mac_result);
-
-
-    weightedSum = __SXTAB16(weightedSum, sum);
-    weightedSum = __SXTAB16_RORn(weightedSum, sum, 8);
-
-    return weightedSum;
-}
+        return weightedSum;
+    }
 
-template<typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
-         && std::numeric_limits<Input_T>::digits == 4)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-Sum_T octoquadMac (const Input_T* __restrict inputs,
-                   const Weight_T* __restrict weights,
-                   Sum_T weightedSum)
-{
-    uint32_t sum = 0;
-    uint32_t mac_result = 0;
+    // Inputs loading & preparation
     uint32_t in;
-    uint32_t neg_in;
-    uint32_t wt = 0;
-    std::memcpy((void*) &wt, weights, 4);
-
     memcpy((void*) &in, inputs, sizeof(in));
 
-    neg_in = __SSUB8(0, in & 0x0F0F0F0F);
-    __USUB8(0, wt & 0x01010101);   
-    mac_result = __SEL(in, neg_in);
-    sum = __QADD8(sum, mac_result);
-
-    neg_in = __SSUB8(0, (in >> 4) & 0x0F0F0F0F);
-    __USUB8(0, wt & 0x02020202);   
-    mac_result = __SEL(in, neg_in);
-    sum = __QADD8(sum, mac_result);
-
-    memcpy((void*) &in, inputs + 4, sizeof(in));
-
-    neg_in = __SSUB8(0, in & 0x0F0F0F0F);
-    __USUB8(0, wt & 0x04040404);   
-    mac_result = __SEL(in, neg_in);
-    sum = __QADD8(sum, mac_result);
-
-    neg_in = __SSUB8(0, (in >> 4) & 0x0F0F0F0F);
-    __USUB8(0, wt & 0x08080808);   
-    mac_result = __SEL(in, neg_in);
-    sum = __QADD8(sum, mac_result);
-
-    memcpy((void*) &in, inputs + 8, sizeof(in));
-
-    neg_in = __SSUB8(0, in & 0x0F0F0F0F);
-    __USUB8(0, wt & 0x10101010);   
-    mac_result = __SEL(in, neg_in);
-    sum = __QADD8(sum, mac_result);
+    uint32_t in1 = XTB16<Input_T>(in);
+    uint32_t in2 = XTB16<Input_T>(in >> 8);
 
-    neg_in = __SSUB8(0, (in >> 4) & 0x0F0F0F0F);
-    __USUB8(0, wt & 0x20202020);   
-    mac_result = __SEL(in, neg_in);
-    sum = __QADD8(sum, mac_result);
-
-    memcpy((void*) &in, inputs + 12, sizeof(in));
+    // Weights loading & preparation
+    uint32_t wt;
+    memcpy((void*) &wt, weights, sizeof(wt));
 
-    neg_in = __SSUB8(0, in & 0x0F0F0F0F);
-    __USUB8(0, wt & 0x40404040);   
-    mac_result = __SEL(in, neg_in);
-    sum = __QADD8(sum, mac_result);
+    uint32_t wt1 = XTB16<Weight_T>(wt);
+    uint32_t wt2 = XTB16<Weight_T>(wt >> 8);
 
-    neg_in = __SSUB8(0, (in >> 4) & 0x0F0F0F0F);
-    __USUB8(0, wt & 0x80808080);   
-    mac_result = __SEL(in, neg_in);
-    sum = __QADD8(sum, mac_result);
+    // Computation
+    if(std::is_same<Sum_T, int32_t>::value) {
+        weightedSum = __SMLAD(in1, wt1, weightedSum);
+        weightedSum = __SMLAD(in2, wt2, weightedSum);
+    }
+    else {
+        weightedSum = __SMLALD(in1, wt1, weightedSum);
+        weightedSum = __SMLALD(in2, wt2, weightedSum);
 
-    weightedSum = __SXTAB16(weightedSum, sum);
-    weightedSum = __SXTAB16_RORn(weightedSum, sum, 8);
+    }
 
     return weightedSum;
 }
 
 
-template<int NB_ITERATIONS,
-         typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
-         && std::numeric_limits<Input_T>::digits == 4
-         && NB_ITERATIONS >= 8 && NB_ITERATIONS < 16)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-void macsOnRange (const Input_T* __restrict inputs,
-                  const Weight_T* __restrict weights,
-                  Sum_T& weightedSum)
-{
-    weightedSum = octoMac(inputs, weights, weightedSum);
-    macsOnRange<NB_ITERATIONS-8>(inputs + 4, weights + 1, weightedSum);
-}
 
-template<int NB_ITERATIONS,
-         typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
-         && std::numeric_limits<Input_T>::digits == 4
-         && NB_ITERATIONS >= 16 && NB_ITERATIONS < 32)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-void macsOnRange (const Input_T* __restrict inputs,
-                  const Weight_T* __restrict weights,
-                  Sum_T& weightedSum)
-{
-    weightedSum = quadquadMac(inputs, weights, weightedSum);
-    macsOnRange<NB_ITERATIONS-16>(inputs + 8, weights + 2, weightedSum);
-}
+// **************************************************************************
+// * Multiply-accumulate the values in inputs and weights for NB_ITERATIONS *
+// **************************************************************************
 
 template<int NB_ITERATIONS,
-         typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(std::numeric_limits<Weight_T>::digits == 1
-         && std::numeric_limits<Input_T>::digits == 4
-         && NB_ITERATIONS >= 32)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-void macsOnRange (const Input_T* __restrict inputs,
-                  const Weight_T* __restrict weights,
-                  Sum_T& weightedSum)
-{
-    weightedSum = octoquadMac(inputs, weights, weightedSum);
-    macsOnRange<NB_ITERATIONS-32>(inputs + 16, weights + 4, weightedSum);
-}
-
-
-// ----------------------------------------------------------------------------
-// -------------- MAC computing functions for kernel 1W-7A --------------------
-// ----------------------------------------------------------------------------
-
-template<typename Input_T,
-         typename std::enable_if<(std::numeric_limits<Input_T>::digits == 7)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-uint32_t quadMacInter(const Input_T* __restrict inputs,
-                      const uint32_t weight,
-                      uint32_t weightedSum)
-{
-    uint32_t in;
-    memcpy((void*) &in, inputs, sizeof(in));
-
-    // Sign extend 
-    if (!std::is_unsigned<Input_T>::value)
-        in = (in + 0xC0C0C0C0) ^ 0xC0C0C0C0;
-
-    uint32_t neg_in = __SSUB8(0, in);
-
-    __USUB8(0, weight);  
-    uint32_t mac_result = __SEL(in, neg_in);
-      
-    uint32_t evenA1 = __SXTB16(mac_result);
-    uint32_t oddA1  = __SXTB16_RORn(mac_result, 8);
-
-    weightedSum = __SADD16(evenA1, weightedSum);  
-    weightedSum = __SADD16(oddA1, weightedSum);  
-
-    return weightedSum;
-}
-
-template<typename Input_T,
-         typename std::enable_if<(std::numeric_limits<Input_T>::digits == 7)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-uint32_t quadMacInterV2(const Input_T* __restrict inputs,
-                      const uint32_t weight,
-                      uint32_t weightedSum)
-{
-    uint32_t in;
-    memcpy((void*) &in, inputs, sizeof(in));
-
-    // Sign extend 
-    if (!std::is_unsigned<Input_T>::value)
-        in = (in + 0xC0C0C0C0) ^ 0xC0C0C0C0;
-
-    uint32_t neg_in = __SSUB8(0, in);
-
-    __USUB8(0, weight);  
-    uint32_t mac_result = __SEL(in, neg_in);
-      
-    weightedSum = __SXTAB16(weightedSum, mac_result);
-    weightedSum = __SXTAB16_RORn(weightedSum, mac_result, 8); 
-
-    return weightedSum;
-}
-
-
-// ----------------------------------------------------------------------------
-// -------------- MAC computing functions for kernel 1W-5A --------------------
-// ----------------------------------------------------------------------------
-
-template<typename Input_T,
-         typename std::enable_if<(std::numeric_limits<Input_T>::digits == 5)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-uint32_t quadMacInter(const Input_T* __restrict inputs,
-                      const uint32_t weight,
-                      uint32_t weightedSum)
-{
-    uint32_t in;
-    memcpy((void*) &in, inputs, sizeof(in));
-
-    // Sign extend 
-    if (!std::is_unsigned<Input_T>::value)
-        in = (in + 0x70707070) ^ 0x70707070;
-
-    uint32_t neg_in = __SSUB8(0, in);
-
-    __USUB8(0, weight);  
-    uint32_t mac_result = __SEL(in, neg_in);
-      
-    weightedSum = __QADD8(weightedSum, mac_result);
-
-    return weightedSum;
-}
-
-
-// ----------------------------------------------------------------------------
-// ------------------- MAC computing general functions ------------------------
-// ----------------------------------------------------------------------------
-
-template<int NB_ITERATIONS, typename Input_T, typename Weight_T, typename Sum_T,
+         int INPUTS_INC = 1,
+         int WEIGHTS_INC = 1,
+         class Input_T,
+         class Weight_T,
+         class Sum_T,
          typename std::enable_if<(NB_ITERATIONS == 0)>::type* = nullptr>
-__attribute__((always_inline)) static inline
+inline static
 void macsOnRange(const Input_T* __restrict /*inputs*/,
                  const Weight_T* __restrict /*weights*/,
-                 Sum_T& /*weightedSum*/)
+                 Sum_T& __restrict /*weightedSum*/)
 {
-    // Nothing should happen
+    // Nothing to do
 }
 
-template<int NB_ITERATIONS, typename Input_T, typename Weight_T, typename Sum_T,
+template<int NB_ITERATIONS,
+         int INPUTS_INC = 1,
+         int WEIGHTS_INC = 1,
+         class Input_T,
+         class Weight_T,
+         class Sum_T,
          typename std::enable_if<(NB_ITERATIONS == 1)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-void macsOnRange(const Input_T* __restrict inputs,
-                 const Weight_T* __restrict weights,
-                 Sum_T& weightedSum)
-{
-    weightedSum = monoMac(inputs, weights, weightedSum);
-}
-
-template<int NB_ITERATIONS, typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(NB_ITERATIONS >= 2 && NB_ITERATIONS < 4 && std::numeric_limits<Weight_T>::digits > 1)>::type* = nullptr>
-__attribute__((always_inline)) static inline
+inline static
 void macsOnRange(const Input_T* __restrict inputs,
                  const Weight_T* __restrict weights,
-                 Sum_T& weightedSum)
+                 Sum_T& __restrict weightedSum)
 {
-    constexpr unsigned int idxI 
-        = (std::numeric_limits<Input_T>::digits > 4) ? 2 : 1;
-    constexpr unsigned int idxW 
-        = (std::numeric_limits<Weight_T>::digits > 4) ? 2 : 1;
-
-    weightedSum = dualMac(inputs, weights, weightedSum);
-    macsOnRange<NB_ITERATIONS - 2>(inputs + idxI, weights + idxW, weightedSum);
+    weightedSum += (*weights) * (*inputs);
 }
 
-template<int NB_ITERATIONS, typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<NB_ITERATIONS >= 4 
-         && (std::numeric_limits<Weight_T>::digits > 4)>::type* = nullptr>
-__attribute__((always_inline)) static inline
+template<int NB_ITERATIONS,
+         int INPUTS_INC = 1,
+         int WEIGHTS_INC = 1,
+         class Input_T,
+         class Weight_T,
+         class Sum_T,
+         typename std::enable_if<(NB_ITERATIONS >= 2 && NB_ITERATIONS < 4)>::type* = nullptr>
+inline static
 void macsOnRange(const Input_T* __restrict inputs,
                  const Weight_T* __restrict weights,
-                 Sum_T& weightedSum)
-{
-    constexpr unsigned int idxI 
-        = (std::numeric_limits<Input_T>::digits > 4) 
-          ? 4 : (std::numeric_limits<Input_T>::digits == 4) ? 2 : 1;
-
-    constexpr unsigned int idxW = 4;
-
-    weightedSum = quadMac(inputs, weights, weightedSum);
-    macsOnRange<NB_ITERATIONS - 4>(inputs + idxI, weights + idxW, weightedSum);
-}
-
-template<int NB_ITERATIONS, typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<(NB_ITERATIONS >= 4 && NB_ITERATIONS < 8) 
-         && (std::numeric_limits<Weight_T>::digits == 4)>::type* = nullptr>
-__attribute__((always_inline)) static inline
+                 Sum_T& __restrict weightedSum)
+{
+    weightedSum = dualMac<INPUTS_INC, WEIGHTS_INC>(inputs, weights, weightedSum);
+    macsOnRange<NB_ITERATIONS - 2, INPUTS_INC, WEIGHTS_INC>(inputs + 2*INPUTS_INC,
+                                                            weights + 2*WEIGHTS_INC,
+                                                            weightedSum);
+}
+
+/**
+ * @brief   MACs Processing
+ * @details Performs NB_ITERATIONS MACs operations, storing results into the
+ *          weightedSum variable.
+ *
+ * @tparam  NB_ITERATIONS   Number of MACs to perform
+ * @tparam  INPUTS_INC      Input Stride
+ * @tparam  WEIGHTS_INC     Weights Stride
+ * @tparam  Input_T         Input Type
+ *
+ * @param   inputs          Pointer to inputs vector
+ * @param   weights         Pointer to weights vector
+ * @param   weightedSum     Pointer to weightedSum
+*/
+template<int NB_ITERATIONS,
+         int INPUTS_INC = 1,
+         int WEIGHTS_INC = 1,
+         class Input_T,
+         class Weight_T,
+         class Sum_T,
+         typename std::enable_if<(NB_ITERATIONS >= 4)>::type* = nullptr>
+inline static
 void macsOnRange(const Input_T* __restrict inputs,
                  const Weight_T* __restrict weights,
-                 Sum_T& weightedSum)
+                 Sum_T& __restrict weightedSum)
 {
-    constexpr unsigned int idxI 
-        = (std::numeric_limits<Input_T>::digits > 4) 
-          ? 4 : (std::numeric_limits<Input_T>::digits == 4) ? 2 : 1;
-
-    constexpr unsigned int idxW = 2;
-
-    weightedSum = quadMac(inputs, weights, weightedSum);
-    macsOnRange<NB_ITERATIONS - 4>(inputs + idxI, weights + idxW, weightedSum);
+    weightedSum = quadMac<INPUTS_INC, WEIGHTS_INC>(inputs, weights, weightedSum);
+    macsOnRange<NB_ITERATIONS - 4, INPUTS_INC, WEIGHTS_INC>(inputs + 4*INPUTS_INC,
+                                                            weights + 4*WEIGHTS_INC,
+                                                            weightedSum);
 }
 
-template<int NB_ITERATIONS, typename Input_T, typename Weight_T, typename Sum_T,
-         typename std::enable_if<NB_ITERATIONS >= 8 
-         && (std::numeric_limits<Weight_T>::digits == 4)>::type* = nullptr>
-__attribute__((always_inline)) static inline
-void macsOnRange(const Input_T* __restrict inputs,
-                 const Weight_T* __restrict weights,
-                 Sum_T& weightedSum)
-{
-    constexpr unsigned int idxI 
-        = (std::numeric_limits<Input_T>::digits > 4) 
-          ? 8 : (std::numeric_limits<Input_T>::digits == 4) 
-            ? 4 : (std::numeric_limits<Input_T>::digits == 2)
-              ? 2 : 1;
-
-    constexpr unsigned int idxW = 4;
-
-    weightedSum = octoMac(inputs, weights, weightedSum);
-    macsOnRange<NB_ITERATIONS - 8>(inputs + idxI, weights + idxW, weightedSum);
-}
 
+}   // N2D2_Export
 
-#endif // __N2D2_MAC_FUNCTIONS_HPP__
+#endif  // __N2D2_EXPORT_CPP_MACS_HPP__
diff --git a/aidge_export_arm_cortexm/_Aidge_Arm/templates/configuration/pool_config.jinja b/aidge_export_arm_cortexm/_Aidge_Arm/templates/configuration/pool_config.jinja
index 363e6f9bcd3f83460a3a2533ce6e243c887442fd..1900612b65ab2c2c581886acd3894659233c64b6 100644
--- a/aidge_export_arm_cortexm/_Aidge_Arm/templates/configuration/pool_config.jinja
+++ b/aidge_export_arm_cortexm/_Aidge_Arm/templates/configuration/pool_config.jinja
@@ -2,7 +2,7 @@
 #ifndef {{ name|upper }}_LAYER_H
 #define {{ name|upper }}_LAYER_H
 
-{# #include "typedefs.h" #}
+{# #include "kernels/typedefs.h" #}
 
 {# For layer configuration -#}
 {% include "./_def_io.jinja" %}
diff --git a/aidge_export_arm_cortexm/boards/stm32/H7/Inc/assert.h b/aidge_export_arm_cortexm/boards/stm32/H7/Inc/assert.h
index bb0ec19d8544bf8c8830cb85da00f7e065422cff..fab32bda495f9a19850ebd1e38830e9f0bdd858e 100644
--- a/aidge_export_arm_cortexm/boards/stm32/H7/Inc/assert.h
+++ b/aidge_export_arm_cortexm/boards/stm32/H7/Inc/assert.h
@@ -1,33 +1,33 @@
-/*
-    (C) Copyright 2019 CEA LIST. All Rights Reserved.
-    Contributor(s): Olivier BICHLER (olivier.bichler@cea.fr)
+// /*
+//     (C) Copyright 2019 CEA LIST. All Rights Reserved.
+//     Contributor(s): Olivier BICHLER (olivier.bichler@cea.fr)
 
-    This file is not part of the open source version of N2D2 and is NOT under
-    the CeCILL-C license. This code is the property of the CEA. It can not be
-    copied or disseminated without its authorization.
-*/
-#ifndef ASSERT_H
-#define ASSERT_H
+//     This file is not part of the open source version of N2D2 and is NOT under
+//     the CeCILL-C license. This code is the property of the CEA. It can not be
+//     copied or disseminated without its authorization.
+// */
+// #ifndef ASSERT_H
+// #define ASSERT_H
 
-#ifdef __cplusplus
-extern "C" {
-#endif 
+// #ifdef __cplusplus
+// extern "C" {
+// #endif
 
-inline void assert_failure(const char* msg, const char* file, int line) {
-//    printf("Assert failure: %s in %s:%d.\r\n", msg, file, line);
-    while(1) {}
-}
+// inline void assert_failure(const char* msg, const char* file, int line) {
+// //    printf("Assert failure: %s in %s:%d.\r\n", msg, file, line);
+//     while(1) {}
+// }
 
-#ifdef NDEBUG
-#define assert(test) ((void)0)
-#define assertm(test, msg) ((void)0)
-#else
-#define assert(test) do { if(!(test)) { assert_failure("error", __FILE__, __LINE__); } } while(0)
-#define assertm(test, msg) do { if(!(test)) { assert_failure(msg, __FILE__, __LINE__); } } while(0)
-#endif 
+// #ifdef NDEBUG
+// #define assert(test) ((void)0)
+// #define assertm(test, msg) ((void)0)
+// #else
+// #define assert(test) do { if(!(test)) { assert_failure("error", __FILE__, __LINE__); } } while(0)
+// #define assertm(test, msg) do { if(!(test)) { assert_failure(msg, __FILE__, __LINE__); } } while(0)
+// #endif
 
-#ifdef __cplusplus
-}
-#endif 
+// #ifdef __cplusplus
+// }
+// #endif
 
-#endif
+// #endif
diff --git a/aidge_export_arm_cortexm/export.py b/aidge_export_arm_cortexm/export.py
index fe7884eb3fa18e6a62bf9ad9bd792d403d1191b8..ebac2d68929399bab55234684a5d35c6c246a9dd 100644
--- a/aidge_export_arm_cortexm/export.py
+++ b/aidge_export_arm_cortexm/export.py
@@ -42,12 +42,11 @@ def gen_board_files(path:str, board:str)->None:
         raise ValueError(f"Board {board} is not supported, supported board are:\n\t-{joint_board_str}")
 
     if isinstance(path, str): path = Path(path)
-    # Create dnn directory is not exist
+    # Create dnn directory if not exist
     dnn_folder = path / "dnn"
     os.makedirs(str(dnn_folder), exist_ok=True)
 
     # Determine which board the user wants
     # to select correct config
-
     # Copy all static files in the export
     shutil.copytree(BOARDS_MAP[board], str(path), dirs_exist_ok=True)
diff --git a/aidge_export_arm_cortexm/operators.py b/aidge_export_arm_cortexm/operators.py
index 5f11b8fe909a6cc57b3d31d654402159021c06e9..2903edefc53199337fb4a658a6fc27ac2ed6b7cc 100644
--- a/aidge_export_arm_cortexm/operators.py
+++ b/aidge_export_arm_cortexm/operators.py
@@ -86,11 +86,11 @@ class Producer_ARMCortexM(ExportNode):
 
     def __init__(self, node, mem_info, conversion_map = datatype_converter_aidge2arm):
         super().__init__(node, mem_info, conversion_map)
-        
+
         weights = self.operator.get_output(0)
 
         self.values = np.array(weights).reshape(-1)
-        
+
 
     def export(self, export_folder: Path):
         header_path = f"include/parameters/{self.attributes['name']}.hpp"
@@ -261,7 +261,7 @@ class Pad_ARMCortexM(ExportNodeCpp):
 
 
 @ExportLibAidgeARM.register("ReLU", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
-class ReLU_ARMCortexM(ExportNodeCpp):
+class ReLU_ARMCortexM_float32(ExportNodeCpp):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
 
@@ -274,7 +274,7 @@ class ReLU_ARMCortexM(ExportNodeCpp):
         ]
 
 @ExportLibAidgeARM.register("Conv2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
-class Conv_ARMCortexM(ExportNodeCpp):
+class Conv_ARMCortexM_float32(ExportNodeCpp):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
         self.attributes["activation"] = "Linear"
@@ -287,27 +287,33 @@ class Conv_ARMCortexM(ExportNodeCpp):
         self.forward_template = str(ROOT / "_Aidge_Arm" / "templates" / "forward_call" / "conv_kernel.jinja")
         self.include_list = []
         self.kernels_to_copy = [
-            str(ROOT / "_Aidge_Arm" / "kernels" / "Convolution" / "Conv.hpp")
+            str(ROOT / "_Aidge_Arm" / "kernels" / "Convolution" / "Conv.hpp"),
+            str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "Macs.hpp"),
+            # str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "swar_arm_acle.h"),
+            str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "nn_scaling_functions.hpp"),
+            str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "typedefs.hpp"),
+            str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "utils.hpp"),
+            str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "assert.h"),
         ]
 
 @ExportLibAidgeARM.register_generic("ArmPadConv2D", aidge_core.ImplSpec([
                                                                 aidge_core.IOSpec(aidge_core.dtype.any),               # Input[0] : Input Spec
                                                                 aidge_core.IOSpec(aidge_core.dtype.dual_int4),      # Input[1] : Weight Spec
                                                                 aidge_core.IOSpec(aidge_core.dtype.int32)           # Input[2] : Bias Spec
-                                                            ], 
+                                                            ],
                                                             [
                                                                 aidge_core.IOSpec(aidge_core.dtype.any)       # Output[0] : Output spec
                                                             ]))
 class PadConvScaling_ARMCortexM(ExportNodeCpp):
     def __init__(self, node, mem_info, conversion_map = datatype_converter_aidge2arm):
         super().__init__(node, mem_info, conversion_map)
-        
+
         self.attributes["activation"] = "Linear"
-        
+
         self.attributes["padding"] = [0, 0]
         if self.operator.attr.has_attr("Pad2D_0"):
             self.attributes["padding"] = self.operator.attr.get_attr("Pad2D_0").get_attr("begin_end_borders")
-        
+
         self.attributes["kernel_dims"] = self.operator.attr.get_attr("Conv2D_0").get_attr("kernel_dims")
         self.attributes["stride_dims"] = self.operator.attr.get_attr("Conv2D_0").get_attr("stride_dims")
         self.attributes["dilation_dims"] = self.operator.attr.get_attr("Conv2D_0").get_attr("dilation_dims")
@@ -329,14 +335,15 @@ class PadConvScaling_ARMCortexM(ExportNodeCpp):
         self.forward_template = str(ROOT / "_Aidge_Arm" / "templates" / "forward_call" / "lowbit_conv_kernel.jinja")
         self.include_list = []
         self.kernels_to_copy = [
-            str(ROOT / "_Aidge_Arm" / "kernels" / "Convolution" / "LowbitConv.hpp"), 
+            str(ROOT / "_Aidge_Arm" / "kernels" / "Convolution" / "LowbitConv.hpp"),
             str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "aidge_supportfunctions.h"),
             str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "Macs.hpp"),
             str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "nn_scaling_functions.hpp"),
             str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "subkernels_functions.hpp"),
             str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "swar_arm_acle.h"),
             str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "typedefs.hpp"),
-            str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "utils.hpp")
+            str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "utils.hpp"),
+            str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "assert.h")
         ]
 
 
@@ -344,20 +351,20 @@ class PadConvScaling_ARMCortexM(ExportNodeCpp):
                                                                 aidge_core.IOSpec(aidge_core.dtype.any),               # Input[0] : Input Spec
                                                                 aidge_core.IOSpec(aidge_core.dtype.dual_int4),      # Input[1] : Weight Spec
                                                                 aidge_core.IOSpec(aidge_core.dtype.int32)           # Input[2] : Bias Spec
-                                                            ], 
+                                                            ],
                                                             [
                                                                 aidge_core.IOSpec(aidge_core.dtype.any)       # Output[0] : Output spec
                                                             ]))
 class ConvScaling_ARMCortexM(ExportNodeCpp):
     def __init__(self, node, mem_info, conversion_map = datatype_converter_aidge2arm):
         super().__init__(node, mem_info, conversion_map)
-        
+
         self.attributes["activation"] = "Linear"
-        
+
         self.attributes["padding"] = [0, 0]
         if self.operator.attr.has_attr("Pad2D_0"):
             self.attributes["padding"] = self.operator.attr.get_attr("Pad2D_0").get_attr("begin_end_borders")
-        
+
         self.attributes["kernel_dims"] = self.operator.attr.get_attr("Conv2D_0").get_attr("kernel_dims")
         self.attributes["stride_dims"] = self.operator.attr.get_attr("Conv2D_0").get_attr("stride_dims")
         self.attributes["dilation_dims"] = self.operator.attr.get_attr("Conv2D_0").get_attr("dilation_dims")
@@ -378,14 +385,15 @@ class ConvScaling_ARMCortexM(ExportNodeCpp):
         self.forward_template = str(ROOT / "_Aidge_Arm" / "templates" / "forward_call" / "lowbit_conv_kernel.jinja")
         self.include_list = []
         self.kernels_to_copy = [
-            str(ROOT / "_Aidge_Arm" / "kernels" / "Convolution" / "LowbitConv.hpp"), 
+            str(ROOT / "_Aidge_Arm" / "kernels" / "Convolution" / "LowbitConv.hpp"),
             str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "aidge_supportfunctions.h"),
             str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "Macs.hpp"),
             str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "nn_scaling_functions.hpp"),
             str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "subkernels_functions.hpp"),
             str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "swar_arm_acle.h"),
             str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "typedefs.hpp"),
-            str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "utils.hpp")
+            str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "utils.hpp"),
+            str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "assert.h")
         ]
 
 
@@ -393,11 +401,11 @@ class ConvScaling_ARMCortexM(ExportNodeCpp):
                                                                 aidge_core.IOSpec(aidge_core.dtype.any),            # Input[0] : Input Spec
                                                                 aidge_core.IOSpec(aidge_core.dtype.dual_int4),      # Input[1] : Weight Spec
                                                                 aidge_core.IOSpec(aidge_core.dtype.int32)           # Input[2] : Bias Spec
-                                                            ], 
+                                                            ],
                                                             [
                                                                 aidge_core.IOSpec(aidge_core.dtype.any)       # Output[0] : Output spec
                                                             ]))
-class FCScaling_ARMCortexM(ExportNodeCpp):
+class FCScaling_ARMCortexM_int4(ExportNodeCpp):
     def __init__(self, node, mem_info, conversion_map = datatype_converter_aidge2arm):
         super().__init__(node, mem_info, conversion_map)
         self.attributes["activation"] = "Linear"
@@ -424,38 +432,42 @@ class FCScaling_ARMCortexM(ExportNodeCpp):
             str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "subkernels_functions.hpp"),
             str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "swar_arm_acle.h"),
             str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "typedefs.hpp"),
-            str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "utils.hpp")
-        ]
-
-@ExportLibAidgeARM.register("MaxPooling2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
-class LowbitPooling_ARMCortexM(ExportNodeCpp):
-    def __init__(self, node, mem_info, conversion_map = datatype_converter_aidge2arm):
-        super().__init__(node, mem_info, conversion_map)
-
-        self.attributes["activation"] = "Linear"
-        self.attributes["pool_type"] = "Max"
-        # No padding with MaxPooling or AvgPooling
-        # Use PaddedMaxPooling/PaddedAvgPooling to add padding attribute
-        self.attributes["padding"] = [0, 0]
-
-        self.attributes["kernel_dims"] = node.get_operator().attr.kernel_dims
-        self.attributes["stride_dims"] = node.get_operator().attr.stride_dims
-
-        self.config_template = str(ROOT / "_Aidge_Arm" / "templates" / "configuration" / "pool_config.jinja")
-        self.forward_template = str(ROOT / "_Aidge_Arm" / "templates" / "forward_call" / "lowbit_pool_kernel.jinja")
-        self.include_list = []
-        self.kernels_to_copy = [
-            str(ROOT / "_Aidge_Arm" / "kernels" / "Pooling" / "LowbitPooling.hpp")
+            str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "utils.hpp"),
+            str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "assert.h")
         ]
 
-        self.kernels_to_copy = [
-            str(ROOT / "_Aidge_Arm" / "kernels" / "Pooling" / "LowbitPooling.hpp"),
-            str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "aidge_supportfunctions.h"),
-            str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "Macs.hpp"),
-            str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "subkernels_functions.hpp"),
-            str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "typedefs.hpp"),
-            str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "utils.hpp")
-        ]
+# FIXME This take the precedence on float32 kernel due to poor management of IOSpec
+# Need to update the IOSpec
+# @ExportLibAidgeARM.register("MaxPooling2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.any)))
+# class LowbitPooling_ARMCortexM(ExportNodeCpp):
+#     def __init__(self, node, mem_info, conversion_map = datatype_converter_aidge2arm):
+#         super().__init__(node, mem_info, conversion_map)
+
+#         self.attributes["activation"] = "Linear"
+#         self.attributes["pool_type"] = "Max"
+#         # No padding with MaxPooling or AvgPooling
+#         # Use PaddedMaxPooling/PaddedAvgPooling to add padding attribute
+#         self.attributes["padding"] = [0, 0]
+
+#         self.attributes["kernel_dims"] = node.get_operator().attr.kernel_dims
+#         self.attributes["stride_dims"] = node.get_operator().attr.stride_dims
+
+#         self.config_template = str(ROOT / "_Aidge_Arm" / "templates" / "configuration" / "pool_config.jinja")
+#         self.forward_template = str(ROOT / "_Aidge_Arm" / "templates" / "forward_call" / "lowbit_pool_kernel.jinja")
+#         self.include_list = []
+#         self.kernels_to_copy = [
+#             str(ROOT / "_Aidge_Arm" / "kernels" / "Pooling" / "LowbitPooling.hpp")
+#         ]
+
+#         self.kernels_to_copy = [
+#             str(ROOT / "_Aidge_Arm" / "kernels" / "Pooling" / "LowbitPooling.hpp"),
+#             str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "aidge_supportfunctions.h"),
+#             str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "Macs.hpp"),
+#             str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "swar_arm_acle.h"),
+#             str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "subkernels_functions.hpp"),
+#             str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "typedefs.hpp"),
+#             str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "utils.hpp")
+#         ]
 
 
 
@@ -464,11 +476,11 @@ class LowbitPooling_ARMCortexM(ExportNodeCpp):
                                                                 aidge_core.IOSpec(aidge_core.dtype.any),    # Input[0] : Input Spec
                                                                 aidge_core.IOSpec(aidge_core.dtype.int4),   # Input[1] : Weight Spec
                                                                 aidge_core.IOSpec(aidge_core.dtype.any)     # Input[2] : Bias Spec
-                                                            ], 
+                                                            ],
                                                             [
                                                                 aidge_core.IOSpec(aidge_core.dtype.int4) # Output[0] : Output spec
                                                             ]))
-class Conv_ARMCortexM(ExportNodeCpp):
+class Conv_ARMCortexM_int4(ExportNodeCpp):
     def __init__(self, node, mem_info, conversion_map = datatype_converter_aidge2arm):
         super().__init__(node, mem_info, conversion_map)
         self.attributes["activation"] = "Linear"
@@ -481,14 +493,15 @@ class Conv_ARMCortexM(ExportNodeCpp):
         self.forward_template = str(ROOT / "_Aidge_Arm" / "templates" / "forward_call" / "lowbit_conv_kernel.jinja")
         self.include_list = []
         self.kernels_to_copy = [
-            str(ROOT / "_Aidge_Arm" / "kernels" / "Convolution" / "LowbitConv.hpp"), 
+            str(ROOT / "_Aidge_Arm" / "kernels" / "Convolution" / "LowbitConv.hpp"),
             str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "aidge_supportfunctions.h"),
             str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "Macs.hpp"),
             str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "nn_scaling_functions.hpp"),
             str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "subkernels_functions.hpp"),
             str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "swar_arm_acle.h"),
             str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "typedefs.hpp"),
-            str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "utils.hpp")
+            str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "utils.hpp"),
+            str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "assert.h")
         ]
 
 @ExportLibAidgeARM.register("ConvDepthWise2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
@@ -574,14 +587,15 @@ class Pooling_ARMCortexM(ExportNodeCpp):
         self.forward_template = str(ROOT / "_Aidge_Arm" / "templates" / "forward_call" / "pool_kernel.jinja")
         self.include_list = []
         self.kernels_to_copy = [
-            str(ROOT / "_Aidge_Arm" / "kernels" / "Pooling" / "Pooling.hpp")
+            str(ROOT / "_Aidge_Arm" / "kernels" / "Pooling" / "Pooling.hpp"),
+            str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "typedefs.hpp")
         ]
         self.kernel = node.get_operator().attr.kernel_dims
         self.stride = node.get_operator().attr.stride_dims
 
 
 @ExportLibAidgeARM.register("FC", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
-class FC_ARMCortexM(ExportNodeCpp):
+class FC_ARMCortexM_float32(ExportNodeCpp):
     def __init__(self, node, mem_info):
         super().__init__(node, mem_info)
         self.attributes["activation"] = "Linear"
@@ -594,7 +608,13 @@ class FC_ARMCortexM(ExportNodeCpp):
         self.forward_template = str(ROOT / "_Aidge_Arm" / "templates" / "forward_call" / "fc_kernel.jinja")
         self.include_list = []
         self.kernels_to_copy = [
-            str(ROOT / "_Aidge_Arm" / "kernels" / "FullyConnected" / "Fc.hpp")
+            str(ROOT / "_Aidge_Arm" / "kernels" / "FullyConnected" / "Fc.hpp"),
+            str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "Macs.hpp"),
+            # str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "swar_arm_acle.h"),
+            str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "nn_scaling_functions.hpp"),
+            str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "typedefs.hpp"),
+            str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "utils.hpp"),
+            str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "assert.h")
         ]
 
 @ExportLibAidgeARM.register("MaxPooling2D", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))
@@ -624,7 +644,13 @@ class FC_ARMCortexM(ExportNodeCpp):
         self.forward_template = str(ROOT / "_Aidge_Arm" / "templates" / "forward_call" / "fc_kernel.jinja")
         self.include_list = []
         self.kernels_to_copy = [
-            str(ROOT / "_Aidge_Arm" / "kernels" / "FullyConnected" / "Fc.hpp")
+            str(ROOT / "_Aidge_Arm" / "kernels" / "FullyConnected" / "Fc.hpp"),
+            str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "Macs.hpp"),
+            # str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "swar_arm_acle.h"),
+            str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "nn_scaling_functions.hpp"),
+            str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "typedefs.hpp"),
+            str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "utils.hpp"),
+            str(ROOT / "_Aidge_Arm" / "kernels" / "Utils" / "assert.h")
         ]
 
 @ExportLibAidgeARM.register("Add", aidge_core.ImplSpec(aidge_core.IOSpec(aidge_core.dtype.float32)))